Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@ annotations_test/**
log_conversion/**
debug_model_inputs/**
temp_dir/**
datasets/
**/datasets/
apikey.txt
slurm-*.out
slurmoutputs/
*.log
.inductor_cache/

scratch/
external_ckpts/
external/MANO/
28 changes: 22 additions & 6 deletions egomimic/algo/pi.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@
)
from egomimic.rldb.embodiment.embodiment import get_embodiment, get_embodiment_id
from egomimic.utils.action_utils import (
ConverterRegistry,
PI05_CARTESIAN_ACTION_ENCODING_LEGACY,
PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D,
PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D,
ConverterRegistry,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -306,7 +307,9 @@ def _action_stats(self, embodiment_id: int, ac_key: str) -> dict:
f"and embodiment id {embodiment_id}"
) from exc

def _unnormalize_action(self, action: torch.Tensor, embodiment_id: int, ac_key: str):
def _unnormalize_action(
self, action: torch.Tensor, embodiment_id: int, ac_key: str
):
return self.norm_stats.unnormalize(
{ac_key: action.clone(), "embodiment": embodiment_id},
embodiment_id,
Expand Down Expand Up @@ -467,22 +470,30 @@ def forward_eval(self, batch):
num_steps=self.num_steps,
)

pred_actions = pred_actions.clone()

predictions = OrderedDict()
ref = _batch[ac_key]
B, T, D = ref.shape

converter = self.action_registry.get(embodiment_id, ac_key)
if (
self.action_encoding
== PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D
):
if self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D:
pred_actions_orig = converter.from32_raw_rotation(
pred_actions,
stats=self._action_stats(embodiment_id, ac_key),
norm_mode=self.norm_stats.norm_mode,
unnormalize_non_rotation=True,
)
unnorm_actions = {ac_key: pred_actions_orig[:, :T, :D]}
elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D:
# Extract the normalized xyz+6D(+gripper) action, then
# unnormalize via the standard pipeline (stats were computed
# over the 6D representation) to get raw 6D actions.
pred_6d = converter.from32_norm_6d(pred_actions)
predictions[ac_key] = pred_6d[:, :T, :D]
unnorm_actions = self.norm_stats.unnormalize(
predictions, embodiment_id
)
elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_LEGACY:
pred_actions_orig = converter.from32(pred_actions)
pred = pred_actions_orig[:, :T, :D]
Expand Down Expand Up @@ -576,6 +587,11 @@ def _robomimic_to_pi_data(
stats=self._action_stats(emb_id, ac_key),
norm_mode=self.norm_stats.norm_mode,
)
elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D:
# Action is already a normalized xyz+6D(+gripper) chunk (the
# ypr->6D conversion happened in the CartesianYPRToRot6D data
# transform). Just pack it into the 32D vector.
action32 = converter.to32_norm_6d(action)
elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_LEGACY:
action32 = converter.to32(action)
else:
Expand Down
3 changes: 2 additions & 1 deletion egomimic/hydra_configs/callbacks/checkpoints.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ model_checkpoint:
filename: "epoch_{epoch}"
save_last: true
save_top_k: -1
every_n_epochs: 100
every_n_epochs: 50
save_on_train_epoch_end: true
16 changes: 8 additions & 8 deletions egomimic/hydra_configs/data/cotrain_pi_base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,17 @@ train_datasets:

valid_datasets:
eva_bimanual:
_target_: ${train_datasets.eva_bimanual._target_}
resolver: ${train_datasets.eva_bimanual.resolver}
filters: ${train_datasets.eva_bimanual.filters}
_target_: ${...train_datasets.eva_bimanual._target_}
resolver: ${...train_datasets.eva_bimanual.resolver}
filters: ${...train_datasets.eva_bimanual.filters}
mode: valid
valid_ratio: ${train_datasets.eva_bimanual.valid_ratio}
valid_ratio: ${...train_datasets.eva_bimanual.valid_ratio}
aria_bimanual:
_target_: ${train_datasets.aria_bimanual._target_}
resolver: ${train_datasets.aria_bimanual.resolver}
filters: ${train_datasets.aria_bimanual.filters}
_target_: ${...train_datasets.aria_bimanual._target_}
resolver: ${...train_datasets.aria_bimanual.resolver}
filters: ${...train_datasets.aria_bimanual.filters}
mode: valid
valid_ratio: ${train_datasets.aria_bimanual.valid_ratio}
valid_ratio: ${...train_datasets.aria_bimanual.valid_ratio}

train_dataloader_params:
eva_bimanual:
Expand Down
11 changes: 7 additions & 4 deletions egomimic/hydra_configs/data/cotrain_pi_lang.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,27 @@ train_datasets:
_target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
project_name: "dense-language"
filter_lambdas:
- "lambda row: (row['robot_name'] == 'eva_bimanual') & (row['task'] == 'pick_place') & (row['zarr_processed_path'] != '')"
- "lambda row: (row.get('embodiment') == 'eva_bimanual') & (row['task'] == 'pick_place') & (row['zarr_processed_path'] != '')"
aria_bimanual:
resolver:
key_map:
keymap_mode: cartesian_pi
filters:
_target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
project_name: "dense-language"
filter_lambdas:
- "lambda row: (row['robot_name'] == 'aria_bimanual') & (row['task'] == 'pick_place') & (row['zarr_processed_path'] != '')"
- "lambda row: (row.get('embodiment') == 'aria_bimanual') & (row['task'] == 'pick_place') & (row['zarr_processed_path'] != '')"

valid_datasets:
eva_bimanual:
filters:
_target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
project_name: "dense-language"
filter_lambdas:
- "lambda row: row['robot_name'] == 'eva_bimanual' and row['task'] == 'pick_place' and row['zarr_processed_path'] != '' and 'alignment' not in (row.get('task_description') or '')"
- "lambda row: row.get('embodiment') == 'eva_bimanual' and row['task'] == 'pick_place' and row['zarr_processed_path'] != '' and 'alignment' not in (row.get('task_description') or '')"
aria_bimanual:
filters:
_target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
project_name: "dense-language"
filter_lambdas:
- "lambda row: row['robot_name'] == 'aria_bimanual' and row['task'] == 'pick_place' and row['zarr_processed_path'] != '' and 'alignment' not in (row.get('task_description') or '')"
- "lambda row: row.get('embodiment') == 'aria_bimanual' and row['task'] == 'pick_place' and row['zarr_processed_path'] != '' and 'alignment' not in (row.get('task_description') or '')"
19 changes: 19 additions & 0 deletions egomimic/hydra_configs/data/cotrain_pi_lang_6d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
defaults:
- cotrain_pi_lang
- _self_

# Same dense-language cotrain data as `cotrain_pi_lang`, but the cartesian action
# chunk is expressed with the continuous 6D rotation representation
# (xyz+6D per arm, +gripper for eva) instead of xyz+ypr. Pairs with the
# `cartesian_normalized_rot6d` action_encoding (model=pi0.5_cotrain_eva_aria_6d).
# Valid datasets inherit the train resolver via `${...}` interpolation in
# cotrain_pi_base, so overriding the train transform mode is sufficient.
train_datasets:
eva_bimanual:
resolver:
transform_list:
mode: cartesian_6d
aria_bimanual:
resolver:
transform_list:
mode: cartesian_6d
33 changes: 33 additions & 0 deletions egomimic/hydra_configs/data/obj_gen_pi_lang.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
defaults:
- cotrain_pi_base
- _self_

# Motion-generalization cotrain: annotated-only eva + annotated-only aria
# (base/object descriptions). PI-style camera keys for both embodiments.

train_datasets:
eva_bimanual:
resolver:
folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place
key_map:
keymap_mode: cartesian_pi
filters:
_target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
project_name: "dense-language"
filter_lambdas:
- "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'eva_bimanual' and (row.get('zarr_processed_path') or '') != '' and 'alignment' not in ((row.get('task_description') or '').lower()) and (row.get('episode_hash') or '') != '2026-04-22-02-30-32-296000'"
aria_bimanual:
resolver:
folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place
key_map:
keymap_mode: cartesian_pi
filters:
_target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
project_name: "dense-language"
filter_lambdas:
# Exclude these aria episodes: every base_0_rgb JPEG decode fails
# (entire episode is corrupted), exhausts random-retry budget at train time.
# 2026-04-26-00-17-53-000000
# 2026-04-26-00-27-26-000000
# 2026-05-01-02-52-58-000000
- "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'aria_bimanual' and (row.get('zarr_processed_path') or '') != '' and any(s in ((row.get('task_description') or '').lower()) for s in ('base', 'object')) and (row.get('episode_hash') or '') not in ('2026-04-26-00-17-53-000000', '2026-04-26-00-27-26-000000', '2026-05-01-02-52-58-000000')"
25 changes: 25 additions & 0 deletions egomimic/hydra_configs/data/obj_gen_pi_lang_6d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
defaults:
- obj_gen_pi_lang
- _self_

# 6D-rotation variant of the motion-generalization cotrain (obj_gen_pi_lang):
# - cartesian action chunk expressed with the continuous 6D rotation
# representation (xyz+6D per arm, +gripper for eva) via the `cartesian_6d`
# transform mode. Pairs with model=pi0.5_cotrain_eva_aria_6d
# (action_encoding=cartesian_normalized_rot6d) and evaluator=eval_pi_camframe_6d.
# - dataset switched from the plain Scale-annotation resolver to the
# AnnotationCutoff resolver, which clamps each action chunk at the end of the
# enclosing language-annotation span (ZarrAnnotationCutoffDataset).
# Valid datasets inherit the train resolver via `${...}` interpolation in
# cotrain_pi_base, so overriding the train resolver is sufficient.
train_datasets:
eva_bimanual:
resolver:
_target_: egomimic.rldb.zarr.zarr_dataset_multi.S3AnnotationCutoffEpisodeResolver
transform_list:
mode: cartesian_6d
aria_bimanual:
resolver:
_target_: egomimic.rldb.zarr.zarr_dataset_multi.S3AnnotationCutoffEpisodeResolver
transform_list:
mode: cartesian_6d
40 changes: 40 additions & 0 deletions egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
defaults:
- cotrain_pi_base
- _self_

# Wristframe variant of motion_gen_pi_lang: annotated-only eva + annotated-only
# aria (base/object descriptions). Actions are expressed in each wrist's own
# frame (cartesian_wristframe_ypr) instead of the head/camera frame. Pair with
# evaluator=eval_pi so the revert transform projects predictions back to cam
# frame for the viz video.

train_datasets:
eva_bimanual:
resolver:
folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place
key_map:
keymap_mode: cartesian_pi
transform_list:
mode: cartesian_wristframe_ypr
filters:
_target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
project_name: "dense-language"
filter_lambdas:
- "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'eva_bimanual' and (row.get('zarr_processed_path') or '') != '' and 'alignment' not in ((row.get('task_description') or '').lower()) and (row.get('episode_hash') or '') != '2026-04-22-02-30-32-296000'"
aria_bimanual:
resolver:
folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place
key_map:
keymap_mode: cartesian_pi
transform_list:
mode: cartesian_wristframe_ypr
filters:
_target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter
project_name: "dense-language"
filter_lambdas:
# Exclude these aria episodes: every base_0_rgb JPEG decode fails
# (entire episode is corrupted), exhausts random-retry budget at train time.
# 2026-04-26-00-17-53-000000
# 2026-04-26-00-27-26-000000
# 2026-05-01-02-52-58-000000
- "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'aria_bimanual' and (row.get('zarr_processed_path') or '') != '' and any(s in ((row.get('task_description') or '').lower()) for s in ('base', 'object')) and (row.get('episode_hash') or '') not in ('2026-04-26-00-17-53-000000', '2026-04-26-00-27-26-000000', '2026-05-01-02-52-58-000000')"
25 changes: 25 additions & 0 deletions egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe_6d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
defaults:
- obj_gen_pi_lang_wristframe
- _self_

# 6D-rotation + AnnotationCutoff variant of the wrist-frame motion-generalization
# cotrain (obj_gen_pi_lang_wristframe):
# - actions in each wrist's own frame, rotation expressed as the continuous 6D
# representation via the `cartesian_wristframe_6d` transform mode. Pairs with
# model=pi0.5_cotrain_eva_aria_6d and evaluator=eval_pi_wristframe_6d (which
# un-6Ds then projects wrist-frame preds back to cam frame for viz/MSE).
# - dataset switched to the AnnotationCutoff resolver (clamps each action chunk
# at the end of its enclosing language-annotation span).
# Valid datasets inherit the train resolver via `${...}` interpolation in
# cotrain_pi_base, so overriding the train resolver is sufficient.
train_datasets:
eva_bimanual:
resolver:
_target_: egomimic.rldb.zarr.zarr_dataset_multi.S3AnnotationCutoffEpisodeResolver
transform_list:
mode: cartesian_wristframe_6d
aria_bimanual:
resolver:
_target_: egomimic.rldb.zarr.zarr_dataset_multi.S3AnnotationCutoffEpisodeResolver
transform_list:
mode: cartesian_wristframe_6d
18 changes: 18 additions & 0 deletions egomimic/hydra_configs/evaluator/eval_pi_camframe_6d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
defaults:
- viz@viz_func: pi_cartesian_lang
- _self_

_target_: egomimic.eval.eval_pi.PIEvalVideo

# Cam-frame 6D-rotation variant: use when the data config expresses cartesian
# actions in head/camera frame with the continuous 6D rotation representation
# (e.g. a data config with `transform_list: mode: cartesian_6d`). Actions are
# already in cam (head) frame, so no frame change is needed — the revert only
# converts the rotation back from xyz+6D (9/arm) to xyz+ypr (6/arm) so the viz
# video and cam-frame MSE see the same ypr layout as the plain cartesian mode.
# Each value resolves to a list[Transform] via its ``_target_``.
transform_lists:
eva_bimanual:
_target_: egomimic.rldb.embodiment.eva._build_eva_cartesian_revert_6d_transform_list
aria_bimanual:
_target_: egomimic.rldb.embodiment.human._build_aria_cartesian_revert_6d_transform_list
18 changes: 18 additions & 0 deletions egomimic/hydra_configs/evaluator/eval_pi_wristframe_6d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
defaults:
- viz@viz_func: pi_cartesian_lang
- _self_

_target_: egomimic.eval.eval_pi.PIEvalVideo

# Wrist-frame 6D-rotation variant: use with a data config that applies the
# `cartesian_wristframe_6d` transform mode (e.g. obj_gen_pi_lang_wristframe_6d).
# The model predicts in each wrist's local frame using the continuous 6D
# rotation; these revert transforms first convert the rotation xyz+6D -> xyz+ypr
# and then project predictions + gt back to cam (head) frame for the cam-frame
# MSE and the viz video. Each value resolves to a list[Transform] via its
# ``_target_``. Must match the viz config mounted above.
transform_lists:
eva_bimanual:
_target_: egomimic.rldb.embodiment.eva._build_eva_cartesian_revert_6d_wristframe_transform_list
aria_bimanual:
_target_: egomimic.rldb.embodiment.human._build_aria_cartesian_revert_6d_wristframe_transform_list
11 changes: 11 additions & 0 deletions egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria_6d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
defaults:
- pi0.5_cotrain_eva_aria
- _self_

# Normalized continuous-6D rotation encoding. Actions arrive already in
# xyz+6D(+gripper) layout (via the `cartesian_6d` transform mode) and normalized
# by the standard pipeline; the forward pass only packs them into the 32D vector.
robomimic_model:
action_encoding: "cartesian_normalized_rot6d"
# Do not splice "Embodiment: <name>" into the prompt (pi0.5_base defaults it on).
embodiment_label: false
Loading