diff --git a/.gitignore b/.gitignore index e85860466..4d617a414 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,14 @@ annotations_test/** log_conversion/** debug_model_inputs/** temp_dir/** +datasets/ +**/datasets/ +apikey.txt +slurm-*.out +slurmoutputs/ +*.log +.inductor_cache/ + scratch/ external_ckpts/ external/MANO/ diff --git a/egomimic/algo/pi.py b/egomimic/algo/pi.py index 952d28c93..98a70791c 100644 --- a/egomimic/algo/pi.py +++ b/egomimic/algo/pi.py @@ -26,9 +26,10 @@ ) from egomimic.rldb.embodiment.embodiment import get_embodiment, get_embodiment_id from egomimic.utils.action_utils import ( - ConverterRegistry, PI05_CARTESIAN_ACTION_ENCODING_LEGACY, + PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D, PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D, + ConverterRegistry, ) logger = logging.getLogger(__name__) @@ -306,7 +307,9 @@ def _action_stats(self, embodiment_id: int, ac_key: str) -> dict: f"and embodiment id {embodiment_id}" ) from exc - def _unnormalize_action(self, action: torch.Tensor, embodiment_id: int, ac_key: str): + def _unnormalize_action( + self, action: torch.Tensor, embodiment_id: int, ac_key: str + ): return self.norm_stats.unnormalize( {ac_key: action.clone(), "embodiment": embodiment_id}, embodiment_id, @@ -467,15 +470,14 @@ def forward_eval(self, batch): num_steps=self.num_steps, ) + pred_actions = pred_actions.clone() + predictions = OrderedDict() ref = _batch[ac_key] B, T, D = ref.shape converter = self.action_registry.get(embodiment_id, ac_key) - if ( - self.action_encoding - == PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D - ): + if self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D: pred_actions_orig = converter.from32_raw_rotation( pred_actions, stats=self._action_stats(embodiment_id, ac_key), @@ -483,6 +485,15 @@ def forward_eval(self, batch): unnormalize_non_rotation=True, ) unnorm_actions = {ac_key: pred_actions_orig[:, :T, :D]} + elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D: + # Extract the normalized xyz+6D(+gripper) action, then + # unnormalize via the standard pipeline (stats were computed + # over the 6D representation) to get raw 6D actions. + pred_6d = converter.from32_norm_6d(pred_actions) + predictions[ac_key] = pred_6d[:, :T, :D] + unnorm_actions = self.norm_stats.unnormalize( + predictions, embodiment_id + ) elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_LEGACY: pred_actions_orig = converter.from32(pred_actions) pred = pred_actions_orig[:, :T, :D] @@ -576,6 +587,11 @@ def _robomimic_to_pi_data( stats=self._action_stats(emb_id, ac_key), norm_mode=self.norm_stats.norm_mode, ) + elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D: + # Action is already a normalized xyz+6D(+gripper) chunk (the + # ypr->6D conversion happened in the CartesianYPRToRot6D data + # transform). Just pack it into the 32D vector. + action32 = converter.to32_norm_6d(action) elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_LEGACY: action32 = converter.to32(action) else: diff --git a/egomimic/hydra_configs/callbacks/checkpoints.yaml b/egomimic/hydra_configs/callbacks/checkpoints.yaml index 86d21a792..a73448099 100644 --- a/egomimic/hydra_configs/callbacks/checkpoints.yaml +++ b/egomimic/hydra_configs/callbacks/checkpoints.yaml @@ -8,4 +8,5 @@ model_checkpoint: filename: "epoch_{epoch}" save_last: true save_top_k: -1 - every_n_epochs: 100 \ No newline at end of file + every_n_epochs: 50 + save_on_train_epoch_end: true \ No newline at end of file diff --git a/egomimic/hydra_configs/data/cotrain_pi_base.yaml b/egomimic/hydra_configs/data/cotrain_pi_base.yaml index ab4c2c3fc..09966a319 100644 --- a/egomimic/hydra_configs/data/cotrain_pi_base.yaml +++ b/egomimic/hydra_configs/data/cotrain_pi_base.yaml @@ -34,17 +34,17 @@ train_datasets: valid_datasets: eva_bimanual: - _target_: ${train_datasets.eva_bimanual._target_} - resolver: ${train_datasets.eva_bimanual.resolver} - filters: ${train_datasets.eva_bimanual.filters} + _target_: ${...train_datasets.eva_bimanual._target_} + resolver: ${...train_datasets.eva_bimanual.resolver} + filters: ${...train_datasets.eva_bimanual.filters} mode: valid - valid_ratio: ${train_datasets.eva_bimanual.valid_ratio} + valid_ratio: ${...train_datasets.eva_bimanual.valid_ratio} aria_bimanual: - _target_: ${train_datasets.aria_bimanual._target_} - resolver: ${train_datasets.aria_bimanual.resolver} - filters: ${train_datasets.aria_bimanual.filters} + _target_: ${...train_datasets.aria_bimanual._target_} + resolver: ${...train_datasets.aria_bimanual.resolver} + filters: ${...train_datasets.aria_bimanual.filters} mode: valid - valid_ratio: ${train_datasets.aria_bimanual.valid_ratio} + valid_ratio: ${...train_datasets.aria_bimanual.valid_ratio} train_dataloader_params: eva_bimanual: diff --git a/egomimic/hydra_configs/data/cotrain_pi_lang.yaml b/egomimic/hydra_configs/data/cotrain_pi_lang.yaml index 5b9766b15..73ca4f9cc 100644 --- a/egomimic/hydra_configs/data/cotrain_pi_lang.yaml +++ b/egomimic/hydra_configs/data/cotrain_pi_lang.yaml @@ -11,13 +11,16 @@ train_datasets: _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter project_name: "dense-language" filter_lambdas: - - "lambda row: (row['robot_name'] == 'eva_bimanual') & (row['task'] == 'pick_place') & (row['zarr_processed_path'] != '')" + - "lambda row: (row.get('embodiment') == 'eva_bimanual') & (row['task'] == 'pick_place') & (row['zarr_processed_path'] != '')" aria_bimanual: + resolver: + key_map: + keymap_mode: cartesian_pi filters: _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter project_name: "dense-language" filter_lambdas: - - "lambda row: (row['robot_name'] == 'aria_bimanual') & (row['task'] == 'pick_place') & (row['zarr_processed_path'] != '')" + - "lambda row: (row.get('embodiment') == 'aria_bimanual') & (row['task'] == 'pick_place') & (row['zarr_processed_path'] != '')" valid_datasets: eva_bimanual: @@ -25,10 +28,10 @@ valid_datasets: _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter project_name: "dense-language" filter_lambdas: - - "lambda row: row['robot_name'] == 'eva_bimanual' and row['task'] == 'pick_place' and row['zarr_processed_path'] != '' and 'alignment' not in (row.get('task_description') or '')" + - "lambda row: row.get('embodiment') == 'eva_bimanual' and row['task'] == 'pick_place' and row['zarr_processed_path'] != '' and 'alignment' not in (row.get('task_description') or '')" aria_bimanual: filters: _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter project_name: "dense-language" filter_lambdas: - - "lambda row: row['robot_name'] == 'aria_bimanual' and row['task'] == 'pick_place' and row['zarr_processed_path'] != '' and 'alignment' not in (row.get('task_description') or '')" + - "lambda row: row.get('embodiment') == 'aria_bimanual' and row['task'] == 'pick_place' and row['zarr_processed_path'] != '' and 'alignment' not in (row.get('task_description') or '')" diff --git a/egomimic/hydra_configs/data/cotrain_pi_lang_6d.yaml b/egomimic/hydra_configs/data/cotrain_pi_lang_6d.yaml new file mode 100644 index 000000000..ddc032f1e --- /dev/null +++ b/egomimic/hydra_configs/data/cotrain_pi_lang_6d.yaml @@ -0,0 +1,19 @@ +defaults: + - cotrain_pi_lang + - _self_ + +# Same dense-language cotrain data as `cotrain_pi_lang`, but the cartesian action +# chunk is expressed with the continuous 6D rotation representation +# (xyz+6D per arm, +gripper for eva) instead of xyz+ypr. Pairs with the +# `cartesian_normalized_rot6d` action_encoding (model=pi0.5_cotrain_eva_aria_6d). +# Valid datasets inherit the train resolver via `${...}` interpolation in +# cotrain_pi_base, so overriding the train transform mode is sufficient. +train_datasets: + eva_bimanual: + resolver: + transform_list: + mode: cartesian_6d + aria_bimanual: + resolver: + transform_list: + mode: cartesian_6d diff --git a/egomimic/hydra_configs/data/obj_gen_pi_lang.yaml b/egomimic/hydra_configs/data/obj_gen_pi_lang.yaml new file mode 100644 index 000000000..08651b1dd --- /dev/null +++ b/egomimic/hydra_configs/data/obj_gen_pi_lang.yaml @@ -0,0 +1,33 @@ +defaults: + - cotrain_pi_base + - _self_ + +# Motion-generalization cotrain: annotated-only eva + annotated-only aria +# (base/object descriptions). PI-style camera keys for both embodiments. + +train_datasets: + eva_bimanual: + resolver: + folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place + key_map: + keymap_mode: cartesian_pi + filters: + _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter + project_name: "dense-language" + filter_lambdas: + - "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'eva_bimanual' and (row.get('zarr_processed_path') or '') != '' and 'alignment' not in ((row.get('task_description') or '').lower()) and (row.get('episode_hash') or '') != '2026-04-22-02-30-32-296000'" + aria_bimanual: + resolver: + folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place + key_map: + keymap_mode: cartesian_pi + filters: + _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter + project_name: "dense-language" + filter_lambdas: + # Exclude these aria episodes: every base_0_rgb JPEG decode fails + # (entire episode is corrupted), exhausts random-retry budget at train time. + # 2026-04-26-00-17-53-000000 + # 2026-04-26-00-27-26-000000 + # 2026-05-01-02-52-58-000000 + - "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'aria_bimanual' and (row.get('zarr_processed_path') or '') != '' and any(s in ((row.get('task_description') or '').lower()) for s in ('base', 'object')) and (row.get('episode_hash') or '') not in ('2026-04-26-00-17-53-000000', '2026-04-26-00-27-26-000000', '2026-05-01-02-52-58-000000')" diff --git a/egomimic/hydra_configs/data/obj_gen_pi_lang_6d.yaml b/egomimic/hydra_configs/data/obj_gen_pi_lang_6d.yaml new file mode 100644 index 000000000..daf77f990 --- /dev/null +++ b/egomimic/hydra_configs/data/obj_gen_pi_lang_6d.yaml @@ -0,0 +1,25 @@ +defaults: + - obj_gen_pi_lang + - _self_ + +# 6D-rotation variant of the motion-generalization cotrain (obj_gen_pi_lang): +# - cartesian action chunk expressed with the continuous 6D rotation +# representation (xyz+6D per arm, +gripper for eva) via the `cartesian_6d` +# transform mode. Pairs with model=pi0.5_cotrain_eva_aria_6d +# (action_encoding=cartesian_normalized_rot6d) and evaluator=eval_pi_camframe_6d. +# - dataset switched from the plain Scale-annotation resolver to the +# AnnotationCutoff resolver, which clamps each action chunk at the end of the +# enclosing language-annotation span (ZarrAnnotationCutoffDataset). +# Valid datasets inherit the train resolver via `${...}` interpolation in +# cotrain_pi_base, so overriding the train resolver is sufficient. +train_datasets: + eva_bimanual: + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3AnnotationCutoffEpisodeResolver + transform_list: + mode: cartesian_6d + aria_bimanual: + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3AnnotationCutoffEpisodeResolver + transform_list: + mode: cartesian_6d diff --git a/egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe.yaml b/egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe.yaml new file mode 100644 index 000000000..c5d58a3e5 --- /dev/null +++ b/egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe.yaml @@ -0,0 +1,40 @@ +defaults: + - cotrain_pi_base + - _self_ + +# Wristframe variant of motion_gen_pi_lang: annotated-only eva + annotated-only +# aria (base/object descriptions). Actions are expressed in each wrist's own +# frame (cartesian_wristframe_ypr) instead of the head/camera frame. Pair with +# evaluator=eval_pi so the revert transform projects predictions back to cam +# frame for the viz video. + +train_datasets: + eva_bimanual: + resolver: + folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place + key_map: + keymap_mode: cartesian_pi + transform_list: + mode: cartesian_wristframe_ypr + filters: + _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter + project_name: "dense-language" + filter_lambdas: + - "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'eva_bimanual' and (row.get('zarr_processed_path') or '') != '' and 'alignment' not in ((row.get('task_description') or '').lower()) and (row.get('episode_hash') or '') != '2026-04-22-02-30-32-296000'" + aria_bimanual: + resolver: + folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place + key_map: + keymap_mode: cartesian_pi + transform_list: + mode: cartesian_wristframe_ypr + filters: + _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter + project_name: "dense-language" + filter_lambdas: + # Exclude these aria episodes: every base_0_rgb JPEG decode fails + # (entire episode is corrupted), exhausts random-retry budget at train time. + # 2026-04-26-00-17-53-000000 + # 2026-04-26-00-27-26-000000 + # 2026-05-01-02-52-58-000000 + - "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'aria_bimanual' and (row.get('zarr_processed_path') or '') != '' and any(s in ((row.get('task_description') or '').lower()) for s in ('base', 'object')) and (row.get('episode_hash') or '') not in ('2026-04-26-00-17-53-000000', '2026-04-26-00-27-26-000000', '2026-05-01-02-52-58-000000')" diff --git a/egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe_6d.yaml b/egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe_6d.yaml new file mode 100644 index 000000000..8d6b2121d --- /dev/null +++ b/egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe_6d.yaml @@ -0,0 +1,25 @@ +defaults: + - obj_gen_pi_lang_wristframe + - _self_ + +# 6D-rotation + AnnotationCutoff variant of the wrist-frame motion-generalization +# cotrain (obj_gen_pi_lang_wristframe): +# - actions in each wrist's own frame, rotation expressed as the continuous 6D +# representation via the `cartesian_wristframe_6d` transform mode. Pairs with +# model=pi0.5_cotrain_eva_aria_6d and evaluator=eval_pi_wristframe_6d (which +# un-6Ds then projects wrist-frame preds back to cam frame for viz/MSE). +# - dataset switched to the AnnotationCutoff resolver (clamps each action chunk +# at the end of its enclosing language-annotation span). +# Valid datasets inherit the train resolver via `${...}` interpolation in +# cotrain_pi_base, so overriding the train resolver is sufficient. +train_datasets: + eva_bimanual: + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3AnnotationCutoffEpisodeResolver + transform_list: + mode: cartesian_wristframe_6d + aria_bimanual: + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3AnnotationCutoffEpisodeResolver + transform_list: + mode: cartesian_wristframe_6d diff --git a/egomimic/hydra_configs/evaluator/eval_pi_camframe_6d.yaml b/egomimic/hydra_configs/evaluator/eval_pi_camframe_6d.yaml new file mode 100644 index 000000000..1a50964d4 --- /dev/null +++ b/egomimic/hydra_configs/evaluator/eval_pi_camframe_6d.yaml @@ -0,0 +1,18 @@ +defaults: + - viz@viz_func: pi_cartesian_lang + - _self_ + +_target_: egomimic.eval.eval_pi.PIEvalVideo + +# Cam-frame 6D-rotation variant: use when the data config expresses cartesian +# actions in head/camera frame with the continuous 6D rotation representation +# (e.g. a data config with `transform_list: mode: cartesian_6d`). Actions are +# already in cam (head) frame, so no frame change is needed — the revert only +# converts the rotation back from xyz+6D (9/arm) to xyz+ypr (6/arm) so the viz +# video and cam-frame MSE see the same ypr layout as the plain cartesian mode. +# Each value resolves to a list[Transform] via its ``_target_``. +transform_lists: + eva_bimanual: + _target_: egomimic.rldb.embodiment.eva._build_eva_cartesian_revert_6d_transform_list + aria_bimanual: + _target_: egomimic.rldb.embodiment.human._build_aria_cartesian_revert_6d_transform_list diff --git a/egomimic/hydra_configs/evaluator/eval_pi_wristframe_6d.yaml b/egomimic/hydra_configs/evaluator/eval_pi_wristframe_6d.yaml new file mode 100644 index 000000000..d99c9722c --- /dev/null +++ b/egomimic/hydra_configs/evaluator/eval_pi_wristframe_6d.yaml @@ -0,0 +1,18 @@ +defaults: + - viz@viz_func: pi_cartesian_lang + - _self_ + +_target_: egomimic.eval.eval_pi.PIEvalVideo + +# Wrist-frame 6D-rotation variant: use with a data config that applies the +# `cartesian_wristframe_6d` transform mode (e.g. obj_gen_pi_lang_wristframe_6d). +# The model predicts in each wrist's local frame using the continuous 6D +# rotation; these revert transforms first convert the rotation xyz+6D -> xyz+ypr +# and then project predictions + gt back to cam (head) frame for the cam-frame +# MSE and the viz video. Each value resolves to a list[Transform] via its +# ``_target_``. Must match the viz config mounted above. +transform_lists: + eva_bimanual: + _target_: egomimic.rldb.embodiment.eva._build_eva_cartesian_revert_6d_wristframe_transform_list + aria_bimanual: + _target_: egomimic.rldb.embodiment.human._build_aria_cartesian_revert_6d_wristframe_transform_list diff --git a/egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria_6d.yaml b/egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria_6d.yaml new file mode 100644 index 000000000..d0e237ca6 --- /dev/null +++ b/egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria_6d.yaml @@ -0,0 +1,11 @@ +defaults: + - pi0.5_cotrain_eva_aria + - _self_ + +# Normalized continuous-6D rotation encoding. Actions arrive already in +# xyz+6D(+gripper) layout (via the `cartesian_6d` transform mode) and normalized +# by the standard pipeline; the forward pass only packs them into the 32D vector. +robomimic_model: + action_encoding: "cartesian_normalized_rot6d" + # Do not splice "Embodiment: " into the prompt (pi0.5_base defaults it on). + embodiment_label: false diff --git a/egomimic/rldb/embodiment/eva.py b/egomimic/rldb/embodiment/eva.py index 519510447..27bd548a1 100644 --- a/egomimic/rldb/embodiment/eva.py +++ b/egomimic/rldb/embodiment/eva.py @@ -6,6 +6,8 @@ from egomimic.rldb.zarr.action_chunk_transforms import ( ActionChunkCoordinateFrameTransform, BatchQuaternionPoseToYPR, + CartesianRot6DToYPR, + CartesianYPRToRot6D, ConcatKeys, DeleteKeys, InterpolateLinear, @@ -29,13 +31,31 @@ class Eva(Embodiment): @staticmethod def get_transform_list( mode: Literal[ - "cartesian", "cartesian_wristframe_ypr", "cartesian_wristframe_quat" + "cartesian", + "cartesian_6d", + "cartesian_wristframe_ypr", + "cartesian_wristframe_6d", + "cartesian_wristframe_quat", ], ) -> list[Transform]: if mode == "cartesian": return _build_eva_bimanual_transform_list(is_quat=True) + elif mode == "cartesian_6d": + # Camera-frame cartesian (14D xyz+ypr+gripper per arm) with the + # rotation re-expressed as the continuous 6D representation + # (20D xyz+6d+gripper per arm) for pi0.5 normalized-rot6d encoding. + return _build_eva_bimanual_transform_list(is_quat=True) + [ + CartesianYPRToRot6D(action_key="actions_cartesian") + ] elif mode == "cartesian_wristframe_ypr": return _build_eva_bimanual_eef_frame_transform_list(is_quat=False) + elif mode == "cartesian_wristframe_6d": + # Wrist-frame cartesian (14D xyz+ypr+gripper per arm) with the + # rotation re-expressed as the continuous 6D representation + # (20D) for pi0.5 normalized-rot6d encoding. + return _build_eva_bimanual_eef_frame_transform_list(is_quat=False) + [ + CartesianYPRToRot6D(action_key="actions_cartesian") + ] elif mode == "cartesian_wristframe_quat": return _build_eva_bimanual_eef_frame_transform_list(is_quat=True) @@ -131,6 +151,39 @@ def dinov3_keymap(cls): } +def _build_eva_cartesian_revert_6d_transform_list( + *, + action_key: str = "actions_cartesian", +) -> list[Transform]: + """Revert camera-frame 6D-rotation EVA cartesian actions back to ypr. + + Used by the cam-frame 6D evaluator: the action chunk is already in camera + frame (produced by the ``cartesian_6d`` transform mode), so only the + rotation representation is converted from xyz+6D (+gripper, 10/arm) back to + xyz+ypr (+gripper, 7/arm) so cam-frame MSE and the viz video see the same + ypr layout as the plain ``cartesian`` mode. + """ + return [CartesianRot6DToYPR(action_key=action_key)] + + +def _build_eva_cartesian_revert_6d_wristframe_transform_list( + *, + action_key: str = "actions_cartesian", +) -> list[Transform]: + """Revert wrist-frame 6D-rotation EVA actions back to camera-frame ypr. + + Two stages for the cam-frame 6D wristframe evaluator: (1) convert the action + rotation from xyz+6D (+gripper) back to xyz+ypr (+gripper) via + ``CartesianRot6DToYPR``; (2) project the wrist-frame ypr actions back into + camera frame using the standard eef-frame revert (which reads the proprio + ``observations.state.ee_pose``, left untouched as ypr by the 6D transform). + """ + return [ + CartesianRot6DToYPR(action_key=action_key), + *_build_eva_bimanual_revert_eef_frame_transform_list(is_quat=False), + ] + + def _build_eva_bimanual_revert_eef_frame_transform_list( *, action_key: str = "actions_cartesian", diff --git a/egomimic/rldb/embodiment/human.py b/egomimic/rldb/embodiment/human.py index a8f6f1ed1..7e0e40831 100644 --- a/egomimic/rldb/embodiment/human.py +++ b/egomimic/rldb/embodiment/human.py @@ -7,6 +7,8 @@ from egomimic.rldb.zarr.action_chunk_transforms import ( ActionChunkCoordinateFrameTransform, BatchQuaternionPoseToYPR, + CartesianRot6DToYPR, + CartesianYPRToRot6D, ConcatKeys, DeleteKeys, InterpolatePose, @@ -98,8 +100,10 @@ def get_transform_list( cls, mode: Literal[ "cartesian", + "cartesian_6d", "cartesian_padded", "cartesian_wristframe_ypr", + "cartesian_wristframe_6d", "keypoints_headframe_ypr", "keypoints_headframe_quat", "keypoints_wristframe_ypr", @@ -107,14 +111,32 @@ def get_transform_list( ], ) -> list[Transform]: if mode == "cartesian": - return _build_human_cartesian_bimanual_transform_list(stride=cls.ACTION_STRIDE) - if mode == "cartesian_padded": + return _build_human_cartesian_bimanual_transform_list( + stride=cls.ACTION_STRIDE + ) + elif mode == "cartesian_6d": + # Head/camera-frame cartesian (12D xyz+ypr per arm) with rotation + # re-expressed as the continuous 6D representation (18D xyz+6d per + # arm) for pi0.5 normalized-rot6d encoding. + return _build_human_cartesian_bimanual_transform_list( + stride=cls.ACTION_STRIDE + ) + [CartesianYPRToRot6D(action_key="actions_cartesian")] + elif mode == "cartesian_padded": return _build_human_cartesian_bimanual_transform_list( stride=cls.ACTION_STRIDE ) + [PadGripperZeros(action_key="actions_cartesian")] - if mode == "cartesian_wristframe_ypr": - return _build_human_cartesian_eef_frame_transform_list(stride=cls.ACTION_STRIDE) - if mode == "keypoints_headframe_ypr": + elif mode == "cartesian_wristframe_ypr": + return _build_aria_cartesian_eef_frame_transform_list( + stride=cls.ACTION_STRIDE + ) + elif mode == "cartesian_wristframe_6d": + # Wrist-frame cartesian (12D xyz+ypr per arm) with rotation + # re-expressed as the continuous 6D representation (18D) for pi0.5 + # normalized-rot6d encoding. + return _build_aria_cartesian_eef_frame_transform_list( + stride=cls.ACTION_STRIDE + ) + [CartesianYPRToRot6D(action_key="actions_cartesian")] + elif mode == "keypoints_headframe_ypr": return _build_human_keypoints_bimanual_transform_list( stride=cls.ACTION_STRIDE, is_quat=False ) @@ -974,7 +996,39 @@ def _build_human_cartesian_revert_eef_frame_transform_list( return transform_list -def _build_human_cartesian_eef_frame_transform_list( +def _build_aria_cartesian_revert_6d_transform_list( + *, + action_key: str = "actions_cartesian", +) -> list[Transform]: + """Revert head/camera-frame 6D-rotation cartesian actions back to ypr. + + Used by the cam-frame 6D evaluator: the action chunk is already in + head/camera frame (produced by the ``cartesian_6d`` transform mode), so no + coordinate-frame change is needed — only the rotation representation is + converted from xyz+6D (9/arm) back to xyz+ypr (6/arm) so cam-frame MSE and + the viz video see the same ypr layout as the plain ``cartesian`` mode. + """ + return [CartesianRot6DToYPR(action_key=action_key)] + + +def _build_aria_cartesian_revert_6d_wristframe_transform_list( + *, + action_key: str = "actions_cartesian", +) -> list[Transform]: + """Revert wrist-frame 6D-rotation ARIA actions back to head/camera-frame ypr. + + (1) ``CartesianRot6DToYPR`` converts the action rotation xyz+6D -> xyz+ypr; + (2) the standard eef-frame revert projects wrist-frame ypr actions back into + head frame using the proprio ``observations.state.ee_pose`` (left as ypr by + the 6D transform). + """ + return [ + CartesianRot6DToYPR(action_key=action_key), + *_build_human_cartesian_revert_eef_frame_transform_list(is_quat=False), + ] + + +def _build_aria_cartesian_eef_frame_transform_list( *, target_world: str = "obs_head_pose", target_world_ypr: str = "obs_head_pose_ypr", diff --git a/egomimic/rldb/zarr/action_chunk_transforms.py b/egomimic/rldb/zarr/action_chunk_transforms.py index 0388d386a..3b4d19cd0 100644 --- a/egomimic/rldb/zarr/action_chunk_transforms.py +++ b/egomimic/rldb/zarr/action_chunk_transforms.py @@ -28,9 +28,11 @@ _matrix_to_xyz, _matrix_to_xyzwxyz, _matrix_to_xyzypr, + _rot6d_to_ypr, _xyz_to_matrix, _xyzwxyz_to_matrix, _xyzypr_to_matrix, + _ypr_to_rot6d, wxyz_to_xyzw, xyzw_to_wxyz, ) @@ -387,6 +389,101 @@ def transform(self, batch: dict) -> dict: return batch +class CartesianYPRToRot6D(Transform): + """Convert a bimanual cartesian action chunk from per-arm xyz+ypr(+gripper) + to per-arm xyz+rot6d(+gripper). + + ``rot6d`` is the continuous 6D rotation representation = the first two + columns of the rotation matrix, packed as [col0(3), col1(3)] (see + :func:`egomimic.utils.pose_utils._ypr_to_rot6d`). This matches the column + convention of the ``to32``/``from32`` packers in + ``egomimic.utils.action_utils``, so the resulting per-arm layout maps + directly into the pi0.5 32D action blocks. + + Input layouts (last dim): + 12 -> [L xyz ypr, R xyz ypr] -> 18 [L xyz 6d, R xyz 6d] + 14 -> [L xyz ypr g, R xyz ypr g] -> 20 [L xyz 6d g, R xyz 6d g] + + Preserves the numpy/tensor type of the input (like ``PadGripperZeros``). + """ + + def __init__( + self, action_key: str = "actions_cartesian", output_key: str | None = None + ): + self.action_key = action_key + self.output_key = output_key or action_key + + def transform(self, batch: dict) -> dict: + actions = batch[self.action_key] + is_tensor = isinstance(actions, torch.Tensor) + arr = actions.cpu().numpy() if is_tensor else np.asarray(actions) + D = arr.shape[-1] + if D == 14: + l_xyz, l_ypr, l_g = arr[..., 0:3], arr[..., 3:6], arr[..., 6:7] + r_xyz, r_ypr, r_g = arr[..., 7:10], arr[..., 10:13], arr[..., 13:14] + out = np.concatenate( + [l_xyz, _ypr_to_rot6d(l_ypr), l_g, r_xyz, _ypr_to_rot6d(r_ypr), r_g], + axis=-1, + ) + elif D == 12: + l_xyz, l_ypr = arr[..., 0:3], arr[..., 3:6] + r_xyz, r_ypr = arr[..., 6:9], arr[..., 9:12] + out = np.concatenate( + [l_xyz, _ypr_to_rot6d(l_ypr), r_xyz, _ypr_to_rot6d(r_ypr)], + axis=-1, + ) + else: + raise ValueError( + f"CartesianYPRToRot6D expects last-dim 12 or 14, got {arr.shape} " + f"for '{self.action_key}'" + ) + batch[self.output_key] = torch.from_numpy(out) if is_tensor else out + return batch + + +class CartesianRot6DToYPR(Transform): + """Inverse of :class:`CartesianYPRToRot6D`: per-arm xyz+rot6d(+gripper) -> + xyz+ypr(+gripper). + + Input layouts (last dim): + 18 -> [L xyz 6d, R xyz 6d] -> 12 [L xyz ypr, R xyz ypr] + 20 -> [L xyz 6d g, R xyz 6d g] -> 14 [L xyz ypr g, R xyz ypr g] + """ + + def __init__( + self, action_key: str = "actions_cartesian", output_key: str | None = None + ): + self.action_key = action_key + self.output_key = output_key or action_key + + def transform(self, batch: dict) -> dict: + actions = batch[self.action_key] + is_tensor = isinstance(actions, torch.Tensor) + arr = actions.cpu().numpy() if is_tensor else np.asarray(actions) + D = arr.shape[-1] + if D == 20: + l_xyz, l_6d, l_g = arr[..., 0:3], arr[..., 3:9], arr[..., 9:10] + r_xyz, r_6d, r_g = arr[..., 10:13], arr[..., 13:19], arr[..., 19:20] + out = np.concatenate( + [l_xyz, _rot6d_to_ypr(l_6d), l_g, r_xyz, _rot6d_to_ypr(r_6d), r_g], + axis=-1, + ) + elif D == 18: + l_xyz, l_6d = arr[..., 0:3], arr[..., 3:9] + r_xyz, r_6d = arr[..., 9:12], arr[..., 12:18] + out = np.concatenate( + [l_xyz, _rot6d_to_ypr(l_6d), r_xyz, _rot6d_to_ypr(r_6d)], + axis=-1, + ) + else: + raise ValueError( + f"CartesianRot6DToYPR expects last-dim 18 or 20, got {arr.shape} " + f"for '{self.action_key}'" + ) + batch[self.output_key] = torch.from_numpy(out) if is_tensor else out + return batch + + class CartesianWithGripperCoordinateTransform(Transform): def __init__( self, @@ -535,12 +632,8 @@ def transform(self, batch: dict) -> dict: ) pad_shape = (*arr.shape[:-1], 1) pad = np.zeros(pad_shape, dtype=arr.dtype) - padded = np.concatenate( - (arr[..., :6], pad, arr[..., 6:], pad), axis=-1 - ) - batch[self.action_key] = ( - torch.from_numpy(padded) if is_tensor else padded - ) + padded = np.concatenate((arr[..., :6], pad, arr[..., 6:], pad), axis=-1) + batch[self.action_key] = torch.from_numpy(padded) if is_tensor else padded return batch diff --git a/egomimic/rldb/zarr/zarr_dataset_multi.py b/egomimic/rldb/zarr/zarr_dataset_multi.py index 069eb3a70..7c941a553 100644 --- a/egomimic/rldb/zarr/zarr_dataset_multi.py +++ b/egomimic/rldb/zarr/zarr_dataset_multi.py @@ -1250,6 +1250,8 @@ def cache_stats(self, save_cache_dir: str): # ---- normalize / unnormalize ---- def _apply_norm_one(self, tensor, stats): + if self.norm_mode == "none": + return tensor if self.norm_mode == "zscore": mean = torch.as_tensor( stats["mean"], device=tensor.device, dtype=torch.float32 @@ -1277,6 +1279,8 @@ def _apply_norm_one(self, tensor, stats): raise ValueError(f"Invalid normalization mode: {self.norm_mode}") def _apply_unnorm_one(self, tensor, stats): + if self.norm_mode == "none": + return tensor if self.norm_mode == "zscore": mean = torch.as_tensor( stats["mean"], device=tensor.device, dtype=torch.float32 @@ -1723,7 +1727,9 @@ def _next(reason: str, key: str = "") -> int: data["embodiment"] = get_embodiment_id(self.embodiment) ep_name = Path(self.episode_path).name - data["episode_hash"] = ep_name[:-5] if ep_name.endswith(".zarr") else ep_name + data["episode_hash"] = ( + ep_name[:-5] if ep_name.endswith(".zarr") else ep_name + ) _ = origin # preserved for symmetry with prior API return data @@ -1749,13 +1755,26 @@ def _build_frame_to_ann_end(self) -> dict[int, int]: annotation span. Annotations use half-open ``[start_idx, end_idx)``. """ mapping: dict[int, int] = {} + n_spans = 0 for ann in self._load_annotations(): start_idx = int(ann.get("start_idx", -1)) end_idx = int(ann.get("end_idx", -1)) if start_idx < 0 or end_idx <= start_idx: continue + n_spans += 1 for idx in range(start_idx, end_idx): mapping[idx] = end_idx + # One-time per-episode visibility into annotation-cutoff usage: if + # spans/frames_covered are 0 the cutoff is a no-op (episode has no usable + # annotations); >0 confirms action chunks are being clamped at EOS. + ep = Path(self.episode_path).name + logger.info( + "[AnnotationCutoff] ep=%s spans=%d frames_covered=%d/%d", + ep, + n_spans, + len(mapping), + self.total_frames, + ) return mapping def _chunk_end_idx(self, start_idx: int, horizon: int, key_type: str | None) -> int: @@ -1770,11 +1789,67 @@ def _chunk_end_idx(self, start_idx: int, horizon: int, key_type: str | None) -> return min(end_idx, ann_end) +def _episode_has_annotation_spans(ds: "ZarrDataset") -> bool: + """True if the episode has at least one usable ``[start_idx, end_idx)`` span. + + Many Scale-"completed" episodes have an empty (or span-less) zarr + ``annotations`` array because the annotation-injection step lagged; the + AnnotationCutoff is a no-op for those, so they should be dropped when the + point of the run is to clamp chunks at annotation boundaries. + """ + try: + anns = ds._load_annotations() + except Exception: + return False + return any( + isinstance(a, dict) + and 0 <= int(a.get("start_idx", -1)) < int(a.get("end_idx", -1)) + for a in anns + ) + + class S3AnnotationCutoffEpisodeResolver(S3EpisodeResolver): - """S3EpisodeResolver that loads ZarrAnnotationCutoffDataset instances.""" + """S3EpisodeResolver that loads ZarrAnnotationCutoffDataset instances. + + When ``require_annotations`` is set (default), episodes whose zarr + ``annotations`` array has no usable span are dropped — otherwise the + annotation cutoff would silently no-op on them. + """ _dataset_class = ZarrAnnotationCutoffDataset + def __init__(self, *args, require_annotations: bool = True, **kwargs): + super().__init__(*args, **kwargs) + self.require_annotations = require_annotations + + def resolve(self, filters=None): + datasets = super().resolve(filters=filters) + if not self.require_annotations: + return datasets + kept = { + h: ds for h, ds in datasets.items() if _episode_has_annotation_spans(ds) + } + dropped = sorted(set(datasets) - set(kept)) + if dropped: + logger.warning( + "[AnnotationCutoff] dropped %d/%d episodes with no usable " + "annotation spans (e.g. %s)", + len(dropped), + len(datasets), + dropped[:5], + ) + logger.info( + "[AnnotationCutoff] kept %d/%d episodes with usable annotations", + len(kept), + len(datasets), + ) + if not kept: + raise ValueError( + "[AnnotationCutoff] no resolved episodes contain usable annotation " + "spans — check the filter / annotation injection for this dataset." + ) + return kept + class LocalAnnotationCutoffEpisodeResolver(LocalEpisodeResolver): """LocalEpisodeResolver that loads ZarrAnnotationCutoffDataset instances.""" diff --git a/egomimic/utils/action_utils.py b/egomimic/utils/action_utils.py index 57602f5a9..a755f4983 100644 --- a/egomimic/utils/action_utils.py +++ b/egomimic/utils/action_utils.py @@ -4,6 +4,11 @@ PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D = "cartesian_ypr_raw_rot6d" PI05_CARTESIAN_ACTION_ENCODING_LEGACY = "legacy_normalized_ypr_rot6d" +# Actions arrive already in xyz+6D(+gripper) layout (the ypr->6D conversion is +# done by the ``CartesianYPRToRot6D`` data transform) and already normalized by +# the standard MultiDataset pipeline. The forward pass only *packs* the +# normalized 6D action into the 32D vector (see ``to32_norm_6d`` below). +PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D = "cartesian_normalized_rot6d" # Bimanual robot Cartesian layout: [x, y, z, yaw, pitch, roll, gripper] x 2. ROBOT_BIMANUAL_CARTESIAN_ROT_DIMS = (3, 4, 5, 10, 11, 12) @@ -243,6 +248,25 @@ def from32_raw_rotation( f"{type(self).__name__} does not support raw-rotation action decoding" ) + def to32_norm_6d(self, actions: torch.Tensor) -> torch.Tensor: + """Pack an already-normalized xyz+6D(+gripper) action into the 32D vector. + + The ypr->6D conversion happens upstream in the ``CartesianYPRToRot6D`` + data transform and the result is normalized by the standard data + pipeline, so this is a pure rearrange (no rotation math, no + normalization). + """ + raise NotImplementedError( + f"{type(self).__name__} does not support normalized-rot6d encoding" + ) + + def from32_norm_6d(self, actions32: torch.Tensor) -> torch.Tensor: + """Inverse of :meth:`to32_norm_6d`: extract the normalized xyz+6D(+gripper) + action from the 32D vector (pure rearrange).""" + raise NotImplementedError( + f"{type(self).__name__} does not support normalized-rot6d decoding" + ) + # ============================================================ # ROBOT CONVERTERS @@ -380,7 +404,9 @@ def to20_raw_rotation( ) if normalized_actions is None: if stats is None: - raise ValueError("stats are required when normalized_actions is omitted") + raise ValueError( + "stats are required when normalized_actions is omitted" + ) model_actions = _normalize_robot_bimanual_non_rot( raw_actions, stats, norm_mode ) @@ -449,6 +475,25 @@ def from32_raw_rotation( unnormalize_non_rotation=unnormalize_non_rotation, ) + def to32_norm_6d(self, actions: torch.Tensor) -> torch.Tensor: + # actions: (B,S,20) = [L xyz(3) 6d(6) g(1), R xyz(3) 6d(6) g(1)] — already + # the canonical 32D block layout (left 0..9, right 10..19), just pad. + actions = _ensure_bsd(actions) + if actions.shape[-1] != 20: + raise ValueError( + f"RobotBimanual.to32_norm_6d expected 20-dim, got {actions.shape[-1]}" + ) + return _pad32(actions) + + def from32_norm_6d(self, actions32: torch.Tensor) -> torch.Tensor: + actions32 = _ensure_bsd(actions32) + if actions32.shape[-1] < 20: + raise ValueError( + f"RobotBimanual.from32_norm_6d expected >=20 dims, got " + f"{actions32.shape[-1]}" + ) + return actions32[..., 0:20] + # ============================================================ # HUMAN CONVERTERS @@ -545,3 +590,30 @@ def from32(self, actions32: torch.Tensor) -> torch.Tensor: R_R = _reconstruct_R_from_cols(R_c1, R_c2) R_ypr = _matrix_to_ypr(R_R) return torch.cat([L_xyz, L_ypr, R_xyz, R_ypr], dim=-1) # (B,S,12) + + def to32_norm_6d(self, actions: torch.Tensor) -> torch.Tensor: + # actions: (B,S,18) = [L xyz(3) 6d(6), R xyz(3) 6d(6)]. Human has no + # gripper, so insert a zero gripper slot at the end of each arm block to + # match the 32D block layout [xyz(3) c1(3) c2(3) g(1)] x 2. + actions = _ensure_bsd(actions) + if actions.shape[-1] != 18: + raise ValueError( + f"HumanBimanual.to32_norm_6d expected 18-dim, got {actions.shape[-1]}" + ) + L = actions[..., 0:9] + R = actions[..., 9:18] + g0 = torch.zeros_like(actions[..., :1]) + Lblock = torch.cat([L, g0], dim=-1) # (B,S,10) + Rblock = torch.cat([R, g0], dim=-1) # (B,S,10) + return _pad32(torch.cat([Lblock, Rblock], dim=-1)) + + def from32_norm_6d(self, actions32: torch.Tensor) -> torch.Tensor: + actions32 = _ensure_bsd(actions32) + if actions32.shape[-1] < 20: + raise ValueError( + f"HumanBimanual.from32_norm_6d expected >=20 dims, got " + f"{actions32.shape[-1]}" + ) + L = actions32[..., 0:9] # drop left gripper slot at idx 9 + R = actions32[..., 10:19] # drop right gripper slot at idx 19 + return torch.cat([L, R], dim=-1) # (B,S,18) diff --git a/egomimic/utils/pose_utils.py b/egomimic/utils/pose_utils.py index 0bbe0a6f7..e5870b83d 100644 --- a/egomimic/utils/pose_utils.py +++ b/egomimic/utils/pose_utils.py @@ -129,6 +129,61 @@ def _xyzypr_to_matrix(xyzypr: np.ndarray) -> np.ndarray: return mats +def _ypr_to_rot6d(ypr: np.ndarray) -> np.ndarray: + """Convert euler ypr to the continuous 6D rotation representation. + + args: + ypr: (..., 3) array of [yaw, pitch, roll] (radians, ZYX convention) + returns: + (..., 6) array = first two columns of the rotation matrix, + concatenated as [col0(3), col1(3)]. + + Matches the column convention used by the torch packers in + ``egomimic.utils.action_utils`` (``_ypr_to_matrix`` = Rz@Ry@Rx, and + ``to32`` taking ``R[..., 0]`` / ``R[..., 1]``). + """ + ypr = np.asarray(ypr) + if ypr.shape[-1] != 3: + raise ValueError(f"Expected (..., 3) ypr, got shape {ypr.shape}") + dtype = ypr.dtype if np.issubdtype(ypr.dtype, np.floating) else np.float64 + shape = ypr.shape[:-1] + flat = ypr.reshape(-1, 3).astype(np.float64) + mats = R.from_euler("ZYX", flat, degrees=False).as_matrix() # (N, 3, 3) + six = np.concatenate([mats[:, :, 0], mats[:, :, 1]], axis=-1) # cols 0,1 + return six.reshape(*shape, 6).astype(dtype, copy=False) + + +def _rot6d_to_ypr(six: np.ndarray) -> np.ndarray: + """Inverse of :func:`_ypr_to_rot6d`. + + args: + six: (..., 6) array = [col0(3), col1(3)] of a rotation matrix. + returns: + (..., 3) array of [yaw, pitch, roll] (radians, ZYX convention). + + Reconstructs a proper rotation via Gram-Schmidt (mirroring + ``_reconstruct_R_from_cols`` in ``action_utils``) before extracting euler + angles, so ``_rot6d_to_ypr(_ypr_to_rot6d(ypr)) == ypr``. + """ + six = np.asarray(six) + if six.shape[-1] != 6: + raise ValueError(f"Expected (..., 6) rot6d, got shape {six.shape}") + dtype = six.dtype if np.issubdtype(six.dtype, np.floating) else np.float64 + shape = six.shape[:-1] + flat = six.reshape(-1, 6).astype(np.float64) + c1 = flat[:, 0:3] + c2 = flat[:, 3:6] + eps = 1e-8 + c1n = c1 / np.clip(np.linalg.norm(c1, axis=-1, keepdims=True), eps, None) + proj = np.sum(c2 * c1n, axis=-1, keepdims=True) * c1n + c2o = c2 - proj + c2n = c2o / np.clip(np.linalg.norm(c2o, axis=-1, keepdims=True), eps, None) + c3n = np.cross(c1n, c2n) + mats = np.stack([c1n, c2n, c3n], axis=-1) # columns + ypr = R.from_matrix(mats).as_euler("ZYX", degrees=False) + return ypr.reshape(*shape, 3).astype(dtype, copy=False) + + def _matrix_to_xyzwxyz(mats: np.ndarray) -> np.ndarray: """ args: diff --git a/egomimic/utils/test_pi05_norm_rot6d.py b/egomimic/utils/test_pi05_norm_rot6d.py new file mode 100644 index 000000000..9ba7a3ba7 --- /dev/null +++ b/egomimic/utils/test_pi05_norm_rot6d.py @@ -0,0 +1,115 @@ +"""Round-trip tests for the normalized continuous-6D rotation encoding. + +Covers the data transform (ypr <-> 6D) and the converter 32D packers +(``to32_norm_6d`` / ``from32_norm_6d``) for both the robot bimanual (14D ypr / +20D 6D, with gripper) and human bimanual (12D ypr / 18D 6D, no gripper) layouts. +""" + +import numpy as np +import pytest +import torch + +from egomimic.rldb.zarr.action_chunk_transforms import ( + CartesianRot6DToYPR, + CartesianYPRToRot6D, +) +from egomimic.utils.action_utils import ( + BaseActionConverter, + HumanBimanualCartesianEuler, + RobotBimanualCartesianEuler, +) +from egomimic.utils.pose_utils import _rot6d_to_ypr, _ypr_to_rot6d + + +def _eva_ypr_chunk(T: int = 5) -> np.ndarray: + # [L xyz ypr g, R xyz ypr g]; moderate angles to avoid gimbal/wrap ambiguity. + rng = np.random.default_rng(0) + xyz = rng.uniform(-1.0, 1.0, size=(T, 3)) + ypr = rng.uniform(-1.0, 1.0, size=(T, 3)) # radians, well inside (-pi, pi) + g = rng.uniform(0.0, 1.0, size=(T, 1)) + arm = np.concatenate([xyz, ypr, g], axis=-1) + return np.concatenate([arm, arm], axis=-1) # 14D + + +def _aria_ypr_chunk(T: int = 5) -> np.ndarray: + rng = np.random.default_rng(1) + xyz = rng.uniform(-1.0, 1.0, size=(T, 3)) + ypr = rng.uniform(-1.0, 1.0, size=(T, 3)) + arm = np.concatenate([xyz, ypr], axis=-1) + return np.concatenate([arm, arm], axis=-1) # 12D + + +def test_ypr_rot6d_helpers_round_trip(): + ypr = np.random.default_rng(2).uniform(-1.0, 1.0, size=(7, 3)) + six = _ypr_to_rot6d(ypr) + assert six.shape == (7, 6) + np.testing.assert_allclose(_rot6d_to_ypr(six), ypr, atol=1e-6) + + +@pytest.mark.parametrize( + "chunk_fn,ypr_dim,six_dim", + [(_eva_ypr_chunk, 14, 20), (_aria_ypr_chunk, 12, 18)], +) +def test_cartesian_ypr_rot6d_transform_round_trips(chunk_fn, ypr_dim, six_dim): + ypr = chunk_fn() + assert ypr.shape[-1] == ypr_dim + + fwd = CartesianYPRToRot6D(action_key="actions_cartesian") + rev = CartesianRot6DToYPR(action_key="actions_cartesian") + + batch = {"actions_cartesian": ypr.copy()} + batch = fwd.transform(batch) + assert batch["actions_cartesian"].shape[-1] == six_dim + + batch = rev.transform(batch) + np.testing.assert_allclose(batch["actions_cartesian"], ypr, atol=1e-6) + + +def test_transform_preserves_tensor_type(): + ypr = torch.from_numpy(_eva_ypr_chunk()) + out = CartesianYPRToRot6D().transform({"actions_cartesian": ypr})[ + "actions_cartesian" + ] + assert isinstance(out, torch.Tensor) + assert out.shape[-1] == 20 + + +def test_robot_bimanual_norm_6d_pack_round_trips(): + converter = RobotBimanualCartesianEuler() + six = torch.from_numpy(_eva_ypr_chunk()).float() + six6d = torch.from_numpy( + CartesianYPRToRot6D().transform({"actions_cartesian": six.numpy()})[ + "actions_cartesian" + ] + ).float()[None] # (1, T, 20) + + packed = converter.to32_norm_6d(six6d) + assert packed.shape[-1] == 32 + decoded = converter.from32_norm_6d(packed) + torch.testing.assert_close(decoded, six6d, atol=1e-6, rtol=1e-6) + + +def test_human_bimanual_norm_6d_pack_round_trips_and_zeros_gripper(): + converter = HumanBimanualCartesianEuler() + six6d = torch.from_numpy( + CartesianYPRToRot6D().transform({"actions_cartesian": _aria_ypr_chunk()})[ + "actions_cartesian" + ] + ).float()[None] # (1, T, 18) + + packed = converter.to32_norm_6d(six6d) + assert packed.shape[-1] == 32 + # gripper slots (9, 19) must be zero for human (no gripper signal). + torch.testing.assert_close(packed[..., 9], torch.zeros_like(packed[..., 9])) + torch.testing.assert_close(packed[..., 19], torch.zeros_like(packed[..., 19])) + + decoded = converter.from32_norm_6d(packed) + torch.testing.assert_close(decoded, six6d, atol=1e-6, rtol=1e-6) + + +def test_base_converter_rejects_norm_6d_encoding(): + converter = BaseActionConverter() + with pytest.raises(NotImplementedError, match="normalized-rot6d"): + converter.to32_norm_6d(torch.zeros(1, 1, 20)) + with pytest.raises(NotImplementedError, match="normalized-rot6d"): + converter.from32_norm_6d(torch.zeros(1, 1, 32))