diff --git a/egomimic/eval/eval_pi.py b/egomimic/eval/eval_pi.py index 5e6b90faa..a508f7943 100644 --- a/egomimic/eval/eval_pi.py +++ b/egomimic/eval/eval_pi.py @@ -5,6 +5,7 @@ from egomimic.eval.eval_video import EvalVideo from egomimic.rldb.embodiment.embodiment import Embodiment, get_embodiment +from egomimic.utils.pose_utils import bimanual_cartesian_layout class PIEvalVideo(EvalVideo): @@ -27,24 +28,31 @@ def compute_metrics_and_viz(self, batch): total_loss = None n_loss_embodiments = 0 - # Bimanual 12-D layout from `HumanBimanualCartesianEuler.from32`: - # [L_xyz(3), L_ypr(3), R_xyz(3), R_ypr(3)]. Split lets us tell a - # translation problem apart from a rotation-reconstruction artifact - # (6D-cols → matrix → YPR can blow up near gimbal lock / ±π wrap). + # Split the bimanual cartesian vector into a translation MSE and a + # rotation MSE so a translation problem reads apart from a rotation + # one. Handles all four native widths via ``bimanual_cartesian_layout``: + # - native model output: 18D (human) / 20D (robot) continuous 6D cols + # — this metric is clean (6D has no ±π wrap). + # - reverted cam-frame output: 12D (human) / 14D (robot) ypr — the + # rotation MSE here keeps the old gimbal/±π caveat, but it's a + # secondary viz metric. + # The metric keys keep the historical ``_ypr_`` name for dashboard + # continuity; it denotes "rotation channels" regardless of encoding. def _split_mse(pred_t, gt_t): - if pred_t.shape[-1] != 12: + layout = bimanual_cartesian_layout(pred_t.shape[-1]) + if layout is None: return None, None - xyz_idx = [0, 1, 2, 6, 7, 8] - ypr_idx = [3, 4, 5, 9, 10, 11] + xyz_idx = list(layout["xyz"]) + rot_idx = list(layout["rot"]) xyz = MeanSquaredError()( pred_t[..., xyz_idx].cpu().contiguous(), gt_t[..., xyz_idx].cpu().contiguous(), ) - ypr = MeanSquaredError()( - pred_t[..., ypr_idx].cpu().contiguous(), - gt_t[..., ypr_idx].cpu().contiguous(), + rot = MeanSquaredError()( + pred_t[..., rot_idx].cpu().contiguous(), + gt_t[..., rot_idx].cpu().contiguous(), ) - return xyz, ypr + return xyz, rot for embodiment_id, _batch in batch.items(): _batch = algo.norm_stats.unnormalize(_batch, embodiment_id) diff --git a/egomimic/hydra_configs/data/cotrain_pi_base.yaml b/egomimic/hydra_configs/data/cotrain_pi_base.yaml index ab4c2c3fc..f5375eff6 100644 --- a/egomimic/hydra_configs/data/cotrain_pi_base.yaml +++ b/egomimic/hydra_configs/data/cotrain_pi_base.yaml @@ -12,7 +12,7 @@ train_datasets: annotation_key: annotations transform_list: _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list - mode: cartesian + mode: cartesian_6d filters: null mode: train valid_ratio: 0.05 @@ -27,7 +27,7 @@ train_datasets: annotation_key: annotations transform_list: _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list - mode: cartesian + mode: cartesian_6d filters: null mode: train valid_ratio: 0.05 diff --git a/egomimic/hydra_configs/data/cotrain_pi_lang_wrist.yaml b/egomimic/hydra_configs/data/cotrain_pi_lang_wrist.yaml index 1f10d1c15..14783e782 100644 --- a/egomimic/hydra_configs/data/cotrain_pi_lang_wrist.yaml +++ b/egomimic/hydra_configs/data/cotrain_pi_lang_wrist.yaml @@ -8,10 +8,10 @@ train_datasets: key_map: keymap_mode: cartesian_wristframe_ypr transform_list: - mode: cartesian_wristframe_ypr + mode: cartesian_wristframe_6d aria_bimanual: resolver: key_map: keymap_mode: cartesian transform_list: - mode: cartesian_wristframe_ypr + mode: cartesian_wristframe_6d diff --git a/egomimic/hydra_configs/data/mecka_pi.yaml b/egomimic/hydra_configs/data/mecka_pi.yaml index a01c9adea..2bd23921a 100644 --- a/egomimic/hydra_configs/data/mecka_pi.yaml +++ b/egomimic/hydra_configs/data/mecka_pi.yaml @@ -12,7 +12,7 @@ train_datasets: annotation_key: annotations transform_list: _target_: egomimic.rldb.embodiment.human.Mecka.get_transform_list - mode: cartesian + mode: cartesian_6d filters: _target_: egomimic.rldb.filters.DatasetFilter filter_lambdas: diff --git a/egomimic/hydra_configs/data/mecka_pi_10_hrs.yaml b/egomimic/hydra_configs/data/mecka_pi_10_hrs.yaml index 78e66d320..9b1a2a78a 100644 --- a/egomimic/hydra_configs/data/mecka_pi_10_hrs.yaml +++ b/egomimic/hydra_configs/data/mecka_pi_10_hrs.yaml @@ -1,5 +1,8 @@ _target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +# Visualization task allowlist consumed by evaluator.tasks by default. +viz_tasks: [cutting_fabric, hanging_clothes, brushing_shoes] + train_datasets: mecka_bimanual: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver @@ -12,11 +15,11 @@ train_datasets: annotation_key: annotations transform_list: _target_: egomimic.rldb.embodiment.human.Mecka.get_transform_list - mode: cartesian + mode: cartesian_6d filters: _target_: egomimic.rldb.filters.DatasetFilter filter_lambdas: - - "lambda row: row['lab'] == 'mecka' and row['task'] in {'packaging_coffee', 'wrapping_gifts', 'cleaning_tools', 'folding_napkins', 'repairing_electronics', 'polishing_jewelry', 'disassembling_phone', 'assembling_flowers', 'making_dumplings', 'peeling_vegetables'}" + - "lambda row: row['lab'] == 'mecka' and row['task'] in {'cutting_fabric', 'hanging_clothes', 'brushing_shoes'}" mode: train valid_datasets: diff --git a/egomimic/hydra_configs/data/mecka_pi_50_hrs.yaml b/egomimic/hydra_configs/data/mecka_pi_50_hrs.yaml index 307894211..9e318cf46 100644 --- a/egomimic/hydra_configs/data/mecka_pi_50_hrs.yaml +++ b/egomimic/hydra_configs/data/mecka_pi_50_hrs.yaml @@ -12,7 +12,7 @@ train_datasets: annotation_key: annotations transform_list: _target_: egomimic.rldb.embodiment.human.Mecka.get_transform_list - mode: cartesian + mode: cartesian_6d filters: _target_: egomimic.rldb.filters.DatasetFilter filter_lambdas: diff --git a/egomimic/hydra_configs/data/mecka_pi_brushing_shoes.yaml b/egomimic/hydra_configs/data/mecka_pi_brushing_shoes.yaml new file mode 100644 index 000000000..571cfffa2 --- /dev/null +++ b/egomimic/hydra_configs/data/mecka_pi_brushing_shoes.yaml @@ -0,0 +1,51 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper + +# Visualization task allowlist consumed by evaluator.tasks by default. +viz_tasks: [brushing_shoes] + +train_datasets: + mecka_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: ${paths.dataset_dir} + key_map: + _target_: egomimic.rldb.embodiment.human.Mecka.get_keymap + mode: cartesian_pi + annotation_key: annotations + transform_list: + _target_: egomimic.rldb.embodiment.human.Mecka.get_transform_list + mode: cartesian_6d + filters: + _target_: egomimic.rldb.filters.DatasetFilter + filter_lambdas: + - "lambda row: row['lab'] == 'mecka' and row['task'] == 'brushing_shoes'" + mode: train + +valid_datasets: + mecka_bimanual: + _target_: ${data.train_datasets.mecka_bimanual._target_} + resolver: ${data.train_datasets.mecka_bimanual.resolver} + filters: ${data.train_datasets.mecka_bimanual.filters} + mode: valid + +train_dataloader_params: + mecka_bimanual: + batch_size: 64 + num_workers: 10 +valid_dataloader_params: + mecka_bimanual: + batch_size: 64 + num_workers: 10 + +# `+evaluator@train_viz_evaluator=train_viz_pi`. +train_viz_datasets: + mecka_bimanual: + _target_: ${data.train_datasets.mecka_bimanual._target_} + resolver: ${data.train_datasets.mecka_bimanual.resolver} + filters: ${data.train_datasets.mecka_bimanual.filters} + mode: train +train_viz_dataloader_params: + mecka_bimanual: + batch_size: 64 + num_workers: 10 diff --git a/egomimic/hydra_configs/data/mecka_pi_eval.yaml b/egomimic/hydra_configs/data/mecka_pi_eval.yaml index 9ca0dbc44..76d680999 100644 --- a/egomimic/hydra_configs/data/mecka_pi_eval.yaml +++ b/egomimic/hydra_configs/data/mecka_pi_eval.yaml @@ -16,7 +16,7 @@ train_datasets: annotation_key: annotations transform_list: _target_: egomimic.rldb.embodiment.human.Mecka.get_transform_list - mode: cartesian + mode: cartesian_6d filters: _target_: egomimic.rldb.filters.DatasetFilter filter_lambdas: diff --git a/egomimic/hydra_configs/data/mecka_pi_fold_clothes_freeform.yaml b/egomimic/hydra_configs/data/mecka_pi_fold_clothes_freeform.yaml index e34d0a8e5..e2ef3e95e 100644 --- a/egomimic/hydra_configs/data/mecka_pi_fold_clothes_freeform.yaml +++ b/egomimic/hydra_configs/data/mecka_pi_fold_clothes_freeform.yaml @@ -12,7 +12,7 @@ train_datasets: annotation_key: annotations transform_list: _target_: egomimic.rldb.embodiment.human.Mecka.get_transform_list - mode: cartesian + mode: cartesian_6d filters: _target_: egomimic.rldb.filters.DatasetFilter filter_lambdas: diff --git a/egomimic/hydra_configs/evaluator/eval_hpt_wrist.yaml b/egomimic/hydra_configs/evaluator/eval_hpt_wrist.yaml index 507f47a5e..897f326b1 100644 --- a/egomimic/hydra_configs/evaluator/eval_hpt_wrist.yaml +++ b/egomimic/hydra_configs/evaluator/eval_hpt_wrist.yaml @@ -12,5 +12,5 @@ transform_lists: _target_: egomimic.rldb.embodiment.eva._build_eva_bimanual_revert_eef_frame_transform_list is_quat: false aria_bimanual: - _target_: egomimic.rldb.embodiment.human._build_aria_cartesian_revert_eef_frame_transform_list + _target_: egomimic.rldb.embodiment.human._build_human_cartesian_revert_eef_frame_transform_list is_quat: false diff --git a/egomimic/hydra_configs/evaluator/eval_pi.yaml b/egomimic/hydra_configs/evaluator/eval_pi.yaml index 13fc4945e..3b4485c24 100644 --- a/egomimic/hydra_configs/evaluator/eval_pi.yaml +++ b/egomimic/hydra_configs/evaluator/eval_pi.yaml @@ -20,7 +20,7 @@ max_frames_per_task: 1000 transform_lists: eva_bimanual: _target_: egomimic.rldb.embodiment.eva._build_eva_bimanual_revert_eef_frame_transform_list - is_quat: false + rot_repr: "6d" aria_bimanual: - _target_: egomimic.rldb.embodiment.human._build_aria_cartesian_revert_eef_frame_transform_list - is_quat: false + _target_: egomimic.rldb.embodiment.human._build_human_cartesian_revert_eef_frame_transform_list + rot_repr: "6d" diff --git a/egomimic/hydra_configs/hydra/launcher/submitit_cpu_pace.yaml b/egomimic/hydra_configs/hydra/launcher/submitit_cpu_pace.yaml index 178b91c90..db297164d 100644 --- a/egomimic/hydra_configs/hydra/launcher/submitit_cpu_pace.yaml +++ b/egomimic/hydra_configs/hydra/launcher/submitit_cpu_pace.yaml @@ -7,7 +7,7 @@ _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher name: ${hydra.job.name} partition: "cpu-small" # PACE CPU partition — confirm before first submit account: "gts-dxu345-rl2" -cpus_per_task: 24 +cpus_per_task: 12 nodes: 1 tasks_per_node: 1 qos: "inferno" diff --git a/egomimic/hydra_configs/model/pi0.5_bc_aria.yaml b/egomimic/hydra_configs/model/pi0.5_bc_aria.yaml index 4d53e813d..fc330ffa6 100644 --- a/egomimic/hydra_configs/model/pi0.5_bc_aria.yaml +++ b/egomimic/hydra_configs/model/pi0.5_bc_aria.yaml @@ -14,7 +14,7 @@ robomimic_model: action_converters: rules: ARIA_BIMANUAL: - _target_: egomimic.utils.action_utils.HumanBimanualCartesianEuler + _target_: egomimic.utils.action_utils.HumanBimanualCartesian6D # optional fallback if no match is found fallback: _target_: egomimic.utils.action_utils.BaseActionConverter diff --git a/egomimic/hydra_configs/model/pi0.5_bc_eva.yaml b/egomimic/hydra_configs/model/pi0.5_bc_eva.yaml index b0b6a6a10..35f66c48a 100644 --- a/egomimic/hydra_configs/model/pi0.5_bc_eva.yaml +++ b/egomimic/hydra_configs/model/pi0.5_bc_eva.yaml @@ -14,7 +14,7 @@ robomimic_model: action_converters: rules: EVA_BIMANUAL: - _target_: egomimic.utils.action_utils.RobotBimanualCartesianEuler + _target_: egomimic.utils.action_utils.RobotBimanualCartesian6D # optional fallback if no match is found fallback: _target_: egomimic.utils.action_utils.BaseActionConverter diff --git a/egomimic/hydra_configs/model/pi0.5_bc_mecka.yaml b/egomimic/hydra_configs/model/pi0.5_bc_mecka.yaml index 70aa29301..477584566 100644 --- a/egomimic/hydra_configs/model/pi0.5_bc_mecka.yaml +++ b/egomimic/hydra_configs/model/pi0.5_bc_mecka.yaml @@ -14,7 +14,7 @@ robomimic_model: action_converters: rules: MECKA_BIMANUAL: - _target_: egomimic.utils.action_utils.HumanBimanualCartesianEuler + _target_: egomimic.utils.action_utils.HumanBimanualCartesian6D # optional fallback if no match is found fallback: _target_: egomimic.utils.action_utils.BaseActionConverter diff --git a/egomimic/hydra_configs/model/pi0.5_bc_scale.yaml b/egomimic/hydra_configs/model/pi0.5_bc_scale.yaml index 293904155..ad46499cd 100644 --- a/egomimic/hydra_configs/model/pi0.5_bc_scale.yaml +++ b/egomimic/hydra_configs/model/pi0.5_bc_scale.yaml @@ -14,7 +14,7 @@ robomimic_model: action_converters: rules: SCALE_BIMANUAL: - _target_: egomimic.utils.action_utils.HumanBimanualCartesianEuler + _target_: egomimic.utils.action_utils.HumanBimanualCartesian6D # optional fallback if no match is found fallback: _target_: egomimic.utils.action_utils.BaseActionConverter diff --git a/egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria.yaml b/egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria.yaml index 3b10646a2..d8aa33fe0 100644 --- a/egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria.yaml +++ b/egomimic/hydra_configs/model/pi0.5_cotrain_eva_aria.yaml @@ -20,9 +20,9 @@ robomimic_model: action_converters: rules: EVA_BIMANUAL: - _target_: egomimic.utils.action_utils.RobotBimanualCartesianEuler + _target_: egomimic.utils.action_utils.RobotBimanualCartesian6D ARIA_BIMANUAL: - _target_: egomimic.utils.action_utils.HumanBimanualCartesianEuler + _target_: egomimic.utils.action_utils.HumanBimanualCartesian6D # optional fallback if no match is found fallback: _target_: egomimic.utils.action_utils.BaseActionConverter \ No newline at end of file diff --git a/egomimic/hydra_configs/model/pi0.5_cotrain_mecka_scale.yaml b/egomimic/hydra_configs/model/pi0.5_cotrain_mecka_scale.yaml index d7ccb63db..a002fb73e 100644 --- a/egomimic/hydra_configs/model/pi0.5_cotrain_mecka_scale.yaml +++ b/egomimic/hydra_configs/model/pi0.5_cotrain_mecka_scale.yaml @@ -20,9 +20,9 @@ robomimic_model: action_converters: rules: MECKA_BIMANUAL: - _target_: egomimic.utils.action_utils.HumanBimanualCartesianEuler + _target_: egomimic.utils.action_utils.HumanBimanualCartesian6D SCALE_BIMANUAL: - _target_: egomimic.utils.action_utils.HumanBimanualCartesianEuler + _target_: egomimic.utils.action_utils.HumanBimanualCartesian6D # optional fallback if no match is found fallback: _target_: egomimic.utils.action_utils.BaseActionConverter diff --git a/egomimic/hydra_configs/trainer/ddp_pi.yaml b/egomimic/hydra_configs/trainer/ddp_pi.yaml index 4fdba78d1..372cff7dc 100644 --- a/egomimic/hydra_configs/trainer/ddp_pi.yaml +++ b/egomimic/hydra_configs/trainer/ddp_pi.yaml @@ -7,5 +7,5 @@ accelerator: gpu devices: ${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'} num_nodes: ${launch_params.nodes} sync_batchnorm: True -check_val_every_n_epoch: 150 +check_val_every_n_epoch: 50 num_sanity_val_steps: 0 diff --git a/egomimic/hydra_configs/trainer/default.yaml b/egomimic/hydra_configs/trainer/default.yaml index 62f147c27..0c5a48331 100644 --- a/egomimic/hydra_configs/trainer/default.yaml +++ b/egomimic/hydra_configs/trainer/default.yaml @@ -13,8 +13,8 @@ precision: bf16 limit_train_batches: 100 limit_val_batches: 80 # perform a validation loop every N training epochs -check_val_every_n_epoch: 100 +check_val_every_n_epoch: 50 # set True to to ensure deterministic results # makes training slower but gives more reproducibility than just setting seeds -deterministic: False \ No newline at end of file +deterministic: False diff --git a/egomimic/rldb/embodiment/eva.py b/egomimic/rldb/embodiment/eva.py index 519510447..7b68ab249 100644 --- a/egomimic/rldb/embodiment/eva.py +++ b/egomimic/rldb/embodiment/eva.py @@ -5,6 +5,7 @@ from egomimic.rldb.embodiment.embodiment import Embodiment from egomimic.rldb.zarr.action_chunk_transforms import ( ActionChunkCoordinateFrameTransform, + BatchQuaternionPoseToXYZ6D, BatchQuaternionPoseToYPR, ConcatKeys, DeleteKeys, @@ -12,9 +13,12 @@ InterpolatePose, NumpyToTensor, PoseCoordinateFrameTransform, + QuaternionPoseToXYZ6D, QuaternionPoseToYPR, SplitKeys, Transform, + XYZ6D_to_XYZYPR, + XYZWXYZ_to_XYZ6D, XYZWXYZ_to_XYZYPR, ) from egomimic.utils.egomimicUtils import ( @@ -29,13 +33,21 @@ class Eva(Embodiment): @staticmethod def get_transform_list( mode: Literal[ - "cartesian", "cartesian_wristframe_ypr", "cartesian_wristframe_quat" + "cartesian", + "cartesian_6d", + "cartesian_wristframe_ypr", + "cartesian_wristframe_6d", + "cartesian_wristframe_quat", ], ) -> list[Transform]: if mode == "cartesian": return _build_eva_bimanual_transform_list(is_quat=True) + elif mode == "cartesian_6d": + return _build_eva_bimanual_transform_list(is_quat=True, rot_repr="6d") elif mode == "cartesian_wristframe_ypr": return _build_eva_bimanual_eef_frame_transform_list(is_quat=False) + elif mode == "cartesian_wristframe_6d": + return _build_eva_bimanual_eef_frame_transform_list(rot_repr="6d") elif mode == "cartesian_wristframe_quat": return _build_eva_bimanual_eef_frame_transform_list(is_quat=True) @@ -146,12 +158,21 @@ def _build_eva_bimanual_revert_eef_frame_transform_list( left_cmd_camframe: str = "left.cmd_ee_pose_camframe", right_cmd_camframe: str = "right.cmd_ee_pose_camframe", is_quat: bool = True, + rot_repr: str = "ypr", ) -> list[Transform]: - """Revert wrist-frame EVA actions back to camera frame for visualization.""" - if is_quat: - pose_shape = 7 + """Revert wrist-frame EVA actions back to camera frame for visualization. + + ``rot_repr="6d"`` reverts a model 6D prediction: the per-arm pose width is 9, + the coordinate transform runs in ``xyz6d`` mode (Gram-Schmidt happens there), + and the reverted cam-frame poses are finally converted to ypr so downstream + viz / deploy keep their ypr contract. + """ + if rot_repr == "6d": + pose_shape = 9 + revert_mode = "xyz6d" else: - pose_shape = 6 + pose_shape = 7 if is_quat else 6 + revert_mode = "xyzypr" transform_list = [ # Extract obs camframe poses from the concatenated obs key SplitKeys( @@ -178,16 +199,23 @@ def _build_eva_bimanual_revert_eef_frame_transform_list( target_world=left_obs_camframe, chunk_world=left_cmd_wristframe, transformed_key_name=left_cmd_camframe, - mode="xyzypr", + mode=revert_mode, inverse=False, ), ActionChunkCoordinateFrameTransform( target_world=right_obs_camframe, chunk_world=right_cmd_wristframe, transformed_key_name=right_cmd_camframe, - mode="xyzypr", + mode=revert_mode, inverse=False, ), + ] + if rot_repr == "6d": + # Collapse 6D columns back to ypr for viz / deploy after the revert. + transform_list.append( + XYZ6D_to_XYZYPR(keys=[left_cmd_camframe, right_cmd_camframe]) + ) + transform_list.append( ConcatKeys( key_list=[ left_cmd_camframe, @@ -198,7 +226,7 @@ def _build_eva_bimanual_revert_eef_frame_transform_list( new_key_name=action_key, delete_old_keys=True, ), - ] + ) return transform_list @@ -226,9 +254,14 @@ def _build_eva_bimanual_eef_frame_transform_list( stride: int = 1, extrinsics_key: str = "x5Dec13_2", is_quat: bool = True, + rot_repr: str = "ypr", ) -> list[Transform]: """EVA bimanual transform pipeline with actions expressed relative to the - current EEF pose (wrist frame), analogous to keypoints relative to wrist pose.""" + current EEF pose (wrist frame), analogous to keypoints relative to wrist pose. + + The frame math always runs in quaternion; ``rot_repr`` controls the final + rotation encoding: ``"ypr"`` (when ``is_quat=False``), raw quaternion (when + ``is_quat=True``), or continuous ``"6d"`` columns.""" extrinsics = EXTRINSICS[extrinsics_key] left_extrinsics_pose = _matrix_to_xyzwxyz(extrinsics["left"][None, :])[0] right_extrinsics_pose = _matrix_to_xyzwxyz(extrinsics["right"][None, :])[0] @@ -304,7 +337,28 @@ def _build_eva_bimanual_eef_frame_transform_list( ), ] - if not is_quat: + if rot_repr == "6d": + transform_list.extend( + [ + BatchQuaternionPoseToXYZ6D( + pose_key=left_cmd_wristframe, + output_key=left_cmd_wristframe, + ), + BatchQuaternionPoseToXYZ6D( + pose_key=right_cmd_wristframe, + output_key=right_cmd_wristframe, + ), + QuaternionPoseToXYZ6D( + pose_key=left_obs_camframe, + output_key=left_obs_camframe, + ), + QuaternionPoseToXYZ6D( + pose_key=right_obs_camframe, + output_key=right_obs_camframe, + ), + ] + ) + elif not is_quat: transform_list.extend( [ BatchQuaternionPoseToYPR( @@ -391,15 +445,23 @@ def _build_eva_bimanual_transform_list( stride: int = 1, extrinsics_key: str = "x5Dec13_2", is_quat: bool = True, + rot_repr: str = "ypr", ) -> list[Transform]: - """Canonical EVA bimanual transform pipeline used by tests and notebooks.""" + """Canonical EVA bimanual transform pipeline used by tests and notebooks. + + ``rot_repr="6d"`` keeps the frame math in quaternion (SLERP interpolation) + and emits continuous 6D rotation columns instead of ypr. + """ extrinsics = EXTRINSICS[extrinsics_key] left_extrinsics_pose = _matrix_to_xyzwxyz(extrinsics["left"][None, :])[0] right_extrinsics_pose = _matrix_to_xyzwxyz(extrinsics["right"][None, :])[0] left_extra_batch_key = {"left_extrinsics_pose": left_extrinsics_pose} right_extra_batch_key = {"right_extrinsics_pose": right_extrinsics_pose} - mode = "xyzwxyz" if is_quat else "xyzypr" + # 6D conversion needs the quaternion intermediate so the columns come from a + # continuous rotation, not a wrapped Euler one. + use_quat = is_quat or rot_repr == "6d" + mode = "xyzwxyz" if use_quat else "xyzypr" transform_list = [ ActionChunkCoordinateFrameTransform( target_world=left_target_world, @@ -455,7 +517,18 @@ def _build_eva_bimanual_transform_list( ), ] - if is_quat: + if rot_repr == "6d": + transform_list.append( + XYZWXYZ_to_XYZ6D( + keys=[ + left_cmd_camframe, + right_cmd_camframe, + left_obs_pose, + right_obs_pose, + ] + ) + ) + elif is_quat: transform_list.append( XYZWXYZ_to_XYZYPR( keys=[ diff --git a/egomimic/rldb/embodiment/human.py b/egomimic/rldb/embodiment/human.py index e97de31e2..957b43535 100644 --- a/egomimic/rldb/embodiment/human.py +++ b/egomimic/rldb/embodiment/human.py @@ -16,6 +16,8 @@ Reshape, SplitKeys, Transform, + XYZ6D_to_XYZYPR, + XYZWXYZ_to_XYZ6D, XYZWXYZ_to_XYZYPR, ) from egomimic.utils.viz_utils import ( @@ -99,7 +101,9 @@ def get_transform_list( mode: Literal[ "cartesian", "cartesian_padded", + "cartesian_6d", "cartesian_wristframe_ypr", + "cartesian_wristframe_6d", "keypoints_headframe_ypr", "keypoints_headframe_quat", "keypoints_wristframe_ypr", @@ -112,8 +116,16 @@ def get_transform_list( return _build_human_cartesian_bimanual_transform_list( stride=cls.ACTION_STRIDE ) + [PadGripperZeros(action_key="actions_cartesian")] + if mode == "cartesian_6d": + return _build_human_cartesian_bimanual_transform_list( + stride=cls.ACTION_STRIDE, rot_repr="6d" + ) if mode == "cartesian_wristframe_ypr": return _build_human_cartesian_eef_frame_transform_list(stride=cls.ACTION_STRIDE) + if mode == "cartesian_wristframe_6d": + return _build_human_cartesian_eef_frame_transform_list( + stride=cls.ACTION_STRIDE, rot_repr="6d" + ) if mode == "keypoints_headframe_ypr": return _build_human_keypoints_bimanual_transform_list( stride=cls.ACTION_STRIDE, is_quat=False @@ -328,13 +340,19 @@ class Mecka(Human): @classmethod def get_transform_list( cls, - mode: Literal["cartesian",] = "cartesian", + mode: Literal["cartesian", "cartesian_6d"] = "cartesian", chunk_length: int = 100, ) -> list[Transform]: if mode == "cartesian": - return _build_aria_cartesian_bimanual_transform_list( + return _build_human_cartesian_bimanual_transform_list( + stride=cls.ACTION_STRIDE, + chunk_length=chunk_length, + ) + elif mode == "cartesian_6d": + return _build_human_cartesian_bimanual_transform_list( stride=cls.ACTION_STRIDE, chunk_length=chunk_length, + rot_repr="6d", ) @classmethod @@ -959,6 +977,7 @@ def _build_human_cartesian_revert_eef_frame_transform_list( left_action_headframe: str = "left.action_ee_pose_headframe", right_action_headframe: str = "right.action_ee_pose_headframe", is_quat: bool = False, + rot_repr: str = "ypr", ) -> list[Transform]: """Revert wrist-frame ARIA cartesian actions back to head (camera) frame. @@ -966,9 +985,16 @@ def _build_human_cartesian_revert_eef_frame_transform_list( action chunks live in each side's wrist frame, the proprio ee-poses live in headframe (= Aria camera frame). Re-composes ``target_headframe @ chunk_wristframe`` so action chunks are back in headframe / camera frame. + + ``rot_repr="6d"`` reverts a model 6D prediction (per-arm width 9, ``xyz6d`` + coordinate transform with Gram-Schmidt), then collapses back to ypr. """ - pose_shape = 7 if is_quat else 6 - mode = "xyzwxyz" if is_quat else "xyzypr" + if rot_repr == "6d": + pose_shape = 9 + mode = "xyz6d" + else: + pose_shape = 7 if is_quat else 6 + mode = "xyzwxyz" if is_quat else "xyzypr" transform_list = [ SplitKeys( input_key=obs_key, @@ -998,12 +1024,18 @@ def _build_human_cartesian_revert_eef_frame_transform_list( mode=mode, inverse=False, ), + ] + if rot_repr == "6d": + transform_list.append( + XYZ6D_to_XYZYPR(keys=[left_action_headframe, right_action_headframe]) + ) + transform_list.append( ConcatKeys( key_list=[left_action_headframe, right_action_headframe], new_key_name=action_key, delete_old_keys=True, ), - ] + ) return transform_list @@ -1027,6 +1059,7 @@ def _build_human_cartesian_eef_frame_transform_list( chunk_length: int = 100, stride: int = 3, delete_target_world: bool = True, + rot_repr: str = "ypr", ) -> list[Transform]: """ARIA bimanual cartesian pipeline expressed in the current wrist frame. @@ -1034,7 +1067,7 @@ def _build_human_cartesian_eef_frame_transform_list( ``obs_head_pose``), then headframe → wristframe (via the proprio ``*.obs_ee_pose_headframe`` for each side). Proprio ee-poses remain in headframe (wristframe of the wrist itself is identity). All retained poses - are converted to xyz-ypr. + are converted to xyz-ypr, or to continuous 6D columns when ``rot_repr="6d"``. """ keys_to_delete = list( { @@ -1102,7 +1135,7 @@ def _build_human_cartesian_eef_frame_transform_list( transformed_key_name=right_action_wristframe, mode="xyzwxyz", ), - XYZWXYZ_to_XYZYPR( + (XYZWXYZ_to_XYZ6D if rot_repr == "6d" else XYZWXYZ_to_XYZYPR)( keys=[ left_action_wristframe, right_action_wristframe, @@ -1143,12 +1176,14 @@ def _build_human_cartesian_bimanual_transform_list( chunk_length: int = 100, stride: int = 3, delete_target_world: bool = True, + rot_repr: str = "ypr", ) -> list[Transform]: """Canonical ARIA bimanual transform pipeline used by tests and notebooks. Aria human data does not have commanded ee poses; action chunks are built from stacked observed ee poses (typically with a horizon on ``left/right.action_ee_pose`` mapped from ``left/right.obs_ee_pose``). + ``rot_repr="6d"`` emits continuous 6D rotation columns instead of ypr. """ keys_to_delete = list( { @@ -1207,7 +1242,7 @@ def _build_human_cartesian_bimanual_transform_list( if target_world_is_quat: transform_list.append( - XYZWXYZ_to_XYZYPR( + (XYZWXYZ_to_XYZ6D if rot_repr == "6d" else XYZWXYZ_to_XYZYPR)( keys=[ left_action_headframe, right_action_headframe, diff --git a/egomimic/rldb/zarr/action_chunk_transforms.py b/egomimic/rldb/zarr/action_chunk_transforms.py index 0388d386a..0e7aeafb0 100644 --- a/egomimic/rldb/zarr/action_chunk_transforms.py +++ b/egomimic/rldb/zarr/action_chunk_transforms.py @@ -26,8 +26,10 @@ _interpolate_quat_wxyz, _interpolate_xyz, _matrix_to_xyz, + _matrix_to_xyz6d, _matrix_to_xyzwxyz, _matrix_to_xyzypr, + _xyz6d_to_matrix, _xyz_to_matrix, _xyzwxyz_to_matrix, _xyzypr_to_matrix, @@ -149,7 +151,7 @@ def __init__( chunk_world: str, transformed_key_name: str, extra_batch_key: dict = None, - mode: Literal["xyz", "xyzwxyz", "xyzypr"] = "xyzwxyz", + mode: Literal["xyz", "xyzwxyz", "xyzypr", "xyz6d"] = "xyzwxyz", inverse: bool = True, ): """ @@ -196,14 +198,24 @@ def transform(self, batch): to_matrix_fn = _xyzwxyz_to_matrix elif self.mode == "xyzypr": to_matrix_fn = _xyzypr_to_matrix + elif self.mode == "xyz6d": + # Gram-Schmidt re-orthonormalization happens here when reverting a + # (possibly non-orthonormal) model 6D prediction back to a frame. + to_matrix_fn = _xyz6d_to_matrix elif self.mode == "xyz": to_matrix_fn = _xyz_to_matrix else: raise ValueError(f"Invalid mode: {self.mode}") - target_world_to_matrix_fn = ( - _xyzwxyz_to_matrix if target_world.shape[-1] == 7 else _xyzypr_to_matrix - ) + # Dispatch the target-world parser by its width: 7 -> xyz+quat(wxyz), + # 9 -> xyz+6D columns, else xyz+ypr. + target_width = target_world.shape[-1] + if target_width == 7: + target_world_to_matrix_fn = _xyzwxyz_to_matrix + elif target_width == 9: + target_world_to_matrix_fn = _xyz6d_to_matrix + else: + target_world_to_matrix_fn = _xyzypr_to_matrix # Convert to SE3 for transformation target_se3 = SE3.from_matrix( target_world_to_matrix_fn(target_world[None, :])[0] @@ -223,6 +235,8 @@ def transform(self, batch): chunk_in_target_frame = _matrix_to_xyzwxyz(chunk_mats) elif self.mode == "xyzypr": chunk_in_target_frame = _matrix_to_xyzypr(chunk_mats) + elif self.mode == "xyz6d": + chunk_in_target_frame = _matrix_to_xyz6d(chunk_mats) elif self.mode == "xyz": chunk_in_target_frame = _matrix_to_xyz(chunk_mats) else: @@ -387,6 +401,95 @@ def transform(self, batch: dict) -> dict: return batch +class XYZWXYZ_to_XYZ6D(Transform): + """Convert listed keys from xyz+quat(wxyz) to xyz+6D-columns in-place. + + The 6D representation (Zhou et al. / 6DRepNet) is the first two columns of + the rotation matrix and is continuous everywhere (no +-pi wraparound), + which is what makes per-dimension normalization meaningful. + """ + + def __init__(self, keys: list[str]): + self.keys = list(keys) + + def transform(self, batch: dict) -> dict: + for key in self.keys: + value = np.asarray(batch[key]) + if value.ndim == 1 and value.shape[0] == 7: + batch[key] = _matrix_to_xyz6d(_xyzwxyz_to_matrix(value[None, :]))[0] + elif value.ndim == 2 and value.shape[1] == 7: + batch[key] = _matrix_to_xyz6d(_xyzwxyz_to_matrix(value)) + else: + raise ValueError( + f"XYZWXYZ_to_XYZ6D expects key '{key}' to have shape (7,) " + f"or (T, 7), got {value.shape}" + ) + return batch + + +class XYZ6D_to_XYZYPR(Transform): + """Convert listed keys from xyz+6D-columns to xyz+ypr in-place. + + Runs Gram-Schmidt (via ``_xyz6d_to_matrix``) to re-orthonormalize the two + columns, then reads Euler angles off the matrix. Used at the tail of the + revert pipelines so downstream viz / deploy keep seeing ypr while the model + natively predicts 6D. + """ + + def __init__(self, keys: list[str]): + self.keys = list(keys) + + def transform(self, batch: dict) -> dict: + for key in self.keys: + value = np.asarray(batch[key]) + if value.ndim == 1 and value.shape[0] == 9: + batch[key] = _matrix_to_xyzypr(_xyz6d_to_matrix(value[None, :]))[0] + elif value.ndim == 2 and value.shape[1] == 9: + batch[key] = _matrix_to_xyzypr(_xyz6d_to_matrix(value)) + else: + raise ValueError( + f"XYZ6D_to_XYZYPR expects key '{key}' to have shape (9,) " + f"or (T, 9), got {value.shape}" + ) + return batch + + +class QuaternionPoseToXYZ6D(Transform): + """Convert a single pose from xyz + quat(wxyz) to xyz + 6D-columns.""" + + def __init__(self, pose_key: str, output_key: str): + self.pose_key = pose_key + self.output_key = output_key + + def transform(self, batch: dict) -> dict: + pose = np.asarray(batch[self.pose_key]) + if pose.shape != (7,): + raise ValueError( + f"QuaternionPoseToXYZ6D expects shape (7,), got {pose.shape} for key " + f"'{self.pose_key}'" + ) + batch[self.output_key] = _matrix_to_xyz6d(_xyzwxyz_to_matrix(pose[None, :]))[0] + return batch + + +class BatchQuaternionPoseToXYZ6D(Transform): + """Convert a batch of poses from xyz + quat(wxyz) to xyz + 6D-columns.""" + + def __init__(self, pose_key: str, output_key: str): + self.pose_key = pose_key + self.output_key = output_key + + def transform(self, batch: dict) -> dict: + pose = np.asarray(batch[self.pose_key]) + if pose.ndim != 2 or pose.shape[-1] != 7: + raise ValueError( + f"BatchQuaternionPoseToXYZ6D expects shape (N, 7), got {pose.shape} " + f"for key '{self.pose_key}'" + ) + batch[self.output_key] = _matrix_to_xyz6d(_xyzwxyz_to_matrix(pose)) + return batch + + class CartesianWithGripperCoordinateTransform(Transform): def __init__( self, diff --git a/egomimic/rldb/zarr/test_6d_rotation_unit.py b/egomimic/rldb/zarr/test_6d_rotation_unit.py new file mode 100644 index 000000000..88d1738e7 --- /dev/null +++ b/egomimic/rldb/zarr/test_6d_rotation_unit.py @@ -0,0 +1,544 @@ +"""Unit + integration tests for the continuous 6D rotation migration. + +Covers the whole 6D path end to end: + * numpy pose helpers (``_matrix_to_xyz6d`` / ``_xyz6d_to_matrix``) and their + bit-for-bit parity with the torch Gram-Schmidt used at decode time; + * the new ``action_chunk_transforms`` building blocks (``XYZWXYZ_to_XYZ6D``, + ``XYZ6D_to_XYZYPR``, the ``xyz6d`` coordinate-frame mode); + * the per-embodiment ``cartesian_6d`` / ``cartesian_wristframe_6d`` builders + (shape progression to 18D human / 20D robot); + * the 32-dim action converters (``RobotBimanualCartesian6D`` / + ``HumanBimanualCartesian6D``) round-trips; + * the wrist-frame revert producing cam-frame ypr, validated to be equivalent + to the legacy ypr revert; + * the HPT gripper-padding generalization; + * the shared bimanual-cartesian index layout used by bounds + eval. +""" + +import numpy as np +import pytest +import torch +from scipy.spatial.transform import Rotation as R + +from egomimic.rldb.embodiment.eva import ( + Eva, + _build_eva_bimanual_revert_eef_frame_transform_list, +) +from egomimic.rldb.embodiment.human import ( + Aria, + Mecka, + Scale, + _build_human_cartesian_revert_eef_frame_transform_list, +) +from egomimic.rldb.zarr.action_chunk_transforms import ( + ActionChunkCoordinateFrameTransform, + BatchQuaternionPoseToXYZ6D, + ConcatKeys, + InterpolatePose, + QuaternionPoseToXYZ6D, + XYZ6D_to_XYZYPR, + XYZWXYZ_to_XYZ6D, + XYZWXYZ_to_XYZYPR, +) +from egomimic.utils.action_utils import ( + HumanBimanualCartesian6D, + RobotBimanualCartesian6D, + _reconstruct_R_from_cols, +) +from egomimic.utils.pose_utils import ( + _matrix_to_xyz6d, + _matrix_to_xyzwxyz, + _matrix_to_xyzypr, + _xyz6d_to_matrix, + bimanual_cartesian_layout, +) + + +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- +def _random_se3(n: int, seed: int) -> np.ndarray: + """n random SE3 matrices (proper rotation + translation).""" + rng = np.random.default_rng(seed) + rots = R.random(n, random_state=seed).as_matrix() + mats = np.tile(np.eye(4), (n, 1, 1)) + mats[:, :3, :3] = rots + mats[:, :3, 3] = rng.uniform(-2.0, 2.0, size=(n, 3)) + return mats + + +def _xyzwxyz_from_mats(mats: np.ndarray) -> np.ndarray: + return _matrix_to_xyzwxyz(mats) + + +# --------------------------------------------------------------------------- +# A. pose_utils 6D helpers +# --------------------------------------------------------------------------- +def test_matrix_to_xyz6d_extracts_translation_and_first_two_columns(): + yaw = 0.7 + mat = np.eye(4) + mat[:3, :3] = R.from_euler("Z", yaw).as_matrix() + mat[:3, 3] = [1.0, 2.0, 3.0] + + out = _matrix_to_xyz6d(mat[None]) + assert out.shape == (1, 9) + np.testing.assert_allclose(out[0, :3], [1.0, 2.0, 3.0]) + # column 1 = [cos, sin, 0], column 2 = [-sin, cos, 0] + np.testing.assert_allclose(out[0, 3:6], [np.cos(yaw), np.sin(yaw), 0.0], atol=1e-12) + np.testing.assert_allclose( + out[0, 6:9], [-np.sin(yaw), np.cos(yaw), 0.0], atol=1e-12 + ) + + +def test_xyz6d_matrix_round_trip_is_identity(): + mats = _random_se3(64, seed=0) + six = _matrix_to_xyz6d(mats) + back = _xyz6d_to_matrix(six) + np.testing.assert_allclose(back, mats, atol=1e-10) + + +def test_xyz6d_to_matrix_round_trip_from_6d(): + mats = _random_se3(32, seed=1) + six = _matrix_to_xyz6d(mats) + six2 = _matrix_to_xyz6d(_xyz6d_to_matrix(six)) + np.testing.assert_allclose(six2, six, atol=1e-10) + + +def test_xyz6d_to_matrix_orthonormalizes_non_orthonormal_columns(): + # Two arbitrary, non-orthonormal, non-unit columns. + six = np.array([[5.0, -6.0, 7.0, 2.0, 0.0, 0.0, 0.3, 4.0, 0.0]]) + mat = _xyz6d_to_matrix(six) + Rm = mat[0, :3, :3] + # Proper rotation: orthonormal columns, det +1. + np.testing.assert_allclose(Rm @ Rm.T, np.eye(3), atol=1e-10) + np.testing.assert_allclose(np.linalg.det(Rm), 1.0, atol=1e-10) + # Translation preserved verbatim. + np.testing.assert_allclose(mat[0, :3, 3], [5.0, -6.0, 7.0]) + # First column points along the (normalized) input c1 direction. + np.testing.assert_allclose(Rm[:, 0], [1.0, 0.0, 0.0], atol=1e-10) + + +def test_xyz6d_to_matrix_is_scale_invariant_in_columns(): + """Gram-Schmidt only uses column *directions*, so scaling them must not + change the recovered rotation (key property the model relies on).""" + six = np.array([[0.0, 0.0, 0.0, 1.0, 0.2, -0.1, -0.2, 1.0, 0.05]]) + scaled = six.copy() + scaled[:, 3:6] *= 3.7 + scaled[:, 6:9] *= 0.25 + np.testing.assert_allclose( + _xyz6d_to_matrix(six), _xyz6d_to_matrix(scaled), atol=1e-10 + ) + + +def test_numpy_and_torch_gram_schmidt_are_bit_compatible(): + """The numpy revert (``_xyz6d_to_matrix``) and the torch decode + (``_reconstruct_R_from_cols``) must agree, including on non-orthonormal + input, or train/eval/deploy would disagree.""" + rng = np.random.default_rng(7) + c1 = rng.uniform(-3, 3, size=(50, 3)) + c2 = rng.uniform(-3, 3, size=(50, 3)) + xyz = rng.uniform(-1, 1, size=(50, 3)) + six = np.concatenate([xyz, c1, c2], axis=-1) + + np_mats = _xyz6d_to_matrix(six) + torch_R = _reconstruct_R_from_cols( + torch.from_numpy(c1)[:, None, :], torch.from_numpy(c2)[:, None, :] + )[:, 0].numpy() + np.testing.assert_allclose(np_mats[:, :3, :3], torch_R, atol=1e-12) + + +def test_xyz6d_helpers_reject_bad_shapes(): + with pytest.raises(ValueError, match=r"Expected \(B, 9\) array"): + _xyz6d_to_matrix(np.zeros((3, 6))) + with pytest.raises(ValueError, match=r"Expected \(B, 4, 4\) array"): + _matrix_to_xyz6d(np.zeros((3, 4))) + + +# --------------------------------------------------------------------------- +# A'. shared bimanual-cartesian index layout +# --------------------------------------------------------------------------- +@pytest.mark.parametrize("width", [12, 14, 18, 20]) +def test_bimanual_layout_partitions_every_channel(width): + layout = bimanual_cartesian_layout(width) + assert layout is not None + all_idx = list(layout["xyz"]) + list(layout["rot"]) + list(layout["grip"]) + assert sorted(all_idx) == list(range(width)) # no overlap, full cover + assert len(layout["xyz"]) == 6 # 3 per arm + + +@pytest.mark.parametrize("width,n_grip", [(12, 0), (14, 2), (18, 0), (20, 2)]) +def test_bimanual_layout_gripper_counts(width, n_grip): + assert len(bimanual_cartesian_layout(width)["grip"]) == n_grip + + +def test_bimanual_layout_unknown_width_is_none(): + for w in (0, 6, 7, 9, 13, 32): + assert bimanual_cartesian_layout(w) is None + + +# --------------------------------------------------------------------------- +# B. action_chunk_transforms 6D building blocks +# --------------------------------------------------------------------------- +def test_xyzwxyz_to_xyz6d_single_and_chunk_shapes(): + yaw = np.pi / 3 + qw, qz = np.cos(yaw / 2), np.sin(yaw / 2) + single = XYZWXYZ_to_XYZ6D(keys=["p"]).transform( + {"p": np.array([1.0, 2.0, 3.0, qw, 0.0, 0.0, qz])} + )["p"] + assert single.shape == (9,) + np.testing.assert_allclose(single[3:6], [np.cos(yaw), np.sin(yaw), 0.0], atol=1e-7) + + chunk = XYZWXYZ_to_XYZ6D(keys=["p"]).transform( + {"p": np.tile([0, 0, 0, qw, 0, 0, qz], (4, 1)).astype(float)} + )["p"] + assert chunk.shape == (4, 9) + + +def test_xyzwxyz_to_xyz6d_then_xyz6d_to_ypr_matches_direct_ypr(): + """quat -> 6d -> ypr equals quat -> ypr away from gimbal lock.""" + mats = _random_se3(20, seed=3) + poses = _xyzwxyz_from_mats(mats) + + via6d = XYZWXYZ_to_XYZ6D(keys=["p"]).transform({"p": poses.copy()})["p"] + via6d = XYZ6D_to_XYZYPR(keys=["p"]).transform({"p": via6d})["p"] + direct = XYZWXYZ_to_XYZYPR(keys=["p"]).transform({"p": poses.copy()})["p"] + np.testing.assert_allclose(via6d, direct, atol=1e-7) + + +def test_xyz6d_to_xypr_rejects_bad_shape(): + with pytest.raises(ValueError, match=r"XYZ6D_to_XYZYPR expects key 'p'"): + XYZ6D_to_XYZYPR(keys=["p"]).transform({"p": np.zeros((2, 6))}) + + +def test_quaternion_pose_to_xyz6d_single_and_batch_agree(): + mats = _random_se3(5, seed=4) + poses = _xyzwxyz_from_mats(mats) + batch = BatchQuaternionPoseToXYZ6D(pose_key="p", output_key="o").transform( + {"p": poses.copy()} + )["o"] + assert batch.shape == (5, 9) + for i in range(5): + single = QuaternionPoseToXYZ6D(pose_key="p", output_key="o").transform( + {"p": poses[i].copy()} + )["o"] + np.testing.assert_allclose(single, batch[i], atol=1e-9) + + +def test_quaternion_pose_to_xyz6d_rejects_bad_shape(): + with pytest.raises(ValueError, match="QuaternionPoseToXYZ6D expects shape"): + QuaternionPoseToXYZ6D(pose_key="p", output_key="o").transform( + {"p": np.zeros(9)} + ) + + +def test_action_chunk_xyz6d_mode_identity_target_preserves_chunk(): + """With an identity target frame, the xyz6d coordinate transform should + return the input rotation unchanged (columns already orthonormal).""" + mats = _random_se3(6, seed=5) + chunk6d = _matrix_to_xyz6d(mats) + identity6d = _matrix_to_xyz6d(np.eye(4)[None])[0] # (9,) + + out = ActionChunkCoordinateFrameTransform( + target_world="t", + chunk_world="c", + transformed_key_name="o", + mode="xyz6d", + inverse=False, + ).transform({"t": identity6d, "c": chunk6d})["o"] + + assert out.shape == (6, 9) + np.testing.assert_allclose(out, chunk6d, atol=1e-9) + + +def test_action_chunk_xyz6d_mode_dispatches_width9_target(): + """A width-9 target world must be parsed as xyz6d (revert path), giving the + same result as composing the rotations by hand.""" + target_mat = _random_se3(1, seed=6)[0] + chunk_mats = _random_se3(4, seed=7) + target6d = _matrix_to_xyz6d(target_mat[None])[0] + chunk6d = _matrix_to_xyz6d(chunk_mats) + + out = ActionChunkCoordinateFrameTransform( + target_world="t", + chunk_world="c", + transformed_key_name="o", + mode="xyz6d", + inverse=False, + ).transform({"t": target6d, "c": chunk6d})["o"] + + expected = _matrix_to_xyz6d(target_mat[None] @ chunk_mats) + np.testing.assert_allclose(out, expected, atol=1e-9) + + +# --------------------------------------------------------------------------- +# C. 32-dim action converters +# --------------------------------------------------------------------------- +def test_robot6d_converter_round_trip_and_layout(): + conv = RobotBimanualCartesian6D() + x = torch.randn(2, 5, 20) + out32 = conv.to32(x) + assert out32.shape == (2, 5, 32) + # 20 native dims land verbatim in the first 20 slots; rest is zero pad. + torch.testing.assert_close(out32[..., :20], x) + torch.testing.assert_close(out32[..., 20:], torch.zeros(2, 5, 12)) + torch.testing.assert_close(conv.from32(out32), x) + + +def test_robot6d_converter_rejects_wrong_width(): + with pytest.raises(ValueError, match="expected 20-dim"): + RobotBimanualCartesian6D().to32(torch.randn(1, 1, 18)) + + +def test_human6d_converter_round_trip_and_gripper_slots(): + conv = HumanBimanualCartesian6D() + x = torch.randn(3, 4, 18) + out32 = conv.to32(x) + assert out32.shape == (3, 4, 32) + # left block 0:9 == left arm, right block 10:19 == right arm. + torch.testing.assert_close(out32[..., 0:9], x[..., 0:9]) + torch.testing.assert_close(out32[..., 10:19], x[..., 9:18]) + # inserted gripper slots (9, 19) are zero. + torch.testing.assert_close(out32[..., 9], torch.zeros(3, 4)) + torch.testing.assert_close(out32[..., 19], torch.zeros(3, 4)) + torch.testing.assert_close(conv.from32(out32), x) + + +def test_human6d_converter_rejects_wrong_width(): + with pytest.raises(ValueError, match="expected 18-dim"): + HumanBimanualCartesian6D().to32(torch.randn(1, 1, 20)) + + +def test_human6d_from32_drops_gripper_slots(): + # from32 must ignore whatever sits in the gripper slots (model may emit + # non-zero there). + conv = HumanBimanualCartesian6D() + a32 = torch.randn(2, 2, 32) + out = conv.from32(a32) + assert out.shape == (2, 2, 18) + torch.testing.assert_close(out[..., 0:9], a32[..., 0:9]) + torch.testing.assert_close(out[..., 9:18], a32[..., 10:19]) + + +def test_2d_input_is_promoted_to_bsd(): + # converters accept (B, D) and unsqueeze a sequence axis. + out = RobotBimanualCartesian6D().to32(torch.randn(4, 20)) + assert out.shape == (4, 1, 32) + + +# --------------------------------------------------------------------------- +# D. builder shape progression (integration over the real transform lists) +# --------------------------------------------------------------------------- +def _eva_cartesian_batch(T=5): + cmd = np.zeros((T, 7)) + cmd[:, 3] = 1.0 + obs = np.zeros((7,)) + obs[3] = 1.0 + return { + "left.cmd_ee_pose": cmd.copy(), + "right.cmd_ee_pose": cmd.copy(), + "left.obs_ee_pose": obs.copy(), + "right.obs_ee_pose": obs.copy(), + "left.cmd_gripper": np.zeros((T, 1)), + "right.cmd_gripper": np.zeros((T, 1)), + "left.obs_gripper": np.zeros((1,)), + "right.obs_gripper": np.zeros((1,)), + } + + +def _run(transform_list, batch): + data = {k: np.asarray(v).copy() for k, v in batch.items()} + for t in transform_list: + data = t.transform(data) + return data + + +def test_eva_cartesian_6d_produces_20d_action_and_obs(): + out = _run(Eva.get_transform_list("cartesian_6d"), _eva_cartesian_batch()) + assert tuple(out["actions_cartesian"].shape)[-1] == 20 + assert tuple(out["observations.state.ee_pose"].shape) == (20,) + + +def test_eva_cartesian_wristframe_6d_produces_20d(): + out = _run( + Eva.get_transform_list("cartesian_wristframe_6d"), _eva_cartesian_batch() + ) + assert tuple(out["actions_cartesian"].shape)[-1] == 20 + assert tuple(out["observations.state.ee_pose"].shape) == (20,) + + +def _aria_cartesian_batch(T=6): + act = np.zeros((T, 7)) + act[:, 3] = 1.0 + obs = np.zeros((7,)) + obs[3] = 1.0 + return { + "obs_head_pose": np.array([0.0, 0, 0, 1.0, 0, 0, 0]), + "left.action_ee_pose": act.copy(), + "right.action_ee_pose": act.copy(), + "left.obs_ee_pose": obs.copy(), + "right.obs_ee_pose": obs.copy(), + } + + +@pytest.mark.parametrize("emb", [Aria, Mecka, Scale]) +def test_human_cartesian_6d_produces_18d(emb): + out = _run(emb.get_transform_list("cartesian_6d"), _aria_cartesian_batch()) + assert tuple(out["actions_cartesian"].shape)[-1] == 18 + assert tuple(out["observations.state.ee_pose"].shape) == (18,) + + +def test_eva_6d_builder_converter_placed_after_interpolate_before_concat(): + tl = Eva.get_transform_list("cartesian_6d") + conv = [i for i, t in enumerate(tl) if isinstance(t, XYZWXYZ_to_XYZ6D)] + interp = [i for i, t in enumerate(tl) if isinstance(t, InterpolatePose)] + concat = [i for i, t in enumerate(tl) if isinstance(t, ConcatKeys)] + assert len(conv) == 1 + assert max(interp) < conv[0] < min(concat) + + +def test_cartesian_6d_rotation_columns_are_bounded(): + """Random ee poses -> the 6D rotation columns stay within [-1, 1] (unit + matrix columns), which is what makes per-dim normalization meaningful.""" + mats = _random_se3(8, seed=11) + cmd = _matrix_to_xyzwxyz(mats) + obs = _matrix_to_xyzwxyz(_random_se3(1, seed=12))[0] + batch = { + "obs_head_pose": np.array([0.0, 0, 0, 1.0, 0, 0, 0]), + "left.action_ee_pose": cmd.copy(), + "right.action_ee_pose": cmd.copy(), + "left.obs_ee_pose": obs.copy(), + "right.obs_ee_pose": obs.copy(), + } + out = _run(Aria.get_transform_list("cartesian_6d"), batch) + layout = bimanual_cartesian_layout(18) + rot = np.asarray(out["actions_cartesian"])[..., list(layout["rot"])] + assert rot.min() >= -1.0 - 1e-6 and rot.max() <= 1.0 + 1e-6 + + +# --------------------------------------------------------------------------- +# E. revert: model 6D wrist-frame -> cam-frame ypr, equivalent to ypr revert +# --------------------------------------------------------------------------- +def test_human_6d_revert_matches_ypr_revert(): + """The 6D revert (Gram-Schmidt + xyz6d frame compose + collapse to ypr) + must yield the same cam-frame ypr as the legacy ypr revert fed the + equivalent rotations.""" + T = 5 + obs_l = _random_se3(1, seed=20)[0] + obs_r = _random_se3(1, seed=21)[0] + wrist_l = _random_se3(T, seed=22) + wrist_r = _random_se3(T, seed=23) + + # ypr inputs (12D obs / per-frame action) + obs_ypr = np.concatenate( + [_matrix_to_xyzypr(obs_l[None])[0], _matrix_to_xyzypr(obs_r[None])[0]] + ) + act_ypr = np.concatenate( + [_matrix_to_xyzypr(wrist_l), _matrix_to_xyzypr(wrist_r)], axis=-1 + ) + ypr_out = _run( + _build_human_cartesian_revert_eef_frame_transform_list(rot_repr="ypr"), + {"observations.state.ee_pose": obs_ypr, "actions_cartesian": act_ypr}, + )["actions_cartesian"] + + # 6D inputs (18D obs / per-frame action) + obs_6d = np.concatenate( + [_matrix_to_xyz6d(obs_l[None])[0], _matrix_to_xyz6d(obs_r[None])[0]] + ) + act_6d = np.concatenate( + [_matrix_to_xyz6d(wrist_l), _matrix_to_xyz6d(wrist_r)], axis=-1 + ) + six_out = _run( + _build_human_cartesian_revert_eef_frame_transform_list(rot_repr="6d"), + {"observations.state.ee_pose": obs_6d, "actions_cartesian": act_6d}, + )["actions_cartesian"] + + assert six_out.shape == ypr_out.shape == (T, 12) + np.testing.assert_allclose(six_out, ypr_out, atol=1e-6) + + +def test_eva_6d_revert_matches_ypr_revert_with_grippers(): + T = 4 + obs_l = _random_se3(1, seed=30)[0] + obs_r = _random_se3(1, seed=31)[0] + wrist_l = _random_se3(T, seed=32) + wrist_r = _random_se3(T, seed=33) + glo, gro = 0.3, 0.7 # obs grippers + gla = np.linspace(0, 1, T)[:, None] + gra = np.linspace(1, 0, T)[:, None] + + def obs_concat(to_pose): + return np.concatenate( + [to_pose(obs_l[None])[0], [glo], to_pose(obs_r[None])[0], [gro]] + ) + + def act_concat(to_pose): + return np.concatenate([to_pose(wrist_l), gla, to_pose(wrist_r), gra], axis=-1) + + ypr_out = _run( + _build_eva_bimanual_revert_eef_frame_transform_list( + rot_repr="ypr", is_quat=False + ), + { + "observations.state.ee_pose": obs_concat(_matrix_to_xyzypr), + "actions_cartesian": act_concat(_matrix_to_xyzypr), + }, + )["actions_cartesian"] + + six_out = _run( + _build_eva_bimanual_revert_eef_frame_transform_list(rot_repr="6d"), + { + "observations.state.ee_pose": obs_concat(_matrix_to_xyz6d), + "actions_cartesian": act_concat(_matrix_to_xyz6d), + }, + )["actions_cartesian"] + + assert six_out.shape == ypr_out.shape == (T, 14) + np.testing.assert_allclose(six_out, ypr_out, atol=1e-6) + + +def test_eva_6d_revert_orthonormalizes_noisy_model_output(): + """A non-orthonormal 6D 'prediction' must still revert to a valid ypr + (matching the orthonormalized rotation), exercising the decode-time + Gram-Schmidt inside the xyz6d revert.""" + T = 3 + obs_l = _random_se3(1, seed=40)[0] + obs_r = _random_se3(1, seed=41)[0] + wrist_l = _random_se3(T, seed=42) + wrist_r = _random_se3(T, seed=43) + + clean_l = _matrix_to_xyz6d(wrist_l) + clean_r = _matrix_to_xyz6d(wrist_r) + noisy_l = clean_l.copy() + noisy_r = clean_r.copy() + # scale columns (direction preserved) + jitter c2 off-orthogonal + noisy_l[:, 3:6] *= 1.5 + noisy_r[:, 6:9] += 0.05 + + obs_6d = np.concatenate( + [_matrix_to_xyz6d(obs_l[None])[0], _matrix_to_xyz6d(obs_r[None])[0]] + ) + out = _run( + _build_human_cartesian_revert_eef_frame_transform_list(rot_repr="6d"), + { + "observations.state.ee_pose": obs_6d, + "actions_cartesian": np.concatenate([noisy_l, noisy_r], axis=-1), + }, + )["actions_cartesian"] + assert out.shape == (T, 12) + assert np.all(np.isfinite(out)) + + +# --------------------------------------------------------------------------- +# F. full chain: data 6D -> converter to32 -> from32 -> revert (pi0.5 path) +# --------------------------------------------------------------------------- +def test_human6d_converter_preserves_data_pipeline_output(): + """A native 18D vector from the data pipeline survives to32/from32 intact, + so the (un)normalized values the model is trained on are exactly the ones + decoded back.""" + out = _run(Aria.get_transform_list("cartesian_6d"), _aria_cartesian_batch()) + native = torch.as_tensor(np.asarray(out["actions_cartesian"]))[ + None + ].float() # (1,T,18) + conv = HumanBimanualCartesian6D() + round_tripped = conv.from32(conv.to32(native)) + torch.testing.assert_close(round_tripped, native) diff --git a/egomimic/rldb/zarr/test_multi_retry.py b/egomimic/rldb/zarr/test_multi_retry.py index b3ecc7a9d..3b3b10a25 100644 --- a/egomimic/rldb/zarr/test_multi_retry.py +++ b/egomimic/rldb/zarr/test_multi_retry.py @@ -154,3 +154,55 @@ def test_reject_outliers_true_rejects_xyz_quantile_violation(): data = {"embodiment": 9, "actions_cartesian": actions} assert "Bounds violation" in mds._check_bounds(data, _DummyLeaf("ep", 1), 0, "ep") + + +# --- 6D (18D human / 20D robot) bounds checks: rotation columns excluded ----- +from egomimic.utils.pose_utils import bimanual_cartesian_layout # noqa: E402 + + +def _make_cartesian_mds_width(reject_outliers: bool, width: int) -> MultiDataset: + mds = MultiDataset( + datasets={"ep": _DummyLeaf("ep", 1)}, + mode="total", + reject_outliers=reject_outliers, + ) + q_low = torch.full((1, width), -1.0) + q_high = torch.full((1, width), 1.0) + stats = { + "quantile_0_01": q_low, + "quantile_99_99": q_high, + "quantile_1": q_low, + "quantile_99": q_high, + } + mds.key_types = {9: {"actions_cartesian": "action_keys"}} + mds.zarr_keys = {9: {"actions_cartesian": "actions_cartesian"}} + mds.norm_stats = {9: {"actions_cartesian": stats}} + return mds + + +@pytest.mark.parametrize("width", [18, 20]) +def test_6d_rotation_columns_are_not_quantile_checked(width): + mds = _make_cartesian_mds_width(reject_outliers=True, width=width) + actions = torch.zeros(1, width) + # Push every rotation column way out of [-1, 1]; must still pass. + actions[..., list(bimanual_cartesian_layout(width)["rot"])] = 9.0 + data = {"embodiment": 9, "actions_cartesian": actions} + assert mds._check_bounds(data, _DummyLeaf("ep", 1), 0, "ep") is None + + +@pytest.mark.parametrize("width", [18, 20]) +def test_6d_xyz_outlier_still_rejected(width): + mds = _make_cartesian_mds_width(reject_outliers=True, width=width) + actions = torch.zeros(1, width) + actions[..., bimanual_cartesian_layout(width)["xyz"][0]] = 5.0 + data = {"embodiment": 9, "actions_cartesian": actions} + assert "Bounds violation" in mds._check_bounds(data, _DummyLeaf("ep", 1), 0, "ep") + + +def test_6d_robot_gripper_outlier_still_rejected(): + # 20D robot keeps grippers in the bounds check. + mds = _make_cartesian_mds_width(reject_outliers=True, width=20) + actions = torch.zeros(1, 20) + actions[..., bimanual_cartesian_layout(20)["grip"][0]] = 5.0 + data = {"embodiment": 9, "actions_cartesian": actions} + assert "Bounds violation" in mds._check_bounds(data, _DummyLeaf("ep", 1), 0, "ep") diff --git a/egomimic/rldb/zarr/zarr_dataset_multi.py b/egomimic/rldb/zarr/zarr_dataset_multi.py index e7060cdc9..37e6ec0bb 100644 --- a/egomimic/rldb/zarr/zarr_dataset_multi.py +++ b/egomimic/rldb/zarr/zarr_dataset_multi.py @@ -48,6 +48,7 @@ create_default_engine, episode_table_to_df, ) +from egomimic.utils.pose_utils import bimanual_cartesian_layout if TYPE_CHECKING: # Annotation-only import — avoids a runtime circular import with @@ -911,11 +912,19 @@ def _check_bounds( logger.warning(prefix) return prefix - is_cartesian_action = ( - key_type == "action_keys" - and key_name == "actions_cartesian" - and arr.shape[-1] == 12 - ) + # The bimanual cartesian action chunk and the ee_pose proprio share a + # [L | R] layout whose rotation channels are either Euler ypr (wraps + # at ±π) or continuous 6D columns. In both cases quantile bounds on + # the rotation channels are meaningless and reject otherwise-valid + # frames, so we only bounds-check the translation (and gripper) + # channels. ``bimanual_cartesian_layout`` recognizes widths + # 12/14 (ypr) and 18/20 (6D); any other width falls through to a + # full-vector check. + cartesian_layout = None + if (key_type == "action_keys" and key_name == "actions_cartesian") or ( + key_type == "proprio_keys" and key_name == "observations.state.ee_pose" + ): + cartesian_layout = bimanual_cartesian_layout(arr.shape[-1]) if not self.reject_outliers: continue @@ -933,11 +942,13 @@ def _check_bounds( except RuntimeError: continue - if is_cartesian_action: - xyz_idx = list(self.CARTESIAN_ACTION_XYZ_INDICES) - arr_for_quantiles = arr[..., xyz_idx] - q_low = q_low[..., xyz_idx] - q_high = q_high[..., xyz_idx] + if cartesian_layout is not None: + check_idx = list(cartesian_layout["xyz"]) + list( + cartesian_layout["grip"] + ) + arr_for_quantiles = arr[..., check_idx] + q_low = q_low[..., check_idx] + q_high = q_high[..., check_idx] else: arr_for_quantiles = arr diff --git a/egomimic/scripts/print_valid_annotations.py b/egomimic/scripts/print_valid_annotations.py new file mode 100644 index 000000000..326e64fa2 --- /dev/null +++ b/egomimic/scripts/print_valid_annotations.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +""" +Print the language annotations (and the frames they span) for the episode(s) +that a run's validation video was rendered on. + +The validation video buffers frames from the *valid* split of a single +``data=`` filter (here: ``lab == 'mecka' and task == 'folding_clothes'``), +capped at ``max_frames_per_task * videos_per_task`` frames. This script +reproduces that exact selection: + + 1. apply the run's SQL filter to ``app.episodes`` (lab + task), + 2. take the deterministic valid split (seed 42, valid_ratio 0.2) via + ``split_dataset_names`` -- the same call the dataloader makes, + 3. for each valid episode, read the zarr ``annotations`` array and print + each span's text and ``[start_idx, end_idx)`` frame range. + +Defaults match logs/.../mecka_pi_fold_clothes_freeform_2026-06-01_19-23-33. +Override with --lab / --task / --dataset-dir if you point it at another run. +""" + +import argparse +import json +import os + +import numpy as np +import zarr + +from egomimic.rldb.filters import DatasetFilter +from egomimic.rldb.zarr.zarr_dataset_multi import ( + S3EpisodeResolver, + split_dataset_names, +) +from egomimic.utils.aws.aws_data_utils import load_env + +# Defaults pulled from the run's .hydra/config.yaml (data.*.filters + +# evaluator settings). Change these if you target a different run. +DEFAULT_DATASET_DIR = "/storage/project/r-dxu345-0/shared/egoverseS3ZarrDatasets" +DEFAULT_LAB = "mecka" +DEFAULT_TASK = "folding_clothes" +VALID_RATIO = 0.2 # MultiDataset default; the config does not override it +# evaluator.max_frames_per_task * evaluator.videos_per_task from the run config. +DEFAULT_VIDEO_FRAME_CAP = 1000 * 1 + + +def decode_entry(value): + """Decode one raw annotations[] element into a dict (mirrors + ZarrDataset._decode_json_entry).""" + if isinstance(value, np.void): + value = value.item() + if isinstance(value, memoryview): + value = value.tobytes() + if isinstance(value, bytearray): + value = bytes(value) + if isinstance(value, bytes): + return json.loads(value.decode("utf-8")) + if isinstance(value, str): + return json.loads(value) + return value + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--lab", default=DEFAULT_LAB) + ap.add_argument("--task", default=DEFAULT_TASK) + ap.add_argument("--dataset-dir", default=DEFAULT_DATASET_DIR) + ap.add_argument( + "--all-splits", + action="store_true", + help="Print every matching episode, not just the ones the video shows.", + ) + ap.add_argument( + "--valid-split", + action="store_true", + help="Print all 92 valid-split episodes (not just those in the video).", + ) + ap.add_argument( + "--frame-cap", + type=int, + default=DEFAULT_VIDEO_FRAME_CAP, + help="max_frames_per_task * videos_per_task; the video's frame budget.", + ) + args = ap.parse_args() + + load_env() # AWS/SQL credentials, same as trainHydra.py + + filt = DatasetFilter( + filter_lambdas=[ + f"lambda row: row['lab'] == '{args.lab}' " + f"and row['task'] == '{args.task}'" + ] + ) + + # SQL-only lookup: returns [(zarr_processed_path, episode_hash), ...]. + # No download is triggered here. + paths, hash_to_task = S3EpisodeResolver._get_filtered_paths(filt) + all_hashes = sorted(h for _, h in paths) + if not all_hashes: + print(f"No episodes matched lab={args.lab!r} task={args.task!r}.") + return + + train_set, valid_set = split_dataset_names(all_hashes, valid_ratio=VALID_RATIO) + # The val loader is shuffle=False and episodes load in sorted(iterdir()) + # order, i.e. sorted episode_hash. The viz renders one frame per validation + # sample in dataset order (one sample == one frame index), so the video is + # the valid episodes in this order, played frame-by-frame, truncated at the + # frame cap. Walk the sorted valid hashes accumulating frames to find which + # episode(s) the video actually shows. + sorted_valid = sorted(valid_set) + coverage = {} # hash -> frames shown in video [0, shown) + remaining = args.frame_cap + for h in sorted_valid: + if remaining <= 0: + break + ep_path = os.path.join(args.dataset_dir, h) + if not os.path.isdir(ep_path): + # Episode not cached locally: we can't read its length to advance + # the cumulative offset. Flag it and stop -- ordering past this + # point is unknown. + coverage[h] = None + break + total = int(dict(zarr.open_group(ep_path, mode="r").attrs)["total_frames"]) + shown = min(total, remaining) + coverage[h] = shown + remaining -= shown + + if args.all_splits: + chosen = all_hashes + elif args.valid_split: + chosen = sorted_valid + else: + chosen = list(coverage.keys()) # only episodes shown in the video + + print( + f"Filter: lab={args.lab!r} task={args.task!r}\n" + f"Matched {len(all_hashes)} episode(s); " + f"{len(valid_set)} in valid split (seed 42, ratio {VALID_RATIO}).\n" + f"Validation video frame budget: {args.frame_cap} frames " + f"(shuffle=False, sorted-hash order, one frame per frame-index).\n" + f"=> Video covers {len(coverage)} episode(s); " + f"annotations below.\n" + ) + + for h in chosen: + split = "valid" if h in valid_set else "train" + shown = coverage.get(h) # frames shown in video, or None + ep_path = os.path.join(args.dataset_dir, h) + print("=" * 78) + tag = ( + f" [shows frames 0-{shown - 1} in video]" + if shown + else ( + " [in valid split, NOT reached by video]" if split == "valid" else "" + ) + ) + print(f"episode_hash: {h} [{split} split]{tag}") + if not os.path.isdir(ep_path): + print(f" (not in local cache: {ep_path})") + continue + g = zarr.open_group(ep_path, mode="r") + attrs = dict(g.attrs) + total = attrs.get("total_frames", "?") + print( + f" embodiment={attrs.get('embodiment')!r} " + f"task_name={attrs.get('task_name')!r} total_frames={total}" + ) + if "annotations" not in g: + print(" no 'annotations' array in store.") + continue + raw = g["annotations"][:] + anns = [decode_entry(x) for x in raw] + anns = [a for a in anns if isinstance(a, dict)] + if not anns: + print(" annotations array is empty (0 spans).") + continue + anns.sort(key=lambda a: int(a.get("start_idx", -1))) + print(f" {len(anns)} annotation span(s):") + for a in anns: + s, e = int(a.get("start_idx", -1)), int(a.get("end_idx", -1)) + # Flag whether this span is visible within the rendered window. + if shown is None: + vis = "" + elif s >= shown: + vis = " (not shown)" + elif e > shown: + vis = f" (partly shown: up to frame {shown - 1})" + else: + vis = " (shown)" + print( + f" frames [{s:>5} , {e:>5}) ({e - s:>5} frames) " + f"{a.get('text', '')!r}{vis}" + ) + print("=" * 78) + + +if __name__ == "__main__": + main() diff --git a/egomimic/utils/action_utils.py b/egomimic/utils/action_utils.py index 75c4fac11..4dbd4c8d4 100644 --- a/egomimic/utils/action_utils.py +++ b/egomimic/utils/action_utils.py @@ -345,3 +345,62 @@ def from32(self, actions32: torch.Tensor) -> torch.Tensor: R_R = _reconstruct_R_from_cols(R_c1, R_c2) R_ypr = _matrix_to_ypr(R_R) return torch.cat([L_xyz, L_ypr, R_xyz, R_ypr], dim=-1) # (B,S,12) + + +# ============================================================ +# 6D ROTATION CONVERTERS +# ============================================================ +# These consume the continuous 6D rotation representation that the data pipeline +# now produces (the ``cartesian_6d`` / ``cartesian_wristframe_6d`` modes), rather +# than Euler ypr. The native per-arm layout is already the model's 32-dim block +# layout — ``[xyz(3), col1(3), col2(3), grip(1)]`` — so the converters are +# trivial repacks: no trig, no Gram-Schmidt. Crucially, ``from32`` returns the +# raw 6 column numbers WITHOUT re-orthonormalizing, so the subsequent +# ``unnormalize`` operates on the exact values the norm stats were fit on. +# Gram-Schmidt of a (possibly non-orthonormal) model prediction happens later, +# in the ``xyz6d`` revert transform. + + +class RobotBimanualCartesian6D(BaseActionConverter): + """ + Input orig: (B,S,20) = [L: xyz(3) c1(3) c2(3) grip(1) | R: xyz(3) c1(3) c2(3) grip(1)] + 32-pack: left block 0..9, right block 10..19 (identical layout) -> pad to 32. + """ + + def to32(self, actions: torch.Tensor) -> torch.Tensor: + actions = _ensure_bsd(actions) + if actions.shape[-1] != 20: + raise ValueError( + f"RobotBimanual6D: expected 20-dim, got {actions.shape[-1]}" + ) + return _pad32(actions) + + def from32(self, actions32: torch.Tensor) -> torch.Tensor: + actions32 = _ensure_bsd(actions32) + return actions32[..., :20] # (B,S,20) + + +class HumanBimanualCartesian6D(BaseActionConverter): + """ + Input orig: (B,S,18) = [L: xyz(3) c1(3) c2(3) | R: xyz(3) c1(3) c2(3)] (no gripper) + 32-pack: left block 0..9 (grip=0), right block 10..19 (grip=0) -> pad to 32. + """ + + def to32(self, actions: torch.Tensor) -> torch.Tensor: + actions = _ensure_bsd(actions) + if actions.shape[-1] != 18: + raise ValueError( + f"HumanBimanual6D: expected 18-dim, got {actions.shape[-1]}" + ) + L, R = actions[..., :9], actions[..., 9:18] + g0L = torch.zeros_like(L[..., :1]) + g0R = torch.zeros_like(R[..., :1]) + Lblock = torch.cat([L, g0L], dim=-1) # (B,S,10) + Rblock = torch.cat([R, g0R], dim=-1) # (B,S,10) + return _pad32(torch.cat([Lblock, Rblock], dim=-1)) + + def from32(self, actions32: torch.Tensor) -> torch.Tensor: + actions32 = _ensure_bsd(actions32) + L = actions32[..., 0:9] # drop the grip slot at index 9 + R = actions32[..., 10:19] # drop the grip slot at index 19 + return torch.cat([L, R], dim=-1) # (B,S,18) diff --git a/egomimic/utils/pose_utils.py b/egomimic/utils/pose_utils.py index 0bbe0a6f7..b52e7d2d1 100644 --- a/egomimic/utils/pose_utils.py +++ b/egomimic/utils/pose_utils.py @@ -129,6 +129,72 @@ def _xyzypr_to_matrix(xyzypr: np.ndarray) -> np.ndarray: return mats +def _matrix_to_xyz6d(mats: np.ndarray) -> np.ndarray: + """Continuous 6D rotation representation (Zhou et al. / 6DRepNet). + + Takes the first two columns of each rotation matrix and prepends the + translation: + + args: + mats: (B, 4, 4) array of SE3 transformation matrices + returns: + (B, 9) np.array of [[x, y, z, c1x, c1y, c1z, c2x, c2y, c2z]] where + c1 / c2 are the first / second columns of the rotation matrix. + """ + if mats.ndim != 3 or mats.shape[-2:] != (4, 4): + raise ValueError(f"Expected (B, 4, 4) array, got shape {mats.shape}") + + mats = np.asarray(mats) + dtype = mats.dtype if np.issubdtype(mats.dtype, np.floating) else np.float64 + + xyz = mats[:, :3, 3] + c1 = mats[:, :3, 0] + c2 = mats[:, :3, 1] + + return np.concatenate([xyz, c1, c2], axis=-1).astype(dtype, copy=False) + + +def _xyz6d_to_matrix(xyz6d: np.ndarray) -> np.ndarray: + """Inverse of :func:`_matrix_to_xyz6d`. + + Reconstructs a proper rotation matrix from the first two (possibly + non-orthonormal) columns via Gram-Schmidt, exactly mirroring the torch + implementation ``egomimic.utils.action_utils._reconstruct_R_from_cols`` + (same ``eps = 1e-8`` floor, same column order, ``c3 = c1 x c2``). The two + implementations must stay bit-compatible so train / eval / deploy agree. + + args: + xyz6d: (B, 9) np.array of [[x, y, z, c1(3), c2(3)]] + returns: + (B, 4, 4) array of SE3 transformation matrices + """ + if xyz6d.ndim != 2 or xyz6d.shape[-1] != 9: + raise ValueError(f"Expected (B, 9) array, got shape {xyz6d.shape}") + + B = xyz6d.shape[0] + dtype = xyz6d.dtype if np.issubdtype(xyz6d.dtype, np.floating) else np.float64 + xyz6d = xyz6d.astype(dtype, copy=False) + + eps = 1e-8 + xyz = xyz6d[:, :3] + c1 = xyz6d[:, 3:6] + c2 = xyz6d[:, 6:9] + + # Gram-Schmidt (matches torch _reconstruct_R_from_cols). + c1n = c1 / np.clip(np.linalg.norm(c1, axis=-1, keepdims=True), eps, None) + proj = np.sum(c2 * c1n, axis=-1, keepdims=True) * c1n + c2o = c2 - proj + c2n = c2o / np.clip(np.linalg.norm(c2o, axis=-1, keepdims=True), eps, None) + c3n = np.cross(c1n, c2n) + + mats = np.broadcast_to(np.eye(4, dtype=dtype), (B, 4, 4)).copy() + mats[:, :3, 0] = c1n + mats[:, :3, 1] = c2n + mats[:, :3, 2] = c3n + mats[:, :3, 3] = xyz + return mats + + def _matrix_to_xyzwxyz(mats: np.ndarray) -> np.ndarray: """ args: @@ -211,9 +277,76 @@ def _matrix_to_xyz(mats: np.ndarray) -> np.ndarray: return mats[:, :3, 3].astype(dtype, copy=False) +# Native bimanual cartesian action / proprio layouts. Each vector is +# [left arm | right arm]; per-arm blocks are one of: +# ypr: xyz(3) + ypr(3) [+ gripper(1)] +# 6d: xyz(3) + col1(3) + col2(3) [+ gripper(1)] +# Mapping width -> {xyz, rot, grip} channel indices. xyz/grip are bounded, +# linearly-interpolated channels; the rot channels are either Euler (wrap at +# +-pi) or continuous 6D columns (bounded in ~[-1, 1]). Both norm-stat bounds +# checking and the eval MSE split consume this so they agree on which channel is +# which. +BIMANUAL_CARTESIAN_LAYOUTS = { + 12: { # human ypr: [L xyz ypr | R xyz ypr] + "xyz": (0, 1, 2, 6, 7, 8), + "rot": (3, 4, 5, 9, 10, 11), + "grip": (), + }, + 14: { # robot ypr: [L xyz ypr g | R xyz ypr g] + "xyz": (0, 1, 2, 7, 8, 9), + "rot": (3, 4, 5, 10, 11, 12), + "grip": (6, 13), + }, + 18: { # human 6d: [L xyz c1 c2 | R xyz c1 c2] + "xyz": (0, 1, 2, 9, 10, 11), + "rot": (3, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 17), + "grip": (), + }, + 20: { # robot 6d: [L xyz c1 c2 g | R xyz c1 c2 g] + "xyz": (0, 1, 2, 10, 11, 12), + "rot": (3, 4, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18), + "grip": (9, 19), + }, +} + + +def bimanual_cartesian_layout(width: int) -> dict | None: + """Index layout for a bimanual cartesian action/proprio vector. + + Returns a dict with ``xyz`` / ``rot`` / ``grip`` index tuples, or ``None`` + if ``width`` is not a recognized native width (12/14 ypr, 18/20 6D). + """ + return BIMANUAL_CARTESIAN_LAYOUTS.get(int(width)) + + +def _sixd_cols_to_ypr(cols: np.ndarray) -> np.ndarray: + """Convert continuous 6D rotation columns to Euler ``ZYX`` (yaw, pitch, roll). + + ``cols`` is a ``(..., 6)`` array of ``[c1(3), c2(3)]`` (the first two + columns of a rotation matrix). Returns a ``(..., 3)`` ypr array. A proper + rotation is reconstructed with the same Gram-Schmidt as + :func:`_xyz6d_to_matrix` so the visualization matches what train / eval + reconstruct from the model's 6D output. + """ + cols = np.asarray(cols) + lead = cols.shape[:-1] + flat = cols.reshape(-1, 6) + if flat.shape[0] == 0: + return np.zeros((*lead, 3), dtype=flat.dtype) + # Reuse _xyz6d_to_matrix with a zero translation; only the rotation matters. + xyz6d = np.concatenate( + [np.zeros((flat.shape[0], 3), dtype=flat.dtype), flat], axis=-1 + ) + mats = _xyz6d_to_matrix(xyz6d) + ypr = R.from_matrix(mats[:, :3, :3]).as_euler("ZYX", degrees=False) + return ypr.reshape(*lead, 3) + + def _split_action_pose(actions): # 14D layout: [L xyz ypr g, R xyz ypr g] # 12D layout: [L xyz ypr, R xyz ypr] + # 20D layout: [L xyz c1 c2 g, R xyz c1 c2 g] (continuous 6D rotation) + # 18D layout: [L xyz c1 c2, R xyz c1 c2] (continuous 6D rotation) if actions.shape[-1] == 14: left_xyz = actions[..., :3] left_ypr = actions[..., 3:6] @@ -224,6 +357,16 @@ def _split_action_pose(actions): left_ypr = actions[..., 3:6] right_xyz = actions[..., 6:9] right_ypr = actions[..., 9:12] + elif actions.shape[-1] == 20: + left_xyz = actions[..., :3] + left_ypr = _sixd_cols_to_ypr(actions[..., 3:9]) + right_xyz = actions[..., 10:13] + right_ypr = _sixd_cols_to_ypr(actions[..., 13:19]) + elif actions.shape[-1] == 18: + left_xyz = actions[..., :3] + left_ypr = _sixd_cols_to_ypr(actions[..., 3:9]) + right_xyz = actions[..., 9:12] + right_ypr = _sixd_cols_to_ypr(actions[..., 12:18]) else: raise ValueError(f"Unsupported action dim {actions.shape[-1]}") return left_xyz, left_ypr, right_xyz, right_ypr diff --git a/norm_stats.md b/norm_stats.md new file mode 100644 index 000000000..0ac8652a9 --- /dev/null +++ b/norm_stats.md @@ -0,0 +1,66 @@ +# Norm Stats + +During a normal training run, norm stats are computed on the fly over +`norm_stats.sample_frac` of the data and cached to the run's `norm_stats/` +dir. Lower `norm_stats.sample_frac` (default `0.2`, set in +`train_zarr_cartesian.yaml`) when training on large datasets. + +## Computing norm stats standalone (no training) + +`egomimic/scripts/compute_norm_stats.py` computes and caches norm stats +**without** instantiating the model or trainer. It accepts the same Hydra +composition as `trainHydra.py` (`--config-name`, `data=`, `model=`, …), so +existing run recipes can be reused. It writes +`/norm_stats/norm_stats.json`; point a follow-up training run +at it via `norm_stats.precomputed_norm_path=` to skip recompute. + +Key options: + +- `norm_stats.sample_frac` — fraction of data to sample (default `0.2`). +- `norm_stats.num_workers` — dataloader workers; set to roughly match the + allocated CPUs. +- `norm_stats.save_cache_dir` — output dir (defaults to the Hydra run dir). +- The script **always recomputes** — it ignores any + `norm_stats.precomputed_norm_path` from the config. + +### Interactive + +``` bash +python egomimic/scripts/compute_norm_stats.py \ + --config-name=train_zarr_cartesian_pi \ + data= \ + norm_stats.sample_frac=0.4 +``` + +### SLURM (CPU-only, via submitit) + +Use the CPU launcher `hydra/launcher/submitit_cpu_pace.yaml` and add `-m` to +trigger submitit. It defaults to `cpus_per_task: 12`; set +`norm_stats.num_workers` to match. Override `hydra.launcher.cpus_per_task` (and +`norm_stats.num_workers`) for a different CPU count. Example — 0.4 sample_frac +for `mecka_pi_fold_clothes_freeform` on the default 12 CPUs: + +``` bash +python egomimic/scripts/compute_norm_stats.py -m \ + --config-name=train_zarr_cartesian_pi \ + data=mecka_pi_fold_clothes_freeform \ + name=mecka_fold_clothes_freeform \ + description=norm_stats_frac0.4 \ + norm_stats.sample_frac=0.4 \ + norm_stats.num_workers=12 \ + hydra/launcher=submitit_cpu_pace +``` + +`name`/`description` set the output dir +(`logs//_/`); the stats land under +`/0/norm_stats/norm_stats.json` (the `0` is the submitit multirun +subdir), with submitit logs in `/.submitit//`. + +Tip: validate the recipe cheaply on the login node before queuing with a +config-only dry run (no dataset load, no submission): + +``` bash +python egomimic/scripts/compute_norm_stats.py \ + --config-name=train_zarr_cartesian_pi data= \ + norm_stats.sample_frac=0.4 hydra/launcher=submitit_cpu_pace --cfg job +```