diff --git a/.gitignore b/.gitignore index e85860466..577c41703 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +aws *.pth *.pyc egomimic.egg-info/ diff --git a/CONTRIBUTING_DATA.md b/CONTRIBUTING_DATA.md index 8e953784a..0efc05d69 100644 --- a/CONTRIBUTING_DATA.md +++ b/CONTRIBUTING_DATA.md @@ -495,6 +495,9 @@ The `embodiment` field in the DB row and in `zarr.attrs` must be one of the foll | `scale_bimanual` | 12 | Scale AI EgoDex + bimanual | | `scale_right_arm` | 13 | Scale AI EgoDex + right arm | | `scale_left_arm` | 14 | Scale AI EgoDex + left arm | +| `microagi_bimanual` | 15 | MicroAGI egocentric capture + bimanual | +| `microagi_right_arm` | 16 | MicroAGI egocentric capture + right arm | +| `microagi_left_arm` | 17 | MicroAGI egocentric capture + left arm | **If your hardware is not in this list**, contact the consortium leads to register a new embodiment identifier before submitting data. @@ -514,6 +517,7 @@ s3://rldb/processed_v3//.zarr/ | `eva_*` | `eva` | | `mecka_*` | `mecka` | | `scale_*` | `scale` | +| `microagi_*` | `microagi` | Examples: ``` diff --git a/egomimic/hydra_configs/data/microagi_keypoints.yaml b/egomimic/hydra_configs/data/microagi_keypoints.yaml new file mode 100644 index 000000000..32b126ff1 --- /dev/null +++ b/egomimic/hydra_configs/data/microagi_keypoints.yaml @@ -0,0 +1,32 @@ +# MicroAGI keypoints (wrist-frame) data config; mirrors aria_keypoints.yaml. +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper + +train_datasets: + microagi_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: ${paths.dataset_dir} + key_map: + _target_: egomimic.rldb.embodiment.human.Microagi.get_keymap + keymap_mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Microagi.get_transform_list + mode: keypoints_wristframe_ypr + filters: + _target_: egomimic.rldb.filters.DatasetFilter + filter_lambdas: + - "lambda row: row['embodiment'] == 'microagi_bimanual'" + mode: total + +valid_datasets: + microagi_bimanual: ${train_datasets.microagi_bimanual} + +train_dataloader_params: + microagi_bimanual: + batch_size: 32 + num_workers: 6 +valid_dataloader_params: + microagi_bimanual: + batch_size: 32 + num_workers: 6 diff --git a/egomimic/hydra_configs/evaluator/viz/keypoints.yaml b/egomimic/hydra_configs/evaluator/viz/keypoints.yaml index 87a15f545..8421dcac6 100644 --- a/egomimic/hydra_configs/evaluator/viz/keypoints.yaml +++ b/egomimic/hydra_configs/evaluator/viz/keypoints.yaml @@ -22,3 +22,9 @@ scale_bimanual: image_key: front_img_1 action_key: actions_keypoints mode: keypoints +microagi_bimanual: + _target_: egomimic.rldb.embodiment.human.Microagi.viz_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_keypoints + mode: keypoints diff --git a/egomimic/hydra_configs/evaluator/viz/keypoints_wrist.yaml b/egomimic/hydra_configs/evaluator/viz/keypoints_wrist.yaml index 87a15f545..8421dcac6 100644 --- a/egomimic/hydra_configs/evaluator/viz/keypoints_wrist.yaml +++ b/egomimic/hydra_configs/evaluator/viz/keypoints_wrist.yaml @@ -22,3 +22,9 @@ scale_bimanual: image_key: front_img_1 action_key: actions_keypoints mode: keypoints +microagi_bimanual: + _target_: egomimic.rldb.embodiment.human.Microagi.viz_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_keypoints + mode: keypoints diff --git a/egomimic/rldb/embodiment/embodiment.py b/egomimic/rldb/embodiment/embodiment.py index cb7bbf4f9..8d2654081 100644 --- a/egomimic/rldb/embodiment/embodiment.py +++ b/egomimic/rldb/embodiment/embodiment.py @@ -31,6 +31,9 @@ class EMBODIMENT(Enum): SCALE_BIMANUAL = 12 SCALE_RIGHT_ARM = 13 SCALE_LEFT_ARM = 14 + MICROAGI_BIMANUAL = 15 + MICROAGI_RIGHT_ARM = 16 + MICROAGI_LEFT_ARM = 17 EMBODIMENT_ID_TO_KEY = {member.value: member.name for member in EMBODIMENT} diff --git a/egomimic/rldb/embodiment/human.py b/egomimic/rldb/embodiment/human.py index a8f6f1ed1..94b4130ec 100644 --- a/egomimic/rldb/embodiment/human.py +++ b/egomimic/rldb/embodiment/human.py @@ -242,6 +242,98 @@ def _get_keymap( } +class Microagi(Human): + VIZ_INTRINSICS_KEY = "microagi" + ACTION_STRIDE = 3 + + @classmethod + def _get_keymap( + cls, + keymap_mode: Literal["cartesian", "cartesian_pi", "keypoints"], + ): + # Layout is intentionally identical to Aria's; kept as its own copy so + # MicroAGI is a Human sibling rather than coupled to Aria's keymap. + if keymap_mode in ("cartesian", "cartesian_pi"): + front_key = ( + "base_0_rgb" if keymap_mode == "cartesian_pi" else cls.VIZ_IMAGE_KEY + ) + return { + front_key: { + "key_type": "camera_keys", + "zarr_key": "images.front_1", + }, + "right.action_ee_pose": { + "key_type": "action_keys", + "zarr_key": "right.obs_ee_pose", + "horizon": 30, + }, + "left.action_ee_pose": { + "key_type": "action_keys", + "zarr_key": "left.obs_ee_pose", + "horizon": 30, + }, + "right.obs_ee_pose": { + "key_type": "proprio_keys", + "zarr_key": "right.obs_ee_pose", + }, + "left.obs_ee_pose": { + "key_type": "proprio_keys", + "zarr_key": "left.obs_ee_pose", + }, + "obs_head_pose": { + "key_type": "proprio_keys", + "zarr_key": "obs_head_pose", + }, + } + elif keymap_mode == "keypoints": + return { + cls.VIZ_IMAGE_KEY: { + "key_type": "camera_keys", + "zarr_key": "images.front_1", + }, + "left.action_keypoints": { + "key_type": "action_keys", + "zarr_key": "left.obs_keypoints", + "horizon": 30, + }, + "right.action_keypoints": { + "key_type": "action_keys", + "zarr_key": "right.obs_keypoints", + "horizon": 30, + }, + "left.action_wrist_pose": { + "key_type": "proprio_keys", + "zarr_key": "left.obs_wrist_pose", + "horizon": 30, + }, + "right.action_wrist_pose": { + "key_type": "proprio_keys", + "zarr_key": "right.obs_wrist_pose", + "horizon": 30, + }, + "left.obs_keypoints": { + "key_type": "proprio_keys", + "zarr_key": "left.obs_keypoints", + }, + "right.obs_keypoints": { + "key_type": "proprio_keys", + "zarr_key": "right.obs_keypoints", + }, + "left.obs_wrist_pose": { + "key_type": "proprio_keys", + "zarr_key": "left.obs_wrist_pose", + }, + "right.obs_wrist_pose": { + "key_type": "proprio_keys", + "zarr_key": "right.obs_wrist_pose", + }, + "obs_head_pose": { + "key_type": "proprio_keys", + "zarr_key": "obs_head_pose", + }, + } + + class Scale(Human): VIZ_INTRINSICS_KEY = "scale" ACTION_STRIDE = 1 diff --git a/egomimic/scripts/viz_language.py b/egomimic/scripts/viz_language.py index 7cb23f1dc..361783aa1 100644 --- a/egomimic/scripts/viz_language.py +++ b/egomimic/scripts/viz_language.py @@ -19,8 +19,9 @@ from egomimic.rldb.embodiment.embodiment import Embodiment from egomimic.rldb.embodiment.eva import Eva -from egomimic.rldb.embodiment.human import Aria, Mecka, Scale +from egomimic.rldb.embodiment.human import Aria, Mecka, Microagi, Scale from egomimic.utils.aws.aws_data_utils import load_env +from egomimic.utils.egomimicUtils import intrinsics_from_metadata from egomimic.utils.viz_utils import _prepare_viz_image OmegaConf.register_new_resolver("eval", eval) @@ -38,6 +39,9 @@ "mecka_bimanual": Mecka, "mecka_right_arm": Mecka, "mecka_left_arm": Mecka, + "microagi_bimanual": Microagi, + "microagi_right_arm": Microagi, + "microagi_left_arm": Microagi, } @@ -125,10 +129,16 @@ def _viz_batch( annotations: list[str], mode: str, viz_transform_list=None, + intrinsics=None, ) -> list: """Visualize one batch and return a list of uint8 HWC numpy frames.""" from egomimic.utils.type_utils import _to_numpy + if image_key not in batch: + matches = [k for k in batch if k.rsplit(".", 1)[-1] == image_key] + if matches: + image_key = matches[0] + if action_key in batch: vis_batch = embodiment_cls.viz_transformed_batch( batch, @@ -137,6 +147,7 @@ def _viz_batch( image_key=image_key, color="Greens", transform_list=viz_transform_list, + intrinsics=intrinsics, ) frames = vis_batch if isinstance(vis_batch, list) else [vis_batch] else: @@ -208,6 +219,9 @@ def _run_viz_for_datasets( file_counter = 0 print(f" {len(dataset.datasets)} episode(s) found") for ep_name, ep_ds in dataset.datasets.items(): + # Per-episode calibration (zarr attrs["intrinsics"]) when the + # episode carries it; None falls back to INTRINSICS[VIZ_INTRINSICS_KEY]. + ep_intrinsics = intrinsics_from_metadata(getattr(ep_ds, "metadata", None)) ep_loader = torch.utils.data.DataLoader( ep_ds, batch_size=1, shuffle=False, num_workers=0 ) @@ -231,6 +245,7 @@ def _run_viz_for_datasets( carried_annotation, mode, viz_transform_list, + intrinsics=ep_intrinsics, ) except Exception as e: print(f" [warn] {ep_name} batch {batch_idx} failed: {e}") diff --git a/egomimic/utils/egomimicUtils.py b/egomimic/utils/egomimicUtils.py index ba73e521c..e78ad7588 100644 --- a/egomimic/utils/egomimicUtils.py +++ b/egomimic/utils/egomimicUtils.py @@ -249,14 +249,40 @@ "right": np.eye(4), }, } +# For accurate intrinsics use the per-episode metdata. +MICROAGI_INTRINSICS = np.array( + [ + [347.5209147135417, 0.0, 323.0985514322917, 0], + [0.0, 347.50667317708336, 177.64398193359373, 0], + [0.0, 0.0, 1.0, 0], + ] +) INTRINSICS = { "base": ARIA_INTRINSICS, "base_half": ARIA_INTRINSICS_HALF, "mecka": MECKA_INTRINSICS, "scale": SCALE_INTRINSICS, + "microagi": MICROAGI_INTRINSICS, } + +def intrinsics_from_metadata(metadata) -> np.ndarray | None: + """Build a 3x4 intrinsics matrix from episode zarr attrs, if present. + + attrs["intrinsics"] is either {"K": row-major 3x3, "width", ...} or a + bare 3x3 nested list, at the stored image resolution. Returns None when + the episode carries no calibration, so callers can fall back to the + per-embodiment INTRINSICS entry. + """ + info = (metadata or {}).get("intrinsics") + if isinstance(info, dict): + info = info.get("K") + if info is None: + return None + K = np.asarray(info, dtype=np.float64).reshape(3, 3) + return np.concatenate([K, np.zeros((3, 1))], axis=1) + ARIA_T_RGB_CPF = np.array( [ [-0.99989084, 0.01251132, -0.00786028, 0.05686918], diff --git a/egomimic/utils/viz_utils.py b/egomimic/utils/viz_utils.py index f9d4723d1..de02f82f5 100644 --- a/egomimic/utils/viz_utils.py +++ b/egomimic/utils/viz_utils.py @@ -49,6 +49,20 @@ def _prepare_viz_image(img): return img +def _resolve_intrinsics(intrinsics, intrinsics_key): + """Prefer an explicit (per-episode) intrinsics matrix over the keyed constant. + + Accepts 3x3 or 3x4; pads 3x3 with a zero column since + cam_frame_to_cam_pixels expects 3x4. + """ + if intrinsics is None: + return INTRINSICS[intrinsics_key] + intrinsics = np.asarray(intrinsics, dtype=np.float64) + if intrinsics.shape == (3, 3): + intrinsics = np.concatenate([intrinsics, np.zeros((3, 1))], axis=1) + return intrinsics + + def _format_rotation_values(rot): rot = np.asarray(rot).reshape(-1) return ", ".join(f"{value:.2f}" for value in rot) @@ -139,14 +153,14 @@ def _viz_rotation_txt(image, actions, **kwargs): return vis -def _viz_traj(image, actions, intrinsics_key, **kwargs): +def _viz_traj(image, actions, intrinsics_key, intrinsics=None, **kwargs): color = kwargs.get("color", "Blues") alpha = kwargs.get("alpha", 1.0) if not ColorPalette.is_valid(color): raise ValueError(f"Invalid color palette: {color}") image = _prepare_viz_image(image) - intrinsics = INTRINSICS[intrinsics_key] + intrinsics = _resolve_intrinsics(intrinsics, intrinsics_key) left_xyz, _, right_xyz, _ = _split_action_pose(actions) base = image.copy() @@ -175,10 +189,10 @@ def _viz_traj(image, actions, intrinsics_key, **kwargs): return vis -def _viz_axes(image, actions, intrinsics_key, axis_len_m=0.04, **kwargs): +def _viz_axes(image, actions, intrinsics_key, axis_len_m=0.04, intrinsics=None, **kwargs): alpha = kwargs.get("alpha", 1.0) image = _prepare_viz_image(image) - intrinsics = INTRINSICS[intrinsics_key] + intrinsics = _resolve_intrinsics(intrinsics, intrinsics_key) left_xyz, left_ypr, right_xyz, right_ypr = _split_action_pose(actions) base = image.copy() vis = base.copy() @@ -276,13 +290,14 @@ def _viz_keypoints( colors, edge_ranges, dot_color=None, + intrinsics=None, **kwargs, ): """Visualize all 21 MANO keypoints per hand, projected onto the image.""" alpha = kwargs.get("alpha", 1.0) image = _prepare_viz_image(image) - intrinsics = INTRINSICS[intrinsics_key] + intrinsics = _resolve_intrinsics(intrinsics, intrinsics_key) base = image.copy() vis = base.copy()