diff --git a/egomimic/algo/pi.py b/egomimic/algo/pi.py index abc7ede62..98a70791c 100644 --- a/egomimic/algo/pi.py +++ b/egomimic/algo/pi.py @@ -25,7 +25,12 @@ _to_minus1_1, ) from egomimic.rldb.embodiment.embodiment import get_embodiment, get_embodiment_id -from egomimic.utils.action_utils import ConverterRegistry +from egomimic.utils.action_utils import ( + PI05_CARTESIAN_ACTION_ENCODING_LEGACY, + PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D, + PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D, + ConverterRegistry, +) logger = logging.getLogger(__name__) # Ensure logger propagates to root logger and has appropriate level @@ -70,6 +75,7 @@ def __init__( state_num_bins: int = 256, control_mode: dict[str, str] | None = None, proprio_keys_for_prompt: list[str] | None = None, + action_encoding: str = PI05_CARTESIAN_ACTION_ENCODING_LEGACY, **kwargs, ): self.nets = nn.ModuleDict() @@ -103,6 +109,7 @@ def __init__( "pi_cam_keys", ["base_0_rgb", "left_wrist_0_rgb", "right_wrist_0_rgb"] ) self.config = config + self.action_encoding = action_encoding self.ac_keys = ac_keys @@ -291,6 +298,23 @@ def _tokenize_prompts(self, prompts: list[str]) -> dict: "token_ar_mask": attention_mask.clone().requires_grad_(False), } + def _action_stats(self, embodiment_id: int, ac_key: str) -> dict: + try: + return self.norm_stats.norm_stats[embodiment_id][ac_key] + except KeyError as exc: + raise KeyError( + f"Missing norm stats for action key {ac_key!r} " + f"and embodiment id {embodiment_id}" + ) from exc + + def _unnormalize_action( + self, action: torch.Tensor, embodiment_id: int, ac_key: str + ): + return self.norm_stats.unnormalize( + {ac_key: action.clone(), "embodiment": embodiment_id}, + embodiment_id, + )[ac_key].to(action.device) + @override def process_batch_for_training(self, batch): """ @@ -446,17 +470,41 @@ def forward_eval(self, batch): num_steps=self.num_steps, ) + pred_actions = pred_actions.clone() + predictions = OrderedDict() ref = _batch[ac_key] B, T, D = ref.shape converter = self.action_registry.get(embodiment_id, ac_key) - pred_actions_orig = converter.from32(pred_actions) - - pred = pred_actions_orig[:, :T, :D] - predictions[ac_key] = pred - - unnorm_actions = self.norm_stats.unnormalize(predictions, embodiment_id) + if self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D: + pred_actions_orig = converter.from32_raw_rotation( + pred_actions, + stats=self._action_stats(embodiment_id, ac_key), + norm_mode=self.norm_stats.norm_mode, + unnormalize_non_rotation=True, + ) + unnorm_actions = {ac_key: pred_actions_orig[:, :T, :D]} + elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D: + # Extract the normalized xyz+6D(+gripper) action, then + # unnormalize via the standard pipeline (stats were computed + # over the 6D representation) to get raw 6D actions. + pred_6d = converter.from32_norm_6d(pred_actions) + predictions[ac_key] = pred_6d[:, :T, :D] + unnorm_actions = self.norm_stats.unnormalize( + predictions, embodiment_id + ) + elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_LEGACY: + pred_actions_orig = converter.from32(pred_actions) + pred = pred_actions_orig[:, :T, :D] + predictions[ac_key] = pred + unnorm_actions = self.norm_stats.unnormalize( + predictions, embodiment_id + ) + else: + raise ValueError( + f"Unsupported PI0.5 action_encoding: {self.action_encoding!r}" + ) for key in unnorm_actions: unnorm_preds[f"{embodiment_name}_{key}"] = unnorm_actions[key] @@ -531,7 +579,25 @@ def _robomimic_to_pi_data( emb_id = get_embodiment_id(embodiment) # embodiment is a name string converter = self.action_registry.get(emb_id, ac_key) - action32 = converter.to32(action) + if self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D: + raw_action = self._unnormalize_action(action, emb_id, ac_key) + action32 = converter.to32_raw_rotation( + raw_action, + normalized_actions=action, + stats=self._action_stats(emb_id, ac_key), + norm_mode=self.norm_stats.norm_mode, + ) + elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D: + # Action is already a normalized xyz+6D(+gripper) chunk (the + # ypr->6D conversion happened in the CartesianYPRToRot6D data + # transform). Just pack it into the 32D vector. + action32 = converter.to32_norm_6d(action) + elif self.action_encoding == PI05_CARTESIAN_ACTION_ENCODING_LEGACY: + action32 = converter.to32(action) + else: + raise ValueError( + f"Unsupported PI0.5 action_encoding: {self.action_encoding!r}" + ) # OpenPI expects a fixed camera tuple. Human datasets only provide # `base_0_rgb`, so duplicate that view into the missing wrist slots and diff --git a/egomimic/rldb/embodiment/eva.py b/egomimic/rldb/embodiment/eva.py index 519510447..27bd548a1 100644 --- a/egomimic/rldb/embodiment/eva.py +++ b/egomimic/rldb/embodiment/eva.py @@ -6,6 +6,8 @@ from egomimic.rldb.zarr.action_chunk_transforms import ( ActionChunkCoordinateFrameTransform, BatchQuaternionPoseToYPR, + CartesianRot6DToYPR, + CartesianYPRToRot6D, ConcatKeys, DeleteKeys, InterpolateLinear, @@ -29,13 +31,31 @@ class Eva(Embodiment): @staticmethod def get_transform_list( mode: Literal[ - "cartesian", "cartesian_wristframe_ypr", "cartesian_wristframe_quat" + "cartesian", + "cartesian_6d", + "cartesian_wristframe_ypr", + "cartesian_wristframe_6d", + "cartesian_wristframe_quat", ], ) -> list[Transform]: if mode == "cartesian": return _build_eva_bimanual_transform_list(is_quat=True) + elif mode == "cartesian_6d": + # Camera-frame cartesian (14D xyz+ypr+gripper per arm) with the + # rotation re-expressed as the continuous 6D representation + # (20D xyz+6d+gripper per arm) for pi0.5 normalized-rot6d encoding. + return _build_eva_bimanual_transform_list(is_quat=True) + [ + CartesianYPRToRot6D(action_key="actions_cartesian") + ] elif mode == "cartesian_wristframe_ypr": return _build_eva_bimanual_eef_frame_transform_list(is_quat=False) + elif mode == "cartesian_wristframe_6d": + # Wrist-frame cartesian (14D xyz+ypr+gripper per arm) with the + # rotation re-expressed as the continuous 6D representation + # (20D) for pi0.5 normalized-rot6d encoding. + return _build_eva_bimanual_eef_frame_transform_list(is_quat=False) + [ + CartesianYPRToRot6D(action_key="actions_cartesian") + ] elif mode == "cartesian_wristframe_quat": return _build_eva_bimanual_eef_frame_transform_list(is_quat=True) @@ -131,6 +151,39 @@ def dinov3_keymap(cls): } +def _build_eva_cartesian_revert_6d_transform_list( + *, + action_key: str = "actions_cartesian", +) -> list[Transform]: + """Revert camera-frame 6D-rotation EVA cartesian actions back to ypr. + + Used by the cam-frame 6D evaluator: the action chunk is already in camera + frame (produced by the ``cartesian_6d`` transform mode), so only the + rotation representation is converted from xyz+6D (+gripper, 10/arm) back to + xyz+ypr (+gripper, 7/arm) so cam-frame MSE and the viz video see the same + ypr layout as the plain ``cartesian`` mode. + """ + return [CartesianRot6DToYPR(action_key=action_key)] + + +def _build_eva_cartesian_revert_6d_wristframe_transform_list( + *, + action_key: str = "actions_cartesian", +) -> list[Transform]: + """Revert wrist-frame 6D-rotation EVA actions back to camera-frame ypr. + + Two stages for the cam-frame 6D wristframe evaluator: (1) convert the action + rotation from xyz+6D (+gripper) back to xyz+ypr (+gripper) via + ``CartesianRot6DToYPR``; (2) project the wrist-frame ypr actions back into + camera frame using the standard eef-frame revert (which reads the proprio + ``observations.state.ee_pose``, left untouched as ypr by the 6D transform). + """ + return [ + CartesianRot6DToYPR(action_key=action_key), + *_build_eva_bimanual_revert_eef_frame_transform_list(is_quat=False), + ] + + def _build_eva_bimanual_revert_eef_frame_transform_list( *, action_key: str = "actions_cartesian", diff --git a/egomimic/rldb/zarr/action_chunk_transforms.py b/egomimic/rldb/zarr/action_chunk_transforms.py index 0388d386a..3b4d19cd0 100644 --- a/egomimic/rldb/zarr/action_chunk_transforms.py +++ b/egomimic/rldb/zarr/action_chunk_transforms.py @@ -28,9 +28,11 @@ _matrix_to_xyz, _matrix_to_xyzwxyz, _matrix_to_xyzypr, + _rot6d_to_ypr, _xyz_to_matrix, _xyzwxyz_to_matrix, _xyzypr_to_matrix, + _ypr_to_rot6d, wxyz_to_xyzw, xyzw_to_wxyz, ) @@ -387,6 +389,101 @@ def transform(self, batch: dict) -> dict: return batch +class CartesianYPRToRot6D(Transform): + """Convert a bimanual cartesian action chunk from per-arm xyz+ypr(+gripper) + to per-arm xyz+rot6d(+gripper). + + ``rot6d`` is the continuous 6D rotation representation = the first two + columns of the rotation matrix, packed as [col0(3), col1(3)] (see + :func:`egomimic.utils.pose_utils._ypr_to_rot6d`). This matches the column + convention of the ``to32``/``from32`` packers in + ``egomimic.utils.action_utils``, so the resulting per-arm layout maps + directly into the pi0.5 32D action blocks. + + Input layouts (last dim): + 12 -> [L xyz ypr, R xyz ypr] -> 18 [L xyz 6d, R xyz 6d] + 14 -> [L xyz ypr g, R xyz ypr g] -> 20 [L xyz 6d g, R xyz 6d g] + + Preserves the numpy/tensor type of the input (like ``PadGripperZeros``). + """ + + def __init__( + self, action_key: str = "actions_cartesian", output_key: str | None = None + ): + self.action_key = action_key + self.output_key = output_key or action_key + + def transform(self, batch: dict) -> dict: + actions = batch[self.action_key] + is_tensor = isinstance(actions, torch.Tensor) + arr = actions.cpu().numpy() if is_tensor else np.asarray(actions) + D = arr.shape[-1] + if D == 14: + l_xyz, l_ypr, l_g = arr[..., 0:3], arr[..., 3:6], arr[..., 6:7] + r_xyz, r_ypr, r_g = arr[..., 7:10], arr[..., 10:13], arr[..., 13:14] + out = np.concatenate( + [l_xyz, _ypr_to_rot6d(l_ypr), l_g, r_xyz, _ypr_to_rot6d(r_ypr), r_g], + axis=-1, + ) + elif D == 12: + l_xyz, l_ypr = arr[..., 0:3], arr[..., 3:6] + r_xyz, r_ypr = arr[..., 6:9], arr[..., 9:12] + out = np.concatenate( + [l_xyz, _ypr_to_rot6d(l_ypr), r_xyz, _ypr_to_rot6d(r_ypr)], + axis=-1, + ) + else: + raise ValueError( + f"CartesianYPRToRot6D expects last-dim 12 or 14, got {arr.shape} " + f"for '{self.action_key}'" + ) + batch[self.output_key] = torch.from_numpy(out) if is_tensor else out + return batch + + +class CartesianRot6DToYPR(Transform): + """Inverse of :class:`CartesianYPRToRot6D`: per-arm xyz+rot6d(+gripper) -> + xyz+ypr(+gripper). + + Input layouts (last dim): + 18 -> [L xyz 6d, R xyz 6d] -> 12 [L xyz ypr, R xyz ypr] + 20 -> [L xyz 6d g, R xyz 6d g] -> 14 [L xyz ypr g, R xyz ypr g] + """ + + def __init__( + self, action_key: str = "actions_cartesian", output_key: str | None = None + ): + self.action_key = action_key + self.output_key = output_key or action_key + + def transform(self, batch: dict) -> dict: + actions = batch[self.action_key] + is_tensor = isinstance(actions, torch.Tensor) + arr = actions.cpu().numpy() if is_tensor else np.asarray(actions) + D = arr.shape[-1] + if D == 20: + l_xyz, l_6d, l_g = arr[..., 0:3], arr[..., 3:9], arr[..., 9:10] + r_xyz, r_6d, r_g = arr[..., 10:13], arr[..., 13:19], arr[..., 19:20] + out = np.concatenate( + [l_xyz, _rot6d_to_ypr(l_6d), l_g, r_xyz, _rot6d_to_ypr(r_6d), r_g], + axis=-1, + ) + elif D == 18: + l_xyz, l_6d = arr[..., 0:3], arr[..., 3:9] + r_xyz, r_6d = arr[..., 9:12], arr[..., 12:18] + out = np.concatenate( + [l_xyz, _rot6d_to_ypr(l_6d), r_xyz, _rot6d_to_ypr(r_6d)], + axis=-1, + ) + else: + raise ValueError( + f"CartesianRot6DToYPR expects last-dim 18 or 20, got {arr.shape} " + f"for '{self.action_key}'" + ) + batch[self.output_key] = torch.from_numpy(out) if is_tensor else out + return batch + + class CartesianWithGripperCoordinateTransform(Transform): def __init__( self, @@ -535,12 +632,8 @@ def transform(self, batch: dict) -> dict: ) pad_shape = (*arr.shape[:-1], 1) pad = np.zeros(pad_shape, dtype=arr.dtype) - padded = np.concatenate( - (arr[..., :6], pad, arr[..., 6:], pad), axis=-1 - ) - batch[self.action_key] = ( - torch.from_numpy(padded) if is_tensor else padded - ) + padded = np.concatenate((arr[..., :6], pad, arr[..., 6:], pad), axis=-1) + batch[self.action_key] = torch.from_numpy(padded) if is_tensor else padded return batch diff --git a/egomimic/robot/rollout.py b/egomimic/robot/rollout.py index d3548c59b..d788e2995 100644 --- a/egomimic/robot/rollout.py +++ b/egomimic/robot/rollout.py @@ -264,10 +264,29 @@ def __init__( self.debug_actions = None self.resampled_action_len = resampled_action_len self.debug = debug - self.transform_list = Eva.get_transform_list(mode="cartesian_wristframe_ypr") + self.transform_list = self._build_transform_list_from_config() self.annotation = None self.collate_fn = annotation_collate self._proprio_debug_printed = False + # Prediction visualizer. Built from ``evaluator.viz_func`` in the + # checkpoint's .hydra/config.yaml so the rollout uses whatever viz + # the training pipeline declared (image_key / action_key / mode). + self.viz_func = self._build_viz_func_from_config() + # Two independent viz modes — both off by default; toggle from the + # intervention menu. ``viz_enabled`` saves at the intrinsics' native + # 640x480 (matches what training viz uses). ``viz_model_enabled`` + # saves at 224x224-with-pad — i.e. the exact tensor the model sees + # after resize_with_pad_torch — and uses scaled intrinsics so the + # projection still lands on the right pixels at that resolution. + self.viz_enabled = False + self.viz_model_enabled = False + self.viz_model_target = 224 + # Revert transform_list. For wrist-frame models, the training + # pipeline declares an ``evaluator.transform_lists.`` block + # that converts model output from wrist frame back to cam frame + # before viz/MSE. The rollout needs the same conversion so the + # cam→base post-processing and the viz projection both work. + self.revert_transform_list = self._build_revert_transform_from_config() if annotation_path is not None: if not os.path.isfile(annotation_path): print(f"[rollout] WARNING: annotation file not found: {annotation_path} (continuing without annotation)") @@ -296,7 +315,21 @@ def _load_checkpoint_cfg(cls, ckpt_path): @classmethod def _patch_checkpoint_paths(cls, ckpt_path): """Rewrite pytorch_weight_path in the checkpoint's saved config - to point to the local base model weights. Returns (patched_path, cfg).""" + to point to the local base model weights. Returns (patched_path, cfg). + + Memory note: torch.load loads the entire checkpoint into RAM (~14GB + for pi05). We immediately get a second copy when ModelWrapper.load_from_checkpoint + runs on the patched file. To avoid OOM we: + - Reuse an existing ``.patched`` file when present, skipping the + load+save entirely on subsequent launches. + - Explicitly ``del`` the in-RAM checkpoint and trigger gc before + returning so the patched copy is freed before the main load runs. + """ + patched_path = ckpt_path + ".patched" + if os.path.isfile(patched_path): + print(f"[rollout] Reusing existing patched checkpoint: {patched_path}") + return patched_path, None + import gc import torch as _torch from omegaconf import OmegaConf cfg, ckpt = cls._load_checkpoint_cfg(ckpt_path) @@ -307,27 +340,31 @@ def _patch_checkpoint_paths(cls, ckpt_path): config = robomimic.get("config", {}) old_path = config.get("pytorch_weight_path") if old_path is None or old_path == cls.LOCAL_WEIGHT_PATH: + del ckpt + gc.collect() return ckpt_path, cfg print(f"[rollout] Patching pytorch_weight_path: {old_path} -> {cls.LOCAL_WEIGHT_PATH}") config["pytorch_weight_path"] = cls.LOCAL_WEIGHT_PATH ckpt["hyper_parameters"]["config_tree"] = OmegaConf.create(cfg) - patched_path = ckpt_path + ".patched" _torch.save(ckpt, patched_path) print(f"[rollout] Patched checkpoint saved to {patched_path}") - return patched_path, cfg + del ckpt, cfg + gc.collect() + return patched_path, None def _apply_annotation_to_algo(self): - """Wire the rollout-time annotation into the PI algo. + """Wire the rollout-time annotation into the loaded algo. + + Duck-typed: sets whichever of the standard annotation knobs the + model actually has, so this works for PI (``sampling_mode``), + QWEN-HPT (``annotation_sampling_mode``), and any future variant + that follows the same naming conventions. Algos that have none of + these attributes are left untouched. - The algo was loaded with its trained-in ``annotation_key`` / - ``sampling_mode`` / ``default_prompt``. Override them so the prompt - the user supplies via --annotation-path is what actually gets - tokenized: - - ``annotation_key="annotations"`` matches the key we stuff into + - ``annotation_key="annotations"`` matches the key inserted into each per-step sample in ``process_obs_for_transform_list``. - - ``sampling_mode="first"`` makes inference deterministic — there's - only ever one annotation per rollout, but if a list ever shows up - we always pick the same element. + - ``sampling_mode="first"`` / ``annotation_sampling_mode="first"`` + make inference deterministic. - ``default_prompt=self.annotation`` is the fallback path for edge cases (e.g. the annotations key gets dropped). """ @@ -338,11 +375,392 @@ def _apply_annotation_to_algo(self): model.annotation_key = "annotations" if hasattr(model, "sampling_mode"): model.sampling_mode = "first" + if hasattr(model, "annotation_sampling_mode"): + model.annotation_sampling_mode = "first" if self.annotation is not None and hasattr(model, "default_prompt"): model.default_prompt = self.annotation + def _build_transform_list_from_config(self): + """Build the rollout transform_list from the training config so the + live observations are pre-processed in the same coordinate frame / + action representation the model was trained on. Reads the eva + embodiment's ``transform_list`` block from the .hydra/config.yaml + next to the checkpoint. Falls back to the legacy hardcoded + ``cartesian_wristframe_ypr`` if nothing is found. + """ + import yaml + ckpt_dir = os.path.dirname(self.policy_path) + candidates = [ + os.path.join(ckpt_dir, "..", ".hydra", "config.yaml"), + os.path.join(ckpt_dir, ".hydra", "config.yaml"), + ] + cfg = None + for p in candidates: + p = os.path.normpath(p) + if os.path.isfile(p): + with open(p) as f: + cfg = yaml.safe_load(f) + break + if cfg is None: + print("[rollout] WARNING: no .hydra/config.yaml found — falling back to mode='cartesian_wristframe_ypr'") + return Eva.get_transform_list(mode="cartesian_wristframe_ypr") + + # Pull the eva embodiment's transform_list block out of either + # ``data.train_datasets.eva_bimanual.resolver.transform_list`` (the + # MultiDataModuleWrapper layout) or a similar nested path. We only + # need ``mode`` (or to detect a direct _target_ to the builder fn). + train_datasets = ( + (cfg.get("data") or {}).get("train_datasets") or {} + ) + eva_block = ( + train_datasets.get("eva_bimanual") + or train_datasets.get("eva_right_arm") + or train_datasets.get("eva_left_arm") + or {} + ) + resolver = eva_block.get("resolver") or {} + tl = resolver.get("transform_list") or {} + + target = tl.get("_target_", "") + mode = tl.get("mode") + + # Mode-based call: Eva.get_transform_list(mode=...) + if "Eva.get_transform_list" in target and mode: + print(f"[rollout] Using transform_list mode='{mode}' (from {os.path.relpath(p)})") + return Eva.get_transform_list(mode=mode) + + # Direct call to the canonical eva-bimanual builder: equivalent to mode='cartesian' + if target.endswith("_build_eva_bimanual_transform_list"): + print(f"[rollout] Using mode='cartesian' (config calls _build_eva_bimanual_transform_list)") + return Eva.get_transform_list(mode="cartesian") + + print(f"[rollout] WARNING: could not parse eva transform_list from config (target={target!r}, mode={mode!r}) — falling back to 'cartesian_wristframe_ypr'") + return Eva.get_transform_list(mode="cartesian_wristframe_ypr") + + def _build_viz_func_from_config(self): + """Instantiate the prediction visualizer from ``evaluator.viz_func`` + in the training .hydra/config.yaml. Returns a callable taking + ``(predictions, batch)`` and producing a numpy image stack, or + ``None`` if no viz_func is declared / the config can't be read. + """ + import yaml + from hydra.utils import instantiate + ckpt_dir = os.path.dirname(self.policy_path) + candidates = [ + os.path.normpath(os.path.join(ckpt_dir, "..", ".hydra", "config.yaml")), + os.path.normpath(os.path.join(ckpt_dir, ".hydra", "config.yaml")), + ] + self._viz_image_key = None + self._viz_action_key = "actions_cartesian" + self._viz_annotation_key = None + cfg = None + for p in candidates: + if os.path.isfile(p): + with open(p) as f: + cfg = yaml.safe_load(f) + break + if cfg is None: + return None + viz_block = ((cfg.get("evaluator") or {}).get("viz_func") or {}) + # Pick the eva entry that matches the active arm config. + for emb_name in ("eva_bimanual", "eva_right_arm", "eva_left_arm"): + entry = viz_block.get(emb_name) + if entry: + try: + fn = instantiate(entry) + except Exception as e: + print(f"[rollout] WARNING: failed to instantiate viz_func from config: {e}") + return None + print( + f"[rollout] viz_func loaded: {entry.get('_target_')} " + f"image_key={entry.get('image_key')!r} " + f"action_key={entry.get('action_key')!r} " + f"mode={entry.get('mode')!r}" + ) + self._viz_image_key = entry.get("image_key") + self._viz_action_key = entry.get("action_key", "actions_cartesian") + self._viz_annotation_key = entry.get("annotation_key") + return fn + return None + + def _build_revert_transform_from_config(self): + """Instantiate the revert transform_list from + ``evaluator.transform_lists.`` in the training + .hydra/config.yaml. The revert takes wrist-frame model output and + produces cam-frame actions using the current ``observations.state.ee_pose`` + as the reference. Returns a list of Transform objects, or ``None`` + if the config has no transform_lists block (i.e. the model was + trained directly in cam frame and no revert is needed). + """ + import yaml + from hydra.utils import instantiate + ckpt_dir = os.path.dirname(self.policy_path) + candidates = [ + os.path.normpath(os.path.join(ckpt_dir, "..", ".hydra", "config.yaml")), + os.path.normpath(os.path.join(ckpt_dir, ".hydra", "config.yaml")), + ] + cfg = None + for p in candidates: + if os.path.isfile(p): + with open(p) as f: + cfg = yaml.safe_load(f) + break + if cfg is None: + return None + block = ((cfg.get("evaluator") or {}).get("transform_lists") or {}) + for emb_name in ("eva_bimanual", "eva_right_arm", "eva_left_arm"): + entry = block.get(emb_name) + if entry: + try: + fn = instantiate(entry) + except Exception as e: + print(f"[rollout] WARNING: failed to instantiate revert transform_list: {e}") + return None + print( + f"[rollout] revert transform_list loaded: " + f"{entry.get('_target_')!r}" + ) + return fn + return None + + def _apply_revert_to_actions(self, preds, batch_unnorm): + """Apply the revert transform to convert wrist-frame model output + to cam frame. Reads ``observations.state.ee_pose`` from + ``batch_unnorm`` as the reference frame and returns a new preds + tensor with the same shape as the input. No-op if no revert is + configured. + """ + if self.revert_transform_list is None: + return preds + from egomimic.rldb.embodiment.embodiment import Embodiment + obs_key = "observations.state.ee_pose" + if obs_key not in batch_unnorm: + print( + f"[rollout] WARNING: '{obs_key}' missing from batch — " + "cannot revert wristframe actions" + ) + return preds + # Build a minimal batch with the predictions plugged in as the + # action chunk plus the obs_ee_pose reference. apply_transform + # splits per-sample and re-batches. + pred_batch = { + "actions_cartesian": preds.detach().cpu().float(), + obs_key: batch_unnorm[obs_key], + } + reverted = Embodiment.apply_transform(pred_batch, self.revert_transform_list) + out = reverted["actions_cartesian"] + if not isinstance(out, torch.Tensor): + out = torch.as_tensor(out) + return out.to(preds.device, dtype=preds.dtype) + + def _save_viz_model_res(self, batch_for_model, preds, embodiment_name, step_i): + """Save the prediction viz at the model's input resolution + (224x224 with padding, mirroring resize_with_pad_torch). Uses + scaled-and-padded intrinsics so the cam-frame xyz projection still + lands on the correct pixel even though the image is small. + Written to ``debug/viz_model_.png`` (separate from the + ``viz_.png`` files produced by the standard viz mode). + """ + if self.viz_func is None: + print("[rollout] viz_model toggle is on but no viz_func is configured — skipping") + return + try: + import torch.nn.functional as F + from egomimic.utils import egomimicUtils + + batch = dict(batch_for_model) + img_key = self._viz_image_key + if not (img_key and img_key in batch): + print(f"[rollout] viz_model: '{img_key}' not in batch — skipping") + return + img_t = batch[img_key] + if isinstance(img_t, torch.Tensor): + if img_t.dim() == 3: + img_t = img_t.unsqueeze(0) + if img_t.shape[1] != 3 and img_t.shape[-1] == 3: + img_t = img_t.permute(0, 3, 1, 2) + src_h, src_w = img_t.shape[-2:] + + # resize_with_pad to target x target + target = int(self.viz_model_target) + ratio = max(src_w / target, src_h / target) + resized_h = int(src_h / ratio) + resized_w = int(src_w / ratio) + img_resized = F.interpolate( + img_t.float(), size=(resized_h, resized_w), + mode="bilinear", align_corners=False, + ) + pad_h0 = (target - resized_h) // 2 + pad_h1 = target - resized_h - pad_h0 + pad_w0 = (target - resized_w) // 2 + pad_w1 = target - resized_w - pad_w0 + img_padded = F.pad( + img_resized, (pad_w0, pad_w1, pad_h0, pad_h1), + mode="constant", value=0, + ) + batch[img_key] = img_padded + + # Scale intrinsics to match the resize+pad. The base intrinsics + # are calibrated for cx*2 x cy*2 (e.g. 640x480 for ARIA). + # Scale them up to the source camera resolution first, then + # apply the resize-with-pad scaling on top. + orig_K = egomimicUtils.INTRINSICS["base"].copy() + ref_w = float(orig_K[0, 2] * 2.0) + ref_h = float(orig_K[1, 2] * 2.0) + cam_scale_x = src_w / ref_w + cam_scale_y = src_h / ref_h + fx = orig_K[0, 0] * cam_scale_x + fy = orig_K[1, 1] * cam_scale_y + cx = orig_K[0, 2] * cam_scale_x + cy = orig_K[1, 2] * cam_scale_y + new_fx, new_fy = fx / ratio, fy / ratio + new_cx = cx / ratio + pad_w0 + new_cy = cy / ratio + pad_h0 + scaled_K = np.array([ + [new_fx, 0.0, new_cx, 0.0], + [0.0, new_fy, new_cy, 0.0], + [0.0, 0.0, 1.0, 0.0], + ]) + + # viz_gt_preds expects batch["embodiment"][0].item() to work. + emb = batch.get("embodiment") + if not isinstance(emb, torch.Tensor): + batch["embodiment"] = torch.tensor( + [int(self.embodiment_id)], dtype=torch.int64 + ) + predictions = { + f"{embodiment_name}_{self._viz_action_key}": preds.detach(), + } + # Swap in scaled intrinsics for the duration of the viz_func call, + # then restore. viz_func reads INTRINSICS["base"] internally. + # ``pred_alpha=0.0`` makes the red prediction overlay fully + # transparent — model-res viz shows the image as the model sees + # it, without the prediction trajectory drawn on top. + try: + egomimicUtils.INTRINSICS["base"] = scaled_K + ims = self.viz_func(predictions, batch, pred_alpha=0.0) + finally: + egomimicUtils.INTRINSICS["base"] = orig_K + + ims = np.asarray(ims) + out_im = ims[0] if ims.ndim == 4 else ims + out_im = cv2.cvtColor(out_im, cv2.COLOR_RGB2BGR) + out_dir = os.path.abspath("debug") + os.makedirs(out_dir, exist_ok=True) + out_path = os.path.join(out_dir, f"viz_model_{step_i:06d}.png") + cv2.imwrite(out_path, out_im) + print(f"[rollout] saved viz_model -> {out_path}") + except Exception as e: + print(f"[rollout] viz_model failed at step {step_i}: {e}") + + def _save_viz(self, batch_for_model, preds, embodiment_name, step_i): + """Render and save a per-inference prediction visualization to + ``debug/viz_.png``. Caller is expected to have already + unnormalized the batch and applied any revert transform_list so + both ``batch[actions_cartesian]`` and ``preds`` are in cam frame. + """ + if self.viz_func is None: + print("[rollout] viz toggle is on but no viz_func is configured — skipping") + return + try: + batch = dict(batch_for_model) + # viz_gt_preds projects cam-frame xyz via INTRINSICS["base"] + # (= ARIA_INTRINSICS, calibrated for 640x480). If the live + # camera publishes at a different resolution (e.g. configs.yaml + # has Aria front at 960x720), the projection lands in the wrong + # pixels even though the xyz values are correct. Resize the + # image to the intrinsics' native size so they match. + img_key = self._viz_image_key + if img_key and img_key in batch: + import torch.nn.functional as F + img_t = batch[img_key] + if isinstance(img_t, torch.Tensor): + # Force BCHW: collated transform_list image is [B, C, H, W]. + if img_t.dim() == 3: + img_t = img_t.unsqueeze(0) + if img_t.shape[1] != 3 and img_t.shape[-1] == 3: + img_t = img_t.permute(0, 3, 1, 2) + if img_t.shape[-2:] != (480, 640): + img_t = F.interpolate( + img_t.float(), size=(480, 640), + mode="bilinear", align_corners=False, + ) + batch[img_key] = img_t + # viz_gt_preds expects batch["embodiment"][0].item() to work, + # so make sure embodiment is a tensor with a batch dim. + emb = batch.get("embodiment") + if not isinstance(emb, torch.Tensor): + batch["embodiment"] = torch.tensor( + [int(self.embodiment_id)], dtype=torch.int64 + ) + # If the viz_func config sets ``annotation_key`` (the partial + # will then do ``batch[annotation_key]`` unconditionally), make + # sure the key exists. Use the loaded rollout annotation when + # present, else an empty string so the text overlay just draws + # blank. + ak = self._viz_annotation_key + if ak and ak not in batch: + batch[ak] = [self.annotation if self.annotation is not None else ""] + predictions = { + f"{embodiment_name}_{self._viz_action_key}": preds.detach(), + } + if not getattr(self, "_viz_debug_printed", False): + act_key = self._viz_action_key + gt = batch.get(act_key) + pred_t = predictions[f"{embodiment_name}_{act_key}"] + img = batch.get(self._viz_image_key) + print( + f"[rollout][viz-debug] image_key={self._viz_image_key!r} " + f"action_key={act_key!r}" + ) + if isinstance(img, torch.Tensor): + print( + f"[rollout][viz-debug] image shape={tuple(img.shape)} " + f"dtype={img.dtype} min={img.float().min().item():.3f} " + f"max={img.float().max().item():.3f}" + ) + if isinstance(pred_t, torch.Tensor): + pf = pred_t.float()[0] # (T, D) + T = pf.shape[0] + print( + f"[rollout][viz-debug] pred shape={tuple(pred_t.shape)} " + f"L_xyz t=0: {pf[0, :3].tolist()}\n" + f"[rollout][viz-debug] " + f"L_xyz t={T//2}: {pf[T//2, :3].tolist()}\n" + f"[rollout][viz-debug] " + f"L_xyz t={T-1}: {pf[-1, :3].tolist()}\n" + f"[rollout][viz-debug] " + f"R_xyz t=0: {pf[0, 7:10].tolist()}\n" + f"[rollout][viz-debug] " + f"R_xyz t={T-1}: {pf[-1, 7:10].tolist()}" + ) + if isinstance(gt, torch.Tensor): + gf = gt.float()[0] + print( + f"[rollout][viz-debug] GT (current EE held) " + f"L_xyz: {gf[0, :3].tolist()} " + f"R_xyz: {gf[0, 7:10].tolist()}" + ) + self._viz_debug_printed = True + ims = self.viz_func(predictions, batch) + ims = np.asarray(ims) + out_dir = os.path.abspath("debug") + os.makedirs(out_dir, exist_ok=True) + out_path = os.path.join(out_dir, f"viz_{step_i:06d}.png") + # viz_gt_preds returns RGB (matches the training-eval pipeline, + # which writes via TensorBoard). cv2.imwrite expects BGR, so + # swap channels here to avoid the inverted-colors save bug. + out_im = ims[0] if ims.ndim == 4 else ims + out_im = cv2.cvtColor(out_im, cv2.COLOR_RGB2BGR) + cv2.imwrite(out_path, out_im) + print(f"[rollout] saved viz -> {out_path}") + except Exception as e: + print(f"[rollout] viz failed at step {step_i}: {e}") + def _load_policy(self): + import gc patched_path, _ = self._patch_checkpoint_paths(self.policy_path) + gc.collect() policy = ModelWrapper.load_from_checkpoint( patched_path, weights_only=False, map_location="cpu" ) @@ -431,6 +849,28 @@ def rollout_step(self, i, obs): preds = self.policy.model.forward_eval(processed_batch)[ f"{embodiment_name}_actions_cartesian" ] + # Wrist-frame models: revert preds to cam frame BEFORE viz and + # BEFORE the cam→base post-processing. For cam-frame models + # (no transform_lists in config), revert is None and these + # calls are no-ops. + batch_for_viz = self.policy.model.norm_stats.unnormalize( + dict(transform_list_batch), self.embodiment_id + ) + if self.revert_transform_list is not None: + preds = self._apply_revert_to_actions(preds, batch_for_viz) + from egomimic.rldb.embodiment.embodiment import Embodiment + gt_only = { + k: v for k, v in batch_for_viz.items() + if k in ("actions_cartesian", "observations.state.ee_pose") + } + gt_reverted = Embodiment.apply_transform( + gt_only, self.revert_transform_list + ) + batch_for_viz = {**batch_for_viz, **gt_reverted} + if self.viz_enabled: + self._save_viz(batch_for_viz, preds, embodiment_name, i) + if self.viz_model_enabled: + self._save_viz_model_res(batch_for_viz, preds, embodiment_name, i) self.actions = preds.detach().cpu().numpy().squeeze() self.debug_actions = self.actions.copy() if self.cartesian: @@ -703,9 +1143,17 @@ def _enter_intervention(kp, policy, rollout_type): """ # Restore normal terminal so the user can type freely termios.tcsetattr(kp.fd, termios.TCSADRAIN, kp.old) + viz_state = ( + "ON" if (isinstance(policy, PolicyRollout) and policy.viz_enabled) else "OFF" + ) + viz_model_state = ( + "ON" if (isinstance(policy, PolicyRollout) and policy.viz_model_enabled) else "OFF" + ) print("\n--- INTERVENTION (rollout paused) ---") print(" c : continue rollout") print(" a : load new annotation file") + print(f" v : toggle prediction viz @ 640x480 (currently {viz_state})") + print(f" m : toggle prediction viz @ model res 224x224 (currently {viz_model_state})") print(" r : restart rollout") print(" q : quit") @@ -735,8 +1183,20 @@ def _enter_intervention(kp, policy, rollout_type): print("Annotation loading is only supported for policy rollouts.") continue policy.load_annotation(ann_path) + elif cmd == "v": + if rollout_type != "policy" or not isinstance(policy, PolicyRollout): + print("Prediction viz is only supported for policy rollouts.") + continue + policy.viz_enabled = not policy.viz_enabled + print(f"[rollout] viz now {'ON' if policy.viz_enabled else 'OFF'}") + elif cmd == "m": + if rollout_type != "policy" or not isinstance(policy, PolicyRollout): + print("Prediction viz is only supported for policy rollouts.") + continue + policy.viz_model_enabled = not policy.viz_model_enabled + print(f"[rollout] viz_model now {'ON' if policy.viz_model_enabled else 'OFF'}") else: - print(f"Unknown command: '{cmd}'. Use c / a / r / q.") + print(f"Unknown command: '{cmd}'. Use c / a / v / m / r / q.") try: with _KeyPoll() as kp: diff --git a/egomimic/utils/action_utils.py b/egomimic/utils/action_utils.py index 75c4fac11..a755f4983 100644 --- a/egomimic/utils/action_utils.py +++ b/egomimic/utils/action_utils.py @@ -1,7 +1,19 @@ -from typing import Dict, Tuple +from typing import Any, Dict, Tuple import torch +PI05_CARTESIAN_ACTION_ENCODING_RAW_ROT_6D = "cartesian_ypr_raw_rot6d" +PI05_CARTESIAN_ACTION_ENCODING_LEGACY = "legacy_normalized_ypr_rot6d" +# Actions arrive already in xyz+6D(+gripper) layout (the ypr->6D conversion is +# done by the ``CartesianYPRToRot6D`` data transform) and already normalized by +# the standard MultiDataset pipeline. The forward pass only *packs* the +# normalized 6D action into the 32D vector (see ``to32_norm_6d`` below). +PI05_CARTESIAN_ACTION_ENCODING_NORM_ROT_6D = "cartesian_normalized_rot6d" + +# Bimanual robot Cartesian layout: [x, y, z, yaw, pitch, roll, gripper] x 2. +ROBOT_BIMANUAL_CARTESIAN_ROT_DIMS = (3, 4, 5, 10, 11, 12) +ROBOT_BIMANUAL_CARTESIAN_NON_ROT_DIMS = (0, 1, 2, 6, 7, 8, 9, 13) + # ---------- registry that stores *objects* ---------- class ConverterRegistry: @@ -43,6 +55,77 @@ def _pad32(x: torch.Tensor) -> torch.Tensor: return x[..., :32] +def _stat_tensor(stats: dict[str, Any], key: str, ref: torch.Tensor) -> torch.Tensor: + value = torch.as_tensor(stats[key], device=ref.device, dtype=torch.float32) + return value.to(dtype=ref.dtype if ref.is_floating_point() else torch.float32) + + +def _apply_norm_one( + tensor: torch.Tensor, + stats: dict[str, Any], + norm_mode: str, +) -> torch.Tensor: + if norm_mode == "zscore": + mean = _stat_tensor(stats, "mean", tensor) + std = _stat_tensor(stats, "std", tensor) + return (tensor - mean) / (std + 1e-6) + if norm_mode == "minmax": + mn = _stat_tensor(stats, "min", tensor) + mx = _stat_tensor(stats, "max", tensor) + return 2.0 * ((tensor - mn) / (mx - mn + 1e-6)) - 1.0 + if norm_mode == "quantile": + q1 = _stat_tensor(stats, "quantile_1", tensor) + q99 = _stat_tensor(stats, "quantile_99", tensor) + return 2.0 * ((tensor - q1) / (q99 - q1 + 1e-6)) - 1.0 + raise ValueError(f"Invalid normalization mode: {norm_mode}") + + +def _apply_unnorm_one( + tensor: torch.Tensor, + stats: dict[str, Any], + norm_mode: str, +) -> torch.Tensor: + if norm_mode == "zscore": + mean = _stat_tensor(stats, "mean", tensor) + std = _stat_tensor(stats, "std", tensor) + return tensor * (std + 1e-6) + mean + if norm_mode == "minmax": + mn = _stat_tensor(stats, "min", tensor) + mx = _stat_tensor(stats, "max", tensor) + return (tensor + 1) * 0.5 * (mx - mn + 1e-6) + mn + if norm_mode == "quantile": + q1 = _stat_tensor(stats, "quantile_1", tensor) + q99 = _stat_tensor(stats, "quantile_99", tensor) + return (tensor + 1) * 0.5 * (q99 - q1 + 1e-6) + q1 + raise ValueError(f"Invalid normalization mode: {norm_mode}") + + +def _normalize_robot_bimanual_non_rot( + raw_actions: torch.Tensor, + stats: dict[str, Any], + norm_mode: str, +) -> torch.Tensor: + normalized = raw_actions.clone() + all_dims = _apply_norm_one(raw_actions, stats, norm_mode) + normalized[..., ROBOT_BIMANUAL_CARTESIAN_NON_ROT_DIMS] = all_dims[ + ..., ROBOT_BIMANUAL_CARTESIAN_NON_ROT_DIMS + ] + return normalized + + +def _unnormalize_robot_bimanual_non_rot( + model_actions: torch.Tensor, + stats: dict[str, Any], + norm_mode: str, +) -> torch.Tensor: + raw_actions = model_actions.clone() + all_dims = _apply_unnorm_one(model_actions, stats, norm_mode) + raw_actions[..., ROBOT_BIMANUAL_CARTESIAN_NON_ROT_DIMS] = all_dims[ + ..., ROBOT_BIMANUAL_CARTESIAN_NON_ROT_DIMS + ] + return raw_actions + + def _ypr_to_matrix(ypr: torch.Tensor, degrees: bool = False) -> torch.Tensor: if degrees: ypr = ypr * (torch.pi / 180.0) @@ -137,6 +220,53 @@ def to32(self, actions: torch.Tensor) -> torch.Tensor: def from32(self, actions32: torch.Tensor) -> torch.Tensor: raise NotImplementedError + def to32_raw_rotation( + self, + raw_actions: torch.Tensor, + *, + normalized_actions: torch.Tensor | None = None, + stats: dict[str, Any] | None = None, + norm_mode: str = "quantile", + ) -> torch.Tensor: + """Pack actions with raw YPR rotations and normalized non-rotation dims.""" + del normalized_actions, stats, norm_mode + raise NotImplementedError( + f"{type(self).__name__} does not support raw-rotation action encoding" + ) + + def from32_raw_rotation( + self, + actions32: torch.Tensor, + *, + stats: dict[str, Any] | None = None, + norm_mode: str = "quantile", + unnormalize_non_rotation: bool = False, + ) -> torch.Tensor: + """Decode actions whose 6D rotation columns represent raw YPR rotations.""" + del stats, norm_mode, unnormalize_non_rotation + raise NotImplementedError( + f"{type(self).__name__} does not support raw-rotation action decoding" + ) + + def to32_norm_6d(self, actions: torch.Tensor) -> torch.Tensor: + """Pack an already-normalized xyz+6D(+gripper) action into the 32D vector. + + The ypr->6D conversion happens upstream in the ``CartesianYPRToRot6D`` + data transform and the result is normalized by the standard data + pipeline, so this is a pure rearrange (no rotation math, no + normalization). + """ + raise NotImplementedError( + f"{type(self).__name__} does not support normalized-rot6d encoding" + ) + + def from32_norm_6d(self, actions32: torch.Tensor) -> torch.Tensor: + """Inverse of :meth:`to32_norm_6d`: extract the normalized xyz+6D(+gripper) + action from the 32D vector (pure rearrange).""" + raise NotImplementedError( + f"{type(self).__name__} does not support normalized-rot6d decoding" + ) + # ============================================================ # ROBOT CONVERTERS @@ -210,7 +340,7 @@ class RobotBimanualCartesianEuler(BaseActionConverter): 32-pack: left block 0..9, right block 10..19 """ - def to32(self, actions: torch.Tensor) -> torch.Tensor: + def to20(self, actions: torch.Tensor) -> torch.Tensor: actions = _ensure_bsd(actions) if actions.shape[-1] != 14: raise ValueError(f"RobotBimanual: expected 14-dim, got {actions.shape[-1]}") @@ -228,12 +358,19 @@ def to32(self, actions: torch.Tensor) -> torch.Tensor: R_c1, R_c2 = R_R[..., 0], R_R[..., 1] right_block = torch.cat([R_xyz, R_c1, R_c2, R_g], dim=-1) # (B,S,10) - return _pad32(torch.cat([left_block, right_block], dim=-1)) # (B,S,20+) -> 32 + return torch.cat([left_block, right_block], dim=-1) # (B,S,20) - def from32(self, actions32: torch.Tensor) -> torch.Tensor: - actions32 = _ensure_bsd(actions32) - Lb = actions32[..., 0:10] - Rb = actions32[..., 10:20] + def to32(self, actions: torch.Tensor) -> torch.Tensor: + return _pad32(self.to20(actions)) + + def from20(self, actions20: torch.Tensor) -> torch.Tensor: + actions20 = _ensure_bsd(actions20) + if actions20.shape[-1] < 20: + raise ValueError( + f"RobotBimanual: expected at least 20 dims, got {actions20.shape[-1]}" + ) + Lb = actions20[..., 0:10] + Rb = actions20[..., 10:20] # left L_xyz, L_c1, L_c2, L_g = Lb[..., 0:3], Lb[..., 3:6], Lb[..., 6:9], Lb[..., 9:10] @@ -249,6 +386,114 @@ def from32(self, actions32: torch.Tensor) -> torch.Tensor: R7 = torch.cat([R_xyz, R_ypr, R_g], dim=-1) return torch.cat([L7, R7], dim=-1) # (B,S,14) + def from32(self, actions32: torch.Tensor) -> torch.Tensor: + return self.from20(actions32) + + def to20_raw_rotation( + self, + raw_actions: torch.Tensor, + *, + normalized_actions: torch.Tensor | None = None, + stats: dict[str, Any] | None = None, + norm_mode: str = "quantile", + ) -> torch.Tensor: + raw_actions = _ensure_bsd(raw_actions) + if raw_actions.shape[-1] != 14: + raise ValueError( + f"RobotBimanual: expected 14-dim, got {raw_actions.shape[-1]}" + ) + if normalized_actions is None: + if stats is None: + raise ValueError( + "stats are required when normalized_actions is omitted" + ) + model_actions = _normalize_robot_bimanual_non_rot( + raw_actions, stats, norm_mode + ) + else: + normalized_actions = _ensure_bsd(normalized_actions).to(raw_actions.device) + if normalized_actions.shape != raw_actions.shape: + raise ValueError( + "normalized_actions must match raw_actions shape; got " + f"{tuple(normalized_actions.shape)} vs {tuple(raw_actions.shape)}" + ) + model_actions = raw_actions.clone() + model_actions[..., ROBOT_BIMANUAL_CARTESIAN_NON_ROT_DIMS] = ( + normalized_actions[..., ROBOT_BIMANUAL_CARTESIAN_NON_ROT_DIMS] + ) + return self.to20(model_actions) + + def from20_raw_rotation( + self, + actions20: torch.Tensor, + *, + stats: dict[str, Any] | None = None, + norm_mode: str = "quantile", + unnormalize_non_rotation: bool = False, + ) -> torch.Tensor: + model_actions = self.from20(actions20) + if not unnormalize_non_rotation: + return model_actions + if model_actions.shape[-1] != 14: + raise ValueError( + "RobotBimanual raw-rotation decoding expected 14D Cartesian actions; " + f"got {model_actions.shape[-1]} dims" + ) + if stats is None: + raise ValueError("stats are required to unnormalize non-rotation dims") + return _unnormalize_robot_bimanual_non_rot(model_actions, stats, norm_mode) + + def to32_raw_rotation( + self, + raw_actions: torch.Tensor, + *, + normalized_actions: torch.Tensor | None = None, + stats: dict[str, Any] | None = None, + norm_mode: str = "quantile", + ) -> torch.Tensor: + return _pad32( + self.to20_raw_rotation( + raw_actions, + normalized_actions=normalized_actions, + stats=stats, + norm_mode=norm_mode, + ) + ) + + def from32_raw_rotation( + self, + actions32: torch.Tensor, + *, + stats: dict[str, Any] | None = None, + norm_mode: str = "quantile", + unnormalize_non_rotation: bool = False, + ) -> torch.Tensor: + return self.from20_raw_rotation( + actions32, + stats=stats, + norm_mode=norm_mode, + unnormalize_non_rotation=unnormalize_non_rotation, + ) + + def to32_norm_6d(self, actions: torch.Tensor) -> torch.Tensor: + # actions: (B,S,20) = [L xyz(3) 6d(6) g(1), R xyz(3) 6d(6) g(1)] — already + # the canonical 32D block layout (left 0..9, right 10..19), just pad. + actions = _ensure_bsd(actions) + if actions.shape[-1] != 20: + raise ValueError( + f"RobotBimanual.to32_norm_6d expected 20-dim, got {actions.shape[-1]}" + ) + return _pad32(actions) + + def from32_norm_6d(self, actions32: torch.Tensor) -> torch.Tensor: + actions32 = _ensure_bsd(actions32) + if actions32.shape[-1] < 20: + raise ValueError( + f"RobotBimanual.from32_norm_6d expected >=20 dims, got " + f"{actions32.shape[-1]}" + ) + return actions32[..., 0:20] + # ============================================================ # HUMAN CONVERTERS @@ -345,3 +590,30 @@ def from32(self, actions32: torch.Tensor) -> torch.Tensor: R_R = _reconstruct_R_from_cols(R_c1, R_c2) R_ypr = _matrix_to_ypr(R_R) return torch.cat([L_xyz, L_ypr, R_xyz, R_ypr], dim=-1) # (B,S,12) + + def to32_norm_6d(self, actions: torch.Tensor) -> torch.Tensor: + # actions: (B,S,18) = [L xyz(3) 6d(6), R xyz(3) 6d(6)]. Human has no + # gripper, so insert a zero gripper slot at the end of each arm block to + # match the 32D block layout [xyz(3) c1(3) c2(3) g(1)] x 2. + actions = _ensure_bsd(actions) + if actions.shape[-1] != 18: + raise ValueError( + f"HumanBimanual.to32_norm_6d expected 18-dim, got {actions.shape[-1]}" + ) + L = actions[..., 0:9] + R = actions[..., 9:18] + g0 = torch.zeros_like(actions[..., :1]) + Lblock = torch.cat([L, g0], dim=-1) # (B,S,10) + Rblock = torch.cat([R, g0], dim=-1) # (B,S,10) + return _pad32(torch.cat([Lblock, Rblock], dim=-1)) + + def from32_norm_6d(self, actions32: torch.Tensor) -> torch.Tensor: + actions32 = _ensure_bsd(actions32) + if actions32.shape[-1] < 20: + raise ValueError( + f"HumanBimanual.from32_norm_6d expected >=20 dims, got " + f"{actions32.shape[-1]}" + ) + L = actions32[..., 0:9] # drop left gripper slot at idx 9 + R = actions32[..., 10:19] # drop right gripper slot at idx 19 + return torch.cat([L, R], dim=-1) # (B,S,18) diff --git a/egomimic/utils/pose_utils.py b/egomimic/utils/pose_utils.py index 0bbe0a6f7..e5870b83d 100644 --- a/egomimic/utils/pose_utils.py +++ b/egomimic/utils/pose_utils.py @@ -129,6 +129,61 @@ def _xyzypr_to_matrix(xyzypr: np.ndarray) -> np.ndarray: return mats +def _ypr_to_rot6d(ypr: np.ndarray) -> np.ndarray: + """Convert euler ypr to the continuous 6D rotation representation. + + args: + ypr: (..., 3) array of [yaw, pitch, roll] (radians, ZYX convention) + returns: + (..., 6) array = first two columns of the rotation matrix, + concatenated as [col0(3), col1(3)]. + + Matches the column convention used by the torch packers in + ``egomimic.utils.action_utils`` (``_ypr_to_matrix`` = Rz@Ry@Rx, and + ``to32`` taking ``R[..., 0]`` / ``R[..., 1]``). + """ + ypr = np.asarray(ypr) + if ypr.shape[-1] != 3: + raise ValueError(f"Expected (..., 3) ypr, got shape {ypr.shape}") + dtype = ypr.dtype if np.issubdtype(ypr.dtype, np.floating) else np.float64 + shape = ypr.shape[:-1] + flat = ypr.reshape(-1, 3).astype(np.float64) + mats = R.from_euler("ZYX", flat, degrees=False).as_matrix() # (N, 3, 3) + six = np.concatenate([mats[:, :, 0], mats[:, :, 1]], axis=-1) # cols 0,1 + return six.reshape(*shape, 6).astype(dtype, copy=False) + + +def _rot6d_to_ypr(six: np.ndarray) -> np.ndarray: + """Inverse of :func:`_ypr_to_rot6d`. + + args: + six: (..., 6) array = [col0(3), col1(3)] of a rotation matrix. + returns: + (..., 3) array of [yaw, pitch, roll] (radians, ZYX convention). + + Reconstructs a proper rotation via Gram-Schmidt (mirroring + ``_reconstruct_R_from_cols`` in ``action_utils``) before extracting euler + angles, so ``_rot6d_to_ypr(_ypr_to_rot6d(ypr)) == ypr``. + """ + six = np.asarray(six) + if six.shape[-1] != 6: + raise ValueError(f"Expected (..., 6) rot6d, got shape {six.shape}") + dtype = six.dtype if np.issubdtype(six.dtype, np.floating) else np.float64 + shape = six.shape[:-1] + flat = six.reshape(-1, 6).astype(np.float64) + c1 = flat[:, 0:3] + c2 = flat[:, 3:6] + eps = 1e-8 + c1n = c1 / np.clip(np.linalg.norm(c1, axis=-1, keepdims=True), eps, None) + proj = np.sum(c2 * c1n, axis=-1, keepdims=True) * c1n + c2o = c2 - proj + c2n = c2o / np.clip(np.linalg.norm(c2o, axis=-1, keepdims=True), eps, None) + c3n = np.cross(c1n, c2n) + mats = np.stack([c1n, c2n, c3n], axis=-1) # columns + ypr = R.from_matrix(mats).as_euler("ZYX", degrees=False) + return ypr.reshape(*shape, 3).astype(dtype, copy=False) + + def _matrix_to_xyzwxyz(mats: np.ndarray) -> np.ndarray: """ args: