diff --git a/.gitignore b/.gitignore index 0de09a269..a9f0c6894 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.pyc egomimic.egg-info/ *.hdf5 +*.out *.out.* egomimic/debug trained_models_highlevel/ @@ -44,4 +45,7 @@ annotations/** annotations_test/** log_conversion/** debug_model_inputs/** -temp_dir/** \ No newline at end of file +temp_dir/** +datasets/** +CLAUDE.md +apikey.txt \ No newline at end of file diff --git a/egomimic/hydra_configs/callbacks/checkpoints.yaml b/egomimic/hydra_configs/callbacks/checkpoints.yaml index 86d21a792..0018fb126 100644 --- a/egomimic/hydra_configs/callbacks/checkpoints.yaml +++ b/egomimic/hydra_configs/callbacks/checkpoints.yaml @@ -8,4 +8,9 @@ model_checkpoint: filename: "epoch_{epoch}" save_last: true save_top_k: -1 - every_n_epochs: 100 \ No newline at end of file + every_n_epochs: 50 + # Force the save to fire at train_epoch_end; without this Lightning defers + # to the end of the first validation loop, which (with + # check_val_every_n_epoch=200) means no ckpt is written for the first 200 + # epochs — leaving a long restart-from-scratch window if a job dies. + save_on_train_epoch_end: trueclose \ No newline at end of file diff --git a/egomimic/hydra_configs/data/cotrain_pi_base.yaml b/egomimic/hydra_configs/data/cotrain_pi_base.yaml index ab4c2c3fc..ba7f2c458 100644 --- a/egomimic/hydra_configs/data/cotrain_pi_base.yaml +++ b/egomimic/hydra_configs/data/cotrain_pi_base.yaml @@ -4,8 +4,8 @@ train_datasets: eva_bimanual: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver resolver: - _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver - folder_path: ${paths.dataset_dir} + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3AnnotationCutoffEpisodeResolver + folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/EgoVerse/datasets/pick_place_cotrain key_map: _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap keymap_mode: cartesian @@ -19,8 +19,8 @@ train_datasets: aria_bimanual: _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver resolver: - _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver - folder_path: ${paths.dataset_dir} + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3AnnotationCutoffEpisodeResolver + folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/EgoVerse/datasets/pick_place_cotrain key_map: _target_: egomimic.rldb.embodiment.human.Aria.get_keymap keymap_mode: cartesian @@ -34,30 +34,33 @@ train_datasets: valid_datasets: eva_bimanual: - _target_: ${train_datasets.eva_bimanual._target_} - resolver: ${train_datasets.eva_bimanual.resolver} - filters: ${train_datasets.eva_bimanual.filters} + _target_: ${...train_datasets.eva_bimanual._target_} + resolver: ${...train_datasets.eva_bimanual.resolver} + filters: ${...train_datasets.eva_bimanual.filters} mode: valid - valid_ratio: ${train_datasets.eva_bimanual.valid_ratio} + valid_ratio: ${...train_datasets.eva_bimanual.valid_ratio} aria_bimanual: - _target_: ${train_datasets.aria_bimanual._target_} - resolver: ${train_datasets.aria_bimanual.resolver} - filters: ${train_datasets.aria_bimanual.filters} + _target_: ${...train_datasets.aria_bimanual._target_} + resolver: ${...train_datasets.aria_bimanual.resolver} + filters: ${...train_datasets.aria_bimanual.filters} mode: valid - valid_ratio: ${train_datasets.aria_bimanual.valid_ratio} + valid_ratio: ${...train_datasets.aria_bimanual.valid_ratio} train_dataloader_params: eva_bimanual: batch_size: 32 - num_workers: 6 + num_workers: 8 + timeout: 300 aria_bimanual: batch_size: 32 - num_workers: 6 - + num_workers: 8 + timeout: 300 valid_dataloader_params: eva_bimanual: batch_size: 32 - num_workers: 6 + num_workers: 8 + timeout: 300 aria_bimanual: batch_size: 32 - num_workers: 6 + num_workers: 8 + timeout: 300 diff --git a/egomimic/hydra_configs/data/eva_fold_clothes.yaml b/egomimic/hydra_configs/data/eva_fold_clothes.yaml new file mode 100644 index 000000000..ac93fca8e --- /dev/null +++ b/egomimic/hydra_configs/data/eva_fold_clothes.yaml @@ -0,0 +1,74 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper + +train_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/shared/egoverseS3ZarrDatasets + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + keymap_mode: cartesian_pi + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + mode: cartesian + filters: + _target_: egomimic.rldb.filters.DatasetFilter + filter_lambdas: + - "lambda row: row.get('embodiment') == 'eva_bimanual'" + - "lambda row: row.get('task') == 'fold_clothes'" + - "lambda row: (row.get('processing_error') or '') == '' and (row.get('zarr_processing_error') or '') == ''" + - "lambda row: (row.get('num_frames') or -1) > 0" + - "lambda row: row.get('episode_hash') not in {'2025-09-20-18-44-29-000000', '2025-09-24-04-02-36-000000', '2025-10-13-00-26-50-123000', '2025-10-13-03-55-42-588000', '2025-10-29-22-02-32-491000', '2025-10-29-22-05-17-842000', '2025-10-29-22-06-00-422000', '2025-10-29-22-06-00-423000', '2025-11-11-22-57-35-210000', '2025-11-11-22-59-20-286000', '2026-01-20-21-28-58-868000'}" + mode: train + valid_ratio: 0.05 + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/shared/egoverseS3ZarrDatasets + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + keymap_mode: cartesian_pi + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: cartesian + filters: + _target_: egomimic.rldb.filters.DatasetFilter + filter_lambdas: + - "lambda row: row.get('embodiment') == 'aria_bimanual'" + - "lambda row: row.get('task') == 'fold_clothes'" + - "lambda row: (row.get('processing_error') or '') == '' and (row.get('zarr_processing_error') or '') == ''" + - "lambda row: (row.get('num_frames') or -1) > 0" + - "lambda row: row.get('episode_hash') not in {'2025-09-20-17-42-51-000000', '2025-09-20-18-44-29-000000', '2025-09-20-19-04-22-000000', '2025-09-20-20-23-06-000000', '2025-09-20-20-55-13-000000', '2025-09-21-21-35-21-000000', '2025-09-21-22-21-41-000000', '2025-09-22-03-09-27-000000', '2025-09-22-03-13-58-000000', '2025-09-22-21-00-06-000000', '2025-09-23-19-32-44-000000', '2025-09-23-21-58-00-000000', '2025-09-24-04-02-36-000000', '2025-09-24-04-07-27-000000', '2025-09-24-15-01-42-000000', '2025-09-25-18-17-20-000000', '2025-09-25-19-11-49-000000', '2025-09-26-20-12-50-000000', '2025-09-27-20-24-52-000000', '2025-09-27-20-49-54-000000', '2025-10-03-17-06-02-000000', '2025-10-12-01-38-11-933000', '2025-10-12-04-18-32-000000', '2025-10-12-16-27-30-000000', '2025-10-13-00-01-37-879000', '2025-10-13-00-26-50-123000', '2025-10-13-00-27-04-770000', '2025-10-13-02-24-10-000000', '2025-10-13-03-55-42-588000', '2025-10-13-03-56-52-773000', '2025-10-14-02-53-00-652000', '2025-10-14-03-46-49-000000', '2025-10-14-04-00-10-000000', '2025-10-14-04-15-30-000000', '2025-10-14-04-21-05-000000', '2025-10-14-04-29-58-059000', '2025-10-29-21-54-36-213000', '2025-10-29-22-00-54-821000', '2025-10-29-22-01-11-720000', '2025-10-29-22-01-58-834000', '2025-10-29-22-02-01-699000', '2025-10-29-22-02-23-166000', '2025-10-29-22-02-26-821000', '2025-10-29-22-02-29-525000', '2025-10-29-22-02-32-490000', '2025-10-29-22-02-32-491000', '2025-10-29-22-04-01-785000', '2025-10-29-22-04-05-568000', '2025-10-29-22-05-17-804000', '2025-10-29-22-05-17-816000', '2025-10-29-22-05-17-818000', '2025-10-29-22-05-17-820000', '2025-10-29-22-05-17-830000', '2025-10-29-22-05-17-842000', '2025-10-29-22-06-00-406000', '2025-10-29-22-06-00-414000', '2025-10-29-22-06-00-418000', '2025-10-29-22-06-00-422000', '2025-10-29-22-06-00-423000', '2025-10-29-22-06-00-426000', '2025-10-29-22-06-00-429000', '2025-10-29-22-06-00-436000', '2025-10-29-22-06-00-462000', '2025-11-11-21-41-21-185000', '2025-11-11-21-47-01-466000', '2025-11-11-21-50-28-322000', '2025-11-11-21-52-08-271000', '2025-11-11-21-52-24-163000', '2025-11-11-21-53-45-616000', '2025-11-11-21-58-53-436000', '2025-11-11-22-02-04-599000', '2025-11-11-22-12-22-396000', '2025-11-11-22-16-59-783000', '2025-11-11-22-31-19-721000', '2025-11-11-22-38-54-397000', '2025-11-11-22-39-07-435000', '2025-11-11-22-44-33-571000', '2025-11-11-22-45-09-166000', '2025-11-11-22-46-01-494000', '2025-11-11-22-46-39-752000', '2025-11-11-22-57-35-210000', '2025-11-11-22-59-20-286000', '2025-11-11-22-59-43-138000', '2025-11-11-23-05-19-609000', '2025-11-22-23-48-44-089000', '2025-11-23-19-45-26-311000', '2025-11-23-19-45-45-117000', '2025-11-24-20-23-30-277000', '2025-11-24-20-23-46-303000', '2025-11-24-20-27-24-975000', '2025-12-09-23-02-21-846000', '2026-01-08-21-58-04-000000', '2026-01-08-22-41-22-000000', '2026-01-09-20-31-25-000000', '2026-01-20-21-28-58-868000', '2026-01-20-21-30-45-970000', '2026-01-20-21-32-08-684000'}" + mode: train + valid_ratio: 0.05 + +valid_datasets: + eva_bimanual: + _target_: ${...train_datasets.eva_bimanual._target_} + resolver: ${...train_datasets.eva_bimanual.resolver} + filters: ${...train_datasets.eva_bimanual.filters} + mode: valid + valid_ratio: ${...train_datasets.eva_bimanual.valid_ratio} + aria_bimanual: + _target_: ${...train_datasets.aria_bimanual._target_} + resolver: ${...train_datasets.aria_bimanual.resolver} + filters: ${...train_datasets.aria_bimanual.filters} + mode: valid + valid_ratio: ${...train_datasets.aria_bimanual.valid_ratio} + +train_dataloader_params: + eva_bimanual: + batch_size: 32 + num_workers: 10 + aria_bimanual: + batch_size: 32 + num_workers: 10 +valid_dataloader_params: + eva_bimanual: + batch_size: 32 + num_workers: 10 + aria_bimanual: + batch_size: 32 + num_workers: 10 diff --git a/egomimic/hydra_configs/data/eva_fold_clothes_eva_only.yaml b/egomimic/hydra_configs/data/eva_fold_clothes_eva_only.yaml new file mode 100644 index 000000000..b8916a9ac --- /dev/null +++ b/egomimic/hydra_configs/data/eva_fold_clothes_eva_only.yaml @@ -0,0 +1,41 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper + +train_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /storage/project/r-dxu345-0/shared/egoverseS3ZarrDatasets + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + keymap_mode: cartesian_pi + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + mode: cartesian + filters: + _target_: egomimic.rldb.filters.DatasetFilter + filter_lambdas: + - "lambda row: row.get('embodiment') == 'eva_bimanual'" + - "lambda row: row.get('task') == 'fold_clothes'" + - "lambda row: (row.get('processing_error') or '') == '' and (row.get('zarr_processing_error') or '') == ''" + - "lambda row: (row.get('num_frames') or -1) > 0" + - "lambda row: row.get('episode_hash') not in {'2025-09-20-18-44-29-000000', '2025-09-24-04-02-36-000000', '2025-10-13-00-26-50-123000', '2025-10-13-03-55-42-588000', '2025-10-29-22-02-32-491000', '2025-10-29-22-05-17-842000', '2025-10-29-22-06-00-422000', '2025-10-29-22-06-00-423000', '2025-11-11-22-57-35-210000', '2025-11-11-22-59-20-286000', '2026-01-20-21-28-58-868000'}" + mode: train + valid_ratio: 0.05 + +valid_datasets: + eva_bimanual: + _target_: ${...train_datasets.eva_bimanual._target_} + resolver: ${...train_datasets.eva_bimanual.resolver} + filters: ${...train_datasets.eva_bimanual.filters} + mode: valid + valid_ratio: ${...train_datasets.eva_bimanual.valid_ratio} + +train_dataloader_params: + eva_bimanual: + batch_size: 32 + num_workers: 10 +valid_dataloader_params: + eva_bimanual: + batch_size: 32 + num_workers: 10 diff --git a/egomimic/hydra_configs/data/obj_gen_pi_lang.yaml b/egomimic/hydra_configs/data/obj_gen_pi_lang.yaml new file mode 100644 index 000000000..08651b1dd --- /dev/null +++ b/egomimic/hydra_configs/data/obj_gen_pi_lang.yaml @@ -0,0 +1,33 @@ +defaults: + - cotrain_pi_base + - _self_ + +# Motion-generalization cotrain: annotated-only eva + annotated-only aria +# (base/object descriptions). PI-style camera keys for both embodiments. + +train_datasets: + eva_bimanual: + resolver: + folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place + key_map: + keymap_mode: cartesian_pi + filters: + _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter + project_name: "dense-language" + filter_lambdas: + - "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'eva_bimanual' and (row.get('zarr_processed_path') or '') != '' and 'alignment' not in ((row.get('task_description') or '').lower()) and (row.get('episode_hash') or '') != '2026-04-22-02-30-32-296000'" + aria_bimanual: + resolver: + folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place + key_map: + keymap_mode: cartesian_pi + filters: + _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter + project_name: "dense-language" + filter_lambdas: + # Exclude these aria episodes: every base_0_rgb JPEG decode fails + # (entire episode is corrupted), exhausts random-retry budget at train time. + # 2026-04-26-00-17-53-000000 + # 2026-04-26-00-27-26-000000 + # 2026-05-01-02-52-58-000000 + - "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'aria_bimanual' and (row.get('zarr_processed_path') or '') != '' and any(s in ((row.get('task_description') or '').lower()) for s in ('base', 'object')) and (row.get('episode_hash') or '') not in ('2026-04-26-00-17-53-000000', '2026-04-26-00-27-26-000000', '2026-05-01-02-52-58-000000')" diff --git a/egomimic/hydra_configs/data/obj_gen_pi_lang_aria_no_objgen.yaml b/egomimic/hydra_configs/data/obj_gen_pi_lang_aria_no_objgen.yaml new file mode 100644 index 000000000..d80080507 --- /dev/null +++ b/egomimic/hydra_configs/data/obj_gen_pi_lang_aria_no_objgen.yaml @@ -0,0 +1,17 @@ +defaults: + - obj_gen_pi_lang + - _self_ + +# Cotrain variant of obj_gen_pi_lang: keeps the eva (robot) obj-gen subset +# intact, but removes the "object" slice from the aria (human) side — i.e. +# aria keeps annotated pick_place episodes whose task_description contains +# "base" but does NOT contain "object". Bad-JPEG aria episodes are still +# excluded. + +train_datasets: + aria_bimanual: + filters: + _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter + project_name: "dense-language" + filter_lambdas: + - "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'aria_bimanual' and (row.get('zarr_processed_path') or '') != '' and 'base' in ((row.get('task_description') or '').lower()) and 'object' not in ((row.get('task_description') or '').lower()) and (row.get('episode_hash') or '') not in ('2026-04-26-00-17-53-000000', '2026-04-26-00-27-26-000000', '2026-05-01-02-52-58-000000')" diff --git a/egomimic/hydra_configs/data/obj_gen_pi_lang_robot_only.yaml b/egomimic/hydra_configs/data/obj_gen_pi_lang_robot_only.yaml new file mode 100644 index 000000000..c78c4966b --- /dev/null +++ b/egomimic/hydra_configs/data/obj_gen_pi_lang_robot_only.yaml @@ -0,0 +1,12 @@ +defaults: + - obj_gen_pi_lang + - _self_ + +# Robot-only variant of obj_gen_pi_lang: annotated eva (robot) data only, +# aria (human) data dropped. Inherits eva filters/resolver from obj_gen_pi_lang. + +train_datasets: + aria_bimanual: null + +valid_datasets: + aria_bimanual: null diff --git a/egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe.yaml b/egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe.yaml new file mode 100644 index 000000000..c5d58a3e5 --- /dev/null +++ b/egomimic/hydra_configs/data/obj_gen_pi_lang_wristframe.yaml @@ -0,0 +1,40 @@ +defaults: + - cotrain_pi_base + - _self_ + +# Wristframe variant of motion_gen_pi_lang: annotated-only eva + annotated-only +# aria (base/object descriptions). Actions are expressed in each wrist's own +# frame (cartesian_wristframe_ypr) instead of the head/camera frame. Pair with +# evaluator=eval_pi so the revert transform projects predictions back to cam +# frame for the viz video. + +train_datasets: + eva_bimanual: + resolver: + folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place + key_map: + keymap_mode: cartesian_pi + transform_list: + mode: cartesian_wristframe_ypr + filters: + _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter + project_name: "dense-language" + filter_lambdas: + - "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'eva_bimanual' and (row.get('zarr_processed_path') or '') != '' and 'alignment' not in ((row.get('task_description') or '').lower()) and (row.get('episode_hash') or '') != '2026-04-22-02-30-32-296000'" + aria_bimanual: + resolver: + folder_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/pick_place + key_map: + keymap_mode: cartesian_pi + transform_list: + mode: cartesian_wristframe_ypr + filters: + _target_: egomimic.rldb.filters.ScaleAnnotationDatasetFilter + project_name: "dense-language" + filter_lambdas: + # Exclude these aria episodes: every base_0_rgb JPEG decode fails + # (entire episode is corrupted), exhausts random-retry budget at train time. + # 2026-04-26-00-17-53-000000 + # 2026-04-26-00-27-26-000000 + # 2026-05-01-02-52-58-000000 + - "lambda row: row.get('task') == 'pick_place' and row.get('embodiment') == 'aria_bimanual' and (row.get('zarr_processed_path') or '') != '' and any(s in ((row.get('task_description') or '').lower()) for s in ('base', 'object')) and (row.get('episode_hash') or '') not in ('2026-04-26-00-17-53-000000', '2026-04-26-00-27-26-000000', '2026-05-01-02-52-58-000000')" diff --git a/egomimic/hydra_configs/evaluator/eval_pi_camframe.yaml b/egomimic/hydra_configs/evaluator/eval_pi_camframe.yaml new file mode 100644 index 000000000..84405e7eb --- /dev/null +++ b/egomimic/hydra_configs/evaluator/eval_pi_camframe.yaml @@ -0,0 +1,12 @@ +defaults: + - viz@viz_func: pi_cartesian_lang + - _self_ + +_target_: egomimic.eval.eval_pi.PIEvalVideo + +# Cam-frame variant: use when the data config does NOT apply a wristframe +# transform (e.g. motion_gen_pi_lang / obj_gen_pi_lang with plain cartesian_pi +# keymap). Actions are already in cam (head) frame, so no revert is needed — +# applying the wrist->cam revert from eval_pi_wristframe.yaml to cam-frame +# actions shifts both gt and pred by a constant offset in the viz video. +transform_lists: {} diff --git a/egomimic/hydra_configs/evaluator/eval_pi_wristframe.yaml b/egomimic/hydra_configs/evaluator/eval_pi_wristframe.yaml new file mode 100644 index 000000000..db168c72f --- /dev/null +++ b/egomimic/hydra_configs/evaluator/eval_pi_wristframe.yaml @@ -0,0 +1,20 @@ +defaults: + - viz@viz_func: pi_cartesian_lang + - _self_ + +_target_: egomimic.eval.eval_pi.PIEvalVideo + +# Wristframe variant: use when the data config applies a wristframe transform +# (e.g. motion_gen_pi_lang_wristframe / obj_gen_pi_lang_wristframe with +# `transform_list: mode: cartesian_wristframe_ypr`). The model predicts in +# each wrist's local frame; these revert transforms project predictions and +# gt back to cam (head) frame for the cam-frame MSE and the viz video. Each +# value resolves to a list[Transform] via its ``_target_``. Must match the +# viz config mounted above. +transform_lists: + eva_bimanual: + _target_: egomimic.rldb.embodiment.eva._build_eva_bimanual_revert_eef_frame_transform_list + is_quat: false + aria_bimanual: + _target_: egomimic.rldb.embodiment.human._build_aria_cartesian_revert_eef_frame_transform_list + is_quat: false diff --git a/egomimic/hydra_configs/evaluator/viz/pi_cartesian_lang.yaml b/egomimic/hydra_configs/evaluator/viz/pi_cartesian_lang.yaml index 3b7859dc6..0d670f5f2 100644 --- a/egomimic/hydra_configs/evaluator/viz/pi_cartesian_lang.yaml +++ b/egomimic/hydra_configs/evaluator/viz/pi_cartesian_lang.yaml @@ -3,8 +3,8 @@ defaults: - _self_ eva_bimanual: - annotation_key: sampled_prompt + annotation_key: annotations aria_bimanual: - annotation_key: sampled_prompt + annotation_key: annotations scale_bimanual: - annotation_key: sampled_prompt + annotation_key: annotations diff --git a/egomimic/hydra_configs/logger/wandb.yaml b/egomimic/hydra_configs/logger/wandb.yaml index 6f574bd5f..4ad06ce91 100644 --- a/egomimic/hydra_configs/logger/wandb.yaml +++ b/egomimic/hydra_configs/logger/wandb.yaml @@ -5,7 +5,8 @@ wandb: # name: "" # name of the run (normally generated by wandb) save_dir: "${paths.output_dir}" offline: False - id: "${name}_${description}_${now:%Y-%m-%d_%H-%M-%S}" # pass correct id to resume experiment! + id: "${name}_${description}_${oc.env:SLURM_JOB_ID,${now:%Y-%m-%d_%H-%M-%S}}" # pinned to SLURM_JOB_ID so Slurm requeues resume the same wandb run + resume: allow anonymous: null # enable anonymous logging project: "zarr_test" log_model: False # upload lightning ckpts diff --git a/egomimic/hydra_configs/model/pi0.5_base.yaml b/egomimic/hydra_configs/model/pi0.5_base.yaml index 1711ecc75..4c78c4b20 100644 --- a/egomimic/hydra_configs/model/pi0.5_base.yaml +++ b/egomimic/hydra_configs/model/pi0.5_base.yaml @@ -3,7 +3,7 @@ robomimic_model: _target_: egomimic.algo.pi.PI config: pytorch_training_precision: bfloat16 - pytorch_weight_path: /storage/home/hcoda1/4/paphiwetsa3/r-dxu345-0/projects/EgoVerse/egomimic/algo/pi_checkpoints/pi05_base_pytorch + pytorch_weight_path: /storage/home/hcoda1/5/agao81/r-dxu345-0/EgoVerse/egomimic/algo/pi_checkpoints/pi05_base_pytorch model: pi05: true action_dim: 32 diff --git a/egomimic/hydra_configs/train_zarr_cartesian.yaml b/egomimic/hydra_configs/train_zarr_cartesian.yaml index 31fe7bd20..92ed2724d 100644 --- a/egomimic/hydra_configs/train_zarr_cartesian.yaml +++ b/egomimic/hydra_configs/train_zarr_cartesian.yaml @@ -17,10 +17,11 @@ mode: train hydra: run: - # Dir should be experiment_name/description_{timestamp} - dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + # Pin to SLURM_JOB_ID so Slurm requeues land in the same dir and Lightning + # autoresume finds last.ckpt; fall back to a timestamp when run outside Slurm. + dir: ./logs/${name}/${description}_${oc.env:SLURM_JOB_ID,${now:%Y-%m-%d_%H-%M-%S}} sweep: - dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + dir: ./logs/${name}/${description}_${oc.env:SLURM_JOB_ID,${now:%Y-%m-%d_%H-%M-%S}} launch_params: gpus_per_node: 1 diff --git a/egomimic/hydra_configs/trainer/ddp_pi.yaml b/egomimic/hydra_configs/trainer/ddp_pi.yaml index 731316953..6d4f95f59 100644 --- a/egomimic/hydra_configs/trainer/ddp_pi.yaml +++ b/egomimic/hydra_configs/trainer/ddp_pi.yaml @@ -7,5 +7,5 @@ accelerator: gpu devices: ${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'} num_nodes: ${launch_params.nodes} sync_batchnorm: True -check_val_every_n_epoch: 200 +check_val_every_n_epoch: 100 num_sanity_val_steps: 0 \ No newline at end of file diff --git a/egomimic/rldb/embodiment/embodiment.py b/egomimic/rldb/embodiment/embodiment.py index cb7bbf4f9..2edbc0fb7 100644 --- a/egomimic/rldb/embodiment/embodiment.py +++ b/egomimic/rldb/embodiment/embodiment.py @@ -178,8 +178,10 @@ def viz_gt_preds( images = batch[image_key] actions = batch[action_key] - if annotation_key is not None: + if annotation_key is not None and annotation_key in batch: annotations = batch[annotation_key] + else: + annotation_key = None ims_list = [] images = _to_numpy(images) actions = _to_numpy(actions) diff --git a/egomimic/rldb/zarr/zarr_dataset_multi.py b/egomimic/rldb/zarr/zarr_dataset_multi.py index 069eb3a70..a9a1dac54 100644 --- a/egomimic/rldb/zarr/zarr_dataset_multi.py +++ b/egomimic/rldb/zarr/zarr_dataset_multi.py @@ -130,6 +130,13 @@ def _normalize_filter_row( if _is_missing_filter_value(normalized.get("is_deleted")): normalized["is_deleted"] = False + embodiment = _first_present( + normalized.get("embodiment"), + normalized.get("robot_type"), + ) + if embodiment is not None: + normalized["embodiment"] = embodiment + return normalized @@ -1723,7 +1730,9 @@ def _next(reason: str, key: str = "") -> int: data["embodiment"] = get_embodiment_id(self.embodiment) ep_name = Path(self.episode_path).name - data["episode_hash"] = ep_name[:-5] if ep_name.endswith(".zarr") else ep_name + data["episode_hash"] = ( + ep_name[:-5] if ep_name.endswith(".zarr") else ep_name + ) _ = origin # preserved for symmetry with prior API return data diff --git a/train.sbatch b/train.sbatch new file mode 100755 index 000000000..10adfa14f --- /dev/null +++ b/train.sbatch @@ -0,0 +1,90 @@ +#!/bin/bash +#SBATCH --account=gts-dxu345-rl2 +#SBATCH --qos=inferno +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=128G +#SBATCH --gres=gpu:h200:4 +#SBATCH --time=3-00:00:00 +#SBATCH --job-name=egomimic +#SBATCH --output=slurm-%j.out + +# Usage: +# sbatch train.sbatch +# sbatch --gres=gpu:h200:4 --ntasks=4 train.sbatch +# +# Defaults to a single-GPU h100 pi-style allocation (per CLAUDE.md: pi runs use +# h100/h200, everything else l40s — override --gres for l40s jobs). + +set -euo pipefail + +REPO_DIR="/storage/project/r-dxu345-0/agao81/EgoVerse" +VENV_PYTHON="${REPO_DIR}/emimic/bin/python" +APIKEY_FILE="${REPO_DIR}/apikey.txt" + +cd "${REPO_DIR}" + +# Source API keys (SCALE_API_KEY, OPENAI_API_KEY, ...). The Scale filter and +# annotation utilities read these from os.environ at instantiate time. +if [[ -f "${APIKEY_FILE}" ]]; then + set -a + # shellcheck disable=SC1090 + source "${APIKEY_FILE}" + set +a +else + echo "WARNING: ${APIKEY_FILE} not found — runs using Scale/OpenAI will fail." >&2 +fi + +echo "Job ${SLURM_JOB_ID} on $(hostname)" +echo "Python: ${VENV_PYTHON}" +echo "Overrides: $*" + +# Wandb fix: spawn its service in /tmp instead of the (slow) shared FS, and +# extend the service-startup poll timeout 30s → 180s. +export WANDB_DIR="/tmp/wandb_${SLURM_JOB_ID:-$$}" +export WANDB_CACHE_DIR="/tmp/wandb_cache_${SLURM_JOB_ID:-$$}" +export WANDB_CONFIG_DIR="/tmp/wandb_config_${SLURM_JOB_ID:-$$}" +export WANDB_SERVICE_WAIT=180 +mkdir -p "$WANDB_DIR" "$WANDB_CACHE_DIR" "$WANDB_CONFIG_DIR" +# Reduce GPU memory fragmentation under heavy allocs (paligemma + action expert). +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + +# Convert NCCL collective hangs into exceptions (with a flight-recorder dump) +# rather than silent freezes. Pairs with the heartbeat watchdog in pl_model. +export TORCH_NCCL_BLOCKING_WAIT=1 +export NCCL_ASYNC_ERROR_HANDLING=1 +export TORCH_NCCL_DUMP_ON_TIMEOUT=1 + +# Silence the tokenizer fork warning (and its known-buggy auto-disable path). +export TOKENIZERS_PARALLELISM=false + +# Optional: stage a dataset folder to node-local /tmp SSD before training. +# Removes shared-NFS reads from the hot path; combined with the worker +# timeout in dataloader_params this turns "silent NFS stall" into "clean +# requeue". Triggered when STAGE_FROM is exported. +# +# sbatch --export=ALL,STAGE_FROM=/path/to/zarr[,STAGE_EMBODIMENTS=eva_bimanual,aria_bimanual] \ +# train.sbatch +# +# rsyncs $STAGE_FROM -> /tmp/$SLURM_JOB_ID/$(basename $STAGE_FROM) once and +# appends `data.train_datasets..resolver.folder_path=` to the +# hydra overrides for every comma-separated embodiment in STAGE_EMBODIMENTS. +STAGE_OVERRIDES=() +if [[ -n "${STAGE_FROM:-}" ]]; then + STAGE_DST="/tmp/${SLURM_JOB_ID:-$$}/$(basename "${STAGE_FROM}")" + echo "Staging dataset ${STAGE_FROM} -> ${STAGE_DST}" + mkdir -p "$(dirname "${STAGE_DST}")" + time rsync -a "${STAGE_FROM%/}/" "${STAGE_DST}/" + EMBS="${STAGE_EMBODIMENTS:-eva_bimanual,aria_bimanual}" + IFS=',' read -r -a EMB_ARR <<< "${EMBS}" + for emb in "${EMB_ARR[@]}"; do + STAGE_OVERRIDES+=("data.train_datasets.${emb}.resolver.folder_path=${STAGE_DST}") + done +fi + +if [[ ${#STAGE_OVERRIDES[@]} -gt 0 ]]; then + srun "${VENV_PYTHON}" egomimic/trainHydra.py "$@" "${STAGE_OVERRIDES[@]}" +else + srun "${VENV_PYTHON}" egomimic/trainHydra.py "$@" +fi