-
Notifications
You must be signed in to change notification settings - Fork 29
Expand file tree
/
Copy pathtrain.env
More file actions
48 lines (42 loc) · 1.97 KB
/
train.env
File metadata and controls
48 lines (42 loc) · 1.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# This is the environment file for running HoloMotion scripts.
export CONDA_BASE=$(conda info --base)
export Train_CONDA_PREFIX="$CONDA_BASE/envs/holomotion_train"
# export CUDA_HOME=$Train_CONDA_PREFIX
export CUDA_HOME=/usr/local/cuda
# export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$Train_CONDA_PREFIX/lib/:$Train_CONDA_PREFIX/lib/stubs"
# export LIBRARY_PATH="$Train_CONDA_PREFIX/lib/stubs:$Train_CONDA_PREFIX/lib:$LIBRARY_PATH"
export HYDRA_FULL_ERROR=1
export OMNI_KIT_ACCEPT_EULA="YES"
export ACCEPT_EULA="YES"
# export CUDA_LAUNCH_BLOCKING=1
export USE_NVRTC=1
export HDF5_USE_FILE_LOCKING=FALSE
export HOLOMOTION_ISAAC_STAGGER_SEC=1
export HOLOMOTION_HDF5_RDCC_NBYTES=$((4 * 1024 * 1024)) # 4MB
export HOLOMOTION_HDF5_MAX_OPEN_SHARDS=16 # 16 shards
export TORCH_DISTRIBUTED_DEBUG=INFO
# export TORCHDYNAMO_VERBOSE=0
echo "--------------------------------"
echo "Train_CONDA_PREFIX: $Train_CONDA_PREFIX"
echo "CUDA_HOME: $CUDA_HOME"
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
echo "LIBRARY_PATH: $LIBRARY_PATH"
echo "HYDRA_FULL_ERROR: $HYDRA_FULL_ERROR"
echo "OMNI_KIT_ACCEPT_EULA: $OMNI_KIT_ACCEPT_EULA"
echo "HDF5_USE_FILE_LOCKING: $HDF5_USE_FILE_LOCKING"
echo "HOLOMOTION_ISAAC_STAGGER_SEC: $HOLOMOTION_ISAAC_STAGGER_SEC"
echo "HOLOMOTION_HDF5_RDCC_NBYTES: $HOLOMOTION_HDF5_RDCC_NBYTES"
echo "HOLOMOTION_HDF5_MAX_OPEN_SHARDS: $HOLOMOTION_HDF5_MAX_OPEN_SHARDS"
echo "--------------------------------"
# Graceful shutdown function for training scripts
# Note: Scripts must set TRAIN_PID variable and call: trap cleanup SIGINT SIGTERM
cleanup() {
echo ""
echo "🛑 Cleanup triggered - shutting down training process ${TRAIN_PID}..."
exec 2>/dev/null # Suppress error messages during cleanup
[[ -n "${TRAIN_PID}" ]] && kill -TERM "${TRAIN_PID}" 2>/dev/null && echo " ✓ Sent TERM signal to process ${TRAIN_PID}"
sleep 2
[[ -n "${TRAIN_PID}" ]] && pkill -P "${TRAIN_PID}" 2>/dev/null && echo " ✓ Killed child processes"
exec 2>&1
echo " ✓ Cleanup complete"
}