-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtranslate-runner.sh
More file actions
68 lines (59 loc) · 2.83 KB
/
translate-runner.sh
File metadata and controls
68 lines (59 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env bash
# Runner for translate-sweep.sbatch job array.
# Each array task handles one (method, translate_pair, app) combination,
# running num_translations=20 translate.py calls against a shared vLLM server.
#
# Source this from translate-sweep.sbatch, then call run_translate_array_task.
# Requires SLURM_ARRAY_TASK_ID to be set (provided automatically by Slurm).
set -euo pipefail
run_translate_array_task() {
local task_id="${SLURM_ARRAY_TASK_ID:-0}"
# Task index decomposition — lists are defined in translate-sweep.sbatch.
local -a app_names
local -a translate_pairs
local -a methods
read -ra app_names <<< "${TRANSLATE_APPS:?TRANSLATE_APPS not set}"
read -ra translate_pairs <<< "${TRANSLATE_PAIRS:?TRANSLATE_PAIRS not set}"
read -ra methods <<< "${TRANSLATE_METHODS:?TRANSLATE_METHODS not set}"
local n_apps=${#app_names[@]}
local n_pairs=${#translate_pairs[@]}
local num_translations="${TRANSLATE_NUM:?TRANSLATE_NUM not set}"
local method_idx=$(( task_id / (n_pairs * n_apps) ))
local pair_idx=$(( (task_id / n_apps) % n_pairs ))
local app_idx=$(( task_id % n_apps ))
local method="${methods[$method_idx]}"
local translate_pair="${translate_pairs[$pair_idx]}"
local app_name="${app_names[$app_idx]}"
local src_model
local dst_model
src_model=$(echo "$translate_pair" | cut -d',' -f1)
dst_model=$(echo "$translate_pair" | cut -d',' -f2)
echo "Task ${task_id}: method=${method} src=${src_model} dst=${dst_model} app=${app_name}"
# Per-task isolated cache dirs in node-local memory (/dev/shm).
# Each array task gets its own slot so concurrent tasks don't collide.
local lmem_cache="/dev/shm/${USER}/.cache"
mkdir -p "$lmem_cache"
local slot="${SLURM_ARRAY_TASK_ID:-0}"
export TORCHINDUCTOR_CACHE_DIR="${lmem_cache}/torchinductor_${SLURM_JOB_ID}_${slot}"
export VLLM_CACHE_ROOT="${lmem_cache}/vllm_${SLURM_JOB_ID}_${slot}"
# A keepalive ID unique to this array task causes GeneratorMixin to write a
# PID file for the vLLM server so it persists across all num_translations calls.
local keepalive_id="${SLURM_JOB_ID}_${slot}"
for i in $(seq 0 $((num_translations - 1))); do
python src/translate/translate.py \
-i "targets/${app_name}/${src_model}/" \
-o ../restate-results/ \
-c "targets/${app_name}/${dst_model}/" \
--method "${method}" \
--src-model "${src_model}" \
--dst-model "${dst_model}" \
-n 1 \
--output-id "${i}" \
--app-name "${app_name}" \
--vllm-environment ../serve/.venv/ \
--vllm-yaml-config config/perlmutter-vllm-oss.yaml \
--vllm-keepalive-id "${keepalive_id}" \
--naive-backend vllm \
--naive-llm-name openai/gpt-oss-120b
done
}