modelscope · tastelikefeet · Jun 14, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh
@@ -5,6 +5,7 @@
 
 set -e  # Exit immediately on error
 export SETUPTOOLS_USE_DISTUTILS=local
+export UV_INDEX_URL=${UV_INDEX_URL:-https://mirrors.aliyun.com/pypi/simple/}
 echo "=========================================="
 echo "Starting deep learning dependencies installation..."
 echo "=========================================="
@@ -53,42 +54,46 @@ TORCH_CUDA_ARCH_LIST=$(get_cuda_arch "$GPU_NAME")
 export TORCH_CUDA_ARCH_LIST
 echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST"
 
-# Install latest base packages
+# Install vllm 0.21.x (latest 0.2x uses CUDA 12 toolchain, avoids CUDA 13 CUTLASS conflicts)
 echo ""
-echo "Installing peft, accelerate, transformers, modelscope..."
-pip install --upgrade peft accelerate transformers "modelscope[framework]" --no-cache-dir
+echo "Installing vllm 0.21..."
+uv pip install "vllm>=0.21,<0.22"
 
-# Install latest vllm
+# Install latest base packages
 echo ""
-echo "Installing latest vllm..."
-pip install --upgrade vllm --no-cache-dir
+echo "Installing peft, accelerate, transformers, modelscope..."
+uv pip install --upgrade peft accelerate transformers "modelscope[framework]"
 
 # Get site-packages path and install transformer_engine and megatron_core
 echo ""
 echo "Installing transformer_engine and megatron_core..."
 SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
 echo "Site-packages path: $SITE_PACKAGES"
 
-CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \
-CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \
-pip install --no-build-isolation "transformer_engine[pytorch]" --no-cache-dir
+export CUDA_HOME=${SITE_PACKAGES}/nvidia/cu13
+export PATH=$CUDA_HOME/bin:$PATH
+export CPATH=$CUDA_HOME/include:$CPATH
+export LIBRARY_PATH=$CUDA_HOME/lib:$LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDA_HOME/lib:$LD_LIBRARY_PATH
+uv pip install transformer_engine_torch --no-build-isolation
 
-pip install megatron_core mcore_bridge --no-cache-dir
+uv pip install megatron_core mcore_bridge
 
-# Install flash-attention (force local build)
+# Install flash-attention
+# Prefer prebuilt wheel; fall back to source build only if needed.
 echo ""
-echo "Installing flash-attention (local build for $GPU_NAME)..."
-TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \
-MAX_JOBS=8 \
-FLASH_ATTENTION_FORCE_BUILD=TRUE \
-pip install flash-attn --no-build-isolation --no-cache-dir
+echo "Installing flash-attention..."
+export TORCH_CUDA_ARCH_LIST
+export MAX_JOBS=8
+pip install flash-attn --no-cache-dir || \
+    FLASH_ATTENTION_FORCE_BUILD=TRUE pip install flash-attn --no-build-isolation --no-cache-dir
 
-pip install flash-linear-attention -U --no-cache-dir
+uv pip install flash-linear-attention --upgrade
 
 # Install numpy
 echo ""
 echo "Installing numpy==2.2 and deep_gemm..."
-pip install numpy==2.2 --no-cache-dir
+uv pip install numpy==2.2
 
 # Verify installation
 echo ""

diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@
 </p>
 
 <p align="center">
-        <a href="https://twinkle-kit.readthedocs.io/en/latest/">English Documentation</a> &nbsp ｜ &nbsp <a href="https://twinkle-kit.readthedocs.io/zh-cn/latest/">中文文档</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/">Twinkle Web</a> &nbsp
+        <a href="https://modelscope.github.io/twinkle-web/docs/">English Documentation</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/zh/docs/">中文文档</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/">Twinkle Web</a> &nbsp
 </p>
 
 ## ✨ What is Twinkle?
@@ -100,11 +100,11 @@ sh INSTALL_MEGATRON.sh
 | DPO multi-LoRA training              | transformers    | [Script](cookbook/rl/dpo_multi_lora.py)                |
 | GKD on-policy distillation           | megatron        | [Script](cookbook/rl/gkd_on_policy.py)                 |
 | GKD off-policy distillation          | megatron        | [Script](cookbook/rl/gkd_off_policy.py)                |
-| Tinker client finetuning (self-host) | transformers    | [Script](cookbook/client/tinker/self_host)             |
-| Tinker client finetuning (ModelScope) | transformers   | [Script](cookbook/client/tinker/modelscope)            |
-| Twinkle client finetuning (self-host) | transformers   | [Script](cookbook/client/twinkle/self_host)            |
-| Twinkle client finetuning (ModelScope) | transformers  | [Script](cookbook/client/twinkle/modelscope)           |
-| Server startup scripts               | transformers/megatron | [Script](cookbook/client/server)                 |
+| Tinker client finetuning (self-host) | transformers    | [Script](cookbook/server_mode/tinker/self_host)             |
+| Tinker client finetuning (ModelScope) | transformers   | [Script](cookbook/server_mode/tinker/modelscope)            |
+| Twinkle client finetuning (self-host) | transformers   | [Script](cookbook/server_mode/twinkle/self_host)            |
+| Twinkle client finetuning (ModelScope) | transformers  | [Script](cookbook/server_mode/twinkle/modelscope)           |
+| Server startup scripts               | transformers/megatron | [Script](cookbook/server_mode/server)                 |
 
 ## Changelog
 - 🎉2026-05-20 Support DeepSeek-V4-Flash and DeepSeek-V4-Pro models.
@@ -122,7 +122,7 @@ sh INSTALL_MEGATRON.sh
 
 We are rolling out training service built atop Twinkle✨ on ModelScope. You may
 train via API endpoint  `base_url=https://www.modelscope.cn/twinkle`. For more details, please refer to
-our [documentation](docs/source_en/Usage%20Guide/Train-as-a-Service.md).
+our [documentation](https://modelscope.github.io/twinkle-web/docs/usage-guide/train-as-a-service/).
 
 ## Supported Hardware
 
@@ -177,7 +177,7 @@ supported on Twinkle✨ framework.
 ## Sample Code
 
 Below are some of the capabilities demonstrated in the example code. For a complete introduction to training capabilities,
-please refer to [Quick Start](docs/source_en/Usage%20Guide/Quick-Start.md) and [cookbook](cookbook).
+please refer to [Quick Start](https://modelscope.github.io/twinkle-web/docs/usage-guide/quick-start/) and [cookbook](cookbook).
 
 ### Train with Ray
 

diff --git a/README_ZH.md b/README_ZH.md
@@ -19,7 +19,7 @@ by <a href="https://modelscope.cn/home">ModelScope</a> & <a href="https://www.cm
 </p>
 
 <p align="center">
-        <a href="https://twinkle-kit.readthedocs.io/en/latest/">英文文档</a> &nbsp ｜ &nbsp <a href="https://twinkle-kit.readthedocs.io/zh-cn/latest/">中文文档</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/">Twinkle 站点</a> &nbsp
+        <a href="https://modelscope.github.io/twinkle-web/docs/">英文文档</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/zh/docs/">中文文档</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/">Twinkle 站点</a> &nbsp
 </p>
 
 ## ✨ Twinkle 是什么？
@@ -94,13 +94,13 @@ sh INSTALL_MEGATRON.sh
 | DPO 多 LoRA 训练                    | transformers          | [脚本](cookbook/rl/dpo_multi_lora.py)                  |
 | GKD 在线蒸馏                        | megatron              | [脚本](cookbook/rl/gkd_on_policy.py)                   |
 | GKD 离线蒸馏                        | megatron              | [脚本](cookbook/rl/gkd_off_policy.py)                  |
-| Tinker 客户端微调（自部署）         | transformers          | [脚本](cookbook/client/tinker/self_host)               |
-| Tinker 客户端微调（ModelScope）      | transformers          | [脚本](cookbook/client/tinker/modelscope)              |
-| Twinkle 客户端微调（自部署）        | transformers          | [脚本](cookbook/client/twinkle/self_host)              |
-| Twinkle 客户端微调（ModelScope）     | transformers          | [脚本](cookbook/client/twinkle/modelscope)             |
-| 服务端启动脚本                      | transformers/megatron | [脚本](cookbook/client/server)                         |
+| Tinker 客户端微调（自部署）         | transformers          | [脚本](cookbook/server_mode/tinker/self_host)               |
+| Tinker 客户端微调（ModelScope）      | transformers          | [脚本](cookbook/server_mode/tinker/modelscope)              |
+| Twinkle 客户端微调（自部署）        | transformers          | [脚本](cookbook/server_mode/twinkle/self_host)              |
+| Twinkle 客户端微调（ModelScope）     | transformers          | [脚本](cookbook/server_mode/twinkle/modelscope)             |
+| 服务端启动脚本                      | transformers/megatron | [脚本](cookbook/server_mode/server)                         |
 
-Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Client等各场景下。其算法过程是外露的，非常便于修改和调试。完整的框架介绍请查看[快速开始](docs/source_zh/使用指引/快速开始.md)
+Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Client等各场景下。其算法过程是外露的，非常便于修改和调试。完整的框架介绍请查看[快速开始](https://modelscope.github.io/twinkle-web/zh/docs/usage-guide/quick-start/)
 
 ## 更新日志
 - 🎉2026-05-20 支持DeepSeek-V4-Flash and DeepSeek-V4-Pro系列模型。
@@ -116,7 +116,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 
 ## ModelScope 的训练服务
 
-我们正在 ModelScope 上推出基于 Twinkle✨ 构建的训练服务。你可以通过 API 端点 `base_url=https://www.modelscope.cn/twinkle` 进行训练。更多详情请参阅我们的[文档](docs/source_zh/使用指引/训练服务.md)。
+我们正在 ModelScope 上推出基于 Twinkle✨ 构建的训练服务。你可以通过 API 端点 `base_url=https://www.modelscope.cn/twinkle` 进行训练。更多详情请参阅我们的[文档](https://modelscope.github.io/twinkle-web/zh/docs/usage-guide/train-as-a-service/)。
 
 ## 支持的硬件
 
@@ -166,7 +166,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 
 ## 示例代码
 
-下面列出了示例代码的一部分能力。完整的训练能力介绍请参考[快速开始](docs/source_zh/使用指引/快速开始.md)以及[cookbook](cookbook)。
+下面列出了示例代码的一部分能力。完整的训练能力介绍请参考[快速开始](https://modelscope.github.io/twinkle-web/zh/docs/usage-guide/quick-start/)以及[cookbook](cookbook)。
 
 ### 使用 Ray 训练
 

diff --git a/cookbook/exp/condenser/untested/eval_condensed_compressed.sh b/cookbook/exp/condenser/untested/eval_condensed_compressed.sh
@@ -3,14 +3,14 @@
 # Identical --dataset / --limit / --model_id as eval_condensed_native.sh for an A/B comparison.
 set -euo pipefail
 
-DATASET="${DATASET:-/mnt/data/yzhao/datasets/musique_ans_v1.0_dev.jsonl}"
-MODEL_ID="${MODEL_ID:-ms://Qwen/Qwen3.5-4B}"
-CONDENSER_LORA="${CONDENSER_LORA:-ms://twinkle-kit/Qwen3.5-4B-Condenser}"
-LIMIT="${LIMIT:-500}"
-NUM_GPUS="${NUM_GPUS:-4}"
-OUT_DIR="${OUT_DIR:-eval_out}"
+DATASET="/mnt/data/yzhao/datasets/musique_ans_v1.0_dev.jsonl"
+MODEL_ID="ms://Qwen/Qwen3.5-4B"
+CONDENSER_LORA="ms://twinkle-kit/Qwen3.5-4B-Condenser"
+LIMIT="500"
+NUM_GPUS="4"
+OUT_DIR="eval_out"
 
-CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3} \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
 python cookbook/exp/eval_condensed.py \
     --mode condensed \
     --dataset_format musique \

diff --git a/cookbook/exp/condenser/untested/eval_condensed_native.sh b/cookbook/exp/condenser/untested/eval_condensed_native.sh
@@ -3,13 +3,13 @@
 # Compare against eval_condensed_compressed.sh on identical --dataset / --limit / --model_id.
 set -euo pipefail
 
-DATASET="${DATASET:-/mnt/data/yzhao/datasets/musique_ans_v1.0_dev.jsonl}"
-MODEL_ID="${MODEL_ID:-ms://Qwen/Qwen3.5-4B}"
-LIMIT="${LIMIT:-500}"
-NUM_GPUS="${NUM_GPUS:-4}"
-OUT_DIR="${OUT_DIR:-eval_out}"
+DATASET="/mnt/data/yzhao/datasets/musique_ans_v1.0_dev.jsonl"
+MODEL_ID="ms://Qwen/Qwen3.5-4B"
+LIMIT="500"
+NUM_GPUS="4"
+OUT_DIR="eval_out"
 
-CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3} \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
 python cookbook/exp/eval_condensed.py \
     --mode native \
     --dataset_format musique \