diff --git a/INSTALL_MEGATRON.sh b/INSTALL_MEGATRON.sh
index 276598c50..951b0171d 100644
--- a/INSTALL_MEGATRON.sh
+++ b/INSTALL_MEGATRON.sh
@@ -5,6 +5,7 @@
 
 set -e  # Exit immediately on error
 export SETUPTOOLS_USE_DISTUTILS=local
+export UV_INDEX_URL=${UV_INDEX_URL:-https://mirrors.aliyun.com/pypi/simple/}
 echo "=========================================="
 echo "Starting deep learning dependencies installation..."
 echo "=========================================="
@@ -53,15 +54,15 @@ TORCH_CUDA_ARCH_LIST=$(get_cuda_arch "$GPU_NAME")
 export TORCH_CUDA_ARCH_LIST
 echo "Using CUDA architecture: $TORCH_CUDA_ARCH_LIST"
 
-# Install latest base packages
+# Install vllm 0.21.x (latest 0.2x uses CUDA 12 toolchain, avoids CUDA 13 CUTLASS conflicts)
 echo ""
-echo "Installing peft, accelerate, transformers, modelscope..."
-pip install --upgrade peft accelerate transformers "modelscope[framework]" --no-cache-dir
+echo "Installing vllm 0.21..."
+uv pip install "vllm>=0.21,<0.22"
 
-# Install latest vllm
+# Install latest base packages
 echo ""
-echo "Installing latest vllm..."
-pip install --upgrade vllm --no-cache-dir
+echo "Installing peft, accelerate, transformers, modelscope..."
+uv pip install --upgrade peft accelerate transformers "modelscope[framework]"
 
 # Get site-packages path and install transformer_engine and megatron_core
 echo ""
@@ -69,26 +70,30 @@ echo "Installing transformer_engine and megatron_core..."
 SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
 echo "Site-packages path: $SITE_PACKAGES"
 
-CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn \
-CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \
-pip install --no-build-isolation "transformer_engine[pytorch]" --no-cache-dir
+export CUDA_HOME=${SITE_PACKAGES}/nvidia/cu13
+export PATH=$CUDA_HOME/bin:$PATH
+export CPATH=$CUDA_HOME/include:$CPATH
+export LIBRARY_PATH=$CUDA_HOME/lib:$LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDA_HOME/lib:$LD_LIBRARY_PATH
+uv pip install transformer_engine_torch --no-build-isolation
 
-pip install megatron_core mcore_bridge --no-cache-dir
+uv pip install megatron_core mcore_bridge
 
-# Install flash-attention (force local build)
+# Install flash-attention
+# Prefer prebuilt wheel; fall back to source build only if needed.
 echo ""
-echo "Installing flash-attention (local build for $GPU_NAME)..."
-TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \
-MAX_JOBS=8 \
-FLASH_ATTENTION_FORCE_BUILD=TRUE \
-pip install flash-attn --no-build-isolation --no-cache-dir
+echo "Installing flash-attention..."
+export TORCH_CUDA_ARCH_LIST
+export MAX_JOBS=8
+pip install flash-attn --no-cache-dir || \
+    FLASH_ATTENTION_FORCE_BUILD=TRUE pip install flash-attn --no-build-isolation --no-cache-dir
 
-pip install flash-linear-attention -U --no-cache-dir
+uv pip install flash-linear-attention --upgrade
 
 # Install numpy
 echo ""
 echo "Installing numpy==2.2 and deep_gemm..."
-pip install numpy==2.2 --no-cache-dir
+uv pip install numpy==2.2
 
 # Verify installation
 echo ""
diff --git a/README.md b/README.md
index 4dd203cfb..4867358a3 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 </p>
 
 <p align="center">
-        <a href="https://twinkle-kit.readthedocs.io/en/latest/">English Documentation</a> &nbsp ｜ &nbsp <a href="https://twinkle-kit.readthedocs.io/zh-cn/latest/">中文文档</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/">Twinkle Web</a> &nbsp
+        <a href="https://modelscope.github.io/twinkle-web/docs/">English Documentation</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/zh/docs/">中文文档</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/">Twinkle Web</a> &nbsp
 </p>
 
 ## ✨ What is Twinkle?
@@ -100,11 +100,11 @@ sh INSTALL_MEGATRON.sh
 | DPO multi-LoRA training              | transformers    | [Script](cookbook/rl/dpo_multi_lora.py)                |
 | GKD on-policy distillation           | megatron        | [Script](cookbook/rl/gkd_on_policy.py)                 |
 | GKD off-policy distillation          | megatron        | [Script](cookbook/rl/gkd_off_policy.py)                |
-| Tinker client finetuning (self-host) | transformers    | [Script](cookbook/client/tinker/self_host)             |
-| Tinker client finetuning (ModelScope) | transformers   | [Script](cookbook/client/tinker/modelscope)            |
-| Twinkle client finetuning (self-host) | transformers   | [Script](cookbook/client/twinkle/self_host)            |
-| Twinkle client finetuning (ModelScope) | transformers  | [Script](cookbook/client/twinkle/modelscope)           |
-| Server startup scripts               | transformers/megatron | [Script](cookbook/client/server)                 |
+| Tinker client finetuning (self-host) | transformers    | [Script](cookbook/server_mode/tinker/self_host)             |
+| Tinker client finetuning (ModelScope) | transformers   | [Script](cookbook/server_mode/tinker/modelscope)            |
+| Twinkle client finetuning (self-host) | transformers   | [Script](cookbook/server_mode/twinkle/self_host)            |
+| Twinkle client finetuning (ModelScope) | transformers  | [Script](cookbook/server_mode/twinkle/modelscope)           |
+| Server startup scripts               | transformers/megatron | [Script](cookbook/server_mode/server)                 |
 
 ## Changelog
 - 🎉2026-05-20 Support DeepSeek-V4-Flash and DeepSeek-V4-Pro models.
@@ -122,7 +122,7 @@ sh INSTALL_MEGATRON.sh
 
 We are rolling out training service built atop Twinkle✨ on ModelScope. You may
 train via API endpoint  `base_url=https://www.modelscope.cn/twinkle`. For more details, please refer to
-our [documentation](docs/source_en/Usage%20Guide/Train-as-a-Service.md).
+our [documentation](https://modelscope.github.io/twinkle-web/docs/usage-guide/train-as-a-service/).
 
 ## Supported Hardware
 
@@ -177,7 +177,7 @@ supported on Twinkle✨ framework.
 ## Sample Code
 
 Below are some of the capabilities demonstrated in the example code. For a complete introduction to training capabilities,
-please refer to [Quick Start](docs/source_en/Usage%20Guide/Quick-Start.md) and [cookbook](cookbook).
+please refer to [Quick Start](https://modelscope.github.io/twinkle-web/docs/usage-guide/quick-start/) and [cookbook](cookbook).
 
 ### Train with Ray
 
diff --git a/README_ZH.md b/README_ZH.md
index 5d588b393..92923209c 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -19,7 +19,7 @@ by <a href="https://modelscope.cn/home">ModelScope</a> & <a href="https://www.cm
 </p>
 
 <p align="center">
-        <a href="https://twinkle-kit.readthedocs.io/en/latest/">英文文档</a> &nbsp ｜ &nbsp <a href="https://twinkle-kit.readthedocs.io/zh-cn/latest/">中文文档</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/">Twinkle 站点</a> &nbsp
+        <a href="https://modelscope.github.io/twinkle-web/docs/">英文文档</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/zh/docs/">中文文档</a> &nbsp ｜ &nbsp <a href="https://modelscope.github.io/twinkle-web/">Twinkle 站点</a> &nbsp
 </p>
 
 ## ✨ Twinkle 是什么？
@@ -94,13 +94,13 @@ sh INSTALL_MEGATRON.sh
 | DPO 多 LoRA 训练                    | transformers          | [脚本](cookbook/rl/dpo_multi_lora.py)                  |
 | GKD 在线蒸馏                        | megatron              | [脚本](cookbook/rl/gkd_on_policy.py)                   |
 | GKD 离线蒸馏                        | megatron              | [脚本](cookbook/rl/gkd_off_policy.py)                  |
-| Tinker 客户端微调（自部署）         | transformers          | [脚本](cookbook/client/tinker/self_host)               |
-| Tinker 客户端微调（ModelScope）      | transformers          | [脚本](cookbook/client/tinker/modelscope)              |
-| Twinkle 客户端微调（自部署）        | transformers          | [脚本](cookbook/client/twinkle/self_host)              |
-| Twinkle 客户端微调（ModelScope）     | transformers          | [脚本](cookbook/client/twinkle/modelscope)             |
-| 服务端启动脚本                      | transformers/megatron | [脚本](cookbook/client/server)                         |
+| Tinker 客户端微调（自部署）         | transformers          | [脚本](cookbook/server_mode/tinker/self_host)               |
+| Tinker 客户端微调（ModelScope）      | transformers          | [脚本](cookbook/server_mode/tinker/modelscope)              |
+| Twinkle 客户端微调（自部署）        | transformers          | [脚本](cookbook/server_mode/twinkle/self_host)              |
+| Twinkle 客户端微调（ModelScope）     | transformers          | [脚本](cookbook/server_mode/twinkle/modelscope)             |
+| 服务端启动脚本                      | transformers/megatron | [脚本](cookbook/server_mode/server)                         |
 
-Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Client等各场景下。其算法过程是外露的，非常便于修改和调试。完整的框架介绍请查看[快速开始](docs/source_zh/使用指引/快速开始.md)
+Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Client等各场景下。其算法过程是外露的，非常便于修改和调试。完整的框架介绍请查看[快速开始](https://modelscope.github.io/twinkle-web/zh/docs/usage-guide/quick-start/)
 
 ## 更新日志
 - 🎉2026-05-20 支持DeepSeek-V4-Flash and DeepSeek-V4-Pro系列模型。
@@ -116,7 +116,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 
 ## ModelScope 的训练服务
 
-我们正在 ModelScope 上推出基于 Twinkle✨ 构建的训练服务。你可以通过 API 端点 `base_url=https://www.modelscope.cn/twinkle` 进行训练。更多详情请参阅我们的[文档](docs/source_zh/使用指引/训练服务.md)。
+我们正在 ModelScope 上推出基于 Twinkle✨ 构建的训练服务。你可以通过 API 端点 `base_url=https://www.modelscope.cn/twinkle` 进行训练。更多详情请参阅我们的[文档](https://modelscope.github.io/twinkle-web/zh/docs/usage-guide/train-as-a-service/)。
 
 ## 支持的硬件
 
@@ -166,7 +166,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 
 ## 示例代码
 
-下面列出了示例代码的一部分能力。完整的训练能力介绍请参考[快速开始](docs/source_zh/使用指引/快速开始.md)以及[cookbook](cookbook)。
+下面列出了示例代码的一部分能力。完整的训练能力介绍请参考[快速开始](https://modelscope.github.io/twinkle-web/zh/docs/usage-guide/quick-start/)以及[cookbook](cookbook)。
 
 ### 使用 Ray 训练
 
diff --git a/cookbook/exp/condenser/untested/eval_condensed_compressed.sh b/cookbook/exp/condenser/untested/eval_condensed_compressed.sh
index 5567a1a3b..833b446fa 100755
--- a/cookbook/exp/condenser/untested/eval_condensed_compressed.sh
+++ b/cookbook/exp/condenser/untested/eval_condensed_compressed.sh
@@ -3,14 +3,14 @@
 # Identical --dataset / --limit / --model_id as eval_condensed_native.sh for an A/B comparison.
 set -euo pipefail
 
-DATASET="${DATASET:-/mnt/data/yzhao/datasets/musique_ans_v1.0_dev.jsonl}"
-MODEL_ID="${MODEL_ID:-ms://Qwen/Qwen3.5-4B}"
-CONDENSER_LORA="${CONDENSER_LORA:-ms://twinkle-kit/Qwen3.5-4B-Condenser}"
-LIMIT="${LIMIT:-500}"
-NUM_GPUS="${NUM_GPUS:-4}"
-OUT_DIR="${OUT_DIR:-eval_out}"
+DATASET="/mnt/data/yzhao/datasets/musique_ans_v1.0_dev.jsonl"
+MODEL_ID="ms://Qwen/Qwen3.5-4B"
+CONDENSER_LORA="ms://twinkle-kit/Qwen3.5-4B-Condenser"
+LIMIT="500"
+NUM_GPUS="4"
+OUT_DIR="eval_out"
 
-CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3} \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
 python cookbook/exp/eval_condensed.py \
     --mode condensed \
     --dataset_format musique \
diff --git a/cookbook/exp/condenser/untested/eval_condensed_native.sh b/cookbook/exp/condenser/untested/eval_condensed_native.sh
index 0849e9378..176c767b6 100755
--- a/cookbook/exp/condenser/untested/eval_condensed_native.sh
+++ b/cookbook/exp/condenser/untested/eval_condensed_native.sh
@@ -3,13 +3,13 @@
 # Compare against eval_condensed_compressed.sh on identical --dataset / --limit / --model_id.
 set -euo pipefail
 
-DATASET="${DATASET:-/mnt/data/yzhao/datasets/musique_ans_v1.0_dev.jsonl}"
-MODEL_ID="${MODEL_ID:-ms://Qwen/Qwen3.5-4B}"
-LIMIT="${LIMIT:-500}"
-NUM_GPUS="${NUM_GPUS:-4}"
-OUT_DIR="${OUT_DIR:-eval_out}"
+DATASET="/mnt/data/yzhao/datasets/musique_ans_v1.0_dev.jsonl"
+MODEL_ID="ms://Qwen/Qwen3.5-4B"
+LIMIT="500"
+NUM_GPUS="4"
+OUT_DIR="eval_out"
 
-CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3} \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
 python cookbook/exp/eval_condensed.py \
     --mode native \
     --dataset_format musique \
diff --git a/cookbook/exp/embedding/build_thinking_rag_index.py b/cookbook/exp/embedding/build_thinking_rag_index.py
new file mode 100644
index 000000000..d228a597a
--- /dev/null
+++ b/cookbook/exp/embedding/build_thinking_rag_index.py
@@ -0,0 +1,935 @@
+"""Build a thinking-trace RAG index from condensed (query, cot) pairs.
+
+Pipeline (per row, batched):
+  1. Load (user_query, reasoning_content) pairs from ``dataset_think.get_dataset``.
+  2. Compress query with ``RAG_QUERY_HINT`` and cot with ``RAG_THINKING_HINT``
+     (a symmetric Problem/Skill/Knowledge schema defined in this file) using a
+     Twinkle ``vLLMSampler`` (TP=4 across GPUs 0-3). Reuses the system/user
+     wrappers from ``cookbook/exp/condenser/make_condenser_dataset.py``.
+  3. On condenser truncation (``stop_reason='length'`` or skeleton-incomplete
+     output), fall back to an external OpenAI-compatible API.
+  4. Encode the condensed pair via the trained embedding model — Twinkle
+     ``TransformersModel`` on the ``emb_model`` device group (DP=4 across GPUs
+     4-7) using ``forward_only(task='embedding')``, the same code path as
+     training.
+  5. Compute cosine similarity for each (query, thinking) pair, drop pairs with
+     ``sim < SIM_THRESHOLD``, and insert kept rows into LanceDB. The vector
+     column carries the **positive (compressed-skill)** embedding so a search
+     keyed by an anchor-encoded query retrieves the matching thinking trace.
+  6. Each row stores the **raw thinking** alongside its embedding, so a hit
+     in the index can directly surface the original CoT.
+
+Eval mode (``--mode eval`` or ``--mode both``):
+  * Self-recall test — encode a sample of dataset queries (whose corresponding
+    rows are already in the index) as anchors and report recall@1/5/10 plus
+    a per-source breakdown.
+
+Architecture (8 GPUs):
+  * GPU 0-3: vLLM condenser (tensor-parallel, ``DeviceGroup name='sampler'``)
+  * GPU 4-7: TransformersModel embedding (data-parallel, ``DeviceGroup name='emb_model'``)
+  * Single ``twinkle.initialize(mode='ray', ...)`` call wires both groups.
+
+Launch examples:
+  python build_thinking_rag_index.py --mode build --total 500000
+  python build_thinking_rag_index.py --mode eval  --eval-size 1000
+  python build_thinking_rag_index.py --mode both  --total 200000 --eval-size 500
+"""
+import argparse
+import json
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+
+# ---------------------------------------------------------------------------
+# Compress prompts — MUST match train_embedding_full_ddp.py exactly.
+# ---------------------------------------------------------------------------
+_HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(_HERE))
+
+COMPRESS_SYSTEM = """\
+You are a compression and summary assistant. For the (query, source) pair, emit a Markdown \
+answer with TWO sections, designed to pair with the `extract_compressed` tool: \
+the reader absorbs `## Summary` directly, then calls `extract_compressed` \
+on any topic-key listed under `## More` to recover its \
+fuller content.
+
+  `## Summary`               \u2014 extreme-density text the reader reads directly.
+  `## More` \u2014 a topic index whose keys are valid arguments \
+to `extract_compressed` for recovering material not captured inline.
+
+Together the two sections must form a COMPLETE, NON-DISTORTING inventory of the \
+source for the query \u2014 nothing essential lost, nothing implied that the source \
+does not support. NO preamble, NO meta-commentary, NO code fences wrapping the \
+whole output.
+
+Output skeleton:
+
+## Summary
+Topic: <what the source is about + scope, one line>
+<dense body answering the query>
+
+## More
+- <topic-key>: <one-line hint of what is revealed when expanded>
+- ...
+
+Format selection for the inline body (pick the MOST COMPACT form per query, mix \
+when helpful):
+- Interface / signature \u2192 code notation directly: `func(a:int)->str`
+- Factual / entity \u2192 telegraphic prose; drop function words; \":\" for \"is\", \",\" \
+for \"has\"
+- Skill / how-to / usage \u2192 lead with `Use when: <trigger>`; numbered telegraphic \
+steps `1.do X 2.then Y`; close with `Output: <result>` when relevant
+- Procedural \u2192 numbered short steps
+- Analytical / design \u2192 hierarchical bullets with abbreviations
+
+`## Summary` rules:
+1. TOPIC LINE \u2014 line 1 is ALWAYS `Topic: <subject \u2014 scope>`, even when the \
+query is narrow. Anchors both the reader and the tool.
+2. DENSITY \u2014 every token in the body carries query-relevant signal; cut filler.
+3. PRIMARY-COMPLETE \u2014 never silently drop a fact essential to answering the \
+query. Anything cut for length MUST appear as a key under \
+`## More`.
+4. NON-MISLEADING \u2014 phrasing must not let the reader infer anything the source \
+does not support; partial truths that mislead are worse than honest omissions \
+flagged in the index.
+5. SELF-CONTAINED \u2014 the reader can act on the answer without re-opening the source.
+6. FAITHFUL \u2014 only content the source supports; no fabrication, no extrapolation.
+7. LANGUAGE \u2014 match the source language.
+8. NO outer code fences around the whole answer; no meta-commentary.
+
+`## More` rules (MANDATORY \u2014 this section is never omitted):
+1. FORMAT \u2014 each bullet is `- <topic-key>: <one-line hint>`:
+   \u2022 topic-key \u2014 short, unambiguous, grounded in source vocabulary so the \
+`extract_compressed` tool can locate the aspect (e.g. `decorators`, \
+`error handling`, `pitfalls`).
+   \u2022 hint \u2014 tells WHAT the reader gains by expanding (concrete numbers, code \
+listings, secondary cases, edge details, related context, \u2026); do NOT restate \
+the inline answer.
+2. CRITERION \u2014 each bullet names an aspect that EXISTS in the source but is \
+NOT fully captured inline. Material that genuinely fits inline without \
+distortion MUST NOT be duplicated here.
+3. FAITHFUL \u2014 hints must be grounded in the source; never speculate or invent.
+4. ORDER \u2014 by relevance to the query, then by importance.
+5. EMPTY CASE \u2014 if the source is so short / single-purpose that everything \
+fits inline, write a single line `- (none)`.
+
+Now begin.\
+"""
+
+COMPRESS_USER = (
+    'Downstream model will read your compressed block to decide whether to '
+    'expand it. Compress faithfully: preserve the passage topic + core facts. '
+    'Do NOT invent facts. Do NOT drop major facts. Do NOT write meta-commentary '
+    'about the Query (never write "Query info: absent", "no X mention", etc.); '
+    'if the passage does not address the Query, still summarize the passage. '
+    'CRITICAL LANGUAGE RULE: detect the dominant language of the Passage '
+    '(NOT the Query, NOT this instruction) and write the ENTIRE output in that '
+    'same language; English passage \u2192 English output, Chinese passage \u2192 '
+    'Chinese output, Japanese passage \u2192 Japanese output. NEVER translate, '
+    'NEVER mix languages, NEVER copy these instructions into the output.\n\n'
+    '## Query (ordering hint only \u2014 still summarize the whole passage)\n{query}\n\n'
+    '## Passage\n{text}')
+
+# Default dataset loader is the index-time corpus (broader retrieval profile);
+# pass --dataset-module dataset_think to fall back to the training mix.
+from dataset_index import get_dataset as _default_get_dataset  # noqa: E402
+
+_GET_DATASET = _default_get_dataset
+
+import twinkle  # noqa: E402
+from twinkle import DeviceGroup, DeviceMesh, get_logger  # noqa: E402
+from twinkle.data_format import SamplingParams as TwinkleSamplingParams  # noqa: E402
+from twinkle.loss import InfonceLoss  # noqa: E402
+from twinkle.model import TransformersModel  # noqa: E402
+from twinkle.processor import InputProcessor  # noqa: E402
+from twinkle.sampler import vLLMSampler  # noqa: E402
+from twinkle.template import Qwen3_5Template  # noqa: E402
+from twinkle.utils.parallel import PosixFileLock  # noqa: E402
+from twinkle_agentic.protocol.openai import OpenAI as OpenAIClient  # noqa: E402
+
+logger = get_logger()
+
+
+# ===========================================================================
+# Config (most fields overridable via CLI / env)
+# ===========================================================================
+
+EMBED_MODEL_ID = os.environ.get(
+    'EMBED_MODEL_ID',
+    'output/embedding_lora_transformers/step_4000',
+)
+CONDENSE_MODEL_ID = os.environ.get('CONDENSE_MODEL_ID', 'ms://twinkle-kit/Qwen3.5-4B-CM-v2')
+
+# Twinkle device topology: TP=4 sampler on 0-3, DP=4 embedding on 4-7.
+SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
+EMB_GPUS = int(os.environ.get('EMB_GPUS', 4))
+NUM_GPUS = SAMPLER_GPUS + EMB_GPUS
+
+# vLLM engine sizing.
+CONDENSE_GPU_MEM = float(os.environ.get('CONDENSE_GPU_MEM', 0.85))
+CONDENSE_MAX_MODEL_LEN = int(os.environ.get('CONDENSE_MAX_MODEL_LEN', 32768))
+CONDENSE_MAX_TOKENS = int(os.environ.get('CONDENSE_MAX_TOKENS', 8192))
+COMPRESS_TEMPERATURE = float(os.environ.get('COMPRESS_TEMPERATURE', 0.2))
+COMPRESS_TOP_P = float(os.environ.get('COMPRESS_TOP_P', 0.5))
+
+# Embedding sizing.
+EMBED_MAX_LENGTH = int(os.environ.get('EMBED_MAX_LENGTH', 8192))
+
+SIM_THRESHOLD = float(os.environ.get('SIM_THRESHOLD', 0.65))
+MIN_TEXT_CHARS = int(os.environ.get('MIN_TEXT_CHARS', 256))
+
+# Hard-templated hints: the condenser SFT prior maps `Skill` to the legacy
+# `Use when: / numbered steps / Output:` skeleton on long inputs; embedding the
+# exact 4-line body template + explicit negative constraints is the only way to
+# override it deterministically across query and cot sides.
+RAG_QUERY_HINT = (
+    'Summarize this query for retrieval. '
+    'The body of ## Summary MUST follow this EXACT 4-line template — '
+    'do NOT emit "Use when:", numbered procedure steps, or "Output:":\n'
+    'Topic: <specific pattern name — scope>\n'
+    'Problem: <what concrete problem is being asked>\n'
+    'Skill: <which specific method/technique/pattern is required to solve it>\n'
+    'Knowledge: <which domains/concepts/facts must be invoked>\n'
+    'Then emit the mandatory ## More section as usual. '
+    'Topic must name the specific pattern, never generic labels.')
+RAG_THINKING_HINT = (
+    'Summarize this reasoning trace for retrieval. '
+    'The body of ## Summary MUST follow this EXACT 4-line template — '
+    'do NOT emit "Use when:", numbered procedure steps, or "Output:":\n'
+    'Topic: <specific pattern name — scope>\n'
+    'Problem: <what concrete problem this trace tackled>\n'
+    'Skill: <which specific method/technique/pattern was applied>\n'
+    'Knowledge: <which domains/concepts/facts were used>\n'
+    'Then emit the mandatory ## More section as usual. '
+    'Topic must name the specific pattern, never generic labels.')
+
+# OpenAI API fallback (used when vLLM truncates).
+COMPRESS_API_KEY = os.environ.get('COMPRESS_API_KEY', '')
+COMPRESS_BASE_URL = os.environ.get(
+    'COMPRESS_BASE_URL', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
+COMPRESS_API_MODEL = os.environ.get('COMPRESS_API_MODEL', 'qwen3.7-max')
+
+# Source → coarse domain (for filtered eval).
+DOMAIN_MAP = {
+    'CodeX-2M-Thinking': 'code',
+    'OpenThoughts3-1.2M': 'reasoning',
+    'LIMO-v2': 'math',
+    'Chinese-DeepSeek-R1-Distill-data-110k': 'reasoning_zh',
+    'Opus-4.6-Reasoning-3000x-filtered': 'reasoning',
+    'claude-opus-4.6-10000x': 'mixed',
+    'angrygiraffe-claude-opus-4.6-4.7-reasoning-8.7k': 'mixed',
+}
+
+
+# ===========================================================================
+# Small helpers
+# ===========================================================================
+
+_LEGACY_USE_WHEN_RE = re.compile(r'(?im)^\s*Use when\s*:')
+_SCHEMA_MARKERS = ('Problem:', 'Skill:', 'Knowledge:')
+
+
+def _is_truncated_compression(text: str) -> bool:
+    """Reject structurally incomplete OR schema-regressed condenser output.
+
+    Triggers API fallback when the vLLM output:
+      * lacks ``## Summary`` / ``## More``,
+      * has an empty or unterminated ``## More`` bullet list, or
+      * regresses to the legacy ``Use when: / numbered-steps / Output:`` skeleton
+        instead of the mandated Problem/Skill/Knowledge 4-line body — the
+        dominant cot-side failure mode that drives sim < 0.45 drops.
+    """
+    if not text or not text.strip():
+        return True
+    if '## More' not in text or '## Summary' not in text:
+        return True
+    after_more = text.split('## More', 1)[1].strip()
+    if not after_more:
+        return True
+    last_line = after_more.splitlines()[-1].strip()
+    if not (last_line.startswith('-') or last_line.endswith(')')):
+        return True
+    summary_body = text.split('## Summary', 1)[1].split('## More', 1)[0]
+    if _LEGACY_USE_WHEN_RE.search(summary_body):
+        return True
+    if not all(marker in summary_body for marker in _SCHEMA_MARKERS):
+        return True
+    return False
+
+
+def _strip_outer_codefence(text: str) -> str:
+    m = re.match(r'^```[a-zA-Z]*\n(.*?)\n```\s*$', text, re.DOTALL)
+    if m:
+        return m.group(1).strip()
+    return text.strip()
+
+
+def _wrap_anchor(text: str) -> List[Dict[str, str]]:
+    """Anchor-side message wrapping (must match training)."""
+    return [
+        {'role': 'user', 'content': text},
+        {'role': 'assistant', 'content': 'Match the correct response here.'},
+    ]
+
+
+def _wrap_positive(text: str) -> List[Dict[str, str]]:
+    """Positive-side message wrapping (must match training)."""
+    return [
+        {'role': 'user', 'content': 'Match the correct query here.'},
+        {'role': 'assistant', 'content': text},
+    ]
+
+
+def _short(text: str, n: int = 96) -> str:
+    text = (text or '').replace('\n', ' ').strip()
+    return text[:n] + ('…' if len(text) > n else '')
+
+
+def _detect_lang(text: str) -> str:
+    if not text:
+        return 'unknown'
+    cjk = sum(1 for ch in text[:512] if '\u4e00' <= ch <= '\u9fff')
+    return 'zh' if cjk >= 8 else 'en'
+
+
+def _build_compress_messages(text: str, query: str) -> List[Dict[str, str]]:
+    return [
+        {'role': 'system', 'content': COMPRESS_SYSTEM},
+        {'role': 'user', 'content': COMPRESS_USER.format(query=query, text=text)},
+    ]
+
+
+# ===========================================================================
+# Twinkle component wrappers
+# ===========================================================================
+
+def initialize_twinkle() -> Tuple[DeviceMesh, DeviceMesh]:
+    """Wire two device groups (sampler / emb_model) and return their meshes."""
+    device_groups = [
+        DeviceGroup(
+            name='sampler',
+            ranks=list(range(SAMPLER_GPUS)),
+            device_type='GPU',
+            gpus_per_worker=SAMPLER_GPUS,  # TP=4 → one worker spans all 4 GPUs
+        ),
+        DeviceGroup(
+            name='emb_model',
+            ranks=list(range(SAMPLER_GPUS, NUM_GPUS)),
+            device_type='GPU',
+        ),
+    ]
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, tp_size=SAMPLER_GPUS)
+    emb_mesh = DeviceMesh.from_sizes(world_size=EMB_GPUS, dp_size=EMB_GPUS)
+    twinkle.initialize(
+        mode='ray',
+        nproc_per_node=NUM_GPUS,
+        groups=device_groups,
+        lazy_collect=False,
+    )
+    return sampler_mesh, emb_mesh
+
+
+def build_sampler(sampler_mesh: DeviceMesh) -> vLLMSampler:
+    sampler = vLLMSampler(
+        model_id=CONDENSE_MODEL_ID,
+        engine_args={
+            'gpu_memory_utilization': CONDENSE_GPU_MEM,
+            'max_model_len': CONDENSE_MAX_MODEL_LEN,
+        },
+        device_mesh=sampler_mesh,
+        remote_group='sampler',
+    )
+    sampler.set_template(
+        'Qwen3_5Template',
+        model_id=CONDENSE_MODEL_ID,
+        enable_thinking=False,
+        max_length=CONDENSE_MAX_MODEL_LEN,
+    )
+    return sampler
+
+
+def build_emb_model(emb_mesh: DeviceMesh) -> Tuple[TransformersModel, Qwen3_5Template]:
+    model = TransformersModel(
+        model_id=EMBED_MODEL_ID,
+        device_mesh=emb_mesh,
+        remote_group='emb_model',
+    )
+    model.set_processor(InputProcessor)
+    # InfonceLoss is required by the framework even though forward_only does
+    # not actually invoke it; matches the training-time configuration.
+    model.set_loss(InfonceLoss, temperature=0.03, use_batch=True)
+    # Qwen3.5-specific subclass applies orphan-</think> chat-template patches.
+    template = Qwen3_5Template(
+        model_id=EMBED_MODEL_ID,
+        max_length=EMBED_MAX_LENGTH,
+        truncation_strategy='delete',
+        enable_thinking=False,
+    )
+    return model, template
+
+
+# ===========================================================================
+# Compression helpers (vLLMSampler) + API fallback
+# ===========================================================================
+
+def _vllm_compress(sampler: vLLMSampler, texts: List[str], query_hint: str
+                   ) -> List[Tuple[str, str]]:
+    """Compress ``texts`` via the sampler; return ``(decoded, stop_reason)``."""
+    if not texts:
+        return []
+    prompts = [{'messages': _build_compress_messages(t, query_hint)} for t in texts]
+    params = TwinkleSamplingParams(
+        max_tokens=CONDENSE_MAX_TOKENS,
+        temperature=COMPRESS_TEMPERATURE,
+        top_p=COMPRESS_TOP_P,
+        num_samples=1,
+    )
+    responses = sampler.sample(prompts, params)
+    results: List[Tuple[str, str]] = []
+    for resp in responses:
+        seq = resp.sequences[0] if resp and resp.sequences else None
+        if seq is None:
+            results.append(('', 'error'))
+            continue
+        text = seq.decoded or ''
+        # Strip any leaked chat-template special tokens like ``<|im_end|>``.
+        text = re.sub(r'<\|[^|]+\|>', '', text).rstrip()
+        text = _strip_outer_codefence(text)
+        results.append((text, seq.stop_reason or 'stop'))
+    return results
+
+
+def _api_compress(api: OpenAIClient, messages: List[Dict[str, str]]) -> Optional[str]:
+    sp = TwinkleSamplingParams(temperature=COMPRESS_TEMPERATURE, max_tokens=CONDENSE_MAX_TOKENS)
+    try:
+        reply = api({'messages': messages}, sp, extra_body={'enable_thinking': False})
+    except Exception as exc:  # noqa: BLE001 — broad catch is intentional
+        sys.stderr.write(f'[api_fallback] error: {exc}\n')
+        return None
+    content = (reply.get('content') or '').strip()
+    if not content:
+        return None
+    return _strip_outer_codefence(content)
+
+
+def _resolve_compressed(sampler: vLLMSampler, api: Optional[OpenAIClient],
+                        texts: List[str], query_hint: str) -> List[Optional[str]]:
+    """Run vLLM batch; replace truncations / skeleton-incomplete with API output."""
+    pairs = _vllm_compress(sampler, texts, query_hint)
+    results: List[Optional[str]] = []
+    for (text, stop), src_text in zip(pairs, texts):
+        if stop != 'length' and not _is_truncated_compression(text):
+            results.append(text)
+            continue
+        if api is None:
+            results.append(None)
+            continue
+        api_text = _api_compress(api, _build_compress_messages(src_text, query_hint))
+        if api_text is None or _is_truncated_compression(api_text):
+            results.append(None)
+        else:
+            results.append(api_text)
+    return results
+
+
+# ===========================================================================
+# Embedding helpers (TransformersModel.forward_only(task='embedding'))
+# ===========================================================================
+
+def _build_features(template: Qwen3_5Template, texts: List[str], role: str
+                    ) -> List[Dict[str, Any]]:
+    """Wrap each text into the role-specific anchor / positive feature dict."""
+    features: List[Dict[str, Any]] = []
+    for text in texts:
+        if not text or not text.strip():
+            # Pad with a single space so positional alignment holds against
+            # the input list — the caller filters out empty-text rows upstream.
+            text = ' '
+        if role == 'anchor':
+            feat = template.encode({'messages': _wrap_anchor(text)})
+            feat['labels'] = [1]
+        else:
+            feat = template.encode({'messages': _wrap_positive(text)})
+            feat['labels'] = [0]
+        features.append(feat)
+    return features
+
+
+def get_embeddings(model: TransformersModel, template: Qwen3_5Template,
+                   texts: List[str], role: str) -> np.ndarray:
+    """Return ``[N, H]`` float32 L2-normalised embeddings for ``texts``.
+
+    Inputs are padded up to a multiple of ``EMB_GPUS`` and sliced back to the
+    original ``N``: the dispatch layer (``_dispatch_args``) starves any rank
+    whose chunk lands beyond ``len(texts)``, so a single forward of fewer than
+    ``EMB_GPUS`` items (e.g. the probe) would otherwise raise
+    ``Batch too small for {EMB_GPUS} workers``.
+    """
+    if not texts:
+        return np.zeros((0,), dtype=np.float32)
+    n = len(texts)
+    pad_n = (-n) % EMB_GPUS
+    padded = list(texts) + [' '] * pad_n if pad_n else list(texts)
+    features = _build_features(template, padded, role)
+    out = model.forward_only(inputs=features, task='embedding', return_logits=True)
+    emb = out['embeddings']
+    if isinstance(emb, torch.Tensor):
+        emb = emb.detach().to(torch.float32).cpu().numpy()
+    emb = np.asarray(emb, dtype=np.float32)
+    return emb[:n] if pad_n else emb
+
+
+def _probe_hidden_size(model: TransformersModel, template: Qwen3_5Template) -> int:
+    """One-shot warmup forward to read out the embedding dimension."""
+    emb = get_embeddings(model, template, ['probe'], role='anchor')
+    if emb.ndim != 2 or emb.shape[0] == 0:
+        raise RuntimeError(f'unexpected embedding shape from probe: {emb.shape}')
+    return int(emb.shape[1])
+
+
+# ===========================================================================
+# LanceDB I/O
+# ===========================================================================
+
+def _make_arrow_schema(hidden_size: int):
+    import pyarrow as pa
+    return pa.schema([
+        pa.field('id', pa.string()),
+        pa.field('vector', pa.list_(pa.float32(), hidden_size)),
+        pa.field('thinking_raw', pa.string()),
+        pa.field('query_raw', pa.string()),
+        pa.field('cot_compressed', pa.string()),
+        pa.field('query_compressed', pa.string()),
+        pa.field('source', pa.string()),
+        pa.field('domain', pa.string()),
+        pa.field('language', pa.string()),
+        pa.field('sim', pa.float32()),
+    ])
+
+
+def _open_or_create_table(db_path: str, table_name: str, hidden_size: int,
+                          mode: str):
+    """Open an existing table for append/eval, or create a fresh one."""
+    import lancedb
+    db = lancedb.connect(db_path)
+    schema = _make_arrow_schema(hidden_size)
+    if table_name in db.table_names():
+        if mode == 'overwrite':
+            db.drop_table(table_name)
+            tbl = db.create_table(table_name, schema=schema, mode='overwrite')
+        else:
+            tbl = db.open_table(table_name)
+    else:
+        tbl = db.create_table(table_name, schema=schema, mode='create')
+    return db, tbl
+
+
+def _existing_ids(table) -> set:
+    try:
+        col = table.to_pandas(columns=['id'])
+        return set(col['id'].astype(str).tolist())
+    except Exception:  # noqa: BLE001
+        return set()
+
+
+# ===========================================================================
+# Build pipeline
+# ===========================================================================
+
+def _stream_corpus(total: Optional[int], load_from_cache_file: bool,
+                   max_rows: int = 0) -> Iterator[Dict[str, Any]]:
+    ds = _GET_DATASET(total=total, load_from_cache_file=load_from_cache_file)
+    n_full = len(ds)
+    cap = max_rows if (max_rows and max_rows < n_full) else n_full
+    sys.stderr.write(f'[corpus] get_dataset: {n_full} rows'
+                     + (f' → yielding first {cap}\n' if cap < n_full else '\n'))
+    for i, row in enumerate(ds):
+        if i >= cap:
+            break
+        yield row
+
+
+def _extract_query_cot(row: Dict[str, Any]) -> Tuple[str, str]:
+    user_query, cot = '', ''
+    for m in row.get('messages') or []:
+        if not isinstance(m, dict):
+            continue
+        role = m.get('role') or ''
+        if role == 'user' and not user_query:
+            user_query = (m.get('content') or '').strip()
+        elif role == 'assistant':
+            cot = (m.get('reasoning_content') or '').strip()
+            break
+    return user_query, cot
+
+
+def _log_miss(misses_path: str, lock: PosixFileLock, record: Dict[str, Any]) -> None:
+    line = json.dumps(record, ensure_ascii=False, default=str) + '\n'
+    with lock:
+        with open(misses_path, 'a', encoding='utf-8') as fh:
+            fh.write(line)
+
+
+def build_index(args: argparse.Namespace,
+                sampler: vLLMSampler,
+                emb_model: TransformersModel,
+                emb_template: Qwen3_5Template,
+                api: Optional[OpenAIClient]) -> None:
+    # ---- Probe embedding dimension -----------------------------------------
+    sys.stderr.write('[build] probing embedding hidden size...\n')
+    hidden_size = _probe_hidden_size(emb_model, emb_template)
+    sys.stderr.write(f'[build] hidden_size={hidden_size}\n')
+
+    # ---- LanceDB ------------------------------------------------------------
+    db, tbl = _open_or_create_table(
+        args.db_path, args.table, hidden_size,
+        mode='overwrite' if args.overwrite else 'append',
+    )
+    indexed = _existing_ids(tbl) if not args.overwrite else set()
+    sys.stderr.write(f'[build] table "{args.table}" — {len(indexed)} existing rows.\n')
+
+    misses_path = args.misses_log or (str(Path(args.db_path) / f'{args.table}.misses.jsonl'))
+    Path(misses_path).parent.mkdir(parents=True, exist_ok=True)
+    misses_lock = PosixFileLock(misses_path + '.lock')
+
+    # ---- Streaming loop -----------------------------------------------------
+    n_seen = n_kept = n_dropped_short = n_dropped_compress = n_dropped_sim = 0
+    n_dropped_dup = 0
+    pbar = tqdm(desc='index', unit='row', dynamic_ncols=True)
+
+    batch: List[Dict[str, Any]] = []
+
+    def _flush(rows: List[Dict[str, Any]]) -> None:
+        nonlocal n_kept, n_dropped_compress, n_dropped_sim
+        if not rows:
+            return
+        # Phase 1 — compress query (RAG_QUERY_HINT) and cot (RAG_THINKING_HINT).
+        # Short queries bypass condenser (passthrough) — matches training behaviour.
+        long_q_indices = [i for i, r in enumerate(rows) if len(r['query_raw']) >= MIN_TEXT_CHARS]
+        q_compressed: List[Optional[str]] = [None] * len(rows)
+        for i, r in enumerate(rows):
+            if len(r['query_raw']) < MIN_TEXT_CHARS:
+                q_compressed[i] = r['query_raw']
+        if long_q_indices:
+            long_results = _resolve_compressed(
+                sampler, api, [rows[i]['query_raw'] for i in long_q_indices], RAG_QUERY_HINT)
+            for idx, res in zip(long_q_indices, long_results):
+                q_compressed[idx] = res
+        c_compressed = _resolve_compressed(
+            sampler, api, [r['cot_raw'] for r in rows], RAG_THINKING_HINT)
+        kept_rows: List[Dict[str, Any]] = []
+        for r, q_cmp, c_cmp in zip(rows, q_compressed, c_compressed):
+            if not q_cmp or not c_cmp:
+                n_dropped_compress += 1
+                _log_miss(misses_path, misses_lock, {
+                    'id': r['id'], 'source': r['source'], 'reason': 'compress_fail',
+                    'query_raw_head': _short(r['query_raw'], 200),
+                    'cot_raw_head': _short(r['cot_raw'], 200),
+                })
+                continue
+            r['query_compressed'] = q_cmp
+            r['cot_compressed'] = c_cmp
+            kept_rows.append(r)
+        if not kept_rows:
+            return
+        # Phase 2 — encode anchor (compressed query) + positive (compressed cot).
+        anchor_emb = get_embeddings(
+            emb_model, emb_template, [r['query_compressed'] for r in kept_rows], role='anchor')
+        positive_emb = get_embeddings(
+            emb_model, emb_template, [r['cot_compressed'] for r in kept_rows], role='positive')
+        sims = (anchor_emb * positive_emb).sum(axis=1).astype(np.float32)
+        # Phase 3 — sim filter + LanceDB insert.
+        to_insert: List[Dict[str, Any]] = []
+        for idx, (r, sim_val) in enumerate(zip(kept_rows, sims)):
+            tag = 'KEEP' if sim_val >= SIM_THRESHOLD else 'DROP'
+            print(f'[{tag} sim={sim_val:.4f}] {r["source"][:24]} '
+                  f'q={_short(r["query_raw"], 60)!r} '
+                  f'cot={_short(r["cot_raw"], 60)!r}', flush=True)
+            if sim_val < SIM_THRESHOLD:
+                n_dropped_sim += 1
+                _log_miss(misses_path, misses_lock, {
+                    'id': r['id'], 'source': r['source'], 'reason': 'sim_low',
+                    'sim': float(sim_val),
+                    'query_raw': r['query_raw'],
+                    'cot_raw': r['cot_raw'],
+                    'query_compressed': r['query_compressed'],
+                    'cot_compressed': r['cot_compressed'],
+                })
+                continue
+            to_insert.append({
+                'id': r['id'],
+                'vector': positive_emb[idx].tolist(),
+                'thinking_raw': r['cot_raw'],
+                'query_raw': r['query_raw'],
+                'cot_compressed': r['cot_compressed'],
+                'query_compressed': r['query_compressed'],
+                'source': r['source'],
+                'domain': DOMAIN_MAP.get(r['source'], 'mixed'),
+                'language': _detect_lang(r['cot_raw']),
+                'sim': float(sim_val),
+            })
+        if to_insert:
+            tbl.add(to_insert)
+            n_kept += len(to_insert)
+            indexed.update(r['id'] for r in to_insert)
+
+    try:
+        for row in _stream_corpus(total=args.total, load_from_cache_file=not args.no_cache,
+                                  max_rows=args.max_rows):
+            n_seen += 1
+            if args.limit and n_kept >= args.limit:
+                break
+            rid = row.get('id') or ''
+            if not rid:
+                continue
+            if rid in indexed:
+                n_dropped_dup += 1
+                continue
+            user_query, cot = _extract_query_cot(row)
+            if not user_query or len(cot) < MIN_TEXT_CHARS:
+                n_dropped_short += 1
+                continue
+            batch.append({
+                'id': rid,
+                'source': row.get('source') or 'unknown',
+                'query_raw': user_query,
+                'cot_raw': cot,
+            })
+            if len(batch) >= args.batch_size:
+                _flush(batch)
+                batch.clear()
+                pbar.set_postfix(kept=n_kept, sim_drop=n_dropped_sim,
+                                 cmp_drop=n_dropped_compress, refresh=False)
+            pbar.update(1)
+        if batch:
+            _flush(batch)
+            batch.clear()
+    finally:
+        pbar.close()
+
+    sys.stderr.write(
+        f'[build] seen={n_seen} kept={n_kept} sim_drop={n_dropped_sim} '
+        f'cmp_drop={n_dropped_compress} short_drop={n_dropped_short} '
+        f'dup_skip={n_dropped_dup}\n')
+
+    # ---- Build vector index for fast retrieval ------------------------------
+    if n_kept >= 64 and not args.skip_index:
+        sys.stderr.write('[build] creating IVF_PQ index (metric=dot)...\n')
+        n_partitions = max(8, min(256, n_kept // 1000 + 1))
+        try:
+            tbl.create_index(
+                metric='dot',
+                vector_column_name='vector',
+                num_partitions=n_partitions,
+                num_sub_vectors=16,
+                index_type='IVF_PQ',
+                replace=True,
+            )
+        except Exception as exc:  # noqa: BLE001
+            sys.stderr.write(f'[build] index build failed: {exc} '
+                             '(table is still queryable via brute-force scan)\n')
+    sys.stderr.write(f'[build] done. table rows={tbl.count_rows()}\n')
+
+
+# ===========================================================================
+# Eval pipeline (self-recall on indexed rows)
+# ===========================================================================
+
+def eval_recall(args: argparse.Namespace,
+                sampler: vLLMSampler,
+                emb_model: TransformersModel,
+                emb_template: Qwen3_5Template,
+                api: Optional[OpenAIClient]) -> None:
+    """Probe each gold query against the index; report recall@k.
+
+    Self-recall semantics: only rows whose ``id`` is already present in the
+    index are probed. The corresponding ``cot``-keyed vector must be retrieved
+    by encoding the **raw user query** through the condenser → embedder
+    pipeline (anchor side). The match is correct iff the retrieved row's
+    ``id`` equals the probe row's ``id``.
+    """
+    import lancedb
+    db = lancedb.connect(args.db_path)
+    if args.table not in db.table_names():
+        raise SystemExit(f'[eval] table "{args.table}" does not exist in {args.db_path}')
+    tbl = db.open_table(args.table)
+    indexed_ids = _existing_ids(tbl)
+    sys.stderr.write(f'[eval] table rows={tbl.count_rows()} indexed_ids={len(indexed_ids)}\n')
+    if not indexed_ids:
+        sys.stderr.write('[eval] empty index — nothing to evaluate.\n')
+        return
+
+    ks = sorted({1, 5, 10, args.top_k})
+    hits = {k: 0 for k in ks}
+    per_source_hits: Dict[str, Dict[int, int]] = {}
+    per_source_total: Dict[str, int] = {}
+    probed = 0
+
+    pbar = tqdm(desc='eval', unit='probe', dynamic_ncols=True)
+    batch_rows: List[Dict[str, Any]] = []
+
+    def _flush(rows: List[Dict[str, Any]]) -> None:
+        nonlocal probed
+        if not rows:
+            return
+        compressed = _resolve_compressed(
+            sampler, api, [r['query_raw'] for r in rows], RAG_QUERY_HINT)
+        useful = [(r, c) for r, c in zip(rows, compressed) if c]
+        if not useful:
+            return
+        anchor_emb = get_embeddings(
+            emb_model, emb_template, [c for _, c in useful], role='anchor')
+        for (r, _), vec in zip(useful, anchor_emb):
+            res = (
+                tbl.search(vec.astype(np.float32).tolist())
+                .metric('dot')
+                .limit(max(ks))
+                .select(['id', 'source'])
+                .to_list()
+            )
+            hit_ids = [item['id'] for item in res]
+            try:
+                rank = hit_ids.index(r['id'])
+            except ValueError:
+                rank = -1
+            for k in ks:
+                if 0 <= rank < k:
+                    hits[k] += 1
+                    per_source_hits.setdefault(r['source'], {kk: 0 for kk in ks})[k] += 1
+            per_source_total[r['source']] = per_source_total.get(r['source'], 0) + 1
+            per_source_hits.setdefault(r['source'], {kk: 0 for kk in ks})
+            probed += 1
+        pbar.update(len(useful))
+
+    try:
+        for row in _stream_corpus(total=args.total, load_from_cache_file=not args.no_cache,
+                                  max_rows=args.max_rows):
+            if probed + len(batch_rows) >= args.eval_size:
+                break
+            rid = row.get('id') or ''
+            if not rid or rid not in indexed_ids:
+                continue
+            user_query, _ = _extract_query_cot(row)
+            if not user_query or len(user_query) < MIN_TEXT_CHARS:
+                continue
+            batch_rows.append({
+                'id': rid,
+                'source': row.get('source') or 'unknown',
+                'query_raw': user_query,
+            })
+            if len(batch_rows) >= args.batch_size:
+                _flush(batch_rows)
+                batch_rows.clear()
+        if batch_rows:
+            _flush(batch_rows)
+    finally:
+        pbar.close()
+
+    if probed == 0:
+        sys.stderr.write(
+            '[eval] no probed rows — index empty, queries too short, or '
+            'corpus exhausted before eval-size?\n')
+        return
+
+    print('\n=== Recall @ k (self-recall, gold present in index) ===')
+    print(f'probed = {probed}')
+    for k in ks:
+        print(f'  recall@{k:<3} = {hits[k]/probed:.4f}  ({hits[k]}/{probed})')
+
+    print('\n=== Per-source recall@10 ===')
+    for src in sorted(per_source_total):
+        tot = per_source_total[src]
+        h10 = per_source_hits.get(src, {}).get(10, 0)
+        print(f'  {src:<48s} {h10/tot:.4f}  ({h10}/{tot})')
+
+
+# ===========================================================================
+# CLI
+# ===========================================================================
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument('--mode', choices=['build', 'eval', 'both'], default='build')
+    p.add_argument('--db-path', default='./output/thinking_rag/lance.db',
+                   help='LanceDB on-disk directory (persisted across runs).')
+    p.add_argument('--table', default='thinking_traces',
+                   help='LanceDB table name within --db-path.')
+    p.add_argument('--total', type=int, default=0,
+                   help='Total dataset rows to scale corpus to (0 = base sizes from the loader module).')
+    p.add_argument('--dataset-module', default='dataset_index',
+                   choices=['dataset_index', 'dataset_think'],
+                   help='Which loader to use: dataset_index (RAG profile) or '
+                        'dataset_think (training mix).')
+    p.add_argument('--limit', type=int, default=0,
+                   help='Stop building once this many rows are kept (0 = no cap).')
+    p.add_argument('--max-rows', type=int, default=0,
+                   help='Truncate corpus to this many rows AFTER get_dataset (0 = no cap). '
+                        'Use this instead of --total to avoid invalidating the dataset cache.')
+    p.add_argument('--batch-size', type=int, default=64,
+                   help='Rows per condense+encode batch.')
+    p.add_argument('--no-cache', action='store_true',
+                   help='Disable load_from_cache_file in dataset_think.get_dataset.')
+    p.add_argument('--overwrite', action='store_true',
+                   help='Drop the table before build and start fresh.')
+    p.add_argument('--skip-index', action='store_true',
+                   help='Skip IVF_PQ index build at the end (debug).')
+    p.add_argument('--misses-log', default='',
+                   help='Path for filtered-row JSONL log (defaults to <db-path>/<table>.misses.jsonl).')
+
+    # eval-only
+    p.add_argument('--eval-size', type=int, default=500,
+                   help='Number of probes for self-recall evaluation.')
+    p.add_argument('--top-k', type=int, default=10,
+                   help='Largest k to report. Smaller ks (1, 5) are always reported.')
+
+    return p.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    Path(args.db_path).mkdir(parents=True, exist_ok=True)
+
+    global _GET_DATASET
+    if args.dataset_module == 'dataset_think':
+        from dataset_think import get_dataset as _swap
+        _GET_DATASET = _swap
+    sys.stderr.write(f'[main] dataset loader: {args.dataset_module}\n')
+
+    # Build/eval both depend on the same Twinkle stack — initialize once.
+    sampler_mesh, emb_mesh = initialize_twinkle()
+    sys.stderr.write(f'[main] twinkle initialized: '
+                     f'sampler ranks 0-{SAMPLER_GPUS - 1} (TP={SAMPLER_GPUS}), '
+                     f'emb_model ranks {SAMPLER_GPUS}-{NUM_GPUS - 1} (DP={EMB_GPUS}).\n')
+
+    sys.stderr.write('[main] starting vLLM condenser sampler...\n')
+    sampler = build_sampler(sampler_mesh)
+    sys.stderr.write('[main] starting embedding TransformersModel...\n')
+    emb_model, emb_template = build_emb_model(emb_mesh)
+
+    api: Optional[OpenAIClient] = None
+    if COMPRESS_API_KEY:
+        api = OpenAIClient(
+            model=COMPRESS_API_MODEL,
+            api_key=COMPRESS_API_KEY,
+            base_url=COMPRESS_BASE_URL,
+        )
+    else:
+        sys.stderr.write(
+            '[main] WARNING: COMPRESS_API_KEY unset — truncated rows will be dropped.\n')
+
+    if args.mode in ('build', 'both'):
+        build_index(args, sampler, emb_model, emb_template, api)
+    if args.mode in ('eval', 'both'):
+        eval_recall(args, sampler, emb_model, emb_template, api)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cookbook/exp/embedding/dataset_index.py b/cookbook/exp/embedding/dataset_index.py
new file mode 100644
index 000000000..7d2905a59
--- /dev/null
+++ b/cookbook/exp/embedding/dataset_index.py
@@ -0,0 +1,718 @@
+"""RAG-index corpus loader — abstract reasoning skills + textbook-style methods.
+
+Distinct from training-time ``dataset_think.py``. Optimizes for **abstraction
+density**, not raw coverage: every row should encode a transferable method,
+theorem, or solution pattern that downstream queries can retrieve as a
+"use-when-X-do-Y" recipe.
+
+Single-table design (``thinking_traces``); EMBED_QUERY_COT condense step in
+``build_thinking_rag_index`` homogenizes thinking-style and textbook-style
+content into the same retrieval form, so dual-table is unnecessary. The
+``source`` field carries the original dataset name for eval-time
+domain-bucket diagnostics.
+
+Output schema matches ``dataset_think.get_dataset()``: ``{id, source, messages}``
+with ``messages[1].reasoning_content`` carrying the CoT.
+
+Mix (≈3.6M rows base, 10 datasets):
+    Math thinking      23% — OpenMathReasoning + OpenR1-Math-220k + s1K-1.1
+    Code thinking      19% — OpenCodeReasoning-2 + codeforces-cots
+    Cross-domain R1    39% — Bespoke-Stratos + dolphin-r1 + reasoning-v1-20m
+                              + natural_reasoning
+    Textbook synth     17% — cosmopedia v1 (auto_math_text, chunked by H2)
+    Olympiad solutions <1% — Omni-MATH
+
+Dropped: camel-ai/{physics,chemistry,biology} (zip-only, no parquet/jsonl) and
+swift/stack-exchange-paired (dataset_infos.json/data layout mismatch); the
+textbook-density gap is covered by a larger cosmopedia slice.
+
+Textbook processors synthesize a question from the chapter heading and place
+the explanatory body into the ``cot`` field — embedding+condense reads
+``query | cot`` so the textbook prose becomes a retrievable method.
+
+Field extraction is defensive: each processor tries multiple plausible column
+names and silently drops rows that miss a usable signal. Inspect
+``dropped_index.jsonl`` after the first run to verify field-name guesses.
+"""
+import re
+from typing import Any, Dict, List, Optional
+
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.preprocessor import Preprocessor
+
+from dataset_think import _THINK_RE, _hash_id, _register, ToMessagesProcessor
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+# Sky-T1 / Bespoke-Stratos custom markers (used in place of <think>).
+_BOT_RE = re.compile(
+    r'<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>', re.DOTALL)
+_BOS_RE = re.compile(
+    r'<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>', re.DOTALL)
+
+# H2 heading split for cosmopedia-style markdown chunks.
+_H2_RE = re.compile(r'^##\s+(.+?)\s*$', re.MULTILINE)
+
+
+def _split_think(text: str) -> tuple:
+    """Return ``(cot, response)``; cot empty if no ``<think>`` block found."""
+    if not text:
+        return '', ''
+    m = _THINK_RE.search(text)
+    if not m:
+        return '', text.strip()
+    return m.group(1).strip(), text[m.end():].strip()
+
+
+def _split_sky_t1(text: str) -> tuple:
+    """Return ``(cot, response)`` for Sky-T1 / Bespoke-Stratos marker format."""
+    if not text:
+        return '', ''
+    bot = _BOT_RE.search(text)
+    bos = _BOS_RE.search(text)
+    cot = bot.group(1).strip() if bot else ''
+    sol = bos.group(1).strip() if bos else ''
+    return cot, sol
+
+
+def _from_messages(messages: Any) -> tuple:
+    """Pull (first_user, first_assistant) from OpenAI/ShareGPT-style list."""
+    if not isinstance(messages, list):
+        return '', ''
+    query, assistant = '', ''
+    for msg in messages:
+        if not isinstance(msg, dict):
+            continue
+        role = msg.get('role') or msg.get('from') or ''
+        content = msg.get('content') or msg.get('value') or ''
+        if not isinstance(content, str):
+            continue
+        if role in ('user', 'human') and not query:
+            query = content.strip()
+        elif role in ('assistant', 'gpt') and not assistant:
+            assistant = content.strip()
+            break
+    return query, assistant
+
+
+def _chunk_by_h2(text: str, min_chars: int = 200, max_chars: int = 6000):
+    """Split markdown text on ``## `` headings; yield ``(title, body)`` pairs."""
+    if not text:
+        return
+    matches = list(_H2_RE.finditer(text))
+    if not matches:
+        head = text.strip()[:80].splitlines()[0] if text.strip() else ''
+        body = text.strip()
+        if head and min_chars <= len(body) <= max_chars:
+            yield head, body
+        return
+    for i, m in enumerate(matches):
+        title = m.group(1).strip()
+        start = m.end()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
+        body = text[start:end].strip()
+        if min_chars <= len(body) <= max_chars and title:
+            yield title, body
+
+
+# ===========================================================================
+# Math thinking
+# ===========================================================================
+
+OPEN_MATH_REASONING_REPO = 'ms://AI-ModelScope/OpenMathReasoning'
+
+
+class OpenMathReasoningProcessor(Preprocessor):
+    """OpenMathReasoning → ``{id, source, query, cot, response}``.
+
+    Schema: ``problem``, ``generated_solution`` (R1 trace with ``<think>``),
+    ``expected_answer``. The ``cot`` *split* (not column) is the long-CoT
+    portion — TIR/genselect/additional_problems sit in sibling splits and
+    are filtered at load time, not row-level.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('problem') or row.get('question') or '').strip()
+            assistant = (row.get('generated_solution') or row.get('solution')
+                         or row.get('output') or '').strip()
+            if not query or not assistant:
+                continue
+            cot, response = _split_think(assistant)
+            if not cot:
+                continue
+            if not response:
+                response = (row.get('expected_answer') or row.get('answer') or '').strip()
+            if not response:
+                continue
+            out.append({
+                'id': _hash_id('open_math_reasoning', f'{query}\n{response}'),
+                'source': 'OpenMathReasoning',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+OPEN_R1_MATH_REPO = 'ms://open-r1/OpenR1-Math-220k'
+
+
+class OpenR1MathProcessor(Preprocessor):
+    """OpenR1-Math-220k → ``{id, source, query, cot, response}``.
+
+    Schema: ``problem``, ``solution``, ``answer``, ``generations`` (list of
+    R1 traces), ``correctness_math_verify`` (parallel bool list). Pick the
+    first generation whose math-verify passed; fall back to ``solution``.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('problem') or row.get('question') or '').strip()
+            if not query:
+                continue
+            assistant = ''
+            gens = row.get('generations')
+            verifies = row.get('correctness_math_verify')
+            if isinstance(gens, list):
+                if isinstance(verifies, list) and len(verifies) == len(gens):
+                    for g, v in zip(gens, verifies):
+                        if v and isinstance(g, str) and g.strip():
+                            assistant = g.strip()
+                            break
+                if not assistant:
+                    for g in gens:
+                        if isinstance(g, str) and g.strip():
+                            assistant = g.strip()
+                            break
+            if not assistant:
+                assistant = (row.get('solution') or '').strip()
+            if not assistant:
+                continue
+            cot, response = _split_think(assistant)
+            if not cot:
+                continue
+            if not response:
+                response = (row.get('answer') or '').strip()
+            if not response:
+                continue
+            out.append({
+                'id': _hash_id('open_r1_math', f'{query}\n{response}'),
+                'source': 'OpenR1-Math-220k',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+S1K_REPO = 'ms://simplescaling/s1K-1.1'
+
+
+class S1KProcessor(Preprocessor):
+    """s1K-1.1 → ``{id, source, query, cot, response}``.
+
+    Schema: ``question`` + ``deepseek_thinking_trajectory`` (or
+    ``thinking_trajectories`` legacy) + ``deepseek_attempt`` (final answer).
+    Hand-curated peak-abstraction set, kept whole.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('question') or row.get('problem') or '').strip()
+            thinking = (row.get('deepseek_thinking_trajectory')
+                        or row.get('thinking_trajectories')
+                        or row.get('thinking') or '')
+            if isinstance(thinking, list):
+                thinking = '\n\n'.join(t for t in thinking if isinstance(t, str))
+            cot = (thinking or '').strip()
+            response = (row.get('deepseek_attempt') or row.get('attempt')
+                        or row.get('answer') or row.get('solution') or '').strip()
+            if not query or not cot or not response:
+                continue
+            out.append({
+                'id': _hash_id('s1k', f'{query}\n{response}'),
+                'source': 's1K-1.1',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+# ===========================================================================
+# Code thinking
+# ===========================================================================
+
+OPEN_CODE_REASONING_REPO = 'ms://nv-community/OpenCodeReasoning-2'
+
+
+class OpenCodeReasoning2Processor(Preprocessor):
+    """OpenCodeReasoning-2 → ``{id, source, query, cot, response}``.
+
+    Schema: ``input``/``problem``, plus per-model R1-style trace columns
+    (``r1_generation``, ``qwq_generation``, etc.). Prefer the ``r1`` trace;
+    fall back to ``solution``.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('input') or row.get('problem')
+                     or row.get('question') or '').strip()
+            # OCR-2 'python' split ships dirty rows where question is literally '-';
+            # the real prompt is buried in r1_generation and not recoverable here.
+            if not query or query == '-':
+                continue
+            assistant = (row.get('r1_generation') or row.get('reasoning_content')
+                         or row.get('solution') or row.get('output') or '').strip()
+            if not assistant:
+                continue
+            cot, response = _split_think(assistant)
+            if not cot:
+                continue
+            if not response:
+                response = (row.get('expected_solution') or row.get('answer') or '').strip()
+            if not response:
+                continue
+            out.append({
+                'id': _hash_id('opencode_reasoning2', f'{query}\n{response}'),
+                'source': 'OpenCodeReasoning-2',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+CODEFORCES_COTS_REPO = 'ms://open-r1/codeforces-cots'
+
+
+class CodeforcesCotsProcessor(Preprocessor):
+    """codeforces-cots → ``{id, source, query, cot, response}``.
+
+    Schema: ``description``/``problem``, ``generation``/``solution`` (R1
+    trace with ``<think>`` + final code). Algorithmic patterns at high
+    abstraction density.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('description') or row.get('problem')
+                     or row.get('input') or row.get('question') or '').strip()
+            assistant = (row.get('generation') or row.get('solution')
+                         or row.get('output') or '').strip()
+            if not query or not assistant:
+                continue
+            cot, response = _split_think(assistant)
+            if not cot or not response:
+                continue
+            out.append({
+                'id': _hash_id('codeforces_cots', f'{query}\n{response}'),
+                'source': 'codeforces-cots',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+# ===========================================================================
+# Cross-domain R1
+# ===========================================================================
+
+BESPOKE_STRATOS_REPO = 'ms://bespokelabs/Bespoke-Stratos-17k'
+
+
+class BespokeStratosProcessor(Preprocessor):
+    """Bespoke-Stratos-17k → ``{id, source, query, cot, response}``.
+
+    Schema: ``conversations`` (ShareGPT). Assistant content uses Sky-T1
+    markers ``<|begin_of_thought|>...<|end_of_thought|>`` then
+    ``<|begin_of_solution|>...<|end_of_solution|>``.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query, assistant = _from_messages(
+                row.get('conversations') or row.get('messages'))
+            if not query or not assistant:
+                continue
+            cot, response = _split_sky_t1(assistant)
+            if not cot:
+                cot, response = _split_think(assistant)
+            if not cot or not response:
+                continue
+            out.append({
+                'id': _hash_id('bespoke_stratos', f'{query}\n{response}'),
+                'source': 'Bespoke-Stratos-17k',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+DOLPHIN_R1_REPO = 'ms://AI-ModelScope/dolphin-r1'
+
+
+class DolphinR1Processor(Preprocessor):
+    """dolphin-r1 → ``{id, source, query, cot, response}``.
+
+    Schema (reasoning-deepseek subset): ``messages=[system, user]`` (no
+    assistant turn) + flat ``reasoning`` (CoT) + ``answer`` (final response)
+    + ``model``. Pull the user turn as query, ``reasoning``/``answer`` as
+    cot/response. Fallback to embedded ``<think>`` for legacy rows.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            msgs = row.get('messages') or row.get('conversations')
+            query = ''
+            if isinstance(msgs, list):
+                for msg in msgs:
+                    if not isinstance(msg, dict):
+                        continue
+                    role = msg.get('role') or msg.get('from') or ''
+                    content = msg.get('content') or msg.get('value') or ''
+                    if role in ('user', 'human') and isinstance(content, str):
+                        query = content.strip()
+            cot = (row.get('reasoning') or row.get('reasoning_content') or '').strip()
+            response = (row.get('answer') or '').strip()
+            if (not cot or not response) and isinstance(msgs, list):
+                _, assistant = _from_messages(msgs)
+                if assistant:
+                    c2, r2 = _split_think(assistant)
+                    if c2:
+                        cot = cot or c2
+                        response = response or r2 or assistant
+            if not query or not cot or not response:
+                continue
+            out.append({
+                'id': _hash_id('dolphin_r1', f'{query}\n{response}'),
+                'source': 'dolphin-r1',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+GLAIVE_REASONING_REPO = 'ms://glaiveai/reasoning-v1-20m'
+
+
+class GlaiveReasoningProcessor(Preprocessor):
+    """reasoning-v1-20m → ``{id, source, query, cot, response}``.
+
+    Schema: ``prompt``, ``response`` (R1 trace with ``<think>`` + answer).
+    Largest cross-domain corpus in the mix; downsample aggressively.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('prompt') or row.get('question')
+                     or row.get('input') or '').strip()
+            assistant = (row.get('response') or row.get('output')
+                         or row.get('answer') or '').strip()
+            if not query or not assistant:
+                continue
+            cot, response = _split_think(assistant)
+            if not cot or not response:
+                continue
+            out.append({
+                'id': _hash_id('glaive_reasoning', f'{query}\n{response}'),
+                'source': 'reasoning-v1-20m',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+NATURAL_REASONING_REPO = 'ms://facebook/natural_reasoning'
+
+
+class NaturalReasoningProcessor(Preprocessor):
+    """natural_reasoning → ``{id, source, query, cot, response}``.
+
+    Schema: ``question`` + ``reference_answer`` + ``responses=[{response_model,
+    response}]``. The ``response`` field itself is the step-by-step CoT
+    (``## Step 1...## Step 2...``); there is no separate ``reasoning`` key.
+    Map ``responses[i].response`` → cot, ``reference_answer`` → response.
+    Rows with empty ``reference_answer`` (~18% per README) are dropped.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('question') or '').strip()
+            if not query:
+                continue
+            cot = ''
+            responses = row.get('responses')
+            if isinstance(responses, list):
+                for r in responses:
+                    if not isinstance(r, dict):
+                        continue
+                    txt = (r.get('response') or r.get('reasoning')
+                           or r.get('thinking') or r.get('answer') or '').strip()
+                    if txt:
+                        cot = txt
+                        break
+            if not cot:
+                cot = (row.get('reasoning') or row.get('thinking')
+                       or row.get('response') or '').strip()
+            response = (row.get('reference_answer') or row.get('answer') or '').strip()
+            if not cot or not response:
+                continue
+            out.append({
+                'id': _hash_id('natural_reasoning', f'{query}\n{response}'),
+                'source': 'natural_reasoning',
+                'query': query,
+                'cot': cot,
+                'response': response,
+            })
+        return self.map_row_to_col(out)
+
+
+# ===========================================================================
+# Textbook-style — synthesize query from chapter heading; body → cot
+# ===========================================================================
+
+COSMOPEDIA_REPO = 'ms://HuggingFaceTB/cosmopedia'
+
+class CosmopediaProcessor(Preprocessor):
+    """cosmopedia v1 → ``{id, source, query, cot, response}``.
+
+    Schema: ``prompt`` (writing instruction), ``text`` (full chapter body),
+    ``format``/``audience``/``seed_data``. The subset is selected at load
+    time (``subset_name='auto_math_text'`` — densest math-textbook slice);
+    H2 chunking inside each row yields synthetic queries
+    (``Explain {heading}``) with the body placed into ``cot``.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            text = (row.get('text') or row.get('content') or '').strip()
+            if not text:
+                continue
+            for title, body in _chunk_by_h2(text):
+                # Heading-only "Explain: X" was 1-2 tokens and impossible to align
+                # with full-section cot. Promote the section's lead paragraph into
+                # the query so anchor carries real semantic content.
+                parts = body.split('\n\n', 1)
+                first_para = parts[0].strip()
+                rest = parts[1].strip() if len(parts) > 1 else ''
+                if len(first_para) < 256 or len(rest) < 256:
+                    continue
+                query = f'{title}\n\n{first_para}' if title else first_para
+                out.append({
+                    'id': _hash_id('cosmopedia', f'{title}\n{first_para[:200]}'),
+                    'source': 'cosmopedia-v1',
+                    'query': query,
+                    'cot': rest,
+                    'response': '',
+                })
+        return self.map_row_to_col(out)
+
+
+OMNI_MATH_REPO = 'ms://AI-ModelScope/Omni-MATH'
+
+
+class OmniMathProcessor(Preprocessor):
+    """Omni-MATH → ``{id, source, query, cot, response}``.
+
+    Schema: ``problem``, ``solution`` (full proof), ``answer``, ``domain``,
+    ``difficulty``. Olympiad-grade derivations — solution body → cot,
+    answer → response.
+    """
+
+    def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+        rows = self.map_col_to_row(rows)
+        out: List[Dict[str, Any]] = []
+        for row in rows:
+            query = (row.get('problem') or row.get('question') or '').strip()
+            solution = (row.get('solution') or '').strip()
+            answer = (row.get('answer') or row.get('expected_answer') or '').strip()
+            if not query or not solution:
+                continue
+            out.append({
+                'id': _hash_id('omni_math', f'{query}\n{solution[:200]}'),
+                'source': 'Omni-MATH',
+                'query': query,
+                'cot': solution,
+                'response': answer,
+            })
+        return self.map_row_to_col(out)
+
+
+# ===========================================================================
+# Mix configuration — base sizes target ≈3.6M total rows
+# ===========================================================================
+
+_BASE_SIZES = {
+    'open_math_reasoning': 600_000,
+    'open_r1_math': 220_000,
+    's1k': 1_000,
+    'opencode_reasoning2': 500_000,
+    'codeforces_cots': 200_000,
+    'bespoke_stratos': 17_000,
+    'dolphin_r1': 400_000,
+    'glaive_reasoning': 800_000,
+    'natural_reasoning': 200_000,
+    'cosmopedia': 700_000,
+    'omni_math': 4_000,
+}
+
+
+def _scaled_sizes(total: Optional[int]) -> Dict[str, int]:
+    if total is None or total <= 0:
+        return dict(_BASE_SIZES)
+    scale = total / sum(_BASE_SIZES.values())
+    return {k: max(1, int(round(v * scale))) for k, v in _BASE_SIZES.items()}
+
+
+def _build_dataset(total: Optional[int] = None,
+                   load_from_cache_file: bool = True) -> Dataset:
+    sizes = _scaled_sizes(total)
+    dataset = Dataset()
+
+    _register(dataset, OpenMathReasoningProcessor,
+              DatasetMeta(dataset_id=OPEN_MATH_REASONING_REPO, split='cot',
+                          data_slice=range(sizes['open_math_reasoning'])),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, OpenR1MathProcessor,
+              DatasetMeta(dataset_id=OPEN_R1_MATH_REPO, split='train',
+                          data_slice=range(sizes['open_r1_math'])),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, S1KProcessor,
+              DatasetMeta(dataset_id=S1K_REPO, split='train'),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, OpenCodeReasoning2Processor,
+              DatasetMeta(dataset_id=OPEN_CODE_REASONING_REPO,
+                          subset_name='train', split='python',
+                          data_slice=range(sizes['opencode_reasoning2'])),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, CodeforcesCotsProcessor,
+              DatasetMeta(dataset_id=CODEFORCES_COTS_REPO,
+                          subset_name='solutions_w_editorials_decontaminated',
+                          split='train',
+                          data_slice=range(sizes['codeforces_cots'])),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, BespokeStratosProcessor,
+              DatasetMeta(dataset_id=BESPOKE_STRATOS_REPO, split='train'),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, DolphinR1Processor,
+              DatasetMeta(dataset_id=DOLPHIN_R1_REPO,
+                          subset_name='reasoning-deepseek', split='train',
+                          data_slice=range(sizes['dolphin_r1'])),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, GlaiveReasoningProcessor,
+              DatasetMeta(dataset_id=GLAIVE_REASONING_REPO, split='train',
+                          data_slice=range(sizes['glaive_reasoning'])),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, NaturalReasoningProcessor,
+              DatasetMeta(dataset_id=NATURAL_REASONING_REPO, split='train',
+                          data_slice=range(sizes['natural_reasoning'])),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, CosmopediaProcessor,
+              DatasetMeta(dataset_id=COSMOPEDIA_REPO,
+                          subset_name='auto_math_text', split='train',
+                          data_slice=range(sizes['cosmopedia'])),
+              load_from_cache_file=load_from_cache_file)
+
+    _register(dataset, OmniMathProcessor,
+              DatasetMeta(dataset_id=OMNI_MATH_REPO, split='test'),
+              load_from_cache_file=load_from_cache_file)
+
+    dataset.mix_dataset(False)
+    # Mix is concatenated in registration order; shuffle so the streaming
+    # consumer sees all sources interleaved instead of 600k OpenMathReasoning
+    # rows before it ever reaches code/textbook splits.
+    dataset.dataset = dataset.dataset.shuffle(seed=42)
+    return dataset
+
+
+def get_dataset(total: Optional[int] = None,
+                dropped_log: Optional[str] = None,
+                load_from_cache_file: bool = True) -> Dataset:
+    """Build, convert to messages, and quality-filter the RAG-index corpus.
+
+    Mirrors ``dataset_think.get_dataset``: identical signature + output
+    schema so ``build_thinking_rag_index`` consumes both modules unchanged.
+    """
+    from twinkle_agentic.preprocessor import (
+        DeadLoopFilter,
+        FixUnicodeFilter,
+        HardFilter,
+        MessageSanityFilter,
+        QualityPreprocessor,
+        RefuseFilter,
+        RemoveRepeatSentencesFilter,
+        TokenNumFilter,
+        TokenSoupFilter,
+    )
+
+    dataset = _build_dataset(total=total, load_from_cache_file=load_from_cache_file)
+    # Drop trivially-short queries (e.g. one-line math problems, OmniMath stubs)
+    # before message conversion — anchor side needs enough tokens to embed meaningfully.
+    dataset.dataset = dataset.dataset.filter(
+        lambda x: len((x.get('query') or '').strip()) >= 100,
+        num_proc=32, load_from_cache_file=load_from_cache_file)
+    dataset.map(ToMessagesProcessor(), remove_columns=['query', 'cot', 'response'],
+                load_from_cache_file=load_from_cache_file)
+    qp = QualityPreprocessor(
+        pipeline=[
+            HardFilter(),
+            RefuseFilter(),
+            DeadLoopFilter(),
+            TokenSoupFilter(),
+            MessageSanityFilter(min_turns=1, max_msg_chars=200000),
+            FixUnicodeFilter(),
+            RemoveRepeatSentencesFilter(),
+            TokenNumFilter(max_num=32768),
+        ],
+        dropped_log_path=dropped_log or '',
+    )
+    dataset.map(qp, num_proc=32, load_from_cache_file=load_from_cache_file)
+    return dataset
+
+
+if __name__ == '__main__':
+    import os
+    dropped_log = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                               'dropped_index.jsonl')
+    if os.path.exists(dropped_log):
+        os.remove(dropped_log)
+    dataset = get_dataset(load_from_cache_file=False)
+    print(len(dataset))
diff --git a/cookbook/exp/embedding/train_embedding_full_ddp.py b/cookbook/exp/embedding/train_embedding_full_ddp.py
index 492e29aae..5db9f786c 100644
--- a/cookbook/exp/embedding/train_embedding_full_ddp.py
+++ b/cookbook/exp/embedding/train_embedding_full_ddp.py
@@ -1,14 +1,12 @@
-"""LoRA embedding training with online condenser self-improvement.
+"""LoRA embedding training with online compression via frozen vLLM condenser.
 
 Architecture (8 GPUs total):
   - Ranks 0-3 (``model``): Trainable embedding model with LoRA, InfoNCE loss.
-  - Ranks 4-5 (``condenser_sampler``): Frozen vLLM condenser for online compression.
-  - Ranks 6-7 (``condenser_model``): Trainable condenser with LoRA for self-improvement.
+  - Ranks 4-7 (``condenser_sampler``): Frozen vLLM condenser for online compression.
 
-When the condenser sampler truncates (stop_reason='length'), an external OpenAI-
-compatible API produces the correct compression. The failure is logged as SFT
-training data. A background thread retrains the condenser on accumulated failures
-mixed with condense_300K, then syncs weights back to the sampler.
+When the condenser sampler truncates or regresses to the legacy schema, an
+external OpenAI-compatible API produces the correct compression. The failure is
+logged to failures.jsonl for offline SFT data regeneration.
 
 Launch:
     python cookbook/exp/train_embedding_lora_ddp.py
@@ -19,6 +17,7 @@
 import re
 import sys
 import threading
+import time
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional
@@ -27,7 +26,6 @@
 
 import twinkle
 from twinkle import DeviceGroup, DeviceMesh, get_device_placement, get_logger
-from twinkle.checkpoint_engine import CheckpointEngineManager
 from twinkle.data_format import SamplingParams
 from twinkle.dataloader import DataLoader
 from twinkle.loss import InfonceLoss
@@ -35,12 +33,13 @@
 from twinkle.model import TransformersModel
 from twinkle.processor import InputProcessor
 from twinkle.sampler import vLLMSampler
-from twinkle.template import Template
+from twinkle.template import Qwen3_5Template, Template
 from twinkle.utils.parallel import PosixFileLock
 from twinkle_agentic.protocol.openai import OpenAI as OpenAIClient
 
 sys.path.insert(0, str(Path(__file__).resolve().parent))
-from dataset_think import get_dataset  # noqa: E402
+from dataset_think import get_dataset as get_dataset_think  # noqa: E402
+from dataset_index import get_dataset as get_dataset_index  # noqa: E402
 
 logger = get_logger()
 
@@ -54,29 +53,33 @@
 
 # -- GPU placement (8 total) --------------------------------------------------
 MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-CONDENSER_SAMPLER_GPUS = int(os.environ.get('CONDENSER_SAMPLER_GPUS', 2))
-CONDENSER_MODEL_GPUS = int(os.environ.get('CONDENSER_MODEL_GPUS', 2))
-NUM_GPUS = MODEL_GPUS + CONDENSER_SAMPLER_GPUS + CONDENSER_MODEL_GPUS
+CONDENSER_SAMPLER_GPUS = int(os.environ.get('CONDENSER_SAMPLER_GPUS', 4))
+NUM_GPUS = MODEL_GPUS + CONDENSER_SAMPLER_GPUS
 
 # -- Embedding training hyper-params ------------------------------------------
 EMB_MAX_LENGTH = 8192
 HARD_NEGATIVES = None
-TEMPERATURE = 0.03
+# 0.07 keeps gradient on diag pairs until cosine clears ~0.75; 0.03 saturated near 0.40.
+TEMPERATURE = 0.07
 
 BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 32))
-LEARNING_RATE = 1.5e-6
+LEARNING_RATE = 1e-5
 GRADIENT_ACCUMULATION_STEPS = 1
 LOG_INTERVAL = 2
-SAVE_INTERVAL = 4000
-NUM_EPOCHS = 2
+SAVE_INTERVAL = 2000
+NUM_EPOCHS = 1
 
 TOTAL_SAMPLES: Optional[int] = None
+# Post-build caps on each loader (None = no cap). Applied via .select() before mix.
+THINK_CAP: Optional[int] = 400_000
+INDEX_CAP: Optional[int] = 400_000
+MIX_SHUFFLE_SEED = 42
 
 # -- Resume from checkpoint ---------------------------------------------------
-RESUME_CHECKPOINT = os.environ.get(
-    'RESUME_CHECKPOINT',
-    './output/embedding_lora_transformers/step_16000')
-RESUME_STEP = int(os.environ.get('RESUME_STEP', 16000))
+# Empty by default — build_model falls back to MODEL_ID (the published emb model).
+# Set both to point at a local in-progress run only when resuming the *same* schedule.
+RESUME_CHECKPOINT = os.environ.get('RESUME_CHECKPOINT', '')
+RESUME_STEP = int(os.environ.get('RESUME_STEP', 0))
 
 # -- Online-compression knobs -------------------------------------------------
 # Below this length, condenser fabricates content for open-ended short prompts;
@@ -87,16 +90,18 @@
 COMPRESS_TOP_P = 0.5
 COMPRESS_MAX_MODEL_LEN = 32768
 
+# How many BATCH_SIZE chunks to fetch and compress in one vLLM call.
+PREFETCH_BATCH_MULTIPLIER = int(os.environ.get('PREFETCH_BATCH_MULTIPLIER', 8))
+
 # -- OpenAI API fallback for truncated compressions ---------------------------
 COMPRESS_API_KEY = os.environ.get('COMPRESS_API_KEY', '')
 COMPRESS_BASE_URL = os.environ.get('COMPRESS_BASE_URL', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
 COMPRESS_MODEL = os.environ.get('COMPRESS_MODEL', 'qwen3.7-max')
-
-# -- Condenser retraining knobs -----------------------------------------------
-CONDENSER_DATASET_ID = 'ms://twinkle-kit/condense_300K'
-CONDENSER_RETRAIN_SAMPLES = 128
-CONDENSER_RETRAIN_EPOCHS = 3
-CONDENSER_RETRAIN_LR = 1e-5
+# Minimum gap between API calls (seconds); bounds dashscope qps under provider limits.
+API_MIN_INTERVAL = float(os.environ.get('API_MIN_INTERVAL', 0.1))
+API_CONCURRENCY = int(os.environ.get('API_CONCURRENCY', 8))
+# vLLM sampler timeout (seconds); if a sample() call exceeds this, fall back to API.
+SAMPLER_TIMEOUT = float(os.environ.get('SAMPLER_TIMEOUT', 300))
 
 # -- Output paths -------------------------------------------------------------
 OUTPUT_DIR = f'./output/embedding_lora_{BACKEND}'
@@ -204,6 +209,17 @@
 _sample_counter = 0
 _sample_counter_lock = threading.Lock()
 
+_api_throttle_lock = threading.Lock()
+_api_last_call = [0.0]
+
+
+def _api_throttle():
+    with _api_throttle_lock:
+        gap = time.monotonic() - _api_last_call[0]
+        if gap < API_MIN_INTERVAL:
+            time.sleep(API_MIN_INTERVAL - gap)
+        _api_last_call[0] = time.monotonic()
+
 
 def _next_sample_id() -> int:
     global _sample_counter
@@ -314,11 +330,40 @@ def save_checkpoint(model, name: str):
 # Compression prompt building
 # =============================================================================
 
+# Hard-templated hints: the condenser SFT prior maps `Skill` to the legacy
+# `Use when: / numbered steps / Output:` skeleton on long inputs; embedding the
+# exact 4-line body template + explicit negative constraints is the only way to
+# override it deterministically across query and cot sides.
 EMBED_QUERY_Q = (
+    'Summarize this query for retrieval. '
+    'The body of ## Summary MUST follow this EXACT 4-line template — '
+    'do NOT emit "Use when:", numbered procedure steps, or "Output:":\n'
+    'Topic: <specific pattern name — scope>\n'
+    'Problem: <what concrete problem is being asked>\n'
+    'Skill: <which specific method/technique/pattern is required to solve it>\n'
+    'Knowledge: <which domains/concepts/facts must be invoked>\n'
+    'Then emit the mandatory ## More section as usual. '
+    'Topic must name the specific pattern, never generic labels.')
+EMBED_QUERY_COT = (
+    'Summarize this reasoning trace for retrieval. '
+    'The body of ## Summary MUST follow this EXACT 4-line template — '
+    'do NOT emit "Use when:", numbered procedure steps, or "Output:":\n'
+    'Topic: <specific pattern name — scope>\n'
+    'Problem: <what concrete problem this trace tackled>\n'
+    'Skill: <which specific method/technique/pattern was applied>\n'
+    'Knowledge: <which domains/concepts/facts were used>\n'
+    'Then emit the mandatory ## More section as usual. '
+    'Topic must name the specific pattern, never generic labels.')
+
+# Legacy schema (Use when: / numbered steps / Output:) — mixed in 50/50 with the
+# new schema to expose the embedder to schema-invariant semantic alignment.
+# Both query and cot of the SAME pair always use the SAME schema; cross-schema
+# anchors and positives would re-introduce the schema asymmetry we just fixed.
+EMBED_QUERY_Q_LEGACY = (
     'What problem does this passage address, and what skill or method is needed? '
     'Topic must name the specific pattern, never generic labels. '
     'Compress into a retrieval-friendly need description.')
-EMBED_QUERY_COT = (
+EMBED_QUERY_COT_LEGACY = (
     'Extract the reusable skill: trigger conditions, key steps, and expected output. '
     'Topic names the method/pattern; format as "Use when: ...", numbered steps, '
     '"Output: ...". Compress into a standardized procedure for retrieval.')
@@ -342,45 +387,59 @@ def _extract_query_cot(row: Dict[str, Any]):
 def _build_compress_prompts(rows: List[Dict[str, Any]]) -> tuple:
     """Build prompts for compressing both query and cot per row.
 
-    Returns (prompts, valid_indices, raw_pairs, prompt_queries, passthrough) where:
+    Returns (prompts, valid_indices, raw_pairs, prompt_queries, passthrough, schemas)
+    where:
     - prompts: flat-interleaved [query_0, cot_0, query_1, cot_1, ...]; ``None`` means
       passthrough (use raw text directly, do not call sampler)
     - valid_indices: which rows passed the min-length filter
     - raw_pairs: [(query, cot), ...]
     - prompt_queries: the query string used for each prompt (for failure logging)
     - passthrough: parallel to prompts; non-None text means "use this verbatim as qc"
+    - schemas: parallel to prompts; 'new' or 'legacy', drives validator branch
     """
     prompts: List[Optional[Dict[str, Any]]] = []
     valid_indices: List[int] = []
     raw_pairs: List[tuple] = []
     prompt_queries: List[str] = []
     passthrough: List[Optional[str]] = []
+    schemas: List[str] = []
+    # Conservative char budget: 32768 max_length - 8192 gen - ~2k prompt overhead = ~22k tokens.
+    # 30k cap bounds vLLM batch latency (vLLM batches by max prompt length).
+    _MAX_COT_CHARS = 30_000
     for i, row in enumerate(rows):
         query, cot = _extract_query_cot(row)
         if not query or len(cot) < MIN_TEXT_CHARS:
             continue
+        if len(cot) > _MAX_COT_CHARS:
+            continue
         valid_indices.append(i)
         raw_pairs.append((query, cot))
+        # 50/50 schema mix; same schema for query+cot of one pair to keep alignment.
+        schema = 'legacy' if (i % 2 == 0) else 'new'
+        q_hint = EMBED_QUERY_Q_LEGACY if schema == 'legacy' else EMBED_QUERY_Q
+        c_hint = EMBED_QUERY_COT_LEGACY if schema == 'legacy' else EMBED_QUERY_COT
         # Short query bypasses condenser to avoid skeleton-induced hallucination.
         if len(query) < MIN_TEXT_CHARS:
             prompts.append(None)
             passthrough.append(query)
         else:
-            user = COMPRESS_USER.format(query=EMBED_QUERY_Q, text=query)
+            user = COMPRESS_USER.format(query=q_hint, text=query)
             prompts.append({'messages': [
                 {'role': 'system', 'content': COMPRESS_SYSTEM},
                 {'role': 'user', 'content': user},
             ]})
             passthrough.append(None)
-        prompt_queries.append(EMBED_QUERY_Q)
-        user = COMPRESS_USER.format(query=EMBED_QUERY_COT, text=cot)
+        prompt_queries.append(q_hint)
+        schemas.append(schema)
+        user = COMPRESS_USER.format(query=c_hint, text=cot)
         prompts.append({'messages': [
             {'role': 'system', 'content': COMPRESS_SYSTEM},
             {'role': 'user', 'content': user},
         ]})
-        prompt_queries.append(EMBED_QUERY_COT)
+        prompt_queries.append(c_hint)
         passthrough.append(None)
-    return prompts, valid_indices, raw_pairs, prompt_queries, passthrough
+        schemas.append(schema)
+    return prompts, valid_indices, raw_pairs, prompt_queries, passthrough, schemas
 
 
 def _get_first_feature(decoded_text: str, template: Template, role: str) -> Optional[Dict[str, Any]]:
@@ -405,13 +464,24 @@ def _get_first_feature(decoded_text: str, template: Template, role: str) -> Opti
 # OpenAI API fallback
 # =============================================================================
 
-def _is_truncated_compression(text: str) -> bool:
-    """Detect structurally incomplete output that vLLM may report as stop_reason='stop'.
+_LEGACY_USE_WHEN_RE = re.compile(r'(?im)^\s*Use when\s*:')
+_SCHEMA_MARKERS = ('Problem:', 'Skill:', 'Knowledge:')
+
+
+def _is_truncated_compression(text: str, schema: str = 'new') -> bool:
+    """Reject structurally incomplete OR schema-regressed condenser output.
 
-    The condenser sometimes emits a chat-template token mid-skeleton (which we then
-    strip), so the visible text ends mid-sentence even though stop_reason!='length'.
-    The COMPRESS_SYSTEM skeleton mandates a `## More` section ending in a bullet list;
-    its absence is an unambiguous truncation signal.
+    Triggers API fallback when the vLLM output:
+      * lacks ``## Summary`` / ``## More``,
+      * has an empty or unterminated ``## More`` bullet list, or
+      * (schema='new' only) regresses to the legacy ``Use when: / numbered-steps /
+        Output:`` skeleton instead of the mandated Problem/Skill/Knowledge 4-line
+        body — the dominant cot-side failure mode that drives sim < 0.45 drops on
+        the RAG index.
+
+    For schema='legacy', body markers are intentionally NOT enforced: the legacy
+    template legitimately emits ``Use when:`` and the SFT prior already produces
+    that shape natively, so only structural completeness is checked.
     """
     if not text or not text.strip():
         return True
@@ -423,11 +493,18 @@ def _is_truncated_compression(text: str) -> bool:
     last_line = after_more.splitlines()[-1].strip()
     if not (last_line.startswith('-') or last_line.endswith(')')):
         return True
+    if schema == 'new':
+        summary_body = text.split('## Summary', 1)[1].split('## More', 1)[0]
+        if _LEGACY_USE_WHEN_RE.search(summary_body):
+            return True
+        if not all(marker in summary_body for marker in _SCHEMA_MARKERS):
+            return True
     return False
 
 
 def _api_compress(api_client: OpenAIClient, prompt: Dict[str, Any]) -> Optional[str]:
     """Call external API to compress when vLLM truncates."""
+    _api_throttle()
     trajectory = {'messages': prompt['messages']}
     # Cap max_tokens to leave ample prompt headroom inside the API model context.
     sp = SamplingParams(temperature=0.2, max_tokens=8192)
@@ -446,61 +523,12 @@ def _api_compress(api_client: OpenAIClient, prompt: Dict[str, Any]) -> Optional[
     return content
 
 
-# =============================================================================
-# Condenser Retrainer (background thread)
-# =============================================================================
-
-class CondenserRetrainer:
-    """Async condenser self-improvement: retrains from failures, syncs to sampler."""
-
-    def __init__(self, condenser_model, ckpt_manager: CheckpointEngineManager,
-                 condenser_sampler):
-        self._model = condenser_model
-        self._ckpt_manager = ckpt_manager
-        self._sampler = condenser_sampler
-        self._signal = threading.Event()
-        self._stop = threading.Event()
-        self._thread = threading.Thread(target=self._loop, daemon=True)
-        self._condense_300k_cache = None
-        self._retrain_count = 0
-        # Prevents sample() and sync_weights() from running concurrently
-        self.sampler_lock = threading.Lock()
-
-    def start(self):
-        self._thread.start()
-
-    def stop(self):
-        self._stop.set()
-        self._signal.set()
-        self._thread.join(timeout=10)
-
-    def notify_failure(self):
-        self._signal.set()
-
-    def _loop(self):
-        while not self._stop.is_set():
-            self._signal.wait(timeout=60)
-            if self._stop.is_set():
-                break
-            if not self._signal.is_set():
-                continue
-            self._signal.clear()
-            try:
-                self._retrain_and_sync()
-            except Exception as exc:
-                logger.error(f'[condenser_retrain] crashed: {exc}')
-
-    def _retrain_and_sync(self):
-        # Retrain + sync temporarily disabled; failures.jsonl is written directly by _log_failure.
-        pass
-
-
 # =============================================================================
 # Main training
 # =============================================================================
 
 def train():
-    # -------- Device groups (3 groups) ----------------------------------------
+    # -------- Device groups (2 groups) ----------------------------------------
     device_groups = [
         DeviceGroup(name='model',
                     ranks=list(range(MODEL_GPUS)),
@@ -508,22 +536,31 @@ def train():
         DeviceGroup(name='condenser_sampler',
                     ranks=list(range(MODEL_GPUS, MODEL_GPUS + CONDENSER_SAMPLER_GPUS)),
                     device_type='GPU'),
-        DeviceGroup(name='condenser_model',
-                    ranks=list(range(MODEL_GPUS + CONDENSER_SAMPLER_GPUS, NUM_GPUS)),
-                    device_type='GPU'),
     ]
     model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
     condenser_sampler_mesh = DeviceMesh.from_sizes(
         world_size=CONDENSER_SAMPLER_GPUS, dp_size=CONDENSER_SAMPLER_GPUS)
-    condenser_model_mesh = DeviceMesh.from_sizes(
-        world_size=CONDENSER_MODEL_GPUS, dp_size=1, fsdp_size=CONDENSER_MODEL_GPUS)
 
     twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups)
 
     # -------- Data -----------------------------------------------------------
-    dataset = get_dataset(total=TOTAL_SAMPLES, load_from_cache_file=True)
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
-    total_forward_steps = len(dataloader) * NUM_EPOCHS
+    dataset = get_dataset_think(total=TOTAL_SAMPLES, load_from_cache_file=True)
+    if THINK_CAP and len(dataset.dataset) > THINK_CAP:
+        dataset.dataset = dataset.dataset.select(range(THINK_CAP))
+    if INDEX_CAP != 0:
+        from datasets import concatenate_datasets
+        ds_index = get_dataset_index(total=None, load_from_cache_file=True)
+        if INDEX_CAP and len(ds_index.dataset) > INDEX_CAP:
+            ds_index.dataset = ds_index.dataset.select(range(INDEX_CAP))
+        n_think = len(dataset.dataset)
+        n_index = len(ds_index.dataset)
+        # Both loaders emit identical {id, source, messages} schema post-QP.
+        dataset.dataset = concatenate_datasets(
+            [dataset.dataset, ds_index.dataset]).shuffle(seed=MIX_SHUFFLE_SEED)
+        logger.info(f'[mix] think={n_think} + index={n_index} → total={len(dataset.dataset)}')
+    _mega_batch_size = BATCH_SIZE * PREFETCH_BATCH_MULTIPLIER
+    dataloader = DataLoader(dataset=dataset, batch_size=_mega_batch_size, shuffle=True)
+    total_forward_steps = len(dataloader) * PREFETCH_BATCH_MULTIPLIER * NUM_EPOCHS
     optimizer_steps = total_forward_steps // GRADIENT_ACCUMULATION_STEPS
 
     # -------- Embedding model (4 GPU) ----------------------------------------
@@ -534,10 +571,10 @@ def train():
     setup_optimizer(model, optimizer_steps)
     model.add_metric(EmbeddingMetric, is_training=True)
 
-    # -------- Condenser sampler (2 GPU, vLLM) --------------------------------
-    emb_template = Template(model_id=MODEL_ID, max_length=EMB_MAX_LENGTH, enable_thinking=False)
+    # -------- Condenser sampler (4 GPU, vLLM) --------------------------------
+    emb_template = Qwen3_5Template(model_id=MODEL_ID, max_length=EMB_MAX_LENGTH, enable_thinking=False)
     # Special tokens come from the condenser tokenizer because the leak we strip is in its decoded output.
-    condenser_template = Template(model_id=CONDENSE_MODEL_ID, max_length=DATASET_MAX_TOKENS,
+    condenser_template = Qwen3_5Template(model_id=CONDENSE_MODEL_ID, max_length=DATASET_MAX_TOKENS,
                                   enable_thinking=False)
     _special_tokens = set(condenser_template.tokenizer.all_special_tokens)
     condenser_sampler = vLLMSampler(
@@ -559,23 +596,32 @@ def train():
         num_samples=1,
     )
 
-    # -------- Condenser model (2 GPU, trainable full-param) -------------------
-    condenser_model = TransformersModel(
-        model_id=CONDENSE_MODEL_ID,
-        device_mesh=condenser_model_mesh,
-        remote_group='condenser_model',
-    )
-    condenser_model.set_optimizer(optimizer_cls='AdamW', lr=CONDENSER_RETRAIN_LR)
-
-    # -------- CheckpointEngineManager: condenser_model → condenser_sampler ---
-    condenser_ckpt_manager = CheckpointEngineManager(
-        model=condenser_model, sampler=condenser_sampler)
-    condenser_ckpt_manager.sync_weights()
-
-    # -------- Background retrainer -------------------------------------------
-    retrainer = CondenserRetrainer(condenser_model, condenser_ckpt_manager,
-                                   condenser_sampler)
-    retrainer.start()
+    condenser_sampler._ray_get_timeout = SAMPLER_TIMEOUT
+    _sampler_epoch = 0
+    
+    def _rebuild_sampler():
+        """Kill stuck actors and recreate the vLLM sampler from scratch."""
+        nonlocal condenser_sampler, _sampler_epoch
+        import ray
+        for actor in getattr(condenser_sampler, '_actors', []):
+            try:
+                ray.kill(actor, no_restart=True)
+            except Exception:
+                pass
+        logger.warning('[sampler] killed stuck actors, recreating sampler \u2026')
+        new = vLLMSampler(
+            model_id=CONDENSE_MODEL_ID,
+            engine_args={'gpu_memory_utilization': 0.8, 'max_model_len': COMPRESS_MAX_MODEL_LEN},
+            device_mesh=condenser_sampler_mesh,
+            remote_group='condenser_sampler',
+        )
+        new.set_template(
+            TEMPLATE_NAME, model_id=CONDENSE_MODEL_ID, enable_thinking=False,
+            truncation_strategy='delete', max_length=DATASET_MAX_TOKENS)
+        new._ray_get_timeout = SAMPLER_TIMEOUT
+        condenser_sampler = new
+        _sampler_epoch += 1
+        logger.warning('[sampler] sampler rebuilt successfully')
 
     # -------- OpenAI API client for fallback ---------------------------------
     api_client = OpenAIClient(
@@ -606,28 +652,41 @@ def train():
     # -------- Train loop -----------------------------------------------------
     def _sample_batch(raw_batch):
         """Compress via vLLM sampler; fall back to API on truncation."""
-        compress_prompts, valid_indices, raw_pairs, prompt_queries, passthrough = \
+        _t_enter = time.monotonic()
+        compress_prompts, valid_indices, raw_pairs, prompt_queries, passthrough, schemas = \
             _build_compress_prompts(raw_batch)
-        if not compress_prompts:
+        _t_build = time.monotonic()
+        if len(compress_prompts) < 4:
             return None
 
         # Only submit non-passthrough prompts to the sampler.
         sampler_input = [p for p in compress_prompts if p is not None]
         sampler_pos = [ri for ri, p in enumerate(compress_prompts) if p is not None]
         if sampler_input:
-            with retrainer.sampler_lock:
+            try:
                 sampler_responses = condenser_sampler.sample(sampler_input, compress_params)
+            except Exception as exc:
+                logger.warning(f'[sampler] error \u2192 API fallback: {exc}')
+                sampler_responses = [None] * len(sampler_input)
+                if 'Timeout' in type(exc).__name__:
+                    try:
+                        _rebuild_sampler()
+                    except Exception as re_exc:
+                        logger.error(f'[sampler] rebuild failed: {re_exc}')
         else:
             sampler_responses = []
+        _t_sample = time.monotonic()
+
         responses = [None] * len(compress_prompts)
         for resp, pos in zip(sampler_responses, sampler_pos):
             responses[pos] = resp
 
         # Extract decoded texts; detect truncations and fall back to API
-        decoded_texts: List[str] = []
+        decoded_texts: List[Optional[str]] = [None] * len(compress_prompts)
+        fallback_indices: List[int] = []
         for ri in range(len(compress_prompts)):
             if passthrough[ri] is not None:
-                decoded_texts.append(passthrough[ri])
+                decoded_texts[ri] = passthrough[ri]
                 continue
             resp = responses[ri]
             seq = resp.sequences[0] if resp and resp.sequences else None
@@ -638,27 +697,33 @@ def _sample_batch(raw_batch):
                     text = text.replace(tok, '')
                 text = text.rstrip()
 
-            # Premature-EOS: model emits chat-template token mid-skeleton, vLLM reports
-            # stop_reason='stop' but the stripped text is structurally incomplete.
             needs_fallback = (not seq or seq.stop_reason == 'length'
-                              or _is_truncated_compression(text))
+                              or _is_truncated_compression(text, schemas[ri]))
             if not needs_fallback:
-                decoded_texts.append(text)
-                continue
-
-            api_result = _api_compress(api_client, compress_prompts[ri])
-            # Skip logging when the API itself produced truncated output: an incomplete
-            # gold answer would teach the condenser to imitate broken outputs.
-            if api_result and not _is_truncated_compression(api_result):
-                decoded_texts.append(api_result)
-                pair_idx = ri // 2
-                q_raw, c_raw = raw_pairs[pair_idx]
-                source_text = q_raw if ri % 2 == 0 else c_raw
-                _log_failure(source_text, prompt_queries[ri], api_result,
-                             valid_indices[pair_idx])
-                retrainer.notify_failure()
+                decoded_texts[ri] = text
             else:
-                decoded_texts.append('')
+                fallback_indices.append(ri)
+
+        _api_calls = len(fallback_indices)
+        if fallback_indices:
+            from concurrent.futures import as_completed
+            api_futures = {}
+            with ThreadPoolExecutor(max_workers=API_CONCURRENCY) as api_pool:
+                for ri in fallback_indices:
+                    api_futures[api_pool.submit(_api_compress, api_client, compress_prompts[ri])] = ri
+                for fut in as_completed(api_futures):
+                    ri = api_futures[fut]
+                    api_result = fut.result()
+                    if api_result and not _is_truncated_compression(api_result, schemas[ri]):
+                        decoded_texts[ri] = api_result
+                        pair_idx = ri // 2
+                        q_raw, c_raw = raw_pairs[pair_idx]
+                        source_text = q_raw if ri % 2 == 0 else c_raw
+                        _log_failure(source_text, prompt_queries[ri], api_result,
+                                     valid_indices[pair_idx])
+                    else:
+                        decoded_texts[ri] = ''
+        _t_api = time.monotonic()
 
         # Build embedding features from decoded texts
         emb_features: List[Dict[str, Any]] = []
@@ -673,69 +738,96 @@ def _sample_batch(raw_batch):
             if feat_q and feat_c:
                 emb_features.append(feat_q)
                 emb_features.append(feat_c)
+        _t_feat = time.monotonic()
 
-        if len(emb_features) < 4:
-            return None
-        return emb_features
+        logger.info(
+            f'[prefetch] prompts={len(sampler_input)} api={_api_calls} feats={len(emb_features)} | '
+            f'build={_t_build - _t_enter:.1f}s '
+            f'vllm={_t_sample - _t_build:.1f}s '
+            f'api={_t_api - _t_sample:.1f}s feat={_t_feat - _t_api:.1f}s '
+            f'total={_t_feat - _t_enter:.1f}s')
+
+        _target = BATCH_SIZE * 2
+        minibatches = [emb_features[i:i + _target] for i in range(0, len(emb_features), _target)]
+        minibatches = [mb for mb in minibatches if len(mb) >= 4]
+        return minibatches if minibatches else None
 
     cur_step = RESUME_STEP
-    # Compute which epoch and how many batches to skip within that epoch
     _batches_per_epoch = len(dataloader)
-    _start_epoch = cur_step // _batches_per_epoch if cur_step > 0 else 0
-    _skip_batches_in_epoch = cur_step - _start_epoch * _batches_per_epoch if cur_step > 0 else 0
+    _steps_per_mega = PREFETCH_BATCH_MULTIPLIER
+    _start_epoch = cur_step // (_batches_per_epoch * _steps_per_mega) if cur_step > 0 else 0
+    _skip_batches_in_epoch = max(0, cur_step // _steps_per_mega - _start_epoch * _batches_per_epoch)
+
+    _ema_prefetch = 0.0
+    _ema_train = 0.0
+    _ema_alpha = 0.1
 
     prefetch_executor = ThreadPoolExecutor(max_workers=1)
     for epoch in range(_start_epoch, NUM_EPOCHS):
-        # Skip consumed samples for the resume epoch (shuffle order won't match
-        # exactly, but the correct number of samples is skipped).
         if _skip_batches_in_epoch > 0:
-            dataloader.skip_consumed_samples(_skip_batches_in_epoch * BATCH_SIZE)
+            dataloader.skip_consumed_samples(_skip_batches_in_epoch * _mega_batch_size)
         batch_iter = iter(dataloader)
-        # Reset skip after first resumed epoch
         _skip_batches_in_epoch = 0
-        prefetch_future = None
-        first_batch = next(batch_iter, None)
-        if first_batch is not None:
-            prefetch_future = prefetch_executor.submit(_sample_batch, first_batch)
 
-        for raw_batch in batch_iter:
-            emb_features = prefetch_future.result() if prefetch_future else None
-            prefetch_future = prefetch_executor.submit(_sample_batch, raw_batch)
+        first = next(batch_iter, None)
+        future = prefetch_executor.submit(_sample_batch, first) if first else None
 
-            if emb_features is None:
+        for raw_mega_batch in batch_iter:
+            t0 = time.monotonic()
+            minibatches = future.result() if future else None
+            t_prefetch = time.monotonic() - t0
+            future = prefetch_executor.submit(_sample_batch, raw_mega_batch)
+
+            if not minibatches:
                 continue
 
-            model.forward_backward(inputs=emb_features, task='embedding')
-            model.clip_grad_and_step(gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
-            cur_step += 1
-
-            if cur_step % LOG_INTERVAL == 0:
-                metric = model.calculate_metric(is_training=True)
-                logger.info(
-                    f'Epoch {epoch} Step {cur_step}/{total_forward_steps}, metric: {metric}')
-                log_dict = {}
-                for k, v in metric.items():
-                    if not v:
-                        continue
-                    try:
-                        log_dict[k] = float(v)
-                    except (ValueError, TypeError):
-                        pass
-                log_dict['epoch'] = epoch
-                swanlab.log(log_dict, step=cur_step)
-            if cur_step % SAVE_INTERVAL == 0:
-                save_checkpoint(model, f'step_{cur_step}')
-
-        # # Drain last prefetched batch
-        # if prefetch_future is not None:
-        #     emb_features = prefetch_future.result()
-        #     if emb_features is not None:
-        #         model.forward_backward(inputs=emb_features, task='embedding')
-        #         model.clip_grad_and_step()
-        #         cur_step += 1
+            for mb in minibatches:
+                t1 = time.monotonic()
+                model.forward_backward(inputs=mb, task='embedding')
+                model.clip_grad_and_step(gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+                t_train = time.monotonic() - t1
+                cur_step += 1
+
+                _ema_prefetch = _ema_alpha * t_prefetch + (1 - _ema_alpha) * _ema_prefetch if cur_step > RESUME_STEP + 1 else t_prefetch
+                _ema_train = _ema_alpha * t_train + (1 - _ema_alpha) * _ema_train if cur_step > RESUME_STEP + 1 else t_train
+
+                if cur_step % LOG_INTERVAL == 0:
+                    metric = model.calculate_metric(is_training=True)
+                    _bottleneck = 'PREFETCH' if _ema_prefetch > _ema_train else 'TRAIN'
+                    logger.info(
+                        f'Epoch {epoch} Step {cur_step}/{total_forward_steps}, metric: {metric} | '
+                        f'prefetch={t_prefetch:.1f}s(ema {_ema_prefetch:.1f}) '
+                        f'train={t_train:.1f}s(ema {_ema_train:.1f}) '
+                        f'bottleneck={_bottleneck}')
+                    log_dict = {}
+                    for k, v in metric.items():
+                        if not v:
+                            continue
+                        try:
+                            log_dict[k] = float(v)
+                        except (ValueError, TypeError):
+                            pass
+                    log_dict['epoch'] = epoch
+                    log_dict['prefetch_sec'] = round(t_prefetch, 2)
+                    log_dict['train_sec'] = round(t_train, 2)
+                    swanlab.log(log_dict, step=cur_step)
+                if cur_step % SAVE_INTERVAL == 0:
+                    save_checkpoint(model, f'step_{cur_step}')
+                t_prefetch = 0.0
+
+        # Drain final mega-batch
+        if future:
+            minibatches = future.result()
+            future = None
+            if minibatches:
+                for mb in minibatches:
+                    model.forward_backward(inputs=mb, task='embedding')
+                    model.clip_grad_and_step(gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+                    cur_step += 1
+                    if cur_step % SAVE_INTERVAL == 0:
+                        save_checkpoint(model, f'step_{cur_step}')
 
     prefetch_executor.shutdown(wait=False)
-    retrainer.stop()
     save_checkpoint(model, 'last-checkpoint')
 
 
diff --git a/cookbook/megatron/tp.py b/cookbook/megatron/tp.py
index 650cf67b6..13c5ccfb1 100644
--- a/cookbook/megatron/tp.py
+++ b/cookbook/megatron/tp.py
@@ -5,42 +5,26 @@
 
 import twinkle
 from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import MegatronModel
 from twinkle.preprocessor import SelfCognitionProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
-MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
-DATASET_ID = 'ms://swift/self-cognition'
-TEMPLATE_NAME = 'Qwen3_5Template'
-MODEL_NAME = 'twinkle大模型'
-MODEL_AUTHOR = 'ModelScope社区'
-DP_SIZE = 2
-TP_SIZE = 2
-PP_SIZE = 2
-BATCH_SIZE = 16
-LEARNING_RATE = 1e-4
-LOG_INTERVAL = 5
-EVAL_INTERVAL = 20
-EVAL_SAMPLES = 100
-TRAIN_SAMPLES = 1000
-
-OUTPUT_DIR = './output/megatron_tp'
-RESUME_FROM_CHECKPOINT = None
-RESUME_ONLY_MODEL = False
-IGNORE_DATA_SKIP = False
-ADAPTER_NAME = 'default'
-
-device_mesh = DeviceMesh.from_sizes(dp_size=DP_SIZE, tp_size=TP_SIZE, pp_size=PP_SIZE)
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+device_mesh = DeviceMesh.from_sizes(dp_size=args.infra.dp_size, tp_size=args.infra.tp_size, pp_size=args.infra.pp_size)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
 
 
 def build_dataset(num_samples: int) -> Dataset:
-    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=range(num_samples)))
-    dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID)
-    dataset.map(SelfCognitionProcessor(MODEL_NAME, MODEL_AUTHOR))
+    dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id, data_slice=range(num_samples)))
+    dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
+    dataset.map(SelfCognitionProcessor(
+        args.extra.get('model_name', 'twinkle大模型'),
+        args.extra.get('model_author', 'ModelScope社区'),
+    ))
     dataset.encode()
     return dataset
 
@@ -48,42 +32,45 @@ def build_dataset(num_samples: int) -> Dataset:
 def save_checkpoint(model: MegatronModel, checkpoint_name: str, dataloader: DataLoader):
     model.save(
         checkpoint_name,
-        output_dir=OUTPUT_DIR,
-        adapter_name=ADAPTER_NAME,
-        save_optimizer=True,
+        output_dir=args.training.output_dir,
+        adapter_name=args.lora.adapter_name,
+        save_optimizer=args.checkpoint.save_optimizer,
         consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
     )
 
 
 def evaluate(model):
-    dataloader = DataLoader(dataset=build_dataset(EVAL_SAMPLES), batch_size=BATCH_SIZE)
+    eval_samples = args.training.eval_samples or 100
+    dataloader = DataLoader(dataset=build_dataset(eval_samples), batch_size=args.training.batch_size)
     for batch in tqdm(dataloader):
         model.forward_only(inputs=batch)
     return model.calculate_metric(is_training=False)
 
 
 def train():
-    dataset = build_dataset(TRAIN_SAMPLES)
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
+    train_samples = args.training.train_samples or 1000
+    dataset = build_dataset(train_samples)
+    dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
 
-    model = MegatronModel(model_id=MODEL_ID)
+    model = MegatronModel(model_id=args.model.model_id)
 
-    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    lora_config = LoraConfig(**args.get_lora_args())
 
     # Comment this to use full-parameter training
-    model.add_adapter_to_model(ADAPTER_NAME, lora_config)
-    model.set_optimizer(optimizer_cls='default', lr=LEARNING_RATE)
-    model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=5, lr_decay_steps=len(dataloader))
+    model.add_adapter_to_model(args.lora.adapter_name, lora_config)
+    model.set_optimizer(optimizer_cls='default', lr=args.optimizer.learning_rate)
+    model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=args.scheduler.num_warmup_steps,
+                           lr_decay_steps=len(dataloader))
 
     start_step = 0
-    if RESUME_FROM_CHECKPOINT:
-        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
+    if args.training.resume_from_checkpoint:
+        checkpoint_path = Path(args.training.resume_from_checkpoint).expanduser().resolve()
         kwargs = {}
-        if ADAPTER_NAME:
-            kwargs['adapter_name'] = ADAPTER_NAME
+        if args.lora.adapter_name:
+            kwargs['adapter_name'] = args.lora.adapter_name
         progress = model.resume_from_checkpoint(
-            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
-        if not IGNORE_DATA_SKIP:
+            str(checkpoint_path), resume_only_model=args.training.resume_only_model, **kwargs)
+        if not args.training.ignore_data_skip:
             dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
             start_step = progress['cur_step']
 
@@ -92,14 +79,15 @@ def train():
     logger.info(f'Total steps: {len(dataloader)}')
 
     best_loss = float('inf')
+    eval_interval = args.training.eval_interval or 20
 
     for step, batch in enumerate(dataloader, start=start_step):
         model.forward_backward(inputs=batch)
         model.clip_grad_and_step()
-        if step % LOG_INTERVAL == 0:
+        if step % args.training.log_interval == 0:
             metric = model.calculate_metric(is_training=True)
             logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
-        if step > 0 and step % EVAL_INTERVAL == 0:
+        if step > 0 and step % eval_interval == 0:
             metrics = evaluate(model)
             logger.info(f'Eval metric: {metrics}')
             metrics['step'] = step
diff --git a/cookbook/megatron/tp.sh b/cookbook/megatron/tp.sh
index 5516130e3..789c54379 100644
--- a/cookbook/megatron/tp.sh
+++ b/cookbook/megatron/tp.sh
@@ -1 +1,23 @@
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp.py
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Megatron TP + LoRA training.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash tp.sh --model-id ms://Qwen/Qwen3.5-4B --tp-size 4
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+  torchrun --nproc_per_node=8 tp.py \
+    --model-id ms://Qwen/Qwen3.5-4B \
+    --dataset-id ms://swift/self-cognition \
+    --template-cls Qwen3_5Template \
+    --dp-size 4 \
+    --tp-size 2 \
+    --batch-size 8 \
+    --lr 1e-4 \
+    --train-samples 1000 \
+    --log-interval 10 \
+    --eval-interval 20 \
+    --output-dir ./output/megatron_tp \
+    --model-name twinkle大模型 \
+    --model-author ModelScope社区 \
+    "$@"
diff --git a/cookbook/megatron/tp_moe.py b/cookbook/megatron/tp_moe.py
index a13b0e58a..11e2c7d84 100644
--- a/cookbook/megatron/tp_moe.py
+++ b/cookbook/megatron/tp_moe.py
@@ -1,29 +1,38 @@
-import os
 from peft import LoraConfig
 from tqdm import tqdm
 
 import twinkle
 from twinkle import DeviceMesh, Platform, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import MegatronModel
 from twinkle.preprocessor import SelfCognitionProcessor
 
+logger = get_logger()
+args = CLI.from_args()
+
 # Construct a device_mesh, tp=pp=ep=dp=2
-device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2, ep_size=2, sequence_parallel=True)
+device_mesh = DeviceMesh.from_sizes(
+    dp_size=args.infra.dp_size, tp_size=args.infra.tp_size,
+    pp_size=args.infra.pp_size, ep_size=args.infra.ep_size,
+    sequence_parallel=args.infra.sequence_parallel,
+)
 # use torchrun mode
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
-
-logger = get_logger()
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
 
 
 def eval(model):
-    # 100 Samples
-    dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100)))
-    dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B')
-    dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+    # Eval samples
+    eval_samples = args.training.eval_samples or 100
+    dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id, data_slice=range(eval_samples)))
+    dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
+    dataset.map(SelfCognitionProcessor(
+        args.extra.get('model_name', 'twinkle大模型'),
+        args.extra.get('model_author', 'ModelScope社区'),
+    ))
     dataset.encode()
-    dataloader = DataLoader(dataset=dataset, batch_size=16)
+    dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
     for step, batch in tqdm(enumerate(dataloader)):
         model.forward_only(inputs=batch)
     metrics = model.calculate_metric(is_training=False)
@@ -31,44 +40,49 @@ def eval(model):
 
 
 def train():
-    # 1000 samples
-    dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
+    # Training samples
+    train_samples = args.training.train_samples or 1000
+    dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id, data_slice=range(train_samples)))
     # Set template to prepare encoding
-    dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-35B-A3B')
+    dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
     # Preprocess the dataset to standard format
-    dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+    dataset.map(SelfCognitionProcessor(
+        args.extra.get('model_name', 'twinkle大模型'),
+        args.extra.get('model_author', 'ModelScope社区'),
+    ))
     # Encode dataset
     dataset.encode()
-    # Global batch size = 1, dp_size = 1
-    dataloader = DataLoader(dataset=dataset, batch_size=16)
+    # Global batch size
+    dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
     # Use a MegatronModel
-    model = MegatronModel(model_id='ms://Qwen/Qwen3.5-35B-A3B')
+    model = MegatronModel(model_id=args.model.model_id)
 
-    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    lora_config = LoraConfig(**args.get_lora_args())
 
-    # Add a lora to model, with name `default`
+    # Add a lora to model, with name from args
     # Comment this to use full-parameter training
-    model.add_adapter_to_model('default', lora_config)
-    # Add Optimizer for lora `default`
-    model.set_optimizer(optimizer_cls='default', lr=1e-4)
-    # Add LRScheduler for lora `default`
-    model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=5, lr_decay_steps=len(dataloader))
+    model.add_adapter_to_model(args.lora.adapter_name, lora_config)
+    # Add Optimizer
+    model.set_optimizer(optimizer_cls='default', lr=args.optimizer.learning_rate)
+    # Add LRScheduler
+    model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=args.scheduler.num_warmup_steps,
+                           lr_decay_steps=len(dataloader))
     logger.info(get_device_placement())
     # Print the training config
     logger.info(model.get_train_configs())
     logger.info(f'Total steps: {len(dataloader)}')
     loss_metric = 99.0
-    # lora: 23G * 8
+    eval_interval = args.training.eval_interval or 20
     for step, batch in enumerate(dataloader):
         # Do forward and backward
         model.forward_backward(inputs=batch)
         # Step
         model.clip_grad_and_step()
-        if step % 5 == 0:
+        if step % args.training.log_interval == 0:
             # Print metric
             metric = model.calculate_metric(is_training=True)
             logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
-        if step > 0 and step % 20 == 0:
+        if step > 0 and step % eval_interval == 0:
             metrics = eval(model)
             logger.info(f'Eval metric: {metrics}')
             metrics['step'] = step
diff --git a/cookbook/megatron/tp_moe.sh b/cookbook/megatron/tp_moe.sh
index 58e586464..7f6a2d06b 100644
--- a/cookbook/megatron/tp_moe.sh
+++ b/cookbook/megatron/tp_moe.sh
@@ -1 +1,25 @@
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 tp_moe.py
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Megatron TP + MoE + LoRA training.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash tp_moe.sh --model-id ms://Qwen/Qwen3.5-30B-A3B --tp-size 4
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+  torchrun --nproc_per_node=8 tp_moe.py \
+    --model-id ms://Qwen/Qwen3.5-30B-A3B \
+    --dataset-id ms://swift/self-cognition \
+    --template-cls Qwen3_5Template \
+    --dp-size 2 \
+    --tp-size 2 \
+    --pp-size 2 \
+    --ep-size 2 \
+    --sequence-parallel \
+    --batch-size 8 \
+    --lr 1e-4 \
+    --train-samples 1000 \
+    --log-interval 10 \
+    --eval-interval 20 \
+    --model-name twinkle大模型 \
+    --model-author ModelScope社区 \
+    "$@"
diff --git a/cookbook/mm/fsdp2.py b/cookbook/mm/fsdp2.py
index 4dc508506..2edaf54e9 100644
--- a/cookbook/mm/fsdp2.py
+++ b/cookbook/mm/fsdp2.py
@@ -3,18 +3,20 @@
 
 import twinkle
 from twinkle import DeviceMesh, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.data_format import Trajectory, Message
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import LazyDataset, DatasetMeta
 from twinkle.model import TransformersModel
 from twinkle.preprocessor import Preprocessor
 
-# Construct a device_mesh, fsdp=2
-device_mesh = DeviceMesh.from_sizes(fsdp_size=2)
-# use torchrun mode
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
-
 logger = get_logger()
+args = CLI.from_args()
+
+# Construct a device_mesh
+device_mesh = DeviceMesh.from_sizes(fsdp_size=args.infra.fsdp_size, dp_size=args.infra.dp_size)
+# use torchrun mode
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
 
 
 class LatexOCRProcessor(Preprocessor):
@@ -35,12 +37,13 @@ def preprocess(self, row) -> Trajectory:
 
 
 def eval(model):
-    # 100 Samples
-    dataset = LazyDataset(dataset_meta=DatasetMeta('ms://AI-ModelScope/LaTeX_OCR', data_slice=range(100)))
-    dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
+    # Eval samples
+    eval_samples = args.training.eval_samples or 100
+    dataset = LazyDataset(dataset_meta=DatasetMeta(args.dataset.dataset_id, data_slice=range(eval_samples)))
+    dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
     dataset.map(LatexOCRProcessor)
     dataset.encode()
-    dataloader = DataLoader(dataset=dataset, batch_size=8)
+    dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
     for step, batch in tqdm(enumerate(dataloader)):
         model.forward_only(inputs=batch)
         model.calculate_loss()
@@ -49,54 +52,56 @@ def eval(model):
 
 
 def train():
-    # 2000 samples
-    dataset = LazyDataset(dataset_meta=DatasetMeta('ms://AI-ModelScope/LaTeX_OCR', data_slice=range(2000)))
+    # Training samples
+    train_samples = args.training.train_samples or 2000
+    dataset = LazyDataset(dataset_meta=DatasetMeta(args.dataset.dataset_id, data_slice=range(train_samples)))
     # Set template to prepare encoding
-    dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=1024)
+    dataset.set_template(args.template.template_cls, model_id=args.model.model_id, max_length=args.template.max_length)
     # Preprocess the dataset to standard format
     dataset.map(LatexOCRProcessor)
     # Encode dataset
     dataset.encode()
-    # Global batch size = 4, for GPUs, so 2 sample per GPU
-    dataloader = DataLoader(dataset=dataset, batch_size=4)
+    # Global batch size
+    dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
     # Use a TransformersModel
-    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForConditionalGeneration
-    model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', model_cls=Qwen3_5ForConditionalGeneration)
+    model = TransformersModel(model_id=args.model.model_id, model_cls=args.model.model_cls)
     model.model._no_split_modules = {'Qwen3_5DecoderLayer'}
 
-    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    lora_config = LoraConfig(**args.get_lora_args())
 
-    # Add a lora to model, with name `default`
-    # Comment this to use full-parameter training
-    model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2)
-    # Add Optimizer for lora `default`
-    model.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
-    model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
-    # Add LRScheduler for lora `default`
+    # Add a lora to model
+    model.add_adapter_to_model(args.lora.adapter_name, lora_config,
+                               gradient_accumulation_steps=args.training.gradient_accumulation_steps)
+    # Add Optimizer
+    model.set_template(args.template.template_cls, model_id=args.model.model_id)
+    model.set_optimizer(optimizer_cls=args.optimizer.optimizer_cls, lr=args.optimizer.learning_rate)
+    # Add LRScheduler
     model.set_lr_scheduler(
-        scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader))
+        scheduler_cls=args.scheduler.scheduler_cls, num_warmup_steps=args.scheduler.num_warmup_steps,
+        num_training_steps=len(dataloader))
     logger.info(get_device_placement())
     # Print the training config
     logger.info(model.get_train_configs())
     logger.info(f'Total steps: {len(dataloader)}')
     loss_metric = 99.0
+    eval_interval = args.training.eval_interval or 200
     for step, batch in enumerate(dataloader):
         # Do forward and backward
         model.forward_backward(inputs=batch)
         # Step
         model.clip_grad_and_step()
-        if step % 20 == 0:
+        if step % args.training.log_interval == 0:
             # Print metric
             metric = model.calculate_metric(is_training=True)
             logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
-        if step > 0 and step % 200 == 0:
+        if step > 0 and step % eval_interval == 0:
             metrics = eval(model)
             logger.info(f'Eval metric: {metrics}')
             metrics['step'] = step
             if loss_metric > float(metrics['loss']):
                 model.save(f'checkpoint-{step}')
                 loss_metric = float(metrics['loss'])
-    model.save(f'last-checkpoint')
+    model.save('last-checkpoint')
 
 
 if __name__ == '__main__':
diff --git a/cookbook/mm/fsdp2.sh b/cookbook/mm/fsdp2.sh
index 46e9f27f6..2e0bed3d5 100644
--- a/cookbook/mm/fsdp2.sh
+++ b/cookbook/mm/fsdp2.sh
@@ -1 +1,21 @@
-CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 fsdp2.py
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Multi-modal FSDP2 + LoRA training (LaTeX OCR).
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash fsdp2.sh --model-id ms://Qwen/Qwen2.5-VL-3B-Instruct --batch-size 4
+
+CUDA_VISIBLE_DEVICES=0,1 \
+  torchrun --nproc_per_node=2 fsdp2.py \
+    --model-id ms://Qwen/Qwen2.5-VL-3B-Instruct \
+    --dataset-id ms://AI-ModelScope/LaTeX_OCR \
+    --template-cls Qwen2_5VLTemplate \
+    --dp-size 2 \
+    --batch-size 2 \
+    --lr 1e-4 \
+    --gradient-accumulation-steps 4 \
+    --train-samples 2000 \
+    --eval-samples 100 \
+    --eval-interval 200 \
+    --log-interval 10 \
+    "$@"
diff --git a/cookbook/mm/fsdp2_gemma4_12b_mm.py b/cookbook/mm/fsdp2_gemma4_12b_mm.py
index c21932b33..62e26d776 100644
--- a/cookbook/mm/fsdp2_gemma4_12b_mm.py
+++ b/cookbook/mm/fsdp2_gemma4_12b_mm.py
@@ -1,4 +1,3 @@
-import os
 from peft import LoraConfig
 from tqdm import tqdm
 from transformers import AutoConfig
@@ -8,35 +7,29 @@
 
 import twinkle
 from twinkle import DeviceMesh, Platform, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import TransformersModel
-# from twinkle.preprocessor import SelfCognitionProcessor, LatexOCRProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
 ########## Construct a device_mesh ##########
 device_mesh = DeviceMesh.from_sizes(
-    # fsdp_size=2,
-    # dp_size=1,
-    # ep_size=2,
+    fsdp_size=args.infra.fsdp_size,
+    dp_size=args.infra.dp_size,
+    ep_size=args.infra.ep_size,
     device_type=Platform.get_platform().device_prefix(),
 )
 # use torchrun mode
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
 
 ########## hyperparameters ##########
-IGNORE_MISMATCHED_SIZES = True
-# MODEL_PATH = 'ms://google/gemma-4-26b-a4b'
-MODEL_PATH = 'ms://google/gemma-4-12b'
-DATASET_PATH = 'ms://AI-ModelScope/LaTeX_OCR'
-TRAIN_LEN = 2000
-BATCH_SIZE = 4
-METRIC_STEP = 10
-SAVE_STEP = 10
+IGNORE_MISMATCHED_SIZES = args.extra.get('ignore_mismatched_sizes', True)
 
 ### reduce model layers for debug
-TEXT_NUM_LAYERS = 8     # gemma-4-12b text_config.num_hidden_layers=48
+TEXT_NUM_LAYERS = args.extra.get('text_num_layers', None)
 
 from twinkle.preprocessor import Preprocessor
 from twinkle.data_format import Message, Trajectory
@@ -79,24 +72,21 @@ def train():
         'messages': List(sub_msg_feat)
     })
     ### prepare dataset and dataloader
-    dataset = Dataset(features=writer_features, dataset_meta=DatasetMeta(DATASET_PATH, subset_name='default', data_slice=range(TRAIN_LEN)))
+    train_samples = args.training.train_samples or 2000
+    dataset = Dataset(features=writer_features, dataset_meta=DatasetMeta(
+        args.dataset.dataset_id, subset_name=args.dataset.subset_name, data_slice=range(train_samples)))
     # Set template to prepare encoding
-    dataset.set_template('Template', model_id=MODEL_PATH)
+    dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
     # Preprocess the dataset to standard format
-    # dataset.map(preprocess_func=SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     dataset.map(preprocess_func=LatexOCRProcessor)
     # Encode dataset
     dataset.encode()
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
+    dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
 
     config, kwargs = AutoConfig.from_pretrained(
-        MODEL_PATH,
+        args.model.model_id,
         trust_remote_code=True,
         return_unused_kwargs=True,
-        # code_revision=code_revision,
-        # _commit_hash=commit_hash,
-        # **hub_kwargs,
-        # **kwargs,
     )
 
     if isinstance(config, Gemma4UnifiedConfig):    # 减层
@@ -111,10 +101,10 @@ def train():
     from transformers import AutoModelForMultimodalLM
     model = TransformersModel(
         model_cls=AutoModelForMultimodalLM,
-        model_id=MODEL_PATH,
+        model_id=args.model.model_id,
         config=config,
         device_mesh=device_mesh,
-        strategy='accelerate', # native_fsdp、 accelerate
+        strategy=args.model.strategy,
         ignore_mismatched_sizes=IGNORE_MISMATCHED_SIZES,
         fsdp_config={
             'reshard_after_forward': True,
@@ -126,46 +116,46 @@ def train():
         },
     )
 
-    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    lora_config = LoraConfig(**args.get_lora_args())
 
-    # Add a lora to model, with name `default`
-    # Comment this to use full-parameter training
-    model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2)
-    # Add Optimizer for lora `default`
-    model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
+    # Add a lora to model
+    model.add_adapter_to_model(args.lora.adapter_name, lora_config,
+                               gradient_accumulation_steps=args.training.gradient_accumulation_steps)
+    # Add Optimizer
+    model.set_optimizer(optimizer_cls=args.optimizer.optimizer_cls, lr=args.optimizer.learning_rate)
 
-    # Add LRScheduler for lora `default`
+    # Add LRScheduler
     model.set_lr_scheduler(
-        scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader))
+        scheduler_cls=args.scheduler.scheduler_cls, num_warmup_steps=args.scheduler.num_warmup_steps,
+        num_training_steps=len(dataloader))
 
     logger.info(get_device_placement())
     # Print the training config
     logger.info(model.get_train_configs())
     logger.info(f'Total steps: {len(dataloader)}')
     best_eval_loss = float('inf')
-    # lora: 8G * 8
-    # full: 18G * 8
 
     ### eval dataset and dataloader
-    EVAL_LENGTH = 100
-    eval_dataset = Dataset(features=writer_features, dataset_meta=DatasetMeta(DATASET_PATH, subset_name='default', data_slice=range(EVAL_LENGTH)))
-    eval_dataset.set_template('Template', model_id=MODEL_PATH)
-    # eval_dataset.map(preprocess_func=SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+    eval_samples = args.training.eval_samples or 100
+    eval_dataset = Dataset(features=writer_features, dataset_meta=DatasetMeta(
+        args.dataset.dataset_id, subset_name=args.dataset.subset_name, data_slice=range(eval_samples)))
+    eval_dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
     eval_dataset.map(preprocess_func=LatexOCRProcessor)
     eval_dataset.encode()
-    eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=8)
+    eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=args.training.batch_size)
+    save_step = args.training.save_steps
     for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
         # Do forward and backward
         model.forward_backward(inputs=batch)
         # Step
         model.clip_grad_and_step()
 
-        if step % METRIC_STEP == 0:
+        if step % args.training.log_interval == 0:
             # Print metric
             metric = model.calculate_metric(is_training=True)
             logger.info(f'Current is step {step} of {len(dataloader)}, Train metric: {metric}')
 
-        if step % SAVE_STEP == 0:
+        if step % save_step == 0:
             metrics = evaluate(model, eval_dataloader)
             metrics['step'] = step
             if float(metrics['loss']) < best_eval_loss:
diff --git a/cookbook/mm/fsdp2_gemma4_mm.py b/cookbook/mm/fsdp2_gemma4_mm.py
index 778051874..5c756cfbe 100644
--- a/cookbook/mm/fsdp2_gemma4_mm.py
+++ b/cookbook/mm/fsdp2_gemma4_mm.py
@@ -1,4 +1,3 @@
-import os
 from peft import LoraConfig
 from tqdm import tqdm
 from transformers import AutoConfig
@@ -8,35 +7,30 @@
 
 import twinkle
 from twinkle import DeviceMesh, Platform, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import TransformersModel
-# from twinkle.preprocessor import SelfCognitionProcessor, LatexOCRProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
 ########## Construct a device_mesh ##########
 device_mesh = DeviceMesh.from_sizes(
-    # fsdp_size=2,
-    # dp_size=1,
-    # ep_size=2,
+    fsdp_size=args.infra.fsdp_size,
+    dp_size=args.infra.dp_size,
+    ep_size=args.infra.ep_size,
     device_type=Platform.get_platform().device_prefix(),
 )
 # use torchrun mode
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
 
 ########## hyperparameters ##########
-IGNORE_MISMATCHED_SIZES = True
-MODEL_PATH = 'ms://google/gemma-4-26b-a4b'
-DATASET_PATH = 'ms://AI-ModelScope/LaTeX_OCR'
-TRAIN_LEN = 2000
-BATCH_SIZE = 4
-METRIC_STEP = 10
-SAVE_STEP = 10
+IGNORE_MISMATCHED_SIZES = args.extra.get('ignore_mismatched_sizes', True)
 
 ### reduce model layers for debug
-TEXT_NUM_LAYERS = 3
-VISION_NUM_LAYERS = 3
+TEXT_NUM_LAYERS = args.extra.get('text_num_layers', None)
+VISION_NUM_LAYERS = args.extra.get('vision_num_layers', None)
 
 
 from twinkle.preprocessor import Preprocessor
@@ -67,24 +61,20 @@ def eval(model, eval_dataloader):
 def train():
 
     ### prepare dataset and dataloader
-    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, data_slice=range(TRAIN_LEN)))
+    train_samples = args.training.train_samples or 2000
+    dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id, data_slice=range(train_samples)))
     # Set template to prepare encoding
-    dataset.set_template('Template', model_id=MODEL_PATH)
+    dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
     # Preprocess the dataset to standard format
-    # dataset.map(preprocess_func=SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
     dataset.map(preprocess_func=LatexOCRProcessor)
     # Encode dataset
     dataset.encode()
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
+    dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
 
     config, kwargs = AutoConfig.from_pretrained(
-        MODEL_PATH,
+        args.model.model_id,
         trust_remote_code=True,
         return_unused_kwargs=True,
-        # code_revision=code_revision,
-        # _commit_hash=commit_hash,
-        # **hub_kwargs,
-        # **kwargs,
     )
 
     if isinstance(config, Gemma4Config):    # 减层
@@ -101,10 +91,10 @@ def train():
 
     # Use a TransformersModel
     model = TransformersModel(
-        model_id=MODEL_PATH,
+        model_id=args.model.model_id,
         config=config,
         device_mesh=device_mesh,
-        strategy='accelerate', # native_fsdp、 accelerate
+        strategy=args.model.strategy,
         ignore_mismatched_sizes=IGNORE_MISMATCHED_SIZES,
         fsdp_config={
             'reshard_after_forward': True,
@@ -116,46 +106,45 @@ def train():
         },
     )
 
-    lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')
+    lora_config = LoraConfig(**args.get_lora_args())
 
-    # Add a lora to model, with name `default`
-    # Comment this to use full-parameter training
-    model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2)
-    # Add Optimizer for lora `default`
-    model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
+    # Add a lora to model
+    model.add_adapter_to_model(args.lora.adapter_name, lora_config,
+                               gradient_accumulation_steps=args.training.gradient_accumulation_steps)
+    # Add Optimizer
+    model.set_optimizer(optimizer_cls=args.optimizer.optimizer_cls, lr=args.optimizer.learning_rate)
 
-    # Add LRScheduler for lora `default`
+    # Add LRScheduler
     model.set_lr_scheduler(
-        scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader))
+        scheduler_cls=args.scheduler.scheduler_cls, num_warmup_steps=args.scheduler.num_warmup_steps,
+        num_training_steps=len(dataloader))
 
     logger.info(get_device_placement())
     # Print the training config
     logger.info(model.get_train_configs())
     logger.info(f'Total steps: {len(dataloader)}')
     best_eval_loss = float('inf')
-    # lora: 8G * 8
-    # full: 18G * 8
 
     ### eval dataset and dataloader
-    EVAL_LENGTH = 100
-    eval_dataset = Dataset(dataset_meta=DatasetMeta(DATASET_PATH, data_slice=range(EVAL_LENGTH)))
-    eval_dataset.set_template('Template', model_id=MODEL_PATH)
-    # eval_dataset.map(preprocess_func=SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
+    eval_samples = args.training.eval_samples or 100
+    eval_dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id, data_slice=range(eval_samples)))
+    eval_dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
     eval_dataset.map(preprocess_func=LatexOCRProcessor)
     eval_dataset.encode()
-    eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=8)
+    eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=args.training.batch_size)
+    save_step = args.training.save_steps
     for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
         # Do forward and backward
         model.forward_backward(inputs=batch)
         # Step
         model.clip_grad_and_step()
 
-        if step % METRIC_STEP == 0:
+        if step % args.training.log_interval == 0:
             # Print metric
             metric = model.calculate_metric(is_training=True)
             logger.info(f'Current is step {step} of {len(dataloader)}, Train metric: {metric}')
 
-        if step % SAVE_STEP == 0:
+        if step % save_step == 0:
             metrics = eval(model, eval_dataloader)
             metrics['step'] = step
             if float(metrics['loss']) < best_eval_loss:
diff --git a/cookbook/mm/fsdp2_gemma4_mm.sh b/cookbook/mm/fsdp2_gemma4_mm.sh
index c67113d8f..82d9ef1d5 100644
--- a/cookbook/mm/fsdp2_gemma4_mm.sh
+++ b/cookbook/mm/fsdp2_gemma4_mm.sh
@@ -1,3 +1,21 @@
-export CUDA_VISIBLE_DEVICES=0,1
+#!/usr/bin/env bash
+set -euo pipefail
 
-torchrun --nnodes=1 --nproc_per_node=2 fsdp2_gemma4_mm.py
+# Multi-modal FSDP2 + LoRA training for Gemma4 (LaTeX OCR).
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash fsdp2_gemma4_mm.sh --model-id ms://google/gemma-4-4b-it --batch-size 4
+
+CUDA_VISIBLE_DEVICES=0,1 \
+  torchrun --nnodes=1 --nproc_per_node=2 fsdp2_gemma4_mm.py \
+    --model-id ms://google/gemma-4-12b-it \
+    --dataset-id ms://AI-ModelScope/LaTeX_OCR \
+    --template-cls Gemma4Template \
+    --dp-size 2 \
+    --batch-size 2 \
+    --lr 1e-4 \
+    --gradient-accumulation-steps 4 \
+    --train-samples 2000 \
+    --eval-samples 100 \
+    --log-interval 10 \
+    --save-steps 200 \
+    "$@"
diff --git a/cookbook/rl/dpo_full.py b/cookbook/rl/dpo/dpo_full.py
similarity index 92%
rename from cookbook/rl/dpo_full.py
rename to cookbook/rl/dpo/dpo_full.py
index 8610b986f..afb3f6155 100644
--- a/cookbook/rl/dpo_full.py
+++ b/cookbook/rl/dpo/dpo_full.py
@@ -49,6 +49,7 @@
 
 import twinkle
 from twinkle import DeviceGroup, DeviceMesh, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.data_format import Trajectory
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
@@ -58,25 +59,26 @@
 from twinkle.processor import InputProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
 # ── Configuration ─────────────────────────────────────────────────────────────
-USE_MEGATRON = int(os.environ.get('USE_MEGATRON', 0))
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3-4B')
-DATASET_ID = os.environ.get('DATASET_ID', 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji')
+USE_MEGATRON = args.model.strategy != 'native_fsdp'
+MODEL_ID = args.model.model_id or 'ms://Qwen/Qwen3-4B'
+DATASET_ID = args.dataset.dataset_id or 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji'
 
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-REF_MODEL_GPUS = int(os.environ.get('REF_MODEL_GPUS', 4))
+MODEL_GPUS = args.infra.model_gpus or 4
+REF_MODEL_GPUS = args.infra.ref_model_gpus or 4
 NUM_GPUS = MODEL_GPUS + REF_MODEL_GPUS
 
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))  # Number of preference pairs
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 2))
-LEARNING_RATE = float(os.environ.get('LR', 1e-5))
-DPO_BETA = float(os.environ.get('DPO_BETA', 0.1))
-SFT_WEIGHT = float(os.environ.get('SFT_WEIGHT', 1.0))  # SFT loss weight for regularization
-LOSS_TYPE = os.environ.get('LOSS_TYPE', 'sigmoid')  # sigmoid, hinge, ipo, simpo, orpo, cpo
-SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 100))
-MAX_LENGTH = int(os.environ.get('MAX_LENGTH', 2048))
-SYSTEM_PROMPT = os.environ.get('SYSTEM_PROMPT', 'You are a helpful assistant.')
+BATCH_SIZE = args.training.batch_size or 8
+GRADIENT_ACCUMULATION_STEPS = args.training.gradient_accumulation_steps or 2
+LEARNING_RATE = args.optimizer.learning_rate or 1e-5
+DPO_BETA = args.loss.beta
+SFT_WEIGHT = args.loss.sft_weight
+LOSS_TYPE = args.loss.loss_type
+SAVE_STEPS = args.training.save_steps or 100
+MAX_LENGTH = args.template.max_length
+SYSTEM_PROMPT = args.template.default_system or 'You are a helpful assistant.'
 
 
 def create_dpo_dataset():
diff --git a/cookbook/rl/dpo/dpo_full.sh b/cookbook/rl/dpo/dpo_full.sh
new file mode 100644
index 000000000..cffba898b
--- /dev/null
+++ b/cookbook/rl/dpo/dpo_full.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# DPO Full-Parameter Training via Ray.
+# Uses separate policy and reference model GPU groups.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash dpo_full.sh --model-id ms://Qwen/Qwen3-8B --beta 0.05
+
+python dpo_full.py \
+    --model-id ms://Qwen/Qwen3-4B \
+    --dataset-id ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --model-gpus 4 \
+    --ref-model-gpus 4 \
+    --batch-size 8 \
+    --gradient-accumulation-steps 2 \
+    --lr 1e-5 \
+    --beta 0.1 \
+    --sft-weight 1.0 \
+    --loss-type sigmoid \
+    --max-length 2048 \
+    --save-steps 100 \
+    "$@"
diff --git a/cookbook/rl/dpo_lora.py b/cookbook/rl/dpo/dpo_lora.py
similarity index 91%
rename from cookbook/rl/dpo_lora.py
rename to cookbook/rl/dpo/dpo_lora.py
index c7ec3147c..868de1521 100644
--- a/cookbook/rl/dpo_lora.py
+++ b/cookbook/rl/dpo/dpo_lora.py
@@ -48,6 +48,7 @@
 
 import twinkle
 from twinkle import DeviceGroup, DeviceMesh, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.data_format import Trajectory
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
@@ -57,24 +58,25 @@
 from twinkle.processor import InputProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
 # ── Configuration ─────────────────────────────────────────────────────────────
-USE_MEGATRON = int(os.environ.get('USE_MEGATRON', 0))
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3-4B')
-DATASET_ID = os.environ.get('DATASET_ID', 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji')
-
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 8))
-
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))    # Number of preference pairs
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 2))
-LEARNING_RATE = float(os.environ.get('LR', 1e-4))  # LoRA DPO requires higher LR (1e-4 to 3e-4)
-DPO_BETA = float(os.environ.get('DPO_BETA', 0.1))
-SFT_WEIGHT = float(os.environ.get('SFT_WEIGHT', 1.0))  # SFT loss weight for regularization
-LOSS_TYPE = os.environ.get('LOSS_TYPE', 'sigmoid')  # sigmoid, hinge, ipo
-SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 100))
-MAX_LENGTH = int(os.environ.get('MAX_LENGTH', 2048))
-ADAPTER_NAME = 'default'
-SYSTEM_PROMPT = os.environ.get('SYSTEM_PROMPT', 'You are a helpful assistant.')
+USE_MEGATRON = args.model.strategy != 'native_fsdp'
+MODEL_ID = args.model.model_id or 'ms://Qwen/Qwen3-4B'
+DATASET_ID = args.dataset.dataset_id or 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji'
+
+MODEL_GPUS = args.infra.model_gpus or 8
+
+BATCH_SIZE = args.training.batch_size or 8
+GRADIENT_ACCUMULATION_STEPS = args.training.gradient_accumulation_steps or 2
+LEARNING_RATE = args.optimizer.learning_rate or 1e-4
+DPO_BETA = args.loss.beta
+SFT_WEIGHT = args.loss.sft_weight
+LOSS_TYPE = args.loss.loss_type
+SAVE_STEPS = args.training.save_steps or 100
+MAX_LENGTH = args.template.max_length
+ADAPTER_NAME = args.lora.adapter_name or 'default'
+SYSTEM_PROMPT = args.template.default_system or 'You are a helpful assistant.'
 
 
 def create_dpo_dataset():
diff --git a/cookbook/rl/dpo/dpo_lora.sh b/cookbook/rl/dpo/dpo_lora.sh
new file mode 100644
index 000000000..7af42b6dc
--- /dev/null
+++ b/cookbook/rl/dpo/dpo_lora.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# DPO LoRA Training via Ray (single GPU group).
+# Uses base model (disable_lora=True) as reference model.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash dpo_lora.sh --model-id ms://Qwen/Qwen3-8B --lr 5e-5
+
+python dpo_lora.py \
+    --model-id ms://Qwen/Qwen3-4B \
+    --dataset-id ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --model-gpus 8 \
+    --batch-size 8 \
+    --gradient-accumulation-steps 2 \
+    --lr 1e-4 \
+    --beta 0.1 \
+    --sft-weight 1.0 \
+    --loss-type sigmoid \
+    --max-length 2048 \
+    --save-steps 100 \
+    --adapter-name default \
+    "$@"
diff --git a/cookbook/rl/dpo_multi_lora.py b/cookbook/rl/dpo/dpo_multi_lora.py
similarity index 91%
rename from cookbook/rl/dpo_multi_lora.py
rename to cookbook/rl/dpo/dpo_multi_lora.py
index 7c09bf61f..0a43322fc 100644
--- a/cookbook/rl/dpo_multi_lora.py
+++ b/cookbook/rl/dpo/dpo_multi_lora.py
@@ -48,6 +48,7 @@
 
 import twinkle
 from twinkle import DeviceGroup, DeviceMesh, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.data_format import Trajectory
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
@@ -57,23 +58,24 @@
 from twinkle.processor import InputProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
 # ── Configuration ─────────────────────────────────────────────────────────────
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
-DATASET_ID = os.environ.get('DATASET_ID', 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji')
-
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 2))
-
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))  # Number of preference pairs
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 2))
-LEARNING_RATE = float(os.environ.get('LR', 1e-4))  # LoRA DPO requires higher LR (1e-4 to 3e-4)
-DPO_BETA = float(os.environ.get('DPO_BETA', 0.1))
-SFT_WEIGHT = float(os.environ.get('SFT_WEIGHT', 1.0))  # SFT loss weight for regularization
-LOSS_TYPE = os.environ.get('LOSS_TYPE', 'sigmoid')  # sigmoid, hinge, ipo
-SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 100))
-MAX_LENGTH = int(os.environ.get('MAX_LENGTH', 2048))
-ADAPTER_NAME = 'default_0'
-SYSTEM_PROMPT = os.environ.get('SYSTEM_PROMPT', 'You are a helpful assistant.')
+MODEL_ID = args.model.model_id or 'ms://Qwen/Qwen3.5-4B'
+DATASET_ID = args.dataset.dataset_id or 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji'
+
+MODEL_GPUS = args.infra.model_gpus or 2
+
+BATCH_SIZE = args.training.batch_size or 8
+GRADIENT_ACCUMULATION_STEPS = args.training.gradient_accumulation_steps or 2
+LEARNING_RATE = args.optimizer.learning_rate or 1e-4
+DPO_BETA = args.loss.beta
+SFT_WEIGHT = args.loss.sft_weight
+LOSS_TYPE = args.loss.loss_type
+SAVE_STEPS = args.training.save_steps or 100
+MAX_LENGTH = args.template.max_length
+ADAPTER_NAME = args.lora.adapter_name or 'default_0'
+SYSTEM_PROMPT = args.template.default_system or 'You are a helpful assistant.'
 
 
 def create_dpo_dataset():
diff --git a/cookbook/rl/dpo/dpo_multi_lora.sh b/cookbook/rl/dpo/dpo_multi_lora.sh
new file mode 100644
index 000000000..0652b95f2
--- /dev/null
+++ b/cookbook/rl/dpo/dpo_multi_lora.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# DPO MultiLoRA Training via Ray (Megatron backend).
+# Uses base model (disable_lora=True) as reference model.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash dpo_multi_lora.sh --model-id ms://Qwen/Qwen3.5-4B --lr 5e-5
+
+python dpo_multi_lora.py \
+    --model-id ms://Qwen/Qwen3.5-4B \
+    --dataset-id ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --model-gpus 2 \
+    --batch-size 8 \
+    --gradient-accumulation-steps 2 \
+    --lr 1e-4 \
+    --beta 0.1 \
+    --sft-weight 1.0 \
+    --loss-type sigmoid \
+    --max-length 2048 \
+    --save-steps 100 \
+    --adapter-name default_0 \
+    "$@"
diff --git a/cookbook/rl/gkd_off_policy.py b/cookbook/rl/gkd/gkd_off_policy.py
similarity index 94%
rename from cookbook/rl/gkd_off_policy.py
rename to cookbook/rl/gkd/gkd_off_policy.py
index 204e90f92..bdf992463 100644
--- a/cookbook/rl/gkd_off_policy.py
+++ b/cookbook/rl/gkd/gkd_off_policy.py
@@ -45,6 +45,7 @@
 
 import twinkle
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.data_format import SamplingParams
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
@@ -55,23 +56,24 @@
 from twinkle.template import Template
 
 logger = get_logger()
+args = CLI.from_args()
 
 # ── Configuration ─────────────────────────────────────────────────────────────
-STUDENT_MODEL_ID = os.environ.get('STUDENT_MODEL_ID', 'ms://Qwen/Qwen3-0.6B')
-TEACHER_MODEL_ID = os.environ.get('TEACHER_MODEL_ID', 'ms://Qwen/Qwen3-8B')
+STUDENT_MODEL_ID = args.rl.student_model_id or 'ms://Qwen/Qwen3-0.6B'
+TEACHER_MODEL_ID = args.rl.teacher_model_id or 'ms://Qwen/Qwen3-8B'
 
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
+MODEL_GPUS = args.infra.model_gpus or 4
+SAMPLER_GPUS = args.infra.sampler_gpus or 4
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 16))
-MAX_STEPS = int(os.environ.get('MAX_STEPS', 1000))
-LEARNING_RATE = float(os.environ.get('LR', 5e-5))
+BATCH_SIZE = args.training.batch_size or 16
+MAX_STEPS = args.training.max_steps or 1000
+LEARNING_RATE = args.optimizer.learning_rate or 5e-5
 
-GKD_BETA = float(os.environ.get('GKD_BETA', 0.5))
-GKD_TEMPERATURE = float(os.environ.get('GKD_TEMPERATURE', 1.0))
-GKD_TOPK = int(os.environ.get('GKD_TOPK', 64))
-ADAPTER_NAME = 'default'
+GKD_BETA = args.rl.gkd_beta
+GKD_TEMPERATURE = args.rl.gkd_temperature
+GKD_TOPK = args.rl.gkd_topk
+ADAPTER_NAME = args.lora.adapter_name or 'default'
 SYSTEM_PROMPT = ('You are a helpful math assistant. Solve the problem step by step and put '
                  'your final answer within #### <number>')
 
diff --git a/cookbook/rl/gkd/gkd_off_policy.sh b/cookbook/rl/gkd/gkd_off_policy.sh
new file mode 100644
index 000000000..262542062
--- /dev/null
+++ b/cookbook/rl/gkd/gkd_off_policy.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# GKD Off-Policy Distillation via Ray.
+# Teacher vLLM computes prompt logprobs on existing dataset responses.
+# Student Megatron model learns to match teacher's token distribution.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash gkd_off_policy.sh --student-model-id ms://Qwen/Qwen3-1.7B --gkd-beta 0.3
+
+python gkd_off_policy.py \
+    --student-model-id ms://Qwen/Qwen3-0.6B \
+    --teacher-model-id ms://Qwen/Qwen3-8B \
+    --model-gpus 4 \
+    --sampler-gpus 4 \
+    --batch-size 16 \
+    --max-steps 1000 \
+    --lr 5e-5 \
+    --gkd-beta 0.5 \
+    --gkd-temperature 1.0 \
+    --gkd-topk 64 \
+    --adapter-name default \
+    "$@"
diff --git a/cookbook/rl/gkd_on_policy.py b/cookbook/rl/gkd/gkd_on_policy.py
similarity index 94%
rename from cookbook/rl/gkd_on_policy.py
rename to cookbook/rl/gkd/gkd_on_policy.py
index 2675d0358..1ddc9d89e 100644
--- a/cookbook/rl/gkd_on_policy.py
+++ b/cookbook/rl/gkd/gkd_on_policy.py
@@ -51,6 +51,7 @@
 import twinkle
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
 from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.cli import CLI
 from twinkle.data_format import SamplingParams
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import DatasetMeta, LazyDataset
@@ -60,26 +61,27 @@
 from twinkle.sampler import vLLMSampler
 
 logger = get_logger()
+args = CLI.from_args()
 
 # ── Configuration ─────────────────────────────────────────────────────────────
-STUDENT_MODEL_ID = os.environ.get('STUDENT_MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
-TEACHER_MODEL_ID = os.environ.get('TEACHER_MODEL_ID', 'ms://Qwen/Qwen3.5-9B')
-USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
+STUDENT_MODEL_ID = args.rl.student_model_id or 'ms://Qwen/Qwen3.5-4B'
+TEACHER_MODEL_ID = args.rl.teacher_model_id or 'ms://Qwen/Qwen3.5-9B'
+USE_MEGATRON = args.model.strategy != 'native_fsdp'
 
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 2))
+MODEL_GPUS = args.infra.model_gpus or 4
+SAMPLER_GPUS = args.infra.sampler_gpus or 2
 NUM_GPUS = MODEL_GPUS + 2*SAMPLER_GPUS
 
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 2048))
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 4))
-MAX_STEPS = int(os.environ.get('MAX_STEPS', 1000))
-LEARNING_RATE = float(os.environ.get('LR', 5e-5))
-N_SAMPLES = int(os.environ.get('N_SAMPLES', 1))
+MAX_NEW_TOKENS = args.sampling.max_tokens or 2048
+BATCH_SIZE = args.training.batch_size or 4
+MAX_STEPS = args.training.max_steps or 1000
+LEARNING_RATE = args.optimizer.learning_rate or 5e-5
+N_SAMPLES = args.sampling.num_samples
 
-GKD_BETA = float(os.environ.get('GKD_BETA', 0.5))
-GKD_TEMPERATURE = float(os.environ.get('GKD_TEMPERATURE', 1.0))
-GKD_TOPK = int(os.environ.get('GKD_TOPK', 64))
-ADAPTER_NAME = 'default'
+GKD_BETA = args.rl.gkd_beta
+GKD_TEMPERATURE = args.rl.gkd_temperature
+GKD_TOPK = args.rl.gkd_topk
+ADAPTER_NAME = args.lora.adapter_name or 'default'
 
 # OlympiadBench subsets
 SUBSETS = [
diff --git a/cookbook/rl/gkd/gkd_on_policy.sh b/cookbook/rl/gkd/gkd_on_policy.sh
new file mode 100644
index 000000000..ed37e5fb5
--- /dev/null
+++ b/cookbook/rl/gkd/gkd_on_policy.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# GKD On-Policy Multimodal Distillation via Ray.
+# Student generates on-policy, teacher provides top-k prompt logprobs,
+# student trains to match teacher's distribution.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash gkd_on_policy.sh --student-model-id ms://Qwen/Qwen3.5-4B --max-steps 500
+
+python gkd_on_policy.py \
+    --student-model-id ms://Qwen/Qwen3.5-4B \
+    --teacher-model-id ms://Qwen/Qwen3.5-9B \
+    --model-gpus 4 \
+    --sampler-gpus 2 \
+    --batch-size 4 \
+    --max-steps 1000 \
+    --max-tokens 2048 \
+    --lr 5e-5 \
+    --num-samples 1 \
+    --gkd-beta 0.5 \
+    --gkd-temperature 1.0 \
+    --gkd-topk 64 \
+    --adapter-name default \
+    "$@"
diff --git a/cookbook/rl/grpo.py b/cookbook/rl/grpo/grpo.py
similarity index 88%
rename from cookbook/rl/grpo.py
rename to cookbook/rl/grpo/grpo.py
index dd16e7f07..e50402383 100644
--- a/cookbook/rl/grpo.py
+++ b/cookbook/rl/grpo/grpo.py
@@ -7,6 +7,7 @@
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
 from twinkle.advantage import GRPOAdvantage
 from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.cli import CLI
 from twinkle.data_format import SamplingParams
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
@@ -19,24 +20,25 @@
 from twinkle.preprocessor.llm import GSM8KProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
-USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '0')))
+MODEL_ID = args.model.model_id or 'ms://Qwen/Qwen3.5-4B'
+USE_MEGATRON = args.model.strategy != 'native_fsdp'
 
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS',4))
+MODEL_GPUS = args.infra.model_gpus or 4
+SAMPLER_GPUS = args.infra.sampler_gpus or 4
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 
-NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
-LEARNING_RATE = float(os.environ.get('LR', 1e-5))
-MAX_STEPS = int(os.environ.get('MAX_STEPS', 200))
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8)) # global prompt-level, global completion-level batch size = BATCH_SIZE * num_generations * dp_size
-MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 8)) # global completion-level mini-batch-size
-MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2)) # per-device-micro-batch-size (completion-level), batch_size in forward_backward
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
-ADAPTER_NAME = 'default'
-SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 50))
+NUM_GENERATIONS = args.rl.num_generations or 8
+MAX_NEW_TOKENS = args.sampling.max_tokens or 4096
+LEARNING_RATE = args.optimizer.learning_rate or 1e-5
+MAX_STEPS = args.training.max_steps or 200
+BATCH_SIZE = args.training.batch_size or 8
+MINI_BATCH_SIZE = args.training.mini_batch_size or 8
+MICRO_BATCH_SIZE = args.training.micro_batch_size or 2
+GRADIENT_ACCUMULATION_STEPS = args.training.gradient_accumulation_steps or 1
+ADAPTER_NAME = args.lora.adapter_name or 'default'
+SAVE_STEPS = args.training.save_steps or 50
 
 def create_gsm8k_dataset():
     dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train'))
diff --git a/cookbook/rl/grpo/grpo.sh b/cookbook/rl/grpo/grpo.sh
new file mode 100644
index 000000000..b6feb02fa
--- /dev/null
+++ b/cookbook/rl/grpo/grpo.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# GRPO training on GSM8K via Ray.
+# Model + vLLM sampler on separate GPU groups.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash grpo.sh --model-id ms://Qwen/Qwen3.5-4B --max-steps 500
+
+python grpo.py \
+    --model-id ms://Qwen/Qwen3.5-4B \
+    --model-gpus 4 \
+    --sampler-gpus 4 \
+    --num-generations 8 \
+    --max-tokens 4096 \
+    --batch-size 8 \
+    --mini-batch-size 8 \
+    --micro-batch-size 2 \
+    --max-steps 200 \
+    --lr 1e-5 \
+    --save-steps 50 \
+    --adapter-name default \
+    "$@"
diff --git a/cookbook/rl/grpo_mm.py b/cookbook/rl/grpo/grpo_mm.py
similarity index 92%
rename from cookbook/rl/grpo_mm.py
rename to cookbook/rl/grpo/grpo_mm.py
index 1f89c7a91..f7de43ca5 100644
--- a/cookbook/rl/grpo_mm.py
+++ b/cookbook/rl/grpo/grpo_mm.py
@@ -14,6 +14,7 @@
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
 from twinkle.advantage import GRPOAdvantage
 from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.cli import CLI
 from twinkle.data_format import SamplingParams
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import DatasetMeta, LazyDataset
@@ -28,27 +29,28 @@
 from twinkle.sampler import vLLMSampler
 
 logger = get_logger()
+args = CLI.from_args()
 
 # Model configuration
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
-USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
+MODEL_ID = args.model.model_id or 'ms://Qwen/Qwen3.5-4B'
+USE_MEGATRON = args.model.strategy != 'native_fsdp'
 
 # GPU configuration
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
+MODEL_GPUS = args.infra.model_gpus or 4
+SAMPLER_GPUS = args.infra.sampler_gpus or 4
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 
 # Training hyperparameters
-NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
-LEARNING_RATE = float(os.environ.get('LR', 1e-5))
-MAX_STEPS = int(os.environ.get('MAX_STEPS', 1000))
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 4))
-MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 4))
-MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 1))
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
-ADAPTER_NAME = 'default'
-SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 50))
+NUM_GENERATIONS = args.rl.num_generations or 8
+MAX_NEW_TOKENS = args.sampling.max_tokens or 4096
+LEARNING_RATE = args.optimizer.learning_rate or 1e-5
+MAX_STEPS = args.training.max_steps or 1000
+BATCH_SIZE = args.training.batch_size or 4
+MINI_BATCH_SIZE = args.training.mini_batch_size or 4
+MICRO_BATCH_SIZE = args.training.micro_batch_size or 1
+GRADIENT_ACCUMULATION_STEPS = args.training.gradient_accumulation_steps or 1
+ADAPTER_NAME = args.lora.adapter_name or 'default'
+SAVE_STEPS = args.training.save_steps or 50
 
 # Dataset configuration
 SUBSETS = [
diff --git a/cookbook/rl/grpo/grpo_mm.sh b/cookbook/rl/grpo/grpo_mm.sh
new file mode 100644
index 000000000..b5ca2fda3
--- /dev/null
+++ b/cookbook/rl/grpo/grpo_mm.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# GRPO Multimodal training on OlympiadBench via Ray.
+# Supports multimodal math/physics problems (Chinese CEE).
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash grpo_mm.sh --model-id ms://Qwen/Qwen3.5-4B --max-steps 500
+
+python grpo_mm.py \
+    --model-id ms://Qwen/Qwen3.5-4B \
+    --model-gpus 4 \
+    --sampler-gpus 4 \
+    --num-generations 8 \
+    --max-tokens 4096 \
+    --batch-size 4 \
+    --mini-batch-size 4 \
+    --micro-batch-size 1 \
+    --max-steps 1000 \
+    --lr 1e-5 \
+    --save-steps 50 \
+    --adapter-name default \
+    "$@"
diff --git a/cookbook/rl/short_math_grpo.py b/cookbook/rl/grpo/short_math_grpo.py
similarity index 91%
rename from cookbook/rl/short_math_grpo.py
rename to cookbook/rl/grpo/short_math_grpo.py
index 5e107b0ae..91fcd7669 100644
--- a/cookbook/rl/short_math_grpo.py
+++ b/cookbook/rl/grpo/short_math_grpo.py
@@ -14,6 +14,7 @@
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
 from twinkle.advantage import GRPOAdvantage
 from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.cli import CLI
 from twinkle.data_format import SamplingParams
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
@@ -26,26 +27,27 @@
 from twinkle.preprocessor.llm import GSM8KProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
 # ========== Configuration ==========
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.5-4B')
-USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
+MODEL_ID = args.model.model_id or 'ms://Qwen/Qwen3.5-4B'
+USE_MEGATRON = args.model.strategy != 'native_fsdp'
 
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 4))
+MODEL_GPUS = args.infra.model_gpus or 4
+SAMPLER_GPUS = args.infra.sampler_gpus or 4
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 
-NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
-LEARNING_RATE = float(os.environ.get('LR', 1e-5))
-MAX_STEPS = int(os.environ.get('MAX_STEPS', 1000))
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 8))
-MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 8))
-MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 2))
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
-ADAPTER_NAME = 'default'
-SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
-LORA_RANK = int(os.environ.get('LORA_RANK', 16))
+NUM_GENERATIONS = args.rl.num_generations or 8
+MAX_NEW_TOKENS = args.sampling.max_tokens or 4096
+LEARNING_RATE = args.optimizer.learning_rate or 1e-5
+MAX_STEPS = args.training.max_steps or 1000
+BATCH_SIZE = args.training.batch_size or 8
+MINI_BATCH_SIZE = args.training.mini_batch_size or 8
+MICRO_BATCH_SIZE = args.training.micro_batch_size or 2
+GRADIENT_ACCUMULATION_STEPS = args.training.gradient_accumulation_steps or 1
+ADAPTER_NAME = args.lora.adapter_name or 'default'
+SAVE_STEPS = args.training.save_steps or 1000
+LORA_RANK = args.lora.lora_r or 16
 
 SYSTEM_PROMPT = ('You are a helpful math assistant. Solve the problem with minimal but correct reasoning '
                  'and put your final answer within \\boxed{}.')
diff --git a/cookbook/rl/grpo/short_math_grpo.sh b/cookbook/rl/grpo/short_math_grpo.sh
new file mode 100644
index 000000000..033507dc8
--- /dev/null
+++ b/cookbook/rl/grpo/short_math_grpo.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# GRPO Short Math Reasoning on GSM8K via Ray.
+# Uses short reasoning format: shorter thinking gets higher brevity reward.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash short_math_grpo.sh --model-id ms://Qwen/Qwen3.5-4B --max-steps 500
+
+python short_math_grpo.py \
+    --model-id ms://Qwen/Qwen3.5-4B \
+    --model-gpus 4 \
+    --sampler-gpus 4 \
+    --num-generations 8 \
+    --max-tokens 4096 \
+    --batch-size 8 \
+    --mini-batch-size 8 \
+    --micro-batch-size 2 \
+    --max-steps 1000 \
+    --lr 1e-5 \
+    --lora-r 16 \
+    --save-steps 1000 \
+    --adapter-name default \
+    "$@"
diff --git a/cookbook/rl/short_math_grpo_moe.py b/cookbook/rl/grpo/short_math_grpo_moe.py
similarity index 90%
rename from cookbook/rl/short_math_grpo_moe.py
rename to cookbook/rl/grpo/short_math_grpo_moe.py
index 9d870eacb..f19747282 100644
--- a/cookbook/rl/short_math_grpo_moe.py
+++ b/cookbook/rl/grpo/short_math_grpo_moe.py
@@ -14,6 +14,7 @@
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
 from twinkle.advantage import GRPOAdvantage
 from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.cli import CLI
 from twinkle.data_format import SamplingParams
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
@@ -26,31 +27,32 @@
 from twinkle.preprocessor.llm import GSM8KProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
 # ========== Configuration ==========
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B')
-USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
+MODEL_ID = args.model.model_id or 'ms://Qwen/Qwen3.6-35B-A3B'
+USE_MEGATRON = args.model.strategy != 'native_fsdp'
 
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-MODEL_EP = int(os.environ.get('MODEL_EP', 2))
-MODEL_TP = int(os.environ.get('MODEL_TP', 2))
-MODEL_PP = int(os.environ.get('MODEL_PP', 2))
+MODEL_GPUS = args.infra.model_gpus or 4
+MODEL_EP = args.infra.ep_size or 2
+MODEL_TP = args.infra.tp_size or 2
+MODEL_PP = args.infra.pp_size or 2
 
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 2))
-SAMPLER_TP = int(os.environ.get('SAMPLER_TP', 2))
+SAMPLER_GPUS = args.infra.sampler_gpus or 2
+SAMPLER_TP = args.sampler.tensor_parallel_size or 2
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 
-NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
-LEARNING_RATE = float(os.environ.get('LR', 5e-5))
-MAX_STEPS = int(os.environ.get('MAX_STEPS', 1000))
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 4))
-MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 4))
-MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 1))
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
-ADAPTER_NAME = 'default'
-SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
-LORA_RANK = int(os.environ.get('LORA_RANK', 16))
+NUM_GENERATIONS = args.rl.num_generations or 8
+MAX_NEW_TOKENS = args.sampling.max_tokens or 4096
+LEARNING_RATE = args.optimizer.learning_rate or 5e-5
+MAX_STEPS = args.training.max_steps or 1000
+BATCH_SIZE = args.training.batch_size or 4
+MINI_BATCH_SIZE = args.training.mini_batch_size or 4
+MICRO_BATCH_SIZE = args.training.micro_batch_size or 1
+GRADIENT_ACCUMULATION_STEPS = args.training.gradient_accumulation_steps or 1
+ADAPTER_NAME = args.lora.adapter_name or 'default'
+SAVE_STEPS = args.training.save_steps or 1000
+LORA_RANK = args.lora.lora_r or 16
 
 SYSTEM_PROMPT = ('You are a helpful math assistant. Solve the problem with minimal but correct reasoning '
                  'and put your final answer within \\boxed{}.')
diff --git a/cookbook/rl/grpo/short_math_grpo_moe.sh b/cookbook/rl/grpo/short_math_grpo_moe.sh
new file mode 100644
index 000000000..00369610c
--- /dev/null
+++ b/cookbook/rl/grpo/short_math_grpo_moe.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# GRPO Short Math MoE on GSM8K via Ray.
+# Uses Megatron MoE model with TP+EP+PP parallelism.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash short_math_grpo_moe.sh --model-id ms://Qwen/Qwen3.6-35B-A3B --max-steps 500
+
+python short_math_grpo_moe.py \
+    --model-id ms://Qwen/Qwen3.6-35B-A3B \
+    --model-gpus 4 \
+    --sampler-gpus 2 \
+    --ep-size 2 \
+    --tp-size 2 \
+    --pp-size 2 \
+    --tensor-parallel-size 2 \
+    --num-generations 8 \
+    --max-tokens 4096 \
+    --batch-size 4 \
+    --mini-batch-size 4 \
+    --micro-batch-size 1 \
+    --max-steps 1000 \
+    --lr 5e-5 \
+    --lora-r 16 \
+    --save-steps 1000 \
+    --adapter-name default \
+    "$@"
diff --git a/cookbook/rl/short_math_grpo_multi_lora.py b/cookbook/rl/grpo/short_math_grpo_multi_lora.py
similarity index 92%
rename from cookbook/rl/short_math_grpo_multi_lora.py
rename to cookbook/rl/grpo/short_math_grpo_multi_lora.py
index 9dad8df30..cff4bb4b9 100644
--- a/cookbook/rl/short_math_grpo_multi_lora.py
+++ b/cookbook/rl/grpo/short_math_grpo_multi_lora.py
@@ -21,6 +21,7 @@
 import twinkle
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
 from twinkle.advantage import GRPOAdvantage
+from twinkle.cli import CLI
 from twinkle.data_format import SamplingParams
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
@@ -33,28 +34,29 @@
 from twinkle.preprocessor.llm import GSM8KProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
 # ========== Configuration ==========
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B')
+MODEL_ID = args.model.model_id or 'ms://Qwen/Qwen3.6-35B-A3B'
 
-MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 2))
-SAMPLER_TP = int(os.environ.get('SAMPLER_TP', 2))
+MODEL_GPUS = args.infra.model_gpus or 4
+SAMPLER_GPUS = args.infra.sampler_gpus or 2
+SAMPLER_TP = args.sampler.tensor_parallel_size or 2
 
 NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
 
-NUM_GENERATIONS = int(os.environ.get('NUM_GENERATIONS', 8))
-MAX_NEW_TOKENS = int(os.environ.get('MAX_NEW_TOKENS', 4096))
-LEARNING_RATE = float(os.environ.get('LR', 5e-5))
-MAX_STEPS = int(os.environ.get('MAX_STEPS', 1000))
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 4))
-MINI_BATCH_SIZE = int(os.environ.get('MINI_BATCH_SIZE', 4))
-MICRO_BATCH_SIZE = int(os.environ.get('MICRO_BATCH_SIZE', 1))
-GRADIENT_ACCUMULATION_STEPS = int(os.environ.get('GRADIENT_ACCUMULATION_STEPS', 1))
-ADAPTER_NAME = 'default_0'
-SAVE_STEPS = int(os.environ.get('SAVE_STEPS', 1000))
-LORA_RANK = int(os.environ.get('LORA_RANK', 16))
-LORA_SYNC_DIR = os.environ.get('LORA_SYNC_DIR', 'output/lora_sync')
+NUM_GENERATIONS = args.rl.num_generations or 8
+MAX_NEW_TOKENS = args.sampling.max_tokens or 4096
+LEARNING_RATE = args.optimizer.learning_rate or 5e-5
+MAX_STEPS = args.training.max_steps or 1000
+BATCH_SIZE = args.training.batch_size or 4
+MINI_BATCH_SIZE = args.training.mini_batch_size or 4
+MICRO_BATCH_SIZE = args.training.micro_batch_size or 1
+GRADIENT_ACCUMULATION_STEPS = args.training.gradient_accumulation_steps or 1
+ADAPTER_NAME = args.lora.adapter_name or 'default_0'
+SAVE_STEPS = args.training.save_steps or 1000
+LORA_RANK = args.lora.lora_r or 16
+LORA_SYNC_DIR = args.checkpoint.lora_sync_dir or 'output/lora_sync'
 
 SYSTEM_PROMPT = ('You are a helpful math assistant. Solve the problem with minimal but correct reasoning '
                  'and put your final answer within \\boxed{}.')
diff --git a/cookbook/rl/grpo/short_math_grpo_multi_lora.sh b/cookbook/rl/grpo/short_math_grpo_multi_lora.sh
new file mode 100644
index 000000000..a465250c8
--- /dev/null
+++ b/cookbook/rl/grpo/short_math_grpo_multi_lora.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# GRPO Short Math MultiLoRA on GSM8K via Ray.
+# Uses MultiLoraMegatronModel with filesystem-based LoRA sync to vLLM.
+# Model: Qwen3.6-35B-A3B (MoE) with tp=2, ep=2, pp=2.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash short_math_grpo_multi_lora.sh --model-id ms://Qwen/Qwen3.6-35B-A3B --max-steps 500
+
+python short_math_grpo_multi_lora.py \
+    --model-id ms://Qwen/Qwen3.6-35B-A3B \
+    --model-gpus 4 \
+    --sampler-gpus 2 \
+    --tensor-parallel-size 2 \
+    --num-generations 8 \
+    --max-tokens 4096 \
+    --batch-size 4 \
+    --mini-batch-size 4 \
+    --micro-batch-size 1 \
+    --max-steps 1000 \
+    --lr 5e-5 \
+    --lora-r 16 \
+    --save-steps 1000 \
+    --adapter-name default_0 \
+    --lora-sync-dir output/lora_sync \
+    "$@"
diff --git a/cookbook/rl/multi_turn/multi_turn_grpo.py b/cookbook/rl/multi_turn/multi_turn_grpo.py
new file mode 100644
index 000000000..6661bfcdc
--- /dev/null
+++ b/cookbook/rl/multi_turn/multi_turn_grpo.py
@@ -0,0 +1,457 @@
+"""Multi-turn GRPO training with EnvPool (integrated environment pool).
+
+Demonstrates how to train an LLM agent via GRPO on interactive environments
+(e.g. Blackjack) using EnvPool and Twinkle's MultiTurnRollout.
+
+EnvPool is deployed as a @remote_class component — either:
+  - With remote_group='env': runs on a dedicated CPU DeviceGroup (isolated)
+  - Without remote_group: runs locally in the driver (zero RPC overhead)
+
+The agent interacts with environments through tool calls:
+  1. EnvPool manages N env instances; each trajectory maps to one slot.
+  2. MultiTurnRollout drives the multi-turn loop: model generates tool calls,
+     EnvTool dispatches them to env.step(), observations are fed back.
+  3. Episode reward is extracted after rollout completes.
+  4. GRPO advantages are computed across the batch and used for policy update.
+
+Usage:
+  # No need to start a separate server — environments are instantiated
+  # directly inside the EnvPool worker:
+  #   python multi_turn_grpo.py
+  #
+  # To run envs on a dedicated CPU worker (isolated):
+  #   ENV_REMOTE=1 python multi_turn_grpo.py
+
+References:
+  - OpenEnv GRPO Blackjack: https://github.com/huggingface/OpenEnv/tree/main/examples/grpo_blackjack
+  - cookbook/rl/grpo/short_math_grpo.py (single-turn GRPO template)
+"""
+import os
+from typing import Any, Dict, List, Tuple
+
+from peft import LoraConfig
+
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
+from twinkle.advantage import GRPOAdvantage
+from twinkle.checkpoint_engine import CheckpointEngineManager
+from twinkle.cli import CLI
+from twinkle.data_format import SamplingParams
+from twinkle.metric import CompletionRewardMetric
+from twinkle.model import TransformersModel
+from twinkle.processor import InputProcessor
+from twinkle.sampler import vLLMSampler
+from twinkle.template import Qwen3_5Template
+from twinkle_agentic.envs import EnvPool, EnvPoolAdapter, EnvTool
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+logger = get_logger()
+args = CLI.from_args()
+
+# ========== Configuration ==========
+MODEL_ID = args.model.model_id or 'ms://Qwen/Qwen3.5-4B'
+USE_MEGATRON = False
+
+MODEL_GPUS = args.infra.model_gpus or 4
+SAMPLER_GPUS = args.infra.sampler_gpus or 4
+NUM_GPUS = MODEL_GPUS + SAMPLER_GPUS
+
+NUM_GENERATIONS = args.rl.num_generations or 8
+MAX_NEW_TOKENS = args.sampling.max_tokens or 2048
+LEARNING_RATE = args.optimizer.learning_rate or 1e-5
+MAX_STEPS = args.training.max_steps or 1000
+BATCH_SIZE = args.training.batch_size or 4
+MINI_BATCH_SIZE = args.training.mini_batch_size or 8
+MICRO_BATCH_SIZE = args.training.micro_batch_size or 2
+GRADIENT_ACCUMULATION_STEPS = args.training.gradient_accumulation_steps or 1
+ADAPTER_NAME = args.lora.adapter_name or 'default'
+SAVE_STEPS = args.training.save_steps or 500
+LORA_RANK = args.lora.lora_r or 16
+MAX_TURNS = int(os.environ.get('MAX_TURNS', '6'))
+
+# Environment configuration
+# ENV_CLS: import path to the environment class (no server needed)
+ENV_CLS = os.environ.get('ENV_CLS', 'blackjack_env:BlackjackEnv')
+# ENV_REMOTE: set to '1' to deploy envs on a dedicated CPU DeviceGroup
+ENV_REMOTE = os.environ.get('ENV_REMOTE', '0') == '1'
+# Pool size = total trajectories per batch
+ENV_POOL_SIZE = int(os.environ.get('ENV_POOL_SIZE', '0'))  # 0 = auto
+
+# ========== Tool Schema (Blackjack example) ==========
+# Define tools the model can use in the environment.
+# For blackjack: a single "play" tool with hit/stand actions.
+# Override TOOL_SCHEMA for different environments.
+BLACKJACK_TOOL_SCHEMA = [
+    {
+        'type': 'function',
+        'function': {
+            'name': 'play',
+            'description': 'Take an action in the blackjack game.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'action': {
+                        'type': 'string',
+                        'enum': ['hit', 'stand'],
+                        'description': 'The action to take: "hit" to draw a card, "stand" to keep current hand.',
+                    }
+                },
+                'required': ['action'],
+            },
+        },
+    }
+]
+
+TOOL_SCHEMA = BLACKJACK_TOOL_SCHEMA
+
+# Action name → OpenSpiel action_id mapping for blackjack.
+# OpenSpiel blackjack: 0 = HIT, 1 = STAND
+BLACKJACK_ACTION_MAP = {'hit': 0, 'stand': 1}
+
+
+def blackjack_action_mapper(tool_name: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
+    """Map tool calls to OpenSpielAction format.
+
+    Converts  play(action='hit')  →  {action_id: 0, game_name: 'blackjack'}
+    """
+    action_str = str(arguments.get('action', 'stand')).lower().strip()
+    action_id = BLACKJACK_ACTION_MAP.get(action_str, 1)  # default STAND
+    return {'action_id': action_id, 'game_name': 'blackjack'}
+
+
+SYSTEM_PROMPT = """You are a skilled blackjack player. You will be told your current hand and the dealer's visible card.
+
+Your goal is to win the game by getting as close to 21 as possible without going over.
+
+Strategy guidelines:
+- Hit if your hand total is below 12
+- Consider the dealer's visible card when deciding
+- Stand if you have 17 or higher
+- Be cautious with hard hands (no ace counted as 11)
+
+Use the `play` tool to take actions. Always reason briefly before acting."""
+
+
+# ========== Environment Setup ==========
+def prepare_trajectories(
+    env_pool: EnvPool,
+    n_trajectories: int,
+    tool_schema: List[Dict],
+    system_prompt: str,
+    action_mapper=None,
+) -> Tuple[List[Dict[str, Any]], List[ToolManager], List[List[EnvTool]]]:
+    """Reset environments via EnvPool and build initial trajectories.
+
+    For each trajectory:
+      1. Get an EnvPoolAdapter (standard Env interface) from the pool
+      2. Reset the env slot to get initial observation
+      3. Build a trajectory dict with system + user messages and tools
+
+    Args:
+        env_pool: The EnvPool instance managing all environments.
+        n_trajectories: Total number of trajectories to create.
+        tool_schema: Tool definitions for the environment.
+        system_prompt: System prompt for the agent.
+        action_mapper: Optional callable to transform actions.
+
+    Returns:
+        Tuple of (trajectories, tool_managers, env_tools_list).
+    """
+    # Get per-trajectory adapters from the pool
+    adapters = env_pool.get_adapters(
+        n=n_trajectories,
+        tool_schema=tool_schema,
+        action_mapper=action_mapper,
+    )
+
+    trajectories = []
+    tool_managers = []
+    env_tools_list = []
+
+    for adapter in adapters:
+        # Reset env slot to start a new episode
+        initial_result = adapter.reset()
+        initial_obs = initial_result.observation
+
+        # Create EnvTool and ToolManager for this trajectory
+        env_tools = EnvTool.from_env(adapter)
+        tm = ToolManager(env_tools)
+
+        # Build trajectory with initial observation as user message
+        traj = {
+            'messages': [
+                {'role': 'system', 'content': system_prompt},
+                {'role': 'user', 'content': initial_obs},
+            ],
+            'tools': tool_schema,
+        }
+
+        trajectories.append(traj)
+        tool_managers.append(tm)
+        env_tools_list.append(env_tools)
+
+    return trajectories, tool_managers, env_tools_list
+
+
+def extract_rewards(env_tools_list: List[List[EnvTool]]) -> List[float]:
+    """Extract episode rewards from EnvTool instances after rollout.
+
+    Each EnvTool tracks the cumulative episode reward from env.step() calls.
+    """
+    rewards = []
+    for env_tools in env_tools_list:
+        if env_tools:
+            reward = env_tools[0].episode_reward
+        else:
+            reward = 0.0
+        rewards.append(reward)
+    return rewards
+
+
+# ========== Main ==========
+def main():
+    # Determine pool size
+    n_trajectories = BATCH_SIZE * NUM_GENERATIONS
+    pool_size = ENV_POOL_SIZE if ENV_POOL_SIZE > 0 else n_trajectories
+
+    # Device groups: model + sampler + (optionally) env
+    device_groups = [
+        DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+        DeviceGroup(name='sampler', ranks=list(range(MODEL_GPUS, NUM_GPUS)), device_type='GPU'),
+    ]
+
+    if ENV_REMOTE:
+        # Add a CPU-only DeviceGroup for env pool (1 CPU process, colocated on same node)
+        device_groups.append(
+            DeviceGroup(name='env', ranks=1, device_type='CPU'),
+        )
+
+    model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, dp_size=SAMPLER_GPUS)
+    twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False)
+
+    lora_config = LoraConfig(
+        target_modules='all-linear',
+        r=LORA_RANK,
+        lora_alpha=LORA_RANK * 2,
+        lora_dropout=0.05,
+    )
+
+    if USE_MEGATRON:
+        from twinkle.model.megatron import MegatronModel
+        model = MegatronModel(
+            model_id=MODEL_ID,
+            device_mesh=model_mesh,
+            remote_group='model',
+            mixed_precision='bf16',
+            variable_seq_lengths=True,
+        )
+    else:
+        model = TransformersModel(
+            model_id=MODEL_ID,
+            device_mesh=model_mesh,
+            remote_group='model',
+        )
+
+    model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
+    if USE_MEGATRON:
+        model.set_optimizer('default', lr=LEARNING_RATE)
+        model.set_lr_scheduler('default', lr_decay_steps=MAX_STEPS, max_lr=LEARNING_RATE)
+    else:
+        model.set_optimizer('AdamW', lr=LEARNING_RATE)
+        model.set_lr_scheduler('CosineAnnealingLR', T_max=MAX_STEPS, eta_min=0)
+
+    model.set_loss('GRPOLoss', epsilon=0.2)
+    model.set_processor(InputProcessor, padding_free=True)
+    model.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False)
+
+    sampler = vLLMSampler(
+        model_id=MODEL_ID,
+        engine_args={
+            'gpu_memory_utilization': 0.8,
+            'max_model_len': 8192,
+            'max_lora_rank': 32,
+            'enable_lora': True,
+            'enable_tower_connector_lora': True,
+        },
+        device_mesh=sampler_mesh,
+        remote_group='sampler',
+    )
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID, enable_thinking=False)
+
+    # ========== EnvPool: environment instances managed by Twinkle ==========
+    env_pool_kwargs = dict(
+        env_cls=ENV_CLS,
+        pool_size=pool_size,
+    )
+    if ENV_REMOTE:
+        # Deploy on dedicated CPU DeviceGroup
+        env_mesh = DeviceMesh.from_sizes(world_size=1, dp_size=1)
+        env_pool_kwargs['remote_group'] = 'env'
+        env_pool_kwargs['device_mesh'] = env_mesh
+    # else: runs locally in driver (zero RPC overhead)
+
+    env_pool = EnvPool(**env_pool_kwargs)
+    logger.info(f'EnvPool created: env_cls={ENV_CLS}, pool_size={pool_size}, '
+                f'remote={ENV_REMOTE}')
+
+    # Local template for MultiTurnRollout bridge computation
+    rollout_template = Qwen3_5Template(MODEL_ID, max_length=8192, enable_thinking=False)
+    rollout_template.truncation_strategy = 'delete'
+
+    ckpt_manager = CheckpointEngineManager(model=model, sampler=sampler)
+
+    # MultiTurnRollout: tool_manager is optional at construction time;
+    # the actual per-trajectory ToolManagers are provided at call time.
+    sampling_params = SamplingParams(
+        max_tokens=MAX_NEW_TOKENS, num_samples=1, logprobs=1,
+        temperature=1.0, top_p=0.95,
+    )
+    rollout = MultiTurnRollout(
+        sampler=sampler,
+        template=rollout_template,
+        sampling_params=sampling_params,
+        max_turns=MAX_TURNS,
+    )
+
+    advantage_fn = GRPOAdvantage()
+    metrics = CompletionRewardMetric()
+
+    optim_step = 0
+    logger.info('Starting multi-turn GRPO training with EnvPool')
+    logger.info(f'ENV_CLS={ENV_CLS}, MAX_TURNS={MAX_TURNS}, NUM_GENERATIONS={NUM_GENERATIONS}')
+    logger.info(get_device_placement())
+
+    while optim_step < MAX_STEPS:
+        metrics.reset()
+
+        # Total trajectories per batch: BATCH_SIZE * NUM_GENERATIONS
+        # Each trajectory is an independent game episode.
+        n_traj = BATCH_SIZE * NUM_GENERATIONS
+
+        # 1. Prepare environments and initial trajectories
+        logger.info(f'[Step {optim_step}] Resetting {n_traj} environments...')
+        expand_prompts, tool_managers, env_tools_list = prepare_trajectories(
+            env_pool=env_pool,
+            n_trajectories=n_traj,
+            tool_schema=TOOL_SCHEMA,
+            system_prompt=SYSTEM_PROMPT,
+            action_mapper=blackjack_action_mapper,
+        )
+
+        # 2. Sync model weights to sampler
+        ckpt_manager.sync_weights(merge_and_sync=False)
+        sampler.reset_prefix_cache()
+
+        # 3. Run multi-turn rollout with per-trajectory ToolManagers
+        all_trajectories: List[Dict[str, Any]] = rollout(
+            expand_prompts,
+            tool_manager=tool_managers,
+        )
+
+        # 4. Extract rewards and logprobs
+        env_rewards = extract_rewards(env_tools_list)
+
+        all_old_logps: List[List[float]] = []
+        all_completion_lengths: List[int] = []
+        n_turns_per_rollout: List[int] = []
+
+        for traj in all_trajectories:
+            logprobs = traj.get('logprobs') or []
+            old_logps = [lp[0][1] for lp in logprobs] if logprobs else []
+            all_old_logps.append(old_logps)
+            # Completion length = number of trainable tokens (labels != -100)
+            labels = traj.get('labels') or []
+            comp_len = sum(1 for l in labels if l != -100)
+            all_completion_lengths.append(comp_len)
+            n_turns_per_rollout.append(int(traj.get('turns') or 0))
+
+        # 5. Compute advantages (group-relative within NUM_GENERATIONS)
+        total_rewards = env_rewards
+        advantages = advantage_fn(
+            total_rewards, num_generations=NUM_GENERATIONS, scale='group',
+        ).tolist()
+
+        # 6. Log metrics
+        metrics.accumulate(
+            completion_lengths=all_completion_lengths,
+            rewards={'total': total_rewards},
+        )
+
+        avg_reward = sum(total_rewards) / len(total_rewards) if total_rewards else 0.0
+        avg_turns = sum(n_turns_per_rollout) / len(n_turns_per_rollout) if n_turns_per_rollout else 0.0
+        logger.info(f'[Step {optim_step}] avg_reward={avg_reward:.3f}, avg_turns={avg_turns:.1f}')
+
+        # 7. Forward-backward with mini-batches
+        # Filter out oversized/truncated trajectories (strategy='delete'),
+        # keep only those with valid completions and ensure >= MODEL_GPUS inputs.
+        all_input_data: List[Dict[str, Any]] = []
+        filtered_old_logps: List[List[float]] = []
+        filtered_advantages: List[float] = []
+        max_len = rollout_template.max_length or float('inf')
+        for i, traj in enumerate(all_trajectories):
+            traj_len = len(traj.get('input_ids') or traj.get('labels') or [])
+            comp_len = sum(1 for l in (traj.get('labels') or []) if l != -100)
+            if traj_len > max_len or comp_len == 0:
+                continue
+            all_input_data.append(traj)
+            filtered_old_logps.append(all_old_logps[i])
+            filtered_advantages.append(advantages[i])
+
+        if len(all_input_data) < MODEL_GPUS:
+            logger.warning(f'[Step {optim_step}] Only {len(all_input_data)} valid trajectories '
+                           f'after filtering (need >= {MODEL_GPUS}), skipping this batch.')
+            continue
+
+        all_old_logps = filtered_old_logps
+        advantages = filtered_advantages
+        total_completions = len(all_input_data)
+        logger.info(f'[Step {optim_step}] {total_completions}/{n_traj} trajectories '
+                    f'passed length filter (max_len={max_len})')
+
+        for mb_start in range(0, total_completions, MINI_BATCH_SIZE):
+            mb_end = min(mb_start + MINI_BATCH_SIZE, total_completions)
+            mb_inputs = all_input_data[mb_start:mb_end]
+            mb_old_logps = all_old_logps[mb_start:mb_end]
+            mb_advantages = advantages[mb_start:mb_end]
+
+            # Print trajectory lengths before forward_backward
+            traj_lengths = []
+            for idx, traj in enumerate(mb_inputs):
+                labels = traj.get('labels') or traj.get('input_ids') or []
+                traj_lengths.append(len(labels))
+            logger.info(f'[Step {optim_step}] mini-batch [{mb_start}:{mb_end}] '
+                        f'n_inputs={len(mb_inputs)}, dp_world={MODEL_GPUS}, '
+                        f'traj_lengths={traj_lengths}')
+
+            model.forward_backward(
+                inputs=mb_inputs,
+                old_logps=mb_old_logps,
+                advantages=mb_advantages,
+                micro_batch_size=MICRO_BATCH_SIZE,
+            )
+            model.clip_grad_and_step()
+            optim_step += 1
+
+            if optim_step >= MAX_STEPS:
+                break
+            if optim_step % SAVE_STEPS == 0:
+                model.save(f'multi-turn-grpo-checkpoint-{optim_step}')
+
+        # 8. Log step summary
+        log_dict = metrics.calculate()
+        log_dict.update(model.calculate_metric(is_training=True))
+        log_dict['avg_turns'] = avg_turns
+        log_dict['avg_reward'] = avg_reward
+        metrics.reset()
+        logger.info(f'[Step {optim_step}/{MAX_STEPS}] {log_dict}')
+
+    # Cleanup
+    env_pool.close()
+    logger.info(f'Training completed. optim_steps={optim_step}')
+    model.save('multi-turn-grpo-final')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cookbook/sample/emb_sample.py b/cookbook/sample/emb_sample.py
index 6d7e4c599..da27a8155 100644
--- a/cookbook/sample/emb_sample.py
+++ b/cookbook/sample/emb_sample.py
@@ -12,15 +12,15 @@
     python cookbook/sample/emb_sample.py
     EMB_MODEL=./output/embedding_lora_transformers/step_16000 python cookbook/sample/emb_sample.py
 """
-import os
 import re
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
 
 import torch
 import torch.nn.functional as F
 
 import twinkle
 from twinkle import DeviceGroup, DeviceMesh, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.data_format import SamplingParams
 from twinkle.loss import InfonceLoss
 from twinkle.model import TransformersModel
@@ -29,12 +29,13 @@
 from twinkle.template import Template
 
 logger = get_logger()
+args = CLI.from_args()
 
 # -- Config -------------------------------------------------------------------
-CONDENSE_MODEL_ID = os.environ.get('CONDENSE_MODEL_ID', 'ms://twinkle-kit/Qwen3.5-4B-CM-v2')
-EMB_MODEL_ID = os.environ.get('EMB_MODEL', 'ms://twinkle-kit/Qwen3.5-4B-QA-emb')
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 1))
-EMB_GPUS = int(os.environ.get('EMB_GPUS', 1))
+CONDENSE_MODEL_ID = args.extra.get('condense_model_id', 'ms://twinkle-kit/Qwen3.5-4B-CM-v2')
+EMB_MODEL_ID = args.extra.get('emb_model_id', 'ms://twinkle-kit/Qwen3.5-4B-QA-emb')
+SAMPLER_GPUS = args.infra.sampler_gpus or 1
+EMB_GPUS = int(args.extra.get('emb_gpus', 1))
 EMB_MAX_LENGTH = 8192
 
 # -- Prompts (aligned with train_embedding_full_ddp.py) -----------------------
diff --git a/cookbook/sample/sample.py b/cookbook/sample/sample.py
index b56460ea1..8cd452b8f 100644
--- a/cookbook/sample/sample.py
+++ b/cookbook/sample/sample.py
@@ -18,19 +18,19 @@
     MODEL_ID=/path/to/model LORA_PATH=/path/to/adapter SAMPLER_GPUS=1 python sample.py
 """
 
-import os
 from typing import List, Dict, Any
 
 import twinkle
 from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.data_format import SamplingParams
 from twinkle.sampler import vLLMSampler
 
 logger = get_logger()
+args = CLI.from_args()
 
-MODEL_ID = os.environ.get('MODEL_ID', 'Qwen/Qwen3.5-4B')
-LORA_PATH = os.environ.get('LORA_PATH', '/path/to/lora')
-SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 1))
+SAMPLER_GPUS = args.infra.sampler_gpus or 1
+LORA_PATH = args.lora.lora_path or '/path/to/lora'
 
 
 def build_prompts() -> List[Dict[str, Any]]:
@@ -67,7 +67,7 @@ def main():
 
     # ── 2. Create vLLMSampler with LoRA enabled ────────────────────────
     sampler = vLLMSampler(
-        model_id=MODEL_ID,
+        model_id=args.model.model_id,
         engine_args={
             'gpu_memory_utilization': 0.7,
             'max_model_len': 4096,
@@ -79,7 +79,7 @@ def main():
         device_mesh=sampler_mesh,
         remote_group='sampler',
     )
-    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID)
+    sampler.set_template('Qwen3_5Template', model_id=args.model.model_id)
     logger.info(get_device_placement())
 
     # ── 3. Configure sampling parameters ────────────────────────────────
@@ -92,7 +92,7 @@ def main():
 
     # ── 4. Run inference ────────────────────────────────────────────────
     prompts = build_prompts()
-    logger.info(f'Sampling {len(prompts)} prompts with model {MODEL_ID} ...')
+    logger.info(f'Sampling {len(prompts)} prompts with model {args.model.model_id} ...')
 
     responses = sampler.sample(prompts, sampling_params, adapter_path=LORA_PATH)
 
diff --git a/cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py b/cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py
index af72efa10..79daafd38 100644
--- a/cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py
+++ b/cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py
@@ -4,7 +4,6 @@
 Run on 8 GPUs:
     torchrun --nproc-per-node=8 cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py
 """
-import os
 from pathlib import Path
 
 from peft import LoraConfig
@@ -12,45 +11,31 @@
 
 import twinkle
 from twinkle import DeviceMesh, Platform, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import TransformersModel
 from twinkle.preprocessor import SelfCognitionProcessor
 
 logger = get_logger()
+args = CLI.from_args()
 
-MODEL_ID = os.environ.get('DSV4_MODEL_ID', 'ms://deepseek-ai/DeepSeek-V4-Flash')
-DATASET_ID = os.environ.get('DATASET_ID', 'ms://swift/self-cognition')
-TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'DeepseekV4Template')
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', '4'))
-GRAD_ACCUM_STEPS = int(os.environ.get('GRAD_ACCUM_STEPS', '4'))
-LOG_INTERVAL = GRAD_ACCUM_STEPS
-LR = float(os.environ.get('LR', '1e-4'))
-MAX_GRAD_NORM = float(os.environ.get('MAX_GRAD_NORM', '1.0'))
-LORA_R = int(os.environ.get('LORA_R', '8'))
-LORA_ALPHA = int(os.environ.get('LORA_ALPHA', '32'))
-ENABLE_EP = os.environ.get('ENABLE_EP', '1') == '1'
-OUTPUT_DIR = os.environ.get('OUTPUT_DIR', './output_dsv4')
-RESUME_FROM_CHECKPOINT = os.environ.get('RESUME_FROM_CHECKPOINT') or None
-RESUME_ONLY_MODEL = os.environ.get('RESUME_ONLY_MODEL', '0') == '1'
-IGNORE_DATA_SKIP = os.environ.get('IGNORE_DATA_SKIP', '0') == '1'
-ADAPTER_NAME = os.environ.get('ADAPTER_NAME', 'default')
-NUM_GPUS = int(os.environ.get('NUM_GPUS', '8'))
+ENABLE_EP = args.extra.get('enable_ep', True)
 
 device_mesh = DeviceMesh.from_sizes(
-    fsdp_size=NUM_GPUS,
-    dp_size=1,
-    ep_size=NUM_GPUS,
+    fsdp_size=args.infra.fsdp_size,
+    dp_size=args.infra.dp_size,
+    ep_size=args.infra.ep_size,
     device_type=Platform.get_platform().device_prefix(),
 )
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
 
 
 def _build_lora_config(enable_ep: bool):
     if enable_ep:
         return LoraConfig(
-            r=LORA_R,
-            lora_alpha=LORA_ALPHA,
+            r=args.lora.lora_r,
+            lora_alpha=args.lora.lora_alpha,
             target_modules='all-linear',
             exclude_modules=['o_a_proj'],
             target_parameters=['mlp.experts.gate_up_proj', 'mlp.experts.down_proj'],
@@ -60,8 +45,8 @@ def _build_lora_config(enable_ep: bool):
     # during forward. That is not stable with plain FSDP2, so non-EP mode uses
     # regular module LoRA and does not train expert parameters.
     return LoraConfig(
-        r=LORA_R,
-        lora_alpha=LORA_ALPHA,
+        r=args.lora.lora_r,
+        lora_alpha=args.lora.lora_alpha,
         exclude_modules=['o_a_proj'],
         target_modules='all-linear',
     )
@@ -70,31 +55,34 @@ def _build_lora_config(enable_ep: bool):
 def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
     return model.save(
         name=checkpoint_name,
-        output_dir=OUTPUT_DIR,
-        adapter_name=ADAPTER_NAME,
-        save_optimizer=True,
+        output_dir=args.training.output_dir,
+        adapter_name=args.lora.adapter_name,
+        save_optimizer=args.checkpoint.save_optimizer,
         consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
     )
 
 
 def train():
-    config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(args.model.model_id, trust_remote_code=True)
     text_config = getattr(config, 'text_config', config)
     if hasattr(text_config, 'use_cache'):
         text_config.use_cache = False
 
-    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID))
-    dataset.set_template(TEMPLATE_ID, model_id=MODEL_ID)
-    dataset.map(SelfCognitionProcessor('twinkle', 'ModelScope'))
+    dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id))
+    dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
+    dataset.map(SelfCognitionProcessor(
+        args.extra.get('model_name', 'twinkle'),
+        args.extra.get('model_author', 'ModelScope'),
+    ))
     dataset.encode(batched=True)
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, device_mesh=device_mesh)
+    dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size, device_mesh=device_mesh)
 
     model = TransformersModel(
-        model_id=MODEL_ID,
+        model_id=args.model.model_id,
         config=config,
         device_mesh=device_mesh,
         strategy='native_fsdp',
-        memory_efficient_init=True,
+        memory_efficient_init=args.model.memory_efficient_init,
         fsdp_config={
             'expert_parallel': {
                 'enabled': ENABLE_EP,
@@ -104,38 +92,41 @@ def train():
         },
     )
     lora_cfg = _build_lora_config(ENABLE_EP)
-    model.add_adapter_to_model(ADAPTER_NAME, lora_cfg, gradient_accumulation_steps=GRAD_ACCUM_STEPS)
-    model.set_optimizer('AdamW', lr=LR, foreach=False)
+    model.add_adapter_to_model(args.lora.adapter_name, lora_cfg,
+                               gradient_accumulation_steps=args.training.gradient_accumulation_steps)
+    model.set_optimizer(args.optimizer.optimizer_cls, lr=args.optimizer.learning_rate, foreach=False)
     model.set_lr_scheduler(
-        scheduler_cls='CosineWarmupScheduler',
-        num_warmup_steps=5,
+        scheduler_cls=args.scheduler.scheduler_cls,
+        num_warmup_steps=args.scheduler.num_warmup_steps,
         num_training_steps=len(dataloader),
     )
 
-    if RESUME_FROM_CHECKPOINT:
-        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
+    if args.training.resume_from_checkpoint:
+        checkpoint_path = Path(args.training.resume_from_checkpoint).expanduser().resolve()
         kwargs = {}
-        if ADAPTER_NAME:
-            kwargs['adapter_name'] = ADAPTER_NAME
+        if args.lora.adapter_name:
+            kwargs['adapter_name'] = args.lora.adapter_name
         progress = model.resume_from_checkpoint(
-            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
-        if not IGNORE_DATA_SKIP:
+            str(checkpoint_path), resume_only_model=args.training.resume_only_model, **kwargs)
+        if not args.training.ignore_data_skip:
             dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
 
     logger.info(get_device_placement())
     logger.info(model.get_train_configs())
     logger.info(
-        f'Total steps: {len(dataloader)}, batch_size={BATCH_SIZE}, grad_accum={GRAD_ACCUM_STEPS}, '
-        f'enable_ep={ENABLE_EP}, output_dir={OUTPUT_DIR}')
+        f'Total steps: {len(dataloader)}, batch_size={args.training.batch_size}, '
+        f'grad_accum={args.training.gradient_accumulation_steps}, '
+        f'enable_ep={ENABLE_EP}, output_dir={args.training.output_dir}')
 
-    optimizer_group = model.optimizer_group[ADAPTER_NAME]
+    optimizer_group = model.optimizer_group[args.lora.adapter_name]
     for batch in dataloader:
         if callable(batch):
             batch = batch()
         model.forward_backward(inputs=batch)
-        model.clip_grad_and_step(max_grad_norm=MAX_GRAD_NORM, gradient_accumulation_steps=GRAD_ACCUM_STEPS)
+        model.clip_grad_and_step(max_grad_norm=args.optimizer.max_grad_norm,
+                                gradient_accumulation_steps=args.training.gradient_accumulation_steps)
         cur_step = optimizer_group.cur_step
-        if cur_step > 0 and cur_step % LOG_INTERVAL == 0:
+        if cur_step > 0 and cur_step % args.training.log_interval == 0:
             metric = model.calculate_metric(is_training=True)
             if callable(metric):
                 metric = metric()
diff --git a/cookbook/transformers/ep_fsdp2_lora_deepseek_v4.sh b/cookbook/transformers/ep_fsdp2_lora_deepseek_v4.sh
index b4e3d9ffb..f2b01ff6e 100644
--- a/cookbook/transformers/ep_fsdp2_lora_deepseek_v4.sh
+++ b/cookbook/transformers/ep_fsdp2_lora_deepseek_v4.sh
@@ -4,13 +4,18 @@ set -euo pipefail
 # EP + FSDP2 + LoRA training for DeepSeek-V4.
 # ENABLE_EP=1 trains expert LoRA with target_parameters.
 # ENABLE_EP=0 runs plain FSDP2 LoRA and does not train expert parameters.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash ep_fsdp2_lora_deepseek_v4.sh --batch-size 8 --lr 5e-5
 
-export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
-export NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
-export ENABLE_EP="${ENABLE_EP:-1}"
-export BATCH_SIZE="${BATCH_SIZE:-4}"
-export GRAD_ACCUM_STEPS="${GRAD_ACCUM_STEPS:-4}"
-export OUTPUT_DIR="${OUTPUT_DIR:-./output_dsv4}"
-
-torchrun --nproc-per-node="${NPROC_PER_NODE}" \
-  cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+  torchrun --nproc-per-node=8 \
+  cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py \
+    --model-id ms://deepseek-ai/DeepSeek-V3-0324 \
+    --dataset-id ms://swift/self-cognition \
+    --dp-size 4 \
+    --ep-size 2 \
+    --batch-size 4 \
+    --gradient-accumulation-steps 4 \
+    --output-dir ./output_dsv4 \
+    --enable-ep 1 \
+    "$@"
diff --git a/cookbook/transformers/ep_fsdp2_lora_deepseek_v4_multinode.sh b/cookbook/transformers/ep_fsdp2_lora_deepseek_v4_multinode.sh
index 7344474e5..47a7ebfc4 100644
--- a/cookbook/transformers/ep_fsdp2_lora_deepseek_v4_multinode.sh
+++ b/cookbook/transformers/ep_fsdp2_lora_deepseek_v4_multinode.sh
@@ -1,26 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
 
 # `deepseek-ai/DeepSeek-V4-Flash` uses mixed FP4/FP8 weights.
 # Convert the checkpoint before training by following:
 # https://gitcode.com/cann/cann-recipes-train/blob/master/llm_pretrain/deepseekv4/README.md#%E6%A8%A1%E5%9E%8B%E6%9D%83%E9%87%8D%E5%87%86%E5%A4%87
 # Install `transformers==5.8.0` before running this cookbook.
+# All training config passed as CLI flags. Override at invocation.
 
-export DSV4_MODEL_ID="ms://deepseek-ai/DeepSeek-V4-Flash-bf16"
-export DATASET_ID="ms://swift/self-cognition"
-# The following environment variables are required for multi-node training. Adjust the values according to your cluster setup.
-export GLOO_SOCKET_IFNAME="eth0" # Use ifconfig to check the network interface name
+# Multi-node networking config — adjust to your cluster setup.
+export GLOO_SOCKET_IFNAME="eth0"
 export HCCL_SOCKET_IFNAME="eth0"
 export HCCL_EXEC_TIMEOUT=1200
 export HCCL_CONNECT_TIMEOUT=1200
-export NNODES=4
-export NUM_GPUS=64
-export MASTER_ADDR="node0" # Replace with the IP address or hostname of the master node
-export MASTER_PORT=29500 # Replace with an open port on the master node
 export HCCL_IF_BASE_PORT=20000
 
-torchrun --nnodes=$NNODES --node_rank=$NODE_RANK --nproc_per_node=16 \
-  --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ep_fsdp2_lora_deepseek_v4.py
+NNODES=4
+MASTER_ADDR=node0
+MASTER_PORT=29500
+NPROC_PER_NODE=16
 
-#  NODE_RANK=0  OUTPUT_DIR=./output sh ep_fsdp2_lora_deepseek_v4_multinode.sh
-#  NODE_RANK=1  OUTPUT_DIR=./output sh ep_fsdp2_lora_deepseek_v4_multinode.sh
-#  NODE_RANK=2  OUTPUT_DIR=./output sh ep_fsdp2_lora_deepseek_v4_multinode.sh
-#  NODE_RANK=3 OUTPUT_DIR=./output sh ep_fsdp2_lora_deepseek_v4_multinode.sh
+torchrun --nnodes=$NNODES --node_rank=${NODE_RANK:?"NODE_RANK must be set"} \
+  --nproc_per_node=$NPROC_PER_NODE \
+  --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT \
+  ep_fsdp2_lora_deepseek_v4.py \
+    --model-id ms://deepseek-ai/DeepSeek-V4-Flash-bf16 \
+    --dataset-id ms://swift/self-cognition \
+    --dp-size 4 \
+    --ep-size 2 \
+    --batch-size 4 \
+    --gradient-accumulation-steps 4 \
+    --output-dir ./output_dsv4_multinode \
+    --enable-ep 1 \
+    "$@"
+
+#  NODE_RANK=0 bash ep_fsdp2_lora_deepseek_v4_multinode.sh
+#  NODE_RANK=1 bash ep_fsdp2_lora_deepseek_v4_multinode.sh
+#  NODE_RANK=2 bash ep_fsdp2_lora_deepseek_v4_multinode.sh
+#  NODE_RANK=3 bash ep_fsdp2_lora_deepseek_v4_multinode.sh
diff --git a/cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py b/cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py
index a9f90111d..5d5088182 100644
--- a/cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py
+++ b/cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py
@@ -4,7 +4,6 @@
 Run on 8 GPUs:
     torchrun --nproc-per-node=8 cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py
 """
-import os
 from pathlib import Path
 
 from peft import LoraConfig
@@ -12,6 +11,7 @@
 
 import twinkle
 from twinkle import DeviceMesh, Platform, get_device_placement, get_logger
+from twinkle.cli import CLI
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import TransformersModel
@@ -20,38 +20,24 @@
 from twinkle.kernel import kernelize_model
 
 logger = get_logger()
+args = CLI.from_args()
 
-MODEL_ID = os.environ.get('QWEN3_MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B')
-DATASET_ID = os.environ.get('DATASET_ID', 'ms://swift/self-cognition')
-TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Qwen3_5Template')
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', '4'))
-GRAD_ACCUM_STEPS = int(os.environ.get('GRAD_ACCUM_STEPS', '4'))
-LOG_INTERVAL = GRAD_ACCUM_STEPS
-LR = float(os.environ.get('LR', '1e-4'))
-MAX_GRAD_NORM = float(os.environ.get('MAX_GRAD_NORM', '1.0'))
-LORA_R = int(os.environ.get('LORA_R', '8'))
-LORA_ALPHA = int(os.environ.get('LORA_ALPHA', '32'))
-ENABLE_EP = os.environ.get('ENABLE_EP', '1') == '1'
-OUTPUT_DIR = os.environ.get('OUTPUT_DIR', './output')
-RESUME_FROM_CHECKPOINT = os.environ.get('RESUME_FROM_CHECKPOINT') or None
-RESUME_ONLY_MODEL = os.environ.get('RESUME_ONLY_MODEL', '0') == '1'
-IGNORE_DATA_SKIP = os.environ.get('IGNORE_DATA_SKIP', '0') == '1'
-ADAPTER_NAME = os.environ.get('ADAPTER_NAME', 'default')
+ENABLE_EP = args.extra.get('enable_ep', True)
 
 device_mesh = DeviceMesh.from_sizes(
-    fsdp_size=8,
-    dp_size=1,
-    ep_size=8,
+    fsdp_size=args.infra.fsdp_size,
+    dp_size=args.infra.dp_size,
+    ep_size=args.infra.ep_size,
     device_type=Platform.get_platform().device_prefix(),
 )
-twinkle.initialize(mode='local', global_device_mesh=device_mesh)
+twinkle.initialize(mode=args.infra.mode, global_device_mesh=device_mesh)
 
 
 def _build_lora_config(enable_ep: bool):
     if enable_ep:
         return LoraConfig(
-            r=LORA_R,
-            lora_alpha=LORA_ALPHA,
+            r=args.lora.lora_r,
+            lora_alpha=args.lora.lora_alpha,
             target_modules='all-linear',
             target_parameters=['mlp.experts.gate_up_proj', 'mlp.experts.down_proj'],
         )
@@ -60,8 +46,8 @@ def _build_lora_config(enable_ep: bool):
     # during forward. That is not stable with plain FSDP2, so non-EP mode uses
     # regular module LoRA and does not train expert parameters.
     return LoraConfig(
-        r=LORA_R,
-        lora_alpha=LORA_ALPHA,
+        r=args.lora.lora_r,
+        lora_alpha=args.lora.lora_alpha,
         target_modules='all-linear',
     )
 
@@ -69,30 +55,33 @@ def _build_lora_config(enable_ep: bool):
 def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
     return model.save(
         name=checkpoint_name,
-        output_dir=OUTPUT_DIR,
-        adapter_name=ADAPTER_NAME,
-        save_optimizer=True,
+        output_dir=args.training.output_dir,
+        adapter_name=args.lora.adapter_name,
+        save_optimizer=args.checkpoint.save_optimizer,
         consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
     )
 
 
 def train():
-    config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(args.model.model_id, trust_remote_code=True)
     text_config = getattr(config, 'text_config', config)
     if hasattr(text_config, 'use_cache'):
         text_config.use_cache = False
 
-    dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID))
+    dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id))
     try:
-        dataset.set_template(TEMPLATE_ID, model_id=MODEL_ID)
+        dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
     except ValueError:
-        dataset.set_template('Qwen3_5Template', model_id=MODEL_ID)
-    dataset.map(SelfCognitionProcessor('twinkle', 'ModelScope'))
+        dataset.set_template('Qwen3_5Template', model_id=args.model.model_id)
+    dataset.map(SelfCognitionProcessor(
+        args.extra.get('model_name', 'twinkle'),
+        args.extra.get('model_author', 'ModelScope'),
+    ))
     dataset.encode(batched=True)
-    dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, device_mesh=device_mesh)
+    dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size, device_mesh=device_mesh)
 
     model = TransformersModel(
-        model_id=MODEL_ID,
+        model_id=args.model.model_id,
         config=config,
         device_mesh=device_mesh,
         strategy='native_fsdp',
@@ -108,38 +97,41 @@ def train():
     if Torch.is_npu_available():
         model = kernelize_model(model, mode='train', device='npu')
     lora_cfg = _build_lora_config(ENABLE_EP)
-    model.add_adapter_to_model(ADAPTER_NAME, lora_cfg, gradient_accumulation_steps=GRAD_ACCUM_STEPS)
-    model.set_optimizer('AdamW', lr=LR, foreach=False)
+    model.add_adapter_to_model(args.lora.adapter_name, lora_cfg,
+                               gradient_accumulation_steps=args.training.gradient_accumulation_steps)
+    model.set_optimizer(args.optimizer.optimizer_cls, lr=args.optimizer.learning_rate, foreach=False)
     model.set_lr_scheduler(
-        scheduler_cls='CosineWarmupScheduler',
-        num_warmup_steps=5,
+        scheduler_cls=args.scheduler.scheduler_cls,
+        num_warmup_steps=args.scheduler.num_warmup_steps,
         num_training_steps=len(dataloader),
     )
 
-    if RESUME_FROM_CHECKPOINT:
-        checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
+    if args.training.resume_from_checkpoint:
+        checkpoint_path = Path(args.training.resume_from_checkpoint).expanduser().resolve()
         kwargs = {}
-        if ADAPTER_NAME:
-            kwargs['adapter_name'] = ADAPTER_NAME
+        if args.lora.adapter_name:
+            kwargs['adapter_name'] = args.lora.adapter_name
         progress = model.resume_from_checkpoint(
-            str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
-        if not IGNORE_DATA_SKIP:
+            str(checkpoint_path), resume_only_model=args.training.resume_only_model, **kwargs)
+        if not args.training.ignore_data_skip:
             dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
 
     logger.info(get_device_placement())
     logger.info(model.get_train_configs())
     logger.info(
-        f'Total steps: {len(dataloader)}, batch_size={BATCH_SIZE}, grad_accum={GRAD_ACCUM_STEPS}, '
-        f'enable_ep={ENABLE_EP}, output_dir={OUTPUT_DIR}')
+        f'Total steps: {len(dataloader)}, batch_size={args.training.batch_size}, '
+        f'grad_accum={args.training.gradient_accumulation_steps}, '
+        f'enable_ep={ENABLE_EP}, output_dir={args.training.output_dir}')
 
-    optimizer_group = model.optimizer_group[ADAPTER_NAME]
+    optimizer_group = model.optimizer_group[args.lora.adapter_name]
     for batch in dataloader:
         if callable(batch):
             batch = batch()
         model.forward_backward(inputs=batch)
-        model.clip_grad_and_step(max_grad_norm=MAX_GRAD_NORM, gradient_accumulation_steps=GRAD_ACCUM_STEPS)
+        model.clip_grad_and_step(max_grad_norm=args.optimizer.max_grad_norm,
+                                gradient_accumulation_steps=args.training.gradient_accumulation_steps)
         cur_step = optimizer_group.cur_step
-        if cur_step > 0 and cur_step % LOG_INTERVAL == 0:
+        if cur_step > 0 and cur_step % args.training.log_interval == 0:
             metric = model.calculate_metric(is_training=True)
             if callable(metric):
                 metric = metric()
diff --git a/cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.sh b/cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.sh
index 8f1813e4f..5132d9d0b 100644
--- a/cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.sh
+++ b/cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.sh
@@ -2,15 +2,19 @@
 set -euo pipefail
 
 # EP + FSDP2 + LoRA training for Qwen3.5-MoE.
-# ENABLE_EP=1 trains expert LoRA with target_parameters.
-# ENABLE_EP=0 runs plain FSDP2 LoRA and does not train expert parameters.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash ep_fsdp2_lora_qwen3_5_moe.sh --batch-size 8 --lr 5e-5
 
-export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
-export NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
-export ENABLE_EP="${ENABLE_EP:-1}"
-export BATCH_SIZE="${BATCH_SIZE:-4}"
-export GRAD_ACCUM_STEPS="${GRAD_ACCUM_STEPS:-4}"
-export OUTPUT_DIR="${OUTPUT_DIR:-./output_qwen3_5_moe}"
-
-torchrun --nproc-per-node="${NPROC_PER_NODE}" \
-  cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+  torchrun --nproc-per-node=8 \
+  cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py \
+    --model-id ms://Qwen/Qwen3.5-30B-A3B \
+    --dataset-id ms://swift/self-cognition \
+    --template-cls Qwen3_5Template \
+    --dp-size 4 \
+    --ep-size 2 \
+    --batch-size 4 \
+    --gradient-accumulation-steps 4 \
+    --output-dir ./output_qwen3_5_moe \
+    --enable-ep 1 \
+    "$@"
diff --git a/cookbook/transformers/fsdp2.py b/cookbook/transformers/fsdp2.py
index ad4c917f9..a3b4da645 100644
--- a/cookbook/transformers/fsdp2.py
+++ b/cookbook/transformers/fsdp2.py
@@ -2,6 +2,7 @@
 
 from peft import LoraConfig
 from tqdm import tqdm
+from torch.optim import Muon  # PyTorch 2.9+; matrix-orthogonalized momentum optimizer.
 
 import twinkle
 from twinkle import DeviceMesh, get_device_placement, get_logger
@@ -51,7 +52,7 @@ def evaluate(model):
 
 
 def train():
-    train_samples = int(args.extra.get('train_samples', 1000))
+    train_samples = args.training.train_samples or 1000
     dataset = build_dataset(train_samples)
     dataloader = DataLoader(dataset=dataset, batch_size=args.training.batch_size)
     model = TransformersModel(model_id=args.model.model_id)
@@ -64,7 +65,16 @@ def train():
     model.add_adapter_to_model(
         args.lora.adapter_name, lora_config,
         gradient_accumulation_steps=args.training.gradient_accumulation_steps)
-    model.set_optimizer(optimizer_cls=args.optimizer.optimizer_cls, lr=args.optimizer.learning_rate)
+    # Muon optimizes 2D hidden-layer weight matrices via Newton-Schulz orthogonalization.
+    # In LoRA training the trainable params are exclusively lora_A / lora_B (both 2D),
+    # so Muon applies cleanly without an AdamW fallback for 1D params.
+    # ``adjust_lr_fn='match_rms_adamw'`` rescales the orthogonalized update so the same
+    # lr / weight_decay tuned for AdamW can be reused directly (Moonshot Muon recipe).
+    model.set_optimizer(
+        optimizer_cls=Muon,
+        lr=args.optimizer.learning_rate,
+        adjust_lr_fn='match_rms_adamw',
+    )
 
     # Add LRScheduler for lora `default`
     model.set_lr_scheduler(
diff --git a/cookbook/transformers/fsdp2.sh b/cookbook/transformers/fsdp2.sh
index bbe269629..c372fbf2e 100644
--- a/cookbook/transformers/fsdp2.sh
+++ b/cookbook/transformers/fsdp2.sh
@@ -2,7 +2,7 @@
 # All training config passed as CLI flags. Override at invocation, e.g.:
 #   bash fsdp2.sh --batch-size 16 --lr 5e-5
 
-CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7} \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
   torchrun --nproc_per_node=8 fsdp2.py \
     --model-id ms://Qwen/Qwen3.5-4B \
     --dataset-id ms://swift/self-cognition \
diff --git a/cookbook/transformers/sp_fsdp_dense.py b/cookbook/transformers/sp_fsdp_dense.py
index a6fd0bdcb..56f22c801 100644
--- a/cookbook/transformers/sp_fsdp_dense.py
+++ b/cookbook/transformers/sp_fsdp_dense.py
@@ -1,9 +1,9 @@
-import numpy as np
 from functools import partial
 from peft import LoraConfig
 
 import twinkle
-from twinkle import DeviceGroup, DeviceMesh, Platform, get_logger
+from twinkle import DeviceMesh, get_logger
+from twinkle.cli import CLI
 from twinkle.dataloader import DataLoader
 from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.model import TransformersModel
@@ -12,48 +12,44 @@
 from twinkle.kernel import kernelize_model
 
 logger = get_logger()
-MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
-DATASETS = 'ms://swift/self-cognition'
-
-device_group = [DeviceGroup(
-    name='default',
-    ranks=[0, 1, 2, 3],
-    device_type=Platform.get_platform().device_prefix(),
-)]
+args = CLI.from_args()
 
 # FSDP + sequence-parallel validation over 4 GPUs: dp=2, fsdp=2.
 # In Transformers route, ulysses_size is the total sequence-parallel degree.
-device_mesh = DeviceMesh(
-    device_type=Platform.get_platform().device_prefix(),
-    mesh=np.arange(4).reshape(2, 2),
-    mesh_dim_names=('dp', 'fsdp'),
-    ulysses_size=2,
+device_mesh = DeviceMesh.from_sizes(
+    dp_size=args.infra.dp_size,
+    fsdp_size=args.infra.fsdp_size,
+    ulysses_size=args.infra.ulysses_size,
 )
 
 twinkle.initialize(
-    mode='local',
-    nproc_per_node=4,
+    mode=args.infra.mode,
     global_device_mesh=device_mesh,
-    lazy_collect=False,
+    lazy_collect=args.infra.lazy_collect,
 )
 
 
 def eval(model):
+    eval_samples = args.training.eval_samples or 100
     dataloader = DataLoader(
-        dataset=partial(create_dataset, data_slice=range(100)),
-        batch_size=4,
+        dataset=partial(create_dataset, data_slice=range(eval_samples)),
+        batch_size=args.training.batch_size,
         device_mesh=device_mesh,
     )
     for _, batch in enumerate(dataloader):
-        model.forward_only(inputs=batch, adapter_name='default')
-        model.calculate_loss(adapter_name='default')
-    return model.calculate_metric(is_training=False, adapter_name='default')
+        model.forward_only(inputs=batch, adapter_name=args.lora.adapter_name)
+        model.calculate_loss(adapter_name=args.lora.adapter_name)
+    return model.calculate_metric(is_training=False, adapter_name=args.lora.adapter_name)
 
 
 def create_dataset(data_slice=None):
-    dataset = Dataset(dataset_meta=DatasetMeta(DATASETS, data_slice=range(500)))
-    dataset.set_template('Qwen3_5Template', model_id=MODEL_ID)
-    dataset.map(SelfCognitionProcessor('twinkle模型', 'twinkle团队'))
+    train_samples = args.training.train_samples or 500
+    dataset = Dataset(dataset_meta=DatasetMeta(args.dataset.dataset_id, data_slice=data_slice or range(train_samples)))
+    dataset.set_template(args.template.template_cls, model_id=args.model.model_id)
+    dataset.map(SelfCognitionProcessor(
+        args.extra.get('model_name', 'twinkle模型'),
+        args.extra.get('model_author', 'twinkle团队'),
+    ))
     dataset.encode(batched=True)
     return dataset
 
@@ -61,36 +57,38 @@ def create_dataset(data_slice=None):
 def train():
     dataloader = DataLoader(
         dataset=partial(create_dataset, data_slice=None),
-        batch_size=8,
+        batch_size=args.training.batch_size,
         device_mesh=device_mesh,
     )
 
     model = TransformersModel(
-        model_id=MODEL_ID,
+        model_id=args.model.model_id,
         device_mesh=device_mesh,
-        strategy='native_fsdp',
+        strategy=args.model.strategy,
     )
     # npu patch
     if Torch.is_npu_available():
         model = kernelize_model(model, mode='train', device='npu')
-    lora_config = LoraConfig(target_modules='all-linear')
-    model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=1)
-    model.set_optimizer('AdamW', lr=1e-4, adapter_name='default')
+    lora_config = LoraConfig(**args.get_lora_args())
+    model.add_adapter_to_model(args.lora.adapter_name, lora_config,
+                               gradient_accumulation_steps=args.training.gradient_accumulation_steps)
+    model.set_optimizer(args.optimizer.optimizer_cls, lr=args.optimizer.learning_rate,
+                        adapter_name=args.lora.adapter_name)
     model.set_lr_scheduler(
-        scheduler_cls='CosineWarmupScheduler',
-        num_warmup_steps=5,
+        scheduler_cls=args.scheduler.scheduler_cls,
+        num_warmup_steps=args.scheduler.num_warmup_steps,
         num_training_steps=len(dataloader),
-        adapter_name='default',
+        adapter_name=args.lora.adapter_name,
     )
 
-    logger.info(model.get_train_configs(adapter_name='default'))
+    logger.info(model.get_train_configs(adapter_name=args.lora.adapter_name))
     logger.info(f'Total steps: {len(dataloader)}')
 
     for step, batch in enumerate(dataloader):
-        model.forward_backward(inputs=batch, adapter_name='default')
-        model.clip_grad_and_step(adapter_name='default')
-        if step % 20 == 0:
-            metric = model.calculate_metric(is_training=True, adapter_name='default')
+        model.forward_backward(inputs=batch, adapter_name=args.lora.adapter_name)
+        model.clip_grad_and_step(adapter_name=args.lora.adapter_name)
+        if step % args.training.log_interval == 0:
+            metric = model.calculate_metric(is_training=True, adapter_name=args.lora.adapter_name)
             logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
     model.save('last-checkpoint', interval=1)
 
diff --git a/cookbook/transformers/sp_fsdp_dense.sh b/cookbook/transformers/sp_fsdp_dense.sh
index 2a8bcf08b..841561fe4 100644
--- a/cookbook/transformers/sp_fsdp_dense.sh
+++ b/cookbook/transformers/sp_fsdp_dense.sh
@@ -1,11 +1,24 @@
-#!/bin/bash
-# To enable Transformers sequence parallelism, please set ulysses_size > 1.
-# ulysses_size is interpreted as the total sequence-parallel degree.
-# device_mesh = DeviceMesh(
-#     device_type="cuda",
-#     mesh=np.arange(4).reshape(2, 2),
-#     mesh_dim_names=("dp", "fsdp"),
-#     ulysses_size=2,
-# )
-#
-CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 sp_fsdp_dense.py
+#!/usr/bin/env bash
+set -euo pipefail
+
+# FSDP + Sequence Parallelism training.
+# To enable Transformers sequence parallelism, set ulysses-size > 1.
+# All training config passed as CLI flags. Override at invocation, e.g.:
+#   bash sp_fsdp_dense.sh --model-id ms://Qwen/Qwen3.5-4B --ulysses-size 4
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+  torchrun --nproc_per_node=4 sp_fsdp_dense.py \
+    --model-id ms://Qwen/Qwen3.5-4B \
+    --dataset-id ms://swift/self-cognition \
+    --template-cls Qwen3_5Template \
+    --dp-size 2 \
+    --fsdp-size 2 \
+    --ulysses-size 2 \
+    --batch-size 4 \
+    --lr 1e-4 \
+    --gradient-accumulation-steps 2 \
+    --train-samples 500 \
+    --log-interval 10 \
+    --model-name twinkle模型 \
+    --model-author twinkle团队 \
+    "$@"
diff --git a/docs/source_en/Components/Agentic/Envs.md b/docs/source_en/Components/Agentic/Envs.md
new file mode 100644
index 000000000..3e2e90b3a
--- /dev/null
+++ b/docs/source_en/Components/Agentic/Envs.md
@@ -0,0 +1,183 @@
+# Environments (Envs)
+
+The Envs module provides an RL execution environment abstraction for agentic training. Environments can participate in multi-turn rollouts interactively or evaluate completed trajectories in batch.
+
+## Env Base Class
+
+```python
+from twinkle_agentic.envs.base import Env, StepResult
+
+class Env(ABC):
+
+    def reset(self, trajectory=None) -> StepResult:
+        """Reset for a new episode."""
+
+    @abstractmethod
+    def step(self, tool_name: str, arguments: dict) -> StepResult:
+        """Execute a single action, return observation + reward + done."""
+
+    def tools(self) -> List[ToolInfo]:
+        """Return tool definitions available in this environment."""
+
+    def evaluate(self, trajectories, **kwargs) -> List[float]:
+        """Batch-evaluate completed trajectories, return rewards."""
+
+    def close(self) -> None:
+        """Release resources."""
+```
+
+### StepResult
+
+```python
+@dataclass
+class StepResult:
+    observation: str = ''    # Environment observation after the action
+    reward: float = 0.0      # Scalar reward for this step
+    done: bool = False        # Whether the episode is terminated
+    info: Dict[str, Any] = field(default_factory=dict)  # Extra metadata
+```
+
+### Two Usage Modes
+
+1. **Interactive mode** (multi-turn rollout) — step-by-step execution:
+
+```python
+env = MyEnv()
+env.reset(trajectory)
+result = env.step('search', {'query': 'Python'})
+# ... repeat until result.done
+```
+
+2. **Batch evaluation mode** — evaluate completed trajectories:
+
+```python
+rewards = env.evaluate(completed_trajectories)
+```
+
+## EnvTool
+
+`EnvTool` wraps an `Env` as a `Tool`, bridging the environment with `ToolManager` and `MultiTurnRollout`.
+
+```python
+from twinkle_agentic.envs.env_tool import EnvTool
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+env = MyEnv()
+
+# Create one EnvTool per tool defined in the environment
+env_tools = EnvTool.from_env(env)
+
+# Register into ToolManager
+manager = ToolManager(env_tools)
+```
+
+### Key Features
+
+| Feature | Description |
+|---------|-------------|
+| `from_env(env)` | Factory: creates one `EnvTool` per tool in `env.tools()`. |
+| `last_result` | Stores the most recent `StepResult` for inspection. |
+| `done` | Property: whether the last step terminated the episode. |
+| `episode_reward` | Property: cumulative reward from `info['episode_reward']`. |
+
+### Manual Construction
+
+```python
+env_tool = EnvTool(
+    env=my_env,
+    tool_name='execute_code',
+    description='Execute Python code in a sandbox.',
+    parameters={
+        'type': 'object',
+        'properties': {
+            'code': {'type': 'string', 'description': 'Python code to execute.'},
+        },
+        'required': ['code'],
+    },
+)
+```
+
+## OpenEnv
+
+`OpenEnv` adapts an [OpenEnv](https://github.com/OpenEnv) WebSocket-based environment server as a synchronous Twinkle `Env`.
+
+```python
+from twinkle_agentic.envs.openenv import OpenEnv
+
+env = OpenEnv(
+    base_url='http://localhost:8000',
+    env_cls='coding_env.CodingEnv',      # Optional typed client
+    env_kwargs={'message_timeout_s': 30},
+    tool_schema=[...],                    # Optional tool definitions
+)
+```
+
+### Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `base_url` | `str` | URL of the running OpenEnv server. |
+| `env_cls` | `str` or class | Dotted import path or class for a typed client. `None` uses `GenericEnvClient`. |
+| `env_kwargs` | `Dict` | Extra kwargs for the client constructor. |
+| `tool_schema` | `List[ToolInfo]` | Tool definitions exposed via `tools()`. |
+| `action_mapper` | `Callable` | Custom function to map `(tool_name, args)` to the action dict sent to the server. |
+
+### Usage with Rollout
+
+```python
+from twinkle_agentic.envs.openenv import OpenEnv
+from twinkle_agentic.envs.env_tool import EnvTool
+from twinkle_agentic.tools.tool_manager import ToolManager
+from twinkle_agentic.rollout.api_multi_turn import APIMultiTurnRollout
+
+# Set up environment
+env = OpenEnv(base_url='http://localhost:8000', tool_schema=[...])
+env.reset()
+
+# Bridge to ToolManager
+env_tools = EnvTool.from_env(env)
+manager = ToolManager(env_tools)
+
+# Use in rollout
+rollout = APIMultiTurnRollout(api=api, tool_manager=manager, max_turns=10)
+results = rollout(trajectories)
+```
+
+### Implementing a Custom Environment
+
+```python
+from twinkle_agentic.envs.base import Env, StepResult
+
+class CodeExecutionEnv(Env):
+
+    def reset(self, trajectory=None):
+        self._sandbox = create_sandbox()
+        return StepResult(observation='Sandbox ready.')
+
+    def step(self, tool_name, arguments):
+        code = arguments.get('code', '')
+        output = self._sandbox.run(code)
+        return StepResult(
+            observation=output,
+            reward=1.0 if 'error' not in output.lower() else 0.0,
+            done=False,
+        )
+
+    def tools(self):
+        return [{
+            'type': 'function',
+            'function': {
+                'name': 'execute_code',
+                'description': 'Run Python code.',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'code': {'type': 'string'},
+                    },
+                },
+            },
+        }]
+
+    def close(self):
+        self._sandbox.cleanup()
+```
diff --git a/docs/source_en/Components/Agentic/Multi-Turn-Tool-Usage.md b/docs/source_en/Components/Agentic/Multi-Turn-Tool-Usage.md
new file mode 100644
index 000000000..06a962a33
--- /dev/null
+++ b/docs/source_en/Components/Agentic/Multi-Turn-Tool-Usage.md
@@ -0,0 +1,205 @@
+# Multi-Turn Tool Usage Guide
+
+This guide shows how to set up and run multi-turn agentic rollouts with tool use in Twinkle.
+
+## Architecture Overview
+
+The agentic rollout pipeline consists of four key components:
+
+- **Tool** — implements a specific capability (search, code execution, etc.)
+- **ToolManager** — registers tools and dispatches LLM tool calls
+- **Env** (optional) — RL environment that exposes tools via `EnvTool`
+- **Rollout** — drives the multi-turn conversation loop
+
+## Quick Start: API-based Rollout
+
+The simplest way to run a multi-turn tool-use rollout using an OpenAI-compatible API:
+
+```python
+from twinkle_agentic.protocol.openai import OpenAI
+from twinkle_agentic.tools.base import Tool
+from twinkle_agentic.tools.tool_manager import ToolManager
+from twinkle_agentic.rollout.api_multi_turn import APIMultiTurnRollout
+from twinkle.data_format.sampling import SamplingParams
+
+# 1. Define tools
+class WeatherTool(Tool):
+    def __call__(self, tool_name, arguments):
+        city = arguments.get('city', 'unknown')
+        return f'The weather in {city} is sunny, 25°C.'
+
+    def tool_info(self):
+        return {
+            'type': 'function',
+            'function': {
+                'name': 'get_weather',
+                'description': 'Get the current weather for a city.',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'city': {'type': 'string', 'description': 'City name.'},
+                    },
+                    'required': ['city'],
+                },
+            },
+        }
+
+# 2. Set up ToolManager
+manager = ToolManager([WeatherTool()])
+
+# 3. Create API client
+api = OpenAI(model='qwen3.5-32b', base_url='http://localhost:8000/v1')
+
+# 4. Create rollout
+rollout = APIMultiTurnRollout(
+    api=api,
+    tool_manager=manager,
+    sampling_params=SamplingParams(temperature=0.7, max_tokens=2048),
+    max_turns=6,
+    concurrency=8,
+)
+
+# 5. Prepare trajectories
+trajectories = [
+    {
+        'messages': [
+            {'role': 'user', 'content': "What's the weather like in Beijing?"},
+        ],
+    },
+]
+
+# 6. Run rollout
+results = rollout(trajectories)
+for r in results:
+    print(f"Turns: {r['turns']}, Stop: {r['stop_reason']}")
+    for msg in r['messages']:
+        print(f"  [{msg['role']}] {msg.get('content', '')[:100]}")
+```
+
+## Training Integration: vLLM-based Rollout
+
+For RLHF training, use `MultiTurnRollout` which produces `input_ids` and `labels`:
+
+```python
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle.data_format.sampling import SamplingParams
+
+rollout = MultiTurnRollout(
+    sampler=vllm_sampler,           # vLLMSampler instance
+    template=template,               # Chat template
+    tool_manager=manager,
+    sampling_params=SamplingParams(temperature=0.7, max_tokens=4096),
+    max_turns=6,
+    max_trajectory_tokens=8192,
+    trace_dir='rollout_traces/',
+)
+
+# In GRPO training loop
+results = rollout(batch_trajectories)
+# results contain input_ids, labels, logprobs for training
+```
+
+## Using Environments as Tools
+
+Bridge an RL environment into the tool pipeline:
+
+```python
+from twinkle_agentic.envs.base import Env, StepResult
+from twinkle_agentic.envs.env_tool import EnvTool
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+# Define environment
+class CodeEnv(Env):
+    def step(self, tool_name, arguments):
+        code = arguments.get('code', '')
+        # Execute code in sandbox
+        result = execute_in_sandbox(code)
+        return StepResult(observation=result, reward=1.0, done=False)
+
+    def tools(self):
+        return [{
+            'type': 'function',
+            'function': {
+                'name': 'run_python',
+                'description': 'Execute Python code.',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'code': {'type': 'string'},
+                    },
+                    'required': ['code'],
+                },
+            },
+        }]
+
+# Bridge Env -> Tool -> ToolManager
+env = CodeEnv()
+env_tools = EnvTool.from_env(env)
+manager = ToolManager(env_tools)
+
+# Use manager in rollout as usual
+rollout = APIMultiTurnRollout(api=api, tool_manager=manager, max_turns=10)
+```
+
+## Using OpenEnv Environments
+
+Connect to a remote OpenEnv WebSocket server:
+
+```python
+from twinkle_agentic.envs.openenv import OpenEnv
+from twinkle_agentic.envs.env_tool import EnvTool
+
+env = OpenEnv(
+    base_url='http://localhost:8000',
+    env_cls='coding_env.CodingEnv',
+    tool_schema=[{
+        'type': 'function',
+        'function': {
+            'name': 'submit',
+            'description': 'Submit code solution.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'code': {'type': 'string'},
+                },
+            },
+        },
+    }],
+)
+
+env.reset()
+env_tools = EnvTool.from_env(env)
+manager = ToolManager(env_tools)
+```
+
+## Per-Trajectory Tool Managers
+
+For scenarios where each trajectory needs its own tool set (e.g., trajectory-bound state):
+
+```python
+# Create per-trajectory managers
+managers = []
+for traj in trajectories:
+    env = create_env_for(traj)
+    env_tools = EnvTool.from_env(env)
+    managers.append(ToolManager(env_tools))
+
+# Pass as a list (aligned 1:1 with trajectories)
+results = rollout(trajectories, tool_manager=managers)
+```
+
+## Trace Debugging
+
+Both rollout implementations support trace dumps for debugging:
+
+```python
+rollout = APIMultiTurnRollout(
+    api=api,
+    tool_manager=manager,
+    trace_dir='traces/',
+    trace_callback=lambda t: t['turns'] > 1,    # Only store multi-turn
+    success_callback=lambda t: t.get('stop_reason') == 'stop',
+)
+```
+
+Trace files are saved as `{step}-{ok|fail}-{id}.json` with the full conversation and metadata.
diff --git a/docs/source_en/Components/Agentic/Preprocessor.md b/docs/source_en/Components/Agentic/Preprocessor.md
new file mode 100644
index 000000000..b646739c0
--- /dev/null
+++ b/docs/source_en/Components/Agentic/Preprocessor.md
@@ -0,0 +1,189 @@
+# Agentic Preprocessor
+
+The agentic preprocessor module provides a pipeline-based data quality filtering framework for multi-turn conversation datasets. It is designed for cleaning and filtering training data before RLHF / agentic fine-tuning.
+
+## QualityPreprocessor
+
+`QualityPreprocessor` is a thin pipeline runner that accepts a list of filter callables and runs them in sequence. Each step receives a list of rows, returns `(kept, dropped)`, and the pipeline logs per-step statistics.
+
+```python
+from twinkle_agentic.preprocessor import QualityPreprocessor, HardFilter, DeadLoopFilter
+
+pipeline = [
+    HardFilter(min_user_chars=10),
+    DeadLoopFilter(),
+]
+preprocessor = QualityPreprocessor(pipeline, dropped_log_path='dropped.jsonl')
+
+# rows is a dict of columns (Dataset.map format)
+cleaned = preprocessor(rows)
+```
+
+### Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `pipeline` | `List[Callable]` | Ordered list of filter steps. Each step takes `List[Dict]` and returns `(kept, dropped)`. |
+| `dropped_log_path` | `str` | Optional JSONL file path for logging dropped rows with step name and reason. |
+
+## Built-in Filters
+
+### HardFilter
+
+Rule-based filter that removes trivially bad rows using deterministic rules. Supports multi-language detection (EN/ZH/JA/KO).
+
+```python
+from twinkle_agentic.preprocessor import HardFilter
+
+f = HardFilter(
+    min_user_chars=10,           # Min chars for non-CJK user query
+    min_user_chars_cjk=6,        # Min chars for CJK user query
+    min_assistant_chars_2turn=80, # Min assistant reply length (2-turn)
+    min_thinking_chars=200,      # Min thinking chain length to exempt
+    system_deny_keywords=['hack', 'exploit'],
+    max_chars_per_round=50000,
+    max_total_chars=200000,
+    max_rounds=50,
+)
+```
+
+**Drop reasons:** `trivial_single_turn`, `shallow_reply`, `all_empty_assistant`, `system_deny_keyword`, `round_too_long`, `total_too_long`, `too_many_rounds`
+
+### DeadLoopFilter
+
+Detects assistant messages exhibiting hesitation or dead-loop patterns — repetitive self-corrections, cascading corrections, and high n-gram repetition.
+
+```python
+from twinkle_agentic.preprocessor import DeadLoopFilter
+
+f = DeadLoopFilter(
+    hesitation_density_threshold=7.0,   # Markers per 1000 chars (response)
+    cascade_threshold=5,                 # Cascade markers in window
+    cascade_window=800,                  # Window size in chars
+    repetition_threshold=0.45,           # N-gram repetition ratio
+    think_hesitation_density_threshold=15.0,  # Laxer for <think> blocks
+    think_repetition_threshold=0.65,
+)
+```
+
+Uses separate threshold profiles for `<think>` reasoning blocks (laxer, free to ramble) and visible response (stricter).
+
+### DedupFilter
+
+Global longest-wins deduplication. The signature is derived from the first real user turn (head+tail) and the first assistant reply.
+
+```python
+from twinkle_agentic.preprocessor import DedupFilter
+
+f = DedupFilter(prefix_chars=100, asst_chars=100)
+kept, dropped = f(all_rows)  # Must see entire dataset in one call
+```
+
+> **Note:** `DedupFilter` requires the full dataset in a single call. Do **not** place it inside `QualityPreprocessor` (which processes per-batch). Run it separately before or after the pipeline.
+
+### RefuseFilter
+
+Detects self-referential refusals in the first assistant reply (e.g., "I cannot help with that"). Multi-language pattern matching (EN/ZH/JA/KO).
+
+```python
+from twinkle_agentic.preprocessor import RefuseFilter
+
+f = RefuseFilter(check_window=600)  # Only check first N chars
+```
+
+### TokenSoupFilter
+
+Detects garbled / token-soup output by checking for replacement characters, control characters, private-use Unicode, leaked special tokens, single-character repetition, and script chaos.
+
+```python
+from twinkle_agentic.preprocessor import TokenSoupFilter
+
+f = TokenSoupFilter(
+    replacement_char_ratio=0.02,
+    special_token_count=20,
+    script_chaos_threshold=0.55,
+)
+```
+
+### PIIPresidioFilter
+
+Multi-language PII detection and rewriting using Microsoft Presidio + spaCy NER + Faker. Detects and replaces personal identifiable information (names, emails, phone numbers, addresses, etc.).
+
+```python
+from twinkle_agentic.preprocessor import PIIPresidioFilter
+
+f = PIIPresidioFilter(languages=['en', 'zh'])
+```
+
+### IntentClassifier
+
+Heuristic intent classifier that tags each row with detected intents. Pluggable detector pipeline.
+
+```python
+from twinkle_agentic.preprocessor import IntentClassifier
+
+classifier = IntentClassifier()
+```
+
+**Intent categories:** `tool_call`, `code`, `math`, `complex_logic`, `reasoning`, `user_dissatisfaction`, `other`
+
+### ScoreFilter
+
+Pluggable scorer-based filter with built-in scorers for character-level metrics, semantic similarity, and code execution.
+
+```python
+from twinkle_agentic.preprocessor import ScoreFilter
+
+f = ScoreFilter()
+```
+
+**Built-in scorers:** `ChrMinScorer`, `SIFDScorer`, `PassNScorer`, `ParaphraseScorer`
+
+### ModelFilter
+
+Filters rows by model ID whitelist.
+
+```python
+from twinkle_agentic.preprocessor import ModelFilter
+
+f = ModelFilter(allowed_models=['qwen3.5-4b', 'qwen3.5-32b'])
+```
+
+### MessageNormalizer
+
+Three-pass message normalization: heartbeat stripping, tool-call rewriting, and consecutive same-role message merging.
+
+```python
+from twinkle_agentic.preprocessor import MessageNormalizer
+
+normalizer = MessageNormalizer()
+```
+
+## Complete Pipeline Example
+
+```python
+from twinkle_agentic.preprocessor import (
+    QualityPreprocessor,
+    HardFilter,
+    DeadLoopFilter,
+    RefuseFilter,
+    TokenSoupFilter,
+    MessageNormalizer,
+    DedupFilter,
+)
+
+# Step 1: Global dedup (must run on full dataset)
+dedup = DedupFilter()
+rows, _ = dedup(all_rows)
+
+# Step 2: Per-batch pipeline
+pipeline = [
+    HardFilter(min_user_chars=10, max_rounds=30),
+    DeadLoopFilter(),
+    RefuseFilter(),
+    TokenSoupFilter(),
+    MessageNormalizer(),
+]
+preprocessor = QualityPreprocessor(pipeline, dropped_log_path='dropped.jsonl')
+cleaned = preprocessor(rows)
+```
diff --git a/docs/source_en/Components/Agentic/Protocol.md b/docs/source_en/Components/Agentic/Protocol.md
new file mode 100644
index 000000000..a44c5c3a2
--- /dev/null
+++ b/docs/source_en/Components/Agentic/Protocol.md
@@ -0,0 +1,91 @@
+# Protocol
+
+The Protocol module provides an abstract LLM API client interface and its OpenAI-compatible implementation. It bridges Twinkle's `Trajectory` / `SamplingParams` data types with external LLM inference services.
+
+## API Base Class
+
+```python
+from abc import ABC, abstractmethod
+from twinkle.data_format import Trajectory
+from twinkle.data_format.message import Message
+from twinkle.data_format.sampling import SamplingParams
+
+class API(ABC):
+    """Abstract LLM API client: Trajectory + SamplingParams -> assistant Message(s)."""
+
+    @abstractmethod
+    def __call__(
+        self,
+        trajectory: Trajectory,
+        sampling_params: SamplingParams,
+        **kwargs,
+    ) -> Union[Message, List[Message]]:
+        raise NotImplementedError()
+```
+
+The `API` class defines a simple contract: given a conversation trajectory and sampling parameters, return one or more assistant messages.
+
+## OpenAI
+
+`OpenAI` is the built-in implementation that works with any endpoint speaking the `/v1/chat/completions` protocol (OpenAI, Azure OpenAI, vLLM, SGLang, Ollama, etc.).
+
+```python
+from twinkle_agentic.protocol.openai import OpenAI
+
+api = OpenAI(
+    model='qwen3.5-32b',
+    base_url='http://localhost:8000/v1',
+    api_key='EMPTY',
+)
+```
+
+### Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `model` | `str` | Model name to pass in the API request. |
+| `api_key` | `str` | API key. Defaults to the `OPENAI_API_KEY` environment variable. |
+| `base_url` | `str` | Base URL of the API endpoint (e.g. `http://localhost:8000/v1`). |
+| `client_kwargs` | `Dict` | Extra keyword arguments forwarded to the `openai.OpenAI` client constructor. |
+
+### Usage
+
+```python
+from twinkle.data_format import Trajectory
+from twinkle.data_format.sampling import SamplingParams
+
+trajectory = {
+    'messages': [
+        {'role': 'user', 'content': 'What is the capital of France?'},
+    ]
+}
+
+sp = SamplingParams(temperature=0.7, max_tokens=512)
+reply = api(trajectory, sp)
+# reply is a Message dict: {'role': 'assistant', 'content': '...'}
+```
+
+### Features
+
+- **Tool calls**: Automatically maps `trajectory['tools']` to the API request and parses structured `tool_calls` from the response.
+- **Reasoning content**: Preserves `reasoning_content` from models that support it (e.g., o1-style reasoning).
+- **Finish reason**: Surfaces `finish_reason` on the returned message so multi-turn drivers can detect length-cap truncation.
+- **Multi-sample**: When `sampling_params.num_samples > 1`, returns a list of messages (one per choice).
+
+### Custom API Client
+
+To integrate a non-OpenAI API, subclass `API`:
+
+```python
+from twinkle_agentic.protocol.base import API
+
+class MyCustomAPI(API):
+
+    def __call__(self, trajectory, sampling_params, **kwargs):
+        # Call your custom endpoint
+        response = my_llm_client.chat(
+            messages=trajectory['messages'],
+            temperature=sampling_params.temperature,
+        )
+        return {'role': 'assistant', 'content': response.text}
+```
diff --git a/docs/source_en/Components/Agentic/Rollout.md b/docs/source_en/Components/Agentic/Rollout.md
new file mode 100644
index 000000000..94b143454
--- /dev/null
+++ b/docs/source_en/Components/Agentic/Rollout.md
@@ -0,0 +1,140 @@
+# Multi-Turn Rollout
+
+The Rollout module provides multi-turn conversation rollout engines for agentic RLHF training. Two implementations are available: `MultiTurnRollout` for batched vLLM sampling and `APIMultiTurnRollout` for OpenAI-compatible API endpoints.
+
+## Rollout Base Class
+
+```python
+from abc import ABC, abstractmethod
+from twinkle.data_format import Trajectory
+
+class Rollout(ABC):
+
+    @abstractmethod
+    def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
+        raise NotImplementedError()
+```
+
+All rollouts accept a list of trajectories and return the same number of trajectories with additional fields (`messages`, `turns`, `stop_reason`, `truncated`).
+
+## MultiTurnRollout
+
+Batched multi-turn rollout engine that uses a vLLM sampler for generation. All active trajectories are sampled in a single batched call per turn for maximum throughput.
+
+### Per-turn Loop
+
+1. Encode each trajectory into an `InputFeature` with a generation prompt
+2. Batch `sampler.sample(active_pifs)` — all live trajectories in parallel
+3. Check termination: `stop_reason == 'length'`, no tool calls, or max turns reached
+4. Dispatch tools via `ToolManager`, append tool responses
+5. Compute bridge tokens (tool turns + generation prompt) with `labels = -100`
+6. Repeat until all trajectories are done
+
+```python
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle_agentic.tools.tool_manager import ToolManager
+from twinkle.data_format.sampling import SamplingParams
+
+rollout = MultiTurnRollout(
+    sampler=vllm_sampler,
+    template=template,
+    tool_manager=tool_manager,
+    sampling_params=SamplingParams(temperature=0.7, max_tokens=4096),
+    max_turns=6,
+    max_trajectory_tokens=8192,
+    trace_dir='rollout_traces/',
+)
+
+# Run rollout
+results = rollout(trajectories)
+```
+
+### Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `sampler` | Sampler | vLLM sampler instance for batched generation. |
+| `template` | `Template` | Chat template for encoding/decoding. |
+| `tool_manager` | `ToolManager` | Tool dispatcher. Can also be passed per-call. |
+| `sampling_params` | `SamplingParams` | Default sampling parameters. |
+| `max_turns` | `int` | Maximum number of turns per trajectory (default: 6). |
+| `max_trajectory_tokens` | `int` | Max total token length; exceeding truncates the trajectory. |
+| `trace_dir` | `str` | Directory for per-trajectory JSON trace dumps. |
+| `trace_callback` | `Callable` | Decides whether to store a trajectory trace. |
+| `success_callback` | `Callable` | Decides filename prefix (`ok-` vs `fail-`). |
+
+### Output Fields
+
+Each output trajectory dict includes:
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `messages` | `List[Dict]` | Full conversation including tool turns. |
+| `input_ids` | `List[int]` | Token IDs of the full sequence. |
+| `labels` | `List[int]` | Training labels (`-100` for non-trainable tokens). |
+| `turns` | `int` | Number of turns performed. |
+| `stop_reason` | `str` | `'stop'` / `'length'` |
+| `truncated` | `bool` | Whether the trajectory was truncated. |
+| `logprobs` | `List` | Per-token log probabilities (if available). |
+
+### Ray Remote Support
+
+`MultiTurnRollout` is decorated with `@remote_class()`, enabling transparent deployment as a Ray actor:
+
+```python
+# The rollout can run as a Ray remote actor
+rollout_actor = MultiTurnRollout.remote(sampler=sampler, template=template, ...)
+results = ray.get(rollout_actor.__call__.remote(trajectories))
+```
+
+## APIMultiTurnRollout
+
+Multi-turn rollout over an OpenAI-compatible chat-completions API. Each trajectory runs independently in a thread pool for network concurrency.
+
+```python
+from twinkle_agentic.rollout.api_multi_turn import APIMultiTurnRollout
+from twinkle_agentic.protocol.openai import OpenAI
+
+api = OpenAI(model='qwen3.5-32b', base_url='http://localhost:8000/v1')
+
+rollout = APIMultiTurnRollout(
+    api=api,
+    tool_manager=tool_manager,
+    sampling_params=SamplingParams(temperature=0.7),
+    max_turns=6,
+    concurrency=8,
+    trace_dir='api_traces/',
+)
+
+results = rollout(trajectories)
+```
+
+### Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `api` | `OpenAI` | OpenAI-compatible API client. |
+| `tool_manager` | `ToolManager` | Tool dispatcher (single or per-trajectory list). |
+| `sampling_params` | `SamplingParams` | Default sampling parameters. |
+| `max_turns` | `int` | Maximum turns per trajectory (default: 6). |
+| `concurrency` | `int` | Thread pool size for parallel API calls (default: 8). |
+| `extra_body` | `Dict` | Extra fields to include in API requests. |
+| `trace_dir` | `str` | Directory for trace dumps. |
+
+### Stop Reasons
+
+| Reason | Description |
+|--------|-------------|
+| `stop` | Assistant responded without tool calls (natural end). |
+| `length` | API returned `finish_reason='length'` (token limit). |
+| `max_turns` | Reached `max_turns` limit. |
+| `api_error` | API call or tool execution raised an exception. |
+
+## Choosing Between Rollouts
+
+| Feature | MultiTurnRollout | APIMultiTurnRollout |
+|---------|-----------------|---------------------|
+| **Backend** | vLLM sampler (local GPU) | OpenAI-compatible API |
+| **Training integration** | Produces `input_ids` / `labels` for GRPO | Messages only (for data collection) |
+| **Batching** | GPU-level batch parallelism | Network-level thread concurrency |
+| **Use case** | Online RLHF training loop | Offline data generation / evaluation |
diff --git a/docs/source_en/Components/Agentic/Tools.md b/docs/source_en/Components/Agentic/Tools.md
new file mode 100644
index 000000000..e53c7371f
--- /dev/null
+++ b/docs/source_en/Components/Agentic/Tools.md
@@ -0,0 +1,119 @@
+# Tools & ToolManager
+
+The Tools module provides an abstract tool interface and a central tool dispatcher (`ToolManager`) for agentic multi-turn rollouts. Tools follow the OpenAI function-calling schema for seamless integration with LLM tool-use capabilities.
+
+## Tool Base Class
+
+```python
+from abc import ABC, abstractmethod
+from twinkle.data_format import Tool as ToolInfo
+
+class Tool(ABC):
+
+    @abstractmethod
+    def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
+        """Execute the tool and return a string result."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def tool_info(self) -> ToolInfo:
+        """Return OpenAI-compatible tool schema."""
+        raise NotImplementedError
+```
+
+### Implementing a Custom Tool
+
+```python
+from twinkle_agentic.tools.base import Tool
+
+class SearchTool(Tool):
+
+    def __call__(self, tool_name: str, arguments: dict) -> str:
+        query = arguments.get('query', '')
+        # Perform search logic
+        return f'Search results for: {query}'
+
+    def tool_info(self):
+        return {
+            'type': 'function',
+            'function': {
+                'name': 'search',
+                'description': 'Search the web for information.',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'query': {
+                            'type': 'string',
+                            'description': 'The search query.',
+                        },
+                    },
+                    'required': ['query'],
+                },
+            },
+        }
+```
+
+## ToolManager
+
+`ToolManager` is a registry and dispatcher for tools. It resolves tool calls from the LLM's structured output and routes them to the correct tool implementation.
+
+```python
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+# Initialize with a list of Tool instances
+manager = ToolManager([search_tool, calculator_tool])
+
+# Or with a dict
+manager = ToolManager({'search': search_tool, 'calc': calculator_tool})
+
+# Or register dynamically
+manager = ToolManager()
+manager.register(search_tool)
+manager.register(calculator_tool)
+```
+
+### Key Methods
+
+| Method | Description |
+|--------|-------------|
+| `register(tool)` | Register a tool (name extracted from `tool_info()`). |
+| `unregister(name)` | Remove a tool by name. |
+| `names()` | List all registered tool names. |
+| `copy()` | Create a shallow copy of the manager. |
+| `tool_infos()` | Return a list of all tool schemas (for API requests). |
+| `__call__(tool_call)` | Dispatch a tool call and return the result string. |
+
+### Dispatching Tool Calls
+
+`ToolManager` accepts OpenAI-shaped tool call dicts:
+
+```python
+tool_call = {
+    'id': 'call_1',
+    'type': 'function',
+    'function': {
+        'name': 'search',
+        'arguments': '{"query": "Python tutorials"}',
+    },
+}
+
+result = manager(tool_call)
+# result: 'Search results for: Python tutorials'
+```
+
+**Error handling:** If the tool name is unknown, arguments are invalid JSON, or the tool raises an exception, `ToolManager` returns a descriptive error string instead of raising — this keeps the rollout loop running.
+
+### Integration with Rollout
+
+```python
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+
+rollout = MultiTurnRollout(
+    sampler=sampler,
+    template=template,
+    tool_manager=manager,  # Pass manager to rollout
+    max_turns=6,
+)
+```
+
+The rollout engine calls `manager(tool_call)` for each tool call generated by the model, and appends the result as a `{'role': 'tool', 'content': result}` message.
diff --git a/docs/source_en/Components/Agentic/index.rst b/docs/source_en/Components/Agentic/index.rst
new file mode 100644
index 000000000..802034366
--- /dev/null
+++ b/docs/source_en/Components/Agentic/index.rst
@@ -0,0 +1,11 @@
+Agentic
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Preprocessor.md
+   Protocol.md
+   Rollout.md
+   Tools.md
+   Envs.md
+   Multi-Turn-Tool-Usage.md
diff --git a/docs/source_en/Components/CLI/CLI.md b/docs/source_en/Components/CLI/CLI.md
new file mode 100644
index 000000000..9efe3c894
--- /dev/null
+++ b/docs/source_en/Components/CLI/CLI.md
@@ -0,0 +1,134 @@
+# CLI
+
+The CLI module provides a unified configuration system for Twinkle training scripts. It merges multiple configuration sources (environment variables, `.env` files, YAML configs, and command-line arguments) into a single `Args` dataclass with typed argument groups.
+
+## Resolution Order
+
+Configuration sources are applied in order (later wins):
+
+1. **Dataclass defaults** — sensible out-of-the-box values
+2. **`.env` file** — project-local overrides
+3. **Environment variables** — `TWINKLE_` prefix or bare keys
+4. **YAML config file** — `--config path/to/config.yaml`
+5. **CLI overrides** — `--key value` (highest priority)
+
+All keys are case-insensitive and dash/underscore equivalent.
+
+## Quick Start
+
+```python
+from twinkle.cli import CLI
+
+args = CLI.from_args()
+
+# Access typed groups
+print(args.model.model_id)
+print(args.training.max_steps)
+print(args.optimizer.learning_rate)
+
+# Or get dictionaries for component construction
+model_kwargs = args.get_model_args()
+optimizer_kwargs = args.get_optimizer_args()
+```
+
+## Argument Groups
+
+| Group | Class | Key Parameters |
+|:------|:------|:---------------|
+| model | `ModelArgs` | `model_id`, `mixed_precision`, `strategy`, `gradient_checkpointing` |
+| lora | `LoraArgs` | `use_lora`, `lora_r`, `lora_alpha`, `lora_target_modules` |
+| dataset | `DatasetArgs` | `dataset_id`, `subset_name`, `split`, `streaming` |
+| template | `TemplateArgs` | `template_cls`, `max_length`, `truncation_strategy`, `enable_thinking` |
+| training | `TrainingArgs` | `max_steps`, `batch_size`, `micro_batch_size`, `output_dir`, `save_steps` |
+| optimizer | `OptimizerArgs` | `optimizer_cls`, `learning_rate`, `weight_decay`, `max_grad_norm` |
+| scheduler | `SchedulerArgs` | `scheduler_cls`, `num_warmup_steps`, `t_max` |
+| loss | `LossArgs` | `loss_cls`, `epsilon`, `beta`, `sft_weight` |
+| sampler | `SamplerArgs` | `sampler_type`, `gpu_memory_utilization`, `tensor_parallel_size` |
+| sampling | `SamplingArgs` | `max_tokens`, `temperature`, `top_k`, `top_p`, `num_samples` |
+| infra | `InfraArgs` | `mode`, `nproc_per_node`, `model_gpus`, `sampler_gpus`, `dp_size` |
+| server | `ServerArgs` | `config`, `host`, `port`, `ray_namespace` |
+| rl | `RLArgs` | `num_generations`, `advantage_type`, `reward_fns` |
+| checkpoint | `CheckpointArgs` | `save_optimizer`, `merge_and_sync`, `platform` |
+
+## YAML Configuration
+
+```yaml
+# config.yaml
+model_id: ms://Qwen/Qwen3.5-4B
+mixed_precision: bf16
+strategy: accelerate
+
+use_lora: true
+lora_r: 16
+lora_alpha: 32
+
+dataset_id: ms://swift/self-cognition
+max_length: 4096
+
+batch_size: 8
+micro_batch_size: 2
+max_steps: 200
+learning_rate: 1e-5
+
+mode: ray
+nproc_per_node: 8
+model_gpus: 4
+sampler_gpus: 4
+```
+
+## Command-Line Usage
+
+```bash
+# Use with YAML config
+python train.py --config config.yaml
+
+# Override specific values
+python train.py --config config.yaml --learning_rate 5e-6 --max_steps 500
+
+# Boolean flags
+python train.py --use_lora --no_gradient_checkpointing
+
+# Without config file (all from CLI)
+python train.py --model_id ms://Qwen/Qwen3.5-4B --batch_size 4
+```
+
+## Environment Variables
+
+```bash
+# TWINKLE_ prefix
+export TWINKLE_MODEL_ID=ms://Qwen/Qwen3.5-4B
+export TWINKLE_LEARNING_RATE=1e-5
+
+# Or bare keys (when recognized)
+export MODEL_ID=ms://Qwen/Qwen3.5-4B
+```
+
+## Field Aliases
+
+Some fields support aliases for convenience:
+
+- `learning_rate` ↔ `lr`
+- `nproc_per_node` ↔ `num_gpus`
+- `max_tokens` ↔ `max_new_tokens`
+- `use_megatron=true` → `strategy=native_fsdp`
+
+## Custom Config Sources
+
+You can extend the CLI with custom configuration sources:
+
+```python
+from twinkle.cli.cli import ConfigSource, Args, ConfigResolver
+
+class RemoteConfigSource(ConfigSource):
+    def __init__(self, url: str):
+        self.url = url
+
+    def load(self) -> dict:
+        import requests
+        return requests.get(self.url).json()
+
+# Apply custom source
+args = Args()
+resolver = ConfigResolver(args)
+resolver.apply(RemoteConfigSource('http://config-server/my-config').load())
+```
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Gym/index.rst" b/docs/source_en/Components/CLI/index.rst
similarity index 76%
rename from "docs/source_zh/\347\273\204\344\273\266/Gym/index.rst"
rename to docs/source_en/Components/CLI/index.rst
index 85d941b97..cf59fa766 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/Gym/index.rst"
+++ b/docs/source_en/Components/CLI/index.rst
@@ -1,6 +1,6 @@
-Gym
+CLI
 ===============
 .. toctree::
    :maxdepth: 1
 
-   Gym.md
+   CLI.md
diff --git a/docs/source_en/Components/Gym/Gym.md b/docs/source_en/Components/Gym/Gym.md
deleted file mode 100644
index 4db355b8a..000000000
--- a/docs/source_en/Components/Gym/Gym.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# Gym
-
-The Gym component provides an interface for reinforcement learning environments in Twinkle.
-
-```python
-from twinkle.gym import Gym
-
-class CustomGym(Gym):
-
-    def step(self, trajectories, **kwargs):
-        """
-        Execute one RL step: evaluate trajectories and return rewards.
-
-        Args:
-            trajectories: Model-generated trajectories to evaluate
-            **kwargs: Additional arguments
-
-        Returns:
-            Reward values for each trajectory
-        """
-        ...
-```
-
-The Gym abstraction allows you to plug in custom RL environments that interact with the training loop. It decouples reward computation and environment interaction from the core training logic.
-
-> Gym is typically used in on-policy RL training where the environment needs to provide feedback on model-generated outputs.
diff --git a/docs/source_en/Components/Loss/InfoNCELoss.md b/docs/source_en/Components/Loss/InfoNCELoss.md
new file mode 100644
index 000000000..1c6cada5d
--- /dev/null
+++ b/docs/source_en/Components/Loss/InfoNCELoss.md
@@ -0,0 +1,68 @@
+# InfoNCE Loss
+
+The `InfonceLoss` implements contrastive learning with in-batch negatives and optional cross-rank gathering. It is designed for embedding/retrieval model training.
+
+## Usage
+
+```python
+from twinkle.loss import InfonceLoss
+
+loss_fn = InfonceLoss(
+    temperature=0.1,
+    use_batch=True,           # Enable in-batch negatives
+    hard_negatives=7,         # Fix negative count per sample
+    mask_fake_negative=True,  # Mask false negatives
+    fake_neg_margin=0.1,      # Margin for false negative detection
+)
+
+model.set_loss(loss_fn)
+```
+
+## Input Format
+
+Each sample is laid out as `anchor(1) + positive(1) + negatives(n)` in a flat embedding tensor. The `inputs['labels']` is a 1-D mask where `1` marks the start of each group.
+
+```
+embeddings: [a0, p0, n0_1, n0_2, a1, p1, n1_1, n1_2, ...]
+labels:     [ 1,  0,    0,    0,  1,  0,    0,    0, ...]
+```
+
+## Parameters
+
+| Parameter | Type | Default | Description |
+|:----------|:-----|:--------|:------------|
+| `temperature` | float | 0.1 | Logit scaling factor |
+| `use_batch` | bool | True | Use cross-sample in-batch negatives |
+| `hard_negatives` | int | None | Fix per-sample negative count (truncate/upsample) |
+| `mask_fake_negative` | bool | False | Mask logits > positive + margin |
+| `fake_neg_margin` | float | 0.1 | Threshold for false negative masking |
+| `include_qq` | bool | False | Add query-query similarity block |
+| `include_dd` | bool | False | Add doc-doc similarity block |
+
+## Cross-Rank Gathering
+
+When `use_batch=True` and distributed training is active, embeddings are gathered from all DP ranks to maximize in-batch negative diversity. Only the local shard retains gradients.
+
+## Similarity Blocks
+
+The loss supports three similarity blocks for comprehensive contrastive learning:
+
+- **Q→D (default)**: Query to all documents — primary contrastive signal
+- **Q→Q** (`include_qq=True`): Query to all other queries — prevents query collapse
+- **D→D** (`include_dd=True`): Document to all other documents — Qwen3-Embedding style
+
+## Example: Embedding Training
+
+```python
+from twinkle.loss import InfonceLoss
+from twinkle.metric import EmbeddingMetric
+
+# Configure model for embedding
+model.set_loss(InfonceLoss(temperature=0.05, use_batch=True, include_qq=True))
+model.set_metric(EmbeddingMetric(device_mesh=mesh, process_group=pg))
+
+# Training loop
+for batch in dataloader:
+    model.forward_backward(batch)
+    model.clip_grad_and_step()
+```
diff --git a/docs/source_en/Components/Loss/index.rst b/docs/source_en/Components/Loss/index.rst
index dceaf20f0..c6f3cb2c9 100644
--- a/docs/source_en/Components/Loss/index.rst
+++ b/docs/source_en/Components/Loss/index.rst
@@ -3,6 +3,19 @@ Loss
 .. toctree::
    :maxdepth: 1
 
+   CrossEntropy.md
+   ChunkedCrossEntropy.md
+   DPOLoss.md
+   GKDLoss.md
+   GRPOLoss.md
+   InfoNCELoss.md
+   MSELoss.md
+   Building-Loss.md
+Loss
+===============
+.. toctree::
+   :maxdepth: 1
+
    CrossEntropy.md
    ChunkedCrossEntropy.md
    DPOLoss.md
diff --git a/docs/source_en/Components/Metrics/EmbeddingMetric.md b/docs/source_en/Components/Metrics/EmbeddingMetric.md
new file mode 100644
index 000000000..e8cea06ba
--- /dev/null
+++ b/docs/source_en/Components/Metrics/EmbeddingMetric.md
@@ -0,0 +1,42 @@
+# EmbeddingMetric
+
+The `EmbeddingMetric` tracks embedding quality during contrastive (InfoNCE) training. It reports anchor-positive cosine similarity statistics and in-batch negative similarity.
+
+## Usage
+
+```python
+from twinkle.metric import EmbeddingMetric
+
+metric = EmbeddingMetric(device_mesh=device_mesh, process_group=process_group)
+
+# During training
+metric.accumulate(inputs, outputs)
+
+# At log interval
+results = metric.calculate()
+# results: {
+#   'pos_sim': '0.8523',     # Mean anchor-positive cosine similarity
+#   'pos_sim_min': '0.7102', # Min across batch
+#   'pos_sim_max': '0.9451', # Max across batch
+#   'neg_sim': '0.2134',     # Mean anchor-negative similarity
+#   'loss': '0.3412',        # Average InfoNCE loss
+#   'grad_norm': '1.234567', # Gradient norm
+# }
+```
+
+## Reported Metrics
+
+| Metric | Description |
+|:-------|:------------|
+| `pos_sim` | Mean cosine similarity between anchors and their positives |
+| `pos_sim_min` | Minimum anchor-positive similarity in the batch |
+| `pos_sim_max` | Maximum anchor-positive similarity in the batch |
+| `neg_sim` | Mean similarity between anchors and other positives (in-batch negatives) |
+| `loss` | Average contrastive loss value |
+| `grad_norm` | Gradient norm (passed via kwargs) |
+
+## Cross-Rank Gathering
+
+`EmbeddingMetric` performs an `all_gather` to compute similarity statistics across all DP ranks, providing a global view of embedding quality even under data-parallel training.
+
+> This metric pairs with `InfonceLoss` for embedding/retrieval training tasks.
diff --git a/docs/source_en/Components/Metrics/GRPOMetric.md b/docs/source_en/Components/Metrics/GRPOMetric.md
new file mode 100644
index 000000000..cd4f11551
--- /dev/null
+++ b/docs/source_en/Components/Metrics/GRPOMetric.md
@@ -0,0 +1,66 @@
+# GRPOMetric
+
+The `GRPOMetric` tracks policy optimization diagnostics during GRPO training, including KL divergence, clipping rates, entropy, and log-probability statistics.
+
+## Usage
+
+```python
+from twinkle.metric import GRPOMetric
+
+metric = GRPOMetric(
+    device_mesh=device_mesh,
+    process_group=process_group,
+    epsilon=0.2,          # PPO clip range
+    temperature=1.0,      # Sampling temperature for logp rescaling
+    top_k_kl=10,          # Track top-K high-KL tokens per step
+)
+
+# During training loop
+metric.accumulate(inputs, outputs, old_logps=old_logps, advantages=advantages)
+
+# At log interval
+results = metric.calculate()
+# results: {
+#   'train/policy_confidence': 0.85,
+#   'train/mean_new_logp': -1.23,
+#   'train/mean_old_logp': -1.30,
+#   'train/logp_diff_mean': 0.07,
+#   'train/approx_kl': 0.003,
+#   'train/token_kl_max': 0.15,
+#   'train/entropy': 2.1,
+#   'train/clip_ratio': 0.02,
+#   'train/clip_ratio_low': 0.01,
+#   'train/clip_ratio_high': 0.01,
+# }
+```
+
+## Reported Metrics
+
+| Metric | Description |
+|:-------|:------------|
+| `train/policy_confidence` | exp(mean_new_logp) — higher means model is more confident |
+| `train/mean_new_logp` | Average log-probability of generated tokens under current policy |
+| `train/mean_old_logp` | Average log-probability under reference policy |
+| `train/logp_diff_mean` | Mean (new - old) log-probability difference |
+| `train/approx_kl` | Schulman K3 estimator of KL(old \|\| new) |
+| `train/token_kl_max` | Maximum per-token KL across all ranks |
+| `train/token_ratio_max` | Maximum importance weight across all ranks |
+| `train/entropy` | Average token-level entropy |
+| `train/clip_ratio` | Fraction of tokens clipped (low + high) |
+| `train/clip_ratio_low` | Fraction clipped below (ratio < 1-ε, negative advantage) |
+| `train/clip_ratio_high` | Fraction clipped above (ratio > 1+ε, positive advantage) |
+
+## Variants
+
+- **`GSPOMetric`** — Computes clip rate at sequence level (geometric-mean ratio per sequence)
+- **`CISPOMetric`** — Unconditional clip rate (not gated by advantage sign)
+
+## Parameters
+
+| Parameter | Type | Default | Description |
+|:----------|:-----|:--------|:------------|
+| `epsilon` | float | 0.2 | Lower clip boundary |
+| `epsilon_high` | float | None | Upper clip boundary (defaults to epsilon) |
+| `temperature` | float | 1.0 | Rescale logps to T=1 before computing KL |
+| `top_k_kl` | int | 0 | If > 0, record top-K high-KL token details |
+| `ignore_index` | int | -100 | Label value to mask out |
diff --git a/docs/source_en/Components/Metrics/index.rst b/docs/source_en/Components/Metrics/index.rst
index 5d50e183b..68215482d 100644
--- a/docs/source_en/Components/Metrics/index.rst
+++ b/docs/source_en/Components/Metrics/index.rst
@@ -8,4 +8,6 @@ Metrics
    Accuracy.md
    CompletionRewardMetric.md
    DPOMetric.md
+   GRPOMetric.md
+   EmbeddingMetric.md
    Building-Metrics.md
diff --git a/docs/source_en/Components/Model/MultiLoraTransformersModel.md b/docs/source_en/Components/Model/MultiLoraTransformersModel.md
index c196f900b..0c78a6886 100644
--- a/docs/source_en/Components/Model/MultiLoraTransformersModel.md
+++ b/docs/source_en/Components/Model/MultiLoraTransformersModel.md
@@ -30,3 +30,48 @@ The reason for the existence of max_loras and max_r parameters is that Twinkle's
 Because of this, the user's r must be less than or equal to the max_r configuration. During actual training, only part of the lora's rank will be used in the calculation.
 
 MultiLoraTransformersModel supports the `@remote_class` annotation and supports device_mesh, which means it can run in Ray workers.
+
+## Tenant Lifecycle
+
+Under the hood, `MultiLoraTransformersModel` uses the `MultiLora` manager to handle tenant LoRA slots. The key APIs:
+
+### acquire_lora
+
+Claim an available LoRA slot for a tenant:
+
+```python
+adapter_name = model.multi_lora.acquire_lora('tenant_a', LoraConfig(r=16, lora_alpha=32))
+```
+
+- Raises `RuntimeError` if all slots are in use or `config.r > max_r`
+
+### release_lora
+
+Release a tenant's LoRA slot, resetting weights to initial state:
+
+```python
+model.multi_lora.release_lora('tenant_a')
+```
+
+### Context Manager
+
+Use `adapter()` for scoped activation:
+
+```python
+with model.multi_lora.adapter('tenant_a') as name:
+    output = model.forward(inputs)
+```
+
+### LoraTenant
+
+Each slot is tracked as a `LoraTenant` dataclass:
+
+```python
+@dataclass
+class LoraTenant:
+    index: int                    # Slot index (0..max_loras-1)
+    adapter_name: str             # Internal name (e.g. "lora_0")
+    config: LoraConfig            # Pre-allocated config (max_r)
+    tenant_adapter_name: str      # User-facing tenant name (None if free)
+    tenant_config: LoraConfig     # Tenant's actual config (None if free)
+```
diff --git a/docs/source_en/Components/Model/SupportedModels.md b/docs/source_en/Components/Model/SupportedModels.md
new file mode 100644
index 000000000..7cd9e8b4d
--- /dev/null
+++ b/docs/source_en/Components/Model/SupportedModels.md
@@ -0,0 +1,79 @@
+# Supported Models
+
+Twinkle supports any model compatible with HuggingFace Transformers or Megatron-LM. Below is a curated list of models tested with Twinkle.
+
+## Language Models
+
+| Model Family | Model IDs | Parameters | Features |
+|:-------------|:----------|:-----------|:---------|
+| Qwen 3.5 | `Qwen/Qwen3.5-0.6B` ~ `Qwen/Qwen3.5-235B-A22B` | 0.6B–235B | MoE, Thinking mode |
+| Qwen 2.5 | `Qwen/Qwen2.5-0.5B` ~ `Qwen/Qwen2.5-72B` | 0.5B–72B | Dense |
+| DeepSeek V4 | `deepseek-ai/DeepSeek-V4` | 685B MoE | Custom DSML encoding |
+| DeepSeek R1 | `deepseek-ai/DeepSeek-R1` | 685B MoE | Reasoning |
+| LLaMA 3 | `meta-llama/Llama-3.3-70B-Instruct` | 8B–70B | Dense |
+| Mistral | `mistralai/Mistral-7B-v0.3` | 7B | Dense |
+| Yi | `01-ai/Yi-1.5-34B` | 6B–34B | Dense |
+| GLM-4 | `THUDM/glm-4-9b-chat` | 9B | Dense |
+| InternLM 2.5 | `internlm/internlm2_5-7b-chat` | 7B–20B | Dense |
+
+## Vision-Language Models
+
+| Model Family | Model IDs | Features |
+|:-------------|:----------|:---------|
+| Qwen 3.5 VL | `Qwen/Qwen3.5-VL-3B` ~ `Qwen/Qwen3.5-VL-72B` | Image, Video |
+| Qwen 2.5 VL | `Qwen/Qwen2.5-VL-7B-Instruct` | Image, Video |
+| InternVL 2.5 | `OpenGVLab/InternVL2_5-8B` | Image |
+
+## Embedding Models
+
+| Model Family | Model IDs | Training Method |
+|:-------------|:----------|:----------------|
+| Qwen3 Embedding | `Qwen/Qwen3-Embedding-0.6B` | InfoNCE contrastive |
+| GTE | `thenlper/gte-large-zh` | InfoNCE contrastive |
+
+## Model Loading
+
+Models can be loaded from ModelScope or HuggingFace:
+
+```python
+from twinkle.model import TransformersModel
+
+# From ModelScope (ms:// prefix)
+model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
+
+# From HuggingFace (hf:// prefix)
+model = TransformersModel(model_id='hf://meta-llama/Llama-3.3-70B-Instruct')
+
+# Local path
+model = TransformersModel(model_id='/path/to/model')
+```
+
+## Framework Support
+
+| Framework | Class | Use Case |
+|:----------|:------|:---------|
+| Transformers | `TransformersModel` | General training (SFT, RLHF, DPO) |
+| Transformers + Multi-LoRA | `MultiLoraTransformersModel` | Multi-tenant training |
+| Megatron-LM | `MegatronModel` | Large-scale distributed pre-training |
+| Megatron + Multi-LoRA | `MultiLoraMegatronModel` | Large-scale multi-tenant |
+
+## Precision Support
+
+| Mode | Description |
+|:-----|:------------|
+| `bf16` | BFloat16 mixed precision (recommended for A100/H100) |
+| `fp16` | Float16 mixed precision (for older GPUs) |
+| `fp8` | FP8 precision (H100 with Transformer Engine) |
+| `no` | Full precision (debugging only) |
+
+## Parallelism Strategies
+
+| Strategy | Config Key | Description |
+|:---------|:-----------|:------------|
+| FSDP | `strategy=accelerate` | Accelerate-managed FSDP (default) |
+| Native FSDP | `strategy=native_fsdp` | PyTorch native FSDP |
+| Tensor Parallel | `tp_size` | Split layers across GPUs |
+| Pipeline Parallel | `pp_size` | Split model stages |
+| Data Parallel | `dp_size` | Replicate model, split data |
+| Sequence Parallel | `sequence_parallel` | Split long sequences |
+| Expert Parallel | `ep_size` | MoE expert distribution |
diff --git a/docs/source_en/Components/Model/index.rst b/docs/source_en/Components/Model/index.rst
index e0648f00f..4802cd0d3 100644
--- a/docs/source_en/Components/Model/index.rst
+++ b/docs/source_en/Components/Model/index.rst
@@ -8,3 +8,14 @@ Model
    MultiLoraTransformersModel.md
    MegatronModel.md
    MultiLoraMegatronModel.md
+   SupportedModels.md
+Model
+===============
+.. toctree::
+   :maxdepth: 1
+
+   TwinkleModel.md
+   TransformersModel.md
+   MultiLoraTransformersModel.md
+   MegatronModel.md
+   MultiLoraMegatronModel.md
diff --git a/docs/source_en/Components/Notifier/Notifier.md b/docs/source_en/Components/Notifier/Notifier.md
new file mode 100644
index 000000000..c4e14b511
--- /dev/null
+++ b/docs/source_en/Components/Notifier/Notifier.md
@@ -0,0 +1,93 @@
+# Notifier
+
+The Notifier component provides a pluggable notification system for sending alerts during training. When exceptions occur or training events need attention, notifiers deliver messages to external channels (e.g., DingTalk webhooks).
+
+## Base Interface
+
+```python
+from twinkle.notifier import Notifier
+
+class Notifier:
+    def __call__(self, message: str):
+        """Send a notification message."""
+        ...
+
+    def to_dict(self) -> dict:
+        """Serialize for checkpoint/restore."""
+        ...
+
+    @classmethod
+    def from_dict(cls, data: dict) -> Notifier:
+        """Restore from serialized form."""
+        ...
+```
+
+## DingNotifier
+
+Sends notifications to DingTalk (钉钉) custom robot webhooks.
+
+```python
+from twinkle.notifier import DingNotifier
+
+notifier = DingNotifier(
+    ding_url='https://oapi.dingtalk.com/robot/send?access_token=xxx',
+    secret='SECxxxxxxx',  # Optional: for signed robots
+    timeout=5.0,
+)
+
+# Send a message
+notifier("### Training Complete\n\n- Steps: 1000\n- Loss: 0.25")
+```
+
+**Parameters:**
+- `ding_url`: Full DingTalk webhook URL with access token
+- `secret`: Optional signing secret for signed-robot mode
+- `timeout`: HTTP request timeout in seconds (default: 5.0)
+
+Messages are sent as DingTalk **Markdown** format. The first heading line is extracted as the chat preview title.
+
+## Exception Notifications
+
+Twinkle provides automatic exception notification with deduplication:
+
+```python
+from twinkle.notifier.base import notify_exception
+
+# Automatically sends formatted exception info
+# Only one rank sends per unique exception (prevents flooding)
+try:
+    model.forward_backward(batch)
+except Exception as e:
+    notify_exception(notifier, context='forward_backward', exc=e, name='sft_train')
+```
+
+The notification includes:
+- Exception type and message
+- Full traceback
+- Runtime metadata (rank, PID, hostname)
+- Deduplication: only one notification per unique exception across all ranks
+
+## Custom Notifier
+
+Create custom notifiers by subclassing `Notifier`:
+
+```python
+from twinkle.notifier import Notifier
+
+class SlackNotifier(Notifier):
+    def __init__(self, webhook_url: str):
+        self.webhook_url = webhook_url
+
+    def __call__(self, message: str):
+        import requests
+        requests.post(self.webhook_url, json={'text': message})
+
+    def to_dict(self):
+        return {'class': 'SlackNotifier', 'webhook_url': self.webhook_url}
+
+    @classmethod
+    def _from_dict_impl(cls, data):
+        return cls(webhook_url=data['webhook_url'])
+```
+
+> Notifiers are registered automatically via `__init_subclass__`, so `Notifier.from_dict()` can restore any subclass by name.
diff --git a/docs/source_en/Components/Notifier/index.rst b/docs/source_en/Components/Notifier/index.rst
new file mode 100644
index 000000000..ff82117d4
--- /dev/null
+++ b/docs/source_en/Components/Notifier/index.rst
@@ -0,0 +1,6 @@
+Notifier
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Notifier.md
diff --git a/docs/source_en/Components/TUI/Auto-Research.md b/docs/source_en/Components/TUI/Auto-Research.md
new file mode 100644
index 000000000..bd27a9705
--- /dev/null
+++ b/docs/source_en/Components/TUI/Auto-Research.md
@@ -0,0 +1,313 @@
+# Auto-Research (TUI)
+
+Twinkle TUI is a terminal-based intelligent training assistant that lets you **control, monitor, and debug ML training through natural language**. It combines a chat-driven AI agent with real-time metrics visualization, log streaming, and an automated health monitor that can detect and fix training failures autonomously.
+
+## Architecture Overview
+
+```
+┌──────────────────────────────────────────────────────────┐
+│ TwinkleTUI (Textual App)                                 │
+│ ┌──────────────────────────────────────────────────────┐ │
+│ │ StatusBar: state / run_id / model / step / progress  │ │
+│ ├──────────────────────┬───────────────────────────────┤ │
+│ │ MetricsPanel         │ LogPanel                      │ │
+│ │ (ASCII chart)        │ (scrolling logs)              │ │
+│ ├──────────────────────┤                               │ │
+│ │ ChatPanel            │                               │ │
+│ │ (user <-> agent)     │                               │ │
+│ └──────────────────────┴───────────────────────────────┘ │
+│                                                          │
+│ Background Services:                                     │
+│   AgentLoop  ─── LLM tool-calling loop                   │
+│   TrainingMonitor ─── periodic health check & auto-fix   │
+│   MetricsPoller ─── incremental metrics reading          │
+│   LogsPoller ─── incremental log tailing                 │
+│   SkillsLoader ─── async plugin loading                  │
+└──────────────────────────────────────────────────────────┘
+```
+
+## Installation & Launch
+
+TUI is part of the `twinkle-client` package:
+
+```bash
+pip install twinkle-client
+```
+
+### Command-Line Usage
+
+```bash
+# Basic launch (uses default local Ollama endpoint)
+twinkle-tui
+
+# Specify LLM backend
+twinkle-tui --llm-base-url http://localhost:11434/v1 --llm-model qwen3.5
+
+# Attach to an existing training run
+twinkle-tui --run-id my-grpo-run
+
+# Use a remote API (e.g., OpenAI-compatible)
+twinkle-tui --llm-base-url https://api.example.com/v1 --llm-api-key sk-xxx --llm-model gpt-4o
+
+# Enable debug logging
+twinkle-tui --verbose
+```
+
+Or run as a Python module:
+
+```bash
+python -m twinkle_client.tui
+```
+
+### CLI Options
+
+| Option | Env Var | Default | Description |
+|--------|---------|---------|-------------|
+| `--run-id`, `-r` | `TWINKLE_TUI_RUN_ID` | None | Attach to an existing training run |
+| `--llm-base-url` | `TWINKLE_LLM_BASE_URL` | `http://localhost:11434/v1` | LLM API base URL |
+| `--llm-model` | `TWINKLE_LLM_MODEL` | `qwen3.5` | LLM model name |
+| `--llm-api-key` | `TWINKLE_LLM_API_KEY` | `not-needed` | LLM API key |
+| `--verbose`, `-v` | `TWINKLE_TUI_VERBOSE` | `False` | Enable DEBUG logging |
+| `--version`, `-V` | — | — | Show version and exit |
+
+### Keyboard Shortcuts
+
+| Key | Action |
+|-----|--------|
+| `q` | Quit |
+| `Ctrl+P` | Toggle metrics panel |
+| `Ctrl+L` | Clear logs |
+
+## Chat Agent
+
+The core of TUI is an **LLM-powered tool-calling agent** (`AgentLoop`) that processes natural language commands through an OpenAI-compatible API. The agent maintains conversation history with automatic pruning (last 50 messages) and supports up to 10 tool-calling rounds per interaction.
+
+### What You Can Say
+
+**Training lifecycle:**
+- *"List my training runs"*
+- *"Start a new GRPO training with Qwen3.5-4B on gsm8k"*
+- *"Pause the current run"*
+- *"Resume training"*
+- *"Stop training"*
+
+**Server management:**
+- *"Start the server with Qwen3.5-4B and a Qwen3.5-72B sampler on 2 GPUs"*
+- *"Shut down the server"*
+- *"How many GPUs are available?"*
+
+**Monitoring & analysis:**
+- *"How is the training going?"*
+- *"Show me the reward-related metrics"*
+- *"Zoom into steps 100-200"*
+- *"Reset the chart view"*
+
+**Search:**
+- *"Search for math datasets"*
+- *"Find Qwen models on ModelScope"*
+
+### Available Tools
+
+The agent has access to 13 built-in tools:
+
+| Tool | Description |
+|------|-------------|
+| `list_training_runs` | List all training runs |
+| `get_training_status` | Get detailed status and recent metrics |
+| `start_server` | Start Ray cluster + Twinkle Server (idempotent) |
+| `shutdown_server` | Shut down server and release GPU resources |
+| `start_training` | Create and launch a new training run |
+| `select_run` | Switch monitoring to a different run |
+| `pause_training` | Pause training (SIGKILL, server retains state) |
+| `resume_training` | Resume by re-launching the client script |
+| `stop_training` | Stop training (SIGTERM, saves checkpoint) |
+| `update_script` | Update training script with version archiving |
+| `list_supported_models` | Query server for available models |
+| `search_datasets` | Search ModelScope for datasets |
+| `search_models` | Search ModelScope for models |
+| `zoom_metrics` | Adjust metrics chart view range |
+| `select_metrics` | Choose which metrics to display (max 4) |
+| `get_cluster_info` | Get GPU/cluster resource info |
+
+### Server Startup
+
+The `start_server` tool automates a multi-step pipeline:
+
+1. **GPU detection** — `nvidia-smi` hardware scan
+2. **GPU allocation** — partition GPUs between training model and samplers
+3. **Config generation** — auto-create `server_config.yaml`
+4. **Ray cluster startup** — multi-node GPU partitioning with isolated `CUDA_VISIBLE_DEVICES`
+5. **Server launch** — start Twinkle Server as background process
+6. **Health check** — poll `/api/v1/healthz` until ready
+
+Multi-model topology is supported: 1 training model + N sampler/teacher models.
+
+### Skills System
+
+TUI supports extensible skill plugins loaded from three sources:
+
+1. **Bundled skills** — shipped inside `twinkle_client/skills/bundled/`
+2. **User-local skills** — `~/.cache/twinkle/tui/skills/local/`
+3. **Community skills** — fetched from ModelScope (best-effort, 10s timeout)
+
+Skills are loaded asynchronously after startup and injected into the agent's system prompt. The agent is usable immediately even before skills finish loading.
+
+## Training Monitor (Auto-Fix)
+
+The `TrainingMonitor` is a background service that runs every **30 seconds**, collecting all available signals about the current training run and feeding them to the LLM for analysis.
+
+### Collected Signals
+
+- **Process status**: alive / dead / unknown
+- **output.log tail**: last 1500 chars (prioritizes tracebacks)
+- **Metrics**: recent entries + first-half vs second-half trend analysis
+- **Stall duration**: seconds since last metric was produced
+- **Current train.py**: full script source (for accurate fixes)
+
+### Decision Framework
+
+The LLM classifies each check into one of three actions:
+
+| Decision | When | Action |
+|----------|------|--------|
+| **LGTM** | Training progressing normally | No action |
+| **WARNING** | Loss plateau, reward hacking, KL explosion, etc. | Relay observation to user |
+| **FIX** | Script crashed, process dead with traceback | Auto-fix and restart |
+
+### Auto-Fix Pipeline
+
+When a FIX is needed:
+
+1. LLM outputs diagnosis + complete fixed script
+2. Monitor archives the old `train.py` as `train_v{N}.py`
+3. Writes the fixed script as the new `train.py`
+4. Re-launches training via `resume_training`
+5. Resets stall tracking for the new attempt
+
+Safety guardrails:
+- Max **3 auto-fix attempts** per run (prevents infinite retry loops)
+- Fix attempts are tracked per `run_id`
+- Snapshot deduplication avoids re-analyzing unchanged states
+
+## File-Based Connection
+
+TUI communicates with training processes through the local filesystem:
+
+```
+~/.cache/twinkle/{run_id}/
+├── meta.json       — run metadata (model_id, config, status, pid)
+├── metrics.jsonl   — one JSON object per step (incremental)
+├── output.log      — combined stdout+stderr from training
+├── train.py        — current active training script
+└── train_v{N}.py   — archived previous script versions
+```
+
+### Training Control Model
+
+In Server Mode, the Twinkle Server retains all model/optimizer state in GPU memory:
+
+- **Pause** = kill client process (SIGKILL) — server state preserved
+- **Resume** = re-launch client script — seamlessly continues training
+- **Stop** = SIGTERM — triggers checkpoint saving then exits
+- **Shut down server** = releases GPU resources, **destroys** model state
+
+## TrainingRuntime (Script Integration)
+
+Training scripts use `TrainingRuntime` to integrate with TUI:
+
+```python
+from twinkle_client.tui.runtime import TrainingRuntime
+
+rt = TrainingRuntime(run_id='my-grpo-run')
+rt.start(model_id='Qwen/Qwen3.5-4B', config={'lr': 1e-5})
+rt.register_graceful_shutdown(model, dataloader)
+
+for step, batch in enumerate(dataloader):
+    # ... training logic ...
+    rt.log_metrics(step=step, loss=loss, reward=reward, grad_norm=gn, lr=lr)
+    rt.log(f'Completed step {step}, loss={loss:.4f}')
+
+rt.finish()
+```
+
+### Key Methods
+
+| Method | Description |
+|--------|-------------|
+| `start(model_id, config, script_path)` | Initialize run directory and metadata |
+| `log_metrics(**kwargs)` | Write metrics entry to `metrics.jsonl` |
+| `log(message)` | Print log message (captured as `output.log`) |
+| `get_resume_info()` | Get `last_step` for resuming from checkpoint |
+| `finish(status)` | Mark training as finished, close files |
+| `register_graceful_shutdown(model, dataloader)` | Register SIGTERM handler that saves checkpoint |
+
+### Resume Support
+
+`TrainingRuntime` automatically saves training progress to `meta.json` (throttled to every 5 seconds). Scripts can use `get_resume_info()` to resume from the last saved step:
+
+```python
+rt = TrainingRuntime(run_id='my-run')
+resume = rt.get_resume_info()
+global_step = resume['last_step']
+
+if global_step > 0:
+    dataloader.skip_consumed_samples(global_step * BATCH_SIZE)
+    print(f'Resuming from step {global_step}')
+```
+
+### Graceful Shutdown
+
+When `register_graceful_shutdown()` is called, a SIGTERM handler is installed that:
+
+1. Saves model checkpoint (LoRA weights + optimizer state)
+2. Saves dataloader position (`consumed_train_samples`)
+3. Logs the checkpoint path
+4. Marks training as `stopped` and exits
+
+## UI Panels
+
+### StatusBar
+
+Displays current training state at the top of the screen:
+
+- Training state icon (🚀 Training / ⏸ Paused / ✅ Done / ❌ Error)
+- Run ID
+- Model name
+- Current step
+- Progress bar with percentage
+
+### MetricsPanel
+
+Real-time ASCII chart rendered with `plotext`:
+
+- Plots up to 4 metrics simultaneously
+- Supports zoom (by step range and y-axis range)
+- Auto-selects first 3 available metrics if no selection
+- Hint bar shows hidden metrics that can be switched via agent
+- Retains up to 2000 data points
+
+### LogPanel
+
+Scrolling log viewer:
+
+- Strips ANSI escape sequences for clean display
+- Hard-wraps long lines to prevent overflow
+- Handles `\r` carriage returns from progress bars
+- Retains last 500 lines
+
+### ChatPanel
+
+Interactive chat interface:
+
+- User input with streaming agent responses
+- Throttled token flushing (80ms) for smooth display
+- Stream reset on tool-call detection
+- Supports Rich markup formatting
+
+## Logging
+
+All TUI logs are written to `./tui.log` (current working directory):
+
+- Rotated at 5MB with 3 backups
+- **No console output** — avoids corrupting Textual's alt-screen buffer
+- Use `--verbose` for DEBUG level logging
diff --git a/docs/source_en/Components/TUI/SkillProvider.md b/docs/source_en/Components/TUI/SkillProvider.md
new file mode 100644
index 000000000..d008cf978
--- /dev/null
+++ b/docs/source_en/Components/TUI/SkillProvider.md
@@ -0,0 +1,71 @@
+# SkillProvider
+
+The skill system allows Twinkle's TUI agent to dynamically load specialized knowledge from external sources (Git repos, APIs, local files) and inject them into the LLM's system prompt.
+
+## Architecture
+
+| Class | Role |
+|-------|------|
+| **Skill** | Dataclass holding a single skill's name, content, and source |
+| **SkillProvider** | Abstract base class for fetching skills from a source |
+| **SkillManager** | Orchestrates multiple providers, aggregates skills for prompt injection |
+
+## Skill Dataclass
+
+```python
+@dataclasses.dataclass
+class Skill:
+    name: str       # Short identifier (typically filename without extension)
+    content: str    # Full markdown content
+    source: str     # Provider name + relative path for traceability
+```
+
+## Creating a Custom Provider
+
+Subclass `SkillProvider` and implement `name` and `fetch()`:
+
+```python
+from twinkle_client.skills.base import SkillProvider
+
+class MySkillProvider(SkillProvider):
+
+    @property
+    def name(self) -> str:
+        return 'my-skills'
+
+    async def fetch(self) -> None:
+        # Download/clone skill files to self.cache_dir
+        # e.g., git clone, API download, file copy
+        ...
+```
+
+The default `load_skills()` scans `self.cache_dir` for `.md` files (skipping README, LICENSE, etc.) and returns `Skill` objects.
+
+## SkillManager
+
+```python
+from twinkle_client.skills.manager import SkillManager
+
+manager = SkillManager()
+manager.register(my_provider)
+manager.register(another_provider)
+
+# Fetch and load all skills
+skills = await manager.load_all()
+
+# Format for LLM system prompt injection
+prompt_section = manager.format_for_prompt()
+```
+
+### Key Methods
+
+| Method | Description |
+|--------|-------------|
+| `register(provider)` | Add a skill provider |
+| `load_all()` | Fetch + load from all providers |
+| `format_for_prompt()` | Render skills as formatted text for system prompt |
+| `get_skill_names()` | List names of loaded skills |
+
+## Cache Directory
+
+By default, skills are cached at `~/.cache/twinkle/tui/skills/<provider_name>/`. Override by passing `cache_dir` to the provider constructor.
diff --git a/docs/source_en/Components/TUI/index.rst b/docs/source_en/Components/TUI/index.rst
new file mode 100644
index 000000000..29cdad073
--- /dev/null
+++ b/docs/source_en/Components/TUI/index.rst
@@ -0,0 +1,7 @@
+TUI
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Auto-Research.md
+   SkillProvider.md
diff --git a/docs/source_en/Components/Task Processor/GRPOProcessor.md b/docs/source_en/Components/Task Processor/GRPOProcessor.md
deleted file mode 100644
index adff73c45..000000000
--- a/docs/source_en/Components/Task Processor/GRPOProcessor.md	
+++ /dev/null
@@ -1,19 +0,0 @@
-# GRPOLossProcessor
-
-GRPOLossProcessor is a task processor wrapper designed for GRPO reinforcement learning training. It extends InputProcessor with GRPO-specific data preparation.
-
-```python
-from twinkle.processor import GRPOLossProcessor
-
-processor = GRPOLossProcessor(
-    device_mesh=...,
-    padding_free=False,
-    framework='transformers',
-)
-
-model.set_processor(processor)
-```
-
-GRPOLossProcessor wraps the base `InputProcessor` and adds handling for GRPO-specific fields such as advantages, old log-probabilities, and reference log-probabilities that are required by the GRPO loss function.
-
-> For standard SFT tasks, use `InputProcessor` directly. Use `GRPOLossProcessor` when your training loop involves GRPO or its variants.
diff --git a/docs/source_en/Components/Task Processor/index.rst b/docs/source_en/Components/Task Processor/index.rst
index 1f20fdbca..1e9d600a4 100644
--- a/docs/source_en/Components/Task Processor/index.rst	
+++ b/docs/source_en/Components/Task Processor/index.rst	
@@ -4,4 +4,3 @@ Task Processor
    :maxdepth: 1
 
    InputProcessor.md
-   GRPOProcessor.md
diff --git a/docs/source_en/Components/Template/DeepSeekV4Template.md b/docs/source_en/Components/Template/DeepSeekV4Template.md
new file mode 100644
index 000000000..bbd74928e
--- /dev/null
+++ b/docs/source_en/Components/Template/DeepSeekV4Template.md
@@ -0,0 +1,56 @@
+# DeepSeek-V4 Template
+
+The `DeepseekV4Template` provides native support for DeepSeek V4's custom chat template encoding, including its unique thinking mode, tool-call protocol, and multi-token special tokens.
+
+## Usage
+
+```python
+from twinkle.template import DeepseekV4Template
+
+template = DeepseekV4Template(
+    model_id='deepseek-ai/DeepSeek-V4',
+    enable_thinking=True,
+)
+```
+
+## Features
+
+- **Custom tokenizer wrapper**: Overrides `apply_chat_template` with DeepSeek V4's encoding protocol
+- **Thinking mode**: Supports `thinking` / `chat` modes with configurable reasoning effort
+- **Tool calls**: Native DSML (DeepSeek Markup Language) tool-call encoding
+- **Multi-token EOS**: Handles DeepSeek V4's multi-character special tokens
+
+## Thinking Modes
+
+```python
+# Enable deep thinking (reasoning mode)
+template = DeepseekV4Template(model_id='...', enable_thinking=True)
+
+# Control reasoning effort
+# 'max' or 'high' enables extended reasoning budget
+template.encode(messages, reasoning_effort='max')
+```
+
+## Tool Call Support
+
+DeepSeek V4 uses its own DSML protocol for structured function calling:
+
+```python
+messages = [
+    {'role': 'user', 'content': 'What is the weather in Shanghai?'},
+]
+tools = [
+    {'type': 'function', 'function': {'name': 'get_weather', 'parameters': {...}}}
+]
+
+features = template.encode(messages, tools=tools)
+```
+
+## Key Differences from Base Template
+
+| Feature | Base Template | DeepseekV4Template |
+|:--------|:-------------|:-------------------|
+| Chat template | HuggingFace native | Custom DSML encoding |
+| Thinking | `<think>` tags | Native thinking mode toggle |
+| Tool calls | Hermes/Qwen format | DSML tool blocks |
+| EOS handling | Single token | Multi-token special markers |
diff --git a/docs/source_en/Components/Template/Template.md b/docs/source_en/Components/Template/Template.md
index 60962a33a..b9124412a 100644
--- a/docs/source_en/Components/Template/Template.md
+++ b/docs/source_en/Components/Template/Template.md
@@ -2,6 +2,64 @@
 
 The template is a key component for converting Trajectory to InputFeature.
 
+```python
+class Template:
+
+    def __init__(self,
+                 model_id: str,
+                 use_chat_template: bool = True,
+                 max_length: Optional[int] = 8192,
+                 truncation_strategy: Literal['raise', 'left', 'right', 'split', 'delete'] = 'raise',
+                 default_system: Optional[str] = None):
+        ...
+
+    def batch_encode(self, trajectories: Union[Dict[str, Any], List[Trajectory]]) -> List[InputFeature]:
+        # Batch encode samples
+        ...
+
+    def check(self, trajectory: Trajectory) -> Optional[Trajectory]:
+        # Encode one sample and return the original sample
+        # Generally used to check data reasonableness in RL algorithms like GRPO
+        ...
+
+    def batch_check(self, trajectories: List[Trajectory]) -> List[Optional[Trajectory]]:
+        # Batch check samples
+        ...
+
+    def decode(self, token_ids: List[int], **kwargs) -> str:
+        # Decode sample
+        ...
+
+    def batch_decode(self, token_ids: List[List[int]], **kwargs) -> List[str]:
+        # Batch decode samples
+        ...
+```
+
+- model_id: Model id containing tokenizer or processor
+- use_chat_template: Whether to use chat_template. If not used, it is generally a pre-training scenario
+- max_length: Maximum length of a single sample
+- truncation_strategy: How to handle the sample if it exceeds the maximum length
+  - raise: Throw an exception. Generally used for very precise dataset scenarios
+  - left: Remove tokens on the left to conform to max_length
+  - right: Remove tokens on the right to conform to max_length
+  - split: Split the oversized sample into multiple max_length chunks (not supported for multimodal, LazyDataset, or IterablePackingDataset)
+  - delete: Drop the entire sample if it exceeds max_length
+- default_system: If the dataset does not have a system message, use the default system
+
+> Template does not support using functions as replacements because it needs to support many functions internally. If you need to write a new Template, please inherit the `Template` class.
+> Generally speaking, using the Template base class is sufficient for pure text models. In the base class, we use tokenizer.apply_chat_template to encode the model, which is universal for general pure text models.
+
+# Template mapping
+
+Currently, the model-template mapping is simple:
+
+- Template class: Supported in all pure text LLMs.
+- DeepseekV4Template class: For DeepSeek V4, rewrites the chat template encoding logic, `encode_messages` is built into twinkle.
+- Qwen3_5Template class: For Qwen3.5 MLLMs.
+# Template
+
+The template is a key component for converting Trajectory to InputFeature.
+
 ```python
 class Template:
 
diff --git a/docs/source_en/Components/Template/ToolCallParsers.md b/docs/source_en/Components/Template/ToolCallParsers.md
new file mode 100644
index 000000000..8d4e3f988
--- /dev/null
+++ b/docs/source_en/Components/Template/ToolCallParsers.md
@@ -0,0 +1,98 @@
+# Tool Call Parsers
+
+Twinkle's template system includes a modular tool-call parsing framework for training models with function calling capabilities.
+
+## Architecture
+
+```
+ToolCallRegistry
+├── HermesQwenParser  — Hermes/Qwen style <tool_call>...</tool_call>
+├── ReActParser       — ReAct Thought/Action/Observation
+├── ClineParser       — Cline XML-based tool calls
+└── VCPParser         — VCP protocol
+```
+
+## ToolCallParser Interface
+
+```python
+from twinkle.template.tools import ToolCallParser
+
+class ToolCallParser(ABC):
+    name: str = ''
+    open_marker: str | None = None
+    close_marker: str | None = None
+
+    def detect(self, text: str) -> bool:
+        """Check if text contains this format's markup."""
+        ...
+
+    def parse(self, text: str) -> List[Dict[str, Any]]:
+        """Extract tool calls in OpenAI format."""
+        ...
+
+    def clean(self, text: str) -> str:
+        """Strip markup, return plain content."""
+        ...
+```
+
+## ToolCallRegistry
+
+The registry auto-discovers parsers and routes detection:
+
+```python
+from twinkle.template.tools import ToolCallRegistry
+
+# Detect which format a completion uses
+parser = ToolCallRegistry.detect_first(completion_text)
+if parser:
+    tool_calls = parser.parse(completion_text)
+    clean_text = parser.clean(completion_text)
+```
+
+## Built-in Parsers
+
+### HermesQwenParser
+
+Parses Hermes/Qwen-style function calls:
+
+```xml
+<tool_call>
+{"name": "get_weather", "arguments": {"city": "Shanghai"}}
+</tool_call>
+```
+
+### ReActParser
+
+Parses ReAct-style reasoning traces:
+
+```
+Thought: I need to check the weather
+Action: get_weather
+Action Input: {"city": "Shanghai"}
+Observation: ...
+```
+
+### ClineParser
+
+Parses Cline XML-based tool invocations with structured parameters.
+
+### VCPParser
+
+Parses VCP (Visual Code Protocol) tool calls.
+
+## Usage in Training
+
+Tool call parsers integrate with the Template during preprocessing:
+
+```python
+from twinkle.template import Template
+
+template = Template(
+    model_id='ms://Qwen/Qwen3.5-4B',
+    enable_thinking=True,
+)
+
+# Template automatically uses ToolCallRegistry for
+# tool-call aware tokenization during encoding
+features = template.encode(messages, tools=tool_definitions)
+```
diff --git a/docs/source_en/Components/Template/index.rst b/docs/source_en/Components/Template/index.rst
index cd5fddb42..c3d125f04 100644
--- a/docs/source_en/Components/Template/index.rst
+++ b/docs/source_en/Components/Template/index.rst
@@ -4,3 +4,11 @@ Template
    :maxdepth: 1
 
    Template.md
+   DeepSeekV4Template.md
+   ToolCallParsers.md
+Template
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Template.md
diff --git a/docs/source_en/Components/Training Middleware/DeviceMesh-and-DeviceGroup.md b/docs/source_en/Components/Training Middleware/DeviceMesh-and-DeviceGroup.md
index 169adb86e..b5f55a575 100644
--- a/docs/source_en/Components/Training Middleware/DeviceMesh-and-DeviceGroup.md	
+++ b/docs/source_en/Components/Training Middleware/DeviceMesh-and-DeviceGroup.md	
@@ -40,6 +40,24 @@ class DeviceMesh:
 
 It is recommended to use `from_sizes` to construct it.
 
+### Parameter Reference
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `world_size` | Total number of processes | 1 |
+| `dp_size` | Data parallel degree | 1 |
+| `fsdp_size` | Fully Sharded Data Parallel degree | None |
+| `tp_size` | Tensor parallel degree | None |
+| `pp_size` | Pipeline parallel degree | None |
+| `ulysses_size` | Ulysses sequence parallel degree | None |
+| `cp_size` | Context parallel degree | None |
+| `ep_size` | Expert parallel degree (for MoE models) | None |
+| `etp_size` | Expert tensor parallel degree | None |
+| `ep_fsdp_size` | FSDP degree within each EP group | None |
+| `vpp_size` | Virtual pipeline parallel degree | None |
+| `device_type` | Device type (`cuda`, `npu`, etc.) | `cuda` |
+| `sequence_parallel` | Enable Megatron-style sequence parallel | False |
+
 Let's give an example:
 
 ```python
diff --git a/docs/source_en/Components/Training Middleware/Expert-Parallel.md b/docs/source_en/Components/Training Middleware/Expert-Parallel.md
new file mode 100644
index 000000000..edd0ec03d
--- /dev/null
+++ b/docs/source_en/Components/Training Middleware/Expert-Parallel.md	
@@ -0,0 +1,74 @@
+# Expert Parallel (EP)
+
+Expert Parallel distributes Mixture-of-Experts (MoE) model experts across multiple GPUs, allowing each rank to hold a subset of experts. This reduces per-GPU memory and enables training of large MoE models.
+
+## Overview
+
+| Concept | Description |
+|---------|-------------|
+| **ExpertParallelConfig** | Configuration dataclass controlling EP behavior |
+| **apply_expert_parallel()** | Entry point that shards experts and patches forward |
+| **shard_experts()** | Evenly splits experts across EP ranks |
+| **patch_forward()** | Replaces MoE block forward with EP-aware all-to-all communication |
+
+## Configuration
+
+```python
+from twinkle.model.transformers.moe.expert_parallel import ExpertParallelConfig
+
+config = ExpertParallelConfig(
+    enabled=True,              # Enable expert parallel
+    router_dtype='fp32',       # Router computation dtype: 'fp32', 'bf16', 'fp16'
+    keep_router_logits=True,   # Return router logits alongside hidden states
+    ignore_shared_experts=False,# Skip shared expert computation (e.g. DeepSeek)
+    ep_size=None,              # EP world size (consumed by TransformersModel)
+)
+```
+
+## Usage with DeviceMesh
+
+EP is activated by setting `ep_size` in `DeviceMesh.from_sizes()`. The framework automatically calls `apply_expert_parallel()` during model initialization.
+
+```python
+from twinkle.utils import DeviceMesh
+
+# 8 GPUs: 2-way EP × 4-way data parallel
+device_mesh = DeviceMesh.from_sizes(
+    world_size=8,
+    dp_size=4,
+    ep_size=2,
+)
+```
+
+For combined EP + FSDP sharding on the expert parameters:
+
+```python
+# 8 GPUs: 2-way EP with FSDP within each EP group
+device_mesh = DeviceMesh.from_sizes(
+    world_size=8,
+    dp_size=2,
+    ep_size=2,
+    ep_fsdp_size=2,
+)
+```
+
+## Communication Pattern
+
+The EP forward pass follows a 4-stage pipeline:
+
+1. **Preprocess** — compute per-expert token counts and split sizes
+2. **Token Pre-All2All** — permute tokens by expert assignment, then all-to-all exchange across EP ranks
+3. **Expert Compute** — each rank runs its local experts on received tokens
+4. **Token Post-All2All** — all-to-all exchange results back, unpermute and apply routing weights
+
+```
+Input tokens → Router → [preprocess] → [pre_all2all] → [local experts] → [post_all2all] → Output
+```
+
+## Requirements
+
+- `num_experts` must be divisible by `ep_size`
+- `torch.distributed` must be initialized
+- MoE blocks must define a `gate`/`router` module and `experts` (either `nn.ModuleList` or tensor-style `gate_up_proj`/`down_proj`)
+- Both ModuleList-style and tensor-style (fused) experts are supported
+- Shared experts (e.g. DeepSeek MoE) are handled automatically unless `ignore_shared_experts=True`
diff --git a/docs/source_en/Components/Training Middleware/Padding-Free.md b/docs/source_en/Components/Training Middleware/Padding-Free.md
new file mode 100644
index 000000000..44dd7ba14
--- /dev/null
+++ b/docs/source_en/Components/Training Middleware/Padding-Free.md	
@@ -0,0 +1,52 @@
+# Padding-Free Training
+
+Padding-free (also called "packing") training eliminates wasted computation on padding tokens by concatenating multiple sequences into a single packed batch. Twinkle supports padding-free training for both standard attention and Qwen3.5's GatedDeltaNet linear attention.
+
+## How It Works
+
+Instead of padding all sequences to `max_length`, padding-free packs multiple sequences into one row and uses `position_ids` to track sequence boundaries. This avoids wasted FLOPs on padding tokens.
+
+```
+Standard:   [tok tok tok PAD PAD PAD]  [tok tok PAD PAD PAD PAD]
+Packed:     [tok tok tok tok tok ...]   ← no padding waste
+```
+
+## Usage
+
+Padding-free is enabled via `PackingDataset` or `IterablePackingDataset`:
+
+```python
+from twinkle.dataset import PackingDataset
+
+dataset = PackingDataset(
+    dataset=base_dataset,
+    max_length=8192,
+)
+```
+
+The dataset automatically packs sequences and generates correct `position_ids` with resets at sequence boundaries.
+
+## GatedDeltaNet Patch (Qwen3.5)
+
+Qwen3.5 uses a hybrid architecture mixing standard attention with GatedDeltaNet linear attention. The native GatedDeltaNet implementation does not reset its linear-attention state at packed sequence boundaries.
+
+`GatedDeltaNetPaddingFreePatch` fixes this by:
+
+1. Patching `Qwen3_5DecoderLayer.forward` to pass `cu_seq_lens_q` (cumulative sequence lengths) to linear attention layers
+2. Patching `Qwen3_5GatedDeltaNet.forward` to use flash-linear-attention kernels (`causal_conv1d`, `chunk_gated_delta_rule`) with `cu_seqlens` support
+
+The patch is applied automatically when padding-free is detected on Qwen3.5 models.
+
+### Requirements
+
+- `flash-linear-attention` package must be installed
+- Only needed for Qwen3.5 models with GatedDeltaNet layers
+- When sequence parallel is enabled, a separate `Qwen3_5GatedDeltaNetUlyssesPatch` is used instead
+
+## Attention Backend Requirements
+
+| Attention Backend | Padding-Free Support |
+|-------------------|---------------------|
+| FlashAttention2 | Fully supported |
+| SDPA | Supported (incompatible with sequence parallel) |
+| Eager | Not supported |
diff --git a/docs/source_en/Components/Training Middleware/Sequence-Parallel.md b/docs/source_en/Components/Training Middleware/Sequence-Parallel.md
new file mode 100644
index 000000000..d08b01d67
--- /dev/null
+++ b/docs/source_en/Components/Training Middleware/Sequence-Parallel.md	
@@ -0,0 +1,68 @@
+# Sequence Parallel (SP)
+
+Sequence Parallel splits long sequences across multiple GPUs along the sequence dimension, enabling training with sequence lengths that exceed single-GPU memory. Twinkle implements Ulysses-style sequence parallel with optional derived ring attention.
+
+## Overview
+
+| Concept | Description |
+|---------|-------------|
+| **SequenceParallelConfig** | Configuration dataclass for SP |
+| **SequenceParallelStrategy** | Strategy class that wraps SP lifecycle |
+| **SequenceParallel** | Core implementation handling pad/split/gather |
+
+## Configuration
+
+```python
+from twinkle.model.transformers.strategy.sequence_parallel import SequenceParallelConfig
+
+config = SequenceParallelConfig(
+    enabled=True,           # Enable sequence parallel
+    ulysses_size=None,      # Ulysses SP degree (auto-derived from DeviceMesh if None)
+    gather_logits=True,     # Gather logits after forward for loss computation
+)
+```
+
+## Usage with DeviceMesh
+
+SP is activated by setting `ulysses_size` in `DeviceMesh.from_sizes()`:
+
+```python
+from twinkle.utils import DeviceMesh
+
+# 8 GPUs: 4-way Ulysses SP × 2-way data parallel
+device_mesh = DeviceMesh.from_sizes(
+    world_size=8,
+    dp_size=2,
+    ulysses_size=4,
+)
+```
+
+## How It Works
+
+1. **Pad** — input sequences are padded to a length divisible by SP world size
+2. **Split** — padded inputs are evenly split across SP ranks along the sequence dimension
+3. **Distributed Attention** — FlashAttention2 is patched to perform Ulysses all-to-all communication before/after attention computation
+4. **Gather** — after forward, logits are gathered back to full sequence length for loss computation
+
+## Supported Attention Backends
+
+| Backend | Status |
+|---------|--------|
+| FlashAttention2 | Fully supported (including packed/padding-free sequences) |
+| SDPA | Supported (non-packed batches only) |
+| Derived Ring Attention | Supported with FlashAttention2 only (`rp_world_size > 1`) |
+
+## Qwen3.5 Linear Attention
+
+SP automatically detects Qwen3.5 GatedDeltaNet linear attention layers and applies the `Qwen3_5GatedDeltaNetUlyssesPatch` for correct sequence-parallel behavior on hybrid attention architectures.
+
+## MoE Auxiliary Loss
+
+For MoE models, SP automatically installs a forward hook that gathers router logits across SP ranks before auxiliary loss computation, ensuring correct load-balancing signals.
+
+## Key Constraints
+
+- `num_key_value_heads` must be divisible by `ulysses_size` (for Ulysses) or use ring attention fallback
+- Packed/padding-free batches require FlashAttention2
+- Derived ring attention requires `batch_size == 1` (packed format)
+- `torch.distributed` must be initialized
diff --git a/docs/source_en/Components/Training Middleware/TwinkleClient.md b/docs/source_en/Components/Training Middleware/TwinkleClient.md
new file mode 100644
index 000000000..18a1437db
--- /dev/null
+++ b/docs/source_en/Components/Training Middleware/TwinkleClient.md	
@@ -0,0 +1,81 @@
+# TwinkleClient
+
+`TwinkleClient` is the Python client for interacting with the Twinkle REST API. It manages sessions, training runs, and checkpoints.
+
+## Initialization
+
+```python
+from twinkle_client.manager import TwinkleClient
+
+client = TwinkleClient(
+    base_url='http://localhost:8000',   # Or TWINKLE_SERVER_URL env var
+    api_key='your-api-key',             # Or TWINKLE_SERVER_TOKEN env var
+    route_prefix='/twinkle',            # API route prefix
+    session_heartbeat_interval=10,      # Heartbeat interval in seconds
+    session_metadata={'user': 'alice'}, # Optional session metadata
+)
+```
+
+On init, the client:
+1. Sets `base_url` and `api_key` into shared context (used by all client objects)
+2. Creates a server-side session
+3. Starts a background heartbeat thread to keep the session alive
+
+## Health Check
+
+```python
+is_healthy = client.health_check()  # Returns True/False
+capabilities = client.get_server_capabilities()  # Supported models
+```
+
+## Training Runs
+
+```python
+# List runs
+runs = client.list_training_runs(limit=20, offset=0)
+
+# List with pagination cursor
+runs, cursor = client.list_training_runs_with_cursor(limit=20)
+
+# Get specific run
+run = client.get_training_run(run_id='run_abc123')
+
+# Find by base model
+qwen_runs = client.find_training_run_by_model('Qwen/Qwen3.5-4B')
+```
+
+## Checkpoints
+
+```python
+# List checkpoints for a run
+checkpoints = client.list_checkpoints(run_id='run_abc123')
+
+# Get checkpoint path
+parsed = client.get_checkpoint_path(run_id, checkpoint_id)
+# parsed.path         → filesystem path
+# parsed.twinkle_path → twinkle:// URI
+
+# Get latest checkpoint (useful for resume training)
+latest_path = client.get_latest_checkpoint_path(run_id)
+
+# Delete checkpoint
+client.delete_checkpoint(run_id, checkpoint_id)
+```
+
+## Capacity & Weights Info
+
+```python
+# LoRA capacity
+capacity = client.get_capacity_info()
+# capacity.max_loras, capacity.used_loras, capacity.free_loras
+
+# Weights metadata
+info = client.get_weights_info('twinkle://run_id/weights/checkpoint')
+# info.base_model, info.is_lora, info.lora_rank
+```
+
+## Cleanup
+
+```python
+client.close()  # Stops heartbeat thread (also registered via atexit)
+```
diff --git a/docs/source_en/Components/Training Middleware/index.rst b/docs/source_en/Components/Training Middleware/index.rst
index 014dfdc66..b2dc3acee 100644
--- a/docs/source_en/Components/Training Middleware/index.rst	
+++ b/docs/source_en/Components/Training Middleware/index.rst	
@@ -4,4 +4,8 @@ Training Middleware
    :maxdepth: 1
 
    DeviceMesh-and-DeviceGroup.md
+   Expert-Parallel.md
+   Sequence-Parallel.md
+   Padding-Free.md
    RemoteClass.md
+   TwinkleClient.md
diff --git a/docs/source_en/Usage Guide/Embedding-Training.md b/docs/source_en/Usage Guide/Embedding-Training.md
new file mode 100644
index 000000000..d86769c38
--- /dev/null
+++ b/docs/source_en/Usage Guide/Embedding-Training.md	
@@ -0,0 +1,120 @@
+# Embedding Training
+
+Twinkle supports contrastive embedding model training with InfoNCE loss, in-batch negatives, and cross-rank gathering. This guide demonstrates how to train embedding models using Twinkle.
+
+---
+
+## Overview
+
+Embedding training in Twinkle uses the following core components:
+
+| Component | Role |
+|:----------|:-----|
+| `InfonceLoss` | Contrastive loss with in-batch negatives |
+| `EmbeddingMetric` | Tracks pos/neg similarity and loss |
+| `TransformersModel` | Trainable embedding model (with LoRA or full) |
+| `InputProcessor` | Processes anchor/positive pairs into features |
+
+### Data Format
+
+Each training sample consists of **(anchor, positive)** pairs. In the embedding feature tensor:
+
+```
+embeddings: [anchor_0, positive_0, anchor_1, positive_1, ...]
+labels:     [       1,         0,        1,          0, ...]
+```
+
+- `labels=1` marks the start of a new group (anchor)
+- `labels=0` marks positives/negatives within the group
+
+---
+
+## Basic Embedding Training
+
+A minimal embedding training script with DDP:
+
+```python
+import twinkle
+from twinkle import DeviceGroup, DeviceMesh, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.loss import InfonceLoss
+from twinkle.metric import EmbeddingMetric
+from twinkle.model import TransformersModel
+from twinkle.processor import InputProcessor
+from twinkle.template import Qwen3_5Template
+
+logger = get_logger()
+
+# --- Configuration ---
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+MODEL_GPUS = 4
+BATCH_SIZE = 32
+LEARNING_RATE = 1e-5
+TEMPERATURE = 0.07
+EMB_MAX_LENGTH = 8192
+
+# --- Initialize ---
+device_groups = [
+    DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+]
+model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
+twinkle.initialize(mode='ray', nproc_per_node=MODEL_GPUS, groups=device_groups)
+
+# --- Model ---
+model = TransformersModel(
+    model_id=MODEL_ID,
+    device_mesh=model_mesh,
+    remote_group='model',
+    ddp_config={'find_unused_parameters': True},
+)
+model.set_processor(InputProcessor)
+model.set_loss(InfonceLoss, temperature=TEMPERATURE, use_batch=True)
+model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
+model.set_lr_scheduler(
+    scheduler_cls='CosineWarmupScheduler',
+    num_warmup_steps=200,
+    num_training_steps=total_steps,
+)
+model.add_metric(EmbeddingMetric, is_training=True)
+
+# --- Template ---
+template = Qwen3_5Template(
+    model_id=MODEL_ID,
+    max_length=EMB_MAX_LENGTH,
+    enable_thinking=False,
+)
+
+# --- Training Loop ---
+for step, batch in enumerate(dataloader):
+    # batch: list of features with anchor/positive pairs
+    model.forward_backward(inputs=batch, task='embedding')
+    model.clip_grad_and_step(gradient_accumulation_steps=1)
+
+    if step % 10 == 0:
+        metric = model.calculate_metric(is_training=True)
+        logger.info(f'Step {step}: {metric}')
+```
+
+### Key Parameters
+
+| Parameter | Recommended | Description |
+|:----------|:------------|:------------|
+| `temperature` | 0.05–0.1 | Lower = sharper contrast. 0.07 keeps gradients flowing until cosine > 0.75 |
+| `use_batch` | True | Enables cross-sample in-batch negatives for better efficiency |
+| `hard_negatives` | None or 7 | Fix negative count per sample; None uses all in-batch |
+| `find_unused_parameters` | True | Required for embedding models (only last hidden state contributes gradients) |
+
+---
+
+## Monitoring
+
+The `EmbeddingMetric` reports key training signals:
+
+| Metric | What it means |
+|:-------|:--------------|
+| `pos_sim` | Average anchor-positive cosine similarity (target: > 0.8) |
+| `neg_sim` | Average anchor-negative similarity (target: < 0.3) |
+| `loss` | InfoNCE loss value |
+| `grad_norm` | Gradient magnitude |
+
+Healthy training shows `pos_sim` rising and `neg_sim` stable or falling. If `pos_sim` saturates near 1.0, lower the temperature.
diff --git a/docs/source_en/Usage Guide/Quick-Start.md b/docs/source_en/Usage Guide/Quick-Start.md
index 1c1a70fb6..707473910 100644
--- a/docs/source_en/Usage Guide/Quick-Start.md	
+++ b/docs/source_en/Usage Guide/Quick-Start.md	
@@ -156,6 +156,8 @@ if __name__ == '__main__':
 
 In this training code, we constructed a dataset and loaded the Qwen/Qwen3.5-4B model, used LoRA with the all-linear approach, and completed one training run. In the logs, you can observe the process of loss gradually converging.
 
+> **Tip — Full-Parameter Training**: The example above uses LoRA for efficiency. To switch to full-parameter training, simply remove the `add_adapter_to_model` call (and the `from peft import LoraConfig` import). Everything else stays the same.
+
 ### torchrun
 
 Twinkle supports running training in torchrun mode. In this scenario, Ray-related dependencies do not need to be installed.
@@ -471,7 +473,7 @@ python train.py
 
 A major feature of Twinkle is support for multi-tenant mixed training. Specifically, multiple users can use a single base model for LoRA training, which can greatly reduce server-side deployment costs.
 
-Checkpoint resumption is also supported in client-server training. The recommended flow is to call `model.resume_from_checkpoint(resume_path)` to restore weights and optimizer state, then call `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` to skip consumed data. See [Twinkle-Client](./Server%20and%20Client/Twinkle-Client.md) and [self_cognition.py](../../../cookbook/client/twinkle/self_host/self_cognition.py).
+Checkpoint resumption is also supported in client-server training. The recommended flow is to call `model.resume_from_checkpoint(resume_path)` to restore weights and optimizer state, then call `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` to skip consumed data. See [Twinkle-Client](./Server%20and%20Client/Twinkle-Client.md) and [self_cognition.py](../../../cookbook/server_mode/twinkle/self_host/self_cognition.py).
 
 Suppose we start a service using eight GPUs. First, we need to start the Ray cluster:
 
@@ -493,6 +495,8 @@ Next, start the server:
 twinkle-server launch -c cookbook/client/server/transformer/server_config.yaml
 ```
 
+> For details on how to write `server_config.yaml`, see [Server Configuration](../Server%20and%20Client/Server.md).
+
 The server will start three services: a sampler cluster, a model cluster, and a utility cluster.
 
 Now you can perform client-side training:
diff --git a/docs/source_en/index.rst b/docs/source_en/index.rst
index ef477f7fc..6128079c8 100644
--- a/docs/source_en/index.rst
+++ b/docs/source_en/index.rst
@@ -15,6 +15,7 @@ Twinkle DOCUMENTATION
    Usage Guide/NPU-Support.md
    Usage Guide/Train-as-a-Service.md
    Usage Guide/Introduction-with-Qwen3.5.md
+   Usage Guide/Embedding-Training.md
 
 .. toctree::
    :maxdepth: 2
@@ -30,7 +31,6 @@ Twinkle DOCUMENTATION
    Components/Sampler/index.rst
    Components/Reward/index.rst
    Components/Advantage/index.rst
-   Components/Gym/index.rst
    Components/Hub/index.rst
    Components/Checkpoint Engine/index.rst
    Components/Metrics/index.rst
@@ -41,6 +41,10 @@ Twinkle DOCUMENTATION
    Components/Plugin/index.rst
    Components/Kernel/index.rst
    Components/Training Middleware/index.rst
+   Components/CLI/index.rst
+   Components/Notifier/index.rst
+   Components/Agentic/index.rst
+   Components/TUI/index.rst
 
 Indices and tables
 ==================
diff --git a/docs/source_zh/index.rst b/docs/source_zh/index.rst
index 3d07d4b2a..6a7be7b5b 100644
--- a/docs/source_zh/index.rst
+++ b/docs/source_zh/index.rst
@@ -15,6 +15,7 @@ Twinkle DOCUMENTATION
    使用指引/NPU的支持.md
    使用指引/训练服务.md
    使用指引/Qwen3.5最佳实践.md
+   使用指引/Embedding训练.md
 
 .. toctree::
    :maxdepth: 2
@@ -30,7 +31,6 @@ Twinkle DOCUMENTATION
    组件/采样器/index.rst
    组件/奖励/index.rst
    组件/优势/index.rst
-   组件/Gym/index.rst
    组件/Hub/index.rst
    组件/检查点引擎/index.rst
    组件/指标/index.rst
@@ -41,6 +41,10 @@ Twinkle DOCUMENTATION
    组件/组件化/index.rst
    组件/Kernel/index.rst
    组件/训练中间件/index.rst
+   组件/CLI/index.rst
+   组件/通知器/index.rst
+   组件/Agentic/index.rst
+   组件/TUI/index.rst
 
 Indices and tables
 ==================
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Embedding\350\256\255\347\273\203.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Embedding\350\256\255\347\273\203.md"
new file mode 100644
index 000000000..94ea86ebe
--- /dev/null
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Embedding\350\256\255\347\273\203.md"
@@ -0,0 +1,120 @@
+# Embedding 模型训练
+
+Twinkle 支持基于 InfoNCE 损失的对比学习 Embedding 模型训练，内置 in-batch negatives 和跨 rank 聚合。本文介绍如何使用 Twinkle 训练 Embedding 模型。
+
+---
+
+## 概述
+
+Embedding 训练使用以下核心组件：
+
+| 组件 | 职责 |
+|:-----|:-----|
+| `InfonceLoss` | 对比损失，支持 in-batch negatives |
+| `EmbeddingMetric` | 追踪正/负对相似度和损失 |
+| `TransformersModel` | 可训练的 Embedding 模型（LoRA 或全参） |
+| `InputProcessor` | 将 anchor/positive 对处理为特征 |
+
+### 数据格式
+
+每个训练样本由 **(anchor, positive)** 对组成。在 Embedding 特征张量中：
+
+```
+embeddings: [anchor_0, positive_0, anchor_1, positive_1, ...]
+labels:     [       1,         0,        1,          0, ...]
+```
+
+- `labels=1` 标记新分组的起始位置（anchor）
+- `labels=0` 标记组内的 positive/negative
+
+---
+
+## 基础 Embedding 训练
+
+使用 DDP 的最小化 Embedding 训练脚本：
+
+```python
+import twinkle
+from twinkle import DeviceGroup, DeviceMesh, get_logger
+from twinkle.dataloader import DataLoader
+from twinkle.loss import InfonceLoss
+from twinkle.metric import EmbeddingMetric
+from twinkle.model import TransformersModel
+from twinkle.processor import InputProcessor
+from twinkle.template import Qwen3_5Template
+
+logger = get_logger()
+
+# --- 配置 ---
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+MODEL_GPUS = 4
+BATCH_SIZE = 32
+LEARNING_RATE = 1e-5
+TEMPERATURE = 0.07
+EMB_MAX_LENGTH = 8192
+
+# --- 初始化 ---
+device_groups = [
+    DeviceGroup(name='model', ranks=list(range(MODEL_GPUS)), device_type='GPU'),
+]
+model_mesh = DeviceMesh.from_sizes(world_size=MODEL_GPUS, dp_size=MODEL_GPUS)
+twinkle.initialize(mode='ray', nproc_per_node=MODEL_GPUS, groups=device_groups)
+
+# --- 模型 ---
+model = TransformersModel(
+    model_id=MODEL_ID,
+    device_mesh=model_mesh,
+    remote_group='model',
+    ddp_config={'find_unused_parameters': True},
+)
+model.set_processor(InputProcessor)
+model.set_loss(InfonceLoss, temperature=TEMPERATURE, use_batch=True)
+model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
+model.set_lr_scheduler(
+    scheduler_cls='CosineWarmupScheduler',
+    num_warmup_steps=200,
+    num_training_steps=total_steps,
+)
+model.add_metric(EmbeddingMetric, is_training=True)
+
+# --- 模板 ---
+template = Qwen3_5Template(
+    model_id=MODEL_ID,
+    max_length=EMB_MAX_LENGTH,
+    enable_thinking=False,
+)
+
+# --- 训练循环 ---
+for step, batch in enumerate(dataloader):
+    # batch: 包含 anchor/positive 对的特征列表
+    model.forward_backward(inputs=batch, task='embedding')
+    model.clip_grad_and_step(gradient_accumulation_steps=1)
+
+    if step % 10 == 0:
+        metric = model.calculate_metric(is_training=True)
+        logger.info(f'Step {step}: {metric}')
+```
+
+### 关键参数
+
+| 参数 | 推荐值 | 说明 |
+|:----|:------|:-----|
+| `temperature` | 0.05–0.1 | 越低对比越尖锐；0.07 保持梯度流动直至 cosine > 0.75 |
+| `use_batch` | True | 启用跨样本 in-batch negatives 提升效率 |
+| `hard_negatives` | None 或 7 | 固定每样本负例数量；None 使用全部 in-batch |
+| `find_unused_parameters` | True | Embedding 模型必需（仅最后隐藏状态产生梯度） |
+
+---
+
+## 监控指标
+
+`EmbeddingMetric` 报告关键训练信号：
+
+| 指标 | 含义 |
+|:----|:-----|
+| `pos_sim` | anchor-positive 平均余弦相似度（目标 > 0.8） |
+| `neg_sim` | anchor-negative 平均相似度（目标 < 0.3） |
+| `loss` | InfoNCE 损失值 |
+| `grad_norm` | 梯度范数 |
+
+健康的训练表现为 `pos_sim` 持续上升、`neg_sim` 稳定或下降。如果 `pos_sim` 过早饱和至 1.0 附近，应降低 temperature。
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
index 39f6fe182..feaf7f4e8 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/NPU\347\232\204\346\224\257\346\214\201.md"
@@ -257,6 +257,19 @@ pip install torch_npu-2.7.1-cp311-cp311-linux_aarch64.whl
 2. “待验证”功能可以尝试，但可能遇到兼容性问题
 3. 遇到问题时，参考对应的示例代码进行配置
 
+## 示例代码
+
+Twinkle 在 NPU 上已验证的示例目前聚焦 Megatron smoke 路径；SFT 和 GRPO cookbook 示例暂无对应文件。
+
+### 远程训练（Tinker 协议）
+- **服务端配置**：[cookbook/remote/tinker/ascend/](https://github.com/modelscope/twinkle/tree/main/cookbook/remote/tinker/ascend)
+  - 提供 HTTP API 接口
+  - 支持远程训练和推理
+  - 适用于生产环境部署
+
+**运行示例**：
+暂无对应命令示例。
+
 
 ## 参考资源
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md"
index 19189948f..3bc5c4ba4 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md"
@@ -157,6 +157,8 @@ if __name__ == '__main__':
 
 在这个训练代码中，我们构造了一个数据集并拉起了Qwen/Qwen3.5-4B模型，使用all-linear方式加载了lora，并完成了一次训练。在日志中，可以看到loss逐步收敛的过程。
 
+> **提示 — 全参数训练**：上面的示例使用 LoRA 以提高效率。若要切换为全参数训练，只需移除 `add_adapter_to_model` 调用（以及 `from peft import LoraConfig` 导入），其余代码完全不变。
+
 ### torchrun
 
 Twinkle 支持以 torchrun 模式运行训练。在这种场景下，不需要安装 Ray 相关的依赖。
@@ -470,7 +472,7 @@ python train.py
 ```
 
 ### 远程训练
-client-server 训练场景同样支持断点续训。推荐流程是调用 `model.resume_from_checkpoint(resume_path)` 恢复权重和优化器状态，再调用 `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` 跳过已消费数据。详细示例可参考 [Twinkle客户端](./服务端和客户端/Twinkle客户端.md) 和 [self_cognition.py](../../../cookbook/client/twinkle/self_host/self_cognition.py)。
+client-server 训练场景同样支持断点续训。推荐流程是调用 `model.resume_from_checkpoint(resume_path)` 恢复权重和优化器状态，再调用 `dataloader.resume_from_checkpoint(progress['consumed_train_samples'])` 跳过已消费数据。详细示例可参考 [Twinkle客户端](./服务端和客户端/Twinkle客户端.md) 和 [self_cognition.py](../../../cookbook/server_mode/twinkle/self_host/self_cognition.py)。
 
 Twinkle 的一大特色是支持多租户用户混合训练。具体来说，多个用户可以使用一个基模进行 LoRA 训练，这样可以极大减小服务端部署成本。
 
@@ -494,6 +496,8 @@ CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0
 twinkle-server launch -c cookbook/client/server/transformer/server_config.yaml
 ```
 
+> `server_config.yaml` 的编写方式详见 [服务端配置](../服务端和客户端/服务端.md)。
+
 服务端会启动一个包含 Sampler 集群、模型集群、工具集群的三个服务。
 
 下面可以进行client端训练：
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Agentic/Envs.md" "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Envs.md"
new file mode 100644
index 000000000..675f05baf
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Envs.md"
@@ -0,0 +1,183 @@
+# 执行环境（Envs）
+
+Envs 模块提供了用于 Agentic 训练的 RL 执行环境抽象。环境可以在多轮 rollout 中交互式参与，也可以批量评估已完成的轨迹。
+
+## Env 基类
+
+```python
+from twinkle_agentic.envs.base import Env, StepResult
+
+class Env(ABC):
+
+    def reset(self, trajectory=None) -> StepResult:
+        """重置环境，开始新一轮。"""
+
+    @abstractmethod
+    def step(self, tool_name: str, arguments: dict) -> StepResult:
+        """执行单个动作，返回观测 + 奖励 + 完成标志。"""
+
+    def tools(self) -> List[ToolInfo]:
+        """返回此环境中可用的工具定义。"""
+
+    def evaluate(self, trajectories, **kwargs) -> List[float]:
+        """批量评估已完成的轨迹，返回奖励列表。"""
+
+    def close(self) -> None:
+        """释放资源。"""
+```
+
+### StepResult
+
+```python
+@dataclass
+class StepResult:
+    observation: str = ''    # 动作执行后的环境观测
+    reward: float = 0.0      # 此步骤的标量奖励
+    done: bool = False        # 是否终止
+    info: Dict[str, Any] = field(default_factory=dict)  # 额外元数据
+```
+
+### 两种使用模式
+
+1. **交互模式**（多轮 rollout）—— 逐步执行：
+
+```python
+env = MyEnv()
+env.reset(trajectory)
+result = env.step('search', {'query': 'Python'})
+# ... 重复直到 result.done
+```
+
+2. **批量评估模式** —— 评估已完成的轨迹：
+
+```python
+rewards = env.evaluate(completed_trajectories)
+```
+
+## EnvTool
+
+`EnvTool` 将 `Env` 包装为 `Tool`，连接环境与 `ToolManager` 和 `MultiTurnRollout`。
+
+```python
+from twinkle_agentic.envs.env_tool import EnvTool
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+env = MyEnv()
+
+# 为环境中定义的每个工具创建一个 EnvTool
+env_tools = EnvTool.from_env(env)
+
+# 注册到 ToolManager
+manager = ToolManager(env_tools)
+```
+
+### 核心特性
+
+| 特性 | 说明 |
+|------|------|
+| `from_env(env)` | 工厂方法：为 `env.tools()` 中的每个工具创建一个 `EnvTool`。 |
+| `last_result` | 存储最近一次 `StepResult` 供调用方检查。 |
+| `done` | 属性：最后一步是否终止了回合。 |
+| `episode_reward` | 属性：来自 `info['episode_reward']` 的累计奖励。 |
+
+### 手动构造
+
+```python
+env_tool = EnvTool(
+    env=my_env,
+    tool_name='execute_code',
+    description='在沙箱中执行 Python 代码。',
+    parameters={
+        'type': 'object',
+        'properties': {
+            'code': {'type': 'string', 'description': '要执行的 Python 代码。'},
+        },
+        'required': ['code'],
+    },
+)
+```
+
+## OpenEnv
+
+`OpenEnv` 将基于 WebSocket 的 [OpenEnv](https://github.com/OpenEnv) 环境服务器适配为同步的 Twinkle `Env`。
+
+```python
+from twinkle_agentic.envs.openenv import OpenEnv
+
+env = OpenEnv(
+    base_url='http://localhost:8000',
+    env_cls='coding_env.CodingEnv',      # 可选的类型化客户端
+    env_kwargs={'message_timeout_s': 30},
+    tool_schema=[...],                    # 可选的工具定义
+)
+```
+
+### 参数
+
+| 参数 | 类型 | 说明 |
+|------|------|------|
+| `base_url` | `str` | 运行中的 OpenEnv 服务器 URL。 |
+| `env_cls` | `str` 或 class | 类型化客户端的点分导入路径或类。`None` 使用 `GenericEnvClient`。 |
+| `env_kwargs` | `Dict` | 传递给客户端构造函数的额外参数。 |
+| `tool_schema` | `List[ToolInfo]` | 通过 `tools()` 暴露的工具定义。 |
+| `action_mapper` | `Callable` | 自定义函数，将 `(tool_name, args)` 映射为发送给服务器的动作字典。 |
+
+### 与 Rollout 集成使用
+
+```python
+from twinkle_agentic.envs.openenv import OpenEnv
+from twinkle_agentic.envs.env_tool import EnvTool
+from twinkle_agentic.tools.tool_manager import ToolManager
+from twinkle_agentic.rollout.api_multi_turn import APIMultiTurnRollout
+
+# 设置环境
+env = OpenEnv(base_url='http://localhost:8000', tool_schema=[...])
+env.reset()
+
+# 桥接到 ToolManager
+env_tools = EnvTool.from_env(env)
+manager = ToolManager(env_tools)
+
+# 在 rollout 中使用
+rollout = APIMultiTurnRollout(api=api, tool_manager=manager, max_turns=10)
+results = rollout(trajectories)
+```
+
+### 实现自定义环境
+
+```python
+from twinkle_agentic.envs.base import Env, StepResult
+
+class CodeExecutionEnv(Env):
+
+    def reset(self, trajectory=None):
+        self._sandbox = create_sandbox()
+        return StepResult(observation='沙箱已就绪。')
+
+    def step(self, tool_name, arguments):
+        code = arguments.get('code', '')
+        output = self._sandbox.run(code)
+        return StepResult(
+            observation=output,
+            reward=1.0 if 'error' not in output.lower() else 0.0,
+            done=False,
+        )
+
+    def tools(self):
+        return [{
+            'type': 'function',
+            'function': {
+                'name': 'execute_code',
+                'description': '运行 Python 代码。',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'code': {'type': 'string'},
+                    },
+                },
+            },
+        }]
+
+    def close(self):
+        self._sandbox.cleanup()
+```
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Agentic/Multi-Turn-Tool-Usage.md" "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Multi-Turn-Tool-Usage.md"
new file mode 100644
index 000000000..1c13bfbe0
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Multi-Turn-Tool-Usage.md"
@@ -0,0 +1,205 @@
+# 多轮工具使用指南
+
+本指南介绍如何在 Twinkle 中设置和运行带工具调用的多轮 Agentic rollout。
+
+## 架构概览
+
+Agentic rollout 管线由四个核心组件组成：
+
+- **Tool** —— 实现特定能力（搜索、代码执行等）
+- **ToolManager** —— 注册工具并分发 LLM 工具调用
+- **Env**（可选）—— RL 环境，通过 `EnvTool` 暴露工具
+- **Rollout** —— 驱动多轮对话循环
+
+## 快速开始：基于 API 的 Rollout
+
+使用 OpenAI 兼容 API 运行多轮工具使用 rollout 的最简方式：
+
+```python
+from twinkle_agentic.protocol.openai import OpenAI
+from twinkle_agentic.tools.base import Tool
+from twinkle_agentic.tools.tool_manager import ToolManager
+from twinkle_agentic.rollout.api_multi_turn import APIMultiTurnRollout
+from twinkle.data_format.sampling import SamplingParams
+
+# 1. 定义工具
+class WeatherTool(Tool):
+    def __call__(self, tool_name, arguments):
+        city = arguments.get('city', '未知')
+        return f'{city}的天气：晴，25°C。'
+
+    def tool_info(self):
+        return {
+            'type': 'function',
+            'function': {
+                'name': 'get_weather',
+                'description': '获取城市的当前天气。',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'city': {'type': 'string', 'description': '城市名称。'},
+                    },
+                    'required': ['city'],
+                },
+            },
+        }
+
+# 2. 设置 ToolManager
+manager = ToolManager([WeatherTool()])
+
+# 3. 创建 API 客户端
+api = OpenAI(model='qwen3.5-32b', base_url='http://localhost:8000/v1')
+
+# 4. 创建 rollout
+rollout = APIMultiTurnRollout(
+    api=api,
+    tool_manager=manager,
+    sampling_params=SamplingParams(temperature=0.7, max_tokens=2048),
+    max_turns=6,
+    concurrency=8,
+)
+
+# 5. 准备轨迹
+trajectories = [
+    {
+        'messages': [
+            {'role': 'user', 'content': '北京今天天气怎么样？'},
+        ],
+    },
+]
+
+# 6. 运行 rollout
+results = rollout(trajectories)
+for r in results:
+    print(f"轮次: {r['turns']}, 停止原因: {r['stop_reason']}")
+    for msg in r['messages']:
+        print(f"  [{msg['role']}] {msg.get('content', '')[:100]}")
+```
+
+## 训练集成：基于 vLLM 的 Rollout
+
+用于 RLHF 训练时，使用 `MultiTurnRollout`，它会生成 `input_ids` 和 `labels`：
+
+```python
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle.data_format.sampling import SamplingParams
+
+rollout = MultiTurnRollout(
+    sampler=vllm_sampler,           # vLLMSampler 实例
+    template=template,               # 聊天模板
+    tool_manager=manager,
+    sampling_params=SamplingParams(temperature=0.7, max_tokens=4096),
+    max_turns=6,
+    max_trajectory_tokens=8192,
+    trace_dir='rollout_traces/',
+)
+
+# 在 GRPO 训练循环中
+results = rollout(batch_trajectories)
+# results 包含 input_ids、labels、logprobs 用于训练
+```
+
+## 将环境用作工具
+
+将 RL 环境桥接到工具管线中：
+
+```python
+from twinkle_agentic.envs.base import Env, StepResult
+from twinkle_agentic.envs.env_tool import EnvTool
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+# 定义环境
+class CodeEnv(Env):
+    def step(self, tool_name, arguments):
+        code = arguments.get('code', '')
+        # 在沙箱中执行代码
+        result = execute_in_sandbox(code)
+        return StepResult(observation=result, reward=1.0, done=False)
+
+    def tools(self):
+        return [{
+            'type': 'function',
+            'function': {
+                'name': 'run_python',
+                'description': '执行 Python 代码。',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'code': {'type': 'string'},
+                    },
+                    'required': ['code'],
+                },
+            },
+        }]
+
+# 桥接 Env -> Tool -> ToolManager
+env = CodeEnv()
+env_tools = EnvTool.from_env(env)
+manager = ToolManager(env_tools)
+
+# 照常在 rollout 中使用 manager
+rollout = APIMultiTurnRollout(api=api, tool_manager=manager, max_turns=10)
+```
+
+## 使用 OpenEnv 环境
+
+连接远程 OpenEnv WebSocket 服务器：
+
+```python
+from twinkle_agentic.envs.openenv import OpenEnv
+from twinkle_agentic.envs.env_tool import EnvTool
+
+env = OpenEnv(
+    base_url='http://localhost:8000',
+    env_cls='coding_env.CodingEnv',
+    tool_schema=[{
+        'type': 'function',
+        'function': {
+            'name': 'submit',
+            'description': '提交代码解决方案。',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'code': {'type': 'string'},
+                },
+            },
+        },
+    }],
+)
+
+env.reset()
+env_tools = EnvTool.from_env(env)
+manager = ToolManager(env_tools)
+```
+
+## 每轨迹独立 ToolManager
+
+当每个轨迹需要独立工具集时（例如，轨迹绑定的状态）：
+
+```python
+# 创建每轨迹的 manager
+managers = []
+for traj in trajectories:
+    env = create_env_for(traj)
+    env_tools = EnvTool.from_env(env)
+    managers.append(ToolManager(env_tools))
+
+# 传入列表（与轨迹 1:1 对齐）
+results = rollout(trajectories, tool_manager=managers)
+```
+
+## 跟踪调试
+
+两种 rollout 实现都支持跟踪文件输出用于调试：
+
+```python
+rollout = APIMultiTurnRollout(
+    api=api,
+    tool_manager=manager,
+    trace_dir='traces/',
+    trace_callback=lambda t: t['turns'] > 1,    # 仅存储多轮对话
+    success_callback=lambda t: t.get('stop_reason') == 'stop',
+)
+```
+
+跟踪文件以 `{step}-{ok|fail}-{id}.json` 格式保存，包含完整对话和元数据。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Agentic/Preprocessor.md" "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Preprocessor.md"
new file mode 100644
index 000000000..a5730abc8
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Preprocessor.md"
@@ -0,0 +1,189 @@
+# Agentic 预处理器
+
+Agentic 预处理器模块提供了基于流水线的多轮对话数据质量过滤框架，用于 RLHF / Agentic 微调之前的训练数据清洗和过滤。
+
+## QualityPreprocessor
+
+`QualityPreprocessor` 是一个轻量级流水线运行器，接受过滤器列表并按顺序执行。每个步骤接收行列表，返回 `(kept, dropped)`，流水线会记录每步统计信息。
+
+```python
+from twinkle_agentic.preprocessor import QualityPreprocessor, HardFilter, DeadLoopFilter
+
+pipeline = [
+    HardFilter(min_user_chars=10),
+    DeadLoopFilter(),
+]
+preprocessor = QualityPreprocessor(pipeline, dropped_log_path='dropped.jsonl')
+
+# rows 是列格式的字典（Dataset.map 格式）
+cleaned = preprocessor(rows)
+```
+
+### 参数
+
+| 参数 | 类型 | 说明 |
+|------|------|------|
+| `pipeline` | `List[Callable]` | 有序的过滤步骤列表。每个步骤接收 `List[Dict]`，返回 `(kept, dropped)`。 |
+| `dropped_log_path` | `str` | 可选的 JSONL 文件路径，用于记录被丢弃的行及步骤名称和原因。 |
+
+## 内置过滤器
+
+### HardFilter
+
+基于硬规则的过滤器，使用确定性规则移除质量差的行。支持多语言检测（EN/ZH/JA/KO）。
+
+```python
+from twinkle_agentic.preprocessor import HardFilter
+
+f = HardFilter(
+    min_user_chars=10,           # 非 CJK 用户查询最小字符数
+    min_user_chars_cjk=6,        # CJK 用户查询最小字符数
+    min_assistant_chars_2turn=80, # 两轮对话中助手回复最小长度
+    min_thinking_chars=200,      # 思考链最小长度（可豁免过滤）
+    system_deny_keywords=['hack', 'exploit'],
+    max_chars_per_round=50000,
+    max_total_chars=200000,
+    max_rounds=50,
+)
+```
+
+**丢弃原因：** `trivial_single_turn`（平凡单轮）、`shallow_reply`（浅回复）、`all_empty_assistant`（全空助手）、`system_deny_keyword`（系统拒绝关键词）、`round_too_long`（单轮过长）、`total_too_long`（总长过长）、`too_many_rounds`（轮次过多）
+
+### DeadLoopFilter
+
+检测助手消息中的犹豫/死循环模式——重复自我纠正、级联纠正和高 n-gram 重复。
+
+```python
+from twinkle_agentic.preprocessor import DeadLoopFilter
+
+f = DeadLoopFilter(
+    hesitation_density_threshold=7.0,   # 每 1000 字符犹豫标记数（响应）
+    cascade_threshold=5,                 # 窗口内级联标记数
+    cascade_window=800,                  # 窗口大小（字符）
+    repetition_threshold=0.45,           # N-gram 重复率
+    think_hesitation_density_threshold=15.0,  # <think> 块更宽松
+    think_repetition_threshold=0.65,
+)
+```
+
+对 `<think>` 推理块使用更宽松的阈值（允许自由发散），对可见响应使用更严格的阈值。
+
+### DedupFilter
+
+全局最长优先去重。签名由第一个真实用户轮次（首尾）和第一个助手回复推导。
+
+```python
+from twinkle_agentic.preprocessor import DedupFilter
+
+f = DedupFilter(prefix_chars=100, asst_chars=100)
+kept, dropped = f(all_rows)  # 必须在一次调用中传入整个数据集
+```
+
+> **注意：** `DedupFilter` 需要在单次调用中接收完整数据集。**不要**将它放入 `QualityPreprocessor` 中（后者按批处理）。请在流水线之前或之后单独运行。
+
+### RefuseFilter
+
+检测第一条助手回复中的自我引用式拒绝（如"我无法帮助您"）。多语言模式匹配（EN/ZH/JA/KO）。
+
+```python
+from twinkle_agentic.preprocessor import RefuseFilter
+
+f = RefuseFilter(check_window=600)  # 仅检查前 N 个字符
+```
+
+### TokenSoupFilter
+
+检测乱码/token-soup 输出，检查替换字符、控制字符、私用区 Unicode、泄漏的特殊 token、单字符重复和脚本混乱。
+
+```python
+from twinkle_agentic.preprocessor import TokenSoupFilter
+
+f = TokenSoupFilter(
+    replacement_char_ratio=0.02,
+    special_token_count=20,
+    script_chaos_threshold=0.55,
+)
+```
+
+### PIIPresidioFilter
+
+基于 Microsoft Presidio + spaCy NER + Faker 的多语言 PII 检测和重写。检测并替换个人身份信息（姓名、邮箱、电话号码、地址等）。
+
+```python
+from twinkle_agentic.preprocessor import PIIPresidioFilter
+
+f = PIIPresidioFilter(languages=['en', 'zh'])
+```
+
+### IntentClassifier
+
+启发式意图分类器，为每行标注检测到的意图。可插拔的检测器管线。
+
+```python
+from twinkle_agentic.preprocessor import IntentClassifier
+
+classifier = IntentClassifier()
+```
+
+**意图类别：** `tool_call`（工具调用）、`code`（代码）、`math`（数学）、`complex_logic`（复杂逻辑）、`reasoning`（推理）、`user_dissatisfaction`（用户不满）、`other`（其他）
+
+### ScoreFilter
+
+可插拔评分器过滤器，内置字符级指标、语义相似度和代码执行评分器。
+
+```python
+from twinkle_agentic.preprocessor import ScoreFilter
+
+f = ScoreFilter()
+```
+
+**内置评分器：** `ChrMinScorer`、`SIFDScorer`、`PassNScorer`、`ParaphraseScorer`
+
+### ModelFilter
+
+按模型 ID 白名单过滤行。
+
+```python
+from twinkle_agentic.preprocessor import ModelFilter
+
+f = ModelFilter(allowed_models=['qwen3.5-4b', 'qwen3.5-32b'])
+```
+
+### MessageNormalizer
+
+三遍消息规范化：心跳剥离、工具调用重写、连续同角色消息合并。
+
+```python
+from twinkle_agentic.preprocessor import MessageNormalizer
+
+normalizer = MessageNormalizer()
+```
+
+## 完整流水线示例
+
+```python
+from twinkle_agentic.preprocessor import (
+    QualityPreprocessor,
+    HardFilter,
+    DeadLoopFilter,
+    RefuseFilter,
+    TokenSoupFilter,
+    MessageNormalizer,
+    DedupFilter,
+)
+
+# 第一步：全局去重（必须在完整数据集上运行）
+dedup = DedupFilter()
+rows, _ = dedup(all_rows)
+
+# 第二步：按批流水线
+pipeline = [
+    HardFilter(min_user_chars=10, max_rounds=30),
+    DeadLoopFilter(),
+    RefuseFilter(),
+    TokenSoupFilter(),
+    MessageNormalizer(),
+]
+preprocessor = QualityPreprocessor(pipeline, dropped_log_path='dropped.jsonl')
+cleaned = preprocessor(rows)
+```
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Agentic/Protocol.md" "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Protocol.md"
new file mode 100644
index 000000000..1e03092d6
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Protocol.md"
@@ -0,0 +1,91 @@
+# 协议（Protocol）
+
+Protocol 模块提供了抽象的 LLM API 客户端接口及其 OpenAI 兼容实现。它将 Twinkle 的 `Trajectory` / `SamplingParams` 数据类型与外部 LLM 推理服务连接起来。
+
+## API 基类
+
+```python
+from abc import ABC, abstractmethod
+from twinkle.data_format import Trajectory
+from twinkle.data_format.message import Message
+from twinkle.data_format.sampling import SamplingParams
+
+class API(ABC):
+    """抽象 LLM API 客户端：Trajectory + SamplingParams -> 助手 Message"""
+
+    @abstractmethod
+    def __call__(
+        self,
+        trajectory: Trajectory,
+        sampling_params: SamplingParams,
+        **kwargs,
+    ) -> Union[Message, List[Message]]:
+        raise NotImplementedError()
+```
+
+`API` 类定义了一个简单的契约：给定对话轨迹和采样参数，返回一条或多条助手消息。
+
+## OpenAI
+
+`OpenAI` 是内置实现，兼容任何支持 `/v1/chat/completions` 协议的端点（OpenAI、Azure OpenAI、vLLM、SGLang、Ollama 等）。
+
+```python
+from twinkle_agentic.protocol.openai import OpenAI
+
+api = OpenAI(
+    model='qwen3.5-32b',
+    base_url='http://localhost:8000/v1',
+    api_key='EMPTY',
+)
+```
+
+### 参数
+
+| 参数 | 类型 | 说明 |
+|------|------|------|
+| `model` | `str` | API 请求中传递的模型名称。 |
+| `api_key` | `str` | API 密钥。默认使用 `OPENAI_API_KEY` 环境变量。 |
+| `base_url` | `str` | API 端点的基础 URL（如 `http://localhost:8000/v1`）。 |
+| `client_kwargs` | `Dict` | 转发给 `openai.OpenAI` 客户端构造函数的额外关键字参数。 |
+
+### 使用方法
+
+```python
+from twinkle.data_format import Trajectory
+from twinkle.data_format.sampling import SamplingParams
+
+trajectory = {
+    'messages': [
+        {'role': 'user', 'content': '法国的首都是什么？'},
+    ]
+}
+
+sp = SamplingParams(temperature=0.7, max_tokens=512)
+reply = api(trajectory, sp)
+# reply 是一个 Message 字典：{'role': 'assistant', 'content': '...'}
+```
+
+### 特性
+
+- **工具调用**：自动将 `trajectory['tools']` 映射到 API 请求，并解析响应中的结构化 `tool_calls`。
+- **推理内容**：保留支持推理的模型返回的 `reasoning_content`（如 o1 风格推理）。
+- **完成原因**：在返回消息中暴露 `finish_reason`，供多轮驱动器检测长度截断。
+- **多样本**：当 `sampling_params.num_samples > 1` 时，返回消息列表（每个 choice 一条）。
+
+### 自定义 API 客户端
+
+要集成非 OpenAI API，请继承 `API`：
+
+```python
+from twinkle_agentic.protocol.base import API
+
+class MyCustomAPI(API):
+
+    def __call__(self, trajectory, sampling_params, **kwargs):
+        # 调用自定义端点
+        response = my_llm_client.chat(
+            messages=trajectory['messages'],
+            temperature=sampling_params.temperature,
+        )
+        return {'role': 'assistant', 'content': response.text}
+```
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Agentic/Rollout.md" "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Rollout.md"
new file mode 100644
index 000000000..b74c1e791
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Rollout.md"
@@ -0,0 +1,140 @@
+# 多轮 Rollout
+
+Rollout 模块提供了用于 Agentic RLHF 训练的多轮对话 rollout 引擎。包含两种实现：用于批量 vLLM 采样的 `MultiTurnRollout` 和用于 OpenAI 兼容 API 端点的 `APIMultiTurnRollout`。
+
+## Rollout 基类
+
+```python
+from abc import ABC, abstractmethod
+from twinkle.data_format import Trajectory
+
+class Rollout(ABC):
+
+    @abstractmethod
+    def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
+        raise NotImplementedError()
+```
+
+所有 rollout 接受轨迹列表并返回相同数量的轨迹，附带额外字段（`messages`、`turns`、`stop_reason`、`truncated`）。
+
+## MultiTurnRollout
+
+批量多轮 rollout 引擎，使用 vLLM 采样器进行生成。每轮中所有活跃轨迹通过单次批量采样调用并行处理，最大化吞吐量。
+
+### 每轮循环
+
+1. 将每个轨迹编码为带生成提示的 `InputFeature`
+2. 批量调用 `sampler.sample(active_pifs)` —— 所有活跃轨迹并行
+3. 检查终止条件：`stop_reason == 'length'`、无工具调用、或达到最大轮次
+4. 通过 `ToolManager` 分发工具调用，追加工具响应
+5. 计算桥接 token（工具轮次 + 生成提示），设置 `labels = -100`
+6. 重复直到所有轨迹完成
+
+```python
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+from twinkle_agentic.tools.tool_manager import ToolManager
+from twinkle.data_format.sampling import SamplingParams
+
+rollout = MultiTurnRollout(
+    sampler=vllm_sampler,
+    template=template,
+    tool_manager=tool_manager,
+    sampling_params=SamplingParams(temperature=0.7, max_tokens=4096),
+    max_turns=6,
+    max_trajectory_tokens=8192,
+    trace_dir='rollout_traces/',
+)
+
+# 运行 rollout
+results = rollout(trajectories)
+```
+
+### 参数
+
+| 参数 | 类型 | 说明 |
+|------|------|------|
+| `sampler` | Sampler | 用于批量生成的 vLLM 采样器实例。 |
+| `template` | `Template` | 用于编码/解码的聊天模板。 |
+| `tool_manager` | `ToolManager` | 工具分发器。也可以按调用传入。 |
+| `sampling_params` | `SamplingParams` | 默认采样参数。 |
+| `max_turns` | `int` | 每个轨迹的最大轮次（默认：6）。 |
+| `max_trajectory_tokens` | `int` | 最大总 token 长度；超出则截断轨迹。 |
+| `trace_dir` | `str` | 每轨迹 JSON 跟踪文件的目录。 |
+| `trace_callback` | `Callable` | 决定是否存储轨迹跟踪。 |
+| `success_callback` | `Callable` | 决定文件名前缀（`ok-` 或 `fail-`）。 |
+
+### 输出字段
+
+每个输出轨迹字典包含：
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `messages` | `List[Dict]` | 包含工具轮次的完整对话。 |
+| `input_ids` | `List[int]` | 完整序列的 token ID。 |
+| `labels` | `List[int]` | 训练标签（非可训练 token 为 `-100`）。 |
+| `turns` | `int` | 执行的轮次数。 |
+| `stop_reason` | `str` | `'stop'` / `'length'` |
+| `truncated` | `bool` | 轨迹是否被截断。 |
+| `logprobs` | `List` | 每 token 的对数概率（如有）。 |
+
+### Ray 远程支持
+
+`MultiTurnRollout` 使用 `@remote_class()` 装饰器，支持作为 Ray actor 透明部署：
+
+```python
+# rollout 可以作为 Ray 远程 actor 运行
+rollout_actor = MultiTurnRollout.remote(sampler=sampler, template=template, ...)
+results = ray.get(rollout_actor.__call__.remote(trajectories))
+```
+
+## APIMultiTurnRollout
+
+通过 OpenAI 兼容 chat-completions API 进行多轮 rollout。每个轨迹在线程池中独立运行，实现网络并发。
+
+```python
+from twinkle_agentic.rollout.api_multi_turn import APIMultiTurnRollout
+from twinkle_agentic.protocol.openai import OpenAI
+
+api = OpenAI(model='qwen3.5-32b', base_url='http://localhost:8000/v1')
+
+rollout = APIMultiTurnRollout(
+    api=api,
+    tool_manager=tool_manager,
+    sampling_params=SamplingParams(temperature=0.7),
+    max_turns=6,
+    concurrency=8,
+    trace_dir='api_traces/',
+)
+
+results = rollout(trajectories)
+```
+
+### 参数
+
+| 参数 | 类型 | 说明 |
+|------|------|------|
+| `api` | `OpenAI` | OpenAI 兼容 API 客户端。 |
+| `tool_manager` | `ToolManager` | 工具分发器（单个或按轨迹的列表）。 |
+| `sampling_params` | `SamplingParams` | 默认采样参数。 |
+| `max_turns` | `int` | 每轨迹最大轮次（默认：6）。 |
+| `concurrency` | `int` | 并行 API 调用的线程池大小（默认：8）。 |
+| `extra_body` | `Dict` | API 请求中附加的额外字段。 |
+| `trace_dir` | `str` | 跟踪文件目录。 |
+
+### 停止原因
+
+| 原因 | 说明 |
+|------|------|
+| `stop` | 助手回复未包含工具调用（自然结束）。 |
+| `length` | API 返回 `finish_reason='length'`（token 限制）。 |
+| `max_turns` | 达到 `max_turns` 限制。 |
+| `api_error` | API 调用或工具执行抛出异常。 |
+
+## 选择建议
+
+| 特性 | MultiTurnRollout | APIMultiTurnRollout |
+|------|-----------------|---------------------|
+| **后端** | vLLM 采样器（本地 GPU） | OpenAI 兼容 API |
+| **训练集成** | 生成 `input_ids` / `labels` 用于 GRPO | 仅消息（用于数据收集） |
+| **批处理** | GPU 级别批量并行 | 网络级别线程并发 |
+| **用例** | 在线 RLHF 训练循环 | 离线数据生成 / 评估 |
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Agentic/Tools.md" "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Tools.md"
new file mode 100644
index 000000000..122b75a14
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Agentic/Tools.md"
@@ -0,0 +1,119 @@
+# 工具与 ToolManager
+
+Tools 模块提供了抽象工具接口和中央工具分发器（`ToolManager`），用于 Agentic 多轮 rollout。工具遵循 OpenAI function-calling schema，与 LLM 工具调用能力无缝集成。
+
+## Tool 基类
+
+```python
+from abc import ABC, abstractmethod
+from twinkle.data_format import Tool as ToolInfo
+
+class Tool(ABC):
+
+    @abstractmethod
+    def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
+        """执行工具并返回字符串结果。"""
+        raise NotImplementedError
+
+    @abstractmethod
+    def tool_info(self) -> ToolInfo:
+        """返回 OpenAI 兼容的工具 schema。"""
+        raise NotImplementedError
+```
+
+### 实现自定义工具
+
+```python
+from twinkle_agentic.tools.base import Tool
+
+class SearchTool(Tool):
+
+    def __call__(self, tool_name: str, arguments: dict) -> str:
+        query = arguments.get('query', '')
+        # 执行搜索逻辑
+        return f'搜索结果：{query}'
+
+    def tool_info(self):
+        return {
+            'type': 'function',
+            'function': {
+                'name': 'search',
+                'description': '搜索网络信息。',
+                'parameters': {
+                    'type': 'object',
+                    'properties': {
+                        'query': {
+                            'type': 'string',
+                            'description': '搜索查询。',
+                        },
+                    },
+                    'required': ['query'],
+                },
+            },
+        }
+```
+
+## ToolManager
+
+`ToolManager` 是工具的注册中心和分发器。它解析 LLM 结构化输出中的工具调用，并路由到正确的工具实现。
+
+```python
+from twinkle_agentic.tools.tool_manager import ToolManager
+
+# 通过 Tool 实例列表初始化
+manager = ToolManager([search_tool, calculator_tool])
+
+# 或通过字典初始化
+manager = ToolManager({'search': search_tool, 'calc': calculator_tool})
+
+# 或动态注册
+manager = ToolManager()
+manager.register(search_tool)
+manager.register(calculator_tool)
+```
+
+### 核心方法
+
+| 方法 | 说明 |
+|------|------|
+| `register(tool)` | 注册工具（名称从 `tool_info()` 提取）。 |
+| `unregister(name)` | 按名称移除工具。 |
+| `names()` | 列出所有已注册的工具名称。 |
+| `copy()` | 创建管理器的浅拷贝。 |
+| `tool_infos()` | 返回所有工具 schema 列表（用于 API 请求）。 |
+| `__call__(tool_call)` | 分发工具调用并返回结果字符串。 |
+
+### 分发工具调用
+
+`ToolManager` 接受 OpenAI 格式的工具调用字典：
+
+```python
+tool_call = {
+    'id': 'call_1',
+    'type': 'function',
+    'function': {
+        'name': 'search',
+        'arguments': '{"query": "Python 教程"}',
+    },
+}
+
+result = manager(tool_call)
+# result: '搜索结果：Python 教程'
+```
+
+**错误处理：** 如果工具名未知、参数是无效 JSON 或工具抛出异常，`ToolManager` 返回描述性错误字符串而不是抛出异常——这保证了 rollout 循环的持续运行。
+
+### 与 Rollout 集成
+
+```python
+from twinkle_agentic.rollout.multi_turn import MultiTurnRollout
+
+rollout = MultiTurnRollout(
+    sampler=sampler,
+    template=template,
+    tool_manager=manager,  # 传入工具管理器
+    max_turns=6,
+)
+```
+
+Rollout 引擎对模型生成的每个工具调用执行 `manager(tool_call)`，并将结果作为 `{'role': 'tool', 'content': result}` 消息追加。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Agentic/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/Agentic/index.rst"
new file mode 100644
index 000000000..802034366
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Agentic/index.rst"
@@ -0,0 +1,11 @@
+Agentic
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Preprocessor.md
+   Protocol.md
+   Rollout.md
+   Tools.md
+   Envs.md
+   Multi-Turn-Tool-Usage.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/CLI/CLI.md" "b/docs/source_zh/\347\273\204\344\273\266/CLI/CLI.md"
new file mode 100644
index 000000000..e21c5ea44
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/CLI/CLI.md"
@@ -0,0 +1,134 @@
+# CLI 命令行配置
+
+CLI 模块为 Twinkle 训练脚本提供统一的配置系统。它将多种配置来源（环境变量、`.env` 文件、YAML 配置、命令行参数）合并到一个带类型的 `Args` 数据类中。
+
+## 配置优先级
+
+配置按以下顺序应用（后者覆盖前者）：
+
+1. **数据类默认值** — 开箱即用
+2. **`.env` 文件** — 项目本地配置
+3. **环境变量** — `TWINKLE_` 前缀或裸键名
+4. **YAML 配置文件** — `--config path/to/config.yaml`
+5. **命令行参数** — `--key value`（最高优先级）
+
+所有键名不区分大小写，横杠和下划线等价。
+
+## 快速开始
+
+```python
+from twinkle.cli import CLI
+
+args = CLI.from_args()
+
+# 访问类型化的参数组
+print(args.model.model_id)
+print(args.training.max_steps)
+print(args.optimizer.learning_rate)
+
+# 或获取字典用于组件构造
+model_kwargs = args.get_model_args()
+optimizer_kwargs = args.get_optimizer_args()
+```
+
+## 参数组
+
+| 分组 | 类名 | 关键参数 |
+|:-----|:-----|:---------|
+| model | `ModelArgs` | `model_id`, `mixed_precision`, `strategy`, `gradient_checkpointing` |
+| lora | `LoraArgs` | `use_lora`, `lora_r`, `lora_alpha`, `lora_target_modules` |
+| dataset | `DatasetArgs` | `dataset_id`, `subset_name`, `split`, `streaming` |
+| template | `TemplateArgs` | `template_cls`, `max_length`, `truncation_strategy`, `enable_thinking` |
+| training | `TrainingArgs` | `max_steps`, `batch_size`, `micro_batch_size`, `output_dir`, `save_steps` |
+| optimizer | `OptimizerArgs` | `optimizer_cls`, `learning_rate`, `weight_decay`, `max_grad_norm` |
+| scheduler | `SchedulerArgs` | `scheduler_cls`, `num_warmup_steps`, `t_max` |
+| loss | `LossArgs` | `loss_cls`, `epsilon`, `beta`, `sft_weight` |
+| sampler | `SamplerArgs` | `sampler_type`, `gpu_memory_utilization`, `tensor_parallel_size` |
+| sampling | `SamplingArgs` | `max_tokens`, `temperature`, `top_k`, `top_p`, `num_samples` |
+| infra | `InfraArgs` | `mode`, `nproc_per_node`, `model_gpus`, `sampler_gpus`, `dp_size` |
+| server | `ServerArgs` | `config`, `host`, `port`, `ray_namespace` |
+| rl | `RLArgs` | `num_generations`, `advantage_type`, `reward_fns` |
+| checkpoint | `CheckpointArgs` | `save_optimizer`, `merge_and_sync`, `platform` |
+
+## YAML 配置示例
+
+```yaml
+# config.yaml
+model_id: ms://Qwen/Qwen3.5-4B
+mixed_precision: bf16
+strategy: accelerate
+
+use_lora: true
+lora_r: 16
+lora_alpha: 32
+
+dataset_id: ms://swift/self-cognition
+max_length: 4096
+
+batch_size: 8
+micro_batch_size: 2
+max_steps: 200
+learning_rate: 1e-5
+
+mode: ray
+nproc_per_node: 8
+model_gpus: 4
+sampler_gpus: 4
+```
+
+## 命令行用法
+
+```bash
+# 使用 YAML 配置
+python train.py --config config.yaml
+
+# 覆盖特定值
+python train.py --config config.yaml --learning_rate 5e-6 --max_steps 500
+
+# 布尔标志
+python train.py --use_lora --no_gradient_checkpointing
+
+# 无配置文件（全部从命令行指定）
+python train.py --model_id ms://Qwen/Qwen3.5-4B --batch_size 4
+```
+
+## 环境变量
+
+```bash
+# TWINKLE_ 前缀
+export TWINKLE_MODEL_ID=ms://Qwen/Qwen3.5-4B
+export TWINKLE_LEARNING_RATE=1e-5
+
+# 或裸键名（当能识别时）
+export MODEL_ID=ms://Qwen/Qwen3.5-4B
+```
+
+## 字段别名
+
+部分字段支持别名：
+
+- `learning_rate` ↔ `lr`
+- `nproc_per_node` ↔ `num_gpus`
+- `max_tokens` ↔ `max_new_tokens`
+- `use_megatron=true` → `strategy=native_fsdp`
+
+## 自定义配置源
+
+你可以通过自定义配置源扩展 CLI：
+
+```python
+from twinkle.cli.cli import ConfigSource, Args, ConfigResolver
+
+class RemoteConfigSource(ConfigSource):
+    def __init__(self, url: str):
+        self.url = url
+
+    def load(self) -> dict:
+        import requests
+        return requests.get(self.url).json()
+
+# 应用自定义配置源
+args = Args()
+resolver = ConfigResolver(args)
+resolver.apply(RemoteConfigSource('http://config-server/my-config').load())
+```
diff --git a/docs/source_en/Components/Gym/index.rst "b/docs/source_zh/\347\273\204\344\273\266/CLI/index.rst"
similarity index 76%
rename from docs/source_en/Components/Gym/index.rst
rename to "docs/source_zh/\347\273\204\344\273\266/CLI/index.rst"
index 85d941b97..cf59fa766 100644
--- a/docs/source_en/Components/Gym/index.rst
+++ "b/docs/source_zh/\347\273\204\344\273\266/CLI/index.rst"
@@ -1,6 +1,6 @@
-Gym
+CLI
 ===============
 .. toctree::
    :maxdepth: 1
 
-   Gym.md
+   CLI.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Gym/Gym.md" "b/docs/source_zh/\347\273\204\344\273\266/Gym/Gym.md"
deleted file mode 100644
index 63dc87aa7..000000000
--- "a/docs/source_zh/\347\273\204\344\273\266/Gym/Gym.md"
+++ /dev/null
@@ -1,26 +0,0 @@
-# Gym
-
-Gym 组件为 Twinkle 中的强化学习环境提供接口。
-
-```python
-from twinkle.gym import Gym
-
-class CustomGym(Gym):
-
-    def step(self, trajectories, **kwargs):
-        """
-        执行一个 RL 步骤：评估轨迹并返回奖励。
-
-        Args:
-            trajectories: 模型生成的待评估轨迹
-            **kwargs: 额外参数
-
-        Returns:
-            每个轨迹的奖励值
-        """
-        ...
-```
-
-Gym 抽象允许你插入自定义 RL 环境与训练循环交互。它将奖励计算和环境交互与核心训练逻辑解耦。
-
-> Gym 通常用于在线策略 RL 训练中，环境需要对模型生成的输出提供反馈。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/TUI/Auto-Research.md" "b/docs/source_zh/\347\273\204\344\273\266/TUI/Auto-Research.md"
new file mode 100644
index 000000000..9624f5ae7
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/TUI/Auto-Research.md"
@@ -0,0 +1,313 @@
+# Auto-Research (TUI)
+
+Twinkle TUI 是一个基于终端的智能训练助手，支持通过**自然语言控制、监控和调试 ML 训练**。它将聊天驱动的 AI 代理与实时指标可视化、日志流、以及自动化健康监控器相结合，能够自主检测并修复训练故障。
+
+## 架构概览
+
+```
+┌──────────────────────────────────────────────────────────┐
+│ TwinkleTUI (Textual 应用)                                │
+│ ┌──────────────────────────────────────────────────────┐ │
+│ │ StatusBar: 状态 / run_id / 模型 / step / 进度条     │ │
+│ ├──────────────────────┬───────────────────────────────┤ │
+│ │ MetricsPanel         │ LogPanel                      │ │
+│ │ (ASCII 图表)         │ (滚动日志)                    │ │
+│ ├──────────────────────┤                               │ │
+│ │ ChatPanel            │                               │ │
+│ │ (用户 <-> 代理)      │                               │ │
+│ └──────────────────────┴───────────────────────────────┘ │
+│                                                          │
+│ 后台服务:                                                 │
+│   AgentLoop  ─── LLM 工具调用循环                         │
+│   TrainingMonitor ─── 定期健康检查与自动修复               │
+│   MetricsPoller ─── 增量指标读取                          │
+│   LogsPoller ─── 增量日志尾读                             │
+│   SkillsLoader ─── 异步插件加载                           │
+└──────────────────────────────────────────────────────────┘
+```
+
+## 安装与启动
+
+TUI 是 `twinkle-client` 包的一部分：
+
+```bash
+pip install twinkle-client
+```
+
+### 命令行用法
+
+```bash
+# 基本启动（使用默认本地 Ollama 端点）
+twinkle-tui
+
+# 指定 LLM 后端
+twinkle-tui --llm-base-url http://localhost:11434/v1 --llm-model qwen3.5
+
+# 连接到已有训练运行
+twinkle-tui --run-id my-grpo-run
+
+# 使用远程 API（如 OpenAI 兼容接口）
+twinkle-tui --llm-base-url https://api.example.com/v1 --llm-api-key sk-xxx --llm-model gpt-4o
+
+# 启用调试日志
+twinkle-tui --verbose
+```
+
+也可作为 Python 模块运行：
+
+```bash
+python -m twinkle_client.tui
+```
+
+### CLI 参数
+
+| 参数 | 环境变量 | 默认值 | 说明 |
+|------|---------|--------|------|
+| `--run-id`, `-r` | `TWINKLE_TUI_RUN_ID` | None | 连接到已有训练运行 |
+| `--llm-base-url` | `TWINKLE_LLM_BASE_URL` | `http://localhost:11434/v1` | LLM API 基础 URL |
+| `--llm-model` | `TWINKLE_LLM_MODEL` | `qwen3.5` | LLM 模型名称 |
+| `--llm-api-key` | `TWINKLE_LLM_API_KEY` | `not-needed` | LLM API 密钥 |
+| `--verbose`, `-v` | `TWINKLE_TUI_VERBOSE` | `False` | 启用 DEBUG 日志 |
+| `--version`, `-V` | — | — | 显示版本并退出 |
+
+### 快捷键
+
+| 按键 | 操作 |
+|------|------|
+| `q` | 退出 |
+| `Ctrl+P` | 切换指标面板 |
+| `Ctrl+L` | 清空日志 |
+
+## 聊天代理
+
+TUI 的核心是一个 **LLM 驱动的工具调用代理**（`AgentLoop`），通过 OpenAI 兼容 API 处理自然语言命令。代理维护对话历史并自动修剪（保留最近 50 条消息），每次交互最多支持 10 轮工具调用。
+
+### 你可以这样说
+
+**训练生命周期：**
+- *"列出我的训练运行"*
+- *"用 Qwen3.5-4B 在 gsm8k 上启动一个新的 GRPO 训练"*
+- *"暂停当前运行"*
+- *"恢复训练"*
+- *"停止训练"*
+
+**服务器管理：**
+- *"启动服务器，使用 Qwen3.5-4B 和一个 2 卡的 Qwen3.5-72B 采样器"*
+- *"关闭服务器"*
+- *"有多少 GPU 可用？"*
+
+**监控与分析：**
+- *"训练进展如何？"*
+- *"显示 reward 相关的指标"*
+- *"放大到 step 100-200"*
+- *"重置图表视图"*
+
+**搜索：**
+- *"搜索数学数据集"*
+- *"在 ModelScope 上查找 Qwen 模型"*
+
+### 可用工具
+
+代理内置 13 个工具：
+
+| 工具 | 说明 |
+|------|------|
+| `list_training_runs` | 列出所有训练运行 |
+| `get_training_status` | 获取详细状态和最近指标 |
+| `start_server` | 启动 Ray 集群 + Twinkle Server（幂等） |
+| `shutdown_server` | 关闭服务器并释放 GPU 资源 |
+| `start_training` | 创建并启动新的训练运行 |
+| `select_run` | 切换监控到另一个运行 |
+| `pause_training` | 暂停训练（SIGKILL，服务器保留状态） |
+| `resume_training` | 通过重新启动客户端脚本恢复训练 |
+| `stop_training` | 停止训练（SIGTERM，保存检查点） |
+| `update_script` | 更新训练脚本（带版本归档） |
+| `list_supported_models` | 查询服务器支持的模型 |
+| `search_datasets` | 在 ModelScope 搜索数据集 |
+| `search_models` | 在 ModelScope 搜索模型 |
+| `zoom_metrics` | 调整指标图表视图范围 |
+| `select_metrics` | 选择显示哪些指标（最多 4 个） |
+| `get_cluster_info` | 获取 GPU/集群资源信息 |
+
+### 服务器启动
+
+`start_server` 工具自动化一个多步骤流程：
+
+1. **GPU 检测** — `nvidia-smi` 硬件扫描
+2. **GPU 分配** — 在训练模型和采样器之间分配 GPU
+3. **配置生成** — 自动创建 `server_config.yaml`
+4. **Ray 集群启动** — 多节点 GPU 分区，隔离 `CUDA_VISIBLE_DEVICES`
+5. **服务器启动** — 作为后台进程启动 Twinkle Server
+6. **健康检查** — 轮询 `/api/v1/healthz` 直到就绪
+
+支持多模型拓扑：1 个训练模型 + N 个采样器/教师模型。
+
+### Skills 系统
+
+TUI 支持从三个来源加载可扩展的技能插件：
+
+1. **内置技能** — 包含在 `twinkle_client/skills/bundled/` 中
+2. **用户本地技能** — `~/.cache/twinkle/tui/skills/local/`
+3. **社区技能** — 从 ModelScope 获取（尽力而为，10 秒超时）
+
+技能在启动后异步加载并注入代理的系统提示词中。代理在技能加载完成前即可使用。
+
+## 训练监控器（自动修复）
+
+`TrainingMonitor` 是一个后台服务，每 **30 秒**运行一次，收集当前训练运行的所有可用信号，并提交给 LLM 进行分析。
+
+### 收集的信号
+
+- **进程状态**：alive / dead / unknown
+- **output.log 尾部**：最后 1500 个字符（优先提取 traceback）
+- **指标**：最近条目 + 前半段 vs 后半段趋势分析
+- **停滞时长**：自最后一次产生指标以来的秒数
+- **当前 train.py**：完整脚本源码（用于精确修复）
+
+### 决策框架
+
+LLM 将每次检查分类为三种操作之一：
+
+| 决策 | 触发条件 | 执行动作 |
+|------|---------|---------|
+| **LGTM** | 训练正常推进 | 无操作 |
+| **WARNING** | Loss 平台期、reward hacking、KL 爆炸等 | 向用户报告观察结果 |
+| **FIX** | 脚本崩溃、进程死亡并有 traceback | 自动修复并重启 |
+
+### 自动修复流程
+
+当需要 FIX 时：
+
+1. LLM 输出诊断 + 完整修复脚本
+2. 监控器将旧 `train.py` 归档为 `train_v{N}.py`
+3. 将修复脚本写为新的 `train.py`
+4. 通过 `resume_training` 重新启动训练
+5. 重置停滞追踪
+
+安全保障：
+- 每个运行最多 **3 次自动修复尝试**（防止无限重试循环）
+- 修复尝试按 `run_id` 追踪
+- 快照去重避免对未变化状态的重复分析
+
+## 基于文件的连接层
+
+TUI 通过本地文件系统与训练进程通信：
+
+```
+~/.cache/twinkle/{run_id}/
+├── meta.json       — 运行元数据（model_id、config、status、pid）
+├── metrics.jsonl   — 每步一个 JSON 对象（增量）
+├── output.log      — 训练的 stdout+stderr 合并输出
+├── train.py        — 当前活动训练脚本
+└── train_v{N}.py   — 归档的历史脚本版本
+```
+
+### 训练控制模型
+
+在 Server 模式下，Twinkle Server 将所有模型/优化器状态保留在 GPU 内存中：
+
+- **暂停** = 杀死客户端进程 (SIGKILL) — 服务器状态保留
+- **恢复** = 重新启动客户端脚本 — 无缝继续训练
+- **停止** = SIGTERM — 触发检查点保存后退出
+- **关闭服务器** = 释放 GPU 资源，**销毁**模型状态
+
+## TrainingRuntime（脚本集成）
+
+训练脚本使用 `TrainingRuntime` 与 TUI 集成：
+
+```python
+from twinkle_client.tui.runtime import TrainingRuntime
+
+rt = TrainingRuntime(run_id='my-grpo-run')
+rt.start(model_id='Qwen/Qwen3.5-4B', config={'lr': 1e-5})
+rt.register_graceful_shutdown(model, dataloader)
+
+for step, batch in enumerate(dataloader):
+    # ... 训练逻辑 ...
+    rt.log_metrics(step=step, loss=loss, reward=reward, grad_norm=gn, lr=lr)
+    rt.log(f'Completed step {step}, loss={loss:.4f}')
+
+rt.finish()
+```
+
+### 核心方法
+
+| 方法 | 说明 |
+|------|------|
+| `start(model_id, config, script_path)` | 初始化运行目录和元数据 |
+| `log_metrics(**kwargs)` | 向 `metrics.jsonl` 写入指标条目 |
+| `log(message)` | 打印日志消息（被捕获为 `output.log`） |
+| `get_resume_info()` | 获取 `last_step` 用于从检查点恢复 |
+| `finish(status)` | 标记训练完成，关闭文件 |
+| `register_graceful_shutdown(model, dataloader)` | 注册 SIGTERM 处理器以保存检查点 |
+
+### 断点续训支持
+
+`TrainingRuntime` 自动将训练进度保存到 `meta.json`（每 5 秒节流写入一次）。脚本可以使用 `get_resume_info()` 从上次保存的步数恢复：
+
+```python
+rt = TrainingRuntime(run_id='my-run')
+resume = rt.get_resume_info()
+global_step = resume['last_step']
+
+if global_step > 0:
+    dataloader.skip_consumed_samples(global_step * BATCH_SIZE)
+    print(f'从 step {global_step} 恢复训练')
+```
+
+### 优雅关停
+
+调用 `register_graceful_shutdown()` 后，会安装一个 SIGTERM 处理器：
+
+1. 保存模型检查点（LoRA 权重 + 优化器状态）
+2. 保存数据加载器位置（`consumed_train_samples`）
+3. 记录检查点路径
+4. 标记训练为 `stopped` 并退出
+
+## UI 面板
+
+### StatusBar（状态栏）
+
+显示在屏幕顶部的当前训练状态：
+
+- 训练状态图标（🚀 训练中 / ⏸ 已暂停 / ✅ 已完成 / ❌ 错误）
+- Run ID
+- 模型名称
+- 当前步数
+- 百分比进度条
+
+### MetricsPanel（指标面板）
+
+使用 `plotext` 渲染的实时 ASCII 图表：
+
+- 同时绘制最多 4 个指标
+- 支持缩放（按步数范围和 y 轴范围）
+- 未选择时自动显示前 3 个可用指标
+- 提示栏显示可通过代理切换的隐藏指标
+- 保留最多 2000 个数据点
+
+### LogPanel（日志面板）
+
+滚动日志查看器：
+
+- 自动剥离 ANSI 转义序列
+- 硬换行长行以防止溢出
+- 处理进度条的 `\r` 回车符
+- 保留最后 500 行
+
+### ChatPanel（聊天面板）
+
+交互式聊天界面：
+
+- 用户输入，流式代理响应
+- 节流令牌刷新（80ms）确保平滑显示
+- 工具调用检测时流重置
+- 支持 Rich 标记格式
+
+## 日志记录
+
+所有 TUI 日志写入 `./tui.log`（当前工作目录）：
+
+- 5MB 时轮转，保留 3 个备份
+- **无控制台输出** — 避免破坏 Textual 的 alt-screen 缓冲区
+- 使用 `--verbose` 启用 DEBUG 级别日志
diff --git "a/docs/source_zh/\347\273\204\344\273\266/TUI/SkillProvider\346\212\200\350\203\275\347\263\273\347\273\237.md" "b/docs/source_zh/\347\273\204\344\273\266/TUI/SkillProvider\346\212\200\350\203\275\347\263\273\347\273\237.md"
new file mode 100644
index 000000000..11637331e
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/TUI/SkillProvider\346\212\200\350\203\275\347\263\273\347\273\237.md"
@@ -0,0 +1,71 @@
+# SkillProvider 技能系统
+
+技能系统允许 Twinkle 的 TUI 智能体从外部来源（Git 仓库、API、本地文件）动态加载专业知识，并注入到 LLM 的系统提示词中。
+
+## 架构
+
+| 类 | 角色 |
+|----|------|
+| **Skill** | 持有单个技能名称、内容和来源的数据类 |
+| **SkillProvider** | 从数据源获取技能的抽象基类 |
+| **SkillManager** | 编排多个 Provider，聚合技能用于提示词注入 |
+
+## Skill 数据类
+
+```python
+@dataclasses.dataclass
+class Skill:
+    name: str       # 简短标识符（通常为文件名去除扩展名）
+    content: str    # 完整的 Markdown 内容
+    source: str     # Provider 名称 + 相对路径，用于可追溯性
+```
+
+## 创建自定义 Provider
+
+继承 `SkillProvider` 并实现 `name` 和 `fetch()`：
+
+```python
+from twinkle_client.skills.base import SkillProvider
+
+class MySkillProvider(SkillProvider):
+
+    @property
+    def name(self) -> str:
+        return 'my-skills'
+
+    async def fetch(self) -> None:
+        # 将技能文件下载/克隆到 self.cache_dir
+        # 例如：git clone、API 下载、文件拷贝
+        ...
+```
+
+默认的 `load_skills()` 会扫描 `self.cache_dir` 中的 `.md` 文件（跳过 README、LICENSE 等），返回 `Skill` 对象。
+
+## SkillManager
+
+```python
+from twinkle_client.skills.manager import SkillManager
+
+manager = SkillManager()
+manager.register(my_provider)
+manager.register(another_provider)
+
+# 拉取并加载所有技能
+skills = await manager.load_all()
+
+# 格式化为 LLM 系统提示词注入内容
+prompt_section = manager.format_for_prompt()
+```
+
+### 关键方法
+
+| 方法 | 说明 |
+|------|------|
+| `register(provider)` | 添加技能 Provider |
+| `load_all()` | 从所有 Provider 拉取并加载 |
+| `format_for_prompt()` | 将技能渲染为系统提示词格式 |
+| `get_skill_names()` | 列出已加载技能名称 |
+
+## 缓存目录
+
+默认缓存在 `~/.cache/twinkle/tui/skills/<provider_name>/`。可通过向 Provider 构造函数传入 `cache_dir` 参数覆盖。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/TUI/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/TUI/index.rst"
new file mode 100644
index 000000000..32ec8dc40
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/TUI/index.rst"
@@ -0,0 +1,7 @@
+TUI
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Auto-Research.md
+   SkillProvider技能系统.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/GRPOProcessor.md" "b/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/GRPOProcessor.md"
deleted file mode 100644
index afb8f0948..000000000
--- "a/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/GRPOProcessor.md"
+++ /dev/null
@@ -1,19 +0,0 @@
-# GRPOLossProcessor
-
-GRPOLossProcessor 是专为 GRPO 强化学习训练设计的任务处理器包装器。它在 InputProcessor 基础上扩展了 GRPO 特有的数据准备功能。
-
-```python
-from twinkle.processor import GRPOLossProcessor
-
-processor = GRPOLossProcessor(
-    device_mesh=...,
-    padding_free=False,
-    framework='transformers',
-)
-
-model.set_processor(processor)
-```
-
-GRPOLossProcessor 包装了基础 `InputProcessor`，并添加了 GRPO 特有字段的处理，如优势值、旧对数概率和参考对数概率，这些是 GRPO 损失函数所需要的。
-
-> 对于标准 SFT 任务，直接使用 `InputProcessor`。当训练循环涉及 GRPO 或其变体时，使用 `GRPOLossProcessor`。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/index.rst"
index 1eb839f0e..a2c88eaf4 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/index.rst"
@@ -4,4 +4,3 @@
    :maxdepth: 1
 
    InputProcessor.md
-   GRPOProcessor.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/EmbeddingMetric.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/EmbeddingMetric.md"
new file mode 100644
index 000000000..ab770498c
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/EmbeddingMetric.md"
@@ -0,0 +1,31 @@
+# EmbeddingMetric
+
+`EmbeddingMetric` 跟踪对比学习（InfoNCE）训练中的嵌入质量，报告锚点-正样本余弦相似度和批内负样本相似度。
+
+## 使用方法
+
+```python
+from twinkle.metric import EmbeddingMetric
+
+metric = EmbeddingMetric(device_mesh=device_mesh, process_group=process_group)
+
+# 训练中
+metric.accumulate(inputs, outputs)
+
+# 日志间隔时
+results = metric.calculate()
+# results: {'pos_sim': '0.8523', 'neg_sim': '0.2134', 'loss': '0.3412', ...}
+```
+
+## 输出指标
+
+| 指标 | 说明 |
+|:-----|:-----|
+| `pos_sim` | 锚点与正样本的平均余弦相似度 |
+| `pos_sim_min` | 批内最小正样本相似度 |
+| `pos_sim_max` | 批内最大正样本相似度 |
+| `neg_sim` | 锚点与其他正样本（批内负样本）的平均相似度 |
+| `loss` | 平均对比损失值 |
+| `grad_norm` | 梯度范数 |
+
+> 此指标与 `InfonceLoss` 配合使用，适用于嵌入/检索模型训练。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/GRPOMetric.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/GRPOMetric.md"
new file mode 100644
index 000000000..434dc17c2
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/GRPOMetric.md"
@@ -0,0 +1,39 @@
+# GRPOMetric
+
+`GRPOMetric` 跟踪 GRPO 训练中的策略优化诊断指标，包括 KL 散度、裁剪率、熵和对数概率统计。
+
+## 使用方法
+
+```python
+from twinkle.metric import GRPOMetric
+
+metric = GRPOMetric(
+    device_mesh=device_mesh,
+    process_group=process_group,
+    epsilon=0.2,          # PPO 裁剪范围
+    temperature=1.0,      # 用于 logp 重缩放的采样温度
+    top_k_kl=10,          # 每步记录 top-K 高 KL token
+)
+
+# 训练循环中
+metric.accumulate(inputs, outputs, old_logps=old_logps, advantages=advantages)
+
+# 日志间隔时
+results = metric.calculate()
+```
+
+## 输出指标
+
+| 指标 | 说明 |
+|:-----|:-----|
+| `train/policy_confidence` | exp(mean_new_logp) — 越高表示模型越自信 |
+| `train/mean_new_logp` | 当前策略下生成 token 的平均对数概率 |
+| `train/mean_old_logp` | 参考策略下的平均对数概率 |
+| `train/approx_kl` | Schulman K3 KL 估计器 |
+| `train/entropy` | 平均 token 级熵 |
+| `train/clip_ratio` | 被裁剪的 token 比例 |
+
+## 变体
+
+- **`GSPOMetric`** — 序列级裁剪率（几何平均比率）
+- **`CISPOMetric`** — 无条件裁剪率（不按优势符号门控）
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/index.rst"
index 6e03f97cf..d5ba804be 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/index.rst"
@@ -3,6 +3,19 @@
 .. toctree::
    :maxdepth: 1
 
+   TrainMetric.md
+   LossMetric.md
+   Accuracy.md
+   CompletionRewardMetric.md
+   DPOMetric.md
+   GRPOMetric.md
+   EmbeddingMetric.md
+   构建指标.md
+指标
+===============
+.. toctree::
+   :maxdepth: 1
+
    TrainMetric.md
    LossMetric.md
    Accuracy.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/InfoNCELoss.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/InfoNCELoss.md"
new file mode 100644
index 000000000..f8fbaa1be
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/InfoNCELoss.md"
@@ -0,0 +1,68 @@
+# InfoNCE 损失
+
+`InfonceLoss` 实现带批内负样本和可选跨 rank 聚合的对比学习损失，用于嵌入/检索模型训练。
+
+## 使用方法
+
+```python
+from twinkle.loss import InfonceLoss
+
+loss_fn = InfonceLoss(
+    temperature=0.1,
+    use_batch=True,           # 启用批内负样本
+    hard_negatives=7,         # 固定每样本负样本数
+    mask_fake_negative=True,  # 遮蔽假负样本
+    fake_neg_margin=0.1,      # 假负样本检测阈值
+)
+
+model.set_loss(loss_fn)
+```
+
+## 输入格式
+
+每个样本按 `锚点(1) + 正样本(1) + 负样本(n)` 排列。`inputs['labels']` 是一维掩码，`1` 标记每组的起始位置。
+
+```
+embeddings: [a0, p0, n0_1, n0_2, a1, p1, n1_1, n1_2, ...]
+labels:     [ 1,  0,    0,    0,  1,  0,    0,    0, ...]
+```
+
+## 参数
+
+| 参数 | 类型 | 默认值 | 说明 |
+|:-----|:-----|:-------|:-----|
+| `temperature` | float | 0.1 | 相似度缩放因子 |
+| `use_batch` | bool | True | 使用跨样本批内负样本 |
+| `hard_negatives` | int | None | 固定每样本负样本数（截断/上采样）|
+| `mask_fake_negative` | bool | False | 遮蔽高于 positive + margin 的 logit |
+| `fake_neg_margin` | float | 0.1 | 假负样本遮蔽阈值 |
+| `include_qq` | bool | False | 添加 query-query 相似度块 |
+| `include_dd` | bool | False | 添加 doc-doc 相似度块 |
+
+## 跨 Rank 聚合
+
+当 `use_batch=True` 且分布式训练激活时，嵌入会从所有 DP rank 聚合以最大化批内负样本多样性。仅本地分片保留梯度。
+
+## 相似度块
+
+该损失支持三种相似度块，提供全面的对比学习信号：
+
+- **Q→D（默认）**：Query 到所有 Document — 主要对比信号
+- **Q→Q**（`include_qq=True`）：Query 到其他所有 Query — 防止 query 坍缩
+- **D→D**（`include_dd=True`）：Document 到其他所有 Document — Qwen3-Embedding 风格
+
+## 示例：Embedding 训练
+
+```python
+from twinkle.loss import InfonceLoss
+from twinkle.metric import EmbeddingMetric
+
+# 配置 Embedding 模型
+model.set_loss(InfonceLoss(temperature=0.05, use_batch=True, include_qq=True))
+model.set_metric(EmbeddingMetric(device_mesh=mesh, process_group=pg))
+
+# 训练循环
+for batch in dataloader:
+    model.forward_backward(batch)
+    model.clip_grad_and_step()
+```
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/index.rst"
index ea813f56f..0a2a890cf 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/index.rst"
@@ -3,6 +3,19 @@
 .. toctree::
    :maxdepth: 1
 
+   CrossEntropy.md
+   ChunkedCrossEntropy.md
+   DPOLoss.md
+   GKDLoss.md
+   GRPOLoss.md
+   InfoNCELoss.md
+   MSELoss.md
+   构建损失.md
+损失
+===============
+.. toctree::
+   :maxdepth: 1
+
    CrossEntropy.md
    ChunkedCrossEntropy.md
    DPOLoss.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MultiLoraTransformersModel.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MultiLoraTransformersModel.md"
index 4017aea7e..5ae13f739 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MultiLoraTransformersModel.md"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/MultiLoraTransformersModel.md"
@@ -30,3 +30,48 @@ class MultiLoraTransformersModel:
 正因如此，用户的r必须要小于等于max_r的配置，在实际训练时仅会使用lora的部分rank参与计算。
 
 MultiLoraTransformersModel支持`@remote_class`注解，并且支持device_mesh，这意味着它可以运行在ray的worker中。
+
+## 租户生命周期
+
+底层使用 `MultiLora` 管理器来处理租户 LoRA 槽位。关键 API：
+
+### acquire_lora
+
+为租户获取一个可用的 LoRA 槽位：
+
+```python
+adapter_name = model.multi_lora.acquire_lora('tenant_a', LoraConfig(r=16, lora_alpha=32))
+```
+
+- 如果所有槽位已被占用或 `config.r > max_r`，则抛出 `RuntimeError`
+
+### release_lora
+
+释放租户的 LoRA 槽位，权重重置为初始状态：
+
+```python
+model.multi_lora.release_lora('tenant_a')
+```
+
+### 上下文管理器
+
+使用 `adapter()` 进行作用域激活：
+
+```python
+with model.multi_lora.adapter('tenant_a') as name:
+    output = model.forward(inputs)
+```
+
+### LoraTenant
+
+每个槽位以 `LoraTenant` 数据类追踪：
+
+```python
+@dataclass
+class LoraTenant:
+    index: int                    # 槽位索引 (0..max_loras-1)
+    adapter_name: str             # 内部名称（如 "lora_0"）
+    config: LoraConfig            # 预分配配置（max_r）
+    tenant_adapter_name: str      # 面向用户的租户名（空闲时为 None）
+    tenant_config: LoraConfig     # 租户实际配置（空闲时为 None）
+```
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/SupportedModels.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/SupportedModels.md"
new file mode 100644
index 000000000..bfbb03ea0
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/SupportedModels.md"
@@ -0,0 +1,77 @@
+# 支持的模型
+
+Twinkle 支持任何兼容 HuggingFace Transformers 或 Megatron-LM 的模型。以下是经过测试的模型列表。
+
+## 语言模型
+
+| 模型系列 | 模型 ID | 参数量 | 特性 |
+|:---------|:--------|:-------|:-----|
+| Qwen 3.5 | `Qwen/Qwen3.5-0.6B` ~ `Qwen/Qwen3.5-235B-A22B` | 0.6B–235B | MoE、思考模式 |
+| Qwen 2.5 | `Qwen/Qwen2.5-0.5B` ~ `Qwen/Qwen2.5-72B` | 0.5B–72B | Dense |
+| DeepSeek V4 | `deepseek-ai/DeepSeek-V4` | 685B MoE | 自定义 DSML 编码 |
+| DeepSeek R1 | `deepseek-ai/DeepSeek-R1` | 685B MoE | 推理 |
+| LLaMA 3 | `meta-llama/Llama-3.3-70B-Instruct` | 8B–70B | Dense |
+| Mistral | `mistralai/Mistral-7B-v0.3` | 7B | Dense |
+| Yi | `01-ai/Yi-1.5-34B` | 6B–34B | Dense |
+| GLM-4 | `THUDM/glm-4-9b-chat` | 9B | Dense |
+| InternLM 2.5 | `internlm/internlm2_5-7b-chat` | 7B–20B | Dense |
+
+## 视觉语言模型
+
+| 模型系列 | 模型 ID | 特性 |
+|:---------|:--------|:-----|
+| Qwen 3.5 VL | `Qwen/Qwen3.5-VL-3B` ~ `Qwen/Qwen3.5-VL-72B` | 图片、视频 |
+| Qwen 2.5 VL | `Qwen/Qwen2.5-VL-7B-Instruct` | 图片、视频 |
+| InternVL 2.5 | `OpenGVLab/InternVL2_5-8B` | 图片 |
+
+## 嵌入模型
+
+| 模型系列 | 模型 ID | 训练方法 |
+|:---------|:--------|:---------|
+| Qwen3 Embedding | `Qwen/Qwen3-Embedding-0.6B` | InfoNCE 对比学习 |
+| GTE | `thenlper/gte-large-zh` | InfoNCE 对比学习 |
+
+## 模型加载
+
+```python
+from twinkle.model import TransformersModel
+
+# 从 ModelScope 加载（ms:// 前缀）
+model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
+
+# 从 HuggingFace 加载（hf:// 前缀）
+model = TransformersModel(model_id='hf://meta-llama/Llama-3.3-70B-Instruct')
+
+# 本地路径
+model = TransformersModel(model_id='/path/to/model')
+```
+
+## 框架支持
+
+| 框架 | 类名 | 适用场景 |
+|:-----|:-----|:---------|
+| Transformers | `TransformersModel` | 通用训练（SFT、RLHF、DPO）|
+| Transformers + Multi-LoRA | `MultiLoraTransformersModel` | 多租户训练 |
+| Megatron-LM | `MegatronModel` | 大规模分布式预训练 |
+| Megatron + Multi-LoRA | `MultiLoraMegatronModel` | 大规模多租户 |
+
+## 精度支持
+
+| 模式 | 说明 |
+|:-----|:-----|
+| `bf16` | BFloat16 混合精度（推荐 A100/H100）|
+| `fp16` | Float16 混合精度（适用于旧 GPU）|
+| `fp8` | FP8 精度（H100 + Transformer Engine）|
+| `no` | 全精度（仅用于调试）|
+
+## 并行策略
+
+| 策略 | 配置键 | 说明 |
+|:-----|:-------|:-----|
+| FSDP | `strategy=accelerate` | Accelerate 管理的 FSDP（默认）|
+| 原生 FSDP | `strategy=native_fsdp` | PyTorch 原生 FSDP |
+| 张量并行 | `tp_size` | 跨 GPU 切分层 |
+| 流水线并行 | `pp_size` | 切分模型阶段 |
+| 数据并行 | `dp_size` | 复制模型，切分数据 |
+| 序列并行 | `sequence_parallel` | 切分长序列 |
+| 专家并行 | `ep_size` | MoE 专家分布 |
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/index.rst"
index 713ea35c6..d20155bd7 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\345\236\213/index.rst"
@@ -8,3 +8,14 @@
    MultiLoraTransformersModel.md
    MegatronModel.md
    MultiLoraMegatronModel.md
+   SupportedModels.md
+模型
+===============
+.. toctree::
+   :maxdepth: 1
+
+   TwinkleModel.md
+   TransformersModel.md
+   MultiLoraTransformersModel.md
+   MegatronModel.md
+   MultiLoraMegatronModel.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/DeepSeekV4Template.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/DeepSeekV4Template.md"
new file mode 100644
index 000000000..053b51051
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/DeepSeekV4Template.md"
@@ -0,0 +1,30 @@
+# DeepSeek-V4 模板
+
+`DeepseekV4Template` 为 DeepSeek V4 提供原生支持，包括其独特的思考模式、工具调用协议和多 token 特殊标记。
+
+## 使用方法
+
+```python
+from twinkle.template import DeepseekV4Template
+
+template = DeepseekV4Template(
+    model_id='deepseek-ai/DeepSeek-V4',
+    enable_thinking=True,
+)
+```
+
+## 特性
+
+- **自定义 tokenizer 包装**：用 DeepSeek V4 的编码协议覆盖 `apply_chat_template`
+- **思考模式**：支持 `thinking` / `chat` 模式切换
+- **工具调用**：原生 DSML 工具调用编码
+- **多 token EOS**：处理 DeepSeek V4 的多字符特殊标记
+
+## 与基础模板的区别
+
+| 特性 | 基础模板 | DeepseekV4Template |
+|:-----|:---------|:-------------------|
+| Chat 模板 | HuggingFace 原生 | 自定义 DSML 编码 |
+| 思考模式 | `<think>` 标签 | 原生思考模式开关 |
+| 工具调用 | Hermes/Qwen 格式 | DSML 工具块 |
+| EOS 处理 | 单 token | 多 token 特殊标记 |
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md"
index 364275b64..c3f5918e1 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md"
@@ -9,7 +9,7 @@ class Template:
                  model_id: str,
                  use_chat_template: bool = True,
                  max_length: Optional[int] = 8192,
-                 truncation_strategy: Literal['raise', 'left', 'right', 'split'] = 'raise',
+                 truncation_strategy: Literal['raise', 'left', 'right', 'split', 'delete'] = 'raise',
                  default_system: Optional[str] = None):
         ...
 
@@ -42,7 +42,9 @@ class Template:
   - raise: 抛出异常。一般用于非常精确的数据集场景
   - left: 移除左边的 token，使其符合 max_length
   - right: 移除右边的 token，使其符合 max_length
-  - default_system: 如果数据集没有 system，则使用默认 system
+  - split: 将超长样本切分为多个 max_length 的片段（不支持多模态、LazyDataset、IterablePackingDataset）
+  - delete: 直接丢弃超长样本
+- default_system: 如果数据集没有 system，则使用默认 system
 
 > Template 不支持使用函数来代替，因为其内部要支持的功能较多。如果需要编写新的 Template，请继承 `Template` 类。
 > 一般来说，纯文本模型使用 Template 基类就足够了，在基类中我们使用了 tokenizer.apply_chat_template 来编码模型，对一般的纯文本模型是通用的。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/ToolCallParsers.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/ToolCallParsers.md"
new file mode 100644
index 000000000..9d52be1c7
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/ToolCallParsers.md"
@@ -0,0 +1,53 @@
+# 工具调用解析器
+
+Twinkle 的模板系统包含模块化的工具调用解析框架，用于训练具有函数调用能力的模型。
+
+## 架构
+
+```
+ToolCallRegistry
+├── HermesQwenParser  — Hermes/Qwen 风格 <tool_call>...</tool_call>
+├── ReActParser       — ReAct Thought/Action/Observation
+├── ClineParser       — Cline XML 工具调用
+└── VCPParser         — VCP 协议
+```
+
+## ToolCallParser 接口
+
+```python
+from twinkle.template.tools import ToolCallParser
+
+class ToolCallParser(ABC):
+    name: str = ''
+
+    def detect(self, text: str) -> bool:
+        """检查文本是否包含此格式的标记"""
+
+    def parse(self, text: str) -> List[Dict[str, Any]]:
+        """提取 OpenAI 格式的工具调用"""
+
+    def clean(self, text: str) -> str:
+        """去除标记，返回纯内容"""
+```
+
+## ToolCallRegistry
+
+注册表自动发现解析器并路由检测：
+
+```python
+from twinkle.template.tools import ToolCallRegistry
+
+# 检测补全使用了哪种格式
+parser = ToolCallRegistry.detect_first(completion_text)
+if parser:
+    tool_calls = parser.parse(completion_text)
+```
+
+## 内置解析器
+
+| 解析器 | 格式说明 |
+|:-------|:---------|
+| HermesQwenParser | `<tool_call>{"name": "...", "arguments": {...}}</tool_call>` |
+| ReActParser | Thought/Action/Action Input/Observation |
+| ClineParser | Cline XML 结构化参数 |
+| VCPParser | Visual Code Protocol |
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/index.rst"
index 9ab4c887b..840adf497 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/index.rst"
@@ -4,3 +4,11 @@
    :maxdepth: 1
 
    Template.md
+   DeepSeekV4Template.md
+   ToolCallParsers.md
+模板
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Template.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/DeviceMesh\345\222\214DeviceGroup.md" "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/DeviceMesh\345\222\214DeviceGroup.md"
index 00ec1f308..5842d51dd 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/DeviceMesh\345\222\214DeviceGroup.md"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/DeviceMesh\345\222\214DeviceGroup.md"
@@ -40,6 +40,96 @@ class DeviceMesh:
 
 推荐使用 `from_sizes` 来构造它。
 
+### 参数参考
+
+| 参数 | 说明 | 默认值 |
+|------|------|--------|
+| `world_size` | 总进程数 | 1 |
+| `dp_size` | 数据并行度 | 1 |
+| `fsdp_size` | 全分片数据并行度 | None |
+| `tp_size` | 张量并行度 | None |
+| `pp_size` | 流水线并行度 | None |
+| `ulysses_size` | Ulysses 序列并行度 | None |
+| `cp_size` | 上下文并行度 | None |
+| `ep_size` | 专家并行度（MoE 模型）| None |
+| `etp_size` | 专家张量并行度 | None |
+| `ep_fsdp_size` | 每个 EP 组内的 FSDP 度 | None |
+| `vpp_size` | 虚拟流水线并行度 | None |
+| `device_type` | 设备类型（`cuda`、`npu` 等）| `cuda` |
+| `sequence_parallel` | 启用 Megatron 风格序列并行 | False |
+
+我们举一个例子：
+
+```python
+sampler_device_mesh = DeviceMesh.from_sizes(dp_size=4)
+actor_device_mesh = DeviceMesh.from_sizes(dp_size=2, pp_size=2, tp_size=2)
+
+dataloader = DataLoader(...)
+sampler = vLLMSampler(..., device_mesh=sampler_device_mesh, remote_group=...)
+actor = MegatronModel(..., device_mesh=actor_device_mesh, remote_group=...)
+
+for data in dataloader:
+    sampler_output = sampler.sample(data)
+    input_data = [seq.new_input_feature for response in sampler_output for seq in response.sequences]
+    ...
+    model_output = actor.forward(input_data)
+```
+
+我们以上面的伪代码来分析数据传递情况。
+
+dataloader 取出数据 -> 按照 dp_size=4 分发给 sampler -> 按照 dp_size=4 收集数据 -> 按照 dp_size=2 分发给模型 -> 按照 dp_size=2 收集输出
+
+通过 DeviceMesh，可以将数据流平顺地在各个 group 和组件之间流转起来。
+
+数据的分发判断由 DeviceMesh 的 `get_slice` 方法执行：
+
+```python
+batch[device_mesh.get_slice(len(batch))]
+```
+
+get_slice 会根据当前 rank，计算出当前 worker 属于哪个 dp 组，并获取对应的数据。该过程发生在 DataLoader 的 DeviceMeshSampler 中，同样发生在 remote_class 的 dispatch 和 collect 中。
+# DeviceMesh/DeviceGroup
+
+这两个类用于表达硬件资源分配和网络拓扑，Twinkle 的数据分发和收集也依赖它们。
+
+## DeviceGroup
+
+```python
+@dataclass
+class DeviceGroup:
+    name: str
+    ranks: Union[List[int], int]
+    device_type: str
+    visible_devices: Optional[str] = None  # Optional: explicitly set visible devices (e.g., "8,9")
+    gpus_per_worker: int = 1
+```
+
+- name: 资源组名
+- ranks: 占用硬件列表，如果是CPU资源仅支持int类型
+- device_type: 硬件类型，例如 GPU/CPU/NPU 等
+- visible_devices: 可见资源列表，用于希望仅使用部分 rank 的硬件的情况
+- gpus_per_worker: 每个 worker 占用多少硬件
+
+如果训练 RL，开发者可以构造多个这样的组，并将对应的模型、采样器分配进入其中。
+
+## DeviceMesh
+
+DeviceMesh 承载了组件拓扑、分布式并行信息，这个类会在组件内传递，用于数据分发和数据收集。
+
+```python
+@dataclass
+class DeviceMesh:
+    ...
+
+    @staticmethod
+    def from_sizes(*, world_size: int = 1, dp_size: int = 1, fsdp_size: int = None, tp_size: int = None,
+                   pp_size: int = None, ulysses_size: int = None, cp_size: int = None, ep_size: int = None,
+                   etp_size: int = None,vpp_size: int = None, device_type: str = 'cuda', sequence_parallel: bool = False) -> "DeviceMesh":
+        ...
+```
+
+推荐使用 `from_sizes` 来构造它。
+
 我们举一个例子：
 
 ```python
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/Padding-Free\350\256\255\347\273\203.md" "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/Padding-Free\350\256\255\347\273\203.md"
new file mode 100644
index 000000000..8bd783cff
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/Padding-Free\350\256\255\347\273\203.md"
@@ -0,0 +1,52 @@
+# Padding-Free 训练
+
+Padding-free（也称为"打包"）训练通过将多个序列拼接到一个打包批次中，消除了对 padding token 的无效计算。Twinkle 支持标准注意力和 Qwen3.5 GatedDeltaNet 线性注意力的 padding-free 训练。
+
+## 工作原理
+
+不同于将所有序列填充到 `max_length`，padding-free 将多个序列打包到一行中，并使用 `position_ids` 跟踪序列边界，从而避免在 padding token 上浪费算力。
+
+```
+标准方式:   [tok tok tok PAD PAD PAD]  [tok tok PAD PAD PAD PAD]
+打包方式:   [tok tok tok tok tok ...]   ← 无 padding 浪费
+```
+
+## 使用方式
+
+通过 `PackingDataset` 或 `IterablePackingDataset` 启用：
+
+```python
+from twinkle.dataset import PackingDataset
+
+dataset = PackingDataset(
+    dataset=base_dataset,
+    max_length=8192,
+)
+```
+
+数据集会自动打包序列并生成正确的 `position_ids`，在序列边界处重置。
+
+## GatedDeltaNet 补丁（Qwen3.5）
+
+Qwen3.5 使用混合架构，融合了标准注意力和 GatedDeltaNet 线性注意力。原生 GatedDeltaNet 实现不会在打包序列边界处重置线性注意力状态。
+
+`GatedDeltaNetPaddingFreePatch` 通过以下方式修复：
+
+1. Patch `Qwen3_5DecoderLayer.forward`，将 `cu_seq_lens_q`（累积序列长度）传递给线性注意力层
+2. Patch `Qwen3_5GatedDeltaNet.forward`，使用支持 `cu_seqlens` 的 flash-linear-attention 内核（`causal_conv1d`、`chunk_gated_delta_rule`）
+
+在 Qwen3.5 模型上检测到 padding-free 时，补丁会自动应用。
+
+### 要求
+
+- 需安装 `flash-linear-attention` 包
+- 仅适用于含 GatedDeltaNet 层的 Qwen3.5 模型
+- 启用序列并行时，会使用 `Qwen3_5GatedDeltaNetUlyssesPatch` 替代
+
+## 注意力后端要求
+
+| 注意力后端 | Padding-Free 支持 |
+|-----------|-------------------|
+| FlashAttention2 | 完全支持 |
+| SDPA | 支持（不兼容序列并行） |
+| Eager | 不支持 |
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/TwinkleClient\345\256\242\346\210\267\347\253\257.md" "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/TwinkleClient\345\256\242\346\210\267\347\253\257.md"
new file mode 100644
index 000000000..327e2e40c
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/TwinkleClient\345\256\242\346\210\267\347\253\257.md"
@@ -0,0 +1,81 @@
+# TwinkleClient 客户端
+
+`TwinkleClient` 是与 Twinkle REST API 交互的 Python 客户端，管理会话、训练任务和检查点。
+
+## 初始化
+
+```python
+from twinkle_client.manager import TwinkleClient
+
+client = TwinkleClient(
+    base_url='http://localhost:8000',   # 或 TWINKLE_SERVER_URL 环境变量
+    api_key='your-api-key',             # 或 TWINKLE_SERVER_TOKEN 环境变量
+    route_prefix='/twinkle',            # API 路由前缀
+    session_heartbeat_interval=10,      # 心跳间隔（秒）
+    session_metadata={'user': 'alice'}, # 可选的会话元数据
+)
+```
+
+初始化时客户端会：
+1. 将 `base_url` 和 `api_key` 设置到共享上下文（所有客户端对象自动使用）
+2. 创建服务端会话
+3. 启动后台心跳线程保持会话活跃
+
+## 健康检查
+
+```python
+is_healthy = client.health_check()  # 返回 True/False
+capabilities = client.get_server_capabilities()  # 支持的模型
+```
+
+## 训练任务
+
+```python
+# 列出训练任务
+runs = client.list_training_runs(limit=20, offset=0)
+
+# 带分页游标列出
+runs, cursor = client.list_training_runs_with_cursor(limit=20)
+
+# 获取特定任务
+run = client.get_training_run(run_id='run_abc123')
+
+# 按基础模型查找
+qwen_runs = client.find_training_run_by_model('Qwen/Qwen3.5-4B')
+```
+
+## 检查点
+
+```python
+# 列出训练任务的检查点
+checkpoints = client.list_checkpoints(run_id='run_abc123')
+
+# 获取检查点路径
+parsed = client.get_checkpoint_path(run_id, checkpoint_id)
+# parsed.path         → 文件系统路径
+# parsed.twinkle_path → twinkle:// URI
+
+# 获取最新检查点（用于恢复训练）
+latest_path = client.get_latest_checkpoint_path(run_id)
+
+# 删除检查点
+client.delete_checkpoint(run_id, checkpoint_id)
+```
+
+## 容量与权重信息
+
+```python
+# LoRA 容量
+capacity = client.get_capacity_info()
+# capacity.max_loras, capacity.used_loras, capacity.free_loras
+
+# 权重元数据
+info = client.get_weights_info('twinkle://run_id/weights/checkpoint')
+# info.base_model, info.is_lora, info.lora_rank
+```
+
+## 清理
+
+```python
+client.close()  # 停止心跳线程（也通过 atexit 自动注册）
+```
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/index.rst"
index 7174ce690..377098988 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/index.rst"
@@ -4,4 +4,8 @@
    :maxdepth: 1
 
    DeviceMesh和DeviceGroup.md
+   专家并行.md
+   序列并行.md
+   Padding-Free训练.md
    RemoteClass.md
+   TwinkleClient客户端.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/\344\270\223\345\256\266\345\271\266\350\241\214.md" "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/\344\270\223\345\256\266\345\271\266\350\241\214.md"
new file mode 100644
index 000000000..a0112249b
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/\344\270\223\345\256\266\345\271\266\350\241\214.md"
@@ -0,0 +1,73 @@
+# 专家并行 (EP)
+
+专家并行将混合专家模型（MoE）的专家分布到多个 GPU 上，每个 rank 只持有部分专家。这降低了单卡显存占用，使大规模 MoE 模型的训练成为可能。
+
+## 概览
+
+| 概念 | 说明 |
+|------|------|
+| **ExpertParallelConfig** | 控制 EP 行为的配置数据类 |
+| **apply_expert_parallel()** | 入口函数，负责分片专家并替换前向传播 |
+| **shard_experts()** | 将专家均匀分配到各 EP rank |
+| **patch_forward()** | 将 MoE block 的 forward 替换为带 all-to-all 通信的 EP 版本 |
+
+## 配置
+
+```python
+from twinkle.model.transformers.moe.expert_parallel import ExpertParallelConfig
+
+config = ExpertParallelConfig(
+    enabled=True,              # 启用专家并行
+    router_dtype='fp32',       # 路由计算精度：'fp32', 'bf16', 'fp16'
+    keep_router_logits=True,   # 在输出中保留路由 logits
+    ignore_shared_experts=False,# 跳过共享专家计算（如 DeepSeek）
+    ep_size=None,              # EP 并行度（由 TransformersModel 使用）
+)
+```
+
+## 配合 DeviceMesh 使用
+
+在 `DeviceMesh.from_sizes()` 中设置 `ep_size` 即可激活 EP。框架会在模型初始化时自动调用 `apply_expert_parallel()`。
+
+```python
+from twinkle.utils import DeviceMesh
+
+# 8 卡：2 路 EP × 4 路数据并行
+device_mesh = DeviceMesh.from_sizes(
+    world_size=8,
+    dp_size=4,
+    ep_size=2,
+)
+```
+
+EP + FSDP 组合分片：
+
+```python
+# 8 卡：2 路 EP，每个 EP 组内 2 路 FSDP
+device_mesh = DeviceMesh.from_sizes(
+    world_size=8,
+    dp_size=2,
+    ep_size=2,
+    ep_fsdp_size=2,
+)
+```
+
+## 通信模式
+
+EP 前向传播遵循 4 阶段流水线：
+
+1. **预处理** — 计算每个专家的 token 数量和分割大小
+2. **Token Pre-All2All** — 按专家分配排列 token，然后在 EP rank 间执行 all-to-all 交换
+3. **专家计算** — 每个 rank 在接收到的 token 上运行本地专家
+4. **Token Post-All2All** — all-to-all 交换结果，反排列并应用路由权重
+
+```
+输入 token → 路由器 → [预处理] → [pre_all2all] → [本地专家] → [post_all2all] → 输出
+```
+
+## 要求
+
+- `num_experts` 必须能被 `ep_size` 整除
+- `torch.distributed` 必须已初始化
+- MoE block 必须定义 `gate`/`router` 模块和 `experts`（支持 `nn.ModuleList` 或张量形式的 `gate_up_proj`/`down_proj`）
+- 共享专家（如 DeepSeek MoE）会自动处理，除非设置 `ignore_shared_experts=True`
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/\345\272\217\345\210\227\345\271\266\350\241\214.md" "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/\345\272\217\345\210\227\345\271\266\350\241\214.md"
new file mode 100644
index 000000000..e1d997188
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\350\256\255\347\273\203\344\270\255\351\227\264\344\273\266/\345\272\217\345\210\227\345\271\266\350\241\214.md"
@@ -0,0 +1,68 @@
+# 序列并行 (SP)
+
+序列并行沿序列维度将长序列分割到多个 GPU 上，使训练能处理超出单卡显存的序列长度。Twinkle 实现了 Ulysses 风格的序列并行，并可选地支持派生环形注意力。
+
+## 概览
+
+| 概念 | 说明 |
+|------|------|
+| **SequenceParallelConfig** | SP 配置数据类 |
+| **SequenceParallelStrategy** | 封装 SP 生命周期的策略类 |
+| **SequenceParallel** | 核心实现，处理填充/分割/聚合 |
+
+## 配置
+
+```python
+from twinkle.model.transformers.strategy.sequence_parallel import SequenceParallelConfig
+
+config = SequenceParallelConfig(
+    enabled=True,           # 启用序列并行
+    ulysses_size=None,      # Ulysses SP 并行度（若为 None 则从 DeviceMesh 自动推导）
+    gather_logits=True,     # 前向后聚合 logits 用于损失计算
+)
+```
+
+## 配合 DeviceMesh 使用
+
+在 `DeviceMesh.from_sizes()` 中设置 `ulysses_size` 即可激活 SP：
+
+```python
+from twinkle.utils import DeviceMesh
+
+# 8 卡：4 路 Ulysses SP × 2 路数据并行
+device_mesh = DeviceMesh.from_sizes(
+    world_size=8,
+    dp_size=2,
+    ulysses_size=4,
+)
+```
+
+## 工作原理
+
+1. **填充** — 输入序列被填充到可被 SP 并行度整除的长度
+2. **分割** — 填充后的输入沿序列维度均匀分配到各 SP rank
+3. **分布式注意力** — FlashAttention2 被 patch 为在注意力计算前后执行 Ulysses all-to-all 通信
+4. **聚合** — 前向传播后，logits 被聚合回完整序列长度用于损失计算
+
+## 支持的注意力后端
+
+| 后端 | 状态 |
+|------|------|
+| FlashAttention2 | 完全支持（包括打包/padding-free 序列）|
+| SDPA | 支持（仅非打包批次）|
+| 派生环形注意力 | 仅支持 FlashAttention2（`rp_world_size > 1`）|
+
+## Qwen3.5 线性注意力
+
+SP 自动检测 Qwen3.5 GatedDeltaNet 线性注意力层，并应用 `Qwen3_5GatedDeltaNetUlyssesPatch`，确保混合注意力架构下序列并行的正确性。
+
+## MoE 辅助损失
+
+对于 MoE 模型，SP 自动安装前向 hook，在计算辅助损失前跨 SP rank 聚合路由 logits，确保负载均衡信号的正确性。
+
+## 关键约束
+
+- `num_key_value_heads` 必须能被 `ulysses_size` 整除（Ulysses 模式），否则回退到环形注意力
+- 打包/padding-free 批次需要 FlashAttention2
+- 派生环形注意力要求 `batch_size == 1`（打包格式）
+- `torch.distributed` 必须已初始化
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\351\200\232\347\237\245\345\231\250/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\351\200\232\347\237\245\345\231\250/index.rst"
new file mode 100644
index 000000000..8b3692d51
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\351\200\232\347\237\245\345\231\250/index.rst"
@@ -0,0 +1,6 @@
+通知器
+===============
+.. toctree::
+   :maxdepth: 1
+
+   通知器.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\351\200\232\347\237\245\345\231\250/\351\200\232\347\237\245\345\231\250.md" "b/docs/source_zh/\347\273\204\344\273\266/\351\200\232\347\237\245\345\231\250/\351\200\232\347\237\245\345\231\250.md"
new file mode 100644
index 000000000..823e0d459
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\351\200\232\347\237\245\345\231\250/\351\200\232\347\237\245\345\231\250.md"
@@ -0,0 +1,93 @@
+# 通知器
+
+通知器组件提供可插拔的通知系统，用于在训练过程中发送告警。当异常发生或训练事件需要关注时，通知器将消息投递到外部渠道（如钉钉 Webhook）。
+
+## 基础接口
+
+```python
+from twinkle.notifier import Notifier
+
+class Notifier:
+    def __call__(self, message: str):
+        """发送通知消息"""
+        ...
+
+    def to_dict(self) -> dict:
+        """序列化（用于 checkpoint 保存/恢复）"""
+        ...
+
+    @classmethod
+    def from_dict(cls, data: dict) -> Notifier:
+        """从序列化数据恢复"""
+        ...
+```
+
+## DingNotifier（钉钉通知）
+
+向钉钉自定义机器人 Webhook 发送通知。
+
+```python
+from twinkle.notifier import DingNotifier
+
+notifier = DingNotifier(
+    ding_url='https://oapi.dingtalk.com/robot/send?access_token=xxx',
+    secret='SECxxxxxxx',  # 可选：签名模式
+    timeout=5.0,
+)
+
+# 发送消息
+notifier("### 训练完成\n\n- Steps: 1000\n- Loss: 0.25")
+```
+
+**参数：**
+- `ding_url`：完整的钉钉 Webhook URL（含 access_token）
+- `secret`：可选签名密钥（签名模式机器人）
+- `timeout`：HTTP 请求超时时间，单位秒（默认 5.0）
+
+消息以钉钉 **Markdown** 格式发送。第一个标题行自动提取为聊天预览标题。
+
+## 异常通知
+
+Twinkle 提供带去重的自动异常通知：
+
+```python
+from twinkle.notifier.base import notify_exception
+
+# 自动发送格式化的异常信息
+# 每个唯一异常只有一个 rank 发送（防止消息洪泛）
+try:
+    model.forward_backward(batch)
+except Exception as e:
+    notify_exception(notifier, context='forward_backward', exc=e, name='sft_train')
+```
+
+通知包含：
+- 异常类型和消息
+- 完整堆栈跟踪
+- 运行时元数据（rank、PID、主机名）
+- 去重：所有 rank 中每个唯一异常只发一条通知
+
+## 自定义通知器
+
+继承 `Notifier` 创建自定义通知器：
+
+```python
+from twinkle.notifier import Notifier
+
+class SlackNotifier(Notifier):
+    def __init__(self, webhook_url: str):
+        self.webhook_url = webhook_url
+
+    def __call__(self, message: str):
+        import requests
+        requests.post(self.webhook_url, json={'text': message})
+
+    def to_dict(self):
+        return {'class': 'SlackNotifier', 'webhook_url': self.webhook_url}
+
+    @classmethod
+    def _from_dict_impl(cls, data):
+        return cls(webhook_url=data['webhook_url'])
+```
+
+> 通知器通过 `__init_subclass__` 自动注册，因此 `Notifier.from_dict()` 可以按类名恢复任何子类。
diff --git a/new_feature.txt b/new_feature.txt
new file mode 100644
index 000000000..39ce0d763
--- /dev/null
+++ b/new_feature.txt
@@ -0,0 +1,204 @@
+# Native ML LLM Control
+
+基于 Twinkle Server Mode 架构，实现 LLM Agent 驱动的 TUI 训练控制系统。面向零基础开发者，通过自然语言对话完成模型训练的全生命周期管理。
+
+
+## 一、整体架构
+
+采用"无状态客户端 + 有状态服务端"的 Server Mode 架构：
+
+- **Server 端（Ray 集群）**：模型权重、LoRA adapter、optimizer 状态、LR scheduler 全部驻留在 GPU 内存
+- **Client 端（TUI + 训练脚本）**：完全无状态，仅负责数据加载、训练循环逻辑、指标上报
+- **核心特性**：杀死 client = 暂停（server 保留全部状态），重启 client = 恢复（零成本继续训练）
+
+支持两种运行模式：
+1. **本地自建**：启动本地 Ray 集群 + Twinkle Server，需评估 GPU 资源和 DeviceMesh
+2. **线上云服务**：连接 ModelScope 托管服务（`http://www.modelscope.cn/twinkle`），无需本地 GPU
+
+
+## 二、TUI 界面（基于 Textual 框架）
+
+TUI 采用 Grid 布局，包含四个核心面板：
+
+1. **状态栏（StatusBar）**：顶部横跨，显示当前训练 run_id、状态、步数进度
+2. **指标面板（MetricsPanel）**：左上区域，绘制 loss / reward / grad_norm 等指标曲线，支持自然语言控制放大缩小和还原
+3. **对话面板（ChatPanel）**：左下区域，用户与 LLM agent 的对话界面，支持 UTF-8 中文输入输出
+4. **日志面板（LogPanel）**：右侧纵向，滚动显示训练运行日志
+
+快捷键：`q` 退出、`Ctrl+P` 切换指标面板、`Ctrl+L` 清空日志。
+
+关闭 TUI 后训练不中断（Server 端状态不受影响），重新打开 TUI 可继续监控。
+
+
+## 三、LLM Agent 系统
+
+### 3.1 对话式 Agent（AgentLoop）
+
+用户通过 Chat 面板与 Agent 对话，Agent 通过 tool_call 执行训练管理操作：
+
+| 工具 | 功能 |
+|------|------|
+| `list_training_runs` | 列出所有活跃和历史训练任务 |
+| `get_training_status` | 获取指定 run 的状态和近期指标 |
+| `pause_training` | 暂停训练（SIGKILL client，server 保留状态） |
+| `resume_training` | 恢复训练（重启 client 脚本） |
+| `stop_training` | 优雅停止（SIGTERM，脚本自动保存 checkpoint + dataloader 位置后退出） |
+| `list_supported_models` | 查询 Server 支持的模型列表（本地/云端） |
+| `search_datasets` | 在 ModelScope 搜索数据集 |
+| `search_models` | 在 ModelScope 搜索模型 |
+| `zoom_metrics` | 自然语言控制指标图表缩放 |
+
+### 3.2 自动监控（TrainingMonitor）
+
+后台 LLM 定期（默认 30 秒）读取 metrics 和 logs，进行趋势分析：
+
+- 检测异常：loss 突增/NaN、reward 停滞、gradient 爆炸/消失、KL 散度过大、entropy 坍塌
+- 主动建议：调整学习率、更换 reward 组合、增加 num_generations
+- 通过 Chat 面板推送诊断报告（`[Monitor] ...`）
+- 无硬编码规则，所有分析由 LLM 推理完成
+
+### 3.3 Skills 可扩展框架
+
+通过 `SkillManager` + `ModelScopeSkillProvider` 加载技能文档，为 Agent 提供领域知识：
+
+- `skills/twinkle-training.md`：训练脚本编写指导（1260+ 行）
+- `skills/autoresearch.md`：自动化研究实验设计（256 行）
+- 支持从 ModelScope 远程加载社区共享 Skills
+
+
+## 四、训练控制机制
+
+### 4.1 训练进程管理
+
+| 操作 | 信号 | 行为 | 恢复方式 |
+|------|------|------|----------|
+| 暂停 | SIGKILL | 立即杀死 client | 重启同一脚本（adapter_name 相同即可继续） |
+| 停止 | SIGTERM | 脚本保存 checkpoint + dataloader 位置后退出 | `model.resume_from_checkpoint()` + `dataloader.resume_from_checkpoint()` |
+| 修改超参 | SIGKILL → 编辑 → 重启 | 新配置生效，optimizer 状态保留 | 使用相同 adapter_name |
+| 重置训练 | 使用新 adapter_name | 全新开始 | 旧 adapter 按 `adapter_timeout` 自动清理 |
+
+### 4.2 优雅退出（SIGTERM）
+
+每个训练脚本必须注册 graceful shutdown handler：
+
+```python
+rt = TrainingRuntime(run_id='my-exp')
+rt.register_graceful_shutdown(model, dataloader)
+```
+
+收到 SIGTERM 后自动执行：
+1. 保存模型 checkpoint（含 optimizer 状态）
+2. 记录 dataloader 已消费的样本数（`consumed_train_samples`）
+3. 写入 `rt.finish(status='stopped')`
+4. 安全退出
+
+
+## 五、数据通信（TUI ↔ 训练脚本）
+
+训练脚本通过 `TrainingRuntime` 写入本地 JSONL 文件，TUI 通过 `LocalConnection` 读取：
+
+```
+~/.cache/twinkle/{run_id}/
+├── meta.json       # 运行元信息（model_id、config、status、pid、script_path、script_version）
+├── train.py        # 当前活跃版本（始终是最新的）
+├── train_v1.py     # 归档：第1版脚本（出错的原始版本）
+├── train_v2.py     # 归档：第2版脚本（如果也有问题）
+├── metrics.jsonl   # 每步一行 JSON（step, loss, reward, grad_norm, lr, ...）
+└── logs.jsonl      # 事件日志（timestamp + message）
+```
+
+**脚本命名与版本管理规则：**
+- `train.py` 始终是当前活跃版本，`resume_training` 只执行它
+- 当 Agent 修复脚本时，旧版自动归档为 `train_v{N}.py`（保留完整修改历史）
+- `meta.json` 中 `script_version` 字段记录当前版本号
+- `run_id` 由用户定义（如 `'grpo-gsm8k'`、`'sft-self-cognition'`）
+- 同一 `run_id` 下可多次修改脚本：server 端 adapter 状态不变，只有 client 逻辑更新
+
+**脚本更新流程（Agent 自动执行）：**
+1. 脚本出错停止 → Agent 读取 logs/metrics 诊断问题
+2. Agent 调用 `update_script(run_id, new_code)` → 旧 `train.py` 归档为 `train_v{N}.py`，新代码写入 `train.py`
+3. Agent 调用 `resume_training(run_id)` → 重新执行最新的 `train.py`
+
+**TUI 通过 PID 进行进程控制：**
+- `meta.json` 记录 `pid`（进程 ID）
+- 暂停 = `os.kill(pid, SIGKILL)`
+- 停止 = `os.kill(pid, SIGTERM)` → 脚本优雅保存 checkpoint
+- 恢复 = `subprocess.Popen(['python', script_path])` → 新 PID 写回 meta
+
+TUI 支持增量读取（tail 模式），避免大文件全量加载。
+
+
+## 六、训练前规划（Pre-Training Planning）
+
+Agent 在编写训练脚本前，必须完成以下评估（本地模式）：
+
+1. **集群资源评估**：GPU 数量、型号、显存（`nvidia-smi` / `ray status`）
+2. **模型显存估算**：LoRA training ≈ model weights + 20% overhead
+3. **DeviceMesh 设计**：根据 GPU 数决定 model vs sampler 分配（决策树）
+4. **训练时间预估**：`total_steps × time_per_step`
+5. **数据集搜索**：通过 ModelScope API 或 `search_datasets` 工具
+6. **模型选择**：根据任务类型和资源约束推荐
+
+云服务模式下可跳过 1-4，直接进入数据集和模型选择。
+
+`list_supported_models` 工具用于查询 Server 实际支持的模型列表，避免选择不可用模型。
+
+
+## 七、Skills 文档内容
+
+### twinkle-training.md（训练脚本编写指导）
+
+覆盖以下内容：
+- Pre-Training Planning 完整流程
+- Ray 集群配置（DeviceGroup / DeviceMesh / initialize）
+- 模型后端（Transformers / Megatron）初始化
+- Dataset 加载、Template 编码、Preprocessor 使用
+- 所有训练方式示例：SFT、GRPO、DPO、GKD、PT
+- Server Mode 完整说明（本地自建 + 云服务两种模式）
+- Cloud Service Mode（ModelScope 托管，两种客户端 API 对比）
+- Sampler 配置与权重同步
+- MultiTurnRollout 多轮对话采样
+- TUI 集成：TrainingRuntime、指标上报、优雅退出
+- 实验管理文件夹规范
+
+### autoresearch.md（自动化研究实验设计）
+
+指导 Agent 如何：
+- 分析用户需求，选取合适的训练方法
+- 根据资源约束选择模型规模
+- 配置超参数（SFT / GRPO / DPO 默认值和调优建议）
+- 设计多阶段 Pipeline（数据清洗 → SFT → GRPO/DPO）
+- 编写数据清洗和转换流程
+- 组织实验输出文件夹
+
+
+## 八、训练脚本规范
+
+所有训练脚本必须：
+
+1. 使用 **Server Mode 语法**：`twinkle_client`（模型操作）+ `twinkle`（数据处理）
+2. 连接到运行中的 Twinkle Server（本地或云端）
+3. 注册 SIGTERM graceful shutdown handler
+4. 通过 `TrainingRuntime` 上报所有可用指标（loss, reward, grad_norm, lr, ...）
+5. 每个实验独立文件夹，包含 `plan.md`、`config.yaml`、`train.py`、`train.sh`
+
+两种客户端 API：
+- **Twinkle 原生**：`init_twinkle_client()` → `MultiLoraTransformersModel` → `forward_backward()` → `clip_grad_and_step()`
+- **Tinker 兼容**：`init_tinker_client()` → `ServiceClient` → `create_lora_training_client()` → `forward_backward()` → `optim_step()`
+
+
+## 九、插件化架构与组件约束
+
+Twinkle 是一个插件化框架，所有核心组件（loss、preprocessor、metric、sampler 等）均以插件形式注册，本地模式下支持用户编写新组件。
+
+### 本地自建模式
+- 完全可扩展：可编写自定义 loss、preprocessor、metric、reward function
+- 通过装饰器注册（如 `@register_loss('MyLoss')`），然后在训练脚本中以字符串名引用
+
+### 线上云服务模式（ModelScope 托管）
+- **安全限制**：不支持传入类、函数对象或 pickle 序列化
+- **只能使用已注册的内置组件**，以字符串名引用
+- 内置 Loss：`CrossEntropyLoss`、`GRPOLoss`、`DPOLoss`、`GKDLoss`
+- 内置 Preprocessor：`SFTPreprocessor`、`RLPreprocessor`、`DPOPreprocessor`
+
+Agent 编写脚本时必须判断目标环境：本地可用自定义组件，云端只能用内置名称。
diff --git a/pyproject.toml b/pyproject.toml
index 5daa14fe9..484dbc9e8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,24 +16,28 @@ dependencies = [
   "transformers",
   "typer>=0.9.0",
   "pyzmq",
+  "accelerate",
+  "torch>=2.6.0,<3.0.0",
 ]
 
 [project.scripts]
 twinkle-server = "twinkle.server.cli:main"
+twinkle-tui = "twinkle_client.tui:main"
 
 [project.optional-dependencies]
-transformers = [
-  "accelerate",
-  "torch>=2.6.0,<3.0.0",
-  "torchvision",
-]
-kernels = ["kernels"]
 megatron = ["megatron-core>=0.12.0", "transformer-engine[pytorch]", "mcore_bridge"]
-vllm = ["vllm>=0.11"]
-ray = ["ray[serve]"]
-datajuicer = ["py-data-juicer"]
-tinker = ["tinker==0.14.0"]
-test = ["hypothesis>=6.0", "pytest", "pytest-asyncio"]
+data = ["py-data-juicer"]
+rl = [
+  "vllm>=0.11",
+  "ray[serve]"
+]
+client = [
+  "textual>=1.0.0",
+  "plotext>=5.2.0",
+  "openai>=1.0.0",
+  "httpx>=0.25.0",
+  "tinker==0.16.1",
+]
 server = [
   "redis>=5.0",
   "psutil>=5.9.0",
@@ -43,6 +47,11 @@ server = [
   "opentelemetry-exporter-otlp",
   "opentelemetry-instrumentation-logging",
 ]
+test = [
+  "hypothesis>=6.0", 
+  "pytest", 
+  "pytest-asyncio"
+]
 docs = [
   "sphinx>=5.3.0,<6.0.0",
   "docutils>=0.16.0,<0.17.0",
@@ -68,3 +77,6 @@ build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
 where = ["src"]
+
+[tool.setuptools.package-data]
+"twinkle_client.skills.bundled" = ["*.md"]
diff --git a/src/twinkle/checkpoint_engine/manager.py b/src/twinkle/checkpoint_engine/manager.py
index cde5c519d..3860d2840 100644
--- a/src/twinkle/checkpoint_engine/manager.py
+++ b/src/twinkle/checkpoint_engine/manager.py
@@ -1,6 +1,5 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 # Adapted from https://github.com/volcengine/verl/blob/main/verl/checkpoint_engine/base.py
-import time
 from typing import List, Optional
 
 from twinkle import Platform, get_logger
diff --git a/src/twinkle/checkpoint_engine/mixin.py b/src/twinkle/checkpoint_engine/mixin.py
index e2e5d94d5..8dc15c926 100644
--- a/src/twinkle/checkpoint_engine/mixin.py
+++ b/src/twinkle/checkpoint_engine/mixin.py
@@ -1,5 +1,4 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
-import os
 
 from twinkle import Platform, remote_function
 from twinkle.checkpoint_engine.base import CheckpointEngine
diff --git a/src/twinkle/cli/cli.py b/src/twinkle/cli/cli.py
index 1730887f2..085ad7ec6 100644
--- a/src/twinkle/cli/cli.py
+++ b/src/twinkle/cli/cli.py
@@ -1,12 +1,11 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
-from __future__ import annotations
-
 import os
 import sys
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field, fields
 from pathlib import Path
-from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Type, Union
+from typing import Any, Iterator, Literal
+
 
 # ────────────────────────────────────────────────────────────────────────────────
 # Arg group dataclasses
@@ -46,6 +45,7 @@ class LoraArgs:
     lora_dropout: float = 0.05
     lora_target_modules: list[str] | None = None
     adapter_name: str = 'default'
+    lora_path: str | None = None
 
 
 @dataclass
@@ -84,6 +84,7 @@ class TrainingArgs:
     log_interval: int = 10
     eval_interval: int | None = None
     eval_samples: int | None = None
+    train_samples: int | None = None
     resume_from_checkpoint: str | None = None
     resume_only_model: bool = False
     ignore_data_skip: bool = False
@@ -117,9 +118,11 @@ class SchedulerArgs:
 @dataclass
 class LossArgs:
     loss_cls: str = 'CrossEntropyLoss'
+    loss_type: str = 'sigmoid'
     epsilon: float = 0.2
     epsilon_high: float | None = None
-    beta: float = 0.0
+    beta: float = 0.1
+    sft_weight: float = 1.0
     entropy_coef: float = 0.0
     ignore_index: int = -100
 
@@ -155,6 +158,7 @@ class InfraArgs:
     ncpu_proc_per_node: int = 8
     model_gpus: int | None = None
     sampler_gpus: int | None = None
+    ref_model_gpus: int | None = None
     world_size: int | None = None
     dp_size: int | None = None
     fsdp_size: int | None = None
@@ -185,6 +189,11 @@ class RLArgs:
     advantage_type: str = 'GRPOAdvantage'
     advantage_scale: Literal['group', 'batch', 'none'] = 'group'
     reward_fns: list[str] | None = None
+    student_model_id: str | None = None
+    teacher_model_id: str | None = None
+    gkd_beta: float = 0.5
+    gkd_temperature: float = 1.0
+    gkd_topk: int = 64
 
 
 @dataclass
@@ -192,6 +201,7 @@ class CheckpointArgs:
     save_optimizer: bool = True
     merge_and_sync: bool = True
     platform: str = 'GPU'
+    lora_sync_dir: str | None = None
 
 
 # ────────────────────────────────────────────────────────────────────────────────
@@ -243,7 +253,7 @@ def _resolve_path(self) -> Path | None:
 class EnvVarSource(ConfigSource):
     """Reads os.environ; recognizes TWINKLE_ prefix and any key known to the registry."""
 
-    def __init__(self, registry: ConfigRegistry):
+    def __init__(self, registry: 'ConfigRegistry'):
         self._registry = registry
 
     def load(self) -> dict[str, str]:
diff --git a/src/twinkle/data_format/output.py b/src/twinkle/data_format/output.py
index 763ef246f..596252fb6 100644
--- a/src/twinkle/data_format/output.py
+++ b/src/twinkle/data_format/output.py
@@ -20,11 +20,13 @@ class ModelOutput(TypedDict, total=False):
         loss: The loss calculated by the model.
         logps: The log-probabilities of correct tokens by the model.
         num_tokens: The token denominator associated with ``loss``.
+        embeddings: The embeddings output by the model, used be embedding task.
     """
     logits: Optional[OutputType]
     loss: Optional[OutputType]
     logps: Optional[OutputType]
     num_tokens: Optional[OutputType]
+    embeddings: Optional[OutputType]
 
 
 class LossOutput(TypedDict, total=False):
diff --git a/src/twinkle/data_format/sampling.py b/src/twinkle/data_format/sampling.py
index 1d5fe07c6..01ff0377d 100644
--- a/src/twinkle/data_format/sampling.py
+++ b/src/twinkle/data_format/sampling.py
@@ -1,5 +1,4 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
-import numpy as np
 from dataclasses import dataclass
 from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple, Union
 
diff --git a/src/twinkle/dataloader/dataloader.py b/src/twinkle/dataloader/dataloader.py
index c392d56cf..408c8d4b4 100644
--- a/src/twinkle/dataloader/dataloader.py
+++ b/src/twinkle/dataloader/dataloader.py
@@ -146,7 +146,7 @@ def _tracking_iter(self, inner):
     def skip_consumed_samples(self, consumed_train_samples: int) -> None:
         from torch.utils.data import IterableDataset
 
-        if isinstance(self.dataset, IterableDataset):
+        if isinstance(self.dataset, IterableDataset) or consumed_train_samples is None or consumed_train_samples <= 0:
             warnings.warn('IterableDataset does not support consumed-data skipping; continuing without skipping.')
             self._skip_samples = 0
             return
@@ -164,6 +164,7 @@ def resume_from_checkpoint(self, consumed_train_samples, **kwargs):
 
     @remote_function()
     def get_state(self) -> dict:
+        """The dataloader state for saving."""
         return {'consumed_train_samples': self._consumed_train_samples}
 
     def _rebuild_sampler_stack(self):
diff --git a/src/twinkle/dataset/iterable_packing_dataset.py b/src/twinkle/dataset/iterable_packing_dataset.py
index ca7c6fbd8..ab1d3a982 100644
--- a/src/twinkle/dataset/iterable_packing_dataset.py
+++ b/src/twinkle/dataset/iterable_packing_dataset.py
@@ -88,10 +88,27 @@ def _fetch_data_out_queue(self, last_res, num_samples):
         last_res += res
         return last_res
 
-    @staticmethod
-    def _cyclic_iter(iterable):
-        while True:
-            yield from iterable
+    def _write_through_iter(self, iterable):
+        """Yields from iterable, meanwhile, save it to disk if needed.
+        Saving is needed when you are using several datasets at a time.
+        """
+        if not self.cyclic:
+            for row in iterable:
+                self._write_through(row)
+                yield row
+            return
+        else:
+            first_pass = True
+            while True:
+                empty = True
+                for row in iterable:
+                    empty = False
+                    if first_pass:
+                        self._write_through(row)
+                    yield row
+                if empty:
+                    return
+                first_pass = False
 
     @remote_function()
     def __iter__(self):
@@ -102,10 +119,7 @@ def __iter__(self):
         except StopIteration:
             return
 
-        if self.cyclic:
-            iterator = self._cyclic_iter(self.dataset)
-        else:
-            iterator = iter(self.dataset)
+        iterator = self._write_through_iter(self.dataset)
         data = []
         max_length = self.template.max_length or 2048
         while True:
diff --git a/src/twinkle/dataset/packing_dataset.py b/src/twinkle/dataset/packing_dataset.py
index fa4acbd57..ada9498b8 100644
--- a/src/twinkle/dataset/packing_dataset.py
+++ b/src/twinkle/dataset/packing_dataset.py
@@ -114,6 +114,8 @@ def __getitem__(self, index):
         assert self._packed_called, 'Call `pack_dataset()` first before index the sample.'
         sequence = self.packed_idx[index]
         rows = [self.dataset[i] for i in sequence]
+        for row in rows:
+            self._write_through(row)
         output = {}
         for key in rows[0]:
             output[key] = [r[key] for r in rows]
diff --git a/src/twinkle/gym/__init__.py b/src/twinkle/gym/__init__.py
deleted file mode 100644
index 44b0771bb..000000000
--- a/src/twinkle/gym/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright (c) ModelScope Contributors. All rights reserved.
-from .base import Gym
diff --git a/src/twinkle/gym/base.py b/src/twinkle/gym/base.py
deleted file mode 100644
index aca798093..000000000
--- a/src/twinkle/gym/base.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) ModelScope Contributors. All rights reserved.
-
-
-class Gym:
-
-    def __init__(self):
-        pass
-
-    def step(self):
-        pass
diff --git a/src/twinkle/infra/__init__.py b/src/twinkle/infra/__init__.py
index 83e10d132..a2760c900 100644
--- a/src/twinkle/infra/__init__.py
+++ b/src/twinkle/infra/__init__.py
@@ -1,11 +1,9 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import functools
 import inspect
-import itertools
 import json
 import numpy as np
 import os
-import random
 import sys
 from typing import Any, Callable, List, Literal, Optional, TypeVar, Union
 
@@ -59,7 +57,7 @@ def _tag_exc(exc: BaseException, caller: Optional[str]) -> None:
             prefix = f'[twinkle driver caller: {caller}] '
             exc.args = (prefix + str(exc.args[0]), *exc.args[1:]) if exc.args else (prefix.rstrip(), )
             exc._twinkle_caller_augmented = True
-    except Exception:  # noqa: BLE001
+    except Exception:  # noqa
         pass
 
 
@@ -404,6 +402,7 @@ def dispatch_func(arg, n):
 
         return result
     elif dispatch == 'slice_dp':
+        assert device_mesh is not None
         # split by dp. each worker in one ep will receive the same argument
         result = []
         # if device_mesh is not None:
@@ -420,14 +419,6 @@ def dispatch_func(arg, n):
             import torch
             if isinstance(arg, list) or isinstance(arg, torch.Tensor):
                 _args = []
-                if device_mesh is None:
-                    total = len(arg)
-                    chunk = max(1, (total + n - 1) // n)
-                    for i in range(n):
-                        start = i * chunk
-                        end = min(total, start + chunk)
-                        _args.append(arg[start:end])
-                    return _args
                 for i in range(n):
                     _args.append(arg[device_mesh.get_slice(
                         len(arg), device_mesh.get_data_rank_from_global_rank(i * _rank_stride))])
@@ -696,11 +687,12 @@ def __next__(_self):
     return decorator
 
 
-def remote_function(dispatch: Union[Literal['slice', 'all', 'slice_dp'], Callable] = 'slice',
+def remote_function(dispatch: Union[Literal['slice', 'all', 'slice_dp', 'last_pp_first'], Callable] = 'slice',
                     execute: Literal['first', 'peer', 'all'] = 'all',
                     collect: Union[Literal['none', 'flatten', 'mean', 'sum', 'first', 'last_pp'], Callable] = 'none',
                     sync: bool = False,
-                    lazy_collect: Optional[bool] = None):
+                    lazy_collect: Optional[bool] = None,
+                    timeout: Optional[float] = None):
     """Patch each method called from remote(which class should be decorated with `remote_class`) with this decorator.
 
     Args:
@@ -726,6 +718,7 @@ def remote_function(dispatch: Union[Literal['slice', 'all', 'slice_dp'], Callabl
         sync: If True, use synchronous execution (execute_all_sync) instead of async.
             Required for methods with NCCL collective operations (e.g., Megatron forward_backward).
         lazy_collect: Do lazy collect, this boolean value decides whether this function needs lazy collect. If setting to None, it will follow the global setting.
+        timeout: Timeout in seconds for ray.get() when collecting results. Instance attribute ``_ray_get_timeout`` overrides this.
     """ # noqa
 
     def decorator(func: Callable[..., T1]) -> Callable[..., T1]:
@@ -773,7 +766,9 @@ def wrapper(self, *args, **kwargs) -> T1:
 
                         result = execute_method(func.__name__, _workers_and_args)
                         # This is a result future, call it to get the actual result
-                        result_func = RayHelper.do_get_and_collect_func(_collect_func, collect, result, device_mesh)
+                        _rgt = getattr(self, '_ray_get_timeout', None) or timeout
+                        result_func = RayHelper.do_get_and_collect_func(
+                            _collect_func, collect, result, device_mesh, timeout=_rgt)
                         _local_lazy_collect = _lazy_collect
                         if func.__name__ == '__iter__':
                             # return self
@@ -803,18 +798,13 @@ def wrapper(self, *args, **kwargs) -> T1:
                             # And this is user independent, only decided by the code.
                             _local_lazy_collect = self._lazy_collect
                         if _local_lazy_collect:
-                            # Wrap the deferred collector so that exceptions
-                            # raised when the caller later materializes the
-                            # result also trigger the notifier. Attributes
-                            # (``_futures`` etc.) on the original collector
-                            # are preserved for downstream code paths.
                             _orig_result_func = result_func
 
                             @functools.wraps(_orig_result_func)
                             def _notifying_result_func(*rargs, **rkwargs):
                                 try:
                                     return _orig_result_func(*rargs, **rkwargs)
-                                except Exception as _e:  # noqa: BLE001
+                                except Exception as _e:  # noqa
                                     _tag_exc(_e, _caller)
                                     notify_exception(_notifier, _ctx, _e, _name)
                                     raise
diff --git a/src/twinkle/infra/_ray/ray_helper.py b/src/twinkle/infra/_ray/ray_helper.py
index 0d8908a35..5cd792c3a 100644
--- a/src/twinkle/infra/_ray/ray_helper.py
+++ b/src/twinkle/infra/_ray/ray_helper.py
@@ -161,18 +161,20 @@ def get_node_address():
         return ip, port
 
     @staticmethod
-    def do_get_and_collect_func(collect_func: Callable, method: Union[str, Callable], futures, device_mesh):
+    def do_get_and_collect_func(collect_func: Callable, method: Union[str, Callable], futures, device_mesh,
+                                timeout=None):
         """Return a callable to collect results in the workers."""
 
         class LazyCollect:
 
-            def __init__(self, futures, method, collect_func, device_mesh):
+            def __init__(self, futures, method, collect_func, device_mesh, timeout=None):
                 self._futures = futures
                 self._method = method
                 self._collect_func = collect_func
                 self._is_lazy_collect = True
                 self.device_mesh = device_mesh
                 self._result = None  # Cache collected results
+                self._timeout = timeout
 
             def _get_result(self):
                 """Internal method to lazily collect and cache results"""
@@ -181,7 +183,7 @@ def _get_result(self):
                     result = []
                     for future in self._futures:
                         if isinstance(future, ray.ObjectRef):
-                            result.append(ray.get(future))
+                            result.append(ray.get(future, timeout=self._timeout))
                         else:
                             result.append(future)
                     self._result = self._collect_func(self._method, result, device_mesh=self.device_mesh)
@@ -199,7 +201,7 @@ def __len__(self):
                 """Support len() function"""
                 return len(self._get_result())
 
-        return LazyCollect(futures, method, collect_func, device_mesh)
+        return LazyCollect(futures, method, collect_func, device_mesh, timeout=timeout)
 
     @staticmethod
     def do_get_and_collect(args, kwargs):
diff --git a/src/twinkle/loss/chunked_cross_entropy.py b/src/twinkle/loss/chunked_cross_entropy.py
index 22d3d4077..061ca2168 100644
--- a/src/twinkle/loss/chunked_cross_entropy.py
+++ b/src/twinkle/loss/chunked_cross_entropy.py
@@ -1,63 +1,159 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
-import math
-from typing import Any
-
-from ..data_format import LossOutput
+from twinkle.data_format import LossOutput
 from .base import Loss
 
+# Lazily-built singleton autograd.Function, so we neither pay the
+# class-construction cost on every forward nor force a top-level torch import.
+_CHUNKED_CE_FUNC = None
+
+
+def _get_chunked_ce_func():
+    global _CHUNKED_CE_FUNC
+    if _CHUNKED_CE_FUNC is not None:
+        return _CHUNKED_CE_FUNC
+
+    import torch
+    import torch.nn.functional as F
+
+    class _ChunkedCrossEntropyFunc(torch.autograd.Function):
+        """Chunked CE that materialises log_softmax(B, V) only one chunk at a time.
+
+        Forward returns a scalar loss; backward writes per-token gradients into
+        a freshly allocated `grad_logits` tensor (the input `logits` is never
+        mutated). Mathematically equivalent to ``CrossEntropyLoss`` in the same
+        package; ``chunk_size`` only controls the memory/throughput trade-off.
+        """
+
+        @staticmethod
+        def forward(ctx, logits, labels, chunk_size, ignore_index, reduction, dft):
+            ctx.save_for_backward(logits, labels)
+            ctx.chunk_size = chunk_size
+            ctx.ignore_index = ignore_index
+            ctx.reduction = reduction
+            ctx.dft = dft
+
+            n = logits.shape[0]
+            # Use fp32 accumulators so we don't lose precision when summing
+            # over many tokens under fp16/bf16 autocast (matches cross_entropy.py).
+            total_loss = logits.new_zeros((), dtype=torch.float32)
+            total_count = logits.new_zeros((), dtype=torch.float32)
+
+            for start in range(0, n, chunk_size):
+                end = min(start + chunk_size, n)
+                logits_chunk = logits[start:end]
+                labels_chunk = labels[start:end]
+                mask = (labels_chunk != ignore_index).float()
+
+                logps = F.log_softmax(logits_chunk, dim=-1).gather(
+                    -1, labels_chunk.clamp(min=0).unsqueeze(-1)).squeeze(-1)
+                per_token = -logps * logps.exp() if dft else -logps
+
+                total_loss = total_loss + (per_token * mask).sum()
+                total_count = total_count + mask.sum()
+
+            ctx.num_tokens = total_count.detach()
+            if reduction == 'mean':
+                return total_loss / total_count.clamp(min=1)
+            return total_loss
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            logits, labels = ctx.saved_tensors
+            chunk_size = ctx.chunk_size
+            ignore_index = ctx.ignore_index
+            reduction = ctx.reduction
+            dft = ctx.dft
+
+            if reduction == 'mean':
+                scale = grad_output / ctx.num_tokens.clamp(min=1)
+            else:
+                scale = grad_output
+
+            grad_logits = torch.empty_like(logits)
+            n = logits.shape[0]
+
+            for start in range(0, n, chunk_size):
+                end = min(start + chunk_size, n)
+                logits_chunk = logits[start:end].detach().requires_grad_(True)
+                labels_chunk = labels[start:end]
+                mask = (labels_chunk != ignore_index).float()
+
+                with torch.enable_grad():
+                    logps = F.log_softmax(logits_chunk, dim=-1).gather(
+                        -1, labels_chunk.clamp(min=0).unsqueeze(-1)).squeeze(-1)
+                    per_token = -logps * logps.exp() if dft else -logps
+                    loss_chunk = (per_token * mask).sum()
+
+                grad_chunk = torch.autograd.grad(loss_chunk, logits_chunk, retain_graph=False)[0]
+                grad_logits[start:end] = grad_chunk * scale
+
+            # logits, labels, chunk_size, ignore_index, reduction, dft
+            return grad_logits, None, None, None, None, None
+
+    _CHUNKED_CE_FUNC = _ChunkedCrossEntropyFunc
+    return _CHUNKED_CE_FUNC
+
 
 class ChunkedCrossEntropyLoss(Loss):
-    """TODO untested code"""
+    """CE loss that chunks the (B, V) softmax to bound peak memory.
+
+    Drop-in replacement for :class:`CrossEntropyLoss` when ``outputs['logits']``
+    is large (e.g. long sequence x big vocab). Behaviour matches that loss
+    bit-for-bit; ``chunk_size`` only affects memory/throughput.
+
+    Args:
+        chunk_size: How many rows of ``logits`` to process per chunk.
+        ignore_index: Label id treated as padding (excluded from loss).
+        reduction: ``'mean'`` or ``'sum'``; matches ``CrossEntropyLoss``.
+        dft: If True, use DFT weighting ``-p*log(p)`` (arxiv 2508.05629).
+    """
 
-    def __init__(self, chunk_size):
+    require_logits = True
+    # We chunk the (B, V) softmax ourselves; tell upstream not to materialise
+    # `logps` (which would already pay the full memory cost we're trying to
+    # avoid). The `_loss_from_logps` fast path is kept only for the rare case
+    # where someone explicitly hands us pre-computed logps.
+    require_logps = False
+
+    def __init__(self,
+                 chunk_size: int,
+                 ignore_index: int = -100,
+                 reduction: str = 'mean',
+                 dft: bool = False,
+                 **kwargs):
+        super().__init__()
+        assert chunk_size > 0, 'chunk_size must be positive'
+        assert reduction in ('mean', 'sum'), f"reduction must be 'mean' or 'sum', got {reduction!r}"
         self.chunk_size = chunk_size
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+        self.dft = dft
 
     def __call__(self, inputs, outputs, **kwargs):
-        import torch
-
-        class ChunkedCrossEntropyLossFunc(torch.autograd.Function):
-
-            @staticmethod
-            def forward(ctx, logits, labels, chunk_size):
-                import torch
-                ctx.save_for_backward(logits, labels)
-                ctx.chunk_size = chunk_size
-
-                losses = []
-                for i in range(math.ceil(logits.shape[0] / chunk_size)):
-                    l_start = i * chunk_size
-                    l_end = min((i + 1) * chunk_size, logits.shape[0])
-                    logits_chunk = logits[l_start:l_end]
-                    labels_chunk = labels[l_start:l_end]
-                    loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
-                    loss_chunk = loss_fct(logits_chunk, labels_chunk)
-                    losses.append(loss_chunk)
-                    del logits_chunk
-                    del labels_chunk
-                all_losses = torch.cat(losses)
-                return all_losses
-
-            @staticmethod
-            def backward(ctx: Any, *grad_outputs: Any):
-                import torch
-                logits, labels = ctx.saved_tensors
-                chunk_size = ctx.chunk_size
-
-                for i in range(math.ceil(logits.shape[0] / chunk_size)):
-                    l_start = i * chunk_size
-                    l_end = min((i + 1) * chunk_size, logits.shape[0])
-                    logits_chunk = logits[l_start:l_end].detach().requires_grad_(True)
-                    labels_chunk = labels[l_start:l_end]
-                    loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
-                    with torch.enable_grad():
-                        loss_chunk = loss_fct(logits_chunk, labels_chunk)
-                        grad_output_chunk = grad_outputs[0][l_start:l_end]
-                        _loss_chunk = (loss_chunk * grad_output_chunk).sum()
-                        grad_chunk = torch.autograd.grad(_loss_chunk, logits_chunk, retain_graph=False)[0]
-                        logits[l_start:l_end] = grad_chunk
-
-                return logits, None, None
+        labels = inputs['labels']
+        logps = outputs.get('logps')
+
+        # Fast path: if logps is already gathered upstream, chunking the
+        # softmax is moot — fall back to the same scalar formula as
+        # CrossEntropyLoss to keep behaviour identical.
+        if logps is not None:
+            return self._loss_from_logps(labels, logps)
 
         logits = outputs['logits']
-        labels = inputs['labels']
-        return LossOutput(loss=ChunkedCrossEntropyLossFunc.apply(logits, labels, self.chunk_size), num_tokens=0)
+        labels = labels.view(-1)
+        logits = logits.view(-1, logits.shape[-1])
+
+        func = _get_chunked_ce_func()
+        loss = func.apply(logits, labels, self.chunk_size, self.ignore_index, self.reduction, self.dft)
+
+        if self.reduction == 'mean':
+            return LossOutput(loss=loss, num_tokens=0)
+        num_tokens = (labels != self.ignore_index).float().sum().clamp(min=1)
+        return LossOutput(loss=loss, num_tokens=num_tokens)
+
+    def _loss_from_logps(self, labels, logps):
+        mask = (labels != self.ignore_index).float()
+        per_token = -logps * logps.exp() if self.dft else -logps
+        if self.reduction == 'mean':
+            return LossOutput(loss=(per_token * mask).sum() / mask.sum().clamp(min=1), num_tokens=0)
+        return LossOutput(loss=(per_token * mask).sum(), num_tokens=mask.sum().clamp(min=1))
diff --git a/src/twinkle/loss/dpo.py b/src/twinkle/loss/dpo.py
index fe526ab46..d53019513 100644
--- a/src/twinkle/loss/dpo.py
+++ b/src/twinkle/loss/dpo.py
@@ -7,7 +7,7 @@
     (https://arxiv.org/abs/2305.18290)
 """
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
-
+import math
 from twinkle.data_format import LossOutput
 from twinkle.loss.base import Loss
 from twinkle.utils.torch_utils import selective_log_softmax
@@ -132,6 +132,12 @@ def __init__(
         **kwargs,
     ):
         super().__init__(ignore_index=ignore_index)
+        if loss_type not in ('sigmoid', 'hinge', 'ipo', 'kto_pair'):
+            raise ValueError(f'Unknown loss_type: {loss_type}')
+        if label_smoothing > 0 and loss_type != 'sigmoid':
+            raise ValueError(
+                f'label_smoothing > 0 is only defined for loss_type="sigmoid", '
+                f'got loss_type="{loss_type}". Set label_smoothing=0.0 or switch to sigmoid.')
         self.beta = beta
         self.label_smoothing = label_smoothing
         self.loss_type = loss_type
@@ -217,6 +223,11 @@ def _compute_dpo_loss(
         if self.loss_type == 'sigmoid':
             # Standard DPO loss: -log(sigmoid(beta * margin))
             losses = -F.logsigmoid(logits)
+            # Apply label smoothing (only meaningful here: Bradley-Terry soft labels).
+            if self.label_smoothing > 0:
+                # Soft labels: (1 - eps) * loss_chosen + eps * loss_rejected
+                smooth_losses = -F.logsigmoid(-logits)  # Loss for flipped preference
+                losses = (1 - self.label_smoothing) * losses + self.label_smoothing * smooth_losses
         elif self.loss_type == 'hinge':
             # Hinge loss variant
             losses = torch.relu(1 - logits)
@@ -234,12 +245,6 @@ def _compute_dpo_loss(
         else:
             raise ValueError(f'Unknown loss_type: {self.loss_type}')
 
-        # Apply label smoothing if specified
-        if self.label_smoothing > 0:
-            # Soft labels: (1 - eps) * loss_chosen + eps * loss_rejected
-            smooth_losses = -F.logsigmoid(-logits)  # Loss for flipped preference
-            losses = (1 - self.label_smoothing) * losses + self.label_smoothing * smooth_losses
-
         return losses.mean()
 
     def __call__(
@@ -321,7 +326,8 @@ def __call__(
             reference_chosen_logps = torch.zeros_like(policy_chosen_logps)
             reference_rejected_logps = torch.zeros_like(policy_rejected_logps)
         else:
-            return LossOutput(loss=torch.tensor(0.0, device=chosen_logps.device), num_tokens=0)
+            zero = (policy_chosen_logps.sum() + policy_rejected_logps.sum()) * 0.0
+            return LossOutput(loss=zero, num_tokens=0)
 
         # Compute DPO loss
         dpo_loss = self._compute_dpo_loss(
@@ -535,11 +541,23 @@ def __call__(
 
         # Odds ratio: log(odds_chosen / odds_rejected)
         # log_odds = log(p/(1-p)) = log(p) - log(1-p)
-        # Compute entirely in log-space to avoid exp() underflow:
-        #   log(p)   = avg_logps  (already in log-space)
-        #   log(1-p) = log1p(-exp(avg_logps))  (numerically stable via log1p)
-        log_odds_chosen = chosen_avg_logps - torch.log1p(-torch.exp(chosen_avg_logps))
-        log_odds_rejected = rejected_avg_logps - torch.log1p(-torch.exp(rejected_avg_logps))
+        # Compute log(1-p) = log(1 - exp(avg_logp)) numerically stably:
+        #   - For x > -log(2):  log(-expm1(x))  (avoids log(0) when p → 1)
+        #   - For x ≤ -log(2): log1p(-exp(x))  (avoids cancellation when p → 0)
+        # ``avg_logp ∈ (-∞, 0]`` so the threshold partitions the safe regime.
+        log_two = math.log(2.0)
+
+        def _log1mexp(x: 'torch.Tensor') -> 'torch.Tensor':
+            # Clamp at a tiny negative to keep both branches well-defined when p≈1.
+            x_safe = torch.clamp(x, max=-1e-7)
+            return torch.where(
+                x_safe > -log_two,
+                torch.log(-torch.expm1(x_safe)),
+                torch.log1p(-torch.exp(x_safe)),
+            )
+
+        log_odds_chosen = chosen_avg_logps - _log1mexp(chosen_avg_logps)
+        log_odds_rejected = rejected_avg_logps - _log1mexp(rejected_avg_logps)
 
         # ORPO odds ratio loss
         odds_ratio = log_odds_chosen - log_odds_rejected
diff --git a/src/twinkle/loss/gkd.py b/src/twinkle/loss/gkd.py
index 3f7db4bfb..7c198ad02 100644
--- a/src/twinkle/loss/gkd.py
+++ b/src/twinkle/loss/gkd.py
@@ -41,6 +41,10 @@ def __init__(
         chunk_size: int = 512,
         **kwargs,
     ):
+        if not (0.0 <= beta <= 1.0):
+            raise ValueError(f'beta must be in [0, 1], got {beta}')
+        if temperature <= 0:
+            raise ValueError(f'temperature must be > 0, got {temperature}')
         self.beta = beta
         self.temperature = temperature
         self.ignore_index = ignore_index
@@ -94,6 +98,7 @@ def __call__(
             labels=labels,
             beta=self.beta,
             temperature=self.temperature,
+            ignore_index=self.ignore_index,
             chunk_size=self.chunk_size,
             topk=topk,
             teacher_topk_logprobs=teacher_topk_logprobs,
@@ -108,6 +113,7 @@ def _generalized_jsd_loss(
         labels=None,
         beta: float = 0.5,
         temperature: float = 1.0,
+        ignore_index: int = -100,
         chunk_size: int = 512,
         topk: Optional[int] = None,
         teacher_topk_logprobs=None,
@@ -164,7 +170,7 @@ def _generalized_jsd_loss(
 
         # ── Mask valid (response) tokens ──────────────────────────────────────
         if labels is not None:
-            mask = labels != -100  # ignore_index is always -100 per convention
+            mask = labels != ignore_index
             # Vocab-size mismatch (e.g. Qwen2.5-VL-3B vs 7B): pad the smaller side
             # so both distributions are defined over the same token set.
             stu_dim = student_logits.shape[-1]
@@ -178,12 +184,15 @@ def _generalized_jsd_loss(
             student_logits = student_logits[mask]  # [num_valid, vocab/topk]
             teacher_logits = teacher_logits[mask]
             num_valid = mask.sum()
+            # ``[mask]`` already created fresh storage, so in-place divide is safe
+            # and avoids an extra [num_valid, V] allocation.
+            student_logits.div_(temperature)
+            teacher_logits.div_(temperature)
         else:
-            student_logits = student_logits.view(-1, student_logits.size(-1))
-            teacher_logits = teacher_logits.view(-1, teacher_logits.size(-1))
+            # Keep logits, may be an infer scenario
+            student_logits = student_logits.reshape(-1, student_logits.size(-1)) / temperature
+            teacher_logits = teacher_logits.reshape(-1, teacher_logits.size(-1)) / temperature
             num_valid = student_logits.size(0)
-        student_logits.div_(temperature)
-        teacher_logits.div_(temperature)
 
         if num_valid == 0:
             return student_logits.new_zeros(())
diff --git a/src/twinkle/loss/grpo.py b/src/twinkle/loss/grpo.py
index 4bb71216c..781b22060 100644
--- a/src/twinkle/loss/grpo.py
+++ b/src/twinkle/loss/grpo.py
@@ -42,18 +42,6 @@ def __init__(
         self.require_entropy = entropy_coef > 0.0
         self.ignore_index = ignore_index
 
-    def _compute_loss_mask(self, labels: 'torch.Tensor') -> 'torch.Tensor':
-        """
-        Compute loss mask from labels.
-
-        Args:
-            labels: [batch, seq_len] target token ids, -100 for ignored positions
-
-        Returns:
-            mask: [batch, seq_len] float tensor, 1.0 for valid positions, 0.0 for ignored
-        """
-        return (labels != self.ignore_index).float()
-
     def _compute_log_importance_weights(
         self,
         per_token_logps: 'torch.Tensor',
@@ -165,10 +153,13 @@ def _pad_and_align_to_batch(
                 return data  # Already aligned
             if data.dim() == 1:
                 data = data.unsqueeze(1)
-            if data.shape[1] == 1:  # Scalars
-                result = torch.full((batch_size, seq_len), fill_value, dtype=dtype, device=device)
-                result[mask] = data[mask.any(dim=1).nonzero(as_tuple=True)[0].repeat_interleave(mask.sum(dim=1)), 0]
-                return result
+            if data.shape[1] == 1:
+                assert data.shape[0] == batch_size, (
+                    f'scalar broadcast expects data.shape[0]==batch_size, '
+                    f'got data.shape={tuple(data.shape)} mask.shape={(batch_size, seq_len)}')
+                fill = torch.full((batch_size, seq_len), fill_value, dtype=dtype, device=device)
+                expanded = data.expand(batch_size, seq_len)
+                return torch.where(mask, expanded, fill)
             data = [data[i] for i in range(batch_size)]  # To list
 
         # Handle list (scalars or sequences)
@@ -276,10 +267,12 @@ def __call__(
             )
 
         # GRPO loss is ill-defined without advantages (e.g. ref-logps-only forward,
-        # or eval/validation forwards). Return a zero loss so the forward still
-        # flows through cleanly and callers can harvest outputs['logps'] freely.
+        # or eval/validation forwards). Return a zero loss that still flows through
+        # autograd so DDP/FSDP do not see unused params, and callers can harvest
+        # outputs['logps'] freely.
         if advantages is None:
-            return LossOutput(loss=torch.zeros((), device=device, dtype=logps.dtype), num_tokens=0)
+            zero = logps.sum() * 0.0
+            return LossOutput(loss=zero, num_tokens=0)
 
         advantages = self._pad_and_align_to_batch(
             advantages,
diff --git a/src/twinkle/loss/infonce.py b/src/twinkle/loss/infonce.py
index 68d14840c..c356bd64c 100644
--- a/src/twinkle/loss/infonce.py
+++ b/src/twinkle/loss/infonce.py
@@ -13,8 +13,6 @@
 import numpy as np
 import torch
 import torch.distributed as dist
-import torch.nn.functional as F
-from enum import Enum
 from torch import nn
 from typing import Optional
 
@@ -22,15 +20,6 @@
 from .base import Loss
 
 
-# Borrowed from sentence_transformers.
-class SiameseDistanceMetric(Enum):
-    """Distance metrics available to the pairwise contrastive losses."""
-
-    EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)  # noqa
-    MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)  # noqa
-    COSINE_DISTANCE = lambda x, y: 1 - F.cosine_similarity(x, y)  # noqa
-
-
 def _extract_sentences(outputs) -> torch.Tensor:
     """Return [B, D] sentence embeddings from postprocess_tensor_sp output.
 
@@ -119,6 +108,11 @@ def __init__(
         process_group=None,
         **kwargs,
     ):
+        if mask_fake_negative and fake_neg_margin <= 0:
+            raise ValueError(
+                f'fake_neg_margin must be > 0 when mask_fake_negative=True, got {fake_neg_margin}. '
+                'A non-positive margin would mask out the positive itself or every above-positive '
+                'logit indiscriminately, collapsing the contrastive signal.')
         self.temperature = temperature
         self.use_batch = use_batch
         self.hard_negatives = hard_negatives
@@ -129,7 +123,13 @@ def __init__(
         self.process_group = process_group
 
     def _gather_across_dp(self, sentences: torch.Tensor, labels: torch.Tensor):
-        """All-gather embeddings & labels across DP ranks; only local shard keeps grad."""
+        """All-gather embeddings & labels across DP ranks; only local shard keeps grad.
+
+        NCCL ``all_gather`` requires every rank to send the *same* tensor size. Under
+        ``slice_dp`` dispatch the per-rank batch is uneven (``divmod`` splits), so we
+        pad each rank to the global max along dim-0, do an equal-sized all_gather,
+        then strip padding back. Only the local shard retains gradients.
+        """
         if not (dist.is_available() and dist.is_initialized()):
             return sentences, labels
         world_size = dist.get_world_size(group=self.process_group)
@@ -137,24 +137,40 @@ def _gather_across_dp(self, sentences: torch.Tensor, labels: torch.Tensor):
             return sentences, labels
         rank = dist.get_rank(group=self.process_group)
 
-        # variable per-rank shapes require communicating shape first
-        local_shape = sentences.new_tensor(sentences.shape, dtype=torch.long)
-        shapes = [torch.empty_like(local_shape) for _ in range(world_size)]
-        dist.all_gather(shapes, local_shape, group=self.process_group)
-        all_sentences = [sentences.new_empty(shape.tolist()) for shape in shapes]
-        dist.all_gather(all_sentences, sentences.contiguous(), group=self.process_group)
-
-        local_label_shape = labels.new_tensor(labels.shape, dtype=torch.long)
-        label_shapes = [torch.empty_like(local_label_shape) for _ in range(world_size)]
-        dist.all_gather(label_shapes, local_label_shape, group=self.process_group)
-        all_labels = [labels.new_empty(shape.tolist()) for shape in label_shapes]
-        dist.all_gather(all_labels, labels.contiguous(), group=self.process_group)
-
-        # keep the local shard differentiable; detach others
-        all_sentences[rank] = sentences
+        # ``labels`` is a 1-D mask aligned to ``sentences`` along dim-0, so they
+        # share the same per-rank size. Gather sizes once and reuse for both.
+        assert sentences.shape[0] == labels.shape[0], (
+            f'sentences/labels dim-0 mismatch: {sentences.shape[0]} vs {labels.shape[0]}')
+        local_n = torch.tensor([sentences.shape[0]], device=sentences.device, dtype=torch.long)
+        sizes = [torch.empty_like(local_n) for _ in range(world_size)]
+        dist.all_gather(sizes, local_n, group=self.process_group)
+        sizes_int = [int(s.item()) for s in sizes]
+        max_n = max(sizes_int)
+
+        def _pad_gather(tensor: torch.Tensor):
+            if tensor.shape[0] < max_n:
+                pad_shape = (max_n - tensor.shape[0],) + tuple(tensor.shape[1:])
+                padded = torch.cat([tensor, tensor.new_zeros(pad_shape)], dim=0)
+            else:
+                padded = tensor
+            buffers = [torch.empty_like(padded) for _ in range(world_size)]
+            dist.all_gather(buffers, padded.contiguous(), group=self.process_group)
+            return buffers
+
+        sent_buffers = _pad_gather(sentences)
+        label_buffers = _pad_gather(labels)
+
+        # Strip padding; keep local shard differentiable, detach others.
+        all_sentences = []
+        all_labels = []
         for idx in range(world_size):
-            if idx != rank:
-                all_sentences[idx] = all_sentences[idx].detach()
+            n = sizes_int[idx]
+            if idx == rank:
+                all_sentences.append(sentences)
+                all_labels.append(labels)
+            else:
+                all_sentences.append(sent_buffers[idx][:n].detach())
+                all_labels.append(label_buffers[idx][:n])
         return torch.cat(all_sentences, dim=0), torch.cat(all_labels, dim=0)
 
     def __call__(self, inputs, outputs, **kwargs) -> LossOutput:
diff --git a/src/twinkle/metric/accuracy.py b/src/twinkle/metric/accuracy.py
index b3034c57a..4dfb01198 100644
--- a/src/twinkle/metric/accuracy.py
+++ b/src/twinkle/metric/accuracy.py
@@ -1,5 +1,4 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
-import numpy as np
 from typing import List, Union
 
 from ..data_format import InputFeature, ModelOutput
diff --git a/src/twinkle/metric/dpo.py b/src/twinkle/metric/dpo.py
index b203d255e..024cb0473 100644
--- a/src/twinkle/metric/dpo.py
+++ b/src/twinkle/metric/dpo.py
@@ -131,6 +131,12 @@ def accumulate(self, inputs: Union[InputFeature, List[InputFeature]], outputs: M
         ref_outputs = kwargs.get('ref_outputs')
         if ref_outputs is not None:
             ref_logps = ref_outputs.get('logps')
+            if ref_logps is not None:
+                if isinstance(ref_logps, list):
+                    if len(ref_logps) == 0:
+                        ref_logps = None
+                    else:
+                        ref_logps = pad_and_stack_tensors(ref_logps)
             if ref_logps is not None:
                 # Align ref_logps to match labels shape (handles different seq lengths)
                 ref_logps = self._align_logps(ref_logps, labels.shape, labels.device, logps.dtype)
diff --git a/src/twinkle/metric/embedding.py b/src/twinkle/metric/embedding.py
index 9fb3aed8c..8b3681031 100644
--- a/src/twinkle/metric/embedding.py
+++ b/src/twinkle/metric/embedding.py
@@ -1,7 +1,4 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
 from typing import List, Union
 
 from twinkle.data_format import InputFeature, ModelOutput
@@ -32,6 +29,9 @@ def reset(self):
         self.grad_norm = 0.0
 
     def accumulate(self, inputs: Union[InputFeature, List[InputFeature]], outputs: ModelOutput, **kwargs):
+        import torch
+        import torch.distributed as dist
+        import torch.nn.functional as F
         sentences = outputs.get('embeddings')
         if sentences is None:
             sentences = outputs.get('logits')
@@ -44,22 +44,34 @@ def accumulate(self, inputs: Union[InputFeature, List[InputFeature]], outputs: M
             inputs = [inputs]
         labels = torch.cat([inp['labels'].view(-1) for inp in inputs], dim=0)
 
-        # Gather embeddings and labels across DP for in-batch stats
+        # Gather embeddings and labels across DP for in-batch stats.
+        # NCCL ``all_gather`` requires every rank to send the same tensor size,
+        # but ``slice_dp`` dispatch (``divmod`` split) can leave per-rank dim-0
+        # uneven. Pad to the global max along dim-0, gather, then strip padding.
         if dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1:
             world_size = dist.get_world_size()
-            local_shape = sentences.new_tensor(sentences.shape, dtype=torch.long)
-            shapes = [torch.empty_like(local_shape) for _ in range(world_size)]
-            dist.all_gather(shapes, local_shape)
-            all_sentences = [sentences.new_empty(s.tolist()) for s in shapes]
-            dist.all_gather(all_sentences, sentences.contiguous())
-            sentences = torch.cat(all_sentences, dim=0)
-
-            local_lshape = labels.new_tensor(labels.shape, dtype=torch.long)
-            lshapes = [torch.empty_like(local_lshape) for _ in range(world_size)]
-            dist.all_gather(lshapes, local_lshape)
-            all_labels = [labels.new_empty(s.tolist()) for s in lshapes]
-            dist.all_gather(all_labels, labels.contiguous())
-            labels = torch.cat(all_labels, dim=0)
+            assert sentences.shape[0] == labels.shape[0], (
+                f'sentences/labels dim-0 mismatch: {sentences.shape[0]} vs {labels.shape[0]}')
+            local_n = torch.tensor([sentences.shape[0]], device=sentences.device, dtype=torch.long)
+            sizes = [torch.empty_like(local_n) for _ in range(world_size)]
+            dist.all_gather(sizes, local_n)
+            sizes_int = [int(s.item()) for s in sizes]
+            max_n = max(sizes_int)
+
+            def _pad_gather(tensor: 'torch.Tensor') -> 'List[torch.Tensor]':
+                if tensor.shape[0] < max_n:
+                    pad_shape = (max_n - tensor.shape[0],) + tuple(tensor.shape[1:])
+                    padded = torch.cat([tensor, tensor.new_zeros(pad_shape)], dim=0)
+                else:
+                    padded = tensor
+                buffers = [torch.empty_like(padded) for _ in range(world_size)]
+                dist.all_gather(buffers, padded.contiguous())
+                return buffers
+
+            sent_buffers = _pad_gather(sentences)
+            label_buffers = _pad_gather(labels)
+            sentences = torch.cat([sent_buffers[i][:sizes_int[i]] for i in range(world_size)], dim=0)
+            labels = torch.cat([label_buffers[i][:sizes_int[i]] for i in range(world_size)], dim=0)
 
         anchor_idx = torch.nonzero(labels, as_tuple=False).squeeze(-1)
         if anchor_idx.numel() == 0:
diff --git a/src/twinkle/metric/grpo.py b/src/twinkle/metric/grpo.py
index 06e082eeb..e2797b1ec 100644
--- a/src/twinkle/metric/grpo.py
+++ b/src/twinkle/metric/grpo.py
@@ -3,9 +3,12 @@
 from typing import Any, Dict, List, Optional, Union
 
 from twinkle.data_format import InputFeature, ModelOutput
+from twinkle.utils import get_logger
 from twinkle.utils.transformers_utils import align_logps_to_mask
 from .base import Metric
 
+logger = get_logger()
+
 
 class GRPOMetric(Metric):
 
@@ -254,6 +257,11 @@ def accumulate(
             if len(seq_lens) == 1:
                 merged = torch.cat(label_tensors, dim=0)
                 inputs_list = [{'labels': merged}]
+            else:
+                logger.warning(
+                    f'GRPOMetric: logps is a single tensor but inputs_list has '
+                    f'{len(inputs_list)} mb with mismatched seq_lens={sorted(seq_lens)}. '
+                    f'Only mb[0] will be accumulated; check the model forward path.')
 
         flat_old: Optional[List] = None
         if old_logps is not None and isinstance(old_logps, (list, tuple)):
@@ -284,7 +292,17 @@ def accumulate(
                 # Uncommon: aligned global tensor. Only honour when it
                 # exactly matches the single-mb shape; otherwise drop.
                 import torch as _torch  # noqa: F811
-                old_slice = old_logps if (_torch.is_tensor(old_logps) and old_logps.shape == logps_mb.shape) else None
+                if _torch.is_tensor(old_logps) and old_logps.shape == logps_mb.shape:
+                    old_slice = old_logps
+                else:
+                    if mb_idx == 0:
+                        # Warn once per accumulate call (not per mb) to avoid log spam.
+                        old_shape = tuple(old_logps.shape) if _torch.is_tensor(old_logps) else 'unknown'
+                        logger.warning(
+                            f'GRPOMetric: old_logps shape {old_shape} does not match '
+                            f'logps_mb shape {tuple(logps_mb.shape)}; ratio/kl metrics will '
+                            f'be skipped for this step.')
+                    old_slice = None
             else:
                 old_slice = None
 
diff --git a/src/twinkle/metric/train_metric.py b/src/twinkle/metric/train_metric.py
index da82a8783..8d785c38b 100644
--- a/src/twinkle/metric/train_metric.py
+++ b/src/twinkle/metric/train_metric.py
@@ -2,7 +2,7 @@
 import time
 from typing import List, Union
 
-from ..data_format import InputFeature, ModelOutput
+from twinkle.data_format import InputFeature, ModelOutput
 from .base import Metric
 
 
diff --git a/src/twinkle/model/base.py b/src/twinkle/model/base.py
index a4d4ea064..8ea00d696 100644
--- a/src/twinkle/model/base.py
+++ b/src/twinkle/model/base.py
@@ -1,8 +1,7 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import os
 from abc import ABC, abstractmethod
-from datetime import timedelta
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Type, Union
 
 from twinkle import Platform, torch_util
 from twinkle.data_format import InputFeature, ModelOutput
diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py
index a5ea3fc56..60b45f774 100644
--- a/src/twinkle/model/megatron/megatron.py
+++ b/src/twinkle/model/megatron/megatron.py
@@ -420,7 +420,7 @@ def forward_step_func(data_iterator, model):
                     embeddings = output_tensor
             elif labels is not None and is_last_pp:
                 _loss_require_logps = getattr(_loss_instance, 'require_logps', True)
-                _loss_require_entropy = (hasattr(_loss_instance, 'require_entropy') and _loss_instance.require_entropy)
+                _loss_require_entropy = getattr(_loss_instance, 'require_entropy', True)
                 _packed = batch.get('packed_seq_params')
                 cu_seqlens_q = getattr(_packed, 'cu_seqlens_q', None) if _packed is not None else None
                 if _loss_require_logps:
@@ -446,7 +446,7 @@ def forward_step_func(data_iterator, model):
                 _outputs = {'logps': logps}
                 if entropies is not None:
                     _outputs['entropies'] = entropies
-                if hasattr(_loss_instance, 'require_logits') and _loss_instance.require_logits:
+                if getattr(_loss_instance, 'require_logits', False):
                     _outputs['logits'] = output_tensor
                 batch, _outputs = processor.unpack_packed_sequences(batch, _outputs)
                 logps = _outputs['logps']
@@ -990,7 +990,9 @@ def _get_rng_state() -> 'ShardedObject':
             'random_rng_state': random.getstate(),
             'np_rng_state': np.random.get_state(),
             'torch_rng_state': torch.get_rng_state(),
-            'cuda_rng_state': torch.cuda.get_rng_state(),
+            # Backend-agnostic device RNG (CUDA / NPU / MPS); key kept as
+            # 'cuda_rng_state' for backward compatibility with existing checkpoints.
+            'cuda_rng_state': Platform.get_device_rng_state(),
             'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states(),
         }
         rng_state_list = [rng_state]
@@ -1112,7 +1114,7 @@ def _save_mcore_optimizer(
             with open(tracker_path, 'w') as f:
                 f.write(str(iteration))
 
-        logging.getLogger(__name__).info(f'Saved mcore optimizer state at iteration {iteration} '
+        logger.info(f'Saved mcore optimizer state at iteration {iteration} '
                                          f'to {checkpoint_dir}')
 
     def _load_mcore_optimizer(
@@ -1139,7 +1141,7 @@ def _load_mcore_optimizer(
         )
         iteration = self._read_iteration(tracker_path)
         if iteration == 0:
-            logging.getLogger(__name__).warning(f'No checkpoint found in {checkpoint_dir}')
+            logger.warning(f'No checkpoint found in {checkpoint_dir}')
             return
 
         iter_dir = os.path.join(checkpoint_dir, f'iter_{iteration:07d}')
@@ -1201,7 +1203,9 @@ def _load_mcore_optimizer(
             random.setstate(rng['random_rng_state'])
             np.random.set_state(rng['np_rng_state'])
             torch.set_rng_state(rng['torch_rng_state'])
-            torch.cuda.set_rng_state(rng['cuda_rng_state'])
+            # Backend-agnostic restore: tolerates ckpt produced on different backend
+            # (returns None) and avoids hard-coded torch.cuda which crashes on NPU.
+            Platform.set_device_rng_state(rng.get('cuda_rng_state'))
             tensor_parallel.get_cuda_rng_tracker().set_states(rng['rng_tracker_states'], )
 
         # Restore iteration counter.
@@ -1211,26 +1215,26 @@ def _load_mcore_optimizer(
         if dist.is_initialized():
             dist.barrier()
 
-        logging.getLogger(__name__).info(f'Resumed from mcore checkpoint at iteration {iteration} '
+        logger.info(f'Resumed from mcore checkpoint at iteration {iteration} '
                                          f'from {checkpoint_dir}')
 
     @staticmethod
     def _read_iteration(tracker_path: str) -> int:
-        if not os.path.exists(tracker_path):
-            return 0
-        with open(tracker_path) as f:
-            iteration = int(f.read().strip())
+        # All ranks must enter the all_reduce together; missing tracker on some
+        # ranks (e.g. NFS lag, partial mount) must NOT short-circuit, otherwise
+        # the remaining ranks hang at the collective. Treat missing as 0 and
+        # let MAX reduction recover the canonical iteration from any rank that
+        # successfully read the file.
+        iteration = 0
+        if os.path.exists(tracker_path):
+            with open(tracker_path) as f:
+                iteration = int(f.read().strip())
         if torch.distributed.is_initialized():
-            iters_cuda = torch.tensor(
-                [iteration],
-                dtype=torch.long,
-                device='cuda',
-            )
-            torch.distributed.all_reduce(
-                iters_cuda,
-                op=torch.distributed.ReduceOp.MAX,
-            )
-            iteration = iters_cuda[0].item()
+            # Use Platform.get_local_device() to stay backend-agnostic
+            # (CUDA / NPU / MPS); 'cuda' would crash on NPU.
+            iters_dev = torch.tensor([iteration], dtype=torch.long, device=Platform.get_local_device())
+            torch.distributed.all_reduce(iters_dev, op=torch.distributed.ReduceOp.MAX)
+            iteration = int(iters_dev[0].item())
         return iteration
 
     def _merge_lora_adapters(self, adapter_name: str = 'default'):
@@ -1256,7 +1260,7 @@ def _save_hf_format(self, output_dir: str, adapter_name: str, lora_converter=Non
 
         For distributed training:
         - All PP ranks participate in export (each has different layers)
-        - Only DP rank 0 actually writes to disk
+        - Only global rank 0 actually writes shared config files
         - Uses barrier for synchronization
 
         For LoRA training:
@@ -1264,12 +1268,9 @@ def _save_hf_format(self, output_dir: str, adapter_name: str, lora_converter=Non
         """
         # Check if this is LoRA training
         is_peft_format = (adapter_name != _default_adapter_name)
+        is_global_zero = (not dist.is_initialized()) or dist.get_rank() == 0
 
-        # Create output directory on rank 0 only
-        from megatron.core import parallel_state as mpu
-        dp_rank = mpu.get_data_parallel_rank() if mpu.is_initialized() else 0
-
-        if dp_rank == 0:
+        if is_global_zero:
             os.makedirs(output_dir, exist_ok=True)
 
         # Synchronize before saving
@@ -1281,8 +1282,8 @@ def _save_hf_format(self, output_dir: str, adapter_name: str, lora_converter=Non
         self.strategy.bridge.save_weights(
             model, output_dir, peft_format=is_peft_format, adapter_name=adapter_name, converter=lora_converter)
 
-        # Save config on rank 0 only
-        if dp_rank == 0:
+        # Save config on global rank 0 only (avoid concurrent writers).
+        if is_global_zero:
             self.hf_config.save_pretrained(output_dir)
             if isinstance(model[0], PeftModel):
                 config = model[0].peft_config[adapter_name]
@@ -1291,11 +1292,13 @@ def _save_hf_format(self, output_dir: str, adapter_name: str, lora_converter=Non
                 model[0].peft_config[adapter_name].save_pretrained(output_dir)
                 config.target_modules = target_modules
 
+        if dist.is_initialized():
+            dist.barrier()
+
     def _save_megatron_format(self, output_dir: str, adapter_name: str, lora_converter=None):
         """Save in Megatron checkpoint format."""
+        is_global_zero = (not dist.is_initialized()) or dist.get_rank() == 0
         os.makedirs(output_dir, exist_ok=True)
-        from megatron.core import parallel_state as mpu
-        dp_rank = mpu.get_data_parallel_rank() if mpu.is_initialized() else 0
         state_dict = self._get_trainable_parameters(adapter_name)
         cpu_state_dict = {}
         for k, v in state_dict.items():
@@ -1311,13 +1314,18 @@ def _save_megatron_format(self, output_dir: str, adapter_name: str, lora_convert
         rank = dist.get_rank() if dist.is_initialized() else 0
         checkpoint_path = os.path.join(output_dir, f'model_rank{rank}.pt')
         torch.save(cpu_state_dict, checkpoint_path)
-        # Save config on rank 0 only
+        # Save shared config on global rank 0 only (avoid concurrent writers).
         model = self.strategy.unwrap_model(self.model)
-        if dp_rank == 0:
+        if is_global_zero:
             self.hf_config.save_pretrained(output_dir)
             if isinstance(model[0], PeftModel):
                 model[0].peft_config[adapter_name].save_pretrained(output_dir)
 
+        # Finalize barrier: ensure all ranks finish writing model_rank*.pt
+        # before the caller proceeds (e.g. uploading / loading the ckpt).
+        if dist.is_initialized():
+            dist.barrier()
+
     def _save_tokenizer(self, output_dir: str, **kwargs):
         from twinkle.utils import is_last_rank
         if not is_last_rank():
diff --git a/src/twinkle/model/megatron/multi_lora_megatron.py b/src/twinkle/model/megatron/multi_lora_megatron.py
index 2dd6b7a53..78981b888 100644
--- a/src/twinkle/model/megatron/multi_lora_megatron.py
+++ b/src/twinkle/model/megatron/multi_lora_megatron.py
@@ -15,7 +15,7 @@
 from transformers import AutoConfig, PretrainedConfig
 from typing import Any, Callable, Dict, List, Literal, Optional, Type, Union
 
-from twinkle import DeviceMesh, remote_class, remote_function, requires, template, torch_util
+from twinkle import DeviceMesh, Platform, remote_class, remote_function, requires, template, torch_util
 from twinkle.data_format import InputFeature, Trajectory
 from twinkle.hub import HubOperation
 from twinkle.infra import collect_tensor_dict
@@ -26,6 +26,9 @@
 from ._mindspeed_runtime import ensure_mindspeed_adaptor_patched
 from .megatron import MegatronModel
 from .strategy import MegatronStrategy
+from twinkle.utils import get_logger
+
+logger = get_logger()
 
 
 @remote_class(execute='all')
@@ -221,8 +224,11 @@ def _save_local_training_rng_state():
             'np_rng_state': np.random.get_state(),
             'torch_rng_state': torch.get_rng_state(),
         }
-        if torch.cuda.is_available():
-            rng_state['cuda_rng_state'] = torch.cuda.get_rng_state()
+        # Backend-agnostic device RNG capture (CUDA / NPU / MPS). Key is kept as
+        # 'cuda_rng_state' for backward compatibility with existing checkpoints.
+        device_rng = Platform.get_device_rng_state()
+        if device_rng is not None:
+            rng_state['cuda_rng_state'] = device_rng
         rng_state['rng_tracker_states'] = tensor_parallel.get_cuda_rng_tracker().get_states()
         return rng_state
 
@@ -233,8 +239,10 @@ def _load_local_training_rng_state(rng_state):
         random.setstate(rng_state['random_rng_state'])
         np.random.set_state(rng_state['np_rng_state'])
         torch.set_rng_state(rng_state['torch_rng_state'])
-        if 'cuda_rng_state' in rng_state and torch.cuda.is_available():
-            torch.cuda.set_rng_state(rng_state['cuda_rng_state'])
+        # Backend-agnostic device RNG restore: tolerates ckpt produced on different
+        # backend (key absent or None) and avoids hard-coded torch.cuda on NPU.
+        if 'cuda_rng_state' in rng_state:
+            Platform.set_device_rng_state(rng_state['cuda_rng_state'])
         tensor_parallel.get_cuda_rng_tracker().set_states(rng_state['rng_tracker_states'])
 
     def _save_multi_lora_optimizer(self, checkpoint_dir: str, optimizer_config, **kwargs):
@@ -251,19 +259,35 @@ def _save_multi_lora_optimizer(self, checkpoint_dir: str, optimizer_config, **kw
 
         torch.save(state_dict, self._rank_local_optimizer_path(checkpoint_dir))
 
+        if dist.is_initialized():
+            dist.barrier()
+
     def _load_multi_lora_optimizer(self, checkpoint_dir: str, adapter_name: str = '', **kwargs):
         no_load_optim = kwargs.pop('no_load_optim', False)
-        no_load_rng = kwargs.pop('no_load_rng', False)
+        no_load_rng = kwargs.pop('no_load_rng', True)
         optimizer_config = self.optimizer_group.get(adapter_name)
         state_dict = torch.load(self._rank_local_optimizer_path(checkpoint_dir), map_location='cpu', weights_only=False)
 
         if not no_load_optim and optimizer_config is not None:
             if optimizer_config.optimizer is not None and 'optimizer' in state_dict:
                 optimizer_config.optimizer.load_state_dict(state_dict['optimizer'])
+                device = Platform.get_local_device()
+                for group_state in optimizer_config.optimizer.state.values():
+                    if not isinstance(group_state, dict):
+                        continue
+                    for k, v in group_state.items():
+                        if isinstance(v, torch.Tensor):
+                            group_state[k] = v.to(device)
             if optimizer_config.lr_scheduler is not None and 'opt_param_scheduler' in state_dict:
                 optimizer_config.lr_scheduler.load_state_dict(state_dict['opt_param_scheduler'])
+        # RNG state is intentionally not restored in multi-tenant mode:
+        # restoring the global RNG would silently affect other active tenants'
+        # dropout / initialization behaviour.
         if not no_load_rng and 'rng_state' in state_dict:
-            self._load_local_training_rng_state(state_dict['rng_state'])
+            logger.warning(
+                'Skipping RNG state restoration in multi-tenant mode. '
+                'Global RNG is shared across tenants; restoring it would '
+                'affect other active adapters.')
         if optimizer_config is not None and 'iteration' in state_dict:
             optimizer_config.cur_step = state_dict['iteration']
 
@@ -354,6 +378,11 @@ def resume_from_checkpoint(self, checkpoint_dir, *, resume_only_model=False, **k
         self._check_adapter_valid(adapter_name)
 
         trainer_state_path = os.path.join(checkpoint_dir, 'trainer_state.json')
+        if not os.path.isfile(trainer_state_path):
+            raise FileNotFoundError(
+                f'trainer_state.json not found in {checkpoint_dir}. '
+                f'Ensure the checkpoint was saved with save_optimizer=True.')
+
         with open(trainer_state_path) as f:
             trainer_state = json.load(f)
 
diff --git a/src/twinkle/model/megatron/strategy/megatron.py b/src/twinkle/model/megatron/strategy/megatron.py
index 819014eb8..1bd809025 100644
--- a/src/twinkle/model/megatron/strategy/megatron.py
+++ b/src/twinkle/model/megatron/strategy/megatron.py
@@ -48,7 +48,6 @@ def __init__(
         ddp_config: Dict[str, Any] = None,
         **kwargs,
     ):
-        import torch.distributed as dist
         from megatron.core import mpu
         self.device_mesh = device_mesh
         self.use_distributed_optimizer = use_distributed_optimizer
diff --git a/src/twinkle/model/transformers/strategy/accelerate.py b/src/twinkle/model/transformers/strategy/accelerate.py
index d0434991f..018f4c494 100644
--- a/src/twinkle/model/transformers/strategy/accelerate.py
+++ b/src/twinkle/model/transformers/strategy/accelerate.py
@@ -34,10 +34,8 @@ def __init__(
         parallelism_config = self._parallelism_config_from_device_mesh(device_mesh)
         fsdp_plugin = self._fsdp_config_from_device_mesh(device_mesh, fsdp_config, memory_efficient_init)
 
-        kwargs_handlers = []
-        kwargs_handlers.append(
-            InitProcessGroupKwargs(
-                timeout=timedelta(seconds=int(os.environ.get('TWINKLE_DIST_TIMEOUT_SECONDS', '7200')))))
+        kwargs_handlers = [InitProcessGroupKwargs(
+            timeout=timedelta(seconds=int(os.environ.get('TWINKLE_DIST_TIMEOUT_SECONDS', '7200'))))]
         if ddp_config is not None:
             from accelerate import DistributedDataParallelKwargs
             ddp_config = DistributedDataParallelKwargs(**ddp_config)
@@ -131,8 +129,7 @@ def _fsdp_config_from_device_mesh(self, device_mesh: DeviceMesh, fsdp_config: Di
         return fsdp_plugin
 
     def wrap_model(self, model, *args):
-        result = self.accelerator.prepare(model, *args)
-        return result
+        return self.accelerator.prepare(model, *args)
 
     def unwrap_model(self, model):
         return self.accelerator.unwrap_model(model, keep_torch_compile=False)
diff --git a/src/twinkle/model/transformers/transformers.py b/src/twinkle/model/transformers/transformers.py
index 61733d7dc..cac375263 100644
--- a/src/twinkle/model/transformers/transformers.py
+++ b/src/twinkle/model/transformers/transformers.py
@@ -414,8 +414,8 @@ def forward(self, *, inputs: Union[InputFeature, List[InputFeature], List[Trajec
             inputs = optimizer_config.template.batch_encode(inputs)  # noqa
         processor: InputProcessor = optimizer_config.processor
         loss_instance = optimizer_config.loss_instance
-        loss_require_logits = (hasattr(loss_instance, 'require_logits') and loss_instance.require_logits)
-        loss_require_entropy = (hasattr(loss_instance, 'require_entropy') and loss_instance.require_entropy)
+        loss_require_logits = getattr(loss_instance, 'require_logits', False)
+        loss_require_entropy = getattr(loss_instance, 'require_entropy', False)
         loss_require_logps = getattr(loss_instance, 'require_logps', True)
         assert isinstance(processor, InputProcessor), 'Set a correct `InputProcessor` before forwarding'
         inputs: Dict[str, Any] = processor(
@@ -490,8 +490,8 @@ def forward_only(self, *, inputs: Union[InputFeature, List[InputFeature], List[T
             processor: InputProcessor = optimizer_config.processor
             assert isinstance(processor, InputProcessor), 'Set InputProcessor correctly before forwarding'
             loss_instance = optimizer_config.loss_instance
-            loss_require_logits = (hasattr(loss_instance, 'require_logits') and loss_instance.require_logits)
-            loss_require_entropy = (hasattr(loss_instance, 'require_entropy') and loss_instance.require_entropy)
+            loss_require_logits = getattr(loss_instance, 'require_logits', False)
+            loss_require_entropy = getattr(loss_instance, 'require_entropy', False)
             loss_require_logps = getattr(loss_instance, 'require_logps', True)
             inputs: Dict[str, Any] = processor(
                 inputs,
@@ -929,7 +929,6 @@ def save(self, name: Optional[str] = None, output_dir: Optional[str] = None, int
         if optimizer_config.cur_step % interval != 0:
             return
         model = self.strategy.unwrap_model(self.model)
-        processed_state_dict = {}
         save_kwargs = {}
         if adapter_name == _default_adapter_name:
             # Full model save
diff --git a/src/twinkle/notifier/__init__.py b/src/twinkle/notifier/__init__.py
index 329cb6f1d..067db71a1 100644
--- a/src/twinkle/notifier/__init__.py
+++ b/src/twinkle/notifier/__init__.py
@@ -1,2 +1,3 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
 from .base import Notifier, notify_exception
 from .ding_notifier import DingNotifier
diff --git a/src/twinkle/notifier/base.py b/src/twinkle/notifier/base.py
index a83903b53..6f50ca659 100644
--- a/src/twinkle/notifier/base.py
+++ b/src/twinkle/notifier/base.py
@@ -1,3 +1,4 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
 import os
 from typing import Dict, Optional
 
@@ -66,7 +67,7 @@ def notify_exception(notifier: Notifier, context: str, exc: BaseException, name:
     if not _try_claim_notify_slot(exc, context, name):
         try:
             setattr(exc, '_twinkle_notified', True)
-        except Exception:  # noqa: BLE001
+        except Exception:  # noqa
             pass
         return
 
diff --git a/src/twinkle/notifier/ding_notifier.py b/src/twinkle/notifier/ding_notifier.py
index fe102d8a5..fc535edd7 100644
--- a/src/twinkle/notifier/ding_notifier.py
+++ b/src/twinkle/notifier/ding_notifier.py
@@ -1,3 +1,4 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
 import base64
 import hashlib
 import hmac
diff --git a/src/twinkle/preprocessor/llm.py b/src/twinkle/preprocessor/llm.py
index 97065fba1..39d3257be 100644
--- a/src/twinkle/preprocessor/llm.py
+++ b/src/twinkle/preprocessor/llm.py
@@ -48,9 +48,9 @@ def preprocess(self, row) -> Trajectory:
 
 class SelfCognitionProcessor(Preprocessor):
 
-    def __init__(self, model_name, model_author):
-        self.model_name = model_name
-        self.model_author = model_author
+    def __init__(self, model_name=None, model_author=None):
+        self.model_name = model_name or 'twinkle robot'
+        self.model_author = model_author or 'twinkle lab'
 
     def __call__(self, rows: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
         rows = self.map_col_to_row(rows)
diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py
index 8709d98ab..14a0e206b 100644
--- a/src/twinkle/processor/base.py
+++ b/src/twinkle/processor/base.py
@@ -42,7 +42,6 @@ class InputProcessor:
         'video_grid_thw': 0,
         'input_features': 0.0,
         'feature_attention_mask': 0,
-        'mm_token_type_ids': 0,
     }
 
     # VLM fields to concatenate (not pad) in batch
@@ -108,8 +107,12 @@ def to_tensor(_input):
                 # so tensor ops like labels != ignore_index or .to(device) would fail without this.
                 if isinstance(value, np.ndarray):
                     value = torch.from_numpy(value)
-                elif (isinstance(value, list) and isinstance(value[0],
-                                                             (int, float, np.number))) or key == 'position_ids':
+                elif isinstance(value, list) and len(value) > 0 and isinstance(
+                        value[0], (int, float, np.number)):
+                    value = torch.tensor(value)
+                elif key == 'position_ids' and not isinstance(value, torch.Tensor):
+                    if value is None:
+                        continue
                     value = torch.tensor(value)
                 elif (isinstance(value, list)) and key in ('completion_mask', 'mm_token_type_ids'):
                     value = torch.tensor(value)
@@ -284,7 +287,9 @@ def pad_cp_inputs(input_tensor: torch.Tensor, padding_value: int) -> torch.Tenso
                 return input_tensor
 
             if cp_size > 1:
-                position_ids_f = position_ids.flatten()
+                pos_for_cu = position_ids[:1] if position_ids.dim() >= 2 and position_ids.shape[0] > 1 \
+                    else position_ids
+                position_ids_f = pos_for_cu.flatten()
                 indices_q = torch.arange(position_ids_f.shape[0], device=position_ids_f.device, dtype=torch.int32)
                 cu_seqlens = torch.cat([
                     indices_q[position_ids_f == 0],
@@ -354,8 +359,11 @@ def split_cp_inputs(inputs: torch.Tensor, cu_seqlens: Optional[torch.Tensor], di
                     view_shape = (*inputs.shape[:dim], 2 * cp_size, val.shape[dim] //
                                   (2 * cp_size), *inputs.shape[dim + 1:])
                     val = val.view(view_shape)
-                    index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], device='cpu',
-                                         pin_memory=True).cuda(non_blocking=True)
+                    index = torch.tensor(
+                        [cp_rank, (2 * cp_size - cp_rank - 1)],
+                        device=inputs.device,
+                        dtype=torch.long,
+                    )
                     val = val.index_select(dim, index)
                     view_shape = (*inputs.shape[:dim], -1, *inputs.shape[dim + 1:])
                     new_inputs.append(val.view(view_shape))
@@ -402,17 +410,18 @@ def prepare_transformers_padding_free_patch(self, inputs: List[InputFeature], **
         if not padding_free or bool(kwargs.get('enable_sp', False)):
             return inputs
 
-        from twinkle.patch import apply_patch
-        from twinkle.patch.gdn_padding_free import GatedDeltaNetPaddingFreePatch
-
-        apply_patch(
-            model,
-            GatedDeltaNetPaddingFreePatch,
-            hf_config=kwargs.get('hf_config'),
-            enable_sp=False,
-        )
         if not getattr(model, '_twinkle_gdn_padding_free_patched', False):
-            return inputs
+            from twinkle.patch import apply_patch
+            from twinkle.patch.gdn_padding_free import GatedDeltaNetPaddingFreePatch
+
+            apply_patch(
+                model,
+                GatedDeltaNetPaddingFreePatch,
+                hf_config=kwargs.get('hf_config'),
+                enable_sp=False,
+            )
+            if not getattr(model, '_twinkle_gdn_padding_free_patched', False):
+                return inputs
 
         for _inp in inputs:
             position_ids = _inp.get('position_ids')
@@ -631,15 +640,27 @@ def to_transformers_dict(inputs: List[InputFeature], **kwargs) -> List[InputFeat
             output = {}
             _keys = [
                 'input_ids',
-                'input_embeddings',
+                'inputs_embeds',
                 'attention_mask',
                 'position_ids',
                 'labels',
                 'completion_mask',
+                'cu_seq_lens_q',
+                'cu_seq_lens_k',
+                'cu_seqlens_q',
+                'cu_seqlens_kv',
+                'max_length_q',
+                'max_length_k',
+                'packed_seq_params',
             ] + list(InputProcessor.VLM_CONCAT_FIELDS)
             for key in list(_input.keys()):
-                if key in _keys:
-                    output[key] = np.array(_input[key]) if not isinstance(_input[key], torch.Tensor) else _input[key]
+                if key not in _keys:
+                    continue
+                value = _input[key]
+                if isinstance(value, torch.Tensor) or not isinstance(value, (list, np.ndarray)):
+                    output[key] = value
+                else:
+                    output[key] = np.array(value)
             results.append(InputFeature(**output))
         return results
 
@@ -694,7 +715,8 @@ def is_mm_position_ids(position_ids):
                     result[key] = self._create_4d_attention_mask(values)
                 elif key == 'position_ids' and is_mm_position_ids(values[0]):
                     result[key] = InputProcessor._pad_sequence(values, self.padding_map[key], self.padding_side)
-                    result[key] = result[key].reshape(values[0].shape[0], len(values), -1)
+                    num_axes = values[0].shape[0]
+                    result[key] = result[key].reshape(len(values), num_axes, -1).permute(1, 0, 2).contiguous()
                 elif isinstance(values[0], torch.Tensor):
                     result[key] = InputProcessor._pad_sequence(values, self.padding_map[key], self.padding_side)
                     if result[key].dim() == 1:
@@ -776,6 +798,5 @@ def postprocess_tensor_cp(self, tensor, cu_seqlens=None):
         if self.device_mesh.cp_world_size <= 1:
             return tensor
         from megatron.core import parallel_state as mpu
-
         from twinkle.utils.torch_utils import gather_cp_load_balanced
         return gather_cp_load_balanced(tensor, mpu.get_context_parallel_group(), seq_dim=1, cu_seqlens=cu_seqlens)
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
index 32dc1ca50..0433ef5a8 100644
--- a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
+++ b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
@@ -1,24 +1,4 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
-"""vLLM-based sampler using VLLMEngine (AsyncLLM).
-
-Device Configuration:
-    vLLMSampler automatically detects the number of available GPUs from
-    CUDA_VISIBLE_DEVICES environment variable (set by twinkle's ResourceManager)
-    and configures vLLM's tensor_parallel_size accordingly.
-
-    To use tensor parallelism, configure DeviceGroup with gpus_per_worker > 1:
-
-        # DP2 with TP2 (4 GPUs total, 2 workers, each with 2 GPUs)
-        DeviceGroup(name='sampler', ranks=[0,1,2,3], gpus_per_worker=2)
-
-        # TP4 (4 GPUs, 1 worker with all 4 GPUs)
-        DeviceGroup(name='sampler', ranks=[0,1,2,3], gpus_per_worker=4)
-
-Data Flow:
-    When multiple vLLMSampler workers exist (DP > 1):
-    - Data is dispatched via dispatch='slice_dp' (each worker gets a slice)
-    - Results are collected via collect='flatten' (merged into single list)
-"""
 import asyncio
 import atexit
 import numpy as np
diff --git a/src/twinkle/server/state/backend/factory.py b/src/twinkle/server/state/backend/factory.py
index 326a6916f..be24c7824 100644
--- a/src/twinkle/server/state/backend/factory.py
+++ b/src/twinkle/server/state/backend/factory.py
@@ -1,12 +1,11 @@
 """Backend factory for creating StateBackend instances based on configuration."""
 from __future__ import annotations
 
-import logging
-
 from twinkle.server.config.persistence import PersistenceConfig
+from twinkle.utils import get_logger
 from .base import StateBackend
 
-logger = logging.getLogger(__name__)
+logger = get_logger()
 
 
 def create_backend(config: PersistenceConfig | None = None) -> StateBackend:
diff --git a/src/twinkle/server/state/base.py b/src/twinkle/server/state/base.py
index d931ce336..8cb055ae4 100644
--- a/src/twinkle/server/state/base.py
+++ b/src/twinkle/server/state/base.py
@@ -1,7 +1,6 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 from __future__ import annotations
 
-import logging
 import time
 from abc import ABC, abstractmethod
 from datetime import datetime, timezone
@@ -9,9 +8,10 @@
 from typing import Generic, TypeVar
 
 from twinkle.server.state.backend.base import StateBackend
+from twinkle.utils import get_logger
 
 T = TypeVar('T', bound=BaseModel)
-logger = logging.getLogger(__name__)
+logger = get_logger()
 
 
 class BaseManager(ABC, Generic[T]):
diff --git a/src/twinkle/server/state/session_manager.py b/src/twinkle/server/state/session_manager.py
index 442019ea4..1bc901b0c 100644
--- a/src/twinkle/server/state/session_manager.py
+++ b/src/twinkle/server/state/session_manager.py
@@ -2,14 +2,14 @@
 from __future__ import annotations
 
 import functools
-import logging
 import time
 
+from twinkle.utils import get_logger
 from .backend.base import ConcurrencyError, StateBackend
 from .base import BaseManager
 from .models import SessionRecord
 
-logger = logging.getLogger(__name__)
+logger = get_logger()
 
 
 def _session_touch_transform(existing: dict | None, *, now: float) -> dict | None:
diff --git a/src/twinkle/server/telemetry/provider.py b/src/twinkle/server/telemetry/provider.py
index 77212c757..059f301fb 100644
--- a/src/twinkle/server/telemetry/provider.py
+++ b/src/twinkle/server/telemetry/provider.py
@@ -16,8 +16,9 @@
 from typing import Any
 
 from twinkle.server.config.telemetry import TelemetryConfig
+from twinkle.utils import get_logger
 
-logger = logging.getLogger(__name__)
+logger = get_logger()
 
 # Loggers belonging to the OTLP transport stack. Their own records must never
 # be routed back through the OTLP LoggingHandler: an exporter error logged
diff --git a/src/twinkle/server/telemetry/worker_init.py b/src/twinkle/server/telemetry/worker_init.py
index 997f2e140..40edc6628 100644
--- a/src/twinkle/server/telemetry/worker_init.py
+++ b/src/twinkle/server/telemetry/worker_init.py
@@ -7,10 +7,11 @@
 """
 from __future__ import annotations
 
-import logging
 import os
 
-logger = logging.getLogger(__name__)
+from twinkle.utils import get_logger
+
+logger = get_logger()
 
 _worker_initialized = False
 
diff --git a/src/twinkle/template/__init__.py b/src/twinkle/template/__init__.py
index 6c4bdddd2..168456eab 100644
--- a/src/twinkle/template/__init__.py
+++ b/src/twinkle/template/__init__.py
@@ -2,3 +2,4 @@
 from .base import Template
 from .deepseek_v4 import DeepseekV4Template
 from .qwen3_5_vl import Qwen3_5Template
+from .tools import ToolCallParser, ToolCallRegistry, ClineParser, HermesQwenParser, ReActParser, VCPParser
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index 26c2e4f26..c1e8f069f 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -32,6 +32,19 @@ class Template:
     video_placeholder: str = '<video>'
     audio_placeholder: str = '<audio>'
 
+    # Encode pipeline stages — class-level so deepcopy/dill fingerprints stay stable
+    # and subclasses can override at class scope rather than re-assigning per instance.
+    pre_pipeline_names: List[str] = [
+        '_add_default_system',
+        '_to_standard_reasoning_content',
+        '_build_standard_messages',
+    ]
+    post_pipeline_names: List[str] = [
+        '_check_max_length',
+        '_add_attention_fields',
+        '_roll_labels',
+    ]
+
     def __init__(self,
                  model_id: str,
                  use_chat_template: bool = True,
@@ -59,18 +72,6 @@ def __init__(self,
         self.default_system = default_system
         self._test_support_assistant_tokens_mask()
 
-        self.pre_pipeline_names: List[str] = [
-            '_add_default_system',
-            '_to_standard_reasoning_content',
-            '_build_standard_messages',
-        ]
-
-        self.post_pipeline_names: List[str] = [
-            '_check_max_length',
-            '_add_attention_fields',
-            '_roll_labels',
-        ]
-
     def parse_tool_call(self, decoded: str) -> List[Dict[str, Any]]:
         """Parse tool calls from the assistant's decoded output.
 
@@ -161,9 +162,14 @@ def preprocess_audios(self, audios: List[AudioInput]) -> List[np.ndarray]:
         """Preprocess a list of audio clips."""
         return [self.preprocess_audio(audio) for audio in audios]
 
-    def _invoke_pre_pipeline(self, trajectories: List[Trajectory]) -> List[Trajectory]:
+    def _invoke_pre_pipeline(self,
+                             trajectories: List[Trajectory],
+                             skip_stages: Optional[Set[str]] = None) -> List[Trajectory]:
+        skip_stages = skip_stages or set()
         current = trajectories
         for pipeline_name in self.pre_pipeline_names:
+            if pipeline_name in skip_stages:
+                continue
             pipeline: Callable[[Trajectory], List[Trajectory]] = getattr(self, pipeline_name)
             next_batch = []
             for trajectory in current:
@@ -198,10 +204,14 @@ def concat_input_feature(self, prompt_input_feature: InputFeature, new_tokens: L
             mm_token_type_ids = result['mm_token_type_ids']
             if not isinstance(mm_token_type_ids, torch.Tensor):
                 mm_token_type_ids = torch.as_tensor(mm_token_type_ids)
-            token_ids_shape = mm_token_type_ids.shape
-            device = mm_token_type_ids.device
-            padded_tokens = torch.zeros((token_ids_shape[0], len(new_tokens))).to(device)
-            result['mm_token_type_ids'] = torch.cat((mm_token_type_ids, padded_tokens), dim=1)
+            # Pad along the last (sequence) dim — handles 1D [T] and 2D [1, T] (Qwen-VL) uniformly.
+            leading_shape = mm_token_type_ids.shape[:-1]
+            padded_tokens = torch.zeros(
+                (*leading_shape, len(new_tokens)),
+                dtype=mm_token_type_ids.dtype,
+                device=mm_token_type_ids.device,
+            )
+            result['mm_token_type_ids'] = torch.cat((mm_token_type_ids, padded_tokens), dim=-1)
         new_input_feature = self._invoke_post_pipeline([result])[0]
         result.update(new_input_feature)
         messages: List[Message] = result.get('messages')
@@ -288,6 +298,12 @@ def _check_max_length(self, input_feature: InputFeature) -> List[InputFeature]:
 
         # Split strategy
         if strategy == 'split':
+            if self.is_mm:
+                raise ValueError(
+                    "truncation_strategy='split' is unsafe for multimodal templates: "
+                    'splitting input_ids across chunks breaks alignment with image tokens, '
+                    'and multimodal fields (pixel_values, image_grid_thw, ...) are not partitioned. '
+                    "Use 'left' / 'right' / 'delete' / 'raise' instead.")
             results = []
             for start in range(0, len(input_feature['input_ids']), self.max_length):
                 end = min(start + self.max_length, len(input_feature['input_ids']))
@@ -584,7 +600,7 @@ def _apply_chat_template(self, trajectory: Trajectory, add_generation_prompt: bo
 
     @staticmethod
     def _get_train_indices(trajectory: Trajectory) -> Optional[Set[int]]:
-        """Extract key-round assistant indices from trajectory's packed ``user_data``."""
+        """You can pick any round for training, only set key_rounds in `user_data`"""
         kr = user_data_get(trajectory.get('user_data'), 'key_rounds')
         if isinstance(kr, list) and kr:
             return set(kr)
@@ -618,6 +634,12 @@ def _encode_messages(self, trajectory: Trajectory, add_generation_prompt: bool =
                 if 'input_ids' in encoded:
                     input_ids = encoded.pop('input_ids')
                     assistant_masks = encoded.pop('assistant_masks')
+                    # _apply_chat_template returns batched tensors ([1, T]); strip the batch dim
+                    # so downstream `len(input_ids)` reflects sequence length, not 1.
+                    if hasattr(input_ids, 'squeeze'):
+                        input_ids = input_ids.squeeze(0)
+                    if hasattr(assistant_masks, 'squeeze'):
+                        assistant_masks = assistant_masks.squeeze(0)
                     labels = np.where(assistant_masks, input_ids, -100)
             else:
                 if kwargs.get('tokenize', True):
@@ -757,16 +779,8 @@ def batch_encode(
         return output
 
     def format_trajectory(self, trajectory: Trajectory, add_default_system: bool = False) -> Trajectory:
-        current = [trajectory]
-        for pipeline_name in self.pre_pipeline_names:
-            if not add_default_system and pipeline_name == '_add_default_system':
-                continue
-            pipeline: Callable[[Trajectory], List[Trajectory]] = getattr(self, pipeline_name)
-            next_batch = []
-            for traj in current:
-                next_batch.extend(pipeline(traj))
-            current = next_batch
-        return current[0]
+        skip = set() if add_default_system else {'_add_default_system'}
+        return self._invoke_pre_pipeline([trajectory], skip_stages=skip)[0]
 
     def check(self, trajectory: Trajectory) -> Optional[Trajectory]:
         encoded = None
@@ -815,7 +829,10 @@ def pre_forward_hook(self, model: 'torch.nn.Module', args, kwargs):
         for k, v in old_kwargs.items():
             if k in {
                     'input_ids', 'attention_mask', 'labels', 'position_ids', 'output_hidden_states', 'logits_to_keep',
-                    'max_length_q', 'max_length_k', 'cu_seq_lens_q', 'cu_seq_lens_k'
+                    'max_length_q', 'max_length_k',
+                    'cu_seq_lens_q', 'cu_seq_lens_k',
+                    'cu_seqlens_q', 'cu_seqlens_kv',
+                    'packed_seq_params',
             } and k not in kwargs:
                 kwargs[k] = v
         if 'inputs_embeds' in kwargs:
diff --git a/src/twinkle/template/deepseek_v4.py b/src/twinkle/template/deepseek_v4.py
index 7a396f0dc..6d3d973e8 100644
--- a/src/twinkle/template/deepseek_v4.py
+++ b/src/twinkle/template/deepseek_v4.py
@@ -127,17 +127,7 @@ def __init__(
         self.truncation_strategy = truncation_strategy
         self.default_system = default_system
         self._test_support_assistant_tokens_mask()
-
-        self.pre_pipeline_names = [
-            '_add_default_system',
-            '_to_standard_reasoning_content',
-            '_build_standard_messages',
-        ]
-        self.post_pipeline_names = [
-            '_check_max_length',
-            '_add_attention_fields',
-            '_roll_labels',
-        ]
+        # pre_pipeline_names / post_pipeline_names inherit from Template (class-level).
 
     def parse(self, decoded: str) -> List[Dict[str, Any]]:
         text = decoded or ''
@@ -190,7 +180,17 @@ def clean(self, decoded: str) -> str:
             text = text[:start] + text[end:]
 
     def parse_tool_call(self, decoded: str) -> List[Dict[str, Any]]:
-        return self.parse(decoded)
+        """Prefer DeepSeek's DSML tool-call format; fall back to ToolCallRegistry parsers."""
+        text = decoded or ''
+        if DeepseekV4Template._TOOL_CALLS_START in text:
+            result = self.parse(text)
+            if result:
+                return result
+        return super().parse_tool_call(decoded)
 
     def clean_tool_call(self, decoded: str) -> str:
-        return self.clean(decoded)
+        """Prefer DeepSeek's DSML tool-call format; fall back to ToolCallRegistry parsers."""
+        text = decoded or ''
+        if DeepseekV4Template._TOOL_CALLS_START in text:
+            return self.clean(text)
+        return super().clean_tool_call(decoded)
diff --git a/src/twinkle/template/qwen3_5_vl.py b/src/twinkle/template/qwen3_5_vl.py
index b61265f5e..2655a78ef 100644
--- a/src/twinkle/template/qwen3_5_vl.py
+++ b/src/twinkle/template/qwen3_5_vl.py
@@ -52,6 +52,9 @@ def __init__(self, *args, **kwargs):
         if self.processor is not self.tokenizer:
             apply_patch(self.processor, Qwen3ChatTemplate)
             apply_patch(self.processor, Qwen3AllowToolTailTemplate)
+        # Patches above may alter chat_template behavior — re-probe assistant_masks
+        # support so `_template_support_assistant_tokens_mask` reflects the patched template.
+        self._test_support_assistant_tokens_mask()
         self._patch_size: Optional[int] = None
         self._merge_size: Optional[int] = None
         self._init_vision_config()
diff --git a/src/twinkle/template/tools/base.py b/src/twinkle/template/tools/base.py
index a6d7040e2..35b63dc82 100644
--- a/src/twinkle/template/tools/base.py
+++ b/src/twinkle/template/tools/base.py
@@ -10,14 +10,6 @@ class ToolCallParser(ABC):
     open_marker: Optional[str] = None
     close_marker: Optional[str] = None
 
-    def matches_model(self, model_id: str) -> bool:
-        """Return True if this parser is the canonical choice for ``model_id``.
-
-        Used for streaming where we must commit to a parser before any text
-        has arrived. Default False — parser is text-detection-only.
-        """
-        return False
-
     @abstractmethod
     def detect(self, text: str) -> bool:
         """Cheap pre-check: does ``text`` carry this format's markup?"""
@@ -30,13 +22,14 @@ def parse(self, text: str) -> List[Dict[str, Any]]:
     def clean(self, text: str) -> str:
         """Strip parser-specific markup; return plain content text."""
 
-    def detect_result(self, text: str) -> bool:
-        """Does ``text`` look like a tool-result message for this protocol?"""
-        return False
+    def extract_tool_result(self, text: str) -> Optional[str]:
+        """If ``text`` is a tool-result message of this protocol, return the
+        body with the protocol-specific prefix stripped; otherwise return ``None``.
 
-    def parse_result(self, text: str) -> str:
-        """Strip protocol-specific result prefix; return the raw tool output body."""
-        return text
+        Default returns ``None`` — only protocols carrying their own tool-result
+        framing (e.g. Cline) need to override this.
+        """
+        return None
 
 
 class ToolCallRegistry:
@@ -56,14 +49,6 @@ def register(cls, parser: ToolCallParser) -> ToolCallParser:
     def parsers(cls) -> List[ToolCallParser]:
         return list(cls._parsers)
 
-    @classmethod
-    def select_for_model(cls, model_id: Optional[str]) -> Optional[ToolCallParser]:
-        mid = (model_id or '').lower()
-        for p in cls._parsers:
-            if p.matches_model(mid):
-                return p
-        return None
-
     @classmethod
     def detect_first(cls, text: str) -> Optional[ToolCallParser]:
         if not text:
diff --git a/src/twinkle/template/tools/cline.py b/src/twinkle/template/tools/cline.py
index 2673e82ef..8f36324ce 100644
--- a/src/twinkle/template/tools/cline.py
+++ b/src/twinkle/template/tools/cline.py
@@ -21,7 +21,7 @@
 from __future__ import annotations
 
 import re
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from .base import ToolCallParser
 
@@ -99,10 +99,6 @@ class ClineParser(ToolCallParser):
     open_marker = None
     close_marker = None
 
-    def matches_model(self, model_id: str) -> bool:
-        # Cline is an app-level prompt protocol, not bound to any model family.
-        return False
-
     def detect(self, text: str) -> bool:
         if not text or '<' not in text:
             return False
@@ -153,9 +149,6 @@ def clean(self, text: str) -> str:
         out.append(text[last:])
         return ''.join(out).rstrip()
 
-    def detect_result(self, text: str) -> bool:
-        return bool(_RESULT_RE.match(text or ''))
-
-    def parse_result(self, text: str) -> str:
+    def extract_tool_result(self, text: str) -> Optional[str]:
         m = _RESULT_RE.match(text or '')
-        return text[m.end():] if m else text
+        return text[m.end():] if m else None
diff --git a/src/twinkle/template/tools/qwen.py b/src/twinkle/template/tools/qwen.py
index 12361b737..6713d570a 100644
--- a/src/twinkle/template/tools/qwen.py
+++ b/src/twinkle/template/tools/qwen.py
@@ -16,9 +16,6 @@ class HermesQwenParser(ToolCallParser):
     _PARAMETER_RE = re.compile(r'<parameter=([^>]+)>\s*([\s\S]*?)\s*</parameter>')
     _STRIP_RE = re.compile(r'<tool_call>[\s\S]*?(?:</tool_call>|\Z)')
 
-    def matches_model(self, model_id: str) -> bool:
-        return 'qwen' in model_id
-
     def detect(self, text: str) -> bool:
         return self.open_marker in text
 
diff --git a/src/twinkle/template/utils.py b/src/twinkle/template/utils.py
index 5648a4770..dc5a5fb40 100644
--- a/src/twinkle/template/utils.py
+++ b/src/twinkle/template/utils.py
@@ -4,14 +4,28 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, TypeVar
 
 from twinkle.data_format import Message, Trajectory
-from twinkle.utils import to_device
+from twinkle.utils import get_logger, to_device
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
 
+logger = get_logger()
 _T = TypeVar('_T')
 
 
+def _coerce_ids_to_list(ids: Any) -> List[int]:
+    import torch
+    if isinstance(ids, torch.Tensor):
+        while ids.dim() > 1:
+            if ids.shape[0] != 1:
+                raise ValueError(
+                    '_coerce_ids_to_list expects a single-sample tensor (leading dims of size 1); '
+                    f'got shape {tuple(ids.shape)}. Pass one trajectory at a time.')
+            ids = ids[0]
+        return ids.tolist()
+    return ids
+
+
 def _convert_to_vlm_format(messages: List[Dict]) -> List[Dict]:
     converted = []
     for msg in messages:
@@ -43,6 +57,7 @@ def _load_image(img: Any) -> Optional[Any]:
         if img.startswith(('http://', 'https://')):
             import requests
             resp = requests.get(img, timeout=30)
+            resp.raise_for_status()
             return Image.open(io.BytesIO(resp.content))
         else:
             return Image.open(img)
@@ -79,7 +94,15 @@ def _transfer_single_message(
         start = 0
         media_idx = 0
         while (pos := content.find(placeholder, start)) != -1:
-            url = media_list[media_idx] if media_idx < len(media_list) else None
+            if media_idx >= len(media_list):
+                # More placeholders than provided media entries: stop scanning so the extra
+                # placeholder text is preserved verbatim instead of being silently consumed
+                # (which would make user-supplied media references vanish without warning).
+                logger.warning(
+                    f'placeholder {placeholder!r} appears more times than provided '
+                    f'{media_type} entries ({len(media_list)}); extra occurrences are kept as literal text.')
+                break
+            url = media_list[media_idx]
             placeholders.append((pos, len(placeholder), media_type, url))
             media_idx += 1
             start = pos + len(placeholder)
@@ -212,14 +235,12 @@ def tokenize_with_assistant_labels(tokenizer: 'PreTrainedTokenizer',
             Labels are -100 for non-assistant tokens, original token id for assistant content tokens.
             Assistant prefix tokens (e.g., '<|im_start|>assistant\n') are excluded from training.
         """
-        import torch
+        kwargs.pop('add_generation_prompt', None)
         messages = trajectory['messages']
 
         # Encode full trajectory
         encoded = encode_func(trajectory, **kwargs)
-        full_ids = encoded.pop('input_ids')
-        if isinstance(full_ids, torch.Tensor):
-            full_ids = full_ids.tolist()[0]
+        full_ids = _coerce_ids_to_list(encoded.pop('input_ids'))
 
         # Initialize labels: all -100 (not trained)
         labels = [-100] * len(full_ids)
@@ -237,17 +258,14 @@ def tokenize_with_assistant_labels(tokenizer: 'PreTrainedTokenizer',
             # encode(messages[:i], add_generation_prompt=True) includes the prefix
             partial_trajectory = copy(trajectory)
             partial_trajectory['messages'] = list(messages[:i])
-            partial_ids = encode_func(partial_trajectory, add_generation_prompt=True, **kwargs)['input_ids']
-            if isinstance(partial_ids, torch.Tensor):
-                partial_ids = partial_ids.tolist()[0]
+            partial_ids = _coerce_ids_to_list(
+                encode_func(partial_trajectory, add_generation_prompt=True, **kwargs)['input_ids'])
             start_pos = len(partial_ids)
 
             # Get end position: encode(messages[:i+1]) includes full assistant turn
             partial_trajectory = copy(trajectory)
             partial_trajectory['messages'] = list(messages[:i + 1])
-            partial_ids = encode_func(partial_trajectory, **kwargs)['input_ids']
-            if isinstance(partial_ids, torch.Tensor):
-                partial_ids = partial_ids.tolist()[0]
+            partial_ids = _coerce_ids_to_list(encode_func(partial_trajectory, **kwargs)['input_ids'])
             end_pos = len(partial_ids)
 
             # Mark assistant CONTENT tokens as trainable (excluding prefix)
@@ -330,21 +348,22 @@ def tokenize_with_assistant_labels(
                 if isinstance(msg['content'], str):
                     msg['content'] = placeholder
                 else:
-                    msg['content'][0]['text'] = placeholder
+                    text_items = [c for c in msg['content']
+                                  if isinstance(c, dict) and c.get('type') == 'text']
+                    if len(text_items) != 1:
+                        raise ValueError(
+                            'TokenizeByPlaceHolder requires exactly one text item in assistant '
+                            f'content, got {len(text_items)} (content={msg["content"]!r}).')
+                    text_items[0]['text'] = placeholder
                 assistant_count += 1
             _dummy_messages.append(msg)
 
         encoded = encode_func(trajectory)
-        full_ids = encoded.pop('input_ids')
-        if isinstance(full_ids, torch.Tensor):
-            full_ids = full_ids.tolist()[0]
+        full_ids = _coerce_ids_to_list(encoded.pop('input_ids'))
 
         _dummy_trajectory = copy(trajectory)
         _dummy_trajectory['messages'] = _dummy_messages
-        template_ids = encode_func(_dummy_trajectory)
-        template_ids = template_ids['input_ids']
-        if isinstance(template_ids, torch.Tensor):
-            template_ids = template_ids.tolist()[0]
+        template_ids = _coerce_ids_to_list(encode_func(_dummy_trajectory)['input_ids'])
 
         extra_kwargs = {}
         if 'add_special_tokens' in inspect.signature(tokenizer.encode).parameters:
@@ -365,7 +384,8 @@ def tokenize_with_assistant_labels(
                 labels = TokenizeByPlaceHolder.build_labels(full_ids, template_parts)
             else:
                 raise e
-        if labels and labels[-1] == -100:
+        last_role = messages[-1]['role'] if messages else None
+        if last_role == 'assistant' and labels and labels[-1] == -100:
             end_idx = len(labels)
             start_idx = end_idx - 1
             while start_idx > 0 and labels[start_idx - 1] == -100:
diff --git a/src/twinkle/utils/grad_clip.py b/src/twinkle/utils/grad_clip.py
index 671aedd2f..bf950cdfb 100644
--- a/src/twinkle/utils/grad_clip.py
+++ b/src/twinkle/utils/grad_clip.py
@@ -1,7 +1,21 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
+"""Gradient normalization and clipping utilities.
+
+Architecture:
+    normalize_and_clip_grad_norm (public entry point)
+    ├── _normalize_grads           — divide all grads by num_tokens in-place
+    ├── _ep_aware_clip_grad_norm   — EP two-phase reduce + unified clip
+    │   └── _local_norm_stat       — per-rank norm statistic (foreach-accelerated)
+    │       └── _collect_local_grads
+    └── _standard_clip_grad_norm   — mixed DTensor/local tensor clip
+        ├── _detect_grad_topology  — classify grads into DTensor/local/mixed
+        ├── _resolve_reduce_device — pick device for all-reduce tensor
+        ├── _compute_total_norm    — reduce local norms across ranks
+        └── _apply_clip            — scale grads by clip coefficient
+"""
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Iterable
+from typing import TYPE_CHECKING, Iterable, List, Set, Tuple
 
 from twinkle import Platform
 from twinkle.utils import torch_util
@@ -9,6 +23,10 @@
 if TYPE_CHECKING:
     import torch
 
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
 
 def normalize_and_clip_grad_norm(parameters: Iterable[torch.nn.Parameter],
                                  *,
@@ -19,29 +37,28 @@ def normalize_and_clip_grad_norm(parameters: Iterable[torch.nn.Parameter],
                                  ep_param_groups=None,
                                  ep_group=None,
                                  ep_fsdp_group=None) -> float:
-    """Normalize gradients by num_tokens, then clip by max_grad_norm.
-
-    If ep_param_groups is provided, uses EP-aware two-phase reduction:
-    - non-EP params: all-reduce over group (fsdp_group)
-    - EP params: all-reduce over ep_fsdp_group, then ep_group
+    """Normalize gradients by *num_tokens*, then clip to *max_grad_norm*.
+
+    Args:
+        parameters: Trainable parameters whose grads will be normalized and clipped.
+        num_tokens: Token count for gradient normalization (summed across DP ranks).
+        max_grad_norm: Maximum allowed gradient norm after normalization.
+        norm_type: Type of the norm (2.0 for L2, inf for max-norm).
+        group: Process group for all-reduce (standard path, e.g. dp_group).
+        ep_param_groups: If provided, ``{'ep': [...], 'non_ep': [...]}`` triggers
+            EP-aware two-phase reduction.
+        ep_group: Process group spanning EP ranks.
+        ep_fsdp_group: Process group for FSDP shards within an EP partition.
+
+    Returns:
+        The total gradient norm (after normalization, before clipping).
     """
-    import torch
-    import torch.distributed as dist
     parameters = list(parameters)
-    if num_tokens <= 0:
-        num_tokens = 1
-
-    grads = []
-    for param in parameters:
-        if param.grad is None:
-            continue
-        param.grad.div_(num_tokens)
-        grads.append(param.grad)
-
+    grads = _normalize_grads(parameters, num_tokens)
     if not grads:
         return 0.0
 
-    # EP-aware path
+    # EP-aware path: separate reduce for expert / non-expert params.
     if ep_param_groups is not None:
         return _ep_aware_clip_grad_norm(
             ep_param_groups=ep_param_groups,
@@ -52,89 +69,172 @@ def normalize_and_clip_grad_norm(parameters: Iterable[torch.nn.Parameter],
             ep_fsdp_group=ep_fsdp_group,
         )
 
-    # Standard path (backward compatible)
-    has_dtensor_grad = any(hasattr(grad, 'to_local') for grad in grads)
-    has_local_tensor_grad = any(not hasattr(grad, 'to_local') for grad in grads)
-    dtensor_mesh_keys = set()
-    for grad in grads:
-        if not hasattr(grad, 'to_local'):
-            continue
-        mesh = getattr(grad, 'device_mesh', None)
-        if mesh is None:
-            dtensor_mesh_keys.add('dtensor:unknown')
+    # Standard path: handles pure DTensor, pure local, and mixed cases.
+    return _standard_clip_grad_norm(parameters, grads, max_grad_norm, norm_type, group)
+
+
+# ---------------------------------------------------------------------------
+# Gradient normalization
+# ---------------------------------------------------------------------------
+
+
+def _normalize_grads(parameters: List[torch.nn.Parameter], num_tokens: int) -> List[torch.Tensor]:
+    """Divide every non-None gradient by *num_tokens* in-place and return the grad list."""
+    if num_tokens <= 0:
+        num_tokens = 1
+    grads = []
+    for param in parameters:
+        if param.grad is None:
             continue
-        try:
-            mesh_key = (tuple(mesh.mesh.flatten().tolist()), tuple(mesh.mesh_dim_names or ()))
-        except Exception:
-            mesh_key = repr(mesh)
-        dtensor_mesh_keys.add(mesh_key)
-
-    has_mixed_dtensor_mesh = len(dtensor_mesh_keys) > 1
-
-    if not (has_dtensor_grad and has_local_tensor_grad) and not has_mixed_dtensor_mesh:
-        grad_norm = torch.nn.utils.clip_grad_norm_(
-            parameters,
-            max_grad_norm,
-            norm_type=norm_type,
-        )
+        param.grad.div_(num_tokens)
+        grads.append(param.grad)
+    return grads
+
+
+# ---------------------------------------------------------------------------
+# Standard (non-EP) clip path
+# ---------------------------------------------------------------------------
+
+
+def _standard_clip_grad_norm(
+    parameters: List[torch.nn.Parameter],
+    grads: List[torch.Tensor],
+    max_grad_norm: float,
+    norm_type: float,
+    group,
+) -> float:
+    """Clip grads that may be a mix of DTensor and local Tensor."""
+    import torch
+
+    topology = _detect_grad_topology(grads)
+    can_use_builtin = (
+        not topology.has_mixed_mesh
+        and (topology.all_dtensor or (topology.all_local and group is None))
+    )
+
+    if can_use_builtin:
+        # PyTorch built-in handles DTensor reduce via mesh ops and works for
+        # single-rank / DDP (grads pre-synced in backward).
+        grad_norm = torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm, norm_type=norm_type)
         grad_norm = torch_util.to_local_tensor(grad_norm)
         return float(grad_norm.item())
 
+    # Manual path for mixed DTensor+local or explicit group reduce.
     norm_type = float(norm_type)
     if norm_type not in (2.0, float('inf')):
         raise ValueError('Mixed DTensor/Tensor clip_grad_norm only supports norm_type=2 or inf.')
 
-    def _local_grad(grad: torch.Tensor) -> torch.Tensor:
-        if hasattr(grad, 'to_local'):
-            return grad.to_local()
-        return grad
+    reduce_device = _resolve_reduce_device(grads)
+    # Mixed meshes cannot be reduced via DTensor propagation; fall back to world group.
+    reduce_group = None if topology.has_mixed_mesh else group
+    total_norm = _compute_total_norm(grads, norm_type, reduce_device, reduce_group)
+    _apply_clip(grads, max_grad_norm, total_norm)
+    return total_norm
+
+
+class _GradTopology:
+    """Lightweight container describing the DTensor/local composition of grads."""
+    __slots__ = ('all_dtensor', 'all_local', 'has_mixed_mesh')
+
+    def __init__(self, all_dtensor: bool, all_local: bool, has_mixed_mesh: bool):
+        self.all_dtensor = all_dtensor
+        self.all_local = all_local
+        self.has_mixed_mesh = has_mixed_mesh
+
+
+def _detect_grad_topology(grads: List[torch.Tensor]) -> _GradTopology:
+    """Classify gradients as pure-DTensor, pure-local, or mixed."""
+    has_dtensor = False
+    has_local = False
+    mesh_keys: Set = set()
 
-    reduce_device = None
     for grad in grads:
-        local_grad = _local_grad(grad)
-        if local_grad.is_cuda or getattr(local_grad, 'is_npu', False):
-            reduce_device = local_grad.device
-            break
-    if reduce_device is None:
-        backend = dist.get_backend() if dist.is_initialized() else None
-        if backend in ('nccl', 'hccl'):
-            reduce_device = torch.device(Platform.get_local_device())
+        if hasattr(grad, 'to_local'):
+            has_dtensor = True
+            mesh = getattr(grad, 'device_mesh', None)
+            if mesh is None:
+                mesh_keys.add('dtensor:unknown')
+            else:
+                try:
+                    key = (tuple(mesh.mesh.flatten().tolist()), tuple(mesh.mesh_dim_names or ()))
+                except Exception:
+                    key = repr(mesh)
+                mesh_keys.add(key)
         else:
-            reduce_device = torch.device('cpu')
-    reduce_group = group
-    if has_mixed_dtensor_mesh:
-        # Different DTensor meshes cannot be reduced by DTensor op propagation (e.g. aten.stack).
-        # Fall back to world reduction over local shards.
-        reduce_group = None
+            has_local = True
+
+    return _GradTopology(
+        all_dtensor=has_dtensor and not has_local,
+        all_local=has_local and not has_dtensor,
+        has_mixed_mesh=len(mesh_keys) > 1,
+    )
+
+
+def _resolve_reduce_device(grads: List[torch.Tensor]):
+    """Pick the device to host the all-reduce scalar (prefer accelerator)."""
+    import torch
+    import torch.distributed as dist
+
+    for grad in grads:
+        local = grad.to_local() if hasattr(grad, 'to_local') else grad
+        if local.is_cuda or getattr(local, 'is_npu', False):
+            return local.device
+
+    backend = dist.get_backend() if dist.is_initialized() else None
+    if backend in ('nccl', 'hccl'):
+        return torch.device(Platform.get_local_device())
+    return torch.device('cpu')
+
+
+def _compute_total_norm(
+    grads: List[torch.Tensor],
+    norm_type: float,
+    device,
+    group,
+) -> float:
+    """Compute the total gradient norm with cross-rank all-reduce."""
+    import torch
+    import torch.distributed as dist
+
+    def _to_local(g):
+        return g.to_local() if hasattr(g, 'to_local') else g
 
     if norm_type == float('inf'):
-        local_norm = 0.0
+        local_val = 0.0
         for grad in grads:
-            local_grad = _local_grad(grad)
+            local_grad = _to_local(grad)
             if local_grad.numel() == 0:
                 continue
-            local_norm = max(local_norm, local_grad.detach().abs().max().item())
-        total_norm_tensor = torch.tensor(local_norm, device=reduce_device, dtype=torch.float32)
+            local_val = max(local_val, local_grad.detach().abs().max().item())
+        total_tensor = torch.tensor(local_val, device=device, dtype=torch.float32)
         if dist.is_initialized():
-            dist.all_reduce(total_norm_tensor, op=dist.ReduceOp.MAX, group=reduce_group)
-        total_norm = float(total_norm_tensor.item())
+            dist.all_reduce(total_tensor, op=dist.ReduceOp.MAX, group=group)
     else:
         local_sq = 0.0
         for grad in grads:
-            local_grad = _local_grad(grad)
+            local_grad = _to_local(grad)
             if local_grad.numel() == 0:
                 continue
             local_sq += local_grad.detach().float().pow(2).sum().item()
-        total_sq_tensor = torch.tensor(local_sq, device=reduce_device, dtype=torch.float32)
+        total_tensor = torch.tensor(local_sq, device=device, dtype=torch.float32)
         if dist.is_initialized():
-            dist.all_reduce(total_sq_tensor, op=dist.ReduceOp.SUM, group=reduce_group)
-        total_norm = float(total_sq_tensor.sqrt().item())
+            dist.all_reduce(total_tensor, op=dist.ReduceOp.SUM, group=group)
+        total_tensor = total_tensor.sqrt()
+
+    return float(total_tensor.item())
+
 
-    clip_coef = float(max_grad_norm) / (total_norm + 1e-6)
+def _apply_clip(grads: List[torch.Tensor], max_grad_norm: float, total_norm: float) -> None:
+    """Scale gradients in-place if total_norm exceeds max_grad_norm."""
+    clip_coef = max_grad_norm / (total_norm + 1e-6)
     if clip_coef < 1.0:
         for grad in grads:
             grad.mul_(clip_coef)
-    return total_norm
+
+
+# ---------------------------------------------------------------------------
+# EP-aware clip path
+# ---------------------------------------------------------------------------
 
 
 def _ep_aware_clip_grad_norm(
@@ -146,59 +246,72 @@ def _ep_aware_clip_grad_norm(
     ep_group=None,
     ep_fsdp_group=None,
 ) -> float:
-    """EP-aware gradient clipping.
+    """EP-aware gradient clipping with two-phase reduction.
 
-    - non-EP params: all-reduce over fsdp_group
-    - EP params: all-reduce over ep_fsdp_group, then ep_group
-    - Unified clip coefficient applied to both groups via clip_grads_with_norm_
+    Reduction strategy:
+        - non-EP params: all-reduce over *fsdp_group*
+        - EP params: all-reduce over *ep_fsdp_group*, then *ep_group*
+    After obtaining the unified total norm, applies clipping to both groups.
     """
     import math
     import torch
-    import torch.distributed as dist
-
-    ep_params = [p for p in ep_param_groups.get('ep', []) if p.grad is not None]
-    non_ep_params = [p for p in ep_param_groups.get('non_ep', []) if p.grad is not None]
 
+    ep_params = [p for p in ep_param_groups.get('ep', []) if p.grad is not None and p.requires_grad]
+    non_ep_params = [p for p in ep_param_groups.get('non_ep', []) if p.grad is not None and p.requires_grad]
     norm_type = float(norm_type)
 
     with torch.no_grad():
-        # non-EP: reduce over fsdp_group
         non_ep_val = _local_norm_stat(non_ep_params, norm_type)
-        if fsdp_group is not None:
-            op = dist.ReduceOp.MAX if math.isinf(norm_type) else dist.ReduceOp.SUM
-            dist.all_reduce(non_ep_val, op=op, group=fsdp_group)
-
-        # EP: reduce over ep_fsdp_group, then ep_group
         ep_val = _local_norm_stat(ep_params, norm_type)
-        if ep_fsdp_group is not None:
-            op = dist.ReduceOp.MAX if math.isinf(norm_type) else dist.ReduceOp.SUM
-            dist.all_reduce(ep_val, op=op, group=ep_fsdp_group)
-        if ep_group is not None:
-            op = dist.ReduceOp.MAX if math.isinf(norm_type) else dist.ReduceOp.SUM
-            dist.all_reduce(ep_val, op=op, group=ep_group)
-
-        # Combine into total_norm tensor
+        _reduce_norm_stat(non_ep_val, norm_type, fsdp_group)
+        _reduce_norm_stat(ep_val, norm_type, ep_fsdp_group)
+        _reduce_norm_stat(ep_val, norm_type, ep_group)
+
+        # Combine
         if math.isinf(norm_type):
             total_norm = torch.maximum(non_ep_val, ep_val)
         else:
-            total_norm = (non_ep_val + ep_val)**(1.0 / norm_type)
+            total_norm = (non_ep_val + ep_val) ** (1.0 / norm_type)
 
     torch.nn.utils.clip_grads_with_norm_(ep_params, max_grad_norm, total_norm, foreach=True)
     torch.nn.utils.clip_grads_with_norm_(non_ep_params, max_grad_norm, total_norm, foreach=True)
-
     return float(total_norm.item())
 
 
+def _reduce_norm_stat(val, norm_type: float, group) -> None:
+    """All-reduce a norm statistic tensor over the given group (no-op if group is None)."""
+    if group is None:
+        return
+    import math
+    import torch.distributed as dist
+    op = dist.ReduceOp.MAX if math.isinf(norm_type) else dist.ReduceOp.SUM
+    dist.all_reduce(val, op=op, group=group)
+
+
+# ---------------------------------------------------------------------------
+# Local norm computation (foreach-accelerated)
+# ---------------------------------------------------------------------------
+
+
 def _local_norm_stat(params, norm_type: float):
-    """Compute local norm statistic: sum of p-th powers (finite p) or max (inf).
+    """Compute the local (single-rank) norm statistic.
 
-    Uses torch._foreach_* batch kernels for finite p to reduce kernel launch overhead.
+    Returns:
+        A scalar tensor on accelerator: sum-of-p-th-powers (finite p) or max-abs (inf).
     """
     import math
+
+    grads_local, default_device = _collect_local_grads(params)
+
+    if math.isinf(norm_type):
+        return _local_inf_norm(grads_local, default_device)
+    return _local_p_norm_stat(grads_local, norm_type, default_device)
+
+
+def _collect_local_grads(params) -> Tuple[List, 'torch.device']:
+    """Extract local fp32 grad tensors and determine the compute device."""
     import torch
     from torch.distributed._tensor import DTensor
-    from torch.utils._foreach_utils import (_device_has_foreach_support, _group_tensors_by_device_and_dtype,
-                                            _has_foreach_support)
 
     grads_local = []
     default_device = None
@@ -213,28 +326,58 @@ def _local_norm_stat(params, norm_type: float):
     if default_device is None:
         default_device = torch.device(Platform.get_local_device())
 
-    if math.isinf(norm_type):
-        val = torch.tensor(0.0, device=default_device, dtype=torch.float32)
-        for g in grads_local:
-            if g.numel() == 0:
-                continue
-            val = torch.maximum(val, g.abs().max())
-        return val
+    return grads_local, default_device
+
+
+def _local_inf_norm(grads_local: List, device) -> 'torch.Tensor':
+    """Compute local max-abs norm."""
+    import torch
+    val = torch.tensor(0.0, device=device, dtype=torch.float32)
+    for g in grads_local:
+        if g.numel() == 0:
+            continue
+        val = torch.maximum(val, g.abs().max())
+    return val
+
+
+def _local_p_norm_stat(grads_local: List, norm_type: float, device) -> 'torch.Tensor':
+    """Compute sum of p-th powers of per-grad norms (foreach-accelerated)."""
+    import torch
 
     p = float(norm_type)
-    val = torch.tensor(0.0, device=default_device, dtype=torch.float32)
-    if not grads_local:
-        return val
+    val = torch.tensor(0.0, device=device, dtype=torch.float32)
     non_empty = [g for g in grads_local if g.numel() > 0]
     if not non_empty:
         return val
+
+    # Try vectorized foreach path (private PyTorch util, may be absent in future).
+    try:
+        from torch.utils._foreach_utils import (
+            _device_has_foreach_support,
+            _group_tensors_by_device_and_dtype,
+            _has_foreach_support,
+        )
+    except ImportError:
+        return _local_p_norm_stat_scalar(non_empty, p, val)
+
     grouped = _group_tensors_by_device_and_dtype([non_empty])
-    for (device, _), ([device_grads], _) in grouped.items():
-        if _has_foreach_support(device_grads, device) or _device_has_foreach_support(device):
-            # Batch: compute ||g||_p for each grad, raise to p-th power, then sum
-            out = torch._foreach_pow_(torch._foreach_norm(device_grads, p), p)
-            val += torch.sum(torch.stack(out)).to(default_device)
+    for (dev, _), ([device_grads], _) in grouped.items():
+        if _has_foreach_support(device_grads, dev) or _device_has_foreach_support(dev):
+            # NOTE: _foreach_pow_ is in-place and returns None (PyTorch convention);
+            # we must keep the intermediate list reference.
+            norms = torch._foreach_norm(device_grads, p)
+            torch._foreach_pow_(norms, p)
+            val += torch.sum(torch.stack(norms)).to(device)
         else:
             for g in device_grads:
-                val += (torch.norm(g, p=p)**p).to(default_device)
+                val += torch.norm(g, p=p).pow(p).to(device)
+    return val
+
+
+def _local_p_norm_stat_scalar(grads: List, p: float, val) -> 'torch.Tensor':
+    """Scalar fallback for p-norm stat when foreach utilities are unavailable."""
+    import torch
+    device = val.device
+    for g in grads:
+        val += torch.norm(g, p=p).pow(p).to(device)
     return val
diff --git a/src/twinkle/utils/platforms/base.py b/src/twinkle/utils/platforms/base.py
index 71c2bd18f..483c725a3 100644
--- a/src/twinkle/utils/platforms/base.py
+++ b/src/twinkle/utils/platforms/base.py
@@ -136,3 +136,24 @@ def device_backend(platform: str = None):
     def get_vllm_device_uuid(device_id: int = 0, platform=None) -> str:
         platform = Platform.get_platform(platform)
         return platform.get_vllm_device_uuid(device_id)
+
+    @staticmethod
+    def get_device_rng_state(platform: str = None):
+        """Return device-specific RNG state (e.g. CUDA / NPU / MPS).
+
+        Backend-agnostic replacement for hard-coded ``torch.cuda.get_rng_state()``.
+        Returns ``None`` when no accelerator is available, so callers can safely
+        skip persistence on CPU-only or unsupported devices.
+        """
+        return Platform.get_platform(platform).get_device_rng_state()
+
+    @staticmethod
+    def set_device_rng_state(state, *, platform: str = None) -> None:
+        """Restore device-specific RNG state.
+
+        No-op when ``state`` is ``None`` (e.g. checkpoint produced on a different
+        backend) or when the current platform has no accelerator available.
+        """
+        if state is None:
+            return
+        Platform.get_platform(platform).set_device_rng_state(state)
diff --git a/src/twinkle/utils/platforms/gpu.py b/src/twinkle/utils/platforms/gpu.py
index 0b99f8855..0f213448e 100644
--- a/src/twinkle/utils/platforms/gpu.py
+++ b/src/twinkle/utils/platforms/gpu.py
@@ -24,3 +24,16 @@ def device_backend(platform: str = None):
     def get_vllm_device_uuid(device_id: int = 0) -> str:
         from vllm.platforms import current_platform
         return current_platform.get_device_uuid(device_id)
+
+    @staticmethod
+    def get_device_rng_state():
+        import torch
+        if torch.cuda.is_available():
+            return torch.cuda.get_rng_state()
+        return None
+
+    @staticmethod
+    def set_device_rng_state(state) -> None:
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.set_rng_state(state)
diff --git a/src/twinkle/utils/platforms/mps.py b/src/twinkle/utils/platforms/mps.py
index e99abb0e0..86ecf751e 100644
--- a/src/twinkle/utils/platforms/mps.py
+++ b/src/twinkle/utils/platforms/mps.py
@@ -40,3 +40,22 @@ def device_backend(platform: str = None):
     @staticmethod
     def get_vllm_device_uuid(device_id: int = 0) -> str:
         raise NotImplementedError
+
+    @staticmethod
+    def get_device_rng_state():
+        import torch
+        if hasattr(torch, 'mps') and hasattr(torch.mps, 'get_rng_state'):
+            try:
+                return torch.mps.get_rng_state()
+            except Exception:  # noqa: BLE001
+                return None
+        return None
+
+    @staticmethod
+    def set_device_rng_state(state) -> None:
+        import torch
+        if hasattr(torch, 'mps') and hasattr(torch.mps, 'set_rng_state'):
+            try:
+                torch.mps.set_rng_state(state)
+            except Exception:  # noqa: BLE001
+                pass
diff --git a/src/twinkle/utils/platforms/npu.py b/src/twinkle/utils/platforms/npu.py
index 89066b280..de15707f6 100644
--- a/src/twinkle/utils/platforms/npu.py
+++ b/src/twinkle/utils/platforms/npu.py
@@ -133,3 +133,24 @@ def get_vllm_device_uuid(device_id: int = 0) -> str:
             visible = os.environ.get(Platform.visible_device_env())
             raw = f'{socket.gethostname()}:{visible}:{device_id}'
             return hashlib.sha1(raw.encode('utf-8')).hexdigest()[:16]
+
+    @staticmethod
+    def get_device_rng_state():
+        import torch
+        try:
+            import torch_npu  # noqa: F401
+        except ImportError:
+            return None
+        if hasattr(torch, 'npu') and torch.npu.is_available():
+            return torch.npu.get_rng_state()
+        return None
+
+    @staticmethod
+    def set_device_rng_state(state) -> None:
+        import torch
+        try:
+            import torch_npu  # noqa: F401
+        except ImportError:
+            return
+        if hasattr(torch, 'npu') and torch.npu.is_available():
+            torch.npu.set_rng_state(state)
diff --git a/src/twinkle/utils/torch_utils.py b/src/twinkle/utils/torch_utils.py
index 793cc26cd..eee46bf89 100644
--- a/src/twinkle/utils/torch_utils.py
+++ b/src/twinkle/utils/torch_utils.py
@@ -96,7 +96,7 @@ def selective_log_softmax(logits, index, return_entropy: bool = False):
                                           'under vocab tensor parallelism (TP>1).')
             # clone to avoid modifying the original logits
             return _vocab_parallel_selective_log_softmax(logits.clone(), index)
-    except (ImportError, AssertionError, OSError):
+    except (ImportError, AssertionError, OSError, RuntimeError):
         pass
 
     if logits.dtype in [torch.float32, torch.float64]:
@@ -352,8 +352,8 @@ def split_cp_inputs(inputs: 'torch.Tensor', cu_seqlens: Optional['torch.Tensor']
             val = inputs[tuple(slices)]
         view_shape = (*inputs.shape[:dim], 2 * cp_size, val.shape[dim] // (2 * cp_size), *inputs.shape[dim + 1:])
         val = val.view(view_shape)
-        index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], device='cpu',
-                             pin_memory=True).cuda(non_blocking=True)
+        index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)],
+                             device=inputs.device, dtype=torch.long)
         val = val.index_select(dim, index)
         view_shape = (*inputs.shape[:dim], -1, *inputs.shape[dim + 1:])
         new_inputs.append(val.view(view_shape))
diff --git a/src/twinkle_agentic/envs/__init__.py b/src/twinkle_agentic/envs/__init__.py
new file mode 100644
index 000000000..4ce59e6cc
--- /dev/null
+++ b/src/twinkle_agentic/envs/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from .base import Env, StepResult
+from .env_tool import EnvTool
+from .openenv import OpenEnv
+from .env_pool import EnvPool, EnvPoolAdapter
diff --git a/src/twinkle_agentic/envs/base.py b/src/twinkle_agentic/envs/base.py
new file mode 100644
index 000000000..a9e4b535f
--- /dev/null
+++ b/src/twinkle_agentic/envs/base.py
@@ -0,0 +1,97 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+from twinkle.data_format import Trajectory
+from twinkle.data_format.message import Tool as ToolInfo
+
+
+@dataclass
+class StepResult:
+    """Result returned by :meth:`Gym.step`.
+
+    Attributes:
+        observation: Environment observation after the action, typically a
+            string describing the tool execution result.
+        reward: Scalar reward for this step (0.0 if unavailable until episode end).
+        done: Whether the episode is terminated after this step.
+        info: Arbitrary metadata for debugging / logging.
+    """
+    observation: str = ''
+    reward: float = 0.0
+    done: bool = False
+    info: Dict[str, Any] = field(default_factory=dict)
+
+
+class Env(ABC):
+    """Base class for RL execution environments.
+
+    Two usage modes:
+
+    1. **Interactive mode** (multi-turn rollout) — the environment participates
+       in the rollout by executing actions step-by-step::
+
+           env.reset(trajectory)
+           result = env.step(tool_name, arguments)
+           # ... repeat until result.done
+
+    2. **Batch evaluation mode** — evaluate a batch of completed trajectories
+       and return rewards::
+
+           rewards = env.evaluate(trajectories)
+
+    To bridge with the existing :class:`ToolManager` / :class:`MultiTurnRollout`,
+    wrap an ``Env`` instance with :class:`EnvTool` so it can be registered as a
+    regular tool.
+    """
+
+    def reset(self, trajectory: Optional[Trajectory] = None) -> StepResult:
+        """Reset environment for a new episode.
+
+        Args:
+            trajectory: Optional initial trajectory (user prompt / context).
+
+        Returns:
+            Initial observation after reset.
+        """
+        return StepResult()
+
+    @abstractmethod
+    def step(self, tool_name: str, arguments: Dict[str, Any]) -> StepResult:
+        """Execute a single action in the environment.
+
+        Args:
+            tool_name: Name of the tool / action to invoke.
+            arguments: Tool arguments as a JSON-serializable dict.
+
+        Returns:
+            StepResult with observation, reward, and done flag.
+        """
+        raise NotImplementedError
+
+    def tools(self) -> List[ToolInfo]:
+        """Return tool definitions available in this environment.
+
+        Used by :class:`EnvTool` to expose the environment's capabilities to
+        the LLM via the standard OpenAI-tools schema.
+        """
+        return []
+
+    def evaluate(self, trajectories: List[Trajectory], **kwargs) -> List[float]:
+        """Batch-evaluate completed trajectories and return rewards.
+
+        Default implementation returns 0.0 for all trajectories. Override for
+        custom reward computation (e.g. answer F1, code execution result).
+        """
+        return [0.0] * len(trajectories)
+
+    def close(self) -> None:
+        """Release environment resources."""
+        pass
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.close()
diff --git a/src/twinkle_agentic/envs/env_pool.py b/src/twinkle_agentic/envs/env_pool.py
new file mode 100644
index 000000000..d0c6c4c66
--- /dev/null
+++ b/src/twinkle_agentic/envs/env_pool.py
@@ -0,0 +1,356 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""EnvPool: environment pool with @remote_class integration.
+
+Follows the same pattern as :class:`twinkle.dataloader.DataLoader`:
+- Decorated with ``@remote_class(execute='first')``
+- When instantiated **without** ``remote_group``, runs locally in the
+  current process (driver or worker) with zero RPC overhead.
+- When instantiated **with** ``remote_group='env'``, gets deployed to a
+  dedicated Ray Worker for process-level isolation.
+
+The pool manages N environment instances internally. Each slot is accessed
+by index. :class:`EnvPoolAdapter` wraps a single slot as a standard
+:class:`Env` so it can be used with :class:`EnvTool` / :class:`ToolManager`.
+
+Usage (local, inside MultiTurnRollout worker)::
+
+    pool = EnvPool(env_cls='blackjack_env:BlackjackEnv', pool_size=32)
+    adapters = pool.get_adapters(tool_schema=TOOL_SCHEMA)
+    # adapters[i] is a standard Env
+
+Usage (remote, on a dedicated DeviceGroup)::
+
+    pool = EnvPool(
+        env_cls='coding_env:CodingEnv',
+        pool_size=8,
+        remote_group='env',
+        device_mesh=env_mesh,
+    )
+"""
+import importlib
+import json
+from typing import Any, Callable, Dict, List, Optional, Type, Union
+
+from twinkle import DeviceMesh, remote_class, remote_function
+from twinkle.utils import get_logger
+from .base import Env, StepResult
+
+logger = get_logger()
+
+
+def _import_env_class(path: str):
+    """Import an environment class from a dotted or colon-separated path.
+
+    Supports:
+      - ``'module:ClassName'`` (entry-point style)
+      - ``'module.ClassName'`` (dotted style)
+    """
+    if ':' in path:
+        module_path, class_name = path.rsplit(':', 1)
+    elif '.' in path:
+        module_path, class_name = path.rsplit('.', 1)
+    else:
+        raise ValueError(
+            f"env_cls must be 'module.ClassName' or 'module:ClassName', got {path!r}"
+        )
+    module = importlib.import_module(module_path)
+    cls = getattr(module, class_name, None)
+    if cls is None:
+        raise ImportError(f"Cannot find class {class_name!r} in module {module_path!r}")
+    return cls
+
+
+def _format_observation(obs) -> str:
+    """Normalize observation to string."""
+    if obs is None:
+        return ''
+    if isinstance(obs, str):
+        return obs
+    if isinstance(obs, dict):
+        for key in ('result', 'output', 'content', 'text', 'message'):
+            if key in obs:
+                return str(obs[key])
+        try:
+            return json.dumps(obs, ensure_ascii=False, default=str)
+        except (TypeError, ValueError):
+            return str(obs)
+    return str(obs)
+
+
+def _normalize_result(result) -> Dict[str, Any]:
+    """Normalize an environment result to a standard dict."""
+    # Already a dict
+    if isinstance(result, dict):
+        return {
+            'observation': _format_observation(result.get('observation', '')),
+            'reward': float(result.get('reward', 0.0)),
+            'done': bool(result.get('done', False)),
+        }
+    # Has .observation attribute (StepResult, OpenEnv result, etc.)
+    if hasattr(result, 'observation'):
+        obs = result.observation
+        return {
+            'observation': _format_observation(obs),
+            'reward': float(getattr(result, 'reward', 0.0) or 0.0),
+            'done': bool(getattr(result, 'done', False)),
+        }
+    # Fallback
+    return {
+        'observation': str(result) if result is not None else '',
+        'reward': 0.0,
+        'done': False,
+    }
+
+
+def _accepts_two_positional(method) -> bool:
+    """Check if method accepts >= 2 positional args (besides self)."""
+    import inspect
+    try:
+        sig = inspect.signature(method)
+        params = [
+            p for p in sig.parameters.values()
+            if p.name != 'self' and p.kind in (
+                inspect.Parameter.POSITIONAL_ONLY,
+                inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            )
+        ]
+        return len(params) >= 2
+    except (ValueError, TypeError):
+        return False
+
+
+@remote_class(execute='first')
+class EnvPool:
+    """Pool of environment instances managed as a Twinkle remote_class.
+
+    Args:
+        env_cls: Import path to the environment class (e.g.
+            ``'blackjack_env:BlackjackEnv'``), or the class itself.
+        pool_size: Number of environment instances to create.
+        device_mesh: Optional DeviceMesh for distributed deployment.
+        env_kwargs: Extra keyword arguments for environment construction.
+    """
+
+    def __init__(
+        self,
+        env_cls: Union[str, Type],
+        pool_size: int = 32,
+        device_mesh: Optional[DeviceMesh] = None,
+        env_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        # Resolve env class
+        if isinstance(env_cls, str):
+            self._env_cls = _import_env_class(env_cls)
+        else:
+            self._env_cls = env_cls
+
+        self._pool_size = pool_size
+        self._env_kwargs = env_kwargs or {}
+        self._episode_rewards: List[float] = [0.0] * pool_size
+
+        # Instantiate all environments
+        self._envs: List[Any] = []
+        for _ in range(pool_size):
+            self._envs.append(self._env_cls(**self._env_kwargs))
+
+        logger.info(f'EnvPool initialized: env_cls={env_cls}, pool_size={pool_size}')
+
+    @remote_function()
+    def reset(self, idx: int) -> Dict[str, Any]:
+        """Reset environment instance at slot ``idx``.
+
+        Returns:
+            Dict with keys: observation, reward, done.
+        """
+        env = self._envs[idx]
+        self._episode_rewards[idx] = 0.0
+        result = env.reset()
+        normalized = _normalize_result(result)
+        normalized['episode_reward'] = 0.0
+        return normalized
+
+    @remote_function()
+    def step(self, idx: int, action: Dict[str, Any]) -> Dict[str, Any]:
+        """Execute one step on environment at slot ``idx``.
+
+        Args:
+            idx: Environment slot index.
+            action: Action dict. If it contains 'tool_name' and 'arguments',
+                dispatches as ``env.step(tool_name, arguments)`` for Twinkle
+                Env protocol. Otherwise passes the dict directly.
+
+        Returns:
+            Dict with keys: observation, reward, done, episode_reward.
+        """
+        env = self._envs[idx]
+
+        # Dispatch based on env interface
+        if 'tool_name' in action and 'arguments' in action:
+            if hasattr(env, 'step') and _accepts_two_positional(env.step):
+                result = env.step(action['tool_name'], action['arguments'])
+            else:
+                result = env.step(action)
+        else:
+            result = env.step(action)
+
+        normalized = _normalize_result(result)
+        self._episode_rewards[idx] += normalized['reward']
+        normalized['episode_reward'] = self._episode_rewards[idx]
+        return normalized
+
+    @remote_function()
+    def reset_batch(self, indices: List[int]) -> List[Dict[str, Any]]:
+        """Batch reset multiple environments.
+
+        Args:
+            indices: List of slot indices to reset.
+
+        Returns:
+            List of result dicts, one per index.
+        """
+        return [self.reset(i) for i in indices]
+
+    @remote_function()
+    def step_batch(self, indices: List[int], actions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Batch step multiple environments.
+
+        Args:
+            indices: List of slot indices.
+            actions: List of action dicts, aligned with indices.
+
+        Returns:
+            List of result dicts, one per index.
+        """
+        assert len(indices) == len(actions)
+        return [self.step(i, a) for i, a in zip(indices, actions)]
+
+    @remote_function()
+    def close(self) -> None:
+        """Release all environment resources."""
+        for env in self._envs:
+            if hasattr(env, 'close'):
+                try:
+                    env.close()
+                except Exception:
+                    pass
+        self._envs.clear()
+        logger.info('EnvPool closed.')
+
+    @property
+    def pool_size(self) -> int:
+        return self._pool_size
+
+    def get_adapters(
+        self,
+        n: Optional[int] = None,
+        tool_schema: Optional[List] = None,
+        action_mapper: Optional[Callable] = None,
+    ) -> List['EnvPoolAdapter']:
+        """Create EnvPoolAdapter instances for use with EnvTool/ToolManager.
+
+        Args:
+            n: Number of adapters to create (default: pool_size).
+            tool_schema: Tool definitions for the LLM.
+            action_mapper: Optional callable to transform (tool_name, arguments)
+                before passing to the environment.
+
+        Returns:
+            List of EnvPoolAdapter instances (indices 0..n-1).
+        """
+        if n is None:
+            n = self._pool_size
+        if n > self._pool_size:
+            raise ValueError(
+                f'Requested {n} adapters but pool only has {self._pool_size} slots.'
+            )
+        return [
+            EnvPoolAdapter(pool=self, idx=i, tool_schema=tool_schema, action_mapper=action_mapper)
+            for i in range(n)
+        ]
+
+
+class EnvPoolAdapter(Env):
+    """Wraps a single slot in an :class:`EnvPool` as a standard :class:`Env`.
+
+    This adapter allows a pool slot to be used transparently with
+    :class:`EnvTool`, :class:`ToolManager`, and :class:`MultiTurnRollout`.
+
+    Args:
+        pool: The EnvPool instance.
+        idx: Slot index in the pool.
+        tool_schema: Tool definitions for the environment.
+        action_mapper: Optional callable to transform (tool_name, arguments).
+    """
+
+    def __init__(
+        self,
+        pool: EnvPool,
+        idx: int,
+        tool_schema: Optional[List] = None,
+        action_mapper: Optional[Callable] = None,
+    ):
+        self._pool = pool
+        self._idx = idx
+        self._tool_schema = tool_schema
+        self._action_mapper = action_mapper
+        self._episode_reward: float = 0.0
+
+    def reset(self, trajectory=None) -> StepResult:
+        """Reset the environment slot."""
+        self._episode_reward = 0.0
+        result = self._pool.reset(self._idx)
+        return StepResult(
+            observation=result.get('observation', ''),
+            reward=0.0,
+            done=False,
+            info=result,
+        )
+
+    def step(self, tool_name: str, arguments: Dict[str, Any]) -> StepResult:
+        """Execute a tool call on the environment slot."""
+        try:
+            if self._action_mapper is not None:
+                action = self._action_mapper(tool_name, arguments)
+            else:
+                action = {'tool_name': tool_name, 'arguments': arguments}
+
+            result = self._pool.step(self._idx, action)
+            obs = result.get('observation', '')
+            reward = float(result.get('reward', 0.0))
+            done = bool(result.get('done', False))
+            self._episode_reward = float(result.get('episode_reward', 0.0))
+
+            return StepResult(
+                observation=obs,
+                reward=reward,
+                done=done,
+                info={'raw_result': result, 'episode_reward': self._episode_reward},
+            )
+        except Exception as e:
+            logger.warning(f'EnvPoolAdapter step error (idx={self._idx}): {e}')
+            return StepResult(
+                observation=f'Error: {e}',
+                reward=0.0,
+                done=True,
+                info={'error': str(e)},
+            )
+
+    def tools(self) -> List:
+        """Return tool definitions."""
+        if self._tool_schema is not None:
+            return self._tool_schema
+        return []
+
+    def evaluate(self, trajectories, **kwargs) -> List[float]:
+        """No-op: rewards are accumulated per-step."""
+        return [0.0] * len(trajectories)
+
+    @property
+    def episode_reward(self) -> float:
+        """Cumulative reward for the current episode."""
+        return self._episode_reward
+
+    def close(self) -> None:
+        """No-op: lifecycle managed by the pool."""
+        pass
diff --git a/src/twinkle_agentic/envs/env_tool.py b/src/twinkle_agentic/envs/env_tool.py
new file mode 100644
index 000000000..0712b4f8d
--- /dev/null
+++ b/src/twinkle_agentic/envs/env_tool.py
@@ -0,0 +1,103 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+from typing import Any, Dict, List, Optional
+
+from twinkle.data_format.message import Tool as ToolInfo
+from twinkle_agentic.tools.base import Tool
+from .base import Env, StepResult
+
+
+class EnvTool(Tool):
+    """Wraps a :class:`Env` environment as a :class:`Tool` for ToolManager.
+
+    Each ``EnvTool`` instance maps to one tool name. When the LLM generates a
+    tool call with that name, it is dispatched to ``env.step(tool_name, args)``.
+
+    The observation string is returned as the tool response content. The reward
+    and done flag are stored on the instance for the caller to inspect.
+
+    Args:
+        env: The Env environment instance.
+        tool_name: Name of the tool this adapter represents.
+        description: Human-readable description for the LLM.
+        parameters: JSON Schema dict describing the tool's parameters.
+            Default to an empty object schema (accepts any arguments).
+    """
+
+    def __init__(
+        self,
+        env: Env,
+        tool_name: str = 'env_action',
+        description: str = 'Execute an action in the environment.',
+        parameters: Optional[Dict[str, Any]] = None,
+    ):
+        self._env = env
+        self._tool_name = tool_name
+        self._description = description
+        self._parameters = parameters or {'type': 'object', 'properties': {}}
+        # Last step result for inspection by callers
+        self.last_result: Optional[StepResult] = None
+
+    def __call__(self, tool_name: str, arguments: Dict[str, Any]) -> str:
+        """Dispatch tool call to the underlying Env environment.
+
+        Args:
+            tool_name: Tool name (should match ``self._tool_name``).
+            arguments: Tool call arguments from the LLM.
+
+        Returns:
+            Observation string from the environment.
+        """
+        result = self._env.step(tool_name, arguments)
+        self.last_result = result
+        return result.observation
+
+    def tool_info(self) -> ToolInfo:
+        """Return OpenAI-compatible tool schema."""
+        return {
+            'type': 'function',
+            'function': {
+                'name': self._tool_name,
+                'description': self._description,
+                'parameters': self._parameters,
+            },
+        }
+
+    @property
+    def done(self) -> bool:
+        """Whether the last step terminated the episode."""
+        return self.last_result.done if self.last_result else False
+
+    @property
+    def episode_reward(self) -> float:
+        """Cumulative reward from the last result info."""
+        if self.last_result and 'episode_reward' in self.last_result.info:
+            return self.last_result.info['episode_reward']
+        return self.last_result.reward if self.last_result else 0.0
+
+    @classmethod
+    def from_env(cls, env: Env) -> List['EnvTool']:
+        """Create one EnvTool per tool defined in the Env.
+
+        If the env exposes multiple tools via :meth:`Env.tools`, this creates
+        a list of ``EnvTool`` instances (one per tool) that can all be
+        registered into the same :class:`ToolManager`.
+
+        If the env has no tool definitions, returns a single generic adapter.
+        """
+        tool_infos = env.tools()
+        if not tool_infos:
+            return [cls(env)]
+
+        tools = []
+        for info in tool_infos:
+            fn = info.get('function', {}) if isinstance(info, dict) else {}
+            name = fn.get('name', 'env_action')
+            desc = fn.get('description', '')
+            params = fn.get('parameters', {'type': 'object', 'properties': {}})
+            tools.append(cls(
+                env=env,
+                tool_name=name,
+                description=desc,
+                parameters=params,
+            ))
+        return tools
diff --git a/src/twinkle_agentic/envs/openenv.py b/src/twinkle_agentic/envs/openenv.py
new file mode 100644
index 000000000..1d749289e
--- /dev/null
+++ b/src/twinkle_agentic/envs/openenv.py
@@ -0,0 +1,245 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import importlib
+import json
+import importlib.util
+import os
+from typing import Any, Callable, Dict, List, Optional, Type
+from twinkle.utils import get_logger
+from twinkle.data_format import Trajectory
+from twinkle.data_format.message import Tool as ToolInfo
+from .base import Env, StepResult
+
+logger = get_logger()
+
+
+def _import_class(dotted_path: str):
+    """Dynamically import a class from a dotted path.
+
+    Example: ``'coding_env.CodingEnv'`` → imports ``CodingEnv`` from the
+    ``coding_env`` package.
+    """
+    parts = dotted_path.rsplit('.', 1)
+    if len(parts) != 2:
+        raise ValueError(
+            f'env_cls must be "module.ClassName", got {dotted_path!r}'
+        )
+    module_path, class_name = parts
+    module = importlib.import_module(module_path)
+    cls = getattr(module, class_name, None)
+    if cls is None:
+        raise ImportError(
+            f'Cannot find class {class_name!r} in module {module_path!r}'
+        )
+    return cls
+
+
+def _get_generic_env_client():
+    """Import GenericEnvClient from openenv, handling broken sub-imports."""
+    try:
+        from openenv.core.generic_client import GenericEnvClient
+        return GenericEnvClient
+    except ImportError:
+        pass
+    # Fallback: try direct submodule import bypassing core __init__
+    try:
+        import openenv
+        pkg_dir = os.path.dirname(openenv.__file__)
+        spec = importlib.util.spec_from_file_location(
+            'openenv.core.generic_client',
+            os.path.join(pkg_dir, 'core', 'generic_client.py'),
+        )
+        if spec and spec.loader:
+            mod = importlib.util.module_from_spec(spec)
+            # We need client_types to be importable
+            ct_spec = importlib.util.spec_from_file_location(
+                'openenv.core.client_types',
+                os.path.join(pkg_dir, 'core', 'client_types.py'),
+            )
+            if ct_spec and ct_spec.loader:
+                import sys
+                ct_mod = importlib.util.module_from_spec(ct_spec)
+                sys.modules['openenv.core.client_types'] = ct_mod
+                ct_spec.loader.exec_module(ct_mod)
+            spec.loader.exec_module(mod)
+            return mod.GenericEnvClient
+    except Exception:
+        pass
+    raise ImportError(
+        'Cannot import GenericEnvClient from openenv. '
+        'Please install openenv: pip install openenv'
+    )
+
+
+class OpenEnv(Env):
+    """Adapter that wraps an OpenEnv ``EnvClient`` as a Twinkle :class:`Env`.
+
+    OpenEnv environments communicate via WebSocket (async). This adapter
+    provides a synchronous interface via the ``.sync()`` wrapper, making it
+    compatible with Twinkle's synchronous :class:`MultiTurnRollout`.
+
+    Args:
+        base_url: URL of the running OpenEnv environment server
+            (e.g. ``'http://localhost:8000'``).
+        env_cls: Optional dotted import path for a typed OpenEnv client class,
+            e.g. ``'coding_env.CodingEnv'`` or ``'echo_env.EchoEnv'``.
+            Alternatively pass the class object directly.
+            If *None*, uses ``GenericEnvClient`` (works with any environment
+            using plain dict actions/observations).
+        env_kwargs: Extra keyword arguments forwarded to the EnvClient
+            constructor (e.g. ``connect_timeout_s``, ``message_timeout_s``).
+        tool_schema: Optional list of tool definitions (OpenAI function-call
+            schema). If provided, :meth:`tools` will return them.
+    """
+
+    def __init__(
+        self,
+        base_url: str,
+        env_cls: Any = None,
+        env_kwargs: Optional[Dict[str, Any]] = None,
+        tool_schema: Optional[List[ToolInfo]] = None,
+        action_mapper: Optional[Callable[[str, Dict[str, Any]], Dict[str, Any]]] = None,
+    ):
+        # Resolve env_cls
+        if env_cls is None:
+            self._env_cls: Type = _get_generic_env_client()
+        elif isinstance(env_cls, str):
+            self._env_cls = _import_class(env_cls)
+        else:
+            self._env_cls = env_cls
+
+        self._base_url = base_url
+        self._env_kwargs = env_kwargs or {}
+        self._tool_schema = tool_schema
+        self._action_mapper = action_mapper
+        self._sync_client = None
+        self._episode_reward: float = 0.0
+
+    def _ensure_client(self):
+        """Lazily create and connect the synchronous client."""
+        if self._sync_client is not None:
+            return
+        client = self._env_cls(
+            base_url=self._base_url,
+            **self._env_kwargs,
+        )
+        # .sync() returns a SyncEnvClient with __enter__/__exit__
+        self._sync_client = client.sync()
+        self._sync_client.__enter__()
+
+    def reset(self, trajectory: Optional[Trajectory] = None) -> StepResult:
+        """Reset the OpenEnv environment for a new episode.
+
+        Args:
+            trajectory: Ignored for OpenEnv (state is server-managed).
+
+        Returns:
+            StepResult with the initial observation.
+        """
+        self._ensure_client()
+        self._episode_reward = 0.0
+        result = self._sync_client.reset()
+        obs = self._format_observation(result)
+        return StepResult(
+            observation=obs,
+            reward=0.0,
+            done=False,
+            info={'raw_result': result},
+        )
+
+    def step(self, tool_name: str, arguments: Dict[str, Any]) -> StepResult:
+        """Execute a tool call in the OpenEnv environment.
+
+        The action is sent as a dict ``{tool_name: ..., arguments: ...}``
+        which is the standard format accepted by ``GenericEnvClient`` and
+        typed clients via ``_step_payload``.
+
+        Args:
+            tool_name: Name of the tool to invoke.
+            arguments: Tool arguments dict.
+
+        Returns:
+            StepResult with observation, step reward, and done flag.
+        """
+        self._ensure_client()
+        try:
+            if self._action_mapper is not None:
+                action = self._action_mapper(tool_name, arguments)
+            else:
+                action = {'tool_name': tool_name, 'arguments': arguments}
+            result = self._sync_client.step(action)
+
+            obs = self._format_observation(result)
+            reward = getattr(result, 'reward', 0.0) or 0.0
+            done = getattr(result, 'done', False) or False
+            self._episode_reward += reward
+
+            return StepResult(
+                observation=obs,
+                reward=reward,
+                done=done,
+                info={'raw_result': result, 'episode_reward': self._episode_reward},
+            )
+        except Exception as e:
+            logger.warning(f'OpenEnv step error: {e}')
+            return StepResult(
+                observation=f'Error: {e}',
+                reward=0.0,
+                done=True,
+                info={'error': str(e)},
+            )
+
+    def tools(self) -> List[ToolInfo]:
+        """Return tool definitions from the OpenEnv environment."""
+        if self._tool_schema is not None:
+            return self._tool_schema
+        return []
+
+    def evaluate(self, trajectories: List[Trajectory], **kwargs) -> List[float]:
+        """OpenEnv environments provide per-step rewards; episode reward is
+        accumulated in ``info['episode_reward']``. This method is a no-op."""
+        return [0.0] * len(trajectories)
+
+    def close(self) -> None:
+        """Disconnect from the OpenEnv server."""
+        if self._sync_client is not None:
+            try:
+                self._sync_client.__exit__(None, None, None)
+            except Exception:
+                pass
+            self._sync_client = None
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _format_observation(result) -> str:
+        """Extract a string observation from an OpenEnv StepResult.
+
+        OpenEnv's ``StepResult.observation`` may be a dict (GenericEnvClient),
+        a typed object, or a string depending on the client class.
+        """
+        obs = getattr(result, 'observation', None)
+        if obs is None:
+            return ''
+        if isinstance(obs, str):
+            return obs
+        # Dict observations (GenericEnvClient)
+        if isinstance(obs, dict):
+            # Common patterns in tool-based envs
+            for key in ('result', 'output', 'content', 'text', 'message'):
+                if key in obs:
+                    return str(obs[key])
+            # Return full dict as JSON
+            try:
+                return json.dumps(obs, ensure_ascii=False, default=str)
+            except (TypeError, ValueError):
+                return str(obs)
+        # Typed observation objects
+        for attr in ('result', 'content', 'output', 'text'):
+            if hasattr(obs, attr):
+                return str(getattr(obs, attr))
+        try:
+            return json.dumps(obs, ensure_ascii=False, default=str)
+        except (TypeError, ValueError):
+            return str(obs)
diff --git a/src/twinkle_agentic/preprocessor/message_normalizer.py b/src/twinkle_agentic/preprocessor/message_normalizer.py
index a8606d8f1..d3074a565 100644
--- a/src/twinkle_agentic/preprocessor/message_normalizer.py
+++ b/src/twinkle_agentic/preprocessor/message_normalizer.py
@@ -105,12 +105,12 @@ def _normalize_tool_calls(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]
             nxt_text = msg_content_text(messages[j])
             if not nxt_text:
                 break
-            if parser.detect_result(nxt_text):
-                body = parser.parse_result(nxt_text)
-            elif tc_idx == 0 and len(tc_list) == 1:
-                body = nxt_text
-            else:
-                break
+            body = parser.extract_tool_result(nxt_text)
+            if body is None:
+                if tc_idx == 0 and len(tc_list) == 1:
+                    body = nxt_text
+                else:
+                    break
             out.append({
                 'role': 'tool',
                 'content': body,
diff --git a/src/twinkle_agentic/rollout/api_multi_turn.py b/src/twinkle_agentic/rollout/api_multi_turn.py
index 322b4a0cd..7521a454f 100644
--- a/src/twinkle_agentic/rollout/api_multi_turn.py
+++ b/src/twinkle_agentic/rollout/api_multi_turn.py
@@ -1,7 +1,7 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import os
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional
 
 from twinkle.data_format import Trajectory
 from twinkle.data_format.sampling import SamplingParams
@@ -174,13 +174,27 @@ def _run_one(
             if not tool_calls:
                 stop_reason = _STOP_NO_TOOL
                 break
-            for tc in tool_calls:
-                response = tool_manager(tc)
-                messages.append({
-                    'role': 'tool',
-                    'tool_call_id': tc.get('id'),
-                    'content': str(response),
-                })
+
+            # Skip tool execution at the last turn — results would never be
+            # consumed by a subsequent API call (consistent with multi_turn.py).
+            if turn >= self.max_turns:
+                truncated = True
+                stop_reason = _STOP_MAX_TURNS
+                break
+
+            try:
+                for tc in tool_calls:
+                    response = tool_manager(tc)
+                    messages.append({
+                        'role': 'tool',
+                        'tool_call_id': tc.get('id'),
+                        'content': str(response),
+                    })
+            except Exception as exc:
+                stop_reason = _STOP_API_ERROR
+                error = f'ToolExecution {type(exc).__name__}: {exc}'
+                truncated = True
+                break
         else:
             # Loop exited normally => max_turns reached.
             truncated = True
diff --git a/src/twinkle_agentic/rollout/base.py b/src/twinkle_agentic/rollout/base.py
index 0d36bbcd6..64d9f922b 100644
--- a/src/twinkle_agentic/rollout/base.py
+++ b/src/twinkle_agentic/rollout/base.py
@@ -1,6 +1,6 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import List
 
 from twinkle.data_format import Trajectory
 
diff --git a/src/twinkle_agentic/rollout/multi_turn.py b/src/twinkle_agentic/rollout/multi_turn.py
index e25e89de9..c76f93354 100644
--- a/src/twinkle_agentic/rollout/multi_turn.py
+++ b/src/twinkle_agentic/rollout/multi_turn.py
@@ -77,7 +77,7 @@ def __init__(
         self,
         sampler,
         template: Template,
-        tool_manager: ToolManager,
+        tool_manager: Optional[ToolManager] = None,
         sampling_params: Optional[SamplingParams] = None,
         max_turns: int = 6,
         max_trajectory_tokens: Optional[int] = None,
@@ -88,8 +88,6 @@ def __init__(
         super().__init__()
         if template is None:
             raise ValueError('MultiTurnRollout requires a local Template instance')
-        if tool_manager is None:
-            raise ValueError('MultiTurnRollout requires a ToolManager')
         if max_turns < 1:
             raise ValueError(f'max_turns must be >= 1, got {max_turns}')
         if max_trajectory_tokens is not None and max_trajectory_tokens < 1:
@@ -112,7 +110,7 @@ def __init__(
                              f'got {self.sampling_params.num_samples}')
         assert self.template.truncation_strategy != 'split', (
             "MultiTurnRollout does not support truncation_strategy='split'; "
-            'use left/right/raise on the template.')
+            'use left/right/delete/raise on the template.')
 
     @remote_function()
     def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]:
@@ -216,7 +214,13 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
             # outstanding tool turns. Done serially: bridge computation is
             # a cheap decode-diff-encode on python strings / token lists.
             for global_idx, tool_messages in pending_bridges:
-                pifs[global_idx] = self._extend_with_bridge(pifs[global_idx], tool_messages)
+                extended = self._extend_with_bridge(pifs[global_idx], tool_messages)
+                if extended is None:
+                    # Trajectory exceeded max_length, mark as done (deleted)
+                    truncated[global_idx] = True
+                    done[global_idx] = True
+                else:
+                    pifs[global_idx] = extended
 
         for i in range(n):
             if not all_logprobs[i]:
@@ -257,6 +261,10 @@ def __call__(self, trajectories: List[Trajectory], **kwargs) -> List[Trajectory]
     @staticmethod
     def _resolve_tool_managers(arg, n: int) -> List[ToolManager]:
         """Broadcast a single ``ToolManager`` or validate a per-trajectory list."""
+        if arg is None:
+            raise ValueError(
+                'tool_manager is required but was not provided. '
+                'Pass it at construction time or as a per-call kwarg.')
         if isinstance(arg, list):
             if len(arg) != n:
                 raise ValueError(f'per-call tool_manager list length ({len(arg)}) does '
@@ -446,6 +454,9 @@ def _extend_with_bridge(
             raise RuntimeError(f'Bridge text tokenised to empty id list: {bridge_text!r}')
 
         new_pif = self._append_bridge_tokens(pif, bridge_ids)
+        if new_pif is None:
+            # Trajectory exceeds max_length and strategy is 'delete'
+            return None
         new_pif['messages'] = messages_after
         return new_pif
 
@@ -491,11 +502,16 @@ def _append_bridge_tokens(
             mm = result['mm_token_type_ids']
             if not isinstance(mm, torch.Tensor):
                 mm = torch.as_tensor(mm)
-            pad = torch.zeros((mm.shape[0], len(bridge_ids)), dtype=mm.dtype, device=mm.device)
-            result['mm_token_type_ids'] = torch.cat([mm, pad], dim=1)
+            # Pad along the last (sequence) dim — handles 1D [T] and 2D [1, T] uniformly.
+            leading_shape = mm.shape[:-1]
+            pad = torch.zeros((*leading_shape, len(bridge_ids)), dtype=mm.dtype, device=mm.device)
+            result['mm_token_type_ids'] = torch.cat([mm, pad], dim=-1)
 
         # Replay the post pipeline: refresh attention_mask / position_ids /
         # length and re-roll labels back into output/shifted order.
-        refreshed = self.template._invoke_post_pipeline([result])[0]
-        result.update(refreshed)
+        refreshed_list = self.template._invoke_post_pipeline([result])
+        if not refreshed_list:
+            # truncation_strategy='delete': trajectory exceeds max_length
+            return None
+        result.update(refreshed_list[0])
         return _to_plain(result)
diff --git a/src/twinkle_agentic/tools/tool_manager.py b/src/twinkle_agentic/tools/tool_manager.py
index eb60cbae8..46cbc907d 100644
--- a/src/twinkle_agentic/tools/tool_manager.py
+++ b/src/twinkle_agentic/tools/tool_manager.py
@@ -45,7 +45,7 @@ def __init__(
                         f'got {type(tools).__name__}')
 
     def register(self, tool: Tool):
-        info = tool.tool_info()
+        info = tool.tool_info() if hasattr(tool, 'tool_info') else None
         name = _extract_name(info)
         if not name:
             raise ValueError(f'tool {type(tool).__name__} must expose a non-empty '
diff --git a/src/twinkle_client/skills/__init__.py b/src/twinkle_client/skills/__init__.py
new file mode 100644
index 000000000..3b82813bb
--- /dev/null
+++ b/src/twinkle_client/skills/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Skills loading framework - extensible providers for agent skill injection."""
+
+from twinkle_client.skills.base import SkillProvider
+from twinkle_client.skills.local_provider import LocalSkillProvider
+from twinkle_client.skills.manager import SkillManager
+from twinkle_client.skills.modelscope_provider import ModelScopeSkillProvider
+
+__all__ = ['SkillProvider', 'SkillManager', 'LocalSkillProvider', 'ModelScopeSkillProvider']
diff --git a/src/twinkle_client/skills/base.py b/src/twinkle_client/skills/base.py
new file mode 100644
index 000000000..bf929f49a
--- /dev/null
+++ b/src/twinkle_client/skills/base.py
@@ -0,0 +1,121 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Base class for skill providers.
+
+A SkillProvider is responsible for fetching skill files (typically markdown)
+from a remote source (Git repo, API, local directory, etc.) and making their
+content available for injection into the agent's system prompt.
+
+To create a new provider, subclass SkillProvider and implement:
+- `name` property: human-readable provider name
+- `fetch()`: download/update skill files to local cache
+- `load_skills()`: read cached files and return list of Skill objects
+"""
+
+from __future__ import annotations
+
+import dataclasses
+from twinkle.utils.logger import get_logger
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+logger = get_logger()
+
+# File stems to skip when scanning for skill markdown files
+_SKIP_STEMS = frozenset({'license', 'readme', 'contributing', 'changelog'})
+
+
+@dataclasses.dataclass
+class Skill:
+    """A single skill loaded from a provider.
+
+    Attributes:
+        name: Short identifier for the skill (typically filename without extension).
+        content: Full markdown content of the skill.
+        source: Provider name + relative path for traceability.
+    """
+
+    name: str
+    content: str
+    source: str
+
+
+class SkillProvider(ABC):
+    """Abstract base class for skill providers.
+
+    Subclass this to add new skill sources (e.g., HuggingFace, local files).
+    """
+
+    def __init__(self, cache_dir: Path | None = None):
+        """Initialize the provider.
+
+        Args:
+            cache_dir: Local directory to cache fetched skill files.
+                       If None, uses ~/.cache/twinkle/tui/skills/<provider_name>
+        """
+        if cache_dir is None:
+            cache_dir = Path.home() / '.cache' / 'twinkle' / 'tui' / 'skills' / self.name
+        self.cache_dir = cache_dir
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Human-readable name of this provider (used as cache subdirectory)."""
+        ...
+
+    @abstractmethod
+    async def fetch(self) -> None:
+        """Fetch or update skill files from the remote source.
+
+        Should clone/pull a git repo, download files, etc.
+        Results should be stored in self.cache_dir.
+        """
+        ...
+
+    async def load_skills(self) -> list[Skill]:
+        """Load all .md skill files from the provider's root directory.
+
+        Override `_skills_root` if the scan directory differs from cache_dir.
+        """
+        root = self._skills_root()
+        if not root.exists():
+            logger.warning(f'[{self.name}] Skills directory not found: {root}')
+            return []
+        return self._scan_markdown_files(root)
+
+    def _skills_root(self) -> Path:
+        """Return the root directory to scan for .md files. Override as needed."""
+        return self.cache_dir
+
+    def _scan_markdown_files(self, root: Path) -> list[Skill]:
+        """Scan a directory tree for markdown skill files.
+
+        Skips hidden directories and common non-skill files (README, LICENSE, etc.).
+        """
+        skills: list[Skill] = []
+        for md_file in sorted(root.rglob('*.md')):
+            rel = md_file.relative_to(root)
+            # Skip hidden directories
+            if any(part.startswith('.') for part in rel.parts):
+                continue
+            # Skip common non-skill files
+            if md_file.stem.lower() in _SKIP_STEMS:
+                continue
+            try:
+                content = md_file.read_text(encoding='utf-8')
+                skills.append(Skill(
+                    name=md_file.stem,
+                    content=content,
+                    source=f'{self.name}/{rel}',
+                ))
+            except Exception as e:
+                logger.warning(f'[{self.name}] Failed to read {md_file}: {e}')
+        logger.info(f'[{self.name}] Loaded {len(skills)} skills from {root}')
+        return skills
+
+    async def get_skills(self) -> list[Skill]:
+        """Convenience method: fetch then load.
+
+        Can be overridden if a provider wants custom logic.
+        """
+        await self.fetch()
+        return await self.load_skills()
diff --git a/src/twinkle_client/skills/bundled/__init__.py b/src/twinkle_client/skills/bundled/__init__.py
new file mode 100644
index 000000000..e588626bc
--- /dev/null
+++ b/src/twinkle_client/skills/bundled/__init__.py
@@ -0,0 +1,2 @@
+# Bundled skill documents for the TUI agent.
+# These .md files are included as package data and loaded at runtime.
diff --git a/src/twinkle_client/skills/bundled/autoresearch.md b/src/twinkle_client/skills/bundled/autoresearch.md
new file mode 100644
index 000000000..0af187617
--- /dev/null
+++ b/src/twinkle_client/skills/bundled/autoresearch.md
@@ -0,0 +1,345 @@
+# AutoResearch Skill
+
+You are an expert ML research assistant. Guide users through designing and executing training experiments using Twinkle.
+
+## Workflow
+
+1. **Requirements** → 2. **Resources** → 3. **Model & Method** → 4. **Dataset** → 5. **Hyperparams** → 6. **Execute**
+
+## Step 1: Requirements & Resources
+
+Gather before any training:
+- **Goal**: Reasoning / Alignment / Domain specialization / Multimodal
+- **Hardware**: GPU count, type (A100/H100/L40), VRAM (40/80GB)
+- **Baseline**: Fresh model or resume from checkpoint?
+- **Success criteria**: Benchmark target (GSM8K, MMLU, HumanEval) or reward convergence
+
+## Step 2: Model Selection
+
+| GPU × VRAM | Max Model Size | Examples |
+|------------|---------------|----------|
+| 1-2 × 80GB | 7B | Qwen/Qwen3.5-4B, Qwen/Qwen3.5-7B |
+| 4 × 80GB | 14B | Qwen/Qwen3.5-14B |
+| 8 × 80GB | 32B | Qwen/Qwen3.5-32B |
+| 16+ × 80GB | 72B | Qwen/Qwen3.5-72B (TP+DP) |
+
+**LoRA** (default): rank 8-64, limited GPU, fast iteration, preserve base capability.
+**Full FT**: 8+ GPUs, fundamental capability change, pre-training.
+
+## Step 3: Training Method
+
+```
+Has labeled input-output pairs? → SFT
+Has preference pairs (chosen/rejected)? → DPO
+Has verifiable reward signal? → GRPO
+Has teacher model? → GKD
+Large unlabeled corpus? → PT (pre-training)
+```
+
+| Method | Data | Compute | Best For |
+|--------|------|---------|----------|
+| SFT | Labeled pairs | Low | Initial capability |
+| GRPO | Prompts + reward fn | High | Reasoning, code |
+| DPO | Preference pairs | Medium | Alignment |
+| GKD | Teacher model | Medium | Distillation |
+
+## Step 4: Dataset
+
+| Task | Datasets | Size |
+|------|----------|------|
+| Math | `ms://modelscope/gsm8k`, competition_math | 1K-8K |
+| Code | humaneval, mbpp | 0.5K-10K |
+| Chat | sharegpt, ultrachat | 50K-500K |
+| DPO | shareAI-Llama3-DPO-zh-en-emoji | 10K-100K |
+| Self-cognition | swift/self-cognition | ~500 |
+
+**Volume guidelines:** SFT 10K-100K, GRPO 5K-50K prompts, DPO 10K-100K pairs.
+Quality > Quantity for all methods.
+
+## Step 5: Hyperparameters
+
+**SFT**: lr=1e-5~5e-5, batch=4-16, epochs=2-5, lora_rank=8-32
+**GRPO**: lr=1e-6~2e-5, batch=4-8 prompts, num_generations=4-16, epsilon=0.1-0.3, max_steps=200-2000
+**DPO**: lr=5e-7~5e-6, beta=0.1, max_steps=500-3000
+
+**Troubleshooting:**
+- NaN loss → reduce lr 10x, gradient clipping max_grad_norm=1.0
+- Reward plateau → increase num_generations, try different reward
+- OOM → reduce micro_batch_size, enable gradient_checkpointing
+- Too slow → increase batch_size, reduce num_generations
+
+## Step 6: Multi-Stage Pipelines
+
+**Reasoning Enhancement**: Data cleaning → SFT warm-up (1-2 epochs) → GRPO
+**General Alignment**: (Optional PT) → SFT → DPO/SimPO
+**Distillation**: GKD from teacher → Self-play GRPO
+
+Between stages: save checkpoint, evaluate, resume from best.
+
+## Step 7: Data Preparation
+
+Standard format for Twinkle:
+```python
+Trajectory(messages=[
+    Message(role='user', content='...'),
+    Message(role='assistant', content='...'),
+])
+# DPO: Trajectory(messages=chosen, extend_message=[('rejected_messages', rejected)])
+```
+
+Quality filters: remove <10 tokens, encoding errors, wrong language, dedup (MinHash Jaccard>0.8).
+# AutoResearch Skill
+
+You are an expert ML research assistant. Guide users through the complete workflow of designing and executing training experiments using Twinkle.
+
+## Step 1: Requirements Analysis
+
+Before any training, systematically gather:
+
+1. **Training Objective**: What capability to improve?
+   - Reasoning (math, logic, code)
+   - Alignment (helpfulness, harmlessness)
+   - Domain specialization (medical, legal, finance)
+   - Multimodal understanding
+   - Instruction following
+
+2. **Hardware Resources**:
+   - GPU count and type (A100/H100/L40/NPU)
+   - Memory per GPU (40GB/80GB)
+   - Available storage for checkpoints
+
+3. **Baseline**: Starting point?
+   - Fresh base model or continue from a checkpoint?
+   - Previous experiment results to compare against?
+
+4. **Success Criteria**: How to measure?
+   - Benchmark scores (GSM8K accuracy, MMLU, HumanEval)
+   - Reward model scores
+   - Human evaluation criteria
+   - Loss/reward convergence targets
+
+## Step 2: Dataset Selection
+
+### By Task Type
+
+| Task | Recommended Datasets | Source |
+|------|---------------------|--------|
+| Math reasoning | GSM8K, MATH, Competition Math | `ms://modelscope/gsm8k` |
+| Code generation | CodeAlpaca, CodeFeedback | HuggingFace/ModelScope |
+| General alignment | UltraChat, ShareGPT | `ms://` prefixed |
+| Preference (DPO) | UltraFeedback, HH-RLHF | `ms://` prefixed |
+| Self-cognition | Built-in (SelfCognitionProcessor) | N/A |
+| Domain-specific | Search ModelScope/HuggingFace | Use `modelscope` SDK |
+
+### Data Volume Guidelines
+
+| Method | Minimum | Recommended | Notes |
+|--------|---------|-------------|-------|
+| SFT | 1K | 10K-100K | Quality > Quantity |
+| GRPO | 2K prompts | 5K-50K prompts | x num_generations per prompt |
+| DPO | 5K pairs | 10K-100K pairs | Need clear quality gap |
+| PT | 100M tokens | 1B+ tokens | Use streaming mode |
+
+### Search Strategy
+
+When user's domain has no obvious dataset:
+1. Search ModelScope: `modelscope hub search --type dataset --query "{domain}"`
+2. Search HuggingFace: look for `{domain}-instruct` or `{domain}-qa` datasets
+3. Consider synthetic generation: use a strong model to generate training data
+4. Consider data mixing: combine domain data with general instruction data (80/20 ratio)
+
+## Step 3: Model Selection
+
+### Scale-Resource Matching
+
+| GPU Count | GPU Memory | Recommended Model Size | Examples |
+|-----------|-----------|----------------------|----------|
+| 1-2 | 40-80GB | 1.5B-7B | Qwen3.5-4B, Qwen3.5-7B |
+| 4 | 80GB | 7B-14B | Qwen3.5-14B |
+| 8 | 80GB | 14B-32B | Qwen3.5-32B |
+| 16+ | 80GB | 32B-72B | Qwen3.5-72B (TP+DP) |
+
+### Model Family Recommendations
+
+- **General purpose**: Qwen3.5 series (best balance of quality and efficiency)
+- **Reasoning-focused**: DeepSeek-V4, Qwen3.5 with thinking enabled
+- **Multimodal**: Qwen3.5-VL, Gemma4
+- **Code**: DeepSeek-Coder-V4, Qwen3.5-Coder
+
+### LoRA vs Full Fine-Tuning
+
+- **LoRA** (default): rank 8-64, all-linear or selective modules. Use when:
+  - Limited GPU memory
+  - Want to preserve base capabilities
+  - Quick iteration needed
+- **Full FT**: Use when:
+  - Sufficient GPU resources (8+ GPUs)
+  - Fundamental capability change needed
+  - Pre-training continuation
+
+## Step 4: Training Method Selection
+
+### Decision Tree
+
+```
+Has labeled input-output pairs?
+├── YES → SFT
+│         └── Want to further improve? → Add GRPO/DPO stage
+├── Has preference pairs (chosen/rejected)?
+│   └── YES → DPO (offline) or SimPO (margin-based)
+├── Has reward signal (verifiable)?
+│   ├── Single-turn → GRPO
+│   └── Multi-turn with environment → MultiTurn GRPO
+├── Has teacher model?
+│   └── YES → GKD (on-policy or off-policy)
+└── Large unlabeled corpus?
+    └── YES → PT (pre-training continuation)
+```
+
+### Method Comparison
+
+| Method | Data Needs | Compute Cost | Stability | Best For |
+|--------|-----------|--------------|-----------|----------|
+| SFT | Labeled pairs | Low | High | Initial capability |
+| GRPO | Prompts + reward fn | High (sampling) | Medium | Reasoning, code |
+| DPO | Preference pairs | Medium | High | Alignment |
+| GKD | Teacher model | Medium | High | Distillation |
+| PT | Raw text | Very High | High | Domain adaptation |
+
+## Step 5: Hyperparameter Configuration
+
+### SFT Defaults
+
+```yaml
+learning_rate: 1e-5 ~ 5e-5
+batch_size: 4-16 (per GPU)
+gradient_accumulation: 1-4
+epochs: 2-5 (or max_steps: 500-5000)
+lora_rank: 8-32
+max_length: 2048-8192
+warmup_steps: 10% of total
+scheduler: CosineAnnealingLR
+```
+
+### GRPO Defaults
+
+```yaml
+learning_rate: 1e-6 ~ 2e-5
+batch_size: 4-8 (prompts per step)
+num_generations: 4-16 (per prompt)
+epsilon: 0.1-0.3
+max_steps: 200-2000
+mini_batch_size: 8-32
+micro_batch_size: 2-4
+advantage_scale: 'group'
+max_new_tokens: 1024-4096
+temperature: 1.0
+```
+
+### DPO Defaults
+
+```yaml
+learning_rate: 5e-7 ~ 5e-6
+beta: 0.1 (KL penalty weight)
+batch_size: 4-8 (pairs per step)
+max_steps: 500-3000
+```
+
+### Tuning Tips
+
+- If loss is NaN: reduce lr by 10x, enable gradient clipping (max_grad_norm=1.0)
+- If reward plateaus: increase num_generations, try different reward combination
+- If OOM: reduce micro_batch_size, enable gradient_checkpointing, reduce max_length
+- If training too slow: increase batch_size, reduce num_generations
+
+## Step 6: Multi-Stage Pipeline Design
+
+### Common Pipelines
+
+**Pipeline A: Reasoning Enhancement**
+1. Data cleaning (filter low-quality, deduplicate)
+2. SFT warm-up (1-2 epochs on curated data)
+3. GRPO training (with verifiable reward)
+
+**Pipeline B: General Alignment**
+1. PT continuation (optional, domain corpus)
+2. SFT instruction tuning (diverse instructions)
+3. DPO/SimPO preference optimization
+
+**Pipeline C: Distillation + Self-Improvement**
+1. GKD from teacher model (on-policy)
+2. Self-play GRPO (student generates, reward judges)
+
+### Stage Transitions
+
+- Between stages: always save checkpoint, evaluate on held-out set
+- If performance drops after a stage: reduce lr for next stage, shorter training
+- Resume from best checkpoint of previous stage
+
+## Step 7: Data Cleaning & Transformation
+
+### Standard Pipeline
+
+1. **Format Normalization**:
+   ```python
+   # Convert to Twinkle messages format
+   Trajectory(messages=[
+       Message(role='system', content='...'),
+       Message(role='user', content='...'),
+       Message(role='assistant', content='...'),
+   ])
+   ```
+
+2. **Quality Filtering**:
+   - Remove samples shorter than 10 tokens or longer than max_length
+   - Remove samples with encoding errors
+   - Remove samples in wrong language (if monolingual training)
+
+3. **Deduplication**:
+   - Exact dedup on content hash
+   - Near-dedup with MinHash (Jaccard > 0.8 = duplicate)
+
+4. **Difficulty Grading** (optional):
+   - By response length (proxy for complexity)
+   - By model perplexity
+   - Curriculum: train easy-to-hard
+
+5. **Output Format**:
+   ```python
+   # Final format for Twinkle Dataset
+   [
+       {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]},
+       ...
+   ]
+   # Save as JSONL for loading via DatasetMeta(dataset_id='path/to/data.jsonl')
+   ```
+
+### Using twinkle_agentic Preprocessors
+
+For advanced filtering, use the built-in preprocessors:
+
+```python
+from twinkle_agentic.preprocessor import (
+    MessageNormalizer,       # Standardize message format
+    DeadLoopFilter,         # Remove repetitive/stuck conversations
+    DedupFilter,            # Deduplication
+    HardFilter,             # Length/format hard constraints
+    RefuseFilter,           # Remove refusal-heavy samples
+    ScoreFilter,            # Quality scoring with LLM
+)
+```
+
+## Output: Experiment Folder
+
+After analysis, generate the experiment folder structure:
+
+```
+experiments/{exp_name}/
+├── plan.md              # This analysis documented
+├── config.yaml          # All hyperparameters
+├── train.py             # Twinkle training script
+├── train.sh             # Launch command
+├── data_prep.py         # Data cleaning script (if needed)
+├── eval.py              # Evaluation script
+└── README.md            # Quick summary for collaborators
+```
diff --git a/src/twinkle_client/skills/bundled/twinkle-training.md b/src/twinkle_client/skills/bundled/twinkle-training.md
new file mode 100644
index 000000000..a9a1ac1e7
--- /dev/null
+++ b/src/twinkle_client/skills/bundled/twinkle-training.md
@@ -0,0 +1,966 @@
+# Twinkle Training Script Skill
+
+You are an expert at writing training scripts for the Twinkle framework.
+
+## CRITICAL RULES
+
+1. **Model/Dataset names MUST use full org/name format**: `Qwen/Qwen3.5-4B`, NOT `Qwen3.5-4B`
+2. **Name resolution workflow** (MUST follow when user gives a model or dataset name):
+   - If user says "Qwen3.5-4B" or any short/ambiguous name → call `search_models(query='Qwen3.5-4B')` to get full ID
+   - If user says "gsm8k" → call `search_datasets(query='gsm8k')` to get full ID like `modelscope/gsm8k`
+   - If server is running → also call `list_supported_models()` to verify the model is deployed
+   - **Never guess model/dataset full names** — always search to confirm
+3. **Scripts MUST use Server Mode** (`twinkle_client` for model + `twinkle` for data)
+4. **DO NOT modify the Twinkle SDK** (`src/twinkle/` or `src/twinkle_client/`)
+5. **Every script MUST register graceful shutdown** via `rt.register_graceful_shutdown(model, dataloader)`
+6. **All imports MUST be explicit** — never use a class/function without importing it first
+7. **`batch_size` constraints** — **Always set `drop_last=True`** in DataLoader.
+   - **All modes**: `batch_size >= model_dp` (number of model data-parallel GPUs). Call `list_supported_models()` to get GPU count.
+   - **GRPO**: `batch_size >= sampler_dp` (sampler's data-parallel worker count = `sampler_gpus / tp`). The sampler dispatches input batch across `dp` workers, each worker must get at least 1 item. This is often the tighter constraint (e.g., 6 sampler GPUs with tp=1 → need batch_size >= 6).
+8. **`rt.start()` MUST be called BEFORE `model.add_adapter_to_model()`** — `add_adapter_to_model` triggers NCCL init (60-120s) and no logs appear until `rt.start()` runs
+9. **`metric.result` values are auto-converted** inside `rt.log_metrics()` — no manual `float()` needed
+10. **NEVER use float format specifiers** (like `:.4f`, `:.2e`) on metric values in `print()` — they may be strings. Just use `{loss}`
+11. **NEVER access internal fields** of model/optimizer/scheduler objects (e.g. `model.optimizer.param_groups[0]['lr']`). Training runs on a remote Ray cluster — only public API methods are available. Use `model.calculate_metric()` to get metrics like loss/lr
+12. **Log ALL available metrics** via `rt.log_metrics(step=step, total_steps=MAX_STEPS, **metric.result)`. NEVER cherry-pick only `loss` — always pass the full `metric.result` dict. Different training types produce different metrics:
+    - **SFT**: loss, grad_norm, lr
+    - **GRPO**: loss, reward, reward_std, kl, entropy, grad_norm, lr
+    - **DPO**: loss, chosen_reward, rejected_reward, reward_margin, grad_norm, lr
+    - Use `**metric.result` to capture all of them automatically
+13. **Every script MUST include resume logic** after DataLoader creation. This enables seamless continuation when the script is auto-fixed and restarted:
+    ```python
+    resume = rt.get_resume_info()
+    global_step = resume['last_step']
+    if global_step > 0:
+        dataloader.skip_consumed_samples(global_step * BATCH_SIZE)
+        print(f'[twinkle] Resuming from step {global_step}')
+    ```
+
+## Pre-Training Planning
+
+> **Cloud shortcut:** If using `base_url='http://www.modelscope.cn/twinkle'`, skip hardware planning — cloud handles it.
+
+### Resource Assessment
+
+```bash
+nvidia-smi --query-gpu=index,name,memory.total --format=csv
+ray status  # if Ray running
+```
+
+### VRAM Quick Rules
+
+- **LoRA training**: model_weights_bf16 + ~20% overhead (7B→~17GB)
+- **Full FT**: model_weights × 4 (7B→~56GB)
+- **vLLM sampler**: model_weights + KV cache
+
+| Model | bf16 VRAM | LoRA (1 GPU) | Min GPU |
+|-------|-----------|-------------|---------|
+| Qwen3.5-4B | 8 GB | ~10 GB | 1× A10 |
+| Qwen3.5-7B | 14 GB | ~17 GB | 1× A10 |
+| Qwen3.5-14B | 28 GB | ~34 GB | 1× A100 |
+| Qwen3.5-32B | 64 GB | ~77 GB | 1× A100 |
+
+### GPU Split (Server Mode)
+
+```
+1 GPU  → model only, SFT/DPO
+2 GPUs → 1 model + 1 sampler (GRPO)
+4 GPUs → 1-2 model + 2-3 sampler
+8 GPUs → 2 model + 4 sampler (or 8 dp for SFT)
+Large models: 2× TP for 32B, 4× TP for 72B
+```
+
+---
+
+## Core API Reference
+
+### 1. Initialization
+
+```python
+from twinkle import init_twinkle_client
+
+# Server Mode (primary — self-hosted)
+client = init_twinkle_client(base_url='http://localhost:8000', api_key='EMPTY_API_KEY')
+
+# Cloud Mode (ModelScope hosted)
+import os
+client = init_twinkle_client(
+    base_url='http://www.modelscope.cn/twinkle',
+    api_key=os.environ['MODELSCOPE_TOKEN']
+)
+
+# Check available models
+caps = client.get_server_capabilities()
+for m in caps.supported_models:
+    print(f'- {m.model_name}')
+```
+
+**Parameters:**
+- `base_url`: Server URL (fallback: `TWINKLE_SERVER_URL` env var)
+- `api_key`: Auth token (fallback: `TWINKLE_SERVER_TOKEN` env var)
+- `session_heartbeat_interval`: Seconds between heartbeats (default: 10)
+
+### 2. Dataset & DatasetMeta
+
+```python
+from twinkle.dataset import Dataset, DatasetMeta, LazyDataset
+```
+
+**DatasetMeta** — describes a data source:
+```python
+DatasetMeta(
+    dataset_id='ms://modelscope/gsm8k',  # ModelScope/HF ID or local path
+    subset_name='main',                    # subset (default: 'default')
+    split='train',                         # split (default: 'train')
+    data_slice=range(5000),                # pick first N samples (optional)
+)
+```
+
+**In-memory data** (no external dataset):
+```python
+DatasetMeta(data=[
+    {'messages': [{'role': 'user', 'content': 'Hi'}, {'role': 'assistant', 'content': 'Hello!'}]},
+    ...
+])
+```
+
+**Dataset** — load, preprocess, encode:
+```python
+dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train', data_slice=range(5000)))
+dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=8192)
+dataset.map(GSM8KProcessor(system='Solve the math problem.'))
+dataset.encode(add_generation_prompt=True)  # True=for sampling, False=for training labels
+```
+
+**LazyDataset** — defers map/encode to `__getitem__` (for multimodal / large datasets):
+```python
+dataset = LazyDataset(DatasetMeta('ms://AI-ModelScope/LaTeX_OCR', data_slice=range(500)))
+dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512)
+dataset.map(LatexOCRProcessor)
+dataset.encode(batched=True)
+```
+
+**Key Dataset methods:**
+| Method | Description |
+|--------|-------------|
+| `set_template(name, model_id=..., max_length=...)` | Set chat template for encoding |
+| `map(processor, init_args={...})` | Apply preprocessor (class, instance, or string name) |
+| `encode(add_generation_prompt=False)` | Tokenize into InputFeature |
+| `filter(filter_func)` | Filter rows |
+| `add_dataset(DatasetMeta(...))` | Add another dataset |
+| `mix_dataset(interleave=True)` | Combine added datasets |
+
+### 3. DataLoader
+
+```python
+from twinkle.dataloader import DataLoader
+
+dataloader = DataLoader(dataset=dataset, batch_size=8, num_workers=0, drop_last=True)
+```
+
+**Parameters:**
+- `dataset`: Dataset or LazyDataset instance
+- `batch_size`: Samples per batch
+- `min_batch_size`: Minimum batch size (optional)
+- `num_workers`: DataLoader workers (default: 2; use 0 for debugging)
+
+**Checkpoint/Resume:**
+```python
+# Resume
+dataloader.resume_from_checkpoint(consumed_train_samples=progress['consumed_train_samples'])
+
+# Save state
+state = dataloader.get_state()  # → {'consumed_train_samples': int}
+```
+
+### 4. MultiLoraTransformersModel
+
+```python
+from twinkle_client.model import MultiLoraTransformersModel
+from peft import LoraConfig
+
+model = MultiLoraTransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
+```
+
+**Setup methods (call in order):**
+```python
+# 1. Add LoRA adapter
+lora_config = LoraConfig(
+    target_modules='all-linear',
+    r=8,
+    lora_alpha=32,
+    lora_dropout=0.05,
+)
+model.add_adapter_to_model(
+    'default',                          # adapter_name (unique per experiment)
+    lora_config,
+    gradient_accumulation_steps=2,      # effective_batch = batch_size × grad_accum
+    # NOTE: Do NOT pass save_dir — the server manages checkpoint paths automatically
+)
+
+# 2. Set template (same as dataset)
+model.set_template('Qwen3_5Template')
+
+# 3. Set input processor
+model.set_processor('InputProcessor', padding_side='right')
+
+# 4. Set loss function
+model.set_loss('CrossEntropyLoss')  # or 'GRPOLoss', 'DPOLoss', 'GKDLoss'
+
+# 5. Set optimizer (only Adam supported for Megatron backend)
+model.set_optimizer('Adam', lr=1e-4)
+
+# 6. Set LR scheduler (optional, NOT supported for Megatron backend)
+model.set_lr_scheduler('CosineAnnealingLR', T_max=100, eta_min=0)
+# model.set_lr_scheduler('CosineWarmupScheduler', num_warmup_steps=50, num_training_steps=1000)
+```
+
+**Training loop methods:**
+| Method | Description |
+|--------|-------------|
+| `forward_backward(inputs, **kwargs)` | Forward + backward in one call |
+| `forward_only(inputs, disable_lora=False)` | Forward without grad (for ref model in DPO) |
+| `clip_grad_and_step()` | Clip grad → optimizer step → zero_grad → lr_step (all-in-one) |
+| `clip_grad_norm(max_grad_norm=1.0)` | Only clip gradients |
+| `step()` | Only optimizer step |
+| `zero_grad()` | Only zero gradients |
+| `lr_step()` | Only LR scheduler step |
+| `calculate_metric(is_training=True)` | Get metrics (returns `.result` dict) |
+| `add_metric('DPOMetric', beta=0.1)` | Register additional metric |
+
+**Save/Load/Upload:**
+```python
+# Save checkpoint (returns SaveResponse with .twinkle_path)
+result = model.save(
+    name='my-checkpoint',
+    save_optimizer=True,
+    consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
+    is_sampler=False,  # True = sampler-only checkpoint (deletes old sampler saves)
+)
+
+# Resume from checkpoint
+progress = model.resume_from_checkpoint(result.twinkle_path)
+# progress → {'cur_step': int, 'consumed_train_samples': int}
+dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
+start_step = progress['cur_step']
+
+# Upload to ModelScope Hub
+model.upload_to_hub(
+    checkpoint_dir=result.twinkle_path,
+    hub_model_id='your_username/model-name',
+    hub_token=None,  # uses server default if None
+)
+```
+
+### 5. vLLMSampler
+
+```python
+from twinkle_client.sampler import vLLMSampler
+
+sampler = vLLMSampler(model_id='ms://Qwen/Qwen3.5-4B')
+sampler.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
+```
+
+**Sampling:**
+```python
+sampling_params = {
+    'max_tokens': 1024,
+    'temperature': 1.0,
+    'top_p': 0.95,
+    'num_samples': 4,    # completions per prompt
+    'logprobs': 1,       # return log probabilities
+}
+
+# Sync weights from training model
+result = model.save(name='sampler-weights', save_optimizer=False, is_sampler=True)
+
+# Sample
+responses = sampler.sample(
+    inputs=batch,                        # List[Trajectory] or List[InputFeature]
+    sampling_params=sampling_params,
+    adapter_uri=result.twinkle_path,     # use latest trained weights
+)
+
+# Parse responses
+for response in responses:
+    for seq in response.sequences:
+        seq.new_input_feature  # Dict: full trajectory as InputFeature (for training)
+        seq.tokens             # List[int]: generated token ids
+        seq.logprobs           # List[List[Tuple[int, float]]]: [(token_id, logp), ...]
+        seq.stop_reason        # str: 'stop' or 'length'
+```
+
+### 6. Preprocessors
+
+```python
+from twinkle.preprocessor import GSM8KProcessor, SelfCognitionProcessor, EmojiDPOProcessor
+from twinkle.preprocessor import Preprocessor  # base class for custom
+```
+
+| Preprocessor | Usage | Init Args |
+|---|---|---|
+| `GSM8KProcessor` | Math QA → Trajectory | `system=None, add_assistant=False` |
+| `SelfCognitionProcessor` | Self-cognition SFT | `model_name='twinkle robot', model_author='twinkle lab'` |
+| `EmojiDPOProcessor` | DPO preference pairs | `system=None, chosen_key='answer_zh', rejected_key='answer_en', prompt_key='prompt'` |
+
+**Using preprocessors:**
+```python
+# By instance (with args)
+dataset.map(GSM8KProcessor(system='Solve step by step.'))
+dataset.map(SelfCognitionProcessor(model_name='My Bot', model_author='Me'))
+
+# By string name + init_args (for cloud mode / serialization)
+dataset.map('SelfCognitionProcessor', init_args={'model_name': 'My Bot', 'model_author': 'Me'})
+
+# By class reference
+dataset.map(EmojiDPOProcessor, init_args={'system': 'You are helpful.'})
+```
+
+**Custom preprocessor:**
+```python
+from twinkle.preprocessor import Preprocessor
+from twinkle.data_format import Trajectory, Message
+
+class MyProcessor(Preprocessor):
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
+        rows = [self.preprocess(row) for row in rows]
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    def preprocess(self, row) -> Trajectory:
+        return Trajectory(messages=[
+            Message(role='user', content=row['question']),
+            Message(role='assistant', content=row['answer']),
+        ])
+```
+
+### 7. Loss Functions
+
+```python
+model.set_loss('CrossEntropyLoss')
+model.set_loss('GRPOLoss', epsilon=0.2, beta=0.0)
+model.set_loss('DPOLoss', beta=0.1, loss_type='sigmoid', reference_free=False, sft_weight=1.0)
+model.set_loss('GKDLoss', beta=0.5, temperature=1.0)
+```
+
+| Loss | Use Case | Key Params |
+|------|----------|------------|
+| `CrossEntropyLoss` | SFT | `ignore_index=-100, dft=False` |
+| `GRPOLoss` | GRPO/PPO RL | `epsilon=0.2, beta=0.0 (KL), entropy_coef=0.0` |
+| `DPOLoss` | DPO preference | `beta=0.1, loss_type='sigmoid'/'hinge'/'ipo'/'kto_pair', sft_weight=0.0` |
+| `GKDLoss` | Knowledge distillation | `beta=0.5 (JSD mix), temperature=1.0, chunk_size=512` |
+
+### 8. Rewards & Advantages
+
+```python
+from twinkle.reward import GSM8KAccuracyReward
+from twinkle.reward.base import Reward
+from twinkle.advantage import GRPOAdvantage
+```
+
+**Built-in rewards:**
+```python
+reward_fn = GSM8KAccuracyReward()
+rewards = reward_fn(trajectories)  # → List[float] (1.0=correct, 0.0=wrong)
+```
+
+**Custom reward (MUST subclass Reward):**
+```python
+class MyReward(Reward):
+    def __call__(self, trajectories, **kwargs) -> List[float]:
+        rewards = []
+        for traj in trajectories:
+            messages = traj.get('messages', [])
+            completion = ''
+            for msg in reversed(messages):
+                if msg.get('role') == 'assistant':
+                    completion = msg.get('content', '')
+                    break
+            # Your scoring logic here
+            rewards.append(score)
+        return rewards
+```
+
+**Advantage computation:**
+```python
+advantage_fn = GRPOAdvantage()
+advantages = advantage_fn(
+    rewards,                    # List[float] or Tensor
+    num_generations=4,          # samples per prompt
+    scale='group',              # 'group'=per-prompt, 'batch'=global, 'none'=no normalization
+).tolist()
+```
+
+### 9. Metrics
+
+```python
+from twinkle.metric import CompletionRewardMetric, DPOMetric
+```
+
+**CompletionRewardMetric** (for GRPO):
+```python
+metrics = CompletionRewardMetric()
+metrics.accumulate(
+    completion_lengths=all_completion_lengths,
+    rewards={'total': total_rewards, 'accuracy': acc_rewards},
+)
+log_dict = metrics.calculate()  # → {'train/total_reward': ..., 'train/completion_length': ...}
+metrics.reset()
+```
+
+**DPOMetric** (for DPO — added to model):
+```python
+model.add_metric('DPOMetric', beta=0.1)
+# Then after forward_backward:
+metric = model.calculate_metric(is_training=True)
+# metric.result → {'logps/chosen': ..., 'rewards/margins': ..., 'rewards/accuracies': ...}
+```
+
+### 10. TrainingRuntime (Observability)
+
+```python
+from twinkle_client.tui.runtime import TrainingRuntime
+
+rt = TrainingRuntime()  # auto-reads TWINKLE_RUN_ID env var (set by TUI launcher)
+# IMPORTANT: call rt.start() BEFORE model.add_adapter_to_model() so TUI can show logs immediately.
+# add_adapter_to_model triggers NCCL init across all GPUs which can take 60-120s.
+rt.start(model_id='Qwen/Qwen3.5-4B', config={'lr': 1e-4}, script_path=__file__)
+rt.register_graceful_shutdown(model, dataloader)  # MUST register
+
+# Resume logic — MUST be after dataloader creation, before training loop:
+resume = rt.get_resume_info()
+global_step = resume['last_step']
+if global_step > 0:
+    dataloader.skip_consumed_samples(global_step * BATCH_SIZE)
+    print(f'[twinkle] Resuming from step {global_step}')
+
+# In training loop — use print() for logs (stdout goes to output.log, shown in TUI):
+metric = model.calculate_metric(is_training=True)
+rt.log_metrics(step=step, total_steps=MAX_STEPS, **metric.result)
+print(f'[Step {step}/{MAX_STEPS}] {metric.result}')
+
+# When done:
+rt.finish(status='completed')
+```
+
+### 11. Data Types
+
+```python
+from twinkle.data_format import Trajectory, Message, InputFeature
+```
+
+**Trajectory** (conversation format — used as input to dataset/sampler):
+```python
+Trajectory(
+    messages=[
+        Message(role='system', content='You are helpful.'),
+        Message(role='user', content='What is 2+2?'),
+        Message(role='assistant', content='4'),
+    ],
+    images=[...],   # optional: for multimodal
+    videos=[...],   # optional
+)
+```
+
+**Message fields:** `role` ('system'/'user'/'assistant'/'tool'), `content` (str), `tool_calls`, `reasoning_content`
+
+**InputFeature** (tokenized — output of encode):
+```python
+InputFeature(
+    input_ids=[...],        # token ids
+    attention_mask=[...],   # 0/1 mask
+    labels=[...],           # -100 for ignored positions
+    completion_mask=[...],  # for RL: which tokens to optimize
+    length=512,
+)
+```
+
+---
+
+## Complete Training Examples
+
+### Example 1: SFT (Self-Cognition Fine-Tuning)
+
+```python
+import os
+from peft import LoraConfig
+from twinkle import init_twinkle_client
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.dataloader import DataLoader
+from twinkle.preprocessor import SelfCognitionProcessor
+from twinkle_client.model import MultiLoraTransformersModel
+from twinkle_client.tui.runtime import TrainingRuntime
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+MAX_STEPS = 50
+
+# 1. Init client
+client = init_twinkle_client(base_url='http://localhost:8000', api_key='EMPTY_API_KEY')
+
+# 2. Runtime (MUST be before model setup — add_adapter_to_model takes 60-120s for NCCL init)
+rt = TrainingRuntime()
+rt.start(model_id='Qwen/Qwen3.5-4B', config={'lr': 1e-4, 'batch_size': 4}, script_path=__file__)
+
+# 3. Prepare dataset
+dataset = Dataset(DatasetMeta('ms://swift/self-cognition', data_slice=range(500)))
+dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=512)
+dataset.map(SelfCognitionProcessor(model_name='Twinkle助手', model_author='ModelScope'))
+dataset.encode(batched=True)
+dataloader = DataLoader(dataset=dataset, batch_size=8, num_workers=0, drop_last=True)
+
+# 4. Configure model
+model = MultiLoraTransformersModel(model_id=MODEL_ID)
+model.add_adapter_to_model('default', LoraConfig(target_modules='all-linear'), gradient_accumulation_steps=2)
+model.set_template('Qwen3_5Template')
+model.set_processor('InputProcessor', padding_side='right')
+model.set_loss('CrossEntropyLoss')
+model.set_optimizer('Adam', lr=1e-4)
+rt.register_graceful_shutdown(model, dataloader)
+
+# 5. Resume logic (enables seamless restart after auto-fix)
+resume = rt.get_resume_info()
+global_step = resume['last_step']
+if global_step > 0:
+    dataloader.skip_consumed_samples(global_step * 8)
+    print(f'[twinkle] Resuming from step {global_step}')
+
+# 6. Training loop
+for epoch in range(3):
+    for batch in dataloader:
+        model.forward_backward(inputs=batch)
+        model.clip_grad_and_step()
+        global_step += 1
+
+        if global_step % 2 == 0:
+            metric = model.calculate_metric(is_training=True)
+            rt.log_metrics(step=global_step, total_steps=MAX_STEPS, **metric.result)
+            print(f'[Step {global_step}/{MAX_STEPS}] {metric.result}')
+
+        if global_step >= MAX_STEPS:
+            break
+
+    if global_step >= MAX_STEPS:
+        break
+
+    # Save per epoch
+    result = model.save(
+        name=f'sft-epoch-{epoch}',
+        save_optimizer=True,
+        consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
+    )
+    print(f'Saved checkpoint: {result.twinkle_path}')
+
+rt.finish(status='completed')
+```
+
+### Example 2: GRPO (Reinforcement Learning)
+
+```python
+import gc
+from typing import List, Dict, Any
+from peft import LoraConfig
+from twinkle import init_twinkle_client
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.dataloader import DataLoader
+from twinkle.preprocessor import GSM8KProcessor
+from twinkle.reward import GSM8KAccuracyReward
+from twinkle.advantage import GRPOAdvantage
+from twinkle.metric import CompletionRewardMetric
+from twinkle_client.model import MultiLoraTransformersModel
+from twinkle_client.sampler import vLLMSampler
+from twinkle_client.tui.runtime import TrainingRuntime
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+NUM_GENERATIONS = 4
+MAX_STEPS = 100
+BATCH_SIZE = 8   # MUST be >= sampler_dp (sampler workers) AND >= model_dp
+LEARNING_RATE = 2e-5
+
+# 1. Init client
+client = init_twinkle_client(base_url='http://127.0.0.1:8000', api_key='EMPTY_API_KEY')
+
+# 2. Runtime (before model setup)
+rt = TrainingRuntime()
+rt.start(model_id='Qwen/Qwen3.5-4B', config={'lr': LEARNING_RATE, 'method': 'GRPO'}, script_path=__file__)
+
+# 3. Prepare dataset (encode with generation prompt for sampling)
+dataset = Dataset(DatasetMeta('ms://modelscope/gsm8k', subset_name='main', split='train', data_slice=range(2000)))
+dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048, enable_thinking=False)
+dataset.map(GSM8KProcessor(system='Solve the math problem and put answer in \\boxed{}.'))
+dataset.encode(add_generation_prompt=True)
+dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, num_workers=0, drop_last=True)
+
+# 4. Configure model with GRPOLoss
+model = MultiLoraTransformersModel(model_id=MODEL_ID)
+model.add_adapter_to_model('default', LoraConfig(target_modules='all-linear', r=8, lora_alpha=32, lora_dropout=0.05),
+                           gradient_accumulation_steps=1)
+model.set_loss('GRPOLoss', epsilon=0.2, beta=0.0)
+model.set_optimizer('Adam', lr=LEARNING_RATE)
+model.set_processor('InputProcessor')
+model.set_template('Qwen3_5Template', model_id=MODEL_ID)
+rt.register_graceful_shutdown(model, dataloader)
+
+# 5. Configure sampler
+sampler = vLLMSampler(model_id=MODEL_ID)
+sampler.set_template('Qwen3_5Template', model_id=MODEL_ID)
+
+# 6. Setup
+advantage_fn = GRPOAdvantage()
+reward_fn = GSM8KAccuracyReward()
+metrics = CompletionRewardMetric()
+sampling_params = {'max_tokens': 1024, 'temperature': 1.0, 'top_p': 0.95, 'num_samples': NUM_GENERATIONS, 'logprobs': 1}
+current_adapter_uri = None
+
+# 7. Training loop
+step = 0
+for batch in dataloader:
+    if step >= MAX_STEPS:
+        break
+    metrics.reset()
+
+    # 7a. Sync weights to sampler
+    result = model.save(name='grpo-sampler-weights', save_optimizer=False, is_sampler=True)
+    current_adapter_uri = result.twinkle_path
+
+    # 7b. Sample completions
+    responses = sampler.sample(inputs=batch, sampling_params=sampling_params, adapter_uri=current_adapter_uri)
+
+    all_inputs: List[Dict[str, Any]] = []
+    all_old_logps: List[List[float]] = []
+    all_completion_lengths: List[int] = []
+
+    for response in responses:
+        for seq in response.sequences:
+            all_inputs.append(seq.new_input_feature)
+            all_old_logps.append([lp[0][1] for lp in seq.logprobs])
+            all_completion_lengths.append(len(seq.tokens))
+
+    # 7c. Compute rewards
+    rewards = reward_fn(all_inputs)
+    metrics.accumulate(completion_lengths=all_completion_lengths, rewards={'accuracy': rewards})
+
+    # 7d. Compute advantages
+    advantages = advantage_fn(rewards, num_generations=NUM_GENERATIONS, scale='group').tolist()
+
+    # Skip if all advantages are zero (no learning signal)
+    if all(abs(a) < 1e-8 for a in advantages):
+        step += 1
+        continue
+
+    # 7e. Train
+    model.forward_backward(inputs=all_inputs, advantages=advantages, old_logps=all_old_logps)
+    model.clip_grad_and_step()
+    gc.collect()
+
+    # 7f. Log
+    log_dict = metrics.calculate()
+    log_dict.update(model.calculate_metric(is_training=True).result)
+    rt.log_metrics(step=step, total_steps=MAX_STEPS, **log_dict)
+    print(f'[Step {step}/{MAX_STEPS}] {log_dict}')
+    step += 1
+
+# Save final
+model.save(name='grpo-final', save_optimizer=True)
+rt.finish(status='completed')
+```
+
+### Example 3: DPO (Preference Optimization)
+
+```python
+import numpy as np
+import torch
+from typing import Any, Dict, List
+from peft import LoraConfig
+from twinkle import init_twinkle_client
+from twinkle.dataset import Dataset, DatasetMeta
+from twinkle.dataloader import DataLoader
+from twinkle.preprocessor import EmojiDPOProcessor
+from twinkle_client.model import MultiLoraTransformersModel
+from twinkle_client.tui.runtime import TrainingRuntime
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+DPO_BETA = 0.1
+LEARNING_RATE = 1e-4
+
+# 1. Init
+client = init_twinkle_client(base_url='http://localhost:8000', api_key='EMPTY_API_KEY')
+
+# 2. Runtime (before model setup)
+rt = TrainingRuntime()
+rt.start(model_id='Qwen/Qwen3.5-4B', config={'method': 'DPO', 'beta': DPO_BETA}, script_path=__file__)
+
+# 3. Prepare DPO dataset
+dataset = Dataset(DatasetMeta('ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji', data_slice=range(100)))
+dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=2048)
+dataset.map(EmojiDPOProcessor, init_args={'system': 'You are a helpful assistant.'})
+dataset.encode()  # DPO: no add_generation_prompt
+dataloader = DataLoader(dataset=dataset, batch_size=8, num_workers=0, drop_last=True)
+
+# 4. Configure model with DPO loss
+model = MultiLoraTransformersModel(model_id=MODEL_ID)
+model.add_adapter_to_model('default', LoraConfig(target_modules='all-linear', r=8, lora_alpha=32, lora_dropout=0.05),
+                           gradient_accumulation_steps=2)
+model.set_template('Qwen3_5Template')
+model.set_processor('InputProcessor', padding_side='right')
+model.set_loss('DPOLoss', beta=DPO_BETA, loss_type='sigmoid', reference_free=False, sft_weight=1.0)
+model.add_metric('DPOMetric', beta=DPO_BETA)
+model.set_optimizer('Adam', lr=LEARNING_RATE)
+rt.register_graceful_shutdown(model, dataloader)
+
+
+def prepare_dpo_batch(batch: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Interleave positive/negative for DP-safe training: [pos1, neg1, pos2, neg2, ...]"""
+    result = []
+    for row in batch:
+        base_fields = {k: v for k, v in row.items() if k not in ('positive', 'negative')}
+        result.append({**base_fields, **row['positive']})
+        result.append({**base_fields, **row['negative']})
+    return result
+
+
+# 5. Training loop
+max_steps = len(dataloader)
+for step, batch in enumerate(dataloader):
+    # Convert numpy/torch tensors for serialization
+    for row in batch:
+        for key in row:
+            if isinstance(row[key], np.ndarray):
+                row[key] = row[key].tolist()
+            elif isinstance(row[key], torch.Tensor):
+                row[key] = row[key].cpu().numpy().tolist()
+
+    dpo_batch = prepare_dpo_batch(batch)
+
+    # Get reference logps from base model (disable LoRA)
+    ref_outputs = model.forward_only(inputs=dpo_batch, disable_lora=True)
+
+    # Train with DPO loss
+    model.forward_backward(inputs=dpo_batch, ref_outputs=ref_outputs.result)
+    model.clip_grad_and_step()
+
+    if step % 2 == 0:
+        metric = model.calculate_metric(is_training=True)
+        rt.log_metrics(step=step, total_steps=max_steps, **metric.result)
+        print(f'[Step {step}/{max_steps}] {metric.result}')
+
+result = model.save(name='dpo-final', save_optimizer=True)
+rt.finish(status='completed')
+```
+
+### Example 4: Multimodal SFT (Image Understanding)
+
+```python
+import numpy as np
+import torch
+from peft import LoraConfig
+from twinkle import init_twinkle_client
+from twinkle.dataset import LazyDataset, DatasetMeta
+from twinkle.dataloader import DataLoader
+from twinkle.preprocessor import Preprocessor
+from twinkle.data_format import Trajectory, Message
+from twinkle_client.model import MultiLoraTransformersModel
+from twinkle_client.tui.runtime import TrainingRuntime
+
+MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
+
+
+class LatexOCRProcessor(Preprocessor):
+    """Custom preprocessor for LaTeX OCR dataset."""
+    def __call__(self, rows):
+        rows = self.map_col_to_row(rows)
+        rows = [self.preprocess(row) for row in rows]
+        rows = self.map_row_to_col(rows)
+        return rows
+
+    def preprocess(self, row) -> Trajectory:
+        return Trajectory(messages=[
+            Message(role='user', content='<image>Using LaTeX to perform OCR on the image.', images=[row['image']]),
+            Message(role='assistant', content=row['text']),
+        ])
+
+
+# 1. Init
+client = init_twinkle_client(base_url='http://localhost:8000', api_key='EMPTY_API_KEY')
+
+# 2. Runtime (before model setup)
+rt = TrainingRuntime()
+rt.start(model_id='Qwen/Qwen3.5-4B', config={'task': 'multimodal-sft'}, script_path=__file__)
+
+# 3. LazyDataset for multimodal (defers processing to avoid OOM)
+dataset = LazyDataset(DatasetMeta('ms://AI-ModelScope/LaTeX_OCR', data_slice=range(500)))
+dataset.set_template('Qwen3_5Template', model_id=MODEL_ID, max_length=512)
+dataset.map(LatexOCRProcessor)
+dataset.encode(batched=True)
+dataloader = DataLoader(dataset=dataset, batch_size=8, num_workers=0, drop_last=True)
+
+# 4. Model setup
+model = MultiLoraTransformersModel(model_id=MODEL_ID)
+model.add_adapter_to_model('default', LoraConfig(target_modules='all-linear'), gradient_accumulation_steps=2)
+model.set_template('Qwen3_5Template')
+model.set_processor('InputProcessor', padding_side='right')
+model.set_loss('CrossEntropyLoss')
+model.set_optimizer('Adam', lr=1e-4)
+rt.register_graceful_shutdown(model, dataloader)
+
+# 5. Train
+for epoch in range(3):
+    for step, batch in enumerate(dataloader):
+        # Important: convert numpy/torch for serialization
+        for sample in batch:
+            for key in sample:
+                if isinstance(sample[key], np.ndarray):
+                    sample[key] = sample[key].tolist()
+                elif isinstance(sample[key], torch.Tensor):
+                    sample[key] = sample[key].cpu().numpy().tolist()
+
+        model.forward_backward(inputs=batch)
+        model.clip_grad_and_step()
+
+        if step % 2 == 0:
+            metric = model.calculate_metric(is_training=True)
+            rt.log_metrics(step=step, total_steps=len(dataloader), **metric.result)
+
+    model.save(name=f'multimodal-epoch-{epoch}', save_optimizer=True)
+
+rt.finish(status='completed')
+```
+
+### Example 5: Sampling / Inference Only
+
+```python
+from twinkle_client import init_twinkle_client
+from twinkle_client.sampler import vLLMSampler
+
+# 1. Init
+client = init_twinkle_client(base_url='http://127.0.0.1:8000', api_key='EMPTY_API_KEY')
+
+# 2. Create sampler
+sampler = vLLMSampler(model_id='Qwen/Qwen3.5-4B')
+sampler.set_template('Qwen3_5Template', model_id='Qwen/Qwen3.5-4B')
+
+# 3. Prepare input as Trajectory
+trajectory = {
+    'messages': [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {'role': 'user', 'content': 'Who are you?'},
+    ]
+}
+
+# 4. Sample (with optional LoRA adapter)
+responses = sampler.sample(
+    inputs=[trajectory] * 4,  # 4 prompts
+    sampling_params={'max_tokens': 128, 'temperature': 1.0, 'num_samples': 2},
+    adapter_uri='twinkle://...',  # optional: from a model.save() result
+)
+
+# 5. Decode
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3.5-4B', trust_remote_code=True)
+for response in responses:
+    for seq in response.sequences:
+        text = tokenizer.decode(seq.tokens, skip_special_tokens=True)
+        print(text)
+```
+
+---
+
+## Server Mode Architecture
+
+```
+┌─ Twinkle Server (Ray + GPU) ─────────────────────────────┐
+│  Base Model → adapter 'exp-01' (weights + optimizer)      │
+│            → adapter 'exp-02' (weights + optimizer)      │
+│  vLLM Sampler → shared inference engine                   │
+└───────────────────────────────────────────────────────────┘
+         ↑ HTTP (forward_backward, clip_grad_and_step, save, sample)
+┌─ Client Script (CPU only, stateless) ────────────────────┐
+│  Data loading + Training loop + Reward computation        │
+└───────────────────────────────────────────────────────────┘
+```
+
+**Key implications:**
+- "Pause" = kill client (SIGKILL) → server retains all state
+- "Stop" = SIGTERM → saves checkpoint + dataloader state → exits
+- "Resume" = restart with same adapter_name → continues seamlessly
+- "Reset" = use new adapter_name → fresh start
+
+### Starting Local Server
+
+```bash
+# 1. Start Ray
+CUDA_VISIBLE_DEVICES=0,1,2,3 ray start --head --port=6379 --num-gpus=4 --disable-usage-stats
+CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0  # CPU worker
+
+# 2. Start server
+python server.py  # reads server_config.yaml, blocks
+```
+
+The TUI agent's `start_server` tool handles this automatically — generates config + starts Ray + launches server.
+
+---
+
+## Built-in Components Summary
+
+| Type | Available | Import Path |
+|------|-----------|-------------|
+| **Loss** | `CrossEntropyLoss`, `GRPOLoss`, `DPOLoss`, `GKDLoss` | `twinkle.loss` |
+| **Preprocessor** | `GSM8KProcessor`, `SelfCognitionProcessor`, `EmojiDPOProcessor` | `twinkle.preprocessor` |
+| **Reward** | `GSM8KAccuracyReward`, `GSM8KFormatReward` | `twinkle.reward` |
+| **Advantage** | `GRPOAdvantage` | `twinkle.advantage` |
+| **Metric** | `CompletionRewardMetric`, `DPOMetric` | `twinkle.metric` |
+| **Template** | `Qwen3_5Template` | (string name to `set_template`) |
+| **Processor** | `InputProcessor` | (string name to `set_processor`) |
+
+**Cloud mode restriction:** Only built-in components (by name string). Custom classes cannot be serialized.
+
+---
+
+## Tinker-Compatible API (Alternative)
+
+For GRPO with Tinker API:
+```python
+from twinkle import init_tinker_client
+init_tinker_client()
+from tinker import ServiceClient, types
+
+service_client = ServiceClient(base_url=BASE_URL, api_key=API_KEY)
+training_client = service_client.create_lora_training_client(base_model='Qwen/Qwen3.5-4B', rank=16)
+training_client.forward_backward(datums, 'importance_sampling').result()
+training_client.optim_step(types.AdamParams(learning_rate=2e-5)).result()
+sampling_client = training_client.save_weights_and_get_sampling_client(name='step-N')
+```
+
+---
+
+## File Layout
+
+```
+~/.cache/twinkle/{run_id}/
+├── meta.json       # Run metadata (model_id, config, status, pid, script_version)
+├── train.py        # Current active script
+├── train_v1.py     # Archived versions
+├── metrics.jsonl   # One JSON line per step
+├── logs.jsonl      # One JSON line per event
+└── stderr.log      # Script stderr output
+```
+
+---
+
+## OpenAI-Compatible Endpoint
+
+The server also exposes OpenAI-compatible `/v1/chat/completions`:
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url='http://127.0.0.1:8000/api/v1', api_key='EMPTY_API_KEY')
+resp = client.chat.completions.create(
+    model='Qwen/Qwen3.5-4B',
+    messages=[{'role': 'user', 'content': 'Hello!'}],
+    max_tokens=128,
+    temperature=0.7,
+    stream=True,  # streaming supported
+)
+for chunk in resp:
+    print(chunk.choices[0].delta.content, end='')
+```
diff --git a/src/twinkle_client/skills/local_provider.py b/src/twinkle_client/skills/local_provider.py
new file mode 100644
index 000000000..35c79c8f3
--- /dev/null
+++ b/src/twinkle_client/skills/local_provider.py
@@ -0,0 +1,35 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Local skill provider - loads skill markdown files from user's local skills directory."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from twinkle_client.skills.base import SkillProvider
+
+# Default: user-local skills directory (user drops .md files here)
+_DEFAULT_SKILLS_DIR = Path.home() / '.cache' / 'twinkle' / 'tui' / 'skills' / 'local'
+
+
+class LocalSkillProvider(SkillProvider):
+    """Loads skill markdown files from a local directory.
+
+    By default, reads from ~/.cache/twinkle/tui/skills/local/.
+    Users can place custom .md skill files there to extend the agent's
+    domain knowledge without modifying the codebase.
+    """
+
+    def __init__(self, skills_dir: Path | str | None = None):
+        self._skills_dir = Path(skills_dir) if skills_dir else _DEFAULT_SKILLS_DIR
+        super().__init__(cache_dir=self._skills_dir)
+
+    @property
+    def name(self) -> str:
+        return 'local'
+
+    async def fetch(self) -> None:
+        """Ensure the local skills directory exists."""
+        self._skills_dir.mkdir(parents=True, exist_ok=True)
+
+    def _skills_root(self) -> Path:
+        return self._skills_dir
diff --git a/src/twinkle_client/skills/manager.py b/src/twinkle_client/skills/manager.py
new file mode 100644
index 000000000..34ceb9c56
--- /dev/null
+++ b/src/twinkle_client/skills/manager.py
@@ -0,0 +1,94 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Skill manager - orchestrates multiple skill providers and formats for LLM injection."""
+
+from __future__ import annotations
+
+from twinkle.utils.logger import get_logger
+
+from twinkle_client.skills.base import Skill, SkillProvider
+
+logger = get_logger()
+
+
+class SkillManager:
+    """Manages multiple SkillProviders and aggregates their skills.
+
+    The SkillManager is the single entry point for the agent to load skills.
+    It supports registering multiple providers (ModelScope, HuggingFace, local, etc.)
+    and produces a combined prompt section for LLM injection.
+
+    Usage:
+        manager = SkillManager()
+        manager.register(ModelScopeSkillProvider())
+        manager.register(HuggingFaceSkillProvider())  # future
+        await manager.load_all()
+        prompt_section = manager.format_for_prompt()
+    """
+
+    def __init__(self):
+        self._providers: list[SkillProvider] = []
+        self._skills: list[Skill] = []
+
+    def register(self, provider: SkillProvider) -> None:
+        """Register a skill provider.
+
+        Args:
+            provider: An instance of a SkillProvider subclass.
+        """
+        self._providers.append(provider)
+        logger.info(f'Registered skill provider: {provider.name}')
+
+    async def load_all(self) -> list[Skill]:
+        """Fetch and load skills from all registered providers.
+
+        Returns:
+            Combined list of all skills from all providers.
+        """
+        self._skills = []
+        for provider in self._providers:
+            try:
+                skills = await provider.get_skills()
+                self._skills.extend(skills)
+                logger.info(f'Provider [{provider.name}] loaded {len(skills)} skills')
+            except Exception as e:
+                logger.error(f'Provider [{provider.name}] failed: {e}')
+        return self._skills
+
+    @property
+    def skills(self) -> list[Skill]:
+        """Return all currently loaded skills."""
+        return self._skills
+
+    def format_for_prompt(self) -> str:
+        """Format all loaded skills into a single text block for LLM system prompt.
+
+        Returns:
+            A formatted string containing all skills, ready to be appended
+            to the system prompt. Returns empty string if no skills loaded.
+        """
+        if not self._skills:
+            return ''
+
+        sections: list[str] = []
+        sections.append('# Available Skills')
+        sections.append('')
+        sections.append(
+            'The following skills provide you with specialized knowledge and capabilities. '
+            'Use them to better assist the user.'
+        )
+        sections.append('')
+
+        for skill in self._skills:
+            sections.append(f'## Skill: {skill.name}')
+            sections.append(f'(source: {skill.source})')
+            sections.append('')
+            sections.append(skill.content)
+            sections.append('')
+            sections.append('---')
+            sections.append('')
+
+        return '\n'.join(sections)
+
+    def get_skill_names(self) -> list[str]:
+        """Return names of all loaded skills (for logging/debug)."""
+        return [s.name for s in self._skills]
diff --git a/src/twinkle_client/skills/modelscope_provider.py b/src/twinkle_client/skills/modelscope_provider.py
new file mode 100644
index 000000000..9fe27f9c4
--- /dev/null
+++ b/src/twinkle_client/skills/modelscope_provider.py
@@ -0,0 +1,65 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""ModelScope skill provider - fetches skills from modelscope-skills GitHub repo."""
+
+from __future__ import annotations
+
+import asyncio
+from twinkle.utils.logger import get_logger
+from pathlib import Path
+
+from twinkle_client.skills.base import SkillProvider
+
+logger = get_logger()
+
+_DEFAULT_REPO_URL = 'https://github.com/modelscope/modelscope-skills.git'
+_DEFAULT_BRANCH = 'main'
+
+
+class ModelScopeSkillProvider(SkillProvider):
+    """Fetches skill markdown files from the modelscope-skills GitHub repository.
+
+    Skills are cloned to a local cache directory. On subsequent calls,
+    the repo is pulled to get updates.
+    """
+
+    def __init__(
+        self,
+        repo_url: str = _DEFAULT_REPO_URL,
+        branch: str = _DEFAULT_BRANCH,
+        cache_dir: Path | None = None,
+    ):
+        self._repo_url = repo_url
+        self._branch = branch
+        super().__init__(cache_dir=cache_dir)
+
+    @property
+    def name(self) -> str:
+        return 'modelscope'
+
+    async def fetch(self) -> None:
+        """Clone or pull the modelscope-skills repository."""
+        repo_dir = self.cache_dir / 'repo'
+
+        if (repo_dir / '.git').exists():
+            proc = await asyncio.create_subprocess_exec(
+                'git', '-C', str(repo_dir), 'pull', '--ff-only',
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            _, stderr = await proc.communicate()
+            if proc.returncode != 0:
+                logger.warning(f'git pull failed: {stderr.decode().strip()}')
+        else:
+            self.cache_dir.mkdir(parents=True, exist_ok=True)
+            proc = await asyncio.create_subprocess_exec(
+                'git', 'clone', '--depth', '1', '--branch', self._branch,
+                self._repo_url, str(repo_dir),
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            _, stderr = await proc.communicate()
+            if proc.returncode != 0:
+                logger.error(f'git clone failed: {stderr.decode().strip()}')
+
+    def _skills_root(self) -> Path:
+        return self.cache_dir / 'repo'
diff --git a/src/twinkle_client/tui/__init__.py b/src/twinkle_client/tui/__init__.py
new file mode 100644
index 000000000..26bf6d557
--- /dev/null
+++ b/src/twinkle_client/tui/__init__.py
@@ -0,0 +1,133 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Twinkle TUI - Terminal User Interface for training control."""
+
+import logging
+from pathlib import Path
+
+from twinkle.utils.logger import get_logger
+
+# ── Log file: ./tui.log (current working directory) ──
+_LOG_FILE = Path.cwd() / 'tui.log'
+
+
+def _configure_logging(verbose: bool = False) -> None:
+    """Configure file-only logging for TUI.
+
+    All logs are written to ./tui.log in the current working directory.
+    NO console output — avoids corrupting Textual's alt-screen buffer.
+    The file is rotated at 5MB with 3 backups.
+    """
+    from logging.handlers import RotatingFileHandler
+
+    handler = RotatingFileHandler(
+        _LOG_FILE,
+        maxBytes=5 * 1024 * 1024,  # 5MB
+        backupCount=3,
+        encoding='utf-8',
+    )
+    handler.setFormatter(logging.Formatter(
+        '%(asctime)s [%(levelname)s] %(name)s: %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S',
+    ))
+
+    level = logging.DEBUG if verbose else logging.INFO
+
+    # Get the 'twinkle' logger (same one returned by get_logger())
+    twinkle_logger = logging.getLogger('twinkle')
+    twinkle_logger.setLevel(level)
+    twinkle_logger.propagate = False
+
+    # Remove any existing handlers (especially StreamHandlers that print to terminal)
+    twinkle_logger.handlers.clear()
+
+    # Only attach the file handler — no terminal output
+    twinkle_logger.addHandler(handler)
+
+    # Mark as initialized so get_logger() won't re-add a StreamHandler
+    from twinkle.utils.logger import init_loggers
+    init_loggers[twinkle_logger.name] = True
+
+    # Also capture warnings from third-party libs
+    logging.captureWarnings(True)
+
+
+def main(argv: list[str] | None = None) -> int:
+    """Programmatic entry point for ``twinkle-tui``.
+
+    Mirrors the pattern of ``twinkle.server.cli:main`` — runs the Typer app
+    in standalone mode and converts SystemExit to a plain return code.
+    """
+    import sys
+    import typer
+    from twinkle.version import __version__
+
+    app = typer.Typer(
+        add_completion=False,
+        no_args_is_help=False,
+        help='Twinkle TUI — ML Training Control via Natural Language.',
+    )
+
+    def _version_callback(value: bool) -> None:
+        if value:
+            typer.echo(f'twinkle-tui {__version__}')
+            raise typer.Exit()
+
+    @app.command()
+    def launch(
+        run_id: str | None = typer.Option(
+            None, '--run-id', '-r',
+            envvar='TWINKLE_TUI_RUN_ID',
+            help='Attach to an existing training run by ID.',
+        ),
+        llm_base_url: str = typer.Option(
+            'http://localhost:11434/v1', '--llm-base-url',
+            envvar='TWINKLE_LLM_BASE_URL',
+            help='LLM API base URL.',
+        ),
+        llm_model: str = typer.Option(
+            'qwen3.5', '--llm-model',
+            envvar='TWINKLE_LLM_MODEL',
+            help='LLM model name.',
+        ),
+        llm_api_key: str = typer.Option(
+            'not-needed', '--llm-api-key',
+            envvar='TWINKLE_LLM_API_KEY',
+            help='LLM API key.',
+        ),
+        verbose: bool = typer.Option(
+            False, '--verbose', '-v',
+            envvar='TWINKLE_TUI_VERBOSE',
+            help='Enable verbose (DEBUG) logging.',
+        ),
+        version: bool = typer.Option(
+            False, '--version', '-V',
+            callback=_version_callback, is_eager=True,
+            help='Show version and exit.',
+        ),
+    ) -> None:
+        """Launch the Twinkle TUI."""
+        _configure_logging(verbose=verbose)
+        logger = get_logger()
+        logger.info(
+            f'TUI starting — model={llm_model}, base_url={llm_base_url}, '
+            f'run_id={run_id}, log_file={_LOG_FILE}'
+        )
+
+        from twinkle_client.tui.app import TwinkleTUI
+
+        tui = TwinkleTUI(
+            run_id=run_id,
+            llm_base_url=llm_base_url,
+            llm_model=llm_model,
+            llm_api_key=llm_api_key,
+        )
+        tui.run()
+
+    try:
+        app(args=argv, standalone_mode=True)
+    except SystemExit as exc:
+        code = exc.code
+        if code is None:
+            return 0
+        return int(code) if not isinstance(code, int) else code
+    return 0
diff --git a/src/twinkle_client/tui/__main__.py b/src/twinkle_client/tui/__main__.py
new file mode 100644
index 000000000..032067683
--- /dev/null
+++ b/src/twinkle_client/tui/__main__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Allow running as `python -m twinkle_client.tui`."""
+
+import sys
+
+from twinkle_client.tui import main
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/twinkle_client/tui/agent/__init__.py b/src/twinkle_client/tui/agent/__init__.py
new file mode 100644
index 000000000..a011eb543
--- /dev/null
+++ b/src/twinkle_client/tui/agent/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+from twinkle_client.tui.agent.core import AgentLoop
+from twinkle_client.tui.agent.monitor import TrainingMonitor
+
+__all__ = ['AgentLoop', 'TrainingMonitor']
diff --git a/src/twinkle_client/tui/agent/core.py b/src/twinkle_client/tui/agent/core.py
new file mode 100644
index 000000000..ec5e0609f
--- /dev/null
+++ b/src/twinkle_client/tui/agent/core.py
@@ -0,0 +1,217 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Agent core - async LLM tool-calling loop using OpenAI-compatible API."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+from twinkle.utils.logger import get_logger
+from typing import Any, Callable
+
+from openai import AsyncOpenAI
+
+from twinkle_client.tui.agent.prompts import SYSTEM_PROMPT
+from twinkle_client.tui.agent.tools import TOOL_SCHEMAS, ToolExecutor
+from twinkle_client.tui.connection import LocalConnection
+
+logger = get_logger()
+
+
+class AgentLoop:
+    """Async tool-calling agent loop using OpenAI-compatible API.
+
+    Manages conversation history, LLM calls, and tool execution.
+    Includes automatic history pruning to prevent context overflow.
+    """
+
+    MAX_TOOL_ROUNDS = 10  # prevent infinite tool loops
+    MAX_HISTORY_MESSAGES = 50  # keep last N messages (excluding system prompt)
+
+    def __init__(
+        self,
+        connection: LocalConnection,
+        llm_base_url: str,
+        llm_model: str,
+        llm_api_key: str,
+        skills_prompt: str = '',
+    ):
+        self.connection = connection
+        self.llm_model = llm_model
+        self._client = AsyncOpenAI(base_url=llm_base_url, api_key=llm_api_key)
+        self._tool_executor = ToolExecutor(connection)
+        # Build system prompt with optional skills section
+        full_prompt = SYSTEM_PROMPT
+        if skills_prompt:
+            full_prompt = f'{SYSTEM_PROMPT}\n\n{skills_prompt}'
+        self.history: list[dict[str, Any]] = [
+            {'role': 'system', 'content': full_prompt},
+        ]
+
+    async def send(
+        self,
+        user_input: str,
+        on_token: Callable[[str], None] | None = None,
+        on_stream_reset: Callable[[], None] | None = None,
+    ) -> str:
+        """Process user input through LLM with tool calling.
+
+        Args:
+            user_input: The user's message text.
+            on_token: Callback for each streamed text chunk.
+            on_stream_reset: Called when a tool-call is detected mid-stream,
+                signalling the UI to discard any partially-displayed tokens.
+
+        Returns the final assistant text response.
+        """
+        logger.info(f'User message: {user_input[:200]}')
+        self.history.append({'role': 'user', 'content': user_input})
+        self._prune_history()
+
+        for round_idx in range(self.MAX_TOOL_ROUNDS):
+            logger.debug(f'LLM call round {round_idx + 1}/{self.MAX_TOOL_ROUNDS}')
+            content, tool_calls = await self._call_llm_stream(on_token=on_token)
+
+            # If no tool calls, we're done — content was correctly streamed
+            if not tool_calls:
+                self.history.append({'role': 'assistant', 'content': content})
+                logger.info(f'Assistant response ({len(content)} chars): {content[:150]}...')
+                return content
+
+            # Tool calls detected — discard any leaked intermediate tokens
+            if on_stream_reset:
+                on_stream_reset()
+
+            logger.info(f'Tool calls detected: {[tc["function"]["name"] for tc in tool_calls]}')
+            self.history.append({
+                'role': 'assistant',
+                'content': content,
+                'tool_calls': tool_calls,
+            })
+
+            for tc in tool_calls:
+                func_name = tc['function']['name']
+                raw_args = tc['function']['arguments']
+                try:
+                    args = json.loads(raw_args) if raw_args else {}
+                except json.JSONDecodeError as e:
+                    logger.error(f'Tool {func_name}: invalid JSON args: {e}\n  raw={raw_args[:500]}')
+                    args = {}
+                logger.info(f'Executing tool: {func_name}({", ".join(f"{k}={v!r}" for k, v in list(args.items())[:5])})')
+                result = await self._tool_executor.execute(func_name, args)
+                logger.debug(f'Tool {func_name} result ({len(result)} chars): {result[:300]}')
+                self.history.append({
+                    'role': 'tool',
+                    'tool_call_id': tc['id'],
+                    'content': result,
+                })
+
+        # Exceeded max rounds
+        logger.warning(f'Exceeded max tool rounds ({self.MAX_TOOL_ROUNDS})')
+        fallback = 'I reached the maximum number of tool calls. Please try a simpler request.'
+        self.history.append({'role': 'assistant', 'content': fallback})
+        return fallback
+
+    async def _call_llm_stream(
+        self,
+        on_token: Callable[[str], None] | None = None,
+    ) -> tuple[str, list[dict[str, Any]]]:
+        """Make a streaming LLM API call. Accumulates content and tool_calls.
+
+        Streams text tokens via on_token only if the response has no tool calls.
+        Returns (full_content, tool_calls_list).
+        """
+        try:
+            stream = await self._client.chat.completions.create(
+                model=self.llm_model,
+                messages=self.history,
+                tools=TOOL_SCHEMAS,
+                tool_choice='auto',
+                stream=True,
+            )
+        except Exception as e:
+            logger.error(f'LLM API call failed: {type(e).__name__}: {e}')
+            raise
+
+        content_parts: list[str] = []
+        tool_calls_map: dict[int, dict[str, Any]] = {}
+        has_tool_calls = False
+        chunk_count = 0
+
+        async for chunk in stream:
+            delta = chunk.choices[0].delta if chunk.choices else None
+            if delta is None:
+                continue
+
+            # Accumulate content
+            if delta.content:
+                content_parts.append(delta.content)
+                # Only stream tokens if no tool calls detected yet
+                if not has_tool_calls and on_token:
+                    on_token(delta.content)
+
+            # Accumulate tool calls
+            if delta.tool_calls:
+                has_tool_calls = True
+                for tc_delta in delta.tool_calls:
+                    idx = tc_delta.index
+                    if idx not in tool_calls_map:
+                        tool_calls_map[idx] = {
+                            'id': '',
+                            'type': 'function',
+                            'function': {'name': '', 'arguments': ''},
+                        }
+                    tc = tool_calls_map[idx]
+                    if tc_delta.id:
+                        tc['id'] = tc_delta.id
+                    if tc_delta.function:
+                        if tc_delta.function.name:
+                            tc['function']['name'] += tc_delta.function.name
+                        if tc_delta.function.arguments:
+                            tc['function']['arguments'] += tc_delta.function.arguments
+
+            # Yield to event loop periodically to allow UI rendering
+            chunk_count += 1
+            if chunk_count % 5 == 0:
+                await asyncio.sleep(0)
+
+        content = ''.join(content_parts)
+        tool_calls = [tool_calls_map[i] for i in sorted(tool_calls_map)] if tool_calls_map else []
+        return content, tool_calls
+
+    def set_metrics_callback(self, callback: Callable) -> None:
+        """Set the callback for metrics zoom control."""
+        self._tool_executor.metrics_callback = callback
+
+    def set_select_metrics_callback(self, callback: Callable[[list[str]], dict]) -> None:
+        """Set the callback for selecting which metrics to display."""
+        self._tool_executor.select_metrics_callback = callback
+
+    def set_run_selected_callback(self, callback: Callable[[str], None]) -> None:
+        """Set the callback invoked when the agent switches to a different run."""
+        self._tool_executor.on_run_selected = callback
+
+    def inject_skills(self, skills_prompt: str) -> None:
+        """Inject skills into the system prompt after initial load.
+
+        Called asynchronously once skills finish loading, so the agent
+        is usable immediately even before skills are ready.
+        """
+        if not skills_prompt:
+            return
+        self.history[0]['content'] = f"{self.history[0]['content']}\n\n{skills_prompt}"
+
+    def _prune_history(self) -> None:
+        """Prune conversation history to prevent context overflow.
+
+        Keeps the system prompt (index 0) and the most recent messages.
+        Cuts at a 'user' message boundary to avoid splitting tool_call sequences.
+        """
+        if len(self.history) <= self.MAX_HISTORY_MESSAGES + 1:
+            return
+        # Find the nearest 'user' message at or after the ideal cut point
+        cut_idx = len(self.history) - self.MAX_HISTORY_MESSAGES
+        while cut_idx < len(self.history) and self.history[cut_idx]['role'] != 'user':
+            cut_idx += 1
+        if cut_idx >= len(self.history):
+            return  # No safe cut point found; skip pruning this round
+        self.history = [self.history[0]] + self.history[cut_idx:]
diff --git a/src/twinkle_client/tui/agent/monitor.py b/src/twinkle_client/tui/agent/monitor.py
new file mode 100644
index 000000000..a638c8e9e
--- /dev/null
+++ b/src/twinkle_client/tui/agent/monitor.py
@@ -0,0 +1,399 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Training monitor - LLM-driven periodic health check.
+
+Every poll cycle, the monitor gathers ALL available signals about the current
+training run (process status, output logs, metrics) and feeds them to the LLM.
+The LLM decides:
+- LGTM: everything normal, no action needed
+- WARNING: report an observation to the user (metrics anomaly, slow progress, etc.)
+- FIX: the script has a bug → LLM outputs a fixed script → monitor applies it and restarts
+
+This unified approach handles crashes, hangs, abnormal training, and metrics
+anomalies in a single loop without separate hard-coded detection logic.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import re
+import time
+from pathlib import Path
+from typing import Any, Callable
+
+from openai import AsyncOpenAI
+
+from twinkle.utils.logger import get_logger
+from twinkle_client.tui.connection import LocalConnection
+
+logger = get_logger()
+
+# Maximum auto-fix attempts per run (prevent infinite retry loops)
+_MAX_FIX_ATTEMPTS = 3
+
+MONITOR_SYSTEM_PROMPT = """\
+You are an automated ML training health monitor. Every ~30 seconds you receive a \
+snapshot of ALL signals from a training run. Your job is to analyze and decide \
+what action (if any) to take.
+
+## Signals you will receive
+
+- **Process status**: alive / zombie / exited / unknown
+- **output.log tail**: recent process output (stdout+stderr combined, may contain errors or warnings)
+- **Metrics**: recent training metrics (loss, reward, lr, etc.)
+- **Stall duration**: seconds since last new metric was produced
+- **Current train.py**: the full training script source (provided for accurate fixes)
+
+## Decision framework
+
+1. **LGTM** — training is progressing normally.
+   - Process alive, metrics flowing, no errors in output, loss trending down.
+   - Respond: `LGTM`
+
+2. **WARNING** — something worth noting but not script-breaking.
+   - Loss plateau, reward hacking, KL explosion, entropy collapse, stall < 5 min, etc.
+   - Respond with a BRIEF (1-3 sentence) observation + suggestion.
+
+3. **FIX** — the script has crashed or is broken and needs code changes.
+   - Process dead/zombie with error traceback in output.
+   - Server returned an error (400/500) that indicates a code bug.
+   - Process stuck > 10 minutes with no metrics AND output shows an error.
+   - Respond in this EXACT format:
+```diagnosis
+<1-2 sentence root cause>
+```
+```python
+<complete fixed training script>
+```
+
+## Rules
+- Be direct and actionable.
+- Respond in the same language as the log content (Chinese or English).
+- NEVER start with LGTM if there is any issue.
+- For FIX: output the COMPLETE fixed script based on the provided "Current train.py". Only modify the lines that cause the error — do NOT rewrite from scratch or change the overall architecture.
+- **MUST preserve resume logic** in fixed scripts: `rt.get_resume_info()` + `dataloader.skip_consumed_samples()`. Never hardcode `global_step = 0` if resume logic exists in the original.
+- Common fixes:
+  - "Batch size N must be >= data world size M" → increase batch_size to M
+  - "save_dir does not exist on the server" → remove the save_dir parameter
+  - Import errors → fix the import
+  - Connection refused → check base_url
+  - "Unknown format code 'f' for object of type 'str'" → remove float format specifiers (:.4f etc.) from print statements
+- Do NOT suggest FIX for transient issues (network blip, temporary stall < 5 min).
+- If process is alive and metrics are flowing but stale for < 3 min, say LGTM.
+"""
+
+
+class TrainingMonitor:
+    """Unified LLM-driven training health monitor.
+
+    Every poll cycle, collects all available signals and asks the LLM
+    to analyze. The LLM may respond with LGTM, a warning, or a FIX
+    (complete fixed script). The monitor applies fixes automatically.
+    """
+
+    _MAX_METRICS_FOR_LLM = 30
+
+    def __init__(
+        self,
+        connection: LocalConnection,
+        on_message: Callable[[str], None],
+        llm_base_url: str = 'http://localhost:11434/v1',
+        llm_model: str = 'qwen3.5',
+        llm_api_key: str = 'not-needed',
+        poll_interval: float = 30.0,
+    ):
+        self.connection = connection
+        self.on_message = on_message
+        self.llm_model = llm_model
+        self.poll_interval = poll_interval
+        self._client = AsyncOpenAI(base_url=llm_base_url, api_key=llm_api_key)
+        self._running = True
+        # Track state per run
+        self._last_analyzed_run_id: str | None = None
+        self._last_metric_time: float = time.time()
+        self._last_metric_step: int = -1
+        self._fix_attempts: dict[str, int] = {}
+        # Avoid spamming: don't re-analyze if nothing changed
+        self._last_snapshot_hash: str = ''
+
+    async def run(self) -> None:
+        """Main monitoring loop."""
+        while self._running:
+            try:
+                await self._check()
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.debug(f'Monitor cycle error: {e}')
+            await asyncio.sleep(self.poll_interval)
+
+    def stop(self) -> None:
+        self._running = False
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Core: gather signals → ask LLM → act on response
+    # ──────────────────────────────────────────────────────────────────────
+
+    async def _check(self) -> None:
+        """Single monitoring cycle."""
+        if not self.connection.current_run_id:
+            return
+
+        run_id = self.connection.current_run_id
+
+        # Reset when switching runs
+        if run_id != self._last_analyzed_run_id:
+            self._last_analyzed_run_id = run_id
+            self._last_metric_time = time.time()
+            self._last_metric_step = -1
+            self._last_snapshot_hash = ''
+
+        # Skip if already completed/stopped by user
+        status = self.connection.get_status(run_id)
+        if status in ('completed', 'stopped', 'paused'):
+            return
+
+        # Gather all signals
+        snapshot = self._gather_snapshot(run_id, status)
+
+        # Dedup: skip if snapshot hasn't meaningfully changed
+        snapshot_hash = self._hash_snapshot(snapshot)
+        if snapshot_hash == self._last_snapshot_hash:
+            return
+        self._last_snapshot_hash = snapshot_hash
+
+        # Ask LLM
+        llm_response = await self._ask_llm(snapshot)
+        if llm_response is None:
+            return
+
+        # Parse and act
+        await self._act_on_response(run_id, llm_response, snapshot)
+
+    def _gather_snapshot(self, run_id: str, status: str) -> dict[str, Any]:
+        """Gather all health signals for the run."""
+        run_dir = self.connection.base_dir / run_id
+
+        # Process status
+        meta = self.connection.get_meta(run_id) or {}
+        pid = meta.get('pid')
+        process_status = 'unknown'
+        if pid:
+            if self.connection._is_process_alive(pid):
+                process_status = 'alive'
+            else:
+                process_status = 'dead'
+
+        # output.log tail (last 1500 chars, focus on errors)
+        output_tail = ''
+        output_file = run_dir / 'output.log'
+        if output_file.exists():
+            try:
+                content = output_file.read_text(errors='replace')
+                # Extract traceback if present
+                tb_idx = content.rfind('Traceback (most recent call last)')
+                if tb_idx >= 0:
+                    output_tail = content[tb_idx:][-1500:]
+                elif len(content) > 1500:
+                    output_tail = content[-1500:]
+                else:
+                    output_tail = content
+            except Exception:
+                pass
+
+        # Metrics
+        metrics = self.connection.get_metrics(run_id, last_n=self._MAX_METRICS_FOR_LLM)
+
+        # Update stall tracking
+        if metrics:
+            latest_step = metrics[-1].get('step', 0)
+            if latest_step > self._last_metric_step:
+                self._last_metric_step = latest_step
+                self._last_metric_time = time.time()
+
+        stall_seconds = time.time() - self._last_metric_time
+
+        # Current train.py script content (for LLM to fix)
+        script_content = ''
+        script_file = run_dir / 'train.py'
+        if script_file.exists():
+            try:
+                script_content = script_file.read_text(errors='replace')
+            except Exception:
+                pass
+
+        return {
+            'run_id': run_id,
+            'meta_status': status,
+            'process_status': process_status,
+            'output_tail': output_tail.strip(),
+            'metrics': metrics,
+            'stall_seconds': int(stall_seconds),
+            'fix_attempts': self._fix_attempts.get(run_id, 0),
+            'script_content': script_content,
+        }
+
+    def _hash_snapshot(self, snapshot: dict[str, Any]) -> str:
+        """Simple hash to detect meaningful changes between cycles."""
+        # Key factors: process status, output tail hash, latest step, stall bucket
+        parts = [
+            snapshot['process_status'],
+            str(len(snapshot['output_tail'])),
+            str(snapshot['metrics'][-1].get('step', 0) if snapshot['metrics'] else 0),
+            str(snapshot['stall_seconds'] // 60),  # bucket by minute
+        ]
+        return '|'.join(parts)
+
+    def _format_snapshot(self, snapshot: dict[str, Any]) -> str:
+        """Format snapshot into text for LLM."""
+        parts = []
+        parts.append(f'## Run: {snapshot["run_id"]}')
+        parts.append(f'- Meta status: {snapshot["meta_status"]}')
+        parts.append(f'- Process: {snapshot["process_status"]}')
+        parts.append(f'- Time since last metric: {snapshot["stall_seconds"]}s')
+        parts.append(f'- Auto-fix attempts so far: {snapshot["fix_attempts"]}/{_MAX_FIX_ATTEMPTS}')
+        parts.append('')
+
+        # output
+        if snapshot['output_tail']:
+            parts.append('## output.log (tail)')
+            parts.append(f'```\n{snapshot["output_tail"]}\n```')
+            parts.append('')
+
+        # Metrics
+        metrics = snapshot['metrics']
+        if metrics:
+            keys = [k for k in metrics[0].keys() if k != 'ts']
+            parts.append(f'## Metrics ({len(metrics)} entries)')
+            parts.append(f'Fields: {", ".join(keys)}')
+            # Last 8 entries
+            for m in metrics[-8:]:
+                row = {k: v for k, v in m.items() if k != 'ts'}
+                parts.append(f'  {json.dumps(row, default=str)}')
+            # Trend
+            if len(metrics) >= 6:
+                mid = len(metrics) // 2
+                parts.append('')
+                parts.append('Trend (first half → second half avg):')
+                for key in keys:
+                    if key in ('step', 'epoch', 'total_steps'):
+                        continue
+                    first_vals = [m.get(key) for m in metrics[:mid] if isinstance(m.get(key), (int, float))]
+                    last_vals = [m.get(key) for m in metrics[mid:] if isinstance(m.get(key), (int, float))]
+                    if first_vals and last_vals:
+                        avg_first = sum(first_vals) / len(first_vals)
+                        avg_last = sum(last_vals) / len(last_vals)
+                        parts.append(f'  {key}: {avg_first:.6g} → {avg_last:.6g}')
+        else:
+            parts.append('## Metrics: NONE (no metrics produced yet)')
+
+        # Script content (for accurate fixes)
+        if snapshot.get('script_content'):
+            parts.append('')
+            parts.append('## Current train.py')
+            parts.append(f'```python\n{snapshot["script_content"]}\n```')
+
+        return '\n'.join(parts)
+
+    async def _ask_llm(self, snapshot: dict[str, Any]) -> str | None:
+        """Send snapshot to LLM, return response or None on failure."""
+        user_content = self._format_snapshot(snapshot)
+
+        # If fix attempts exhausted, tell LLM not to suggest FIX
+        extra = ''
+        if snapshot['fix_attempts'] >= _MAX_FIX_ATTEMPTS:
+            extra = '\n\nNOTE: Auto-fix attempts exhausted. Do NOT suggest FIX. Only report warnings.'
+
+        try:
+            response = await self._client.chat.completions.create(
+                model=self.llm_model,
+                messages=[
+                    {'role': 'system', 'content': MONITOR_SYSTEM_PROMPT + extra},
+                    {'role': 'user', 'content': user_content},
+                ],
+                temperature=0.3,
+                max_tokens=4096,
+            )
+            content = (response.choices[0].message.content or '').strip()
+            return content if content else None
+        except Exception as e:
+            logger.debug(f'Monitor LLM call failed: {e}')
+            return None
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Act on LLM response
+    # ──────────────────────────────────────────────────────────────────────
+
+    async def _act_on_response(self, run_id: str, response: str, snapshot: dict[str, Any]) -> None:
+        """Parse LLM response and take appropriate action."""
+        # Case 1: LGTM — no action
+        if response.upper().startswith('LGTM'):
+            return
+
+        # Case 2: FIX — contains a ```python block
+        diagnosis, fixed_script = self._parse_fix_response(response)
+        if fixed_script:
+            await self._apply_fix(run_id, diagnosis, fixed_script)
+            return
+
+        # Case 3: WARNING — just relay to user
+        self.on_message(f'[Monitor] {response}')
+
+    async def _apply_fix(self, run_id: str, diagnosis: str, fixed_script: str) -> None:
+        """Apply auto-fix: update script + resume training."""
+        attempts = self._fix_attempts.get(run_id, 0)
+        if attempts >= _MAX_FIX_ATTEMPTS:
+            self.on_message(
+                f'[Monitor] 已达最大自动修复次数 ({_MAX_FIX_ATTEMPTS})，不再尝试。'
+                '请手动检查或输入指令。'
+            )
+            return
+
+        self.on_message(f'[Monitor] 检测到问题，正在自动修复 (第{attempts + 1}次)...\n诊断: {diagnosis}')
+
+        try:
+            # Update script (archives old version)
+            self.connection.update_script(run_id, fixed_script)
+            # Resume (re-launch)
+            result = self.connection.resume_training(run_id)
+            if result.get('status') == 'error':
+                self.on_message(f'[Monitor] 重启失败: {result.get("error", "unknown")}')
+            else:
+                self.on_message(f'[Monitor] 脚本已修复并重启 (PID: {result.get("pid", "?")})')
+                # Reset stall tracking for the new attempt
+                self._last_metric_time = time.time()
+                self._last_metric_step = -1
+                self._last_snapshot_hash = ''
+        except Exception as e:
+            self.on_message(f'[Monitor] 自动修复失败: {e}')
+
+        self._fix_attempts[run_id] = attempts + 1
+
+    @staticmethod
+    def _parse_fix_response(response: str) -> tuple[str, str]:
+        """Parse LLM response for diagnosis + fixed script.
+
+        Returns (diagnosis, fixed_script). fixed_script is empty if
+        no ```python block found (meaning it's a WARNING, not a FIX).
+        """
+        diagnosis = ''
+        fixed_script = ''
+
+        # Extract python code block
+        code_match = re.search(r'```python\s*\n(.*?)```', response, re.DOTALL)
+        if not code_match:
+            return '', ''
+
+        fixed_script = code_match.group(1).strip()
+
+        # Extract diagnosis block
+        diag_match = re.search(r'```diagnosis\s*\n(.*?)```', response, re.DOTALL)
+        if diag_match:
+            diagnosis = diag_match.group(1).strip()
+        else:
+            # Fallback: text before the python block
+            before = response[:response.find('```python')]
+            lines = [l.strip() for l in before.splitlines() if l.strip() and not l.startswith('```')]
+            diagnosis = lines[-1] if lines else 'Auto-fix applied'
+
+        return diagnosis, fixed_script
diff --git a/src/twinkle_client/tui/agent/prompts.py b/src/twinkle_client/tui/agent/prompts.py
new file mode 100644
index 000000000..c7875acd5
--- /dev/null
+++ b/src/twinkle_client/tui/agent/prompts.py
@@ -0,0 +1,24 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""System prompts for the TUI embedded agent."""
+
+SYSTEM_PROMPT = """\
+You are the Twinkle Training Assistant, an AI agent embedded in a TUI (Terminal User Interface) \
+that helps users manage ML model training.
+
+Your capabilities:
+1. **Training Control**: Start, pause, resume training. Modify hyperparameters on-the-fly.
+2. **Monitoring**: Analyze metrics (loss, reward, accuracy) and detect anomalies.
+3. **Guidance**: Help users choose datasets, models, training methods, and hyperparameters.
+4. **Search**: Search ModelScope/HuggingFace for models and datasets.
+5. **Chart Control**: Zoom, pan, and reset the metrics chart based on user's natural language requests.
+
+Rules:
+- Be concise. Users are in a terminal — avoid long paragraphs.
+- When suggesting hyperparameter changes, explain WHY briefly.
+- If you detect training anomalies (NaN loss, reward plateau), proactively suggest fixes.
+- For chart zoom requests, call the `zoom_metrics` tool with appropriate parameters.
+- Always confirm destructive actions (stopping training, changing dataset) before executing.
+- When user mentions a model or dataset by short name (e.g. "Qwen3.5-4B", "gsm8k"), \
+ALWAYS call `search_models` or `search_datasets` first to resolve the full org/name ID before using it.
+- Respond in the same language the user uses.
+"""
diff --git a/src/twinkle_client/tui/agent/tools.py b/src/twinkle_client/tui/agent/tools.py
new file mode 100644
index 000000000..5b810bc0c
--- /dev/null
+++ b/src/twinkle_client/tui/agent/tools.py
@@ -0,0 +1,1283 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Agent tool definitions for training control, search, and metrics."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+from typing import Any, Callable
+
+from twinkle.utils.logger import get_logger
+from twinkle_client.tui.connection import LocalConnection
+
+logger = get_logger()
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Tool schemas (OpenAI function calling format)
+# ──────────────────────────────────────────────────────────────────────────────
+
+TOOL_SCHEMAS: list[dict[str, Any]] = [
+    {
+        'type': 'function',
+        'function': {
+            'name': 'list_training_runs',
+            'description': 'List all active and historical training runs.',
+            'parameters': {'type': 'object', 'properties': {}, 'required': []},
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'get_training_status',
+            'description': 'Get detailed status and recent metrics for a training run.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'run_id': {'type': 'string', 'description': 'Training run ID.'},
+                },
+                'required': ['run_id'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'start_server',
+            'description': (
+                'Start Ray cluster and Twinkle Server. MUST be called before start_training. '
+                'Idempotent: skips if server is already reachable. '
+                'Supports multi-model deployments: one training model + N sampler/teacher models. '
+                'Automatically generates server_config.yaml from parameters.'
+            ),
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'model_id': {
+                        'type': 'string',
+                        'description': 'Student/training model ID (e.g. "Qwen/Qwen3.5-4B").',
+                    },
+                    'train_gpus': {
+                        'type': 'integer',
+                        'description': 'GPUs for the training model. Default: auto-detect remaining GPUs.',
+                    },
+                    'backend': {
+                        'type': 'string',
+                        'enum': ['transformers', 'megatron'],
+                        'description': 'Training model backend. Default: transformers.',
+                    },
+                    'samplers': {
+                        'type': 'array',
+                        'description': (
+                            'List of sampler/teacher models for RL/OPD. Each entry deploys '
+                            'an inference service (vLLM or torch). Omit for simple SFT.'
+                        ),
+                        'items': {
+                            'type': 'object',
+                            'properties': {
+                                'model_id': {
+                                    'type': 'string',
+                                    'description': 'Teacher/reference model ID (e.g. "Qwen/Qwen3.5-72B").',
+                                },
+                                'gpus': {
+                                    'type': 'integer',
+                                    'description': 'Total number of GPUs for this sampler. Default: 1. Must equal tp * dp.',
+                                },
+                                'tp': {
+                                    'type': 'integer',
+                                    'description': (
+                                        'Tensor parallelism size (GPUs per vLLM worker process). '
+                                        'Use tp>1 for large models that do not fit on a single GPU. Default: 1.'
+                                    ),
+                                },
+                                'dp': {
+                                    'type': 'integer',
+                                    'description': (
+                                        'Data parallelism size (number of independent inference replicas). '
+                                        'If not specified, computed as gpus // tp. Default: 1.'
+                                    ),
+                                },
+                                'engine': {
+                                    'type': 'string',
+                                    'enum': ['vllm', 'torch'],
+                                    'description': 'Inference engine. Default: vllm.',
+                                },
+                                'max_model_len': {
+                                    'type': 'integer',
+                                    'description': 'Max sequence length for inference. Default: 16000.',
+                                },
+                            },
+                            'required': ['model_id'],
+                        },
+                    },
+                    'port': {
+                        'type': 'integer',
+                        'description': 'HTTP port for server. Default: 8000.',
+                    },
+                },
+                'required': ['model_id'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'shutdown_server',
+            'description': (
+                'Shut down Twinkle Server and Ray cluster. WARNING: This releases all GPU resources '
+                'and DESTROYS model state held in server memory. Only call when training is truly '
+                'finished and you no longer need the server. Model weights/optimizer state in GPU '
+                'will be LOST unless a checkpoint was explicitly saved.'
+            ),
+            'parameters': {'type': 'object', 'properties': {}, 'required': []},
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'start_training',
+            'description': (
+                'Create a new training run: write the client script, launch it, and start monitoring. '
+                'REQUIRES: Twinkle Server must be running (call start_server first). '
+                'The client script connects to the server — server holds model state in GPU memory. '
+                'Kill client = pause (state preserved). Re-launch client = resume.'
+            ),
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'run_id': {'type': 'string', 'description': 'Unique run ID (e.g., "grpo-gsm8k").'},
+                    'script_content': {'type': 'string', 'description': 'Full Python source code of the training script.'},
+                    'model_id': {'type': 'string', 'description': 'Model identifier for metadata (e.g., "Qwen/Qwen3.5-4B").'},
+                },
+                'required': ['run_id', 'script_content'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'select_run',
+            'description': 'Switch the TUI to monitor a different training run. Updates metrics panel and status bar.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'run_id': {'type': 'string', 'description': 'Training run ID to monitor.'},
+                },
+                'required': ['run_id'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'pause_training',
+            'description': 'Pause training by killing the client process (SIGKILL). Server retains all state — call resume_training to continue.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'run_id': {'type': 'string', 'description': 'Training run ID to pause.'},
+                },
+                'required': ['run_id'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'resume_training',
+            'description': 'Resume a paused training run by re-launching the client script. Server state is preserved.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'run_id': {'type': 'string', 'description': 'Training run ID to resume.'},
+                },
+                'required': ['run_id'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'stop_training',
+            'description': (
+                'Stop the client training process (SIGKILL). In server mode the model state '
+                'remains in the server GPU memory — use resume_training to continue. '
+                'This is equivalent to pause_training. To fully release GPU resources and '
+                'destroy server state, use shutdown_server instead.'
+            ),
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'run_id': {'type': 'string', 'description': 'Training run ID to stop.'},
+                },
+                'required': ['run_id'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'update_script',
+            'description': 'Update the training script for a run. Archives the current train.py as train_v{N}.py and writes the new version. Use after diagnosing a script error, then call resume_training.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'run_id': {'type': 'string', 'description': 'Training run ID.'},
+                    'script_content': {'type': 'string', 'description': 'Full Python source code of the new training script.'},
+                },
+                'required': ['run_id', 'script_content'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'list_supported_models',
+            'description': 'Query the Twinkle server for its list of supported base models. Always call this before writing a training script to verify model availability.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'base_url': {
+                        'type': 'string',
+                        'description': 'Server base URL. Default: http://localhost:8000. Cloud: http://www.modelscope.cn/twinkle',
+                    },
+                },
+                'required': [],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'search_datasets',
+            'description': 'Search ModelScope Hub for datasets matching a query.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'query': {'type': 'string', 'description': 'Search query for datasets.'},
+                    'limit': {'type': 'integer', 'description': 'Max results (default 5).'},
+                },
+                'required': ['query'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'search_models',
+            'description': 'Search ModelScope Hub for models matching a query.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'query': {'type': 'string', 'description': 'Search query for models.'},
+                    'limit': {'type': 'integer', 'description': 'Max results (default 5).'},
+                },
+                'required': ['query'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'zoom_metrics',
+            'description': 'Adjust the metrics chart view. Zoom into specific step ranges or reset to show all.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'action': {
+                        'type': 'string',
+                        'enum': ['zoom', 'reset'],
+                        'description': '"zoom" to set range, "reset" to show all.',
+                    },
+                    'x_start': {'type': 'integer', 'description': 'Start step for x-axis.'},
+                    'x_end': {'type': 'integer', 'description': 'End step for x-axis.'},
+                    'y_min': {'type': 'number', 'description': 'Min value for y-axis.'},
+                    'y_max': {'type': 'number', 'description': 'Max value for y-axis.'},
+                },
+                'required': ['action'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'select_metrics',
+            'description': 'Choose which metric keys to display on the chart. The chart shows at most 4 metrics at once. Use this when the user asks to see specific metrics (e.g. "show reward-related metrics"). Pass an empty keys array to list all available metrics without changing the selection.',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'keys': {
+                        'type': 'array',
+                        'items': {'type': 'string'},
+                        'description': 'Metric key names to display. Match against available keys (supports partial: pick keys containing the keyword). Pass [] to query available keys only.',
+                    },
+                },
+                'required': ['keys'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'get_cluster_info',
+            'description': (
+                'Get cluster GPU resource info for planning training parallelism. '
+                'First attempts to query a running Ray cluster; if Ray is not available, '
+                'falls back to nvidia-smi for local GPU discovery. '
+                'The result indicates whether Ray is active — if not, the training script '
+                'should either start a local Ray cluster itself or the user should launch '
+                'Ray manually (see server mode run.sh).'
+            ),
+            'parameters': {'type': 'object', 'properties': {}, 'required': []},
+        },
+    },
+]
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Tool executor
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+class ToolExecutor:
+    """Executes agent tool calls against the local connection."""
+
+    def __init__(self, connection: LocalConnection):
+        self.connection = connection
+        self.metrics_callback: Callable | None = None
+        self.select_metrics_callback: Callable[[list[str]], dict] | None = None
+        self.on_run_selected: Callable[[str], None] | None = None
+        self._server_url: str | None = None  # Set after successful start_server
+
+    async def execute(self, name: str, arguments: dict[str, Any]) -> str:
+        """Execute a tool by name and return the result as a JSON string."""
+        handler = getattr(self, f'_tool_{name}', None)
+        if handler is None:
+            logger.warning(f'Unknown tool called: {name}')
+            return json.dumps({'error': f'Unknown tool: {name}'})
+        try:
+            result = await handler(**arguments)
+            return json.dumps(result, ensure_ascii=False, default=str)
+        except Exception as e:
+            logger.error(f'Tool {name} raised exception: {type(e).__name__}: {e}', exc_info=True)
+            return json.dumps({'error': f'{name} failed: {e}'})
+
+    # ── Training lifecycle ──
+
+    def _resolve_server_url(self) -> str:
+        """Resolve server URL: instance state > env var > default."""
+        return (
+            self._server_url
+            or os.environ.get('TWINKLE_SERVER_URL')
+            or 'http://localhost:8000'
+        )
+
+    async def _tool_list_training_runs(self) -> list[dict]:
+        return self.connection.list_training_runs()
+
+    async def _tool_get_training_status(self, run_id: str) -> dict:
+        metrics = self.connection.get_metrics(run_id, last_n=10)
+        meta = self.connection.get_meta(run_id) or {}
+        state = meta.get('status', 'unknown')
+        return {'run_id': run_id, 'state': state, 'model_id': meta.get('model_id'), 'recent_metrics': metrics}
+
+    async def _tool_start_training(self, run_id: str, script_content: str, model_id: str = '') -> dict:
+        # Pre-check: Twinkle Server must be reachable
+        server_url = self._resolve_server_url()
+        if not await self._check_server_health(server_url):
+            return {
+                'status': 'error',
+                'run_id': run_id,
+                'error': (
+                    f'Twinkle Server is not reachable at {server_url}. '
+                    'Call start_server first to launch Ray cluster and Twinkle Server.'
+                ),
+            }
+        result = self.connection.start_training(run_id, script_content, model_id)
+        actual_run_id = result.get('run_id', run_id)
+        if self.on_run_selected:
+            self.on_run_selected(actual_run_id)
+        return result
+
+    async def _tool_select_run(self, run_id: str) -> dict:
+        self.connection.current_run_id = run_id
+        if self.on_run_selected:
+            self.on_run_selected(run_id)
+        return {'run_id': run_id, 'status': 'selected'}
+
+    async def _tool_pause_training(self, run_id: str) -> dict:
+        return self.connection.pause_training(run_id)
+
+    async def _tool_resume_training(self, run_id: str) -> dict:
+        return self.connection.resume_training(run_id)
+
+    async def _tool_stop_training(self, run_id: str) -> dict:
+        # Send SIGTERM for graceful shutdown (checkpoint saving via registered handler).
+        # Server retains model state in GPU memory — use resume_training to continue.
+        return self.connection.stop_training(run_id)
+
+    async def _tool_update_script(self, run_id: str, script_content: str) -> dict:
+        return self.connection.update_script(run_id, script_content)
+
+    # ── Server lifecycle ──
+
+    async def _check_server_health(self, url: str) -> bool:
+        """Check if Twinkle Server is reachable (non-blocking)."""
+        import urllib.request
+        import urllib.error
+
+        def _probe():
+            try:
+                req = urllib.request.Request(f'{url}/api/v1/healthz', method='GET')
+                urllib.request.urlopen(req, timeout=3)
+                return True
+            except (urllib.error.URLError, OSError):
+                # Try a simpler connectivity check
+                try:
+                    urllib.request.urlopen(url, timeout=3)
+                    return True
+                except (urllib.error.URLError, OSError):
+                    return False
+
+        return await asyncio.get_event_loop().run_in_executor(None, _probe)
+
+    # ── Server startup pipeline ──
+
+    async def _tool_start_server(
+        self,
+        model_id: str,
+        train_gpus: int | None = None,
+        port: int = 8000,
+        backend: str = 'transformers',
+        samplers: list[dict] | None = None,
+    ) -> dict:
+        """Start Ray cluster + Twinkle Server. Idempotent. Supports multi-model."""
+        server_url = self._server_url or os.environ.get('TWINKLE_SERVER_URL') or f'http://localhost:{port}'
+
+        # Idempotent: skip if already running
+        if await self._check_server_health(server_url):
+            self._server_url = server_url
+            return {'status': 'already_running', 'server_url': server_url}
+
+        def _start():
+            sampler_list = samplers or []
+
+            # Step 1: Detect hardware & compute GPU partition
+            total_hw_gpus = self._detect_gpu_count()
+            if total_hw_gpus == 0:
+                return {'status': 'error', 'error': 'No GPUs detected. Cannot start training server.'}
+
+            alloc = self._compute_gpu_allocation(sampler_list, train_gpus, total_hw_gpus)
+            if 'error' in alloc:
+                return {'status': 'error', 'error': alloc['error']}
+            t_gpus, sampler_gpu_total = alloc['train_gpus'], alloc['sampler_gpus']
+
+            # Step 2: Generate server_config.yaml
+            config_path = self._generate_server_config(
+                model_id=model_id, train_gpus=t_gpus,
+                port=port, backend=backend, samplers=sampler_list,
+            )
+
+            # Step 3: Start Ray cluster (multi-node GPU partitioning)
+            ray_err = self._start_ray_cluster(t_gpus, sampler_gpu_total)
+            if ray_err:
+                return {'status': 'error', 'error': ray_err}
+
+            # Step 4: Launch Twinkle Server process
+            proc, log_path, err = self._launch_server_process(config_path)
+            if err:
+                return {'status': 'error', 'error': err}
+
+            # Step 5: Wait for readiness (healthz + sampler engine)
+            return self._wait_server_ready(
+                server_url=server_url, proc=proc, log_path=log_path,
+                sampler_list=sampler_list, model_id=model_id,
+                t_gpus=t_gpus, backend=backend, config_path=config_path,
+            )
+
+        result = await asyncio.get_event_loop().run_in_executor(None, _start)
+        if result.get('status') in ('started', 'already_running'):
+            self._server_url = server_url
+        return result
+
+    # ── Server startup helpers ──
+
+    @staticmethod
+    def _detect_gpu_count() -> int:
+        """Detect total hardware GPU count via nvidia-smi."""
+        import subprocess as _sp
+        try:
+            r = _sp.run(
+                ['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'],
+                capture_output=True, text=True, timeout=10,
+            )
+            if r.returncode == 0:
+                return len([ln for ln in r.stdout.strip().split('\n') if ln.strip()])
+        except (FileNotFoundError, OSError):
+            pass
+        return 0
+
+    @staticmethod
+    def _compute_gpu_allocation(
+        sampler_list: list[dict],
+        train_gpus: int | None,
+        total_hw_gpus: int,
+    ) -> dict:
+        """Compute GPU partition: {train_gpus, sampler_gpus} or {error}."""
+        sampler_gpu_total = 0
+        for s in sampler_list:
+            s_tp = s.get('tp', 1)
+            s_dp, s_gpus = s.get('dp'), s.get('gpus')
+            if s_gpus is not None:
+                sampler_gpu_total += s_gpus
+            elif s_dp is not None:
+                sampler_gpu_total += s_tp * s_dp
+            else:
+                sampler_gpu_total += s_tp  # default dp=1
+
+        t_gpus = train_gpus if train_gpus is not None else max(1, total_hw_gpus - sampler_gpu_total)
+        needed = t_gpus + sampler_gpu_total
+        if needed > total_hw_gpus:
+            return {
+                'error': (
+                    f'Requested {needed} GPUs (train={t_gpus}, samplers={sampler_gpu_total}) '
+                    f'but only {total_hw_gpus} available.'
+                ),
+            }
+        return {'train_gpus': t_gpus, 'sampler_gpus': sampler_gpu_total}
+
+    @staticmethod
+    def _start_ray_cluster(train_gpus: int, sampler_gpus: int) -> str | None:
+        """Start Ray multi-node cluster with GPU partitioning.
+
+        Each role gets its own Ray node with dedicated CUDA_VISIBLE_DEVICES
+        so GPUs are indexed from 0 within each node. This prevents the
+        GPU ID mapping issues that occur with a single-node setup.
+
+        On a single machine, multiple raylets need separate --temp-dir to
+        avoid being detected as "already running".
+
+        Returns an error message on failure, or None on success.
+        """
+        import subprocess as _sp
+        import tempfile
+        from pathlib import Path
+
+        _sp.run(['ray', 'stop', '--force'], capture_output=True, timeout=15)
+
+        # Create unique temp dirs so each `ray start` spawns a separate raylet
+        ray_base = Path(tempfile.gettempdir()) / 'twinkle_ray'
+        ray_base.mkdir(parents=True, exist_ok=True)
+
+        def _ray_node(
+            devices: str, num_gpus: int, *,
+            head: bool = False, node_name: str = 'worker',
+        ) -> str | None:
+            env = os.environ.copy()
+            env['CUDA_VISIBLE_DEVICES'] = devices
+            temp_dir = str(ray_base / node_name)
+            cmd = ['ray', 'start', f'--temp-dir={temp_dir}']
+            if head:
+                cmd += ['--head', '--port=6379', '--disable-usage-stats', '--include-dashboard=false']
+            else:
+                cmd += ['--address=127.0.0.1:6379']
+            cmd.append(f'--num-gpus={num_gpus}')
+            r = _sp.run(cmd, capture_output=True, text=True, timeout=30, env=env)
+            if r.returncode != 0 and 'already' not in r.stderr.lower():
+                return r.stderr.strip()
+            return None
+
+        # Head node — training model GPUs
+        model_devices = ','.join(str(i) for i in range(train_gpus))
+        err = _ray_node(model_devices, train_gpus, head=True, node_name='head')
+        if err:
+            return f'Ray head start failed: {err}'
+
+        # GPU Worker node — sampler GPUs
+        if sampler_gpus > 0:
+            sampler_devices = ','.join(str(i) for i in range(train_gpus, train_gpus + sampler_gpus))
+            err = _ray_node(sampler_devices, sampler_gpus, node_name='gpu_worker')
+            if err:
+                return f'Ray GPU worker start failed: {err}'
+
+        # CPU Worker node — processor (no GPU)
+        _ray_node('', 0, node_name='cpu_worker')
+        return None
+
+    @staticmethod
+    def _launch_server_process(config_path: str) -> tuple:
+        """Launch Twinkle Server as a detached background process.
+
+        Returns (proc, log_path, error). On success error is None.
+        """
+        import subprocess as _sp
+        from pathlib import Path
+
+        log_dir = Path.home() / '.cache' / 'twinkle'
+        log_dir.mkdir(parents=True, exist_ok=True)
+        log_path = str(log_dir / 'server.log')
+        log_file = open(log_path, 'w')
+
+        cmd = ['python', '-m', 'twinkle.server', 'launch', '--config', config_path]
+        try:
+            proc = _sp.Popen(
+                cmd, stdout=log_file, stderr=_sp.STDOUT,
+                start_new_session=True,
+            )
+        except OSError as e:
+            log_file.close()
+            return None, log_path, f'Failed to start Twinkle server: {e}'
+        return proc, log_path, None
+
+    @staticmethod
+    def _wait_server_ready(
+        server_url: str,
+        proc,
+        log_path: str,
+        sampler_list: list[dict],
+        model_id: str,
+        t_gpus: int,
+        backend: str,
+        config_path: str,
+    ) -> dict:
+        """Poll server until healthy (healthz + sampler engine ready)."""
+        import time
+        import urllib.request
+        import urllib.error
+
+        timeout_s = 120 if sampler_list else 60
+        needed = t_gpus + sum(
+            s.get('gpus') or (s.get('tp', 1) * s.get('dp', 1)) for s in sampler_list
+        )
+
+        for _ in range(timeout_s):
+            time.sleep(1)
+            if proc.poll() is not None:
+                return {
+                    'status': 'error',
+                    'error': f'Server exited (code={proc.returncode}). '
+                             f'Model: {model_id}, GPUs: {t_gpus}, Samplers: {len(sampler_list)}.',
+                    'log_path': log_path,
+                }
+            try:
+                urllib.request.urlopen(f'{server_url}/api/v1/healthz', timeout=2)
+            except (OSError, Exception):
+                continue
+
+            # healthz OK — additionally wait for sampler vLLM engines
+            if sampler_list and not ToolExecutor._probe_sampler_ready(server_url, sampler_list, model_id):
+                return {
+                    'status': 'started',
+                    'warning': 'Server is up but sampler may still be loading.',
+                    'server_url': server_url, 'server_pid': proc.pid,
+                    'model_id': model_id, 'log_path': log_path,
+                }
+
+            return {
+                'status': 'started',
+                'server_url': server_url, 'server_pid': proc.pid,
+                'model_id': model_id, 'train_gpus': t_gpus,
+                'backend': backend,
+                'samplers': [s.get('model_id') for s in sampler_list],
+                'total_gpus_used': needed,
+                'config_path': config_path, 'log_path': log_path,
+            }
+
+        return {
+            'status': 'timeout',
+            'error': 'Health check did not pass within timeout. Models may still be loading.',
+            'server_pid': proc.pid, 'log_path': log_path,
+        }
+
+    @staticmethod
+    def _probe_sampler_ready(server_url: str, sampler_list: list[dict], fallback_model_id: str) -> bool:
+        """Probe sampler route up to 90s to confirm vLLM engine is loaded."""
+        import time
+        import urllib.request
+        import urllib.error
+
+        s_mid = sampler_list[0].get('model_id', fallback_model_id)
+        probe_url = f'{server_url}/api/v1/sampler/{s_mid}/twinkle/create'
+
+        for _ in range(90):
+            try:
+                req = urllib.request.Request(
+                    probe_url, method='POST', data=b'{}',
+                    headers={'Content-Type': 'application/json'},
+                )
+                urllib.request.urlopen(req, timeout=5)
+                return True  # non-error response = ready
+            except urllib.error.HTTPError as e:
+                if e.code < 500:
+                    return True  # 4xx = actor alive, just bad request
+                time.sleep(1)  # 5xx = still loading
+            except (OSError, Exception):
+                time.sleep(1)
+        return False
+
+    @staticmethod
+    def _generate_server_config(
+        model_id: str,
+        train_gpus: int,
+        port: int = 8000,
+        backend: str = 'transformers',
+        samplers: list[dict] | None = None,
+    ) -> str:
+        """Generate a server_config.yaml from template and return its path.
+
+        Supports multi-model topology:
+          - 1 training model (student)
+          - N sampler/teacher models (for RL/OPD)
+          - 1 processor service
+        """
+        from pathlib import Path
+        import yaml
+
+        sampler_list = samplers or []
+
+        # Sanitize model name for use in route/names
+        def _short(mid: str) -> str:
+            return mid.split('/')[-1] if '/' in mid else mid
+
+        model_short = _short(model_id)
+
+        # Collect all model IDs for supported_models
+        all_model_ids = [model_id] + [s['model_id'] for s in sampler_list]
+
+        # === Build applications list ===
+        applications = []
+
+        # 1. API Gateway
+        applications.append({
+            'name': 'server',
+            'route_prefix': '/api/v1',
+            'import_path': 'server',
+            'args': {
+                'server_config': {'per_token_model_limit': 3},
+                'supported_models': all_model_ids,
+            },
+            'deployments': [{
+                'name': 'TinkerCompatServer',
+                'max_ongoing_requests': 50,
+                'autoscaling_config': {
+                    'min_replicas': 1,
+                    'max_replicas': 1,
+                    'target_ongoing_requests': 128,
+                },
+                'ray_actor_options': {'num_cpus': 0.1},
+            }],
+        })
+
+        # 2. Build GPU-requiring applications (model + samplers),
+        #    then sort by GPU count DESCENDING before appending.
+        #    Largest PG deploys first → it has the fewest node choices →
+        #    avoids GPU scheduling deadlock on single-machine multi-node.
+        gpu_apps: list[tuple[int, dict]] = []  # (gpu_count, app_config)
+
+        # 2a. Training model worker (student)
+        gpu_apps.append((train_gpus, {
+            'name': f'models-{model_short}',
+            'route_prefix': f'/api/v1/model/{model_id}',
+            'import_path': 'model',
+            'args': {
+                'backend': backend,
+                'model_id': f'ms://{model_id}',
+                'max_length': 500000,  # total tokens per forward pass (must match max_input_tokens)
+                'nproc_per_node': train_gpus,
+                'device_group': {
+                    'name': 'model',
+                    'ranks': train_gpus,
+                    'device_type': 'cuda',
+                },
+                'device_mesh': {
+                    'device_type': 'cuda',
+                    'dp_size': train_gpus,
+                },
+                'queue_config': {
+                    'rps_limit': 100,
+                    'tps_limit': 100000,
+                    'max_input_tokens': 500000,
+                },
+                'adapter_config': {
+                    'adapter_timeout': 600,
+                },
+            },
+            'deployments': [{
+                'name': 'ModelManagement',
+                'autoscaling_config': {
+                    'min_replicas': 1,
+                    'max_replicas': 1,
+                    'target_ongoing_requests': 16,
+                },
+                'ray_actor_options': {
+                    'num_cpus': 0.1,
+                    'runtime_env': {
+                        'env_vars': {'TWINKLE_TRUST_REMOTE_CODE': '1'},
+                    },
+                },
+            }],
+        }))
+
+        # 2b. Sampler/teacher models
+        sampler_name_count: dict[str, int] = {}
+        for sampler_cfg in sampler_list:
+            s_model_id = sampler_cfg['model_id']
+            s_short = _short(s_model_id)
+
+            # Deduplicate names when multiple samplers share the same short name
+            sampler_name_count[s_short] = sampler_name_count.get(s_short, 0) + 1
+            if sampler_name_count[s_short] > 1:
+                s_name = f'sampler-{s_short}-{sampler_name_count[s_short]}'
+            else:
+                s_name = f'sampler-{s_short}'
+
+            s_engine = sampler_cfg.get('engine', 'vllm')
+            s_max_len = sampler_cfg.get('max_model_len', 16000)
+
+            # Compute tp / dp / total GPUs:
+            #   tp = tensor parallelism (GPUs per vLLM process, for large models)
+            #   dp = data parallelism (number of independent inference replicas)
+            #   total GPUs = tp * dp
+            s_tp = sampler_cfg.get('tp', 1)
+            s_dp = sampler_cfg.get('dp', None)
+            s_gpus = sampler_cfg.get('gpus', None)
+
+            if s_dp is not None and s_gpus is not None:
+                # Both specified: validate consistency
+                s_tp = s_gpus // s_dp if s_tp == 1 else s_tp
+            elif s_gpus is not None:
+                # Only total GPUs specified: derive dp
+                s_dp = max(1, s_gpus // s_tp)
+            elif s_dp is not None:
+                # Only dp specified: derive total
+                s_gpus = s_tp * s_dp
+            else:
+                # Nothing specified: default to 1 GPU (tp=1, dp=1)
+                s_dp = 1
+                s_gpus = s_tp * s_dp
+
+            s_total_gpus = s_tp * s_dp
+
+            # Build device_mesh: include tp_size when tp>1 so that
+            # world_size = tp*dp and slice_dp dispatch computes correct
+            # rank_stride for DP data sharding.
+            mesh_config: dict = {'device_type': 'cuda', 'dp_size': s_dp}
+            if s_tp > 1:
+                mesh_config['tp_size'] = s_tp
+
+            sampler_app: dict = {
+                'name': s_name,
+                'route_prefix': f'/api/v1/sampler/{s_model_id}',
+                'import_path': 'sampler',
+                'args': {
+                    'model_id': f'ms://{s_model_id}',
+                    'nproc_per_node': s_total_gpus,
+                    'sampler_type': s_engine,
+                    'device_group': {
+                        'name': s_name,
+                        'ranks': s_total_gpus,
+                        'device_type': 'cuda',
+                        'gpus_per_worker': s_tp,
+                    },
+                    'device_mesh': mesh_config,
+                    'queue_config': {
+                        'rps_limit': 100,
+                        'tps_limit': 100000,
+                    },
+                },
+                'deployments': [{
+                    'name': 'SamplerManagement',
+                    'autoscaling_config': {
+                        'min_replicas': 1,
+                        'max_replicas': 1,
+                        'target_ongoing_requests': 16,
+                    },
+                    'ray_actor_options': {
+                        'num_cpus': 0.1,
+                        'runtime_env': {
+                            'env_vars': {'TWINKLE_TRUST_REMOTE_CODE': '1'},
+                        },
+                    },
+                }],
+            }
+
+            # Add engine-specific args
+            if s_engine == 'vllm':
+                engine_args = {
+                    'max_model_len': s_max_len,
+                    'gpu_memory_utilization': 0.85,
+                    'enable_lora': True,
+                    'logprobs_mode': 'processed_logprobs',
+                }
+                # Set tensor_parallel_size when tp > 1
+                if s_tp > 1:
+                    engine_args['tensor_parallel_size'] = s_tp
+                sampler_app['args']['engine_args'] = engine_args
+
+            gpu_apps.append((s_total_gpus, sampler_app))
+
+        # 3. Sort GPU apps by GPU count DESCENDING, then append in order.
+        #    Largest PG deploys first → claims the largest node → avoids deadlock.
+        gpu_apps.sort(key=lambda x: x[0], reverse=True)
+        for _, app_cfg in gpu_apps:
+            applications.append(app_cfg)
+
+        # 4. Processor service
+        applications.append({
+            'name': 'processor',
+            'route_prefix': '/api/v1/processor',
+            'import_path': 'processor',
+            'args': {
+                'ncpu_proc_per_node': 2,
+                'device_group': {
+                    'name': 'processor',
+                    'ranks': 2,
+                    'device_type': 'CPU',
+                },
+                'device_mesh': {
+                    'device_type': 'CPU',
+                    'dp_size': 2,
+                },
+            },
+            'deployments': [{
+                'name': 'ProcessorManagement',
+                'autoscaling_config': {
+                    'min_replicas': 1,
+                    'max_replicas': 1,
+                    'target_ongoing_requests': 128,
+                },
+                'ray_actor_options': {'num_cpus': 0.1},
+            }],
+        })
+
+        # === Assemble final config ===
+        config = {
+            'proxy_location': 'EveryNode',
+            'http_options': {
+                'host': '0.0.0.0',
+                'port': port,
+            },
+            'applications': applications,
+        }
+
+        # Write to ~/.cache/twinkle/server_config.yaml
+        config_dir = Path.home() / '.cache' / 'twinkle'
+        config_dir.mkdir(parents=True, exist_ok=True)
+        config_path = config_dir / 'server_config.yaml'
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f, default_flow_style=False, allow_unicode=True)
+
+        return str(config_path)
+
+    async def _tool_shutdown_server(self) -> dict:
+        """Shut down Twinkle Server and Ray cluster. DESTROYS GPU model state."""
+        import subprocess as _sp
+
+        def _shutdown():
+            results = {}
+
+            # 1. Try `serve shutdown` to cleanly stop Ray Serve deployments
+            try:
+                r = _sp.run(['serve', 'shutdown', '-y'], capture_output=True, text=True, timeout=30)
+                results['serve_shutdown'] = 'ok' if r.returncode == 0 else r.stderr.strip()
+            except (FileNotFoundError, OSError) as e:
+                results['serve_shutdown'] = f'skipped: {e}'
+
+            # 2. Kill any remaining twinkle.server processes
+            try:
+                _sp.run(['pkill', '-f', 'twinkle.server'], capture_output=True, timeout=5)
+            except (FileNotFoundError, OSError):
+                pass
+
+            # 3. Stop Ray cluster
+            try:
+                r = _sp.run(['ray', 'stop', '--force'], capture_output=True, text=True, timeout=15)
+                results['ray_stop'] = 'ok' if r.returncode == 0 else r.stderr.strip()
+            except (FileNotFoundError, OSError) as e:
+                results['ray_stop'] = f'failed: {e}'
+
+            results['status'] = 'shutdown_complete'
+            results['warning'] = 'All GPU model state has been released.'
+            return results
+
+        return await asyncio.get_event_loop().run_in_executor(None, _shutdown)
+
+    # ── Server queries ──
+
+    async def _tool_list_supported_models(self, base_url: str | None = None) -> dict:
+        """Query the Twinkle server for supported models."""
+        url = base_url or self._resolve_server_url()
+
+        def _query():
+            # Use a lightweight HTTP GET instead of init_twinkle_client() which
+            # creates a session + heartbeat thread that would leak since we never
+            # call close().
+            import urllib.request
+            import urllib.error
+
+            endpoint = f'{url}/api/v1/twinkle/get_server_capabilities'
+            req = urllib.request.Request(endpoint, method='GET')
+            resp = urllib.request.urlopen(req, timeout=10)
+            data = json.loads(resp.read().decode())
+            models = data.get('supported_models', [])
+            # Each model entry may be a dict with 'model_name' or a plain string
+            model_names = []
+            for m in models:
+                if isinstance(m, dict):
+                    model_names.append(m.get('model_name', ''))
+                else:
+                    model_names.append(str(m))
+            return {
+                'base_url': url,
+                'supported_models': model_names,
+            }
+
+        try:
+            return await asyncio.get_event_loop().run_in_executor(None, _query)
+        except Exception as e:
+            return {'error': f'Failed to query {url}: {e}'}
+
+    async def _tool_search_datasets(self, query: str, limit: int = 5) -> dict:
+        """Search ModelScope for datasets."""
+        return await self._search_hub('datasets', query, limit)
+
+    async def _tool_search_models(self, query: str, limit: int = 5) -> dict:
+        """Search ModelScope for models."""
+        return await self._search_hub('models', query, limit)
+
+    async def _search_hub(self, resource_type: str, query: str, limit: int) -> dict:
+        """Unified ModelScope Hub search for models or datasets."""
+
+        def _search():
+            if resource_type == 'datasets':
+                return self._search_datasets_impl(query, limit)
+            else:
+                return self._search_models_impl(query, limit)
+
+        try:
+            items = await asyncio.get_event_loop().run_in_executor(None, _search)
+            return {'query': query, 'results': items}
+        except Exception as e:
+            return {'error': f'{resource_type.title()} search failed: {e}'}
+
+    @staticmethod
+    def _search_datasets_impl(query: str, limit: int) -> list[dict]:
+        """Search datasets via ModelScope SDK (new API)."""
+        from modelscope.hub.api import HubApi
+        api = HubApi()
+        result = api.list_datasets('', search=query, page_size=limit)
+        datasets = result.get('datasets', [])
+        return [
+            {'id': d.get('id', ''), 'name': d.get('display_name', d.get('id', ''))}
+            for d in datasets
+        ]
+
+    @staticmethod
+    def _search_models_impl(query: str, limit: int) -> list[dict]:
+        """Search models via ModelScope HTTP API (SDK doesn't support search)."""
+        import requests
+        resp = requests.put(
+            'https://modelscope.cn/api/v1/models/',
+            json={'Name': query, 'PageSize': limit, 'PageNumber': 1},
+            timeout=15,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        if not data.get('Success'):
+            raise RuntimeError(data.get('Message', 'Unknown error'))
+        models = data.get('Data', {}).get('Models', [])
+        return [
+            {
+                'id': f"{m.get('Path', '')}/{m.get('Name', '')}",
+                'name': m.get('ChineseName') or m.get('Name', ''),
+            }
+            for m in models
+        ]
+
+    # ── Metrics chart ──
+
+    async def _tool_zoom_metrics(
+        self,
+        action: str,
+        x_start: int | None = None,
+        x_end: int | None = None,
+        y_min: float | None = None,
+        y_max: float | None = None,
+    ) -> dict:
+        if self.metrics_callback:
+            if action == 'reset':
+                self.metrics_callback('reset')
+            else:
+                self.metrics_callback('zoom', x_start=x_start, x_end=x_end, y_min=y_min, y_max=y_max)
+        return {'action': action, 'status': 'applied'}
+
+    async def _tool_select_metrics(self, keys: list[str]) -> dict:
+        """Select which metrics to display on the chart."""
+        if self.select_metrics_callback:
+            return self.select_metrics_callback(keys)
+        return {'error': 'Metrics panel not available'}
+
+    # ── Cluster info ──
+
+    async def _tool_get_cluster_info(self) -> dict:
+        """Query cluster resources: try Ray first, fall back to nvidia-smi."""
+
+        def _query():
+            # 1. Try connecting to an existing Ray cluster
+            ray_info = self._try_ray_cluster()
+            if ray_info is not None:
+                ray_info['ray_active'] = True
+                return ray_info
+
+            # 2. Ray not available — fall back to nvidia-smi
+            nvidia_info = self._try_nvidia_smi()
+            nvidia_info['ray_active'] = False
+            nvidia_info['hint'] = (
+                'Ray cluster is not running. To use distributed training, '
+                'start Ray first: `ray start --head --num-gpus=N` or use '
+                'the server mode run.sh script.'
+            )
+            return nvidia_info
+
+        return await asyncio.get_event_loop().run_in_executor(None, _query)
+
+    @staticmethod
+    def _suppress_fds():
+        """Context-manager-like helpers to suppress stdout/stderr at OS fd level.
+
+        Ray's C++ runtime writes directly to fd 1/2, bypassing Python's
+        sys.stdout/stderr. We must redirect at the OS level to prevent
+        corrupting Textual's alt-screen buffer.
+        """
+        import sys
+        _devnull_fd = os.open(os.devnull, os.O_WRONLY)
+        _saved_stdout_fd = os.dup(1)
+        _saved_stderr_fd = os.dup(2)
+        os.dup2(_devnull_fd, 1)
+        os.dup2(_devnull_fd, 2)
+        # Also redirect Python-level streams
+        _old_stdout, _old_stderr = sys.stdout, sys.stderr
+        sys.stdout = open(os.devnull, 'w')
+        sys.stderr = open(os.devnull, 'w')
+        return _devnull_fd, _saved_stdout_fd, _saved_stderr_fd, _old_stdout, _old_stderr
+
+    @staticmethod
+    def _restore_fds(state):
+        """Restore stdout/stderr from state returned by _suppress_fds()."""
+        import sys
+        _devnull_fd, _saved_stdout_fd, _saved_stderr_fd, _old_stdout, _old_stderr = state
+        # Restore Python-level
+        sys.stdout.close()
+        sys.stderr.close()
+        sys.stdout = _old_stdout
+        sys.stderr = _old_stderr
+        # Restore OS-level
+        os.dup2(_saved_stdout_fd, 1)
+        os.dup2(_saved_stderr_fd, 2)
+        os.close(_saved_stdout_fd)
+        os.close(_saved_stderr_fd)
+        os.close(_devnull_fd)
+
+    @staticmethod
+    def _try_ray_cluster() -> dict | None:
+        """Attempt to query an existing Ray cluster. Returns None if unavailable."""
+        try:
+            import ray
+        except ImportError:
+            return None
+
+        import logging as _logging
+
+        try:
+            # Connect to existing cluster without starting a new one.
+            # Use short timeout (5s) to avoid long GCS connection hangs.
+            # IMPORTANT: Suppress stdout/stderr at OS fd level during ray.init()
+            # to prevent corrupting Textual's alt-screen buffer (Ray's C++ runtime
+            # writes directly to fd 1/2, causing UI glitches like duplicated Input
+            # widgets filling the screen).
+            if not ray.is_initialized():
+                state = ToolExecutor._suppress_fds()
+                try:
+                    ray.init(
+                        address='auto',
+                        ignore_reinit_error=True,
+                        _timeout_s=5,
+                        logging_level=_logging.ERROR,
+                        configure_logging=False,
+                    )
+                finally:
+                    ToolExecutor._restore_fds(state)
+
+            resources = ray.cluster_resources()
+            available = ray.available_resources()
+            nodes = ray.nodes()
+            gpu_total = resources.get('GPU', 0)
+            gpu_available = available.get('GPU', 0)
+            # Detect GPU types from node resources
+            gpu_types = set()
+            for node in nodes:
+                for key in node.get('Resources', {}):
+                    if key.startswith('accelerator_type:'):
+                        gpu_types.add(key.split(':', 1)[1])
+            return {
+                'num_nodes': len([n for n in nodes if n.get('Alive')]),
+                'gpu_total': int(gpu_total),
+                'gpu_available': int(gpu_available),
+                'gpu_types': sorted(gpu_types) if gpu_types else ['unknown'],
+                'cpu_total': resources.get('CPU', 0),
+                'memory_bytes': resources.get('memory', 0),
+            }
+        except Exception:
+            # Ray not reachable (no cluster running) — disconnect cleanly
+            try:
+                import ray as _ray
+                if _ray.is_initialized():
+                    _ray.shutdown()
+            except Exception:
+                pass
+            return None
+
+    @staticmethod
+    def _try_nvidia_smi() -> dict:
+        """Parse nvidia-smi output for local GPU info."""
+        import subprocess as _sp
+
+        try:
+            result = _sp.run(
+                ['nvidia-smi', '--query-gpu=index,name,memory.total,memory.free,utilization.gpu',
+                 '--format=csv,noheader,nounits'],
+                capture_output=True, text=True, timeout=10,
+            )
+            if result.returncode != 0:
+                return {'error': f'nvidia-smi failed: {result.stderr.strip()}', 'gpu_total': 0}
+
+            gpus = []
+            for line in result.stdout.strip().split('\n'):
+                if not line.strip():
+                    continue
+                parts = [p.strip() for p in line.split(',')]
+                if len(parts) >= 5:
+                    try:
+                        gpus.append({
+                            'index': int(parts[0]),
+                            'name': parts[1],
+                            'memory_total_mb': int(parts[2]),
+                            'memory_free_mb': int(parts[3]),
+                            'utilization_pct': int(parts[4]) if parts[4].isdigit() else 0,
+                        })
+                    except (ValueError, IndexError):
+                        # Skip lines with unparseable values (e.g. [N/A])
+                        continue
+
+            gpu_types = sorted(set(g['name'] for g in gpus))
+            return {
+                'gpu_total': len(gpus),
+                'gpu_available': len([g for g in gpus if g['utilization_pct'] < 10]),
+                'gpu_types': gpu_types if gpu_types else ['none'],
+                'gpus': gpus,
+                'source': 'nvidia-smi',
+            }
+        except FileNotFoundError:
+            return {'error': 'nvidia-smi not found (no NVIDIA GPU or driver not installed)', 'gpu_total': 0}
+        except Exception as e:
+            return {'error': f'nvidia-smi query failed: {e}', 'gpu_total': 0}
diff --git a/src/twinkle_client/tui/app.py b/src/twinkle_client/tui/app.py
new file mode 100644
index 000000000..bd8c14604
--- /dev/null
+++ b/src/twinkle_client/tui/app.py
@@ -0,0 +1,304 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Twinkle TUI main application."""
+
+from __future__ import annotations
+
+import asyncio
+from twinkle.utils.logger import get_logger
+from typing import ClassVar
+
+from textual.app import App, ComposeResult
+from textual.binding import Binding
+
+from twinkle_client.tui.agent.core import AgentLoop
+from twinkle_client.tui.agent.monitor import TrainingMonitor
+from twinkle_client.tui.connection import LocalConnection
+from twinkle_client.skills import LocalSkillProvider, ModelScopeSkillProvider, SkillManager
+from twinkle_client.tui.widgets.chat import ChatPanel
+from twinkle_client.tui.widgets.logs import LogPanel
+from twinkle_client.tui.widgets.metrics import MetricsPanel
+from twinkle_client.tui.widgets.status_bar import StatusBar
+
+logger = get_logger()
+
+# Timeout for remote skills fetching (seconds)
+_SKILLS_FETCH_TIMEOUT = 10.0
+
+
+class TwinkleTUI(App):
+    """Main Textual application for Twinkle training control."""
+
+    TITLE = 'Twinkle TUI'
+    SUB_TITLE = 'ML Training Control'
+
+    CSS = """
+    Screen {
+        layout: grid;
+        grid-size: 2 3;
+        grid-rows: auto 2fr 3fr;
+        grid-columns: 2fr 1fr;
+    }
+
+    #status-bar {
+        column-span: 2;
+        height: 3;
+    }
+
+    #metrics {
+        height: 100%;
+    }
+
+    #logs {
+        height: 100%;
+        row-span: 2;
+    }
+
+    #chat {
+        height: 100%;
+    }
+    """
+
+    BINDINGS: ClassVar[list[Binding]] = [
+        Binding('q', 'quit', 'Quit'),
+        Binding('ctrl+p', 'toggle_metrics', 'Toggle Metrics'),
+        Binding('ctrl+l', 'clear_logs', 'Clear Logs'),
+    ]
+
+    def __init__(
+        self,
+        run_id: str | None = None,
+        llm_base_url: str = 'http://localhost:11434/v1',
+        llm_model: str = 'qwen3.5',
+        llm_api_key: str = 'not-needed',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.run_id = run_id
+        self.llm_base_url = llm_base_url
+        self.llm_model = llm_model
+        self.llm_api_key = llm_api_key
+        self._connection: LocalConnection | None = None
+        self._agent: AgentLoop | None = None
+        self._monitor: TrainingMonitor | None = None
+        self._bg_tasks: list[asyncio.Task] = []
+
+    def compose(self) -> ComposeResult:
+        yield StatusBar(id='status-bar')
+        yield MetricsPanel(id='metrics')
+        yield LogPanel(id='logs')
+        yield ChatPanel(id='chat')
+
+    async def on_mount(self) -> None:
+        """Initialize connection, agent, and background tasks.
+
+        Agent is created immediately (with empty skills) so user messages
+        are never silently dropped. Skills are loaded asynchronously and
+        injected once ready.
+        """
+        self._connection = LocalConnection()
+        if self.run_id:
+            self._connection.current_run_id = self.run_id
+
+        # Create agent immediately (usable before skills are loaded)
+        self._agent = AgentLoop(
+            connection=self._connection,
+            llm_base_url=self.llm_base_url,
+            llm_model=self.llm_model,
+            llm_api_key=self.llm_api_key,
+            skills_prompt='',
+        )
+        self._agent.set_run_selected_callback(self._on_run_selected)
+        self._agent.set_metrics_callback(self._handle_metrics_zoom)
+        self._agent.set_select_metrics_callback(self._handle_select_metrics)
+
+        self._monitor = TrainingMonitor(
+            connection=self._connection,
+            on_message=self._on_agent_message,
+            llm_base_url=self.llm_base_url,
+            llm_model=self.llm_model,
+            llm_api_key=self.llm_api_key,
+        )
+
+        # Start background tasks
+        self._bg_tasks = [
+            asyncio.create_task(self._monitor.run()),
+            asyncio.create_task(self._poll_logs()),
+            asyncio.create_task(self._poll_metrics()),
+            asyncio.create_task(self._load_skills_async()),
+        ]
+
+        # Show welcome message and initial status
+        self._show_welcome()
+        self._update_status_bar()
+
+    def _show_welcome(self) -> None:
+        """Show a welcome hint in the chat panel."""
+        chat = self.query_one('#chat', ChatPanel)
+        if self._connection and self._connection.current_run_id:
+            chat.add_assistant_message(
+                f'Monitoring run: [bold]{self._connection.current_run_id}[/]. '
+                'Ask me anything about your training.'
+            )
+        else:
+            chat.add_assistant_message(
+                'Welcome! I can help you start, monitor, and control ML training. '
+                'Try: "list my training runs" or "start a new GRPO training".'
+            )
+
+    async def _load_skills_async(self) -> None:
+        """Load skills in background and inject into agent when ready."""
+        manager = SkillManager()
+
+        # 1. Bundled skills shipped inside the twinkle_client package (pip-installable)
+        from pathlib import Path as _Path
+        _bundled_skills = _Path(__file__).resolve().parent.parent / 'skills' / 'bundled'
+        if _bundled_skills.is_dir():
+            manager.register(LocalSkillProvider(skills_dir=_bundled_skills))
+
+        # 2. User-local custom skills (~/.cache/twinkle/tui/skills/local/)
+        manager.register(LocalSkillProvider())
+        # 3. Community skills from ModelScope (remote, best-effort)
+        manager.register(ModelScopeSkillProvider())
+        try:
+            await asyncio.wait_for(manager.load_all(), timeout=_SKILLS_FETCH_TIMEOUT)
+        except (asyncio.TimeoutError, Exception) as e:
+            logger.warning(f'Skills loading incomplete: {e}')
+        skills_prompt = manager.format_for_prompt()
+        if skills_prompt and self._agent:
+            self._agent.inject_skills(skills_prompt)
+            logger.info(f'Skills injected: {manager.get_skill_names()}')
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Background polling
+    # ──────────────────────────────────────────────────────────────────────
+
+    async def _poll_metrics(self, interval: float = 3.0) -> None:
+        """Poll metrics incrementally and update UI."""
+        while True:
+            try:
+                if self._connection and self._connection.current_run_id:
+                    run_id = self._connection.current_run_id
+                    new_metrics = self._connection.get_new_metrics(run_id)
+                    if new_metrics:
+                        self.query_one('#metrics', MetricsPanel).append_metrics(new_metrics)
+                        self._update_status_bar(latest_metrics=new_metrics[-1])
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.debug(f'Metrics poll error: {e}')
+            await asyncio.sleep(interval)
+
+    async def _poll_logs(self, interval: float = 2.0) -> None:
+        """Poll logs incrementally and push to LogPanel."""
+        while True:
+            try:
+                if self._connection and self._connection.current_run_id:
+                    new_logs = self._connection.get_new_logs(self._connection.current_run_id)
+                    if new_logs:
+                        log_panel = self.query_one('#logs', LogPanel)
+                        for entry in new_logs:
+                            log_panel.append_log(entry.get('msg', ''))
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.debug(f'Logs poll error: {e}')
+            await asyncio.sleep(interval)
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Callbacks
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _on_agent_message(self, message: str) -> None:
+        """Push agent/monitor messages to chat panel."""
+        self.query_one('#chat', ChatPanel).add_assistant_message(message)
+
+    def _handle_metrics_zoom(self, action: str, **kwargs) -> None:
+        metrics_panel = self.query_one('#metrics', MetricsPanel)
+        if action == 'reset':
+            metrics_panel.reset_zoom()
+        else:
+            metrics_panel.zoom(**kwargs)
+
+    def _handle_select_metrics(self, keys: list[str]) -> dict:
+        """Handle select_metrics tool call from agent."""
+        metrics_panel = self.query_one('#metrics', MetricsPanel)
+        return metrics_panel.select_keys(keys)
+
+    def _on_run_selected(self, run_id: str) -> None:
+        """Handle run switch: reset offsets, clear metrics, update status."""
+        if self._connection:
+            self._connection.reset_offsets(run_id)
+        self.query_one('#metrics', MetricsPanel).update_metrics([])
+        self._update_status_bar()
+
+    def _update_status_bar(self, latest_metrics: dict | None = None) -> None:
+        """Update status bar from current connection state (single meta read)."""
+        if not self._connection or not self._connection.current_run_id:
+            return
+        run_id = self._connection.current_run_id
+        status_bar = self.query_one('#status-bar', StatusBar)
+
+        # Single meta read for all status queries
+        meta = self._connection.get_meta(run_id)
+        if meta:
+            model_id = meta.get('model_id')
+            if model_id:
+                status_bar.update_status(model=model_id)
+
+            status = meta.get('status', 'unknown')
+            state_map = {
+                'running': 'training',
+                'paused': 'paused',
+                'stopped': 'completed',
+                'completed': 'completed',
+                'error': 'error',
+            }
+            status_bar.update_status(state=state_map.get(status, 'idle'))
+
+        status_bar.update_status(run_id=run_id)
+
+        # Step from latest metrics
+        if latest_metrics:
+            status_bar.update_status(
+                step=latest_metrics.get('step'),
+                total_steps=latest_metrics.get('total_steps'),
+            )
+
+    # ──────────────────────────────────────────────────────────────────────
+    # User interaction
+    # ──────────────────────────────────────────────────────────────────────
+
+    async def on_chat_panel_user_submitted(self, event: ChatPanel.UserSubmitted) -> None:
+        """Handle user input from chat panel."""
+        if not self._agent:
+            return
+        chat = self.query_one('#chat', ChatPanel)
+        chat.start_streaming()
+        try:
+            await self._agent.send(
+                event.text,
+                on_token=chat.append_stream,
+                on_stream_reset=chat.reset_stream,
+            )
+        except Exception as e:
+            chat.finish_streaming()
+            chat.add_assistant_message(f'[Error] {e}')
+            return
+        chat.finish_streaming()
+        await asyncio.sleep(0)
+
+    def action_toggle_metrics(self) -> None:
+        self.query_one('#metrics', MetricsPanel).toggle_class('hidden')
+
+    def action_clear_logs(self) -> None:
+        self.query_one('#logs', LogPanel).clear()
+
+    def action_quit(self) -> None:
+        """Quit with proper cleanup of background tasks."""
+        self._cancel_tasks()
+        self.exit()
+
+    def _cancel_tasks(self) -> None:
+        for task in self._bg_tasks:
+            task.cancel()
+        self._bg_tasks.clear()
diff --git a/src/twinkle_client/tui/connection.py b/src/twinkle_client/tui/connection.py
new file mode 100644
index 000000000..86d883746
--- /dev/null
+++ b/src/twinkle_client/tui/connection.py
@@ -0,0 +1,378 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Local file-based connection layer for TUI.
+
+Reads metrics from JSONL and raw logs from output.log.
+
+In Server Mode, training control is done by killing/restarting the client
+process. The server retains all model/optimizer state in GPU memory.
+- "Pause" = kill client process (SIGKILL)
+- "Resume" = start a new client with same adapter_name
+- "Stop" = graceful shutdown via SIGTERM (saves checkpoint)
+
+File layout under run_dir (~/.cache/twinkle/{run_id}/):
+    metrics.jsonl  — one JSON object per line, written after each step
+    output.log     — combined stdout+stderr (raw text, read by TUI log panel)
+    meta.json      — run metadata (model_id, config, status, pid)
+    train.py       — current active training script
+    train_v{N}.py  — archived previous versions
+"""
+
+from __future__ import annotations
+
+import json
+from twinkle.utils.logger import get_logger
+import os
+import re
+import shutil
+import signal
+import subprocess
+import time
+from pathlib import Path
+from typing import Any
+
+logger = get_logger()
+
+DEFAULT_BASE_DIR = Path.home() / '.cache' / 'twinkle'
+
+
+class LocalConnection:
+    """File-based connection between TUI and training process.
+
+    All monitoring happens through the local filesystem:
+    - Metrics and logs are read from JSONL files (tail-style incremental)
+    - Training control is via process management (kill/restart)
+    """
+
+    def __init__(self, base_dir: Path | str | None = None):
+        self.base_dir = Path(base_dir) if base_dir else DEFAULT_BASE_DIR
+        self.current_run_id: str | None = None
+        self._metrics_offsets: dict[str, int] = {}
+        self._logs_offsets: dict[str, int] = {}
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Meta
+    # ──────────────────────────────────────────────────────────────────────
+
+    def get_meta(self, run_id: str) -> dict[str, Any] | None:
+        """Read and parse meta.json for a run."""
+        meta_file = self.base_dir / run_id / 'meta.json'
+        if not meta_file.exists():
+            return None
+        try:
+            return json.loads(meta_file.read_text())
+        except Exception:
+            return None
+
+    def _write_meta(self, run_id: str, meta: dict[str, Any]) -> None:
+        """Write meta dict to meta.json for a run."""
+        meta_file = self.base_dir / run_id / 'meta.json'
+        meta_file.parent.mkdir(parents=True, exist_ok=True)
+        meta_file.write_text(json.dumps(meta, indent=2))
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Discovery
+    # ──────────────────────────────────────────────────────────────────────
+
+    def list_training_runs(self) -> list[dict[str, Any]]:
+        """List all training runs by scanning base directory.
+
+        A valid run directory must contain either meta.json or metrics.jsonl.
+        """
+        if not self.base_dir.exists():
+            return []
+        runs = []
+        for entry in sorted(self.base_dir.iterdir(), reverse=True):
+            if not entry.is_dir():
+                continue
+            meta_file = entry / 'meta.json'
+            metrics_file = entry / 'metrics.jsonl'
+            if not (meta_file.exists() or metrics_file.exists()):
+                continue
+            run_info = {'run_id': entry.name, 'dir': str(entry)}
+            if meta_file.exists():
+                try:
+                    run_info.update(json.loads(meta_file.read_text()))
+                except Exception:
+                    pass
+            runs.append(run_info)
+        return runs
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Metrics & logs (incremental reading)
+    # ──────────────────────────────────────────────────────────────────────
+
+    def get_metrics(self, run_id: str, last_n: int = 200) -> list[dict[str, Any]]:
+        """Read metrics from JSONL file (tail last_n entries)."""
+        metrics_file = self.base_dir / run_id / 'metrics.jsonl'
+        if not metrics_file.exists():
+            return []
+        try:
+            lines = metrics_file.read_text().strip().splitlines()
+            recent = lines[-last_n:] if len(lines) > last_n else lines
+            return [json.loads(line) for line in recent if line.strip()]
+        except Exception:
+            return []
+
+    def get_new_metrics(self, run_id: str) -> list[dict[str, Any]]:
+        """Read only new metrics since last read (incremental, per-run)."""
+        metrics_file = self.base_dir / run_id / 'metrics.jsonl'
+        if not metrics_file.exists():
+            return []
+        try:
+            offset = self._metrics_offsets.get(run_id, 0)
+            with open(metrics_file, 'r') as f:
+                f.seek(offset)
+                new_data = f.read()
+                self._metrics_offsets[run_id] = f.tell()
+            if not new_data.strip():
+                return []
+            return [json.loads(line) for line in new_data.strip().splitlines() if line.strip()]
+        except Exception:
+            return []
+
+    def get_new_logs(self, run_id: str) -> list[dict[str, Any]]:
+        """Read new raw log lines from output.log (incremental, per-run).
+
+        Returns list of dicts with 'msg' key for each new line.
+        """
+        output_file = self.base_dir / run_id / 'output.log'
+        entries: list[dict[str, Any]] = []
+
+        if not output_file.exists():
+            return entries
+
+        try:
+            offset = self._logs_offsets.get(run_id, 0)
+            with open(output_file, 'r', errors='replace') as f:
+                f.seek(offset)
+                new_data = f.read()
+                self._logs_offsets[run_id] = f.tell()
+            if new_data:
+                for line in new_data.split('\n'):
+                    if not line:
+                        continue
+                    # For \r-separated content (progress bars), keep only the last segment
+                    # This simulates terminal behavior where \r overwrites the current line
+                    if '\r' in line:
+                        line = line.rsplit('\r', 1)[-1]
+                    if line.strip():
+                        entries.append({'msg': line})
+        except Exception:
+            pass
+
+        return entries
+
+    def reset_offsets(self, run_id: str) -> None:
+        """Reset incremental read offsets for a run (e.g., after switching runs)."""
+        self._metrics_offsets.pop(run_id, None)
+        self._logs_offsets.pop(run_id, None)
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Process health helpers
+    # ──────────────────────────────────────────────────────────────────────
+
+    @staticmethod
+    def _is_process_alive(pid: int) -> bool:
+        """Check if a process is still running (POSIX).
+
+        Returns False for zombie processes (state 'Z') since they have
+        already exited even though the PID still exists.
+        """
+        try:
+            os.kill(pid, 0)
+        except (ProcessLookupError, PermissionError, OSError):
+            return False
+        # PID exists — but check if it's a zombie via /proc
+        try:
+            status_file = Path(f'/proc/{pid}/status')
+            if status_file.exists():
+                for line in status_file.read_text().splitlines():
+                    if line.startswith('State:'):
+                        # State: Z (zombie) / R (running) / S (sleeping) / D (disk sleep)
+                        return 'Z' not in line
+        except Exception:
+            pass
+        return True
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Process management
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _launch_script(self, run_id: str) -> dict[str, Any]:
+        """Launch the run's train.py as a background subprocess.
+
+        Captures stderr to stderr.log so script errors are diagnosable.
+        Returns a dict with launch result (pid or error).
+        """
+        meta = self.get_meta(run_id)
+        if not meta:
+            return {'status': 'error', 'run_id': run_id, 'error': f'No meta.json for run {run_id}'}
+
+        script_path = meta.get('script_path')
+        if not script_path or not Path(script_path).exists():
+            return {'status': 'error', 'run_id': run_id, 'error': f'Script not found: {script_path}'}
+
+        run_dir = self.base_dir / run_id
+        output_file = run_dir / 'output.log'
+
+        try:
+            output_fh = open(output_file, 'w')
+        except OSError as e:
+            return {'status': 'error', 'run_id': run_id, 'error': f'Cannot open output log: {e}'}
+
+        try:
+            env = os.environ.copy()
+            env['TWINKLE_RUN_ID'] = run_id
+            proc = subprocess.Popen(
+                ['python', '-u', script_path],
+                cwd=str(run_dir),
+                env=env,
+                stdout=output_fh,
+                stderr=subprocess.STDOUT,
+                start_new_session=True,
+            )
+        except OSError as e:
+            output_fh.close()
+            return {'status': 'error', 'run_id': run_id, 'error': f'Failed to launch script: {e}'}
+        output_fh.close()
+
+        # Non-blocking check: if process already exited (e.g., syntax error)
+        retcode = proc.poll()
+        if retcode is not None:
+            error_msg = output_file.read_text().strip()[-500:] if output_file.exists() else ''
+            meta['status'] = 'error'
+            self._write_meta(run_id, meta)
+            return {'status': 'error', 'run_id': run_id, 'error': error_msg or f'Process exited immediately (code={retcode})'}
+
+        meta['pid'] = proc.pid
+        meta['status'] = 'running'
+        self._write_meta(run_id, meta)
+        return {'status': 'running', 'run_id': run_id, 'pid': proc.pid, 'script_path': script_path}
+
+    def start_training(self, run_id: str, script_content: str, model_id: str = '') -> dict[str, Any]:
+        """Create a new training run and launch the script.
+
+        Args:
+            run_id: Unique identifier for the run (timestamp suffix auto-appended).
+            script_content: Full Python source of the training script.
+            model_id: Model identifier for metadata.
+        """
+        from datetime import datetime
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        run_id = f'{run_id}_{timestamp}'
+
+        run_dir = self.base_dir / run_id
+        run_dir.mkdir(parents=True, exist_ok=True)
+
+        train_py = run_dir / 'train.py'
+        train_py.write_text(script_content)
+
+        meta = {
+            'run_id': run_id,
+            'model_id': model_id,
+            'status': 'starting',
+            'script_path': str(train_py),
+            'script_version': 1,
+            'start_time': time.time(),
+        }
+        self._write_meta(run_id, meta)
+        self.current_run_id = run_id
+
+        return self._launch_script(run_id)
+
+    def pause_training(self, run_id: str) -> dict[str, Any]:
+        """Pause training by killing the client process (SIGKILL).
+
+        Server retains all state — restart the script to continue.
+        """
+        meta = self.get_meta(run_id)
+        pid = meta.get('pid') if meta else None
+
+        if pid:
+            try:
+                os.kill(pid, signal.SIGKILL)
+            except (ProcessLookupError, PermissionError):
+                pass
+
+        if meta:
+            meta['status'] = 'paused'
+            self._write_meta(run_id, meta)
+
+        return {'status': 'paused', 'run_id': run_id, 'pid': pid}
+
+    def resume_training(self, run_id: str) -> dict[str, Any]:
+        """Resume training by re-launching the stored training script.
+
+        Server state (LoRA weights, optimizer, LR scheduler) is preserved in GPU memory.
+        """
+        # Reset log/metrics offsets since output.log will be truncated on re-launch
+        self.reset_offsets(run_id)
+        return self._launch_script(run_id)
+
+    def stop_training(self, run_id: str) -> dict[str, Any]:
+        """Stop training gracefully via SIGTERM.
+
+        The training script's SIGTERM handler saves checkpoint + dataloader state,
+        then exits. Training can later be resumed from checkpoint.
+        """
+        meta = self.get_meta(run_id)
+        pid = meta.get('pid') if meta else None
+
+        if pid:
+            try:
+                os.kill(pid, signal.SIGTERM)
+            except (ProcessLookupError, PermissionError):
+                pass
+
+        if meta:
+            meta['status'] = 'stopping'
+            self._write_meta(run_id, meta)
+
+        return {'status': 'stopping', 'run_id': run_id, 'pid': pid}
+
+    def update_script(self, run_id: str, new_script_content: str) -> dict[str, Any]:
+        """Update the training script with version archiving.
+
+        Archives the current train.py as train_v{N}.py, writes new content.
+        Version numbering is based on the actual max version found on disk.
+        """
+        run_dir = self.base_dir / run_id
+        run_dir.mkdir(parents=True, exist_ok=True)
+
+        train_py = run_dir / 'train.py'
+
+        # Archive existing script with robust version numbering
+        version = 1
+        if train_py.exists():
+            # Find the actual max version number from filenames
+            max_v = 0
+            for f in run_dir.glob('train_v*.py'):
+                m = re.match(r'train_v(\d+)\.py$', f.name)
+                if m:
+                    max_v = max(max_v, int(m.group(1)))
+            archive_v = max_v + 1
+            shutil.copy2(train_py, run_dir / f'train_v{archive_v}.py')
+            version = archive_v + 1
+
+        train_py.write_text(new_script_content)
+
+        # Update meta
+        meta = self.get_meta(run_id) or {'run_id': run_id}
+        meta['script_version'] = version
+        meta['script_path'] = str(train_py)
+        self._write_meta(run_id, meta)
+
+        return {
+            'run_id': run_id,
+            'script_version': version,
+            'script_path': str(train_py),
+        }
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Status queries
+    # ──────────────────────────────────────────────────────────────────────
+
+    def get_status(self, run_id: str) -> str:
+        """Get the current status string for a run."""
+        meta = self.get_meta(run_id)
+        return meta.get('status', 'unknown') if meta else 'unknown'
diff --git a/src/twinkle_client/tui/runtime.py b/src/twinkle_client/tui/runtime.py
new file mode 100644
index 000000000..dc2e95b35
--- /dev/null
+++ b/src/twinkle_client/tui/runtime.py
@@ -0,0 +1,294 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Training runtime utilities for TUI integration.
+
+This module provides helpers that training scripts import to:
+1. Write structured metrics to metrics.jsonl
+2. Print log messages to stdout (captured as output.log by TUI launcher)
+3. Manage run lifecycle (start/end)
+4. Register SIGTERM handler for graceful shutdown with checkpoint
+
+In Server Mode, the client is stateless - killing the client process is
+equivalent to "pause" (server retains all optimizer/model state in GPU memory).
+Restarting the script with the same adapter_name seamlessly continues training.
+
+Usage in training scripts:
+    from twinkle_client.tui.runtime import TrainingRuntime
+
+    rt = TrainingRuntime(run_id='my-grpo-run')
+    rt.start(model_id='Qwen/Qwen3.5-4B', config={...})
+    rt.register_graceful_shutdown(model, dataloader)
+
+    for step, batch in enumerate(dataloader):
+        # ... training logic ...
+        rt.log_metrics(step=step, loss=loss, reward=reward, grad_norm=gn, lr=lr)
+        rt.log(f'Completed step {step}, loss={loss:.4f}')
+
+    rt.finish()
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+import signal
+import sys
+import time
+from pathlib import Path
+from typing import Any, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from twinkle_client.model import MultiLoraTransformersModel
+    from twinkle.dataloader import DataLoader
+
+
+DEFAULT_BASE_DIR = Path.home() / '.cache' / 'twinkle'
+
+
+class TrainingRuntime:
+    """Runtime helper for training scripts to integrate with TUI.
+
+    Manages:
+    - Writing metrics.jsonl (structured step data)
+    - Printing logs to stdout (captured by TUI launcher as output.log)
+    - Run metadata (meta.json)
+    - SIGTERM graceful shutdown with checkpoint saving
+    """
+
+    def __init__(self, run_id: str | None = None, base_dir: Path | str | None = None):
+        """Initialize the training runtime.
+
+        Args:
+            run_id: Unique identifier for this training run.
+                If None, reads from TWINKLE_RUN_ID environment variable
+                (automatically set by TUI launcher).
+            base_dir: Base directory for run data. Defaults to ~/.cache/twinkle/
+        """
+        self.base_dir = Path(base_dir) if base_dir else DEFAULT_BASE_DIR
+        if run_id is None:
+            run_id = os.environ.get('TWINKLE_RUN_ID', '')
+            if not run_id:
+                raise ValueError(
+                    'run_id must be provided or TWINKLE_RUN_ID env var must be set'
+                )
+        self.run_id = run_id
+        self.run_dir = self.base_dir / run_id
+
+        self._metrics_file: Any = None
+        self._started = False
+        self._last_progress_save: float = 0.0  # timestamp of last progress write
+        self._progress_save_interval: float = 5.0  # write at most every 5 seconds
+
+    def start(
+        self,
+        model_id: str = '',
+        config: dict[str, Any] | None = None,
+        script_path: str | Path | None = None,
+    ) -> None:
+        """Initialize the run directory and write metadata.
+
+        Call this once at the beginning of training.
+
+        Args:
+            model_id: Model identifier (e.g. 'Qwen/Qwen3.5-4B').
+            config: Training configuration dict (hyperparameters, etc.).
+            script_path: Path to the training script. If provided, the script
+                will be copied into the run directory as ``train.py`` so that
+                resume/restart can re-execute it automatically.
+        """
+        self.run_dir.mkdir(parents=True, exist_ok=True)
+
+        # Copy training script into run directory for reproducibility & restart.
+        # If train.py already exists (e.g. from a previous failed run), archive it
+        # as train_v{N}.py before overwriting.
+        stored_script: str | None = None
+        script_version = 1
+        if script_path is not None:
+            import shutil
+            src = Path(script_path).resolve()
+            dst = self.run_dir / 'train.py'
+            if src.exists() and src != dst.resolve():
+                # Archive existing train.py if present
+                if dst.exists():
+                    # Find max existing version number (regex-based, consistent with connection.py)
+                    max_v = 0
+                    for f in self.run_dir.glob('train_v*.py'):
+                        m = re.match(r'train_v(\d+)\.py$', f.name)
+                        if m:
+                            max_v = max(max_v, int(m.group(1)))
+                    archive_v = max_v + 1
+                    shutil.copy2(dst, self.run_dir / f'train_v{archive_v}.py')
+                    script_version = archive_v + 1
+                shutil.copy2(src, dst)
+            stored_script = str(dst)
+
+        # Write run metadata
+        meta = {
+            'run_id': self.run_id,
+            'model_id': model_id,
+            'config': config or {},
+            'start_time': time.time(),
+            'status': 'running',
+            'pid': os.getpid(),
+            'script_path': stored_script,
+            'script_version': script_version,
+        }
+        (self.run_dir / 'meta.json').write_text(json.dumps(meta, indent=2))
+
+        # Open metrics file for append
+        self._metrics_file = open(self.run_dir / 'metrics.jsonl', 'a', buffering=1)
+
+        self._started = True
+        self.log('Training started')
+
+    def log_metrics(self, **kwargs) -> None:
+        """Write a metrics entry to metrics.jsonl.
+
+        All keyword arguments are written as a single JSON line.
+        A timestamp is automatically added.
+        Values are auto-converted to float where possible; unconvertible values are dropped.
+
+        Example:
+            rt.log_metrics(step=10, loss=0.5, reward=1.2, grad_norm=0.8, lr=1e-5)
+        """
+        if not self._metrics_file:
+            return
+        entry = {'ts': time.time()}
+        for k, v in kwargs.items():
+            try:
+                entry[k] = float(v)
+            except (TypeError, ValueError):
+                pass  # skip unconvertible values
+        self._metrics_file.write(json.dumps(entry) + '\n')
+
+        # Auto-save training progress to meta.json (for resume after crash)
+        step = entry.get('step')
+        if step is not None:
+            now = time.time()
+            if now - self._last_progress_save >= self._progress_save_interval:
+                progress = {'last_step': int(step)}
+                if 'total_steps' in entry:
+                    progress['total_steps'] = int(entry['total_steps'])
+                self._save_progress(progress)
+                self._last_progress_save = now
+
+    def log(self, message: str) -> None:
+        """Print a log message to stdout (captured as output.log by TUI).
+
+        Args:
+            message: Human-readable log message.
+        """
+        print(f'[twinkle] {message}', flush=True)
+
+    def get_resume_info(self) -> dict[str, int]:
+        """Get resume info from a previous run (if any).
+
+        Reads the 'progress' field from meta.json, which is auto-saved
+        during training by log_metrics().
+
+        Returns:
+            dict with 'last_step' (int, default 0) and optionally 'total_steps'.
+            Always returns a dict — never None. Fresh start returns {'last_step': 0}.
+
+        Usage:
+            resume = rt.get_resume_info()
+            global_step = resume['last_step']
+            if global_step > 0:
+                dataloader.skip_consumed_samples(global_step * BATCH_SIZE)
+                print(f'[twinkle] Resuming from step {global_step}')
+        """
+        meta_path = self.run_dir / 'meta.json'
+        if not meta_path.exists():
+            return {'last_step': 0}
+        try:
+            meta = json.loads(meta_path.read_text())
+            progress = meta.get('progress')
+            if progress and progress.get('last_step', 0) > 0:
+                return progress
+        except Exception:
+            pass
+        return {'last_step': 0}
+
+    def _save_progress(self, progress: dict[str, int]) -> None:
+        """Save training progress to meta.json (throttled, non-blocking)."""
+        meta_path = self.run_dir / 'meta.json'
+        try:
+            meta = json.loads(meta_path.read_text()) if meta_path.exists() else {}
+            meta['progress'] = progress
+            meta_path.write_text(json.dumps(meta, indent=2))
+        except Exception:
+            pass  # never crash training for progress bookkeeping
+
+    def finish(self, status: str = 'completed') -> None:
+        """Mark training as finished and close files.
+
+        Args:
+            status: Final status ('completed', 'stopped', 'error').
+        """
+        self.log(f'Training finished with status: {status}')
+
+        # Update metadata
+        meta_file = self.run_dir / 'meta.json'
+        if meta_file.exists():
+            try:
+                meta = json.loads(meta_file.read_text())
+                meta['status'] = status
+                meta['end_time'] = time.time()
+                meta_file.write_text(json.dumps(meta, indent=2))
+            except Exception:
+                pass
+
+        # Close files
+        if self._metrics_file:
+            self._metrics_file.close()
+            self._metrics_file = None
+
+        self._started = False
+
+    def register_graceful_shutdown(
+        self,
+        model: 'MultiLoraTransformersModel',
+        dataloader: 'DataLoader | None' = None,
+        checkpoint_name: str = 'interrupted',
+    ) -> None:
+        """Register SIGTERM handler for graceful shutdown with checkpoint.
+
+        When SIGTERM is received (e.g., from TUI stop command), the handler will:
+        1. Save model checkpoint (LoRA weights + optimizer state)
+        2. Save dataloader position (consumed_train_samples) for exact resume
+        3. Log the checkpoint path
+        4. Mark training as 'stopped' and exit
+
+        Args:
+            model: The MultiLoraTransformersModel instance.
+            dataloader: Optional DataLoader with .get_state() support.
+            checkpoint_name: Name for the saved checkpoint.
+
+        Usage:
+            rt = TrainingRuntime(run_id='my-run')
+            rt.start(...)
+            rt.register_graceful_shutdown(model, dataloader)
+            # ... training loop ...
+        """
+        def _shutdown_handler(signum, frame):
+            self.log('SIGTERM received, saving checkpoint before exit...')
+            try:
+                save_kwargs = {
+                    'name': checkpoint_name,
+                    'save_optimizer': True,
+                }
+                if dataloader is not None:
+                    state = dataloader.get_state()
+                    save_kwargs['consumed_train_samples'] = state.get('consumed_train_samples', 0)
+                    self.log(f'Dataloader state: consumed_train_samples={save_kwargs["consumed_train_samples"]}')
+
+                result = model.save(**save_kwargs)
+                self.log(f'Checkpoint saved: {result}')
+            except Exception as e:
+                self.log(f'Error saving checkpoint during shutdown: {e}')
+
+            self.finish(status='stopped')
+            sys.exit(0)
+
+        signal.signal(signal.SIGTERM, _shutdown_handler)
+        self.log('Graceful shutdown handler registered (SIGTERM)')
diff --git a/src/twinkle_client/tui/widgets/__init__.py b/src/twinkle_client/tui/widgets/__init__.py
new file mode 100644
index 000000000..340a61ff9
--- /dev/null
+++ b/src/twinkle_client/tui/widgets/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+from twinkle_client.tui.widgets.chat import ChatPanel
+from twinkle_client.tui.widgets.logs import LogPanel
+from twinkle_client.tui.widgets.metrics import MetricsPanel
+from twinkle_client.tui.widgets.status_bar import StatusBar
+
+__all__ = ['ChatPanel', 'LogPanel', 'MetricsPanel', 'StatusBar']
diff --git a/src/twinkle_client/tui/widgets/chat.py b/src/twinkle_client/tui/widgets/chat.py
new file mode 100644
index 000000000..f116e9f47
--- /dev/null
+++ b/src/twinkle_client/tui/widgets/chat.py
@@ -0,0 +1,162 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Chat panel widget - handles user/agent conversation display and input."""
+
+from __future__ import annotations
+
+import time
+
+from textual.app import ComposeResult
+from textual.message import Message as TextualMessage
+from textual.widgets import Input, RichLog, Static
+from textual.widget import Widget
+
+
+class ChatPanel(Widget):
+    """Interactive chat panel for user <-> agent conversation.
+
+    Streaming text is written directly into the main chat-log (RichLog)
+    in throttled chunks so the conversation flows naturally without
+    a separate narrow preview widget.
+    """
+
+    DEFAULT_CSS = """
+    ChatPanel {
+        layout: vertical;
+        border: solid $primary;
+        padding: 0;
+    }
+
+    ChatPanel > #chat-title {
+        dock: top;
+        height: 1;
+        background: $primary;
+        color: $text;
+        text-align: center;
+    }
+
+    ChatPanel > #chat-log {
+        height: 1fr;
+        padding: 0 1;
+    }
+
+    ChatPanel > #chat-input {
+        dock: bottom;
+        height: 3;
+        margin: 0 1;
+    }
+    """
+
+    class UserSubmitted(TextualMessage):
+        """Event emitted when user submits a message."""
+
+        def __init__(self, text: str) -> None:
+            super().__init__()
+            self.text = text
+
+    # Minimum interval between flushing chunks to the RichLog
+    _STREAM_THROTTLE = 0.08  # 80ms — balance between responsiveness and perf
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._streaming_buffer = ''  # un-flushed chars
+        self._full_response = ''     # entire response accumulated
+        self._is_streaming = False
+        self._last_flush_time: float = 0.0
+        self._header_written = False  # whether "Agent: " prefix is written
+
+    def compose(self) -> ComposeResult:
+        yield Static('Chat', id='chat-title')
+        yield RichLog(id='chat-log', wrap=True, markup=True, max_lines=200)
+        yield Input(placeholder='Ask the agent anything...', id='chat-input')
+
+    def on_input_submitted(self, event: Input.Submitted) -> None:
+        """Handle input submission."""
+        text = event.value.strip()
+        if not text:
+            return
+        event.input.value = ''
+        self.add_user_message(text)
+        self.post_message(self.UserSubmitted(text))
+
+    def add_user_message(self, text: str) -> None:
+        """Add a user message to the chat log."""
+        self.query_one('#chat-log', RichLog).write(f'[bold green]You:[/] {text}')
+
+    def add_assistant_message(self, text: str) -> None:
+        """Add an assistant message to the chat log."""
+        self.query_one('#chat-log', RichLog).write(f'[bold cyan]Agent:[/] {text}')
+
+    # ── Streaming API ──
+
+    def start_streaming(self) -> None:
+        """Begin a streaming assistant response."""
+        self._streaming_buffer = ''
+        self._full_response = ''
+        self._is_streaming = True
+        self._last_flush_time = 0.0
+        self._header_written = False
+
+    def reset_stream(self) -> None:
+        """Discard buffered streaming content (called when tool-calls detected).
+
+        Resets state so the next LLM round starts fresh.
+        """
+        self._streaming_buffer = ''
+        self._full_response = ''
+        self._header_written = False
+        log = self.query_one('#chat-log', RichLog)
+        log.write('[dim]  ↳ calling tools...[/]')
+
+    def append_stream(self, chunk: str) -> None:
+        """Append a chunk from the LLM stream.
+
+        Writes accumulated text to the chat-log in throttled batches
+        so the conversation scrolls naturally.
+        """
+        self._streaming_buffer += chunk
+        self._full_response += chunk
+        now = time.monotonic()
+        if now - self._last_flush_time >= self._STREAM_THROTTLE:
+            self._flush_stream()
+
+    def _flush_stream(self, force: bool = False) -> None:
+        """Write buffered streaming text to the RichLog.
+
+        Only flushes complete lines (up to the last newline) to avoid
+        splitting multi-line structures like tables mid-row.
+        If force=True, flushes everything (used at end of stream).
+        """
+        if not self._streaming_buffer:
+            return
+
+        if force:
+            text_to_write = self._streaming_buffer
+            self._streaming_buffer = ''
+        else:
+            # Only flush up to the last newline — keep incomplete line buffered
+            last_nl = self._streaming_buffer.rfind('\n')
+            if last_nl == -1:
+                return  # No complete line yet, keep buffering
+            text_to_write = self._streaming_buffer[:last_nl + 1]
+            self._streaming_buffer = self._streaming_buffer[last_nl + 1:]
+
+        if not text_to_write:
+            return
+
+        log = self.query_one('#chat-log', RichLog)
+        if not self._header_written:
+            log.write(f'[bold cyan]Agent:[/] {text_to_write}', shrink=False)
+            self._header_written = True
+        else:
+            log.write(text_to_write, shrink=False)
+        self._last_flush_time = time.monotonic()
+
+    def finish_streaming(self) -> str:
+        """End streaming and return the full accumulated response."""
+        # Force-flush any remaining buffer (including incomplete lines)
+        self._flush_stream(force=True)
+        self._is_streaming = False
+        full_text = self._full_response
+        self._full_response = ''
+        self._streaming_buffer = ''
+        return full_text
diff --git a/src/twinkle_client/tui/widgets/logs.py b/src/twinkle_client/tui/widgets/logs.py
new file mode 100644
index 000000000..bf5775389
--- /dev/null
+++ b/src/twinkle_client/tui/widgets/logs.py
@@ -0,0 +1,70 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Log panel widget - efficient scrolling log display using RichLog."""
+
+from __future__ import annotations
+
+import re
+
+from textual.app import ComposeResult
+from textual.widgets import RichLog, Static
+from textual.widget import Widget
+
+# Regex to strip ALL terminal escape sequences (colors, cursor movement, etc.)
+_ANSI_RE = re.compile(r'\x1b\[[0-9;?]*[A-Za-z]|\x1b\][^\x07]*\x07|\x1b[^\[\]]')
+
+
+class LogPanel(Widget):
+    """Scrolling log panel showing training logs in real-time."""
+
+    DEFAULT_CSS = """
+    LogPanel {
+        layout: vertical;
+        border: solid $accent;
+        padding: 0;
+        overflow: hidden hidden;
+    }
+
+    LogPanel > #log-title {
+        dock: top;
+        height: 1;
+        background: $accent;
+        color: $text;
+        text-align: center;
+    }
+
+    LogPanel > #log-content {
+        height: 1fr;
+        width: 100%;
+        padding: 0 1;
+    }
+    """
+
+    def compose(self) -> ComposeResult:
+        yield Static('Logs', id='log-title')
+        yield RichLog(id='log-content', max_lines=500, wrap=True, markup=False)
+
+    def append_log(self, message: str) -> None:
+        """Append a log message to the panel, hard-wrapping to avoid overflow."""
+        log_widget = self.query_one('#log-content', RichLog)
+        # Get available width (subtract padding); fallback to 60
+        width = (log_widget.size.width or 60) - 2
+        if width < 20:
+            width = 60
+
+        # Strip ALL terminal control sequences and \r (progress bar carriage returns)
+        clean = _ANSI_RE.sub('', message)
+        clean = clean.replace('\r', '\n')  # \r from progress bars → treat as newline
+
+        # Hard-wrap long lines to prevent overflow into adjacent panels
+        for line in clean.splitlines():
+            line = line.rstrip()
+            if not line:
+                continue
+            while len(line) > width:
+                log_widget.write(line[:width])
+                line = line[width:]
+            log_widget.write(line)
+
+    def clear(self) -> None:
+        """Clear all log entries."""
+        self.query_one('#log-content', RichLog).clear()
diff --git a/src/twinkle_client/tui/widgets/metrics.py b/src/twinkle_client/tui/widgets/metrics.py
new file mode 100644
index 000000000..bdc5a7f6e
--- /dev/null
+++ b/src/twinkle_client/tui/widgets/metrics.py
@@ -0,0 +1,199 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Metrics panel widget - renders training metrics as ASCII charts."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import plotext as plt
+from rich.text import Text
+
+from textual.app import ComposeResult
+from textual.widgets import Static
+from textual.widget import Widget
+
+# Maximum data points to retain in memory
+_MAX_HISTORY = 2000
+
+
+class MetricsPanel(Widget):
+    """Renders training metrics (loss, reward, etc.) as terminal plots."""
+
+    DEFAULT_CSS = """
+    MetricsPanel {
+        layout: vertical;
+        border: solid $warning;
+        padding: 0;
+    }
+
+    MetricsPanel > #metrics-title {
+        dock: top;
+        height: 1;
+        background: $warning;
+        color: $text;
+        text-align: center;
+    }
+
+    MetricsPanel > #metrics-hint {
+        dock: bottom;
+        height: 1;
+        color: $text-muted;
+        text-style: italic;
+        padding: 0 1;
+    }
+
+    MetricsPanel > #metrics-plot {
+        height: 1fr;
+        padding: 0 1;
+    }
+    """
+
+    # Max metric lines on a single chart
+    _MAX_DISPLAY_KEYS = 4
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._metrics_history: list[dict[str, Any]] = []
+        self._selected_keys: list[str] | None = None  # None = auto (first 3)
+        self._x_range: tuple[int | None, int | None] = (None, None)
+        self._y_range: tuple[float | None, float | None] = (None, None)
+
+    def compose(self) -> ComposeResult:
+        yield Static('Metrics', id='metrics-title')
+        yield Static('No data yet...', id='metrics-plot')
+        yield Static('', id='metrics-hint')
+
+    def update_metrics(self, metrics: list[dict[str, Any]]) -> None:
+        """Replace all metrics data and redraw the plot."""
+        self._metrics_history = metrics[-_MAX_HISTORY:]
+        self._redraw()
+
+    def append_metrics(self, new_metrics: list[dict[str, Any]]) -> None:
+        """Append new metrics incrementally and redraw."""
+        self._metrics_history.extend(new_metrics)
+        # Trim to max history
+        if len(self._metrics_history) > _MAX_HISTORY:
+            self._metrics_history = self._metrics_history[-_MAX_HISTORY:]
+        self._redraw()
+
+    def zoom(self, x_start: int | None = None, x_end: int | None = None,
+             y_min: float | None = None, y_max: float | None = None) -> None:
+        """Zoom into a specific range of the chart (by step value, not index)."""
+        self._x_range = (x_start, x_end)
+        self._y_range = (y_min, y_max)
+        self._redraw()
+
+    def reset_zoom(self) -> None:
+        """Reset zoom to show all data."""
+        self._x_range = (None, None)
+        self._y_range = (None, None)
+        self._redraw()
+
+    def get_available_keys(self) -> list[str]:
+        """Return all plottable metric keys from current data."""
+        if not self._metrics_history:
+            return []
+        sample = self._metrics_history[0]
+        # Exclude metadata fields that are not meaningful as plotted metrics
+        exclude = ('step', 'ts', 'epoch', 'total_steps')
+        return [k for k in sample.keys() if k not in exclude]
+
+    def get_selected_keys(self) -> list[str]:
+        """Return the currently displayed metric keys."""
+        available = self.get_available_keys()
+        if self._selected_keys is not None:
+            return [k for k in self._selected_keys if k in available]
+        return available[:3]
+
+    def select_keys(self, keys: list[str]) -> dict[str, Any]:
+        """Select which metrics to display. Returns status with available keys.
+
+        Args:
+            keys: Metric key names to show (max _MAX_DISPLAY_KEYS).
+                  Pass empty list to reset to auto-select mode.
+        """
+        available = self.get_available_keys()
+        if not keys:
+            self._selected_keys = None
+        else:
+            # Validate & cap
+            valid = [k for k in keys if k in available]
+            self._selected_keys = valid[:self._MAX_DISPLAY_KEYS] if valid else None
+        self._redraw()
+        return {
+            'selected': self.get_selected_keys(),
+            'available': available,
+        }
+
+    def _redraw(self) -> None:
+        """Redraw the metrics plot."""
+        plot_widget = self.query_one('#metrics-plot', Static)
+        hint_widget = self.query_one('#metrics-hint', Static)
+        if not self._metrics_history:
+            plot_widget.update('No data yet...')
+            hint_widget.update('')
+            return
+
+        available = self.get_available_keys()
+        if not available:
+            plot_widget.update('No plottable metrics.')
+            hint_widget.update('')
+            return
+
+        # Determine which keys to plot
+        display_keys = self.get_selected_keys()
+        hidden_keys = [k for k in available if k not in display_keys]
+
+        # Update hint with hidden metrics
+        if hidden_keys:
+            hint_widget.update(
+                f'[dim]Also available: {" ".join(hidden_keys)} '
+                f'(ask agent to switch)[/]'
+            )
+        else:
+            hint_widget.update('')
+
+        # Get plot size from widget dimensions
+        width = max(self.size.width - 4, 40)
+        height = max(self.size.height - 6, 8)  # account for hint line
+
+        plt.clf()
+        plt.plotsize(width, height)
+        plt.theme('dark')
+
+        steps = [m.get('step', i) for i, m in enumerate(self._metrics_history)]
+
+        # Apply x-range filter by step VALUE (not array index)
+        x_start, x_end = self._x_range
+        if x_start is not None or x_end is not None:
+            filtered = [
+                (s, m) for s, m in zip(steps, self._metrics_history)
+                if (x_start is None or s >= x_start) and (x_end is None or s <= x_end)
+            ]
+            if filtered:
+                steps, data_slice = zip(*filtered)
+                steps = list(steps)
+                data_slice = list(data_slice)
+            else:
+                plot_widget.update('No data in selected range.')
+                return
+        else:
+            data_slice = self._metrics_history
+
+        # Plot selected metrics
+        for key in display_keys:
+            values = [m.get(key, 0) for m in data_slice]
+            if any(v is not None for v in values):
+                plt.plot(steps, values, label=key)
+
+        # Apply y-range
+        y_min, y_max = self._y_range
+        if y_min is not None or y_max is not None:
+            plt.ylim(y_min or plt.ylim()[0], y_max or plt.ylim()[1])
+
+        plt.title('Training Metrics')
+        plt.xlabel('Step')
+
+        # Render to string and convert ANSI to Rich Text for Textual
+        plot_str = plt.build()
+        plot_widget.update(Text.from_ansi(plot_str))
diff --git a/src/twinkle_client/tui/widgets/status_bar.py b/src/twinkle_client/tui/widgets/status_bar.py
new file mode 100644
index 000000000..bf9436d4c
--- /dev/null
+++ b/src/twinkle_client/tui/widgets/status_bar.py
@@ -0,0 +1,93 @@
+# Copyright (c) Twinkle Contributors. All rights reserved.
+"""Status bar widget - shows training progress and status."""
+
+from __future__ import annotations
+
+from textual.app import ComposeResult
+from textual.widgets import Static
+from textual.widget import Widget
+
+
+class StatusBar(Widget):
+    """Top status bar showing training state, model, step count, and progress."""
+
+    DEFAULT_CSS = """
+    StatusBar {
+        layout: horizontal;
+        height: 3;
+        background: $surface;
+        border-bottom: solid $primary;
+        padding: 0 2;
+    }
+
+    StatusBar > .status-item {
+        width: auto;
+        padding: 0 2;
+        content-align: center middle;
+    }
+
+    StatusBar > #status-state {
+        color: $success;
+        text-style: bold;
+    }
+
+    StatusBar > #status-run {
+        color: $text-muted;
+    }
+
+    StatusBar > #status-model {
+        color: $text;
+    }
+
+    StatusBar > #status-step {
+        color: $warning;
+    }
+
+    StatusBar > #status-progress {
+        color: $accent;
+        width: 1fr;
+        text-align: right;
+    }
+    """
+
+    def compose(self) -> ComposeResult:
+        yield Static('⏸ Idle', id='status-state', classes='status-item')
+        yield Static('Run: -', id='status-run', classes='status-item')
+        yield Static('Model: -', id='status-model', classes='status-item')
+        yield Static('Step: 0', id='status-step', classes='status-item')
+        yield Static('', id='status-progress', classes='status-item')
+
+    def update_status(
+        self,
+        state: str | None = None,
+        model: str | None = None,
+        step: int | None = None,
+        total_steps: int | None = None,
+        run_id: str | None = None,
+    ) -> None:
+        """Update status bar fields."""
+        if state is not None:
+            state_icons = {
+                'training': '🚀 Training',
+                'paused': '⏸ Paused',
+                'idle': '⏸ Idle',
+                'error': '❌ Error',
+                'completed': '✅ Done',
+            }
+            self.query_one('#status-state', Static).update(
+                state_icons.get(state, f'● {state}')
+            )
+        if model is not None:
+            self.query_one('#status-model', Static).update(f'Model: {model}')
+        if run_id is not None:
+            self.query_one('#status-run', Static).update(f'Run: {run_id}')
+        if step is not None:
+            self.query_one('#status-step', Static).update(f'Step: {step}')
+        if total_steps is not None and step is not None:
+            pct = min(100, int(step / total_steps * 100)) if total_steps > 0 else 0
+            bar_len = 20
+            filled = int(bar_len * pct / 100)
+            bar = '█' * filled + '░' * (bar_len - filled)
+            self.query_one('#status-progress', Static).update(
+                f'[{bar}] {pct}% ({step}/{total_steps})'
+            )
diff --git a/tests/advantage/__init__.py b/tests/advantage/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/advantage/test_advantage.py b/tests/advantage/test_advantage.py
new file mode 100644
index 000000000..1f1f2b51e
--- /dev/null
+++ b/tests/advantage/test_advantage.py
@@ -0,0 +1,193 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import pytest
+import torch
+
+from twinkle.advantage import GRPOAdvantage, RLOOAdvantage
+
+
+class TestGRPOAdvantage:
+
+    def setup_method(self):
+        self.grpo = GRPOAdvantage()
+
+    # --- basic shape / dtype ---
+
+    def test_output_shape_matches_input(self):
+        rewards = torch.tensor([1.0, 2.0, 3.0, 4.0])
+        result = self.grpo(rewards, num_generations=4)
+        assert result.shape == rewards.shape
+
+    def test_output_is_float32(self):
+        rewards = [1, 2, 3, 4]
+        result = self.grpo(rewards, num_generations=4)
+        assert result.dtype == torch.float32
+
+    def test_accepts_list_input(self):
+        rewards = [0.0, 1.0, 0.0, 1.0]
+        result = self.grpo(rewards, num_generations=4)
+        assert result.shape == (4,)
+
+    # --- scale='none' ---
+
+    def test_scale_none_subtracts_group_mean(self):
+        # Group [0, 1, 0, 1]: mean=0.5, advantages = [-0.5, 0.5, -0.5, 0.5]
+        rewards = torch.tensor([0.0, 1.0, 0.0, 1.0])
+        result = self.grpo(rewards, num_generations=4, scale='none')
+        expected = torch.tensor([-0.5, 0.5, -0.5, 0.5])
+        assert torch.allclose(result, expected, atol=1e-6)
+
+    def test_scale_none_two_groups(self):
+        # 2 prompts, 4 generations each
+        rewards = torch.tensor([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
+        result = self.grpo(rewards, num_generations=4, scale='none')
+        # Group 1: [0,1,2,3] mean=1.5 → [-1.5, -0.5, 0.5, 1.5]
+        # Group 2: [4,5,6,7] mean=5.5 → [-1.5, -0.5, 0.5, 1.5]
+        expected = torch.tensor([-1.5, -0.5, 0.5, 1.5, -1.5, -0.5, 0.5, 1.5])
+        assert torch.allclose(result, expected, atol=1e-6)
+
+    # --- scale='group' ---
+
+    def test_scale_group_divides_by_group_std(self):
+        rewards = torch.tensor([0.0, 2.0, 0.0, 2.0])
+        result = self.grpo(rewards, num_generations=4, scale='group')
+        # Group mean=1.0, std≈1.1547
+        # advantages_raw = [-1, 1, -1, 1]
+        # normalized = [-1, 1, -1, 1] / 1.1547
+        assert result.shape == (4,)
+
+    def test_scale_group_zero_std_handled(self):
+        # All same reward → std=0, division by eps=1e-8
+        rewards = torch.tensor([5.0, 5.0, 5.0, 5.0])
+        result = self.grpo(rewards, num_generations=4, scale='group')
+        # advantages_raw = [0,0,0,0], still 0 after division
+        assert torch.allclose(result, torch.zeros(4), atol=1e-5)
+
+    # --- scale='batch' ---
+
+    def test_scale_batch_uses_batch_std(self):
+        rewards = torch.tensor([0.0, 2.0, 4.0, 6.0])
+        result = self.grpo(rewards, num_generations=4, scale='batch')
+        assert result.shape == (4,)
+
+    # --- num_generations=1 ---
+
+    def test_num_generations_1_scale_none(self):
+        rewards = torch.tensor([1.0, 3.0, 5.0])
+        result = self.grpo(rewards, num_generations=1, scale='none')
+        # mean=3.0, advantages = [-2, 0, 2]
+        expected = torch.tensor([-2.0, 0.0, 2.0])
+        assert torch.allclose(result, expected, atol=1e-6)
+
+    def test_num_generations_1_scale_batch(self):
+        rewards = torch.tensor([1.0, 3.0, 5.0])
+        result = self.grpo(rewards, num_generations=1, scale='batch')
+        # mean=3.0, std≈2.0, advantages = [-2,0,2]/2
+        assert result.shape == (3,)
+
+    def test_num_generations_1_scale_default(self):
+        """Default scale for num_generations=1 is 'group', which returns raw rewards."""
+        rewards = torch.tensor([1.0, 2.0, 3.0])
+        result = self.grpo(rewards, num_generations=1)
+        assert torch.allclose(result, rewards)
+
+    # --- multi-dim rewards ---
+
+    def test_multi_dim_rewards_summed(self):
+        rewards = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])
+        result = self.grpo(rewards, num_generations=4, scale='none')
+        # summed: [3, 7, 11, 15], mean=9, advantages = [-6, -2, 2, 6]
+        expected = torch.tensor([-6.0, -2.0, 2.0, 6.0])
+        assert torch.allclose(result, expected, atol=1e-6)
+
+    # --- error handling ---
+
+    def test_invalid_num_generations_zero(self):
+        with pytest.raises(ValueError):
+            self.grpo(torch.tensor([1.0, 2.0]), num_generations=0)
+
+    def test_invalid_num_generations_mismatch(self):
+        with pytest.raises(ValueError):
+            self.grpo(torch.tensor([1.0, 2.0, 3.0]), num_generations=2)
+
+    def test_single_element_with_num_gen_1(self):
+        result = self.grpo(torch.tensor([5.0]), num_generations=1)
+        assert torch.allclose(result, torch.tensor([5.0]))
+
+
+class TestRLOOAdvantage:
+
+    def setup_method(self):
+        self.rloo = RLOOAdvantage()
+
+    # --- basic shape / dtype ---
+
+    def test_output_shape_matches_input(self):
+        rewards = torch.tensor([1.0, 2.0, 3.0, 4.0])
+        result = self.rloo(rewards, num_generations=4)
+        assert result.shape == rewards.shape
+
+    def test_accepts_list_input(self):
+        rewards = [1.0, 2.0, 3.0, 4.0]
+        result = self.rloo(rewards, num_generations=4)
+        assert result.shape == (4,)
+
+    # --- RLOO formula ---
+
+    def test_rloo_leave_one_out_baseline(self):
+        # K=4, rewards=[0,0,0,1]
+        # For reward=0: baseline = 1/3 ≈ 0.333, advantage ≈ -0.333
+        # For reward=1: baseline = 0/3 = 0,     advantage = 1
+        rewards = torch.tensor([0.0, 0.0, 0.0, 1.0])
+        result = self.rloo(rewards, num_generations=4, scale='none')
+        assert torch.allclose(result[0], torch.tensor(-1.0 / 3), atol=1e-5)
+        assert torch.allclose(result[3], torch.tensor(1.0), atol=1e-5)
+
+    def test_rloo_all_same_rewards(self):
+        rewards = torch.tensor([5.0, 5.0, 5.0, 5.0])
+        result = self.rloo(rewards, num_generations=4, scale='none')
+        # All advantages should be 0
+        assert torch.allclose(result, torch.zeros(4), atol=1e-5)
+
+    def test_rloo_two_groups(self):
+        rewards = torch.tensor([1.0, 3.0, 5.0, 7.0])
+        result = self.rloo(rewards, num_generations=2, scale='none')
+        # Group1 [1,3]: baseline_1=3, adv_1=-2; baseline_2=1, adv_2=2
+        # Group2 [5,7]: baseline_1=7, adv_1=-2; baseline_2=5, adv_2=2
+        expected = torch.tensor([-2.0, 2.0, -2.0, 2.0])
+        assert torch.allclose(result, expected, atol=1e-5)
+
+    # --- scale modes ---
+
+    def test_scale_group(self):
+        rewards = torch.tensor([0.0, 1.0, 2.0, 3.0])
+        result = self.rloo(rewards, num_generations=4, scale='group')
+        assert result.shape == (4,)
+
+    def test_scale_batch(self):
+        rewards = torch.tensor([0.0, 1.0, 2.0, 3.0])
+        result = self.rloo(rewards, num_generations=4, scale='batch')
+        assert result.shape == (4,)
+
+    # --- error handling ---
+
+    def test_invalid_num_generations_one(self):
+        with pytest.raises(ValueError):
+            self.rloo(torch.tensor([1.0, 2.0]), num_generations=1)
+
+    def test_invalid_num_generations_zero(self):
+        with pytest.raises(ValueError):
+            self.rloo(torch.tensor([1.0, 2.0]), num_generations=0)
+
+    def test_invalid_num_generations_mismatch(self):
+        with pytest.raises(ValueError):
+            self.rloo(torch.tensor([1.0, 2.0, 3.0]), num_generations=2)
+
+    # --- multi-dim ---
+
+    def test_multi_dim_rewards_summed(self):
+        rewards = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])
+        result = self.rloo(rewards, num_generations=4, scale='none')
+        # summed: [3, 7, 11, 15], sum=36
+        # adv_3 = 3 - (36-3)/3 = 3 - 11 = -8
+        # adv_15 = 15 - (36-15)/3 = 15 - 7 = 8
+        assert result.shape == (4,)
diff --git a/tests/loss/__init__.py b/tests/loss/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/loss/test_ce_mse.py b/tests/loss/test_ce_mse.py
new file mode 100644
index 000000000..c3c3d2db5
--- /dev/null
+++ b/tests/loss/test_ce_mse.py
@@ -0,0 +1,180 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for loss functions: CrossEntropy, ChunkedCrossEntropy, MSE."""
+import pytest
+import torch
+import torch.nn.functional as F
+
+from twinkle.loss import CrossEntropyLoss, ChunkedCrossEntropyLoss, MSELoss
+
+
+class TestCrossEntropyLoss:
+
+    def test_basic_ce_loss(self):
+        loss_fn = CrossEntropyLoss()
+        logits = torch.randn(4, 10)
+        labels = torch.randint(0, 10, (4,))
+        inputs = {'labels': labels}
+        outputs = {'logits': logits}
+        result = loss_fn(inputs, outputs)
+        assert isinstance(result, dict) and 'loss' in result
+        assert result['loss'].dim() == 0  # scalar
+        assert result['loss'].item() >= 0
+
+    def test_ce_from_logps(self):
+        loss_fn = CrossEntropyLoss()
+        logits = torch.randn(4, 10)
+        labels = torch.randint(0, 10, (4,))
+        logps = F.log_softmax(logits, dim=-1).gather(-1, labels.unsqueeze(-1)).squeeze(-1)
+        inputs = {'labels': labels}
+        outputs = {'logps': logps}
+        result = loss_fn(inputs, outputs)
+        assert result['loss'].item() >= 0
+
+    def test_ce_with_ignore_index(self):
+        loss_fn = CrossEntropyLoss(ignore_index=-100)
+        logits = torch.randn(4, 10)
+        labels = torch.tensor([0, -100, 5, -100])
+        inputs = {'labels': labels}
+        outputs = {'logits': logits}
+        result = loss_fn(inputs, outputs)
+        assert result['loss'].item() >= 0
+
+    def test_ce_all_ignored(self):
+        """When all labels are ignored, denominator clamps to 1, loss should be 0."""
+        loss_fn = CrossEntropyLoss(ignore_index=-100)
+        logits = torch.randn(4, 10)
+        labels = torch.tensor([-100, -100, -100, -100])
+        inputs = {'labels': labels}
+        outputs = {'logits': logits}
+        result = loss_fn(inputs, outputs)
+        assert result['loss'].item() == pytest.approx(0.0, abs=1e-6)
+
+    def test_ce_reduction_sum(self):
+        loss_fn = CrossEntropyLoss(reduction='sum')
+        logits = torch.randn(4, 10)
+        labels = torch.randint(0, 10, (4,))
+        inputs = {'labels': labels}
+        outputs = {'logits': logits}
+        result = loss_fn(inputs, outputs)
+        assert result['num_tokens'].item() > 0
+
+    def test_ce_dft_weighting(self):
+        loss_fn = CrossEntropyLoss(dft=True)
+        logits = torch.randn(4, 10)
+        labels = torch.randint(0, 10, (4,))
+        inputs = {'labels': labels}
+        outputs = {'logits': logits}
+        result = loss_fn(inputs, outputs)
+        # DFT: -p*log(p) should always be non-negative
+        assert result['loss'].item() >= 0
+
+    def test_ce_logps_vs_logits_match(self):
+        """Results from logits path and pre-computed logps path should match."""
+        loss_fn = CrossEntropyLoss()
+        logits = torch.randn(4, 10)
+        labels = torch.randint(0, 10, (4,))
+
+        # From logits
+        result_logits = loss_fn({'labels': labels}, {'logits': logits.clone()})
+
+        # From logps
+        logps = F.log_softmax(logits, dim=-1).gather(-1, labels.unsqueeze(-1)).squeeze(-1)
+        result_logps = loss_fn({'labels': labels}, {'logps': logps})
+
+        assert torch.allclose(result_logits['loss'], result_logps['loss'], atol=1e-5)
+
+
+class TestChunkedCrossEntropyLoss:
+
+    def test_chunked_matches_standard(self):
+        """Chunked CE should produce the same loss as standard CE."""
+        torch.manual_seed(42)
+        logits = torch.randn(8, 20)
+        labels = torch.randint(0, 20, (8,))
+
+        standard = CrossEntropyLoss()
+        chunked = ChunkedCrossEntropyLoss(chunk_size=3)
+
+        r_std = standard({'labels': labels}, {'logits': logits.clone()})
+        r_chunked = chunked({'labels': labels.clone()}, {'logits': logits.clone()})
+
+        assert torch.allclose(r_std['loss'], r_chunked['loss'], atol=1e-4)
+
+    def test_chunked_with_ignore_index(self):
+        chunked = ChunkedCrossEntropyLoss(chunk_size=2, ignore_index=-100)
+        logits = torch.randn(6, 10)
+        labels = torch.tensor([0, -100, 5, 3, -100, 1])
+        result = chunked({'labels': labels}, {'logits': logits})
+        assert result['loss'].item() >= 0
+
+    def test_chunked_reduction_sum(self):
+        chunked = ChunkedCrossEntropyLoss(chunk_size=4, reduction='sum')
+        logits = torch.randn(8, 10)
+        labels = torch.randint(0, 10, (8,))
+        result = chunked({'labels': labels}, {'logits': logits})
+        assert result['num_tokens'].item() > 0
+
+    def test_chunked_from_logps(self):
+        """Fast path: when logps is provided, should match standard CE logps path."""
+        torch.manual_seed(42)
+        logits = torch.randn(4, 10)
+        labels = torch.randint(0, 10, (4,))
+        logps = F.log_softmax(logits, dim=-1).gather(-1, labels.unsqueeze(-1)).squeeze(-1)
+
+        standard = CrossEntropyLoss()
+        chunked = ChunkedCrossEntropyLoss(chunk_size=2)
+
+        r_std = standard({'labels': labels}, {'logps': logps.clone()})
+        r_chunked = chunked({'labels': labels.clone()}, {'logps': logps.clone()})
+
+        assert torch.allclose(r_std['loss'], r_chunked['loss'], atol=1e-5)
+
+    def test_chunked_dft(self):
+        chunked = ChunkedCrossEntropyLoss(chunk_size=2, dft=True)
+        logits = torch.randn(4, 10)
+        labels = torch.randint(0, 10, (4,))
+        result = chunked({'labels': labels}, {'logits': logits})
+        assert result['loss'].item() >= 0
+
+    def test_chunked_invalid_chunk_size(self):
+        with pytest.raises(AssertionError):
+            ChunkedCrossEntropyLoss(chunk_size=0)
+
+    def test_chunked_invalid_reduction(self):
+        with pytest.raises(AssertionError):
+            ChunkedCrossEntropyLoss(chunk_size=1, reduction='max')
+
+    def test_chunked_gradient_flow(self):
+        """Ensure gradients flow through the chunked autograd function."""
+        chunked = ChunkedCrossEntropyLoss(chunk_size=2)
+        logits = torch.randn(4, 10, requires_grad=True)
+        labels = torch.randint(0, 10, (4,))
+        result = chunked({'labels': labels}, {'logits': logits})
+        result['loss'].backward()
+        assert logits.grad is not None
+        assert logits.grad.shape == logits.shape
+
+
+class TestMSELoss:
+
+    def test_basic_mse(self):
+        loss_fn = MSELoss()
+        preds = torch.randn(4, 3)
+        labels = torch.randn(4, 3)
+        result = loss_fn({'labels': labels}, {'logits': preds})
+        assert isinstance(result, dict) and 'loss' in result
+        assert result['loss'].item() >= 0
+
+    def test_mse_zero_when_equal(self):
+        loss_fn = MSELoss()
+        vals = torch.randn(4, 3)
+        result = loss_fn({'labels': vals}, {'logits': vals})
+        assert result['loss'].item() == pytest.approx(0.0, abs=1e-6)
+
+    def test_mse_known_value(self):
+        loss_fn = MSELoss()
+        preds = torch.tensor([[1.0, 2.0]])
+        labels = torch.tensor([[3.0, 5.0]])
+        result = loss_fn({'labels': labels}, {'logits': preds})
+        expected = ((1 - 3) ** 2 + (2 - 5) ** 2) / 2  # = 6.5
+        assert result['loss'].item() == pytest.approx(expected, abs=1e-5)
diff --git a/tests/loss/test_dpo.py b/tests/loss/test_dpo.py
new file mode 100644
index 000000000..f12d5ea3a
--- /dev/null
+++ b/tests/loss/test_dpo.py
@@ -0,0 +1,184 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for DPO family losses: DPO, SimPO, CPO, ORPO."""
+import pytest
+import torch
+import torch.nn.functional as F
+
+from twinkle.loss import DPOLoss, SimPOLoss, CPOLoss, ORPOLoss
+
+
+def _make_preference_batch(batch_size=4, seq_len=8, vocab_size=20, with_ref=True):
+    """Create a synthetic DPO-style interleaved batch.
+
+    Layout: [chosen_0, rejected_0, chosen_1, rejected_1, ...]
+    """
+    torch.manual_seed(42)
+    logits = torch.randn(batch_size, seq_len, vocab_size)
+    labels = torch.randint(0, vocab_size, (batch_size, seq_len))
+    # Mark some positions as ignore
+    for i in range(batch_size):
+        labels[i, seq_len // 2:] = -100
+
+    logps = F.log_softmax(logits, dim=-1).gather(-1, labels.clamp(min=0).unsqueeze(-1)).squeeze(-1)
+
+    inputs = {'labels': labels}
+    outputs = {'logps': logps, 'logits': logits}
+
+    ref_logps = None
+    if with_ref:
+        ref_logits = torch.randn(batch_size, seq_len, vocab_size)
+        ref_logps = F.log_softmax(ref_logits, dim=-1).gather(
+            -1, labels.clamp(min=0).unsqueeze(-1)).squeeze(-1)
+
+    return inputs, outputs, ref_logps
+
+
+class TestDPOLoss:
+
+    def test_basic_dpo_sigmoid(self):
+        loss_fn = DPOLoss(beta=0.1, loss_type='sigmoid')
+        inputs, outputs, ref_logps = _make_preference_batch()
+        result = loss_fn(inputs, outputs, ref_logps=ref_logps)
+        assert isinstance(result, dict) and 'loss' in result
+        assert result['loss'].dim() == 0
+
+    def test_dpo_hinge(self):
+        loss_fn = DPOLoss(beta=0.1, loss_type='hinge')
+        inputs, outputs, ref_logps = _make_preference_batch()
+        result = loss_fn(inputs, outputs, ref_logps=ref_logps)
+        assert result['loss'].dim() == 0
+
+    def test_dpo_ipo(self):
+        loss_fn = DPOLoss(beta=0.1, loss_type='ipo')
+        inputs, outputs, ref_logps = _make_preference_batch()
+        result = loss_fn(inputs, outputs, ref_logps=ref_logps)
+        assert result['loss'].dim() == 0
+
+    def test_dpo_kto_pair(self):
+        loss_fn = DPOLoss(beta=0.1, loss_type='kto_pair')
+        inputs, outputs, ref_logps = _make_preference_batch()
+        result = loss_fn(inputs, outputs, ref_logps=ref_logps)
+        assert result['loss'].dim() == 0
+
+    def test_dpo_reference_free(self):
+        loss_fn = DPOLoss(beta=0.1, reference_free=True)
+        inputs, outputs, _ = _make_preference_batch(with_ref=False)
+        result = loss_fn(inputs, outputs)
+        assert result['loss'].dim() == 0
+
+    def test_dpo_no_ref_returns_zero(self):
+        """Without ref_logps and not reference_free, loss should be zero."""
+        loss_fn = DPOLoss(beta=0.1)
+        inputs, outputs, _ = _make_preference_batch(with_ref=False)
+        result = loss_fn(inputs, outputs)
+        assert result['loss'].item() == pytest.approx(0.0, abs=1e-6)
+
+    def test_dpo_with_label_smoothing(self):
+        loss_fn = DPOLoss(beta=0.1, label_smoothing=0.1, loss_type='sigmoid')
+        inputs, outputs, ref_logps = _make_preference_batch()
+        result = loss_fn(inputs, outputs, ref_logps=ref_logps)
+        assert result['loss'].dim() == 0
+
+    def test_dpo_with_sft_weight(self):
+        loss_fn = DPOLoss(beta=0.1, sft_weight=0.1)
+        inputs, outputs, ref_logps = _make_preference_batch()
+        result = loss_fn(inputs, outputs, ref_logps=ref_logps)
+        assert result['loss'].dim() == 0
+
+    def test_dpo_precomputed_ref_chosen_rejected(self):
+        loss_fn = DPOLoss(beta=0.1)
+        inputs, outputs, ref_logps = _make_preference_batch()
+        # Compute sequence-level ref logps for chosen/rejected
+        labels = inputs['labels']
+        loss_mask = (labels != -100).float()
+        chosen_ref = (ref_logps[0::2] * loss_mask[0::2]).sum(dim=-1)
+        rejected_ref = (ref_logps[1::2] * loss_mask[1::2]).sum(dim=-1)
+        result = loss_fn(inputs, outputs, ref_chosen_logps=chosen_ref, ref_rejected_logps=rejected_ref)
+        assert result['loss'].dim() == 0
+
+    def test_dpo_invalid_loss_type(self):
+        with pytest.raises(ValueError, match='Unknown loss_type'):
+            DPOLoss(loss_type='invalid_type')
+
+    def test_dpo_label_smoothing_non_sigmoid_raises(self):
+        with pytest.raises(ValueError, match='label_smoothing'):
+            DPOLoss(label_smoothing=0.1, loss_type='hinge')
+
+    def test_dpo_odd_batch_raises(self):
+        loss_fn = DPOLoss(beta=0.1)
+        labels = torch.randint(0, 10, (3, 5))
+        logps = torch.randn(3, 5)
+        with pytest.raises(AssertionError, match='even'):
+            loss_fn({'labels': labels}, {'logps': logps})
+
+    def test_dpo_from_logits(self):
+        """Test with logits instead of pre-computed logps."""
+        loss_fn = DPOLoss(beta=0.1, reference_free=True)
+        inputs, outputs, _ = _make_preference_batch(with_ref=False)
+        # Remove logps, force computation from logits
+        outputs_no_logps = {'logits': outputs['logits']}
+        result = loss_fn(inputs, outputs_no_logps)
+        assert result['loss'].dim() == 0
+
+
+class TestSimPOLoss:
+
+    def test_basic_simpo(self):
+        loss_fn = SimPOLoss(beta=2.5, gamma=0.5)
+        inputs, outputs, _ = _make_preference_batch(with_ref=False)
+        result = loss_fn(inputs, outputs)
+        assert isinstance(result, dict) and 'loss' in result
+        assert result['loss'].dim() == 0
+
+    def test_simpo_loss_non_negative(self):
+        loss_fn = SimPOLoss()
+        inputs, outputs, _ = _make_preference_batch(with_ref=False)
+        result = loss_fn(inputs, outputs)
+        # -log(sigmoid(x)) is always >= 0
+        assert result['loss'].item() >= 0
+
+    def test_simpo_odd_batch_raises(self):
+        loss_fn = SimPOLoss()
+        labels = torch.randint(0, 10, (3, 5))
+        logps = torch.randn(3, 5)
+        with pytest.raises(AssertionError):
+            loss_fn({'labels': labels}, {'logps': logps})
+
+
+class TestCPOLoss:
+
+    def test_basic_cpo(self):
+        loss_fn = CPOLoss(beta=0.1, bc_coef=1.0)
+        inputs, outputs, _ = _make_preference_batch(with_ref=False)
+        result = loss_fn(inputs, outputs)
+        assert isinstance(result, dict) and 'loss' in result
+        assert result['loss'].dim() == 0
+
+    def test_cpo_zero_bc_coef(self):
+        loss_fn = CPOLoss(beta=0.1, bc_coef=0.0)
+        inputs, outputs, _ = _make_preference_batch(with_ref=False)
+        result = loss_fn(inputs, outputs)
+        assert result['loss'].dim() == 0
+
+
+class TestORPOLoss:
+
+    def test_basic_orpo(self):
+        loss_fn = ORPOLoss(lambda_orpo=0.1)
+        inputs, outputs, _ = _make_preference_batch(with_ref=False)
+        result = loss_fn(inputs, outputs)
+        assert isinstance(result, dict) and 'loss' in result
+        assert result['loss'].dim() == 0
+
+    def test_orpo_loss_finite(self):
+        loss_fn = ORPOLoss(lambda_orpo=0.1)
+        inputs, outputs, _ = _make_preference_batch(with_ref=False)
+        result = loss_fn(inputs, outputs)
+        assert torch.isfinite(result['loss'])
+
+    def test_orpo_odd_batch_raises(self):
+        loss_fn = ORPOLoss()
+        labels = torch.randint(0, 10, (3, 5))
+        logps = torch.randn(3, 5)
+        with pytest.raises(AssertionError):
+            loss_fn({'labels': labels}, {'logps': logps})
diff --git a/tests/loss/test_grpo_gkd.py b/tests/loss/test_grpo_gkd.py
new file mode 100644
index 000000000..541b867a0
--- /dev/null
+++ b/tests/loss/test_grpo_gkd.py
@@ -0,0 +1,248 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for GRPO family losses and GKD loss."""
+import pytest
+import torch
+import torch.nn.functional as F
+
+from twinkle.loss import GRPOLoss, GSPOLoss, SAPOLoss, CISPOLoss, BNPOLoss, DRGRPOLoss, GKDLoss
+
+
+def _make_rl_batch(batch_size=4, seq_len=8, vocab_size=20):
+    """Create a synthetic RL training batch with labels, logps, old_logps, ref_logps, advantages."""
+    torch.manual_seed(42)
+    logits = torch.randn(batch_size, seq_len, vocab_size)
+    labels = torch.randint(0, vocab_size, (batch_size, seq_len))
+    for i in range(batch_size):
+        labels[i, seq_len // 2:] = -100
+
+    loss_mask = (labels != -100)
+    masked_labels = labels.clone()
+    masked_labels[~loss_mask] = 0
+    logps = F.log_softmax(logits, dim=-1).gather(-1, masked_labels.unsqueeze(-1)).squeeze(-1)
+
+    old_logps = logps.detach() + torch.randn_like(logps) * 0.1
+    ref_logps = logps.detach() + torch.randn_like(logps) * 0.05
+    advantages = torch.randn(batch_size, 1)
+
+    inputs = {'labels': labels}
+    outputs = {'logps': logps, 'logits': logits}
+    return inputs, outputs, old_logps, ref_logps, advantages
+
+
+class TestGRPOLoss:
+
+    def test_basic_grpo_loss(self):
+        loss_fn = GRPOLoss(epsilon=0.2)
+        inputs, outputs, old_logps, ref_logps, advantages = _make_rl_batch()
+        result = loss_fn(inputs, outputs, old_logps=old_logps, ref_logps=ref_logps, advantages=advantages)
+        assert isinstance(result, dict) and 'loss' in result
+        assert result['loss'].dim() == 0
+        assert torch.isfinite(result['loss'])
+
+    def test_grpo_no_advantages_returns_zero(self):
+        """Without advantages, should return a zero loss."""
+        loss_fn = GRPOLoss()
+        inputs, outputs, _, _, _ = _make_rl_batch()
+        result = loss_fn(inputs, outputs, advantages=None)
+        assert result['loss'].item() == pytest.approx(0.0, abs=1e-6)
+
+    def test_grpo_no_old_logps_uses_current(self):
+        """Without old_logps, uses current logps → ratio=1."""
+        loss_fn = GRPOLoss(epsilon=0.2)
+        inputs, outputs, _, ref_logps, advantages = _make_rl_batch()
+        result = loss_fn(inputs, outputs, old_logps=None, ref_logps=ref_logps, advantages=advantages)
+        assert torch.isfinite(result['loss'])
+
+    def test_grpo_kl_penalty(self):
+        """With beta > 0 and ref_logps, KL penalty is added."""
+        loss_fn = GRPOLoss(epsilon=0.2, beta=0.01)
+        inputs, outputs, old_logps, ref_logps, advantages = _make_rl_batch()
+        result = loss_fn(inputs, outputs, old_logps=old_logps, ref_logps=ref_logps, advantages=advantages)
+        assert torch.isfinite(result['loss'])
+
+    def test_grpo_no_kl_when_no_ref(self):
+        loss_fn = GRPOLoss(epsilon=0.2, beta=0.01)
+        inputs, outputs, old_logps, _, advantages = _make_rl_batch()
+        result = loss_fn(inputs, outputs, old_logps=old_logps, ref_logps=None, advantages=advantages)
+        assert torch.isfinite(result['loss'])
+
+    def test_grpo_from_logits(self):
+        """When logps is None, should compute from logits."""
+        loss_fn = GRPOLoss(epsilon=0.2)
+        inputs, outputs, _, _, advantages = _make_rl_batch()
+        outputs_no_logps = {'logits': outputs['logits']}
+        result = loss_fn(inputs, outputs_no_logps, advantages=advantages)
+        assert torch.isfinite(result['loss'])
+
+    def test_grpo_list_advantages(self):
+        """Advantages can be a list of floats."""
+        loss_fn = GRPOLoss(epsilon=0.2)
+        inputs, outputs, old_logps, _, _ = _make_rl_batch()
+        adv_list = [1.0, -1.0, 0.5, -0.5]
+        result = loss_fn(inputs, outputs, old_logps=old_logps, advantages=adv_list)
+        assert torch.isfinite(result['loss'])
+
+    def test_grpo_entropy_coef(self):
+        loss_fn = GRPOLoss(epsilon=0.2, entropy_coef=0.01)
+        inputs, outputs, old_logps, _, advantages = _make_rl_batch()
+        outputs_with_ent = {**outputs, 'entropies': torch.rand(4, 8)}
+        result = loss_fn(inputs, outputs_with_ent, old_logps=old_logps, advantages=advantages)
+        assert torch.isfinite(result['loss'])
+
+    def test_grpo_entropy_coef_requires_entropies(self):
+        loss_fn = GRPOLoss(entropy_coef=0.01)
+        inputs, outputs, old_logps, _, advantages = _make_rl_batch()
+        with pytest.raises(AssertionError, match='entropies'):
+            loss_fn(inputs, outputs, old_logps=old_logps, advantages=advantages)
+
+
+class TestGSPOLoss:
+
+    def test_basic_gspo(self):
+        loss_fn = GSPOLoss(epsilon=0.2)
+        inputs, outputs, old_logps, ref_logps, advantages = _make_rl_batch()
+        result = loss_fn(inputs, outputs, old_logps=old_logps, ref_logps=ref_logps, advantages=advantages)
+        assert isinstance(result, dict) and 'loss' in result
+        assert torch.isfinite(result['loss'])
+
+
+class TestSAPOLoss:
+
+    def test_basic_sapo(self):
+        loss_fn = SAPOLoss(epsilon=0.2, tau_pos=1.0, tau_neg=1.0)
+        inputs, outputs, old_logps, ref_logps, advantages = _make_rl_batch()
+        result = loss_fn(inputs, outputs, old_logps=old_logps, ref_logps=ref_logps, advantages=advantages)
+        assert isinstance(result, dict) and 'loss' in result
+        assert torch.isfinite(result['loss'])
+
+
+class TestCISPOLoss:
+
+    def test_basic_cispo(self):
+        loss_fn = CISPOLoss(epsilon=0.2)
+        inputs, outputs, old_logps, ref_logps, advantages = _make_rl_batch()
+        result = loss_fn(inputs, outputs, old_logps=old_logps, ref_logps=ref_logps, advantages=advantages)
+        assert isinstance(result, dict) and 'loss' in result
+        assert torch.isfinite(result['loss'])
+
+
+class TestBNPOLoss:
+
+    def test_basic_bnpo(self):
+        loss_fn = BNPOLoss(epsilon=0.2)
+        inputs, outputs, old_logps, ref_logps, advantages = _make_rl_batch()
+        result = loss_fn(inputs, outputs, old_logps=old_logps, ref_logps=ref_logps, advantages=advantages)
+        assert isinstance(result, dict) and 'loss' in result
+        assert torch.isfinite(result['loss'])
+
+
+class TestDRGRPOLoss:
+
+    def test_basic_dr_grpo(self):
+        loss_fn = DRGRPOLoss(epsilon=0.2, max_completion_length=10)
+        inputs, outputs, old_logps, ref_logps, advantages = _make_rl_batch()
+        result = loss_fn(inputs, outputs, old_logps=old_logps, ref_logps=ref_logps, advantages=advantages)
+        assert isinstance(result, dict) and 'loss' in result
+        assert torch.isfinite(result['loss'])
+
+
+class TestGKDLoss:
+
+    def test_basic_gkd_full_teacher(self):
+        """GKD with full-vocabulary teacher logits."""
+        loss_fn = GKDLoss(beta=0.5, temperature=2.0, chunk_size=64)
+        torch.manual_seed(42)
+        batch, seq, vocab = 2, 4, 16
+        student_logits = torch.randn(batch, seq, vocab)
+        teacher_logits = torch.randn(batch, seq, vocab)
+        labels = torch.randint(0, vocab, (batch, seq))
+        labels[:, seq // 2:] = -100
+
+        inputs = {'labels': labels}
+        outputs = {'logits': student_logits}
+        result = loss_fn(inputs, outputs, teacher_logits=teacher_logits)
+        assert isinstance(result, dict) and 'loss' in result
+        assert torch.isfinite(result['loss'])
+
+    def test_gkd_topk_teacher(self):
+        """GKD with top-k reduced teacher logits."""
+        loss_fn = GKDLoss(beta=0.5, temperature=2.0, chunk_size=64, topk=5)
+        torch.manual_seed(42)
+        batch, seq, vocab = 2, 4, 16
+        student_logits = torch.randn(batch, seq, vocab)
+        teacher_logits = torch.randn(batch, seq, vocab)
+        labels = torch.randint(0, vocab, (batch, seq))
+        labels[:, seq // 2:] = -100
+
+        inputs = {'labels': labels}
+        outputs = {'logits': student_logits}
+        result = loss_fn(inputs, outputs, teacher_logits=teacher_logits, topk=5)
+        assert torch.isfinite(result['loss'])
+
+    def test_gkd_remote_api_teacher(self):
+        """GKD with remote API teacher (topk logprobs + indices)."""
+        loss_fn = GKDLoss(beta=0.5, temperature=2.0, chunk_size=64)
+        torch.manual_seed(42)
+        batch, seq, vocab, k = 2, 4, 16, 5
+        student_logits = torch.randn(batch, seq, vocab)
+        teacher_topk_indices = torch.randint(0, vocab, (batch, seq, k))
+        teacher_topk_logprobs = torch.randn(batch, seq, k)
+        labels = torch.randint(0, vocab, (batch, seq))
+        labels[:, seq // 2:] = -100
+
+        inputs = {'labels': labels}
+        outputs = {'logits': student_logits}
+        result = loss_fn(inputs, outputs,
+                         teacher_topk_logprobs=teacher_topk_logprobs,
+                         teacher_topk_indices=teacher_topk_indices)
+        assert torch.isfinite(result['loss'])
+
+    def test_gkd_forward_kl(self):
+        """beta=0 → forward KL(S || T)."""
+        loss_fn = GKDLoss(beta=0.0, temperature=1.0, chunk_size=64)
+        torch.manual_seed(42)
+        batch, seq, vocab = 2, 4, 16
+        student_logits = torch.randn(batch, seq, vocab)
+        teacher_logits = torch.randn(batch, seq, vocab)
+        labels = torch.randint(0, vocab, (batch, seq))
+
+        result = loss_fn({'labels': labels}, {'logits': student_logits}, teacher_logits=teacher_logits)
+        assert torch.isfinite(result['loss'])
+
+    def test_gkd_reverse_kl(self):
+        """beta=1 → reverse KL(T || S)."""
+        loss_fn = GKDLoss(beta=1.0, temperature=1.0, chunk_size=64)
+        torch.manual_seed(42)
+        batch, seq, vocab = 2, 4, 16
+        student_logits = torch.randn(batch, seq, vocab)
+        teacher_logits = torch.randn(batch, seq, vocab)
+        labels = torch.randint(0, vocab, (batch, seq))
+
+        result = loss_fn({'labels': labels}, {'logits': student_logits}, teacher_logits=teacher_logits)
+        assert torch.isfinite(result['loss'])
+
+    def test_gkd_invalid_beta(self):
+        with pytest.raises(ValueError, match='beta'):
+            GKDLoss(beta=-0.1)
+
+    def test_gkd_invalid_temperature(self):
+        with pytest.raises(ValueError, match='temperature'):
+            GKDLoss(temperature=0)
+
+    def test_gkd_no_teacher_raises(self):
+        loss_fn = GKDLoss()
+        labels = torch.randint(0, 10, (2, 4))
+        with pytest.raises(AssertionError):
+            loss_fn({'labels': labels}, {'logits': torch.randn(2, 4, 10)})
+
+    def test_gkd_no_ignored_tokens(self):
+        """When all tokens are valid."""
+        loss_fn = GKDLoss(beta=0.5, temperature=2.0, chunk_size=64)
+        torch.manual_seed(42)
+        batch, seq, vocab = 2, 4, 16
+        student_logits = torch.randn(batch, seq, vocab)
+        teacher_logits = torch.randn(batch, seq, vocab)
+        labels = torch.randint(0, vocab, (batch, seq))
+
+        result = loss_fn({'labels': labels}, {'logits': student_logits}, teacher_logits=teacher_logits)
+        assert torch.isfinite(result['loss'])
diff --git a/tests/metric/__init__.py b/tests/metric/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/metric/test_metrics.py b/tests/metric/test_metrics.py
new file mode 100644
index 000000000..e4651ee86
--- /dev/null
+++ b/tests/metric/test_metrics.py
@@ -0,0 +1,420 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for metric classes: Accuracy, LossMetric, TrainMetric, CompletionRewardMetric, DPOMetric, GRPOMetric, EmbeddingMetric."""
+import time
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from twinkle.metric import (
+    Accuracy,
+    CompletionRewardMetric,
+    DPOMetric,
+    EmbeddingMetric,
+    GRPOMetric,
+    GSPOMetric,
+    CISPOMetric,
+    LossMetric,
+    TrainMetric,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _no_dist_metric(cls, **kwargs):
+    """Instantiate a metric without distributed groups."""
+    return cls(device_mesh=None, process_group=None, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Accuracy
+# ---------------------------------------------------------------------------
+
+class TestAccuracy:
+
+    def test_perfect_accuracy(self):
+        m = _no_dist_metric(Accuracy)
+        labels = torch.tensor([1, 2, 3, 4])
+        logits = torch.zeros(4, 10)
+        logits[0, 1] = 10.0
+        logits[1, 2] = 10.0
+        logits[2, 3] = 10.0
+        logits[3, 4] = 10.0
+        inputs = {'labels': labels}
+        outputs = {'logits': logits}
+        m.accumulate(inputs, outputs)
+        result = m.calculate()
+        assert result['accuracy'] == '1.00'
+
+    def test_zero_accuracy(self):
+        m = _no_dist_metric(Accuracy)
+        labels = torch.tensor([0, 1, 2])
+        logits = torch.zeros(3, 10)
+        logits[0, 9] = 10.0  # wrong
+        logits[1, 8] = 10.0  # wrong
+        logits[2, 7] = 10.0  # wrong
+        inputs = {'labels': labels}
+        outputs = {'logits': logits}
+        m.accumulate(inputs, outputs)
+        result = m.calculate()
+        assert result['accuracy'] == '0.00'
+
+    def test_partial_accuracy(self):
+        m = _no_dist_metric(Accuracy)
+        labels = torch.tensor([1, 2])
+        logits = torch.zeros(2, 10)
+        logits[0, 1] = 10.0  # correct
+        logits[1, 5] = 10.0  # wrong
+        inputs = {'labels': labels}
+        outputs = {'logits': logits}
+        m.accumulate(inputs, outputs)
+        result = m.calculate()
+        assert result['accuracy'] == '0.50'
+
+    def test_accuracy_with_completion_mask(self):
+        m = _no_dist_metric(Accuracy)
+        labels = torch.tensor([1, 2, 3, 4])
+        logits = torch.zeros(4, 10)
+        logits[:, 0] = 10.0  # all predict token 0 → wrong
+        # Only count positions where mask is True
+        mask = torch.tensor([True, False, True, False])
+        inputs = {'labels': labels, 'completion_mask': mask}
+        outputs = {'logits': logits}
+        m.accumulate(inputs, outputs)
+        result = m.calculate()
+        # positions 0 and 2 are counted, both wrong → 0%
+        assert result['accuracy'] == '0.00'
+
+    def test_accuracy_no_logits_skips(self):
+        m = _no_dist_metric(Accuracy)
+        inputs = {'labels': torch.tensor([1, 2])}
+        outputs = {}
+        m.accumulate(inputs, outputs)
+        result = m.calculate()
+        assert result == {}
+
+    def test_accuracy_reset(self):
+        m = _no_dist_metric(Accuracy)
+        labels = torch.tensor([1])
+        logits = torch.zeros(1, 10)
+        logits[0, 1] = 10.0
+        m.accumulate({'labels': labels}, {'logits': logits})
+        m.reset()
+        assert m.total_correct == 0
+        assert m.total_count == 0
+
+    def test_accuracy_with_ignore_index(self):
+        m = _no_dist_metric(Accuracy)
+        labels = torch.tensor([1, -100, 3])
+        logits = torch.zeros(3, 10)
+        logits[0, 1] = 10.0  # correct
+        logits[2, 5] = 10.0  # wrong
+        inputs = {'labels': labels}
+        outputs = {'logits': logits}
+        m.accumulate(inputs, outputs)
+        result = m.calculate()
+        # 1 correct out of 2 valid tokens
+        assert result['accuracy'] == '0.50'
+
+    def test_accuracy_does_not_accept_list(self):
+        m = _no_dist_metric(Accuracy)
+        inputs = [{'labels': torch.tensor([1])}]
+        with pytest.raises(AssertionError):
+            m.accumulate(inputs, {'logits': torch.zeros(1, 10)})
+
+
+# ---------------------------------------------------------------------------
+# LossMetric
+# ---------------------------------------------------------------------------
+
+class TestLossMetric:
+
+    def test_basic_accumulate(self):
+        m = _no_dist_metric(LossMetric)
+        m.accumulate({'labels': torch.tensor([1])}, {'loss': torch.tensor(2.5), 'num_tokens': torch.tensor(4.0)})
+        result = m.calculate()
+        assert 'loss' in result
+
+    def test_loss_mean_reduction(self):
+        m = _no_dist_metric(LossMetric)
+        m.accumulate({'labels': torch.tensor([1])}, {'loss': torch.tensor(3.0)}, loss_reduction='mean', num_tokens=3)
+        # With num_tokens, avg_loss = total_loss / num_tokens
+        m.accumulate({'labels': torch.tensor([1])}, {'loss': torch.tensor(6.0)}, loss_reduction='sum', num_tokens=6)
+        result = m.calculate()
+        assert 'loss' in result
+
+    def test_loss_grad_norm(self):
+        m = _no_dist_metric(LossMetric)
+        m.accumulate({'labels': torch.tensor([1])}, {'loss': torch.tensor(1.0)}, grad_norm=0.5)
+        result = m.calculate()
+        assert 'grad_norm' in result
+
+    def test_loss_no_loss_skips(self):
+        m = _no_dist_metric(LossMetric)
+        m.accumulate({'labels': torch.tensor([1])}, {})
+        assert m.total_count == 0
+
+    def test_loss_reset(self):
+        m = _no_dist_metric(LossMetric)
+        m.accumulate({'labels': torch.tensor([1])}, {'loss': torch.tensor(1.0)})
+        m.reset()
+        assert m.total_loss == 0
+        assert m.total_count == 0
+
+
+# ---------------------------------------------------------------------------
+# TrainMetric
+# ---------------------------------------------------------------------------
+
+class TestTrainMetric:
+
+    def test_basic_train_metric(self):
+        m = _no_dist_metric(TrainMetric)
+        m.accumulate({}, {}, lr=1e-4, step=10, gradient_accumulation_steps=2)
+        result = m.calculate()
+        assert 'learning rate' in result
+        assert 'iters' in result
+        assert result['iters'] == 5  # 10 // 2
+
+    def test_train_metric_lr_list(self):
+        m = _no_dist_metric(TrainMetric)
+        m.accumulate({}, {}, lr=[1e-4, 2e-4], step=10, gradient_accumulation_steps=1)
+        result = m.calculate()
+        assert 'learning rate(param group 1)' in result
+        assert 'learning rate(param group 2)' in result
+
+    def test_train_metric_single_lr_in_list(self):
+        m = _no_dist_metric(TrainMetric)
+        m.accumulate({}, {}, lr=[1e-4], step=5, gradient_accumulation_steps=1)
+        result = m.calculate()
+        assert 'learning rate' in result
+        # Single-element list should be flattened to scalar
+
+    def test_train_metric_speed(self):
+        m = _no_dist_metric(TrainMetric)
+        m.accumulate({}, {}, lr=1e-4, step=10)
+        # Advance time slightly
+        result = m.calculate()
+        assert 'speed' in result
+
+    def test_train_metric_reset(self):
+        m = _no_dist_metric(TrainMetric)
+        m.accumulate({}, {}, lr=1e-4, step=10)
+        m.reset()
+        # After reset, time is updated but step info persists
+        assert m.last_step == 10
+
+
+# ---------------------------------------------------------------------------
+# CompletionRewardMetric
+# ---------------------------------------------------------------------------
+
+class TestCompletionRewardMetric:
+
+    def test_basic_rewards(self):
+        m = _no_dist_metric(CompletionRewardMetric)
+        m.accumulate(rewards={'accuracy': [1.0, 0.0, 1.0]}, completion_lengths=[10, 20, 30])
+        result = m.calculate()
+        assert 'train/accuracy_reward' in result
+        assert 'train/completion_length' in result
+
+    def test_reward_std(self):
+        m = _no_dist_metric(CompletionRewardMetric)
+        m.accumulate(rewards={'format': [1.0, 0.5, 0.0]})
+        result = m.calculate()
+        assert 'train/format_reward_std' in result
+
+    def test_generate_time(self):
+        m = _no_dist_metric(CompletionRewardMetric)
+        m.accumulate(rewards={}, generate_time=1.5, weight_sync_time=0.3)
+        result = m.calculate()
+        assert 'profiling/Time taken: generate' in result
+        assert 'profiling/Time taken: move_model_to_sampler' in result
+
+    def test_empty_accumulate(self):
+        m = _no_dist_metric(CompletionRewardMetric)
+        m.accumulate()
+        result = m.calculate()
+        assert result == {}
+
+    def test_mean_empty_list(self):
+        assert CompletionRewardMetric._mean([]) == -1.0
+
+    def test_std_single_element(self):
+        assert CompletionRewardMetric._std([1.0]) == 0.0
+
+
+# ---------------------------------------------------------------------------
+# DPOMetric
+# ---------------------------------------------------------------------------
+
+class TestDPOMetric:
+
+    def test_basic_dpo_metric(self):
+        m = _no_dist_metric(DPOMetric, beta=0.1)
+        labels = torch.tensor([[1, 2, -100], [3, 4, -100]])
+        logps = torch.randn(2, 3)
+        m.accumulate({'labels': labels}, {'logps': logps})
+        result = m.calculate()
+        assert 'logps/chosen' in result
+        assert 'logps/rejected' in result
+
+    def test_dpo_metric_with_ref(self):
+        m = _no_dist_metric(DPOMetric, beta=0.1)
+        labels = torch.tensor([[1, 2, -100], [3, 4, -100]])
+        logps = torch.randn(2, 3)
+        ref_logps = torch.randn(2, 3)
+        m.accumulate({'labels': labels}, {'logps': logps}, ref_outputs={'logps': ref_logps})
+        result = m.calculate()
+        assert 'rewards/chosen' in result
+        assert 'rewards/accuracies' in result
+
+    def test_dpo_metric_no_logps_skips(self):
+        m = _no_dist_metric(DPOMetric, beta=0.1)
+        m.accumulate({'labels': torch.tensor([[1, 2]])}, {})
+        result = m.calculate()
+        assert result == {}
+
+    def test_dpo_metric_reset(self):
+        m = _no_dist_metric(DPOMetric, beta=0.1)
+        labels = torch.tensor([[1, 2, -100], [3, 4, -100]])
+        m.accumulate({'labels': labels}, {'logps': torch.randn(2, 3)})
+        m.reset()
+        assert m.total_count == 0
+
+    def test_dpo_metric_odd_batch_raises(self):
+        m = _no_dist_metric(DPOMetric, beta=0.1)
+        labels = torch.tensor([[1, 2, -100]])
+        logps = torch.randn(1, 3)
+        with pytest.raises(AssertionError, match='even'):
+            m.accumulate({'labels': labels}, {'logps': logps})
+
+
+# ---------------------------------------------------------------------------
+# GRPOMetric / GSPOMetric / CISPOMetric
+# ---------------------------------------------------------------------------
+
+class TestGRPOMetric:
+
+    def test_basic_grpo_metric(self):
+        m = _no_dist_metric(GRPOMetric)
+        labels = torch.tensor([[1, 2, -100, -100]])
+        logps = torch.randn(1, 4)
+        m.accumulate({'labels': labels}, {'logps': logps})
+        result = m.calculate()
+        assert 'train/policy_confidence' in result
+        assert 'train/mean_new_logp' in result
+
+    def test_grpo_metric_with_old(self):
+        m = _no_dist_metric(GRPOMetric)
+        labels = torch.tensor([[1, 2, -100, -100]])
+        logps = torch.randn(1, 4)
+        old_logps = [torch.randn(1, 2)]  # only 2 valid tokens
+        m.accumulate({'labels': labels}, {'logps': logps}, old_logps=old_logps)
+        result = m.calculate()
+        assert 'train/approx_kl' in result
+        assert 'train/logp_diff_mean' in result
+
+    def test_grpo_metric_empty_outputs(self):
+        m = _no_dist_metric(GRPOMetric)
+        m.accumulate({'labels': torch.tensor([[1, 2]])}, None)
+        # Should not crash
+        result = m.calculate()
+        # Might be empty if nothing accumulated
+        assert isinstance(result, dict)
+
+    def test_grpo_metric_entropy(self):
+        m = _no_dist_metric(GRPOMetric)
+        labels = torch.tensor([[1, 2, -100, -100]])
+        logps = torch.randn(1, 4)
+        entropies = torch.randn(1, 4)
+        m.accumulate({'labels': labels}, {'logps': logps, 'entropies': entropies})
+        result = m.calculate()
+        assert 'train/entropy' in result
+
+    def test_grpo_metric_reset(self):
+        m = _no_dist_metric(GRPOMetric)
+        labels = torch.tensor([[1, 2, -100, -100]])
+        logps = torch.randn(1, 4)
+        m.accumulate({'labels': labels}, {'logps': logps})
+        m.reset()
+        assert m.n_tokens == 0
+        assert m.sum_new == 0.0
+
+
+class TestGSPOMetric:
+
+    def test_basic_gspo_metric(self):
+        m = _no_dist_metric(GSPOMetric)
+        labels = torch.tensor([[1, 2, -100, -100]])
+        logps = torch.randn(1, 4)
+        m.accumulate({'labels': labels}, {'logps': logps})
+        result = m.calculate()
+        assert 'train/policy_confidence' in result
+
+
+class TestCISPOMetric:
+
+    def test_basic_cispo_metric(self):
+        m = _no_dist_metric(CISPOMetric)
+        labels = torch.tensor([[1, 2, -100, -100]])
+        logps = torch.randn(1, 4)
+        m.accumulate({'labels': labels}, {'logps': logps})
+        result = m.calculate()
+        assert 'train/policy_confidence' in result
+
+
+# ---------------------------------------------------------------------------
+# EmbeddingMetric
+# ---------------------------------------------------------------------------
+
+class TestEmbeddingMetric:
+
+    def test_basic_embedding_metric(self):
+        m = _no_dist_metric(EmbeddingMetric)
+        # 2 samples: anchor (label=1) + positive (label=0), then another pair
+        labels = torch.tensor([1, 0, 1, 0])  # Two anchor-positive pairs
+        embeddings = torch.randn(4, 8)
+        m.accumulate({'labels': labels}, {'embeddings': embeddings})
+        result = m.calculate()
+        assert 'pos_sim' in result
+
+    def test_embedding_metric_with_loss(self):
+        m = _no_dist_metric(EmbeddingMetric)
+        labels = torch.tensor([1, 0])
+        embeddings = torch.randn(2, 8)
+        m.accumulate({'labels': labels}, {'embeddings': embeddings, 'loss': torch.tensor(2.5)})
+        result = m.calculate()
+        assert 'loss' in result
+
+    def test_embedding_metric_no_embeddings_skips(self):
+        m = _no_dist_metric(EmbeddingMetric)
+        m.accumulate({'labels': torch.tensor([1, 0])}, {})
+        result = m.calculate()
+        assert result == {}
+
+    def test_embedding_metric_fallback_to_logits(self):
+        m = _no_dist_metric(EmbeddingMetric)
+        labels = torch.tensor([1, 0])
+        logits_2d = torch.randn(2, 8)
+        m.accumulate({'labels': labels}, {'logits': logits_2d})
+        result = m.calculate()
+        assert 'pos_sim' in result
+
+    def test_embedding_metric_3d_logits(self):
+        m = _no_dist_metric(EmbeddingMetric)
+        labels = torch.tensor([1, 0])
+        logits_3d = torch.randn(2, 5, 8)  # 3D → CLS pooled
+        m.accumulate({'labels': labels}, {'logits': logits_3d})
+        result = m.calculate()
+        assert 'pos_sim' in result
+
+    def test_embedding_metric_no_anchors(self):
+        m = _no_dist_metric(EmbeddingMetric)
+        labels = torch.tensor([0, 0])  # No anchors
+        embeddings = torch.randn(2, 8)
+        m.accumulate({'labels': labels}, {'embeddings': embeddings})
+        # No anchors → no positive similarity
+        assert m.pos_count == 0
diff --git a/tests/template/test_tool_parsers.py b/tests/template/test_tool_parsers.py
index 41f6a3a4f..9269ed1f2 100644
--- a/tests/template/test_tool_parsers.py
+++ b/tests/template/test_tool_parsers.py
@@ -23,11 +23,6 @@ def test_detect(self):
         assert not self.p.detect('plain text')
         assert not self.p.detect('')
 
-    def test_matches_model(self):
-        assert self.p.matches_model('qwen2.5-7b')
-        assert self.p.matches_model('qwen3-32b')
-        assert not self.p.matches_model('llama-3.1-8b')
-
     def test_parse_json_variant(self):
         text = '<tool_call>{"name": "get_weather", "arguments": {"city": "Paris"}}</tool_call>'
         out = self.p.parse(text)
@@ -104,10 +99,6 @@ def test_no_block_marker(self):
         assert self.p.open_marker is None
         assert self.p.close_marker is None
 
-    def test_does_not_match_qwen_model(self):
-        assert not self.p.matches_model('qwen2.5')
-        assert not self.p.matches_model('llama-3')
-
     def test_parse_single_action(self):
         text = 'Thought: search the web.\nAction: search[hello world]'
         out = self.p.parse(text)
@@ -172,11 +163,6 @@ def test_no_marker(self):
         assert self.p.open_marker is None
         assert self.p.close_marker is None
 
-    def test_does_not_match_any_model_by_default(self):
-        # Cline is an app-level prompt protocol, not a model-family format.
-        assert not self.p.matches_model('qwen2.5')
-        assert not self.p.matches_model('claude-3')
-
     def test_parse_single_arg(self):
         text = '<read_file><path>src/foo.py</path></read_file>'
         out = self.p.parse(text)
@@ -254,14 +240,6 @@ def test_no_parser_for_plain_text(self):
         assert ToolCallRegistry.detect_first('just some plain text') is None
         assert ToolCallRegistry.detect_first('') is None
 
-    def test_select_for_qwen_picks_hermes(self):
-        parser = ToolCallRegistry.select_for_model('qwen2.5-7b')
-        assert parser is not None and parser.name == 'hermes_qwen'
-
-    def test_select_for_unknown_returns_none(self):
-        assert ToolCallRegistry.select_for_model('llama-3.1-8b') is None
-        assert ToolCallRegistry.select_for_model(None) is None
-
 
 if __name__ == '__main__':
     pytest.main([__file__, '-v'])
diff --git a/tests/twinkle_agentic/test_multi_turn_rollout.py b/tests/twinkle_agentic/test_multi_turn_rollout.py
index 5aaa85a59..4f17d8282 100644
--- a/tests/twinkle_agentic/test_multi_turn_rollout.py
+++ b/tests/twinkle_agentic/test_multi_turn_rollout.py
@@ -523,9 +523,14 @@ def test_rejects_none_template(sampler, tool_manager):
         MultiTurnRollout(sampler=sampler, template=None, tool_manager=tool_manager)
 
 
-def test_rejects_none_tool_manager(sampler, template):
-    with pytest.raises(ValueError, match='ToolManager'):
-        MultiTurnRollout(sampler=sampler, template=template, tool_manager=None)
+def test_none_tool_manager_accepted_at_construction(sampler, template):
+    """tool_manager=None is valid at construction; error deferred to call time."""
+    rollout = MultiTurnRollout(sampler=sampler, template=template, tool_manager=None)
+    assert rollout.tool_manager is None
+    # Calling without providing a tool_manager should raise
+    sampler.queue(_tool_call_text('search', {'q': 'x'}), stop_reason='stop')
+    with pytest.raises(ValueError, match='tool_manager is required'):
+        rollout([_user_traj('hello')])
 
 
 def test_rejects_bad_max_turns(sampler, template, tool_manager):
diff --git a/tests/twinkle_agentic/test_tools.py b/tests/twinkle_agentic/test_tools.py
new file mode 100644
index 000000000..87c050f6b
--- /dev/null
+++ b/tests/twinkle_agentic/test_tools.py
@@ -0,0 +1,218 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for twinkle_agentic.tools: base Tool and ToolManager."""
+
+import pytest
+
+from twinkle.data_format import ToolCall
+from twinkle.data_format.message import Tool as ToolInfo
+from twinkle_agentic.tools.base import Tool
+from twinkle_agentic.tools.tool_manager import ToolManager, _extract_name
+
+
+# ---------------------------------------------------------------------------
+# Mock tools
+# ---------------------------------------------------------------------------
+
+class MockTool(Tool):
+
+    def __init__(self, name='mock_tool'):
+        self._name = name
+
+    def __call__(self, tool_name: str, arguments: dict) -> str:
+        return f'executed {tool_name} with {arguments}'
+
+    def tool_info(self) -> ToolInfo:
+        return {'function': {'name': self._name, 'description': 'A mock tool.'}}
+
+
+class CalculatorTool(Tool):
+
+    def __call__(self, tool_name: str, arguments: dict) -> str:
+        op = arguments.get('operation', 'add')
+        a = arguments.get('a', 0)
+        b = arguments.get('b', 0)
+        if op == 'add':
+            return str(a + b)
+        if op == 'subtract':
+            return str(a - b)
+        return 'unknown operation'
+
+    def tool_info(self) -> ToolInfo:
+        return {'function': {'name': 'calculator', 'description': 'Basic calculator.'}}
+
+
+class FailingTool(Tool):
+
+    def __call__(self, tool_name: str, arguments: dict) -> str:
+        raise RuntimeError('intentional failure')
+
+    def tool_info(self) -> ToolInfo:
+        return {'function': {'name': 'failing_tool', 'description': 'Always fails.'}}
+
+
+# ---------------------------------------------------------------------------
+# _extract_name
+# ---------------------------------------------------------------------------
+
+class TestExtractName:
+
+    def test_valid_dict(self):
+        info = {'function': {'name': 'test_tool'}}
+        assert _extract_name(info) == 'test_tool'
+
+    def test_missing_function(self):
+        assert _extract_name({'name': 'test_tool'}) is None
+
+    def test_non_dict(self):
+        assert _extract_name('not a dict') is None
+
+    def test_empty_name(self):
+        assert _extract_name({'function': {'name': ''}}) is None
+
+    def test_none_name(self):
+        assert _extract_name({'function': {'name': None}}) is None
+
+
+# ---------------------------------------------------------------------------
+# ToolManager
+# ---------------------------------------------------------------------------
+
+class TestToolManager:
+
+    def test_empty_constructor(self):
+        tm = ToolManager()
+        assert tm.names() == []
+
+    def test_from_dict(self):
+        mock = MockTool()
+        tm = ToolManager({'mock_tool': mock})
+        assert tm.names() == ['mock_tool']
+
+    def test_from_list(self):
+        mock = MockTool()
+        calc = CalculatorTool()
+        tm = ToolManager([mock, calc])
+        assert sorted(tm.names()) == ['calculator', 'mock_tool']
+
+    def test_from_none(self):
+        tm = ToolManager(None)
+        assert tm.names() == []
+
+    def test_invalid_type_raises(self):
+        with pytest.raises(TypeError, match='ToolManager expects dict'):
+            ToolManager(42)
+
+    def test_register(self):
+        tm = ToolManager()
+        mock = MockTool()
+        tm.register(mock)
+        assert 'mock_tool' in tm.names()
+
+    def test_register_missing_name_raises(self):
+        tm = ToolManager()
+        with pytest.raises(ValueError, match='non-empty'):
+            tm.register(object())  # object has no tool_info
+
+    def test_unregister(self):
+        mock = MockTool()
+        tm = ToolManager({'mock_tool': mock})
+        removed = tm.unregister('mock_tool')
+        assert removed is mock
+        assert 'mock_tool' not in tm.names()
+
+    def test_unregister_missing(self):
+        tm = ToolManager()
+        assert tm.unregister('nonexistent') is None
+
+    def test_copy(self):
+        mock = MockTool()
+        tm = ToolManager({'mock_tool': mock})
+        copied = tm.copy()
+        assert copied.names() == ['mock_tool']
+        assert copied is not tm
+
+    def test_tool_infos(self):
+        mock = MockTool()
+        tm = ToolManager({'mock_tool': mock})
+        infos = tm.tool_infos()
+        assert len(infos) == 1
+        assert infos[0]['function']['name'] == 'mock_tool'
+
+    def test_call_with_dict(self):
+        calc = CalculatorTool()
+        tm = ToolManager({'calculator': calc})
+        result = tm({'function': {'name': 'calculator', 'arguments': {'a': 3, 'b': 4, 'operation': 'add'}}})
+        assert result == '7'
+
+    def test_call_with_tool_call(self):
+        calc = CalculatorTool()
+        tm = ToolManager({'calculator': calc})
+        tc = ToolCall(**{'function': {'name': 'calculator', 'arguments': {'a': 10, 'b': 2, 'operation': 'subtract'}}})
+        result = tm(tc)
+        assert result == '8'
+
+    def test_call_missing_tool(self):
+        tm = ToolManager()
+        result = tm({'function': {'name': 'missing', 'arguments': {}}})
+        assert 'unknown tool' in result
+        assert 'Available:' in result
+
+    def test_call_missing_function(self):
+        tm = ToolManager({'mock': MockTool()})
+        result = tm({})
+        assert 'missing "function"' in result
+
+    def test_call_missing_name(self):
+        tm = ToolManager({'mock': MockTool()})
+        result = tm({'function': {}})
+        assert 'missing "function.name"' in result
+
+    def test_call_string_arguments(self):
+        calc = CalculatorTool()
+        tm = ToolManager({'calculator': calc})
+        result = tm({'function': {'name': 'calculator', 'arguments': '{"a": 5, "b": 3, "operation": "add"}'}})
+        assert result == '8'
+
+    def test_call_invalid_json_string(self):
+        calc = CalculatorTool()
+        tm = ToolManager({'calculator': calc})
+        result = tm({'function': {'name': 'calculator', 'arguments': 'not json'}})
+        assert 'invalid JSON' in result
+
+    def test_call_empty_json_string(self):
+        calc = CalculatorTool()
+        tm = ToolManager({'calculator': calc})
+        result = tm({'function': {'name': 'calculator', 'arguments': '   '}})
+        assert result == '0'  # default values
+
+    def test_call_invalid_argument_type(self):
+        tm = ToolManager({'mock': MockTool()})
+        result = tm({'function': {'name': 'mock', 'arguments': 42}})
+        assert 'must be a JSON string or object' in result
+
+    def test_call_tool_exception(self):
+        fail = FailingTool()
+        tm = ToolManager({'failing_tool': fail})
+        result = tm({'function': {'name': 'failing_tool', 'arguments': {}}})
+        assert 'Error' in result
+        assert 'intentional failure' in result
+
+    def test_call_none_arguments(self):
+        mock = MockTool()
+        tm = ToolManager({'mock': mock})
+        result = tm({'function': {'name': 'mock', 'arguments': None}})
+        assert 'executed mock with {}' == result
+
+    def test_call_tool_call_is_not_dict(self):
+        tm = ToolManager()
+        result = tm('not a dict')
+        assert 'tool_call must be an object' in result
+
+    def test_from_list_tool_without_tool_info(self):
+        class BadTool(Tool):
+            def __call__(self, *args, **kwargs):
+                return ''
+            def tool_info(self):
+                return {}  # missing function.name
+        with pytest.raises(ValueError, match='non-empty'):
+            ToolManager([BadTool()])
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
new file mode 100644
index 000000000..641da8ad1
--- /dev/null
+++ b/tests/utils/test_utils.py
@@ -0,0 +1,413 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+"""Tests for utils key functions: torch_utils, seed, network, utils, safetensors, transformers_utils."""
+import json
+import os
+import tempfile
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from twinkle.utils import (
+    deep_getattr,
+    is_valid_ipv6_address,
+    pad_and_stack_tensors,
+    pad_sequence_to_length,
+)
+from twinkle.utils.seed import stable_seed as _stable_seed
+from twinkle.utils.torch_utils import (
+    clone_state_dict_to_cpu,
+    selective_log_softmax,
+    to_device,
+)
+from twinkle.utils.network import find_free_port
+from twinkle.utils.utils import (
+    call_with_supported_kwargs,
+    copy_files_by_pattern,
+    has_signature_parameter,
+    signature_info,
+)
+from twinkle.utils.transformers_utils import (
+    align_logps_to_mask,
+    filter_from_config_kwargs,
+)
+from twinkle.utils.safetensors import LazyTensor, StreamingSafetensorSaver
+
+
+# ===========================================================================
+# torch_utils
+# ===========================================================================
+
+
+class TestToDevice:
+
+    def test_tensor_to_cpu(self):
+        t = torch.randn(3, 4)
+        result = to_device(t, 'cpu')
+        assert result.device == torch.device('cpu')
+
+    def test_dict_values_moved(self):
+        data = {'a': torch.randn(2), 'b': torch.randn(3)}
+        result = to_device(data, 'cpu')
+        assert isinstance(result, dict)
+        assert result['a'].device == torch.device('cpu')
+
+    def test_list_values_moved(self):
+        data = [torch.randn(2), torch.randn(3)]
+        result = to_device(data, 'cpu')
+        assert isinstance(result, list)
+
+    def test_non_tensor_passthrough(self):
+        assert to_device(42, 'cpu') == 42
+        assert to_device('hello', 'cpu') == 'hello'
+
+    def test_nested_dict(self):
+        data = {'outer': {'inner': torch.randn(2)}}
+        result = to_device(data, 'cpu')
+        assert result['outer']['inner'].device == torch.device('cpu')
+
+
+class TestCloneStateDictToCpu:
+
+    def test_clones_tensors(self):
+        state = {'w': torch.randn(3, 4), 'b': torch.randn(4)}
+        cloned = clone_state_dict_to_cpu(state)
+        assert cloned['w'].device == torch.device('cpu')
+        assert cloned['w'].data_ptr() != state['w'].data_ptr()  # different storage
+        assert torch.allclose(cloned['w'], state['w'].cpu())  # same values
+
+    def test_preserves_non_tensors(self):
+        state = {'step': 100, 'name': 'model'}
+        cloned = clone_state_dict_to_cpu(state)
+        assert cloned['step'] == 100
+        assert cloned['name'] == 'model'
+
+
+class TestPadSequenceToLength:
+
+    def test_right_pad(self):
+        t = torch.ones(2, 3)
+        result = pad_sequence_to_length(t, 5)
+        assert result.shape == (2, 5)
+        assert (result[:, :3] == 1).all()
+        assert (result[:, 3:] == 0).all()
+
+    def test_left_pad(self):
+        t = torch.ones(2, 3)
+        result = pad_sequence_to_length(t, 5, left_pad=True)
+        assert result.shape == (2, 5)
+        assert (result[:, :2] == 0).all()
+        assert (result[:, 2:] == 1).all()
+
+    def test_no_pad_if_longer(self):
+        t = torch.ones(2, 10)
+        result = pad_sequence_to_length(t, 5)
+        assert result.shape == (2, 10)
+
+    def test_custom_pad_value(self):
+        t = torch.ones(2, 3)
+        result = pad_sequence_to_length(t, 5, pad_value=-1.0)
+        assert (result[:, 3:] == -1).all()
+
+
+class TestSelectiveLogSoftmax:
+
+    def test_matches_naive_implementation(self):
+        torch.manual_seed(42)
+        logits = torch.randn(4, 20)
+        index = torch.randint(0, 20, (4,))
+
+        # Naive: log_softmax → gather
+        expected = torch.gather(logits.log_softmax(-1), -1, index.unsqueeze(-1)).squeeze(-1)
+        result = selective_log_softmax(logits, index)
+        assert torch.allclose(result, expected, atol=1e-5)
+
+    def test_with_return_entropy(self):
+        logits = torch.randn(4, 20, dtype=torch.float32)
+        index = torch.randint(0, 20, (4,))
+        logps, entropy = selective_log_softmax(logits, index, return_entropy=True)
+        # Entropy should be non-negative
+        assert (entropy >= 0).all()
+        # Manual entropy check for one row
+        row_probs = torch.exp(F.log_softmax(logits[0], dim=-1))
+        manual_entropy = -(row_probs * row_probs.log()).sum()
+        assert torch.allclose(entropy[0], manual_entropy, atol=1e-5)
+
+    def test_bfloat16_fallback(self):
+        logits = torch.randn(4, 20, dtype=torch.bfloat16)
+        index = torch.randint(0, 20, (4,))
+        result = selective_log_softmax(logits, index)
+        expected = torch.gather(logits.float().log_softmax(-1), -1, index.unsqueeze(-1)).squeeze(-1)
+        assert torch.allclose(result.float(), expected, atol=1e-2)
+
+
+class TestPadAndStackTensors:
+
+    def test_same_shape(self):
+        tensors = [torch.randn(3, 4), torch.randn(3, 4)]
+        result = pad_and_stack_tensors(tensors)
+        assert result.shape == (6, 4)
+
+    def test_different_length(self):
+        tensors = [torch.randn(3), torch.randn(5)]
+        result = pad_and_stack_tensors(tensors, pad_value=0)
+        assert result.shape == (10, )  # padded to max length then concat
+
+    def test_different_length_stack(self):
+        tensors = [torch.randn(3), torch.randn(5)]
+        result = pad_and_stack_tensors(tensors, pad_value=0, concat=False)
+        assert result.shape == (2, 5)  # stack
+
+    def test_single_tensor(self):
+        t = torch.randn(3, 4)
+        result = pad_and_stack_tensors([t])
+        assert result is t  # returns as-is
+
+    def test_empty_list_raises(self):
+        with pytest.raises(ValueError):
+            pad_and_stack_tensors([])
+
+    def test_different_ndim(self):
+        tensors = [torch.randn(3), torch.randn(2, 4)]
+        result = pad_and_stack_tensors(tensors)
+        # First tensor gets unsqueezed to (1,3), second is (2,4)
+        assert result.shape[1] == 4
+
+
+# ===========================================================================
+# seed
+# ===========================================================================
+
+
+class TestStableSeed:
+
+    def test_deterministic(self):
+        assert _stable_seed('hello', 42) == _stable_seed('hello', 42)
+
+    def test_different_inputs_differ(self):
+        assert _stable_seed('a') != _stable_seed('b')
+
+    def test_returns_uint32(self):
+        val = _stable_seed('test')
+        assert isinstance(val, int)
+        assert 0 <= val < 2**32
+
+    def test_cross_process_stable(self):
+        """SHA-256 based seed is not affected by PYTHONHASHSEED."""
+        val = _stable_seed('consistent', 'key')
+        assert isinstance(val, int)
+        # Call again — must be same
+        assert _stable_seed('consistent', 'key') == val
+
+
+# ===========================================================================
+# network
+# ===========================================================================
+
+
+class TestIsValidIPv6Address:
+
+    def test_valid_ipv6(self):
+        assert is_valid_ipv6_address('::1')
+
+    def test_invalid_ipv6(self):
+        assert not is_valid_ipv6_address('not_an_ip')
+
+    def test_ipv4_is_not_ipv6(self):
+        assert not is_valid_ipv6_address('127.0.0.1')
+
+    def test_full_ipv6(self):
+        assert is_valid_ipv6_address('2001:0db8:85a3:0000:0000:8a2e:0370:7334')
+
+
+class TestFindFreePort:
+
+    def test_finds_a_port(self):
+        port = find_free_port()
+        assert isinstance(port, int)
+        assert 0 < port <= 65535
+
+    def test_port_is_available(self):
+        port = find_free_port()
+        import socket
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(('', port))  # Should not raise
+
+
+# ===========================================================================
+# utils.utils
+# ===========================================================================
+
+
+class TestDeepGetattr:
+
+    def test_simple_attr(self):
+        class Obj:
+            x = 42
+        assert deep_getattr(Obj(), 'x') == 42
+
+    def test_nested_attr(self):
+        class Inner:
+            val = 99
+        class Outer:
+            inner = Inner()
+        assert deep_getattr(Outer(), 'inner.val') == 99
+
+    def test_dict_key(self):
+        d = {'a': {'b': 10}}
+        assert deep_getattr(d, 'a.b') == 10
+
+    def test_missing_returns_default(self):
+        assert deep_getattr(object(), 'x', default='missing') == 'missing'
+
+    def test_dict_missing_key(self):
+        d = {'a': 1}
+        assert deep_getattr(d, 'b', default='nope') == 'nope'
+
+
+class TestSignatureInfo:
+
+    def test_function_with_kwargs(self):
+        def f(a, b=2, **kw):
+            pass
+        accepts_kwargs, params = signature_info(f)
+        assert accepts_kwargs is True
+        assert 'a' in params
+
+    def test_function_without_kwargs(self):
+        def f(a, b=2):
+            pass
+        accepts_kwargs, params = signature_info(f)
+        assert accepts_kwargs is False
+
+    def test_has_signature_parameter(self):
+        def f(a, b=2):
+            pass
+        assert has_signature_parameter(f, 'a')
+        assert not has_signature_parameter(f, 'c')
+
+
+class TestCallWithSupportedKwargs:
+
+    def test_filters_unsupported(self):
+        def f(a, b=2):
+            return (a, b)
+        result = call_with_supported_kwargs(f, a=10, b=20, c=30)
+        assert result == (10, 20)
+
+    def test_passes_all_with_kwargs(self):
+        def f(a, **kw):
+            return kw
+        result = call_with_supported_kwargs(f, 1, x=2, y=3)
+        assert result == {'x': 2, 'y': 3}
+
+
+class TestCopyFilesByPattern:
+
+    def test_copies_matching_files(self):
+        with tempfile.TemporaryDirectory() as src, tempfile.TemporaryDirectory() as dst:
+            # Create source files
+            open(os.path.join(src, 'a.txt'), 'w').close()
+            open(os.path.join(src, 'b.py'), 'w').close()
+            copy_files_by_pattern(src, dst, '*.txt')
+            assert os.path.exists(os.path.join(dst, 'a.txt'))
+            assert not os.path.exists(os.path.join(dst, 'b.py'))
+
+    def test_excludes_pattern(self):
+        with tempfile.TemporaryDirectory() as src, tempfile.TemporaryDirectory() as dst:
+            open(os.path.join(src, 'a.txt'), 'w').close()
+            open(os.path.join(src, 'b.txt'), 'w').close()
+            copy_files_by_pattern(src, dst, '*.txt', exclude_patterns=['b.*'])
+            assert os.path.exists(os.path.join(dst, 'a.txt'))
+            assert not os.path.exists(os.path.join(dst, 'b.txt'))
+
+
+# ===========================================================================
+# transformers_utils
+# ===========================================================================
+
+
+class TestAlignLogpsToMask:
+
+    def test_already_aligned_tensor(self):
+        mask = torch.tensor([[True, True, False, False]])
+        logps = torch.randn(1, 4)
+        result = align_logps_to_mask(logps, mask, torch.float32)
+        assert torch.equal(result, logps)
+
+    def test_ragged_list(self):
+        mask = torch.tensor([[True, True, False, False]])
+        # Only 2 valid positions
+        old_logps = [torch.tensor([0.5, 0.3])]
+        result = align_logps_to_mask(old_logps, mask, torch.float32)
+        assert result.shape == (1, 4)
+        # Positions 0,1 should have values, positions 2,3 should be 0
+        assert result[0, 0].item() == pytest.approx(0.5)
+        assert result[0, 1].item() == pytest.approx(0.3)
+        assert result[0, 2].item() == 0.0
+        assert result[0, 3].item() == 0.0
+
+    def test_scalar_broadcast(self):
+        mask = torch.tensor([[True, True, False]])
+        result = align_logps_to_mask([5.0], mask, torch.float32)
+        assert result[0, 0].item() == pytest.approx(5.0)
+        assert result[0, 1].item() == pytest.approx(5.0)
+
+    def test_returns_none_for_unsupported(self):
+        mask = torch.tensor([[True]])
+        result = align_logps_to_mask(42, mask, torch.float32)
+        assert result is None
+
+
+class TestFilterFromConfigKwargs:
+
+    def test_filters_load_only_keys(self):
+        kwargs = {
+            'cache_dir': '/tmp',
+            'num_layers': 12,
+            'trust_remote_code': True,
+            'hidden_size': 768,
+        }
+        result = filter_from_config_kwargs(kwargs)
+        assert 'cache_dir' not in result
+        assert 'trust_remote_code' not in result
+        assert result['num_layers'] == 12
+        assert result['hidden_size'] == 768
+
+
+# ===========================================================================
+# safetensors (partial — StreamingSaver without real GPU)
+# ===========================================================================
+
+
+class TestLazyTensor:
+
+    def test_from_tensor(self):
+        t = torch.randn(3, 4)
+        lazy = LazyTensor(tensor=t)
+        assert torch.equal(lazy.load(), t)
+
+    def test_from_loader(self):
+        t = torch.randn(3, 4)
+        lazy = LazyTensor(loader=lambda: t)
+        assert torch.equal(lazy.load(), t)
+
+    def test_loader_called_each_time(self):
+        count = [0]
+        def loader():
+            count[0] += 1
+            return torch.tensor(count[0], dtype=torch.float32)
+        lazy = LazyTensor(loader=loader)
+        v1 = lazy.load()
+        v2 = lazy.load()
+        assert v1.item() != v2.item()  # loader called each time
+
+
+class TestStreamingSafetensorSaver:
+
+    def test_init_creates_dir(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            save_dir = os.path.join(tmpdir, 'output')
+            saver = StreamingSafetensorSaver(save_dir, max_shard_size='1GB', save_rank='master')
+            # is_save_rank depends on is_master() which may be False on non-dist
+            # Just verify no crash