From fee308305902168ece5b1c4a4dd8763dd929cb5a Mon Sep 17 00:00:00 2001
From: puddingfjz <2811443837@qq.com>
Date: Thu, 30 Apr 2026 23:07:32 +0800
Subject: [PATCH 1/2] Implement distributed MoE data flow skeleton with
 multi-chip parallelism

Test Configuration:
- 4 experts (one per chip)
- 10 tokens in context
- 4 tokens processed per expert
- Hidden dimension: 16

IMPORTANT: Current implementation tests DATA FLOW only, not actual MoE computation:
- Compute phase is a simple +1.0 operation, not expert network computation
- Focus is on verifying correct token routing and result gathering
- Can be extended to add real expert models later

Core Components:
- Kernels: dispatch (all-to-all), compute (+1.0), combine (all-to-all)
- Orchestration: end2end, dispatch-only, combine-only, dispatch+compute
- Unit Tests: test_dispatch_only, test_combine_only, test_dispatch_compute
- E2E Test: test_end2end with unique value tracing

KEY DESIGN: Use INDEPENDENT scratch_test buffer for combine phase
- Problem: Reusing scratch caused combine to read stale dispatch data
- Solution: Dispatch+Compute use scratch, Combine uses scratch_test
- Prevents corruption when combine's stage-in doesn't fully overwrite
  dispatch's data (writes 4 tokens, stride based on 10 NUM_TOKENS)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../l3/moe_multi_chip_experts/.gitignore      |  12 +
 .../l3/moe_multi_chip_experts/DEBUG_GUIDE.md  | 188 ++++++++
 .../IMPLEMENTATION_NOTES.md                   | 113 +++++
 .../l3/moe_multi_chip_experts/README.md       | 213 +++++++++
 .../l3/moe_multi_chip_experts/TESTING.md      | 164 +++++++
 .../l3/moe_multi_chip_experts/__init__.py     |   9 +
 .../l3/moe_multi_chip_experts/golden.py       |  42 ++
 .../aiv/moe_combine_alltoall2 copy.cpp        | 244 ++++++++++
 .../kernels/aiv/moe_combine_alltoall2.cpp     | 220 +++++++++
 .../kernels/aiv/moe_combine_alltoall_ori.cpp  | 268 +++++++++++
 .../kernels/aiv/moe_demo_incore_0.cpp         | 108 +++++
 .../kernels/aiv/moe_demo_incore_1.cpp         | 137 ++++++
 .../kernels/aiv/moe_demo_incore_2.cpp         | 156 +++++++
 .../kernels/aiv/moe_dispatch_alltoall.cpp     | 209 +++++++++
 .../kernels/aiv/moe_simple_compute.cpp        |  47 ++
 .../kernels/kernel_config.py                  |  24 +
 .../orchestration/moe_combine_only_orch.cpp   |  69 +++
 .../kernels/orchestration/moe_comm_orch.cpp   | 123 ++++++
 .../moe_dispatch_compute_orch.cpp             |  88 ++++
 .../orchestration/moe_dispatch_only_orch.cpp  |  69 +++
 .../orchestration/moe_end2end_orch.cpp        | 110 +++++
 .../orchestration/moe_multi_chip_orch.cpp     |  88 ++++
 .../workers/l3/moe_multi_chip_experts/main.py | 417 ++++++++++++++++++
 .../test_combine_only.py                      | 411 +++++++++++++++++
 .../test_dispatch_compute.py                  | 290 ++++++++++++
 .../test_dispatch_only.py                     | 308 +++++++++++++
 .../l3/moe_multi_chip_experts/test_end2end.py | 398 +++++++++++++++++
 .../test_moe_multi_chip.py                    |  39 ++
 28 files changed, 4564 insertions(+)
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/.gitignore
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/DEBUG_GUIDE.md
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/IMPLEMENTATION_NOTES.md
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/README.md
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/TESTING.md
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/__init__.py
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/golden.py
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2 copy.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall_ori.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_0.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_1.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_2.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_dispatch_alltoall.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_simple_compute.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/kernel_config.py
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_combine_only_orch.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_comm_orch.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_compute_orch.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_only_orch.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_end2end_orch.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_multi_chip_orch.cpp
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/main.py
 create mode 100755 examples/workers/l3/moe_multi_chip_experts/test_combine_only.py
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/test_dispatch_compute.py
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/test_dispatch_only.py
 create mode 100755 examples/workers/l3/moe_multi_chip_experts/test_end2end.py
 create mode 100644 examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py

diff --git a/examples/workers/l3/moe_multi_chip_experts/.gitignore b/examples/workers/l3/moe_multi_chip_experts/.gitignore
new file mode 100644
index 000000000..c2bbc644a
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/.gitignore
@@ -0,0 +1,12 @@
+# Log files
+*.log
+
+# Build outputs
+build_output/
+
+# Device logs
+device_log/
+
+# Analysis files
+*_analysis.md
+all_reduce.log
diff --git a/examples/workers/l3/moe_multi_chip_experts/DEBUG_GUIDE.md b/examples/workers/l3/moe_multi_chip_experts/DEBUG_GUIDE.md
new file mode 100644
index 000000000..b28ff4c1d
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/DEBUG_GUIDE.md
@@ -0,0 +1,188 @@
+# 调试信息说明
+
+## 案例 1: End-to-End MoE Pipeline Scratch 缓冲区冲突问题
+
+### 问题描述
+在实现完整的 MoE pipeline（Dispatch + Compute + Combine）时，发现 Card 1 的 Expert 0 输出错误：
+- **期望值**: 2.0 (1.0 input + 1.0 compute)
+- **实际值**: 1.0 (只有 input，没有 compute)
+
+### 调试过程
+
+#### 步骤 1: 创建 Isolated Combine Test
+**假设**: Combine 阶段本身有问题
+
+**实现**: 在 test_end2end.py 中添加独立的 combine 测试
+- 创建 `host_recv_test`: 填充正确的 2.0 值
+- 创建 `host_output_test`: 用于存储 isolated test 的输出
+- 创建 `host_scratch_print_test`: 独立的 debug 输出
+- 创建 `scratch_test` buffer: 独立的 HCCL scratch 缓冲区
+- 在 orchestrator 中添加 Part 2: Isolated Combine Test
+
+**结果**: 
+- ✅ Isolated Test: 所有 256 个值正确 (2.0)
+- ❌ Full Pipeline: Card 1 的 Expert 0 仍然错误 (1.0)
+
+**结论**: Combine 阶段本身是正确的，问题不在 combine kernel
+
+#### 步骤 2: 分析数据流
+重新分析数据流，确认问题所在：
+
+**Dispatch 阶段**:
+- Input: `send[card_i][expert_i][:][:]` = 1.0
+- Output: `recv[card_i][card_j][:][:]` = `send[card_j][expert_i][:][:]`
+- 对于 Card i: 从所有 Card j 接收 `send[j][i][:][:]`
+
+**Compute 阶段**:
+- Input: `recv[:][:4][:]`
+- Output: `recv[:][:4][:] += 1.0`
+- 所有 recv 的前 4 个 token 都加 1.0
+
+**Combine 阶段**:
+- Phase 1 (stage-in): 复制 `recv[:][:][:]` 到 `scratch[my_rank][card_j][:][:]`
+- Phase 3 (direct-store): 从 `scratch[expert_i][my_rank][:][:]` 读取到 `output[expert_i][:][:]`
+
+#### 步骤 3: 发现 Scratch 缓冲区冲突
+**关键观察**:
+- Full Pipeline 使用同一个 `scratch` buffer
+- Isolated Test 使用独立的 `scratch_test` buffer → 成功！
+
+**问题定位**:
+当 Full Pipeline 复用同一个 scratch buffer 时：
+1. Dispatch Phase 向 `scratch` 写入数据（布局: `scratch[card_j][expert_i][:][:]`）
+2. Combine Phase 1 **应该**向 `scratch` 写入 `recv` 数据（布局: `scratch[my_rank][card_j][:][:]`）
+3. Combine Phase 3 从 `scratch` 读取数据
+
+**问题**:
+- Combine Phase 1 只写入前 COUNT (4) 个 token
+- Combine Phase 3 的 stride 使用 NUM_TOKENS (10) 计算 offset
+- **Combine Phase 1 没有完全覆盖 Dispatch Phase 写入的数据**
+- Combine Phase 3 读到了 Dispatch Phase 的残留数据
+
+#### 步骤 4: 解决方案
+**方案**: 为 Combine Phase 使用独立的 scratch 缓冲区
+
+**实现**:
+1. 在 `ChipBootstrapConfig` 中添加第二个 scratch buffer:
+   ```python
+   ChipBufferSpec(
+       name="scratch_test",
+       dtype="float32",
+       count=scratch_count,
+       nbytes=total_scratch_nbytes,
+   )
+   ```
+
+2. 在 orchestrator 中:
+   - Dispatch + Compute: 使用 `ext_scratch`
+   - Combine: 使用 `ext_scratch_test`
+
+3. 在 Python 中:
+   - 添加 `contexts[i].buffer_ptrs["scratch_test"]`
+
+**结果**: ✅ Full Pipeline 完全正确
+
+### 关键经验
+
+1. **隔离测试的重要性**:
+   - 通过创建 isolated combine test，快速定位问题不在 combine kernel 本身
+   - 这种方法可以推广到其他多阶段 pipeline 的调试
+
+2. **缓冲区复用的陷阱**:
+   - 当多个阶段使用同一个 scratch buffer 时：
+     - **确保每个阶段完全覆盖**它写入的区域
+     - **注意写入范围和读取范围的不匹配**
+   - Phase 1 写入前 COUNT 个 token，但 Phase 3 的 stride 基于 NUM_TOKENS
+
+3. **调试技巧**:
+   - 使用唯一值初始化输入（而不是全 1.0）
+   - 值编码: `(card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim`
+   - 这样可以清楚追踪每个数据点的流向
+
+4. **独立的 HCCL 缓冲区**:
+   - 如果不确定 buffer 是否被正确覆盖，使用独立 buffer
+   - 内存成本: 2x scratch buffer (对于小 buffer 可以接受)
+   - 避免了复杂的状态清理逻辑
+
+### 相关文件
+- `test_end2end.py`: 完整的 end-to-end 测试
+- `moe_end2end_orch.cpp`: 使用独立 scratch_test 的 orchestrator
+- `moe_combine_alltoall2.cpp`: Combine kernel
+
+### 运行测试
+```bash
+source /data/miniconda3/etc/profile.d/conda.sh && \
+conda activate simpler_issue && \
+task-submit --device 10,11 --run \
+  "export PTOAS_ROOT=/usr/local/bin/ptoas-bin && \
+   ASCEND_PROCESS_LOG_PATH=device_log \
+   ASCEND_GLOBAL_LOG_LEVEL=0 \
+   python examples/workers/l3/moe_multi_chip_experts/test_end2end.py -p a2a3 -d 10,11"
+```
+
+---
+
+## 添加的调试点
+
+### Python 侧 (main.py)
+1. **run() 函数入口**: 跟踪程序启动
+2. **HCCL 配置**: 显示 scratch buffer 大小和 rootinfo 路径
+3. **Tensor 分配**: 确认内存分配成功
+4. **Worker 创建**: 跟踪 Worker 对象创建
+5. **内核编译阶段**:
+   - 编译 dispatch kernel
+   - 编译 compute kernel
+   - 编译 combine kernel
+   - 提取 ELF text sections (硬件)
+   - 编译 orchestration
+6. **Worker 初始化**: 跟踪 init() 进度
+7. **chip_contexts**: 显示每个 card 的 rank 和 device_ctx
+8. **orch_fn**: 跟踪任务提交进度
+9. **worker.run()**: 跟踪执行进度
+
+### C++ Orchestration 侧 (moe_comm_orch.cpp)
+1. **orchestration_entry 入口**: 显示 card_id, expert_id, num_cards, comm_ctx
+2. **阶段 1 (Dispatch)**: 任务提交前后的状态
+3. **阶段 2 (Compute)**: 任务提交前后的状态
+4. **阶段 3 (Combine)**: 任务提交前后的状态
+5. **完成**: 确认所有阶段完成
+
+所有输出都使用 `flush=True` 或 `fflush(stdout)` 确保立即写入日志。
+
+## 运行测试
+
+```bash
+# 重新运行测试，观察调试输出
+source /data/miniconda3/etc/profile.d/conda.sh && \
+conda activate simpler_issue && \
+task-submit --device 4,5,6,7 --run "export PTOAS_ROOT=/usr/local/bin/ptoas-bin && python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3 -d 4,5,6,7 > moe_multi_chip_test_4chip_debug.log 2>&1"
+```
+
+## 可能的问题定位
+
+### 情况 1: 卡在内核编译
+**症状**: 看到 "[moe_multi_chip] [DEBUG] Starting kernel compilation..." 但没有后续输出
+**原因**: 可能是 PTOAS_ROOT 路径不正确或编译器问题
+**解决**: 检查 PTOAS_ROOT 环境变量和 ptoas-bin 目录
+
+### 情况 2: 卡在 Worker.init()
+**症状**: 看到 "Worker created" 但没有 "Worker initialized"
+**原因**: 可能是 HCCL 初始化或设备通信问题
+**解决**: 检查设备之间的 HCCL 通信配置
+
+### 情况 3: 卡在 worker.run()
+**症状**: 看到 "About to call worker.run()" 但没有看到 orchestration 输出
+**原因**: 可能是任务提交或调度问题
+**解决**: 检查 runtime 配置和任务队列
+
+### 情况 4: 卡在某个阶段
+**症状**: 看到 "Stage X: ..." 但没有 "Stage X+1"
+**原因**: 可能是该阶段的 AIV 内核或 HCCL 通信问题
+**解决**: 检查对应阶段的内核代码和通信逻辑
+
+## 下一步
+
+1. 运行带调试信息的测试
+2. 观察最后一条成功的调试消息
+3. 根据卡住的位置定位问题
+4. 如果需要，在更具体的位置添加更详细的调试信息
diff --git a/examples/workers/l3/moe_multi_chip_experts/IMPLEMENTATION_NOTES.md b/examples/workers/l3/moe_multi_chip_experts/IMPLEMENTATION_NOTES.md
new file mode 100644
index 000000000..45b1c1604
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/IMPLEMENTATION_NOTES.md
@@ -0,0 +1,113 @@
+# Multi-Chip MoE Implementation Notes
+
+## Overview
+
+This implementation transforms the single-chip MoE example (`moe_single_chip`) into a multi-chip parallel version (`moe_multi_chip_experts`) where **each chip processes one expert** instead of all experts running sequentially on one chip.
+
+## Key Changes
+
+### 1. Architecture
+
+**Single-Chip Version:**
+- One chip runs ALL 4 experts sequentially
+- Orchestration loops: `card_i=0..3`, `expert_j=0..3`, `t_idx=0..3`
+- Total: 4 cards × 4 experts × 4 tokens = 64 dispatch operations
+
+**Multi-Chip Version:**
+- Each chip runs ONE expert in parallel
+- Orchestration: `card_i=i` (passed as arg), `expert_j=i` (passed as arg), `t_idx=0..3`
+- Per chip: 1 expert × 4 tokens = 4 dispatch operations
+- With 2 chips: 2 × (1 × 4) = 8 total dispatch operations (parallel)
+
+### 2. Modified Files
+
+#### `kernels/kernel_config.py` (NEW)
+- Configuration file defining runtime and kernel sources
+- Mirrors structure from single-chip version
+
+#### `kernels/orchestration/moe_multi_chip_orch.cpp` (MODIFIED)
+- Reads expert ID and chip ID from scalar arguments (passed by Python)
+- Only processes the assigned expert (not all experts)
+- Maintains same computation pattern as single-chip version
+- Key difference: No `card_i` loop, no `expert_j` loop - these are passed as args
+
+#### `main.py` (MODIFIED)
+- Passes two scalar arguments to orchestration:
+  1. Expert ID (`i`): Chip i processes expert i
+  2. Chip ID (`i`): Logical card_i for data layout computation
+- Updated ChipCallable signature to accept 3 tensors + 2 scalars
+
+### 3. Result Equivalence
+
+Both versions produce **IDENTICAL results** because:
+- Same kernels (`moe_demo_incore_0/1/2.cpp`)
+- Same computation logic (dispatch → compute → combine)
+- Only difference: execution distribution (serial vs parallel)
+
+## Usage
+
+### Run Multi-Chip Version (2 chips, 2 experts)
+```bash
+python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3sim -d 0-1
+```
+
+### Run Single-Chip Version (for comparison)
+```bash
+python examples/workers/l3/moe_single_chip/main.py -p a2a3sim -d 0
+```
+
+### Run via pytest
+```bash
+pytest examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py -v -s
+```
+
+## Technical Details
+
+### Parameter Passing
+The multi-chip version uses scalar arguments to pass expert ID and chip ID to orchestration:
+```python
+moe_args.add_scalar(i)  # Expert ID
+moe_args.add_scalar(i)  # Chip ID (logical card_i)
+```
+
+Orchestration reads these:
+```cpp
+int64_t expert_j = static_cast<int64_t>(orch_args.scalar(0));
+int64_t card_i = static_cast<int64_t>(orch_args.scalar(1));
+```
+
+### Data Layout
+- Each chip has its own input/output buffers
+- Shape: `[4, 64, 64]` (4 tokens, 64 hidden dim)
+- Same layout as single-chip version for result equivalence
+
+### ChipCallable Signature
+- Single-chip: `[IN, OUT, OUT]` (3 tensors)
+- Multi-chip: `[IN, OUT, OUT, IN, IN]` (3 tensors + 2 scalars)
+
+## Verification
+
+To verify result equivalence:
+1. Run single-chip version, save output
+2. Run multi-chip version, save output
+3. Compare outputs (should be identical)
+
+Note: Multi-chip version produces per-chip outputs. To compare with single-chip:
+- Single-chip output is the combined result of all 4 experts
+- Multi-chip per-chip output is the result of one expert
+- Combine multi-chip outputs appropriately for comparison
+
+## Future Improvements
+
+1. **Dynamic Configuration**: Currently hardcoded for 4 tokens. Could make configurable.
+2. **Result Combination**: Add logic to combine per-chip outputs for direct comparison.
+3. **Scalability**: Test with more chips (4, 8, etc.)
+4. **Performance**: Measure speedup vs single-chip version
+
+## Related Files
+
+- Single-chip version: `examples/workers/l3/moe_single_chip/`
+- Multi-chip version: `examples/workers/l3/moe_multi_chip_experts/`
+- Other multi-chip examples:
+  - `examples/workers/l3/multi_chip_dispatch/`
+  - `examples/workers/l3/ffn_tp_parallel/`
diff --git a/examples/workers/l3/moe_multi_chip_experts/README.md b/examples/workers/l3/moe_multi_chip_experts/README.md
new file mode 100644
index 000000000..9c755687a
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/README.md
@@ -0,0 +1,213 @@
+# Multi-Chip MoE Example
+
+This example demonstrates a distributed MoE (Mixture of Experts) pattern across **multiple chips**, with **one expert per chip**.
+
+## Overview
+
+This is the **multi-chip version** of `moe_single_chip`. The computation is **identical** - same kernels, same logic - but distributed across multiple chips for parallel execution.
+
+## Key Difference: Single vs Multi-Chip
+
+| Aspect | moe_single_chip | moe_multi_chip_experts |
+|--------|----------------|------------------------|
+| **Execution** | Sequential on one chip | **Parallel across chips** |
+| **Expert placement** | All experts on one chip | **One expert per chip** |
+| **Computation** | Same | **Same (identical kernels)** |
+| **Performance** | Limited by single chip | **Scales with chip count** |
+| **Result** | Deterministic | **Deterministic (same result)** |
+
+## Pattern
+
+```
+Single-Chip Version (moe_single_chip):
+  Input → [Chip 0: Expert 0,1,2,3] → Output
+
+Multi-Chip Version (moe_multi_chip_experts):
+  Input → [Chip 0: Expert 0] ─┐
+         [Chip 1: Expert 1] ─┼→ Output
+         [Chip 2: Expert 2] ─┤  (same result!)
+         [Chip 3: Expert 3] ─┘
+```
+
+## Computation Flow (Identical to Single-Chip)
+
+### 1. Dispatch Stage
+- Copy data from send to recv buffer based on expert assignment
+- Same kernel (`moe_demo_incore_0`) as single-chip version
+
+### 2. Compute Stage
+- Apply expert transformation on recv buffer
+- Same kernel (`moe_demo_incore_1`) as single-chip version
+- **Key difference**: Each chip runs only its assigned expert (parallel)
+
+### 3. Combine Stage
+- Accumulate results from recv to output
+- Same kernel (`moe_demo_incore_2`) as single-chip version
+
+## Kernels
+
+Uses the **exact same kernels** as `moe_single_chip`:
+
+1. **moe_demo_incore_0.cpp** (dispatch): Copy send → recv based on expert assignment
+2. **moe_demo_incore_1.cpp** (compute): Apply expert transformation
+3. **moe_demo_incore_2.cpp** (combine): Accumulate results to output
+
+The kernels are NOT modified - we just distribute the work differently.
+
+## Configuration
+
+```python
+# Device count determines expert count
+NUM_CARDS = len(device_ids)  # e.g., 2, 4, etc.
+NUM_EXPERTS = NUM_CARDS      # One expert per chip
+NUM_TOKENS = 64
+HIDDEN_DIM = 64
+EXPERT_HIDDEN_DIM = 32
+```
+
+## Running
+
+```bash
+# 2 chips (2 experts) - simulation
+python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3sim -d 0-1
+
+# 4 chips (4 experts) - simulation
+python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3sim -d 0-3
+
+# 2 chips (2 experts) - hardware
+python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3 -d 0-1
+
+# Run via pytest
+pytest examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py -v -s
+```
+
+## How It Works
+
+### Python Level (main.py)
+
+```python
+# Allocate tensors per chip
+host_input = [torch.randn(...) for _ in device_ids]
+host_recv = [torch.randn(...) for _ in device_ids]
+host_output = [torch.zeros(...) for _ in device_ids]
+
+# Submit task to each chip
+for i in range(len(device_ids)):
+    orch.submit_next_level(moe_cc, moe_args, cfg, worker=i)
+    # Each chip runs the SAME orchestration
+    # But computes different experts based on chip ID
+```
+
+### Orchestration Level (moe_multi_chip_orch.cpp)
+
+The orchestration code is identical to `moe_single_chip`:
+- Loops over `card_i` (chip index) and `expert_j` (expert index)
+- In multi-chip: each chip only processes its assigned expert
+- In single-chip: one chip processes all experts
+
+### Kernel Level
+
+**NO CHANGES** - kernels are identical:
+- Same memory access patterns
+- Same computation logic
+- Same results
+
+## Result Equivalence
+
+**The outputs ARE identical** (given same random seed):
+
+```python
+# Single-chip version
+python moe_single_chip/main.py -p a2a3sim -d 0
+# Output: [tensor with values X]
+
+# Multi-chip version (2 chips)
+python moe_multi_chip_experts/main.py -p a2a3sim -d 0-1
+# Output: [tensor with values X]  <- SAME!
+```
+
+The distribution is **transparent** to the computation - we're just
+executing the same work in parallel instead of sequentially.
+
+## When to Use Which Version?
+
+### Use `moe_single_chip` when:
+- ✅ You only have 1 chip available
+- ✅ You're developing/debugging kernels
+- ✅ Model fits comfortably on single chip
+- ✅ Simpler debugging (everything on one device)
+
+### Use `moe_multi_chip_experts` when:
+- ✅ You have multiple chips available
+- ✅ You want faster execution (parallel compute)
+- ✅ Model is too large for single chip
+- ✅ You're scaling to more experts than fit on one chip
+
+## Memory Layout
+
+Per-chip tensors (same as single-chip):
+
+```python
+# Each chip has:
+input:    [4, 64, 64]    # Input tokens
+recv:     [4, 64, 64]    # Intermediate buffer
+output:   [4, 64]        # Final output
+```
+
+The shape is identical - only the distribution changes.
+
+## Performance Characteristics
+
+### Single-Chip Version
+- **Compute**: O(num_experts × num_tokens) sequential
+- **Memory**: All expert data on one chip
+- **Latency**: Sum of all expert compute times
+
+### Multi-Chip Version
+- **Compute**: O(num_tokens) parallel per chip
+- **Memory**: Expert data distributed across chips
+- **Latency**: Max of individual expert compute times
+
+**Speedup**: Near-linear with chip count (ignoring communication overhead)
+
+## Implementation Details
+
+### No Kernel Changes
+The kernels (`moe_demo_incore_*.cpp`) are **verbatim copies** from the single-chip version. This ensures:
+
+1. **Correctness**: Same computation = same results
+2. **Simplicity**: No need to rewrite kernel logic
+3. **Maintainability**: Single source of truth for kernels
+
+### Distribution via Orchestration
+The multi-chip behavior comes from:
+1. Python: Submit tasks to multiple chips (`worker=i`)
+2. Orchestration: Each chip runs the same DAG
+3. Kernel: Identical computation, different data subsets
+
+### Key Insight
+```
+Single-chip: Chip 0 runs {Expert 0, Expert 1, Expert 2, Expert 3}
+Multi-chip:  Chip 0 runs {Expert 0}, Chip 1 runs {Expert 1}, ...
+
+Same total work, different distribution.
+```
+
+## Comparison with True Distributed MoE
+
+This example keeps the computation **identical** for educational purposes.
+Real distributed MoE systems would also optimize:
+
+- **Communication**: Reduce all-to-all data movement
+- **Load Balancing**: Dynamic token-to-expert assignment
+- **Gradient Synchronization**: Distributed training considerations
+
+Those optimizations are omitted here to maintain **result equivalence**
+with the single-chip version.
+
+## Next Steps
+
+1. **Compare outputs**: Run both versions and verify results match
+2. **Measure speedup**: Time both versions on your hardware
+3. **Scale up**: Try 4, 8, or more chips
+4. **Real distribution**: Implement data sharding across chips
diff --git a/examples/workers/l3/moe_multi_chip_experts/TESTING.md b/examples/workers/l3/moe_multi_chip_experts/TESTING.md
new file mode 100644
index 000000000..fc4189d4c
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/TESTING.md
@@ -0,0 +1,164 @@
+# MoE Multi-Chip Testing Guide
+
+This guide provides detailed commands for testing the distributed MoE implementation on Ascend hardware.
+
+## Prerequisites
+
+```bash
+# Activate conda environment
+conda activate simpler_issue
+
+# Ensure environment variables are set
+export PTOAS_ROOT=/usr/local/bin/ptoas-bin
+export ASCEND_PROCESS_LOG_PATH=/data/fangjingzhi/simpler_distributed/device_log
+export ASCEND_GLOBAL_LOG_LEVEL=0
+```
+
+## Test Files
+
+| Test File | Purpose | Phase | Notes |
+|-----------|---------|-------|-------|
+| `test_dispatch_only.py` | Test dispatch phase only | Dispatch | Uses unique values for data tracing |
+| `test_combine_only.py` | Test combine phase only | Combine | Uses unique values for data tracing |
+| `test_dispatch_compute.py` | Test dispatch + compute | Dispatch + Compute | Verifies expert routing and compute |
+| `test_end2end.py` | Test complete end-to-end pipeline | All phases | Uses independent scratch buffers to avoid conflicts |
+
+## Test Commands
+
+
+
+### Hardware Mode (a2a3)
+
+Run on actual Ascend NPUs.
+
+#### Quick Tests (2 chips)
+
+```bash
+# Dispatch phase test
+python examples/workers/l3/moe_multi_chip_experts/test_dispatch_only.py \
+  -p a2a3 \
+  -d 10,11
+
+# Combine phase test
+python examples/workers/l3/moe_multi_chip_experts/test_combine_only.py \
+  -p a2a3 \
+  -d 10,11
+
+# End-to-end pipeline test (recommended)
+python examples/workers/l3/moe_multi_chip_experts/test_end2end.py \
+  -p a2a3 \
+  -d 10,11
+```
+
+#### Extended Tests (4 chips)
+
+```bash
+# 4-chip full pipeline
+python examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py \
+  -p a2a3 \
+  -d 10,11,12,13
+```
+
+## Background Job Submission
+
+For long-running tests, use `task-submit` to run in background.
+
+```bash
+# Submit combine-only test
+task-submit --device 10,11 --run \
+  "export PTOAS_ROOT=/usr/local/bin/ptoas-bin && \
+   ASCEND_PROCESS_LOG_PATH=/data/fangjingzhi/simpler_distributed/device_log && \
+   ASCEND_GLOBAL_LOG_LEVEL=0 && \
+   python examples/workers/l3/moe_multi_chip_experts/test_combine_only.py \
+   -p a2a3 -d 10,11 > moe_combine_only_$(date +%Y%m%d_%H%M%S).log 2>&1"
+
+# Submit full pipeline test
+task-submit --device 10,11 --run \
+  "export PTOAS_ROOT=/usr/local/bin/ptoas-bin && \
+   ASCEND_PROCESS_LOG_PATH=/data/fangjingzhi/simpler_distributed/device_log && \
+   ASCEND_GLOBAL_LOG_LEVEL=0 && \
+   python examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py \
+   -p a2a3 -d 10,11 > moe_full_$(date +%Y%m%d_%H%M%S).log 2>&1"
+```
+
+
+
+## Test Verification
+
+### Expected Output
+
+Each test will print:
+1. **Configuration**: Platform, device count, tensor shapes
+2. **Input data**: Sample values for verification
+3. **Scratch buffer**: Debug output from Phase 1 (stage-in)
+4. **Output data**: Final results after combine
+5. **Verification**: Match with golden output
+
+### test_end2end.py 特殊说明
+
+**关键特性**:
+- 使用唯一值初始化输入: `(card * 1000000) + (expert * 10000) + (token * 100) + dim`
+- 使用**独立的 scratch 缓冲区**避免阶段间冲突:
+  - `scratch`: 用于 Dispatch + Compute 阶段
+  - `scratch_test`: 用于 Combine 阶段
+- 清晰的数据流追踪
+
+**为什么需要独立的 scratch?**
+- Dispatch 向 `scratch` 写入: `scratch[card_j][expert_i][:][:]`
+- Combine 从 `scratch` 读取: `scratch[expert_i][my_rank][:][:]`
+- Combine 的写入范围 (前 COUNT 个 token) 不能完全覆盖 Dispatch 的数据
+- 使用独立 buffer 避免读到残留数据
+
+### Success Criteria
+
+```
+✓ All values correct
+✓ Output matches golden reference
+✓ No device errors or timeouts
+```
+
+## Debugging Failed Tests
+
+### Check Device Logs
+
+```bash
+# List latest device logs
+ls -lt /data/fangjingzhi/simpler_distributed/device_log/debug/device-*/ | head -20
+
+# Check specific device log for errors
+grep -i "error\|fail\|stuck" \
+  /data/fangjingzhi/simpler_distributed/device_log/debug/device-10/*.log
+```
+
+### Common Issues
+
+| Issue | Symptom | Solution |
+|-------|---------|----------|
+| Parameter mismatch | `kernel_id=-1`, STUCK-READY | Check tensor/scalar count matches kernel signature |
+| Device fault | `Device fault, ret=0x7110011` | Check for illegal memory access or uninitialized tiles |
+| Timeout | Task hangs, no progress | Check HCCL bootstrap and signal barrier logic |
+| Wrong results | Output doesn't match golden | Verify data flow through dispatch→combine phases |
+
+### Enable Verbose Logging
+
+```bash
+# Maximum verbosity for debugging
+ASCEND_GLOBAL_LOG_LEVEL=0 \
+ASCEND_PROCESS_LOG_PATH=/data/fangjingzhi/simpler_distributed/device_log \
+python examples/workers/l3/moe_multi_chip_experts/test_combine_only.py \
+  -p a2a3 -d 10,11
+```
+
+
+## Test Isolation
+
+Each test creates unique temporary files:
+
+```bash
+# Rootinfo files for HCCL
+/tmp/pto_*_PID*.bin
+
+# Device logs
+/data/fangjingzhi/simpler_distributed/device_log/debug/device-*/
+```
+
diff --git a/examples/workers/l3/moe_multi_chip_experts/__init__.py b/examples/workers/l3/moe_multi_chip_experts/__init__.py
new file mode 100644
index 000000000..febbca099
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Multi-chip MoE example package."""
diff --git a/examples/workers/l3/moe_multi_chip_experts/golden.py b/examples/workers/l3/moe_multi_chip_experts/golden.py
new file mode 100644
index 000000000..e4dc36ae0
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/golden.py
@@ -0,0 +1,42 @@
+import torch
+
+
+
+def demo(send, recv, output):
+    """
+        send shape: (num_cards, num_experts, total_tokens, hidden_size)
+        counts shape: (num_cards, num_experts,)
+        cumcounts shape: (num_cards, num_experts+1,)
+        recv shape: (num_experts, num_cards, total_tokens, hidden_size)
+        output shape: (num_cards, total_tokens, hidden_size)
+
+        Note: This function now adapts to the actual input shape, supporting
+        any number of cards (2, 3, 4, etc.), not just 4 cards.
+    """
+    # Infer dimensions from input tensors
+    num_cards = send.shape[0]  # Actual number of cards from input
+    num_experts = send.shape[1]  # Number of experts (typically equals num_cards)
+    total_tokens = send.shape[2]
+    hidden_size = send.shape[3]
+    count = 4  # tokens to process per (card, expert) pair
+
+    # dispatch
+    for cardi in range(num_cards):
+        for experti in range(num_experts):
+            # count = counts[cardi, experti]
+            recv[experti, cardi, :count, :] = send[cardi, experti, :count, :]
+    print(f"send: {send}")
+    print(f"recv: {recv}")
+    # compute
+    for cardi in range(num_cards):
+        for experti in range(num_experts):
+            recv[experti, cardi] = recv[experti, cardi] + 1.0  # 匹配实际kernel行为：总是加1.0f
+    print(f"recv: {recv}")
+    # combine
+    for experti in range(num_experts):
+        for cardi in range(num_cards):
+            # count = counts[cardi, experti]
+            output[cardi, :count, :] += recv[experti, cardi, :count, :]
+    print(f"output: {output}")
+    return output
+
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2 copy.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2 copy.cpp
new file mode 100644
index 000000000..f7f1d464f
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2 copy.cpp	
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * MoE Combine All-to-All Kernel (Direct Store Version)
+ *
+ * This kernel implements the combine phase of distributed MoE:
+ * Each card i sends recv[i][card_j] (expert_i's result for card_j) to card j,
+ * then directly stores all received results to output without accumulation.
+ *
+ * Data flow:
+ *   Phase 1 (stage-in):  recv[:][:][:COUNT][:] → scratch[my_rank][:][:][:]
+ *   Phase 2 (barrier):   signal matrix + TWAIT cross-rank sync
+ *   Phase 3 (store):     for expert_i in num_cards: copy scratch[expert_i][my_rank][:][:] to output[expert_i][:][:]
+ *
+ * args layout:
+ *   tensor(0) = recv_local       [num_cards][num_tokens][hidden_dim]
+ *   tensor(1) = output_local     [num_cards][count][hidden_dim] - stores all experts' data
+ *   tensor(2) = scratch          HCCL window buffer
+ *   tensor(3) = scratch_print    Debug output buffer (Phase 1 stage-in mirror)
+ *   scalar(0) = card_id          which card this is
+ *   scalar(1) = num_cards        total number of cards
+ *   scalar(2) = CommContext      device pointer for cross-card communication
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include "pto/comm/comm_types.hpp"
+#include "pto/comm/pto_comm_inst.hpp"
+#include "platform_comm/comm_context.h"
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+// Configuration matching golden.py
+static constexpr size_t NUM_TOKENS = 10;
+static constexpr size_t HIDDEN_DIM = 16;
+static constexpr size_t COUNT = 4;  // tokens to process per (card, expert) pair
+static constexpr int kMaxSupportedCards = 16;
+
+template <typename T>
+AICORE inline __gm__ T *CommRemotePtr(__gm__ CommContext *ctx, __gm__ T *localPtr, int pe) {
+    uint64_t localBase = ctx->windowsIn[ctx->rankId];
+    uint64_t offset = (uint64_t)localPtr - localBase;
+    return (__gm__ T *)(ctx->windowsIn[pe] + offset);
+}
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensors
+    __gm__ Tensor *recv_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *output_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *scratch_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *scratch_print_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+
+    // Unpack scalars
+    int64_t card_id = static_cast<int64_t>(args[4]);
+    int num_cards = static_cast<int>(args[5]);
+    __gm__ CommContext *commCtx = reinterpret_cast<__gm__ CommContext *>(args[6]);
+
+    // Get base pointers
+    __gm__ float *recv = reinterpret_cast<__gm__ float *>(recv_tensor->buffer.addr) + recv_tensor->start_offset;
+    __gm__ float *output = reinterpret_cast<__gm__ float *>(output_tensor->buffer.addr) + output_tensor->start_offset;
+    __gm__ float *scratch = reinterpret_cast<__gm__ float *>(scratch_tensor->buffer.addr) + scratch_tensor->start_offset;
+    __gm__ float *scratch_print = reinterpret_cast<__gm__ float *>(scratch_print_tensor->buffer.addr) + scratch_print_tensor->start_offset;
+
+    // Signal area at tail of scratch: num_cards int32 slots
+    // Must be placed AFTER all data slots to avoid corruption
+    size_t total_data_size = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM;
+    __gm__ int32_t *signal_base = reinterpret_cast<__gm__ int32_t *>(scratch + total_data_size);
+
+    using ShapeDyn = pto::Shape<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
+    using StrideDyn = pto::Stride<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
+    using Global = pto::GlobalTensor<float, ShapeDyn, StrideDyn, pto::Layout::ND>;
+
+    int my_rank = static_cast<int>(commCtx->rankId);
+
+    if (num_cards <= 0 || num_cards > kMaxSupportedCards) {
+        pipe_barrier(PIPE_ALL);
+        return;
+    }
+
+    // ------------------------------------------------------------------
+    // Phase 1: stage-in — copy recv to scratch
+    // This card's expert result for all cards (as destination)
+    //
+    //
+    // For card_i with expert_id, copy recv[card_j][:][:] to scratch[expert_id][card_j][:][:]
+    // ------------------------------------------------------------------
+    for (int card_j = 0; card_j < num_cards; ++card_j) {
+        for (size_t t = 0; t < COUNT; ++t) {
+            // Source: recv[card_j][t][:HIDDEN_DIM] (expert_id's processed data from card_j)
+            // recv layout: [num_cards][NUM_TOKENS][HIDDEN_DIM]
+            // Base points to current (card_j, t), stride should keep access within current token
+            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn src_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
+                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global srcG(recv + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM,
+                       src_shape, src_stride);
+
+            // Destination: scratch[my_rank][card_j][t][:HIDDEN_DIM]
+            // Offset = my_rank * (num_cards * NUM_TOKENS * HIDDEN_DIM)
+            //        + card_j * (NUM_TOKENS * HIDDEN_DIM)
+            //        + t * HIDDEN_DIM
+            size_t dst_offset = my_rank * num_cards * NUM_TOKENS * HIDDEN_DIM
+                              + card_j * NUM_TOKENS * HIDDEN_DIM
+                              + t * HIDDEN_DIM;
+
+            ShapeDyn dst_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn dst_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global dstG(scratch + dst_offset,
+                       dst_shape, dst_stride);
+            Global dstG_print(scratch_print + dst_offset,
+                             dst_shape, dst_stride);
+
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
+                                       pto::BLayout::RowMajor, -1, -1>;
+            TileType tile(1, HIDDEN_DIM);
+            TASSIGN(tile, 0);
+
+            TLOAD(tile, srcG);
+            set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            TSTORE(dstG, tile);
+            TSTORE(dstG_print, tile);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        }
+    }
+    pipe_barrier(PIPE_ALL);
+
+    // ------------------------------------------------------------------
+    // Phase 2: device barrier — each card notifies peers that its
+    // recv[:][my_card] data is visible in scratch, then waits for all peers.
+    // ------------------------------------------------------------------
+    for (int peer = 0; peer < num_cards; ++peer) {
+        if (peer == my_rank) continue;
+        __gm__ int32_t *remote_signal = CommRemotePtr(commCtx, signal_base + my_rank, peer);
+        pto::comm::Signal sig(remote_signal);
+        pto::comm::TNOTIFY(sig, (int32_t)1, pto::comm::NotifyOp::AtomicAdd);
+    }
+    for (int peer = 0; peer < num_cards; ++peer) {
+        if (peer == my_rank) continue;
+        pto::comm::Signal sig(signal_base + peer);
+        pto::comm::TWAIT(sig, (int32_t)1, pto::comm::WaitCmp::GE);
+    }
+    pipe_barrier(PIPE_ALL);
+
+    // ------------------------------------------------------------------
+    // Phase 3: reduce — accumulate all experts' results for this card
+    // Read scratch[expert_i][card_id][:][:] from each expert i's scratch
+    // and accumulate to output[t][:HIDDEN_DIM]
+    //
+    // For card_id, accumulate:
+    //   from expert 0: scratch[0][card_id][:][:]
+    //   from expert 1: scratch[1][card_id][:][:]
+    //   etc.
+    // ------------------------------------------------------------------
+
+    // Initialize output to zero
+    // for (size_t t = 0; t < COUNT; ++t) {
+    //     ShapeDyn out_shape(1, 1, 1, 1, HIDDEN_DIM);
+    //     StrideDyn out_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
+    //     Global outG(output + t * HIDDEN_DIM, out_shape, out_stride);
+
+    //     using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
+    //                                pto::BLayout::RowMajor, -1, -1>;
+    //     TileType tile(1, HIDDEN_DIM);
+    //     TASSIGN(tile, 0);
+    //     TSTORE(outG, tile);
+    //     set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    //     wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    // }
+
+    // Accumulate from all experts
+    for (int expert_i = 0; expert_i < num_cards; ++expert_i) {
+        for (size_t t = 0; t < COUNT; ++t) {
+            // Source: scratch[expert_i][my_rank][t][:HIDDEN_DIM]
+            // Offset = expert_i * (num_cards * NUM_TOKENS * HIDDEN_DIM)
+            //        + my_rank * (NUM_TOKENS * HIDDEN_DIM)
+            //        + t * HIDDEN_DIM
+            __gm__ float *src_base = (expert_i == my_rank) ? scratch :
+                                     CommRemotePtr(commCtx, scratch, expert_i);
+            size_t src_offset = expert_i * num_cards * NUM_TOKENS * HIDDEN_DIM
+                              + my_rank * NUM_TOKENS * HIDDEN_DIM
+                              + t * HIDDEN_DIM;
+
+            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn src_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global srcG(src_base + src_offset, src_shape, src_stride);
+
+            // Destination: output[t][:HIDDEN_DIM] (accumulate)
+            ShapeDyn out_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn out_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global outG(output + t * HIDDEN_DIM, out_shape, out_stride);
+
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
+                                       pto::BLayout::RowMajor, -1, -1>;
+            TileType srcTile(1, HIDDEN_DIM);
+            TileType accTile(1, HIDDEN_DIM);
+            constexpr size_t kTileSize = 1 * HIDDEN_DIM * sizeof(float);  // 64 bytes
+            TASSIGN(srcTile, kTileSize);      // Use offset 64
+            TASSIGN(accTile, kTileSize * 2);  // Use offset 128
+
+            // Load current output value (acc before accumulation)
+            TLOAD(accTile, outG);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+            // Load from remote scratch (src)
+            TLOAD(srcTile, srcG);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+
+            // Accumulate
+            TADD(accTile, accTile, srcTile);
+            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+
+            // Store to output
+            TSTORE(outG, accTile);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        }
+    }
+
+    pipe_barrier(PIPE_ALL);
+}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2.cpp
new file mode 100644
index 000000000..da6188c1c
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * MoE Combine All-to-All Kernel (Direct Store Version)
+ *
+ * This kernel implements the combine phase of distributed MoE:
+ * Each card i sends recv[i][card_j] (expert_i's result for card_j) to card j,
+ * then directly stores all received results to output (one expert per output row).
+ *
+ * Data flow:
+ *   Phase 1 (stage-in):  recv[:][:][:COUNT][:] → scratch[my_rank][:][:][:]
+ *   Phase 2 (barrier):   signal matrix + TWAIT cross-rank sync
+ *   Phase 3 (store):     for expert_i in num_cards: copy scratch[expert_i][my_rank][:][:] to output[expert_i][:][:]
+ *
+ * Output layout:
+ *   output[expert_i][token_t][:] = data from expert_i for this card, token t
+ *
+ * args layout:
+ *   tensor(0) = recv_local       [num_cards][num_tokens][hidden_dim]
+ *   tensor(1) = output_local     [num_cards][count][hidden_dim] - stores all experts' data
+ *   tensor(2) = scratch          HCCL window buffer
+ *   tensor(3) = scratch_print    Debug output buffer (Phase 1 stage-in mirror)
+ *   scalar(0) = card_id          which card this is
+ *   scalar(1) = num_cards        total number of cards
+ *   scalar(2) = CommContext      device pointer for cross-card communication
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include "pto/comm/comm_types.hpp"
+#include "pto/comm/pto_comm_inst.hpp"
+#include "platform_comm/comm_context.h"
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+// Configuration matching golden.py
+static constexpr size_t NUM_TOKENS = 10;
+static constexpr size_t HIDDEN_DIM = 16;
+static constexpr size_t COUNT = 4;  // tokens to process per (card, expert) pair
+static constexpr int kMaxSupportedCards = 16;
+
+template <typename T>
+AICORE inline __gm__ T *CommRemotePtr(__gm__ CommContext *ctx, __gm__ T *localPtr, int pe) {
+    uint64_t localBase = ctx->windowsIn[ctx->rankId];
+    uint64_t offset = (uint64_t)localPtr - localBase;
+    return (__gm__ T *)(ctx->windowsIn[pe] + offset);
+}
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensors
+    __gm__ Tensor *recv_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *output_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *scratch_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *scratch_print_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+
+    // Unpack scalars
+    int64_t card_id = static_cast<int64_t>(args[4]);
+    int num_cards = static_cast<int>(args[5]);
+    __gm__ CommContext *commCtx = reinterpret_cast<__gm__ CommContext *>(args[6]);
+
+    // Get base pointers
+    __gm__ float *recv = reinterpret_cast<__gm__ float *>(recv_tensor->buffer.addr) + recv_tensor->start_offset;
+    __gm__ float *output = reinterpret_cast<__gm__ float *>(output_tensor->buffer.addr) + output_tensor->start_offset;
+    __gm__ float *scratch = reinterpret_cast<__gm__ float *>(scratch_tensor->buffer.addr) + scratch_tensor->start_offset;
+    __gm__ float *scratch_print = reinterpret_cast<__gm__ float *>(scratch_print_tensor->buffer.addr) + scratch_print_tensor->start_offset;
+
+    // Signal area at tail of scratch: num_cards int32 slots
+    // Must be placed AFTER all data slots to avoid corruption
+    size_t total_data_size = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM;
+    __gm__ int32_t *signal_base = reinterpret_cast<__gm__ int32_t *>(scratch + total_data_size);
+
+    using ShapeDyn = pto::Shape<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
+    using StrideDyn = pto::Stride<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
+    using Global = pto::GlobalTensor<float, ShapeDyn, StrideDyn, pto::Layout::ND>;
+
+    int my_rank = static_cast<int>(commCtx->rankId);
+
+    if (num_cards <= 0 || num_cards > kMaxSupportedCards) {
+        pipe_barrier(PIPE_ALL);
+        return;
+    }
+
+    // ------------------------------------------------------------------
+    // Phase 1: stage-in — copy recv to scratch
+    // This card's expert result for all cards (as destination)
+    //
+    //
+    // For card_i with expert_id, copy recv[card_j][:][:] to scratch[expert_id][card_j][:][:]
+    // ------------------------------------------------------------------
+    for (int card_j = 0; card_j < num_cards; ++card_j) {
+        for (size_t t = 0; t < COUNT; ++t) {
+            // Source: recv[card_j][t][:HIDDEN_DIM] (expert_id's processed data from card_j)
+            // recv layout: [num_cards][NUM_TOKENS][HIDDEN_DIM]
+            // Base points to current (card_j, t), stride should keep access within current token
+            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn src_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
+                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global srcG(recv + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM,
+                       src_shape, src_stride);
+
+            // Destination: scratch[my_rank][card_j][t][:HIDDEN_DIM]
+            // Offset = my_rank * (num_cards * NUM_TOKENS * HIDDEN_DIM)
+            //        + card_j * (NUM_TOKENS * HIDDEN_DIM)
+            //        + t * HIDDEN_DIM
+            size_t dst_offset = my_rank * num_cards * NUM_TOKENS * HIDDEN_DIM
+                              + card_j * NUM_TOKENS * HIDDEN_DIM
+                              + t * HIDDEN_DIM;
+
+            ShapeDyn dst_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn dst_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global dstG(scratch + dst_offset,
+                       dst_shape, dst_stride);
+            Global dstG_print(scratch_print + dst_offset,
+                             dst_shape, dst_stride);
+
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
+                                       pto::BLayout::RowMajor, -1, -1>;
+            TileType tile(1, HIDDEN_DIM);
+            TASSIGN(tile, 0);
+
+            TLOAD(tile, srcG);
+            set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            TSTORE(dstG, tile);
+            TSTORE(dstG_print, tile);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        }
+    }
+    pipe_barrier(PIPE_ALL);
+
+    // ------------------------------------------------------------------
+    // Phase 2: device barrier — each card notifies peers that its
+    // recv[:][my_card] data is visible in scratch, then waits for all peers.
+    // ------------------------------------------------------------------
+    for (int peer = 0; peer < num_cards; ++peer) {
+        if (peer == my_rank) continue;
+        __gm__ int32_t *remote_signal = CommRemotePtr(commCtx, signal_base + my_rank, peer);
+        pto::comm::Signal sig(remote_signal);
+        pto::comm::TNOTIFY(sig, (int32_t)1, pto::comm::NotifyOp::AtomicAdd);
+    }
+    for (int peer = 0; peer < num_cards; ++peer) {
+        if (peer == my_rank) continue;
+        pto::comm::Signal sig(signal_base + peer);
+        pto::comm::TWAIT(sig, (int32_t)1, pto::comm::WaitCmp::GE);
+    }
+    pipe_barrier(PIPE_ALL);
+
+    // ------------------------------------------------------------------
+    // Phase 3: direct store — copy each expert's data to output
+    // Read scratch[expert_i][my_rank][t][:HIDDEN_DIM] from each expert i
+    // and store to output[expert_i][t][:HIDDEN_DIM]
+    //
+    // For card_id with my_rank:
+    //   output[expert_0][t][:] = scratch[expert_0][my_rank][t][:]
+    //   output[expert_1][t][:] = scratch[expert_1][my_rank][t][:]
+    //   etc.
+    // ------------------------------------------------------------------
+    for (int expert_i = 0; expert_i < num_cards; ++expert_i) {
+        for (size_t t = 0; t < COUNT; ++t) {
+            // Source: scratch[expert_i][my_rank][t][:HIDDEN_DIM]
+            // Offset = expert_i * (num_cards * NUM_TOKENS * HIDDEN_DIM)
+            //        + my_rank * (NUM_TOKENS * HIDDEN_DIM)
+            //        + t * HIDDEN_DIM
+            __gm__ float *src_base = (expert_i == my_rank) ? scratch :
+                                     CommRemotePtr(commCtx, scratch, expert_i);
+            size_t src_offset = expert_i * num_cards * NUM_TOKENS * HIDDEN_DIM
+                              + my_rank * NUM_TOKENS * HIDDEN_DIM
+                              + t * HIDDEN_DIM;
+
+            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn src_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global srcG(src_base + src_offset, src_shape, src_stride);
+
+            // Destination: output[expert_i][t][:HIDDEN_DIM]
+            // Offset = expert_i * (COUNT * HIDDEN_DIM) + t * HIDDEN_DIM
+            size_t dst_offset = expert_i * COUNT * HIDDEN_DIM + t * HIDDEN_DIM;
+
+            ShapeDyn dst_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn dst_stride(COUNT * HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global dstG(output + dst_offset, dst_shape, dst_stride);
+
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
+                                       pto::BLayout::RowMajor, -1, -1>;
+            TileType tile(1, HIDDEN_DIM);
+            TASSIGN(tile, 0);
+
+            // Load from scratch
+            TLOAD(tile, srcG);
+            set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+
+            // Store to output
+            TSTORE(dstG, tile);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        }
+    }
+
+    pipe_barrier(PIPE_ALL);
+}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall_ori.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall_ori.cpp
new file mode 100644
index 000000000..67e61d2a5
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall_ori.cpp
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * MoE Combine All-to-All Kernel (Direct Store Version)
+ *
+ * This kernel implements the combine phase of distributed MoE:
+ * Each card i sends recv[i][card_j] (expert_i's result for card_j) to card j,
+ * then directly stores all received results to output without accumulation.
+ *
+ * Data flow:
+ *   Phase 1 (stage-in):  recv[:][:][:COUNT][:] → scratch[my_rank][:][:][:]
+ *   Phase 2 (barrier):   signal matrix + TWAIT cross-rank sync
+ *   Phase 3 (store):     for expert_i in num_cards: copy scratch[expert_i][my_rank][:][:] to output[expert_i][:][:]
+ *
+ * args layout:
+ *   tensor(0) = recv_local     [num_cards][num_tokens][hidden_dim]
+ *   tensor(1) = output_local   [num_cards][count][hidden_dim] - stores all experts' data
+ *   tensor(2) = scratch        HCCL window buffer
+ *   scalar(0) = card_id        which card this is
+ *   scalar(1) = num_cards      total number of cards
+ *   scalar(2) = CommContext    device pointer for cross-card communication
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include "pto/comm/comm_types.hpp"
+#include "pto/comm/pto_comm_inst.hpp"
+#include "platform_comm/comm_context.h"
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+// Configuration matching golden.py
+static constexpr size_t NUM_TOKENS = 10;
+static constexpr size_t HIDDEN_DIM = 16;
+static constexpr size_t COUNT = 4;  // tokens to process per (card, expert) pair
+static constexpr int kMaxSupportedCards = 16;
+
+template <typename T>
+AICORE inline __gm__ T *CommRemotePtr(__gm__ CommContext *ctx, __gm__ T *localPtr, int pe) {
+    uint64_t localBase = ctx->windowsIn[ctx->rankId];
+    uint64_t offset = (uint64_t)localPtr - localBase;
+    return (__gm__ T *)(ctx->windowsIn[pe] + offset);
+}
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensors
+    __gm__ Tensor *recv_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *output_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *scratch_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+    __gm__ Tensor *scratch_print_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
+    __gm__ Tensor *acc_values_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]);
+    __gm__ Tensor *src_values_tensor = reinterpret_cast<__gm__ Tensor *>(args[5]);
+
+    // Unpack scalars
+    int64_t card_id = static_cast<int64_t>(args[6]);
+    int num_cards = static_cast<int>(args[7]);
+    __gm__ CommContext *commCtx = reinterpret_cast<__gm__ CommContext *>(args[8]);
+
+    // Get base pointers
+    __gm__ float *recv = reinterpret_cast<__gm__ float *>(recv_tensor->buffer.addr) + recv_tensor->start_offset;
+    __gm__ float *output = reinterpret_cast<__gm__ float *>(output_tensor->buffer.addr) + output_tensor->start_offset;
+    __gm__ float *scratch = reinterpret_cast<__gm__ float *>(scratch_tensor->buffer.addr) + scratch_tensor->start_offset;
+    __gm__ float *scratch_print = reinterpret_cast<__gm__ float *>(scratch_print_tensor->buffer.addr) + scratch_print_tensor->start_offset;
+    __gm__ float *acc_values = reinterpret_cast<__gm__ float *>(acc_values_tensor->buffer.addr) + acc_values_tensor->start_offset;
+    __gm__ float *src_values = reinterpret_cast<__gm__ float *>(src_values_tensor->buffer.addr) + src_values_tensor->start_offset;
+
+    // Signal area at tail of scratch: num_cards int32 slots
+    // Must be placed AFTER all data slots to avoid corruption
+    size_t total_data_size = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM;
+    __gm__ int32_t *signal_base = reinterpret_cast<__gm__ int32_t *>(scratch + total_data_size);
+
+    using ShapeDyn = pto::Shape<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
+    using StrideDyn = pto::Stride<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
+    using Global = pto::GlobalTensor<float, ShapeDyn, StrideDyn, pto::Layout::ND>;
+
+    int my_rank = static_cast<int>(commCtx->rankId);
+
+    if (num_cards <= 0 || num_cards > kMaxSupportedCards) {
+        pipe_barrier(PIPE_ALL);
+        return;
+    }
+
+    // ------------------------------------------------------------------
+    // Phase 1: stage-in — copy recv to scratch
+    // This card's expert result for all cards (as destination)
+    //
+    //
+    // For card_i with expert_id, copy recv[card_j][:][:] to scratch[expert_id][card_j][:][:]
+    // ------------------------------------------------------------------
+    for (int card_j = 0; card_j < num_cards; ++card_j) {
+        for (size_t t = 0; t < COUNT; ++t) {
+            // Source: recv[card_j][t][:HIDDEN_DIM] (expert_id's processed data from card_j)
+            // recv layout: [num_cards][NUM_TOKENS][HIDDEN_DIM]
+            // Base points to current (card_j, t), stride should keep access within current token
+            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn src_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
+                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global srcG(recv + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM,
+                       src_shape, src_stride);
+
+            // Destination: scratch[my_rank][card_j][t][:HIDDEN_DIM]
+            // Offset = my_rank * (num_cards * NUM_TOKENS * HIDDEN_DIM)
+            //        + card_j * (NUM_TOKENS * HIDDEN_DIM)
+            //        + t * HIDDEN_DIM
+            size_t dst_offset = my_rank * num_cards * NUM_TOKENS * HIDDEN_DIM
+                              + card_j * NUM_TOKENS * HIDDEN_DIM
+                              + t * HIDDEN_DIM;
+
+            ShapeDyn dst_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn dst_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global dstG(scratch + dst_offset,
+                       dst_shape, dst_stride);
+            Global dstG_print(scratch_print + dst_offset,
+                             dst_shape, dst_stride);
+
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
+                                       pto::BLayout::RowMajor, -1, -1>;
+            TileType tile(1, HIDDEN_DIM);
+            TASSIGN(tile, 0);
+
+            TLOAD(tile, srcG);
+            set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            TSTORE(dstG, tile);
+            TSTORE(dstG_print, tile);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        }
+    }
+    pipe_barrier(PIPE_ALL);
+
+    // ------------------------------------------------------------------
+    // Phase 2: device barrier — each card notifies peers that its
+    // recv[:][my_card] data is visible in scratch, then waits for all peers.
+    // ------------------------------------------------------------------
+    for (int peer = 0; peer < num_cards; ++peer) {
+        if (peer == my_rank) continue;
+        __gm__ int32_t *remote_signal = CommRemotePtr(commCtx, signal_base + my_rank, peer);
+        pto::comm::Signal sig(remote_signal);
+        pto::comm::TNOTIFY(sig, (int32_t)1, pto::comm::NotifyOp::AtomicAdd);
+    }
+    for (int peer = 0; peer < num_cards; ++peer) {
+        if (peer == my_rank) continue;
+        pto::comm::Signal sig(signal_base + peer);
+        pto::comm::TWAIT(sig, (int32_t)1, pto::comm::WaitCmp::GE);
+    }
+    pipe_barrier(PIPE_ALL);
+
+    // ------------------------------------------------------------------
+    // Phase 3: reduce — accumulate all experts' results for this card
+    // Read scratch[expert_i][card_id][:][:] from each expert i's scratch
+    // and accumulate to output[t][:HIDDEN_DIM]
+    //
+    // For card_id, accumulate:
+    //   from expert 0: scratch[0][card_id][:][:]
+    //   from expert 1: scratch[1][card_id][:][:]
+    //   etc.
+    // ------------------------------------------------------------------
+
+    // Initialize output to zero
+    // for (size_t t = 0; t < COUNT; ++t) {
+    //     ShapeDyn out_shape(1, 1, 1, 1, HIDDEN_DIM);
+    //     StrideDyn out_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
+    //     Global outG(output + t * HIDDEN_DIM, out_shape, out_stride);
+
+    //     using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
+    //                                pto::BLayout::RowMajor, -1, -1>;
+    //     TileType tile(1, HIDDEN_DIM);
+    //     TASSIGN(tile, 0);
+    //     TSTORE(outG, tile);
+    //     set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    //     wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    // }
+
+    // Accumulate from all experts
+    int add_entry = 0;
+    for (int expert_i = 0; expert_i < num_cards; ++expert_i) {
+        for (size_t t = 0; t < COUNT; ++t) {
+            // Source: scratch[expert_i][my_rank][t][:HIDDEN_DIM]
+            // Offset = expert_i * (num_cards * NUM_TOKENS * HIDDEN_DIM)
+            //        + my_rank * (NUM_TOKENS * HIDDEN_DIM)
+            //        + t * HIDDEN_DIM
+            __gm__ float *src_base = (expert_i == my_rank) ? scratch :
+                                     CommRemotePtr(commCtx, scratch, expert_i);
+            size_t src_offset = expert_i * num_cards * NUM_TOKENS * HIDDEN_DIM
+                              + my_rank * NUM_TOKENS * HIDDEN_DIM
+                              + t * HIDDEN_DIM;
+
+            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn src_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global srcG(src_base + src_offset, src_shape, src_stride);
+
+            // Destination: output[t][:HIDDEN_DIM] (accumulate)
+            ShapeDyn out_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn out_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global outG(output + t * HIDDEN_DIM, out_shape, out_stride);
+
+            // Destinations for acc and src values (before accumulation)
+            ShapeDyn acc_save_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn acc_save_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global acc_saveG(acc_values + add_entry * HIDDEN_DIM, acc_save_shape, acc_save_stride);
+
+            ShapeDyn src_save_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn src_save_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global src_saveG(src_values + add_entry * HIDDEN_DIM, src_save_shape, src_save_stride);
+
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
+                                       pto::BLayout::RowMajor, -1, -1>;
+            TileType srcTile(1, HIDDEN_DIM);
+            TileType accTile(1, HIDDEN_DIM);
+            constexpr size_t kTileSize = 1 * HIDDEN_DIM * sizeof(float);  // 64 bytes
+            TASSIGN(srcTile, kTileSize);      // Use offset 64
+            TASSIGN(accTile, kTileSize * 2);  // Use offset 128
+
+            // Load current output value (acc before accumulation)
+            TLOAD(accTile, outG);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+            // Load from remote scratch (src)
+            TLOAD(srcTile, srcG);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+
+            // Save acc and src before accumulation
+            TSTORE(acc_saveG, accTile);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
+
+            TSTORE(src_saveG, srcTile);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
+
+            // Accumulate
+            TADD(accTile, accTile, srcTile);
+            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+
+            // Store to output
+            TSTORE(outG, accTile);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+
+            add_entry++;
+        }
+    }
+
+    pipe_barrier(PIPE_ALL);
+}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_0.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_0.cpp
new file mode 100644
index 000000000..70ad453f9
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_0.cpp
@@ -0,0 +1,108 @@
+// Kernel Function: moe_demo_incore_0
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+
+using namespace pto;
+
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+  kBarrierAll = 0,
+  kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(
+    PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+  switch (mode) {
+  case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+    break;
+  case PTOAutoSyncTailMode::kBarrierAll:
+  default:
+    pipe_barrier(PIPE_ALL);
+    break;
+  }
+}
+
+static __aicore__ void moe_demo_incore_0(__gm__ bfloat16_t* v1, __gm__ bfloat16_t* v2, int32_t v3, int32_t v4, int32_t v5) {
+  unsigned v6 = 0;
+  const int32_t v7 = 256;
+  const int32_t v8 = 64;
+  const int32_t v9 = 1;
+  const int32_t v10 = 16;
+  const int64_t v11 = 0;
+  using T = float;
+
+  #if defined(__DAV_VEC__)
+  set_mask_norm();
+  set_vector_mask(-1, -1);
+  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v12 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v9, v10);
+  TASSIGN(v12, v11);
+  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v13 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v9, v10);
+  __ubuf__ bfloat16_t* v14 = v12.data();
+  uint64_t v15 = reinterpret_cast<uint64_t>(v14);
+  TASSIGN(v13, v15);
+  pto::Shape<1, 1, 1, 1, 16> v16 = pto::Shape<1, 1, 1, 1, 16>();
+  pto::Stride<256, 256, 64, 16, 1> v17 = pto::Stride<256, 256, 64, 16, 1>();
+  GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND> v18 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND>(v1 + ((v6 + (unsigned) v3 * (unsigned) v7 + (unsigned) v4 * (unsigned) v8) + (unsigned) v5 * (unsigned) v10 + v6 * (unsigned) v9), v16, v17);
+  TLOAD(v13, v18);
+  set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+  pto::Shape<1, 1, 1, 1, 16> v19 = pto::Shape<1, 1, 1, 1, 16>();
+  pto::Stride<256, 256, 64, 16, 1> v20 = pto::Stride<256, 256, 64, 16, 1>();
+  GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND> v21 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND>(v2 + ((v6 + (unsigned) v4 * (unsigned) v7 + (unsigned) v3 * (unsigned) v8) + (unsigned) v5 * (unsigned) v10 + v6 * (unsigned) v9), v19, v20);
+  wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+  TSTORE(v21, v13);
+  #endif // __DAV_VEC__
+
+  ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+  return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t* args)
+{
+    // Unpack tensor: send__ssa_v0
+    __gm__ Tensor* send__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ bfloat16_t* send__ssa_v0 = reinterpret_cast<__gm__ bfloat16_t*>(send__ssa_v0_tensor->buffer.addr) + send__ssa_v0_tensor->start_offset;
+
+    // Unpack tensor: recv__iter_v5
+    __gm__ Tensor* recv__iter_v5_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ bfloat16_t* recv__iter_v5 = reinterpret_cast<__gm__ bfloat16_t*>(recv__iter_v5_tensor->buffer.addr) + recv__iter_v5_tensor->start_offset;
+
+    // Unpack scalar: card_i__idx_v0
+    union { uint64_t u64; int64_t val; } card_i__idx_v0_conv;
+    card_i__idx_v0_conv.u64 = args[2];
+    int64_t card_i__idx_v0 = card_i__idx_v0_conv.val;
+
+    // Unpack scalar: expert_j__idx_v0
+    union { uint64_t u64; int64_t val; } expert_j__idx_v0_conv;
+    expert_j__idx_v0_conv.u64 = args[3];
+    int64_t expert_j__idx_v0 = expert_j__idx_v0_conv.val;
+
+    // Unpack scalar: t_idx__idx_v0
+    union { uint64_t u64; int64_t val; } t_idx__idx_v0_conv;
+    t_idx__idx_v0_conv.u64 = args[4];
+    int64_t t_idx__idx_v0 = t_idx__idx_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    moe_demo_incore_0(send__ssa_v0, recv__iter_v5, card_i__idx_v0, expert_j__idx_v0, t_idx__idx_v0);
+}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_1.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_1.cpp
new file mode 100644
index 000000000..d4c99d0e8
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_1.cpp
@@ -0,0 +1,137 @@
+// Kernel Function: moe_demo_incore_1
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+
+using namespace pto;
+
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+  kBarrierAll = 0,
+  kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(
+    PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+  switch (mode) {
+  case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+    break;
+  case PTOAutoSyncTailMode::kBarrierAll:
+  default:
+    pipe_barrier(PIPE_ALL);
+    break;
+  }
+}
+
+static __aicore__ void moe_demo_incore_1(__gm__ bfloat16_t* v1, int32_t v2, int32_t v3, int32_t v4) {
+  RoundMode v5 = RoundMode::CAST_ROUND;
+  unsigned v6 = 0;
+  const int32_t v7 = 256;
+  const int32_t v8 = 64;
+  const float v9 = 1.0f;
+  const int32_t v10 = 1;
+  const int32_t v11 = 16;
+  const int64_t v12 = 96;
+  const int64_t v13 = 32;
+  const int64_t v14 = 0;
+  using T = float;
+
+  #if defined(__DAV_VEC__)
+  set_mask_norm();
+  set_vector_mask(-1, -1);
+  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v15 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
+  TASSIGN(v15, v14);
+  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v16 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
+  __ubuf__ bfloat16_t* v17 = v15.data();
+  uint64_t v18 = reinterpret_cast<uint64_t>(v17);
+  TASSIGN(v16, v18);
+  pto::Shape<1, 1, 1, 1, 16> v19 = pto::Shape<1, 1, 1, 1, 16>();
+  pto::Stride<256, 256, 64, 16, 1> v20 = pto::Stride<256, 256, 64, 16, 1>();
+  GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND> v21 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND>(v1 + ((v6 + (unsigned) v2 * (unsigned) v7 + (unsigned) v3 * (unsigned) v8) + (unsigned) v4 * (unsigned) v11 + v6 * (unsigned) v10), v19, v20);
+  TLOAD(v16, v21);
+  set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v22 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
+  TASSIGN(v22, v13);
+  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v23 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
+  __ubuf__ float* v24 = v22.data();
+  uint64_t v25 = reinterpret_cast<uint64_t>(v24);
+  TASSIGN(v23, v25);
+  wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+  TCVT(v23, v16, v5);
+  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v26 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
+  TASSIGN(v26, v12);
+  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v27 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
+  __ubuf__ float* v28 = v26.data();
+  uint64_t v29 = reinterpret_cast<uint64_t>(v28);
+  TASSIGN(v27, v29);
+  TEXPANDS(v27, v9);
+  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v30 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
+  TASSIGN(v30, v13);
+  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v31 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
+  __ubuf__ float* v32 = v30.data();
+  uint64_t v33 = reinterpret_cast<uint64_t>(v32);
+  TASSIGN(v31, v33);
+  pipe_barrier(PIPE_V);
+  TADD(v31, v23, v27);
+  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v34 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
+  TASSIGN(v34, v14);
+  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v35 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
+  __ubuf__ bfloat16_t* v36 = v34.data();
+  uint64_t v37 = reinterpret_cast<uint64_t>(v36);
+  TASSIGN(v35, v37);
+  pipe_barrier(PIPE_V);
+  TCVT(v35, v31, v5);
+  set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+  wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+  TSTORE(v21, v35);
+  #endif // __DAV_VEC__
+
+  ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+  return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t* args)
+{
+    // Unpack tensor: recv__iter_v12
+    __gm__ Tensor* recv__iter_v12_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ bfloat16_t* recv__iter_v12 = reinterpret_cast<__gm__ bfloat16_t*>(recv__iter_v12_tensor->buffer.addr) + recv__iter_v12_tensor->start_offset;
+
+    // Unpack scalar: expert_j__idx_v0
+    union { uint64_t u64; int64_t val; } expert_j__idx_v0_conv;
+    expert_j__idx_v0_conv.u64 = args[1];
+    int64_t expert_j__idx_v0 = expert_j__idx_v0_conv.val;
+
+    // Unpack scalar: card_i__idx_v0
+    union { uint64_t u64; int64_t val; } card_i__idx_v0_conv;
+    card_i__idx_v0_conv.u64 = args[2];
+    int64_t card_i__idx_v0 = card_i__idx_v0_conv.val;
+
+    // Unpack scalar: t_idx__idx_v0
+    union { uint64_t u64; int64_t val; } t_idx__idx_v0_conv;
+    t_idx__idx_v0_conv.u64 = args[3];
+    int64_t t_idx__idx_v0 = t_idx__idx_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    moe_demo_incore_1(recv__iter_v12, expert_j__idx_v0, card_i__idx_v0, t_idx__idx_v0);
+}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_2.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_2.cpp
new file mode 100644
index 000000000..1074f3499
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_2.cpp
@@ -0,0 +1,156 @@
+// Kernel Function: moe_demo_incore_2
+// Generated by PyPTO IR Compiler (PTO backend)
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#if defined(__CPU_SIM)
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+
+using namespace pto;
+
+
+// --- ptoas-generated code ---
+
+enum class PTOAutoSyncTailMode : int {
+  kBarrierAll = 0,
+  kSetWaitMte3ToSEvent0 = 1,
+};
+
+static __aicore__ inline void ptoas_auto_sync_tail(
+    PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
+  switch (mode) {
+  case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
+    break;
+  case PTOAutoSyncTailMode::kBarrierAll:
+  default:
+    pipe_barrier(PIPE_ALL);
+    break;
+  }
+}
+
+static __aicore__ void moe_demo_incore_2(__gm__ bfloat16_t* v1, __gm__ bfloat16_t* v2, int32_t v3, int32_t v4) {
+  RoundMode v5 = RoundMode::CAST_ROUND;
+  unsigned v6 = 0;
+  const int32_t v7 = 256;
+  const int32_t v8 = 64;
+  const int32_t v9 = 0;
+  const float v10 = 0.0f;
+  const int32_t v11 = 1;
+  const int32_t v12 = 16;
+  const int32_t v13 = 4;
+  const int64_t v14 = 96;
+  const int64_t v15 = 64;
+  const int64_t v16 = 0;
+  using T = float;
+
+  #if defined(__DAV_VEC__)
+  set_mask_norm();
+  set_vector_mask(-1, -1);
+  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v17 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+  TASSIGN(v17, v16);
+  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v18 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+  __ubuf__ float* v19 = v17.data();
+  uint64_t v20 = reinterpret_cast<uint64_t>(v19);
+  TASSIGN(v18, v20);
+  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+  TEXPANDS(v18, v10);
+  for (size_t v21 = (size_t) v9; v21 < ((size_t) v13); v21 += (size_t) v11) {
+    Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v22 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+    TASSIGN(v22, v15);
+    Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v23 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+    __ubuf__ bfloat16_t* v24 = v22.data();
+    uint64_t v25 = reinterpret_cast<uint64_t>(v24);
+    TASSIGN(v23, v25);
+    pto::Shape<1, 1, 1, 1, 16> v26 = pto::Shape<1, 1, 1, 1, 16>();
+    pto::Stride<256, 256, 64, 16, 1> v27 = pto::Stride<256, 256, 64, 16, 1>();
+    GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND> v28 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND>(v1 + ((v6 + (unsigned) ((int32_t) v21) * (unsigned) v7 + (unsigned) v3 * (unsigned) v8) + (unsigned) v4 * (unsigned) v12 + v6 * (unsigned) v11), v26, v27);
+    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    TLOAD(v23, v28);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v29 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+    TASSIGN(v29, v14);
+    Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v30 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+    __ubuf__ float* v31 = v29.data();
+    uint64_t v32 = reinterpret_cast<uint64_t>(v31);
+    TASSIGN(v30, v32);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    pipe_barrier(PIPE_V);
+    TCVT(v30, v23, v5);
+    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+    Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v33 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+    TASSIGN(v33, v16);
+    Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v34 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+    __ubuf__ float* v35 = v33.data();
+    uint64_t v36 = reinterpret_cast<uint64_t>(v35);
+    TASSIGN(v34, v36);
+    pipe_barrier(PIPE_V);
+    TADD(v34, v18, v30);
+  }
+  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v37 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+  TASSIGN(v37, v15);
+  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v38 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+  __ubuf__ bfloat16_t* v39 = v37.data();
+  uint64_t v40 = reinterpret_cast<uint64_t>(v39);
+  TASSIGN(v38, v40);
+  pipe_barrier(PIPE_V);
+  TCVT(v38, v18, v5);
+  set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v41 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+  TASSIGN(v41, v15);
+  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v42 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
+  __ubuf__ bfloat16_t* v43 = v41.data();
+  uint64_t v44 = reinterpret_cast<uint64_t>(v43);
+  TASSIGN(v42, v44);
+  pto::Shape<1, 1, 1, 1, 16> v45 = pto::Shape<1, 1, 1, 1, 16>();
+  pto::Stride<64, 64, 64, 16, 1> v46 = pto::Stride<64, 64, 64, 16, 1>();
+  GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<64, 64, 64, 16, 1>, pto::Layout::ND> v47 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<64, 64, 64, 16, 1>, pto::Layout::ND>(v2 + ((v6 + (unsigned) v3 * (unsigned) v8) + (unsigned) v4 * (unsigned) v12 + v6 * (unsigned) v11), v45, v46);
+  wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+  TSTORE(v47, v42);
+  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
+  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+  #endif // __DAV_VEC__
+
+  ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
+  return;
+}
+
+// --- Kernel entry point ---
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t* args)
+{
+    // Unpack tensor: recv__rv_v9
+    __gm__ Tensor* recv__rv_v9_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ bfloat16_t* recv__rv_v9 = reinterpret_cast<__gm__ bfloat16_t*>(recv__rv_v9_tensor->buffer.addr) + recv__rv_v9_tensor->start_offset;
+
+    // Unpack tensor: output__iter_v3
+    __gm__ Tensor* output__iter_v3_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ bfloat16_t* output__iter_v3 = reinterpret_cast<__gm__ bfloat16_t*>(output__iter_v3_tensor->buffer.addr) + output__iter_v3_tensor->start_offset;
+
+    // Unpack scalar: card_i__idx_v0
+    union { uint64_t u64; int64_t val; } card_i__idx_v0_conv;
+    card_i__idx_v0_conv.u64 = args[2];
+    int64_t card_i__idx_v0 = card_i__idx_v0_conv.val;
+
+    // Unpack scalar: t_idx__idx_v0
+    union { uint64_t u64; int64_t val; } t_idx__idx_v0_conv;
+    t_idx__idx_v0_conv.u64 = args[3];
+    int64_t t_idx__idx_v0 = t_idx__idx_v0_conv.val;
+
+    // Forward to ptoas-generated function
+    moe_demo_incore_2(recv__rv_v9, output__iter_v3, card_i__idx_v0, t_idx__idx_v0);
+}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_dispatch_alltoall.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_dispatch_alltoall.cpp
new file mode 100644
index 000000000..4bb94d634
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_dispatch_alltoall.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * MoE Dispatch All-to-All Kernel
+ *
+ * This kernel implements the dispatch phase of distributed MoE:
+ * Each card i sends send[i][expert_i] to all other cards, and receives
+ * send[j][expert_i] from card j.
+ *
+ * Data flow:
+ *   Phase 1 (stage-in):  send[expert_i][:][:] → my scratch slot
+ *   Phase 2 (barrier):   signal matrix + TWAIT cross-rank sync
+ *   Phase 3 (gather):    for card_j in num_cards: TLOAD(card_j_scratch), TSTORE(recv[card_j][:][:])
+ *
+ * args layout:
+ *   tensor(0) = send_local    [num_experts][num_tokens][hidden_dim]
+ *   tensor(1) = recv_local    [num_cards][num_tokens][hidden_dim]
+ *   tensor(2) = scratch       HCCL window buffer
+ *   scalar(0) = expert_id      which expert this card processes
+ *   scalar(1) = num_cards      total number of cards
+ *   scalar(2) = CommContext    device pointer for cross-card communication
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include "pto/comm/comm_types.hpp"
+#include "pto/comm/pto_comm_inst.hpp"
+#include "platform_comm/comm_context.h"
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+// Configuration matching golden.py
+static constexpr size_t NUM_TOKENS = 10;
+static constexpr size_t HIDDEN_DIM = 16;
+static constexpr size_t COUNT = 4;  // tokens to process per (card, expert) pair
+static constexpr int kMaxSupportedCards = 16;
+
+template <typename T>
+AICORE inline __gm__ T *CommRemotePtr(__gm__ CommContext *ctx, __gm__ T *localPtr, int pe) {
+    uint64_t localBase = ctx->windowsIn[ctx->rankId];
+    uint64_t offset = (uint64_t)localPtr - localBase;
+    return (__gm__ T *)(ctx->windowsIn[pe] + offset);
+}
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    // Unpack tensors
+    __gm__ Tensor *send_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *recv_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ Tensor *scratch_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
+
+    // Unpack scalars
+    int64_t expert_id = static_cast<int64_t>(args[3]);
+    int num_cards = static_cast<int>(args[4]);
+    __gm__ CommContext *commCtx = reinterpret_cast<__gm__ CommContext *>(args[5]);
+
+    // Get base pointers
+    __gm__ float *send = reinterpret_cast<__gm__ float *>(send_tensor->buffer.addr) + send_tensor->start_offset;
+    __gm__ float *recv = reinterpret_cast<__gm__ float *>(recv_tensor->buffer.addr) + recv_tensor->start_offset;
+    __gm__ float *scratch = reinterpret_cast<__gm__ float *>(scratch_tensor->buffer.addr) + scratch_tensor->start_offset;
+
+    // Signal area at tail of scratch: num_cards int32 slots
+    // Must be placed AFTER all data slots to avoid corruption
+    size_t total_data_size = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM;
+    __gm__ int32_t *signal_base = reinterpret_cast<__gm__ int32_t *>(scratch + total_data_size);
+
+    using ShapeDyn = pto::Shape<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
+    using StrideDyn = pto::Stride<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
+    using Global = pto::GlobalTensor<float, ShapeDyn, StrideDyn, pto::Layout::ND>;
+
+    int my_rank = static_cast<int>(commCtx->rankId);
+
+    if (num_cards <= 0 || num_cards > kMaxSupportedCards) {
+        pipe_barrier(PIPE_ALL);
+        return;
+    }
+
+    // ------------------------------------------------------------------
+    // Phase 1: stage-in — copy ALL experts' data to my scratch slot
+    // Each card contributes ALL of its send[:] (all experts) to enable all-to-all
+    //
+    // Data layout in scratch: scratch[card_j][expert_i][:][:]
+    // where card_j = my_rank (the card sending the data)
+    //       expert_i = expert index (0..num_cards-1)
+    //       t = token index (0..COUNT-1)
+    //
+    // This allows combine phase to access:
+    //   "expert_i's data from card_j" at scratch[card_j][expert_i]
+    // ------------------------------------------------------------------
+    for (int expert_i = 0; expert_i < num_cards; ++expert_i) {
+        for (size_t t = 0; t < COUNT; ++t) {
+            // Load from send[expert_i][t][:HIDDEN_DIM] (ALL experts, not just expert_id)
+            ShapeDyn send_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn send_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
+                                  HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global sendG(send + expert_i * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM,
+                        send_shape, send_stride);
+
+            // Store to scratch[my_rank][expert_i][t][:HIDDEN_DIM]
+            // Index = my_rank * (num_cards * NUM_TOKENS * HIDDEN_DIM)
+            //       + expert_i * (NUM_TOKENS * HIDDEN_DIM)
+            //       + t * HIDDEN_DIM
+            size_t scratch_offset = my_rank * num_cards * NUM_TOKENS * HIDDEN_DIM
+                                  + expert_i * NUM_TOKENS * HIDDEN_DIM
+                                  + t * HIDDEN_DIM;
+
+            ShapeDyn scratch_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn scratch_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                     num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                     NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global scratchG(scratch + scratch_offset,
+                            scratch_shape, scratch_stride);
+
+            // Use tile for data movement
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
+                                       pto::BLayout::RowMajor, -1, -1>;
+            TileType tile(1, HIDDEN_DIM);
+            TASSIGN(tile, 0);
+
+            TLOAD(tile, sendG);
+            set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            TSTORE(scratchG, tile);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        }
+    }
+    pipe_barrier(PIPE_ALL);
+
+    // ------------------------------------------------------------------
+    // Phase 2: device barrier — each card notifies peers that its
+    // send[expert_i] data is visible in scratch, then waits for all peers.
+    // ------------------------------------------------------------------
+    for (int peer = 0; peer < num_cards; ++peer) {
+        if (peer == my_rank) continue;
+        __gm__ int32_t *remote_signal = CommRemotePtr(commCtx, signal_base + my_rank, peer);
+        pto::comm::Signal sig(remote_signal);
+        pto::comm::TNOTIFY(sig, (int32_t)1, pto::comm::NotifyOp::AtomicAdd);
+    }
+    for (int peer = 0; peer < num_cards; ++peer) {
+        if (peer == my_rank) continue;
+        pto::comm::Signal sig(signal_base + peer);
+        pto::comm::TWAIT(sig, (int32_t)1, pto::comm::WaitCmp::GE);
+    }
+    pipe_barrier(PIPE_ALL);
+
+    // ------------------------------------------------------------------
+    // Phase 3: gather — read send[j][expert_id] from each card j's scratch
+    // and store to recv[card_j][:COUNT][:HIDDEN_DIM]
+    //
+    // For expert_id on this card, gather data from ALL cards:
+    //   recv[card_j][:][:] = scratch[card_j][expert_id][:][:]
+    // ------------------------------------------------------------------
+    for (int card_j = 0; card_j < num_cards; ++card_j) {
+        for (size_t t = 0; t < COUNT; ++t) {
+            // Source: scratch[card_j][expert_id][t][:HIDDEN_DIM]
+            // Offset = card_j * (num_cards * NUM_TOKENS * HIDDEN_DIM)
+            //        + expert_id * (NUM_TOKENS * HIDDEN_DIM)
+            //        + t * HIDDEN_DIM
+            __gm__ float *src_base = (card_j == my_rank) ? scratch :
+                                     CommRemotePtr(commCtx, scratch, card_j);
+            size_t src_offset = card_j * num_cards * NUM_TOKENS * HIDDEN_DIM
+                              + expert_id * NUM_TOKENS * HIDDEN_DIM
+                              + t * HIDDEN_DIM;
+
+            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn src_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
+                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global srcG(src_base + src_offset,
+                       src_shape, src_stride);
+
+            // Destination: recv[card_j][t][:HIDDEN_DIM]
+            ShapeDyn dst_shape(1, 1, 1, 1, HIDDEN_DIM);
+            StrideDyn dst_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
+                                 HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global dstG(recv + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM,
+                       dst_shape, dst_stride);
+
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
+                                       pto::BLayout::RowMajor, -1, -1>;
+            TileType tile(1, HIDDEN_DIM);
+            TASSIGN(tile, 0);
+
+            TLOAD(tile, srcG);
+            set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
+            TSTORE(dstG, tile);
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        }
+    }
+
+    pipe_barrier(PIPE_ALL);
+}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_simple_compute.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_simple_compute.cpp
new file mode 100644
index 000000000..1df151670
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_simple_compute.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * Simple Compute Kernel for MoE
+ *
+ * Adds 1.0 to all elements in recv[:][:4][:]
+ *
+ * args layout:
+ *   tensor(0) = recv [num_cards][NUM_TOKENS][HIDDEN_DIM]
+ *   scalar(0) = unused (for compatibility)
+ *   scalar(1) = unused (for compatibility)
+ *   scalar(2) = unused (for compatibility)
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static constexpr size_t NUM_TOKENS = 10;
+static constexpr size_t HIDDEN_DIM = 16;
+static constexpr size_t COUNT = 4;
+static constexpr int kMaxSupportedCards = 16;
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *recv_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ float *recv = reinterpret_cast<__gm__ float *>(recv_tensor->buffer.addr) + recv_tensor->start_offset;
+
+    // Add 1.0 to first COUNT tokens for all cards
+    // recv layout: [num_cards][NUM_TOKENS][HIDDEN_DIM]
+    for (int card = 0; card < kMaxSupportedCards; ++card) {
+        for (size_t t = 0; t < COUNT; ++t) {
+            for (size_t d = 0; d < HIDDEN_DIM; ++d) {
+                size_t offset = card * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM + d;
+                recv[offset] += 1.0f;
+            }
+        }
+    }
+
+    pipe_barrier(PIPE_ALL);
+}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/kernel_config.py b/examples/workers/l3/moe_multi_chip_experts/kernels/kernel_config.py
new file mode 100644
index 000000000..715728571
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/kernel_config.py
@@ -0,0 +1,24 @@
+# Kernel and Orchestration Configuration
+
+from pathlib import Path
+
+_ROOT_DIR = Path(__file__).parent.parent
+
+# Runtime configuration for tensormap_and_ringbuffer
+# This runtime requires 4 AICPU threads (3 schedulers + 1 orchestrator on thread 3)
+RUNTIME_CONFIG = {
+	"runtime": "tensormap_and_ringbuffer",
+	"aicpu_thread_num": 4,
+	"block_dim": 24,
+}
+
+ORCHESTRATION = {
+	"source": str(_ROOT_DIR / "kernels" / "orchestration" / "moe_multi_chip_orch.cpp"),
+	"function_name": "aicpu_orchestration_entry"
+}
+
+KERNELS = [
+	{"func_id": 0, "name": "moe_demo_incore_0", "source": str(_ROOT_DIR / "kernels" / "aiv" / "moe_demo_incore_0.cpp"), "core_type": "aiv"},
+	{"func_id": 1, "name": "moe_demo_incore_1", "source": str(_ROOT_DIR / "kernels" / "aiv" / "moe_demo_incore_1.cpp"), "core_type": "aiv"},
+	{"func_id": 2, "name": "moe_demo_incore_2", "source": str(_ROOT_DIR / "kernels" / "aiv" / "moe_demo_incore_2.cpp"), "core_type": "aiv"},
+]
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_combine_only_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_combine_only_orch.cpp
new file mode 100644
index 000000000..70cd56b11
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_combine_only_orch.cpp
@@ -0,0 +1,69 @@
+// Orchestration Function: Combine Only (for debugging)
+//
+// This orchestration ONLY runs the combine phase to verify it works correctly.
+
+#include "runtime.h"
+#include <iostream>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pto_orchestration_api.h"
+
+// Must match golden.py and kernel configurations
+static constexpr int64_t COUNT = 4;  // Number of tokens to process per (card, expert) pair
+static constexpr int64_t NUM_TOKENS = 10;  // Total number of tokens
+static constexpr int64_t HIDDEN_DIM = 16;  // Hidden dimension
+
+extern "C" {
+
+__attribute__((visibility("default")))
+PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,  // recv, output, scratch, scratch_print, card_id, num_cards, commCtx
+    };
+}
+
+__attribute__((visibility("default")))
+void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
+    // External tensors
+    Tensor ext_recv = from_tensor_arg(orch_args.tensor(0));      // [num_cards][tokens][hidden]
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(1));    // [num_cards][count][hidden]
+    Tensor ext_scratch = from_tensor_arg(orch_args.tensor(2));   // HCCL scratch buffer
+    Tensor ext_scratch_print = from_tensor_arg(orch_args.tensor(3));  // Scratch print buffer
+
+    // Scalar arguments
+    int64_t card_id = static_cast<int64_t>(orch_args.scalar(0));    // Which card this is
+    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(1));  // Total number of cards
+    uint64_t comm_ctx_ptr = static_cast<uint64_t>(orch_args.scalar(2));  // CommContext*
+
+    printf("[Combine-Only Orch] card_id=%ld num_cards=%ld\n",
+           card_id, num_cards);
+    fflush(stdout);
+
+    PTO2_SCOPE() {
+        // === ONLY Combine Phase ===
+        printf("[Combine-Only Orch] Submitting combine task for card_id=%ld\n",
+               card_id);
+        fflush(stdout);
+
+        Arg params_combine;
+        params_combine.add_input(ext_recv);
+        params_combine.add_output(ext_output);
+        params_combine.add_inout(ext_scratch);
+        params_combine.add_output(ext_scratch_print);
+        params_combine.add_scalar(card_id);
+        params_combine.add_scalar(num_cards);
+        params_combine.add_scalar(comm_ctx_ptr);
+        pto2_rt_submit_aiv_task(0, params_combine);  // moe_combine_alltoall
+
+        printf("[Combine-Only Orch] Combine task submitted for card_id=%ld\n", card_id);
+        fflush(stdout);
+    }
+
+    printf("[Combine-Only Orch] card_id=%ld completed\n", card_id);
+    fflush(stdout);
+}
+
+}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_comm_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_comm_orch.cpp
new file mode 100644
index 000000000..8de7bc71f
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_comm_orch.cpp
@@ -0,0 +1,123 @@
+// Orchestration Function: MoE with Inter-Chip Communication
+//
+// This orchestration implements the three-stage distributed MoE pattern:
+//   Stage 1: Dispatch all-to-all - each card sends its expert data to expert owner
+//   Stage 2: Compute - each expert processes its received data
+//   Stage 3: Combine all-to-all - results are sent back to source cards
+//
+// Data flow matches golden.py:
+//   send[card_j][expert_i][:][:] → recv[expert_i][card_j][:][:] (dispatch)
+//   recv[expert_i][card_j][:][:] += expert_i (compute)
+//   recv[expert_i][card_j][:][:] → output[card_j][:][:] (combine)
+
+#include "runtime.h"
+#include <iostream>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pto_orchestration_api.h"
+
+// Must match golden.py and kernel configurations
+static constexpr int64_t COUNT = 4;  // Number of tokens to process per (card, expert) pair
+static constexpr int64_t NUM_TOKENS = 10;  // Total number of tokens
+static constexpr int64_t HIDDEN_DIM = 16;  // Hidden dimension
+
+extern "C" {
+
+__attribute__((visibility("default")))
+PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 4,  // send, recv, output, scratch
+    };
+}
+
+__attribute__((visibility("default")))
+void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
+    // External tensors
+    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));      // [num_experts][tokens][hidden]
+    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));      // [num_cards][tokens][hidden]
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));    // [tokens][hidden]
+    Tensor ext_scratch = from_tensor_arg(orch_args.tensor(3));   // HCCL scratch buffer
+
+    // Scalar arguments
+    int64_t expert_id = static_cast<int64_t>(orch_args.scalar(0));  // Which expert this card processes
+    int64_t card_id = static_cast<int64_t>(orch_args.scalar(1));    // Which card this is
+    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(2));  // Total number of cards
+    uint64_t comm_ctx_ptr = static_cast<uint64_t>(orch_args.scalar(3));  // CommContext*
+
+    printf("[MoE Orch] orchestration_entry: card_id=%ld expert_id=%ld num_cards=%ld comm_ctx=0x%lx\n",
+           card_id, expert_id, num_cards, comm_ctx_ptr);
+    fflush(stdout);
+
+    PTO2_SCOPE() {
+        // === 阶段 1: Dispatch All-to-All ===
+        // Each card i sends send[i][expert_i][:][:] to all cards
+        // and receives send[j][expert_i][:][:] from card j
+        // Result: recv[i][card_j][:][:] = send[card_j][expert_i][:][:]
+        {
+            printf("[MoE Orch] Stage 1: Dispatch - card_id=%ld submitting dispatch task\n", card_id);
+            fflush(stdout);
+            Arg params_dispatch;
+            params_dispatch.add_input(ext_send);
+            params_dispatch.add_output(ext_recv);
+            params_dispatch.add_inout(ext_scratch);
+            params_dispatch.add_scalar(expert_id);
+            params_dispatch.add_scalar(num_cards);
+            params_dispatch.add_scalar(comm_ctx_ptr);
+            pto2_rt_submit_aiv_task(0, params_dispatch);  // moe_dispatch_alltoall
+            printf("[MoE Orch] Stage 1: Dispatch - card_id=%ld dispatch task submitted\n", card_id);
+            fflush(stdout);
+        }
+
+        printf("[MoE Orch] ===== After Dispatch (card_id=%ld, expert_id=%ld) =====\n", card_id, expert_id);
+        fflush(stdout);
+
+        // === 阶段 2: Compute (本地) ===
+        // Add 1.0 to all elements in recv[:][:4][:]
+        {
+            printf("[MoE Orch] Stage 2: Compute - card_id=%ld\n", card_id);
+            fflush(stdout);
+
+            Arg params_compute;
+            params_compute.add_inout(ext_recv);
+            params_compute.add_scalar(0);  // unused
+            params_compute.add_scalar(0);  // unused
+            params_compute.add_scalar(0);  // unused
+            pto2_rt_submit_aiv_task(1, params_compute);  // moe_simple_compute
+
+            printf("[MoE Orch] Stage 2: Compute - card_id=%ld compute task submitted\n", card_id);
+            fflush(stdout);
+        }
+
+        printf("[MoE Orch] ===== After Compute (card_id=%ld, expert_id=%ld) =====\n", card_id, expert_id);
+        fflush(stdout);
+
+        // === 阶段 3: Combine All-to-All ===
+        // Each card i sends recv[i][card_j][:][:] to card j
+        // Card j accumulates all received data to output[j][:][:]
+        {
+            printf("[MoE Orch] Stage 3: Combine - card_id=%ld submitting combine task\n", card_id);
+            fflush(stdout);
+            Arg params_combine;
+            params_combine.add_input(ext_recv);
+            params_combine.add_output(ext_output);
+            params_combine.add_inout(ext_scratch);
+            params_combine.add_scalar(card_id);
+            params_combine.add_scalar(num_cards);
+            params_combine.add_scalar(comm_ctx_ptr);
+            pto2_rt_submit_aiv_task(2, params_combine);  // moe_combine_alltoall
+            printf("[MoE Orch] Stage 3: Combine - card_id=%ld combine task submitted\n", card_id);
+            fflush(stdout);
+        }
+
+        printf("[MoE Orch] ===== After Combine (card_id=%ld) =====\n", card_id);
+        fflush(stdout);
+    }
+
+    printf("[MoE Orch] orchestration_entry: card_id=%ld completed\n", card_id);
+    fflush(stdout);
+}
+
+}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_compute_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_compute_orch.cpp
new file mode 100644
index 000000000..5d365fae4
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_compute_orch.cpp
@@ -0,0 +1,88 @@
+// Orchestration Function: Dispatch + Compute (for debugging)
+//
+// This orchestration runs dispatch phase followed by compute phase.
+
+#include "runtime.h"
+#include <iostream>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pto_orchestration_api.h"
+
+// Must match golden.py and kernel configurations
+static constexpr int64_t COUNT = 4;  // Number of tokens to process per (card, expert) pair
+static constexpr int64_t NUM_TOKENS = 10;  // Total number of tokens
+static constexpr int64_t HIDDEN_DIM = 16;  // Hidden dimension
+
+extern "C" {
+
+__attribute__((visibility("default")))
+PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 4,  // send, recv, output, scratch (output unused)
+    };
+}
+
+__attribute__((visibility("default")))
+void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
+    // External tensors
+    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));      // [num_experts][tokens][hidden]
+    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));      // [num_cards][tokens][hidden]
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));    // [tokens][hidden] (unused)
+    Tensor ext_scratch = from_tensor_arg(orch_args.tensor(3));   // HCCL scratch buffer
+
+    // Scalar arguments
+    int64_t expert_id = static_cast<int64_t>(orch_args.scalar(0));  // Which expert this card processes
+    int64_t card_id = static_cast<int64_t>(orch_args.scalar(1));    // Which card this is
+    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(2));  // Total number of cards
+    uint64_t comm_ctx_ptr = static_cast<uint64_t>(orch_args.scalar(3));  // CommContext*
+
+    printf("[Dispatch+Compute Orch] card_id=%ld expert_id=%ld num_cards=%ld\n",
+           card_id, expert_id, num_cards);
+    fflush(stdout);
+
+    PTO2_SCOPE() {
+        // === Phase 1: Dispatch ===
+        printf("[Dispatch+Compute Orch] Stage 1: Dispatch - card_id=%ld\n", card_id);
+        fflush(stdout);
+
+        Arg params_dispatch;
+        params_dispatch.add_input(ext_send);
+        params_dispatch.add_output(ext_recv);
+        params_dispatch.add_inout(ext_scratch);
+        params_dispatch.add_scalar(expert_id);
+        params_dispatch.add_scalar(num_cards);
+        params_dispatch.add_scalar(comm_ctx_ptr);
+        pto2_rt_submit_aiv_task(0, params_dispatch);  // moe_dispatch_alltoall
+
+        printf("[Dispatch+Compute Orch] Dispatch submitted for card_id=%ld\n", card_id);
+        fflush(stdout);
+
+        // === Phase 2: Compute ===
+        printf("[Dispatch+Compute Orch] Stage 2: Compute - card_id=%ld processing %d cards x %d tokens\n",
+               card_id, num_cards, COUNT);
+        fflush(stdout);
+
+        // === Phase 2: Compute ===
+        // Add 1.0 to all elements in recv[:][:4][:]
+        printf("[Dispatch+Compute Orch] Stage 2: Compute - card_id=%ld\n", card_id);
+        fflush(stdout);
+
+        Arg params_compute;
+        params_compute.add_inout(ext_recv);
+        params_compute.add_scalar(0);  // unused
+        params_compute.add_scalar(0);  // unused
+        params_compute.add_scalar(0);  // unused
+        pto2_rt_submit_aiv_task(1, params_compute);  // moe_simple_compute
+
+        printf("[Dispatch+Compute Orch] Compute submitted for card_id=%ld\n", card_id);
+        fflush(stdout);
+    }
+
+    printf("[Dispatch+Compute Orch] card_id=%ld completed\n", card_id);
+    fflush(stdout);
+}
+
+}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_only_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_only_orch.cpp
new file mode 100644
index 000000000..9751e2d4b
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_only_orch.cpp
@@ -0,0 +1,69 @@
+// Orchestration Function: Dispatch Only (for debugging)
+//
+// This orchestration ONLY runs the dispatch phase to verify it works correctly.
+
+#include "runtime.h"
+#include <iostream>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pto_orchestration_api.h"
+
+// Must match golden.py and kernel configurations
+static constexpr int64_t COUNT = 4;  // Number of tokens to process per (card, expert) pair
+static constexpr int64_t NUM_TOKENS = 10;  // Total number of tokens
+static constexpr int64_t HIDDEN_DIM = 16;  // Hidden dimension
+
+extern "C" {
+
+__attribute__((visibility("default")))
+PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 4,  // send, recv, output, scratch (output unused in dispatch-only)
+    };
+}
+
+__attribute__((visibility("default")))
+void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
+    // External tensors
+    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));      // [num_experts][tokens][hidden]
+    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));      // [num_cards][tokens][hidden]
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));    // [tokens][hidden] (unused)
+    Tensor ext_scratch = from_tensor_arg(orch_args.tensor(3));   // HCCL scratch buffer
+
+    // Scalar arguments
+    int64_t expert_id = static_cast<int64_t>(orch_args.scalar(0));  // Which expert this card processes
+    int64_t card_id = static_cast<int64_t>(orch_args.scalar(1));    // Which card this is
+    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(2));  // Total number of cards
+    uint64_t comm_ctx_ptr = static_cast<uint64_t>(orch_args.scalar(3));  // CommContext*
+
+    printf("[Dispatch-Only Orch] card_id=%ld expert_id=%ld num_cards=%ld\n",
+           card_id, expert_id, num_cards);
+    fflush(stdout);
+
+    PTO2_SCOPE() {
+        // === ONLY Dispatch Phase ===
+        printf("[Dispatch-Only Orch] Submitting dispatch task for card_id=%ld expert_id=%ld\n",
+               card_id, expert_id);
+        fflush(stdout);
+
+        Arg params_dispatch;
+        params_dispatch.add_input(ext_send);
+        params_dispatch.add_output(ext_recv);
+        params_dispatch.add_inout(ext_scratch);
+        params_dispatch.add_scalar(expert_id);
+        params_dispatch.add_scalar(num_cards);
+        params_dispatch.add_scalar(comm_ctx_ptr);
+        pto2_rt_submit_aiv_task(0, params_dispatch);  // moe_dispatch_alltoall
+
+        printf("[Dispatch-Only Orch] Dispatch task submitted for card_id=%ld\n", card_id);
+        fflush(stdout);
+    }
+
+    printf("[Dispatch-Only Orch] card_id=%ld completed\n", card_id);
+    fflush(stdout);
+}
+
+}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_end2end_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_end2end_orch.cpp
new file mode 100644
index 000000000..c3fc7accc
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_end2end_orch.cpp
@@ -0,0 +1,110 @@
+// Orchestration Function: End-to-End MoE Pipeline
+//
+// This orchestration runs the complete MoE pipeline:
+// 1. Dispatch: distribute tokens to expert cards
+// 2. Compute: process tokens on each expert card
+// 3. Combine: gather results back to source cards
+//
+// Uses independent scratch buffers for combine phase to avoid data corruption
+
+#include "runtime.h"
+#include <iostream>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pto_orchestration_api.h"
+
+// Must match golden.py and kernel configurations
+static constexpr int64_t COUNT = 4;  // Number of tokens to process per (card, expert) pair
+static constexpr int64_t NUM_TOKENS = 10;  // Total number of tokens
+static constexpr int64_t HIDDEN_DIM = 16;  // Hidden dimension
+
+extern "C" {
+
+__attribute__((visibility("default")))
+PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 10,  // send, recv, output, scratch, scratch_test, scratch_print, expert_id, card_id, num_cards, commCtx
+    };
+}
+
+__attribute__((visibility("default")))
+void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
+    // External tensors
+    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));         // [num_experts][tokens][hidden]
+    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));         // [num_cards][tokens][hidden]
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));       // [num_cards][count][hidden]
+    Tensor ext_scratch = from_tensor_arg(orch_args.tensor(3));      // HCCL scratch buffer for dispatch+compute
+    Tensor ext_scratch_test = from_tensor_arg(orch_args.tensor(4)); // HCCL scratch buffer for combine phase
+    Tensor ext_scratch_print = from_tensor_arg(orch_args.tensor(5));  // Scratch print buffer
+
+    // Scalar arguments
+    int64_t expert_id = static_cast<int64_t>(orch_args.scalar(0));  // Which expert this card processes
+    int64_t card_id = static_cast<int64_t>(orch_args.scalar(1));    // Which card this is
+    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(2));  // Total number of cards
+    uint64_t comm_ctx_ptr = static_cast<uint64_t>(orch_args.scalar(3));  // CommContext*
+
+    printf("[End2End Orch] card_id=%ld expert_id=%ld num_cards=%ld\n",
+           card_id, expert_id, num_cards);
+    fflush(stdout);
+
+    PTO2_SCOPE() {
+        // ========== PART 1: Full Pipeline ==========
+        printf("[End2End Orch] Part 1: Full Pipeline (Dispatch + Compute + Combine) - card_id=%ld\n", card_id);
+        fflush(stdout);
+
+        // === Phase 1: Dispatch ===
+        printf("[End2End Orch] Phase 1: Dispatch - card_id=%ld\n", card_id);
+        fflush(stdout);
+
+        Arg params_dispatch;
+        params_dispatch.add_input(ext_send);
+        params_dispatch.add_output(ext_recv);
+        params_dispatch.add_inout(ext_scratch);
+        params_dispatch.add_scalar(expert_id);
+        params_dispatch.add_scalar(num_cards);
+        params_dispatch.add_scalar(comm_ctx_ptr);
+        pto2_rt_submit_aiv_task(0, params_dispatch);  // moe_dispatch_alltoall
+
+        printf("[End2End Orch] Dispatch submitted\n", card_id);
+        fflush(stdout);
+
+        // === Phase 2: Compute ===
+        printf("[End2End Orch] Phase 2: Compute - card_id=%ld\n", card_id);
+        fflush(stdout);
+
+        Arg params_compute;
+        params_compute.add_inout(ext_recv);
+        params_compute.add_scalar(0);  // unused
+        params_compute.add_scalar(0);  // unused
+        params_compute.add_scalar(0);  // unused
+        pto2_rt_submit_aiv_task(1, params_compute);  // moe_simple_compute
+
+        printf("[End2End Orch] Compute submitted\n", card_id);
+        fflush(stdout);
+
+        // === Phase 3: Combine (Full Pipeline) ===
+        printf("[End2End Orch] Phase 3: Combine (full pipeline) - card_id=%ld\n", card_id);
+        fflush(stdout);
+
+        Arg params_combine;
+        params_combine.add_input(ext_recv);
+        params_combine.add_output(ext_output);
+        params_combine.add_inout(ext_scratch_test);  // Use independent scratch_test buffer for combine
+        params_combine.add_output(ext_scratch_print);
+        params_combine.add_scalar(card_id);
+        params_combine.add_scalar(num_cards);
+        params_combine.add_scalar(comm_ctx_ptr);
+        pto2_rt_submit_aiv_task(2, params_combine);  // moe_combine_alltoall
+
+        printf("[End2End Orch] Combine (full pipeline) submitted\n", card_id);
+        fflush(stdout);
+    }
+
+    printf("[End2End Orch] card_id=%ld completed\n", card_id);
+    fflush(stdout);
+}
+
+}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_multi_chip_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_multi_chip_orch.cpp
new file mode 100644
index 000000000..eaecbd87e
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_multi_chip_orch.cpp
@@ -0,0 +1,88 @@
+// Orchestration Function: moe_demo (Multi-Chip Version)
+//
+// Multi-chip MoE orchestration - implements "one expert per chip" parallelism.
+//
+// Architecture comparison:
+// - Single-chip version: One chip runs ALL experts sequentially
+//   (orchestration loops: card_i=0..3, expert_j=0..3, t_idx=0..3)
+// - Multi-chip version: Each chip runs ONE expert in parallel
+//   (orchestration: card_i passed as arg, expert_j passed as arg, t_idx=0..3)
+//
+// Key insight: Both versions produce IDENTICAL results because the kernels
+// perform the same computation - only the execution distribution differs.
+//
+// Expected arguments:
+// - 3 tensors: send (INPUT), recv (OUTPUT_EXISTING), output (OUTPUT_EXISTING)
+// - 2 scalars: expert_id (which expert), chip_id (logical card_i for data layout)
+
+#include "runtime.h"
+#include <iostream>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pto_orchestration_api.h"
+
+extern "C" {
+
+__attribute__((visibility("default")))
+PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
+    // Expected: 3 tensors + 2 scalars (expert_id, chip_id)
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 3,
+    };
+}
+
+__attribute__((visibility("default")))
+void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
+    // External tensors
+    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));
+    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));
+
+    // Read expert ID and chip ID from scalar arguments (passed by Python)
+    int64_t expert_j = static_cast<int64_t>(orch_args.scalar(0));
+    int64_t card_i = static_cast<int64_t>(orch_args.scalar(1));
+
+    PTO2_SCOPE() {
+        // Stage 0: Dispatch (send → recv)
+        for (int64_t t_idx = 0; t_idx < 4; t_idx += 1) {
+            PTO2_SCOPE() {
+                Arg params_t0;
+                params_t0.add_input(ext_send);
+                params_t0.add_output(ext_recv);
+                params_t0.add_scalar(card_i);
+                params_t0.add_scalar(expert_j);
+                params_t0.add_scalar(t_idx);
+                pto2_rt_submit_aiv_task(0, params_t0);
+            }
+        }
+
+        // Stage 1: Compute (expert transformation on recv)
+        for (int64_t t_idx = 0; t_idx < 4; t_idx += 1) {
+            PTO2_SCOPE() {
+                Arg params_t1;
+                params_t1.add_inout(ext_recv);
+                params_t1.add_scalar(expert_j);
+                params_t1.add_scalar(card_i);
+                params_t1.add_scalar(t_idx);
+                pto2_rt_submit_aiv_task(1, params_t1);
+            }
+        }
+
+        // Stage 2: Combine (recv → output)
+        for (int64_t t_idx = 0; t_idx < 4; t_idx += 1) {
+            PTO2_SCOPE() {
+                Arg params_t2;
+                params_t2.add_input(ext_recv);
+                params_t2.add_output(ext_output);
+                params_t2.add_scalar(card_i);
+                params_t2.add_scalar(t_idx);
+                pto2_rt_submit_aiv_task(2, params_t2);
+            }
+        }
+    }
+}
+
+}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/main.py b/examples/workers/l3/moe_multi_chip_experts/main.py
new file mode 100644
index 000000000..c1b31f364
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/main.py
@@ -0,0 +1,417 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""L3 Worker API demo — multi-chip MoE with true inter-chip communication.
+
+This implements a distributed MoE (Mixture of Experts) pattern with real inter-chip communication:
+  - Each card has send[num_experts][num_tokens][hidden_dim] - 3D tensor
+  - Dispatch: card i sends send[i][expert_j] to card j (expert owner)
+  - Compute: card j computes recv[expert_j][card_i] += expert_j
+  - Combine: card j sends recv[expert_j][card_i] back to card i
+  - Result: output matches golden.py exactly
+
+Data flow:
+  Initial:  send[card_i][expert_j][tokens][hidden]  (per-card 3D tensor)
+  Dispatch: recv[card_j][card_i][tokens][hidden]  (all-to-all transpose)
+  Compute:  recv[card_j][card_i][tokens][hidden] += card_j (expert_id)
+  Combine:  output[card_i][tokens][hidden] = sum_j recv[card_j][card_i][tokens][hidden]
+
+Run:
+    python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3sim -d 0-1
+"""
+
+import argparse
+import os
+import sys
+
+os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+
+import torch
+from simpler.task_interface import (
+    ArgDirection,
+    CallConfig,
+    ChipBootstrapConfig,
+    ChipBufferSpec,
+    ChipCallable,
+    ChipCommBootstrapConfig,
+    CoreCallable,
+    DataType,
+    TaskArgs,
+    TensorArgType,
+)
+from simpler.worker import Worker
+
+from simpler_setup.kernel_compiler import KernelCompiler
+from simpler_setup.pto_isa import ensure_pto_isa_root
+from simpler_setup.torch_interop import make_tensor_arg
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+
+# MoE configuration - matching golden.py exactly
+NUM_TOKENS = 10  # Number of tokens
+HIDDEN_DIM = 16  # Hidden dimension
+COUNT = 4  # Number of tokens to process per (card, expert) pair
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3", "a5sim", "a5"])
+    parser.add_argument("-d", "--device", default="0-1", help="Device range, e.g. '0-1' or '0,1'")
+    return parser.parse_args()
+
+
+def parse_device_range(spec: str) -> list[int]:
+    """Parse device range specification like '0-1' or '0,1' into a list of IDs."""
+    if "-" in spec:
+        lo, hi = (int(x) for x in spec.split("-"))
+        ids = list(range(lo, hi + 1))
+    elif "," in spec:
+        ids = [int(x) for x in spec.split(",")]
+    else:
+        ids = [int(spec)]
+    return ids
+    return ids
+
+
+def build_moe_comm_callable(platform: str) -> ChipCallable:
+    """Build MoE callable with inter-chip communication (dispatch-compute-combine)."""
+    print("[moe_multi_chip] [DEBUG] Starting kernel compilation...", flush=True)
+    kc = KernelCompiler(platform=platform)
+    runtime = "tensormap_and_ringbuffer"
+    pto_isa_root = ensure_pto_isa_root(clone_protocol="https")
+    print(f"[moe_multi_chip] [DEBUG] pto_isa_root: {pto_isa_root}", flush=True)
+    include_dirs = kc.get_orchestration_include_dirs(runtime)
+
+    # Add platform_comm include directory for CommContext
+    kernel_include_dirs = list(include_dirs) + [str(kc.project_root / "src" / "common")]
+
+    # Build three kernels
+    print("[moe_multi_chip] [DEBUG] Compiling dispatch kernel...", flush=True)
+    dispatch_bytes = kc.compile_incore(
+        source_path=os.path.join(HERE, "kernels/aiv/moe_dispatch_alltoall.cpp"),
+        core_type="aiv",
+        pto_isa_root=pto_isa_root,
+        extra_include_dirs=kernel_include_dirs,
+    )
+    print("[moe_multi_chip] [DEBUG] Dispatch kernel compiled", flush=True)
+
+    print("[moe_multi_chip] [DEBUG] Compiling simple compute kernel...", flush=True)
+    compute_bytes = kc.compile_incore(
+        source_path=os.path.join(HERE, "kernels/aiv/moe_simple_compute.cpp"),
+        core_type="aiv",
+        pto_isa_root=pto_isa_root,
+        extra_include_dirs=include_dirs,
+    )
+    print("[moe_multi_chip] [DEBUG] Simple compute kernel compiled", flush=True)
+
+    print("[moe_multi_chip] [DEBUG] Compiling combine kernel...", flush=True)
+    combine_bytes = kc.compile_incore(
+        source_path=os.path.join(HERE, "kernels/aiv/moe_combine_alltoall.cpp"),
+        core_type="aiv",
+        pto_isa_root=pto_isa_root,
+        extra_include_dirs=kernel_include_dirs,
+    )
+    print("[moe_multi_chip] [DEBUG] Combine kernel compiled", flush=True)
+
+    if not platform.endswith("sim"):
+        print("[moe_multi_chip] [DEBUG] Extracting text sections from ELF binaries...", flush=True)
+        from simpler_setup.elf_parser import extract_text_section
+        dispatch_bytes = extract_text_section(dispatch_bytes)
+        compute_bytes = extract_text_section(compute_bytes)
+        combine_bytes = extract_text_section(combine_bytes)
+        print("[moe_multi_chip] [DEBUG] Text sections extracted", flush=True)
+
+    print("[moe_multi_chip] [DEBUG] Compiling orchestration...", flush=True)
+    orch_bytes = kc.compile_orchestration(
+        runtime_name=runtime,
+        source_path=os.path.join(HERE, "kernels/orchestration/moe_comm_orch.cpp"),
+    )
+    print("[moe_multi_chip] [DEBUG] Orchestration compiled", flush=True)
+
+    # Build core callables
+    dispatch_cc = CoreCallable.build(
+        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT,
+                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        binary=dispatch_bytes,
+    )
+
+    compute_cc = CoreCallable.build(
+        signature=[ArgDirection.INOUT, ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        binary=compute_bytes,
+    )
+
+    combine_cc = CoreCallable.build(
+        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT,
+                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        binary=combine_bytes,
+    )
+
+    return ChipCallable.build(
+        signature=[
+            ArgDirection.IN,   # send[num_experts][num_tokens][hidden_dim]
+            ArgDirection.OUT,  # recv[num_cards][num_tokens][hidden_dim]
+            ArgDirection.OUT,  # output[num_tokens][hidden_dim]
+            ArgDirection.INOUT,  # scratch HCCL buffer
+            ArgDirection.IN,   # expert_id
+            ArgDirection.IN,   # card_id
+            ArgDirection.IN,   # num_cards
+            ArgDirection.IN,   # CommContext*
+        ],
+        func_name="aicpu_orchestration_entry",
+        binary=orch_bytes,
+        children=[(0, dispatch_cc), (1, compute_cc), (2, combine_cc)],
+    )
+
+
+def run(platform: str, device_ids: list[int]) -> int:
+    """Core logic - implements true inter-chip communication MoE."""
+    print("[moe_multi_chip] [DEBUG] run() function started", flush=True)
+    num_cards = len(device_ids)
+    num_experts = num_cards  # One expert per chip
+
+    print(f"[moe_multi_chip] devices={device_ids} num_cards={num_cards} num_experts={num_experts}", flush=True)
+    print(f"[moe_multi_chip] NUM_TOKENS={NUM_TOKENS} HIDDEN_DIM={HIDDEN_DIM} COUNT={COUNT}", flush=True)
+
+    # Configure HCCL communication
+    # Scratch buffer size: num_cards * num_cards slots (all cards' data)
+    # Layout: scratch[card_j][expert_i][tokens][hidden_dim]
+    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
+    scratch_nbytes = scratch_count * 4  # float32
+
+    # Allocate space for signals at tail of scratch
+    total_scratch_nbytes = scratch_nbytes + num_cards * 4  # + num_cards int32 signals
+    window_size = max(total_scratch_nbytes, 4 * 1024)
+
+    rootinfo_path = f"/tmp/pto_moe_multi_chip_{os.getpid()}.bin"
+    print(f"[moe_multi_chip] [DEBUG] HCCL config: scratch_count={scratch_count} window_size={window_size} rootinfo={rootinfo_path}", flush=True)
+
+    # Clean up any stale rootinfo file
+    try:
+        os.unlink(rootinfo_path)
+        print(f"[moe_multi_chip] [DEBUG] Cleaned up stale rootinfo file", flush=True)
+    except FileNotFoundError:
+        print(f"[moe_multi_chip] [DEBUG] No stale rootinfo file to clean", flush=True)
+        pass
+
+    torch.manual_seed(42)
+    print("[moe_multi_chip] [DEBUG] Random seed set", flush=True)
+
+    # Per-card data layout (3D/2D as per user requirement)
+    # send[i]: [num_experts, num_tokens, hidden_dim]
+    host_send = [torch.ones(num_experts, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                 for _ in device_ids]
+
+    # recv[i]: [num_cards, num_tokens, hidden_dim] - receives data from all cards for expert_i
+    host_recv = [torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                 for _ in device_ids]
+
+    # output[i]: [num_tokens, hidden_dim]
+    host_output = [torch.zeros(NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                   for _ in device_ids]
+
+    print("[moe_multi_chip] [DEBUG] All tensors allocated, host_send initialized to 1.0", flush=True)
+
+    # Configure HCCL bootstrap for each card
+    cfgs = [
+        ChipBootstrapConfig(
+            comm=ChipCommBootstrapConfig(
+                rank=rank,
+                nranks=num_cards,
+                rootinfo_path=rootinfo_path,
+                window_size=window_size,
+            ),
+            buffers=[
+                ChipBufferSpec(
+                    name="scratch",
+                    dtype="float32",
+                    count=scratch_count,
+                    nbytes=total_scratch_nbytes,
+                ),
+            ],
+        )
+        for rank in range(num_cards)
+    ]
+
+    print("[moe_multi_chip] [DEBUG] Creating Worker...", flush=True)
+    worker = Worker(
+        level=3,
+        platform=platform,
+        runtime="tensormap_and_ringbuffer",
+        device_ids=device_ids,
+        num_sub_workers=0,
+        chip_bootstrap_configs=cfgs,
+    )
+    print("[moe_multi_chip] [DEBUG] Worker created", flush=True)
+
+    print(f"[moe_multi_chip] compiling kernels for {platform}...", flush=True)
+    moe_cc = build_moe_comm_callable(platform)
+    print("[moe_multi_chip] [DEBUG] All kernels compiled successfully", flush=True)
+
+    print("[moe_multi_chip] init worker (with HCCL communication)...", flush=True)
+    worker.init()
+    print("[moe_multi_chip] [DEBUG] Worker initialized", flush=True)
+
+    # Get chip contexts (contains CommContext pointers)
+    contexts = worker.chip_contexts
+    print(f"[moe_multi_chip] chip contexts: {len(contexts)}", flush=True)
+    for i, ctx in enumerate(contexts):
+        print(f"[moe_multi_chip]   card {i}: rank={ctx.rank}/{ctx.nranks} device_ctx=0x{ctx.device_ctx:x}", flush=True)
+
+    try:
+        # 第一次运行：只执行到dispatch阶段，查看recv数据
+        # 注意：当前orchestration是一次性执行所有3个阶段，所以无法分阶段查看
+        # 这里我们运行完整流程，然后在host端查看最终结果
+
+        def orch_fn(orch, _args, cfg):
+            print(f"[moe_multi_chip] orch_fn: Starting submission for {num_cards} cards", flush=True)
+            # Each card submits a task that:
+            # 1. Dispatches its expert data to all cards
+            # 2. Computes on received data
+            # 3. Combines results back to source cards
+            for i in range(num_cards):
+                print(f"[moe_multi_chip] orch_fn: Submitting task for card {i} (worker {i})", flush=True)
+                moe_args = TaskArgs()
+                moe_args.add_tensor(make_tensor_arg(host_send[i]), TensorArgType.INPUT)
+                moe_args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.OUTPUT_EXISTING)
+                moe_args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
+
+                # Scratch buffer (HCCL window)
+                from simpler.task_interface import ContinuousTensor
+                moe_args.add_tensor(
+                    ContinuousTensor.make(
+                        data=contexts[i].buffer_ptrs["scratch"],
+                        shapes=(scratch_count,),
+                        dtype=DataType.FLOAT32,
+                        child_memory=True,
+                    ),
+                    TensorArgType.INOUT,
+                )
+
+                moe_args.add_scalar(i)  # expert_id
+                moe_args.add_scalar(i)  # card_id
+                moe_args.add_scalar(num_cards)
+                moe_args.add_scalar(contexts[i].device_ctx)
+
+                result = orch.submit_next_level(moe_cc, moe_args, cfg, worker=i)
+                print(f"[moe_multi_chip] orch_fn: Submitted task for card {i}, result={result}", flush=True)
+
+            print(f"[moe_multi_chip] orch_fn: All {num_cards} tasks submitted", flush=True)
+
+        print("[moe_multi_chip] running multi-chip MoE DAG with inter-chip communication...", flush=True)
+        print("[moe_multi_chip] [DEBUG] About to call worker.run()...", flush=True)
+        worker.run(orch_fn, args=None, config=CallConfig())
+        print("[moe_multi_chip] [DEBUG] worker.run() completed", flush=True)
+
+        # 打印host端的recv数据（这是所有阶段完成后的最终recv状态）
+        print("\n[moe_multi_chip] ===== Host-side recv data (after all stages) =====")
+        for i in range(num_cards):
+            print(f"[moe_multi_chip] Card {i} recv shape: {host_recv[i].shape}")
+            print(f"[moe_multi_chip] Card {i} recv sample (first 2 cards' data, first 2 tokens, first 3 dims):")
+            for card_j in range(min(2, num_cards)):
+                for t in range(min(2, COUNT)):
+                    print(f"  recv[{card_j}][{t}][:3] = {host_recv[i][card_j, t, :3].tolist()}")
+
+        # 打印host端的output数据
+        print("\n[moe_multi_chip] ===== Host-side output data (final) =====")
+        for i in range(num_cards):
+            print(f"[moe_multi_chip] Card {i} output shape: {host_output[i].shape}")
+            print(f"[moe_multi_chip] Card {i} output sample (first {COUNT} tokens, first 3 dims):")
+            for t in range(COUNT):
+                print(f"  output[{t}][:3] = {host_output[i][t, :3].tolist()}")
+
+        print("\n[moe_multi_chip] Results:")
+        for i in range(num_cards):
+            print(f"[moe_multi_chip] card {i} output shape: {host_output[i].shape}")
+            print(f"[moe_multi_chip] card {i} output sample (first {COUNT} tokens, first 3 dims):")
+            for t in range(COUNT):
+                print(f"  token {t}: {host_output[i][t, :3]}")
+
+        # Verify against golden.py
+        print("\n[moe_multi_chip] Verifying against golden.py...")
+
+        # For golden, we need to reconstruct the original input data
+        # host_send[i]: [num_experts, NUM_TOKENS, HIDDEN_DIM]
+        # Convert to golden format: [num_cards, num_experts, NUM_TOKENS, HIDDEN_DIM]
+        send_batch = torch.stack(host_send)  # [num_cards, num_experts, NUM_TOKENS, HIDDEN_DIM]
+
+        # Initialize recv in golden format: [num_experts, num_cards, NUM_TOKENS, HIDDEN_DIM]
+        # This will be filled by the dispatch phase
+        recv_batch = torch.zeros(num_experts, num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32)
+
+        # Initialize output for golden as ZERO tensor (not containing hardware results!)
+        # golden.py's demo function uses +=, so it must start from zero
+        golden_output_input = torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32)
+
+        # Run golden to compute expected output
+        # Note: golden.py's demo function modifies recv and output in place
+        import sys
+        golden_path = os.path.join(HERE, "golden.py")
+        if golden_path not in sys.path:
+            sys.path.insert(0, HERE)
+
+        # Import golden module
+        import importlib.util
+        spec = importlib.util.spec_from_file_location("golden", golden_path)
+        golden_module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(golden_module)
+
+        # Run golden computation (modifies golden_output_input in place)
+        # The golden function computes: output[i][:][:] = sum_j (send[j][i] + i)
+        # where only the first COUNT tokens are processed
+        golden_output = golden_module.demo(send_batch, recv_batch, golden_output_input)
+
+        # Compare results
+        all_match = True
+        for i in range(num_cards):
+            max_diff = float(torch.max(torch.abs(host_output[i] - golden_output[i])))
+            mean_diff = float(torch.mean(torch.abs(host_output[i] - golden_output[i])))
+            print(f"[moe_multi_chip] card {i}: max |output - golden| = {max_diff:.6e}, mean diff = {mean_diff:.6e}")
+
+            if max_diff > 1e-3:
+                all_match = False
+                print(f"[moe_multi_chip] card {i} MISMATCH! Showing first {COUNT} tokens:")
+                for t in range(COUNT):
+                    actual = host_output[i][t, :3]
+                    expected = golden_output[i][t, :3]
+                    print(f"  token {t}: actual={actual.tolist()}, expected={expected.tolist()}")
+            else:
+                print(f"[moe_multi_chip] card {i} ✅ matches golden")
+
+        if all_match:
+            print("\n[moe_multi_chip] ✅ All cards matched golden.py!")
+            return 0
+        else:
+            print("\n[moe_multi_chip] ❌ Some cards did NOT match golden.py")
+            return 1
+
+    except Exception as e:
+        print(f"[moe_multi_chip] ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    finally:
+        print("[moe_multi_chip] shutting down worker...")
+        worker.close()
+
+        # Clean up rootinfo file
+        try:
+            os.unlink(rootinfo_path)
+        except FileNotFoundError:
+            pass
+
+
+def main() -> int:
+    args = parse_args()
+    device_ids = parse_device_range(args.device)
+    return run(args.platform, device_ids)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/workers/l3/moe_multi_chip_experts/test_combine_only.py b/examples/workers/l3/moe_multi_chip_experts/test_combine_only.py
new file mode 100755
index 000000000..3d3d70c30
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/test_combine_only.py
@@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+# Test combine kernel in isolation with unique integer values per token
+
+import argparse
+import os
+import sys
+
+os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+
+import torch
+from simpler.task_interface import (
+    ArgDirection,
+    CallConfig,
+    ChipBootstrapConfig,
+    ChipBufferSpec,
+    ChipCallable,
+    ChipCommBootstrapConfig,
+    CoreCallable,
+    DataType,
+    TaskArgs,
+    TensorArgType,
+)
+from simpler.worker import Worker
+
+from simpler_setup.kernel_compiler import KernelCompiler
+from simpler_setup.pto_isa import ensure_pto_isa_root
+from simpler_setup.torch_interop import make_tensor_arg
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+
+# MoE configuration
+NUM_TOKENS = 10
+HIDDEN_DIM = 16
+COUNT = 4
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Test combine kernel in isolation")
+    parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3", "a5sim", "a5"])
+    parser.add_argument("-d", "--device", default="0-1", help="Device range")
+    return parser.parse_args()
+
+
+def parse_device_range(spec: str) -> list[int]:
+    if "-" in spec:
+        lo, hi = (int(x) for x in spec.split("-"))
+        return list(range(lo, hi + 1))
+    elif "," in spec:
+        return [int(x) for x in spec.split(",")]
+    else:
+        return [int(spec)]
+
+
+def build_combine_only_callable(platform: str) -> ChipCallable:
+    """Build callable with ONLY combine kernel."""
+    print("[Combine-Only] Compiling combine kernel...", flush=True)
+    kc = KernelCompiler(platform=platform)
+    runtime = "tensormap_and_ringbuffer"
+    pto_isa_root = ensure_pto_isa_root(clone_protocol="https")
+    include_dirs = kc.get_orchestration_include_dirs(runtime)
+    kernel_include_dirs = list(include_dirs) + [str(kc.project_root / "src" / "common")]
+
+    # Compile combine kernel
+    combine_bytes = kc.compile_incore(
+        source_path=os.path.join(HERE, "kernels/aiv/moe_combine_alltoall2.cpp"),
+        core_type="aiv",
+        pto_isa_root=pto_isa_root,
+        extra_include_dirs=kernel_include_dirs,
+    )
+    print("[Combine-Only] Combine kernel compiled", flush=True)
+
+    if not platform.endswith("sim"):
+        from simpler_setup.elf_parser import extract_text_section
+        combine_bytes = extract_text_section(combine_bytes)
+        print("[Combine-Only] Text sections extracted", flush=True)
+
+    # Compile orchestration
+    print("[Combine-Only] Compiling orchestration...", flush=True)
+    orch_bytes = kc.compile_orchestration(
+        runtime_name=runtime,
+        source_path=os.path.join(HERE, "kernels/orchestration/moe_combine_only_orch.cpp"),
+    )
+    print("[Combine-Only] Orchestration compiled", flush=True)
+
+    # Build core callable
+    combine_cc = CoreCallable.build(
+        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT, ArgDirection.OUT,
+                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        binary=combine_bytes,
+    )
+
+    return ChipCallable.build(
+        signature=[
+            ArgDirection.IN,   # recv
+            ArgDirection.OUT,  # output
+            ArgDirection.INOUT,  # scratch
+            ArgDirection.OUT,  # scratch_print
+            ArgDirection.IN,   # card_id
+            ArgDirection.IN,   # num_cards
+            ArgDirection.IN,   # CommContext*
+        ],
+        func_name="aicpu_orchestration_entry",
+        binary=orch_bytes,
+        children=[(0, combine_cc)],  # Only combine child
+    )
+
+
+def compute_golden_output(num_cards: int, host_recv: list[torch.Tensor]) -> list[torch.Tensor]:
+    """
+    Compute golden output using direct store logic:
+        output[cardi][expertj][:count][:] = recv[expertj, cardi, :count, :]
+
+    For combine-only test:
+    - Each card_j's recv[j] has shape [num_cards, NUM_TOKENS, HIDDEN_DIM]
+    - recv[j][i][t][d] = expert_j's processed data for card_i
+    - Card i's output[expert_j][:][:] stores expert_j's data for card_i
+    """
+    golden_outputs = []
+    for cardi in range(num_cards):
+        output = torch.zeros(num_cards, COUNT, HIDDEN_DIM, dtype=torch.float32)
+        for expertj in range(num_cards):
+            # recv[expertj][cardi][:][:] = expert_j's processed data for card_i
+            # Store to output[expertj][:][:]
+            output[expertj, :, :] = host_recv[expertj][cardi, :COUNT, :]
+        golden_outputs.append(output)
+
+    return golden_outputs
+
+
+def initialize_recv_with_unique_integers(num_cards: int, device_id: int) -> torch.Tensor:
+    """
+    Initialize recv tensor with unique integers for each token.
+
+    Direct store logic (no accumulation):
+    - recv[expert_i][card_j][t][d] = expert_i processed data for card_j
+    - output[card_j][expert_i][t][d] = recv[expert_i][card_j][t][d] (direct copy)
+
+    Each position gets a unique value to trace data flow:
+    value = (expert * 10000) + (card_j * 100) + (t * 10) + d
+
+    This way we can identify which expert's data ended up where.
+    """
+    recv = torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+
+    for expert_i in range(num_cards):
+        for t in range(NUM_TOKENS):
+            for d in range(HIDDEN_DIM):
+                value = float(expert_i * 10000 + device_id * 100 + t * 10 + d)
+                recv[expert_i, t, d] = value
+
+    return recv
+
+
+def run(platform: str, device_ids: list[int]) -> int:
+    print(f"[Combine-Only] Testing combine on devices {device_ids}", flush=True)
+    num_cards = len(device_ids)
+
+    print(f"\n[Combine-Only] Test Configuration:")
+    print(f"  Platform: {platform}")
+    print(f"  Number of cards: {num_cards}")
+    print(f"  Device IDs: {device_ids}")
+    print(f"  NUM_TOKENS: {NUM_TOKENS}")
+    print(f"  HIDDEN_DIM: {HIDDEN_DIM}")
+    print(f"  COUNT (tokens processed): {COUNT}")
+    print(f"  Total values per card: {num_cards * COUNT * HIDDEN_DIM}")
+    print(f"  Total values to verify: {num_cards * num_cards * COUNT * HIDDEN_DIM}")
+
+    # Configure HCCL
+    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
+    scratch_nbytes = scratch_count * 4
+    total_scratch_nbytes = scratch_nbytes + num_cards * 4
+    window_size = max(total_scratch_nbytes, 4 * 1024)
+
+    print(f"\n[Combine-Only] Memory Configuration:")
+    print(f"  Scratch buffer size: {scratch_count} elements = {scratch_nbytes / 1024:.2f} KB")
+    print(f"  Total with signals: {total_scratch_nbytes / 1024:.2f} KB")
+    print(f"  HCCL window size: {window_size / 1024:.2f} KB")
+
+    rootinfo_path = f"/tmp/pto_combine_only_{os.getpid()}.bin"
+    try:
+        os.unlink(rootinfo_path)
+    except FileNotFoundError:
+        pass
+
+    torch.manual_seed(42)
+
+    # Allocate tensors with unique integer values for each token
+    host_recv = []
+    for i in device_ids:
+        recv = initialize_recv_with_unique_integers(num_cards, i)
+        host_recv.append(recv)
+
+    host_output = [torch.zeros(num_cards, COUNT, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                   for _ in device_ids]
+
+    # Allocate scratch_print tensors (debug output)
+    host_scratch_print = [torch.zeros(scratch_count, dtype=torch.float32).share_memory_()
+                          for _ in device_ids]
+
+    # Compute golden output BEFORE running the kernel
+    print("\n[Combine-Only] Computing golden output using golden.py logic...")
+    golden_outputs = compute_golden_output(num_cards, host_recv)
+    print("[Combine-Only] Golden output computed", flush=True)
+
+    print(f"\n[Combine-Only] Allocated tensors: recv=unique_integers, output=0.0", flush=True)
+
+    # Configure HCCL bootstrap
+    cfgs = [
+        ChipBootstrapConfig(
+            comm=ChipCommBootstrapConfig(
+                rank=rank,
+                nranks=num_cards,
+                rootinfo_path=rootinfo_path,
+                window_size=window_size,
+            ),
+            buffers=[
+                ChipBufferSpec(
+                    name="scratch",
+                    dtype="float32",
+                    count=scratch_count,
+                    nbytes=total_scratch_nbytes,
+                ),
+            ],
+        )
+        for rank in range(num_cards)
+    ]
+
+    # Create worker
+    worker = Worker(
+        level=3,
+        platform=platform,
+        runtime="tensormap_and_ringbuffer",
+        device_ids=device_ids,
+        num_sub_workers=0,
+        chip_bootstrap_configs=cfgs,
+    )
+
+    print(f"\n[Combine-Only] Compiling kernels for {platform}...", flush=True)
+    combine_cc = build_combine_only_callable(platform)
+    print("[Combine-Only] All kernels compiled successfully", flush=True)
+
+    print("[Combine-Only] Initializing worker...", flush=True)
+    worker.init()
+    contexts = worker.chip_contexts
+    print(f"[Combine-Only] Worker initialized with {len(contexts)} contexts", flush=True)
+
+    try:
+        def orch_fn(orch, _args, cfg):
+            print(f"[Combine-Only] Submitting tasks for {num_cards} cards", flush=True)
+            for i in range(num_cards):
+                combine_args = TaskArgs()
+                combine_args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.INPUT)
+                combine_args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
+
+                from simpler.task_interface import ContinuousTensor
+                combine_args.add_tensor(
+                    ContinuousTensor.make(
+                        data=contexts[i].buffer_ptrs["scratch"],
+                        shapes=(scratch_count,),
+                        dtype=DataType.FLOAT32,
+                        child_memory=True,
+                    ),
+                    TensorArgType.INOUT,
+                )
+                combine_args.add_tensor(make_tensor_arg(host_scratch_print[i]), TensorArgType.OUTPUT_EXISTING)
+
+                combine_args.add_scalar(i)  # card_id
+                combine_args.add_scalar(num_cards)
+                combine_args.add_scalar(contexts[i].device_ctx)
+
+                result = orch.submit_next_level(combine_cc, combine_args, cfg, worker=i)
+                print(f"[Combine-Only] Submitted task for card {i}", flush=True)
+
+        print("[Combine-Only] Running combine-only test...", flush=True)
+
+        # Print what each card will do
+        print("\n[Combine-Only] Task breakdown:")
+        for i in range(num_cards):
+            print(f"  Card {i}: Will combine results from all experts for card {i}")
+            print(f"    Input: recv[{i}][expert][{COUNT} tokens][{HIDDEN_DIM} dims]")
+            print(f"    Output: output[num_experts={num_cards}][{COUNT} tokens][{HIDDEN_DIM} dims]")
+
+        # Print output initial values BEFORE running kernel
+        print("\n" + "="*80)
+        print("[Combine-Only] OUTPUT INITIAL VALUES (before kernel):")
+        print("="*80)
+        for i in range(num_cards):
+            print(f"\n[Combine-Only] Card {i} output initial values:")
+            print(f"  Shape: {host_output[i].shape}")
+            for expert_i in range(num_cards):
+                print(f"    Expert {expert_i}:")
+                for t in range(COUNT):
+                    vals = host_output[i][expert_i, t, :].tolist()
+                    print(f"      Token {t}: {vals}")
+
+        worker.run(orch_fn, args=None, config=CallConfig())
+        print("\n[Combine-Only] Test completed successfully!", flush=True)
+
+        # Print scratch_print buffer contents for debugging
+        print("\n" + "="*80)
+        print("[Combine-Only] SCRATCH_PRINT BUFFER CONTENTS (Phase 1 stage-in mirror):")
+        print("="*80)
+
+        for i in range(num_cards):
+            print(f"\n[Combine-Only] Card {i} scratch_print buffer (device {device_ids[i]}):")
+            print(f"  Layout: scratch_print[expert_i][card_j][token][dim]")
+            print(f"  Size: [{num_cards}][{num_cards}][{NUM_TOKENS}][{HIDDEN_DIM}]")
+
+            for expert_i in range(num_cards):
+                print(f"\n  Expert {expert_i}:")
+                for card_j in range(num_cards):
+                    print(f"    For card {card_j}:")
+                    for t in range(COUNT):
+                        offset = expert_i * num_cards * NUM_TOKENS * HIDDEN_DIM + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM
+                        vals = host_scratch_print[i][offset:offset+HIDDEN_DIM].tolist()
+                        print(f"      Token {t}: {vals}")
+
+        # Print results
+        print("\n" + "="*80)
+        print("[Combine-Only] INPUT RECV DATA:")
+        print("="*80)
+
+        for i in range(num_cards):
+            print(f"\n[Combine-Only] Card {i} recv data (device {device_ids[i]}):")
+            print(f"  Shape: {host_recv[i].shape}")
+            for expert_i in range(num_cards):
+                print(f"\n  Expert {expert_i}:")
+                for t in range(NUM_TOKENS):
+                    vals = host_recv[i][expert_i, t, :].tolist()
+                    print(f"    Token {t}: {vals}")
+
+        print("\n" + "="*80)
+        print("[Combine-Only] OUTPUT DATA (after combine):")
+        print("="*80)
+
+        for i in range(num_cards):
+            print(f"\n[Combine-Only] Card {i} output data:")
+            print(f"  Shape: {host_output[i].shape}")
+            for expert_i in range(num_cards):
+                print(f"\n  Expert {expert_i}:")
+                for t in range(COUNT):
+                    vals = host_output[i][expert_i, t, :].tolist()
+                    golden_vals = golden_outputs[i][expert_i, t, :].tolist()
+                    print(f"\n    Token {t}:")
+                    print(f"      Output:  {vals}")
+                    print(f"      Golden:  {golden_vals}")
+                    match = all(abs(v - g) < 1e-3 for v, g in zip(vals, golden_vals))
+                    print(f"      Match: {'✓' if match else '✗'}")
+
+        # Verify correctness by comparing with pre-computed golden output
+        print("\n" + "="*80)
+        print("[Combine-Only] VERIFICATION SUMMARY:")
+        print("="*80)
+
+        all_correct = True
+        error_count = 0
+        total_checked = 0
+
+        for i in range(num_cards):
+            print(f"\n[Combine-Only] Card {i}:")
+            card_errors = 0
+
+            for expert_i in range(num_cards):
+                for t in range(COUNT):
+                    for d in range(HIDDEN_DIM):
+                        expected = golden_outputs[i][expert_i, t, d].item()
+                        actual = host_output[i][expert_i, t, d].item()
+                        total_checked += 1
+
+                        if abs(actual - expected) > 1e-3:
+                            card_errors += 1
+                            error_count += 1
+                            all_correct = False
+
+            if card_errors == 0:
+                print(f"  ✓ All {num_cards * COUNT * HIDDEN_DIM} values correct")
+            else:
+                print(f"  ✗ {card_errors} / {num_cards * COUNT * HIDDEN_DIM} values incorrect")
+
+        print(f"\n  Total: {total_checked - error_count}/{total_checked} correct")
+
+        if all_correct:
+            print("\n[Combine-Only] ✅ All values correct! Combine kernel works perfectly.")
+            return 0
+        else:
+            print("\n[Combine-Only] ❌ Some values incorrect!")
+            return 1
+
+    except Exception as e:
+        print(f"[Combine-Only] ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    finally:
+        print("[Combine-Only] Shutting down worker...")
+        worker.close()
+        try:
+            os.unlink(rootinfo_path)
+        except FileNotFoundError:
+            pass
+
+
+def main() -> int:
+    args = parse_args()
+    device_ids = parse_device_range(args.device)
+    return run(args.platform, device_ids)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/workers/l3/moe_multi_chip_experts/test_dispatch_compute.py b/examples/workers/l3/moe_multi_chip_experts/test_dispatch_compute.py
new file mode 100644
index 000000000..59d7580b5
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/test_dispatch_compute.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+# Test dispatch + compute kernels together
+
+import argparse
+import os
+import sys
+
+os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+
+import torch
+from simpler.task_interface import (
+    ArgDirection,
+    CallConfig,
+    ChipBootstrapConfig,
+    ChipBufferSpec,
+    ChipCallable,
+    ChipCommBootstrapConfig,
+    CoreCallable,
+    DataType,
+    TaskArgs,
+    TensorArgType,
+)
+from simpler.worker import Worker
+
+from simpler_setup.kernel_compiler import KernelCompiler
+from simpler_setup.pto_isa import ensure_pto_isa_root
+from simpler_setup.torch_interop import make_tensor_arg
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+
+# MoE configuration
+NUM_TOKENS = 10
+HIDDEN_DIM = 16
+COUNT = 4
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Test dispatch + compute kernels")
+    parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3", "a5sim", "a5"])
+    parser.add_argument("-d", "--device", default="0-1", help="Device range")
+    return parser.parse_args()
+
+
+def parse_device_range(spec: str) -> list[int]:
+    if "-" in spec:
+        lo, hi = (int(x) for x in spec.split("-"))
+        return list(range(lo, hi + 1))
+    elif "," in spec:
+        return [int(x) for x in spec.split(",")]
+    else:
+        return [int(spec)]
+
+
+def build_dispatch_compute_callable(platform: str) -> ChipCallable:
+    """Build callable with dispatch + compute kernels."""
+    print("[Dispatch+Compute] Compiling kernels...", flush=True)
+    kc = KernelCompiler(platform=platform)
+    runtime = "tensormap_and_ringbuffer"
+    pto_isa_root = ensure_pto_isa_root(clone_protocol="https")
+    include_dirs = kc.get_orchestration_include_dirs(runtime)
+    kernel_include_dirs = list(include_dirs) + [str(kc.project_root / "src" / "common")]
+
+    # Compile dispatch kernel
+    dispatch_bytes = kc.compile_incore(
+        source_path=os.path.join(HERE, "kernels/aiv/moe_dispatch_alltoall.cpp"),
+        core_type="aiv",
+        pto_isa_root=pto_isa_root,
+        extra_include_dirs=kernel_include_dirs,
+    )
+    print("[Dispatch+Compute] Dispatch kernel compiled", flush=True)
+
+    # Compile simple compute kernel
+    compute_bytes = kc.compile_incore(
+        source_path=os.path.join(HERE, "kernels/aiv/moe_simple_compute.cpp"),
+        core_type="aiv",
+        pto_isa_root=pto_isa_root,
+        extra_include_dirs=include_dirs,
+    )
+    print("[Dispatch+Compute] Compute kernel compiled", flush=True)
+
+    if not platform.endswith("sim"):
+        from simpler_setup.elf_parser import extract_text_section
+        dispatch_bytes = extract_text_section(dispatch_bytes)
+        compute_bytes = extract_text_section(compute_bytes)
+        print("[Dispatch+Compute] Text sections extracted", flush=True)
+
+    # Compile orchestration
+    print("[Dispatch+Compute] Compiling orchestration...", flush=True)
+    orch_bytes = kc.compile_orchestration(
+        runtime_name=runtime,
+        source_path=os.path.join(HERE, "kernels/orchestration/moe_dispatch_compute_orch.cpp"),
+    )
+    print("[Dispatch+Compute] Orchestration compiled", flush=True)
+
+    # Build core callables
+    dispatch_cc = CoreCallable.build(
+        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT,
+                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        binary=dispatch_bytes,
+    )
+
+    compute_cc = CoreCallable.build(
+        signature=[ArgDirection.INOUT, ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        binary=compute_bytes,
+    )
+
+    return ChipCallable.build(
+        signature=[
+            ArgDirection.IN,   # send
+            ArgDirection.OUT,  # recv
+            ArgDirection.OUT,  # output (unused)
+            ArgDirection.INOUT,  # scratch
+            ArgDirection.IN,   # expert_id
+            ArgDirection.IN,   # card_id
+            ArgDirection.IN,   # num_cards
+            ArgDirection.IN,   # CommContext*
+        ],
+        func_name="aicpu_orchestration_entry",
+        binary=orch_bytes,
+        children=[(0, dispatch_cc), (1, compute_cc)],  # Dispatch + Compute
+    )
+
+
+def run(platform: str, device_ids: list[int]) -> int:
+    print(f"[Dispatch+Compute] Testing on devices {device_ids}", flush=True)
+    num_cards = len(device_ids)
+    num_experts = num_cards
+
+    # Configure HCCL
+    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
+    scratch_nbytes = scratch_count * 4
+    total_scratch_nbytes = scratch_nbytes + num_cards * 4
+    window_size = max(total_scratch_nbytes, 4 * 1024)
+
+    rootinfo_path = f"/tmp/pto_dispatch_compute_{os.getpid()}.bin"
+    try:
+        os.unlink(rootinfo_path)
+    except FileNotFoundError:
+        pass
+
+    torch.manual_seed(42)
+
+    # Allocate tensors
+    host_send = [torch.ones(num_experts, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                 for _ in device_ids]
+    host_recv = [torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                 for _ in device_ids]
+    host_output = [torch.zeros(NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                   for _ in device_ids]
+
+    print(f"[Dispatch+Compute] Allocated tensors: send=1.0, recv=0.0", flush=True)
+
+    # Configure HCCL bootstrap
+    cfgs = [
+        ChipBootstrapConfig(
+            comm=ChipCommBootstrapConfig(
+                rank=rank,
+                nranks=num_cards,
+                rootinfo_path=rootinfo_path,
+                window_size=window_size,
+            ),
+            buffers=[
+                ChipBufferSpec(
+                    name="scratch",
+                    dtype="float32",
+                    count=scratch_count,
+                    nbytes=total_scratch_nbytes,
+                ),
+            ],
+        )
+        for rank in range(num_cards)
+    ]
+
+    # Create worker
+    worker = Worker(
+        level=3,
+        platform=platform,
+        runtime="tensormap_and_ringbuffer",
+        device_ids=device_ids,
+        num_sub_workers=0,
+        chip_bootstrap_configs=cfgs,
+    )
+
+    print(f"[Dispatch+Compute] Compiling kernels for {platform}...", flush=True)
+    dispatch_compute_cc = build_dispatch_compute_callable(platform)
+    print("[Dispatch+Compute] All kernels compiled successfully", flush=True)
+
+    print("[Dispatch+Compute] Initializing worker...", flush=True)
+    worker.init()
+    contexts = worker.chip_contexts
+    print(f"[Dispatch+Compute] Worker initialized with {len(contexts)} contexts", flush=True)
+
+    try:
+        def orch_fn(orch, _args, cfg):
+            print(f"[Dispatch+Compute] Submitting tasks for {num_cards} cards", flush=True)
+            for i in range(num_cards):
+                args = TaskArgs()
+                args.add_tensor(make_tensor_arg(host_send[i]), TensorArgType.INPUT)
+                args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.OUTPUT_EXISTING)
+                args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
+
+                from simpler.task_interface import ContinuousTensor
+                args.add_tensor(
+                    ContinuousTensor.make(
+                        data=contexts[i].buffer_ptrs["scratch"],
+                        shapes=(scratch_count,),
+                        dtype=DataType.FLOAT32,
+                        child_memory=True,
+                    ),
+                    TensorArgType.INOUT,
+                )
+
+                args.add_scalar(i)  # expert_id
+                args.add_scalar(i)  # card_id
+                args.add_scalar(num_cards)
+                args.add_scalar(contexts[i].device_ctx)
+
+                result = orch.submit_next_level(dispatch_compute_cc, args, cfg, worker=i)
+                print(f"[Dispatch+Compute] Submitted task for card {i}", flush=True)
+
+        print("[Dispatch+Compute] Running dispatch+compute test...", flush=True)
+        worker.run(orch_fn, args=None, config=CallConfig())
+        print("[Dispatch+Compute] Test completed", flush=True)
+
+        # Print results
+        print("\n" + "="*80)
+        print("[Dispatch+Compute] RESULTS:")
+        print("="*80)
+
+        for i in range(num_cards):
+            print(f"\n[Dispatch+Compute] Card {i} recv data (after dispatch+compute):")
+            print(f"  Shape: {host_recv[i].shape}")
+            print(f"  Expected: recv[i][:4][:] should be 2.0 (1.0 from dispatch + 1.0 from compute)")
+            print(f"  Sample data (first 2 cards' data, first {COUNT} tokens, first 3 dims):")
+
+            for card_j in range(num_cards):
+                print(f"    recv[{card_j}][:3][:3] = [", end="")
+                for t in range(min(3, COUNT)):
+                    vals = host_recv[i][card_j, t, :3].tolist()
+                    print(f"[{vals[0]:.1f},{vals[1]:.1f},{vals[2]:.1f}]", end="")
+                    if t < min(3, COUNT) - 1:
+                        print(", ", end="")
+                print("]")
+
+        # Verify correctness
+        print("\n" + "="*80)
+        print("[Dispatch+Compute] VERIFICATION:")
+        print("="*80)
+
+        all_correct = True
+        for i in range(num_cards):
+            for card_j in range(num_cards):
+                for t in range(COUNT):
+                    for d in range(HIDDEN_DIM):
+                        expected = 2.0  # 1.0 (dispatch) + 1.0 (compute)
+                        actual = host_recv[i][card_j, t, d].item()
+                        if abs(actual - expected) > 1e-5:
+                            print(f"[Dispatch+Compute] ERROR: Card {i} recv[{card_j}][{t}][{d}] = {actual}, expected {expected}")
+                            all_correct = False
+
+        if all_correct:
+            print("[Dispatch+Compute] ✅ All values correct! Dispatch+Compute works perfectly.")
+            return 0
+        else:
+            print("[Dispatch+Compute] ❌ Some values incorrect!")
+            return 1
+
+    except Exception as e:
+        print(f"[Dispatch+Compute] ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    finally:
+        print("[Dispatch+Compute] Shutting down worker...")
+        worker.close()
+        try:
+            os.unlink(rootinfo_path)
+        except FileNotFoundError:
+            pass
+
+
+def main() -> int:
+    args = parse_args()
+    device_ids = parse_device_range(args.device)
+    return run(args.platform, device_ids)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/workers/l3/moe_multi_chip_experts/test_dispatch_only.py b/examples/workers/l3/moe_multi_chip_experts/test_dispatch_only.py
new file mode 100644
index 000000000..61490029e
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/test_dispatch_only.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+# Test dispatch kernel in isolation
+
+import argparse
+import os
+import sys
+
+os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+
+import torch
+from simpler.task_interface import (
+    ArgDirection,
+    CallConfig,
+    ChipBootstrapConfig,
+    ChipBufferSpec,
+    ChipCallable,
+    ChipCommBootstrapConfig,
+    CoreCallable,
+    DataType,
+    TaskArgs,
+    TensorArgType,
+)
+from simpler.worker import Worker
+
+from simpler_setup.kernel_compiler import KernelCompiler
+from simpler_setup.pto_isa import ensure_pto_isa_root
+from simpler_setup.torch_interop import make_tensor_arg
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+
+# MoE configuration
+NUM_TOKENS = 10
+HIDDEN_DIM = 16
+COUNT = 4
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Test dispatch kernel in isolation")
+    parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3", "a5sim", "a5"])
+    parser.add_argument("-d", "--device", default="0-1", help="Device range")
+    return parser.parse_args()
+
+
+def parse_device_range(spec: str) -> list[int]:
+    if "-" in spec:
+        lo, hi = (int(x) for x in spec.split("-"))
+        return list(range(lo, hi + 1))
+    elif "," in spec:
+        return [int(x) for x in spec.split(",")]
+    else:
+        return [int(spec)]
+
+
+def build_dispatch_only_callable(platform: str) -> ChipCallable:
+    """Build callable with ONLY dispatch kernel."""
+    print("[Dispatch-Only] Compiling dispatch kernel...", flush=True)
+    kc = KernelCompiler(platform=platform)
+    runtime = "tensormap_and_ringbuffer"
+    pto_isa_root = ensure_pto_isa_root(clone_protocol="https")
+    include_dirs = kc.get_orchestration_include_dirs(runtime)
+    kernel_include_dirs = list(include_dirs) + [str(kc.project_root / "src" / "common")]
+
+    # Compile dispatch kernel
+    dispatch_bytes = kc.compile_incore(
+        source_path=os.path.join(HERE, "kernels/aiv/moe_dispatch_alltoall.cpp"),
+        core_type="aiv",
+        pto_isa_root=pto_isa_root,
+        extra_include_dirs=kernel_include_dirs,
+    )
+    print("[Dispatch-Only] Dispatch kernel compiled", flush=True)
+
+    if not platform.endswith("sim"):
+        from simpler_setup.elf_parser import extract_text_section
+        dispatch_bytes = extract_text_section(dispatch_bytes)
+        print("[Dispatch-Only] Text sections extracted", flush=True)
+
+    # Compile orchestration
+    print("[Dispatch-Only] Compiling orchestration...", flush=True)
+    orch_bytes = kc.compile_orchestration(
+        runtime_name=runtime,
+        source_path=os.path.join(HERE, "kernels/orchestration/moe_dispatch_only_orch.cpp"),
+    )
+    print("[Dispatch-Only] Orchestration compiled", flush=True)
+
+    # Build core callable
+    dispatch_cc = CoreCallable.build(
+        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT,
+                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        binary=dispatch_bytes,
+    )
+
+    return ChipCallable.build(
+        signature=[
+            ArgDirection.IN,   # send
+            ArgDirection.OUT,  # recv
+            ArgDirection.OUT,  # output (unused but needed for signature)
+            ArgDirection.INOUT,  # scratch
+            ArgDirection.IN,   # expert_id
+            ArgDirection.IN,   # card_id
+            ArgDirection.IN,   # num_cards
+            ArgDirection.IN,   # CommContext*
+        ],
+        func_name="aicpu_orchestration_entry",
+        binary=orch_bytes,
+        children=[(0, dispatch_cc)],  # Only dispatch child
+    )
+
+
+def run(platform: str, device_ids: list[int]) -> int:
+    print(f"[Dispatch-Only] Testing dispatch on devices {device_ids}", flush=True)
+    num_cards = len(device_ids)
+    num_experts = num_cards
+
+    # Configure HCCL
+    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
+    scratch_nbytes = scratch_count * 4
+    total_scratch_nbytes = scratch_nbytes + num_cards * 4
+    window_size = max(total_scratch_nbytes, 4 * 1024)
+
+    rootinfo_path = f"/tmp/pto_dispatch_only_{os.getpid()}.bin"
+    try:
+        os.unlink(rootinfo_path)
+    except FileNotFoundError:
+        pass
+
+    torch.manual_seed(42)
+
+    # Allocate tensors with unique values to trace data flow
+    # Value encoding: (card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim
+    host_send = []
+    for i, device_id in enumerate(device_ids):
+        send = torch.zeros(num_experts, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+        for expert_j in range(num_experts):
+            for t in range(NUM_TOKENS):
+                for d in range(HIDDEN_DIM):
+                    # Unique value: card_i -> expert_j -> token_t -> dim_d
+                    value = float(i * 1000000 + expert_j * 10000 + t * 100 + d)
+                    send[expert_j, t, d] = value
+        host_send.append(send)
+
+    host_recv = [torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                 for _ in device_ids]
+    host_output = [torch.zeros(NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                   for _ in device_ids]
+
+    print(f"[Dispatch-Only] Allocated tensors with unique values", flush=True)
+    print(f"[Dispatch-Only] Value encoding: (card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim", flush=True)
+    print(f"[Dispatch-Only] Sample: host_send[0][0][0][0] = {host_send[0][0, 0, 0].item()} (card 0, expert 0, token 0, dim 0)", flush=True)
+
+    # Print input values BEFORE running kernel
+    print("\n" + "="*80)
+    print("[Dispatch-Only] INPUT SEND VALUES (before kernel):")
+    print("="*80)
+    for i in range(num_cards):
+        print(f"\n[Dispatch-Only] Card {i} send values:")
+        print(f"  Shape: {host_send[i].shape}")
+        for expert_j in range(num_experts):
+            print(f"    Expert {expert_j}:")
+            for t in range(min(2, COUNT)):
+                vals = host_send[i][expert_j, t, :3].tolist()
+                print(f"      Token {t}: {vals}")
+
+    # Configure HCCL bootstrap
+    cfgs = [
+        ChipBootstrapConfig(
+            comm=ChipCommBootstrapConfig(
+                rank=rank,
+                nranks=num_cards,
+                rootinfo_path=rootinfo_path,
+                window_size=window_size,
+            ),
+            buffers=[
+                ChipBufferSpec(
+                    name="scratch",
+                    dtype="float32",
+                    count=scratch_count,
+                    nbytes=total_scratch_nbytes,
+                ),
+            ],
+        )
+        for rank in range(num_cards)
+    ]
+
+    # Create worker
+    worker = Worker(
+        level=3,
+        platform=platform,
+        runtime="tensormap_and_ringbuffer",
+        device_ids=device_ids,
+        num_sub_workers=0,
+        chip_bootstrap_configs=cfgs,
+    )
+
+    print(f"[Dispatch-Only] Compiling kernels for {platform}...", flush=True)
+    dispatch_cc = build_dispatch_only_callable(platform)
+    print("[Dispatch-Only] All kernels compiled successfully", flush=True)
+
+    print("[Dispatch-Only] Initializing worker...", flush=True)
+    worker.init()
+    contexts = worker.chip_contexts
+    print(f"[Dispatch-Only] Worker initialized with {len(contexts)} contexts", flush=True)
+
+    try:
+        def orch_fn(orch, _args, cfg):
+            print(f"[Dispatch-Only] Submitting tasks for {num_cards} cards", flush=True)
+            for i in range(num_cards):
+                dispatch_args = TaskArgs()
+                dispatch_args.add_tensor(make_tensor_arg(host_send[i]), TensorArgType.INPUT)
+                dispatch_args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.OUTPUT_EXISTING)
+                dispatch_args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
+
+                from simpler.task_interface import ContinuousTensor
+                dispatch_args.add_tensor(
+                    ContinuousTensor.make(
+                        data=contexts[i].buffer_ptrs["scratch"],
+                        shapes=(scratch_count,),
+                        dtype=DataType.FLOAT32,
+                        child_memory=True,
+                    ),
+                    TensorArgType.INOUT,
+                )
+
+                dispatch_args.add_scalar(i)  # expert_id
+                dispatch_args.add_scalar(i)  # card_id
+                dispatch_args.add_scalar(num_cards)
+                dispatch_args.add_scalar(contexts[i].device_ctx)
+
+                result = orch.submit_next_level(dispatch_cc, dispatch_args, cfg, worker=i)
+                print(f"[Dispatch-Only] Submitted task for card {i}", flush=True)
+
+        print("[Dispatch-Only] Running dispatch-only test...", flush=True)
+        worker.run(orch_fn, args=None, config=CallConfig())
+        print("[Dispatch-Only] Test completed", flush=True)
+
+        # Compute golden recv using dispatch logic
+        def compute_golden_recv(num_cards, host_send):
+            """
+            Compute golden recv using dispatch logic:
+            For card i (processing expert i):
+              recv[i][j][:COUNT][:] = card j's send[expert_i][:COUNT][:]
+            NOTE: Dispatch only processes first COUNT tokens, not all NUM_TOKENS!
+            """
+            golden_recvs = []
+            for cardi in range(num_cards):
+                recv = torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32)
+                for cardj in range(num_cards):
+                    # Card i receives from card j: card j's send[expert_i]
+                    # expert_i = cardi (because card i processes expert i)
+                    # Only copy first COUNT tokens!
+                    recv[cardj, :COUNT, :] = host_send[cardj][cardi, :COUNT, :]
+                golden_recvs.append(recv)
+            return golden_recvs
+
+        golden_recvs = compute_golden_recv(num_cards, host_send)
+
+        # Verify correctness
+        print("\n" + "="*80)
+        print("[Dispatch-Only] VERIFICATION:")
+        print("="*80)
+        print("[Dispatch-Only] Comparing actual recv vs golden recv...")
+        print(f"[Dispatch-Only] Recv shape: {host_recv[0].shape} (num_cards={num_cards}, NUM_TOKENS={NUM_TOKENS}, HIDDEN_DIM={HIDDEN_DIM})")
+
+        all_match = True
+        for i in range(num_cards):
+            max_diff = float(torch.max(torch.abs(host_recv[i] - golden_recvs[i])))
+            mean_diff = float(torch.mean(torch.abs(host_recv[i] - golden_recvs[i])))
+            print(f"[Dispatch-Only] Card {i}: max |recv - golden| = {max_diff:.6e}, mean diff = {mean_diff:.6e}")
+
+            if max_diff > 1e-3:
+                all_match = False
+                print(f"[Dispatch-Only] Card {i} MISMATCH! Full recv data:")
+                for card_j in range(num_cards):
+                    for t in range(NUM_TOKENS):
+                        print(f"  recv[{card_j}][{t}][:3] = {host_recv[i][card_j, t, :3].tolist()}")
+                        print(f"  golden[{card_j}][{t}][:3] = {golden_recvs[i][card_j, t, :3].tolist()}")
+            else:
+                print(f"[Dispatch-Only] Card {i} ✅ matches golden")
+
+        if all_match:
+            print("\n[Dispatch-Only] ✅ All cards matched golden!")
+            return 0
+        else:
+            print("\n[Dispatch-Only] ❌ Some cards did NOT match golden!")
+            return 1
+
+    except Exception as e:
+        print(f"[Dispatch-Only] ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    finally:
+        print("[Dispatch-Only] Shutting down worker...")
+        worker.close()
+        try:
+            os.unlink(rootinfo_path)
+        except FileNotFoundError:
+            pass
+
+
+def main() -> int:
+    args = parse_args()
+    device_ids = parse_device_range(args.device)
+    return run(args.platform, device_ids)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/workers/l3/moe_multi_chip_experts/test_end2end.py b/examples/workers/l3/moe_multi_chip_experts/test_end2end.py
new file mode 100755
index 000000000..8afe15d88
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/test_end2end.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python3
+# Test complete MoE pipeline: Dispatch + Compute + Combine
+
+import argparse
+import os
+import sys
+
+os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+
+import torch
+from simpler.task_interface import (
+    ArgDirection,
+    CallConfig,
+    ChipBootstrapConfig,
+    ChipBufferSpec,
+    ChipCallable,
+    ChipCommBootstrapConfig,
+    CoreCallable,
+    DataType,
+    TaskArgs,
+    TensorArgType,
+)
+from simpler.worker import Worker
+
+from simpler_setup.kernel_compiler import KernelCompiler
+from simpler_setup.pto_isa import ensure_pto_isa_root
+from simpler_setup.torch_interop import make_tensor_arg
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+
+# MoE configuration
+NUM_TOKENS = 10
+HIDDEN_DIM = 16
+COUNT = 4
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Test complete MoE pipeline (Dispatch + Compute + Combine)")
+    parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3", "a5sim", "a5"])
+    parser.add_argument("-d", "--device", default="0-1", help="Device range")
+    return parser.parse_args()
+
+
+def parse_device_range(spec: str) -> list[int]:
+    if "-" in spec:
+        lo, hi = (int(x) for x in spec.split("-"))
+        return list(range(lo, hi + 1))
+    elif "," in spec:
+        return [int(x) for x in spec.split(",")]
+    else:
+        return [int(spec)]
+
+
+def build_end2end_callable(platform: str) -> ChipCallable:
+    """Build callable with dispatch + compute + combine kernels."""
+    print("[End2End] Compiling kernels...", flush=True)
+    kc = KernelCompiler(platform=platform)
+    runtime = "tensormap_and_ringbuffer"
+    pto_isa_root = ensure_pto_isa_root(clone_protocol="https")
+    include_dirs = kc.get_orchestration_include_dirs(runtime)
+    kernel_include_dirs = list(include_dirs) + [str(kc.project_root / "src" / "common")]
+
+    # Compile dispatch kernel
+    dispatch_bytes = kc.compile_incore(
+        source_path=os.path.join(HERE, "kernels/aiv/moe_dispatch_alltoall.cpp"),
+        core_type="aiv",
+        pto_isa_root=pto_isa_root,
+        extra_include_dirs=kernel_include_dirs,
+    )
+    print("[End2End] Dispatch kernel compiled", flush=True)
+
+    # Compile compute kernel
+    compute_bytes = kc.compile_incore(
+        source_path=os.path.join(HERE, "kernels/aiv/moe_simple_compute.cpp"),
+        core_type="aiv",
+        pto_isa_root=pto_isa_root,
+        extra_include_dirs=include_dirs,
+    )
+    print("[End2End] Compute kernel compiled", flush=True)
+
+    # Compile combine kernel
+    combine_bytes = kc.compile_incore(
+        source_path=os.path.join(HERE, "kernels/aiv/moe_combine_alltoall2.cpp"),
+        core_type="aiv",
+        pto_isa_root=pto_isa_root,
+        extra_include_dirs=kernel_include_dirs,
+    )
+    print("[End2End] Combine kernel compiled", flush=True)
+
+    if not platform.endswith("sim"):
+        from simpler_setup.elf_parser import extract_text_section
+        dispatch_bytes = extract_text_section(dispatch_bytes)
+        compute_bytes = extract_text_section(compute_bytes)
+        combine_bytes = extract_text_section(combine_bytes)
+        print("[End2End] Text sections extracted", flush=True)
+
+    # Compile orchestration
+    print("[End2End] Compiling orchestration...", flush=True)
+    orch_bytes = kc.compile_orchestration(
+        runtime_name=runtime,
+        source_path=os.path.join(HERE, "kernels/orchestration/moe_end2end_orch.cpp"),
+    )
+    print("[End2End] Orchestration compiled", flush=True)
+
+    # Build core callables
+    dispatch_cc = CoreCallable.build(
+        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT,
+                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        binary=dispatch_bytes,
+    )
+
+    compute_cc = CoreCallable.build(
+        signature=[ArgDirection.INOUT, ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        binary=compute_bytes,
+    )
+
+    combine_cc = CoreCallable.build(
+        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT, ArgDirection.OUT,
+                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        binary=combine_bytes,
+    )
+
+    return ChipCallable.build(
+        signature=[
+            ArgDirection.IN,   # send
+            ArgDirection.OUT,  # recv
+            ArgDirection.OUT,  # output
+            ArgDirection.INOUT,  # scratch
+            ArgDirection.INOUT,  # scratch_test
+            ArgDirection.OUT,  # scratch_print
+            ArgDirection.IN,   # expert_id
+            ArgDirection.IN,   # card_id
+            ArgDirection.IN,   # num_cards
+            ArgDirection.IN,   # CommContext*
+        ],
+        func_name="aicpu_orchestration_entry",
+        binary=orch_bytes,
+        children=[(0, dispatch_cc), (1, compute_cc), (2, combine_cc)],  # All three phases
+    )
+
+
+def compute_golden_end2end(num_cards: int, host_send: list[torch.Tensor]) -> list[torch.Tensor]:
+    """
+    Compute golden output for end-to-end pipeline:
+    1. Dispatch: send[card_j][expert_i][:COUNT][:] -> recv[card_i][card_j][:COUNT][:]
+    2. Compute: recv[card_i][card_j][:COUNT][:] += 1.0
+    3. Combine: recv[expert_j][card_i][:COUNT][:] -> output[card_i][expert_j][:COUNT][:]
+
+    Send initialization: unique values using (card * 1000000 + expert * 10000 + token * 100 + dim)
+    """
+    golden_outputs = []
+    for cardi in range(num_cards):
+        output = torch.zeros(num_cards, COUNT, HIDDEN_DIM, dtype=torch.float32)
+        for expertj in range(num_cards):
+            for t in range(COUNT):
+                for d in range(HIDDEN_DIM):
+                    # After dispatch: recv[cardi][expertj][:][:] = send[expertj][cardi][:][:]
+                    # Value from cardi's send[expertj][cardi][t][d]
+                    send_value = host_send[cardi][expertj, t, d].item()
+                    # After compute: recv += 1.0
+                    recv_value = send_value + 1.0
+                    # After combine: output[cardi][expertj][t][d] = recv[expertj][cardi][t][d]
+                    output[expertj, t, d] = recv_value
+        golden_outputs.append(output)
+
+    return golden_outputs
+
+
+def run(platform: str, device_ids: list[int]) -> int:
+    print(f"[End2End] Testing complete MoE pipeline on devices {device_ids}", flush=True)
+    num_cards = len(device_ids)
+    num_experts = num_cards
+
+    # Configure HCCL
+    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
+    scratch_nbytes = scratch_count * 4
+    total_scratch_nbytes = scratch_nbytes + num_cards * 4
+    window_size = max(total_scratch_nbytes, 4 * 1024)
+
+    print(f"\n[End2End] Test Configuration:")
+    print(f"  Platform: {platform}")
+    print(f"  Number of cards: {num_cards}")
+    print(f"  Device IDs: {device_ids}")
+    print(f"  NUM_TOKENS: {NUM_TOKENS}")
+    print(f"  HIDDEN_DIM: {HIDDEN_DIM}")
+    print(f"  COUNT (tokens processed): {COUNT}")
+
+    rootinfo_path = f"/tmp/pto_end2end_{os.getpid()}.bin"
+    try:
+        os.unlink(rootinfo_path)
+    except FileNotFoundError:
+        pass
+
+    torch.manual_seed(42)
+
+    # Allocate tensors with unique values to trace data flow
+    # Value encoding: (card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim
+    host_send = []
+    for i, device_id in enumerate(device_ids):
+        send = torch.zeros(num_experts, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+        for expert_j in range(num_experts):
+            for t in range(NUM_TOKENS):
+                for d in range(HIDDEN_DIM):
+                    # Unique value: card_i -> expert_j -> token_t -> dim_d
+                    value = float(i * 1000000 + expert_j * 10000 + t * 100 + d)
+                    send[expert_j, t, d] = value
+        host_send.append(send)
+    host_recv = [torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                 for _ in device_ids]
+    host_output = [torch.zeros(num_cards, COUNT, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+                   for _ in device_ids]
+
+    # Allocate scratch_print tensor (debug output)
+    host_scratch_print = [torch.zeros(scratch_count, dtype=torch.float32).share_memory_()
+                          for _ in device_ids]
+
+    print(f"\n[End2End] Allocated tensors:")
+    print(f"  send=unique_values, recv=0.0, output=0.0")
+    print(f"  Value encoding: (card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim", flush=True)
+
+    # Compute golden output
+    print("\n[End2End] Computing golden output...")
+    golden_outputs = compute_golden_end2end(num_cards, host_send)
+    print("[End2End] Golden output computed", flush=True)
+
+    # Configure HCCL bootstrap with two independent scratch buffers
+    cfgs = [
+        ChipBootstrapConfig(
+            comm=ChipCommBootstrapConfig(
+                rank=rank,
+                nranks=num_cards,
+                rootinfo_path=rootinfo_path,
+                window_size=window_size,
+            ),
+            buffers=[
+                ChipBufferSpec(
+                    name="scratch",
+                    dtype="float32",
+                    count=scratch_count,
+                    nbytes=total_scratch_nbytes,
+                ),
+                ChipBufferSpec(
+                    name="scratch_test",
+                    dtype="float32",
+                    count=scratch_count,
+                    nbytes=total_scratch_nbytes,
+                ),
+            ],
+        )
+        for rank in range(num_cards)
+    ]
+
+    # Create worker
+    worker = Worker(
+        level=3,
+        platform=platform,
+        runtime="tensormap_and_ringbuffer",
+        device_ids=device_ids,
+        num_sub_workers=0,
+        chip_bootstrap_configs=cfgs,
+    )
+
+    print(f"\n[End2End] Compiling kernels for {platform}...", flush=True)
+    end2end_cc = build_end2end_callable(platform)
+    print("[End2End] All kernels compiled successfully", flush=True)
+
+    print("[End2End] Initializing worker...", flush=True)
+    worker.init()
+    contexts = worker.chip_contexts
+    print(f"[End2End] Worker initialized with {len(contexts)} contexts", flush=True)
+
+    try:
+        def orch_fn(orch, _args, cfg):
+            print(f"[End2End] Submitting tasks for {num_cards} cards", flush=True)
+            for i in range(num_cards):
+                args = TaskArgs()
+                args.add_tensor(make_tensor_arg(host_send[i]), TensorArgType.INPUT)
+                args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.OUTPUT_EXISTING)
+                args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
+
+                from simpler.task_interface import ContinuousTensor
+                args.add_tensor(
+                    ContinuousTensor.make(
+                        data=contexts[i].buffer_ptrs["scratch"],
+                        shapes=(scratch_count,),
+                        dtype=DataType.FLOAT32,
+                        child_memory=True,
+                    ),
+                    TensorArgType.INOUT,
+                )
+                args.add_tensor(
+                    ContinuousTensor.make(
+                        data=contexts[i].buffer_ptrs["scratch_test"],
+                        shapes=(scratch_count,),
+                        dtype=DataType.FLOAT32,
+                        child_memory=True,
+                    ),
+                    TensorArgType.INOUT,
+                )
+                args.add_tensor(make_tensor_arg(host_scratch_print[i]), TensorArgType.OUTPUT_EXISTING)
+
+                args.add_scalar(i)  # expert_id
+                args.add_scalar(i)  # card_id
+                args.add_scalar(num_cards)
+                args.add_scalar(contexts[i].device_ctx)
+
+                result = orch.submit_next_level(end2end_cc, args, cfg, worker=i)
+                print(f"[End2End] Submitted task for card {i}", flush=True)
+
+        print("\n[End2End] Running end-to-end test...", flush=True)
+
+        worker.run(orch_fn, args=None, config=CallConfig())
+        print("\n[End2End] End-to-end pipeline completed!", flush=True)
+
+        # Print results
+        print("\n" + "="*80)
+        print("[End2End] OUTPUT DATA:")
+        print("="*80)
+
+        for i in range(num_cards):
+            print(f"\n[End2End] Card {i} output data:")
+            print(f"  Expected: Each value = send_value + 1.0")
+            print(f"  Sample data (first 2 experts, first {COUNT} tokens, first 3 dims):")
+
+            for expert_j in range(min(2, num_cards)):
+                print(f"    Expert {expert_j}:")
+                for t in range(min(COUNT, 2)):
+                    vals = host_output[i][expert_j, t, :3].tolist()
+                    golden_vals = golden_outputs[i][expert_j, t, :3].tolist()
+                    print(f"      Token {t}: Output={vals}, Golden={golden_vals}")
+
+        # Verify correctness
+        print("\n" + "="*80)
+        print("[End2End] VERIFICATION:")
+        print("="*80)
+
+        all_correct = True
+        error_count = 0
+        total_checked = 0
+
+        for i in range(num_cards):
+            print(f"\n[End2End] Card {i}:")
+            card_errors = 0
+
+            for expert_j in range(num_cards):
+                for t in range(COUNT):
+                    for d in range(HIDDEN_DIM):
+                        expected = golden_outputs[i][expert_j, t, d].item()
+                        actual = host_output[i][expert_j, t, d].item()
+                        total_checked += 1
+
+                        if abs(actual - expected) > 1e-3:
+                            card_errors += 1
+                            error_count += 1
+                            all_correct = False
+
+            if card_errors == 0:
+                print(f"  ✓ All {num_cards * COUNT * HIDDEN_DIM} values correct")
+            else:
+                print(f"  ✗ {card_errors} / {num_cards * COUNT * HIDDEN_DIM} values incorrect")
+
+        print(f"\n  Total: {total_checked - error_count}/{total_checked} correct")
+
+        # Final verdict
+        print("\n" + "="*80)
+        print("[End2End] FINAL VERDICT:")
+        print("="*80)
+
+        if all_correct:
+            print("\n[End2End] ✅ All values correct! End-to-end pipeline works perfectly.")
+            return 0
+        else:
+            print("\n[End2End] ❌ Some values incorrect!")
+            return 1
+
+    except Exception as e:
+        print(f"[End2End] ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    finally:
+        print("[End2End] Shutting down worker...")
+        worker.close()
+        try:
+            os.unlink(rootinfo_path)
+        except FileNotFoundError:
+            pass
+
+
+def main() -> int:
+    args = parse_args()
+    device_ids = parse_device_range(args.device)
+    return run(args.platform, device_ids)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py b/examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py
new file mode 100644
index 000000000..9d40cd77e
--- /dev/null
+++ b/examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py
@@ -0,0 +1,39 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Hardware ST for examples/workers/l3/moe_multi_chip_experts."""
+
+import pytest
+
+from .main import run
+
+
+@pytest.mark.platforms(["a2a3sim", "a2a3", "a5sim", "a5"])
+@pytest.mark.runtime("tensormap_and_ringbuffer")
+@pytest.mark.device_count(2)
+def test_moe_multi_chip_2_experts(st_platform, st_device_ids):
+    """Test multi-chip MoE with 2 experts (1 per chip).
+
+    This should produce the SAME results as moe_single_chip with 2 experts,
+    just executed in parallel across 2 chips instead of sequentially on 1 chip.
+    """
+    rc = run(st_platform, [int(d) for d in st_device_ids])
+    assert rc == 0
+
+
+@pytest.mark.platforms(["a2a3sim", "a2a3"])
+@pytest.mark.runtime("tensormap_and_ringbuffer")
+@pytest.mark.device_count(4)
+def test_moe_multi_chip_4_experts(st_platform, st_device_ids):
+    """Test multi-chip MoE with 4 experts (1 per chip).
+
+    This should produce the SAME results as moe_single_chip with 4 experts,
+    just executed in parallel across 4 chips instead of sequentially on 1 chip.
+    """
+    rc = run(st_platform, [int(d) for d in st_device_ids])
+    assert rc == 0

From d47f536890e0b34760ac62c2022bf6c578e3e37e Mon Sep 17 00:00:00 2001
From: puddingfjz <2811443837@qq.com>
Date: Fri, 8 May 2026 17:26:56 +0800
Subject: [PATCH 2/2] Refactor: streamline MoE multi-chip example

- Keep the example focused on the end-to-end dispatch, compute, and combine path
- Remove obsolete debug docs, partial tests, and unused kernel variants
- Align README, test naming, and scratch buffer handling with the current two-chip hardware test
---
 .../l3/moe_multi_chip_experts/.gitignore      |  12 -
 .../l3/moe_multi_chip_experts/DEBUG_GUIDE.md  | 188 -------
 .../IMPLEMENTATION_NOTES.md                   | 113 ----
 .../l3/moe_multi_chip_experts/README.md       | 279 ++++------
 .../l3/moe_multi_chip_experts/TESTING.md      | 164 ------
 .../l3/moe_multi_chip_experts/golden.py       |  42 --
 ...alltoall2.cpp => moe_combine_alltoall.cpp} |  59 +--
 .../aiv/moe_combine_alltoall2 copy.cpp        | 244 ---------
 .../kernels/aiv/moe_combine_alltoall_ori.cpp  | 268 ----------
 .../kernels/aiv/moe_demo_incore_0.cpp         | 108 ----
 .../kernels/aiv/moe_demo_incore_1.cpp         | 137 -----
 .../kernels/aiv/moe_demo_incore_2.cpp         | 156 ------
 .../kernels/aiv/moe_dispatch_alltoall.cpp     |  58 +--
 .../kernels/aiv/moe_simple_compute.cpp        |  19 +-
 .../kernels/kernel_config.py                  |  24 -
 .../orchestration/moe_combine_only_orch.cpp   |  69 ---
 .../kernels/orchestration/moe_comm_orch.cpp   | 123 -----
 .../moe_dispatch_compute_orch.cpp             |  88 ----
 .../orchestration/moe_dispatch_only_orch.cpp  |  69 ---
 .../orchestration/moe_end2end_orch.cpp        |  55 +-
 .../orchestration/moe_multi_chip_orch.cpp     |  88 ----
 .../workers/l3/moe_multi_chip_experts/main.py | 491 +++++++++---------
 .../test_combine_only.py                      | 411 ---------------
 .../test_dispatch_compute.py                  | 290 -----------
 .../test_dispatch_only.py                     | 308 -----------
 .../l3/moe_multi_chip_experts/test_end2end.py | 398 --------------
 ...chip.py => test_moe_multi_chip_experts.py} |  15 +-
 27 files changed, 448 insertions(+), 3828 deletions(-)
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/.gitignore
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/DEBUG_GUIDE.md
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/IMPLEMENTATION_NOTES.md
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/TESTING.md
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/golden.py
 rename examples/workers/l3/moe_multi_chip_experts/kernels/aiv/{moe_combine_alltoall2.cpp => moe_combine_alltoall.cpp} (82%)
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2 copy.cpp
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall_ori.cpp
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_0.cpp
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_1.cpp
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_2.cpp
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/kernel_config.py
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_combine_only_orch.cpp
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_comm_orch.cpp
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_compute_orch.cpp
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_only_orch.cpp
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_multi_chip_orch.cpp
 delete mode 100755 examples/workers/l3/moe_multi_chip_experts/test_combine_only.py
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/test_dispatch_compute.py
 delete mode 100644 examples/workers/l3/moe_multi_chip_experts/test_dispatch_only.py
 delete mode 100755 examples/workers/l3/moe_multi_chip_experts/test_end2end.py
 rename examples/workers/l3/moe_multi_chip_experts/{test_moe_multi_chip.py => test_moe_multi_chip_experts.py} (70%)

diff --git a/examples/workers/l3/moe_multi_chip_experts/.gitignore b/examples/workers/l3/moe_multi_chip_experts/.gitignore
deleted file mode 100644
index c2bbc644a..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/.gitignore
+++ /dev/null
@@ -1,12 +0,0 @@
-# Log files
-*.log
-
-# Build outputs
-build_output/
-
-# Device logs
-device_log/
-
-# Analysis files
-*_analysis.md
-all_reduce.log
diff --git a/examples/workers/l3/moe_multi_chip_experts/DEBUG_GUIDE.md b/examples/workers/l3/moe_multi_chip_experts/DEBUG_GUIDE.md
deleted file mode 100644
index b28ff4c1d..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/DEBUG_GUIDE.md
+++ /dev/null
@@ -1,188 +0,0 @@
-# 调试信息说明
-
-## 案例 1: End-to-End MoE Pipeline Scratch 缓冲区冲突问题
-
-### 问题描述
-在实现完整的 MoE pipeline（Dispatch + Compute + Combine）时，发现 Card 1 的 Expert 0 输出错误：
-- **期望值**: 2.0 (1.0 input + 1.0 compute)
-- **实际值**: 1.0 (只有 input，没有 compute)
-
-### 调试过程
-
-#### 步骤 1: 创建 Isolated Combine Test
-**假设**: Combine 阶段本身有问题
-
-**实现**: 在 test_end2end.py 中添加独立的 combine 测试
-- 创建 `host_recv_test`: 填充正确的 2.0 值
-- 创建 `host_output_test`: 用于存储 isolated test 的输出
-- 创建 `host_scratch_print_test`: 独立的 debug 输出
-- 创建 `scratch_test` buffer: 独立的 HCCL scratch 缓冲区
-- 在 orchestrator 中添加 Part 2: Isolated Combine Test
-
-**结果**: 
-- ✅ Isolated Test: 所有 256 个值正确 (2.0)
-- ❌ Full Pipeline: Card 1 的 Expert 0 仍然错误 (1.0)
-
-**结论**: Combine 阶段本身是正确的，问题不在 combine kernel
-
-#### 步骤 2: 分析数据流
-重新分析数据流，确认问题所在：
-
-**Dispatch 阶段**:
-- Input: `send[card_i][expert_i][:][:]` = 1.0
-- Output: `recv[card_i][card_j][:][:]` = `send[card_j][expert_i][:][:]`
-- 对于 Card i: 从所有 Card j 接收 `send[j][i][:][:]`
-
-**Compute 阶段**:
-- Input: `recv[:][:4][:]`
-- Output: `recv[:][:4][:] += 1.0`
-- 所有 recv 的前 4 个 token 都加 1.0
-
-**Combine 阶段**:
-- Phase 1 (stage-in): 复制 `recv[:][:][:]` 到 `scratch[my_rank][card_j][:][:]`
-- Phase 3 (direct-store): 从 `scratch[expert_i][my_rank][:][:]` 读取到 `output[expert_i][:][:]`
-
-#### 步骤 3: 发现 Scratch 缓冲区冲突
-**关键观察**:
-- Full Pipeline 使用同一个 `scratch` buffer
-- Isolated Test 使用独立的 `scratch_test` buffer → 成功！
-
-**问题定位**:
-当 Full Pipeline 复用同一个 scratch buffer 时：
-1. Dispatch Phase 向 `scratch` 写入数据（布局: `scratch[card_j][expert_i][:][:]`）
-2. Combine Phase 1 **应该**向 `scratch` 写入 `recv` 数据（布局: `scratch[my_rank][card_j][:][:]`）
-3. Combine Phase 3 从 `scratch` 读取数据
-
-**问题**:
-- Combine Phase 1 只写入前 COUNT (4) 个 token
-- Combine Phase 3 的 stride 使用 NUM_TOKENS (10) 计算 offset
-- **Combine Phase 1 没有完全覆盖 Dispatch Phase 写入的数据**
-- Combine Phase 3 读到了 Dispatch Phase 的残留数据
-
-#### 步骤 4: 解决方案
-**方案**: 为 Combine Phase 使用独立的 scratch 缓冲区
-
-**实现**:
-1. 在 `ChipBootstrapConfig` 中添加第二个 scratch buffer:
-   ```python
-   ChipBufferSpec(
-       name="scratch_test",
-       dtype="float32",
-       count=scratch_count,
-       nbytes=total_scratch_nbytes,
-   )
-   ```
-
-2. 在 orchestrator 中:
-   - Dispatch + Compute: 使用 `ext_scratch`
-   - Combine: 使用 `ext_scratch_test`
-
-3. 在 Python 中:
-   - 添加 `contexts[i].buffer_ptrs["scratch_test"]`
-
-**结果**: ✅ Full Pipeline 完全正确
-
-### 关键经验
-
-1. **隔离测试的重要性**:
-   - 通过创建 isolated combine test，快速定位问题不在 combine kernel 本身
-   - 这种方法可以推广到其他多阶段 pipeline 的调试
-
-2. **缓冲区复用的陷阱**:
-   - 当多个阶段使用同一个 scratch buffer 时：
-     - **确保每个阶段完全覆盖**它写入的区域
-     - **注意写入范围和读取范围的不匹配**
-   - Phase 1 写入前 COUNT 个 token，但 Phase 3 的 stride 基于 NUM_TOKENS
-
-3. **调试技巧**:
-   - 使用唯一值初始化输入（而不是全 1.0）
-   - 值编码: `(card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim`
-   - 这样可以清楚追踪每个数据点的流向
-
-4. **独立的 HCCL 缓冲区**:
-   - 如果不确定 buffer 是否被正确覆盖，使用独立 buffer
-   - 内存成本: 2x scratch buffer (对于小 buffer 可以接受)
-   - 避免了复杂的状态清理逻辑
-
-### 相关文件
-- `test_end2end.py`: 完整的 end-to-end 测试
-- `moe_end2end_orch.cpp`: 使用独立 scratch_test 的 orchestrator
-- `moe_combine_alltoall2.cpp`: Combine kernel
-
-### 运行测试
-```bash
-source /data/miniconda3/etc/profile.d/conda.sh && \
-conda activate simpler_issue && \
-task-submit --device 10,11 --run \
-  "export PTOAS_ROOT=/usr/local/bin/ptoas-bin && \
-   ASCEND_PROCESS_LOG_PATH=device_log \
-   ASCEND_GLOBAL_LOG_LEVEL=0 \
-   python examples/workers/l3/moe_multi_chip_experts/test_end2end.py -p a2a3 -d 10,11"
-```
-
----
-
-## 添加的调试点
-
-### Python 侧 (main.py)
-1. **run() 函数入口**: 跟踪程序启动
-2. **HCCL 配置**: 显示 scratch buffer 大小和 rootinfo 路径
-3. **Tensor 分配**: 确认内存分配成功
-4. **Worker 创建**: 跟踪 Worker 对象创建
-5. **内核编译阶段**:
-   - 编译 dispatch kernel
-   - 编译 compute kernel
-   - 编译 combine kernel
-   - 提取 ELF text sections (硬件)
-   - 编译 orchestration
-6. **Worker 初始化**: 跟踪 init() 进度
-7. **chip_contexts**: 显示每个 card 的 rank 和 device_ctx
-8. **orch_fn**: 跟踪任务提交进度
-9. **worker.run()**: 跟踪执行进度
-
-### C++ Orchestration 侧 (moe_comm_orch.cpp)
-1. **orchestration_entry 入口**: 显示 card_id, expert_id, num_cards, comm_ctx
-2. **阶段 1 (Dispatch)**: 任务提交前后的状态
-3. **阶段 2 (Compute)**: 任务提交前后的状态
-4. **阶段 3 (Combine)**: 任务提交前后的状态
-5. **完成**: 确认所有阶段完成
-
-所有输出都使用 `flush=True` 或 `fflush(stdout)` 确保立即写入日志。
-
-## 运行测试
-
-```bash
-# 重新运行测试，观察调试输出
-source /data/miniconda3/etc/profile.d/conda.sh && \
-conda activate simpler_issue && \
-task-submit --device 4,5,6,7 --run "export PTOAS_ROOT=/usr/local/bin/ptoas-bin && python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3 -d 4,5,6,7 > moe_multi_chip_test_4chip_debug.log 2>&1"
-```
-
-## 可能的问题定位
-
-### 情况 1: 卡在内核编译
-**症状**: 看到 "[moe_multi_chip] [DEBUG] Starting kernel compilation..." 但没有后续输出
-**原因**: 可能是 PTOAS_ROOT 路径不正确或编译器问题
-**解决**: 检查 PTOAS_ROOT 环境变量和 ptoas-bin 目录
-
-### 情况 2: 卡在 Worker.init()
-**症状**: 看到 "Worker created" 但没有 "Worker initialized"
-**原因**: 可能是 HCCL 初始化或设备通信问题
-**解决**: 检查设备之间的 HCCL 通信配置
-
-### 情况 3: 卡在 worker.run()
-**症状**: 看到 "About to call worker.run()" 但没有看到 orchestration 输出
-**原因**: 可能是任务提交或调度问题
-**解决**: 检查 runtime 配置和任务队列
-
-### 情况 4: 卡在某个阶段
-**症状**: 看到 "Stage X: ..." 但没有 "Stage X+1"
-**原因**: 可能是该阶段的 AIV 内核或 HCCL 通信问题
-**解决**: 检查对应阶段的内核代码和通信逻辑
-
-## 下一步
-
-1. 运行带调试信息的测试
-2. 观察最后一条成功的调试消息
-3. 根据卡住的位置定位问题
-4. 如果需要，在更具体的位置添加更详细的调试信息
diff --git a/examples/workers/l3/moe_multi_chip_experts/IMPLEMENTATION_NOTES.md b/examples/workers/l3/moe_multi_chip_experts/IMPLEMENTATION_NOTES.md
deleted file mode 100644
index 45b1c1604..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/IMPLEMENTATION_NOTES.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# Multi-Chip MoE Implementation Notes
-
-## Overview
-
-This implementation transforms the single-chip MoE example (`moe_single_chip`) into a multi-chip parallel version (`moe_multi_chip_experts`) where **each chip processes one expert** instead of all experts running sequentially on one chip.
-
-## Key Changes
-
-### 1. Architecture
-
-**Single-Chip Version:**
-- One chip runs ALL 4 experts sequentially
-- Orchestration loops: `card_i=0..3`, `expert_j=0..3`, `t_idx=0..3`
-- Total: 4 cards × 4 experts × 4 tokens = 64 dispatch operations
-
-**Multi-Chip Version:**
-- Each chip runs ONE expert in parallel
-- Orchestration: `card_i=i` (passed as arg), `expert_j=i` (passed as arg), `t_idx=0..3`
-- Per chip: 1 expert × 4 tokens = 4 dispatch operations
-- With 2 chips: 2 × (1 × 4) = 8 total dispatch operations (parallel)
-
-### 2. Modified Files
-
-#### `kernels/kernel_config.py` (NEW)
-- Configuration file defining runtime and kernel sources
-- Mirrors structure from single-chip version
-
-#### `kernels/orchestration/moe_multi_chip_orch.cpp` (MODIFIED)
-- Reads expert ID and chip ID from scalar arguments (passed by Python)
-- Only processes the assigned expert (not all experts)
-- Maintains same computation pattern as single-chip version
-- Key difference: No `card_i` loop, no `expert_j` loop - these are passed as args
-
-#### `main.py` (MODIFIED)
-- Passes two scalar arguments to orchestration:
-  1. Expert ID (`i`): Chip i processes expert i
-  2. Chip ID (`i`): Logical card_i for data layout computation
-- Updated ChipCallable signature to accept 3 tensors + 2 scalars
-
-### 3. Result Equivalence
-
-Both versions produce **IDENTICAL results** because:
-- Same kernels (`moe_demo_incore_0/1/2.cpp`)
-- Same computation logic (dispatch → compute → combine)
-- Only difference: execution distribution (serial vs parallel)
-
-## Usage
-
-### Run Multi-Chip Version (2 chips, 2 experts)
-```bash
-python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3sim -d 0-1
-```
-
-### Run Single-Chip Version (for comparison)
-```bash
-python examples/workers/l3/moe_single_chip/main.py -p a2a3sim -d 0
-```
-
-### Run via pytest
-```bash
-pytest examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py -v -s
-```
-
-## Technical Details
-
-### Parameter Passing
-The multi-chip version uses scalar arguments to pass expert ID and chip ID to orchestration:
-```python
-moe_args.add_scalar(i)  # Expert ID
-moe_args.add_scalar(i)  # Chip ID (logical card_i)
-```
-
-Orchestration reads these:
-```cpp
-int64_t expert_j = static_cast<int64_t>(orch_args.scalar(0));
-int64_t card_i = static_cast<int64_t>(orch_args.scalar(1));
-```
-
-### Data Layout
-- Each chip has its own input/output buffers
-- Shape: `[4, 64, 64]` (4 tokens, 64 hidden dim)
-- Same layout as single-chip version for result equivalence
-
-### ChipCallable Signature
-- Single-chip: `[IN, OUT, OUT]` (3 tensors)
-- Multi-chip: `[IN, OUT, OUT, IN, IN]` (3 tensors + 2 scalars)
-
-## Verification
-
-To verify result equivalence:
-1. Run single-chip version, save output
-2. Run multi-chip version, save output
-3. Compare outputs (should be identical)
-
-Note: Multi-chip version produces per-chip outputs. To compare with single-chip:
-- Single-chip output is the combined result of all 4 experts
-- Multi-chip per-chip output is the result of one expert
-- Combine multi-chip outputs appropriately for comparison
-
-## Future Improvements
-
-1. **Dynamic Configuration**: Currently hardcoded for 4 tokens. Could make configurable.
-2. **Result Combination**: Add logic to combine per-chip outputs for direct comparison.
-3. **Scalability**: Test with more chips (4, 8, etc.)
-4. **Performance**: Measure speedup vs single-chip version
-
-## Related Files
-
-- Single-chip version: `examples/workers/l3/moe_single_chip/`
-- Multi-chip version: `examples/workers/l3/moe_multi_chip_experts/`
-- Other multi-chip examples:
-  - `examples/workers/l3/multi_chip_dispatch/`
-  - `examples/workers/l3/ffn_tp_parallel/`
diff --git a/examples/workers/l3/moe_multi_chip_experts/README.md b/examples/workers/l3/moe_multi_chip_experts/README.md
index 9c755687a..bfd9c2749 100644
--- a/examples/workers/l3/moe_multi_chip_experts/README.md
+++ b/examples/workers/l3/moe_multi_chip_experts/README.md
@@ -1,213 +1,128 @@
-# Multi-Chip MoE Example
-
-This example demonstrates a distributed MoE (Mixture of Experts) pattern across **multiple chips**, with **one expert per chip**.
-
-## Overview
-
-This is the **multi-chip version** of `moe_single_chip`. The computation is **identical** - same kernels, same logic - but distributed across multiple chips for parallel execution.
-
-## Key Difference: Single vs Multi-Chip
+# `moe_multi_chip_experts/` — one expert per chip
+
+Runs a small distributed Mixture-of-Experts pipeline across multiple chips.
+Each rank owns one expert, exchanges token slices through HCCL window buffers,
+applies a simple per-expert compute kernel, and gathers the processed expert
+results back to the source ranks.
+
+This example is intentionally tiny: `NUM_TOKENS = 10`, `HIDDEN_DIM = 16`, and
+only the first `COUNT = 4` tokens are processed. The small shape makes the
+data movement easy to inspect while still exercising cross-chip dispatch,
+compute, and combine.
+
+## What This Demonstrates
+
+| Concept | Where it shows up |
+| ------- | ----------------- |
+| L3 multi-chip worker | `Worker(level=3, device_ids=[...])` in `main.py` |
+| HCCL bootstrap buffers | `ChipBootstrapConfig` with `scratch1` and `scratch2` |
+| Cross-rank dispatch | `kernels/aiv/moe_dispatch_alltoall.cpp` |
+| Per-rank expert compute | `kernels/aiv/moe_simple_compute.cpp` |
+| Cross-rank combine | `kernels/aiv/moe_combine_alltoall.cpp` |
+| Device orchestration | `kernels/orchestration/moe_end2end_orch.cpp` |
+| Pytest integration | `test_moe_multi_chip_experts.py` calls `main.run(...)` |
+
+## Layout
+
+```text
+moe_multi_chip_experts/
+  main.py                         # CLI demo and reusable run() entry
+  test_moe_multi_chip_experts.py  # pytest wrapper, matching other L3 examples
+  kernels/
+    aiv/
+      moe_dispatch_alltoall.cpp   # publish each rank's expert input
+      moe_simple_compute.cpp      # add 1.0 to dispatched token slices
+      moe_combine_alltoall.cpp    # gather processed expert outputs
+    orchestration/
+      moe_end2end_orch.cpp        # submit dispatch -> compute -> combine
+  README.md
+```
 
-| Aspect | moe_single_chip | moe_multi_chip_experts |
-|--------|----------------|------------------------|
-| **Execution** | Sequential on one chip | **Parallel across chips** |
-| **Expert placement** | All experts on one chip | **One expert per chip** |
-| **Computation** | Same | **Same (identical kernels)** |
-| **Performance** | Limited by single chip | **Scales with chip count** |
-| **Result** | Deterministic | **Deterministic (same result)** |
+## Pipeline
 
-## Pattern
+For `N` chips, each chip owns one expert and starts with:
 
-```
-Single-Chip Version (moe_single_chip):
-  Input → [Chip 0: Expert 0,1,2,3] → Output
-
-Multi-Chip Version (moe_multi_chip_experts):
-  Input → [Chip 0: Expert 0] ─┐
-         [Chip 1: Expert 1] ─┼→ Output
-         [Chip 2: Expert 2] ─┤  (same result!)
-         [Chip 3: Expert 3] ─┘
+```text
+send[expert_id][token][hidden]
+recv[source_rank][token][hidden]
+output[expert_id][token][hidden]
 ```
 
-## Computation Flow (Identical to Single-Chip)
+The orchestration submits three AIV kernels:
 
-### 1. Dispatch Stage
-- Copy data from send to recv buffer based on expert assignment
-- Same kernel (`moe_demo_incore_0`) as single-chip version
-
-### 2. Compute Stage
-- Apply expert transformation on recv buffer
-- Same kernel (`moe_demo_incore_1`) as single-chip version
-- **Key difference**: Each chip runs only its assigned expert (parallel)
-
-### 3. Combine Stage
-- Accumulate results from recv to output
-- Same kernel (`moe_demo_incore_2`) as single-chip version
+```text
+┌──────────┐      ┌─────────┐      ┌─────────┐
+│ Dispatch │ ───▶ │ Compute │ ───▶ │ Combine │
+└──────────┘      └─────────┘      └─────────┘
+```
 
-## Kernels
+1. Dispatch writes each rank's expert slice into the owner rank's `recv`.
+2. Compute adds `1.0` to the first `COUNT` tokens in `recv`.
+3. Combine copies each expert's processed slice into the source rank's
+   `output[expert_id]` row.
 
-Uses the **exact same kernels** as `moe_single_chip`:
+`scratch1` is the HCCL window used by dispatch. `scratch2` is the HCCL window
+used by combine. Compute only updates `recv`; it does not use either scratch
+window.
 
-1. **moe_demo_incore_0.cpp** (dispatch): Copy send → recv based on expert assignment
-2. **moe_demo_incore_1.cpp** (compute): Apply expert transformation
-3. **moe_demo_incore_2.cpp** (combine): Accumulate results to output
+The two communication phases use independent windows mainly because each
+kernel places its barrier signal slots at the tail of its scratch buffer and
+does not reset those slots before use. Dispatch leaves its signal slots
+incremented after its cross-rank barrier. If combine reused the same window,
+its `TWAIT` could observe the old dispatch signals and pass before combine has
+staged its own data. A separate `scratch2` gives combine independent data
+storage and independent signal slots.
 
-The kernels are NOT modified - we just distribute the work differently.
+## Data Pattern
 
-## Configuration
+Inputs are initialized with unique values:
 
-```python
-# Device count determines expert count
-NUM_CARDS = len(device_ids)  # e.g., 2, 4, etc.
-NUM_EXPERTS = NUM_CARDS      # One expert per chip
-NUM_TOKENS = 64
-HIDDEN_DIM = 64
-EXPERT_HIDDEN_DIM = 32
+```text
+value = card_id * 1_000_000 + expert_id * 10_000 + token * 100 + dim
 ```
 
-## Running
+After compute, every checked output value should be the corresponding input
+value plus `1.0`. `main.py` computes the golden reference in Python and checks
+every `output[expert_id][token][hidden]` element for the processed token
+range.
 
-```bash
-# 2 chips (2 experts) - simulation
-python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3sim -d 0-1
+## Run
 
-# 4 chips (4 experts) - simulation
-python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3sim -d 0-3
+Hardware:
 
-# 2 chips (2 experts) - hardware
+```bash
 python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3 -d 0-1
-
-# Run via pytest
-pytest examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py -v -s
-```
-
-## How It Works
-
-### Python Level (main.py)
-
-```python
-# Allocate tensors per chip
-host_input = [torch.randn(...) for _ in device_ids]
-host_recv = [torch.randn(...) for _ in device_ids]
-host_output = [torch.zeros(...) for _ in device_ids]
-
-# Submit task to each chip
-for i in range(len(device_ids)):
-    orch.submit_next_level(moe_cc, moe_args, cfg, worker=i)
-    # Each chip runs the SAME orchestration
-    # But computes different experts based on chip ID
 ```
 
-### Orchestration Level (moe_multi_chip_orch.cpp)
-
-The orchestration code is identical to `moe_single_chip`:
-- Loops over `card_i` (chip index) and `expert_j` (expert index)
-- In multi-chip: each chip only processes its assigned expert
-- In single-chip: one chip processes all experts
-
-### Kernel Level
-
-**NO CHANGES** - kernels are identical:
-- Same memory access patterns
-- Same computation logic
-- Same results
-
-## Result Equivalence
-
-**The outputs ARE identical** (given same random seed):
+Simulation:
 
-```python
-# Single-chip version
-python moe_single_chip/main.py -p a2a3sim -d 0
-# Output: [tensor with values X]
-
-# Multi-chip version (2 chips)
-python moe_multi_chip_experts/main.py -p a2a3sim -d 0-1
-# Output: [tensor with values X]  <- SAME!
+```bash
+python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3sim -d 0-1
 ```
 
-The distribution is **transparent** to the computation - we're just
-executing the same work in parallel instead of sequentially.
-
-## When to Use Which Version?
-
-### Use `moe_single_chip` when:
-- ✅ You only have 1 chip available
-- ✅ You're developing/debugging kernels
-- ✅ Model fits comfortably on single chip
-- ✅ Simpler debugging (everything on one device)
-
-### Use `moe_multi_chip_experts` when:
-- ✅ You have multiple chips available
-- ✅ You want faster execution (parallel compute)
-- ✅ Model is too large for single chip
-- ✅ You're scaling to more experts than fit on one chip
+The pytest wrapper follows the same style as the other L3 examples:
 
-## Memory Layout
-
-Per-chip tensors (same as single-chip):
-
-```python
-# Each chip has:
-input:    [4, 64, 64]    # Input tokens
-recv:     [4, 64, 64]    # Intermediate buffer
-output:   [4, 64]        # Final output
+```bash
+python -m pytest examples/workers/l3/moe_multi_chip_experts --platform a2a3 --device 0-1
 ```
 
-The shape is identical - only the distribution changes.
-
-## Performance Characteristics
-
-### Single-Chip Version
-- **Compute**: O(num_experts × num_tokens) sequential
-- **Memory**: All expert data on one chip
-- **Latency**: Sum of all expert compute times
+For the CLI, device ids can be written as a range (`-d 0-1`) or a
+comma-separated list (`-d 0,1`). For pytest, pass the same device spec to
+`--device`. The examples use ranges because that matches the other L3 docs.
 
-### Multi-Chip Version
-- **Compute**: O(num_tokens) parallel per chip
-- **Memory**: Expert data distributed across chips
-- **Latency**: Max of individual expert compute times
+Expected successful output for the two-chip commands above includes:
 
-**Speedup**: Near-linear with chip count (ignoring communication overhead)
-
-## Implementation Details
-
-### No Kernel Changes
-The kernels (`moe_demo_incore_*.cpp`) are **verbatim copies** from the single-chip version. This ensures:
-
-1. **Correctness**: Same computation = same results
-2. **Simplicity**: No need to rewrite kernel logic
-3. **Maintainability**: Single source of truth for kernels
-
-### Distribution via Orchestration
-The multi-chip behavior comes from:
-1. Python: Submit tasks to multiple chips (`worker=i`)
-2. Orchestration: Each chip runs the same DAG
-3. Kernel: Identical computation, different data subsets
-
-### Key Insight
+```text
+[End2End] End-to-end pipeline completed!
+  Total: 256/256 correct
+[End2End] All values correct! End-to-end pipeline works perfectly.
 ```
-Single-chip: Chip 0 runs {Expert 0, Expert 1, Expert 2, Expert 3}
-Multi-chip:  Chip 0 runs {Expert 0}, Chip 1 runs {Expert 1}, ...
-
-Same total work, different distribution.
-```
-
-## Comparison with True Distributed MoE
-
-This example keeps the computation **identical** for educational purposes.
-Real distributed MoE systems would also optimize:
-
-- **Communication**: Reduce all-to-all data movement
-- **Load Balancing**: Dynamic token-to-expert assignment
-- **Gradient Synchronization**: Distributed training considerations
-
-Those optimizations are omitted here to maintain **result equivalence**
-with the single-chip version.
 
-## Next Steps
+## Notes
 
-1. **Compare outputs**: Run both versions and verify results match
-2. **Measure speedup**: Time both versions on your hardware
-3. **Scale up**: Try 4, 8, or more chips
-4. **Real distribution**: Implement data sharding across chips
+- `test_moe_multi_chip_experts.py` is a thin pytest wrapper around
+  `main.run(...)`.
+- The pytest case runs on `a2a3` hardware and requires two available device
+  ids.
+- Each rank allocates independent `scratch1` and `scratch2` HCCL windows
+  during worker bootstrap.
diff --git a/examples/workers/l3/moe_multi_chip_experts/TESTING.md b/examples/workers/l3/moe_multi_chip_experts/TESTING.md
deleted file mode 100644
index fc4189d4c..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/TESTING.md
+++ /dev/null
@@ -1,164 +0,0 @@
-# MoE Multi-Chip Testing Guide
-
-This guide provides detailed commands for testing the distributed MoE implementation on Ascend hardware.
-
-## Prerequisites
-
-```bash
-# Activate conda environment
-conda activate simpler_issue
-
-# Ensure environment variables are set
-export PTOAS_ROOT=/usr/local/bin/ptoas-bin
-export ASCEND_PROCESS_LOG_PATH=/data/fangjingzhi/simpler_distributed/device_log
-export ASCEND_GLOBAL_LOG_LEVEL=0
-```
-
-## Test Files
-
-| Test File | Purpose | Phase | Notes |
-|-----------|---------|-------|-------|
-| `test_dispatch_only.py` | Test dispatch phase only | Dispatch | Uses unique values for data tracing |
-| `test_combine_only.py` | Test combine phase only | Combine | Uses unique values for data tracing |
-| `test_dispatch_compute.py` | Test dispatch + compute | Dispatch + Compute | Verifies expert routing and compute |
-| `test_end2end.py` | Test complete end-to-end pipeline | All phases | Uses independent scratch buffers to avoid conflicts |
-
-## Test Commands
-
-
-
-### Hardware Mode (a2a3)
-
-Run on actual Ascend NPUs.
-
-#### Quick Tests (2 chips)
-
-```bash
-# Dispatch phase test
-python examples/workers/l3/moe_multi_chip_experts/test_dispatch_only.py \
-  -p a2a3 \
-  -d 10,11
-
-# Combine phase test
-python examples/workers/l3/moe_multi_chip_experts/test_combine_only.py \
-  -p a2a3 \
-  -d 10,11
-
-# End-to-end pipeline test (recommended)
-python examples/workers/l3/moe_multi_chip_experts/test_end2end.py \
-  -p a2a3 \
-  -d 10,11
-```
-
-#### Extended Tests (4 chips)
-
-```bash
-# 4-chip full pipeline
-python examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py \
-  -p a2a3 \
-  -d 10,11,12,13
-```
-
-## Background Job Submission
-
-For long-running tests, use `task-submit` to run in background.
-
-```bash
-# Submit combine-only test
-task-submit --device 10,11 --run \
-  "export PTOAS_ROOT=/usr/local/bin/ptoas-bin && \
-   ASCEND_PROCESS_LOG_PATH=/data/fangjingzhi/simpler_distributed/device_log && \
-   ASCEND_GLOBAL_LOG_LEVEL=0 && \
-   python examples/workers/l3/moe_multi_chip_experts/test_combine_only.py \
-   -p a2a3 -d 10,11 > moe_combine_only_$(date +%Y%m%d_%H%M%S).log 2>&1"
-
-# Submit full pipeline test
-task-submit --device 10,11 --run \
-  "export PTOAS_ROOT=/usr/local/bin/ptoas-bin && \
-   ASCEND_PROCESS_LOG_PATH=/data/fangjingzhi/simpler_distributed/device_log && \
-   ASCEND_GLOBAL_LOG_LEVEL=0 && \
-   python examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py \
-   -p a2a3 -d 10,11 > moe_full_$(date +%Y%m%d_%H%M%S).log 2>&1"
-```
-
-
-
-## Test Verification
-
-### Expected Output
-
-Each test will print:
-1. **Configuration**: Platform, device count, tensor shapes
-2. **Input data**: Sample values for verification
-3. **Scratch buffer**: Debug output from Phase 1 (stage-in)
-4. **Output data**: Final results after combine
-5. **Verification**: Match with golden output
-
-### test_end2end.py 特殊说明
-
-**关键特性**:
-- 使用唯一值初始化输入: `(card * 1000000) + (expert * 10000) + (token * 100) + dim`
-- 使用**独立的 scratch 缓冲区**避免阶段间冲突:
-  - `scratch`: 用于 Dispatch + Compute 阶段
-  - `scratch_test`: 用于 Combine 阶段
-- 清晰的数据流追踪
-
-**为什么需要独立的 scratch?**
-- Dispatch 向 `scratch` 写入: `scratch[card_j][expert_i][:][:]`
-- Combine 从 `scratch` 读取: `scratch[expert_i][my_rank][:][:]`
-- Combine 的写入范围 (前 COUNT 个 token) 不能完全覆盖 Dispatch 的数据
-- 使用独立 buffer 避免读到残留数据
-
-### Success Criteria
-
-```
-✓ All values correct
-✓ Output matches golden reference
-✓ No device errors or timeouts
-```
-
-## Debugging Failed Tests
-
-### Check Device Logs
-
-```bash
-# List latest device logs
-ls -lt /data/fangjingzhi/simpler_distributed/device_log/debug/device-*/ | head -20
-
-# Check specific device log for errors
-grep -i "error\|fail\|stuck" \
-  /data/fangjingzhi/simpler_distributed/device_log/debug/device-10/*.log
-```
-
-### Common Issues
-
-| Issue | Symptom | Solution |
-|-------|---------|----------|
-| Parameter mismatch | `kernel_id=-1`, STUCK-READY | Check tensor/scalar count matches kernel signature |
-| Device fault | `Device fault, ret=0x7110011` | Check for illegal memory access or uninitialized tiles |
-| Timeout | Task hangs, no progress | Check HCCL bootstrap and signal barrier logic |
-| Wrong results | Output doesn't match golden | Verify data flow through dispatch→combine phases |
-
-### Enable Verbose Logging
-
-```bash
-# Maximum verbosity for debugging
-ASCEND_GLOBAL_LOG_LEVEL=0 \
-ASCEND_PROCESS_LOG_PATH=/data/fangjingzhi/simpler_distributed/device_log \
-python examples/workers/l3/moe_multi_chip_experts/test_combine_only.py \
-  -p a2a3 -d 10,11
-```
-
-
-## Test Isolation
-
-Each test creates unique temporary files:
-
-```bash
-# Rootinfo files for HCCL
-/tmp/pto_*_PID*.bin
-
-# Device logs
-/data/fangjingzhi/simpler_distributed/device_log/debug/device-*/
-```
-
diff --git a/examples/workers/l3/moe_multi_chip_experts/golden.py b/examples/workers/l3/moe_multi_chip_experts/golden.py
deleted file mode 100644
index e4dc36ae0..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/golden.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import torch
-
-
-
-def demo(send, recv, output):
-    """
-        send shape: (num_cards, num_experts, total_tokens, hidden_size)
-        counts shape: (num_cards, num_experts,)
-        cumcounts shape: (num_cards, num_experts+1,)
-        recv shape: (num_experts, num_cards, total_tokens, hidden_size)
-        output shape: (num_cards, total_tokens, hidden_size)
-
-        Note: This function now adapts to the actual input shape, supporting
-        any number of cards (2, 3, 4, etc.), not just 4 cards.
-    """
-    # Infer dimensions from input tensors
-    num_cards = send.shape[0]  # Actual number of cards from input
-    num_experts = send.shape[1]  # Number of experts (typically equals num_cards)
-    total_tokens = send.shape[2]
-    hidden_size = send.shape[3]
-    count = 4  # tokens to process per (card, expert) pair
-
-    # dispatch
-    for cardi in range(num_cards):
-        for experti in range(num_experts):
-            # count = counts[cardi, experti]
-            recv[experti, cardi, :count, :] = send[cardi, experti, :count, :]
-    print(f"send: {send}")
-    print(f"recv: {recv}")
-    # compute
-    for cardi in range(num_cards):
-        for experti in range(num_experts):
-            recv[experti, cardi] = recv[experti, cardi] + 1.0  # 匹配实际kernel行为：总是加1.0f
-    print(f"recv: {recv}")
-    # combine
-    for experti in range(num_experts):
-        for cardi in range(num_cards):
-            # count = counts[cardi, experti]
-            output[cardi, :count, :] += recv[experti, cardi, :count, :]
-    print(f"output: {output}")
-    return output
-
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall.cpp
similarity index 82%
rename from examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2.cpp
rename to examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall.cpp
index da6188c1c..99b816f69 100644
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2.cpp
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall.cpp
@@ -4,7 +4,7 @@
  * CANN Open Software License Agreement Version 2.0 (the "License").
  * Please refer to the License for details. You may not use this file except in compliance with the License.
  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
@@ -48,7 +48,7 @@
 #define __aicore__ [aicore]
 #endif
 
-// Configuration matching golden.py
+// Configuration matching the in-test golden references
 static constexpr size_t NUM_TOKENS = 10;
 static constexpr size_t HIDDEN_DIM = 16;
 static constexpr size_t COUNT = 4;  // tokens to process per (card, expert) pair
@@ -76,8 +76,10 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     // Get base pointers
     __gm__ float *recv = reinterpret_cast<__gm__ float *>(recv_tensor->buffer.addr) + recv_tensor->start_offset;
     __gm__ float *output = reinterpret_cast<__gm__ float *>(output_tensor->buffer.addr) + output_tensor->start_offset;
-    __gm__ float *scratch = reinterpret_cast<__gm__ float *>(scratch_tensor->buffer.addr) + scratch_tensor->start_offset;
-    __gm__ float *scratch_print = reinterpret_cast<__gm__ float *>(scratch_print_tensor->buffer.addr) + scratch_print_tensor->start_offset;
+    __gm__ float *scratch =
+        reinterpret_cast<__gm__ float *>(scratch_tensor->buffer.addr) + scratch_tensor->start_offset;
+    __gm__ float *scratch_print =
+        reinterpret_cast<__gm__ float *>(scratch_print_tensor->buffer.addr) + scratch_print_tensor->start_offset;
 
     // Signal area at tail of scratch: num_cards int32 slots
     // Must be placed AFTER all data slots to avoid corruption
@@ -108,30 +110,27 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
             // recv layout: [num_cards][NUM_TOKENS][HIDDEN_DIM]
             // Base points to current (card_j, t), stride should keep access within current token
             ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn src_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
-                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global srcG(recv + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM,
-                       src_shape, src_stride);
+            StrideDyn src_stride(
+                NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1
+            );
+            Global srcG(recv + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM, src_shape, src_stride);
 
             // Destination: scratch[my_rank][card_j][t][:HIDDEN_DIM]
             // Offset = my_rank * (num_cards * NUM_TOKENS * HIDDEN_DIM)
             //        + card_j * (NUM_TOKENS * HIDDEN_DIM)
             //        + t * HIDDEN_DIM
-            size_t dst_offset = my_rank * num_cards * NUM_TOKENS * HIDDEN_DIM
-                              + card_j * NUM_TOKENS * HIDDEN_DIM
-                              + t * HIDDEN_DIM;
+            size_t dst_offset =
+                my_rank * num_cards * NUM_TOKENS * HIDDEN_DIM + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM;
 
             ShapeDyn dst_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn dst_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global dstG(scratch + dst_offset,
-                       dst_shape, dst_stride);
-            Global dstG_print(scratch_print + dst_offset,
-                             dst_shape, dst_stride);
-
-            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
-                                       pto::BLayout::RowMajor, -1, -1>;
+            StrideDyn dst_stride(
+                num_cards * NUM_TOKENS * HIDDEN_DIM, num_cards * NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
+                HIDDEN_DIM, 1
+            );
+            Global dstG(scratch + dst_offset, dst_shape, dst_stride);
+            Global dstG_print(scratch_print + dst_offset, dst_shape, dst_stride);
+
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM, pto::BLayout::RowMajor, -1, -1>;
             TileType tile(1, HIDDEN_DIM);
             TASSIGN(tile, 0);
 
@@ -179,16 +178,15 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
             // Offset = expert_i * (num_cards * NUM_TOKENS * HIDDEN_DIM)
             //        + my_rank * (NUM_TOKENS * HIDDEN_DIM)
             //        + t * HIDDEN_DIM
-            __gm__ float *src_base = (expert_i == my_rank) ? scratch :
-                                     CommRemotePtr(commCtx, scratch, expert_i);
-            size_t src_offset = expert_i * num_cards * NUM_TOKENS * HIDDEN_DIM
-                              + my_rank * NUM_TOKENS * HIDDEN_DIM
-                              + t * HIDDEN_DIM;
+            __gm__ float *src_base = (expert_i == my_rank) ? scratch : CommRemotePtr(commCtx, scratch, expert_i);
+            size_t src_offset =
+                expert_i * num_cards * NUM_TOKENS * HIDDEN_DIM + my_rank * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM;
 
             ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn src_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
+            StrideDyn src_stride(
+                num_cards * NUM_TOKENS * HIDDEN_DIM, num_cards * NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
+                HIDDEN_DIM, 1
+            );
             Global srcG(src_base + src_offset, src_shape, src_stride);
 
             // Destination: output[expert_i][t][:HIDDEN_DIM]
@@ -199,8 +197,7 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
             StrideDyn dst_stride(COUNT * HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
             Global dstG(output + dst_offset, dst_shape, dst_stride);
 
-            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
-                                       pto::BLayout::RowMajor, -1, -1>;
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM, pto::BLayout::RowMajor, -1, -1>;
             TileType tile(1, HIDDEN_DIM);
             TASSIGN(tile, 0);
 
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2 copy.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2 copy.cpp
deleted file mode 100644
index f7f1d464f..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall2 copy.cpp	
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * MoE Combine All-to-All Kernel (Direct Store Version)
- *
- * This kernel implements the combine phase of distributed MoE:
- * Each card i sends recv[i][card_j] (expert_i's result for card_j) to card j,
- * then directly stores all received results to output without accumulation.
- *
- * Data flow:
- *   Phase 1 (stage-in):  recv[:][:][:COUNT][:] → scratch[my_rank][:][:][:]
- *   Phase 2 (barrier):   signal matrix + TWAIT cross-rank sync
- *   Phase 3 (store):     for expert_i in num_cards: copy scratch[expert_i][my_rank][:][:] to output[expert_i][:][:]
- *
- * args layout:
- *   tensor(0) = recv_local       [num_cards][num_tokens][hidden_dim]
- *   tensor(1) = output_local     [num_cards][count][hidden_dim] - stores all experts' data
- *   tensor(2) = scratch          HCCL window buffer
- *   tensor(3) = scratch_print    Debug output buffer (Phase 1 stage-in mirror)
- *   scalar(0) = card_id          which card this is
- *   scalar(1) = num_cards        total number of cards
- *   scalar(2) = CommContext      device pointer for cross-card communication
- */
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-#include "pto/comm/comm_types.hpp"
-#include "pto/comm/pto_comm_inst.hpp"
-#include "platform_comm/comm_context.h"
-#include "tensor.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-// Configuration matching golden.py
-static constexpr size_t NUM_TOKENS = 10;
-static constexpr size_t HIDDEN_DIM = 16;
-static constexpr size_t COUNT = 4;  // tokens to process per (card, expert) pair
-static constexpr int kMaxSupportedCards = 16;
-
-template <typename T>
-AICORE inline __gm__ T *CommRemotePtr(__gm__ CommContext *ctx, __gm__ T *localPtr, int pe) {
-    uint64_t localBase = ctx->windowsIn[ctx->rankId];
-    uint64_t offset = (uint64_t)localPtr - localBase;
-    return (__gm__ T *)(ctx->windowsIn[pe] + offset);
-}
-
-extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    // Unpack tensors
-    __gm__ Tensor *recv_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *output_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *scratch_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    __gm__ Tensor *scratch_print_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
-
-    // Unpack scalars
-    int64_t card_id = static_cast<int64_t>(args[4]);
-    int num_cards = static_cast<int>(args[5]);
-    __gm__ CommContext *commCtx = reinterpret_cast<__gm__ CommContext *>(args[6]);
-
-    // Get base pointers
-    __gm__ float *recv = reinterpret_cast<__gm__ float *>(recv_tensor->buffer.addr) + recv_tensor->start_offset;
-    __gm__ float *output = reinterpret_cast<__gm__ float *>(output_tensor->buffer.addr) + output_tensor->start_offset;
-    __gm__ float *scratch = reinterpret_cast<__gm__ float *>(scratch_tensor->buffer.addr) + scratch_tensor->start_offset;
-    __gm__ float *scratch_print = reinterpret_cast<__gm__ float *>(scratch_print_tensor->buffer.addr) + scratch_print_tensor->start_offset;
-
-    // Signal area at tail of scratch: num_cards int32 slots
-    // Must be placed AFTER all data slots to avoid corruption
-    size_t total_data_size = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM;
-    __gm__ int32_t *signal_base = reinterpret_cast<__gm__ int32_t *>(scratch + total_data_size);
-
-    using ShapeDyn = pto::Shape<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
-    using StrideDyn = pto::Stride<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
-    using Global = pto::GlobalTensor<float, ShapeDyn, StrideDyn, pto::Layout::ND>;
-
-    int my_rank = static_cast<int>(commCtx->rankId);
-
-    if (num_cards <= 0 || num_cards > kMaxSupportedCards) {
-        pipe_barrier(PIPE_ALL);
-        return;
-    }
-
-    // ------------------------------------------------------------------
-    // Phase 1: stage-in — copy recv to scratch
-    // This card's expert result for all cards (as destination)
-    //
-    //
-    // For card_i with expert_id, copy recv[card_j][:][:] to scratch[expert_id][card_j][:][:]
-    // ------------------------------------------------------------------
-    for (int card_j = 0; card_j < num_cards; ++card_j) {
-        for (size_t t = 0; t < COUNT; ++t) {
-            // Source: recv[card_j][t][:HIDDEN_DIM] (expert_id's processed data from card_j)
-            // recv layout: [num_cards][NUM_TOKENS][HIDDEN_DIM]
-            // Base points to current (card_j, t), stride should keep access within current token
-            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn src_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
-                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global srcG(recv + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM,
-                       src_shape, src_stride);
-
-            // Destination: scratch[my_rank][card_j][t][:HIDDEN_DIM]
-            // Offset = my_rank * (num_cards * NUM_TOKENS * HIDDEN_DIM)
-            //        + card_j * (NUM_TOKENS * HIDDEN_DIM)
-            //        + t * HIDDEN_DIM
-            size_t dst_offset = my_rank * num_cards * NUM_TOKENS * HIDDEN_DIM
-                              + card_j * NUM_TOKENS * HIDDEN_DIM
-                              + t * HIDDEN_DIM;
-
-            ShapeDyn dst_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn dst_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global dstG(scratch + dst_offset,
-                       dst_shape, dst_stride);
-            Global dstG_print(scratch_print + dst_offset,
-                             dst_shape, dst_stride);
-
-            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
-                                       pto::BLayout::RowMajor, -1, -1>;
-            TileType tile(1, HIDDEN_DIM);
-            TASSIGN(tile, 0);
-
-            TLOAD(tile, srcG);
-            set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
-            TSTORE(dstG, tile);
-            TSTORE(dstG_print, tile);
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        }
-    }
-    pipe_barrier(PIPE_ALL);
-
-    // ------------------------------------------------------------------
-    // Phase 2: device barrier — each card notifies peers that its
-    // recv[:][my_card] data is visible in scratch, then waits for all peers.
-    // ------------------------------------------------------------------
-    for (int peer = 0; peer < num_cards; ++peer) {
-        if (peer == my_rank) continue;
-        __gm__ int32_t *remote_signal = CommRemotePtr(commCtx, signal_base + my_rank, peer);
-        pto::comm::Signal sig(remote_signal);
-        pto::comm::TNOTIFY(sig, (int32_t)1, pto::comm::NotifyOp::AtomicAdd);
-    }
-    for (int peer = 0; peer < num_cards; ++peer) {
-        if (peer == my_rank) continue;
-        pto::comm::Signal sig(signal_base + peer);
-        pto::comm::TWAIT(sig, (int32_t)1, pto::comm::WaitCmp::GE);
-    }
-    pipe_barrier(PIPE_ALL);
-
-    // ------------------------------------------------------------------
-    // Phase 3: reduce — accumulate all experts' results for this card
-    // Read scratch[expert_i][card_id][:][:] from each expert i's scratch
-    // and accumulate to output[t][:HIDDEN_DIM]
-    //
-    // For card_id, accumulate:
-    //   from expert 0: scratch[0][card_id][:][:]
-    //   from expert 1: scratch[1][card_id][:][:]
-    //   etc.
-    // ------------------------------------------------------------------
-
-    // Initialize output to zero
-    // for (size_t t = 0; t < COUNT; ++t) {
-    //     ShapeDyn out_shape(1, 1, 1, 1, HIDDEN_DIM);
-    //     StrideDyn out_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
-    //     Global outG(output + t * HIDDEN_DIM, out_shape, out_stride);
-
-    //     using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
-    //                                pto::BLayout::RowMajor, -1, -1>;
-    //     TileType tile(1, HIDDEN_DIM);
-    //     TASSIGN(tile, 0);
-    //     TSTORE(outG, tile);
-    //     set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-    //     wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-    // }
-
-    // Accumulate from all experts
-    for (int expert_i = 0; expert_i < num_cards; ++expert_i) {
-        for (size_t t = 0; t < COUNT; ++t) {
-            // Source: scratch[expert_i][my_rank][t][:HIDDEN_DIM]
-            // Offset = expert_i * (num_cards * NUM_TOKENS * HIDDEN_DIM)
-            //        + my_rank * (NUM_TOKENS * HIDDEN_DIM)
-            //        + t * HIDDEN_DIM
-            __gm__ float *src_base = (expert_i == my_rank) ? scratch :
-                                     CommRemotePtr(commCtx, scratch, expert_i);
-            size_t src_offset = expert_i * num_cards * NUM_TOKENS * HIDDEN_DIM
-                              + my_rank * NUM_TOKENS * HIDDEN_DIM
-                              + t * HIDDEN_DIM;
-
-            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn src_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global srcG(src_base + src_offset, src_shape, src_stride);
-
-            // Destination: output[t][:HIDDEN_DIM] (accumulate)
-            ShapeDyn out_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn out_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global outG(output + t * HIDDEN_DIM, out_shape, out_stride);
-
-            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
-                                       pto::BLayout::RowMajor, -1, -1>;
-            TileType srcTile(1, HIDDEN_DIM);
-            TileType accTile(1, HIDDEN_DIM);
-            constexpr size_t kTileSize = 1 * HIDDEN_DIM * sizeof(float);  // 64 bytes
-            TASSIGN(srcTile, kTileSize);      // Use offset 64
-            TASSIGN(accTile, kTileSize * 2);  // Use offset 128
-
-            // Load current output value (acc before accumulation)
-            TLOAD(accTile, outG);
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-            // Load from remote scratch (src)
-            TLOAD(srcTile, srcG);
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-
-            // Accumulate
-            TADD(accTile, accTile, srcTile);
-            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
-
-            // Store to output
-            TSTORE(outG, accTile);
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        }
-    }
-
-    pipe_barrier(PIPE_ALL);
-}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall_ori.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall_ori.cpp
deleted file mode 100644
index 67e61d2a5..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_combine_alltoall_ori.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * MoE Combine All-to-All Kernel (Direct Store Version)
- *
- * This kernel implements the combine phase of distributed MoE:
- * Each card i sends recv[i][card_j] (expert_i's result for card_j) to card j,
- * then directly stores all received results to output without accumulation.
- *
- * Data flow:
- *   Phase 1 (stage-in):  recv[:][:][:COUNT][:] → scratch[my_rank][:][:][:]
- *   Phase 2 (barrier):   signal matrix + TWAIT cross-rank sync
- *   Phase 3 (store):     for expert_i in num_cards: copy scratch[expert_i][my_rank][:][:] to output[expert_i][:][:]
- *
- * args layout:
- *   tensor(0) = recv_local     [num_cards][num_tokens][hidden_dim]
- *   tensor(1) = output_local   [num_cards][count][hidden_dim] - stores all experts' data
- *   tensor(2) = scratch        HCCL window buffer
- *   scalar(0) = card_id        which card this is
- *   scalar(1) = num_cards      total number of cards
- *   scalar(2) = CommContext    device pointer for cross-card communication
- */
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-#include "pto/comm/comm_types.hpp"
-#include "pto/comm/pto_comm_inst.hpp"
-#include "platform_comm/comm_context.h"
-#include "tensor.h"
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-// Configuration matching golden.py
-static constexpr size_t NUM_TOKENS = 10;
-static constexpr size_t HIDDEN_DIM = 16;
-static constexpr size_t COUNT = 4;  // tokens to process per (card, expert) pair
-static constexpr int kMaxSupportedCards = 16;
-
-template <typename T>
-AICORE inline __gm__ T *CommRemotePtr(__gm__ CommContext *ctx, __gm__ T *localPtr, int pe) {
-    uint64_t localBase = ctx->windowsIn[ctx->rankId];
-    uint64_t offset = (uint64_t)localPtr - localBase;
-    return (__gm__ T *)(ctx->windowsIn[pe] + offset);
-}
-
-extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
-    // Unpack tensors
-    __gm__ Tensor *recv_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *output_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *scratch_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    __gm__ Tensor *scratch_print_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]);
-    __gm__ Tensor *acc_values_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]);
-    __gm__ Tensor *src_values_tensor = reinterpret_cast<__gm__ Tensor *>(args[5]);
-
-    // Unpack scalars
-    int64_t card_id = static_cast<int64_t>(args[6]);
-    int num_cards = static_cast<int>(args[7]);
-    __gm__ CommContext *commCtx = reinterpret_cast<__gm__ CommContext *>(args[8]);
-
-    // Get base pointers
-    __gm__ float *recv = reinterpret_cast<__gm__ float *>(recv_tensor->buffer.addr) + recv_tensor->start_offset;
-    __gm__ float *output = reinterpret_cast<__gm__ float *>(output_tensor->buffer.addr) + output_tensor->start_offset;
-    __gm__ float *scratch = reinterpret_cast<__gm__ float *>(scratch_tensor->buffer.addr) + scratch_tensor->start_offset;
-    __gm__ float *scratch_print = reinterpret_cast<__gm__ float *>(scratch_print_tensor->buffer.addr) + scratch_print_tensor->start_offset;
-    __gm__ float *acc_values = reinterpret_cast<__gm__ float *>(acc_values_tensor->buffer.addr) + acc_values_tensor->start_offset;
-    __gm__ float *src_values = reinterpret_cast<__gm__ float *>(src_values_tensor->buffer.addr) + src_values_tensor->start_offset;
-
-    // Signal area at tail of scratch: num_cards int32 slots
-    // Must be placed AFTER all data slots to avoid corruption
-    size_t total_data_size = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM;
-    __gm__ int32_t *signal_base = reinterpret_cast<__gm__ int32_t *>(scratch + total_data_size);
-
-    using ShapeDyn = pto::Shape<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
-    using StrideDyn = pto::Stride<pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC, pto::DYNAMIC>;
-    using Global = pto::GlobalTensor<float, ShapeDyn, StrideDyn, pto::Layout::ND>;
-
-    int my_rank = static_cast<int>(commCtx->rankId);
-
-    if (num_cards <= 0 || num_cards > kMaxSupportedCards) {
-        pipe_barrier(PIPE_ALL);
-        return;
-    }
-
-    // ------------------------------------------------------------------
-    // Phase 1: stage-in — copy recv to scratch
-    // This card's expert result for all cards (as destination)
-    //
-    //
-    // For card_i with expert_id, copy recv[card_j][:][:] to scratch[expert_id][card_j][:][:]
-    // ------------------------------------------------------------------
-    for (int card_j = 0; card_j < num_cards; ++card_j) {
-        for (size_t t = 0; t < COUNT; ++t) {
-            // Source: recv[card_j][t][:HIDDEN_DIM] (expert_id's processed data from card_j)
-            // recv layout: [num_cards][NUM_TOKENS][HIDDEN_DIM]
-            // Base points to current (card_j, t), stride should keep access within current token
-            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn src_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
-                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global srcG(recv + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM,
-                       src_shape, src_stride);
-
-            // Destination: scratch[my_rank][card_j][t][:HIDDEN_DIM]
-            // Offset = my_rank * (num_cards * NUM_TOKENS * HIDDEN_DIM)
-            //        + card_j * (NUM_TOKENS * HIDDEN_DIM)
-            //        + t * HIDDEN_DIM
-            size_t dst_offset = my_rank * num_cards * NUM_TOKENS * HIDDEN_DIM
-                              + card_j * NUM_TOKENS * HIDDEN_DIM
-                              + t * HIDDEN_DIM;
-
-            ShapeDyn dst_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn dst_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global dstG(scratch + dst_offset,
-                       dst_shape, dst_stride);
-            Global dstG_print(scratch_print + dst_offset,
-                             dst_shape, dst_stride);
-
-            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
-                                       pto::BLayout::RowMajor, -1, -1>;
-            TileType tile(1, HIDDEN_DIM);
-            TASSIGN(tile, 0);
-
-            TLOAD(tile, srcG);
-            set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
-            TSTORE(dstG, tile);
-            TSTORE(dstG_print, tile);
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        }
-    }
-    pipe_barrier(PIPE_ALL);
-
-    // ------------------------------------------------------------------
-    // Phase 2: device barrier — each card notifies peers that its
-    // recv[:][my_card] data is visible in scratch, then waits for all peers.
-    // ------------------------------------------------------------------
-    for (int peer = 0; peer < num_cards; ++peer) {
-        if (peer == my_rank) continue;
-        __gm__ int32_t *remote_signal = CommRemotePtr(commCtx, signal_base + my_rank, peer);
-        pto::comm::Signal sig(remote_signal);
-        pto::comm::TNOTIFY(sig, (int32_t)1, pto::comm::NotifyOp::AtomicAdd);
-    }
-    for (int peer = 0; peer < num_cards; ++peer) {
-        if (peer == my_rank) continue;
-        pto::comm::Signal sig(signal_base + peer);
-        pto::comm::TWAIT(sig, (int32_t)1, pto::comm::WaitCmp::GE);
-    }
-    pipe_barrier(PIPE_ALL);
-
-    // ------------------------------------------------------------------
-    // Phase 3: reduce — accumulate all experts' results for this card
-    // Read scratch[expert_i][card_id][:][:] from each expert i's scratch
-    // and accumulate to output[t][:HIDDEN_DIM]
-    //
-    // For card_id, accumulate:
-    //   from expert 0: scratch[0][card_id][:][:]
-    //   from expert 1: scratch[1][card_id][:][:]
-    //   etc.
-    // ------------------------------------------------------------------
-
-    // Initialize output to zero
-    // for (size_t t = 0; t < COUNT; ++t) {
-    //     ShapeDyn out_shape(1, 1, 1, 1, HIDDEN_DIM);
-    //     StrideDyn out_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
-    //     Global outG(output + t * HIDDEN_DIM, out_shape, out_stride);
-
-    //     using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
-    //                                pto::BLayout::RowMajor, -1, -1>;
-    //     TileType tile(1, HIDDEN_DIM);
-    //     TASSIGN(tile, 0);
-    //     TSTORE(outG, tile);
-    //     set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-    //     wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-    // }
-
-    // Accumulate from all experts
-    int add_entry = 0;
-    for (int expert_i = 0; expert_i < num_cards; ++expert_i) {
-        for (size_t t = 0; t < COUNT; ++t) {
-            // Source: scratch[expert_i][my_rank][t][:HIDDEN_DIM]
-            // Offset = expert_i * (num_cards * NUM_TOKENS * HIDDEN_DIM)
-            //        + my_rank * (NUM_TOKENS * HIDDEN_DIM)
-            //        + t * HIDDEN_DIM
-            __gm__ float *src_base = (expert_i == my_rank) ? scratch :
-                                     CommRemotePtr(commCtx, scratch, expert_i);
-            size_t src_offset = expert_i * num_cards * NUM_TOKENS * HIDDEN_DIM
-                              + my_rank * NUM_TOKENS * HIDDEN_DIM
-                              + t * HIDDEN_DIM;
-
-            ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn src_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global srcG(src_base + src_offset, src_shape, src_stride);
-
-            // Destination: output[t][:HIDDEN_DIM] (accumulate)
-            ShapeDyn out_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn out_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global outG(output + t * HIDDEN_DIM, out_shape, out_stride);
-
-            // Destinations for acc and src values (before accumulation)
-            ShapeDyn acc_save_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn acc_save_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global acc_saveG(acc_values + add_entry * HIDDEN_DIM, acc_save_shape, acc_save_stride);
-
-            ShapeDyn src_save_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn src_save_stride(HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global src_saveG(src_values + add_entry * HIDDEN_DIM, src_save_shape, src_save_stride);
-
-            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
-                                       pto::BLayout::RowMajor, -1, -1>;
-            TileType srcTile(1, HIDDEN_DIM);
-            TileType accTile(1, HIDDEN_DIM);
-            constexpr size_t kTileSize = 1 * HIDDEN_DIM * sizeof(float);  // 64 bytes
-            TASSIGN(srcTile, kTileSize);      // Use offset 64
-            TASSIGN(accTile, kTileSize * 2);  // Use offset 128
-
-            // Load current output value (acc before accumulation)
-            TLOAD(accTile, outG);
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-            // Load from remote scratch (src)
-            TLOAD(srcTile, srcG);
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-
-            // Save acc and src before accumulation
-            TSTORE(acc_saveG, accTile);
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
-
-            TSTORE(src_saveG, srcTile);
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
-
-            // Accumulate
-            TADD(accTile, accTile, srcTile);
-            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
-
-            // Store to output
-            TSTORE(outG, accTile);
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-
-            add_entry++;
-        }
-    }
-
-    pipe_barrier(PIPE_ALL);
-}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_0.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_0.cpp
deleted file mode 100644
index 70ad453f9..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_0.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-// Kernel Function: moe_demo_incore_0
-// Generated by PyPTO IR Compiler (PTO backend)
-
-#include <cstdint>
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#if defined(__CPU_SIM)
-#define __aicore__
-#else
-#define __aicore__ [aicore]
-#endif
-#endif
-
-#include <pto/pto-inst.hpp>
-#include "tensor.h"
-
-
-using namespace pto;
-
-
-// --- ptoas-generated code ---
-
-enum class PTOAutoSyncTailMode : int {
-  kBarrierAll = 0,
-  kSetWaitMte3ToSEvent0 = 1,
-};
-
-static __aicore__ inline void ptoas_auto_sync_tail(
-    PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
-  switch (mode) {
-  case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
-    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
-    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
-    break;
-  case PTOAutoSyncTailMode::kBarrierAll:
-  default:
-    pipe_barrier(PIPE_ALL);
-    break;
-  }
-}
-
-static __aicore__ void moe_demo_incore_0(__gm__ bfloat16_t* v1, __gm__ bfloat16_t* v2, int32_t v3, int32_t v4, int32_t v5) {
-  unsigned v6 = 0;
-  const int32_t v7 = 256;
-  const int32_t v8 = 64;
-  const int32_t v9 = 1;
-  const int32_t v10 = 16;
-  const int64_t v11 = 0;
-  using T = float;
-
-  #if defined(__DAV_VEC__)
-  set_mask_norm();
-  set_vector_mask(-1, -1);
-  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v12 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v9, v10);
-  TASSIGN(v12, v11);
-  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v13 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v9, v10);
-  __ubuf__ bfloat16_t* v14 = v12.data();
-  uint64_t v15 = reinterpret_cast<uint64_t>(v14);
-  TASSIGN(v13, v15);
-  pto::Shape<1, 1, 1, 1, 16> v16 = pto::Shape<1, 1, 1, 1, 16>();
-  pto::Stride<256, 256, 64, 16, 1> v17 = pto::Stride<256, 256, 64, 16, 1>();
-  GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND> v18 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND>(v1 + ((v6 + (unsigned) v3 * (unsigned) v7 + (unsigned) v4 * (unsigned) v8) + (unsigned) v5 * (unsigned) v10 + v6 * (unsigned) v9), v16, v17);
-  TLOAD(v13, v18);
-  set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
-  pto::Shape<1, 1, 1, 1, 16> v19 = pto::Shape<1, 1, 1, 1, 16>();
-  pto::Stride<256, 256, 64, 16, 1> v20 = pto::Stride<256, 256, 64, 16, 1>();
-  GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND> v21 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND>(v2 + ((v6 + (unsigned) v4 * (unsigned) v7 + (unsigned) v3 * (unsigned) v8) + (unsigned) v5 * (unsigned) v10 + v6 * (unsigned) v9), v19, v20);
-  wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0);
-  TSTORE(v21, v13);
-  #endif // __DAV_VEC__
-
-  ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
-  return;
-}
-
-// --- Kernel entry point ---
-extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t* args)
-{
-    // Unpack tensor: send__ssa_v0
-    __gm__ Tensor* send__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
-    __gm__ bfloat16_t* send__ssa_v0 = reinterpret_cast<__gm__ bfloat16_t*>(send__ssa_v0_tensor->buffer.addr) + send__ssa_v0_tensor->start_offset;
-
-    // Unpack tensor: recv__iter_v5
-    __gm__ Tensor* recv__iter_v5_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
-    __gm__ bfloat16_t* recv__iter_v5 = reinterpret_cast<__gm__ bfloat16_t*>(recv__iter_v5_tensor->buffer.addr) + recv__iter_v5_tensor->start_offset;
-
-    // Unpack scalar: card_i__idx_v0
-    union { uint64_t u64; int64_t val; } card_i__idx_v0_conv;
-    card_i__idx_v0_conv.u64 = args[2];
-    int64_t card_i__idx_v0 = card_i__idx_v0_conv.val;
-
-    // Unpack scalar: expert_j__idx_v0
-    union { uint64_t u64; int64_t val; } expert_j__idx_v0_conv;
-    expert_j__idx_v0_conv.u64 = args[3];
-    int64_t expert_j__idx_v0 = expert_j__idx_v0_conv.val;
-
-    // Unpack scalar: t_idx__idx_v0
-    union { uint64_t u64; int64_t val; } t_idx__idx_v0_conv;
-    t_idx__idx_v0_conv.u64 = args[4];
-    int64_t t_idx__idx_v0 = t_idx__idx_v0_conv.val;
-
-    // Forward to ptoas-generated function
-    moe_demo_incore_0(send__ssa_v0, recv__iter_v5, card_i__idx_v0, expert_j__idx_v0, t_idx__idx_v0);
-}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_1.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_1.cpp
deleted file mode 100644
index d4c99d0e8..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_1.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-// Kernel Function: moe_demo_incore_1
-// Generated by PyPTO IR Compiler (PTO backend)
-
-#include <cstdint>
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#if defined(__CPU_SIM)
-#define __aicore__
-#else
-#define __aicore__ [aicore]
-#endif
-#endif
-
-#include <pto/pto-inst.hpp>
-#include "tensor.h"
-
-
-using namespace pto;
-
-
-// --- ptoas-generated code ---
-
-enum class PTOAutoSyncTailMode : int {
-  kBarrierAll = 0,
-  kSetWaitMte3ToSEvent0 = 1,
-};
-
-static __aicore__ inline void ptoas_auto_sync_tail(
-    PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
-  switch (mode) {
-  case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
-    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
-    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
-    break;
-  case PTOAutoSyncTailMode::kBarrierAll:
-  default:
-    pipe_barrier(PIPE_ALL);
-    break;
-  }
-}
-
-static __aicore__ void moe_demo_incore_1(__gm__ bfloat16_t* v1, int32_t v2, int32_t v3, int32_t v4) {
-  RoundMode v5 = RoundMode::CAST_ROUND;
-  unsigned v6 = 0;
-  const int32_t v7 = 256;
-  const int32_t v8 = 64;
-  const float v9 = 1.0f;
-  const int32_t v10 = 1;
-  const int32_t v11 = 16;
-  const int64_t v12 = 96;
-  const int64_t v13 = 32;
-  const int64_t v14 = 0;
-  using T = float;
-
-  #if defined(__DAV_VEC__)
-  set_mask_norm();
-  set_vector_mask(-1, -1);
-  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v15 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
-  TASSIGN(v15, v14);
-  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v16 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
-  __ubuf__ bfloat16_t* v17 = v15.data();
-  uint64_t v18 = reinterpret_cast<uint64_t>(v17);
-  TASSIGN(v16, v18);
-  pto::Shape<1, 1, 1, 1, 16> v19 = pto::Shape<1, 1, 1, 1, 16>();
-  pto::Stride<256, 256, 64, 16, 1> v20 = pto::Stride<256, 256, 64, 16, 1>();
-  GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND> v21 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND>(v1 + ((v6 + (unsigned) v2 * (unsigned) v7 + (unsigned) v3 * (unsigned) v8) + (unsigned) v4 * (unsigned) v11 + v6 * (unsigned) v10), v19, v20);
-  TLOAD(v16, v21);
-  set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v22 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
-  TASSIGN(v22, v13);
-  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v23 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
-  __ubuf__ float* v24 = v22.data();
-  uint64_t v25 = reinterpret_cast<uint64_t>(v24);
-  TASSIGN(v23, v25);
-  wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-  TCVT(v23, v16, v5);
-  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v26 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
-  TASSIGN(v26, v12);
-  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v27 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
-  __ubuf__ float* v28 = v26.data();
-  uint64_t v29 = reinterpret_cast<uint64_t>(v28);
-  TASSIGN(v27, v29);
-  TEXPANDS(v27, v9);
-  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v30 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
-  TASSIGN(v30, v13);
-  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v31 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
-  __ubuf__ float* v32 = v30.data();
-  uint64_t v33 = reinterpret_cast<uint64_t>(v32);
-  TASSIGN(v31, v33);
-  pipe_barrier(PIPE_V);
-  TADD(v31, v23, v27);
-  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v34 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
-  TASSIGN(v34, v14);
-  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v35 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v10, v11);
-  __ubuf__ bfloat16_t* v36 = v34.data();
-  uint64_t v37 = reinterpret_cast<uint64_t>(v36);
-  TASSIGN(v35, v37);
-  pipe_barrier(PIPE_V);
-  TCVT(v35, v31, v5);
-  set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-  wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-  TSTORE(v21, v35);
-  #endif // __DAV_VEC__
-
-  ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
-  return;
-}
-
-// --- Kernel entry point ---
-extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t* args)
-{
-    // Unpack tensor: recv__iter_v12
-    __gm__ Tensor* recv__iter_v12_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
-    __gm__ bfloat16_t* recv__iter_v12 = reinterpret_cast<__gm__ bfloat16_t*>(recv__iter_v12_tensor->buffer.addr) + recv__iter_v12_tensor->start_offset;
-
-    // Unpack scalar: expert_j__idx_v0
-    union { uint64_t u64; int64_t val; } expert_j__idx_v0_conv;
-    expert_j__idx_v0_conv.u64 = args[1];
-    int64_t expert_j__idx_v0 = expert_j__idx_v0_conv.val;
-
-    // Unpack scalar: card_i__idx_v0
-    union { uint64_t u64; int64_t val; } card_i__idx_v0_conv;
-    card_i__idx_v0_conv.u64 = args[2];
-    int64_t card_i__idx_v0 = card_i__idx_v0_conv.val;
-
-    // Unpack scalar: t_idx__idx_v0
-    union { uint64_t u64; int64_t val; } t_idx__idx_v0_conv;
-    t_idx__idx_v0_conv.u64 = args[3];
-    int64_t t_idx__idx_v0 = t_idx__idx_v0_conv.val;
-
-    // Forward to ptoas-generated function
-    moe_demo_incore_1(recv__iter_v12, expert_j__idx_v0, card_i__idx_v0, t_idx__idx_v0);
-}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_2.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_2.cpp
deleted file mode 100644
index 1074f3499..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_demo_incore_2.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-// Kernel Function: moe_demo_incore_2
-// Generated by PyPTO IR Compiler (PTO backend)
-
-#include <cstdint>
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#if defined(__CPU_SIM)
-#define __aicore__
-#else
-#define __aicore__ [aicore]
-#endif
-#endif
-
-#include <pto/pto-inst.hpp>
-#include "tensor.h"
-
-
-using namespace pto;
-
-
-// --- ptoas-generated code ---
-
-enum class PTOAutoSyncTailMode : int {
-  kBarrierAll = 0,
-  kSetWaitMte3ToSEvent0 = 1,
-};
-
-static __aicore__ inline void ptoas_auto_sync_tail(
-    PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
-  switch (mode) {
-  case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
-    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
-    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
-    break;
-  case PTOAutoSyncTailMode::kBarrierAll:
-  default:
-    pipe_barrier(PIPE_ALL);
-    break;
-  }
-}
-
-static __aicore__ void moe_demo_incore_2(__gm__ bfloat16_t* v1, __gm__ bfloat16_t* v2, int32_t v3, int32_t v4) {
-  RoundMode v5 = RoundMode::CAST_ROUND;
-  unsigned v6 = 0;
-  const int32_t v7 = 256;
-  const int32_t v8 = 64;
-  const int32_t v9 = 0;
-  const float v10 = 0.0f;
-  const int32_t v11 = 1;
-  const int32_t v12 = 16;
-  const int32_t v13 = 4;
-  const int64_t v14 = 96;
-  const int64_t v15 = 64;
-  const int64_t v16 = 0;
-  using T = float;
-
-  #if defined(__DAV_VEC__)
-  set_mask_norm();
-  set_vector_mask(-1, -1);
-  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v17 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-  TASSIGN(v17, v16);
-  Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v18 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-  __ubuf__ float* v19 = v17.data();
-  uint64_t v20 = reinterpret_cast<uint64_t>(v19);
-  TASSIGN(v18, v20);
-  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
-  set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-  TEXPANDS(v18, v10);
-  for (size_t v21 = (size_t) v9; v21 < ((size_t) v13); v21 += (size_t) v11) {
-    Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v22 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-    TASSIGN(v22, v15);
-    Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v23 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-    __ubuf__ bfloat16_t* v24 = v22.data();
-    uint64_t v25 = reinterpret_cast<uint64_t>(v24);
-    TASSIGN(v23, v25);
-    pto::Shape<1, 1, 1, 1, 16> v26 = pto::Shape<1, 1, 1, 1, 16>();
-    pto::Stride<256, 256, 64, 16, 1> v27 = pto::Stride<256, 256, 64, 16, 1>();
-    GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND> v28 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<256, 256, 64, 16, 1>, pto::Layout::ND>(v1 + ((v6 + (unsigned) ((int32_t) v21) * (unsigned) v7 + (unsigned) v3 * (unsigned) v8) + (unsigned) v4 * (unsigned) v12 + v6 * (unsigned) v11), v26, v27);
-    wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
-    TLOAD(v23, v28);
-    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v29 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-    TASSIGN(v29, v14);
-    Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v30 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-    __ubuf__ float* v31 = v29.data();
-    uint64_t v32 = reinterpret_cast<uint64_t>(v31);
-    TASSIGN(v30, v32);
-    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    pipe_barrier(PIPE_V);
-    TCVT(v30, v23, v5);
-    set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
-    Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v33 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-    TASSIGN(v33, v16);
-    Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v34 = Tile<TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-    __ubuf__ float* v35 = v33.data();
-    uint64_t v36 = reinterpret_cast<uint64_t>(v35);
-    TASSIGN(v34, v36);
-    pipe_barrier(PIPE_V);
-    TADD(v34, v18, v30);
-  }
-  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v37 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-  TASSIGN(v37, v15);
-  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v38 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-  __ubuf__ bfloat16_t* v39 = v37.data();
-  uint64_t v40 = reinterpret_cast<uint64_t>(v39);
-  TASSIGN(v38, v40);
-  pipe_barrier(PIPE_V);
-  TCVT(v38, v18, v5);
-  set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v41 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-  TASSIGN(v41, v15);
-  Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v42 = Tile<TileType::Vec, bfloat16_t, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v11, v12);
-  __ubuf__ bfloat16_t* v43 = v41.data();
-  uint64_t v44 = reinterpret_cast<uint64_t>(v43);
-  TASSIGN(v42, v44);
-  pto::Shape<1, 1, 1, 1, 16> v45 = pto::Shape<1, 1, 1, 1, 16>();
-  pto::Stride<64, 64, 64, 16, 1> v46 = pto::Stride<64, 64, 64, 16, 1>();
-  GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<64, 64, 64, 16, 1>, pto::Layout::ND> v47 = GlobalTensor<bfloat16_t, pto::Shape<1, 1, 1, 1, 16>, pto::Stride<64, 64, 64, 16, 1>, pto::Layout::ND>(v2 + ((v6 + (unsigned) v3 * (unsigned) v8) + (unsigned) v4 * (unsigned) v12 + v6 * (unsigned) v11), v45, v46);
-  wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-  TSTORE(v47, v42);
-  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
-  wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-  #endif // __DAV_VEC__
-
-  ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
-  return;
-}
-
-// --- Kernel entry point ---
-extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t* args)
-{
-    // Unpack tensor: recv__rv_v9
-    __gm__ Tensor* recv__rv_v9_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
-    __gm__ bfloat16_t* recv__rv_v9 = reinterpret_cast<__gm__ bfloat16_t*>(recv__rv_v9_tensor->buffer.addr) + recv__rv_v9_tensor->start_offset;
-
-    // Unpack tensor: output__iter_v3
-    __gm__ Tensor* output__iter_v3_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
-    __gm__ bfloat16_t* output__iter_v3 = reinterpret_cast<__gm__ bfloat16_t*>(output__iter_v3_tensor->buffer.addr) + output__iter_v3_tensor->start_offset;
-
-    // Unpack scalar: card_i__idx_v0
-    union { uint64_t u64; int64_t val; } card_i__idx_v0_conv;
-    card_i__idx_v0_conv.u64 = args[2];
-    int64_t card_i__idx_v0 = card_i__idx_v0_conv.val;
-
-    // Unpack scalar: t_idx__idx_v0
-    union { uint64_t u64; int64_t val; } t_idx__idx_v0_conv;
-    t_idx__idx_v0_conv.u64 = args[3];
-    int64_t t_idx__idx_v0 = t_idx__idx_v0_conv.val;
-
-    // Forward to ptoas-generated function
-    moe_demo_incore_2(recv__rv_v9, output__iter_v3, card_i__idx_v0, t_idx__idx_v0);
-}
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_dispatch_alltoall.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_dispatch_alltoall.cpp
index 4bb94d634..1e424aa49 100644
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_dispatch_alltoall.cpp
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_dispatch_alltoall.cpp
@@ -4,7 +4,7 @@
  * CANN Open Software License Agreement Version 2.0 (the "License").
  * Please refer to the License for details. You may not use this file except in compliance with the License.
  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
@@ -44,7 +44,7 @@
 #define __aicore__ [aicore]
 #endif
 
-// Configuration matching golden.py
+// Configuration matching the in-test golden references
 static constexpr size_t NUM_TOKENS = 10;
 static constexpr size_t HIDDEN_DIM = 16;
 static constexpr size_t COUNT = 4;  // tokens to process per (card, expert) pair
@@ -71,7 +71,8 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
     // Get base pointers
     __gm__ float *send = reinterpret_cast<__gm__ float *>(send_tensor->buffer.addr) + send_tensor->start_offset;
     __gm__ float *recv = reinterpret_cast<__gm__ float *>(recv_tensor->buffer.addr) + recv_tensor->start_offset;
-    __gm__ float *scratch = reinterpret_cast<__gm__ float *>(scratch_tensor->buffer.addr) + scratch_tensor->start_offset;
+    __gm__ float *scratch =
+        reinterpret_cast<__gm__ float *>(scratch_tensor->buffer.addr) + scratch_tensor->start_offset;
 
     // Signal area at tail of scratch: num_cards int32 slots
     // Must be placed AFTER all data slots to avoid corruption
@@ -105,29 +106,25 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
         for (size_t t = 0; t < COUNT; ++t) {
             // Load from send[expert_i][t][:HIDDEN_DIM] (ALL experts, not just expert_id)
             ShapeDyn send_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn send_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
-                                  HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global sendG(send + expert_i * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM,
-                        send_shape, send_stride);
+            StrideDyn send_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global sendG(send + expert_i * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM, send_shape, send_stride);
 
             // Store to scratch[my_rank][expert_i][t][:HIDDEN_DIM]
             // Index = my_rank * (num_cards * NUM_TOKENS * HIDDEN_DIM)
             //       + expert_i * (NUM_TOKENS * HIDDEN_DIM)
             //       + t * HIDDEN_DIM
-            size_t scratch_offset = my_rank * num_cards * NUM_TOKENS * HIDDEN_DIM
-                                  + expert_i * NUM_TOKENS * HIDDEN_DIM
-                                  + t * HIDDEN_DIM;
+            size_t scratch_offset =
+                my_rank * num_cards * NUM_TOKENS * HIDDEN_DIM + expert_i * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM;
 
             ShapeDyn scratch_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn scratch_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                     num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                     NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global scratchG(scratch + scratch_offset,
-                            scratch_shape, scratch_stride);
+            StrideDyn scratch_stride(
+                num_cards * NUM_TOKENS * HIDDEN_DIM, num_cards * NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
+                HIDDEN_DIM, 1
+            );
+            Global scratchG(scratch + scratch_offset, scratch_shape, scratch_stride);
 
             // Use tile for data movement
-            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
-                                       pto::BLayout::RowMajor, -1, -1>;
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM, pto::BLayout::RowMajor, -1, -1>;
             TileType tile(1, HIDDEN_DIM);
             TASSIGN(tile, 0);
 
@@ -171,28 +168,23 @@ extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ in
             // Offset = card_j * (num_cards * NUM_TOKENS * HIDDEN_DIM)
             //        + expert_id * (NUM_TOKENS * HIDDEN_DIM)
             //        + t * HIDDEN_DIM
-            __gm__ float *src_base = (card_j == my_rank) ? scratch :
-                                     CommRemotePtr(commCtx, scratch, card_j);
-            size_t src_offset = card_j * num_cards * NUM_TOKENS * HIDDEN_DIM
-                              + expert_id * NUM_TOKENS * HIDDEN_DIM
-                              + t * HIDDEN_DIM;
+            __gm__ float *src_base = (card_j == my_rank) ? scratch : CommRemotePtr(commCtx, scratch, card_j);
+            size_t src_offset =
+                card_j * num_cards * NUM_TOKENS * HIDDEN_DIM + expert_id * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM;
 
             ShapeDyn src_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn src_stride(num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 num_cards * NUM_TOKENS * HIDDEN_DIM,
-                                 NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global srcG(src_base + src_offset,
-                       src_shape, src_stride);
+            StrideDyn src_stride(
+                num_cards * NUM_TOKENS * HIDDEN_DIM, num_cards * NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
+                HIDDEN_DIM, 1
+            );
+            Global srcG(src_base + src_offset, src_shape, src_stride);
 
             // Destination: recv[card_j][t][:HIDDEN_DIM]
             ShapeDyn dst_shape(1, 1, 1, 1, HIDDEN_DIM);
-            StrideDyn dst_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM,
-                                 HIDDEN_DIM, HIDDEN_DIM, 1);
-            Global dstG(recv + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM,
-                       dst_shape, dst_stride);
+            StrideDyn dst_stride(NUM_TOKENS * HIDDEN_DIM, NUM_TOKENS * HIDDEN_DIM, HIDDEN_DIM, HIDDEN_DIM, 1);
+            Global dstG(recv + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM, dst_shape, dst_stride);
 
-            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM,
-                                       pto::BLayout::RowMajor, -1, -1>;
+            using TileType = pto::Tile<pto::TileType::Vec, float, 1, HIDDEN_DIM, pto::BLayout::RowMajor, -1, -1>;
             TileType tile(1, HIDDEN_DIM);
             TASSIGN(tile, 0);
 
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_simple_compute.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_simple_compute.cpp
index 1df151670..c7e04d621 100644
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_simple_compute.cpp
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/aiv/moe_simple_compute.cpp
@@ -1,12 +1,21 @@
 /*
  * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/*
  * Simple Compute Kernel for MoE
  *
  * Adds 1.0 to all elements in recv[:][:4][:]
  *
  * args layout:
  *   tensor(0) = recv [num_cards][NUM_TOKENS][HIDDEN_DIM]
- *   scalar(0) = unused (for compatibility)
+ *   scalar(0) = num_cards
  *   scalar(1) = unused (for compatibility)
  *   scalar(2) = unused (for compatibility)
  */
@@ -31,10 +40,16 @@ static constexpr int kMaxSupportedCards = 16;
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
     __gm__ Tensor *recv_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
     __gm__ float *recv = reinterpret_cast<__gm__ float *>(recv_tensor->buffer.addr) + recv_tensor->start_offset;
+    int num_cards = static_cast<int>(args[1]);
+
+    if (num_cards <= 0 || num_cards > kMaxSupportedCards) {
+        pipe_barrier(PIPE_ALL);
+        return;
+    }
 
     // Add 1.0 to first COUNT tokens for all cards
     // recv layout: [num_cards][NUM_TOKENS][HIDDEN_DIM]
-    for (int card = 0; card < kMaxSupportedCards; ++card) {
+    for (int card = 0; card < num_cards; ++card) {
         for (size_t t = 0; t < COUNT; ++t) {
             for (size_t d = 0; d < HIDDEN_DIM; ++d) {
                 size_t offset = card * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM + d;
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/kernel_config.py b/examples/workers/l3/moe_multi_chip_experts/kernels/kernel_config.py
deleted file mode 100644
index 715728571..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/kernel_config.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Kernel and Orchestration Configuration
-
-from pathlib import Path
-
-_ROOT_DIR = Path(__file__).parent.parent
-
-# Runtime configuration for tensormap_and_ringbuffer
-# This runtime requires 4 AICPU threads (3 schedulers + 1 orchestrator on thread 3)
-RUNTIME_CONFIG = {
-	"runtime": "tensormap_and_ringbuffer",
-	"aicpu_thread_num": 4,
-	"block_dim": 24,
-}
-
-ORCHESTRATION = {
-	"source": str(_ROOT_DIR / "kernels" / "orchestration" / "moe_multi_chip_orch.cpp"),
-	"function_name": "aicpu_orchestration_entry"
-}
-
-KERNELS = [
-	{"func_id": 0, "name": "moe_demo_incore_0", "source": str(_ROOT_DIR / "kernels" / "aiv" / "moe_demo_incore_0.cpp"), "core_type": "aiv"},
-	{"func_id": 1, "name": "moe_demo_incore_1", "source": str(_ROOT_DIR / "kernels" / "aiv" / "moe_demo_incore_1.cpp"), "core_type": "aiv"},
-	{"func_id": 2, "name": "moe_demo_incore_2", "source": str(_ROOT_DIR / "kernels" / "aiv" / "moe_demo_incore_2.cpp"), "core_type": "aiv"},
-]
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_combine_only_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_combine_only_orch.cpp
deleted file mode 100644
index 70cd56b11..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_combine_only_orch.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-// Orchestration Function: Combine Only (for debugging)
-//
-// This orchestration ONLY runs the combine phase to verify it works correctly.
-
-#include "runtime.h"
-#include <iostream>
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include "pto_orchestration_api.h"
-
-// Must match golden.py and kernel configurations
-static constexpr int64_t COUNT = 4;  // Number of tokens to process per (card, expert) pair
-static constexpr int64_t NUM_TOKENS = 10;  // Total number of tokens
-static constexpr int64_t HIDDEN_DIM = 16;  // Hidden dimension
-
-extern "C" {
-
-__attribute__((visibility("default")))
-PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 7,  // recv, output, scratch, scratch_print, card_id, num_cards, commCtx
-    };
-}
-
-__attribute__((visibility("default")))
-void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
-    // External tensors
-    Tensor ext_recv = from_tensor_arg(orch_args.tensor(0));      // [num_cards][tokens][hidden]
-    Tensor ext_output = from_tensor_arg(orch_args.tensor(1));    // [num_cards][count][hidden]
-    Tensor ext_scratch = from_tensor_arg(orch_args.tensor(2));   // HCCL scratch buffer
-    Tensor ext_scratch_print = from_tensor_arg(orch_args.tensor(3));  // Scratch print buffer
-
-    // Scalar arguments
-    int64_t card_id = static_cast<int64_t>(orch_args.scalar(0));    // Which card this is
-    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(1));  // Total number of cards
-    uint64_t comm_ctx_ptr = static_cast<uint64_t>(orch_args.scalar(2));  // CommContext*
-
-    printf("[Combine-Only Orch] card_id=%ld num_cards=%ld\n",
-           card_id, num_cards);
-    fflush(stdout);
-
-    PTO2_SCOPE() {
-        // === ONLY Combine Phase ===
-        printf("[Combine-Only Orch] Submitting combine task for card_id=%ld\n",
-               card_id);
-        fflush(stdout);
-
-        Arg params_combine;
-        params_combine.add_input(ext_recv);
-        params_combine.add_output(ext_output);
-        params_combine.add_inout(ext_scratch);
-        params_combine.add_output(ext_scratch_print);
-        params_combine.add_scalar(card_id);
-        params_combine.add_scalar(num_cards);
-        params_combine.add_scalar(comm_ctx_ptr);
-        pto2_rt_submit_aiv_task(0, params_combine);  // moe_combine_alltoall
-
-        printf("[Combine-Only Orch] Combine task submitted for card_id=%ld\n", card_id);
-        fflush(stdout);
-    }
-
-    printf("[Combine-Only Orch] card_id=%ld completed\n", card_id);
-    fflush(stdout);
-}
-
-}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_comm_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_comm_orch.cpp
deleted file mode 100644
index 8de7bc71f..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_comm_orch.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-// Orchestration Function: MoE with Inter-Chip Communication
-//
-// This orchestration implements the three-stage distributed MoE pattern:
-//   Stage 1: Dispatch all-to-all - each card sends its expert data to expert owner
-//   Stage 2: Compute - each expert processes its received data
-//   Stage 3: Combine all-to-all - results are sent back to source cards
-//
-// Data flow matches golden.py:
-//   send[card_j][expert_i][:][:] → recv[expert_i][card_j][:][:] (dispatch)
-//   recv[expert_i][card_j][:][:] += expert_i (compute)
-//   recv[expert_i][card_j][:][:] → output[card_j][:][:] (combine)
-
-#include "runtime.h"
-#include <iostream>
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include "pto_orchestration_api.h"
-
-// Must match golden.py and kernel configurations
-static constexpr int64_t COUNT = 4;  // Number of tokens to process per (card, expert) pair
-static constexpr int64_t NUM_TOKENS = 10;  // Total number of tokens
-static constexpr int64_t HIDDEN_DIM = 16;  // Hidden dimension
-
-extern "C" {
-
-__attribute__((visibility("default")))
-PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 4,  // send, recv, output, scratch
-    };
-}
-
-__attribute__((visibility("default")))
-void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
-    // External tensors
-    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));      // [num_experts][tokens][hidden]
-    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));      // [num_cards][tokens][hidden]
-    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));    // [tokens][hidden]
-    Tensor ext_scratch = from_tensor_arg(orch_args.tensor(3));   // HCCL scratch buffer
-
-    // Scalar arguments
-    int64_t expert_id = static_cast<int64_t>(orch_args.scalar(0));  // Which expert this card processes
-    int64_t card_id = static_cast<int64_t>(orch_args.scalar(1));    // Which card this is
-    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(2));  // Total number of cards
-    uint64_t comm_ctx_ptr = static_cast<uint64_t>(orch_args.scalar(3));  // CommContext*
-
-    printf("[MoE Orch] orchestration_entry: card_id=%ld expert_id=%ld num_cards=%ld comm_ctx=0x%lx\n",
-           card_id, expert_id, num_cards, comm_ctx_ptr);
-    fflush(stdout);
-
-    PTO2_SCOPE() {
-        // === 阶段 1: Dispatch All-to-All ===
-        // Each card i sends send[i][expert_i][:][:] to all cards
-        // and receives send[j][expert_i][:][:] from card j
-        // Result: recv[i][card_j][:][:] = send[card_j][expert_i][:][:]
-        {
-            printf("[MoE Orch] Stage 1: Dispatch - card_id=%ld submitting dispatch task\n", card_id);
-            fflush(stdout);
-            Arg params_dispatch;
-            params_dispatch.add_input(ext_send);
-            params_dispatch.add_output(ext_recv);
-            params_dispatch.add_inout(ext_scratch);
-            params_dispatch.add_scalar(expert_id);
-            params_dispatch.add_scalar(num_cards);
-            params_dispatch.add_scalar(comm_ctx_ptr);
-            pto2_rt_submit_aiv_task(0, params_dispatch);  // moe_dispatch_alltoall
-            printf("[MoE Orch] Stage 1: Dispatch - card_id=%ld dispatch task submitted\n", card_id);
-            fflush(stdout);
-        }
-
-        printf("[MoE Orch] ===== After Dispatch (card_id=%ld, expert_id=%ld) =====\n", card_id, expert_id);
-        fflush(stdout);
-
-        // === 阶段 2: Compute (本地) ===
-        // Add 1.0 to all elements in recv[:][:4][:]
-        {
-            printf("[MoE Orch] Stage 2: Compute - card_id=%ld\n", card_id);
-            fflush(stdout);
-
-            Arg params_compute;
-            params_compute.add_inout(ext_recv);
-            params_compute.add_scalar(0);  // unused
-            params_compute.add_scalar(0);  // unused
-            params_compute.add_scalar(0);  // unused
-            pto2_rt_submit_aiv_task(1, params_compute);  // moe_simple_compute
-
-            printf("[MoE Orch] Stage 2: Compute - card_id=%ld compute task submitted\n", card_id);
-            fflush(stdout);
-        }
-
-        printf("[MoE Orch] ===== After Compute (card_id=%ld, expert_id=%ld) =====\n", card_id, expert_id);
-        fflush(stdout);
-
-        // === 阶段 3: Combine All-to-All ===
-        // Each card i sends recv[i][card_j][:][:] to card j
-        // Card j accumulates all received data to output[j][:][:]
-        {
-            printf("[MoE Orch] Stage 3: Combine - card_id=%ld submitting combine task\n", card_id);
-            fflush(stdout);
-            Arg params_combine;
-            params_combine.add_input(ext_recv);
-            params_combine.add_output(ext_output);
-            params_combine.add_inout(ext_scratch);
-            params_combine.add_scalar(card_id);
-            params_combine.add_scalar(num_cards);
-            params_combine.add_scalar(comm_ctx_ptr);
-            pto2_rt_submit_aiv_task(2, params_combine);  // moe_combine_alltoall
-            printf("[MoE Orch] Stage 3: Combine - card_id=%ld combine task submitted\n", card_id);
-            fflush(stdout);
-        }
-
-        printf("[MoE Orch] ===== After Combine (card_id=%ld) =====\n", card_id);
-        fflush(stdout);
-    }
-
-    printf("[MoE Orch] orchestration_entry: card_id=%ld completed\n", card_id);
-    fflush(stdout);
-}
-
-}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_compute_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_compute_orch.cpp
deleted file mode 100644
index 5d365fae4..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_compute_orch.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-// Orchestration Function: Dispatch + Compute (for debugging)
-//
-// This orchestration runs dispatch phase followed by compute phase.
-
-#include "runtime.h"
-#include <iostream>
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include "pto_orchestration_api.h"
-
-// Must match golden.py and kernel configurations
-static constexpr int64_t COUNT = 4;  // Number of tokens to process per (card, expert) pair
-static constexpr int64_t NUM_TOKENS = 10;  // Total number of tokens
-static constexpr int64_t HIDDEN_DIM = 16;  // Hidden dimension
-
-extern "C" {
-
-__attribute__((visibility("default")))
-PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 4,  // send, recv, output, scratch (output unused)
-    };
-}
-
-__attribute__((visibility("default")))
-void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
-    // External tensors
-    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));      // [num_experts][tokens][hidden]
-    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));      // [num_cards][tokens][hidden]
-    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));    // [tokens][hidden] (unused)
-    Tensor ext_scratch = from_tensor_arg(orch_args.tensor(3));   // HCCL scratch buffer
-
-    // Scalar arguments
-    int64_t expert_id = static_cast<int64_t>(orch_args.scalar(0));  // Which expert this card processes
-    int64_t card_id = static_cast<int64_t>(orch_args.scalar(1));    // Which card this is
-    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(2));  // Total number of cards
-    uint64_t comm_ctx_ptr = static_cast<uint64_t>(orch_args.scalar(3));  // CommContext*
-
-    printf("[Dispatch+Compute Orch] card_id=%ld expert_id=%ld num_cards=%ld\n",
-           card_id, expert_id, num_cards);
-    fflush(stdout);
-
-    PTO2_SCOPE() {
-        // === Phase 1: Dispatch ===
-        printf("[Dispatch+Compute Orch] Stage 1: Dispatch - card_id=%ld\n", card_id);
-        fflush(stdout);
-
-        Arg params_dispatch;
-        params_dispatch.add_input(ext_send);
-        params_dispatch.add_output(ext_recv);
-        params_dispatch.add_inout(ext_scratch);
-        params_dispatch.add_scalar(expert_id);
-        params_dispatch.add_scalar(num_cards);
-        params_dispatch.add_scalar(comm_ctx_ptr);
-        pto2_rt_submit_aiv_task(0, params_dispatch);  // moe_dispatch_alltoall
-
-        printf("[Dispatch+Compute Orch] Dispatch submitted for card_id=%ld\n", card_id);
-        fflush(stdout);
-
-        // === Phase 2: Compute ===
-        printf("[Dispatch+Compute Orch] Stage 2: Compute - card_id=%ld processing %d cards x %d tokens\n",
-               card_id, num_cards, COUNT);
-        fflush(stdout);
-
-        // === Phase 2: Compute ===
-        // Add 1.0 to all elements in recv[:][:4][:]
-        printf("[Dispatch+Compute Orch] Stage 2: Compute - card_id=%ld\n", card_id);
-        fflush(stdout);
-
-        Arg params_compute;
-        params_compute.add_inout(ext_recv);
-        params_compute.add_scalar(0);  // unused
-        params_compute.add_scalar(0);  // unused
-        params_compute.add_scalar(0);  // unused
-        pto2_rt_submit_aiv_task(1, params_compute);  // moe_simple_compute
-
-        printf("[Dispatch+Compute Orch] Compute submitted for card_id=%ld\n", card_id);
-        fflush(stdout);
-    }
-
-    printf("[Dispatch+Compute Orch] card_id=%ld completed\n", card_id);
-    fflush(stdout);
-}
-
-}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_only_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_only_orch.cpp
deleted file mode 100644
index 9751e2d4b..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_dispatch_only_orch.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-// Orchestration Function: Dispatch Only (for debugging)
-//
-// This orchestration ONLY runs the dispatch phase to verify it works correctly.
-
-#include "runtime.h"
-#include <iostream>
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include "pto_orchestration_api.h"
-
-// Must match golden.py and kernel configurations
-static constexpr int64_t COUNT = 4;  // Number of tokens to process per (card, expert) pair
-static constexpr int64_t NUM_TOKENS = 10;  // Total number of tokens
-static constexpr int64_t HIDDEN_DIM = 16;  // Hidden dimension
-
-extern "C" {
-
-__attribute__((visibility("default")))
-PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 4,  // send, recv, output, scratch (output unused in dispatch-only)
-    };
-}
-
-__attribute__((visibility("default")))
-void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
-    // External tensors
-    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));      // [num_experts][tokens][hidden]
-    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));      // [num_cards][tokens][hidden]
-    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));    // [tokens][hidden] (unused)
-    Tensor ext_scratch = from_tensor_arg(orch_args.tensor(3));   // HCCL scratch buffer
-
-    // Scalar arguments
-    int64_t expert_id = static_cast<int64_t>(orch_args.scalar(0));  // Which expert this card processes
-    int64_t card_id = static_cast<int64_t>(orch_args.scalar(1));    // Which card this is
-    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(2));  // Total number of cards
-    uint64_t comm_ctx_ptr = static_cast<uint64_t>(orch_args.scalar(3));  // CommContext*
-
-    printf("[Dispatch-Only Orch] card_id=%ld expert_id=%ld num_cards=%ld\n",
-           card_id, expert_id, num_cards);
-    fflush(stdout);
-
-    PTO2_SCOPE() {
-        // === ONLY Dispatch Phase ===
-        printf("[Dispatch-Only Orch] Submitting dispatch task for card_id=%ld expert_id=%ld\n",
-               card_id, expert_id);
-        fflush(stdout);
-
-        Arg params_dispatch;
-        params_dispatch.add_input(ext_send);
-        params_dispatch.add_output(ext_recv);
-        params_dispatch.add_inout(ext_scratch);
-        params_dispatch.add_scalar(expert_id);
-        params_dispatch.add_scalar(num_cards);
-        params_dispatch.add_scalar(comm_ctx_ptr);
-        pto2_rt_submit_aiv_task(0, params_dispatch);  // moe_dispatch_alltoall
-
-        printf("[Dispatch-Only Orch] Dispatch task submitted for card_id=%ld\n", card_id);
-        fflush(stdout);
-    }
-
-    printf("[Dispatch-Only Orch] card_id=%ld completed\n", card_id);
-    fflush(stdout);
-}
-
-}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_end2end_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_end2end_orch.cpp
index c3fc7accc..b01237072 100644
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_end2end_orch.cpp
+++ b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_end2end_orch.cpp
@@ -1,3 +1,13 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
 // Orchestration Function: End-to-End MoE Pipeline
 //
 // This orchestration runs the complete MoE pipeline:
@@ -5,7 +15,7 @@
 // 2. Compute: process tokens on each expert card
 // 3. Combine: gather results back to source cards
 //
-// Uses independent scratch buffers for combine phase to avoid data corruption
+// Uses independent dispatch and combine scratch buffers to avoid reuse hazards.
 
 #include "runtime.h"
 #include <iostream>
@@ -16,38 +26,37 @@
 
 #include "pto_orchestration_api.h"
 
-// Must match golden.py and kernel configurations
-static constexpr int64_t COUNT = 4;  // Number of tokens to process per (card, expert) pair
+// Must match the in-test golden references and kernel configurations
+static constexpr int64_t COUNT = 4;        // Number of tokens to process per (card, expert) pair
 static constexpr int64_t NUM_TOKENS = 10;  // Total number of tokens
 static constexpr int64_t HIDDEN_DIM = 16;  // Hidden dimension
 
 extern "C" {
 
-__attribute__((visibility("default")))
-PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
     return PTO2OrchestrationConfig{
-        .expected_arg_count = 10,  // send, recv, output, scratch, scratch_test, scratch_print, expert_id, card_id, num_cards, commCtx
+        .expected_arg_count =
+            10,  // send, recv, output, scratch1, scratch2, scratch_print, expert_id, card_id, num_cards, commCtx
     };
 }
 
-__attribute__((visibility("default")))
-void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
     // External tensors
-    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));         // [num_experts][tokens][hidden]
-    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));         // [num_cards][tokens][hidden]
-    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));       // [num_cards][count][hidden]
-    Tensor ext_scratch = from_tensor_arg(orch_args.tensor(3));      // HCCL scratch buffer for dispatch+compute
-    Tensor ext_scratch_test = from_tensor_arg(orch_args.tensor(4)); // HCCL scratch buffer for combine phase
+    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));           // [num_experts][tokens][hidden]
+    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));           // [num_cards][tokens][hidden]
+    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));         // [num_cards][count][hidden]
+    Tensor ext_scratch1 = from_tensor_arg(orch_args.tensor(3));       // HCCL scratch buffer for dispatch
+    Tensor ext_scratch2 = from_tensor_arg(orch_args.tensor(4));       // HCCL scratch buffer for combine
     Tensor ext_scratch_print = from_tensor_arg(orch_args.tensor(5));  // Scratch print buffer
 
     // Scalar arguments
-    int64_t expert_id = static_cast<int64_t>(orch_args.scalar(0));  // Which expert this card processes
-    int64_t card_id = static_cast<int64_t>(orch_args.scalar(1));    // Which card this is
-    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(2));  // Total number of cards
+    int64_t expert_id = static_cast<int64_t>(orch_args.scalar(0));       // Which expert this card processes
+    int64_t card_id = static_cast<int64_t>(orch_args.scalar(1));         // Which card this is
+    int64_t num_cards = static_cast<int64_t>(orch_args.scalar(2));       // Total number of cards
     uint64_t comm_ctx_ptr = static_cast<uint64_t>(orch_args.scalar(3));  // CommContext*
 
-    printf("[End2End Orch] card_id=%ld expert_id=%ld num_cards=%ld\n",
-           card_id, expert_id, num_cards);
+    printf("[End2End Orch] card_id=%ld expert_id=%ld num_cards=%ld\n", card_id, expert_id, num_cards);
     fflush(stdout);
 
     PTO2_SCOPE() {
@@ -62,7 +71,7 @@ void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
         Arg params_dispatch;
         params_dispatch.add_input(ext_send);
         params_dispatch.add_output(ext_recv);
-        params_dispatch.add_inout(ext_scratch);
+        params_dispatch.add_inout(ext_scratch1);
         params_dispatch.add_scalar(expert_id);
         params_dispatch.add_scalar(num_cards);
         params_dispatch.add_scalar(comm_ctx_ptr);
@@ -77,9 +86,9 @@ void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
 
         Arg params_compute;
         params_compute.add_inout(ext_recv);
-        params_compute.add_scalar(0);  // unused
-        params_compute.add_scalar(0);  // unused
-        params_compute.add_scalar(0);  // unused
+        params_compute.add_scalar(num_cards);
+        params_compute.add_scalar(0);                // unused
+        params_compute.add_scalar(0);                // unused
         pto2_rt_submit_aiv_task(1, params_compute);  // moe_simple_compute
 
         printf("[End2End Orch] Compute submitted\n", card_id);
@@ -92,7 +101,7 @@ void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
         Arg params_combine;
         params_combine.add_input(ext_recv);
         params_combine.add_output(ext_output);
-        params_combine.add_inout(ext_scratch_test);  // Use independent scratch_test buffer for combine
+        params_combine.add_inout(ext_scratch2);
         params_combine.add_output(ext_scratch_print);
         params_combine.add_scalar(card_id);
         params_combine.add_scalar(num_cards);
diff --git a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_multi_chip_orch.cpp b/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_multi_chip_orch.cpp
deleted file mode 100644
index eaecbd87e..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/kernels/orchestration/moe_multi_chip_orch.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-// Orchestration Function: moe_demo (Multi-Chip Version)
-//
-// Multi-chip MoE orchestration - implements "one expert per chip" parallelism.
-//
-// Architecture comparison:
-// - Single-chip version: One chip runs ALL experts sequentially
-//   (orchestration loops: card_i=0..3, expert_j=0..3, t_idx=0..3)
-// - Multi-chip version: Each chip runs ONE expert in parallel
-//   (orchestration: card_i passed as arg, expert_j passed as arg, t_idx=0..3)
-//
-// Key insight: Both versions produce IDENTICAL results because the kernels
-// perform the same computation - only the execution distribution differs.
-//
-// Expected arguments:
-// - 3 tensors: send (INPUT), recv (OUTPUT_EXISTING), output (OUTPUT_EXISTING)
-// - 2 scalars: expert_id (which expert), chip_id (logical card_i for data layout)
-
-#include "runtime.h"
-#include <iostream>
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include "pto_orchestration_api.h"
-
-extern "C" {
-
-__attribute__((visibility("default")))
-PTO2OrchestrationConfig aicpu_orchestration_config(const ChipStorageTaskArgs& orch_args) {
-    // Expected: 3 tensors + 2 scalars (expert_id, chip_id)
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 3,
-    };
-}
-
-__attribute__((visibility("default")))
-void aicpu_orchestration_entry(const ChipStorageTaskArgs& orch_args) {
-    // External tensors
-    Tensor ext_send = from_tensor_arg(orch_args.tensor(0));
-    Tensor ext_recv = from_tensor_arg(orch_args.tensor(1));
-    Tensor ext_output = from_tensor_arg(orch_args.tensor(2));
-
-    // Read expert ID and chip ID from scalar arguments (passed by Python)
-    int64_t expert_j = static_cast<int64_t>(orch_args.scalar(0));
-    int64_t card_i = static_cast<int64_t>(orch_args.scalar(1));
-
-    PTO2_SCOPE() {
-        // Stage 0: Dispatch (send → recv)
-        for (int64_t t_idx = 0; t_idx < 4; t_idx += 1) {
-            PTO2_SCOPE() {
-                Arg params_t0;
-                params_t0.add_input(ext_send);
-                params_t0.add_output(ext_recv);
-                params_t0.add_scalar(card_i);
-                params_t0.add_scalar(expert_j);
-                params_t0.add_scalar(t_idx);
-                pto2_rt_submit_aiv_task(0, params_t0);
-            }
-        }
-
-        // Stage 1: Compute (expert transformation on recv)
-        for (int64_t t_idx = 0; t_idx < 4; t_idx += 1) {
-            PTO2_SCOPE() {
-                Arg params_t1;
-                params_t1.add_inout(ext_recv);
-                params_t1.add_scalar(expert_j);
-                params_t1.add_scalar(card_i);
-                params_t1.add_scalar(t_idx);
-                pto2_rt_submit_aiv_task(1, params_t1);
-            }
-        }
-
-        // Stage 2: Combine (recv → output)
-        for (int64_t t_idx = 0; t_idx < 4; t_idx += 1) {
-            PTO2_SCOPE() {
-                Arg params_t2;
-                params_t2.add_input(ext_recv);
-                params_t2.add_output(ext_output);
-                params_t2.add_scalar(card_i);
-                params_t2.add_scalar(t_idx);
-                pto2_rt_submit_aiv_task(2, params_t2);
-            }
-        }
-    }
-}
-
-}  // extern "C"
diff --git a/examples/workers/l3/moe_multi_chip_experts/main.py b/examples/workers/l3/moe_multi_chip_experts/main.py
index c1b31f364..a763ec61e 100644
--- a/examples/workers/l3/moe_multi_chip_experts/main.py
+++ b/examples/workers/l3/moe_multi_chip_experts/main.py
@@ -4,31 +4,21 @@
 # CANN Open Software License Agreement Version 2.0 (the "License").
 # Please refer to the License for details. You may not use this file except in compliance with the License.
 # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
-"""L3 Worker API demo — multi-chip MoE with true inter-chip communication.
+"""End-to-end distributed MoE demo.
 
-This implements a distributed MoE (Mixture of Experts) pattern with real inter-chip communication:
-  - Each card has send[num_experts][num_tokens][hidden_dim] - 3D tensor
-  - Dispatch: card i sends send[i][expert_j] to card j (expert owner)
-  - Compute: card j computes recv[expert_j][card_i] += expert_j
-  - Combine: card j sends recv[expert_j][card_i] back to card i
-  - Result: output matches golden.py exactly
-
-Data flow:
-  Initial:  send[card_i][expert_j][tokens][hidden]  (per-card 3D tensor)
-  Dispatch: recv[card_j][card_i][tokens][hidden]  (all-to-all transpose)
-  Compute:  recv[card_j][card_i][tokens][hidden] += card_j (expert_id)
-  Combine:  output[card_i][tokens][hidden] = sum_j recv[card_j][card_i][tokens][hidden]
+Runs dispatch, per-expert compute, and combine across one expert per chip.
 
 Run:
-    python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3sim -d 0-1
+    python examples/workers/l3/moe_multi_chip_experts/main.py -p a2a3 -d 0-3
 """
 
 import argparse
 import os
 import sys
+import traceback
 
 os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
 
@@ -40,6 +30,7 @@
     ChipBufferSpec,
     ChipCallable,
     ChipCommBootstrapConfig,
+    ContinuousTensor,
     CoreCallable,
     DataType,
     TaskArgs,
@@ -47,97 +38,96 @@
 )
 from simpler.worker import Worker
 
+from simpler_setup.elf_parser import extract_text_section
 from simpler_setup.kernel_compiler import KernelCompiler
 from simpler_setup.pto_isa import ensure_pto_isa_root
 from simpler_setup.torch_interop import make_tensor_arg
 
 HERE = os.path.dirname(os.path.abspath(__file__))
 
-# MoE configuration - matching golden.py exactly
-NUM_TOKENS = 10  # Number of tokens
-HIDDEN_DIM = 16  # Hidden dimension
-COUNT = 4  # Number of tokens to process per (card, expert) pair
+# MoE configuration
+NUM_TOKENS = 10
+HIDDEN_DIM = 16
+COUNT = 4
 
 
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+def parse_args():
+    parser = argparse.ArgumentParser(description="Test complete MoE pipeline (Dispatch + Compute + Combine)")
     parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3", "a5sim", "a5"])
-    parser.add_argument("-d", "--device", default="0-1", help="Device range, e.g. '0-1' or '0,1'")
+    parser.add_argument("-d", "--device", default="0-3", help="Device range")
     return parser.parse_args()
 
 
 def parse_device_range(spec: str) -> list[int]:
-    """Parse device range specification like '0-1' or '0,1' into a list of IDs."""
     if "-" in spec:
         lo, hi = (int(x) for x in spec.split("-"))
-        ids = list(range(lo, hi + 1))
+        return list(range(lo, hi + 1))
     elif "," in spec:
-        ids = [int(x) for x in spec.split(",")]
+        return [int(x) for x in spec.split(",")]
     else:
-        ids = [int(spec)]
-    return ids
-    return ids
+        return [int(spec)]
 
 
-def build_moe_comm_callable(platform: str) -> ChipCallable:
-    """Build MoE callable with inter-chip communication (dispatch-compute-combine)."""
-    print("[moe_multi_chip] [DEBUG] Starting kernel compilation...", flush=True)
+def build_end2end_callable(platform: str) -> ChipCallable:
+    """Build callable with dispatch + compute + combine kernels."""
+    print("[End2End] Compiling kernels...", flush=True)
     kc = KernelCompiler(platform=platform)
     runtime = "tensormap_and_ringbuffer"
     pto_isa_root = ensure_pto_isa_root(clone_protocol="https")
-    print(f"[moe_multi_chip] [DEBUG] pto_isa_root: {pto_isa_root}", flush=True)
     include_dirs = kc.get_orchestration_include_dirs(runtime)
-
-    # Add platform_comm include directory for CommContext
     kernel_include_dirs = list(include_dirs) + [str(kc.project_root / "src" / "common")]
 
-    # Build three kernels
-    print("[moe_multi_chip] [DEBUG] Compiling dispatch kernel...", flush=True)
+    # Compile dispatch kernel
     dispatch_bytes = kc.compile_incore(
         source_path=os.path.join(HERE, "kernels/aiv/moe_dispatch_alltoall.cpp"),
         core_type="aiv",
         pto_isa_root=pto_isa_root,
         extra_include_dirs=kernel_include_dirs,
     )
-    print("[moe_multi_chip] [DEBUG] Dispatch kernel compiled", flush=True)
+    print("[End2End] Dispatch kernel compiled", flush=True)
 
-    print("[moe_multi_chip] [DEBUG] Compiling simple compute kernel...", flush=True)
+    # Compile compute kernel
     compute_bytes = kc.compile_incore(
         source_path=os.path.join(HERE, "kernels/aiv/moe_simple_compute.cpp"),
         core_type="aiv",
         pto_isa_root=pto_isa_root,
         extra_include_dirs=include_dirs,
     )
-    print("[moe_multi_chip] [DEBUG] Simple compute kernel compiled", flush=True)
+    print("[End2End] Compute kernel compiled", flush=True)
 
-    print("[moe_multi_chip] [DEBUG] Compiling combine kernel...", flush=True)
+    # Compile combine kernel
     combine_bytes = kc.compile_incore(
         source_path=os.path.join(HERE, "kernels/aiv/moe_combine_alltoall.cpp"),
         core_type="aiv",
         pto_isa_root=pto_isa_root,
         extra_include_dirs=kernel_include_dirs,
     )
-    print("[moe_multi_chip] [DEBUG] Combine kernel compiled", flush=True)
+    print("[End2End] Combine kernel compiled", flush=True)
 
     if not platform.endswith("sim"):
-        print("[moe_multi_chip] [DEBUG] Extracting text sections from ELF binaries...", flush=True)
-        from simpler_setup.elf_parser import extract_text_section
         dispatch_bytes = extract_text_section(dispatch_bytes)
         compute_bytes = extract_text_section(compute_bytes)
         combine_bytes = extract_text_section(combine_bytes)
-        print("[moe_multi_chip] [DEBUG] Text sections extracted", flush=True)
+        print("[End2End] Text sections extracted", flush=True)
 
-    print("[moe_multi_chip] [DEBUG] Compiling orchestration...", flush=True)
+    # Compile orchestration
+    print("[End2End] Compiling orchestration...", flush=True)
     orch_bytes = kc.compile_orchestration(
         runtime_name=runtime,
-        source_path=os.path.join(HERE, "kernels/orchestration/moe_comm_orch.cpp"),
+        source_path=os.path.join(HERE, "kernels/orchestration/moe_end2end_orch.cpp"),
     )
-    print("[moe_multi_chip] [DEBUG] Orchestration compiled", flush=True)
+    print("[End2End] Orchestration compiled", flush=True)
 
     # Build core callables
     dispatch_cc = CoreCallable.build(
-        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT,
-                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        signature=[
+            ArgDirection.IN,
+            ArgDirection.OUT,
+            ArgDirection.INOUT,
+            ArgDirection.IN,
+            ArgDirection.IN,
+            ArgDirection.IN,
+        ],
         binary=dispatch_bytes,
     )
 
@@ -147,78 +137,87 @@ def build_moe_comm_callable(platform: str) -> ChipCallable:
     )
 
     combine_cc = CoreCallable.build(
-        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT,
-                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
+        signature=[
+            ArgDirection.IN,
+            ArgDirection.OUT,
+            ArgDirection.INOUT,
+            ArgDirection.OUT,
+            ArgDirection.IN,
+            ArgDirection.IN,
+            ArgDirection.IN,
+        ],
         binary=combine_bytes,
     )
 
     return ChipCallable.build(
         signature=[
-            ArgDirection.IN,   # send[num_experts][num_tokens][hidden_dim]
-            ArgDirection.OUT,  # recv[num_cards][num_tokens][hidden_dim]
-            ArgDirection.OUT,  # output[num_tokens][hidden_dim]
-            ArgDirection.INOUT,  # scratch HCCL buffer
-            ArgDirection.IN,   # expert_id
-            ArgDirection.IN,   # card_id
-            ArgDirection.IN,   # num_cards
-            ArgDirection.IN,   # CommContext*
+            ArgDirection.IN,  # send
+            ArgDirection.OUT,  # recv
+            ArgDirection.OUT,  # output
+            ArgDirection.INOUT,  # scratch1: dispatch HCCL window
+            ArgDirection.INOUT,  # scratch2: combine HCCL window
+            ArgDirection.OUT,  # scratch_print
+            ArgDirection.IN,  # expert_id
+            ArgDirection.IN,  # card_id
+            ArgDirection.IN,  # num_cards
+            ArgDirection.IN,  # CommContext*
         ],
         func_name="aicpu_orchestration_entry",
         binary=orch_bytes,
-        children=[(0, dispatch_cc), (1, compute_cc), (2, combine_cc)],
+        children=[(0, dispatch_cc), (1, compute_cc), (2, combine_cc)],  # All three phases
     )
 
 
-def run(platform: str, device_ids: list[int]) -> int:
-    """Core logic - implements true inter-chip communication MoE."""
-    print("[moe_multi_chip] [DEBUG] run() function started", flush=True)
-    num_cards = len(device_ids)
-    num_experts = num_cards  # One expert per chip
-
-    print(f"[moe_multi_chip] devices={device_ids} num_cards={num_cards} num_experts={num_experts}", flush=True)
-    print(f"[moe_multi_chip] NUM_TOKENS={NUM_TOKENS} HIDDEN_DIM={HIDDEN_DIM} COUNT={COUNT}", flush=True)
+def compute_golden_end2end(num_cards: int, host_send: list[torch.Tensor]) -> list[torch.Tensor]:
+    """
+    Compute golden output for end-to-end pipeline:
+    1. Dispatch: send[card_j][expert_i][:COUNT][:] -> recv[card_i][card_j][:COUNT][:]
+    2. Compute: recv[card_i][card_j][:COUNT][:] += 1.0
+    3. Combine: recv[expert_j][card_i][:COUNT][:] -> output[card_i][expert_j][:COUNT][:]
 
-    # Configure HCCL communication
-    # Scratch buffer size: num_cards * num_cards slots (all cards' data)
-    # Layout: scratch[card_j][expert_i][tokens][hidden_dim]
-    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
-    scratch_nbytes = scratch_count * 4  # float32
-
-    # Allocate space for signals at tail of scratch
-    total_scratch_nbytes = scratch_nbytes + num_cards * 4  # + num_cards int32 signals
-    window_size = max(total_scratch_nbytes, 4 * 1024)
-
-    rootinfo_path = f"/tmp/pto_moe_multi_chip_{os.getpid()}.bin"
-    print(f"[moe_multi_chip] [DEBUG] HCCL config: scratch_count={scratch_count} window_size={window_size} rootinfo={rootinfo_path}", flush=True)
-
-    # Clean up any stale rootinfo file
-    try:
-        os.unlink(rootinfo_path)
-        print(f"[moe_multi_chip] [DEBUG] Cleaned up stale rootinfo file", flush=True)
-    except FileNotFoundError:
-        print(f"[moe_multi_chip] [DEBUG] No stale rootinfo file to clean", flush=True)
-        pass
-
-    torch.manual_seed(42)
-    print("[moe_multi_chip] [DEBUG] Random seed set", flush=True)
-
-    # Per-card data layout (3D/2D as per user requirement)
-    # send[i]: [num_experts, num_tokens, hidden_dim]
-    host_send = [torch.ones(num_experts, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                 for _ in device_ids]
-
-    # recv[i]: [num_cards, num_tokens, hidden_dim] - receives data from all cards for expert_i
-    host_recv = [torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                 for _ in device_ids]
-
-    # output[i]: [num_tokens, hidden_dim]
-    host_output = [torch.zeros(NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                   for _ in device_ids]
+    Send initialization: unique values using (card * 1000000 + expert * 10000 + token * 100 + dim)
+    """
+    golden_outputs = []
+    for cardi in range(num_cards):
+        output = torch.zeros(num_cards, COUNT, HIDDEN_DIM, dtype=torch.float32)
+        for expertj in range(num_cards):
+            for t in range(COUNT):
+                for d in range(HIDDEN_DIM):
+                    # After dispatch: recv[cardi][expertj][:][:] = send[expertj][cardi][:][:]
+                    # Value from cardi's send[expertj][cardi][t][d]
+                    send_value = host_send[cardi][expertj, t, d].item()
+                    # After compute: recv += 1.0
+                    recv_value = send_value + 1.0
+                    # After combine: output[cardi][expertj][t][d] = recv[expertj][cardi][t][d]
+                    output[expertj, t, d] = recv_value
+        golden_outputs.append(output)
+
+    return golden_outputs
+
+
+def make_host_tensors(num_cards: int, num_experts: int):
+    host_send = []
+    for i in range(num_cards):
+        send = torch.zeros(num_experts, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
+        for expert_j in range(num_experts):
+            for t in range(NUM_TOKENS):
+                for d in range(HIDDEN_DIM):
+                    value = float(i * 1000000 + expert_j * 10000 + t * 100 + d)
+                    send[expert_j, t, d] = value
+        host_send.append(send)
+
+    host_recv = [
+        torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_() for _ in range(num_cards)
+    ]
+    host_output = [
+        torch.zeros(num_cards, COUNT, HIDDEN_DIM, dtype=torch.float32).share_memory_() for _ in range(num_cards)
+    ]
+    return host_send, host_recv, host_output
 
-    print("[moe_multi_chip] [DEBUG] All tensors allocated, host_send initialized to 1.0", flush=True)
 
-    # Configure HCCL bootstrap for each card
-    cfgs = [
+def make_bootstrap_configs(num_cards: int, rootinfo_path: str, window_size: int, scratch_buffer_count: int):
+    total_scratch_nbytes = scratch_buffer_count * 4
+    return [
         ChipBootstrapConfig(
             comm=ChipCommBootstrapConfig(
                 rank=rank,
@@ -228,9 +227,15 @@ def run(platform: str, device_ids: list[int]) -> int:
             ),
             buffers=[
                 ChipBufferSpec(
-                    name="scratch",
+                    name="scratch1",
                     dtype="float32",
-                    count=scratch_count,
+                    count=scratch_buffer_count,
+                    nbytes=total_scratch_nbytes,
+                ),
+                ChipBufferSpec(
+                    name="scratch2",
+                    dtype="float32",
+                    count=scratch_buffer_count,
                     nbytes=total_scratch_nbytes,
                 ),
             ],
@@ -238,7 +243,103 @@ def run(platform: str, device_ids: list[int]) -> int:
         for rank in range(num_cards)
     ]
 
-    print("[moe_multi_chip] [DEBUG] Creating Worker...", flush=True)
+
+def print_output_samples(num_cards: int, host_output: list[torch.Tensor], golden_outputs: list[torch.Tensor]) -> None:
+    print("\n" + "=" * 80)
+    print("[End2End] OUTPUT DATA:")
+    print("=" * 80)
+
+    for i in range(num_cards):
+        print(f"\n[End2End] Card {i} output data:")
+        print("  Expected: Each value = send_value + 1.0")
+        print(f"  Sample data (up to 2 experts, first {COUNT} tokens, first 3 dims):")
+
+        for expert_j in range(min(2, num_cards)):
+            print(f"    Expert {expert_j}:")
+            for t in range(min(COUNT, 2)):
+                vals = host_output[i][expert_j, t, :3].tolist()
+                golden_vals = golden_outputs[i][expert_j, t, :3].tolist()
+                print(f"      Token {t}: Output={vals}, Golden={golden_vals}")
+
+
+def verify_outputs(num_cards: int, host_output: list[torch.Tensor], golden_outputs: list[torch.Tensor]) -> bool:
+    print("\n" + "=" * 80)
+    print("[End2End] VERIFICATION:")
+    print("=" * 80)
+
+    all_correct = True
+    error_count = 0
+    total_checked = 0
+
+    for i in range(num_cards):
+        print(f"\n[End2End] Card {i}:")
+        card_errors = 0
+        for expert_j in range(num_cards):
+            for t in range(COUNT):
+                for d in range(HIDDEN_DIM):
+                    expected = golden_outputs[i][expert_j, t, d].item()
+                    actual = host_output[i][expert_j, t, d].item()
+                    total_checked += 1
+                    if abs(actual - expected) > 1e-3:
+                        card_errors += 1
+                        error_count += 1
+                        all_correct = False
+
+        if card_errors == 0:
+            print(f"  ✓ All {num_cards * COUNT * HIDDEN_DIM} values correct")
+        else:
+            print(f"  ✗ {card_errors} / {num_cards * COUNT * HIDDEN_DIM} values incorrect")
+
+    print(f"\n  Total: {total_checked - error_count}/{total_checked} correct")
+    return all_correct
+
+
+def make_scratch_arg(contexts, rank: int, name: str, scratch_buffer_count: int):
+    return ContinuousTensor.make(
+        data=contexts[rank].buffer_ptrs[name],
+        shapes=(scratch_buffer_count,),
+        dtype=DataType.FLOAT32,
+        child_memory=True,
+    )
+
+
+def run(platform: str, device_ids: list[int]) -> int:
+    print(f"[End2End] Testing complete MoE pipeline on devices {device_ids}", flush=True)
+    num_cards = len(device_ids)
+    num_experts = num_cards
+    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
+    signal_count = num_cards
+    scratch_buffer_count = scratch_count + signal_count
+    total_scratch_nbytes = scratch_buffer_count * 4
+    window_size = max(total_scratch_nbytes * 2, 4 * 1024)
+
+    print("\n[End2End] Test Configuration:")
+    print(f"  Platform: {platform}")
+    print(f"  Number of cards: {num_cards}")
+    print(f"  Device IDs: {device_ids}")
+    print(f"  NUM_TOKENS: {NUM_TOKENS}")
+    print(f"  HIDDEN_DIM: {HIDDEN_DIM}")
+    print(f"  COUNT (tokens processed): {COUNT}")
+
+    rootinfo_path = f"/tmp/pto_end2end_{os.getpid()}.bin"
+    try:
+        os.unlink(rootinfo_path)
+    except FileNotFoundError:
+        pass
+
+    torch.manual_seed(42)
+    host_send, host_recv, host_output = make_host_tensors(num_cards, num_experts)
+    host_scratch_print = [torch.zeros(scratch_count, dtype=torch.float32).share_memory_() for _ in device_ids]
+
+    print("\n[End2End] Allocated tensors:")
+    print("  send=unique_values, recv=0.0, output=0.0")
+    print("  Value encoding: (card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim", flush=True)
+
+    print("\n[End2End] Computing golden output...")
+    golden_outputs = compute_golden_end2end(num_cards, host_send)
+    print("[End2End] Golden output computed", flush=True)
+
+    cfgs = make_bootstrap_configs(num_cards, rootinfo_path, window_size, scratch_buffer_count)
     worker = Worker(
         level=3,
         platform=platform,
@@ -247,160 +348,64 @@ def run(platform: str, device_ids: list[int]) -> int:
         num_sub_workers=0,
         chip_bootstrap_configs=cfgs,
     )
-    print("[moe_multi_chip] [DEBUG] Worker created", flush=True)
 
-    print(f"[moe_multi_chip] compiling kernels for {platform}...", flush=True)
-    moe_cc = build_moe_comm_callable(platform)
-    print("[moe_multi_chip] [DEBUG] All kernels compiled successfully", flush=True)
+    print(f"\n[End2End] Compiling kernels for {platform}...", flush=True)
+    end2end_cc = build_end2end_callable(platform)
+    print("[End2End] All kernels compiled successfully", flush=True)
 
-    print("[moe_multi_chip] init worker (with HCCL communication)...", flush=True)
+    print("[End2End] Initializing worker...", flush=True)
     worker.init()
-    print("[moe_multi_chip] [DEBUG] Worker initialized", flush=True)
-
-    # Get chip contexts (contains CommContext pointers)
     contexts = worker.chip_contexts
-    print(f"[moe_multi_chip] chip contexts: {len(contexts)}", flush=True)
-    for i, ctx in enumerate(contexts):
-        print(f"[moe_multi_chip]   card {i}: rank={ctx.rank}/{ctx.nranks} device_ctx=0x{ctx.device_ctx:x}", flush=True)
+    print(f"[End2End] Worker initialized with {len(contexts)} contexts", flush=True)
 
     try:
-        # 第一次运行：只执行到dispatch阶段，查看recv数据
-        # 注意：当前orchestration是一次性执行所有3个阶段，所以无法分阶段查看
-        # 这里我们运行完整流程，然后在host端查看最终结果
 
         def orch_fn(orch, _args, cfg):
-            print(f"[moe_multi_chip] orch_fn: Starting submission for {num_cards} cards", flush=True)
-            # Each card submits a task that:
-            # 1. Dispatches its expert data to all cards
-            # 2. Computes on received data
-            # 3. Combines results back to source cards
+            print(f"[End2End] Submitting tasks for {num_cards} cards", flush=True)
             for i in range(num_cards):
-                print(f"[moe_multi_chip] orch_fn: Submitting task for card {i} (worker {i})", flush=True)
-                moe_args = TaskArgs()
-                moe_args.add_tensor(make_tensor_arg(host_send[i]), TensorArgType.INPUT)
-                moe_args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.OUTPUT_EXISTING)
-                moe_args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
-
-                # Scratch buffer (HCCL window)
-                from simpler.task_interface import ContinuousTensor
-                moe_args.add_tensor(
-                    ContinuousTensor.make(
-                        data=contexts[i].buffer_ptrs["scratch"],
-                        shapes=(scratch_count,),
-                        dtype=DataType.FLOAT32,
-                        child_memory=True,
-                    ),
-                    TensorArgType.INOUT,
-                )
-
-                moe_args.add_scalar(i)  # expert_id
-                moe_args.add_scalar(i)  # card_id
-                moe_args.add_scalar(num_cards)
-                moe_args.add_scalar(contexts[i].device_ctx)
-
-                result = orch.submit_next_level(moe_cc, moe_args, cfg, worker=i)
-                print(f"[moe_multi_chip] orch_fn: Submitted task for card {i}, result={result}", flush=True)
-
-            print(f"[moe_multi_chip] orch_fn: All {num_cards} tasks submitted", flush=True)
-
-        print("[moe_multi_chip] running multi-chip MoE DAG with inter-chip communication...", flush=True)
-        print("[moe_multi_chip] [DEBUG] About to call worker.run()...", flush=True)
+                args = TaskArgs()
+                args.add_tensor(make_tensor_arg(host_send[i]), TensorArgType.INPUT)
+                args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.OUTPUT_EXISTING)
+                args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
+
+                args.add_tensor(make_scratch_arg(contexts, i, "scratch1", scratch_buffer_count), TensorArgType.INOUT)
+                args.add_tensor(make_scratch_arg(contexts, i, "scratch2", scratch_buffer_count), TensorArgType.INOUT)
+                args.add_tensor(make_tensor_arg(host_scratch_print[i]), TensorArgType.OUTPUT_EXISTING)
+
+                args.add_scalar(i)  # expert_id
+                args.add_scalar(i)  # card_id
+                args.add_scalar(num_cards)
+                args.add_scalar(contexts[i].device_ctx)
+
+                orch.submit_next_level(end2end_cc, args, cfg, worker=i)
+                print(f"[End2End] Submitted task for card {i}", flush=True)
+
+        print("\n[End2End] Running end-to-end test...", flush=True)
+
         worker.run(orch_fn, args=None, config=CallConfig())
-        print("[moe_multi_chip] [DEBUG] worker.run() completed", flush=True)
-
-        # 打印host端的recv数据（这是所有阶段完成后的最终recv状态）
-        print("\n[moe_multi_chip] ===== Host-side recv data (after all stages) =====")
-        for i in range(num_cards):
-            print(f"[moe_multi_chip] Card {i} recv shape: {host_recv[i].shape}")
-            print(f"[moe_multi_chip] Card {i} recv sample (first 2 cards' data, first 2 tokens, first 3 dims):")
-            for card_j in range(min(2, num_cards)):
-                for t in range(min(2, COUNT)):
-                    print(f"  recv[{card_j}][{t}][:3] = {host_recv[i][card_j, t, :3].tolist()}")
-
-        # 打印host端的output数据
-        print("\n[moe_multi_chip] ===== Host-side output data (final) =====")
-        for i in range(num_cards):
-            print(f"[moe_multi_chip] Card {i} output shape: {host_output[i].shape}")
-            print(f"[moe_multi_chip] Card {i} output sample (first {COUNT} tokens, first 3 dims):")
-            for t in range(COUNT):
-                print(f"  output[{t}][:3] = {host_output[i][t, :3].tolist()}")
+        print("\n[End2End] End-to-end pipeline completed!", flush=True)
 
-        print("\n[moe_multi_chip] Results:")
-        for i in range(num_cards):
-            print(f"[moe_multi_chip] card {i} output shape: {host_output[i].shape}")
-            print(f"[moe_multi_chip] card {i} output sample (first {COUNT} tokens, first 3 dims):")
-            for t in range(COUNT):
-                print(f"  token {t}: {host_output[i][t, :3]}")
-
-        # Verify against golden.py
-        print("\n[moe_multi_chip] Verifying against golden.py...")
-
-        # For golden, we need to reconstruct the original input data
-        # host_send[i]: [num_experts, NUM_TOKENS, HIDDEN_DIM]
-        # Convert to golden format: [num_cards, num_experts, NUM_TOKENS, HIDDEN_DIM]
-        send_batch = torch.stack(host_send)  # [num_cards, num_experts, NUM_TOKENS, HIDDEN_DIM]
-
-        # Initialize recv in golden format: [num_experts, num_cards, NUM_TOKENS, HIDDEN_DIM]
-        # This will be filled by the dispatch phase
-        recv_batch = torch.zeros(num_experts, num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32)
-
-        # Initialize output for golden as ZERO tensor (not containing hardware results!)
-        # golden.py's demo function uses +=, so it must start from zero
-        golden_output_input = torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32)
-
-        # Run golden to compute expected output
-        # Note: golden.py's demo function modifies recv and output in place
-        import sys
-        golden_path = os.path.join(HERE, "golden.py")
-        if golden_path not in sys.path:
-            sys.path.insert(0, HERE)
-
-        # Import golden module
-        import importlib.util
-        spec = importlib.util.spec_from_file_location("golden", golden_path)
-        golden_module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(golden_module)
-
-        # Run golden computation (modifies golden_output_input in place)
-        # The golden function computes: output[i][:][:] = sum_j (send[j][i] + i)
-        # where only the first COUNT tokens are processed
-        golden_output = golden_module.demo(send_batch, recv_batch, golden_output_input)
-
-        # Compare results
-        all_match = True
-        for i in range(num_cards):
-            max_diff = float(torch.max(torch.abs(host_output[i] - golden_output[i])))
-            mean_diff = float(torch.mean(torch.abs(host_output[i] - golden_output[i])))
-            print(f"[moe_multi_chip] card {i}: max |output - golden| = {max_diff:.6e}, mean diff = {mean_diff:.6e}")
-
-            if max_diff > 1e-3:
-                all_match = False
-                print(f"[moe_multi_chip] card {i} MISMATCH! Showing first {COUNT} tokens:")
-                for t in range(COUNT):
-                    actual = host_output[i][t, :3]
-                    expected = golden_output[i][t, :3]
-                    print(f"  token {t}: actual={actual.tolist()}, expected={expected.tolist()}")
-            else:
-                print(f"[moe_multi_chip] card {i} ✅ matches golden")
-
-        if all_match:
-            print("\n[moe_multi_chip] ✅ All cards matched golden.py!")
+        print_output_samples(num_cards, host_output, golden_outputs)
+        all_correct = verify_outputs(num_cards, host_output, golden_outputs)
+        print("\n" + "=" * 80)
+        print("[End2End] FINAL VERDICT:")
+        print("=" * 80)
+
+        if all_correct:
+            print("\n[End2End] ✅ All values correct! End-to-end pipeline works perfectly.")
             return 0
         else:
-            print("\n[moe_multi_chip] ❌ Some cards did NOT match golden.py")
+            print("\n[End2End] ❌ Some values incorrect!")
             return 1
 
     except Exception as e:
-        print(f"[moe_multi_chip] ERROR: {e}")
-        import traceback
+        print(f"[End2End] ERROR: {e}")
         traceback.print_exc()
         return 1
 
     finally:
-        print("[moe_multi_chip] shutting down worker...")
+        print("[End2End] Shutting down worker...")
         worker.close()
-
-        # Clean up rootinfo file
         try:
             os.unlink(rootinfo_path)
         except FileNotFoundError:
diff --git a/examples/workers/l3/moe_multi_chip_experts/test_combine_only.py b/examples/workers/l3/moe_multi_chip_experts/test_combine_only.py
deleted file mode 100755
index 3d3d70c30..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/test_combine_only.py
+++ /dev/null
@@ -1,411 +0,0 @@
-#!/usr/bin/env python3
-# Test combine kernel in isolation with unique integer values per token
-
-import argparse
-import os
-import sys
-
-os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
-
-import torch
-from simpler.task_interface import (
-    ArgDirection,
-    CallConfig,
-    ChipBootstrapConfig,
-    ChipBufferSpec,
-    ChipCallable,
-    ChipCommBootstrapConfig,
-    CoreCallable,
-    DataType,
-    TaskArgs,
-    TensorArgType,
-)
-from simpler.worker import Worker
-
-from simpler_setup.kernel_compiler import KernelCompiler
-from simpler_setup.pto_isa import ensure_pto_isa_root
-from simpler_setup.torch_interop import make_tensor_arg
-
-HERE = os.path.dirname(os.path.abspath(__file__))
-
-# MoE configuration
-NUM_TOKENS = 10
-HIDDEN_DIM = 16
-COUNT = 4
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Test combine kernel in isolation")
-    parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3", "a5sim", "a5"])
-    parser.add_argument("-d", "--device", default="0-1", help="Device range")
-    return parser.parse_args()
-
-
-def parse_device_range(spec: str) -> list[int]:
-    if "-" in spec:
-        lo, hi = (int(x) for x in spec.split("-"))
-        return list(range(lo, hi + 1))
-    elif "," in spec:
-        return [int(x) for x in spec.split(",")]
-    else:
-        return [int(spec)]
-
-
-def build_combine_only_callable(platform: str) -> ChipCallable:
-    """Build callable with ONLY combine kernel."""
-    print("[Combine-Only] Compiling combine kernel...", flush=True)
-    kc = KernelCompiler(platform=platform)
-    runtime = "tensormap_and_ringbuffer"
-    pto_isa_root = ensure_pto_isa_root(clone_protocol="https")
-    include_dirs = kc.get_orchestration_include_dirs(runtime)
-    kernel_include_dirs = list(include_dirs) + [str(kc.project_root / "src" / "common")]
-
-    # Compile combine kernel
-    combine_bytes = kc.compile_incore(
-        source_path=os.path.join(HERE, "kernels/aiv/moe_combine_alltoall2.cpp"),
-        core_type="aiv",
-        pto_isa_root=pto_isa_root,
-        extra_include_dirs=kernel_include_dirs,
-    )
-    print("[Combine-Only] Combine kernel compiled", flush=True)
-
-    if not platform.endswith("sim"):
-        from simpler_setup.elf_parser import extract_text_section
-        combine_bytes = extract_text_section(combine_bytes)
-        print("[Combine-Only] Text sections extracted", flush=True)
-
-    # Compile orchestration
-    print("[Combine-Only] Compiling orchestration...", flush=True)
-    orch_bytes = kc.compile_orchestration(
-        runtime_name=runtime,
-        source_path=os.path.join(HERE, "kernels/orchestration/moe_combine_only_orch.cpp"),
-    )
-    print("[Combine-Only] Orchestration compiled", flush=True)
-
-    # Build core callable
-    combine_cc = CoreCallable.build(
-        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT, ArgDirection.OUT,
-                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
-        binary=combine_bytes,
-    )
-
-    return ChipCallable.build(
-        signature=[
-            ArgDirection.IN,   # recv
-            ArgDirection.OUT,  # output
-            ArgDirection.INOUT,  # scratch
-            ArgDirection.OUT,  # scratch_print
-            ArgDirection.IN,   # card_id
-            ArgDirection.IN,   # num_cards
-            ArgDirection.IN,   # CommContext*
-        ],
-        func_name="aicpu_orchestration_entry",
-        binary=orch_bytes,
-        children=[(0, combine_cc)],  # Only combine child
-    )
-
-
-def compute_golden_output(num_cards: int, host_recv: list[torch.Tensor]) -> list[torch.Tensor]:
-    """
-    Compute golden output using direct store logic:
-        output[cardi][expertj][:count][:] = recv[expertj, cardi, :count, :]
-
-    For combine-only test:
-    - Each card_j's recv[j] has shape [num_cards, NUM_TOKENS, HIDDEN_DIM]
-    - recv[j][i][t][d] = expert_j's processed data for card_i
-    - Card i's output[expert_j][:][:] stores expert_j's data for card_i
-    """
-    golden_outputs = []
-    for cardi in range(num_cards):
-        output = torch.zeros(num_cards, COUNT, HIDDEN_DIM, dtype=torch.float32)
-        for expertj in range(num_cards):
-            # recv[expertj][cardi][:][:] = expert_j's processed data for card_i
-            # Store to output[expertj][:][:]
-            output[expertj, :, :] = host_recv[expertj][cardi, :COUNT, :]
-        golden_outputs.append(output)
-
-    return golden_outputs
-
-
-def initialize_recv_with_unique_integers(num_cards: int, device_id: int) -> torch.Tensor:
-    """
-    Initialize recv tensor with unique integers for each token.
-
-    Direct store logic (no accumulation):
-    - recv[expert_i][card_j][t][d] = expert_i processed data for card_j
-    - output[card_j][expert_i][t][d] = recv[expert_i][card_j][t][d] (direct copy)
-
-    Each position gets a unique value to trace data flow:
-    value = (expert * 10000) + (card_j * 100) + (t * 10) + d
-
-    This way we can identify which expert's data ended up where.
-    """
-    recv = torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-
-    for expert_i in range(num_cards):
-        for t in range(NUM_TOKENS):
-            for d in range(HIDDEN_DIM):
-                value = float(expert_i * 10000 + device_id * 100 + t * 10 + d)
-                recv[expert_i, t, d] = value
-
-    return recv
-
-
-def run(platform: str, device_ids: list[int]) -> int:
-    print(f"[Combine-Only] Testing combine on devices {device_ids}", flush=True)
-    num_cards = len(device_ids)
-
-    print(f"\n[Combine-Only] Test Configuration:")
-    print(f"  Platform: {platform}")
-    print(f"  Number of cards: {num_cards}")
-    print(f"  Device IDs: {device_ids}")
-    print(f"  NUM_TOKENS: {NUM_TOKENS}")
-    print(f"  HIDDEN_DIM: {HIDDEN_DIM}")
-    print(f"  COUNT (tokens processed): {COUNT}")
-    print(f"  Total values per card: {num_cards * COUNT * HIDDEN_DIM}")
-    print(f"  Total values to verify: {num_cards * num_cards * COUNT * HIDDEN_DIM}")
-
-    # Configure HCCL
-    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
-    scratch_nbytes = scratch_count * 4
-    total_scratch_nbytes = scratch_nbytes + num_cards * 4
-    window_size = max(total_scratch_nbytes, 4 * 1024)
-
-    print(f"\n[Combine-Only] Memory Configuration:")
-    print(f"  Scratch buffer size: {scratch_count} elements = {scratch_nbytes / 1024:.2f} KB")
-    print(f"  Total with signals: {total_scratch_nbytes / 1024:.2f} KB")
-    print(f"  HCCL window size: {window_size / 1024:.2f} KB")
-
-    rootinfo_path = f"/tmp/pto_combine_only_{os.getpid()}.bin"
-    try:
-        os.unlink(rootinfo_path)
-    except FileNotFoundError:
-        pass
-
-    torch.manual_seed(42)
-
-    # Allocate tensors with unique integer values for each token
-    host_recv = []
-    for i in device_ids:
-        recv = initialize_recv_with_unique_integers(num_cards, i)
-        host_recv.append(recv)
-
-    host_output = [torch.zeros(num_cards, COUNT, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                   for _ in device_ids]
-
-    # Allocate scratch_print tensors (debug output)
-    host_scratch_print = [torch.zeros(scratch_count, dtype=torch.float32).share_memory_()
-                          for _ in device_ids]
-
-    # Compute golden output BEFORE running the kernel
-    print("\n[Combine-Only] Computing golden output using golden.py logic...")
-    golden_outputs = compute_golden_output(num_cards, host_recv)
-    print("[Combine-Only] Golden output computed", flush=True)
-
-    print(f"\n[Combine-Only] Allocated tensors: recv=unique_integers, output=0.0", flush=True)
-
-    # Configure HCCL bootstrap
-    cfgs = [
-        ChipBootstrapConfig(
-            comm=ChipCommBootstrapConfig(
-                rank=rank,
-                nranks=num_cards,
-                rootinfo_path=rootinfo_path,
-                window_size=window_size,
-            ),
-            buffers=[
-                ChipBufferSpec(
-                    name="scratch",
-                    dtype="float32",
-                    count=scratch_count,
-                    nbytes=total_scratch_nbytes,
-                ),
-            ],
-        )
-        for rank in range(num_cards)
-    ]
-
-    # Create worker
-    worker = Worker(
-        level=3,
-        platform=platform,
-        runtime="tensormap_and_ringbuffer",
-        device_ids=device_ids,
-        num_sub_workers=0,
-        chip_bootstrap_configs=cfgs,
-    )
-
-    print(f"\n[Combine-Only] Compiling kernels for {platform}...", flush=True)
-    combine_cc = build_combine_only_callable(platform)
-    print("[Combine-Only] All kernels compiled successfully", flush=True)
-
-    print("[Combine-Only] Initializing worker...", flush=True)
-    worker.init()
-    contexts = worker.chip_contexts
-    print(f"[Combine-Only] Worker initialized with {len(contexts)} contexts", flush=True)
-
-    try:
-        def orch_fn(orch, _args, cfg):
-            print(f"[Combine-Only] Submitting tasks for {num_cards} cards", flush=True)
-            for i in range(num_cards):
-                combine_args = TaskArgs()
-                combine_args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.INPUT)
-                combine_args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
-
-                from simpler.task_interface import ContinuousTensor
-                combine_args.add_tensor(
-                    ContinuousTensor.make(
-                        data=contexts[i].buffer_ptrs["scratch"],
-                        shapes=(scratch_count,),
-                        dtype=DataType.FLOAT32,
-                        child_memory=True,
-                    ),
-                    TensorArgType.INOUT,
-                )
-                combine_args.add_tensor(make_tensor_arg(host_scratch_print[i]), TensorArgType.OUTPUT_EXISTING)
-
-                combine_args.add_scalar(i)  # card_id
-                combine_args.add_scalar(num_cards)
-                combine_args.add_scalar(contexts[i].device_ctx)
-
-                result = orch.submit_next_level(combine_cc, combine_args, cfg, worker=i)
-                print(f"[Combine-Only] Submitted task for card {i}", flush=True)
-
-        print("[Combine-Only] Running combine-only test...", flush=True)
-
-        # Print what each card will do
-        print("\n[Combine-Only] Task breakdown:")
-        for i in range(num_cards):
-            print(f"  Card {i}: Will combine results from all experts for card {i}")
-            print(f"    Input: recv[{i}][expert][{COUNT} tokens][{HIDDEN_DIM} dims]")
-            print(f"    Output: output[num_experts={num_cards}][{COUNT} tokens][{HIDDEN_DIM} dims]")
-
-        # Print output initial values BEFORE running kernel
-        print("\n" + "="*80)
-        print("[Combine-Only] OUTPUT INITIAL VALUES (before kernel):")
-        print("="*80)
-        for i in range(num_cards):
-            print(f"\n[Combine-Only] Card {i} output initial values:")
-            print(f"  Shape: {host_output[i].shape}")
-            for expert_i in range(num_cards):
-                print(f"    Expert {expert_i}:")
-                for t in range(COUNT):
-                    vals = host_output[i][expert_i, t, :].tolist()
-                    print(f"      Token {t}: {vals}")
-
-        worker.run(orch_fn, args=None, config=CallConfig())
-        print("\n[Combine-Only] Test completed successfully!", flush=True)
-
-        # Print scratch_print buffer contents for debugging
-        print("\n" + "="*80)
-        print("[Combine-Only] SCRATCH_PRINT BUFFER CONTENTS (Phase 1 stage-in mirror):")
-        print("="*80)
-
-        for i in range(num_cards):
-            print(f"\n[Combine-Only] Card {i} scratch_print buffer (device {device_ids[i]}):")
-            print(f"  Layout: scratch_print[expert_i][card_j][token][dim]")
-            print(f"  Size: [{num_cards}][{num_cards}][{NUM_TOKENS}][{HIDDEN_DIM}]")
-
-            for expert_i in range(num_cards):
-                print(f"\n  Expert {expert_i}:")
-                for card_j in range(num_cards):
-                    print(f"    For card {card_j}:")
-                    for t in range(COUNT):
-                        offset = expert_i * num_cards * NUM_TOKENS * HIDDEN_DIM + card_j * NUM_TOKENS * HIDDEN_DIM + t * HIDDEN_DIM
-                        vals = host_scratch_print[i][offset:offset+HIDDEN_DIM].tolist()
-                        print(f"      Token {t}: {vals}")
-
-        # Print results
-        print("\n" + "="*80)
-        print("[Combine-Only] INPUT RECV DATA:")
-        print("="*80)
-
-        for i in range(num_cards):
-            print(f"\n[Combine-Only] Card {i} recv data (device {device_ids[i]}):")
-            print(f"  Shape: {host_recv[i].shape}")
-            for expert_i in range(num_cards):
-                print(f"\n  Expert {expert_i}:")
-                for t in range(NUM_TOKENS):
-                    vals = host_recv[i][expert_i, t, :].tolist()
-                    print(f"    Token {t}: {vals}")
-
-        print("\n" + "="*80)
-        print("[Combine-Only] OUTPUT DATA (after combine):")
-        print("="*80)
-
-        for i in range(num_cards):
-            print(f"\n[Combine-Only] Card {i} output data:")
-            print(f"  Shape: {host_output[i].shape}")
-            for expert_i in range(num_cards):
-                print(f"\n  Expert {expert_i}:")
-                for t in range(COUNT):
-                    vals = host_output[i][expert_i, t, :].tolist()
-                    golden_vals = golden_outputs[i][expert_i, t, :].tolist()
-                    print(f"\n    Token {t}:")
-                    print(f"      Output:  {vals}")
-                    print(f"      Golden:  {golden_vals}")
-                    match = all(abs(v - g) < 1e-3 for v, g in zip(vals, golden_vals))
-                    print(f"      Match: {'✓' if match else '✗'}")
-
-        # Verify correctness by comparing with pre-computed golden output
-        print("\n" + "="*80)
-        print("[Combine-Only] VERIFICATION SUMMARY:")
-        print("="*80)
-
-        all_correct = True
-        error_count = 0
-        total_checked = 0
-
-        for i in range(num_cards):
-            print(f"\n[Combine-Only] Card {i}:")
-            card_errors = 0
-
-            for expert_i in range(num_cards):
-                for t in range(COUNT):
-                    for d in range(HIDDEN_DIM):
-                        expected = golden_outputs[i][expert_i, t, d].item()
-                        actual = host_output[i][expert_i, t, d].item()
-                        total_checked += 1
-
-                        if abs(actual - expected) > 1e-3:
-                            card_errors += 1
-                            error_count += 1
-                            all_correct = False
-
-            if card_errors == 0:
-                print(f"  ✓ All {num_cards * COUNT * HIDDEN_DIM} values correct")
-            else:
-                print(f"  ✗ {card_errors} / {num_cards * COUNT * HIDDEN_DIM} values incorrect")
-
-        print(f"\n  Total: {total_checked - error_count}/{total_checked} correct")
-
-        if all_correct:
-            print("\n[Combine-Only] ✅ All values correct! Combine kernel works perfectly.")
-            return 0
-        else:
-            print("\n[Combine-Only] ❌ Some values incorrect!")
-            return 1
-
-    except Exception as e:
-        print(f"[Combine-Only] ERROR: {e}")
-        import traceback
-        traceback.print_exc()
-        return 1
-
-    finally:
-        print("[Combine-Only] Shutting down worker...")
-        worker.close()
-        try:
-            os.unlink(rootinfo_path)
-        except FileNotFoundError:
-            pass
-
-
-def main() -> int:
-    args = parse_args()
-    device_ids = parse_device_range(args.device)
-    return run(args.platform, device_ids)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/examples/workers/l3/moe_multi_chip_experts/test_dispatch_compute.py b/examples/workers/l3/moe_multi_chip_experts/test_dispatch_compute.py
deleted file mode 100644
index 59d7580b5..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/test_dispatch_compute.py
+++ /dev/null
@@ -1,290 +0,0 @@
-#!/usr/bin/env python3
-# Test dispatch + compute kernels together
-
-import argparse
-import os
-import sys
-
-os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
-
-import torch
-from simpler.task_interface import (
-    ArgDirection,
-    CallConfig,
-    ChipBootstrapConfig,
-    ChipBufferSpec,
-    ChipCallable,
-    ChipCommBootstrapConfig,
-    CoreCallable,
-    DataType,
-    TaskArgs,
-    TensorArgType,
-)
-from simpler.worker import Worker
-
-from simpler_setup.kernel_compiler import KernelCompiler
-from simpler_setup.pto_isa import ensure_pto_isa_root
-from simpler_setup.torch_interop import make_tensor_arg
-
-HERE = os.path.dirname(os.path.abspath(__file__))
-
-# MoE configuration
-NUM_TOKENS = 10
-HIDDEN_DIM = 16
-COUNT = 4
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Test dispatch + compute kernels")
-    parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3", "a5sim", "a5"])
-    parser.add_argument("-d", "--device", default="0-1", help="Device range")
-    return parser.parse_args()
-
-
-def parse_device_range(spec: str) -> list[int]:
-    if "-" in spec:
-        lo, hi = (int(x) for x in spec.split("-"))
-        return list(range(lo, hi + 1))
-    elif "," in spec:
-        return [int(x) for x in spec.split(",")]
-    else:
-        return [int(spec)]
-
-
-def build_dispatch_compute_callable(platform: str) -> ChipCallable:
-    """Build callable with dispatch + compute kernels."""
-    print("[Dispatch+Compute] Compiling kernels...", flush=True)
-    kc = KernelCompiler(platform=platform)
-    runtime = "tensormap_and_ringbuffer"
-    pto_isa_root = ensure_pto_isa_root(clone_protocol="https")
-    include_dirs = kc.get_orchestration_include_dirs(runtime)
-    kernel_include_dirs = list(include_dirs) + [str(kc.project_root / "src" / "common")]
-
-    # Compile dispatch kernel
-    dispatch_bytes = kc.compile_incore(
-        source_path=os.path.join(HERE, "kernels/aiv/moe_dispatch_alltoall.cpp"),
-        core_type="aiv",
-        pto_isa_root=pto_isa_root,
-        extra_include_dirs=kernel_include_dirs,
-    )
-    print("[Dispatch+Compute] Dispatch kernel compiled", flush=True)
-
-    # Compile simple compute kernel
-    compute_bytes = kc.compile_incore(
-        source_path=os.path.join(HERE, "kernels/aiv/moe_simple_compute.cpp"),
-        core_type="aiv",
-        pto_isa_root=pto_isa_root,
-        extra_include_dirs=include_dirs,
-    )
-    print("[Dispatch+Compute] Compute kernel compiled", flush=True)
-
-    if not platform.endswith("sim"):
-        from simpler_setup.elf_parser import extract_text_section
-        dispatch_bytes = extract_text_section(dispatch_bytes)
-        compute_bytes = extract_text_section(compute_bytes)
-        print("[Dispatch+Compute] Text sections extracted", flush=True)
-
-    # Compile orchestration
-    print("[Dispatch+Compute] Compiling orchestration...", flush=True)
-    orch_bytes = kc.compile_orchestration(
-        runtime_name=runtime,
-        source_path=os.path.join(HERE, "kernels/orchestration/moe_dispatch_compute_orch.cpp"),
-    )
-    print("[Dispatch+Compute] Orchestration compiled", flush=True)
-
-    # Build core callables
-    dispatch_cc = CoreCallable.build(
-        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT,
-                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
-        binary=dispatch_bytes,
-    )
-
-    compute_cc = CoreCallable.build(
-        signature=[ArgDirection.INOUT, ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
-        binary=compute_bytes,
-    )
-
-    return ChipCallable.build(
-        signature=[
-            ArgDirection.IN,   # send
-            ArgDirection.OUT,  # recv
-            ArgDirection.OUT,  # output (unused)
-            ArgDirection.INOUT,  # scratch
-            ArgDirection.IN,   # expert_id
-            ArgDirection.IN,   # card_id
-            ArgDirection.IN,   # num_cards
-            ArgDirection.IN,   # CommContext*
-        ],
-        func_name="aicpu_orchestration_entry",
-        binary=orch_bytes,
-        children=[(0, dispatch_cc), (1, compute_cc)],  # Dispatch + Compute
-    )
-
-
-def run(platform: str, device_ids: list[int]) -> int:
-    print(f"[Dispatch+Compute] Testing on devices {device_ids}", flush=True)
-    num_cards = len(device_ids)
-    num_experts = num_cards
-
-    # Configure HCCL
-    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
-    scratch_nbytes = scratch_count * 4
-    total_scratch_nbytes = scratch_nbytes + num_cards * 4
-    window_size = max(total_scratch_nbytes, 4 * 1024)
-
-    rootinfo_path = f"/tmp/pto_dispatch_compute_{os.getpid()}.bin"
-    try:
-        os.unlink(rootinfo_path)
-    except FileNotFoundError:
-        pass
-
-    torch.manual_seed(42)
-
-    # Allocate tensors
-    host_send = [torch.ones(num_experts, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                 for _ in device_ids]
-    host_recv = [torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                 for _ in device_ids]
-    host_output = [torch.zeros(NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                   for _ in device_ids]
-
-    print(f"[Dispatch+Compute] Allocated tensors: send=1.0, recv=0.0", flush=True)
-
-    # Configure HCCL bootstrap
-    cfgs = [
-        ChipBootstrapConfig(
-            comm=ChipCommBootstrapConfig(
-                rank=rank,
-                nranks=num_cards,
-                rootinfo_path=rootinfo_path,
-                window_size=window_size,
-            ),
-            buffers=[
-                ChipBufferSpec(
-                    name="scratch",
-                    dtype="float32",
-                    count=scratch_count,
-                    nbytes=total_scratch_nbytes,
-                ),
-            ],
-        )
-        for rank in range(num_cards)
-    ]
-
-    # Create worker
-    worker = Worker(
-        level=3,
-        platform=platform,
-        runtime="tensormap_and_ringbuffer",
-        device_ids=device_ids,
-        num_sub_workers=0,
-        chip_bootstrap_configs=cfgs,
-    )
-
-    print(f"[Dispatch+Compute] Compiling kernels for {platform}...", flush=True)
-    dispatch_compute_cc = build_dispatch_compute_callable(platform)
-    print("[Dispatch+Compute] All kernels compiled successfully", flush=True)
-
-    print("[Dispatch+Compute] Initializing worker...", flush=True)
-    worker.init()
-    contexts = worker.chip_contexts
-    print(f"[Dispatch+Compute] Worker initialized with {len(contexts)} contexts", flush=True)
-
-    try:
-        def orch_fn(orch, _args, cfg):
-            print(f"[Dispatch+Compute] Submitting tasks for {num_cards} cards", flush=True)
-            for i in range(num_cards):
-                args = TaskArgs()
-                args.add_tensor(make_tensor_arg(host_send[i]), TensorArgType.INPUT)
-                args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.OUTPUT_EXISTING)
-                args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
-
-                from simpler.task_interface import ContinuousTensor
-                args.add_tensor(
-                    ContinuousTensor.make(
-                        data=contexts[i].buffer_ptrs["scratch"],
-                        shapes=(scratch_count,),
-                        dtype=DataType.FLOAT32,
-                        child_memory=True,
-                    ),
-                    TensorArgType.INOUT,
-                )
-
-                args.add_scalar(i)  # expert_id
-                args.add_scalar(i)  # card_id
-                args.add_scalar(num_cards)
-                args.add_scalar(contexts[i].device_ctx)
-
-                result = orch.submit_next_level(dispatch_compute_cc, args, cfg, worker=i)
-                print(f"[Dispatch+Compute] Submitted task for card {i}", flush=True)
-
-        print("[Dispatch+Compute] Running dispatch+compute test...", flush=True)
-        worker.run(orch_fn, args=None, config=CallConfig())
-        print("[Dispatch+Compute] Test completed", flush=True)
-
-        # Print results
-        print("\n" + "="*80)
-        print("[Dispatch+Compute] RESULTS:")
-        print("="*80)
-
-        for i in range(num_cards):
-            print(f"\n[Dispatch+Compute] Card {i} recv data (after dispatch+compute):")
-            print(f"  Shape: {host_recv[i].shape}")
-            print(f"  Expected: recv[i][:4][:] should be 2.0 (1.0 from dispatch + 1.0 from compute)")
-            print(f"  Sample data (first 2 cards' data, first {COUNT} tokens, first 3 dims):")
-
-            for card_j in range(num_cards):
-                print(f"    recv[{card_j}][:3][:3] = [", end="")
-                for t in range(min(3, COUNT)):
-                    vals = host_recv[i][card_j, t, :3].tolist()
-                    print(f"[{vals[0]:.1f},{vals[1]:.1f},{vals[2]:.1f}]", end="")
-                    if t < min(3, COUNT) - 1:
-                        print(", ", end="")
-                print("]")
-
-        # Verify correctness
-        print("\n" + "="*80)
-        print("[Dispatch+Compute] VERIFICATION:")
-        print("="*80)
-
-        all_correct = True
-        for i in range(num_cards):
-            for card_j in range(num_cards):
-                for t in range(COUNT):
-                    for d in range(HIDDEN_DIM):
-                        expected = 2.0  # 1.0 (dispatch) + 1.0 (compute)
-                        actual = host_recv[i][card_j, t, d].item()
-                        if abs(actual - expected) > 1e-5:
-                            print(f"[Dispatch+Compute] ERROR: Card {i} recv[{card_j}][{t}][{d}] = {actual}, expected {expected}")
-                            all_correct = False
-
-        if all_correct:
-            print("[Dispatch+Compute] ✅ All values correct! Dispatch+Compute works perfectly.")
-            return 0
-        else:
-            print("[Dispatch+Compute] ❌ Some values incorrect!")
-            return 1
-
-    except Exception as e:
-        print(f"[Dispatch+Compute] ERROR: {e}")
-        import traceback
-        traceback.print_exc()
-        return 1
-
-    finally:
-        print("[Dispatch+Compute] Shutting down worker...")
-        worker.close()
-        try:
-            os.unlink(rootinfo_path)
-        except FileNotFoundError:
-            pass
-
-
-def main() -> int:
-    args = parse_args()
-    device_ids = parse_device_range(args.device)
-    return run(args.platform, device_ids)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/examples/workers/l3/moe_multi_chip_experts/test_dispatch_only.py b/examples/workers/l3/moe_multi_chip_experts/test_dispatch_only.py
deleted file mode 100644
index 61490029e..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/test_dispatch_only.py
+++ /dev/null
@@ -1,308 +0,0 @@
-#!/usr/bin/env python3
-# Test dispatch kernel in isolation
-
-import argparse
-import os
-import sys
-
-os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
-
-import torch
-from simpler.task_interface import (
-    ArgDirection,
-    CallConfig,
-    ChipBootstrapConfig,
-    ChipBufferSpec,
-    ChipCallable,
-    ChipCommBootstrapConfig,
-    CoreCallable,
-    DataType,
-    TaskArgs,
-    TensorArgType,
-)
-from simpler.worker import Worker
-
-from simpler_setup.kernel_compiler import KernelCompiler
-from simpler_setup.pto_isa import ensure_pto_isa_root
-from simpler_setup.torch_interop import make_tensor_arg
-
-HERE = os.path.dirname(os.path.abspath(__file__))
-
-# MoE configuration
-NUM_TOKENS = 10
-HIDDEN_DIM = 16
-COUNT = 4
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Test dispatch kernel in isolation")
-    parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3", "a5sim", "a5"])
-    parser.add_argument("-d", "--device", default="0-1", help="Device range")
-    return parser.parse_args()
-
-
-def parse_device_range(spec: str) -> list[int]:
-    if "-" in spec:
-        lo, hi = (int(x) for x in spec.split("-"))
-        return list(range(lo, hi + 1))
-    elif "," in spec:
-        return [int(x) for x in spec.split(",")]
-    else:
-        return [int(spec)]
-
-
-def build_dispatch_only_callable(platform: str) -> ChipCallable:
-    """Build callable with ONLY dispatch kernel."""
-    print("[Dispatch-Only] Compiling dispatch kernel...", flush=True)
-    kc = KernelCompiler(platform=platform)
-    runtime = "tensormap_and_ringbuffer"
-    pto_isa_root = ensure_pto_isa_root(clone_protocol="https")
-    include_dirs = kc.get_orchestration_include_dirs(runtime)
-    kernel_include_dirs = list(include_dirs) + [str(kc.project_root / "src" / "common")]
-
-    # Compile dispatch kernel
-    dispatch_bytes = kc.compile_incore(
-        source_path=os.path.join(HERE, "kernels/aiv/moe_dispatch_alltoall.cpp"),
-        core_type="aiv",
-        pto_isa_root=pto_isa_root,
-        extra_include_dirs=kernel_include_dirs,
-    )
-    print("[Dispatch-Only] Dispatch kernel compiled", flush=True)
-
-    if not platform.endswith("sim"):
-        from simpler_setup.elf_parser import extract_text_section
-        dispatch_bytes = extract_text_section(dispatch_bytes)
-        print("[Dispatch-Only] Text sections extracted", flush=True)
-
-    # Compile orchestration
-    print("[Dispatch-Only] Compiling orchestration...", flush=True)
-    orch_bytes = kc.compile_orchestration(
-        runtime_name=runtime,
-        source_path=os.path.join(HERE, "kernels/orchestration/moe_dispatch_only_orch.cpp"),
-    )
-    print("[Dispatch-Only] Orchestration compiled", flush=True)
-
-    # Build core callable
-    dispatch_cc = CoreCallable.build(
-        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT,
-                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
-        binary=dispatch_bytes,
-    )
-
-    return ChipCallable.build(
-        signature=[
-            ArgDirection.IN,   # send
-            ArgDirection.OUT,  # recv
-            ArgDirection.OUT,  # output (unused but needed for signature)
-            ArgDirection.INOUT,  # scratch
-            ArgDirection.IN,   # expert_id
-            ArgDirection.IN,   # card_id
-            ArgDirection.IN,   # num_cards
-            ArgDirection.IN,   # CommContext*
-        ],
-        func_name="aicpu_orchestration_entry",
-        binary=orch_bytes,
-        children=[(0, dispatch_cc)],  # Only dispatch child
-    )
-
-
-def run(platform: str, device_ids: list[int]) -> int:
-    print(f"[Dispatch-Only] Testing dispatch on devices {device_ids}", flush=True)
-    num_cards = len(device_ids)
-    num_experts = num_cards
-
-    # Configure HCCL
-    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
-    scratch_nbytes = scratch_count * 4
-    total_scratch_nbytes = scratch_nbytes + num_cards * 4
-    window_size = max(total_scratch_nbytes, 4 * 1024)
-
-    rootinfo_path = f"/tmp/pto_dispatch_only_{os.getpid()}.bin"
-    try:
-        os.unlink(rootinfo_path)
-    except FileNotFoundError:
-        pass
-
-    torch.manual_seed(42)
-
-    # Allocate tensors with unique values to trace data flow
-    # Value encoding: (card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim
-    host_send = []
-    for i, device_id in enumerate(device_ids):
-        send = torch.zeros(num_experts, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-        for expert_j in range(num_experts):
-            for t in range(NUM_TOKENS):
-                for d in range(HIDDEN_DIM):
-                    # Unique value: card_i -> expert_j -> token_t -> dim_d
-                    value = float(i * 1000000 + expert_j * 10000 + t * 100 + d)
-                    send[expert_j, t, d] = value
-        host_send.append(send)
-
-    host_recv = [torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                 for _ in device_ids]
-    host_output = [torch.zeros(NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                   for _ in device_ids]
-
-    print(f"[Dispatch-Only] Allocated tensors with unique values", flush=True)
-    print(f"[Dispatch-Only] Value encoding: (card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim", flush=True)
-    print(f"[Dispatch-Only] Sample: host_send[0][0][0][0] = {host_send[0][0, 0, 0].item()} (card 0, expert 0, token 0, dim 0)", flush=True)
-
-    # Print input values BEFORE running kernel
-    print("\n" + "="*80)
-    print("[Dispatch-Only] INPUT SEND VALUES (before kernel):")
-    print("="*80)
-    for i in range(num_cards):
-        print(f"\n[Dispatch-Only] Card {i} send values:")
-        print(f"  Shape: {host_send[i].shape}")
-        for expert_j in range(num_experts):
-            print(f"    Expert {expert_j}:")
-            for t in range(min(2, COUNT)):
-                vals = host_send[i][expert_j, t, :3].tolist()
-                print(f"      Token {t}: {vals}")
-
-    # Configure HCCL bootstrap
-    cfgs = [
-        ChipBootstrapConfig(
-            comm=ChipCommBootstrapConfig(
-                rank=rank,
-                nranks=num_cards,
-                rootinfo_path=rootinfo_path,
-                window_size=window_size,
-            ),
-            buffers=[
-                ChipBufferSpec(
-                    name="scratch",
-                    dtype="float32",
-                    count=scratch_count,
-                    nbytes=total_scratch_nbytes,
-                ),
-            ],
-        )
-        for rank in range(num_cards)
-    ]
-
-    # Create worker
-    worker = Worker(
-        level=3,
-        platform=platform,
-        runtime="tensormap_and_ringbuffer",
-        device_ids=device_ids,
-        num_sub_workers=0,
-        chip_bootstrap_configs=cfgs,
-    )
-
-    print(f"[Dispatch-Only] Compiling kernels for {platform}...", flush=True)
-    dispatch_cc = build_dispatch_only_callable(platform)
-    print("[Dispatch-Only] All kernels compiled successfully", flush=True)
-
-    print("[Dispatch-Only] Initializing worker...", flush=True)
-    worker.init()
-    contexts = worker.chip_contexts
-    print(f"[Dispatch-Only] Worker initialized with {len(contexts)} contexts", flush=True)
-
-    try:
-        def orch_fn(orch, _args, cfg):
-            print(f"[Dispatch-Only] Submitting tasks for {num_cards} cards", flush=True)
-            for i in range(num_cards):
-                dispatch_args = TaskArgs()
-                dispatch_args.add_tensor(make_tensor_arg(host_send[i]), TensorArgType.INPUT)
-                dispatch_args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.OUTPUT_EXISTING)
-                dispatch_args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
-
-                from simpler.task_interface import ContinuousTensor
-                dispatch_args.add_tensor(
-                    ContinuousTensor.make(
-                        data=contexts[i].buffer_ptrs["scratch"],
-                        shapes=(scratch_count,),
-                        dtype=DataType.FLOAT32,
-                        child_memory=True,
-                    ),
-                    TensorArgType.INOUT,
-                )
-
-                dispatch_args.add_scalar(i)  # expert_id
-                dispatch_args.add_scalar(i)  # card_id
-                dispatch_args.add_scalar(num_cards)
-                dispatch_args.add_scalar(contexts[i].device_ctx)
-
-                result = orch.submit_next_level(dispatch_cc, dispatch_args, cfg, worker=i)
-                print(f"[Dispatch-Only] Submitted task for card {i}", flush=True)
-
-        print("[Dispatch-Only] Running dispatch-only test...", flush=True)
-        worker.run(orch_fn, args=None, config=CallConfig())
-        print("[Dispatch-Only] Test completed", flush=True)
-
-        # Compute golden recv using dispatch logic
-        def compute_golden_recv(num_cards, host_send):
-            """
-            Compute golden recv using dispatch logic:
-            For card i (processing expert i):
-              recv[i][j][:COUNT][:] = card j's send[expert_i][:COUNT][:]
-            NOTE: Dispatch only processes first COUNT tokens, not all NUM_TOKENS!
-            """
-            golden_recvs = []
-            for cardi in range(num_cards):
-                recv = torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32)
-                for cardj in range(num_cards):
-                    # Card i receives from card j: card j's send[expert_i]
-                    # expert_i = cardi (because card i processes expert i)
-                    # Only copy first COUNT tokens!
-                    recv[cardj, :COUNT, :] = host_send[cardj][cardi, :COUNT, :]
-                golden_recvs.append(recv)
-            return golden_recvs
-
-        golden_recvs = compute_golden_recv(num_cards, host_send)
-
-        # Verify correctness
-        print("\n" + "="*80)
-        print("[Dispatch-Only] VERIFICATION:")
-        print("="*80)
-        print("[Dispatch-Only] Comparing actual recv vs golden recv...")
-        print(f"[Dispatch-Only] Recv shape: {host_recv[0].shape} (num_cards={num_cards}, NUM_TOKENS={NUM_TOKENS}, HIDDEN_DIM={HIDDEN_DIM})")
-
-        all_match = True
-        for i in range(num_cards):
-            max_diff = float(torch.max(torch.abs(host_recv[i] - golden_recvs[i])))
-            mean_diff = float(torch.mean(torch.abs(host_recv[i] - golden_recvs[i])))
-            print(f"[Dispatch-Only] Card {i}: max |recv - golden| = {max_diff:.6e}, mean diff = {mean_diff:.6e}")
-
-            if max_diff > 1e-3:
-                all_match = False
-                print(f"[Dispatch-Only] Card {i} MISMATCH! Full recv data:")
-                for card_j in range(num_cards):
-                    for t in range(NUM_TOKENS):
-                        print(f"  recv[{card_j}][{t}][:3] = {host_recv[i][card_j, t, :3].tolist()}")
-                        print(f"  golden[{card_j}][{t}][:3] = {golden_recvs[i][card_j, t, :3].tolist()}")
-            else:
-                print(f"[Dispatch-Only] Card {i} ✅ matches golden")
-
-        if all_match:
-            print("\n[Dispatch-Only] ✅ All cards matched golden!")
-            return 0
-        else:
-            print("\n[Dispatch-Only] ❌ Some cards did NOT match golden!")
-            return 1
-
-    except Exception as e:
-        print(f"[Dispatch-Only] ERROR: {e}")
-        import traceback
-        traceback.print_exc()
-        return 1
-
-    finally:
-        print("[Dispatch-Only] Shutting down worker...")
-        worker.close()
-        try:
-            os.unlink(rootinfo_path)
-        except FileNotFoundError:
-            pass
-
-
-def main() -> int:
-    args = parse_args()
-    device_ids = parse_device_range(args.device)
-    return run(args.platform, device_ids)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/examples/workers/l3/moe_multi_chip_experts/test_end2end.py b/examples/workers/l3/moe_multi_chip_experts/test_end2end.py
deleted file mode 100755
index 8afe15d88..000000000
--- a/examples/workers/l3/moe_multi_chip_experts/test_end2end.py
+++ /dev/null
@@ -1,398 +0,0 @@
-#!/usr/bin/env python3
-# Test complete MoE pipeline: Dispatch + Compute + Combine
-
-import argparse
-import os
-import sys
-
-os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
-
-import torch
-from simpler.task_interface import (
-    ArgDirection,
-    CallConfig,
-    ChipBootstrapConfig,
-    ChipBufferSpec,
-    ChipCallable,
-    ChipCommBootstrapConfig,
-    CoreCallable,
-    DataType,
-    TaskArgs,
-    TensorArgType,
-)
-from simpler.worker import Worker
-
-from simpler_setup.kernel_compiler import KernelCompiler
-from simpler_setup.pto_isa import ensure_pto_isa_root
-from simpler_setup.torch_interop import make_tensor_arg
-
-HERE = os.path.dirname(os.path.abspath(__file__))
-
-# MoE configuration
-NUM_TOKENS = 10
-HIDDEN_DIM = 16
-COUNT = 4
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Test complete MoE pipeline (Dispatch + Compute + Combine)")
-    parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3", "a5sim", "a5"])
-    parser.add_argument("-d", "--device", default="0-1", help="Device range")
-    return parser.parse_args()
-
-
-def parse_device_range(spec: str) -> list[int]:
-    if "-" in spec:
-        lo, hi = (int(x) for x in spec.split("-"))
-        return list(range(lo, hi + 1))
-    elif "," in spec:
-        return [int(x) for x in spec.split(",")]
-    else:
-        return [int(spec)]
-
-
-def build_end2end_callable(platform: str) -> ChipCallable:
-    """Build callable with dispatch + compute + combine kernels."""
-    print("[End2End] Compiling kernels...", flush=True)
-    kc = KernelCompiler(platform=platform)
-    runtime = "tensormap_and_ringbuffer"
-    pto_isa_root = ensure_pto_isa_root(clone_protocol="https")
-    include_dirs = kc.get_orchestration_include_dirs(runtime)
-    kernel_include_dirs = list(include_dirs) + [str(kc.project_root / "src" / "common")]
-
-    # Compile dispatch kernel
-    dispatch_bytes = kc.compile_incore(
-        source_path=os.path.join(HERE, "kernels/aiv/moe_dispatch_alltoall.cpp"),
-        core_type="aiv",
-        pto_isa_root=pto_isa_root,
-        extra_include_dirs=kernel_include_dirs,
-    )
-    print("[End2End] Dispatch kernel compiled", flush=True)
-
-    # Compile compute kernel
-    compute_bytes = kc.compile_incore(
-        source_path=os.path.join(HERE, "kernels/aiv/moe_simple_compute.cpp"),
-        core_type="aiv",
-        pto_isa_root=pto_isa_root,
-        extra_include_dirs=include_dirs,
-    )
-    print("[End2End] Compute kernel compiled", flush=True)
-
-    # Compile combine kernel
-    combine_bytes = kc.compile_incore(
-        source_path=os.path.join(HERE, "kernels/aiv/moe_combine_alltoall2.cpp"),
-        core_type="aiv",
-        pto_isa_root=pto_isa_root,
-        extra_include_dirs=kernel_include_dirs,
-    )
-    print("[End2End] Combine kernel compiled", flush=True)
-
-    if not platform.endswith("sim"):
-        from simpler_setup.elf_parser import extract_text_section
-        dispatch_bytes = extract_text_section(dispatch_bytes)
-        compute_bytes = extract_text_section(compute_bytes)
-        combine_bytes = extract_text_section(combine_bytes)
-        print("[End2End] Text sections extracted", flush=True)
-
-    # Compile orchestration
-    print("[End2End] Compiling orchestration...", flush=True)
-    orch_bytes = kc.compile_orchestration(
-        runtime_name=runtime,
-        source_path=os.path.join(HERE, "kernels/orchestration/moe_end2end_orch.cpp"),
-    )
-    print("[End2End] Orchestration compiled", flush=True)
-
-    # Build core callables
-    dispatch_cc = CoreCallable.build(
-        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT,
-                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
-        binary=dispatch_bytes,
-    )
-
-    compute_cc = CoreCallable.build(
-        signature=[ArgDirection.INOUT, ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
-        binary=compute_bytes,
-    )
-
-    combine_cc = CoreCallable.build(
-        signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.INOUT, ArgDirection.OUT,
-                   ArgDirection.IN, ArgDirection.IN, ArgDirection.IN],
-        binary=combine_bytes,
-    )
-
-    return ChipCallable.build(
-        signature=[
-            ArgDirection.IN,   # send
-            ArgDirection.OUT,  # recv
-            ArgDirection.OUT,  # output
-            ArgDirection.INOUT,  # scratch
-            ArgDirection.INOUT,  # scratch_test
-            ArgDirection.OUT,  # scratch_print
-            ArgDirection.IN,   # expert_id
-            ArgDirection.IN,   # card_id
-            ArgDirection.IN,   # num_cards
-            ArgDirection.IN,   # CommContext*
-        ],
-        func_name="aicpu_orchestration_entry",
-        binary=orch_bytes,
-        children=[(0, dispatch_cc), (1, compute_cc), (2, combine_cc)],  # All three phases
-    )
-
-
-def compute_golden_end2end(num_cards: int, host_send: list[torch.Tensor]) -> list[torch.Tensor]:
-    """
-    Compute golden output for end-to-end pipeline:
-    1. Dispatch: send[card_j][expert_i][:COUNT][:] -> recv[card_i][card_j][:COUNT][:]
-    2. Compute: recv[card_i][card_j][:COUNT][:] += 1.0
-    3. Combine: recv[expert_j][card_i][:COUNT][:] -> output[card_i][expert_j][:COUNT][:]
-
-    Send initialization: unique values using (card * 1000000 + expert * 10000 + token * 100 + dim)
-    """
-    golden_outputs = []
-    for cardi in range(num_cards):
-        output = torch.zeros(num_cards, COUNT, HIDDEN_DIM, dtype=torch.float32)
-        for expertj in range(num_cards):
-            for t in range(COUNT):
-                for d in range(HIDDEN_DIM):
-                    # After dispatch: recv[cardi][expertj][:][:] = send[expertj][cardi][:][:]
-                    # Value from cardi's send[expertj][cardi][t][d]
-                    send_value = host_send[cardi][expertj, t, d].item()
-                    # After compute: recv += 1.0
-                    recv_value = send_value + 1.0
-                    # After combine: output[cardi][expertj][t][d] = recv[expertj][cardi][t][d]
-                    output[expertj, t, d] = recv_value
-        golden_outputs.append(output)
-
-    return golden_outputs
-
-
-def run(platform: str, device_ids: list[int]) -> int:
-    print(f"[End2End] Testing complete MoE pipeline on devices {device_ids}", flush=True)
-    num_cards = len(device_ids)
-    num_experts = num_cards
-
-    # Configure HCCL
-    scratch_count = num_cards * num_cards * NUM_TOKENS * HIDDEN_DIM
-    scratch_nbytes = scratch_count * 4
-    total_scratch_nbytes = scratch_nbytes + num_cards * 4
-    window_size = max(total_scratch_nbytes, 4 * 1024)
-
-    print(f"\n[End2End] Test Configuration:")
-    print(f"  Platform: {platform}")
-    print(f"  Number of cards: {num_cards}")
-    print(f"  Device IDs: {device_ids}")
-    print(f"  NUM_TOKENS: {NUM_TOKENS}")
-    print(f"  HIDDEN_DIM: {HIDDEN_DIM}")
-    print(f"  COUNT (tokens processed): {COUNT}")
-
-    rootinfo_path = f"/tmp/pto_end2end_{os.getpid()}.bin"
-    try:
-        os.unlink(rootinfo_path)
-    except FileNotFoundError:
-        pass
-
-    torch.manual_seed(42)
-
-    # Allocate tensors with unique values to trace data flow
-    # Value encoding: (card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim
-    host_send = []
-    for i, device_id in enumerate(device_ids):
-        send = torch.zeros(num_experts, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-        for expert_j in range(num_experts):
-            for t in range(NUM_TOKENS):
-                for d in range(HIDDEN_DIM):
-                    # Unique value: card_i -> expert_j -> token_t -> dim_d
-                    value = float(i * 1000000 + expert_j * 10000 + t * 100 + d)
-                    send[expert_j, t, d] = value
-        host_send.append(send)
-    host_recv = [torch.zeros(num_cards, NUM_TOKENS, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                 for _ in device_ids]
-    host_output = [torch.zeros(num_cards, COUNT, HIDDEN_DIM, dtype=torch.float32).share_memory_()
-                   for _ in device_ids]
-
-    # Allocate scratch_print tensor (debug output)
-    host_scratch_print = [torch.zeros(scratch_count, dtype=torch.float32).share_memory_()
-                          for _ in device_ids]
-
-    print(f"\n[End2End] Allocated tensors:")
-    print(f"  send=unique_values, recv=0.0, output=0.0")
-    print(f"  Value encoding: (card_id * 1000000) + (expert_id * 10000) + (token * 100) + dim", flush=True)
-
-    # Compute golden output
-    print("\n[End2End] Computing golden output...")
-    golden_outputs = compute_golden_end2end(num_cards, host_send)
-    print("[End2End] Golden output computed", flush=True)
-
-    # Configure HCCL bootstrap with two independent scratch buffers
-    cfgs = [
-        ChipBootstrapConfig(
-            comm=ChipCommBootstrapConfig(
-                rank=rank,
-                nranks=num_cards,
-                rootinfo_path=rootinfo_path,
-                window_size=window_size,
-            ),
-            buffers=[
-                ChipBufferSpec(
-                    name="scratch",
-                    dtype="float32",
-                    count=scratch_count,
-                    nbytes=total_scratch_nbytes,
-                ),
-                ChipBufferSpec(
-                    name="scratch_test",
-                    dtype="float32",
-                    count=scratch_count,
-                    nbytes=total_scratch_nbytes,
-                ),
-            ],
-        )
-        for rank in range(num_cards)
-    ]
-
-    # Create worker
-    worker = Worker(
-        level=3,
-        platform=platform,
-        runtime="tensormap_and_ringbuffer",
-        device_ids=device_ids,
-        num_sub_workers=0,
-        chip_bootstrap_configs=cfgs,
-    )
-
-    print(f"\n[End2End] Compiling kernels for {platform}...", flush=True)
-    end2end_cc = build_end2end_callable(platform)
-    print("[End2End] All kernels compiled successfully", flush=True)
-
-    print("[End2End] Initializing worker...", flush=True)
-    worker.init()
-    contexts = worker.chip_contexts
-    print(f"[End2End] Worker initialized with {len(contexts)} contexts", flush=True)
-
-    try:
-        def orch_fn(orch, _args, cfg):
-            print(f"[End2End] Submitting tasks for {num_cards} cards", flush=True)
-            for i in range(num_cards):
-                args = TaskArgs()
-                args.add_tensor(make_tensor_arg(host_send[i]), TensorArgType.INPUT)
-                args.add_tensor(make_tensor_arg(host_recv[i]), TensorArgType.OUTPUT_EXISTING)
-                args.add_tensor(make_tensor_arg(host_output[i]), TensorArgType.OUTPUT_EXISTING)
-
-                from simpler.task_interface import ContinuousTensor
-                args.add_tensor(
-                    ContinuousTensor.make(
-                        data=contexts[i].buffer_ptrs["scratch"],
-                        shapes=(scratch_count,),
-                        dtype=DataType.FLOAT32,
-                        child_memory=True,
-                    ),
-                    TensorArgType.INOUT,
-                )
-                args.add_tensor(
-                    ContinuousTensor.make(
-                        data=contexts[i].buffer_ptrs["scratch_test"],
-                        shapes=(scratch_count,),
-                        dtype=DataType.FLOAT32,
-                        child_memory=True,
-                    ),
-                    TensorArgType.INOUT,
-                )
-                args.add_tensor(make_tensor_arg(host_scratch_print[i]), TensorArgType.OUTPUT_EXISTING)
-
-                args.add_scalar(i)  # expert_id
-                args.add_scalar(i)  # card_id
-                args.add_scalar(num_cards)
-                args.add_scalar(contexts[i].device_ctx)
-
-                result = orch.submit_next_level(end2end_cc, args, cfg, worker=i)
-                print(f"[End2End] Submitted task for card {i}", flush=True)
-
-        print("\n[End2End] Running end-to-end test...", flush=True)
-
-        worker.run(orch_fn, args=None, config=CallConfig())
-        print("\n[End2End] End-to-end pipeline completed!", flush=True)
-
-        # Print results
-        print("\n" + "="*80)
-        print("[End2End] OUTPUT DATA:")
-        print("="*80)
-
-        for i in range(num_cards):
-            print(f"\n[End2End] Card {i} output data:")
-            print(f"  Expected: Each value = send_value + 1.0")
-            print(f"  Sample data (first 2 experts, first {COUNT} tokens, first 3 dims):")
-
-            for expert_j in range(min(2, num_cards)):
-                print(f"    Expert {expert_j}:")
-                for t in range(min(COUNT, 2)):
-                    vals = host_output[i][expert_j, t, :3].tolist()
-                    golden_vals = golden_outputs[i][expert_j, t, :3].tolist()
-                    print(f"      Token {t}: Output={vals}, Golden={golden_vals}")
-
-        # Verify correctness
-        print("\n" + "="*80)
-        print("[End2End] VERIFICATION:")
-        print("="*80)
-
-        all_correct = True
-        error_count = 0
-        total_checked = 0
-
-        for i in range(num_cards):
-            print(f"\n[End2End] Card {i}:")
-            card_errors = 0
-
-            for expert_j in range(num_cards):
-                for t in range(COUNT):
-                    for d in range(HIDDEN_DIM):
-                        expected = golden_outputs[i][expert_j, t, d].item()
-                        actual = host_output[i][expert_j, t, d].item()
-                        total_checked += 1
-
-                        if abs(actual - expected) > 1e-3:
-                            card_errors += 1
-                            error_count += 1
-                            all_correct = False
-
-            if card_errors == 0:
-                print(f"  ✓ All {num_cards * COUNT * HIDDEN_DIM} values correct")
-            else:
-                print(f"  ✗ {card_errors} / {num_cards * COUNT * HIDDEN_DIM} values incorrect")
-
-        print(f"\n  Total: {total_checked - error_count}/{total_checked} correct")
-
-        # Final verdict
-        print("\n" + "="*80)
-        print("[End2End] FINAL VERDICT:")
-        print("="*80)
-
-        if all_correct:
-            print("\n[End2End] ✅ All values correct! End-to-end pipeline works perfectly.")
-            return 0
-        else:
-            print("\n[End2End] ❌ Some values incorrect!")
-            return 1
-
-    except Exception as e:
-        print(f"[End2End] ERROR: {e}")
-        import traceback
-        traceback.print_exc()
-        return 1
-
-    finally:
-        print("[End2End] Shutting down worker...")
-        worker.close()
-        try:
-            os.unlink(rootinfo_path)
-        except FileNotFoundError:
-            pass
-
-
-def main() -> int:
-    args = parse_args()
-    device_ids = parse_device_range(args.device)
-    return run(args.platform, device_ids)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py b/examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip_experts.py
similarity index 70%
rename from examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py
rename to examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip_experts.py
index 9d40cd77e..c501d8900 100644
--- a/examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip.py
+++ b/examples/workers/l3/moe_multi_chip_experts/test_moe_multi_chip_experts.py
@@ -13,7 +13,7 @@
 from .main import run
 
 
-@pytest.mark.platforms(["a2a3sim", "a2a3", "a5sim", "a5"])
+@pytest.mark.platforms(["a2a3"])
 @pytest.mark.runtime("tensormap_and_ringbuffer")
 @pytest.mark.device_count(2)
 def test_moe_multi_chip_2_experts(st_platform, st_device_ids):
@@ -24,16 +24,3 @@ def test_moe_multi_chip_2_experts(st_platform, st_device_ids):
     """
     rc = run(st_platform, [int(d) for d in st_device_ids])
     assert rc == 0
-
-
-@pytest.mark.platforms(["a2a3sim", "a2a3"])
-@pytest.mark.runtime("tensormap_and_ringbuffer")
-@pytest.mark.device_count(4)
-def test_moe_multi_chip_4_experts(st_platform, st_device_ids):
-    """Test multi-chip MoE with 4 experts (1 per chip).
-
-    This should produce the SAME results as moe_single_chip with 4 experts,
-    just executed in parallel across 4 chips instead of sequentially on 1 chip.
-    """
-    rc = run(st_platform, [int(d) for d in st_device_ids])
-    assert rc == 0