alibaba · taoluo · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_lora.yaml b/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_lora.yaml
@@ -29,26 +29,27 @@ system_envs:
 #    - roll
 #    - baseline
 
-track_with: tensorboard
+track_with: stdout # Disable TensorBoard in smoke run to bypass SummaryWriter type constraints.
 tracker_kwargs:
-  log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_sokoban
+  log_dir: ./output/tensorboard/agentic_sokoban_lora_smoke # Use local path so smoke test does not depend on external mount.
 
 
 checkpoint_config:
   type: file_system
-  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}
+  output_dir: /tmp/roll_output/agentic_sokoban_lora_smoke # Keep checkpoint path local for a portable smoke run.
 
-num_gpus_per_node: 8
+num_gpus_per_node: 4 # Fit smoke test to a 4-GPU node.
 
-max_steps: 1024
+max_steps: 3 # Minimal training smoke: one training step is enough to verify end-to-end path.
 save_steps: 10000
 logging_steps: 1
-eval_steps: 10
+eval_steps: 0 # Disable eval loop for faster smoke validation.
 resume_from_checkpoint: false
+async_generation_ratio: 1 # Required by partial_gpu_mode validation in agentic_pipeline.
 
-rollout_batch_size: 1024
-val_batch_size: 1024
-sequence_length: 8192
+rollout_batch_size: 4 # Keep rollout tiny to reduce runtime/memory.
+val_batch_size: 4
+sequence_length: 2048 # Reduce memory pressure while preserving normal train path.
 
 advantage_clip: 0.2
 ppo_epochs: 1
@@ -75,9 +76,9 @@ actor_train:
   training_args:
     learning_rate: 2.0e-5
     weight_decay: 0
-    per_device_train_batch_size: 2
-    gradient_accumulation_steps: 64
-    warmup_steps: 10
+    per_device_train_batch_size: 1 # Minimal micro-batch for smoke stability.
+    gradient_accumulation_steps: 2
+    warmup_steps: 1
     lr_scheduler_type: cosine
   data_args:
     template: qwen2_5
@@ -91,7 +92,7 @@ actor_train:
       expert_model_parallel_size: 1
       use_distributed_optimizer: true
       recompute_granularity: full
-  device_mapping: list(range(0,8))
+  device_mapping: list(range(0,2)) # Constrain actor_train to 4 GPUs for this smoke profile.
   infer_batch_size: 2
 
 actor_infer:
@@ -102,11 +103,11 @@ actor_infer:
     lora_rank: 32
     lora_alpha: 32
   generating_args:
-    max_new_tokens: 128 # single-turn response length
-    top_p: 0.99
-    top_k: 100
+    max_new_tokens: 64 # Shorter generation keeps smoke test fast.
+    top_p: 1
+    top_k: 3
     num_beams: 1
-    temperature: 0.99
+    temperature: 0.0
     num_return_sequences: 1
   data_args:
     template: qwen2_5
@@ -116,7 +117,7 @@ actor_infer:
       gpu_memory_utilization: 0.8
       block_size: 16
       load_format: auto
-  device_mapping: list(range(0,8))
+  device_mapping: list(range(0,4)) # Constrain actor_infer to same 4-GPU pool.
 
 reference:
   model_args:
@@ -129,7 +130,7 @@ reference:
   strategy_args:
     strategy_name: hf_infer
     strategy_config: ~
-  device_mapping: list(range(0,8))
+  device_mapping: list(range(0,2)) # Keep reference mapping consistent with 4-GPU smoke topology.
   infer_batch_size: 2
 
 reward_normalization:
@@ -138,19 +139,19 @@ reward_normalization:
 
 train_env_manager:
   format_penalty: -0.15 # sokoban env penalty_for_step=-0.1
-  max_env_num_per_worker: 16
-  num_env_groups: 128
+  max_env_num_per_worker: 4 # Smaller env fanout for quick smoke startup.
+  num_env_groups: 2
   # under the same group, the env config and env seed are ensured to be equal
-  group_size: 8
+  group_size: 2
   tags: [SimpleSokoban]
-  num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
+  num_groups_partition: [2] # Match reduced group count for smoke.
 
 val_env_manager:
-  max_env_num_per_worker: 32
-  num_env_groups: 1024
+  max_env_num_per_worker: 4 # Keep validation manager light even though eval is disabled.
+  num_env_groups: 4
   group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output
   tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake]
-  num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
+  num_groups_partition: [1, 1, 1, 1] # Minimal partitioning for smoke.
 
 
 # Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64

diff --git a/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_mulit_lora_partial_overlap.yaml b/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_mulit_lora_partial_overlap.yaml
@@ -0,0 +1,206 @@
+defaults:
+  - ../config/traj_envs@_here_
+  - ../config/deepspeed_zero@_here_
+  - ../config/deepspeed_zero2@_here_
+  - ../config/deepspeed_zero3@_here_
+  - ../config/deepspeed_zero3_cpuoffload@_here_
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+pipeline_cls: roll.pipeline.agentic.agentic_multi_lora_pipeline.AgenticMultiLoraPipeline
+
+
+
+exp_name: "agent_train_sokoban_multi_lora1"
+seed: 42
+logging_dir: ./output/lora_pipeline1/logs
+output_dir: ./output/lora_pipeline1
+render_save_dir: /tmp/roll_output/lora_pipeline1/render
+
+# track_with: wandb
+# tracker_kwargs:
+#   entity: "khd6t7hdhn-university-of-pennsylvania"
+#   project: "rlix"
+#   api_key: "${oc.env:WANDB_API_KEY}"
+
+
+system_envs:
+  USE_MODELSCOPE: "0"
+  NCCL_SHM_DISABLE: "1"
+  RAY_PROFILING: "1"
+  RAY_DEDUP_LOGS: "0"
+  RAY_TMPDIR: "${oc.env:RAY_TMPDIR,/tmp}"
+  OMP_NUM_THREADS: "1"
+  MKL_NUM_THREADS: "1"
+  OPENBLAS_NUM_THREADS: "1"
+  RAY_grpc_server_thread_pool_size: "4"
+  TORCHINDUCTOR_COMPILE_THREADS: "1"
+  TORCHINDUCTOR_MAX_AUTOTUNE: "0"
+  # Container lacks SYS_PTRACE capability; disable vLLM custom all-reduce IPC and use NCCL fallback
+  VLLM_DISABLE_CUSTOM_ALL_REDUCE: "1"
+
+checkpoint_config:
+  type: file_system
+  output_dir: /tmp/roll_output/multi_lora2/checkpoints
+
+num_gpus_per_node: 2
+model_download_type: HUGGINGFACE_HUB
+offload_nccl: true
+max_steps: 3
+model_update_buffer_size_mb: 100  # Limit broadcast bucket to 100 MB to avoid OOM with co-located infer workers
+model_update_transport: cpu_serialize  # CPU byte serialization; avoids pidfd_getfd error in restricted containers
+verify_model_after_sync: true
+save_steps: 10000
+logging_steps: 1
+eval_steps: 20
+resume_from_checkpoint: false
+
+async_generation_ratio: 1
+
+rollout_batch_size: 4
+val_batch_size: 4
+sequence_length: 1024  # Reduced from 2048: Sokoban max_new_tokens=64 needs ~500 tokens max, halves peak activation memory
+max_actions_per_traj: 5
+
+advantage_clip: 0.2
+ppo_epochs: 1
+adv_estimator: "grpo"
+init_kl_coef: 0.0
+whiten_advantages: true
+entropy_loss_coef: 0
+max_grad_norm: 1.0
+
+pretrain: Qwen/Qwen2.5-0.5B-Instruct
+reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct
+
+actor_train:
+  offload_nccl: ${offload_nccl}
+  model_args:
+    attn_implementation: fa2
+    disable_gradient_checkpointing: false
+    dtype: bf16
+    model_type: ~
+    adapters:
+      Sokoban1:
+        lora_target: all-linear
+        lora_rank: 8
+        lora_alpha: 8
+      Sokoban2:
+        lora_target: all-linear
+        lora_rank: 8
+        lora_alpha: 8
+  training_args:
+    learning_rate: 1.0e-6
+    weight_decay: 0
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 2
+    warmup_steps: 1
+    lr_scheduler_type: cosine
+  data_args:
+    template: qwen2_5
+  strategy_args:
+    strategy_name: megatron_train
+    strategy_config:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      use_distributed_optimizer: false
+      is_lora_optimizer_isolated: true
+      recompute_granularity: full
+      sequence_parallel: true
+      overlap_grad_reduce: false # Isolated LoRA mode requires overlap_grad_reduce disabled to avoid grad-sync hang.
+  # Note: use_sequence_packing is NOT enabled here — sequence packing mixes sequences from different
+  # LoRA adapters into one microbatch, violating the adapter-homogeneity constraint in inner_forward_step.
+  # Note: use_dynamic_batching_in_train is also NOT enabled — incompatible with is_lora_optimizer_isolated=true.
+  device_mapping: "[0, ]"
+  infer_batch_size: 1
+
+actor_infer:
+  offload_nccl: ${offload_nccl}
+  model_args:
+    disable_gradient_checkpointing: true
+    dtype: bf16
+    adapters:
+      Sokoban1:
+        lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
+        lora_rank: 8
+        lora_alpha: 8
+      Sokoban2:
+        lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
+        lora_rank: 8
+        lora_alpha: 8
+  generating_args:
+    max_new_tokens: 64
+    top_p: 1
+    top_k: 3
+    num_beams: 1
+    temperature: 0.0
+    num_return_sequences: 1
+  data_args:
+    template: qwen2_5
+  strategy_args:
+    strategy_name: vllm
+    strategy_config:
+      VLLM_USE_V1: 1
+      gpu_memory_utilization: 0.8 # Raise cache budget so vLLM has non-zero KV blocks during two-worker startup.
+      block_size: 16
+      load_format: auto
+      tensor_parallel_size: 1
+      max_num_batched_tokens: 1024  # Match reduced sequence_length=1024
+      max_num_seqs: 2
+      enforce_eager: true
+      sleep_level: 1
+  device_mapping: "[0, 1, ]"
+
+reference:
+  offload_nccl: ${offload_nccl}
+  model_args:
+    attn_implementation: fa2
+    disable_gradient_checkpointing: true
+    dtype: bf16
+    model_type: ~
+  data_args:
+    template: qwen2_5
+  strategy_args:
+    strategy_name: megatron_infer
+    strategy_config:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+  # Dynamic batching on reference (megatron_infer): trims padding per-microbatch to actual token length
+  # (rounded to sequence_length_round_in_infer). Reduces peak memory during log_prob computation.
+  use_dynamic_batching_in_infer: true
+  max_tokens_per_microbatch_in_infer: 1024  # Match reduced sequence_length=1024
+  sequence_length_round_in_infer: 8
+  device_mapping: "[0, ]"
+  infer_batch_size: 1
+
+reward_normalization:
+  grouping: traj_group_id
+  method: mean_std
+
+train_env_manager:
+  format_penalty: -0.15
+  max_env_num_per_worker: 4
+  num_env_groups: 2
+  group_size: 2
+  tags: [Sokoban1, Sokoban2]
+  num_groups_partition: [1, 1]
+
+val_env_manager:
+  max_env_num_per_worker: 4
+  num_env_groups: 2
+  group_size: 2
+  tags: [Sokoban1, Sokoban2]
+  num_groups_partition: [1, 1]
+
+max_tokens_per_step: 64
+
+custom_envs:
+  Sokoban1:
+    ${custom_env.SimpleSokoban}
+  Sokoban2:
+    ${custom_env.SimpleSokoban}
diff --git a/examples/start_agentic_pipeline.py b/examples/start_agentic_pipeline.py
@@ -34,6 +34,7 @@ def main():
     pipeline = pipeline_cls(pipeline_config=ppo_config)
 
     pipeline.run()
+    print("Pipeline finished.")
 
 
 if __name__ == "__main__":

diff --git a/mcore_adapter/src/mcore_adapter/adapters/lora_layer.py b/mcore_adapter/src/mcore_adapter/adapters/lora_layer.py
@@ -254,6 +254,9 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any):
 
                 if self.sequence_parallel and self.base_layer.parallel_mode == "row":
                     lora_result = scatter_to_sequence_parallel_region(lora_result)
+                # Cast per-adapter result before accumulating; each adapter may compute in its own weight dtype.
+                if lora_result.dtype != previous_dtype:
+                    lora_result = lora_result.to(previous_dtype)
                 result = result + lora_result
 
         result = result.to(previous_dtype)
@@ -413,6 +416,8 @@ def _create_lora_layers(self, r, lora_bias, **kwargs):
         in_features = self.in_features * self.tp_size
 
         if self.is_grouped:
+            if not isinstance(TEGroupedLinear, type):
+                raise RuntimeError("Grouped LoRA layers require Transformer Engine grouped linear support.")
             r = r // self.config.moe_router_topk
             lora_a = TERowParallelGroupedLinear(
                 num_gemms=self.base_layer.num_gemms,
@@ -457,6 +462,8 @@ def _create_lora_layers(self, r, lora_bias, **kwargs):
         out_features = self.out_features * self.tp_size
 
         if self.is_grouped:
+            if not isinstance(TEGroupedLinear, type):
+                raise RuntimeError("Grouped LoRA layers require Transformer Engine grouped linear support.")
             r = r // self.config.moe_router_topk
             lora_a = TEGroupedLinear(
                 num_gemms=self.base_layer.num_gemms,
@@ -518,6 +525,13 @@ def dispatch_megatron(
     elif isinstance(target_base_layer, (TELinear, TEGroupedLinear)):
         # default to column parallel linear for non-parallel linear layers
         new_module = LoraColumnParallelLinear(base_layer=target, adapter_name=adapter_name, **kwargs)
+    else:
+        # Fail fast: non-TE layers are not supported for LoRA. This prevents silent skip
+        # where peft would leave the module unchanged (no LoRA applied) with no error.
+        raise RuntimeError(
+            f"LoRA on {type(target_base_layer).__name__} is not supported. "
+            "Use transformer_impl=transformer_engine."
+        )
 
     return new_module
 

diff --git a/mcore_adapter/src/mcore_adapter/initialize.py b/mcore_adapter/src/mcore_adapter/initialize.py
@@ -53,6 +53,9 @@ def _initialize_distributed(args: "TrainingArguments"):
             rank=int(os.getenv("RANK", "0")),
             world_size=int(os.getenv("WORLD_SIZE", "1")),
             timeout=args.ddp_timeout_delta,
+            # Explicitly bind NCCL to this GPU from the start; avoids ambiguous
+            # device selection when multiple GPUs are visible to the process.
+            device_id=torch.device(args.device),
         )
     # Set the tensor model-parallel, pipeline model-parallel, and
     # data-parallel communicators.

diff --git a/mcore_adapter/src/mcore_adapter/models/model_utils.py b/mcore_adapter/src/mcore_adapter/models/model_utils.py
@@ -112,7 +112,8 @@ def forward(self, hidden_states):
 class _McaLoraLogitsHelper(torch.autograd.Function):
     @staticmethod
     def forward(ctx, logits: "torch.Tensor"):
-        return logits
+        # Return a fresh tensor so downstream inplace ops do not invalidate this custom backward.
+        return logits.clone()
 
     @staticmethod
     def backward(ctx, grad_output: "torch.Tensor"):