Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
108 commits
Select commit Hold shift + click to select a range
d7cd7ae
feat(multipipeline): isolate namespaces and ports
taoluo Feb 13, 2026
f679c43
feat(multipipeline): progress + shrink-to-zero
taoluo Feb 13, 2026
a74e4a2
feat(schedrl): add ROLL adapter entrypoint
taoluo Feb 13, 2026
0c47a7d
fix(config): avoid eval in worker config
taoluo Feb 13, 2026
3957699
feat(roll): implement resize_infer and multi-pipeline support
taoluo Feb 14, 2026
4b6a414
refactor(schedrl_adapter): simplify adapter API and add static cluste…
taoluo Feb 15, 2026
7263b3a
feat(roll): propagate SchedRL env vars via runtime_env for Ray actors
taoluo Feb 16, 2026
81923dd
fix(roll): resource manager GPU placement and CPU platform compatibility
taoluo Feb 16, 2026
8e22b44
feat(roll): improve bucket cache for multiprocess-safe selective sync
taoluo Feb 16, 2026
f644288
feat(roll): major SchedRL concurrent_pipeline refactor and vLLM compa…
taoluo Feb 16, 2026
7c002f8
chore(roll): update example configs and requirements for smoke testing
taoluo Feb 16, 2026
c398b68
feat(roll): adapt to simplified SchedRL API with state verification
taoluo Feb 17, 2026
1b23c84
feat(collective): add timeout_s, fail-fast KeyError, and teardown helper
taoluo Feb 18, 2026
e1957e3
feat(model_update): comm_plan-based selective sync with NCCL teardown…
taoluo Feb 18, 2026
2cb08bd
feat(cluster): add resolve_topology flag to skip blocking ray.get in …
taoluo Feb 18, 2026
2a1074c
feat(scheduler): non-blocking init, local PG allocation, SchedRL expa…
taoluo Feb 18, 2026
1db761b
fix(pipeline): re-offload actor_train after checkpoint to prevent GPU…
taoluo Feb 18, 2026
e43f9b2
fix(misc): sync resize_infer, asyncio fixes, request tracing logs, co…
taoluo Feb 18, 2026
5796a53
refactor(schedrl): move notify_ready_to_release to end of pipeline loop
taoluo Feb 19, 2026
c54fa9d
feat(lora): add lora_routing utility for multi-LoRA microbatch dispatch
taoluo Feb 19, 2026
044ab18
feat(config): add adapters field for multi-LoRA configuration
taoluo Feb 19, 2026
9ba3721
feat(megatron): add per-adapter multi-LoRA training support
taoluo Feb 19, 2026
a570280
feat(sft): add train_step_lora and LoRA weight management methods
taoluo Feb 19, 2026
d505be1
test(integration): add per-adapter single LoRA step equivalence test
taoluo Feb 19, 2026
3c8f0a4
feat(multi-lora): add adapters_to_update parameter to model_update
taoluo Feb 20, 2026
cf70df8
feat(multi-lora): add per-adapter checkpoint promotion and selective …
taoluo Feb 20, 2026
a8939b6
feat(multi-lora): add model_update_lora_subset helper method
taoluo Feb 20, 2026
c6184ac
feat(multi-lora): add train_step_lora RPC to ActorWorker
taoluo Feb 20, 2026
3d1ae29
feat(multi-lora): add SchedRLMultiLoraPipeline implementation
taoluo Feb 20, 2026
cfff6e9
feat(multi-lora): add pipeline registration and shared RequestSchedul…
taoluo Feb 20, 2026
bf5c5d5
feat(multi-lora): add per-adapter cache, RNG state, and selective syn…
taoluo Feb 20, 2026
cb137ab
feat(multi-lora): add _op_lock and notify_adapter_updated for selecti…
taoluo Feb 20, 2026
f5a63bf
feat(multi-lora): add adapters_to_sync support in model update service
taoluo Feb 20, 2026
d049a7a
fix(multi-lora): PP support and per-adapter optimizer fixes
taoluo Feb 21, 2026
b7effa5
feat(multi-lora): add setup_lora_training_from_adapters for multi-ada…
taoluo Feb 21, 2026
8414eaa
fix: misc robustness improvements for PP and distributed setup
taoluo Feb 21, 2026
16cf323
test(multi-lora): add TC5 for PP=2 and improve test robustness
taoluo Feb 21, 2026
ed9b165
(multi-lora): passed the tp2 pp2 test case for multi lora
taoluo Feb 21, 2026
8790783
test(multi-lora): add TC6 tp2pp2 and TC7 dp2pp2
taoluo Feb 21, 2026
1fe3999
feat(lora): add multi-LoRA routing utilities and adapter config norma…
taoluo Feb 22, 2026
9eb9ce2
feat(vllm): add multi-LoRA routing support to vLLM strategy
taoluo Feb 22, 2026
722fcf6
feat(env): inject lora_name in env managers for multi-LoRA routing
taoluo Feb 22, 2026
8130534
feat(pipeline): add multi-LoRA integration to workers and schedulers
taoluo Feb 22, 2026
a9e5da2
feat(examples): add multi-LoRA pipeline and smoke test configs
taoluo Feb 22, 2026
4caf875
chore(utils): add lora_name support to collective utilities
taoluo Feb 22, 2026
7c4d3dd
fix(sft): ensure lora_name broadcast before validation in train_step_…
taoluo Feb 22, 2026
9c15a5f
chore(examples): replace sokoban_grpo configs with full_finetune and …
taoluo Feb 23, 2026
179b85f
feat(multi-lora): update strategy, workers, and scheduler for multi-L…
taoluo Feb 23, 2026
4faa27c
fix(multi-pipeline): thread limits and barrier_mode removal
taoluo Feb 24, 2026
04a630c
fix(examples): use HuggingFace and set actor_infer lora_rank to 8
taoluo Feb 24, 2026
b5352aa
feat(adapter): pass lora_name to scheduler for GPU trace labels
taoluo Feb 24, 2026
244ba45
fix(vllm): stream base weights one-at-a-time and free sender GPU bucket
taoluo Feb 25, 2026
fe8634a
fix(adapter): validate offload_nccl and scope LoRA verify to expanded…
taoluo Feb 25, 2026
0801971
fix(examples): reduce sequence_length and enable dynamic batching in …
taoluo Feb 25, 2026
51e37e4
fix(adapter): close HEAD gaps in concurrent_pipeline run()
taoluo Feb 27, 2026
64570ba
feat(multi-lora): per-adapter run loop, adapter sync, and load_states…
taoluo Feb 27, 2026
a7a6327
fix(pipeline): offload GPU states after checkpoint to prevent OOM on …
taoluo Feb 28, 2026
6333f46
add prefix for tracker
taoluo Feb 28, 2026
fe5a4cd
rename lora names
taoluo Feb 28, 2026
8b25f39
fix(multi-lora): use deque for fair FIFO wait order in get_batch loop
taoluo Mar 1, 2026
982c8c1
refactor(schedrl): replace raw namespace/actor name strings with type…
taoluo Mar 2, 2026
149a5a4
refactor(schedrl): remove unused dead code
taoluo Mar 2, 2026
99cb31b
refactor(schedrl): migrate schedrl_adapter and examples to main sched…
taoluo Mar 2, 2026
3d83318
refactor(rlix): rename schedrl to rlix across codebase
taoluo Mar 2, 2026
29b1aea
refactor: rename _is_library_mode to do_time_sharing
taoluo Mar 3, 2026
4536cd2
docs(review): add explanatory comments to Step 1 config & foundation …
taoluo Mar 3, 2026
e93bfa0
docs(review): add explanatory comments to Step 2 utils & collective c…
taoluo Mar 3, 2026
6c17962
fix(functionals): fail fast when prompts.batch is None in postprocess…
taoluo Mar 3, 2026
c0af0eb
docs(functionals): explain np.repeat ordering for non_tensor_batch ex…
taoluo Mar 3, 2026
f949463
refactor: replace RLIX_CONTROL_PLANE checks with DO_TIME_SHARING cons…
taoluo Mar 3, 2026
067b79a
perf(send_recv): restore CUDA IPC for vLLM with lazy-probed fallback
taoluo Mar 3, 2026
4b35aa7
refactor(mcore_adapter): simplify LoRA layer and remove untested abst…
taoluo Mar 4, 2026
8ae1532
refactor(shrink-expand): extract GPU→dp_rank translation utils, adopt…
taoluo Mar 4, 2026
cdeeedb
chore: remove rlix_request_id from env managers and log paths
taoluo Mar 4, 2026
7f348eb
fix(scheduler): simplify LoadBalancer acquisition and fix decorator p…
taoluo Mar 4, 2026
8f4b044
refactor(constants): expand rlix_env_vars with additional thread-limi…
taoluo Mar 5, 2026
4120025
docs(scheduler): add docstrings and clarify time-sharing comments
taoluo Mar 5, 2026
9018006
refactor(rollout_scheduler): switch to pipeline-namespace coordinator…
taoluo Mar 5, 2026
0bda8c7
refactor(resource-manager): inline actor creation into RollResourceMa…
taoluo Mar 5, 2026
ae5e1d7
refactor(cache): simplify bucket cache key to checkpoint_version only
taoluo Mar 5, 2026
00b6e29
refactor(distributed): simplify Cluster topology and unify collective…
taoluo Mar 5, 2026
77ee8c4
refactor(vllm): remove dead LoRA verification methods
taoluo Mar 5, 2026
e505828
fix(lora): restore offload guard, defer registration, evict at all sl…
taoluo Mar 5, 2026
9dd1540
refactor(send_recv): remove bucket_bytes CPU fallback path
taoluo Mar 6, 2026
2a8aa47
refactor(strategy): dedup load/offload_states, fix IPC torch reductio…
taoluo Mar 8, 2026
8472e2f
refactor(lora): align setup_lora_training_from_adapters to upstream a…
taoluo Mar 9, 2026
082be54
refactor(env_manager): extract duplicated LoRA injection into _resolv…
taoluo Mar 9, 2026
160a341
refactor(pipeline): code review fixes across pipeline and worker modules
taoluo Mar 9, 2026
50587c0
refactor(pipeline): fix offload_states arg forwarding, multi-LoRA ful…
taoluo Mar 10, 2026
3fb2700
chore(pipeline): add TODO for fine-granular rollout interruption per-…
taoluo Mar 10, 2026
389b3ac
fix
taoluo Mar 10, 2026
e9348e5
fix: multi-pipeline InferWorker deadlocks, multi-LoRA OOM, and expand…
taoluo Mar 11, 2026
b767577
feat(transport): add cpu_pickle transport for colocated model weight …
taoluo Mar 11, 2026
9170beb
feat: add post-sync weight verification for base model and LoRA adapters
taoluo Mar 12, 2026
d309e61
feat(config): add verify_model_after_sync flag (disabled by default)
taoluo Mar 12, 2026
aec2ed2
chore(config): enable verify_model_after_sync in test configs
taoluo Mar 12, 2026
09c7244
perf: compute sender stats on GPU before CPU copy in cache builder
taoluo Mar 12, 2026
959c981
fix: resolve multi-lora RequestScheduler name collision and tied weig…
taoluo Mar 12, 2026
c4d51eb
chore: disable wandb tracking in test config
taoluo Mar 12, 2026
4ef56bc
fix(vllm): exclude add_lora_count from adapter hash to prevent LRU ev…
taoluo Mar 12, 2026
2718922
feat: make rlix dependency optional for standalone ROLL usage
taoluo Mar 12, 2026
f684386
fix(megatron): require multiple adapters for multi-adapter mode
taoluo Mar 12, 2026
650615e
feat: optimize cpu_serialize transport with torch.save + pinned memory
taoluo Mar 13, 2026
8000112
chore: remove .claude/plans and design_docs
taoluo Mar 14, 2026
e36027f
fix(rollout): explicit progress batch lifecycle with begin/end
taoluo Mar 17, 2026
6e87932
refactor(rollout): remove unused ProgressReport constructor fields
taoluo Mar 17, 2026
3507b19
refactor(rollout): emit raw collected instead of remaining in heartbeat
taoluo Mar 17, 2026
4989ec4
fix: rename ROLL_rlix references to ROLL
taoluo Mar 21, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 27 additions & 26 deletions examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,27 @@ system_envs:
# - roll
# - baseline

track_with: tensorboard
track_with: stdout # Disable TensorBoard in smoke run to bypass SummaryWriter type constraints.
tracker_kwargs:
log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_sokoban
log_dir: ./output/tensorboard/agentic_sokoban_lora_smoke # Use local path so smoke test does not depend on external mount.


checkpoint_config:
type: file_system
output_dir: /data/cpfs_0/rl_examples/models/${exp_name}
output_dir: /tmp/roll_output/agentic_sokoban_lora_smoke # Keep checkpoint path local for a portable smoke run.

num_gpus_per_node: 8
num_gpus_per_node: 4 # Fit smoke test to a 4-GPU node.

max_steps: 1024
max_steps: 3 # Minimal training smoke: one training step is enough to verify end-to-end path.
save_steps: 10000
logging_steps: 1
eval_steps: 10
eval_steps: 0 # Disable eval loop for faster smoke validation.
resume_from_checkpoint: false
async_generation_ratio: 1 # Required by partial_gpu_mode validation in agentic_pipeline.

rollout_batch_size: 1024
val_batch_size: 1024
sequence_length: 8192
rollout_batch_size: 4 # Keep rollout tiny to reduce runtime/memory.
val_batch_size: 4
sequence_length: 2048 # Reduce memory pressure while preserving normal train path.

advantage_clip: 0.2
ppo_epochs: 1
Expand All @@ -75,9 +76,9 @@ actor_train:
training_args:
learning_rate: 2.0e-5
weight_decay: 0
per_device_train_batch_size: 2
gradient_accumulation_steps: 64
warmup_steps: 10
per_device_train_batch_size: 1 # Minimal micro-batch for smoke stability.
gradient_accumulation_steps: 2
warmup_steps: 1
lr_scheduler_type: cosine
data_args:
template: qwen2_5
Expand All @@ -91,7 +92,7 @@ actor_train:
expert_model_parallel_size: 1
use_distributed_optimizer: true
recompute_granularity: full
device_mapping: list(range(0,8))
device_mapping: list(range(0,2)) # Constrain actor_train to 4 GPUs for this smoke profile.
infer_batch_size: 2

actor_infer:
Expand All @@ -102,11 +103,11 @@ actor_infer:
lora_rank: 32
lora_alpha: 32
generating_args:
max_new_tokens: 128 # single-turn response length
top_p: 0.99
top_k: 100
max_new_tokens: 64 # Shorter generation keeps smoke test fast.
top_p: 1
top_k: 3
num_beams: 1
temperature: 0.99
temperature: 0.0
num_return_sequences: 1
data_args:
template: qwen2_5
Expand All @@ -116,7 +117,7 @@ actor_infer:
gpu_memory_utilization: 0.8
block_size: 16
load_format: auto
device_mapping: list(range(0,8))
device_mapping: list(range(0,4)) # Constrain actor_infer to same 4-GPU pool.

reference:
model_args:
Expand All @@ -129,7 +130,7 @@ reference:
strategy_args:
strategy_name: hf_infer
strategy_config: ~
device_mapping: list(range(0,8))
device_mapping: list(range(0,2)) # Keep reference mapping consistent with 4-GPU smoke topology.
infer_batch_size: 2

reward_normalization:
Expand All @@ -138,19 +139,19 @@ reward_normalization:

train_env_manager:
format_penalty: -0.15 # sokoban env penalty_for_step=-0.1
max_env_num_per_worker: 16
num_env_groups: 128
max_env_num_per_worker: 4 # Smaller env fanout for quick smoke startup.
num_env_groups: 2
# under the same group, the env config and env seed are ensured to be equal
group_size: 8
group_size: 2
tags: [SimpleSokoban]
num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
num_groups_partition: [2] # Match reduced group count for smoke.

val_env_manager:
max_env_num_per_worker: 32
num_env_groups: 1024
max_env_num_per_worker: 4 # Keep validation manager light even though eval is disabled.
num_env_groups: 4
group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output
tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake]
num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
num_groups_partition: [1, 1, 1, 1] # Minimal partitioning for smoke.


# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
defaults:
- ../config/traj_envs@_here_
- ../config/deepspeed_zero@_here_
- ../config/deepspeed_zero2@_here_
- ../config/deepspeed_zero3@_here_
- ../config/deepspeed_zero3_cpuoffload@_here_

hydra:
run:
dir: .
output_subdir: null

pipeline_cls: roll.pipeline.agentic.agentic_multi_lora_pipeline.AgenticMultiLoraPipeline



exp_name: "agent_train_sokoban_multi_lora1"
seed: 42
logging_dir: ./output/lora_pipeline1/logs
output_dir: ./output/lora_pipeline1
render_save_dir: /tmp/roll_output/lora_pipeline1/render

# track_with: wandb
# tracker_kwargs:
# entity: "khd6t7hdhn-university-of-pennsylvania"
# project: "rlix"
# api_key: "${oc.env:WANDB_API_KEY}"


system_envs:
USE_MODELSCOPE: "0"
NCCL_SHM_DISABLE: "1"
RAY_PROFILING: "1"
RAY_DEDUP_LOGS: "0"
RAY_TMPDIR: "${oc.env:RAY_TMPDIR,/tmp}"
OMP_NUM_THREADS: "1"
MKL_NUM_THREADS: "1"
OPENBLAS_NUM_THREADS: "1"
RAY_grpc_server_thread_pool_size: "4"
TORCHINDUCTOR_COMPILE_THREADS: "1"
TORCHINDUCTOR_MAX_AUTOTUNE: "0"
# Container lacks SYS_PTRACE capability; disable vLLM custom all-reduce IPC and use NCCL fallback
VLLM_DISABLE_CUSTOM_ALL_REDUCE: "1"

checkpoint_config:
type: file_system
output_dir: /tmp/roll_output/multi_lora2/checkpoints

num_gpus_per_node: 2
model_download_type: HUGGINGFACE_HUB
offload_nccl: true
max_steps: 3
model_update_buffer_size_mb: 100 # Limit broadcast bucket to 100 MB to avoid OOM with co-located infer workers
model_update_transport: cpu_serialize # CPU byte serialization; avoids pidfd_getfd error in restricted containers
verify_model_after_sync: true
save_steps: 10000
logging_steps: 1
eval_steps: 20
resume_from_checkpoint: false

async_generation_ratio: 1

rollout_batch_size: 4
val_batch_size: 4
sequence_length: 1024 # Reduced from 2048: Sokoban max_new_tokens=64 needs ~500 tokens max, halves peak activation memory
max_actions_per_traj: 5

advantage_clip: 0.2
ppo_epochs: 1
adv_estimator: "grpo"
init_kl_coef: 0.0
whiten_advantages: true
entropy_loss_coef: 0
max_grad_norm: 1.0

pretrain: Qwen/Qwen2.5-0.5B-Instruct
reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct

actor_train:
offload_nccl: ${offload_nccl}
model_args:
attn_implementation: fa2
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
adapters:
Sokoban1:
lora_target: all-linear
lora_rank: 8
lora_alpha: 8
Sokoban2:
lora_target: all-linear
lora_rank: 8
lora_alpha: 8
training_args:
learning_rate: 1.0e-6
weight_decay: 0
per_device_train_batch_size: 1
gradient_accumulation_steps: 2
warmup_steps: 1
lr_scheduler_type: cosine
data_args:
template: qwen2_5
strategy_args:
strategy_name: megatron_train
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
use_distributed_optimizer: false
is_lora_optimizer_isolated: true
recompute_granularity: full
sequence_parallel: true
overlap_grad_reduce: false # Isolated LoRA mode requires overlap_grad_reduce disabled to avoid grad-sync hang.
# Note: use_sequence_packing is NOT enabled here — sequence packing mixes sequences from different
# LoRA adapters into one microbatch, violating the adapter-homogeneity constraint in inner_forward_step.
# Note: use_dynamic_batching_in_train is also NOT enabled — incompatible with is_lora_optimizer_isolated=true.
device_mapping: "[0, ]"
infer_batch_size: 1

actor_infer:
offload_nccl: ${offload_nccl}
model_args:
disable_gradient_checkpointing: true
dtype: bf16
adapters:
Sokoban1:
lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
lora_rank: 8
lora_alpha: 8
Sokoban2:
lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
lora_rank: 8
lora_alpha: 8
generating_args:
max_new_tokens: 64
top_p: 1
top_k: 3
num_beams: 1
temperature: 0.0
num_return_sequences: 1
data_args:
template: qwen2_5
strategy_args:
strategy_name: vllm
strategy_config:
VLLM_USE_V1: 1
gpu_memory_utilization: 0.8 # Raise cache budget so vLLM has non-zero KV blocks during two-worker startup.
block_size: 16
load_format: auto
tensor_parallel_size: 1
max_num_batched_tokens: 1024 # Match reduced sequence_length=1024
max_num_seqs: 2
enforce_eager: true
sleep_level: 1
device_mapping: "[0, 1, ]"

reference:
offload_nccl: ${offload_nccl}
model_args:
attn_implementation: fa2
disable_gradient_checkpointing: true
dtype: bf16
model_type: ~
data_args:
template: qwen2_5
strategy_args:
strategy_name: megatron_infer
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
# Dynamic batching on reference (megatron_infer): trims padding per-microbatch to actual token length
# (rounded to sequence_length_round_in_infer). Reduces peak memory during log_prob computation.
use_dynamic_batching_in_infer: true
max_tokens_per_microbatch_in_infer: 1024 # Match reduced sequence_length=1024
sequence_length_round_in_infer: 8
device_mapping: "[0, ]"
infer_batch_size: 1

reward_normalization:
grouping: traj_group_id
method: mean_std

train_env_manager:
format_penalty: -0.15
max_env_num_per_worker: 4
num_env_groups: 2
group_size: 2
tags: [Sokoban1, Sokoban2]
num_groups_partition: [1, 1]

val_env_manager:
max_env_num_per_worker: 4
num_env_groups: 2
group_size: 2
tags: [Sokoban1, Sokoban2]
num_groups_partition: [1, 1]

max_tokens_per_step: 64

custom_envs:
Sokoban1:
${custom_env.SimpleSokoban}
Sokoban2:
${custom_env.SimpleSokoban}
1 change: 1 addition & 0 deletions examples/start_agentic_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def main():
pipeline = pipeline_cls(pipeline_config=ppo_config)

pipeline.run()
print("Pipeline finished.")


if __name__ == "__main__":
Expand Down
14 changes: 14 additions & 0 deletions mcore_adapter/src/mcore_adapter/adapters/lora_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,9 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any):

if self.sequence_parallel and self.base_layer.parallel_mode == "row":
lora_result = scatter_to_sequence_parallel_region(lora_result)
# Cast per-adapter result before accumulating; each adapter may compute in its own weight dtype.
if lora_result.dtype != previous_dtype:
lora_result = lora_result.to(previous_dtype)
result = result + lora_result

result = result.to(previous_dtype)
Expand Down Expand Up @@ -413,6 +416,8 @@ def _create_lora_layers(self, r, lora_bias, **kwargs):
in_features = self.in_features * self.tp_size

if self.is_grouped:
if not isinstance(TEGroupedLinear, type):
raise RuntimeError("Grouped LoRA layers require Transformer Engine grouped linear support.")
r = r // self.config.moe_router_topk
lora_a = TERowParallelGroupedLinear(
num_gemms=self.base_layer.num_gemms,
Expand Down Expand Up @@ -457,6 +462,8 @@ def _create_lora_layers(self, r, lora_bias, **kwargs):
out_features = self.out_features * self.tp_size

if self.is_grouped:
if not isinstance(TEGroupedLinear, type):
raise RuntimeError("Grouped LoRA layers require Transformer Engine grouped linear support.")
r = r // self.config.moe_router_topk
lora_a = TEGroupedLinear(
num_gemms=self.base_layer.num_gemms,
Expand Down Expand Up @@ -518,6 +525,13 @@ def dispatch_megatron(
elif isinstance(target_base_layer, (TELinear, TEGroupedLinear)):
# default to column parallel linear for non-parallel linear layers
new_module = LoraColumnParallelLinear(base_layer=target, adapter_name=adapter_name, **kwargs)
else:
# Fail fast: non-TE layers are not supported for LoRA. This prevents silent skip
# where peft would leave the module unchanged (no LoRA applied) with no error.
raise RuntimeError(
f"LoRA on {type(target_base_layer).__name__} is not supported. "
"Use transformer_impl=transformer_engine."
)

return new_module

Expand Down
3 changes: 3 additions & 0 deletions mcore_adapter/src/mcore_adapter/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ def _initialize_distributed(args: "TrainingArguments"):
rank=int(os.getenv("RANK", "0")),
world_size=int(os.getenv("WORLD_SIZE", "1")),
timeout=args.ddp_timeout_delta,
# Explicitly bind NCCL to this GPU from the start; avoids ambiguous
# device selection when multiple GPUs are visible to the process.
device_id=torch.device(args.device),
)
# Set the tensor model-parallel, pipeline model-parallel, and
# data-parallel communicators.
Expand Down
3 changes: 2 additions & 1 deletion mcore_adapter/src/mcore_adapter/models/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ def forward(self, hidden_states):
class _McaLoraLogitsHelper(torch.autograd.Function):
@staticmethod
def forward(ctx, logits: "torch.Tensor"):
return logits
# Return a fresh tensor so downstream inplace ops do not invalidate this custom backward.
return logits.clone()

@staticmethod
def backward(ctx, grad_output: "torch.Tensor"):
Expand Down
Loading