From 6847f5b31b352519af97fcf8fcb1c11728c2dbd8 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 16 Jun 2026 11:01:03 +0800 Subject: [PATCH 1/5] support minimax_m3 --- src/mcore_bridge/config/parser.py | 22 +++++ src/mcore_bridge/model/constant.py | 1 + src/mcore_bridge/model/gpts/__init__.py | 2 +- src/mcore_bridge/model/gpts/minimax_m3.py | 70 ++++++++++++++++ .../model/mm_gpts/qwen3_moe_next.py | 80 +++++++++++++++++++ 5 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 src/mcore_bridge/model/gpts/minimax_m3.py create mode 100644 src/mcore_bridge/model/mm_gpts/qwen3_moe_next.py diff --git a/src/mcore_bridge/config/parser.py b/src/mcore_bridge/config/parser.py index a97ba70..e52c2a8 100644 --- a/src/mcore_bridge/config/parser.py +++ b/src/mcore_bridge/config/parser.py @@ -229,6 +229,28 @@ def hf_to_mcore_config(hf_config: PretrainedConfig) -> Dict[str, Any]: res.setdefault('linear_attention_freq', 4) elif llm_model_type == 'minimax_m2': res['add_qkv_bias'] = False + elif hf_model_type == 'minimax_m3_vl': + text_cfg = getattr(hf_config, 'text_config', hf_config) + res['add_qkv_bias'] = False + # Fix intermediate sizes: intermediate_size is MoE expert size, dense_intermediate_size is for dense MLP + res['moe_ffn_hidden_size'] = res.get('ffn_hidden_size', 3072) + dense_intermediate_size = getattr(text_cfg, 'dense_intermediate_size', None) + if dense_intermediate_size: + res['ffn_hidden_size'] = dense_intermediate_size + # Shared expert intermediate size + shared_intermediate_size = getattr(text_cfg, 'shared_intermediate_size', None) + if shared_intermediate_size: + res['moe_shared_expert_intermediate_size'] = shared_intermediate_size + # moe_layer_freq from list + moe_layer_freq_list = getattr(text_cfg, 'moe_layer_freq', None) + if isinstance(moe_layer_freq_list, list): + res['moe_layer_freq'] = f"[{','.join(str(x) for x in moe_layer_freq_list)}]" + # Activation: swigluoai = quick_geglu + glu_linear_offset + res['swiglu'] = False + res['quick_geglu'] = True + res['glu_linear_offset'] = 1 + # Gemma-style RMSNorm: weight is (1 + w) + res['layernorm_zero_centered_gamma'] = True elif llm_model_type == 'olmoe': res['qk_layernorm'] = True elif hf_model_type == 'llama4': diff --git a/src/mcore_bridge/model/constant.py b/src/mcore_bridge/model/constant.py index 1c3ae97..37c794a 100644 --- a/src/mcore_bridge/model/constant.py +++ b/src/mcore_bridge/model/constant.py @@ -7,6 +7,7 @@ class LLMModelType: olmoe = 'olmoe' glm4 = 'glm4' minimax_m2 = 'minimax_m2' + minimax_m3 = 'minimax_m3' hy_v3 = 'hy_v3' bailing_moe = 'bailing_moe' bailing_hybrid = 'bailing_hybrid' diff --git a/src/mcore_bridge/model/gpts/__init__.py b/src/mcore_bridge/model/gpts/__init__.py index 52025b8..ffef760 100644 --- a/src/mcore_bridge/model/gpts/__init__.py +++ b/src/mcore_bridge/model/gpts/__init__.py @@ -1,2 +1,2 @@ # Copyright (c) ModelScope Contributors. All rights reserved. -from . import bailing_hybrid, bailing_moe, deepseek_v4, glm4, hunyuan, llm, minimax_m2, olmoe, qwen3_emb, qwen3_next +from . import bailing_hybrid, bailing_moe, deepseek_v4, glm4, hunyuan, llm, minimax_m2, minimax_m3, olmoe, qwen3_emb, qwen3_next diff --git a/src/mcore_bridge/model/gpts/minimax_m3.py b/src/mcore_bridge/model/gpts/minimax_m3.py new file mode 100644 index 0000000..3d72037 --- /dev/null +++ b/src/mcore_bridge/model/gpts/minimax_m3.py @@ -0,0 +1,70 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +import torch +import torch.distributed as dist + +from mcore_bridge.bridge import GPTBridge + +from ..constant import ModelType +from ..register import ModelMeta, register_model + + +class MinimaxM3Bridge(GPTBridge): + hf_layers_prefix = 'language_model.model.layers' + hf_mtp_prefix = 'language_model.model.layers' + hf_embed_key = 'language_model.model.embed_tokens.weight' + hf_final_layernorm_key = 'language_model.model.norm.weight' + hf_lm_head_key = 'language_model.lm_head.weight' + hf_expert_bias_key = 'e_score_correction_bias' + + def _set_layer_mlp(self, mg_layer, hf_state_dict, layer_idx: int, to_mcore: bool, is_mtp: bool = False): + mg_mlp = None if mg_layer is None else mg_layer.mlp + is_moe = True if hasattr(mg_mlp, 'experts') else False + if not to_mcore: + is_moe = torch.tensor([is_moe], dtype=torch.bool, device='cuda') + if self.pp_size > 1: + dist.all_reduce(is_moe, group=self.pp_group) + if is_moe: + hf_state_dict.update( + self._set_moe_state(mg_mlp, hf_state_dict, 'block_sparse_moe.', layer_idx, to_mcore, is_mtp=is_mtp)) + self._set_state_dict(mg_layer, 'pre_mlp_layernorm.weight', hf_state_dict, + self.hf_post_attention_layernorm_key, to_mcore) + else: + hf_state_dict.update(self._set_mlp_state(mg_mlp, hf_state_dict, 'mlp.', layer_idx, to_mcore)) + self._set_state_dict(mg_layer, 'mlp.linear_fc1.layer_norm_weight', hf_state_dict, + self.hf_post_attention_layernorm_key, to_mcore) + return hf_state_dict + + def _set_moe_state( + self, + mg_mlp, + hf_state_dict, + hf_prefix: str, + layer_idx: int, + to_mcore: bool, + is_mtp: bool = False, + ): + if to_mcore: + # Rename routed experts: w1->gate_proj, w3->up_proj, w2->down_proj + # Shared experts already use standard naming (gate_proj/up_proj/down_proj) + hf_state_dict = { + k.replace('.w1.', '.gate_proj.').replace('.w3.', '.up_proj.').replace('.w2.', '.down_proj.') + if 'shared_expert' not in k else k: v + for k, v in hf_state_dict.items() + } + hf_state_dict = super()._set_moe_state(mg_mlp, hf_state_dict, hf_prefix, layer_idx, to_mcore, is_mtp) + if not to_mcore: + # Rename back for routed experts only + hf_state_dict = { + k.replace('.gate_proj.', '.w1.').replace('.up_proj.', '.w3.').replace('.down_proj.', '.w2.') + if 'shared_expert' not in k else k: v + for k, v in hf_state_dict.items() + } + return hf_state_dict + + +register_model( + ModelMeta( + ModelType.minimax_m3, + ['minimax_m3_vl'], + bridge_cls=MinimaxM3Bridge, + )) diff --git a/src/mcore_bridge/model/mm_gpts/qwen3_moe_next.py b/src/mcore_bridge/model/mm_gpts/qwen3_moe_next.py new file mode 100644 index 0000000..6585f17 --- /dev/null +++ b/src/mcore_bridge/model/mm_gpts/qwen3_moe_next.py @@ -0,0 +1,80 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +import torch + +from mcore_bridge.bridge import GPTBridge + +from ..constant import ModelType +from ..gpts.qwen3_next_gdn import Qwen3NextGDNBridge, Qwen3NextLoader +from ..register import ModelMeta, register_model +from .qwen3_vl import Qwen3VL_Vit, Qwen3VLLoader +from .utils import HuggingFaceVit + + +class Qwen3OmniNextBridge(Qwen3NextGDNBridge): + hf_layers_prefix = 'thinker.model.layers' + hf_embed_key = 'thinker.model.embed_tokens.weight' + hf_final_layernorm_key = 'thinker.model.norm.weight' + hf_lm_head_key = 'thinker.lm_head.weight' + hf_score_key = 'thinker.score.weight' + + +class Qwen3OmniNext_Vit(HuggingFaceVit): + module_mapping = {'thinker.audio_tower': 'audio_tower', 'thinker.visual': 'visual'} + _vision_tower = ['audio_tower', 'visual'] + _aligner = ['audio_tower.proj1', 'audio_tower.proj2', 'visual.merger', 'visual.deepstack_merger_list'] + _generator = ['talker'] + + def prepare_model(self, hf_config): + from transformers.models.qwen3_omni_next.modeling_qwen3_omni_next import ( + Qwen3OmniNextAudioEncoder, Qwen3OmniNextThinkerForConditionalGeneration, Qwen3OmniNextVisionEncoder) + self.model_cls = Qwen3OmniNextThinkerForConditionalGeneration + self.audio_tower = Qwen3OmniNextAudioEncoder._from_config(hf_config.thinker_config.audio_config) + self.visual = Qwen3OmniNextVisionEncoder._from_config(hf_config.thinker_config.vision_config) + + def get_inputs_embeds(self, inputs_embeds, **kwargs): + input_ids = kwargs['input_ids'] + visual = self.visual + hf_config = self.hf_config.thinker_config + res = Qwen3VL_Vit._get_inputs_embeds(self, inputs_embeds, kwargs, visual, hf_config) + inputs_embeds = res['inputs_embeds'] + input_features = kwargs.get('input_features') + feature_attention_mask = kwargs.get('feature_attention_mask') + + if input_features is None: + input_features = input_ids.new_zeros([1, 128, 128], dtype=self.audio_tower.dtype) + feature_attention_mask = input_ids.new_ones([1, 128], dtype=torch.bool) + audio_res = self.get_audio_features(input_features, feature_attention_mask) + if hasattr(audio_res, 'last_hidden_state'): + audio_embeds = audio_res.last_hidden_state + else: + audio_embeds = audio_res + inputs_embeds = inputs_embeds + audio_embeds.mean() * 0. + else: + audio_res = self.get_audio_features(input_features, feature_attention_mask) + if hasattr(audio_res, 'last_hidden_state'): + audio_embeds = audio_res.last_hidden_state + else: + audio_embeds = audio_res + audio_mask = (input_ids == hf_config.audio_token_id).unsqueeze(-1).expand_as(inputs_embeds) + audio_embeds = audio_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(audio_mask, audio_embeds) + res['inputs_embeds'] = inputs_embeds + return res + + def get_audio_features(self, *args, **kwargs): + with self.patch_hf_config(): + return self.model_cls.get_audio_features(self, *args, **kwargs) + + +class Qwen3OmniNextLoader(Qwen3VLLoader, Qwen3NextLoader): + pass + + +register_model( + ModelMeta( + ModelType.qwen3_omni_next, + ['qwen3_omni_next'], + bridge_cls=Qwen3OmniNextBridge, + visual_cls=Qwen3OmniNext_Vit, + loader=Qwen3OmniNextLoader, + )) From a2137b1344ab2478e8996b4d69bb30830931a890 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 16 Jun 2026 11:09:58 +0800 Subject: [PATCH 2/5] update --- .../model/mm_gpts/qwen3_moe_next.py | 80 ------------------- 1 file changed, 80 deletions(-) delete mode 100644 src/mcore_bridge/model/mm_gpts/qwen3_moe_next.py diff --git a/src/mcore_bridge/model/mm_gpts/qwen3_moe_next.py b/src/mcore_bridge/model/mm_gpts/qwen3_moe_next.py deleted file mode 100644 index 6585f17..0000000 --- a/src/mcore_bridge/model/mm_gpts/qwen3_moe_next.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) ModelScope Contributors. All rights reserved. -import torch - -from mcore_bridge.bridge import GPTBridge - -from ..constant import ModelType -from ..gpts.qwen3_next_gdn import Qwen3NextGDNBridge, Qwen3NextLoader -from ..register import ModelMeta, register_model -from .qwen3_vl import Qwen3VL_Vit, Qwen3VLLoader -from .utils import HuggingFaceVit - - -class Qwen3OmniNextBridge(Qwen3NextGDNBridge): - hf_layers_prefix = 'thinker.model.layers' - hf_embed_key = 'thinker.model.embed_tokens.weight' - hf_final_layernorm_key = 'thinker.model.norm.weight' - hf_lm_head_key = 'thinker.lm_head.weight' - hf_score_key = 'thinker.score.weight' - - -class Qwen3OmniNext_Vit(HuggingFaceVit): - module_mapping = {'thinker.audio_tower': 'audio_tower', 'thinker.visual': 'visual'} - _vision_tower = ['audio_tower', 'visual'] - _aligner = ['audio_tower.proj1', 'audio_tower.proj2', 'visual.merger', 'visual.deepstack_merger_list'] - _generator = ['talker'] - - def prepare_model(self, hf_config): - from transformers.models.qwen3_omni_next.modeling_qwen3_omni_next import ( - Qwen3OmniNextAudioEncoder, Qwen3OmniNextThinkerForConditionalGeneration, Qwen3OmniNextVisionEncoder) - self.model_cls = Qwen3OmniNextThinkerForConditionalGeneration - self.audio_tower = Qwen3OmniNextAudioEncoder._from_config(hf_config.thinker_config.audio_config) - self.visual = Qwen3OmniNextVisionEncoder._from_config(hf_config.thinker_config.vision_config) - - def get_inputs_embeds(self, inputs_embeds, **kwargs): - input_ids = kwargs['input_ids'] - visual = self.visual - hf_config = self.hf_config.thinker_config - res = Qwen3VL_Vit._get_inputs_embeds(self, inputs_embeds, kwargs, visual, hf_config) - inputs_embeds = res['inputs_embeds'] - input_features = kwargs.get('input_features') - feature_attention_mask = kwargs.get('feature_attention_mask') - - if input_features is None: - input_features = input_ids.new_zeros([1, 128, 128], dtype=self.audio_tower.dtype) - feature_attention_mask = input_ids.new_ones([1, 128], dtype=torch.bool) - audio_res = self.get_audio_features(input_features, feature_attention_mask) - if hasattr(audio_res, 'last_hidden_state'): - audio_embeds = audio_res.last_hidden_state - else: - audio_embeds = audio_res - inputs_embeds = inputs_embeds + audio_embeds.mean() * 0. - else: - audio_res = self.get_audio_features(input_features, feature_attention_mask) - if hasattr(audio_res, 'last_hidden_state'): - audio_embeds = audio_res.last_hidden_state - else: - audio_embeds = audio_res - audio_mask = (input_ids == hf_config.audio_token_id).unsqueeze(-1).expand_as(inputs_embeds) - audio_embeds = audio_embeds.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(audio_mask, audio_embeds) - res['inputs_embeds'] = inputs_embeds - return res - - def get_audio_features(self, *args, **kwargs): - with self.patch_hf_config(): - return self.model_cls.get_audio_features(self, *args, **kwargs) - - -class Qwen3OmniNextLoader(Qwen3VLLoader, Qwen3NextLoader): - pass - - -register_model( - ModelMeta( - ModelType.qwen3_omni_next, - ['qwen3_omni_next'], - bridge_cls=Qwen3OmniNextBridge, - visual_cls=Qwen3OmniNext_Vit, - loader=Qwen3OmniNextLoader, - )) From b5152b37ea6a76016e21a89c041dc9ab6a6625d3 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 16 Jun 2026 11:18:17 +0800 Subject: [PATCH 3/5] update --- README.md | 1 + README_zh.md | 1 + src/mcore_bridge/model/constant.py | 2 +- src/mcore_bridge/model/gpts/__init__.py | 3 ++- .../model/gpts/{minimax_m3.py => minimax_m3_vl.py} | 2 +- 5 files changed, 6 insertions(+), 3 deletions(-) rename src/mcore_bridge/model/gpts/{minimax_m3.py => minimax_m3_vl.py} (98%) diff --git a/README.md b/README.md index 6e46573..85fb643 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,7 @@ The following is the list of models supported by MCore-Bridge: | Qwen | qwen2_vl, qwen2_5_vl, qwen2_5_omni
qwen3_vl, qwen3_vl_moe, qwen3_omni_moe, qwen3_asr
qwen3_5, qwen3_5_moe | | Gemma | gemma4, gemma4_unified | | GLM | glm4v, glm4v_moe | +| MiniMax | minimax_m3_vl | | Kimi | kimi_vl, kimi_k25 | | InternVL | internvl_chat, internvl | | Ovis | ovis2_5 | diff --git a/README_zh.md b/README_zh.md index 7e03aa2..56980c1 100644 --- a/README_zh.md +++ b/README_zh.md @@ -144,6 +144,7 @@ uv pip install -e . --torch-backend=auto | Qwen | qwen2_vl, qwen2_5_vl, qwen2_5_omni
qwen3_vl, qwen3_vl_moe, qwen3_omni_moe, qwen3_asr
qwen3_5, qwen3_5_moe | | Gemma | gemma4, gemma4_unified | | GLM | glm4v, glm4v_moe | +| MiniMax | minimax_m3_vl | | Kimi | kimi_vl, kimi_k25 | | InternVL | internvl_chat, internvl | | Ovis | ovis2_5 | diff --git a/src/mcore_bridge/model/constant.py b/src/mcore_bridge/model/constant.py index 37c794a..7de7672 100644 --- a/src/mcore_bridge/model/constant.py +++ b/src/mcore_bridge/model/constant.py @@ -7,7 +7,7 @@ class LLMModelType: olmoe = 'olmoe' glm4 = 'glm4' minimax_m2 = 'minimax_m2' - minimax_m3 = 'minimax_m3' + minimax_m3_vl = 'minimax_m3_vl' hy_v3 = 'hy_v3' bailing_moe = 'bailing_moe' bailing_hybrid = 'bailing_hybrid' diff --git a/src/mcore_bridge/model/gpts/__init__.py b/src/mcore_bridge/model/gpts/__init__.py index ffef760..b62f701 100644 --- a/src/mcore_bridge/model/gpts/__init__.py +++ b/src/mcore_bridge/model/gpts/__init__.py @@ -1,2 +1,3 @@ # Copyright (c) ModelScope Contributors. All rights reserved. -from . import bailing_hybrid, bailing_moe, deepseek_v4, glm4, hunyuan, llm, minimax_m2, minimax_m3, olmoe, qwen3_emb, qwen3_next +from . import (bailing_hybrid, bailing_moe, deepseek_v4, glm4, hunyuan, llm, minimax_m2, minimax_m3_vl, olmoe, + qwen3_emb, qwen3_next) diff --git a/src/mcore_bridge/model/gpts/minimax_m3.py b/src/mcore_bridge/model/gpts/minimax_m3_vl.py similarity index 98% rename from src/mcore_bridge/model/gpts/minimax_m3.py rename to src/mcore_bridge/model/gpts/minimax_m3_vl.py index 3d72037..3f62e28 100644 --- a/src/mcore_bridge/model/gpts/minimax_m3.py +++ b/src/mcore_bridge/model/gpts/minimax_m3_vl.py @@ -64,7 +64,7 @@ def _set_moe_state( register_model( ModelMeta( - ModelType.minimax_m3, + ModelType.minimax_m3_vl, ['minimax_m3_vl'], bridge_cls=MinimaxM3Bridge, )) From 77ca78d2523f8a2ab806461aa8dbe5dbb9b3e555 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 16 Jun 2026 14:13:18 +0800 Subject: [PATCH 4/5] update --- src/mcore_bridge/model/gpts/__init__.py | 3 +- src/mcore_bridge/model/mm_gpts/__init__.py | 3 +- .../model/{gpts => mm_gpts}/minimax_m3_vl.py | 36 +++++++++++++------ 3 files changed, 29 insertions(+), 13 deletions(-) rename src/mcore_bridge/model/{gpts => mm_gpts}/minimax_m3_vl.py (69%) diff --git a/src/mcore_bridge/model/gpts/__init__.py b/src/mcore_bridge/model/gpts/__init__.py index b62f701..52025b8 100644 --- a/src/mcore_bridge/model/gpts/__init__.py +++ b/src/mcore_bridge/model/gpts/__init__.py @@ -1,3 +1,2 @@ # Copyright (c) ModelScope Contributors. All rights reserved. -from . import (bailing_hybrid, bailing_moe, deepseek_v4, glm4, hunyuan, llm, minimax_m2, minimax_m3_vl, olmoe, - qwen3_emb, qwen3_next) +from . import bailing_hybrid, bailing_moe, deepseek_v4, glm4, hunyuan, llm, minimax_m2, olmoe, qwen3_emb, qwen3_next diff --git a/src/mcore_bridge/model/mm_gpts/__init__.py b/src/mcore_bridge/model/mm_gpts/__init__.py index b862ec6..67298dc 100644 --- a/src/mcore_bridge/model/mm_gpts/__init__.py +++ b/src/mcore_bridge/model/mm_gpts/__init__.py @@ -1,2 +1,3 @@ # Copyright (c) ModelScope Contributors. All rights reserved. -from . import gemma4, glm, internvl, kimi_vl, llama4, llava, qwen, qwen3_5, qwen3_5_gdn, qwen3_asr, qwen3_omni, qwen3_vl +from . import (gemma4, glm, internvl, kimi_vl, llama4, llava, minimax_m3_vl, qwen, qwen3_5, qwen3_5_gdn, qwen3_asr, + qwen3_omni, qwen3_vl) diff --git a/src/mcore_bridge/model/gpts/minimax_m3_vl.py b/src/mcore_bridge/model/mm_gpts/minimax_m3_vl.py similarity index 69% rename from src/mcore_bridge/model/gpts/minimax_m3_vl.py rename to src/mcore_bridge/model/mm_gpts/minimax_m3_vl.py index 3f62e28..d88ae44 100644 --- a/src/mcore_bridge/model/gpts/minimax_m3_vl.py +++ b/src/mcore_bridge/model/mm_gpts/minimax_m3_vl.py @@ -6,6 +6,23 @@ from ..constant import ModelType from ..register import ModelMeta, register_model +from .utils import HuggingFaceVit + + +class MinimaxM3Vit(HuggingFaceVit): + module_mapping = {'model.vision_tower': 'vision_tower', 'model.multi_modal_projector': 'multi_modal_projector'} + _vision_tower = ['vision_tower'] + _aligner = ['multi_modal_projector'] + + def prepare_model(self, hf_config: PretrainedConfig): + from transformers.models.internvl.modeling_internvl import InternVLModel, InternVLMultiModalProjector + self.vision_tower = AutoModel.from_config(hf_config.vision_config) + self.multi_modal_projector = InternVLMultiModalProjector(hf_config).to(self.vision_tower.dtype) + self.model_cls = InternVLModel + self.dtype = self.vision_tower.dtype + + def get_inputs_embeds(self, inputs_embeds, **kwargs): + return self._hf_get_inputs_embeds(inputs_embeds, kwargs, self.visual, self.hf_config) class MinimaxM3Bridge(GPTBridge): @@ -47,24 +64,23 @@ def _set_moe_state( # Rename routed experts: w1->gate_proj, w3->up_proj, w2->down_proj # Shared experts already use standard naming (gate_proj/up_proj/down_proj) hf_state_dict = { - k.replace('.w1.', '.gate_proj.').replace('.w3.', '.up_proj.').replace('.w2.', '.down_proj.') - if 'shared_expert' not in k else k: v + k.replace('.w1.', '.gate_proj.').replace('.w3.', '.up_proj.').replace('.w2.', '.down_proj.') if 'shared_expert' not in k else k: + v for k, v in hf_state_dict.items() } hf_state_dict = super()._set_moe_state(mg_mlp, hf_state_dict, hf_prefix, layer_idx, to_mcore, is_mtp) if not to_mcore: # Rename back for routed experts only hf_state_dict = { - k.replace('.gate_proj.', '.w1.').replace('.up_proj.', '.w3.').replace('.down_proj.', '.w2.') - if 'shared_expert' not in k else k: v + k.replace('.gate_proj.', '.w1.').replace('.up_proj.', '.w3.').replace('.down_proj.', '.w2.') if 'shared_expert' not in k else k: + v for k, v in hf_state_dict.items() } return hf_state_dict -register_model( - ModelMeta( - ModelType.minimax_m3_vl, - ['minimax_m3_vl'], - bridge_cls=MinimaxM3Bridge, - )) +register_model(ModelMeta( + ModelType.minimax_m3_vl, + ['minimax_m3_vl'], + bridge_cls=MinimaxM3Bridge, +)) From 39947d2ffec1a09bdebfd7533e915134b1a22cb5 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 16 Jun 2026 15:29:17 +0800 Subject: [PATCH 5/5] update --- src/mcore_bridge/config/parser.py | 22 +++------ .../model/mm_gpts/minimax_m3_vl.py | 49 ++++++------------- 2 files changed, 21 insertions(+), 50 deletions(-) diff --git a/src/mcore_bridge/config/parser.py b/src/mcore_bridge/config/parser.py index e52c2a8..56f8143 100644 --- a/src/mcore_bridge/config/parser.py +++ b/src/mcore_bridge/config/parser.py @@ -26,7 +26,7 @@ 'hf_model_type': ['model_type'], # moe 'moe_ffn_hidden_size': ['moe_intermediate_size'], - 'moe_shared_expert_intermediate_size': ['shared_expert_intermediate_size'], + 'moe_shared_expert_intermediate_size': ['shared_expert_intermediate_size', 'shared_intermediate_size'], 'moe_router_topk': ['num_experts_per_tok', 'moe_topk', 'moe_k', 'top_k_experts'], 'moe_router_num_groups': ['n_group'], 'moe_router_group_topk': ['topk_group'], @@ -230,26 +230,18 @@ def hf_to_mcore_config(hf_config: PretrainedConfig) -> Dict[str, Any]: elif llm_model_type == 'minimax_m2': res['add_qkv_bias'] = False elif hf_model_type == 'minimax_m3_vl': - text_cfg = getattr(hf_config, 'text_config', hf_config) + text_config = hf_config.text_config res['add_qkv_bias'] = False # Fix intermediate sizes: intermediate_size is MoE expert size, dense_intermediate_size is for dense MLP - res['moe_ffn_hidden_size'] = res.get('ffn_hidden_size', 3072) - dense_intermediate_size = getattr(text_cfg, 'dense_intermediate_size', None) - if dense_intermediate_size: - res['ffn_hidden_size'] = dense_intermediate_size - # Shared expert intermediate size - shared_intermediate_size = getattr(text_cfg, 'shared_intermediate_size', None) - if shared_intermediate_size: - res['moe_shared_expert_intermediate_size'] = shared_intermediate_size - # moe_layer_freq from list - moe_layer_freq_list = getattr(text_cfg, 'moe_layer_freq', None) + res['moe_ffn_hidden_size'] = res['ffn_hidden_size'] + res['ffn_hidden_size'] = text_config.dense_intermediate_size + moe_layer_freq_list = text_config.mlp_layer_types if isinstance(moe_layer_freq_list, list): - res['moe_layer_freq'] = f"[{','.join(str(x) for x in moe_layer_freq_list)}]" - # Activation: swigluoai = quick_geglu + glu_linear_offset + res['moe_layer_freq'] = f"[{','.join('0' if x == 'dense' else '1' for x in moe_layer_freq_list)}]" res['swiglu'] = False res['quick_geglu'] = True + res['activation_func_clamp_value'] = 7 res['glu_linear_offset'] = 1 - # Gemma-style RMSNorm: weight is (1 + w) res['layernorm_zero_centered_gamma'] = True elif llm_model_type == 'olmoe': res['qk_layernorm'] = True diff --git a/src/mcore_bridge/model/mm_gpts/minimax_m3_vl.py b/src/mcore_bridge/model/mm_gpts/minimax_m3_vl.py index d88ae44..69814dc 100644 --- a/src/mcore_bridge/model/mm_gpts/minimax_m3_vl.py +++ b/src/mcore_bridge/model/mm_gpts/minimax_m3_vl.py @@ -1,6 +1,7 @@ # Copyright (c) ModelScope Contributors. All rights reserved. import torch import torch.distributed as dist +from transformers import PretrainedConfig from mcore_bridge.bridge import GPTBridge @@ -15,11 +16,10 @@ class MinimaxM3Vit(HuggingFaceVit): _aligner = ['multi_modal_projector'] def prepare_model(self, hf_config: PretrainedConfig): - from transformers.models.internvl.modeling_internvl import InternVLModel, InternVLMultiModalProjector - self.vision_tower = AutoModel.from_config(hf_config.vision_config) - self.multi_modal_projector = InternVLMultiModalProjector(hf_config).to(self.vision_tower.dtype) - self.model_cls = InternVLModel - self.dtype = self.vision_tower.dtype + from transformers.models.minimax_m3_vl.modeling_minimax_m3_vl import (MiniMaxM3VLMultiModalProjector, + MiniMaxM3VLVisionModel) + self.vision_tower = MiniMaxM3VLVisionModel(hf_config.vision_config).to(hf_config.dtype) + self.multi_modal_projector = MiniMaxM3VLMultiModalProjector(hf_config).to(hf_config.dtype) def get_inputs_embeds(self, inputs_embeds, **kwargs): return self._hf_get_inputs_embeds(inputs_embeds, kwargs, self.visual, self.hf_config) @@ -33,24 +33,6 @@ class MinimaxM3Bridge(GPTBridge): hf_lm_head_key = 'language_model.lm_head.weight' hf_expert_bias_key = 'e_score_correction_bias' - def _set_layer_mlp(self, mg_layer, hf_state_dict, layer_idx: int, to_mcore: bool, is_mtp: bool = False): - mg_mlp = None if mg_layer is None else mg_layer.mlp - is_moe = True if hasattr(mg_mlp, 'experts') else False - if not to_mcore: - is_moe = torch.tensor([is_moe], dtype=torch.bool, device='cuda') - if self.pp_size > 1: - dist.all_reduce(is_moe, group=self.pp_group) - if is_moe: - hf_state_dict.update( - self._set_moe_state(mg_mlp, hf_state_dict, 'block_sparse_moe.', layer_idx, to_mcore, is_mtp=is_mtp)) - self._set_state_dict(mg_layer, 'pre_mlp_layernorm.weight', hf_state_dict, - self.hf_post_attention_layernorm_key, to_mcore) - else: - hf_state_dict.update(self._set_mlp_state(mg_mlp, hf_state_dict, 'mlp.', layer_idx, to_mcore)) - self._set_state_dict(mg_layer, 'mlp.linear_fc1.layer_norm_weight', hf_state_dict, - self.hf_post_attention_layernorm_key, to_mcore) - return hf_state_dict - def _set_moe_state( self, mg_mlp, @@ -61,26 +43,23 @@ def _set_moe_state( is_mtp: bool = False, ): if to_mcore: - # Rename routed experts: w1->gate_proj, w3->up_proj, w2->down_proj - # Shared experts already use standard naming (gate_proj/up_proj/down_proj) hf_state_dict = { - k.replace('.w1.', '.gate_proj.').replace('.w3.', '.up_proj.').replace('.w2.', '.down_proj.') if 'shared_expert' not in k else k: - v + k.replace('.w1.', '.gate_proj.').replace('.w3.', '.up_proj.').replace('.w2.', '.down_proj.'): v for k, v in hf_state_dict.items() } hf_state_dict = super()._set_moe_state(mg_mlp, hf_state_dict, hf_prefix, layer_idx, to_mcore, is_mtp) if not to_mcore: - # Rename back for routed experts only hf_state_dict = { - k.replace('.gate_proj.', '.w1.').replace('.up_proj.', '.w3.').replace('.down_proj.', '.w2.') if 'shared_expert' not in k else k: - v + k.replace('.gate_proj.', '.w1.').replace('.up_proj.', '.w3.').replace('.down_proj.', '.w2.'): v for k, v in hf_state_dict.items() } return hf_state_dict -register_model(ModelMeta( - ModelType.minimax_m3_vl, - ['minimax_m3_vl'], - bridge_cls=MinimaxM3Bridge, -)) +register_model( + ModelMeta( + ModelType.minimax_m3_vl, + ['minimax_m3_vl'], + bridge_cls=MinimaxM3Bridge, + visual_cls=MinimaxM3Vit, + ))