diff --git a/README.md b/README.md index 6e46573..85fb643 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,7 @@ The following is the list of models supported by MCore-Bridge: | Qwen | qwen2_vl, qwen2_5_vl, qwen2_5_omni
qwen3_vl, qwen3_vl_moe, qwen3_omni_moe, qwen3_asr
qwen3_5, qwen3_5_moe | | Gemma | gemma4, gemma4_unified | | GLM | glm4v, glm4v_moe | +| MiniMax | minimax_m3_vl | | Kimi | kimi_vl, kimi_k25 | | InternVL | internvl_chat, internvl | | Ovis | ovis2_5 | diff --git a/README_zh.md b/README_zh.md index 7e03aa2..56980c1 100644 --- a/README_zh.md +++ b/README_zh.md @@ -144,6 +144,7 @@ uv pip install -e . --torch-backend=auto | Qwen | qwen2_vl, qwen2_5_vl, qwen2_5_omni
qwen3_vl, qwen3_vl_moe, qwen3_omni_moe, qwen3_asr
qwen3_5, qwen3_5_moe | | Gemma | gemma4, gemma4_unified | | GLM | glm4v, glm4v_moe | +| MiniMax | minimax_m3_vl | | Kimi | kimi_vl, kimi_k25 | | InternVL | internvl_chat, internvl | | Ovis | ovis2_5 | diff --git a/src/mcore_bridge/config/parser.py b/src/mcore_bridge/config/parser.py index a97ba70..56f8143 100644 --- a/src/mcore_bridge/config/parser.py +++ b/src/mcore_bridge/config/parser.py @@ -26,7 +26,7 @@ 'hf_model_type': ['model_type'], # moe 'moe_ffn_hidden_size': ['moe_intermediate_size'], - 'moe_shared_expert_intermediate_size': ['shared_expert_intermediate_size'], + 'moe_shared_expert_intermediate_size': ['shared_expert_intermediate_size', 'shared_intermediate_size'], 'moe_router_topk': ['num_experts_per_tok', 'moe_topk', 'moe_k', 'top_k_experts'], 'moe_router_num_groups': ['n_group'], 'moe_router_group_topk': ['topk_group'], @@ -229,6 +229,20 @@ def hf_to_mcore_config(hf_config: PretrainedConfig) -> Dict[str, Any]: res.setdefault('linear_attention_freq', 4) elif llm_model_type == 'minimax_m2': res['add_qkv_bias'] = False + elif hf_model_type == 'minimax_m3_vl': + text_config = hf_config.text_config + res['add_qkv_bias'] = False + # Fix intermediate sizes: intermediate_size is MoE expert size, dense_intermediate_size is for dense MLP + res['moe_ffn_hidden_size'] = res['ffn_hidden_size'] + res['ffn_hidden_size'] = text_config.dense_intermediate_size + moe_layer_freq_list = text_config.mlp_layer_types + if isinstance(moe_layer_freq_list, list): + res['moe_layer_freq'] = f"[{','.join('0' if x == 'dense' else '1' for x in moe_layer_freq_list)}]" + res['swiglu'] = False + res['quick_geglu'] = True + res['activation_func_clamp_value'] = 7 + res['glu_linear_offset'] = 1 + res['layernorm_zero_centered_gamma'] = True elif llm_model_type == 'olmoe': res['qk_layernorm'] = True elif hf_model_type == 'llama4': diff --git a/src/mcore_bridge/model/constant.py b/src/mcore_bridge/model/constant.py index 1c3ae97..7de7672 100644 --- a/src/mcore_bridge/model/constant.py +++ b/src/mcore_bridge/model/constant.py @@ -7,6 +7,7 @@ class LLMModelType: olmoe = 'olmoe' glm4 = 'glm4' minimax_m2 = 'minimax_m2' + minimax_m3_vl = 'minimax_m3_vl' hy_v3 = 'hy_v3' bailing_moe = 'bailing_moe' bailing_hybrid = 'bailing_hybrid' diff --git a/src/mcore_bridge/model/mm_gpts/__init__.py b/src/mcore_bridge/model/mm_gpts/__init__.py index b862ec6..67298dc 100644 --- a/src/mcore_bridge/model/mm_gpts/__init__.py +++ b/src/mcore_bridge/model/mm_gpts/__init__.py @@ -1,2 +1,3 @@ # Copyright (c) ModelScope Contributors. All rights reserved. -from . import gemma4, glm, internvl, kimi_vl, llama4, llava, qwen, qwen3_5, qwen3_5_gdn, qwen3_asr, qwen3_omni, qwen3_vl +from . import (gemma4, glm, internvl, kimi_vl, llama4, llava, minimax_m3_vl, qwen, qwen3_5, qwen3_5_gdn, qwen3_asr, + qwen3_omni, qwen3_vl) diff --git a/src/mcore_bridge/model/mm_gpts/minimax_m3_vl.py b/src/mcore_bridge/model/mm_gpts/minimax_m3_vl.py new file mode 100644 index 0000000..69814dc --- /dev/null +++ b/src/mcore_bridge/model/mm_gpts/minimax_m3_vl.py @@ -0,0 +1,65 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +import torch +import torch.distributed as dist +from transformers import PretrainedConfig + +from mcore_bridge.bridge import GPTBridge + +from ..constant import ModelType +from ..register import ModelMeta, register_model +from .utils import HuggingFaceVit + + +class MinimaxM3Vit(HuggingFaceVit): + module_mapping = {'model.vision_tower': 'vision_tower', 'model.multi_modal_projector': 'multi_modal_projector'} + _vision_tower = ['vision_tower'] + _aligner = ['multi_modal_projector'] + + def prepare_model(self, hf_config: PretrainedConfig): + from transformers.models.minimax_m3_vl.modeling_minimax_m3_vl import (MiniMaxM3VLMultiModalProjector, + MiniMaxM3VLVisionModel) + self.vision_tower = MiniMaxM3VLVisionModel(hf_config.vision_config).to(hf_config.dtype) + self.multi_modal_projector = MiniMaxM3VLMultiModalProjector(hf_config).to(hf_config.dtype) + + def get_inputs_embeds(self, inputs_embeds, **kwargs): + return self._hf_get_inputs_embeds(inputs_embeds, kwargs, self.visual, self.hf_config) + + +class MinimaxM3Bridge(GPTBridge): + hf_layers_prefix = 'language_model.model.layers' + hf_mtp_prefix = 'language_model.model.layers' + hf_embed_key = 'language_model.model.embed_tokens.weight' + hf_final_layernorm_key = 'language_model.model.norm.weight' + hf_lm_head_key = 'language_model.lm_head.weight' + hf_expert_bias_key = 'e_score_correction_bias' + + def _set_moe_state( + self, + mg_mlp, + hf_state_dict, + hf_prefix: str, + layer_idx: int, + to_mcore: bool, + is_mtp: bool = False, + ): + if to_mcore: + hf_state_dict = { + k.replace('.w1.', '.gate_proj.').replace('.w3.', '.up_proj.').replace('.w2.', '.down_proj.'): v + for k, v in hf_state_dict.items() + } + hf_state_dict = super()._set_moe_state(mg_mlp, hf_state_dict, hf_prefix, layer_idx, to_mcore, is_mtp) + if not to_mcore: + hf_state_dict = { + k.replace('.gate_proj.', '.w1.').replace('.up_proj.', '.w3.').replace('.down_proj.', '.w2.'): v + for k, v in hf_state_dict.items() + } + return hf_state_dict + + +register_model( + ModelMeta( + ModelType.minimax_m3_vl, + ['minimax_m3_vl'], + bridge_cls=MinimaxM3Bridge, + visual_cls=MinimaxM3Vit, + ))