diff --git a/diffsynth_engine/args.py b/diffsynth_engine/args.py
index 38d9cf7..cfdc8f9 100644
--- a/diffsynth_engine/args.py
+++ b/diffsynth_engine/args.py
@@ -17,8 +17,10 @@ def _parse_tuple(value: str) -> Tuple[int, int] | int:
         raise ValueError(f"Cannot parse tuple: {value}, format should be '256,256' or '256'")
 
 
-def _parse_attention_type(attn_type_str: str) -> AttentionType:
+def _parse_attention_type(attn_type_str: str | None) -> AttentionType | None:
     """Convert string to AttentionType enum"""
+    if attn_type_str is None:
+        return None
     return AttentionType[attn_type_str.upper()]
 
 
@@ -106,9 +108,9 @@ def parse_cli_args() -> Dict[str, Any]:
     attn_group.add_argument(
         "--attn-type",
         type=str,
-        default="sdpa",
+        default=None,
         choices=attn_type_choices,
-        help="Attention type (default: sdpa)",
+        help="Attention type (default: auto, SDPA on GPU, MINDIE on NPU)",
     )
     attn_group.add_argument(
         "--sparge-topk",
diff --git a/diffsynth_engine/configs/base.py b/diffsynth_engine/configs/base.py
index c05eb0b..ce5198b 100644
--- a/diffsynth_engine/configs/base.py
+++ b/diffsynth_engine/configs/base.py
@@ -36,7 +36,7 @@ class PipelineConfig:
     vae_tile_stride: int | Tuple[int, int] = (192, 192)
 
     # attention
-    attn_type: AttentionType = AttentionType.SDPA
+    attn_type: AttentionType | None = None  # None = auto-detect
     attn_params: Optional[AttentionParams] = None
 
     # parallelism
diff --git a/diffsynth_engine/layers/attention/backends/abstract.py b/diffsynth_engine/layers/attention/backends/abstract.py
index 949400e..9a70de1 100644
--- a/diffsynth_engine/layers/attention/backends/abstract.py
+++ b/diffsynth_engine/layers/attention/backends/abstract.py
@@ -22,6 +22,7 @@ class AttentionType(enum.Enum):
     SAGE2 = enum.auto()
     SAGE3 = enum.auto()
     SPARGE = enum.auto()
+    MINDIE = enum.auto()
 
     def __str__(self) -> str:
         return self.name.lower()
diff --git a/diffsynth_engine/layers/attention/backends/mindie_attn.py b/diffsynth_engine/layers/attention/backends/mindie_attn.py
new file mode 100644
index 0000000..70c5103
--- /dev/null
+++ b/diffsynth_engine/layers/attention/backends/mindie_attn.py
@@ -0,0 +1,79 @@
+import torch
+from diffsynth_engine.layers.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionMetadata,
+    AttentionType,
+)
+from diffsynth_engine.utils.import_utils import is_npu_available
+
+
+class MindieAttentionBackend(AttentionBackend):
+    @staticmethod
+    def check_availability() -> None:
+        if not is_npu_available():
+            raise RuntimeError("NPU is not available, cannot use MINDIE attention backend")
+
+    @staticmethod
+    def get_type() -> AttentionType:
+        return AttentionType.MINDIE
+
+    @staticmethod
+    def get_impl_cls() -> type["AttentionImpl"]:
+        return MindieAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return AttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type:
+        return None
+
+    @staticmethod
+    def get_supported_head_sizes() -> list[int]:
+        return []
+
+
+class MindieAttentionImpl(AttentionImpl):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        softmax_scale: float | None = None,
+        causal: bool = False,
+        num_kv_heads: int | None = None,
+        **extra_impl_args,
+    ) -> None:
+        if num_kv_heads is None:
+            num_kv_heads = num_heads
+        self.num_kv_groups = num_heads // num_kv_heads
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.num_heads = num_heads
+        self.head_size = head_size
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: torch.Tensor | None = None,
+        attn_metadata=None,
+    ) -> torch.Tensor:
+        from mindiesd.layers.flash_attn.attention_forward import attention_forward
+
+        scale = self.softmax_scale
+        if scale is None:
+            scale = self.head_size ** -0.5
+
+        out = attention_forward(
+            query=query,
+            key=key,
+            value=value,
+            attn_mask=attn_mask,
+            scale=scale,
+            fused=True,
+            head_first=False,
+        )
+        return out
\ No newline at end of file
diff --git a/diffsynth_engine/layers/attention/selector.py b/diffsynth_engine/layers/attention/selector.py
index a151382..1cc4ed4 100644
--- a/diffsynth_engine/layers/attention/selector.py
+++ b/diffsynth_engine/layers/attention/selector.py
@@ -1,7 +1,7 @@
 from functools import cache
 
 from diffsynth_engine.layers.attention.backends.abstract import AttentionBackend, AttentionType
-from diffsynth_engine.utils.import_utils import LazyImport
+from diffsynth_engine.utils.import_utils import LazyImport, is_npu_available
 
 AiterBackend = LazyImport("diffsynth_engine.layers.attention.backends.aiter", "AiterBackend")
 AiterFP8Backend = LazyImport("diffsynth_engine.layers.attention.backends.aiter", "AiterFP8Backend")
@@ -15,6 +15,7 @@
 SageAttention3Backend = LazyImport("diffsynth_engine.layers.attention.backends.sage_attn_3", "SageAttention3Backend")
 SDPABackend = LazyImport("diffsynth_engine.layers.attention.backends.sdpa", "SDPABackend")
 SpargeAttentionBackend = LazyImport("diffsynth_engine.layers.attention.backends.sparge_attn", "SpargeAttentionBackend")
+MindieAttentionBackend = LazyImport("diffsynth_engine.layers.attention.backends.mindie_attn", "MindieAttentionBackend")
 
 _attention_backends = {
     AttentionType.AITER: AiterBackend,
@@ -27,14 +28,16 @@
     AttentionType.SAGE3: SageAttention3Backend,
     AttentionType.SDPA: SDPABackend,
     AttentionType.SPARGE: SpargeAttentionBackend,
+    AttentionType.MINDIE: MindieAttentionBackend,
 }
 
 
 @cache
 def get_attn_backend(head_size: int, attn_type: AttentionType | None = None) -> type["AttentionBackend"]:
-    # use SDPA as default
     if attn_type is None:
-        attn_type = AttentionType.SDPA
+        # Auto-detect: NPU → MINDIE, otherwise → SDPA
+        attn_type = AttentionType.MINDIE if is_npu_available() else AttentionType.SDPA
+
     selected_backend = _attention_backends[attn_type]
     selected_backend.check_availability()
     if not selected_backend.supports_head_size(head_size):
diff --git a/diffsynth_engine/layers/mlp.py b/diffsynth_engine/layers/mlp.py
new file mode 100644
index 0000000..de00521
--- /dev/null
+++ b/diffsynth_engine/layers/mlp.py
@@ -0,0 +1,72 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from diffsynth_engine.utils.import_utils import is_npu_available
+
+try:
+    import torch_npu
+except ImportError:
+    torch_npu = None
+
+
+class _GELUProj(nn.Module):
+    """Wrapper to match diffusers FeedForward GELU structure with internal proj.
+
+    This wrapper holds the first Linear layer as .proj to match checkpoint keys.
+    """
+
+    def __init__(self, dim, inner_dim):
+        super().__init__()
+        self.proj = nn.Linear(dim, inner_dim)
+
+    def forward(self, x):
+        return F.gelu(x, approximate="tanh")
+
+
+class FastGELUMLP(nn.Module):
+    """MLP with npu_fast_gelu on NPU, fallback to F.gelu on other devices.
+
+    Functionally equivalent to diffusers.models.attention.FeedForward(
+        dim=dim, dim_out=dim, activation_fn="gelu-approximate"
+    )
+    """
+
+    def __init__(self, dim, dim_out=None, mult=4):
+        """Initialize MLP.
+
+        Args:
+            dim: Input and output dimension
+            dim_out: Output dimension, defaults to dim
+            mult: inner_dim = dim * mult, defaults to 4
+        """
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out or dim
+
+        # Match diffusers FeedForward structure: net[0]=GELU(proj), net[2]=output
+        # net[1] is Dropout which is skipped in inference
+        self.net = nn.ModuleList([
+            _GELUProj(dim, inner_dim),
+            nn.Dropout(0.0),
+            nn.Linear(inner_dim, dim_out),
+        ])
+
+    def forward(self, hidden_states):
+        """Forward pass.
+
+        Args:
+            hidden_states: Input tensor, shape [B, S, dim]
+
+        Returns:
+            Output tensor, shape [B, S, dim_out]
+        """
+        # net[0] = _GELUProj with internal proj (dim → inner_dim)
+        hidden_states = self.net[0].proj(hidden_states)
+
+        if is_npu_available() and torch_npu is not None:
+            hidden_states = torch_npu.npu_fast_gelu(hidden_states)
+        else:
+            hidden_states = F.gelu(hidden_states, approximate="tanh")
+
+        # net[2] = output Linear (inner_dim → dim_out)
+        hidden_states = self.net[2](hidden_states)
+        return hidden_states
\ No newline at end of file
diff --git a/diffsynth_engine/layers/norm.py b/diffsynth_engine/layers/norm.py
new file mode 100644
index 0000000..b846aaa
--- /dev/null
+++ b/diffsynth_engine/layers/norm.py
@@ -0,0 +1,78 @@
+import torch
+import torch.nn as nn
+from diffusers.models.normalization import RMSNorm as DiffusersRMSNorm
+from diffsynth_engine.utils.import_utils import is_npu_available
+
+try:
+    import torch_npu
+except ImportError:
+    torch_npu = None
+
+try:
+    from mindiesd.layers import layernorm_scale_shift
+except ImportError:
+    layernorm_scale_shift = None
+
+
+class RMSNorm(nn.Module):
+    """NPU-optimized RMSNorm wrapper with fallback to diffusers implementation."""
+
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.eps = eps
+        # Cache the fallback instance so forward() reuses the same weight
+        # tensor. register_parameter is reference assignment (no copy), so
+        # self.weight and self._fallback.weight share the same storage.
+        # When a checkpoint writes to "weight", both paths see the update.
+        self._fallback = DiffusersRMSNorm(hidden_size, eps)
+        self.register_parameter("weight", self._fallback.weight)
+
+    def forward(self, hidden_states):
+        if is_npu_available() and torch_npu is not None:
+            return torch_npu.npu_rms_norm(hidden_states, self.weight, epsilon=self.eps)[0]
+        else:
+            return self._fallback(hidden_states)
+
+
+class AdaLayerNorm(nn.Module):
+    """NPU-optimized AdaLayerNorm with fallback to original implementation.
+
+    Performs: output = layernorm(x) * (1 + scale) + shift
+
+    Args:
+        layernorm: The underlying nn.LayerNorm module (elementwise_affine=False)
+    """
+
+    def __init__(self, layernorm: nn.LayerNorm):
+        super().__init__()
+        self.layernorm = layernorm
+
+    def forward(self, hidden_states: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: Input tensor, shape [B, S, H]
+            scale: Scale parameter, shape [B, H] or [B, 1, H]
+            shift: Shift parameter, shape [B, H] or [B, 1, H]
+
+        Returns:
+            layernorm(x) * (1 + scale) + shift
+        """
+        if is_npu_available() and layernorm_scale_shift is not None:
+            # NPU path: use MindIE-SD fused operator
+            return layernorm_scale_shift(
+                layernorm=self.layernorm,
+                x=hidden_states,
+                scale=scale,
+                shift=shift,
+                fused=True
+            )
+        else:
+            # Fallback: original Python implementation
+            normed = self.layernorm(hidden_states)
+            # Handle [B, 1, H] -> [B, H] dimension
+            if scale.dim() == 2:
+                scale = scale.unsqueeze(1)
+            if shift.dim() == 2:
+                shift = shift.unsqueeze(1)
+            return normed * (1 + scale) + shift
\ No newline at end of file
diff --git a/diffsynth_engine/models/qwen_image/transformer_qwenimage.py b/diffsynth_engine/models/qwen_image/transformer_qwenimage.py
index 699f108..900ca95 100644
--- a/diffsynth_engine/models/qwen_image/transformer_qwenimage.py
+++ b/diffsynth_engine/models/qwen_image/transformer_qwenimage.py
@@ -22,16 +22,18 @@
 import torch
 import torch.nn as nn
 from diffusers.configuration_utils import register_to_config
-from diffusers.models.attention import FeedForward
+from diffsynth_engine.layers.mlp import FastGELUMLP
 from diffusers.models.embeddings import TimestepEmbedding, Timesteps
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
+from diffusers.models.normalization import AdaLayerNormContinuous
+from diffsynth_engine.layers.norm import RMSNorm, AdaLayerNorm
 
 from diffsynth_engine.distributed.utils import sequence_parallel_shard, sequence_parallel_unshard
 from diffsynth_engine.forward_context import get_forward_context
 from diffsynth_engine.layers.attention import USPAttention
 from diffsynth_engine.models.base import DiffusionModel
 from diffsynth_engine.utils import logging
+from diffsynth_engine.utils.import_utils import is_npu_available
 
 logger = logging.get_logger(__name__)
 
@@ -58,25 +60,45 @@ def apply_rotary_emb_qwen(
     """
     if use_real:
         cos, sin = freqs_cis  # [S, D]
-        cos = cos[None, None]
-        sin = sin[None, None]
+        # Broadcast to [1, S, 1, D] to match x: [B, S, H, D]
+        cos = cos[None, :, None, :]
+        sin = sin[None, :, None, :]
         cos, sin = cos.to(x.device), sin.to(x.device)
 
+        # rotated_mode mapping
         if use_real_unbind_dim == -1:
-            # Used for flux, cogvideox, hunyuan-dit
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
-            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+            rotated_mode = "rotated_half"
         elif use_real_unbind_dim == -2:
-            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
-            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+            rotated_mode = "rotated_interleaved"
         else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
-
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+            raise ValueError(f"use_real_unbind_dim must be -1 or -2, got {use_real_unbind_dim}")
+
+        if is_npu_available():
+            from mindiesd.layers.rope import rotary_position_embedding
+
+            x_out = rotary_position_embedding(
+                x=x,
+                cos=cos,
+                sin=sin,
+                rotated_mode=rotated_mode,
+                head_first=False,
+                fused=True,
+            )
+        else:
+            # Fallback to original implementation
+            if use_real_unbind_dim == -1:
+                x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+                x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+            else:
+                x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)
+                x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+            x_out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
 
-        return out
+        return x_out
     else:
+        # Complex path: freqs_cis is [S, D//2] complex
+        # x is [B, S, H, D] where D = 2 * freq_dim
+        # Use original complex multiplication approach
         x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
         freqs_cis = freqs_cis.unsqueeze(1)
         x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
@@ -544,7 +566,7 @@ def __init__(
             nn.SiLU(),
             nn.Linear(dim, 6 * dim, bias=True),  # For scale, shift, gate for norm1 and norm2
         )
-        self.img_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.img_norm1 = AdaLayerNorm(nn.LayerNorm(dim, elementwise_affine=False, eps=eps))
         self.attn = QwenDoubleStreamAttention(
             dim=dim,
             num_attention_heads=num_attention_heads,
@@ -552,23 +574,31 @@ def __init__(
             qk_norm=qk_norm,
             eps=eps,
         )
-        self.img_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        self.img_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.img_norm2 = AdaLayerNorm(nn.LayerNorm(dim, elementwise_affine=False, eps=eps))
+        self.img_mlp = FastGELUMLP(dim=dim, dim_out=dim)
 
         # Text processing modules
         self.txt_mod = nn.Sequential(
             nn.SiLU(),
             nn.Linear(dim, 6 * dim, bias=True),  # For scale, shift, gate for norm1 and norm2
         )
-        self.txt_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.txt_norm1 = AdaLayerNorm(nn.LayerNorm(dim, elementwise_affine=False, eps=eps))
         # Text doesn't need separate attention - it's handled by img_attn joint computation
-        self.txt_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        self.txt_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.txt_norm2 = AdaLayerNorm(nn.LayerNorm(dim, elementwise_affine=False, eps=eps))
+        self.txt_mlp = FastGELUMLP(dim=dim, dim_out=dim)
 
         self.zero_cond_t = zero_cond_t
 
     def _modulate(self, x, mod_params, index=None):
-        """Apply modulation to input tensor"""
+        """Apply modulation to input tensor.
+
+        NOTE: Currently unused in the normal forward path, which uses
+        AdaLayerNorm (NPU-optimized) instead. This method is preserved for
+        the zero_cond_t=True path, where modulate_index drives per-token
+        conditional selection of scale/shift/gate. AdaLayerNorm does not
+        support this per-token logic, so when zero_cond_t=True is enabled,
+        forward() should switch back to _modulate for modulate_index != None.
+        """
         # x: b l d, shift: b d, scale: b d, gate: b d
         shift, scale, gate = mod_params.chunk(3, dim=-1)
 
@@ -613,24 +643,31 @@ def forward(
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
         modulate_index: Optional[List[int]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Get modulation parameters for both streams
-        img_mod_params = self.img_mod(temb)  # [B, 6*dim]
-
+        # When zero_cond_t is enabled, temb has 2*B batch (cond + uncond CFG).
+        # Chunk it first so both img and txt mod_params use the same B-sized temb.
+        # NOTE: per-token conditional modulation (modulate_index) is unsupported
+        # with AdaLayerNorm; _modulate is preserved for future CFG support.
         if self.zero_cond_t:
             temb = torch.chunk(temb, 2, dim=0)[0]
+
+        img_mod_params = self.img_mod(temb)  # [B, 6*dim]
         txt_mod_params = self.txt_mod(temb)  # [B, 6*dim]
 
         # Split modulation parameters for norm1 and norm2
         img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1)  # Each [B, 3*dim]
         txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1)  # Each [B, 3*dim]
 
-        # Process image stream - norm1 + modulation
-        img_normed = self.img_norm1(hidden_states)
-        img_modulated, img_gate1 = self._modulate(img_normed, img_mod1, modulate_index)
+        # Split shift/scale/gate for AdaLayerNorm
+        img_shift1, img_scale1, img_gate1 = img_mod1.chunk(3, dim=-1)
+        img_shift2, img_scale2, img_gate2 = img_mod2.chunk(3, dim=-1)
+        txt_shift1, txt_scale1, txt_gate1 = txt_mod1.chunk(3, dim=-1)
+        txt_shift2, txt_scale2, txt_gate2 = txt_mod2.chunk(3, dim=-1)
+
+        # Process image stream - norm1 + modulation (AdaLayerNorm)
+        img_modulated = self.img_norm1(hidden_states, img_scale1, img_shift1)
 
-        # Process text stream - norm1 + modulation
-        txt_normed = self.txt_norm1(encoder_hidden_states)
-        txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1)
+        # Process text stream - norm1 + modulation (AdaLayerNorm)
+        txt_modulated = self.txt_norm1(encoder_hidden_states, txt_scale1, txt_shift1)
 
         # Use QwenDoubleStreamAttention for joint attention computation
         # This directly implements the DoubleStreamLayerMegatron logic:
@@ -649,20 +686,19 @@ def forward(
         )
 
         # Apply attention gates and add residual (like in Megatron)
-        hidden_states = hidden_states + img_gate1 * img_attn_output
-        encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output
+        # .unsqueeze(1): gates are [B, dim] from chunk, need [B, 1, dim] to broadcast with [B, S, dim]
+        hidden_states = hidden_states + img_gate1.unsqueeze(1) * img_attn_output
+        encoder_hidden_states = encoder_hidden_states + txt_gate1.unsqueeze(1) * txt_attn_output
 
-        # Process image stream - norm2 + MLP
-        img_normed2 = self.img_norm2(hidden_states)
-        img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2, modulate_index)
+        # Process image stream - norm2 + MLP (AdaLayerNorm)
+        img_modulated2 = self.img_norm2(hidden_states, img_scale2, img_shift2)
         img_mlp_output = self.img_mlp(img_modulated2)
-        hidden_states = hidden_states + img_gate2 * img_mlp_output
+        hidden_states = hidden_states + img_gate2.unsqueeze(1) * img_mlp_output
 
-        # Process text stream - norm2 + MLP
-        txt_normed2 = self.txt_norm2(encoder_hidden_states)
-        txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2)
+        # Process text stream - norm2 + MLP (AdaLayerNorm)
+        txt_modulated2 = self.txt_norm2(encoder_hidden_states, txt_scale2, txt_shift2)
         txt_mlp_output = self.txt_mlp(txt_modulated2)
-        encoder_hidden_states = encoder_hidden_states + txt_gate2 * txt_mlp_output
+        encoder_hidden_states = encoder_hidden_states + txt_gate2.unsqueeze(1) * txt_mlp_output
 
         # Clip to prevent overflow for fp16
         if encoder_hidden_states.dtype == torch.float16:
diff --git a/diffsynth_engine/utils/import_utils.py b/diffsynth_engine/utils/import_utils.py
index 8c7ce7e..0492489 100644
--- a/diffsynth_engine/utils/import_utils.py
+++ b/diffsynth_engine/utils/import_utils.py
@@ -3,6 +3,34 @@
 import importlib
 
 
+def is_npu_available():
+    """Detect if NPU is available using mindiesd.utils.is_npu_available.
+
+    Falls back to manual detection if mindiesd is not available.
+    """
+    mindiesd_spec = importlib.util.find_spec("mindiesd")
+    if mindiesd_spec is not None:
+        try:
+            from mindiesd.utils import is_npu_available as mindiesd_is_npu_available
+
+            return mindiesd_is_npu_available()
+        except (ImportError, AttributeError):
+            pass
+
+    # Fallback to manual detection
+    if importlib.util.find_spec("torch_npu") is None:
+        return False
+    try:
+        import torch
+
+        import torch_npu
+
+        _ = torch.npu.device_count()
+        return torch.npu.is_available()
+    except RuntimeError:
+        return False
+
+
 class LazyImport:
     def __init__(self, module_name: str, class_name: str):
         self.module_name = module_name
diff --git a/tests/test_layers/test_adalayernorm.py b/tests/test_layers/test_adalayernorm.py
new file mode 100644
index 0000000..afb92ab
--- /dev/null
+++ b/tests/test_layers/test_adalayernorm.py
@@ -0,0 +1,91 @@
+import unittest
+
+import torch
+import torch.nn as nn
+
+from diffsynth_engine.layers.norm import AdaLayerNorm
+
+
+class TestAdaLayerNorm(unittest.TestCase):
+    """Test AdaLayerNorm wrapper class"""
+
+    def test_forward_with_2d_scale_shift(self):
+        """Test AdaLayerNorm with [B, H] scale and shift"""
+        layernorm = nn.LayerNorm(64, elementwise_affine=False, eps=1e-6)
+        adaln = AdaLayerNorm(layernorm)
+
+        hidden_states = torch.randn(2, 16, 64)
+        scale = torch.randn(2, 64)
+        shift = torch.randn(2, 64)
+
+        output = adaln(hidden_states, scale, shift)
+
+        self.assertEqual(output.shape, hidden_states.shape)
+        self.assertFalse(torch.isnan(output).any())
+
+    def test_forward_with_3d_scale_shift(self):
+        """Test AdaLayerNorm with [B, 1, H] scale and shift"""
+        layernorm = nn.LayerNorm(64, elementwise_affine=False, eps=1e-6)
+        adaln = AdaLayerNorm(layernorm)
+
+        hidden_states = torch.randn(2, 16, 64)
+        scale = torch.randn(2, 1, 64)  # 3D
+        shift = torch.randn(2, 1, 64)  # 3D
+
+        output = adaln(hidden_states, scale, shift)
+
+        self.assertEqual(output.shape, hidden_states.shape)
+        self.assertFalse(torch.isnan(output).any())
+
+    def test_forward_mixed_scale_shift(self):
+        """Test AdaLayerNorm with [B, H] scale and [B, 1, H] shift"""
+        layernorm = nn.LayerNorm(64, elementwise_affine=False, eps=1e-6)
+        adaln = AdaLayerNorm(layernorm)
+
+        hidden_states = torch.randn(2, 16, 64)
+        scale = torch.randn(2, 64)  # 2D
+        shift = torch.randn(2, 1, 64)  # 3D
+
+        output = adaln(hidden_states, scale, shift)
+
+        self.assertEqual(output.shape, hidden_states.shape)
+        self.assertFalse(torch.isnan(output).any())
+
+    def test_adalayernorm_vs_manual(self):
+        """Test that AdaLayerNorm output matches manual implementation"""
+        layernorm = nn.LayerNorm(64, elementwise_affine=False, eps=1e-6)
+        adaln = AdaLayerNorm(layernorm)
+
+        hidden_states = torch.randn(2, 16, 64)
+        scale = torch.randn(2, 64)
+        shift = torch.randn(2, 64)
+
+        # Get output from AdaLayerNorm
+        output = adaln(hidden_states, scale, shift)
+
+        # Manual implementation for comparison
+        normed = layernorm(hidden_states)
+        scale_expanded = scale.unsqueeze(1)  # [B, H] -> [B, 1, H]
+        shift_expanded = shift.unsqueeze(1)
+        expected = normed * (1 + scale_expanded) + shift_expanded
+
+        self.assertTrue(torch.allclose(output, expected, atol=1e-5))
+
+    def test_different_batch_size(self):
+        """Test AdaLayerNorm with different batch sizes"""
+        layernorm = nn.LayerNorm(128, elementwise_affine=False, eps=1e-6)
+        adaln = AdaLayerNorm(layernorm)
+
+        for batch_size in [1, 4, 8]:
+            hidden_states = torch.randn(batch_size, 32, 128)
+            scale = torch.randn(batch_size, 128)
+            shift = torch.randn(batch_size, 128)
+
+            output = adaln(hidden_states, scale, shift)
+
+            self.assertEqual(output.shape, hidden_states.shape)
+            self.assertFalse(torch.isnan(output).any())
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file