hao-ai-lab · Edenzzzz · Jul 15, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 30, 2025
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -65,6 +65,21 @@ steps:
                   - TEST_TYPE=ssim
                 agents:
                   queue: "default"
+            - path:
+                - "fastvideo/v1/tests/lora/**"
+                - "fastvideo/v1/models/loader/**"
+                - "fastvideo/v1/tests/transformers/**"
+                - "fastvideo/v1/pipelines/**"
+                - "fastvideo/v1/layers/lora/**"
+                - "pyproject.toml"
+                - "docker/Dockerfile.python3.12"
+              config:
+                command: "timeout 15m .buildkite/scripts/pr_test.sh"
+                label: "LoRA Inference Tests"
+                env:
+                  - TEST_TYPE=inference_lora
+                agents:
+                  queue: "default"
             - path:
                 - "fastvideo/v1/**"
                 - "pyproject.toml"

diff --git a/.buildkite/scripts/pr_test.sh b/.buildkite/scripts/pr_test.sh
@@ -97,6 +97,10 @@ case "$TEST_TYPE" in
         log "Running precision VSA tests..."
         MODAL_COMMAND="$MODAL_ENV python3 -m modal run $MODAL_TEST_FILE::run_precision_tests_VSA"
         ;;
+    "inference_lora")
+        log "Running LoRA tests..."
+        MODAL_COMMAND="$MODAL_ENV python3 -m modal run $MODAL_TEST_FILE::run_inference_lora_tests"
+        ;;
     *)
         log "Error: Unknown test type: $TEST_TYPE"
         exit 1

diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json
@@ -13,4 +13,4 @@
       ]
     }
   ]
-}
+}
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -372,4 +372,4 @@ jobs:
           JOB_IDS: '["encoder-test", "vae-test", "transformer-test", "ssim-test-py3.10", "ssim-test-py3.11", "ssim-test-py3.12", "training-test", "training-test-VSA", "inference-test-STA", "precision-test-STA", "precision-test-VSA"]'
           RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
           GITHUB_RUN_ID: ${{ github.run_id }}
-        run: python .github/scripts/runpod_cleanup.py
+        run: python .github/scripts/runpod_cleanup.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -60,7 +60,7 @@ repos:
   rev: v1.15.0
   hooks:
   - id: mypy
-    args: [--python-version, '3.10', --follow-imports, "skip", ]
+    args: [--python-version, '3.10', --follow-imports, "skip" ]
     additional_dependencies: [types-cachetools, types-setuptools, types-PyYAML, types-requests]
 - repo: local
   hooks:
@@ -69,7 +69,7 @@ repos:
     entry: bash
     args:
       - -c
-      - 'git ls-files | grep -v "^fastvideo/v1/tests/ssim/" | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
+      - 'git ls-files | grep -v "^fastvideo/v1/tests/ssim/" | grep -v "^fastvideo/v1/tests/inference/lora/L40S_reference_videos/" | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
     language: system
     always_run: true
     pass_filenames: false

diff --git a/examples/inference/lora/wan_lora_inference.py b/examples/inference/lora/wan_lora_inference.py
@@ -6,7 +6,7 @@ def main():
     # Initialize VideoGenerator with the Wan model
     generator = VideoGenerator.from_pretrained(
         "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
-        num_gpus=2,
+        num_gpus=1,
         lora_path="benjamin-paine/steamboat-willie-1.3b",
         lora_nickname="steamboat"
     )
@@ -16,6 +16,7 @@ def main():
         "num_frames": 81,
         "guidance_scale": 5.0,
         "num_inference_steps": 32,
+        "seed": 42,
     }
     # Generate video with LoRA style
     prompt = "steamboat willie style, golden era animation, close-up of a short fluffy monster  kneeling beside a melting red candle. the mood is one of wonder and curiosity,  as the monster gazes at the flame with wide eyes and open mouth. Its pose and expression  convey a sense of innocence and playfulness, as if it is exploring the world around it for the first time.  The use of warm colors and dramatic lighting further enhances the cozy atmosphere of the image."
@@ -29,8 +30,17 @@ def main():
         negative_prompt=negative_prompt,
         **kwargs
     )
-
-    generator.set_lora_adapter(lora_nickname="flat_color", lora_path="motimalu/wan-flat-color-1.3b-v2")
+    del generator
+
+    # Until FSDP resharding bug is fixed, multi-lora requires reloading the model
+    # see https://github.com/pytorch/pytorch/issues/157209
+    generator = VideoGenerator.from_pretrained(
+        "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+        num_gpus=1,
+        lora_path="motimalu/wan-flat-color-1.3b-v2",
+        lora_nickname="flat_color"
+    )
+    # generator.set_lora_adapter(lora_nickname="flat_color", lora_path="motimalu/wan-flat-color-1.3b-v2")
     prompt = "flat color, no lineart, blending, negative space, artist:[john kafka|ponsuke kaikai|hara id 21|yoneyama mai|fuzichoco],  1girl, sakura miko, pink hair, cowboy shot, white shirt, floral print, off shoulder, outdoors, cherry blossom, tree shade, wariza, looking up, falling petals, half-closed eyes, white sky, clouds,  live2d animation, upper body, high quality cinematic video of a woman sitting under a sakura tree. Dreamy and lonely, the camera close-ups on the face of the woman as she turns towards the viewer. The Camera is steady, This is a cowboy shot. The animation is smooth and fluid."
     negative_prompt = "bad quality video,色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
     video = generator.generate_video(

diff --git a/fastvideo/utils/collect_env.py b/fastvideo/utils/collect_env.py
@@ -62,6 +62,7 @@
 DEFAULT_CONDA_PATTERNS = {
     "torch",
     "numpy",
+    "mypy"
     "cudatoolkit",
     "soumith",
     "mkl",
@@ -80,7 +81,6 @@
 DEFAULT_PIP_PATTERNS = {
     "torch",
     "numpy",
-    "mypy",
     "flake8",
     "triton",
     "optree",

diff --git a/fastvideo/v1/configs/fasthunyuan_t2v.json b/fastvideo/v1/configs/fasthunyuan_t2v.json
@@ -4,7 +4,7 @@
   "use_cpu_offload": false,
   "disable_autocast": false,
   "precision": "bf16",
-  "vae_precision": "fp16",
+  "vae_precision": "fp32",
   "vae_tiling": true,
   "vae_sp": true,
   "vae_config": {

diff --git a/fastvideo/v1/configs/models/dits/base.py b/fastvideo/v1/configs/models/dits/base.py
@@ -11,9 +11,9 @@
 class DiTArchConfig(ArchConfig):
     _fsdp_shard_conditions: list = field(default_factory=list)
     _compile_conditions: list = field(default_factory=list)
-    _param_names_mapping: dict = field(default_factory=dict)
-    _reverse_param_names_mapping: dict = field(default_factory=dict)
-    _lora_param_names_mapping: dict = field(default_factory=dict)
+    param_names_mapping: dict = field(default_factory=dict)
+    reverse_param_names_mapping: dict = field(default_factory=dict)
+    lora_param_names_mapping: dict = field(default_factory=dict)
     _supported_attention_backends: tuple[AttentionBackendEnum, ...] = (
         AttentionBackendEnum.SLIDING_TILE_ATTN, AttentionBackendEnum.SAGE_ATTN,
         AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.TORCH_SDPA,

diff --git a/fastvideo/v1/configs/models/dits/hunyuanvideo.py b/fastvideo/v1/configs/models/dits/hunyuanvideo.py
@@ -31,7 +31,7 @@ class HunyuanVideoArchConfig(DiTArchConfig):
     _compile_conditions: list = field(
         default_factory=lambda: [is_double_block, is_single_block, is_txt_in])
 
-    _param_names_mapping: dict = field(
+    param_names_mapping: dict = field(
         default_factory=lambda: {
             # 1. context_embedder.time_text_embed submodules (specific rules, applied first):
             r"^context_embedder\.time_text_embed\.timestep_embedder\.linear_1\.(.*)$":
@@ -146,8 +146,8 @@ class HunyuanVideoArchConfig(DiTArchConfig):
             r"final_layer.linear.\1",
         })
 
-    # Reverse mapping for saving checkpoints: training -> diffusers
-    _reverse_param_names_mapping: dict = field(default_factory=lambda: {})
+    # Reverse mapping for saving checkpoints: custom -> hf
+    reverse_param_names_mapping: dict = field(default_factory=lambda: {})
 
     patch_size: int = 2
     patch_size_t: int = 1

diff --git a/fastvideo/v1/configs/models/dits/stepvideo.py b/fastvideo/v1/configs/models/dits/stepvideo.py
@@ -10,7 +10,7 @@ class StepVideoArchConfig(DiTArchConfig):
         default_factory=lambda:
         [lambda n, m: "transformer_blocks" in n and n.split(".")[-1].isdigit()])
 
-    _param_names_mapping: dict = field(
+    param_names_mapping: dict = field(
         default_factory=lambda: {
             # transformer block
             r"^transformer_blocks\.(\d+)\.norm1\.(weight|bias)$":

diff --git a/fastvideo/v1/configs/models/dits/wanvideo.py b/fastvideo/v1/configs/models/dits/wanvideo.py
@@ -12,7 +12,7 @@ def is_blocks(n: str, m) -> bool:
 class WanVideoArchConfig(DiTArchConfig):
     _fsdp_shard_conditions: list = field(default_factory=lambda: [is_blocks])
 
-    _param_names_mapping: dict = field(
+    param_names_mapping: dict = field(
         default_factory=lambda: {
             r"^patch_embedding\.(.*)$":
             r"patch_embedding.proj.\1",
@@ -52,12 +52,12 @@ class WanVideoArchConfig(DiTArchConfig):
             r"blocks.\1.self_attn_residual_norm.norm.\2",
         })
 
-    # Reverse mapping for saving checkpoints: training -> diffusers
-    _reverse_param_names_mapping: dict = field(default_factory=lambda: {})
+    # Reverse mapping for saving checkpoints: custom -> hf
+    reverse_param_names_mapping: dict = field(default_factory=lambda: {})
 
     # Some LoRA adapters use the original official layer names instead of hf layer names,
     # so apply this before the param_names_mapping
-    _lora_param_names_mapping: dict = field(
+    lora_param_names_mapping: dict = field(
         default_factory=lambda: {
             r"^blocks\.(\d+)\.self_attn\.q\.(.*)$": r"blocks.\1.attn1.to_q.\2",
             r"^blocks\.(\d+)\.self_attn\.k\.(.*)$": r"blocks.\1.attn1.to_k.\2",

diff --git a/fastvideo/v1/configs/pipelines/base.py b/fastvideo/v1/configs/pipelines/base.py
@@ -62,11 +62,11 @@ class PipelineConfig:
     image_encoder_precision: str = "fp32"
 
     # Text encoder configuration
-    DEFAULT_TEXT_ENCODER_PRECISIONS = ("fp16", )
+    DEFAULT_TEXT_ENCODER_PRECISIONS = ("fp32", )
     text_encoder_configs: tuple[EncoderConfig, ...] = field(
         default_factory=lambda: (EncoderConfig(), ))
     text_encoder_precisions: tuple[str, ...] = field(
-        default_factory=lambda: ("fp16", ))
+        default_factory=lambda: ("fp32", ))
     preprocess_text_funcs: tuple[Callable[[str], str], ...] = field(
         default_factory=lambda: (preprocess_text, ))
     postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], torch.tensor],

diff --git a/fastvideo/v1/entrypoints/video_generator.py b/fastvideo/v1/entrypoints/video_generator.py
@@ -70,7 +70,7 @@ def from_pretrained(cls,
         """
         # If users also provide some kwargs, it will override the FastVideoArgs and PipelineConfig.
         kwargs['model_path'] = model_path
-        fastvideo_args = FastVideoArgs.from_kwargs(kwargs)
+        fastvideo_args = FastVideoArgs.from_kwargs(**kwargs)
 
         return cls.from_fastvideo_args(fastvideo_args)
 
@@ -109,6 +109,7 @@ def generate_video(
             prompt: The prompt to use for generation
             negative_prompt: The negative prompt to use (overrides the one in fastvideo_args)
             output_path: Path to save the video (overrides the one in fastvideo_args)
+            output_video_name: Name of the video file to save. Default is the first 100 characters of the prompt.
             save_video: Whether to save the video to disk
             return_frames: Whether to return the raw frames
             num_inference_steps: Number of denoising steps (overrides fastvideo_args)
@@ -228,6 +229,7 @@ def generate_video(
             n_tokens=n_tokens,
             VSA_sparsity=fastvideo_args.VSA_sparsity,
             extra={},
+            output_video_name=kwargs.get("output_video_name", prompt[:100]),
         )
 
         # Run inference
@@ -251,7 +253,8 @@ def generate_video(
             output_path = batch.output_path
             if output_path:
                 os.makedirs(output_path, exist_ok=True)
-                video_path = os.path.join(output_path, f"{prompt[:100]}.mp4")
+                video_path = os.path.join(output_path,
+                                          f"{batch.output_video_name}.mp4")
                 imageio.mimsave(video_path, frames, fps=batch.fps, format="mp4")
                 logger.info("Saved video to %s", video_path)
             else:
@@ -267,7 +270,9 @@ def generate_video(
                 "generation_time": gen_time
             }
 
-    def set_lora_adapter(self, lora_nickname: str, lora_path: str) -> None:
+    def set_lora_adapter(self,
+                         lora_nickname: str,
+                         lora_path: str | None = None) -> None:
         self.executor.set_lora_adapter(lora_nickname, lora_path)
 
     def shutdown(self):

diff --git a/fastvideo/v1/fastvideo_args.py b/fastvideo/v1/fastvideo_args.py
@@ -280,7 +280,7 @@ def from_cli_args(cls, args: argparse.Namespace) -> "FastVideoArgs":
         return cls(**kwargs)  # type: ignore
 
     @classmethod
-    def from_kwargs(cls, kwargs: dict[str, Any]) -> "FastVideoArgs":
+    def from_kwargs(cls, **kwargs: Any) -> "FastVideoArgs":
         kwargs['pipeline_config'] = PipelineConfig.from_kwargs(kwargs)
         return cls(**kwargs)
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,4 +13,4 @@ @@
           ]
         }
       ]
-    }
+    }