huggingface · larryliu0820 · Mar 19, 2026 · Mar 18, 2026
diff --git a/install_dev.py b/install_dev.py
@@ -5,22 +5,22 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20260104"
-    TORCHAO_NIGHTLY_VERSION = "dev20251222"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20260317"
+    TORCHAO_NIGHTLY_VERSION = "dev20260317"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/torch_pin.py#L2
-    TORCH_NIGHTLY_VERSION = "dev20251222"
+    TORCH_NIGHTLY_VERSION = "dev20260317"
     subprocess.check_call(
         [
             sys.executable,
             "-m",
             "pip",
             "install",
             "--no-cache-dir",  # Prevent cached CUDA packages
-            f"executorch==1.1.0.{EXECUTORCH_NIGHTLY_VERSION}",
-            f"torch==2.11.0.{TORCH_NIGHTLY_VERSION}",
-            f"torchvision==0.25.0.{TORCH_NIGHTLY_VERSION}",
-            f"torchaudio==2.10.0.{TORCH_NIGHTLY_VERSION}",
-            f"torchao==0.16.0.{TORCHAO_NIGHTLY_VERSION}",
+            f"executorch==1.3.0.{EXECUTORCH_NIGHTLY_VERSION}",
+            f"torch==2.12.0.{TORCH_NIGHTLY_VERSION}",
+            f"torchvision==0.26.0.{TORCH_NIGHTLY_VERSION}",
+            f"torchaudio==2.11.0.{TORCH_NIGHTLY_VERSION}",
+            f"torchao==0.17.0.{TORCHAO_NIGHTLY_VERSION}",
             "--extra-index-url",
             "https://download.pytorch.org/whl/nightly/cpu",
         ]

diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
@@ -35,6 +35,16 @@ def parse_args_executorch(parser):
         required=True,
         help="Model ID on huggingface.co or path on disk to load model from.",
     )
+    required_group.add_argument(
+        "--gguf_file",
+        type=str,
+        required=False,
+        help="GGUF filename (in a Hub repo) or local path to a GGUF file. "
+        "When set, the model weights and config are loaded from this GGUF file "
+        "instead of safetensors/bin checkpoints. Note: GGUF weights are "
+        "dequantized to float32 before export, which increases peak memory. "
+        "Use --dtype float16 to halve memory usage. Requires: pip install gguf",
+    )
     required_group.add_argument(
         "-o",
         "--output_dir",
@@ -271,6 +281,8 @@ def run(self):
             kwargs["device"] = self.args.device
         if hasattr(self.args, "image_size") and self.args.image_size:
             kwargs["image_size"] = self.args.image_size
+        if hasattr(self.args, "gguf_file") and self.args.gguf_file:
+            kwargs["gguf_file"] = self.args.gguf_file
 
         main_export(
             model_name_or_path=self.args.model,

diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py
@@ -61,7 +61,13 @@ def load_causal_lm_model(model_name_or_path: str, **kwargs) -> CausalLMExportabl
     use_custom_sdpa = use_custom_sdpa or attn_implementation == "custom_sdpa"
     max_seq_len = kwargs.get("max_seq_len", None)
     max_length = max_seq_len if max_seq_len is not None else kwargs.get("max_length", 2048)
-    config = kwargs.get("config") or AutoConfig.from_pretrained(model_name_or_path)
+    gguf_file = kwargs.get("gguf_file", None)
+    if gguf_file:
+        logging.warning(
+            "GGUF weights are dequantized to float32 before export, which increases "
+            "peak memory. Use --dtype float16 to halve memory usage."
+        )
+    config = kwargs.get("config") or AutoConfig.from_pretrained(model_name_or_path, gguf_file=gguf_file)
 
     if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
         # NOTE: To make the model exportable we need to set the rope scaling to default to avoid hitting
@@ -81,13 +87,15 @@ def _load_eager_pretrained(
         cache_implementation,
         batch_size,
         max_length,
+        gguf_file=None,
     ):
         eager_model = AutoModelForCausalLM.from_pretrained(
             model_name_or_path,
             device_map=device,
             dtype=dtype,
             config=config,
             attn_implementation=attn_implementation,
+            gguf_file=gguf_file,
             generation_config=GenerationConfig(
                 use_cache=True,
                 cache_implementation=cache_implementation,
@@ -110,6 +118,7 @@ def _load_eager_pretrained(
             cache_implementation,
             batch_size,
             max_length,
+            gguf_file,
         )
     except ValueError as e:
         if "torch.nn.functional.scaled_dot_product_attention" in str(e):
@@ -124,6 +133,7 @@ def _load_eager_pretrained(
                 cache_implementation,
                 batch_size,
                 max_length,
+                gguf_file,
             )
         else:
             raise e