NVIDIA · jperez999 · Apr 27, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 10, 2026
@@ -44,13 +44,16 @@ uv pip install "nemo-retriever[local]==26.3.0" nv-ingest-client==26.3.0 nv-inges
 For **remote NIM inference only** (no local GPU required), the base package is sufficient:
 
 ```bash
+uv python install 3.12
 uv venv retriever --python 3.12
 source retriever/bin/activate
 uv pip install nemo-retriever==26.3.0 nv-ingest-client==26.3.0 nv-ingest==26.3.0 nv-ingest-api==26.3.0
 ```
 
 This creates a dedicated Python environment and installs the `nemo-retriever` PyPI package, the canonical distribution for the NeMo Retriever Library.
 
+> **Note:** `uv python install 3.12` installs a uv-managed Python that includes development headers (`Python.h`). These headers are required by vLLM, which compiles CUDA kernels at runtime using torch inductor. If you skip this step and use a system Python without headers, vLLM actor initialization will fail with `InductorError: fatal error: Python.h: No such file or directory`.
+
 2. Override Torch and Torchvision with CUDA 13 builds (local GPU only)
 
 The `[local]` extra pulls PyTorch from PyPI, which defaults to a CPU build on Linux. Reinstall from the CUDA 13.0 wheel index to match the CUDA runtime required by the Nemotron model packages:
@@ -91,7 +94,22 @@ ingestor = (
   .embed()
   .vdb_upload()
 )
+```
+
+### Optional extras
+
+- **`asr`** — Local ASR (Parakeet). Has a different `transformers` requirement than the core package; install only if you need local ASR:
+  ```bash
+  uv pip install -e './nemo_retriever[asr]'
+  ```
+
+Run the batch pipeline script and point it at the directory that contains your PDFs using the following command.
 
+```bash
+uv run python nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py /path/to/pdfs
+```
+
+```python
 # ingestor.ingest() actually executes the pipeline
 # results are returned as a ray dataset and inspectable as chunks
 ray_dataset = ingestor.ingest()

@@ -88,7 +88,13 @@ local = [
   "nvidia-ml-py",
   "apscheduler>=3.10",
   "psutil>=5.9.0",
+  # vLLM compiles CUDA kernels at runtime via torch inductor (requires Python.h).
+  # Use a uv-managed Python (`uv python install 3.12`) so headers are available;
+  # system Python installs typically omit them and will fail with InductorError.
   "vllm==0.17.0; sys_platform == 'linux'",
+  # flashinfer and cubin versions must match
+  "flashinfer-cubin==0.6.4; sys_platform == 'linux'",
+  "flashinfer-python==0.6.4; sys_platform == 'linux'",
 ]
 
 # ── Multimedia — audio/ASR and SVG rendering ────────────────────────────────

@@ -249,6 +249,7 @@ def ingest(self, data: Any, **kwargs: Any) -> Any:
                 if os.environ.get(_fwd_key):
                     ray_env_vars[_fwd_key] = os.environ[_fwd_key]
             ray_env_vars["HF_HUB_OFFLINE"] = os.environ.get("HF_HUB_OFFLINE", "1")
+            os.environ["HF_HUB_OFFLINE"] = ray_env_vars["HF_HUB_OFFLINE"]
             runtime_env = {"env_vars": ray_env_vars}
             ray.init(
                 address=self._ray_address,

@@ -297,6 +297,8 @@ def ingest(self, params: Any = None, **kwargs: Any) -> Any:
                 for _fwd_key in ("HF_TOKEN", "HF_HOME", "HUGGING_FACE_HUB_TOKEN", "NVIDIA_API_KEY"):
                     if os.environ.get(_fwd_key):
                         ray_env_vars[_fwd_key] = os.environ[_fwd_key]
+                ray_env_vars["HF_HUB_OFFLINE"] = os.environ.get("HF_HUB_OFFLINE", "1")
+                os.environ["HF_HUB_OFFLINE"] = ray_env_vars["HF_HUB_OFFLINE"]
                 runtime_env = {"env_vars": ray_env_vars}
                 ray.init(
                     address=self._ray_address,

@@ -250,7 +250,7 @@ embedding:
   api_key: ""  # e.g. $NGC_API_KEY or $NVIDIA_API_KEY
 
   # Embedding service settings
-  # If set to null/empty, `retriever local stage5` will fall back to local HF embeddings
+  # If set to null/empty, `retriever local stage5` will fall back to local vLLM embeddings
   # via `nemo_retriever.model.local.llama_nemotron_embed_1b_v2_embedder`.
   embedding_nim_endpoint: null
   # embedding_nim_endpoint: "http://localhost:8012/v1"

@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     from nemo_retriever.model.model import BaseModel
@@ -58,27 +58,75 @@ def is_vl_rerank_model(model_name: str | None) -> bool:
 def create_local_embedder(
     model_name: str | None = None,
     *,
+    backend: str = "vllm",
     device: str | None = None,
     hf_cache_dir: str | None = None,
+    gpu_memory_utilization: float = 0.45,
+    enforce_eager: bool = False,
+    dimensions: int | None = None,
     normalize: bool = True,
     max_length: int = 8192,
-):
+) -> Any:
     """Create the appropriate local embedding model (VL or non-VL).
 
-    Centralises the resolve -> branch -> construct pattern that was previously
-    duplicated across batch, inprocess, fused, gpu_pool, recall, retriever,
-    and text_embed code paths.
+    *backend* must be ``"vllm"`` or ``"hf"``.
+
+    For non-VL models:
+
+    - ``backend="vllm"`` (default): vLLM via ``LlamaNemotronEmbed1BV2Embedder``.
+    - ``backend="hf"``: HuggingFace via ``LlamaNemotronEmbed1BV2HFEmbedder``.
+
+    For VL models:
+
+    - ``backend="vllm"`` (default): vLLM via ``LlamaNemotronEmbedVL1BV2VLLMEmbedder``.
+    - ``backend="hf"``: HuggingFace via ``LlamaNemotronEmbedVL1BV2Embedder``.
+
+    ``device`` applies only to HuggingFace paths. For vLLM paths, ``device`` is
+    forwarded for compatibility but deprecated and ignored (vLLM placement is
+    process-level); passing it emits ``DeprecationWarning``.
+
+    Note: ``gpu_memory_utilization``, ``enforce_eager``, ``dimensions``,
+    ``normalize``, and ``max_length`` apply to vLLM paths only; the HF VL path ignores them.
     """
+    b = (backend or "vllm").strip().lower()
+    if b not in ("vllm", "hf"):
+        raise ValueError(f"backend must be 'vllm' or 'hf', got {backend!r}")
     model_id = resolve_embed_model(model_name)
 
     if is_vl_embed_model(model_name):
+        if b == "hf":
+            from nemo_retriever.model.local.llama_nemotron_embed_vl_1b_v2_embedder import (
+                LlamaNemotronEmbedVL1BV2Embedder,
+            )
+
+            return LlamaNemotronEmbedVL1BV2Embedder(
+                device=device,
+                hf_cache_dir=hf_cache_dir,
+                model_id=model_id,
+            )
+
         from nemo_retriever.model.local.llama_nemotron_embed_vl_1b_v2_embedder import (
-            LlamaNemotronEmbedVL1BV2Embedder,
+            LlamaNemotronEmbedVL1BV2VLLMEmbedder,
+        )
+
+        return LlamaNemotronEmbedVL1BV2VLLMEmbedder(
+            model_id=model_id,
+            device=device,
+            hf_cache_dir=hf_cache_dir,
+            gpu_memory_utilization=gpu_memory_utilization,
+            enforce_eager=enforce_eager,
         )
 
-        return LlamaNemotronEmbedVL1BV2Embedder(
+    if b == "hf":
+        from nemo_retriever.model.local.llama_nemotron_embed_1b_v2_hf_embedder import (
+            LlamaNemotronEmbed1BV2HFEmbedder,
+        )
+
+        return LlamaNemotronEmbed1BV2HFEmbedder(
             device=device,
             hf_cache_dir=hf_cache_dir,
+            normalize=normalize,
+            max_length=int(max_length),
             model_id=model_id,
         )
 
@@ -87,11 +135,53 @@ def create_local_embedder(
     )
 
     return LlamaNemotronEmbed1BV2Embedder(
+        model_id=model_id,
+        hf_cache_dir=hf_cache_dir,
+        device=device,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        dimensions=dimensions,
+        normalize=normalize,
+        max_length=int(max_length),
+    )
+
+
+_LOCAL_QUERY_BACKENDS = frozenset({"hf", "vllm"})
+
+
+def create_local_query_embedder(
+    model_name: str | None = None,
+    *,
+    backend: str = "hf",
+    device: str | None = None,
+    hf_cache_dir: str | None = None,
+    gpu_memory_utilization: float = 0.45,
+    enforce_eager: bool = False,
+    dimensions: int | None = None,
+    normalize: bool = True,
+    max_length: int = 8192,
+) -> Any:
+    """Create a local embedder for *query* vectors in retrieval (Retriever / recall).
+
+    *backend* must be ``"hf"`` (default) or ``"vllm"``.
+
+    - ``backend="hf"``: HuggingFace for both VL and non-VL models.
+    - ``backend="vllm"``: vLLM for both VL and non-VL models.
+    """
+    b = (backend or "hf").strip().lower()
+    if b not in _LOCAL_QUERY_BACKENDS:
+        raise ValueError(f"backend must be one of {sorted(_LOCAL_QUERY_BACKENDS)}, got {backend!r}")
+
+    return create_local_embedder(
+        model_name,
+        backend=b,
         device=device,
         hf_cache_dir=hf_cache_dir,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        dimensions=dimensions,
         normalize=normalize,
-        max_length=max_length,
-        model_id=model_id,
+        max_length=int(max_length),
     )