Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
91 commits
Select commit Hold shift + click to select a range
57fe183
First pass of reduced vLLM embedder
charlesbluca Mar 5, 2026
81b711e
Merge remote-tracking branch 'upstream/main' into retriever-vllm-for-…
charlesbluca Mar 5, 2026
2f19148
Merge remote-tracking branch 'upstream/main' into retriever-vllm-for-…
charlesbluca Mar 10, 2026
ec5a3e3
Pull vLLM CUDA 13 wheel
charlesbluca Mar 17, 2026
c9d4c42
Merge remote-tracking branch 'upstream/main' into retriever-vllm-for-…
charlesbluca Mar 17, 2026
a4fb012
Merge remote-tracking branch 'upstream/main' into retriever-vllm-for-…
charlesbluca Mar 20, 2026
840a765
feat(beir): plumb use_vllm through BeirConfig to Retriever
charlesbluca Mar 20, 2026
11afd40
Plumb vLLM CLI option into inprocess example
charlesbluca Mar 20, 2026
e5db4ad
Merge remote-tracking branch 'upstream/main' into retriever-vllm-for-…
charlesbluca Mar 23, 2026
804bde6
Linting
charlesbluca Mar 23, 2026
d00b4b2
Allow vLLM to be toggled separately for recall
charlesbluca Mar 23, 2026
65c224b
Apply ingest/recall vLLM split to fused example
charlesbluca Mar 23, 2026
9915ac1
Bump vLLM / torch to allow vLLM-backed VL embedder
charlesbluca Mar 24, 2026
c5fbfa5
Merge remote-tracking branch 'upstream/main' into retriever-vllm-for-…
charlesbluca Apr 7, 2026
1a803b9
Port vLLM embedding support to new _BatchEmbedGPUActor
charlesbluca Apr 7, 2026
715ecc7
Merge branch 'main' into retriever-vllm-for-embeddings-1
charlesbluca Apr 7, 2026
da47bb0
Revert vLLM/torch bump; add >=0.17.0 guard for VLM embedder
charlesbluca Apr 7, 2026
e8ffd24
Add --embed-use-vllm flag to graph_pipeline example
charlesbluca Apr 7, 2026
2c74315
Move vLLM >=0.17.0 guard to VLM embedder only
charlesbluca Apr 7, 2026
f4293d0
Honour device and hf_cache_dir in vLLM embedding path
charlesbluca Apr 7, 2026
aa172f7
Refactor: split vLLM-only embedder classes, matching parse/captioner …
charlesbluca Apr 7, 2026
8ed5558
Remove use_vllm toggle: always use vLLM for non-VL embedding, HF for VL
charlesbluca Apr 7, 2026
b32ae31
Use HF for recall queries; fix vLLM normalization to match HF output …
charlesbluca Apr 7, 2026
d57e066
Merge branch 'main' into retriever-vllm-for-embeddings-1
charlesbluca Apr 8, 2026
f1fd386
Fix tests broken by use_vllm removal and method rename
charlesbluca Apr 8, 2026
60c4cf2
Fix dead code, double-prefix bug, and swallowed PoolerConfig errors
charlesbluca Apr 8, 2026
e9a7b8e
Remove stale embed_use_vllm tests from test_embed_params.py
charlesbluca Apr 8, 2026
48d5f1a
Address code review: vLLM params, dead try/except, and missing tests
charlesbluca Apr 8, 2026
00ae3fd
Merge upstream/main: guard vLLM/flashinfer deps to linux, adopt arche…
charlesbluca Apr 8, 2026
28145b9
Address code review: use factory in Retriever, fix ragged tensor, fil…
charlesbluca Apr 8, 2026
be8de73
Merge branch 'main' into retriever-vllm-for-embeddings-1
charlesbluca Apr 8, 2026
9ead948
Fix stale README vllm flag docs and add return type to create_local_e…
charlesbluca Apr 8, 2026
69bb31e
Fix ragged tensor crash and document CUDA_VISIBLE_DEVICES side effect…
charlesbluca Apr 8, 2026
b7c3fa3
Forward dimensions param through factory/processor; add revision to e…
charlesbluca Apr 8, 2026
f2366c8
Fix silent count mismatch when vLLM returns None embeddings
charlesbluca Apr 8, 2026
512736f
Merge remote-tracking branch 'upstream/main' into retriever-vllm-for-…
charlesbluca Apr 8, 2026
6bdc13a
Fix row drop and missing L2 norm in VL vLLM embedder
charlesbluca Apr 8, 2026
b0c7b8b
Fix double prefix, remove stale embed-vllm extra, document global sid…
charlesbluca Apr 8, 2026
8a1899a
Remove CUDA_VISIBLE_DEVICES global mutation from VLLMEmbedder
charlesbluca Apr 9, 2026
de44543
Avoid overwriting compile cache env vars if already set
charlesbluca Apr 9, 2026
2baa908
Merge remote-tracking branch 'upstream/main' into retriever-vllm-for-…
charlesbluca Apr 9, 2026
14d5e1b
Fix TypeError when calling VL embedder with prefix kwarg in text_embe…
charlesbluca Apr 9, 2026
fbb433f
text-embed: remove custom compile cache override and fix PoolerConfig…
charlesbluca Apr 13, 2026
572b8b7
text-embed: remove unused embed_via_vllm helper
charlesbluca Apr 13, 2026
8c6ec7f
Unify local text embedder on vLLM in LlamaNemotronEmbed1BV2Embedder
charlesbluca Apr 13, 2026
f705de7
Drop unused device from local text vLLM embedder
charlesbluca Apr 13, 2026
d959685
Add selectable local query embed backend (HF vs vLLM)
charlesbluca Apr 13, 2026
fc5b10b
Merge branch 'main' into retriever-vllm-for-embeddings-1
charlesbluca Apr 13, 2026
988ba2f
Deprecate device on LlamaNemotronEmbed1BV2Embedder (vLLM path)
charlesbluca Apr 13, 2026
cb89300
Merge branch 'main' into retriever-vllm-for-embeddings-1
charlesbluca Apr 14, 2026
61d893f
Merge upstream/main into retriever-vllm-for-embeddings-1
charlesbluca Apr 15, 2026
f52034e
fix(harness): recursive glob for subdirectory datasets; add jp20 swee…
charlesbluca Apr 20, 2026
eed2630
Merge upstream/main into retriever-vllm-for-embeddings-1
charlesbluca Apr 21, 2026
32addf1
Implement full multimodal support for LlamaNemotronEmbedVL1BV2VLLMEmb…
charlesbluca Apr 21, 2026
65ff1d9
Merge upstream/main into retriever-vllm-for-embeddings-1
charlesbluca Apr 21, 2026
c727185
Merge upstream/main into retriever-vllm-for-embeddings-1
charlesbluca Apr 21, 2026
172b1b0
style: apply black formatting to vLLM embedder and tests
charlesbluca Apr 21, 2026
da0d022
fix: correct vLLM install instruction to include [local] extra
charlesbluca Apr 21, 2026
b6a6a52
feat(embed): add HF ingest backend selector and fix image batch size
charlesbluca Apr 21, 2026
0b2f511
feat(model): route VL query and ingest through vLLM by default
charlesbluca Apr 21, 2026
1d3f360
fix(embed): add missing --embed-local-ingest-backend CLI arg and forw…
charlesbluca Apr 22, 2026
38ebcb8
feat(harness): add 18-run embedder × reranker sweep suite
charlesbluca Apr 21, 2026
447d52d
fix(harness): remove --rerank-modality flag from graph_pipeline invoc…
charlesbluca Apr 23, 2026
371a6d3
fix(ray): propagate HF_HUB_OFFLINE to os.environ before ray.init()
charlesbluca Apr 23, 2026
363d20d
chore(harness): revert harness changes for follow-up PR
charlesbluca Apr 23, 2026
a277139
Merge remote-tracking branch 'upstream/main' into retriever-vllm-for-…
charlesbluca Apr 23, 2026
ca17fd8
chore(harness): drop sweep YAML and test_configs additions
charlesbluca Apr 23, 2026
a66df02
feat(embed): explicit backend selection, HF ingest support, and query…
charlesbluca Apr 23, 2026
9db3c35
fix(embed): defer HF model load to first use and harden bool/cache-cl…
charlesbluca Apr 23, 2026
3fe5209
fix(embed): map "auto" backend to "hf" in _get_local_embedder
charlesbluca Apr 23, 2026
982c4a0
Merge branch 'main' into retriever-vllm-for-embeddings-1
charlesbluca Apr 23, 2026
e9f86a6
refactor(embed): drop 'auto' backend — query defaults hf, ingest defa…
charlesbluca Apr 24, 2026
2933433
fix(embed): drop stale VL comment; fix DeprecationWarning stacklevel
charlesbluca Apr 24, 2026
53f352a
test(beir): drop stale 'auto' backend assertion
charlesbluca Apr 24, 2026
6967a5d
fix(embed): restore normalize/max_length in gpu_operator; guard vllm …
charlesbluca Apr 24, 2026
ed415a4
feat: VL embedder always uses HF backend; text embedder defaults to v…
charlesbluca Apr 24, 2026
64f29bc
Merge branch 'main' into retriever-vllm-for-embeddings-1
charlesbluca Apr 24, 2026
246f6e0
fix(embed): VL create_local_embedder respects backend param; default …
charlesbluca Apr 24, 2026
556436b
fix(embed): forward gpu_memory_utilization/enforce_eager to VL vLLM e…
charlesbluca Apr 24, 2026
c79c17c
fix(embed): respect local_ingest_backend config in text-embed CLI path
charlesbluca Apr 24, 2026
cb73da9
fix: address code review comments on local_ingest_backend comment and…
charlesbluca Apr 24, 2026
6618eb4
Merge branch 'main' into retriever-vllm-for-embeddings-1
charlesbluca Apr 24, 2026
b4eb2b3
Add queries comparison (#1928)
tomer-levin-nv Apr 27, 2026
235baab
fix(embed): skip prefix kwarg for HF ingest backend in _embed closure
charlesbluca Apr 27, 2026
aa863dc
Merge branch 'main' into retriever-vllm-for-embeddings-1
charlesbluca Apr 27, 2026
c6c4a47
fix(text_embed): strip whitespace in _to_bool; prefer embed_model_nam…
charlesbluca Apr 27, 2026
7f2c248
fix(review): address three code-review comments
charlesbluca Apr 27, 2026
0cf1f8c
fix(embed): add unload() to all four embedder classes; pass device kw…
charlesbluca Apr 27, 2026
a6d3505
fix(review): add field validator for local_ingest_backend; add LlamaN…
charlesbluca Apr 27, 2026
b28115a
fix(embed): defer vLLM GPU allocation to first use via _ensure_loaded()
charlesbluca Apr 27, 2026
9b0d15c
fix(embed): defer HF VL embedder GPU load via _ensure_loaded()
charlesbluca Apr 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions nemo_retriever/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,16 @@ uv pip install "nemo-retriever[local]==26.3.0" nv-ingest-client==26.3.0 nv-inges
For **remote NIM inference only** (no local GPU required), the base package is sufficient:

```bash
uv python install 3.12
uv venv retriever --python 3.12
source retriever/bin/activate
uv pip install nemo-retriever==26.3.0 nv-ingest-client==26.3.0 nv-ingest==26.3.0 nv-ingest-api==26.3.0
```

This creates a dedicated Python environment and installs the `nemo-retriever` PyPI package, the canonical distribution for the NeMo Retriever Library.

> **Note:** `uv python install 3.12` installs a uv-managed Python that includes development headers (`Python.h`). These headers are required by vLLM, which compiles CUDA kernels at runtime using torch inductor. If you skip this step and use a system Python without headers, vLLM actor initialization will fail with `InductorError: fatal error: Python.h: No such file or directory`.

2. Override Torch and Torchvision with CUDA 13 builds (local GPU only)

The `[local]` extra pulls PyTorch from PyPI, which defaults to a CPU build on Linux. Reinstall from the CUDA 13.0 wheel index to match the CUDA runtime required by the Nemotron model packages:
Expand Down Expand Up @@ -91,7 +94,22 @@ ingestor = (
.embed()
.vdb_upload()
)
```

### Optional extras

- **`asr`** — Local ASR (Parakeet). Has a different `transformers` requirement than the core package; install only if you need local ASR:
```bash
uv pip install -e './nemo_retriever[asr]'
```

Run the batch pipeline script and point it at the directory that contains your PDFs using the following command.

```bash
uv run python nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py /path/to/pdfs
```

```python
# ingestor.ingest() actually executes the pipeline
# results are returned as a ray dataset and inspectable as chunks
ray_dataset = ingestor.ingest()
Expand Down
6 changes: 6 additions & 0 deletions nemo_retriever/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,13 @@ local = [
"nvidia-ml-py",
"apscheduler>=3.10",
"psutil>=5.9.0",
# vLLM compiles CUDA kernels at runtime via torch inductor (requires Python.h).
# Use a uv-managed Python (`uv python install 3.12`) so headers are available;
# system Python installs typically omit them and will fail with InductorError.
"vllm==0.17.0; sys_platform == 'linux'",
# flashinfer and cubin versions must match
"flashinfer-cubin==0.6.4; sys_platform == 'linux'",
"flashinfer-python==0.6.4; sys_platform == 'linux'",
]

# ── Multimedia — audio/ASR and SVG rendering ────────────────────────────────
Expand Down
1 change: 1 addition & 0 deletions nemo_retriever/src/nemo_retriever/graph/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ def ingest(self, data: Any, **kwargs: Any) -> Any:
if os.environ.get(_fwd_key):
ray_env_vars[_fwd_key] = os.environ[_fwd_key]
ray_env_vars["HF_HUB_OFFLINE"] = os.environ.get("HF_HUB_OFFLINE", "1")
os.environ["HF_HUB_OFFLINE"] = ray_env_vars["HF_HUB_OFFLINE"]
runtime_env = {"env_vars": ray_env_vars}
ray.init(
address=self._ray_address,
Expand Down
2 changes: 2 additions & 0 deletions nemo_retriever/src/nemo_retriever/graph_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ def ingest(self, params: Any = None, **kwargs: Any) -> Any:
for _fwd_key in ("HF_TOKEN", "HF_HOME", "HUGGING_FACE_HUB_TOKEN", "NVIDIA_API_KEY"):
if os.environ.get(_fwd_key):
ray_env_vars[_fwd_key] = os.environ[_fwd_key]
ray_env_vars["HF_HUB_OFFLINE"] = os.environ.get("HF_HUB_OFFLINE", "1")
os.environ["HF_HUB_OFFLINE"] = ray_env_vars["HF_HUB_OFFLINE"]
runtime_env = {"env_vars": ray_env_vars}
ray.init(
address=self._ray_address,
Expand Down
2 changes: 1 addition & 1 deletion nemo_retriever/src/nemo_retriever/ingest-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ embedding:
api_key: "" # e.g. $NGC_API_KEY or $NVIDIA_API_KEY

# Embedding service settings
# If set to null/empty, `retriever local stage5` will fall back to local HF embeddings
# If set to null/empty, `retriever local stage5` will fall back to local vLLM embeddings
# via `nemo_retriever.model.local.llama_nemotron_embed_1b_v2_embedder`.
embedding_nim_endpoint: null
# embedding_nim_endpoint: "http://localhost:8012/v1"
Expand Down
108 changes: 99 additions & 9 deletions nemo_retriever/src/nemo_retriever/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
from nemo_retriever.model.model import BaseModel
Expand Down Expand Up @@ -58,27 +58,75 @@ def is_vl_rerank_model(model_name: str | None) -> bool:
def create_local_embedder(
model_name: str | None = None,
*,
backend: str = "vllm",
device: str | None = None,
hf_cache_dir: str | None = None,
gpu_memory_utilization: float = 0.45,
enforce_eager: bool = False,
dimensions: int | None = None,
normalize: bool = True,
Comment thread
charlesbluca marked this conversation as resolved.
max_length: int = 8192,
):
) -> Any:
"""Create the appropriate local embedding model (VL or non-VL).

Centralises the resolve -> branch -> construct pattern that was previously
duplicated across batch, inprocess, fused, gpu_pool, recall, retriever,
and text_embed code paths.
*backend* must be ``"vllm"`` or ``"hf"``.

For non-VL models:

- ``backend="vllm"`` (default): vLLM via ``LlamaNemotronEmbed1BV2Embedder``.
- ``backend="hf"``: HuggingFace via ``LlamaNemotronEmbed1BV2HFEmbedder``.

For VL models:

- ``backend="vllm"`` (default): vLLM via ``LlamaNemotronEmbedVL1BV2VLLMEmbedder``.
- ``backend="hf"``: HuggingFace via ``LlamaNemotronEmbedVL1BV2Embedder``.

``device`` applies only to HuggingFace paths. For vLLM paths, ``device`` is
forwarded for compatibility but deprecated and ignored (vLLM placement is
process-level); passing it emits ``DeprecationWarning``.

Note: ``gpu_memory_utilization``, ``enforce_eager``, ``dimensions``,
``normalize``, and ``max_length`` apply to vLLM paths only; the HF VL path ignores them.
"""
b = (backend or "vllm").strip().lower()
if b not in ("vllm", "hf"):
raise ValueError(f"backend must be 'vllm' or 'hf', got {backend!r}")
model_id = resolve_embed_model(model_name)

if is_vl_embed_model(model_name):
if b == "hf":
from nemo_retriever.model.local.llama_nemotron_embed_vl_1b_v2_embedder import (
LlamaNemotronEmbedVL1BV2Embedder,
)

return LlamaNemotronEmbedVL1BV2Embedder(
device=device,
hf_cache_dir=hf_cache_dir,
model_id=model_id,
)

from nemo_retriever.model.local.llama_nemotron_embed_vl_1b_v2_embedder import (
LlamaNemotronEmbedVL1BV2Embedder,
LlamaNemotronEmbedVL1BV2VLLMEmbedder,
)

return LlamaNemotronEmbedVL1BV2VLLMEmbedder(
model_id=model_id,
device=device,
hf_cache_dir=hf_cache_dir,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
)
Comment thread
charlesbluca marked this conversation as resolved.

return LlamaNemotronEmbedVL1BV2Embedder(
if b == "hf":
from nemo_retriever.model.local.llama_nemotron_embed_1b_v2_hf_embedder import (
LlamaNemotronEmbed1BV2HFEmbedder,
)

return LlamaNemotronEmbed1BV2HFEmbedder(
device=device,
hf_cache_dir=hf_cache_dir,
normalize=normalize,
max_length=int(max_length),
model_id=model_id,
)

Expand All @@ -87,11 +135,53 @@ def create_local_embedder(
)

return LlamaNemotronEmbed1BV2Embedder(
model_id=model_id,
hf_cache_dir=hf_cache_dir,
device=device,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
dimensions=dimensions,
normalize=normalize,
max_length=int(max_length),
)


_LOCAL_QUERY_BACKENDS = frozenset({"hf", "vllm"})


def create_local_query_embedder(
model_name: str | None = None,
*,
backend: str = "hf",
device: str | None = None,
hf_cache_dir: str | None = None,
gpu_memory_utilization: float = 0.45,
enforce_eager: bool = False,
dimensions: int | None = None,
normalize: bool = True,
max_length: int = 8192,
) -> Any:
"""Create a local embedder for *query* vectors in retrieval (Retriever / recall).

*backend* must be ``"hf"`` (default) or ``"vllm"``.

- ``backend="hf"``: HuggingFace for both VL and non-VL models.
- ``backend="vllm"``: vLLM for both VL and non-VL models.
"""
b = (backend or "hf").strip().lower()
if b not in _LOCAL_QUERY_BACKENDS:
raise ValueError(f"backend must be one of {sorted(_LOCAL_QUERY_BACKENDS)}, got {backend!r}")

return create_local_embedder(
model_name,
backend=b,
device=device,
hf_cache_dir=hf_cache_dir,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
dimensions=dimensions,
normalize=normalize,
max_length=max_length,
model_id=model_id,
max_length=int(max_length),
)


Expand Down
Loading
Loading