From 132f3136a393bfa0688a7c54d0865388bc1b2e77 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 6 May 2026 11:56:57 +0200
Subject: [PATCH 1/4] add vllm support

---
 pyproject.toml                                |   3 +
 src/autointent/_wrappers/embedder/__init__.py |   2 +
 src/autointent/_wrappers/embedder/embedder.py |   8 +-
 src/autointent/_wrappers/embedder/vllm.py     | 184 ++++++++++++++++++
 src/autointent/configs/__init__.py            |   2 +
 src/autointent/configs/_embedder.py           |  37 +++-
 tests/embedder/conftest.py                    |  32 +++
 7 files changed, 265 insertions(+), 3 deletions(-)
 create mode 100644 src/autointent/_wrappers/embedder/vllm.py

diff --git a/pyproject.toml b/pyproject.toml
index 23a64f33c..816709e82 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -77,6 +77,9 @@ opensearch = [
 openai = [
     "openai (>=2,<3)",
 ]
+vllm = [
+    "vllm>=0.20.0",
+]
 
 [tool.uv]
 conflicts = [
diff --git a/src/autointent/_wrappers/embedder/__init__.py b/src/autointent/_wrappers/embedder/__init__.py
index 23d37f489..9c32a88e2 100644
--- a/src/autointent/_wrappers/embedder/__init__.py
+++ b/src/autointent/_wrappers/embedder/__init__.py
@@ -5,6 +5,7 @@
 from .hashing_vectorizer import HashingVectorizerEmbeddingBackend
 from .openai import OpenaiEmbeddingBackend
 from .sentence_transformers import SentenceTransformerEmbeddingBackend
+from .vllm import VllmEmbeddingBackend
 
 __all__ = [
     "BaseEmbeddingBackend",
@@ -12,4 +13,5 @@
     "HashingVectorizerEmbeddingBackend",
     "OpenaiEmbeddingBackend",
     "SentenceTransformerEmbeddingBackend",
+    "VllmEmbeddingBackend",
 ]
diff --git a/src/autointent/_wrappers/embedder/embedder.py b/src/autointent/_wrappers/embedder/embedder.py
index 70c513bb9..1884f53d2 100644
--- a/src/autointent/_wrappers/embedder/embedder.py
+++ b/src/autointent/_wrappers/embedder/embedder.py
@@ -16,11 +16,13 @@
     HashingVectorizerEmbeddingConfig,
     OpenaiEmbeddingConfig,
     SentenceTransformerEmbeddingConfig,
+    VllmEmbeddingConfig,
 )
 
 from .hashing_vectorizer import HashingVectorizerEmbeddingBackend
 from .openai import OpenaiEmbeddingBackend
 from .sentence_transformers import SentenceTransformerEmbeddingBackend
+from .vllm import VllmEmbeddingBackend
 
 if TYPE_CHECKING:
     import numpy as np
@@ -64,8 +66,8 @@ def _init_backend(self) -> BaseEmbeddingBackend:
             return OpenaiEmbeddingBackend(self.config)
         if isinstance(self.config, HashingVectorizerEmbeddingConfig):
             return HashingVectorizerEmbeddingBackend(self.config)
-        # Check if it's exactly the abstract base config (not a subclass)
-
+        if isinstance(self.config, VllmEmbeddingConfig):
+            return VllmEmbeddingBackend(self.config)
         msg = f"Cannot instantiate abstract EmbedderConfig: {self.config.__repr__()}"
         raise TypeError(msg)
 
@@ -161,6 +163,8 @@ def load(cls, path: Path | str, override_config: EmbedderConfig | None = None) -
             instance._backend = OpenaiEmbeddingBackend.load(backend_path)
         elif isinstance(config, HashingVectorizerEmbeddingConfig):
             instance._backend = HashingVectorizerEmbeddingBackend.load(backend_path)
+        elif isinstance(config, VllmEmbeddingConfig):
+            instance._backend = VllmEmbeddingBackend.load(backend_path)
         else:
             msg = f"Cannot load abstract EmbedderConfig: {config.__repr__()}"
             raise TypeError(msg)
diff --git a/src/autointent/_wrappers/embedder/vllm.py b/src/autointent/_wrappers/embedder/vllm.py
new file mode 100644
index 000000000..8f4054a73
--- /dev/null
+++ b/src/autointent/_wrappers/embedder/vllm.py
@@ -0,0 +1,184 @@
+"""vLLM-based embedding backend for GPU-accelerated inference."""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import TYPE_CHECKING, Literal, cast, overload
+
+import numpy as np
+import torch
+
+from autointent._hash import Hasher
+from autointent._utils import require
+from autointent.configs._embedder import VllmEmbeddingConfig
+
+from .base import BaseEmbeddingBackend
+from .utils import get_embeddings_path
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import numpy.typing as npt
+    from vllm import LLM
+
+    from autointent.configs import TaskTypeEnum
+
+logger = logging.getLogger(__name__)
+
+
+class VllmEmbeddingBackend(BaseEmbeddingBackend):
+    """vLLM-based embedding backend implementation."""
+
+    supports_training: bool = False
+
+    def __init__(self, config: VllmEmbeddingConfig) -> None:
+        """Initialize the vLLM backend.
+
+        Args:
+            config: Configuration for vLLM embeddings.
+        """
+        self.config = config
+        self._model = None
+
+    def _load_model(self) -> LLM:
+        """Lazy-load the vLLM LLM engine on first use."""
+        if self._model is None:
+            require("vllm", extra="vllm")
+            from vllm import LLM
+
+            kwargs = {
+                "model": self.config.model_name,
+                "task": "embed",
+                "gpu_memory_utilization": self.config.gpu_memory_utilization,
+                "dtype": self.config.dtype,
+                "trust_remote_code": self.config.trust_remote_code,
+                **self.config.extra_init_kwargs,
+            }
+            if self.config.max_model_len is not None:
+                kwargs["max_model_len"] = self.config.max_model_len
+
+            logger.debug("Loading vLLM embedding model %s", self.config.model_name)
+            self._model = LLM(**kwargs)
+        return self._model
+
+    def clear_ram(self) -> None:
+        """Release GPU memory held by the vLLM engine."""
+        if self._model is not None:
+            logger.debug("Clearing vLLM embedder %s from GPU memory", self.config.model_name)
+            del self._model
+            self._model = None
+            torch.cuda.empty_cache()
+
+    def get_hash(self) -> int:
+        """Compute a hash value for identifying the embedding model."""
+        hasher = Hasher()
+        hasher.update(self.config.model_name)
+        hasher.update(str(self.config.max_model_len))
+        return hasher.intdigest()
+
+    def embed(
+        self,
+        utterances: list[str],
+        task_type: TaskTypeEnum | None = None,
+        return_tensors: bool = False,
+    ) -> npt.NDArray[np.float32] | torch.Tensor:
+        """Calculate embeddings for a list of utterances.
+
+        Args:
+            utterances: List of input texts to calculate embeddings for.
+            task_type: Type of task for which embeddings are calculated.
+            return_tensors: If True, return a PyTorch tensor; otherwise, return a numpy array.
+
+        Returns:
+            A numpy array or PyTorch tensor of embeddings.
+        """
+        if len(utterances) == 0:
+            msg = "Empty input"
+            logger.error(msg)
+            raise ValueError(msg)
+
+        prompt = self.config.get_prompt(task_type)
+        if prompt:
+            utterances = [f"{prompt} {utterance}" for utterance in utterances]
+
+        if self.config.use_cache:
+            hasher = Hasher()
+            hasher.update(self.get_hash())
+            hasher.update(utterances)
+            if prompt:
+                hasher.update(prompt)
+
+            embeddings_path = get_embeddings_path(hasher.hexdigest())
+            if embeddings_path.exists():
+                logger.debug("Loading cached vLLM embeddings from %s", embeddings_path)
+                embeddings_np = cast("npt.NDArray[np.float32]", np.load(embeddings_path))
+                if return_tensors:
+                    return torch.from_numpy(embeddings_np)
+                return embeddings_np
+
+        model = self._load_model()
+
+        logger.debug(
+            "Calculating embeddings with vLLM model %s, batch_size=%d",
+            self.config.model_name,
+            self.config.batch_size,
+        )
+
+        outputs = model.encode(utterances, pooling_task="embed", **self.config.extra_encode_kwargs)
+        all_embeddings = [output.outputs.embedding for output in outputs]
+
+        embeddings_np = np.array(all_embeddings, dtype=np.float32)
+
+        if self.config.use_cache:
+            embeddings_path.parent.mkdir(parents=True, exist_ok=True)
+            np.save(embeddings_path, embeddings_np)
+
+        if return_tensors:
+            return torch.from_numpy(embeddings_np)
+        return embeddings_np
+
+    def similarity(
+        self, embeddings1: npt.NDArray[np.float32], embeddings2: npt.NDArray[np.float32]
+    ) -> npt.NDArray[np.float32]:
+        """Calculate cosine similarity between two sets of embeddings.
+
+        Args:
+            embeddings1: First set of embeddings (size n).
+            embeddings2: Second set of embeddings (size m).
+
+        Returns:
+            A numpy array of similarities (size n x m).
+        """
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        normalized1 = embeddings1 / norm1
+        normalized2 = embeddings2 / norm2
+        return cast("npt.NDArray[np.float32]", np.dot(normalized1, normalized2.T))
+
+    def dump(self, path: Path) -> None:
+        """Save the backend config to disk (stateless — no model weights to save).
+
+        Args:
+            path: Path to the directory where the backend will be saved.
+        """
+        path.mkdir(parents=True, exist_ok=True)
+        config_path = path / "config.json"
+        with config_path.open("w", encoding="utf-8") as file:
+            json.dump(self.config.model_dump(mode="json"), file, indent=4, ensure_ascii=False)
+
+    @classmethod
+    def load(cls, path: Path) -> VllmEmbeddingBackend:
+        """Load the backend from saved config.
+
+        Args:
+            path: Path to the directory where the backend is stored.
+
+        Returns:
+            Loaded backend instance.
+        """
+        config_path = path / "config.json"
+        with config_path.open("r", encoding="utf-8") as file:
+            config_data = json.load(file)
+        config = VllmEmbeddingConfig.model_validate(config_data)
+        return cls(config)
diff --git a/src/autointent/configs/__init__.py b/src/autointent/configs/__init__.py
index efa41f592..45531ea2c 100644
--- a/src/autointent/configs/__init__.py
+++ b/src/autointent/configs/__init__.py
@@ -6,6 +6,7 @@
     OpenaiEmbeddingConfig,
     SentenceTransformerEmbeddingConfig,
     TaskTypeEnum,
+    VllmEmbeddingConfig,
     get_default_embedder_config,
     initialize_embedder_config,
 )
@@ -41,6 +42,7 @@
     "TokenizerConfig",
     "TorchTrainingConfig",
     "VectorIndexConfig",
+    "VllmEmbeddingConfig",
     "VocabConfig",
     "get_default_embedder_config",
     "get_default_hfmodel_config",
diff --git a/src/autointent/configs/_embedder.py b/src/autointent/configs/_embedder.py
index 7b8252d52..f11784217 100644
--- a/src/autointent/configs/_embedder.py
+++ b/src/autointent/configs/_embedder.py
@@ -122,8 +122,41 @@ class HashingVectorizerEmbeddingConfig(BaseEmbedderConfig):
     dtype: str = Field("float32", description="Type of the matrix returned by fit_transform() or transform().")
 
 
+class VllmEmbeddingConfig(BaseEmbedderConfig):
+    """Configuration for vLLM-based embeddings."""
+
+    model_name: str = Field("BAAI/bge-base-en-v1.5", description="Name of the HuggingFace model to load via vLLM.")
+    batch_size: int = Field(32, description="Number of texts to encode per vLLM encode() call.")
+    max_model_len: int | None = Field(
+        None, description="Maximum sequence length. Reduces VRAM usage for long-context models."
+    )
+    gpu_memory_utilization: float = Field(
+        0.9,
+        ge=0.0,
+        le=1.0,
+        description="Fraction of GPU memory vLLM is allowed to use (0.0 to 1.0).",
+    )
+    dtype: str = Field(
+        "auto",
+        description="Data type for model weights: 'auto', 'float16', 'bfloat16', 'float32'.",
+    )
+    trust_remote_code: bool = Field(False, description="Whether to trust remote code when loading the model.")
+    extra_init_kwargs: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Extra keyword arguments passed to the vLLM LLM() constructor.",
+    )
+    extra_encode_kwargs: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Extra keyword arguments passed to llm.encode() at inference time (e.g. custom SamplingParams).",
+    )
+
+
 EmbedderConfig: TypeAlias = (
-    SentenceTransformerEmbeddingConfig | OpenaiEmbeddingConfig | HashingVectorizerEmbeddingConfig | BaseEmbedderConfig
+    SentenceTransformerEmbeddingConfig
+    | OpenaiEmbeddingConfig
+    | HashingVectorizerEmbeddingConfig
+    | VllmEmbeddingConfig
+    | BaseEmbedderConfig
 )
 
 
@@ -140,4 +173,6 @@ def initialize_embedder_config(values: dict[str, Any] | str | BaseEmbedderConfig
         return get_default_embedder_config(model_name=values)
     if isinstance(values, dict) and "n_features" in values:
         return HashingVectorizerEmbeddingConfig(**values)
+    if isinstance(values, dict) and "gpu_memory_utilization" in values:
+        return VllmEmbeddingConfig(**values)
     return get_default_embedder_config(**values)
diff --git a/tests/embedder/conftest.py b/tests/embedder/conftest.py
index 81dd189f0..ac78b41d0 100644
--- a/tests/embedder/conftest.py
+++ b/tests/embedder/conftest.py
@@ -1,17 +1,23 @@
+import importlib.util
 import os
 import platform
 
 import pytest
+import torch
 
 from autointent.configs import (
     HashingVectorizerEmbeddingConfig,
     OpenaiEmbeddingConfig,
     SentenceTransformerEmbeddingConfig,
+    VllmEmbeddingConfig,
 )
 
 # Check if OpenAI API key is available for testing
 openai_available = os.getenv("OPENAI_API_KEY") is not None
 
+# Check if vLLM is installed and CUDA is available
+vllm_available = importlib.util.find_spec("vllm") is not None and torch.cuda.is_available()
+
 
 @pytest.fixture
 def on_windows() -> bool:
@@ -50,6 +56,19 @@ def on_windows() -> bool:
         ),
         id="openai",
     ),
+    pytest.param(
+        VllmEmbeddingConfig(
+            model_name="sergeyzh/rubert-tiny-turbo",
+            batch_size=4,
+            use_cache=False,
+            max_model_len=512,
+        ),
+        marks=pytest.mark.skipif(
+            not vllm_available,
+            reason="vLLM not installed or CUDA not available (pip install autointent[vllm])",
+        ),
+        id="vllm",
+    ),
 ]
 
 # Only SentenceTransformer backend supports training
@@ -90,3 +109,16 @@ def create_openai_config(**kwargs) -> OpenaiEmbeddingConfig:
     }
     defaults.update(kwargs)
     return OpenaiEmbeddingConfig(**defaults)
+
+
+def create_vllm_config(**kwargs) -> VllmEmbeddingConfig:
+    """Helper function to create VllmEmbeddingConfig with test-friendly defaults."""
+    defaults = {
+        "model_name": "BAAI/bge-base-en-v1.5",
+        "batch_size": 4,
+        "use_cache": False,
+        "gpu_memory_utilization": 0.5,
+        "max_model_len": 512,
+    }
+    defaults.update(kwargs)
+    return VllmEmbeddingConfig(**defaults)

From 2bf098c9fddff6110c4add919f55ddf22b3b7d8b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 09:59:23 +0000
Subject: [PATCH 2/4] Update optimizer_config.schema.json

---
 docs/optimizer_config.schema.json | 152 ++++++++++++++++++++++++++++++
 1 file changed, 152 insertions(+)

diff --git a/docs/optimizer_config.schema.json b/docs/optimizer_config.schema.json
index 9057800b1..97532300d 100644
--- a/docs/optimizer_config.schema.json
+++ b/docs/optimizer_config.schema.json
@@ -972,6 +972,155 @@
             },
             "title": "TokenizerConfig",
             "type": "object"
+        },
+        "VllmEmbeddingConfig": {
+            "additionalProperties": false,
+            "description": "Configuration for vLLM-based embeddings.",
+            "properties": {
+                "default_prompt": {
+                    "anyOf": [
+                        {
+                            "type": "string"
+                        },
+                        {
+                            "type": "null"
+                        }
+                    ],
+                    "default": null,
+                    "description": "Default prompt for the model. This is used when no task specific prompt is not provided.",
+                    "title": "Default Prompt"
+                },
+                "classification_prompt": {
+                    "anyOf": [
+                        {
+                            "type": "string"
+                        },
+                        {
+                            "type": "null"
+                        }
+                    ],
+                    "default": null,
+                    "description": "Prompt for classifier.",
+                    "title": "Classification Prompt"
+                },
+                "cluster_prompt": {
+                    "anyOf": [
+                        {
+                            "type": "string"
+                        },
+                        {
+                            "type": "null"
+                        }
+                    ],
+                    "default": null,
+                    "description": "Prompt for clustering.",
+                    "title": "Cluster Prompt"
+                },
+                "sts_prompt": {
+                    "anyOf": [
+                        {
+                            "type": "string"
+                        },
+                        {
+                            "type": "null"
+                        }
+                    ],
+                    "default": null,
+                    "description": "Prompt for finding most similar sentences.",
+                    "title": "Sts Prompt"
+                },
+                "query_prompt": {
+                    "anyOf": [
+                        {
+                            "type": "string"
+                        },
+                        {
+                            "type": "null"
+                        }
+                    ],
+                    "default": null,
+                    "description": "Prompt for query.",
+                    "title": "Query Prompt"
+                },
+                "passage_prompt": {
+                    "anyOf": [
+                        {
+                            "type": "string"
+                        },
+                        {
+                            "type": "null"
+                        }
+                    ],
+                    "default": null,
+                    "description": "Prompt for passage.",
+                    "title": "Passage Prompt"
+                },
+                "use_cache": {
+                    "default": true,
+                    "description": "Whether to use embeddings caching.",
+                    "title": "Use Cache",
+                    "type": "boolean"
+                },
+                "model_name": {
+                    "default": "BAAI/bge-base-en-v1.5",
+                    "description": "Name of the HuggingFace model to load via vLLM.",
+                    "title": "Model Name",
+                    "type": "string"
+                },
+                "batch_size": {
+                    "default": 32,
+                    "description": "Number of texts to encode per vLLM encode() call.",
+                    "title": "Batch Size",
+                    "type": "integer"
+                },
+                "max_model_len": {
+                    "anyOf": [
+                        {
+                            "type": "integer"
+                        },
+                        {
+                            "type": "null"
+                        }
+                    ],
+                    "default": null,
+                    "description": "Maximum sequence length. Reduces VRAM usage for long-context models.",
+                    "title": "Max Model Len"
+                },
+                "gpu_memory_utilization": {
+                    "default": 0.9,
+                    "description": "Fraction of GPU memory vLLM is allowed to use (0.0 to 1.0).",
+                    "maximum": 1.0,
+                    "minimum": 0.0,
+                    "title": "Gpu Memory Utilization",
+                    "type": "number"
+                },
+                "dtype": {
+                    "default": "auto",
+                    "description": "Data type for model weights: 'auto', 'float16', 'bfloat16', 'float32'.",
+                    "title": "Dtype",
+                    "type": "string"
+                },
+                "trust_remote_code": {
+                    "default": false,
+                    "description": "Whether to trust remote code when loading the model.",
+                    "title": "Trust Remote Code",
+                    "type": "boolean"
+                },
+                "extra_init_kwargs": {
+                    "additionalProperties": true,
+                    "description": "Extra keyword arguments passed to the vLLM LLM() constructor.",
+                    "title": "Extra Init Kwargs",
+                    "type": "object"
+                },
+                "extra_encode_kwargs": {
+                    "additionalProperties": true,
+                    "description": "Extra keyword arguments passed to llm.encode() at inference time (e.g. custom SamplingParams).",
+                    "title": "Extra Encode Kwargs",
+                    "type": "object"
+                }
+            },
+            "title": "VllmEmbeddingConfig",
+            "type": "object"
         }
     },
     "description": "Configuration for the optimization process.\n\nOne can use it to customize optimization beyond choosing different preset.\nInstantiate it and pass to :py:meth:`autointent.Pipeline.from_optimization_config`.",
@@ -1019,6 +1168,9 @@
                 {
                     "$ref": "#/$defs/HashingVectorizerEmbeddingConfig"
                 },
+                {
+                    "$ref": "#/$defs/VllmEmbeddingConfig"
+                },
                 {
                     "$ref": "#/$defs/BaseEmbedderConfig"
                 }

From 481f38bf6edcbc250134b95e28c78c50a7ead635 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 6 May 2026 12:00:22 +0200
Subject: [PATCH 3/4] lint again

---
 src/autointent/_wrappers/embedder/vllm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/autointent/_wrappers/embedder/vllm.py b/src/autointent/_wrappers/embedder/vllm.py
index 8f4054a73..cc5d9f8c8 100644
--- a/src/autointent/_wrappers/embedder/vllm.py
+++ b/src/autointent/_wrappers/embedder/vllm.py
@@ -4,7 +4,7 @@
 
 import json
 import logging
-from typing import TYPE_CHECKING, Literal, cast, overload
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import torch

From e90532477e7acc4c7a32f13c03cc2fdd7d1fcd41 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 6 May 2026 12:02:25 +0200
Subject: [PATCH 4/4] fix typeing

---
 src/autointent/_wrappers/embedder/vllm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/autointent/_wrappers/embedder/vllm.py b/src/autointent/_wrappers/embedder/vllm.py
index cc5d9f8c8..170c73aa5 100644
--- a/src/autointent/_wrappers/embedder/vllm.py
+++ b/src/autointent/_wrappers/embedder/vllm.py
@@ -20,7 +20,7 @@
     from pathlib import Path
 
     import numpy.typing as npt
-    from vllm import LLM
+    from vllm import LLM  # type: ignore[import-not-found]
 
     from autointent.configs import TaskTypeEnum
 
@@ -65,7 +65,7 @@ def _load_model(self) -> LLM:
     def clear_ram(self) -> None:
         """Release GPU memory held by the vLLM engine."""
         if self._model is not None:
-            logger.debug("Clearing vLLM embedder %s from GPU memory", self.config.model_name)
+            logger.debug("Clearing vLLM embedder %s from GPU memory", self.config.model_name)  # type: ignore[unreachable]
             del self._model
             self._model = None
             torch.cuda.empty_cache()