From 132f3136a393bfa0688a7c54d0865388bc1b2e77 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 6 May 2026 11:56:57 +0200 Subject: [PATCH 1/4] add vllm support --- pyproject.toml | 3 + src/autointent/_wrappers/embedder/__init__.py | 2 + src/autointent/_wrappers/embedder/embedder.py | 8 +- src/autointent/_wrappers/embedder/vllm.py | 184 ++++++++++++++++++ src/autointent/configs/__init__.py | 2 + src/autointent/configs/_embedder.py | 37 +++- tests/embedder/conftest.py | 32 +++ 7 files changed, 265 insertions(+), 3 deletions(-) create mode 100644 src/autointent/_wrappers/embedder/vllm.py diff --git a/pyproject.toml b/pyproject.toml index 23a64f33c..816709e82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,9 @@ opensearch = [ openai = [ "openai (>=2,<3)", ] +vllm = [ + "vllm>=0.20.0", +] [tool.uv] conflicts = [ diff --git a/src/autointent/_wrappers/embedder/__init__.py b/src/autointent/_wrappers/embedder/__init__.py index 23d37f489..9c32a88e2 100644 --- a/src/autointent/_wrappers/embedder/__init__.py +++ b/src/autointent/_wrappers/embedder/__init__.py @@ -5,6 +5,7 @@ from .hashing_vectorizer import HashingVectorizerEmbeddingBackend from .openai import OpenaiEmbeddingBackend from .sentence_transformers import SentenceTransformerEmbeddingBackend +from .vllm import VllmEmbeddingBackend __all__ = [ "BaseEmbeddingBackend", @@ -12,4 +13,5 @@ "HashingVectorizerEmbeddingBackend", "OpenaiEmbeddingBackend", "SentenceTransformerEmbeddingBackend", + "VllmEmbeddingBackend", ] diff --git a/src/autointent/_wrappers/embedder/embedder.py b/src/autointent/_wrappers/embedder/embedder.py index 70c513bb9..1884f53d2 100644 --- a/src/autointent/_wrappers/embedder/embedder.py +++ b/src/autointent/_wrappers/embedder/embedder.py @@ -16,11 +16,13 @@ HashingVectorizerEmbeddingConfig, OpenaiEmbeddingConfig, SentenceTransformerEmbeddingConfig, + VllmEmbeddingConfig, ) from .hashing_vectorizer import HashingVectorizerEmbeddingBackend from .openai import OpenaiEmbeddingBackend from .sentence_transformers import SentenceTransformerEmbeddingBackend +from .vllm import VllmEmbeddingBackend if TYPE_CHECKING: import numpy as np @@ -64,8 +66,8 @@ def _init_backend(self) -> BaseEmbeddingBackend: return OpenaiEmbeddingBackend(self.config) if isinstance(self.config, HashingVectorizerEmbeddingConfig): return HashingVectorizerEmbeddingBackend(self.config) - # Check if it's exactly the abstract base config (not a subclass) - + if isinstance(self.config, VllmEmbeddingConfig): + return VllmEmbeddingBackend(self.config) msg = f"Cannot instantiate abstract EmbedderConfig: {self.config.__repr__()}" raise TypeError(msg) @@ -161,6 +163,8 @@ def load(cls, path: Path | str, override_config: EmbedderConfig | None = None) - instance._backend = OpenaiEmbeddingBackend.load(backend_path) elif isinstance(config, HashingVectorizerEmbeddingConfig): instance._backend = HashingVectorizerEmbeddingBackend.load(backend_path) + elif isinstance(config, VllmEmbeddingConfig): + instance._backend = VllmEmbeddingBackend.load(backend_path) else: msg = f"Cannot load abstract EmbedderConfig: {config.__repr__()}" raise TypeError(msg) diff --git a/src/autointent/_wrappers/embedder/vllm.py b/src/autointent/_wrappers/embedder/vllm.py new file mode 100644 index 000000000..8f4054a73 --- /dev/null +++ b/src/autointent/_wrappers/embedder/vllm.py @@ -0,0 +1,184 @@ +"""vLLM-based embedding backend for GPU-accelerated inference.""" + +from __future__ import annotations + +import json +import logging +from typing import TYPE_CHECKING, Literal, cast, overload + +import numpy as np +import torch + +from autointent._hash import Hasher +from autointent._utils import require +from autointent.configs._embedder import VllmEmbeddingConfig + +from .base import BaseEmbeddingBackend +from .utils import get_embeddings_path + +if TYPE_CHECKING: + from pathlib import Path + + import numpy.typing as npt + from vllm import LLM + + from autointent.configs import TaskTypeEnum + +logger = logging.getLogger(__name__) + + +class VllmEmbeddingBackend(BaseEmbeddingBackend): + """vLLM-based embedding backend implementation.""" + + supports_training: bool = False + + def __init__(self, config: VllmEmbeddingConfig) -> None: + """Initialize the vLLM backend. + + Args: + config: Configuration for vLLM embeddings. + """ + self.config = config + self._model = None + + def _load_model(self) -> LLM: + """Lazy-load the vLLM LLM engine on first use.""" + if self._model is None: + require("vllm", extra="vllm") + from vllm import LLM + + kwargs = { + "model": self.config.model_name, + "task": "embed", + "gpu_memory_utilization": self.config.gpu_memory_utilization, + "dtype": self.config.dtype, + "trust_remote_code": self.config.trust_remote_code, + **self.config.extra_init_kwargs, + } + if self.config.max_model_len is not None: + kwargs["max_model_len"] = self.config.max_model_len + + logger.debug("Loading vLLM embedding model %s", self.config.model_name) + self._model = LLM(**kwargs) + return self._model + + def clear_ram(self) -> None: + """Release GPU memory held by the vLLM engine.""" + if self._model is not None: + logger.debug("Clearing vLLM embedder %s from GPU memory", self.config.model_name) + del self._model + self._model = None + torch.cuda.empty_cache() + + def get_hash(self) -> int: + """Compute a hash value for identifying the embedding model.""" + hasher = Hasher() + hasher.update(self.config.model_name) + hasher.update(str(self.config.max_model_len)) + return hasher.intdigest() + + def embed( + self, + utterances: list[str], + task_type: TaskTypeEnum | None = None, + return_tensors: bool = False, + ) -> npt.NDArray[np.float32] | torch.Tensor: + """Calculate embeddings for a list of utterances. + + Args: + utterances: List of input texts to calculate embeddings for. + task_type: Type of task for which embeddings are calculated. + return_tensors: If True, return a PyTorch tensor; otherwise, return a numpy array. + + Returns: + A numpy array or PyTorch tensor of embeddings. + """ + if len(utterances) == 0: + msg = "Empty input" + logger.error(msg) + raise ValueError(msg) + + prompt = self.config.get_prompt(task_type) + if prompt: + utterances = [f"{prompt} {utterance}" for utterance in utterances] + + if self.config.use_cache: + hasher = Hasher() + hasher.update(self.get_hash()) + hasher.update(utterances) + if prompt: + hasher.update(prompt) + + embeddings_path = get_embeddings_path(hasher.hexdigest()) + if embeddings_path.exists(): + logger.debug("Loading cached vLLM embeddings from %s", embeddings_path) + embeddings_np = cast("npt.NDArray[np.float32]", np.load(embeddings_path)) + if return_tensors: + return torch.from_numpy(embeddings_np) + return embeddings_np + + model = self._load_model() + + logger.debug( + "Calculating embeddings with vLLM model %s, batch_size=%d", + self.config.model_name, + self.config.batch_size, + ) + + outputs = model.encode(utterances, pooling_task="embed", **self.config.extra_encode_kwargs) + all_embeddings = [output.outputs.embedding for output in outputs] + + embeddings_np = np.array(all_embeddings, dtype=np.float32) + + if self.config.use_cache: + embeddings_path.parent.mkdir(parents=True, exist_ok=True) + np.save(embeddings_path, embeddings_np) + + if return_tensors: + return torch.from_numpy(embeddings_np) + return embeddings_np + + def similarity( + self, embeddings1: npt.NDArray[np.float32], embeddings2: npt.NDArray[np.float32] + ) -> npt.NDArray[np.float32]: + """Calculate cosine similarity between two sets of embeddings. + + Args: + embeddings1: First set of embeddings (size n). + embeddings2: Second set of embeddings (size m). + + Returns: + A numpy array of similarities (size n x m). + """ + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + normalized1 = embeddings1 / norm1 + normalized2 = embeddings2 / norm2 + return cast("npt.NDArray[np.float32]", np.dot(normalized1, normalized2.T)) + + def dump(self, path: Path) -> None: + """Save the backend config to disk (stateless — no model weights to save). + + Args: + path: Path to the directory where the backend will be saved. + """ + path.mkdir(parents=True, exist_ok=True) + config_path = path / "config.json" + with config_path.open("w", encoding="utf-8") as file: + json.dump(self.config.model_dump(mode="json"), file, indent=4, ensure_ascii=False) + + @classmethod + def load(cls, path: Path) -> VllmEmbeddingBackend: + """Load the backend from saved config. + + Args: + path: Path to the directory where the backend is stored. + + Returns: + Loaded backend instance. + """ + config_path = path / "config.json" + with config_path.open("r", encoding="utf-8") as file: + config_data = json.load(file) + config = VllmEmbeddingConfig.model_validate(config_data) + return cls(config) diff --git a/src/autointent/configs/__init__.py b/src/autointent/configs/__init__.py index efa41f592..45531ea2c 100644 --- a/src/autointent/configs/__init__.py +++ b/src/autointent/configs/__init__.py @@ -6,6 +6,7 @@ OpenaiEmbeddingConfig, SentenceTransformerEmbeddingConfig, TaskTypeEnum, + VllmEmbeddingConfig, get_default_embedder_config, initialize_embedder_config, ) @@ -41,6 +42,7 @@ "TokenizerConfig", "TorchTrainingConfig", "VectorIndexConfig", + "VllmEmbeddingConfig", "VocabConfig", "get_default_embedder_config", "get_default_hfmodel_config", diff --git a/src/autointent/configs/_embedder.py b/src/autointent/configs/_embedder.py index 7b8252d52..f11784217 100644 --- a/src/autointent/configs/_embedder.py +++ b/src/autointent/configs/_embedder.py @@ -122,8 +122,41 @@ class HashingVectorizerEmbeddingConfig(BaseEmbedderConfig): dtype: str = Field("float32", description="Type of the matrix returned by fit_transform() or transform().") +class VllmEmbeddingConfig(BaseEmbedderConfig): + """Configuration for vLLM-based embeddings.""" + + model_name: str = Field("BAAI/bge-base-en-v1.5", description="Name of the HuggingFace model to load via vLLM.") + batch_size: int = Field(32, description="Number of texts to encode per vLLM encode() call.") + max_model_len: int | None = Field( + None, description="Maximum sequence length. Reduces VRAM usage for long-context models." + ) + gpu_memory_utilization: float = Field( + 0.9, + ge=0.0, + le=1.0, + description="Fraction of GPU memory vLLM is allowed to use (0.0 to 1.0).", + ) + dtype: str = Field( + "auto", + description="Data type for model weights: 'auto', 'float16', 'bfloat16', 'float32'.", + ) + trust_remote_code: bool = Field(False, description="Whether to trust remote code when loading the model.") + extra_init_kwargs: dict[str, Any] = Field( + default_factory=dict, + description="Extra keyword arguments passed to the vLLM LLM() constructor.", + ) + extra_encode_kwargs: dict[str, Any] = Field( + default_factory=dict, + description="Extra keyword arguments passed to llm.encode() at inference time (e.g. custom SamplingParams).", + ) + + EmbedderConfig: TypeAlias = ( - SentenceTransformerEmbeddingConfig | OpenaiEmbeddingConfig | HashingVectorizerEmbeddingConfig | BaseEmbedderConfig + SentenceTransformerEmbeddingConfig + | OpenaiEmbeddingConfig + | HashingVectorizerEmbeddingConfig + | VllmEmbeddingConfig + | BaseEmbedderConfig ) @@ -140,4 +173,6 @@ def initialize_embedder_config(values: dict[str, Any] | str | BaseEmbedderConfig return get_default_embedder_config(model_name=values) if isinstance(values, dict) and "n_features" in values: return HashingVectorizerEmbeddingConfig(**values) + if isinstance(values, dict) and "gpu_memory_utilization" in values: + return VllmEmbeddingConfig(**values) return get_default_embedder_config(**values) diff --git a/tests/embedder/conftest.py b/tests/embedder/conftest.py index 81dd189f0..ac78b41d0 100644 --- a/tests/embedder/conftest.py +++ b/tests/embedder/conftest.py @@ -1,17 +1,23 @@ +import importlib.util import os import platform import pytest +import torch from autointent.configs import ( HashingVectorizerEmbeddingConfig, OpenaiEmbeddingConfig, SentenceTransformerEmbeddingConfig, + VllmEmbeddingConfig, ) # Check if OpenAI API key is available for testing openai_available = os.getenv("OPENAI_API_KEY") is not None +# Check if vLLM is installed and CUDA is available +vllm_available = importlib.util.find_spec("vllm") is not None and torch.cuda.is_available() + @pytest.fixture def on_windows() -> bool: @@ -50,6 +56,19 @@ def on_windows() -> bool: ), id="openai", ), + pytest.param( + VllmEmbeddingConfig( + model_name="sergeyzh/rubert-tiny-turbo", + batch_size=4, + use_cache=False, + max_model_len=512, + ), + marks=pytest.mark.skipif( + not vllm_available, + reason="vLLM not installed or CUDA not available (pip install autointent[vllm])", + ), + id="vllm", + ), ] # Only SentenceTransformer backend supports training @@ -90,3 +109,16 @@ def create_openai_config(**kwargs) -> OpenaiEmbeddingConfig: } defaults.update(kwargs) return OpenaiEmbeddingConfig(**defaults) + + +def create_vllm_config(**kwargs) -> VllmEmbeddingConfig: + """Helper function to create VllmEmbeddingConfig with test-friendly defaults.""" + defaults = { + "model_name": "BAAI/bge-base-en-v1.5", + "batch_size": 4, + "use_cache": False, + "gpu_memory_utilization": 0.5, + "max_model_len": 512, + } + defaults.update(kwargs) + return VllmEmbeddingConfig(**defaults) From 2bf098c9fddff6110c4add919f55ddf22b3b7d8b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 6 May 2026 09:59:23 +0000 Subject: [PATCH 2/4] Update optimizer_config.schema.json --- docs/optimizer_config.schema.json | 152 ++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/docs/optimizer_config.schema.json b/docs/optimizer_config.schema.json index 9057800b1..97532300d 100644 --- a/docs/optimizer_config.schema.json +++ b/docs/optimizer_config.schema.json @@ -972,6 +972,155 @@ }, "title": "TokenizerConfig", "type": "object" + }, + "VllmEmbeddingConfig": { + "additionalProperties": false, + "description": "Configuration for vLLM-based embeddings.", + "properties": { + "default_prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Default prompt for the model. This is used when no task specific prompt is not provided.", + "title": "Default Prompt" + }, + "classification_prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Prompt for classifier.", + "title": "Classification Prompt" + }, + "cluster_prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Prompt for clustering.", + "title": "Cluster Prompt" + }, + "sts_prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Prompt for finding most similar sentences.", + "title": "Sts Prompt" + }, + "query_prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Prompt for query.", + "title": "Query Prompt" + }, + "passage_prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Prompt for passage.", + "title": "Passage Prompt" + }, + "use_cache": { + "default": true, + "description": "Whether to use embeddings caching.", + "title": "Use Cache", + "type": "boolean" + }, + "model_name": { + "default": "BAAI/bge-base-en-v1.5", + "description": "Name of the HuggingFace model to load via vLLM.", + "title": "Model Name", + "type": "string" + }, + "batch_size": { + "default": 32, + "description": "Number of texts to encode per vLLM encode() call.", + "title": "Batch Size", + "type": "integer" + }, + "max_model_len": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Maximum sequence length. Reduces VRAM usage for long-context models.", + "title": "Max Model Len" + }, + "gpu_memory_utilization": { + "default": 0.9, + "description": "Fraction of GPU memory vLLM is allowed to use (0.0 to 1.0).", + "maximum": 1.0, + "minimum": 0.0, + "title": "Gpu Memory Utilization", + "type": "number" + }, + "dtype": { + "default": "auto", + "description": "Data type for model weights: 'auto', 'float16', 'bfloat16', 'float32'.", + "title": "Dtype", + "type": "string" + }, + "trust_remote_code": { + "default": false, + "description": "Whether to trust remote code when loading the model.", + "title": "Trust Remote Code", + "type": "boolean" + }, + "extra_init_kwargs": { + "additionalProperties": true, + "description": "Extra keyword arguments passed to the vLLM LLM() constructor.", + "title": "Extra Init Kwargs", + "type": "object" + }, + "extra_encode_kwargs": { + "additionalProperties": true, + "description": "Extra keyword arguments passed to llm.encode() at inference time (e.g. custom SamplingParams).", + "title": "Extra Encode Kwargs", + "type": "object" + } + }, + "title": "VllmEmbeddingConfig", + "type": "object" } }, "description": "Configuration for the optimization process.\n\nOne can use it to customize optimization beyond choosing different preset.\nInstantiate it and pass to :py:meth:`autointent.Pipeline.from_optimization_config`.", @@ -1019,6 +1168,9 @@ { "$ref": "#/$defs/HashingVectorizerEmbeddingConfig" }, + { + "$ref": "#/$defs/VllmEmbeddingConfig" + }, { "$ref": "#/$defs/BaseEmbedderConfig" } From 481f38bf6edcbc250134b95e28c78c50a7ead635 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 6 May 2026 12:00:22 +0200 Subject: [PATCH 3/4] lint again --- src/autointent/_wrappers/embedder/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/autointent/_wrappers/embedder/vllm.py b/src/autointent/_wrappers/embedder/vllm.py index 8f4054a73..cc5d9f8c8 100644 --- a/src/autointent/_wrappers/embedder/vllm.py +++ b/src/autointent/_wrappers/embedder/vllm.py @@ -4,7 +4,7 @@ import json import logging -from typing import TYPE_CHECKING, Literal, cast, overload +from typing import TYPE_CHECKING, cast import numpy as np import torch From e90532477e7acc4c7a32f13c03cc2fdd7d1fcd41 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 6 May 2026 12:02:25 +0200 Subject: [PATCH 4/4] fix typeing --- src/autointent/_wrappers/embedder/vllm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/autointent/_wrappers/embedder/vllm.py b/src/autointent/_wrappers/embedder/vllm.py index cc5d9f8c8..170c73aa5 100644 --- a/src/autointent/_wrappers/embedder/vllm.py +++ b/src/autointent/_wrappers/embedder/vllm.py @@ -20,7 +20,7 @@ from pathlib import Path import numpy.typing as npt - from vllm import LLM + from vllm import LLM # type: ignore[import-not-found] from autointent.configs import TaskTypeEnum @@ -65,7 +65,7 @@ def _load_model(self) -> LLM: def clear_ram(self) -> None: """Release GPU memory held by the vLLM engine.""" if self._model is not None: - logger.debug("Clearing vLLM embedder %s from GPU memory", self.config.model_name) + logger.debug("Clearing vLLM embedder %s from GPU memory", self.config.model_name) # type: ignore[unreachable] del self._model self._model = None torch.cuda.empty_cache()