Skip to content

Commit 57bfdd8

Browse files
committed
feat(LoRA): implement JIT dynamic LoRA routing and Control Vector injection
- This commit introduces a paradigm shift in how adapters are managed by exposing dynamic, per-request LoRA and Control Vector routing through the high-level `Llama` API and OpenAI-compatible endpoints (`create_chat_completion`, `create_completion`, etc.). - Previously, LoRA adapters were statically bound to the model during initialization (`lora_path` inside `__init__`), strictly limiting a loaded model instance to a single persona/task (Single-Tenant). Switching personas required reloading the entire model or duplicating it in VRAM. - With this update: - Users can preload multiple LoRAs into VRAM via `load_lora()`. - Adapters can be dynamically mounted and unmounted at runtime using the `active_loras` and `control_vector` arguments. - The `eval` method now performs Just-In-Time (JIT) weight mounting right before the compute graph executes, followed by guaranteed state wipeout (Debouncing). - This unlocks zero-latency "Multi-Tenant" serving, allowing a single base model instance to concurrently serve multiple users with entirely different LoRA personas (e.g., Role A vs. Role B) without VRAM duplication or model reloading overhead. Signed-off-by: JamePeng <jame_peng@sina.com>
1 parent 82ef995 commit 57bfdd8

1 file changed

Lines changed: 114 additions & 6 deletions

File tree

llama_cpp/llama.py

Lines changed: 114 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,29 @@ def eval_logits(self) -> Deque[List[float]]:
688688
maxlen=self._n_ctx if self._logits_all else 1,
689689
)
690690

691+
# LoRA / Adapter Management API
692+
693+
def load_lora(self, name: str, path: str):
694+
"""Loads a LoRA adapter into VRAM without applying it yet."""
695+
self._model.load_lora(name, path)
696+
697+
def unload_lora(self, name: str):
698+
"""Actively unloads a specific LoRA to free up VRAM."""
699+
self._model.unload_lora(name)
700+
701+
@property
702+
def loaded_lora_count(self) -> int:
703+
"""Returns the total number of LoRA adapters currently loaded in VRAM."""
704+
return self._model.loaded_lora_count
705+
706+
def list_loras(self) -> List[str]:
707+
"""Returns a list of all registered LoRA names."""
708+
return self._model.list_loras()
709+
710+
def unload_all_loras(self):
711+
"""Iterates through the registry and forces VRAM release for all loaded LoRAs."""
712+
self._model.unload_all_loras()
713+
691714
def tokenize(
692715
self, text: bytes, add_bos: bool = True, special: bool = False
693716
) -> List[int]:
@@ -746,14 +769,17 @@ def reset(self):
746769
"""Reset the model state."""
747770
self.n_tokens = 0
748771

749-
def eval(self, tokens: Sequence[int]):
772+
def eval(
773+
self,
774+
tokens: Sequence[int],
775+
active_loras: Optional[List[Dict[str, Union[str, float]]]] = None,
776+
control_vector: Optional[Dict[str, Any]] = None,
777+
):
750778
"""Evaluate a list of tokens.
751779
752780
Args:
753781
tokens: The list of tokens to evaluate.
754782
"""
755-
if len(tokens) == 0:
756-
return
757783
n_eval = len(tokens)
758784
if n_eval == 0:
759785
return
@@ -852,6 +878,38 @@ def eval(self, tokens: Sequence[int]):
852878
logits_array=logits_array
853879
)
854880

881+
# JIT Dynamic LoRAs Weights Mounting
882+
883+
# Dynamic LoRA Routing
884+
if active_loras is not None:
885+
adapters_to_apply = []
886+
for lora in active_loras:
887+
name = lora.get("name")
888+
scale = float(lora.get("scale", 1.0))
889+
adapter_obj = getattr(self._model, "_lora_registry", {}).get(name)
890+
if adapter_obj:
891+
adapters_to_apply.append((adapter_obj, scale))
892+
elif self.verbose:
893+
print(f"Llama.eval: Warning! LoRA '{name}' not found in registry. Skipping.", file=sys.stderr)
894+
895+
self._ctx.apply_loras(adapters_to_apply)
896+
else:
897+
# Crucial Fallback: Wipe the graph clean if no LoRAs are requested.
898+
# This guarantees zero weight contamination between different users/slots in a multiplexed environment.
899+
self._ctx.clear_loras()
900+
901+
# Dynamic Control Vector (CVec) Injection
902+
if control_vector is not None:
903+
data = control_vector.get("data", [])
904+
il_start = control_vector.get("layer_start", 1)
905+
il_end = control_vector.get("layer_end", self.n_layer())
906+
n_embd = self.n_embd()
907+
908+
self._ctx.apply_cvec(data, n_embd, il_start, il_end)
909+
else:
910+
# Ensure the control vector is cleared for a clean state
911+
self._ctx.clear_cvec()
912+
855913
# Dynamic Batch Downgrade: Attempt to decode, reduce batch size if KV cache is fragmented
856914
current_batch_size = n_chunk
857915
success = False
@@ -1119,6 +1177,8 @@ def generate(
11191177
grammar: Optional[LlamaGrammar] = None,
11201178
grammar_lazy: bool = False,
11211179
seed: Optional[int] = None,
1180+
active_loras: Optional[List[Dict[str, Union[str, float]]]] = None,
1181+
control_vector: Optional[Dict[str, Any]] = None,
11221182
) -> Generator[int, Optional[Sequence[int]], None]:
11231183
"""Create a generator of tokens from a prompt.
11241184
@@ -1164,6 +1224,14 @@ def generate(
11641224
grammar: Optional BNF-like grammar (GBNF) to constrain sampling syntax.
11651225
grammar_lazy: If True, activates grammar constraints only on specific trigger tokens.
11661226
seed: RNG seed for sampling. Overrides the instance seed.
1227+
active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation.
1228+
Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`)
1229+
and an optional "scale" key (float, defaults to 1.0).
1230+
Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`.
1231+
control_vector: A dictionary containing Control Vector (CVec) data for representation engineering.
1232+
Must contain a "data" key with a flattened 1D list of floats.
1233+
Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count).
1234+
Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers.
11671235
11681236
Yields:
11691237
The generated tokens.
@@ -1357,7 +1425,7 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array):
13571425
last_token = [tokens[-1]]
13581426

13591427
# 1. Evaluate up to N-1
1360-
self.eval(body_tokens)
1428+
self.eval(body_tokens, active_loras=active_loras, control_vector=control_vector)
13611429

13621430
# 2. Save the N-1 state snapshot
13631431
current_history = self._input_ids[:self.n_tokens].tolist()
@@ -1367,10 +1435,10 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array):
13671435
seq_id=0
13681436
)
13691437
# 3. Evaluate the final token to refresh logits
1370-
self.eval(last_token)
1438+
self.eval(last_token, active_loras=active_loras, control_vector=control_vector)
13711439
else:
13721440
# Standard evaluation or single-token generation step
1373-
self.eval(tokens)
1441+
self.eval(tokens, active_loras=active_loras, control_vector=control_vector)
13741442

13751443
# Sample loop
13761444
while sample_idx < self.n_tokens:
@@ -1672,6 +1740,8 @@ def _create_completion(
16721740
grammar: Optional[LlamaGrammar] = None,
16731741
grammar_lazy: bool = False,
16741742
seed: Optional[int] = None,
1743+
active_loras: Optional[List[Dict[str, Union[str, float]]]] = None,
1744+
control_vector: Optional[Dict[str, Any]] = None,
16751745
) -> Union[
16761746
Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
16771747
]:
@@ -1851,6 +1921,8 @@ def _create_completion(
18511921
grammar=grammar,
18521922
grammar_lazy=grammar_lazy,
18531923
seed=seed if seed is not None else self._seed,
1924+
active_loras=active_loras,
1925+
control_vector=control_vector,
18541926
):
18551927
if llama_cpp.llama_token_is_eog(self._model.vocab, token):
18561928
text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
@@ -2303,6 +2375,8 @@ def create_completion(
23032375
logits_processor: Optional[LogitsProcessorList] = None,
23042376
grammar: Optional[LlamaGrammar] = None,
23052377
grammar_lazy: bool = False,
2378+
active_loras: Optional[List[Dict[str, Union[str, float]]]] = None,
2379+
control_vector: Optional[Dict[str, Any]] = None,
23062380
) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
23072381
"""Generate text from a prompt.
23082382
@@ -2347,6 +2421,14 @@ def create_completion(
23472421
logits_processor: A list of logits processors to use.
23482422
grammar: A grammar to use for constrained sampling.
23492423
grammar_lazy: If True, enables lazy evaluation.
2424+
active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation.
2425+
Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`)
2426+
and an optional "scale" key (float, defaults to 1.0).
2427+
Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`.
2428+
control_vector: A dictionary containing Control Vector (CVec) data for representation engineering.
2429+
Must contain a "data" key with a flattened 1D list of floats.
2430+
Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count).
2431+
Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers.
23502432
23512433
Raises:
23522434
ValueError: If the requested tokens exceed the context window.
@@ -2396,6 +2478,8 @@ def create_completion(
23962478
logits_processor=logits_processor,
23972479
grammar=grammar,
23982480
grammar_lazy=grammar_lazy,
2481+
active_loras=active_loras,
2482+
control_vector=control_vector,
23992483
)
24002484
if stream:
24012485
chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
@@ -2445,6 +2529,8 @@ def __call__(
24452529
logits_processor: Optional[LogitsProcessorList] = None,
24462530
grammar: Optional[LlamaGrammar] = None,
24472531
grammar_lazy: bool = False,
2532+
active_loras: Optional[List[Dict[str, Union[str, float]]]] = None,
2533+
control_vector: Optional[Dict[str, Any]] = None,
24482534
) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
24492535
"""Generate text from a prompt.
24502536
@@ -2489,6 +2575,14 @@ def __call__(
24892575
logits_processor: A list of logits processors to use.
24902576
grammar: A grammar to use for constrained sampling.
24912577
grammar_lazy: If True, enables lazy evaluation.
2578+
active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation.
2579+
Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`)
2580+
and an optional "scale" key (float, defaults to 1.0).
2581+
Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`.
2582+
control_vector: A dictionary containing Control Vector (CVec) data for representation engineering.
2583+
Must contain a "data" key with a flattened 1D list of floats.
2584+
Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count).
2585+
Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers.
24922586
24932587
Raises:
24942588
ValueError: If the requested tokens exceed the context window.
@@ -2538,6 +2632,8 @@ def __call__(
25382632
logits_processor=logits_processor,
25392633
grammar=grammar,
25402634
grammar_lazy=grammar_lazy,
2635+
active_loras=active_loras,
2636+
control_vector=control_vector,
25412637
)
25422638

25432639
def create_chat_completion(
@@ -2583,6 +2679,8 @@ def create_chat_completion(
25832679
logits_processor: Optional[LogitsProcessorList] = None,
25842680
grammar: Optional[LlamaGrammar] = None,
25852681
grammar_lazy: bool = False,
2682+
active_loras: Optional[List[Dict[str, Union[str, float]]]] = None,
2683+
control_vector: Optional[Dict[str, Any]] = None,
25862684
logprobs: Optional[bool] = None,
25872685
top_logprobs: Optional[int] = None,
25882686
) -> Union[
@@ -2632,6 +2730,14 @@ def create_chat_completion(
26322730
logits_processor: A list of logits processors to use.
26332731
grammar: A grammar to use.
26342732
grammar_lazy: If True, enables lazy evaluation.
2733+
active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation.
2734+
Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`)
2735+
and an optional "scale" key (float, defaults to 1.0).
2736+
Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`.
2737+
control_vector: A dictionary containing Control Vector (CVec) data for representation engineering.
2738+
Must contain a "data" key with a flattened 1D list of floats.
2739+
Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count).
2740+
Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers.
26352741
26362742
Returns:
26372743
Generated chat completion or a stream of chat completion chunks.
@@ -2686,6 +2792,8 @@ def create_chat_completion(
26862792
logits_processor=logits_processor,
26872793
grammar=grammar,
26882794
grammar_lazy=grammar_lazy,
2795+
active_loras=active_loras,
2796+
control_vector=control_vector,
26892797
)
26902798

26912799
def create_chat_completion_openai_v1(

0 commit comments

Comments
 (0)