@@ -688,6 +688,29 @@ def eval_logits(self) -> Deque[List[float]]:
688688 maxlen = self ._n_ctx if self ._logits_all else 1 ,
689689 )
690690
691+ # LoRA / Adapter Management API
692+
693+ def load_lora (self , name : str , path : str ):
694+ """Loads a LoRA adapter into VRAM without applying it yet."""
695+ self ._model .load_lora (name , path )
696+
697+ def unload_lora (self , name : str ):
698+ """Actively unloads a specific LoRA to free up VRAM."""
699+ self ._model .unload_lora (name )
700+
701+ @property
702+ def loaded_lora_count (self ) -> int :
703+ """Returns the total number of LoRA adapters currently loaded in VRAM."""
704+ return self ._model .loaded_lora_count
705+
706+ def list_loras (self ) -> List [str ]:
707+ """Returns a list of all registered LoRA names."""
708+ return self ._model .list_loras ()
709+
710+ def unload_all_loras (self ):
711+ """Iterates through the registry and forces VRAM release for all loaded LoRAs."""
712+ self ._model .unload_all_loras ()
713+
691714 def tokenize (
692715 self , text : bytes , add_bos : bool = True , special : bool = False
693716 ) -> List [int ]:
@@ -746,14 +769,17 @@ def reset(self):
746769 """Reset the model state."""
747770 self .n_tokens = 0
748771
749- def eval (self , tokens : Sequence [int ]):
772+ def eval (
773+ self ,
774+ tokens : Sequence [int ],
775+ active_loras : Optional [List [Dict [str , Union [str , float ]]]] = None ,
776+ control_vector : Optional [Dict [str , Any ]] = None ,
777+ ):
750778 """Evaluate a list of tokens.
751779
752780 Args:
753781 tokens: The list of tokens to evaluate.
754782 """
755- if len (tokens ) == 0 :
756- return
757783 n_eval = len (tokens )
758784 if n_eval == 0 :
759785 return
@@ -852,6 +878,38 @@ def eval(self, tokens: Sequence[int]):
852878 logits_array = logits_array
853879 )
854880
881+ # JIT Dynamic LoRAs Weights Mounting
882+
883+ # Dynamic LoRA Routing
884+ if active_loras is not None :
885+ adapters_to_apply = []
886+ for lora in active_loras :
887+ name = lora .get ("name" )
888+ scale = float (lora .get ("scale" , 1.0 ))
889+ adapter_obj = getattr (self ._model , "_lora_registry" , {}).get (name )
890+ if adapter_obj :
891+ adapters_to_apply .append ((adapter_obj , scale ))
892+ elif self .verbose :
893+ print (f"Llama.eval: Warning! LoRA '{ name } ' not found in registry. Skipping." , file = sys .stderr )
894+
895+ self ._ctx .apply_loras (adapters_to_apply )
896+ else :
897+ # Crucial Fallback: Wipe the graph clean if no LoRAs are requested.
898+ # This guarantees zero weight contamination between different users/slots in a multiplexed environment.
899+ self ._ctx .clear_loras ()
900+
901+ # Dynamic Control Vector (CVec) Injection
902+ if control_vector is not None :
903+ data = control_vector .get ("data" , [])
904+ il_start = control_vector .get ("layer_start" , 1 )
905+ il_end = control_vector .get ("layer_end" , self .n_layer ())
906+ n_embd = self .n_embd ()
907+
908+ self ._ctx .apply_cvec (data , n_embd , il_start , il_end )
909+ else :
910+ # Ensure the control vector is cleared for a clean state
911+ self ._ctx .clear_cvec ()
912+
855913 # Dynamic Batch Downgrade: Attempt to decode, reduce batch size if KV cache is fragmented
856914 current_batch_size = n_chunk
857915 success = False
@@ -1119,6 +1177,8 @@ def generate(
11191177 grammar : Optional [LlamaGrammar ] = None ,
11201178 grammar_lazy : bool = False ,
11211179 seed : Optional [int ] = None ,
1180+ active_loras : Optional [List [Dict [str , Union [str , float ]]]] = None ,
1181+ control_vector : Optional [Dict [str , Any ]] = None ,
11221182 ) -> Generator [int , Optional [Sequence [int ]], None ]:
11231183 """Create a generator of tokens from a prompt.
11241184
@@ -1164,6 +1224,14 @@ def generate(
11641224 grammar: Optional BNF-like grammar (GBNF) to constrain sampling syntax.
11651225 grammar_lazy: If True, activates grammar constraints only on specific trigger tokens.
11661226 seed: RNG seed for sampling. Overrides the instance seed.
1227+ active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation.
1228+ Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`)
1229+ and an optional "scale" key (float, defaults to 1.0).
1230+ Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`.
1231+ control_vector: A dictionary containing Control Vector (CVec) data for representation engineering.
1232+ Must contain a "data" key with a flattened 1D list of floats.
1233+ Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count).
1234+ Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers.
11671235
11681236 Yields:
11691237 The generated tokens.
@@ -1357,7 +1425,7 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array):
13571425 last_token = [tokens [- 1 ]]
13581426
13591427 # 1. Evaluate up to N-1
1360- self .eval (body_tokens )
1428+ self .eval (body_tokens , active_loras = active_loras , control_vector = control_vector )
13611429
13621430 # 2. Save the N-1 state snapshot
13631431 current_history = self ._input_ids [:self .n_tokens ].tolist ()
@@ -1367,10 +1435,10 @@ def adapter(token_data_array: llama_cpp.llama_token_data_array):
13671435 seq_id = 0
13681436 )
13691437 # 3. Evaluate the final token to refresh logits
1370- self .eval (last_token )
1438+ self .eval (last_token , active_loras = active_loras , control_vector = control_vector )
13711439 else :
13721440 # Standard evaluation or single-token generation step
1373- self .eval (tokens )
1441+ self .eval (tokens , active_loras = active_loras , control_vector = control_vector )
13741442
13751443 # Sample loop
13761444 while sample_idx < self .n_tokens :
@@ -1672,6 +1740,8 @@ def _create_completion(
16721740 grammar : Optional [LlamaGrammar ] = None ,
16731741 grammar_lazy : bool = False ,
16741742 seed : Optional [int ] = None ,
1743+ active_loras : Optional [List [Dict [str , Union [str , float ]]]] = None ,
1744+ control_vector : Optional [Dict [str , Any ]] = None ,
16751745 ) -> Union [
16761746 Iterator [CreateCompletionResponse ], Iterator [CreateCompletionStreamResponse ]
16771747 ]:
@@ -1851,6 +1921,8 @@ def _create_completion(
18511921 grammar = grammar ,
18521922 grammar_lazy = grammar_lazy ,
18531923 seed = seed if seed is not None else self ._seed ,
1924+ active_loras = active_loras ,
1925+ control_vector = control_vector ,
18541926 ):
18551927 if llama_cpp .llama_token_is_eog (self ._model .vocab , token ):
18561928 text = self .detokenize (completion_tokens , prev_tokens = prompt_tokens )
@@ -2303,6 +2375,8 @@ def create_completion(
23032375 logits_processor : Optional [LogitsProcessorList ] = None ,
23042376 grammar : Optional [LlamaGrammar ] = None ,
23052377 grammar_lazy : bool = False ,
2378+ active_loras : Optional [List [Dict [str , Union [str , float ]]]] = None ,
2379+ control_vector : Optional [Dict [str , Any ]] = None ,
23062380 ) -> Union [CreateCompletionResponse , Iterator [CreateCompletionStreamResponse ]]:
23072381 """Generate text from a prompt.
23082382
@@ -2347,6 +2421,14 @@ def create_completion(
23472421 logits_processor: A list of logits processors to use.
23482422 grammar: A grammar to use for constrained sampling.
23492423 grammar_lazy: If True, enables lazy evaluation.
2424+ active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation.
2425+ Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`)
2426+ and an optional "scale" key (float, defaults to 1.0).
2427+ Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`.
2428+ control_vector: A dictionary containing Control Vector (CVec) data for representation engineering.
2429+ Must contain a "data" key with a flattened 1D list of floats.
2430+ Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count).
2431+ Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers.
23502432
23512433 Raises:
23522434 ValueError: If the requested tokens exceed the context window.
@@ -2396,6 +2478,8 @@ def create_completion(
23962478 logits_processor = logits_processor ,
23972479 grammar = grammar ,
23982480 grammar_lazy = grammar_lazy ,
2481+ active_loras = active_loras ,
2482+ control_vector = control_vector ,
23992483 )
24002484 if stream :
24012485 chunks : Iterator [CreateCompletionStreamResponse ] = completion_or_chunks
@@ -2445,6 +2529,8 @@ def __call__(
24452529 logits_processor : Optional [LogitsProcessorList ] = None ,
24462530 grammar : Optional [LlamaGrammar ] = None ,
24472531 grammar_lazy : bool = False ,
2532+ active_loras : Optional [List [Dict [str , Union [str , float ]]]] = None ,
2533+ control_vector : Optional [Dict [str , Any ]] = None ,
24482534 ) -> Union [CreateCompletionResponse , Iterator [CreateCompletionStreamResponse ]]:
24492535 """Generate text from a prompt.
24502536
@@ -2489,6 +2575,14 @@ def __call__(
24892575 logits_processor: A list of logits processors to use.
24902576 grammar: A grammar to use for constrained sampling.
24912577 grammar_lazy: If True, enables lazy evaluation.
2578+ active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation.
2579+ Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`)
2580+ and an optional "scale" key (float, defaults to 1.0).
2581+ Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`.
2582+ control_vector: A dictionary containing Control Vector (CVec) data for representation engineering.
2583+ Must contain a "data" key with a flattened 1D list of floats.
2584+ Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count).
2585+ Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers.
24922586
24932587 Raises:
24942588 ValueError: If the requested tokens exceed the context window.
@@ -2538,6 +2632,8 @@ def __call__(
25382632 logits_processor = logits_processor ,
25392633 grammar = grammar ,
25402634 grammar_lazy = grammar_lazy ,
2635+ active_loras = active_loras ,
2636+ control_vector = control_vector ,
25412637 )
25422638
25432639 def create_chat_completion (
@@ -2583,6 +2679,8 @@ def create_chat_completion(
25832679 logits_processor : Optional [LogitsProcessorList ] = None ,
25842680 grammar : Optional [LlamaGrammar ] = None ,
25852681 grammar_lazy : bool = False ,
2682+ active_loras : Optional [List [Dict [str , Union [str , float ]]]] = None ,
2683+ control_vector : Optional [Dict [str , Any ]] = None ,
25862684 logprobs : Optional [bool ] = None ,
25872685 top_logprobs : Optional [int ] = None ,
25882686 ) -> Union [
@@ -2632,6 +2730,14 @@ def create_chat_completion(
26322730 logits_processor: A list of logits processors to use.
26332731 grammar: A grammar to use.
26342732 grammar_lazy: If True, enables lazy evaluation.
2733+ active_loras: A list of dictionaries specifying the LoRA adapters to dynamically apply during generation.
2734+ Each dictionary must contain a "name" key (matching a LoRA previously loaded into VRAM via `load_lora()`)
2735+ and an optional "scale" key (float, defaults to 1.0).
2736+ Example: `[{"name": "role_A", "scale": 0.85}, {"name": "role_B", "scale": 0.5}]`.
2737+ control_vector: A dictionary containing Control Vector (CVec) data for representation engineering.
2738+ Must contain a "data" key with a flattened 1D list of floats.
2739+ Optionally accepts "layer_start" (int, defaults to 1) and "layer_end" (int, defaults to the model's total layer count).
2740+ Note: The length of the "data" list MUST be at least `n_embd * layer_end`, with zero-padding for any skipped early layers.
26352741
26362742 Returns:
26372743 Generated chat completion or a stream of chat completion chunks.
@@ -2686,6 +2792,8 @@ def create_chat_completion(
26862792 logits_processor = logits_processor ,
26872793 grammar = grammar ,
26882794 grammar_lazy = grammar_lazy ,
2795+ active_loras = active_loras ,
2796+ control_vector = control_vector ,
26892797 )
26902798
26912799 def create_chat_completion_openai_v1 (
0 commit comments