Merge pull request #12 from LLukas22/feat/streaming

LLukas22 · web-flow · commit c7e3efc98aaf · 2023-05-27T17:09:32.000+02:00
Add streaming support
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "llm-rs"
-version = "0.2.5"
+version = "0.2.6"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/README.md b/README.md
@@ -23,12 +23,26 @@ model = AutoModel.from_pretrained("path/to/model.bin",model_type=KnownModels.Lla
 #generate
 print(model.generate("The meaning of life is"))
 ```
+
+### Streaming Text
+Text can be yielded from a generator via the `stream` function:
+```python 
+from llm_rs import AutoModel, KnownModels
+
+#load the model
+model = AutoModel.from_pretrained("path/to/model.bin",model_type=KnownModels.Llama)
+
+#generate
+for token in model.stream("The meaning of life is"):
+    print(token)
+```
+
 ### Running GGML models from the Hugging Face Hub
 GGML converted models can be directly downloaded and run from the hub.
 ```python 
 from llm_rs import AutoModel
 
-model = AutoModel.from_pretrained("LLukas22/mpt-7b-ggml",model_file="mpt-7b-q4_0-ggjt.bin")
+model = AutoModel.from_pretrained("rustformers/mpt-7b-ggml",model_file="mpt-7b-q4_0-ggjt.bin")
 ```
 If there are multiple models in a repo the `model_file` has to be specified.
 If you want to load repositories which were not created throught this library, you have to specify the `model_type` parameter as the metadata files needed to infer the architecture are missing.
diff --git a/llm_rs/base_model.py b/llm_rs/base_model.py
@@ -1,4 +1,4 @@
-from typing import Optional, Callable, List, Union
+from typing import Optional, Callable, List, Union, Generator
 from abc import ABC
 import os
 
@@ -36,6 +36,14 @@ def generate(self,prompt:str,
         Generates text from a prompt.
         """ 
         ...
+
+    def stream(self,prompt:str,
+                 generation_config:Optional[GenerationConfig]=None,
+                 ) -> Generator[str,None,None]: 
+        """
+        Streams text from a prompt.
+        """ 
+        ...
     
     def tokenize(self,text:str) -> List[int]:
         """
diff --git a/llm_rs/langchain/__init__.py b/llm_rs/langchain/__init__.py
@@ -0,0 +1 @@
+from .langchain import RustformerLLM
diff --git a/llm_rs/langchain/langchain.py b/llm_rs/langchain/langchain.py
@@ -0,0 +1,97 @@
+try:
+    from langchain.llms.base import LLM
+except ImportError:
+    raise ImportError(
+        'To use the llm_rs.langchain module, please install llm-rs with the additional "langchain" dependencies via: pip install llm-rs[langchain]')
+
+from typing import Any, Dict, Optional, Sequence, Union, List
+import os
+
+from pydantic import root_validator
+from langchain.callbacks.manager import CallbackManagerForLLMRun
+
+from ..auto import AutoModel, KnownModels
+from ..config import GenerationConfig, SessionConfig
+from ..base_model import Model
+
+class RustformerLLM(LLM):
+    """
+    Langchain-Wrapper around a Rustformers model.
+    """
+
+    model: Optional[Model] = None #: :meta private:
+
+    model_path_or_repo_id: Union[str,os.PathLike]
+    """The path to the model file or directory or the name of a Hugging Face Hub
+    model repo."""
+
+    model_type: Optional[KnownModels] = None
+    """The model type."""
+
+    model_file: Optional[str] = None
+    """The name of the model file in repo or directory."""
+
+    # session_config:SessionConfig=SessionConfig()
+    # """Session config for the model."""
+
+    # generation_config:GenerationConfig=GenerationConfig()
+    # """Generation config for the model."""
+
+    lora_paths:Optional[List[Union[str,os.PathLike]]]=None
+    """Paths to the lora files."""
+
+
+    @property
+    def _identifying_params(self) -> Dict[str, Any]:
+        """Get the identifying parameters."""
+        return {
+            'model_path_or_repo_id': self.model_path_or_repo_id,
+            'model_type': self.model_type,
+            'model_file': self.model_file,
+            # 'session_config': self.session_config,
+            # 'generation_config': self.generation_config,
+            'lora_paths': self.lora_paths,
+        }
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return 'rustformer'
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate and load model from a local file or remote repo."""
+        values['model'] = AutoModel.from_pretrained(
+            model_path_or_repo_id= values['model_path_or_repo_id'],
+            model_type=values['model_type'],
+            model_file=values['model_file'],
+            # session_config=values['session_config'],
+            lora_paths=values['lora_paths'],
+        )
+        return values
+
+    def _call(
+        self,
+        prompt: str,
+        stop: Optional[Sequence[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+    ) -> str:
+        """Generate text from a prompt.
+
+        Args:
+            prompt: The prompt to generate text from.
+            stop: A list of sequences to stop generation when encountered.
+
+        Returns:
+            The generated text.
+        """
+        text = []
+        generation_config = GenerationConfig()
+
+        if stop:
+            generation_config.stop_words = list(stop)
+        for chunk in self.model.stream(prompt, generation_config=generation_config):
+            text.append(chunk)
+            if run_manager:
+                run_manager.on_llm_new_token(chunk, verbose=self.verbose)
+        return ''.join(text)
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,5 +37,9 @@ convert=[
   "einops >= 0.6.1"
 ]
 
+langchain=[
+  "langchain"
+]
+
 [tool.maturin]
 features = ["pyo3/extension-module"]
diff --git a/src/configs.rs b/src/configs.rs
@@ -1,7 +1,9 @@
+use crate::stopwords::StopWordHandler;
 use llm::{InferenceParameters, InferenceSessionConfig, ModelKVMemoryType, TokenBias};
 use pyo3::prelude::*;
 
 #[pyclass]
+#[derive(Clone)]
 pub struct GenerationConfig {
     #[pyo3(get, set)]
     pub top_k: usize,
@@ -19,6 +21,7 @@ pub struct GenerationConfig {
     pub max_new_tokens: Option<usize>,
     #[pyo3(get, set)]
     pub stop_words: Option<Vec<String>>,
+    pub stop_word_handler: Option<StopWordHandler>,
 }
 
 impl Default for GenerationConfig {
@@ -32,6 +35,18 @@ impl Default for GenerationConfig {
             seed: 42,
             max_new_tokens: None,
             stop_words: None,
+            stop_word_handler: None,
+        }
+    }
+}
+
+impl GenerationConfig {
+    pub fn init_stop_words(&mut self, model: &dyn llm::Model) {
+        if self.stop_words.is_some() {
+            let stopwords = self.stop_words.clone().unwrap();
+            self.stop_word_handler = Some(StopWordHandler::new(model, &stopwords));
+        } else {
+            self.stop_word_handler = None;
         }
     }
 }
@@ -59,6 +74,7 @@ impl GenerationConfig {
             seed: seed.unwrap_or(42),
             max_new_tokens,
             stop_words,
+            stop_word_handler: None,
         }
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -5,6 +5,7 @@ mod model_base;
 mod models;
 mod quantize;
 mod results;
+mod stopwords;
 
 #[pymodule]
 fn llm_rs(_py: Python, m: &PyModule) -> PyResult<()> {
diff --git a/src/model_base.rs b/src/model_base.rs
diff --git a/src/stopwords.rs b/src/stopwords.rs

Original file line number	Diff line number	Diff line change
`@@ -37,5 +37,9 @@ convert=[`
`37`	`37`	`"einops >= 0.6.1"`
`38`	`38`	`]`
`39`	`39`
	`40`	`+langchain=[`
	`41`	`+ "langchain"`
	`42`	`+]`
	`43`	`+`
`40`	`44`	`[tool.maturin]`
`41`	`45`	`features = ["pyo3/extension-module"]`