EricApgar · EricApgar · Mar 15, 2026 · Mar 7, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,23 @@
+name: Tests
+
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+
+jobs:
+  test:
+    runs-on: self-hosted  # Requires a self-hosted runner with a GPU and model weights.
+                          # Register one at: Settings -> Actions -> Runners -> New self-hosted runner.
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install package
+        run: uv sync --extra dev --extra openai --extra microsoft
+
+      - name: Run tests
+        run: uv run pytest -v
+        env:
+          LLM_MODEL_CACHE: ${{ secrets.LLM_MODEL_CACHE }}
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ uv sync
 
 ...with optional libraries:
 ```
-uv sync --extra <tag-1> <tag-2>
+uv sync --extra <tag-1> --extra <tag-2>
 ```
 
 ### ...with pip:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "llm"  # The pip install <name>.
-version = "0.3.0"
+version = "0.4.0"
 description = "Library for easy use of LLMs."
 readme = "README.md"
 authors = [
@@ -15,6 +15,7 @@ dependencies = [
     "torchvision",
     "sentence-transformers>=5.2.2",
     "llm-conversation",
+    "pytest>=8.0",
 ]
 
 [project.optional-dependencies]
@@ -49,6 +50,9 @@ torch = { index = "pytorch-cu130" }
 torchvision = { index = "pytorch-cu130" }
 llm-conversation = { git = "https://github.com/EricApgar/llm-conversation", rev = "v0.2.0" }
 
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
 [build-system]
 requires = ["uv_build>=0.9.7,<0.10.0"]
 build-backend = "uv_build"

diff --git a/src/llm/models/gpt_oss_20b.py b/src/llm/models/gpt_oss_20b.py
@@ -1,15 +1,6 @@
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from openai_harmony import (
-    Conversation as HarmonyConversation,
-    RenderConversationConfig,
-    load_harmony_encoding,
-    HarmonyEncodingName,
-    DeveloperContent,
-    ReasoningEffort,
-    SystemContent,
-    Message,
-    Role)
+import os
+
+from transformers import pipeline
 
 from llm.models.template import Template
 from llm_conversation import Conversation
@@ -21,7 +12,7 @@ def __init__(self, hf_token: str=None):
         super().__init__(hf_token=hf_token)
 
         self.name = 'openai/gpt-oss-20b'
-        self.tokenizer = None
+        self.model: pipeline = None
 
 
     def load(self,
@@ -31,168 +22,97 @@ def load(self,
         quantization: str=None,
         device: str=None):
 
+        if (not remote) and (not os.path.isdir(location)):
+            raise ValueError(f'Nonexistant location ({location}) - fix or set remote=True.')
+
         self.location = location
         self.remote = remote
         self.commit = commit
         self.quantization = quantization
 
         self._set_device(device=device)
 
-        self.model = AutoModelForCausalLM.from_pretrained(
-            pretrained_model_name_or_path=self.name,
+        model_kwargs = {
+            'cache_dir': self.location,
+            'local_files_only': not self.remote}
+
+        self.model = pipeline(
+            task='text-generation',
+            model=self.name,
+            dtype='auto',
+            device_map=self.device,
             token=self.hf_token,
-            cache_dir=self.location,
-            local_files_only=not self.remote,
             revision=self.commit,
-            low_cpu_mem_usage=True,
-            # quantization_config=quantization_config,
-            device_map=self.device,
-            trust_remote_code=True,  # self.remote, TODO
-            _attn_implementation='eager',
-            torch_dtype='auto')  # Might be obsolete. Change to "dtype"?
-
-        self.tokenizer = AutoTokenizer.from_pretrained(self.name)
-
-        if self.tokenizer.pad_token_id is None or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
-            self.tokenizer.pad_token_id = 0  # use a dedicated ID that isn't EOS
+            # trust_remote_code=self.remote,
+            model_kwargs=model_kwargs)
 
         return
-
+    
 
     def ask(self,
         prompt: str | Conversation,
-        max_tokens: int=1024,
-        temperature: float=0.5,
-        reasoning_level: str='low',
-        repetition_penalty: float=1.12,
-        top_p: float=0.95):
-        '''
-        Call an LLM with a prompt and generate a response.
+        max_tokens: int=512,
+        temperature: float=0.9,
+        reasoning_level: str=None):
 
-        This model works best when the input is formatted into an
-        openai-harmony conversation structure, so all inputs are converted
-        into a generic Conversation structure (if not already one) and then
-        converted into the harmony structure.
-        '''
-
-        if not self.model:
-            raise ValueError('Must load model before using! (see model.load())')
-
-        encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+        formatted_messages = self._format_prompt(prompt=prompt, reasoning_level=reasoning_level)
 
-        if isinstance(prompt, str):  # Create a structured conversation from 
-            convo = Conversation()
-            convo.add_response(role='user', text=prompt)
+        kwargs = {}
+        if temperature == 0:
+            kwargs['do_sample'] = False
         else:
-            convo = prompt
-
-        convo_harmony = self._to_harmony(conversation=convo, reasoning_level=reasoning_level)
-
-        render_cfg = RenderConversationConfig(auto_drop_analysis=True)
-        prefill_ids = encoding.render_conversation_for_completion(
-            convo_harmony,
-            Role.ASSISTANT,
-            config=render_cfg)
-        stop_token_ids = encoding.stop_tokens_for_assistant_actions()
+            kwargs['temperature'] = temperature
 
-        input_ids = torch.tensor([prefill_ids], device=self.model.device)
-        attention_mask = torch.ones_like(input_ids)
-
-        out = self.model.generate(
-            input_ids=input_ids,
+        model_output = self.model(
+            formatted_messages,
             max_new_tokens=max_tokens,
-            do_sample=True,
-            temperature=temperature,
-            top_p=top_p,
-            # min_p=min_p,
-            repetition_penalty=repetition_penalty,
-            eos_token_id=stop_token_ids,
-            # pad_token_id=self.tokenizer.eos_token_id,
-            attention_mask=attention_mask,
-            pad_token_id=self.tokenizer.pad_token_id)
-
-        generated_tokens = out[0, input_ids.shape[-1]:].tolist()
-
-        # NOTE: Translate tokens directly to output (Debugging Only)
-        text_tokens = self.tokenizer.batch_decode(
-            generated_tokens,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False)
-
-        # I can patch the harmony token problem by assuming that the message is in the generated tokens, and just
-        # wiping out everything up until the first occurrence of ['', 'analysis', ].
-        generated_tokens = generated_tokens[get_good_token_start(token_list=text_tokens):]
-
-        # Transform tokens into the text equivalent (contains model reasoning and thinking).
-        full_response = encoding.parse_messages_from_completion_tokens(generated_tokens, role=Role.ASSISTANT)
+            **kwargs)
+
+        full_text_response = model_output[0]['generated_text'][-1]['content']
 
-        # Extract the actual response from the full set of generated text.
-        final_response = next(m for m in full_response if m.channel == "final")
-        text_response = final_response.content[0].text
+        if 'assistantfinal' in full_text_response:
+            text = full_text_response.split("assistantfinal", 1)[1].strip()
+        else:
+            raise ValueError(f'Mangled LLM output. Could not find expected end marker "assistantfinal" in generated text: {full_text_response}')
 
-        return text_response
+        return text
 
 
     @staticmethod
-    def _to_harmony(conversation: Conversation, reasoning_level: str) -> HarmonyConversation:
+    def _format_prompt(prompt: str | Conversation, reasoning_level: str=None) -> list[dict]:
         '''
-        Build a Harmony-Conversation object from a Generic Conversation object.
+        Structure the input convo and images into the expected format
+        to get a good clean LLM response. Embedd it and prepare for LLM
+        token generation.
         '''
 
-        if reasoning_level == 'low':
-            reasoning_level = ReasoningEffort.LOW
-        elif reasoning_level == 'medium':
-            reasoning_level = ReasoningEffort.MEDIUM
-        elif reasoning_level == 'high':
-            reasoning_level = ReasoningEffort.HIGH
-
-        # System Details about the Overall Conversation.
-        system_msg = Message.from_role_and_content(
-            Role.SYSTEM,
-            SystemContent.new().with_reasoning_effort(reasoning_level))
-
-        developer_msg = Message.from_role_and_content(
-            Role.DEVELOPER,
-            DeveloperContent.new().with_instructions(conversation.overall_prompt))
-
-        msgs = [system_msg, developer_msg]
-
-        # Background Context Information.
-        if conversation.context:
-            context_block = '\n'.join([
-                "BACKGROUND CONTEXT (not part of the dialogue):",
-                '\n'.join(conversation.context),
-                "END BACKGROUND CONTEXT"])
-
-            msgs.append(Message.from_role_and_content(Role.USER, context_block))
-
-        # Conversation history between user and AI.
-        for turn in conversation.history:
-            if turn.role == "user":
-                msgs.append(Message.from_role_and_content(Role.USER, turn.text))
-            else:
-                msgs.append(Message.from_role_and_content(Role.ASSISTANT, turn.text))
+        if isinstance(prompt, str):
+            convo = Conversation()
+            convo.add_response(role='user', text=prompt)
+        else:
+            convo = prompt
 
-        harmony_convo = HarmonyConversation.from_messages(msgs)
+        system_pieces = []
+        formatted_messages = []
 
-        return harmony_convo
+        if reasoning_level:
+            system_pieces.append(f'Reasoning level: {reasoning_level}.')
 
+        if convo.overall_prompt:
+            system_pieces.append(convo.overall_prompt)
 
-def get_good_token_start(token_list: list[str]):
-    '''
-    This is a patch for handling bad generated tokens which break
-    the openai-harmony prompt formatter.
+        if convo.context:
+            for context in convo.context:
+                system_pieces.append(context)
 
-    It finds the start of rational thought in the generated output, skipping
-    over the nonsense content that's generated.
-    '''
-    for i in range(len(token_list) - 1):
-        if token_list[i] == "" and token_list[i + 1] == "analysis":
-            return i
+        if system_pieces:  # Merge background context pieces.
+            formatted_messages.append({'role': 'system', 'content': ' '.join(system_pieces)})
 
-    raise ValueError("Could not find ['', 'analysis'] in the list")
+        if convo.history:
+            for response in convo.history:
+                formatted_messages.append({'role': response.role, 'content': response.text})
 
-    return
+        return formatted_messages
 
 
 if __name__ == '__main__':