Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Tests

on:
pull_request:
branches: [main]
push:
branches: [main]

jobs:
test:
runs-on: self-hosted # Requires a self-hosted runner with a GPU and model weights.
# Register one at: Settings -> Actions -> Runners -> New self-hosted runner.

steps:
- uses: actions/checkout@v4

- name: Install package
run: uv sync --extra dev --extra openai --extra microsoft

- name: Run tests
run: uv run pytest -v
env:
LLM_MODEL_CACHE: ${{ secrets.LLM_MODEL_CACHE }}
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ uv sync

...with optional libraries:
```
uv sync --extra <tag-1> <tag-2>
uv sync --extra <tag-1> --extra <tag-2>
```

### ...with pip:
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "llm" # The pip install <name>.
version = "0.3.0"
version = "0.4.0"
description = "Library for easy use of LLMs."
readme = "README.md"
authors = [
Expand All @@ -15,6 +15,7 @@ dependencies = [
"torchvision",
"sentence-transformers>=5.2.2",
"llm-conversation",
"pytest>=8.0",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -49,6 +50,9 @@ torch = { index = "pytorch-cu130" }
torchvision = { index = "pytorch-cu130" }
llm-conversation = { git = "https://github.com/EricApgar/llm-conversation", rev = "v0.2.0" }

[tool.pytest.ini_options]
testpaths = ["tests"]

[build-system]
requires = ["uv_build>=0.9.7,<0.10.0"]
build-backend = "uv_build"
Expand Down
202 changes: 61 additions & 141 deletions src/llm/models/gpt_oss_20b.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,6 @@
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from openai_harmony import (
Conversation as HarmonyConversation,
RenderConversationConfig,
load_harmony_encoding,
HarmonyEncodingName,
DeveloperContent,
ReasoningEffort,
SystemContent,
Message,
Role)
import os

from transformers import pipeline

from llm.models.template import Template
from llm_conversation import Conversation
Expand All @@ -21,7 +12,7 @@ def __init__(self, hf_token: str=None):
super().__init__(hf_token=hf_token)

self.name = 'openai/gpt-oss-20b'
self.tokenizer = None
self.model: pipeline = None


def load(self,
Expand All @@ -31,168 +22,97 @@ def load(self,
quantization: str=None,
device: str=None):

if (not remote) and (not os.path.isdir(location)):
raise ValueError(f'Nonexistant location ({location}) - fix or set remote=True.')

self.location = location
self.remote = remote
self.commit = commit
self.quantization = quantization

self._set_device(device=device)

self.model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path=self.name,
model_kwargs = {
'cache_dir': self.location,
'local_files_only': not self.remote}

self.model = pipeline(
task='text-generation',
model=self.name,
dtype='auto',
device_map=self.device,
token=self.hf_token,
cache_dir=self.location,
local_files_only=not self.remote,
revision=self.commit,
low_cpu_mem_usage=True,
# quantization_config=quantization_config,
device_map=self.device,
trust_remote_code=True, # self.remote, TODO
_attn_implementation='eager',
torch_dtype='auto') # Might be obsolete. Change to "dtype"?

self.tokenizer = AutoTokenizer.from_pretrained(self.name)

if self.tokenizer.pad_token_id is None or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
self.tokenizer.pad_token_id = 0 # use a dedicated ID that isn't EOS
# trust_remote_code=self.remote,
model_kwargs=model_kwargs)

return


def ask(self,
prompt: str | Conversation,
max_tokens: int=1024,
temperature: float=0.5,
reasoning_level: str='low',
repetition_penalty: float=1.12,
top_p: float=0.95):
'''
Call an LLM with a prompt and generate a response.
max_tokens: int=512,
temperature: float=0.9,
reasoning_level: str=None):

This model works best when the input is formatted into an
openai-harmony conversation structure, so all inputs are converted
into a generic Conversation structure (if not already one) and then
converted into the harmony structure.
'''

if not self.model:
raise ValueError('Must load model before using! (see model.load())')

encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
formatted_messages = self._format_prompt(prompt=prompt, reasoning_level=reasoning_level)

if isinstance(prompt, str): # Create a structured conversation from
convo = Conversation()
convo.add_response(role='user', text=prompt)
kwargs = {}
if temperature == 0:
kwargs['do_sample'] = False
else:
convo = prompt

convo_harmony = self._to_harmony(conversation=convo, reasoning_level=reasoning_level)

render_cfg = RenderConversationConfig(auto_drop_analysis=True)
prefill_ids = encoding.render_conversation_for_completion(
convo_harmony,
Role.ASSISTANT,
config=render_cfg)
stop_token_ids = encoding.stop_tokens_for_assistant_actions()
kwargs['temperature'] = temperature

input_ids = torch.tensor([prefill_ids], device=self.model.device)
attention_mask = torch.ones_like(input_ids)

out = self.model.generate(
input_ids=input_ids,
model_output = self.model(
formatted_messages,
max_new_tokens=max_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
# min_p=min_p,
repetition_penalty=repetition_penalty,
eos_token_id=stop_token_ids,
# pad_token_id=self.tokenizer.eos_token_id,
attention_mask=attention_mask,
pad_token_id=self.tokenizer.pad_token_id)

generated_tokens = out[0, input_ids.shape[-1]:].tolist()

# NOTE: Translate tokens directly to output (Debugging Only)
text_tokens = self.tokenizer.batch_decode(
generated_tokens,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)

# I can patch the harmony token problem by assuming that the message is in the generated tokens, and just
# wiping out everything up until the first occurrence of ['', 'analysis', ].
generated_tokens = generated_tokens[get_good_token_start(token_list=text_tokens):]

# Transform tokens into the text equivalent (contains model reasoning and thinking).
full_response = encoding.parse_messages_from_completion_tokens(generated_tokens, role=Role.ASSISTANT)
**kwargs)

full_text_response = model_output[0]['generated_text'][-1]['content']

# Extract the actual response from the full set of generated text.
final_response = next(m for m in full_response if m.channel == "final")
text_response = final_response.content[0].text
if 'assistantfinal' in full_text_response:
text = full_text_response.split("assistantfinal", 1)[1].strip()
else:
raise ValueError(f'Mangled LLM output. Could not find expected end marker "assistantfinal" in generated text: {full_text_response}')

return text_response
return text


@staticmethod
def _to_harmony(conversation: Conversation, reasoning_level: str) -> HarmonyConversation:
def _format_prompt(prompt: str | Conversation, reasoning_level: str=None) -> list[dict]:
'''
Build a Harmony-Conversation object from a Generic Conversation object.
Structure the input convo and images into the expected format
to get a good clean LLM response. Embedd it and prepare for LLM
token generation.
'''

if reasoning_level == 'low':
reasoning_level = ReasoningEffort.LOW
elif reasoning_level == 'medium':
reasoning_level = ReasoningEffort.MEDIUM
elif reasoning_level == 'high':
reasoning_level = ReasoningEffort.HIGH

# System Details about the Overall Conversation.
system_msg = Message.from_role_and_content(
Role.SYSTEM,
SystemContent.new().with_reasoning_effort(reasoning_level))

developer_msg = Message.from_role_and_content(
Role.DEVELOPER,
DeveloperContent.new().with_instructions(conversation.overall_prompt))

msgs = [system_msg, developer_msg]

# Background Context Information.
if conversation.context:
context_block = '\n'.join([
"BACKGROUND CONTEXT (not part of the dialogue):",
'\n'.join(conversation.context),
"END BACKGROUND CONTEXT"])

msgs.append(Message.from_role_and_content(Role.USER, context_block))

# Conversation history between user and AI.
for turn in conversation.history:
if turn.role == "user":
msgs.append(Message.from_role_and_content(Role.USER, turn.text))
else:
msgs.append(Message.from_role_and_content(Role.ASSISTANT, turn.text))
if isinstance(prompt, str):
convo = Conversation()
convo.add_response(role='user', text=prompt)
else:
convo = prompt

harmony_convo = HarmonyConversation.from_messages(msgs)
system_pieces = []
formatted_messages = []

return harmony_convo
if reasoning_level:
system_pieces.append(f'Reasoning level: {reasoning_level}.')

if convo.overall_prompt:
system_pieces.append(convo.overall_prompt)

def get_good_token_start(token_list: list[str]):
'''
This is a patch for handling bad generated tokens which break
the openai-harmony prompt formatter.
if convo.context:
for context in convo.context:
system_pieces.append(context)

It finds the start of rational thought in the generated output, skipping
over the nonsense content that's generated.
'''
for i in range(len(token_list) - 1):
if token_list[i] == "" and token_list[i + 1] == "analysis":
return i
if system_pieces: # Merge background context pieces.
formatted_messages.append({'role': 'system', 'content': ' '.join(system_pieces)})

raise ValueError("Could not find ['', 'analysis'] in the list")
if convo.history:
for response in convo.history:
formatted_messages.append({'role': response.role, 'content': response.text})

return
return formatted_messages


if __name__ == '__main__':
Expand Down
Loading
Loading