From ff3202968fe8db51b690d421dcd4abc941eab1d0 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Mon, 8 Jun 2026 20:52:09 +0000 Subject: [PATCH 01/16] Runner: Moved model invoking logic to runner.py --- benchtools/runner.py | 136 +++++++++++++++++++++++++++++++++++++++---- benchtools/task.py | 122 +------------------------------------- 2 files changed, 128 insertions(+), 130 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index c8a3c64..e42260e 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,8 +1,12 @@ # module to create and run benchmarks -import yaml import os +import json +import yaml +import boto3 import pandas as pd from pathlib import Path +from ollama import chat, ChatResponse, Client + # possibly resurected for batch runs? class BenchRunner(): @@ -27,7 +31,8 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None): self.model = model api_default = {'ollama_api': "http://localhost:11434", 'openai':"https://api.openai.com/v1", - 'ollama':""} + 'ollama':"", + 'bedrock': ""} if api: self.api = api else: @@ -35,6 +40,125 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None): def __str__(self): return f'{self.model} via {self.runner_type}' + + def run(self, prompt, format): + ''' + Run method of a runner takes a prompt and a format and then finds the correct api call that matches the runner requested by the user. Runs the LLM call and returns the LLM response + ''' + error = None + response = '' + try: + match self.runner_type: + case "ollama": + completion: ChatResponse = chat( + model=self.model, + format = format, + messages=[ + { + 'role': 'user', + 'content':prompt, + }, + ]) + response = completion.message.content + + + case "ollama_api": + client = Client( + host=self.api , + ) + completion = client.chat( + self.model, + format = format, + messages=[ + { + "role": "user", + "content": prompt, + }, + ], + ) + response = completion["message"]["content"] + + + case "openai": + client = OpenAI( + base_url=self.api, + ) + chat_completion = client.chat.completions.create( + model=self.model, + messages=[ + { + "role": "user", + "content": prompt, + } + ], + ) + response = chat_completion.choices[0].message.content + + case "bedrock": + bedrock_client = boto3.client('bedrock-runtime') + # Bedrock has multiple foundational models that will each differ in request parameters and response fields we included cases for a couple of them + # for available foundational models and their inferance parameters follow + # https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html + # Catch the model family first + model_fam = None + if self.model.startswith("meta"): model_fam = "llama" + elif self.model.startswith("google"): model_fam = "gemma" + match model_fam: + case "llama": + # Embed the prompt in Llama 3's instruction format. + formatted_prompt = f""" +<|begin_of_text|><|start_header_id|>user<|end_header_id|> +{prompt} +<|eot_id|> +<|start_header_id|>assistant<|end_header_id|> +""" + # Format the request payload using the model's native structure. + request = { + "prompt": formatted_prompt, + # "max_gen_len": 512, + # "temperature": 0.5, + } + # Convert the native request to JSON. + request = json.dumps(request) + completeion = bedrock_client.invoke_model( + modelId = self.model, + body = request, + accept="application/json" # ??? + ) + # Decode the response body. + response = json.loads(completeion["body"].read()) + response = response["generation"] + case "gemma": + # Format the request payload using the model's native structure. + request = { + 'messages': [ + { + 'role': 'user', + 'content': prompt + } + ] + } + # Convert the native request to JSON. + request = json.dumps(request) + completeion = bedrock_client.invoke_model( + modelId = self.model, + body = request, + accept="application/json" # ??? + ) + # Decode the response body. + response = json.loads(completeion['body'].read()) + response = response['choices'][0]['message']['content'] + case _: + raise NotImplementedError + + case _: + print(f"Runner type {self.runner_type} not supported") + return None + except Exception as e: + error = e + return (json.dump(response), error) + + class BenchRunnerList(): @@ -87,11 +211,3 @@ def from_file(cls,file_path): runner_list = [BenchRunner(**runner_info)] return cls(runner_list) - - - - - - - - diff --git a/benchtools/task.py b/benchtools/task.py index e420b2d..4f0f0cb 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -3,10 +3,8 @@ import os import yaml import json -import boto3 import pandas as pd import itertools -from ollama import chat, ChatResponse, Client from .logger import init_log_folder, log_interaction from pathlib import PurePath from datasets import load_dataset @@ -22,16 +20,6 @@ prompt_id_fx = {'concatenator_id_generator':concatenator_id_generator, 'selector_id_generator':selector_id_generator} -class UnMatchedModel(Exception): - """ - Exception raised for a bedrock model that isn't accounted for in the match statement - Follow https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html for a list of available models on bedrock and their inferance parameters - """ - def __init__(self, model): - self.model = model - message = f"Cannot call the model ${attempted_withdrawal} using aws Bedrock. Please fetch the correct inferance parameters for it and add it in a PR to BenchTools." - super().__init__(message) # Call the base class constructor - class Task: """ @@ -496,114 +484,8 @@ def run(self, runner=BenchRunner(), log_dir='logs', for (prompt_id, prompt),values in zip(id_prompt_list,self.variant_values): - error = None - response = '' - try: - match runner.runner_type: - case "ollama": - completion: ChatResponse = chat( - model=runner.model, - format = self.FormatClass.model_json_schema(), - messages=[ - { - 'role': 'user', - 'content':prompt, - }, - ]) - # print("response: " + response.message.content) - response = completion.message.content - - - case "ollama_api": - client = Client( - host=runner.api , - ) - completion = client.chat( - runner.model, - format = self.FormatClass.model_json_schema(), - messages=[ - { - "role": "user", - "content": prompt, - }, - ], - ) - response = completion["message"]["content"] - - - case "openai": - client = OpenAI( - base_url=runner.api, - ) - chat_completion = client.chat.completions.create( - model=runner.model, - messages=[ - { - "role": "user", - "content": prompt, - } - ], - ) - response = chat_completion.choices[0].message.content - - case "bedrock": - bedrock_client = boto3.client('bedrock-runtime') - # Bedrock has multiple foundational models that will each differ in request parameters and response fields we included cases for a couple of them - # for available foundational models and their inferance parameters follow - # https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html - # Catch the model family first - model_fam = None - if runner.model.startswith("meta"): model_fam = "llama" - elif runner.model.startswith("google"): model_fam = "gemma" - match model_fam: - case "llama": - # Embed the prompt in Llama 3's instruction format. - formatted_prompt = f""" -<|begin_of_text|><|start_header_id|>user<|end_header_id|> -{prompt} -<|eot_id|> -<|start_header_id|>assistant<|end_header_id|> -""" - # Format the request payload using the model's native structure. - request = { - "prompt": formatted_prompt, - # "max_gen_len": 512, - # "temperature": 0.5, - } - # Convert the native request to JSON. - request = json.dumps(request) - completeion = bedrock_client.invoke_model( - modelId = runner.model, - body = request - ) - # Decode the response body. - response = json.loads(completeion["body"].read()) - response = response["generation"] - case "gemma": - completeion = bedrock_client.invoke_model( - modelId = runner.model, - body = json.dumps( - { - 'messages': [ - { - 'role': 'user', - 'content': prompt - } - ] - } - ) - ) - # Decode the response body. - response = json.loads(completeion['body'].read()) - response = response['choices'][0]['message']['content'] - case _: - raise UnMatchedModel(runner.model) - - case _: - print(f"Runner type {runner.runner_type} not supported") - return None - except Exception as e: - error = e + response, error = runner.run(prompt, self.FormatClass.model_json_schema()) + if score: score_val = self.scoring_function(response, self.reference[prompt_id]) From 01b8962b7486f874e8badf80c33482992955b363 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Mon, 8 Jun 2026 21:33:22 +0000 Subject: [PATCH 02/16] fix --- benchtools/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index e42260e..7fb5208 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -156,7 +156,7 @@ def run(self, prompt, format): return None except Exception as e: error = e - return (json.dump(response), error) + return response, error From d74acad1221525d61af2b3eb81709277784f7d82 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Tue, 9 Jun 2026 01:35:41 +0000 Subject: [PATCH 03/16] Task: don't score if there was an error --- benchtools/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchtools/task.py b/benchtools/task.py index 4f0f0cb..0e96d5a 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -486,7 +486,7 @@ def run(self, runner=BenchRunner(), log_dir='logs', response, error = runner.run(prompt, self.FormatClass.model_json_schema()) - if score: + if not error and score: score_val = self.scoring_function(response, self.reference[prompt_id]) else: From a3d71760022315633750d6ec154901de86c1ff60 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Tue, 9 Jun 2026 01:38:35 +0000 Subject: [PATCH 04/16] Runner: Switching bedrock API from invoke which is low-level to converse which is high-level and has more potential --- benchtools/runner.py | 78 ++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 54 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 7fb5208..b849edc 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -95,61 +95,31 @@ def run(self, prompt, format): response = chat_completion.choices[0].message.content case "bedrock": - bedrock_client = boto3.client('bedrock-runtime') - # Bedrock has multiple foundational models that will each differ in request parameters and response fields we included cases for a couple of them - # for available foundational models and their inferance parameters follow - # https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html - # Catch the model family first - model_fam = None - if self.model.startswith("meta"): model_fam = "llama" - elif self.model.startswith("google"): model_fam = "gemma" - match model_fam: - case "llama": - # Embed the prompt in Llama 3's instruction format. - formatted_prompt = f""" -<|begin_of_text|><|start_header_id|>user<|end_header_id|> -{prompt} -<|eot_id|> -<|start_header_id|>assistant<|end_header_id|> -""" - # Format the request payload using the model's native structure. - request = { - "prompt": formatted_prompt, - # "max_gen_len": 512, - # "temperature": 0.5, - } - # Convert the native request to JSON. - request = json.dumps(request) - completeion = bedrock_client.invoke_model( - modelId = self.model, - body = request, - accept="application/json" # ??? - ) - # Decode the response body. - response = json.loads(completeion["body"].read()) - response = response["generation"] - case "gemma": - # Format the request payload using the model's native structure. - request = { - 'messages': [ - { + client = boto3.client('bedrock-runtime', region_name='us-east-1') + try: + response = client.converse( + modelId=self.model, + messages=[ + { 'role': 'user', - 'content': prompt - } - ] - } - # Convert the native request to JSON. - request = json.dumps(request) - completeion = bedrock_client.invoke_model( - modelId = self.model, - body = request, - accept="application/json" # ??? - ) - # Decode the response body. - response = json.loads(completeion['body'].read()) - response = response['choices'][0]['message']['content'] - case _: - raise NotImplementedError + 'content': [{'text': prompt}] + } + ] + ) + # Catch the model family + model_fam = None + if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "llama" + elif self.model.startswith("google"): model_fam = "gemma" + elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova" + match model_fam: + case "llama" |"nova": + response = response['output']['message']['content'][0]['text'] + case "gemma" | "_": + response = response['output']['message']['content']['text'] + + except Exception as e: + error = e + print(f"bedrock converse API failed with model {self.model}.\n{e}") case _: print(f"Runner type {self.runner_type} not supported") From 4acd89139bba065326eec86acc86fdded8e670b8 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Tue, 9 Jun 2026 17:43:57 +0000 Subject: [PATCH 05/16] runner: rename model_fam --- benchtools/runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index b849edc..6a4eb4d 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -108,11 +108,11 @@ def run(self, prompt, format): ) # Catch the model family model_fam = None - if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "llama" + if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "meta" elif self.model.startswith("google"): model_fam = "gemma" elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova" match model_fam: - case "llama" |"nova": + case "meta" |"nova": response = response['output']['message']['content'][0]['text'] case "gemma" | "_": response = response['output']['message']['content']['text'] From c1a81a1ff792bcd959c1723714feee57a99ffc77 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 10 Jun 2026 06:26:45 +0000 Subject: [PATCH 06/16] Runner: adding inferance parameters to runner's attributes --- benchtools/runner.py | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 6a4eb4d..25d0910 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -14,7 +14,7 @@ class BenchRunner(): A BenchRunner holds information about how a task is going to be run. ''' - def __init__(self, runner_type='ollama', model='gemma3:1b', api=None): + def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperature=None, max_tokens=None, top_p=None, stop_sequence=None): ''' The constructor for BenchRunner will have default values for all attributes to have a full default runner ready to be used for running any task. P.S. Requires Ollama to be installed and running on your machine. @@ -25,6 +25,14 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None): The name of the LLM to use for running the tasks. Default is 'gemma3'. P.S. Will need to have the model downloaded locally if using ollama api: str The URL of the API to use for accessing an LLM. If None, the default API will be http://localhost:11434 as this is used by ollama by default + temperature: float + Controls randomness in generation (higher = more random) + max_tokens: int + Maximum number of tokens to generate + top_p: float + Cumulative probability threshold for nucleus sampling + stop_sequence: list + Stop sequences that will halt generation ''' self.runner_type = runner_type @@ -38,6 +46,13 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None): else: self.api = api_default[runner_type] + self.inference_parameters={} + if temperature: self.inference_parameters.update({"tempetature": temperature}) + if top_p: self.inference_parameters.update({"top_p": top_p}) + if max_tokens: self.inference_parameters.update({"max_tokens": max_tokens}) + if stop_sequence: self.inference_parameters.update({"stop": temperatstop_sequenceure}) + + def __str__(self): return f'{self.model} via {self.runner_type}' @@ -54,11 +69,13 @@ def run(self, prompt, format): model=self.model, format = format, messages=[ - { - 'role': 'user', - 'content':prompt, - }, - ]) + { + 'role': 'user', + 'content':prompt, + }, + ], + options=self.inference_parameters + ) response = completion.message.content @@ -75,6 +92,7 @@ def run(self, prompt, format): "content": prompt, }, ], + options=self.inference_parameters ) response = completion["message"]["content"] @@ -95,6 +113,13 @@ def run(self, prompt, format): response = chat_completion.choices[0].message.content case "bedrock": + config={} + if self.inference_parameters: + if "tempetature" in self.inference_parameters: config.update({"temperature": self.inference_parameters["tempetature"]}) + if "top_p" in self.inference_parameters: config.update({"topP": self.inference_parameters["top_p"]}) + if "max_tokens" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters["max_tokens"]}) + if "stop" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters["stop"]}) + client = boto3.client('bedrock-runtime', region_name='us-east-1') try: response = client.converse( @@ -104,7 +129,10 @@ def run(self, prompt, format): 'role': 'user', 'content': [{'text': prompt}] } - ] + ], + inferenceConfig=config, + # additionalModelRequestFields{}, # For model-specific inference params + # additionalModelResponseFieldPaths[], # For model-specific return fields ) # Catch the model family model_fam = None From a1ca3a094d6ceb2f534c4e8364556f692c22b12f Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 10 Jun 2026 15:46:04 +0000 Subject: [PATCH 07/16] typo --- benchtools/runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 25d0910..e0a002e 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -47,7 +47,7 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperatur self.api = api_default[runner_type] self.inference_parameters={} - if temperature: self.inference_parameters.update({"tempetature": temperature}) + if temperature: self.inference_parameters.update({"temperature": temperature}) if top_p: self.inference_parameters.update({"top_p": top_p}) if max_tokens: self.inference_parameters.update({"max_tokens": max_tokens}) if stop_sequence: self.inference_parameters.update({"stop": temperatstop_sequenceure}) @@ -115,7 +115,7 @@ def run(self, prompt, format): case "bedrock": config={} if self.inference_parameters: - if "tempetature" in self.inference_parameters: config.update({"temperature": self.inference_parameters["tempetature"]}) + if "temperature" in self.inference_parameters: config.update({"temperature": self.inference_parameters["temperature"]}) if "top_p" in self.inference_parameters: config.update({"topP": self.inference_parameters["top_p"]}) if "max_tokens" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters["max_tokens"]}) if "stop" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters["stop"]}) From 8f9a7bdb8fc0f865bcf4ce2cd413bd4e851016cf Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 10 Jun 2026 16:00:49 +0000 Subject: [PATCH 08/16] ollama uses num_predict not max_tokens --- benchtools/runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index e0a002e..f14f9ab 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -49,7 +49,7 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperatur self.inference_parameters={} if temperature: self.inference_parameters.update({"temperature": temperature}) if top_p: self.inference_parameters.update({"top_p": top_p}) - if max_tokens: self.inference_parameters.update({"max_tokens": max_tokens}) + if max_tokens: self.inference_parameters.update({"num_predict": max_tokens}) if stop_sequence: self.inference_parameters.update({"stop": temperatstop_sequenceure}) @@ -117,7 +117,7 @@ def run(self, prompt, format): if self.inference_parameters: if "temperature" in self.inference_parameters: config.update({"temperature": self.inference_parameters["temperature"]}) if "top_p" in self.inference_parameters: config.update({"topP": self.inference_parameters["top_p"]}) - if "max_tokens" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters["max_tokens"]}) + if "num_predict" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters["num_predict"]}) if "stop" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters["stop"]}) client = boto3.client('bedrock-runtime', region_name='us-east-1') From 8d6bbb4dcdf838e1755de7c53c4522cef08a6b28 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 10 Jun 2026 16:09:13 +0000 Subject: [PATCH 09/16] typo --- benchtools/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index f14f9ab..0b7ff83 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -50,7 +50,7 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperatur if temperature: self.inference_parameters.update({"temperature": temperature}) if top_p: self.inference_parameters.update({"top_p": top_p}) if max_tokens: self.inference_parameters.update({"num_predict": max_tokens}) - if stop_sequence: self.inference_parameters.update({"stop": temperatstop_sequenceure}) + if stop_sequence: self.inference_parameters.update({"stop": stop_sequence}) def __str__(self): From 0ff8408b813635a5c337069f7b62e05e1f107a98 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 10 Jun 2026 18:32:57 +0000 Subject: [PATCH 10/16] Runner: model_params in dictionary --- benchtools/runner.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/benchtools/runner.py b/benchtools/runner.py index 0b7ff83..35d8fdc 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -14,7 +14,7 @@ class BenchRunner(): A BenchRunner holds information about how a task is going to be run. ''' - def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperature=None, max_tokens=None, top_p=None, stop_sequence=None): + def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, model_params=None): ''' The constructor for BenchRunner will have default values for all attributes to have a full default runner ready to be used for running any task. P.S. Requires Ollama to be installed and running on your machine. @@ -25,14 +25,16 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperatur The name of the LLM to use for running the tasks. Default is 'gemma3'. P.S. Will need to have the model downloaded locally if using ollama api: str The URL of the API to use for accessing an LLM. If None, the default API will be http://localhost:11434 as this is used by ollama by default - temperature: float - Controls randomness in generation (higher = more random) - max_tokens: int - Maximum number of tokens to generate - top_p: float - Cumulative probability threshold for nucleus sampling - stop_sequence: list - Stop sequences that will halt generation + model_params: dict + A dictionary with inference parameters to be used for the model generation: + temperature: float + Controls randomness in generation (higher = more random) + max_tokens: int + Maximum number of tokens to generate + top_p: float + Cumulative probability threshold for nucleus sampling + stop_sequence: list + Stop sequences that will halt generation ''' self.runner_type = runner_type @@ -47,10 +49,11 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperatur self.api = api_default[runner_type] self.inference_parameters={} - if temperature: self.inference_parameters.update({"temperature": temperature}) - if top_p: self.inference_parameters.update({"top_p": top_p}) - if max_tokens: self.inference_parameters.update({"num_predict": max_tokens}) - if stop_sequence: self.inference_parameters.update({"stop": stop_sequence}) + if model_params: + if 'temperature' in model_params: self.inference_parameters.update({"temperature": model_params["temperature"]}) + if 'top_p' in model_params: self.inference_parameters.update({"top_p": model_params["top_p"]}) + if 'max_tokens' in model_params: self.inference_parameters.update({"num_predict": model_params["max_tokens"]}) + if 'stop_sequence' in model_params: self.inference_parameters.update({"stop": model_params["stop_sequence"]}) def __str__(self): From baafa670ac4667fde6bcec3c9435090c87165932 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 10 Jun 2026 18:35:50 +0000 Subject: [PATCH 11/16] bnechmark: Bench object has a list of runners as attribute. List of runners loaded from yml files --- benchtools/benchmark.py | 46 +++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py index e0b6870..53e92dc 100644 --- a/benchtools/benchmark.py +++ b/benchtools/benchmark.py @@ -46,7 +46,7 @@ class Bench(): run() Run one task or all tasks of the benchmark. ''' - def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks=[]): + def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks=[], runners=[BenchRunner()]): ''' Initialize the benchmark object with the name and path to the benchmark folder. @@ -58,6 +58,8 @@ def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks path where the benchmark will be stored tasks: list of Task objects list of tasks to be included in the benchmark. Each task should be an instance of the Task class + runners: list[BenchRunner] + Specification of the model/s and API that will be used to run the benchmark ''' # set up the object attributes @@ -81,6 +83,8 @@ def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks else: self.tasks = {} + self.runners=runners + # Written if the benchmark directory has been initialized self.written = os.path.exists(self.benchmark_path) @@ -128,10 +132,12 @@ def from_folders(cls, benchmark_path): else: tasks = [] + runners = Bench.load_runners(benchmark_path) + return cls(name = info['bench_name'], benchmark_path = benchmark_path, - concept = info['concept'], tasks=tasks) + concept = info['concept'], tasks=tasks, runners=runners) @classmethod def from_yaml(cls, benchmark_path): @@ -159,9 +165,11 @@ def from_yaml(cls, benchmark_path): for task_dict in task_list: tasks.append(Task.from_dict(task_dict,source_path=benchmark_path)) + runners = Bench.load_runners(benchmark_path) + return cls(name = info['bench_name'], benchmark_path =benchmark_path, - concept= info['concept'], tasks=tasks) + concept= info['concept'], tasks=tasks, runners=runners) @classmethod def load(cls, benchmark_path): @@ -199,6 +207,25 @@ def load_info(benchmark_path): info = yaml.safe_load(f) return info + + @staticmethod + def load_runners(benchmark_path): + runners = [] + model_params = {} + content = os.listdir(benchmark_path) + if 'model_param.yml' in content: + with open(os.path.join(benchmark_path, 'model_param.yml'), 'r') as f: + model_params = yaml.safe_load(f) + + if 'runner.yml' in content: + with open(os.path.join(benchmark_path, 'runner.yml'), 'r') as f: + run_info = yaml.safe_load(f) + api= run_info['api'] if 'api' in run_info else None + for model in run_info['models']: + runners.append(BenchRunner(run_info['runner_type'], model, api, model_params)) + else: runners.append(BenchRunner(model_param=model_params)) + + return runners def initialize_dir(self, no_git=False): @@ -307,7 +334,7 @@ def add_task(self, task_object:Task): task_object.write(self.benchmark_path) - def run(self, runner=BenchRunner(), log_dir=None, score=False): + def run(self, log_dir=None, score=False): ''' Run the benchmark by running each task in the benchmark and logging the interactions. Parameters: @@ -324,7 +351,7 @@ def run(self, runner=BenchRunner(), log_dir=None, score=False): # Run each task for name, task in self.tasks.items(): - self.run_task(task, runner, log_dir,score) + self.run_task(task, log_dir,score) @@ -425,8 +452,7 @@ def score(self, model=None,task=None, run ='last',collate=False): - def run_task(self, target_task=None, runner=BenchRunner(), - log_dir=None, score=False): + def run_task(self, target_task=None, log_dir=None, score=False): ''' run a specific task ''' @@ -449,7 +475,9 @@ def run_task(self, target_task=None, runner=BenchRunner(), raise ValueError("target_task should be either a string (task name) or a Task object.") # TODO: Add log_dir to attributes? + run_responses = [] + for runner in self.runners: + run_responses.append(task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score)) - return task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score) - + return run_responses From 43cfc22e297891d403067865a25e56a43932aac2 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 10 Jun 2026 18:37:10 +0000 Subject: [PATCH 12/16] Demo: Adding demo files for runner info --- benchtools/assets/demos/listbench/model_param.yml | 3 +++ .../{folderbench/multiple_models.yml => listbench/runner.yml} | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 benchtools/assets/demos/listbench/model_param.yml rename benchtools/assets/demos/{folderbench/multiple_models.yml => listbench/runner.yml} (62%) diff --git a/benchtools/assets/demos/listbench/model_param.yml b/benchtools/assets/demos/listbench/model_param.yml new file mode 100644 index 0000000..b6d7e67 --- /dev/null +++ b/benchtools/assets/demos/listbench/model_param.yml @@ -0,0 +1,3 @@ +temperature: 0.5 +max_tokens: 17 +top_p: 0.150 \ No newline at end of file diff --git a/benchtools/assets/demos/folderbench/multiple_models.yml b/benchtools/assets/demos/listbench/runner.yml similarity index 62% rename from benchtools/assets/demos/folderbench/multiple_models.yml rename to benchtools/assets/demos/listbench/runner.yml index fbb7d54..7d072df 100644 --- a/benchtools/assets/demos/folderbench/multiple_models.yml +++ b/benchtools/assets/demos/listbench/runner.yml @@ -1,4 +1,4 @@ runner_type: ollama -model: +models: - 'llama3.2' - - 'gemma3' + - 'gemma3' \ No newline at end of file From 4fc52908b5922f3afca23017031479e449e75089 Mon Sep 17 00:00:00 2001 From: Ayman Sandouk Date: Wed, 17 Jun 2026 13:22:39 -0400 Subject: [PATCH 13/16] Benchmark: removing runner from attributes --- benchtools/benchmark.py | 47 +++++++---------------------------------- 1 file changed, 8 insertions(+), 39 deletions(-) diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py index 53e92dc..96aface 100644 --- a/benchtools/benchmark.py +++ b/benchtools/benchmark.py @@ -46,7 +46,7 @@ class Bench(): run() Run one task or all tasks of the benchmark. ''' - def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks=[], runners=[BenchRunner()]): + def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks=[]): ''' Initialize the benchmark object with the name and path to the benchmark folder. @@ -58,8 +58,6 @@ def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks path where the benchmark will be stored tasks: list of Task objects list of tasks to be included in the benchmark. Each task should be an instance of the Task class - runners: list[BenchRunner] - Specification of the model/s and API that will be used to run the benchmark ''' # set up the object attributes @@ -83,8 +81,6 @@ def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks else: self.tasks = {} - self.runners=runners - # Written if the benchmark directory has been initialized self.written = os.path.exists(self.benchmark_path) @@ -132,12 +128,10 @@ def from_folders(cls, benchmark_path): else: tasks = [] - runners = Bench.load_runners(benchmark_path) - return cls(name = info['bench_name'], benchmark_path = benchmark_path, - concept = info['concept'], tasks=tasks, runners=runners) + concept = info['concept'], tasks=tasks) @classmethod def from_yaml(cls, benchmark_path): @@ -165,11 +159,9 @@ def from_yaml(cls, benchmark_path): for task_dict in task_list: tasks.append(Task.from_dict(task_dict,source_path=benchmark_path)) - runners = Bench.load_runners(benchmark_path) - return cls(name = info['bench_name'], benchmark_path =benchmark_path, - concept= info['concept'], tasks=tasks, runners=runners) + concept= info['concept'], tasks=tasks) @classmethod def load(cls, benchmark_path): @@ -207,25 +199,6 @@ def load_info(benchmark_path): info = yaml.safe_load(f) return info - - @staticmethod - def load_runners(benchmark_path): - runners = [] - model_params = {} - content = os.listdir(benchmark_path) - if 'model_param.yml' in content: - with open(os.path.join(benchmark_path, 'model_param.yml'), 'r') as f: - model_params = yaml.safe_load(f) - - if 'runner.yml' in content: - with open(os.path.join(benchmark_path, 'runner.yml'), 'r') as f: - run_info = yaml.safe_load(f) - api= run_info['api'] if 'api' in run_info else None - for model in run_info['models']: - runners.append(BenchRunner(run_info['runner_type'], model, api, model_params)) - else: runners.append(BenchRunner(model_param=model_params)) - - return runners def initialize_dir(self, no_git=False): @@ -334,7 +307,7 @@ def add_task(self, task_object:Task): task_object.write(self.benchmark_path) - def run(self, log_dir=None, score=False): + def run(self, runner=BenchRunner(), log_dir=None, score=False): ''' Run the benchmark by running each task in the benchmark and logging the interactions. Parameters: @@ -351,7 +324,7 @@ def run(self, log_dir=None, score=False): # Run each task for name, task in self.tasks.items(): - self.run_task(task, log_dir,score) + self.run_task(task, runner, log_dir,score) @@ -452,7 +425,7 @@ def score(self, model=None,task=None, run ='last',collate=False): - def run_task(self, target_task=None, log_dir=None, score=False): + def run_task(self, runner=BenchRunner(), target_task=None, log_dir=None, score=False): ''' run a specific task ''' @@ -474,10 +447,6 @@ def run_task(self, target_task=None, log_dir=None, score=False): else: raise ValueError("target_task should be either a string (task name) or a Task object.") - # TODO: Add log_dir to attributes? - run_responses = [] - for runner in self.runners: - run_responses.append(task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score)) - - return run_responses + # TODO: Add log_dir to attributes? + return task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score) From 8764397f2406007287f15cc861ffa74002266f17 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 17 Jun 2026 14:46:46 -0400 Subject: [PATCH 14/16] Demo: fixing runners to match changes --- benchtools/assets/demos/folderbench/multiple_models.yml | 4 ++++ benchtools/assets/demos/folderbench/runner.yml | 3 +++ benchtools/assets/demos/listbench/model_param.yml | 3 --- benchtools/assets/demos/listbench/runner.yml | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 benchtools/assets/demos/folderbench/multiple_models.yml delete mode 100644 benchtools/assets/demos/listbench/model_param.yml diff --git a/benchtools/assets/demos/folderbench/multiple_models.yml b/benchtools/assets/demos/folderbench/multiple_models.yml new file mode 100644 index 0000000..5716aab --- /dev/null +++ b/benchtools/assets/demos/folderbench/multiple_models.yml @@ -0,0 +1,4 @@ +runner_type: ollama +model: + - 'llama3.2' + - 'gemma3' \ No newline at end of file diff --git a/benchtools/assets/demos/folderbench/runner.yml b/benchtools/assets/demos/folderbench/runner.yml index e4d4032..16d0627 100644 --- a/benchtools/assets/demos/folderbench/runner.yml +++ b/benchtools/assets/demos/folderbench/runner.yml @@ -1,2 +1,5 @@ runner_type: ollama model: 'llama3.2' +temperature: 0.5 +max_tokens: 17 +top_p: 0.150 \ No newline at end of file diff --git a/benchtools/assets/demos/listbench/model_param.yml b/benchtools/assets/demos/listbench/model_param.yml deleted file mode 100644 index b6d7e67..0000000 --- a/benchtools/assets/demos/listbench/model_param.yml +++ /dev/null @@ -1,3 +0,0 @@ -temperature: 0.5 -max_tokens: 17 -top_p: 0.150 \ No newline at end of file diff --git a/benchtools/assets/demos/listbench/runner.yml b/benchtools/assets/demos/listbench/runner.yml index 7d072df..5716aab 100644 --- a/benchtools/assets/demos/listbench/runner.yml +++ b/benchtools/assets/demos/listbench/runner.yml @@ -1,4 +1,4 @@ runner_type: ollama -models: +model: - 'llama3.2' - 'gemma3' \ No newline at end of file From 9920dfa6940f4c86cb1a3e59071b91f7407a6501 Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 17 Jun 2026 14:51:13 -0400 Subject: [PATCH 15/16] Coounting tokens and stop reason, passing run_info dict to task then to logger --- benchtools/logger.py | 18 +++++++------ benchtools/runner.py | 63 +++++++++++++++++++++++++++++++++++++------- benchtools/task.py | 10 +++---- 3 files changed, 68 insertions(+), 23 deletions(-) diff --git a/benchtools/logger.py b/benchtools/logger.py index cc86041..27064c6 100644 --- a/benchtools/logger.py +++ b/benchtools/logger.py @@ -79,7 +79,7 @@ def init_log_folder(log_path, model, task_info: dict, id_prompt_list: list, benc return run_dir -def log_interaction(run_log_dir, prompt_id, prompt, response, error,values,score=None): +def log_interaction(run_log_dir, prompt_id, prompt,values, run_info,score=None): """ Logs the event to the log folder specified by the user @@ -89,12 +89,10 @@ def log_interaction(run_log_dir, prompt_id, prompt, response, error,values,score Path to a run-specific directory in a log directory specified in a call to the run method prompt_id: str Index of the sub-task being logged - prompt: str - The input provided to the model. - response: str - The output generated by the model. error: str Any error from the runner + run_info: dict + A dictionary containing all the info from the runner (response, tokens, stop reason, etc...) """ # Making this into a directory in case more files (possibly steps) were to be held in here @@ -105,7 +103,7 @@ def log_interaction(run_log_dir, prompt_id, prompt, response, error,values,score f.write("------ prompt ------\n") f.write(f"{prompt}\n\n") f.write("------ response ------\n") - f.write(f"{response}\n\n") + f.write(f"{run_info['response']}\n\n") # Gather run_info info with open(os.path.join(run_log_dir, "run_info.yml"), 'r') as f: @@ -116,12 +114,16 @@ def log_interaction(run_log_dir, prompt_id, prompt, response, error,values,score 'task_name': run_info['name'], 'template': run_info['template'], 'prompt_id': prompt_id, - 'error': error, + 'error': run_info.get('error', None), 'values':values, 'steps':{ 0: { # In case a subtask had more than one step we can always make the 0 dynamic 'prompt': prompt, - 'response': response, + 'response': run_info['response'], + 'prompt_tokens': run_info.get('prompt_tokens', None), + 'response_tokens': run_info.get('response_tokens', None), + 'total_tokens': run_info.get('total_tokens', None), + 'stop_reason': run_info.get('stop_reason', None), }, }, } diff --git a/benchtools/runner.py b/benchtools/runner.py index 35d8fdc..e32d60b 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -55,6 +55,24 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, model_para if 'max_tokens' in model_params: self.inference_parameters.update({"num_predict": model_params["max_tokens"]}) if 'stop_sequence' in model_params: self.inference_parameters.update({"stop": model_params["stop_sequence"]}) + + @staticmethod + def from_file(cls, file_path): + runners = [] + model_params = {} + if not os.path.exists(file_path): + raise FileNotFoundError(f"File {file_path} does not exist.") + + with open(os.path.join(file_path), 'r') as f: + run_info = yaml.safe_load(f) + type= run_info.pop('runner_type', 'ollama') + model= run_info.pop('model', 'gemma3:1b') + api= run_info.pop('api', None) + + # Any remaining keys are considered model parameters + model_params = run_info if run_info else None + + return cls(type, model, api, model_params) def __str__(self): return f'{self.model} via {self.runner_type}' @@ -63,8 +81,21 @@ def run(self, prompt, format): ''' Run method of a runner takes a prompt and a format and then finds the correct api call that matches the runner requested by the user. Runs the LLM call and returns the LLM response ''' - error = None - response = '' + run_info = { + 'runner_type': self.runner_type, + 'model': self.model, + 'api': self.api, + 'inference_parameters': self.inference_parameters, + 'prompt': prompt, + 'format': format, + 'response': '', + 'error': None, + 'prompt_tokens': 0, + 'response_tokens': 0, + 'total_tokens': 0, + 'stop_reason': None, + } + try: match self.runner_type: case "ollama": @@ -79,7 +110,11 @@ def run(self, prompt, format): ], options=self.inference_parameters ) - response = completion.message.content + run_info['response'] = completion.message.content + run_info['prompt_tokens'] = completion.prompt_eval_count + run_info['response_tokens'] = completion.eval_count + run_info['total_tokens'] = completion.eval_count + completion.prompt_eval_count + run_info['stop_reason'] = completion.done_reason case "ollama_api": @@ -97,7 +132,11 @@ def run(self, prompt, format): ], options=self.inference_parameters ) - response = completion["message"]["content"] + run_info['response'] = completion["message"]["content"] + run_info['prompt_tokens'] = completion["prompt_eval_count"] + run_info['response_tokens'] = completion["eval_count"] + run_info['total_tokens'] = completion["eval_count"] + completion["prompt_eval_count"] + run_info['stop_reason'] = completion["done_reason"] case "openai": @@ -144,9 +183,13 @@ def run(self, prompt, format): elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova" match model_fam: case "meta" |"nova": - response = response['output']['message']['content'][0]['text'] + run_info['response'] = response['output']['message']['content'][0]['text'] case "gemma" | "_": - response = response['output']['message']['content']['text'] + run_info['response'] = response['output']['message']['content']['text'] + run_info['prompt_tokens'] = response['usage']['inputTokens'] + run_info['response_tokens'] = response['usage']['outputTokens'] + run_info['total_tokens'] = response['usage']['totalTokens'] + run_info['stop_reason'] = response['stopReason'] except Exception as e: error = e @@ -156,17 +199,17 @@ def run(self, prompt, format): print(f"Runner type {self.runner_type} not supported") return None except Exception as e: - error = e - return response, error + run_info['error'] = e + return run_info class BenchRunnerList(): ''' - a set of runners + a set of runner objects that can be used to run a benchmark on multiple models and/or runner types. ''' - def __init__(self, runners: list[BenchRunner]): + def __init__(self, runners: list[BenchRunner]=[BenchRunner()]): ''' Parameters diff --git a/benchtools/task.py b/benchtools/task.py index 0e96d5a..759790e 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -484,16 +484,16 @@ def run(self, runner=BenchRunner(), log_dir='logs', for (prompt_id, prompt),values in zip(id_prompt_list,self.variant_values): - response, error = runner.run(prompt, self.FormatClass.model_json_schema()) + run_info = runner.run(prompt, self.FormatClass.model_json_schema()) - if not error and score: - score_val = self.scoring_function(response, self.reference[prompt_id]) + if not 'error' in run_info and score: + score_val = self.scoring_function(run_info['response'], self.reference[prompt_id]) else: score_val = None - log_interaction(run_log, prompt_id, prompt, response, str(error),values,score_val) - responses.append(response) + log_interaction(run_log, prompt_id, prompt,values, run_info, score_val) + responses.append(run_info['response']) self.responses = responses From df1438990a183bf5cd143eb1a542f0b8c96fb26b Mon Sep 17 00:00:00 2001 From: AymanBx Date: Wed, 24 Jun 2026 18:02:17 +0000 Subject: [PATCH 16/16] Refactor: Logger Class. Scoring isn't logged --- benchtools/benchmark.py | 72 +++++++------ benchtools/logger.py | 223 ++++++++++++++++++++-------------------- benchtools/runner.py | 98 +++++++++--------- benchtools/task.py | 34 +++--- 4 files changed, 212 insertions(+), 215 deletions(-) diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py index 96aface..70d9f4f 100644 --- a/benchtools/benchmark.py +++ b/benchtools/benchmark.py @@ -6,9 +6,10 @@ import yaml import json # from pathlib import Path # ??? -from benchtools.task import Task from pathlib import PurePath -from benchtools.runner import BenchRunner +from .task import Task +from .logger import Logger +from .runner import BenchRunner from .utils import load_asset @@ -319,12 +320,47 @@ def run(self, runner=BenchRunner(), log_dir=None, score=False): score : bool to run scoring now or not ''' + # If user doesn't specify a log_dir, default to logs folder inside bench folder if not log_dir and not self.written: raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.") + elif not log_dir: + log_dir = os.path.join(self.benchmark_path, 'logs') + + # Initiaize a logger object that will handle the logging of the info and interactions + logger = Logger(log_dir) + logger.log_bench_info(bench_info={'bench_name': self.bench_name, 'bench_path': self.benchmark_path, 'concept': self.concept}) # Run each task for name, task in self.tasks.items(): - self.run_task(task, runner, log_dir,score) + self.run_task(task, runner, logger,score) + + + + def run_task(self, target_task=None, runner=BenchRunner(), log_dir=None, logger=None, score=False): + ''' + run a specific task + ''' + + # If user doesn't specify a log_dir, default to logs folder inside bench folder + if not log_dir and not self.written: + raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.") + elif not log_dir: + log_dir = os.path.join(self.benchmark_path, 'logs') + + if not(target_task): + # TODO: use a generator and make this have a state + target_task = list[self.tasks.keys()][0] + + if isinstance(target_task, str): + task_object = self.tasks[target_task] + elif isinstance(target_task, Task): + task_object = target_task + else: + raise ValueError("target_task should be either a string (task name) or a Task object.") + + + return task_object.run(runner, log_dir, logger, score) + @@ -421,32 +457,4 @@ def score(self, model=None,task=None, run ='last',collate=False): - return score_list - - - - def run_task(self, runner=BenchRunner(), target_task=None, log_dir=None, score=False): - ''' - run a specific task - ''' - if not log_dir and not self.written: - raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.") - - # If user doesn't specify a log_dir, default to logs folder inside bench folder - if not log_dir: - log_dir = os.path.join(self.benchmark_path, 'logs') - - if not(target_task): - # TODO: use a generator and make this have a state - target_task = list[self.tasks.keys()][0] - - if isinstance(target_task, str): - task_object = self.tasks[target_task] - elif isinstance(target_task, Task): - task_object = target_task - else: - raise ValueError("target_task should be either a string (task name) or a Task object.") - - # TODO: Add log_dir to attributes? - return task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score) - + return score_list \ No newline at end of file diff --git a/benchtools/logger.py b/benchtools/logger.py index 27064c6..2421029 100644 --- a/benchtools/logger.py +++ b/benchtools/logger.py @@ -16,129 +16,130 @@ def default(self, o): return super().default(o) -def init_log_folder(log_path, model, task_info: dict, id_prompt_list: list, benchmark=None, benchmark_path=None): - '''' - Creates the log directories and sub-directories for a specific task. - - Parameters: - ------------- - log_path: str - The path to the log dir where the log file will be created. - model: - The name of the model running the task - task_info: dict - A dictionary with all the task's info for which the logger is being initialized. +class Logger: + ''' + A class that holds all information and methods related to logging the interactions between the runner and the model. The logger will create the logging structure for each run of a task, and will log the prompt, response, and any other relevant information such as tokens used, stop reason, errors, etc... ''' - # Get timestamp without fractions of seconds - timestamp = int(datetime.datetime.now().timestamp()) - model_dir = os.path.join(log_path, model) - if not os.path.exists(model_dir): - os.mkdir(model_dir) + def __init__(self, log_path): + ''' + Initializes the logger by creating the log directory if it doesn't exist. - task_dir = os.path.join(model_dir, task_info['name']) - if not os.path.exists(task_dir): - os.mkdir(task_dir) + Parameters: + ------------- + log_path: str + The path to the log dir where the log file will be created. + ''' + self.log_path = log_path + # self.init_log_directory() # Create the log folder structure for the task + os.makedirs(self.log_path, exist_ok=True) - run_dir = os.path.join(task_dir, str(timestamp)) - os.mkdir(run_dir) + self.bench_info = {} - # Create run_info.yml with all the metadata - run_info = task_info - if benchmark: - run_info['bench_name'] = benchmark - run_info['benchmark_path'] = benchmark_path - run_info['run_id'] = str(timestamp) - run_info['log_path'] = str(run_dir) - # Add prompt_id of each value set to values - for idx, (prompt_id, _) in enumerate(id_prompt_list): - run_info['values'][idx].update({'prompt_id': prompt_id}) - - with open(os.path.join(run_dir,'run_info.yml'), 'w') as f: - yaml.dump(run_info, f) + def log_bench_info(self, bench_info): + # Get timestamp without fractions of seconds + timestamp = int(datetime.datetime.now().timestamp()) + bench_info[f'bench_run_id'] = str(timestamp) + self.bench_info = bench_info + self.log_path = os.path.join(self.log_path, f"bench_{bench_info['bench_name']}") + os.makedirs(self.log_path, exist_ok=True) - { - # TODO: What can we benifit from the logger? - # log_file = os.path.join(log_path, f'{task_name}_log.txt') - # print(f"\nLOGPATH: {log_file}\n") # Debugging - - # formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') - # handler = logging.FileHandler(log_file) - # handler.setFormatter(formatter) - - # logger = logging.getLogger(task_name) - # logger.setLevel(logging.INFO) # TODO add as an argument to the init functuion to use more options - # logger.addHandler(handler) - - # print(logger) # Debugging - # return logger - } - - return run_dir - -def log_interaction(run_log_dir, prompt_id, prompt,values, run_info,score=None): - """ - Logs the event to the log folder specified by the user - - Parameters: - ------------- - run_log_dir: str - Path to a run-specific directory in a log directory specified in a call to the run method - prompt_id: str - Index of the sub-task being logged - error: str - Any error from the runner - run_info: dict - A dictionary containing all the info from the runner (response, tokens, stop reason, etc...) - """ - - # Making this into a directory in case more files (possibly steps) were to be held in here - prompt_dir = os.path.join(run_log_dir, prompt_id) - os.mkdir(prompt_dir) - - with open(os.path.join(prompt_dir, "log.txt"), 'w') as f: - f.write("------ prompt ------\n") - f.write(f"{prompt}\n\n") - f.write("------ response ------\n") - f.write(f"{run_info['response']}\n\n") - - # Gather run_info info - with open(os.path.join(run_log_dir, "run_info.yml"), 'r') as f: - run_info = yaml.safe_load(f) - - - step_trace = { - 'task_name': run_info['name'], - 'template': run_info['template'], - 'prompt_id': prompt_id, - 'error': run_info.get('error', None), - 'values':values, - 'steps':{ - 0: { # In case a subtask had more than one step we can always make the 0 dynamic - 'prompt': prompt, - 'response': run_info['response'], - 'prompt_tokens': run_info.get('prompt_tokens', None), - 'response_tokens': run_info.get('response_tokens', None), - 'total_tokens': run_info.get('total_tokens', None), - 'stop_reason': run_info.get('stop_reason', None), - }, - }, - } - if not(score is None): - step_trace['steps'][0]['score'] = score + def log_task_info(self, task_info, id_prompt_list: list): + ''' + Logs the task info to the log folder specified by the user + + Parameters: + ------------- + task_info: dict + A dictionary with all the task's info for which the logger is being initialized. + ''' + # Get timestamp without fractions of seconds + timestamp = int(datetime.datetime.now().timestamp()) + + task_info['task_run_id'] = str(timestamp) + self.task_info = task_info + + self.task_log_path = os.path.join(self.log_path, f"task_{task_info['name']}") + os.makedirs(self.task_log_path, exist_ok=True) + + # with open(os.path.join(run_log_dir, "task_info.yml"), 'w') as f: + # yaml.dump(task_info, f) + + # Add prompt_id of each value set to values + for idx, (prompt_id, _) in enumerate(id_prompt_list): + task_info['values'][idx].update({'prompt_id': prompt_id}) + + + def log_runner_info(self, runner_info): + '''' + Creates the log directories and sub-directories for a specific task. + + Parameters: + ------------- + runner_info: dict + Dictionary that contains information about the runner of a task + ''' - with open(os.path.join(prompt_dir, "log.json"), 'w') as f: - # yaml.dump(step_trace, f) - json.dump(step_trace, f, indent=4, cls=EnhancedJSONEncoder) + self.model_dir = os.path.join(self.task_log_path, runner_info['model']) + os.makedirs(self.model_dir, exist_ok=True) - # TODO: What can we benifit from the logger? - # logger.info(f'Input: {prompt}') - # logger.info(f'Output: {response}') + self.run_dir = os.path.join(self.model_dir, self.task_info['task_run_id']) + os.makedirs(self.run_dir, exist_ok=True) + + self.runner_info = runner_info + # Create run_info.yml with all the metadata + self.run_info = self.bench_info | self.task_info | self.runner_info + self.run_info['log_path'] = str(self.task_log_path) + with open(os.path.join(self.run_dir,'run_info.yml'), 'w') as f: + yaml.dump(self.run_info, f) - + def log_interaction(self, response_info): + """ + Logs the event to the log folder specified by the user + + Parameters: + ------------- + response_info: dict + A dictionary of logged information from the interaction with the LLM + """ + + # Making this into a directory in case more files (possibly steps) were to be held in here + self.prompt_dir = os.path.join(self.run_dir, response_info['prompt_id']) + os.mkdir(self.prompt_dir) + + with open(os.path.join(self.prompt_dir, "log.txt"), 'w') as f: + f.write("------ prompt ------\n") + f.write(f"{response_info['prompt']}\n\n") + f.write("------ response ------\n") + f.write(f"{response_info['response']}\n\n") + + + step_trace = { + 'task_name': self.run_info['name'], + 'template': self.run_info['template'], + 'steps':{ + 0: response_info, + }, + } + + + with open(os.path.join(self.prompt_dir, "log.json"), 'w') as f: + json.dump(step_trace, f, indent=4, cls=EnhancedJSONEncoder) + + # TODO: What can we benifit from the logger? + # logger.info(f'Input: {prompt}') + # logger.info(f'Output: {response}') + + + + # def log_score(score): + # with open(os.path.join(run_log_dir, "run_info.yml"), 'r') as f: + # run_info = yaml.safe_load(f) + + # step_trace['steps'][0]['score'] = score \ No newline at end of file diff --git a/benchtools/runner.py b/benchtools/runner.py index e32d60b..60a2ee9 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -5,6 +5,7 @@ import boto3 import pandas as pd from pathlib import Path +from .logger import Logger from ollama import chat, ChatResponse, Client @@ -26,15 +27,7 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, model_para api: str The URL of the API to use for accessing an LLM. If None, the default API will be http://localhost:11434 as this is used by ollama by default model_params: dict - A dictionary with inference parameters to be used for the model generation: - temperature: float - Controls randomness in generation (higher = more random) - max_tokens: int - Maximum number of tokens to generate - top_p: float - Cumulative probability threshold for nucleus sampling - stop_sequence: list - Stop sequences that will halt generation + A dictionary with inference parameters to be used for the model generation such as temperature, max_tokens, top_p, stop_sequence, etc. ''' self.runner_type = runner_type @@ -48,13 +41,7 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, model_para else: self.api = api_default[runner_type] - self.inference_parameters={} - if model_params: - if 'temperature' in model_params: self.inference_parameters.update({"temperature": model_params["temperature"]}) - if 'top_p' in model_params: self.inference_parameters.update({"top_p": model_params["top_p"]}) - if 'max_tokens' in model_params: self.inference_parameters.update({"num_predict": model_params["max_tokens"]}) - if 'stop_sequence' in model_params: self.inference_parameters.update({"stop": model_params["stop_sequence"]}) - + self.inference_parameters= model_params @staticmethod def from_file(cls, file_path): @@ -77,16 +64,22 @@ def from_file(cls, file_path): def __str__(self): return f'{self.model} via {self.runner_type}' - def run(self, prompt, format): + def run(self, prompt_id, prompt, values, format, logger): ''' Run method of a runner takes a prompt and a format and then finds the correct api call that matches the runner requested by the user. Runs the LLM call and returns the LLM response ''' - run_info = { + runner_info = { 'runner_type': self.runner_type, 'model': self.model, 'api': self.api, - 'inference_parameters': self.inference_parameters, + 'inference_parameters': self.inference_parameters + } + logger.log_runner_info(runner_info) + + response_info = { + 'prompt_id': prompt_id, 'prompt': prompt, + 'values': values, 'format': format, 'response': '', 'error': None, @@ -110,11 +103,11 @@ def run(self, prompt, format): ], options=self.inference_parameters ) - run_info['response'] = completion.message.content - run_info['prompt_tokens'] = completion.prompt_eval_count - run_info['response_tokens'] = completion.eval_count - run_info['total_tokens'] = completion.eval_count + completion.prompt_eval_count - run_info['stop_reason'] = completion.done_reason + response_info['response'] = completion.message.content + response_info['prompt_tokens'] = completion.prompt_eval_count + response_info['response_tokens'] = completion.eval_count + response_info['total_tokens'] = completion.eval_count + completion.prompt_eval_count + response_info['stop_reason'] = completion.done_reason case "ollama_api": @@ -132,11 +125,11 @@ def run(self, prompt, format): ], options=self.inference_parameters ) - run_info['response'] = completion["message"]["content"] - run_info['prompt_tokens'] = completion["prompt_eval_count"] - run_info['response_tokens'] = completion["eval_count"] - run_info['total_tokens'] = completion["eval_count"] + completion["prompt_eval_count"] - run_info['stop_reason'] = completion["done_reason"] + response_info['response'] = completion["message"]["content"] + response_info['prompt_tokens'] = completion["prompt_eval_count"] + response_info['response_tokens'] = completion["eval_count"] + response_info['total_tokens'] = completion["eval_count"] + completion["prompt_eval_count"] + response_info['stop_reason'] = completion["done_reason"] case "openai": @@ -156,11 +149,13 @@ def run(self, prompt, format): case "bedrock": config={} + # bedrock has some shared inference parameters but also some model specific ones. + # We pop the shared ones and then send the rest as additionalModelRequestFields for the model to handle as needed. if self.inference_parameters: - if "temperature" in self.inference_parameters: config.update({"temperature": self.inference_parameters["temperature"]}) - if "top_p" in self.inference_parameters: config.update({"topP": self.inference_parameters["top_p"]}) - if "num_predict" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters["num_predict"]}) - if "stop" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters["stop"]}) + if "temperature" in self.inference_parameters: config.update({"temperature": self.inference_parameters.pop("temperature", None)}) + if "topP" in self.inference_parameters: config.update({"topP": self.inference_parameters.pop("topP", None)}) + if "maxTokens" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters.pop("maxTokens", None)}) + if "stopSequences" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters.pop("stopSequences", None)}) client = boto3.client('bedrock-runtime', region_name='us-east-1') try: @@ -173,34 +168,37 @@ def run(self, prompt, format): } ], inferenceConfig=config, - # additionalModelRequestFields{}, # For model-specific inference params + additionalModelRequestFields = self.inference_parameters, # For model-specific inference params # additionalModelResponseFieldPaths[], # For model-specific return fields ) # Catch the model family - model_fam = None - if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "meta" - elif self.model.startswith("google"): model_fam = "gemma" - elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova" - match model_fam: - case "meta" |"nova": - run_info['response'] = response['output']['message']['content'][0]['text'] - case "gemma" | "_": - run_info['response'] = response['output']['message']['content']['text'] - run_info['prompt_tokens'] = response['usage']['inputTokens'] - run_info['response_tokens'] = response['usage']['outputTokens'] - run_info['total_tokens'] = response['usage']['totalTokens'] - run_info['stop_reason'] = response['stopReason'] + # model_fam = None + # if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "meta" + # elif self.model.startswith("google"): model_fam = "gemma" + # elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova" + # match model_fam: + # case "meta" |"nova": + # response_info['response'] = response['output']['message']['content'][0]['text'] + # case "gemma" | "_": + # response_info['response'] = response['output']['message']['content']['text'] + response_info['response'] = response['output']['message']['content'][0]['text'] + response_info['prompt_tokens'] = response['usage']['inputTokens'] + response_info['response_tokens'] = response['usage']['outputTokens'] + response_info['total_tokens'] = response['usage']['totalTokens'] + response_info['stop_reason'] = response['stopReason'] except Exception as e: - error = e + response_info['error'] = e print(f"bedrock converse API failed with model {self.model}.\n{e}") case _: print(f"Runner type {self.runner_type} not supported") return None except Exception as e: - run_info['error'] = e - return run_info + response_info['error'] = e + + logger.log_interaction(response_info) + return response_info['response'], response_info['error'] diff --git a/benchtools/task.py b/benchtools/task.py index 759790e..cb8c467 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -5,7 +5,7 @@ import json import pandas as pd import itertools -from .logger import init_log_folder, log_interaction +from .logger import Logger from pathlib import PurePath from datasets import load_dataset from .runner import BenchRunner @@ -446,9 +446,7 @@ def write_csv(self, target_folder): - def run(self, runner=BenchRunner(), log_dir='logs', - benchmark=None, benchmark_path=None, - score = False): + def run(self, runner=BenchRunner(), logger= None, log_dir='logs', score = False): """ run the task on the stated model and log the interactions. @@ -469,31 +467,23 @@ def run(self, runner=BenchRunner(), log_dir='logs', # Gerenate all the prompts of the task id_prompt_list = self.generate_prompts() - # Create log directory if it doesn't exist - if not os.path.exists(log_dir): - os.mkdir(log_dir) - - run_log="" - # Create logging structure for a task within a log directory - try: - run_log = init_log_folder(log_dir, runner.model, self.get_dict(), - id_prompt_list, benchmark, benchmark_path) - except Exception as e: - print(f"Couldn't create log directory in {log_dir}...\n{e}") - + if not logger: + logger = Logger(log_dir) + + logger.log_task_info(self.get_dict(), id_prompt_list) for (prompt_id, prompt),values in zip(id_prompt_list,self.variant_values): - run_info = runner.run(prompt, self.FormatClass.model_json_schema()) + response, error = runner.run(prompt_id, prompt, values, self.FormatClass.model_json_schema(), logger) - if not 'error' in run_info and score: - score_val = self.scoring_function(run_info['response'], self.reference[prompt_id]) - + if not error and score: + score_val = self.scoring_function(response, self.reference[prompt_id]) else: score_val = None + - log_interaction(run_log, prompt_id, prompt,values, run_info, score_val) - responses.append(run_info['response']) + # log_interaction(run_log, prompt_id, prompt,values, run_info, score_val) + responses.append(response) self.responses = responses