From ff3202968fe8db51b690d421dcd4abc941eab1d0 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Mon, 8 Jun 2026 20:52:09 +0000
Subject: [PATCH 01/16] Runner: Moved model invoking logic to runner.py

---
 benchtools/runner.py | 136 +++++++++++++++++++++++++++++++++++++++----
 benchtools/task.py   | 122 +-------------------------------------
 2 files changed, 128 insertions(+), 130 deletions(-)

diff --git a/benchtools/runner.py b/benchtools/runner.py
index c8a3c64..e42260e 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -1,8 +1,12 @@
 # module to create and run benchmarks
-import yaml
 import os
+import json
+import yaml
+import boto3
 import pandas as pd
 from pathlib import Path
+from ollama import chat, ChatResponse, Client
+
 
 # possibly resurected for batch runs? 
 class BenchRunner():
@@ -27,7 +31,8 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None):
         self.model = model
         api_default = {'ollama_api': "http://localhost:11434",
                            'openai':"https://api.openai.com/v1",
-                           'ollama':""}
+                           'ollama':"",
+                           'bedrock': ""}
         if api:
             self.api = api 
         else:
@@ -35,6 +40,125 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None):
 
     def __str__(self):
         return f'{self.model} via {self.runner_type}'
+
+    def run(self, prompt, format):
+        '''
+        Run method of a runner takes a prompt and a format and then finds the correct api call that matches the runner requested by the user. Runs the LLM call and returns the LLM response
+        '''
+        error = None
+        response = ''
+        try:
+            match self.runner_type:
+                case "ollama":
+                    completion: ChatResponse = chat(
+                        model=self.model,
+                        format = format,
+                        messages=[
+                        {
+                        'role': 'user',
+                        'content':prompt,
+                        },
+                    ])
+                    response = completion.message.content
+
+
+                case "ollama_api":
+                    client = Client(
+                        host=self.api ,
+                    )
+                    completion = client.chat(
+                        self.model,
+                        format = format,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": prompt,
+                            },
+                        ],
+                    )
+                    response = completion["message"]["content"]
+
+
+                case "openai":
+                    client = OpenAI(
+                        base_url=self.api,
+                    )
+                    chat_completion = client.chat.completions.create(
+                        model=self.model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": prompt,
+                            }
+                        ],
+                    )
+                    response = chat_completion.choices[0].message.content
+
+                case "bedrock":
+                    bedrock_client = boto3.client('bedrock-runtime')
+                    # Bedrock has multiple foundational models that will each differ in request parameters and response fields we included cases for a couple of them
+                    # for available foundational models and their inferance parameters follow 
+                    # https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
+                    # Catch the model family first
+                    model_fam = None
+                    if self.model.startswith("meta"): model_fam = "llama"
+                    elif self.model.startswith("google"): model_fam = "gemma"
+                    match model_fam:
+                        case "llama":
+                            # Embed the prompt in Llama 3's instruction format.
+                            formatted_prompt = f"""
+<|begin_of_text|><|start_header_id|>user<|end_header_id|>
+{prompt}
+<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
+"""
+                            # Format the request payload using the model's native structure.
+                            request = {
+                                "prompt": formatted_prompt,
+                                # "max_gen_len": 512,
+                                # "temperature": 0.5,
+                            }
+                            # Convert the native request to JSON.
+                            request = json.dumps(request)
+                            completeion = bedrock_client.invoke_model(
+                                modelId = self.model,
+                                body = request,
+                                accept="application/json" # ???
+                            )
+                            # Decode the response body.
+                            response = json.loads(completeion["body"].read())
+                            response = response["generation"]
+                        case "gemma":
+                            # Format the request payload using the model's native structure.
+                            request = {
+                                'messages': [
+                                    {
+                                    'role': 'user',
+                                    'content': prompt
+                                    }
+                                ]
+                            }
+                            # Convert the native request to JSON.
+                            request = json.dumps(request)
+                            completeion = bedrock_client.invoke_model(
+                                modelId = self.model,
+                                body = request,
+                                accept="application/json" # ???
+                            )
+                            # Decode the response body.
+                            response = json.loads(completeion['body'].read())
+                            response = response['choices'][0]['message']['content']
+                        case _:
+                            raise NotImplementedError
+
+                case _:
+                    print(f"Runner type {self.runner_type} not supported")
+                    return None
+        except Exception as e:
+            error = e
+        return (json.dump(response), error)
+
+
     
 
 class BenchRunnerList():
@@ -87,11 +211,3 @@ def from_file(cls,file_path):
                 runner_list = [BenchRunner(**runner_info)]
         
         return cls(runner_list)
-    
-    
-
-        
-
-    
-
-        
diff --git a/benchtools/task.py b/benchtools/task.py
index e420b2d..4f0f0cb 100644
--- a/benchtools/task.py
+++ b/benchtools/task.py
@@ -3,10 +3,8 @@
 import os
 import yaml
 import json
-import boto3
 import pandas as pd
 import itertools
-from ollama import chat, ChatResponse, Client
 from .logger import init_log_folder, log_interaction
 from pathlib import PurePath
 from datasets import load_dataset
@@ -22,16 +20,6 @@
 prompt_id_fx = {'concatenator_id_generator':concatenator_id_generator,
                 'selector_id_generator':selector_id_generator}
 
-class UnMatchedModel(Exception):
-    """
-    Exception raised for a bedrock model that isn't accounted for in the match statement
-    Follow https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html for a list of available models on bedrock and their inferance parameters
-    """
-    def __init__(self, model):
-        self.model = model
-        message = f"Cannot call the model ${attempted_withdrawal} using aws Bedrock. Please fetch the correct inferance parameters for it and add it in a PR to BenchTools."
-        super().__init__(message) # Call the base class constructor
-
 
 class Task:
     """
@@ -496,114 +484,8 @@ def run(self, runner=BenchRunner(), log_dir='logs',
 
         for (prompt_id, prompt),values in zip(id_prompt_list,self.variant_values):
             
-            error = None
-            response = ''
-            try:
-                match runner.runner_type:
-                    case "ollama":
-                        completion: ChatResponse = chat(
-                            model=runner.model, 
-                            format = self.FormatClass.model_json_schema(),
-                            messages=[
-                            {
-                            'role': 'user',
-                            'content':prompt,
-                            },
-                        ])
-                        # print("response: " + response.message.content)
-                        response = completion.message.content
-                        
-
-                    case "ollama_api":
-                        client = Client(
-                            host=runner.api ,
-                        )
-                        completion = client.chat(
-                            runner.model,
-                            format = self.FormatClass.model_json_schema(),
-                            messages=[
-                                {
-                                    "role": "user",
-                                    "content": prompt,
-                                },
-                            ],
-                        )
-                        response = completion["message"]["content"]
-                        
-
-                    case "openai":
-                        client = OpenAI(
-                            base_url=runner.api,
-                        )
-                        chat_completion = client.chat.completions.create(
-                            model=runner.model,
-                            messages=[
-                                {
-                                    "role": "user",
-                                    "content": prompt,
-                                }
-                            ],
-                        )
-                        response = chat_completion.choices[0].message.content
-                        
-                    case "bedrock":
-                        bedrock_client = boto3.client('bedrock-runtime')
-                        # Bedrock has multiple foundational models that will each differ in request parameters and response fields we included cases for a couple of them
-                        # for available foundational models and their inferance parameters follow 
-                        # https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
-                        # Catch the model family first
-                        model_fam = None
-                        if runner.model.startswith("meta"): model_fam = "llama"
-                        elif runner.model.startswith("google"): model_fam = "gemma"
-                        match model_fam:
-                            case "llama":
-                                # Embed the prompt in Llama 3's instruction format.
-                                formatted_prompt = f"""
-<|begin_of_text|><|start_header_id|>user<|end_header_id|>
-{prompt}
-<|eot_id|>
-<|start_header_id|>assistant<|end_header_id|>
-"""
-                                # Format the request payload using the model's native structure.
-                                request = {
-                                    "prompt": formatted_prompt,
-                                    # "max_gen_len": 512,
-                                    # "temperature": 0.5,
-                                }
-                                # Convert the native request to JSON.
-                                request = json.dumps(request)
-                                completeion = bedrock_client.invoke_model(
-                                    modelId = runner.model,
-                                    body = request
-                                )
-                                # Decode the response body.
-                                response = json.loads(completeion["body"].read())
-                                response = response["generation"]
-                            case "gemma":
-                                completeion = bedrock_client.invoke_model(
-                                    modelId = runner.model,
-                                    body = json.dumps(
-                                        {
-                                            'messages': [
-                                                {
-                                                'role': 'user',
-                                                'content': prompt
-                                                }
-                                            ]
-                                        }
-                                    )
-                                )
-                                # Decode the response body.
-                                response = json.loads(completeion['body'].read())
-                                response = response['choices'][0]['message']['content']
-                            case _:
-                                raise UnMatchedModel(runner.model)
-                        
-                    case _:
-                        print(f"Runner type {runner.runner_type} not supported")
-                        return None
-            except Exception as e:
-                error = e
+            response, error = runner.run(prompt, self.FormatClass.model_json_schema())
+            
             if score:
                 score_val = self.scoring_function(response, self.reference[prompt_id])
                 

From 01b8962b7486f874e8badf80c33482992955b363 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Mon, 8 Jun 2026 21:33:22 +0000
Subject: [PATCH 02/16] fix

---
 benchtools/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchtools/runner.py b/benchtools/runner.py
index e42260e..7fb5208 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -156,7 +156,7 @@ def run(self, prompt, format):
                     return None
         except Exception as e:
             error = e
-        return (json.dump(response), error)
+        return response, error
 
 
     

From d74acad1221525d61af2b3eb81709277784f7d82 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Tue, 9 Jun 2026 01:35:41 +0000
Subject: [PATCH 03/16] Task: don't score if there was an error

---
 benchtools/task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchtools/task.py b/benchtools/task.py
index 4f0f0cb..0e96d5a 100644
--- a/benchtools/task.py
+++ b/benchtools/task.py
@@ -486,7 +486,7 @@ def run(self, runner=BenchRunner(), log_dir='logs',
             
             response, error = runner.run(prompt, self.FormatClass.model_json_schema())
             
-            if score:
+            if not error and score:
                 score_val = self.scoring_function(response, self.reference[prompt_id])
                 
             else: 

From a3d71760022315633750d6ec154901de86c1ff60 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Tue, 9 Jun 2026 01:38:35 +0000
Subject: [PATCH 04/16] Runner: Switching bedrock API from invoke which is
 low-level to converse which is high-level and has more potential

---
 benchtools/runner.py | 78 ++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 54 deletions(-)

diff --git a/benchtools/runner.py b/benchtools/runner.py
index 7fb5208..b849edc 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -95,61 +95,31 @@ def run(self, prompt, format):
                     response = chat_completion.choices[0].message.content
 
                 case "bedrock":
-                    bedrock_client = boto3.client('bedrock-runtime')
-                    # Bedrock has multiple foundational models that will each differ in request parameters and response fields we included cases for a couple of them
-                    # for available foundational models and their inferance parameters follow 
-                    # https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
-                    # Catch the model family first
-                    model_fam = None
-                    if self.model.startswith("meta"): model_fam = "llama"
-                    elif self.model.startswith("google"): model_fam = "gemma"
-                    match model_fam:
-                        case "llama":
-                            # Embed the prompt in Llama 3's instruction format.
-                            formatted_prompt = f"""
-<|begin_of_text|><|start_header_id|>user<|end_header_id|>
-{prompt}
-<|eot_id|>
-<|start_header_id|>assistant<|end_header_id|>
-"""
-                            # Format the request payload using the model's native structure.
-                            request = {
-                                "prompt": formatted_prompt,
-                                # "max_gen_len": 512,
-                                # "temperature": 0.5,
-                            }
-                            # Convert the native request to JSON.
-                            request = json.dumps(request)
-                            completeion = bedrock_client.invoke_model(
-                                modelId = self.model,
-                                body = request,
-                                accept="application/json" # ???
-                            )
-                            # Decode the response body.
-                            response = json.loads(completeion["body"].read())
-                            response = response["generation"]
-                        case "gemma":
-                            # Format the request payload using the model's native structure.
-                            request = {
-                                'messages': [
-                                    {
+                    client = boto3.client('bedrock-runtime', region_name='us-east-1')
+                    try:
+                        response = client.converse(
+                            modelId=self.model,
+                            messages=[
+                                {
                                     'role': 'user',
-                                    'content': prompt
-                                    }
-                                ]
-                            }
-                            # Convert the native request to JSON.
-                            request = json.dumps(request)
-                            completeion = bedrock_client.invoke_model(
-                                modelId = self.model,
-                                body = request,
-                                accept="application/json" # ???
-                            )
-                            # Decode the response body.
-                            response = json.loads(completeion['body'].read())
-                            response = response['choices'][0]['message']['content']
-                        case _:
-                            raise NotImplementedError
+                                    'content': [{'text': prompt}]
+                                }
+                            ]
+                        )
+                        # Catch the model family
+                        model_fam = None
+                        if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "llama"
+                        elif self.model.startswith("google"): model_fam = "gemma"
+                        elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova"
+                        match model_fam:
+                            case "llama" |"nova":
+                                response = response['output']['message']['content'][0]['text']
+                            case "gemma" | "_":
+                                response = response['output']['message']['content']['text']
+
+                    except Exception as e:
+                        error = e
+                        print(f"bedrock converse API failed with model {self.model}.\n{e}")
 
                 case _:
                     print(f"Runner type {self.runner_type} not supported")

From 4acd89139bba065326eec86acc86fdded8e670b8 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Tue, 9 Jun 2026 17:43:57 +0000
Subject: [PATCH 05/16] runner: rename model_fam

---
 benchtools/runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchtools/runner.py b/benchtools/runner.py
index b849edc..6a4eb4d 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -108,11 +108,11 @@ def run(self, prompt, format):
                         )
                         # Catch the model family
                         model_fam = None
-                        if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "llama"
+                        if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "meta"
                         elif self.model.startswith("google"): model_fam = "gemma"
                         elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova"
                         match model_fam:
-                            case "llama" |"nova":
+                            case "meta" |"nova":
                                 response = response['output']['message']['content'][0]['text']
                             case "gemma" | "_":
                                 response = response['output']['message']['content']['text']

From c1a81a1ff792bcd959c1723714feee57a99ffc77 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 10 Jun 2026 06:26:45 +0000
Subject: [PATCH 06/16] Runner: adding inferance parameters to runner's
 attributes

---
 benchtools/runner.py | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/benchtools/runner.py b/benchtools/runner.py
index 6a4eb4d..25d0910 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -14,7 +14,7 @@ class BenchRunner():
     A BenchRunner holds information about how a task is going to be run. 
     '''
 
-    def __init__(self, runner_type='ollama', model='gemma3:1b', api=None):
+    def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperature=None, max_tokens=None, top_p=None, stop_sequence=None):
         '''
         The constructor for BenchRunner will have default values for all attributes to have a full default runner ready to be used for running any task.
         P.S. Requires Ollama to be installed and running on your machine.
@@ -25,6 +25,14 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None):
             The name of the LLM to use for running the tasks. Default is 'gemma3'. P.S. Will need to have the model downloaded locally if using ollama
         api: str
             The URL of the API to use for accessing an LLM. If None, the default API will be http://localhost:11434 as this is used by ollama by default
+        temperature: float
+            Controls randomness in generation (higher = more random)
+        max_tokens: int
+            Maximum number of tokens to generate
+        top_p: float
+            Cumulative probability threshold for nucleus sampling
+        stop_sequence: list
+            Stop sequences that will halt generation
         '''
 
         self.runner_type = runner_type
@@ -38,6 +46,13 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None):
         else:
             self.api = api_default[runner_type]
 
+        self.inference_parameters={}
+        if temperature: self.inference_parameters.update({"tempetature": temperature})
+        if top_p: self.inference_parameters.update({"top_p": top_p})
+        if max_tokens: self.inference_parameters.update({"max_tokens": max_tokens})
+        if stop_sequence: self.inference_parameters.update({"stop": temperatstop_sequenceure})
+
+
     def __str__(self):
         return f'{self.model} via {self.runner_type}'
 
@@ -54,11 +69,13 @@ def run(self, prompt, format):
                         model=self.model,
                         format = format,
                         messages=[
-                        {
-                        'role': 'user',
-                        'content':prompt,
-                        },
-                    ])
+                            {
+                            'role': 'user',
+                            'content':prompt,
+                            },
+                        ],
+                        options=self.inference_parameters
+                    )
                     response = completion.message.content
 
 
@@ -75,6 +92,7 @@ def run(self, prompt, format):
                                 "content": prompt,
                             },
                         ],
+                        options=self.inference_parameters
                     )
                     response = completion["message"]["content"]
 
@@ -95,6 +113,13 @@ def run(self, prompt, format):
                     response = chat_completion.choices[0].message.content
 
                 case "bedrock":
+                    config={}
+                    if self.inference_parameters:
+                        if "tempetature" in self.inference_parameters: config.update({"temperature": self.inference_parameters["tempetature"]})
+                        if "top_p" in self.inference_parameters: config.update({"topP": self.inference_parameters["top_p"]})
+                        if "max_tokens" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters["max_tokens"]})
+                        if "stop" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters["stop"]})
+
                     client = boto3.client('bedrock-runtime', region_name='us-east-1')
                     try:
                         response = client.converse(
@@ -104,7 +129,10 @@ def run(self, prompt, format):
                                     'role': 'user',
                                     'content': [{'text': prompt}]
                                 }
-                            ]
+                            ],
+                            inferenceConfig=config,
+                            # additionalModelRequestFields{}, # For model-specific inference params
+                            # additionalModelResponseFieldPaths[], # For model-specific return fields
                         )
                         # Catch the model family
                         model_fam = None

From a1ca3a094d6ceb2f534c4e8364556f692c22b12f Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 10 Jun 2026 15:46:04 +0000
Subject: [PATCH 07/16] typo

---
 benchtools/runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchtools/runner.py b/benchtools/runner.py
index 25d0910..e0a002e 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -47,7 +47,7 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperatur
             self.api = api_default[runner_type]
 
         self.inference_parameters={}
-        if temperature: self.inference_parameters.update({"tempetature": temperature})
+        if temperature: self.inference_parameters.update({"temperature": temperature})
         if top_p: self.inference_parameters.update({"top_p": top_p})
         if max_tokens: self.inference_parameters.update({"max_tokens": max_tokens})
         if stop_sequence: self.inference_parameters.update({"stop": temperatstop_sequenceure})
@@ -115,7 +115,7 @@ def run(self, prompt, format):
                 case "bedrock":
                     config={}
                     if self.inference_parameters:
-                        if "tempetature" in self.inference_parameters: config.update({"temperature": self.inference_parameters["tempetature"]})
+                        if "temperature" in self.inference_parameters: config.update({"temperature": self.inference_parameters["temperature"]})
                         if "top_p" in self.inference_parameters: config.update({"topP": self.inference_parameters["top_p"]})
                         if "max_tokens" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters["max_tokens"]})
                         if "stop" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters["stop"]})

From 8f9a7bdb8fc0f865bcf4ce2cd413bd4e851016cf Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 10 Jun 2026 16:00:49 +0000
Subject: [PATCH 08/16] ollama uses num_predict not max_tokens

---
 benchtools/runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchtools/runner.py b/benchtools/runner.py
index e0a002e..f14f9ab 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -49,7 +49,7 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperatur
         self.inference_parameters={}
         if temperature: self.inference_parameters.update({"temperature": temperature})
         if top_p: self.inference_parameters.update({"top_p": top_p})
-        if max_tokens: self.inference_parameters.update({"max_tokens": max_tokens})
+        if max_tokens: self.inference_parameters.update({"num_predict": max_tokens})
         if stop_sequence: self.inference_parameters.update({"stop": temperatstop_sequenceure})
 
 
@@ -117,7 +117,7 @@ def run(self, prompt, format):
                     if self.inference_parameters:
                         if "temperature" in self.inference_parameters: config.update({"temperature": self.inference_parameters["temperature"]})
                         if "top_p" in self.inference_parameters: config.update({"topP": self.inference_parameters["top_p"]})
-                        if "max_tokens" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters["max_tokens"]})
+                        if "num_predict" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters["num_predict"]})
                         if "stop" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters["stop"]})
 
                     client = boto3.client('bedrock-runtime', region_name='us-east-1')

From 8d6bbb4dcdf838e1755de7c53c4522cef08a6b28 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 10 Jun 2026 16:09:13 +0000
Subject: [PATCH 09/16] typo

---
 benchtools/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchtools/runner.py b/benchtools/runner.py
index f14f9ab..0b7ff83 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -50,7 +50,7 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperatur
         if temperature: self.inference_parameters.update({"temperature": temperature})
         if top_p: self.inference_parameters.update({"top_p": top_p})
         if max_tokens: self.inference_parameters.update({"num_predict": max_tokens})
-        if stop_sequence: self.inference_parameters.update({"stop": temperatstop_sequenceure})
+        if stop_sequence: self.inference_parameters.update({"stop": stop_sequence})
 
 
     def __str__(self):

From 0ff8408b813635a5c337069f7b62e05e1f107a98 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 10 Jun 2026 18:32:57 +0000
Subject: [PATCH 10/16] Runner: model_params in dictionary

---
 benchtools/runner.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/benchtools/runner.py b/benchtools/runner.py
index 0b7ff83..35d8fdc 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -14,7 +14,7 @@ class BenchRunner():
     A BenchRunner holds information about how a task is going to be run. 
     '''
 
-    def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperature=None, max_tokens=None, top_p=None, stop_sequence=None):
+    def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, model_params=None):
         '''
         The constructor for BenchRunner will have default values for all attributes to have a full default runner ready to be used for running any task.
         P.S. Requires Ollama to be installed and running on your machine.
@@ -25,14 +25,16 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperatur
             The name of the LLM to use for running the tasks. Default is 'gemma3'. P.S. Will need to have the model downloaded locally if using ollama
         api: str
             The URL of the API to use for accessing an LLM. If None, the default API will be http://localhost:11434 as this is used by ollama by default
-        temperature: float
-            Controls randomness in generation (higher = more random)
-        max_tokens: int
-            Maximum number of tokens to generate
-        top_p: float
-            Cumulative probability threshold for nucleus sampling
-        stop_sequence: list
-            Stop sequences that will halt generation
+        model_params: dict
+            A dictionary with inference parameters to be used for the model generation:
+                temperature: float
+                    Controls randomness in generation (higher = more random)
+                max_tokens: int
+                    Maximum number of tokens to generate
+                top_p: float
+                    Cumulative probability threshold for nucleus sampling
+                stop_sequence: list
+                    Stop sequences that will halt generation
         '''
 
         self.runner_type = runner_type
@@ -47,10 +49,11 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, temperatur
             self.api = api_default[runner_type]
 
         self.inference_parameters={}
-        if temperature: self.inference_parameters.update({"temperature": temperature})
-        if top_p: self.inference_parameters.update({"top_p": top_p})
-        if max_tokens: self.inference_parameters.update({"num_predict": max_tokens})
-        if stop_sequence: self.inference_parameters.update({"stop": stop_sequence})
+        if model_params:
+            if 'temperature' in model_params: self.inference_parameters.update({"temperature": model_params["temperature"]})
+            if 'top_p' in model_params: self.inference_parameters.update({"top_p": model_params["top_p"]})
+            if 'max_tokens' in model_params: self.inference_parameters.update({"num_predict": model_params["max_tokens"]})
+            if 'stop_sequence' in model_params: self.inference_parameters.update({"stop": model_params["stop_sequence"]})
 
 
     def __str__(self):

From baafa670ac4667fde6bcec3c9435090c87165932 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 10 Jun 2026 18:35:50 +0000
Subject: [PATCH 11/16] bnechmark: Bench object has a list of runners as
 attribute. List of runners loaded from yml files

---
 benchtools/benchmark.py | 46 +++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py
index e0b6870..53e92dc 100644
--- a/benchtools/benchmark.py
+++ b/benchtools/benchmark.py
@@ -46,7 +46,7 @@ class Bench():
     run()
         Run one task or all tasks of the benchmark.
     '''
-    def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks=[]):
+    def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks=[], runners=[BenchRunner()]):
         '''
         Initialize the benchmark object with the name and path to the benchmark folder.
 
@@ -58,6 +58,8 @@ def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks
             path where the benchmark will be stored 
         tasks: list of Task objects
             list of tasks to be included in the benchmark. Each task should be an instance of the Task class
+        runners: list[BenchRunner]
+            Specification of the model/s and API that will be used to run the benchmark
         '''
 
         # set up the object attributes
@@ -81,6 +83,8 @@ def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks
         else:
             self.tasks = {}
         
+        self.runners=runners
+
         # Written if the benchmark directory has been initialized
         self.written = os.path.exists(self.benchmark_path)
 
@@ -128,10 +132,12 @@ def from_folders(cls, benchmark_path):
         else:
             tasks = []
 
+        runners = Bench.load_runners(benchmark_path)
+
         
         return cls(name = info['bench_name'], 
                     benchmark_path = benchmark_path,
-                    concept = info['concept'], tasks=tasks)
+                    concept = info['concept'], tasks=tasks, runners=runners)
     
     @classmethod
     def from_yaml(cls, benchmark_path):
@@ -159,9 +165,11 @@ def from_yaml(cls, benchmark_path):
         for task_dict in task_list:
             tasks.append(Task.from_dict(task_dict,source_path=benchmark_path))
 
+        runners = Bench.load_runners(benchmark_path)
+
 
         return cls(name = info['bench_name'], benchmark_path =benchmark_path,
-                   concept= info['concept'], tasks=tasks)
+                   concept= info['concept'], tasks=tasks, runners=runners)
 
     @classmethod
     def load(cls, benchmark_path):
@@ -199,6 +207,25 @@ def load_info(benchmark_path):
             info = yaml.safe_load(f)
         
         return info
+
+    @staticmethod
+    def load_runners(benchmark_path):
+        runners = []
+        model_params = {}
+        content = os.listdir(benchmark_path)
+        if 'model_param.yml' in content:
+            with open(os.path.join(benchmark_path, 'model_param.yml'), 'r') as f:
+                model_params = yaml.safe_load(f)
+
+        if 'runner.yml' in content:
+            with open(os.path.join(benchmark_path, 'runner.yml'), 'r') as f:
+                run_info = yaml.safe_load(f)
+            api= run_info['api'] if 'api' in run_info else None
+            for model in run_info['models']:
+                runners.append(BenchRunner(run_info['runner_type'], model, api, model_params))
+        else: runners.append(BenchRunner(model_param=model_params))
+
+        return runners
         
 
     def initialize_dir(self, no_git=False):
@@ -307,7 +334,7 @@ def add_task(self, task_object:Task):
             task_object.write(self.benchmark_path)
 
 
-    def run(self, runner=BenchRunner(), log_dir=None, score=False):
+    def run(self, log_dir=None, score=False):
         '''
         Run the benchmark by running each task in the benchmark and logging the interactions.
         Parameters:
@@ -324,7 +351,7 @@ def run(self, runner=BenchRunner(), log_dir=None, score=False):
         
         # Run each task
         for name, task in self.tasks.items():
-            self.run_task(task, runner, log_dir,score)
+            self.run_task(task, log_dir,score)
 
     
 
@@ -425,8 +452,7 @@ def score(self, model=None,task=None, run ='last',collate=False):
 
 
 
-    def run_task(self, target_task=None, runner=BenchRunner(), 
-                 log_dir=None, score=False): 
+    def run_task(self, target_task=None, log_dir=None, score=False):
         '''
         run a specific task
         '''
@@ -449,7 +475,9 @@ def run_task(self, target_task=None, runner=BenchRunner(),
             raise ValueError("target_task should be either a string (task name) or a Task object.")
 
         # TODO: Add log_dir to attributes?
+        run_responses = []
+        for runner in self.runners:
+            run_responses.append(task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score))
         
-        return task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score)
-
+        return run_responses
 

From 43cfc22e297891d403067865a25e56a43932aac2 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 10 Jun 2026 18:37:10 +0000
Subject: [PATCH 12/16] Demo: Adding demo files for runner info

---
 benchtools/assets/demos/listbench/model_param.yml             | 3 +++
 .../{folderbench/multiple_models.yml => listbench/runner.yml} | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)
 create mode 100644 benchtools/assets/demos/listbench/model_param.yml
 rename benchtools/assets/demos/{folderbench/multiple_models.yml => listbench/runner.yml} (62%)

diff --git a/benchtools/assets/demos/listbench/model_param.yml b/benchtools/assets/demos/listbench/model_param.yml
new file mode 100644
index 0000000..b6d7e67
--- /dev/null
+++ b/benchtools/assets/demos/listbench/model_param.yml
@@ -0,0 +1,3 @@
+temperature: 0.5
+max_tokens: 17
+top_p: 0.150
\ No newline at end of file
diff --git a/benchtools/assets/demos/folderbench/multiple_models.yml b/benchtools/assets/demos/listbench/runner.yml
similarity index 62%
rename from benchtools/assets/demos/folderbench/multiple_models.yml
rename to benchtools/assets/demos/listbench/runner.yml
index fbb7d54..7d072df 100644
--- a/benchtools/assets/demos/folderbench/multiple_models.yml
+++ b/benchtools/assets/demos/listbench/runner.yml
@@ -1,4 +1,4 @@
 runner_type: ollama
-model: 
+models: 
  - 'llama3.2'
- - 'gemma3'
+ - 'gemma3'
\ No newline at end of file

From 4fc52908b5922f3afca23017031479e449e75089 Mon Sep 17 00:00:00 2001
From: Ayman Sandouk <ayman_sandouk@uri.edu>
Date: Wed, 17 Jun 2026 13:22:39 -0400
Subject: [PATCH 13/16] Benchmark: removing runner from attributes

---
 benchtools/benchmark.py | 47 +++++++----------------------------------
 1 file changed, 8 insertions(+), 39 deletions(-)

diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py
index 53e92dc..96aface 100644
--- a/benchtools/benchmark.py
+++ b/benchtools/benchmark.py
@@ -46,7 +46,7 @@ class Bench():
     run()
         Run one task or all tasks of the benchmark.
     '''
-    def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks=[], runners=[BenchRunner()]):
+    def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks=[]):
         '''
         Initialize the benchmark object with the name and path to the benchmark folder.
 
@@ -58,8 +58,6 @@ def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks
             path where the benchmark will be stored 
         tasks: list of Task objects
             list of tasks to be included in the benchmark. Each task should be an instance of the Task class
-        runners: list[BenchRunner]
-            Specification of the model/s and API that will be used to run the benchmark
         '''
 
         # set up the object attributes
@@ -83,8 +81,6 @@ def __init__(self, name, base_path='.', benchmark_path=None, concept=None, tasks
         else:
             self.tasks = {}
         
-        self.runners=runners
-
         # Written if the benchmark directory has been initialized
         self.written = os.path.exists(self.benchmark_path)
 
@@ -132,12 +128,10 @@ def from_folders(cls, benchmark_path):
         else:
             tasks = []
 
-        runners = Bench.load_runners(benchmark_path)
-
         
         return cls(name = info['bench_name'], 
                     benchmark_path = benchmark_path,
-                    concept = info['concept'], tasks=tasks, runners=runners)
+                    concept = info['concept'], tasks=tasks)
     
     @classmethod
     def from_yaml(cls, benchmark_path):
@@ -165,11 +159,9 @@ def from_yaml(cls, benchmark_path):
         for task_dict in task_list:
             tasks.append(Task.from_dict(task_dict,source_path=benchmark_path))
 
-        runners = Bench.load_runners(benchmark_path)
-
 
         return cls(name = info['bench_name'], benchmark_path =benchmark_path,
-                   concept= info['concept'], tasks=tasks, runners=runners)
+                   concept= info['concept'], tasks=tasks)
 
     @classmethod
     def load(cls, benchmark_path):
@@ -207,25 +199,6 @@ def load_info(benchmark_path):
             info = yaml.safe_load(f)
         
         return info
-
-    @staticmethod
-    def load_runners(benchmark_path):
-        runners = []
-        model_params = {}
-        content = os.listdir(benchmark_path)
-        if 'model_param.yml' in content:
-            with open(os.path.join(benchmark_path, 'model_param.yml'), 'r') as f:
-                model_params = yaml.safe_load(f)
-
-        if 'runner.yml' in content:
-            with open(os.path.join(benchmark_path, 'runner.yml'), 'r') as f:
-                run_info = yaml.safe_load(f)
-            api= run_info['api'] if 'api' in run_info else None
-            for model in run_info['models']:
-                runners.append(BenchRunner(run_info['runner_type'], model, api, model_params))
-        else: runners.append(BenchRunner(model_param=model_params))
-
-        return runners
         
 
     def initialize_dir(self, no_git=False):
@@ -334,7 +307,7 @@ def add_task(self, task_object:Task):
             task_object.write(self.benchmark_path)
 
 
-    def run(self, log_dir=None, score=False):
+    def run(self, runner=BenchRunner(), log_dir=None, score=False):
         '''
         Run the benchmark by running each task in the benchmark and logging the interactions.
         Parameters:
@@ -351,7 +324,7 @@ def run(self, log_dir=None, score=False):
         
         # Run each task
         for name, task in self.tasks.items():
-            self.run_task(task, log_dir,score)
+            self.run_task(task, runner, log_dir,score)
 
     
 
@@ -452,7 +425,7 @@ def score(self, model=None,task=None, run ='last',collate=False):
 
 
 
-    def run_task(self, target_task=None, log_dir=None, score=False):
+    def run_task(self, runner=BenchRunner(), target_task=None, log_dir=None, score=False):
         '''
         run a specific task
         '''
@@ -474,10 +447,6 @@ def run_task(self, target_task=None, log_dir=None, score=False):
         else:
             raise ValueError("target_task should be either a string (task name) or a Task object.")
 
-        # TODO: Add log_dir to attributes?
-        run_responses = []
-        for runner in self.runners:
-            run_responses.append(task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score))
-        
-        return run_responses
+        # TODO: Add log_dir to attributes?        
+        return task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score)
 

From 8764397f2406007287f15cc861ffa74002266f17 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 17 Jun 2026 14:46:46 -0400
Subject: [PATCH 14/16] Demo: fixing runners to match changes

---
 benchtools/assets/demos/folderbench/multiple_models.yml | 4 ++++
 benchtools/assets/demos/folderbench/runner.yml          | 3 +++
 benchtools/assets/demos/listbench/model_param.yml       | 3 ---
 benchtools/assets/demos/listbench/runner.yml            | 2 +-
 4 files changed, 8 insertions(+), 4 deletions(-)
 create mode 100644 benchtools/assets/demos/folderbench/multiple_models.yml
 delete mode 100644 benchtools/assets/demos/listbench/model_param.yml

diff --git a/benchtools/assets/demos/folderbench/multiple_models.yml b/benchtools/assets/demos/folderbench/multiple_models.yml
new file mode 100644
index 0000000..5716aab
--- /dev/null
+++ b/benchtools/assets/demos/folderbench/multiple_models.yml
@@ -0,0 +1,4 @@
+runner_type: ollama
+model: 
+ - 'llama3.2'
+ - 'gemma3'
\ No newline at end of file
diff --git a/benchtools/assets/demos/folderbench/runner.yml b/benchtools/assets/demos/folderbench/runner.yml
index e4d4032..16d0627 100644
--- a/benchtools/assets/demos/folderbench/runner.yml
+++ b/benchtools/assets/demos/folderbench/runner.yml
@@ -1,2 +1,5 @@
 runner_type: ollama
 model: 'llama3.2'
+temperature: 0.5
+max_tokens: 17
+top_p: 0.150
\ No newline at end of file
diff --git a/benchtools/assets/demos/listbench/model_param.yml b/benchtools/assets/demos/listbench/model_param.yml
deleted file mode 100644
index b6d7e67..0000000
--- a/benchtools/assets/demos/listbench/model_param.yml
+++ /dev/null
@@ -1,3 +0,0 @@
-temperature: 0.5
-max_tokens: 17
-top_p: 0.150
\ No newline at end of file
diff --git a/benchtools/assets/demos/listbench/runner.yml b/benchtools/assets/demos/listbench/runner.yml
index 7d072df..5716aab 100644
--- a/benchtools/assets/demos/listbench/runner.yml
+++ b/benchtools/assets/demos/listbench/runner.yml
@@ -1,4 +1,4 @@
 runner_type: ollama
-models: 
+model: 
  - 'llama3.2'
  - 'gemma3'
\ No newline at end of file

From 9920dfa6940f4c86cb1a3e59071b91f7407a6501 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 17 Jun 2026 14:51:13 -0400
Subject: [PATCH 15/16] Coounting tokens and stop reason, passing run_info dict
 to task then to logger

---
 benchtools/logger.py | 18 +++++++------
 benchtools/runner.py | 63 +++++++++++++++++++++++++++++++++++++-------
 benchtools/task.py   | 10 +++----
 3 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/benchtools/logger.py b/benchtools/logger.py
index cc86041..27064c6 100644
--- a/benchtools/logger.py
+++ b/benchtools/logger.py
@@ -79,7 +79,7 @@ def init_log_folder(log_path, model, task_info: dict, id_prompt_list: list, benc
 
     return run_dir
 
-def log_interaction(run_log_dir, prompt_id, prompt, response, error,values,score=None):
+def log_interaction(run_log_dir, prompt_id, prompt,values, run_info,score=None):
     """
     Logs the event to the log folder specified by the user
 
@@ -89,12 +89,10 @@ def log_interaction(run_log_dir, prompt_id, prompt, response, error,values,score
         Path to a run-specific directory in a log directory specified in a call to the run method
     prompt_id: str
         Index of the sub-task being logged
-    prompt: str
-        The input provided to the model.
-    response: str
-        The output generated by the model.
     error: str
         Any error from the runner
+    run_info: dict
+        A dictionary containing all the info from the runner (response, tokens, stop reason, etc...)
     """
 
     # Making this into a directory in case more files (possibly steps) were to be held in here
@@ -105,7 +103,7 @@ def log_interaction(run_log_dir, prompt_id, prompt, response, error,values,score
         f.write("------ prompt ------\n")
         f.write(f"{prompt}\n\n")
         f.write("------ response ------\n")
-        f.write(f"{response}\n\n")
+        f.write(f"{run_info['response']}\n\n")
     
     # Gather run_info info
     with open(os.path.join(run_log_dir, "run_info.yml"), 'r') as f:
@@ -116,12 +114,16 @@ def log_interaction(run_log_dir, prompt_id, prompt, response, error,values,score
         'task_name': run_info['name'],
         'template': run_info['template'],
         'prompt_id': prompt_id,
-        'error': error,
+        'error': run_info.get('error', None),
         'values':values,
         'steps':{ 
             0: { # In case a subtask had more than one step we can always make the 0 dynamic
                 'prompt': prompt,
-                'response': response,
+                'response': run_info['response'],
+                'prompt_tokens': run_info.get('prompt_tokens', None),
+                'response_tokens': run_info.get('response_tokens', None),
+                'total_tokens': run_info.get('total_tokens', None),
+                'stop_reason': run_info.get('stop_reason', None),
             },
         },
     }
diff --git a/benchtools/runner.py b/benchtools/runner.py
index 35d8fdc..e32d60b 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -55,6 +55,24 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, model_para
             if 'max_tokens' in model_params: self.inference_parameters.update({"num_predict": model_params["max_tokens"]})
             if 'stop_sequence' in model_params: self.inference_parameters.update({"stop": model_params["stop_sequence"]})
 
+    
+    @staticmethod
+    def from_file(cls, file_path):
+        runners = []
+        model_params = {}
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"File {file_path} does not exist.")
+        
+        with open(os.path.join(file_path), 'r') as f:
+            run_info = yaml.safe_load(f)
+        type= run_info.pop('runner_type', 'ollama')
+        model= run_info.pop('model', 'gemma3:1b')
+        api= run_info.pop('api', None)
+
+        # Any remaining keys are considered model parameters
+        model_params = run_info if run_info else None
+
+        return cls(type, model, api, model_params)
 
     def __str__(self):
         return f'{self.model} via {self.runner_type}'
@@ -63,8 +81,21 @@ def run(self, prompt, format):
         '''
         Run method of a runner takes a prompt and a format and then finds the correct api call that matches the runner requested by the user. Runs the LLM call and returns the LLM response
         '''
-        error = None
-        response = ''
+        run_info = {
+            'runner_type': self.runner_type,
+            'model': self.model,
+            'api': self.api,
+            'inference_parameters': self.inference_parameters,
+            'prompt': prompt,
+            'format': format,
+            'response': '',
+            'error': None,
+            'prompt_tokens': 0,
+            'response_tokens': 0,
+            'total_tokens': 0,
+            'stop_reason': None,
+        }
+
         try:
             match self.runner_type:
                 case "ollama":
@@ -79,7 +110,11 @@ def run(self, prompt, format):
                         ],
                         options=self.inference_parameters
                     )
-                    response = completion.message.content
+                    run_info['response'] = completion.message.content
+                    run_info['prompt_tokens'] = completion.prompt_eval_count
+                    run_info['response_tokens'] = completion.eval_count
+                    run_info['total_tokens'] = completion.eval_count + completion.prompt_eval_count
+                    run_info['stop_reason'] = completion.done_reason
 
 
                 case "ollama_api":
@@ -97,7 +132,11 @@ def run(self, prompt, format):
                         ],
                         options=self.inference_parameters
                     )
-                    response = completion["message"]["content"]
+                    run_info['response'] = completion["message"]["content"]
+                    run_info['prompt_tokens'] = completion["prompt_eval_count"]
+                    run_info['response_tokens'] = completion["eval_count"]
+                    run_info['total_tokens'] = completion["eval_count"] + completion["prompt_eval_count"]
+                    run_info['stop_reason'] = completion["done_reason"]
 
 
                 case "openai":
@@ -144,9 +183,13 @@ def run(self, prompt, format):
                         elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova"
                         match model_fam:
                             case "meta" |"nova":
-                                response = response['output']['message']['content'][0]['text']
+                                run_info['response'] = response['output']['message']['content'][0]['text']
                             case "gemma" | "_":
-                                response = response['output']['message']['content']['text']
+                                run_info['response'] = response['output']['message']['content']['text']
+                        run_info['prompt_tokens'] = response['usage']['inputTokens']
+                        run_info['response_tokens'] = response['usage']['outputTokens']
+                        run_info['total_tokens'] = response['usage']['totalTokens']
+                        run_info['stop_reason'] = response['stopReason']
 
                     except Exception as e:
                         error = e
@@ -156,17 +199,17 @@ def run(self, prompt, format):
                     print(f"Runner type {self.runner_type} not supported")
                     return None
         except Exception as e:
-            error = e
-        return response, error
+            run_info['error'] = e
+        return run_info
 
 
     
 
 class BenchRunnerList():
     '''
-    a set of runners
+    a set of runner objects that can be used to run a benchmark on multiple models and/or runner types.
     '''
-    def __init__(self, runners: list[BenchRunner]):
+    def __init__(self, runners: list[BenchRunner]=[BenchRunner()]):
         '''
 
         Parameters
diff --git a/benchtools/task.py b/benchtools/task.py
index 0e96d5a..759790e 100644
--- a/benchtools/task.py
+++ b/benchtools/task.py
@@ -484,16 +484,16 @@ def run(self, runner=BenchRunner(), log_dir='logs',
 
         for (prompt_id, prompt),values in zip(id_prompt_list,self.variant_values):
             
-            response, error = runner.run(prompt, self.FormatClass.model_json_schema())
+            run_info = runner.run(prompt, self.FormatClass.model_json_schema())
             
-            if not error and score:
-                score_val = self.scoring_function(response, self.reference[prompt_id])
+            if not 'error' in run_info and score:
+                score_val = self.scoring_function(run_info['response'], self.reference[prompt_id])
                 
             else: 
                 score_val = None
 
-            log_interaction(run_log, prompt_id, prompt, response, str(error),values,score_val)
-            responses.append(response)
+            log_interaction(run_log, prompt_id, prompt,values, run_info, score_val)
+            responses.append(run_info['response'])
 
         
         self.responses = responses 

From df1438990a183bf5cd143eb1a542f0b8c96fb26b Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 24 Jun 2026 18:02:17 +0000
Subject: [PATCH 16/16] Refactor: Logger Class. Scoring isn't logged

---
 benchtools/benchmark.py |  72 +++++++------
 benchtools/logger.py    | 223 ++++++++++++++++++++--------------------
 benchtools/runner.py    |  98 +++++++++---------
 benchtools/task.py      |  34 +++---
 4 files changed, 212 insertions(+), 215 deletions(-)

diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py
index 96aface..70d9f4f 100644
--- a/benchtools/benchmark.py
+++ b/benchtools/benchmark.py
@@ -6,9 +6,10 @@
 import yaml
 import json
 # from pathlib import Path # ???
-from benchtools.task import Task
 from pathlib import PurePath
-from benchtools.runner import BenchRunner
+from .task import Task
+from .logger import Logger
+from .runner import BenchRunner
 from .utils import load_asset
 
 
@@ -319,12 +320,47 @@ def run(self, runner=BenchRunner(), log_dir=None, score=False):
         score : bool
             to run scoring now or not
         '''
+        # If user doesn't specify a log_dir, default to logs folder inside bench folder
         if not log_dir and not self.written:
             raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.")
+        elif not log_dir:
+            log_dir = os.path.join(self.benchmark_path, 'logs')
+        
+        # Initiaize a logger object that will handle the logging of the info and interactions
+        logger = Logger(log_dir)
+        logger.log_bench_info(bench_info={'bench_name': self.bench_name, 'bench_path': self.benchmark_path, 'concept': self.concept})
         
         # Run each task
         for name, task in self.tasks.items():
-            self.run_task(task, runner, log_dir,score)
+            self.run_task(task, runner, logger,score)
+
+
+
+    def run_task(self, target_task=None, runner=BenchRunner(), log_dir=None, logger=None, score=False):
+        '''
+        run a specific task
+        '''
+
+        # If user doesn't specify a log_dir, default to logs folder inside bench folder
+        if not log_dir and not self.written:
+            raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.")
+        elif not log_dir:
+            log_dir = os.path.join(self.benchmark_path, 'logs')
+
+        if not(target_task):
+            # TODO: use a generator and make this have a state
+            target_task = list[self.tasks.keys()][0]
+
+        if isinstance(target_task, str):
+            task_object = self.tasks[target_task]
+        elif isinstance(target_task, Task):
+            task_object = target_task
+        else:
+            raise ValueError("target_task should be either a string (task name) or a Task object.")
+
+
+        return task_object.run(runner, log_dir, logger, score)
+
 
     
 
@@ -421,32 +457,4 @@ def score(self, model=None,task=None, run ='last',collate=False):
         
         
 
-        return score_list
-
-
-
-    def run_task(self, runner=BenchRunner(), target_task=None, log_dir=None, score=False):
-        '''
-        run a specific task
-        '''
-        if not log_dir and not self.written:
-            raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.")
-
-        # If user doesn't specify a log_dir, default to logs folder inside bench folder
-        if not log_dir:
-            log_dir = os.path.join(self.benchmark_path, 'logs')
-
-        if not(target_task):
-            # TODO: use a generator and make this have a state
-            target_task = list[self.tasks.keys()][0] 
-
-        if isinstance(target_task, str):
-            task_object = self.tasks[target_task]
-        elif isinstance(target_task, Task):
-            task_object = target_task
-        else:
-            raise ValueError("target_task should be either a string (task name) or a Task object.")
-
-        # TODO: Add log_dir to attributes?        
-        return task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score)
-
+        return score_list
\ No newline at end of file
diff --git a/benchtools/logger.py b/benchtools/logger.py
index 27064c6..2421029 100644
--- a/benchtools/logger.py
+++ b/benchtools/logger.py
@@ -16,129 +16,130 @@ def default(self, o):
         return super().default(o)
 
 
-def init_log_folder(log_path, model, task_info: dict, id_prompt_list: list, benchmark=None, benchmark_path=None):
-    ''''
-    Creates the log directories and sub-directories for a specific task.
-    
-    Parameters:
-    -------------
-    log_path: str
-        The path to the log dir where the log file will be created.
-    model: 
-        The name of the model running the task
-    task_info: dict
-        A dictionary with all the task's info for which the logger is being initialized.
+class Logger:
+    ''' 
+    A class that holds all information and methods related to logging the interactions between the runner and the model. The logger will create the logging structure for each run of a task, and will log the prompt, response, and any other relevant information such as tokens used, stop reason, errors, etc...
     '''
-    # Get timestamp without fractions of seconds
-    timestamp = int(datetime.datetime.now().timestamp())
 
-    model_dir = os.path.join(log_path, model)
-    if not os.path.exists(model_dir):
-        os.mkdir(model_dir)
+    def __init__(self, log_path):
+        '''
+        Initializes the logger by creating the log directory if it doesn't exist.
 
-    task_dir = os.path.join(model_dir, task_info['name'])
-    if not os.path.exists(task_dir):
-        os.mkdir(task_dir)
+        Parameters:
+        -------------
+        log_path: str
+            The path to the log dir where the log file will be created.
+        '''
+        self.log_path = log_path
+        # self.init_log_directory() # Create the log folder structure for the task
+        os.makedirs(self.log_path, exist_ok=True)
 
-    run_dir = os.path.join(task_dir, str(timestamp))
-    os.mkdir(run_dir)
+        self.bench_info = {}
 
-    # Create run_info.yml with all the metadata
-    run_info =  task_info
-    if benchmark:
-        run_info['bench_name'] = benchmark
-        run_info['benchmark_path'] = benchmark_path
-    run_info['run_id'] = str(timestamp)
-    run_info['log_path'] = str(run_dir)
 
-    # Add prompt_id of each value set to values
-    for idx, (prompt_id, _) in enumerate(id_prompt_list):
-        run_info['values'][idx].update({'prompt_id': prompt_id})
-    
-    with open(os.path.join(run_dir,'run_info.yml'), 'w') as f:
-        yaml.dump(run_info, f)
+    def log_bench_info(self, bench_info):
+        # Get timestamp without fractions of seconds
+        timestamp = int(datetime.datetime.now().timestamp())
 
+        bench_info[f'bench_run_id'] = str(timestamp)
+        self.bench_info = bench_info
+        self.log_path = os.path.join(self.log_path, f"bench_{bench_info['bench_name']}")
+        os.makedirs(self.log_path, exist_ok=True)
 
 
-    {
-        # TODO: What can we benifit from the logger?
-        # log_file = os.path.join(log_path, f'{task_name}_log.txt')
-        # print(f"\nLOGPATH: {log_file}\n") # Debugging
-
-        # formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        # handler = logging.FileHandler(log_file)
-        # handler.setFormatter(formatter)
-
-        # logger = logging.getLogger(task_name)
-        # logger.setLevel(logging.INFO) # TODO add as an argument to the init functuion to use more options 
-        # logger.addHandler(handler)
-
-        # print(logger) # Debugging
-        # return logger
-    }
-
-    return run_dir
-
-def log_interaction(run_log_dir, prompt_id, prompt,values, run_info,score=None):
-    """
-    Logs the event to the log folder specified by the user
-
-    Parameters:
-    -------------
-    run_log_dir: str
-        Path to a run-specific directory in a log directory specified in a call to the run method
-    prompt_id: str
-        Index of the sub-task being logged
-    error: str
-        Any error from the runner
-    run_info: dict
-        A dictionary containing all the info from the runner (response, tokens, stop reason, etc...)
-    """
-
-    # Making this into a directory in case more files (possibly steps) were to be held in here
-    prompt_dir = os.path.join(run_log_dir, prompt_id)
-    os.mkdir(prompt_dir)
-
-    with open(os.path.join(prompt_dir, "log.txt"), 'w') as f:
-        f.write("------ prompt ------\n")
-        f.write(f"{prompt}\n\n")
-        f.write("------ response ------\n")
-        f.write(f"{run_info['response']}\n\n")
-    
-    # Gather run_info info
-    with open(os.path.join(run_log_dir, "run_info.yml"), 'r') as f:
-            run_info = yaml.safe_load(f) 
-
-    
-    step_trace = {
-        'task_name': run_info['name'],
-        'template': run_info['template'],
-        'prompt_id': prompt_id,
-        'error': run_info.get('error', None),
-        'values':values,
-        'steps':{ 
-            0: { # In case a subtask had more than one step we can always make the 0 dynamic
-                'prompt': prompt,
-                'response': run_info['response'],
-                'prompt_tokens': run_info.get('prompt_tokens', None),
-                'response_tokens': run_info.get('response_tokens', None),
-                'total_tokens': run_info.get('total_tokens', None),
-                'stop_reason': run_info.get('stop_reason', None),
-            },
-        },
-    }
-    if not(score is None):
-        step_trace['steps'][0]['score'] = score
+    def log_task_info(self, task_info, id_prompt_list: list):
+        '''
+        Logs the task info to the log folder specified by the user
+
+        Parameters:
+        -------------
+        task_info: dict
+            A dictionary with all the task's info for which the logger is being initialized.
+        '''
+        # Get timestamp without fractions of seconds
+        timestamp = int(datetime.datetime.now().timestamp())
+
+        task_info['task_run_id'] = str(timestamp)
+        self.task_info = task_info
+
+        self.task_log_path = os.path.join(self.log_path, f"task_{task_info['name']}")
+        os.makedirs(self.task_log_path, exist_ok=True)
+        
+        # with open(os.path.join(run_log_dir, "task_info.yml"), 'w') as f:
+            # yaml.dump(task_info, f)
+
+        # Add prompt_id of each value set to values
+        for idx, (prompt_id, _) in enumerate(id_prompt_list):
+            task_info['values'][idx].update({'prompt_id': prompt_id})
+
+
+    def log_runner_info(self, runner_info):
+        ''''
+        Creates the log directories and sub-directories for a specific task.
+
+        Parameters:
+        -------------
+        runner_info: dict
+            Dictionary that contains information about the runner of a task
+        '''
 
-    with open(os.path.join(prompt_dir, "log.json"), 'w') as f:
-        # yaml.dump(step_trace, f)
-        json.dump(step_trace, f, indent=4, cls=EnhancedJSONEncoder)
+        self.model_dir = os.path.join(self.task_log_path, runner_info['model'])
+        os.makedirs(self.model_dir, exist_ok=True)
 
-    # TODO: What can we benifit from the logger?
-    # logger.info(f'Input: {prompt}')
-    # logger.info(f'Output: {response}')
+        self.run_dir = os.path.join(self.model_dir, self.task_info['task_run_id'])
+        os.makedirs(self.run_dir, exist_ok=True)
+ 
+        self.runner_info = runner_info
 
+        # Create run_info.yml with all the metadata
+        self.run_info =  self.bench_info | self.task_info | self.runner_info
+        self.run_info['log_path'] = str(self.task_log_path)
 
+        with open(os.path.join(self.run_dir,'run_info.yml'), 'w') as f:
+            yaml.dump(self.run_info, f)
 
-    
 
+    def log_interaction(self, response_info):
+        """
+        Logs the event to the log folder specified by the user
+
+        Parameters:
+        -------------
+        response_info: dict
+            A dictionary of logged information from the interaction with the LLM
+        """
+
+        # Making this into a directory in case more files (possibly steps) were to be held in here
+        self.prompt_dir = os.path.join(self.run_dir, response_info['prompt_id'])
+        os.mkdir(self.prompt_dir)
+
+        with open(os.path.join(self.prompt_dir, "log.txt"), 'w') as f:
+            f.write("------ prompt ------\n")
+            f.write(f"{response_info['prompt']}\n\n")
+            f.write("------ response ------\n")
+            f.write(f"{response_info['response']}\n\n")
+
+
+        step_trace = {
+            'task_name': self.run_info['name'],
+            'template': self.run_info['template'],
+            'steps':{ 
+                0: response_info,
+            },
+        }
+        
+
+        with open(os.path.join(self.prompt_dir, "log.json"), 'w') as f:
+            json.dump(step_trace, f, indent=4, cls=EnhancedJSONEncoder)
+
+        # TODO: What can we benifit from the logger?
+        # logger.info(f'Input: {prompt}')
+        # logger.info(f'Output: {response}')
+        
+        
+        
+    # def log_score(score):
+    #     with open(os.path.join(run_log_dir, "run_info.yml"), 'r') as f:
+    #             run_info = yaml.safe_load(f) 
+        
+    #     step_trace['steps'][0]['score'] = score
\ No newline at end of file
diff --git a/benchtools/runner.py b/benchtools/runner.py
index e32d60b..60a2ee9 100644
--- a/benchtools/runner.py
+++ b/benchtools/runner.py
@@ -5,6 +5,7 @@
 import boto3
 import pandas as pd
 from pathlib import Path
+from .logger import Logger
 from ollama import chat, ChatResponse, Client
 
 
@@ -26,15 +27,7 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, model_para
         api: str
             The URL of the API to use for accessing an LLM. If None, the default API will be http://localhost:11434 as this is used by ollama by default
         model_params: dict
-            A dictionary with inference parameters to be used for the model generation:
-                temperature: float
-                    Controls randomness in generation (higher = more random)
-                max_tokens: int
-                    Maximum number of tokens to generate
-                top_p: float
-                    Cumulative probability threshold for nucleus sampling
-                stop_sequence: list
-                    Stop sequences that will halt generation
+            A dictionary with inference parameters to be used for the model generation such as temperature, max_tokens, top_p, stop_sequence, etc.
         '''
 
         self.runner_type = runner_type
@@ -48,13 +41,7 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, model_para
         else:
             self.api = api_default[runner_type]
 
-        self.inference_parameters={}
-        if model_params:
-            if 'temperature' in model_params: self.inference_parameters.update({"temperature": model_params["temperature"]})
-            if 'top_p' in model_params: self.inference_parameters.update({"top_p": model_params["top_p"]})
-            if 'max_tokens' in model_params: self.inference_parameters.update({"num_predict": model_params["max_tokens"]})
-            if 'stop_sequence' in model_params: self.inference_parameters.update({"stop": model_params["stop_sequence"]})
-
+        self.inference_parameters= model_params
     
     @staticmethod
     def from_file(cls, file_path):
@@ -77,16 +64,22 @@ def from_file(cls, file_path):
     def __str__(self):
         return f'{self.model} via {self.runner_type}'
 
-    def run(self, prompt, format):
+    def run(self, prompt_id, prompt, values,  format, logger):
         '''
         Run method of a runner takes a prompt and a format and then finds the correct api call that matches the runner requested by the user. Runs the LLM call and returns the LLM response
         '''
-        run_info = {
+        runner_info = {
             'runner_type': self.runner_type,
             'model': self.model,
             'api': self.api,
-            'inference_parameters': self.inference_parameters,
+            'inference_parameters': self.inference_parameters
+        }
+        logger.log_runner_info(runner_info)
+
+        response_info = {
+            'prompt_id': prompt_id,
             'prompt': prompt,
+            'values': values,
             'format': format,
             'response': '',
             'error': None,
@@ -110,11 +103,11 @@ def run(self, prompt, format):
                         ],
                         options=self.inference_parameters
                     )
-                    run_info['response'] = completion.message.content
-                    run_info['prompt_tokens'] = completion.prompt_eval_count
-                    run_info['response_tokens'] = completion.eval_count
-                    run_info['total_tokens'] = completion.eval_count + completion.prompt_eval_count
-                    run_info['stop_reason'] = completion.done_reason
+                    response_info['response'] = completion.message.content
+                    response_info['prompt_tokens'] = completion.prompt_eval_count
+                    response_info['response_tokens'] = completion.eval_count
+                    response_info['total_tokens'] = completion.eval_count + completion.prompt_eval_count
+                    response_info['stop_reason'] = completion.done_reason
 
 
                 case "ollama_api":
@@ -132,11 +125,11 @@ def run(self, prompt, format):
                         ],
                         options=self.inference_parameters
                     )
-                    run_info['response'] = completion["message"]["content"]
-                    run_info['prompt_tokens'] = completion["prompt_eval_count"]
-                    run_info['response_tokens'] = completion["eval_count"]
-                    run_info['total_tokens'] = completion["eval_count"] + completion["prompt_eval_count"]
-                    run_info['stop_reason'] = completion["done_reason"]
+                    response_info['response'] = completion["message"]["content"]
+                    response_info['prompt_tokens'] = completion["prompt_eval_count"]
+                    response_info['response_tokens'] = completion["eval_count"]
+                    response_info['total_tokens'] = completion["eval_count"] + completion["prompt_eval_count"]
+                    response_info['stop_reason'] = completion["done_reason"]
 
 
                 case "openai":
@@ -156,11 +149,13 @@ def run(self, prompt, format):
 
                 case "bedrock":
                     config={}
+                    # bedrock has some shared inference parameters but also some model specific ones.
+                    # We pop the shared ones and then send the rest as additionalModelRequestFields for the model to handle as needed.
                     if self.inference_parameters:
-                        if "temperature" in self.inference_parameters: config.update({"temperature": self.inference_parameters["temperature"]})
-                        if "top_p" in self.inference_parameters: config.update({"topP": self.inference_parameters["top_p"]})
-                        if "num_predict" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters["num_predict"]})
-                        if "stop" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters["stop"]})
+                        if "temperature" in self.inference_parameters: config.update({"temperature": self.inference_parameters.pop("temperature", None)})
+                        if "topP" in self.inference_parameters: config.update({"topP": self.inference_parameters.pop("topP", None)})
+                        if "maxTokens" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters.pop("maxTokens", None)})
+                        if "stopSequences" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters.pop("stopSequences", None)})
 
                     client = boto3.client('bedrock-runtime', region_name='us-east-1')
                     try:
@@ -173,34 +168,37 @@ def run(self, prompt, format):
                                 }
                             ],
                             inferenceConfig=config,
-                            # additionalModelRequestFields{}, # For model-specific inference params
+                            additionalModelRequestFields = self.inference_parameters, # For model-specific inference params
                             # additionalModelResponseFieldPaths[], # For model-specific return fields
                         )
                         # Catch the model family
-                        model_fam = None
-                        if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "meta"
-                        elif self.model.startswith("google"): model_fam = "gemma"
-                        elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova"
-                        match model_fam:
-                            case "meta" |"nova":
-                                run_info['response'] = response['output']['message']['content'][0]['text']
-                            case "gemma" | "_":
-                                run_info['response'] = response['output']['message']['content']['text']
-                        run_info['prompt_tokens'] = response['usage']['inputTokens']
-                        run_info['response_tokens'] = response['usage']['outputTokens']
-                        run_info['total_tokens'] = response['usage']['totalTokens']
-                        run_info['stop_reason'] = response['stopReason']
+                        # model_fam = None
+                        # if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "meta"
+                        # elif self.model.startswith("google"): model_fam = "gemma"
+                        # elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova"
+                        # match model_fam:
+                        #     case "meta" |"nova":
+                        #         response_info['response'] = response['output']['message']['content'][0]['text']
+                        #     case "gemma" | "_":
+                        #         response_info['response'] = response['output']['message']['content']['text']
+                        response_info['response'] = response['output']['message']['content'][0]['text']
+                        response_info['prompt_tokens'] = response['usage']['inputTokens']
+                        response_info['response_tokens'] = response['usage']['outputTokens']
+                        response_info['total_tokens'] = response['usage']['totalTokens']
+                        response_info['stop_reason'] = response['stopReason']
 
                     except Exception as e:
-                        error = e
+                        response_info['error'] = e
                         print(f"bedrock converse API failed with model {self.model}.\n{e}")
 
                 case _:
                     print(f"Runner type {self.runner_type} not supported")
                     return None
         except Exception as e:
-            run_info['error'] = e
-        return run_info
+            response_info['error'] = e
+
+        logger.log_interaction(response_info)
+        return response_info['response'], response_info['error']
 
 
     
diff --git a/benchtools/task.py b/benchtools/task.py
index 759790e..cb8c467 100644
--- a/benchtools/task.py
+++ b/benchtools/task.py
@@ -5,7 +5,7 @@
 import json
 import pandas as pd
 import itertools
-from .logger import init_log_folder, log_interaction
+from .logger import Logger
 from pathlib import PurePath
 from datasets import load_dataset
 from .runner import BenchRunner
@@ -446,9 +446,7 @@ def write_csv(self, target_folder):
 
 
     
-    def run(self, runner=BenchRunner(), log_dir='logs', 
-            benchmark=None, benchmark_path=None,
-            score = False):
+    def run(self, runner=BenchRunner(), logger= None, log_dir='logs', score = False):
         """
         run the task on the stated model and log the interactions.
 
@@ -469,31 +467,23 @@ def run(self, runner=BenchRunner(), log_dir='logs',
         # Gerenate all the prompts of the task
         id_prompt_list = self.generate_prompts()
 
-        # Create log directory if it doesn't exist
-        if not os.path.exists(log_dir):
-            os.mkdir(log_dir)
-
-        run_log=""
-        # Create logging structure for a task within a log directory
-        try:
-            run_log = init_log_folder(log_dir, runner.model, self.get_dict(), 
-                                        id_prompt_list, benchmark, benchmark_path)
-        except Exception as e:
-            print(f"Couldn't create log directory in {log_dir}...\n{e}")
-
+        if not logger:
+            logger = Logger(log_dir)
+            
+        logger.log_task_info(self.get_dict(), id_prompt_list)
 
         for (prompt_id, prompt),values in zip(id_prompt_list,self.variant_values):
             
-            run_info = runner.run(prompt, self.FormatClass.model_json_schema())
+            response, error = runner.run(prompt_id, prompt, values, self.FormatClass.model_json_schema(), logger)
             
-            if not 'error' in run_info and score:
-                score_val = self.scoring_function(run_info['response'], self.reference[prompt_id])
-                
+            if not error and score:
+                score_val = self.scoring_function(response, self.reference[prompt_id])
             else: 
                 score_val = None
+        
 
-            log_interaction(run_log, prompt_id, prompt,values, run_info, score_val)
-            responses.append(run_info['response'])
+            # log_interaction(run_log, prompt_id, prompt,values, run_info, score_val)
+            responses.append(response)
 
         
         self.responses = responses