diff --git a/benchtools/assets/demos/folderbench/multiple_models.yml b/benchtools/assets/demos/folderbench/multiple_models.yml index fbb7d54..5716aab 100644 --- a/benchtools/assets/demos/folderbench/multiple_models.yml +++ b/benchtools/assets/demos/folderbench/multiple_models.yml @@ -1,4 +1,4 @@ runner_type: ollama model: - 'llama3.2' - - 'gemma3' + - 'gemma3' \ No newline at end of file diff --git a/benchtools/assets/demos/folderbench/runner.yml b/benchtools/assets/demos/folderbench/runner.yml index e4d4032..16d0627 100644 --- a/benchtools/assets/demos/folderbench/runner.yml +++ b/benchtools/assets/demos/folderbench/runner.yml @@ -1,2 +1,5 @@ runner_type: ollama model: 'llama3.2' +temperature: 0.5 +max_tokens: 17 +top_p: 0.150 \ No newline at end of file diff --git a/benchtools/assets/demos/listbench/runner.yml b/benchtools/assets/demos/listbench/runner.yml new file mode 100644 index 0000000..5716aab --- /dev/null +++ b/benchtools/assets/demos/listbench/runner.yml @@ -0,0 +1,4 @@ +runner_type: ollama +model: + - 'llama3.2' + - 'gemma3' \ No newline at end of file diff --git a/benchtools/benchmark.py b/benchtools/benchmark.py index e0b6870..70d9f4f 100644 --- a/benchtools/benchmark.py +++ b/benchtools/benchmark.py @@ -6,9 +6,10 @@ import yaml import json # from pathlib import Path # ??? -from benchtools.task import Task from pathlib import PurePath -from benchtools.runner import BenchRunner +from .task import Task +from .logger import Logger +from .runner import BenchRunner from .utils import load_asset @@ -319,12 +320,47 @@ def run(self, runner=BenchRunner(), log_dir=None, score=False): score : bool to run scoring now or not ''' + # If user doesn't specify a log_dir, default to logs folder inside bench folder if not log_dir and not self.written: raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.") + elif not log_dir: + log_dir = os.path.join(self.benchmark_path, 'logs') + + # Initiaize a logger object that will handle the logging of the info and interactions + logger = Logger(log_dir) + logger.log_bench_info(bench_info={'bench_name': self.bench_name, 'bench_path': self.benchmark_path, 'concept': self.concept}) # Run each task for name, task in self.tasks.items(): - self.run_task(task, runner, log_dir,score) + self.run_task(task, runner, logger,score) + + + + def run_task(self, target_task=None, runner=BenchRunner(), log_dir=None, logger=None, score=False): + ''' + run a specific task + ''' + + # If user doesn't specify a log_dir, default to logs folder inside bench folder + if not log_dir and not self.written: + raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.") + elif not log_dir: + log_dir = os.path.join(self.benchmark_path, 'logs') + + if not(target_task): + # TODO: use a generator and make this have a state + target_task = list[self.tasks.keys()][0] + + if isinstance(target_task, str): + task_object = self.tasks[target_task] + elif isinstance(target_task, Task): + task_object = target_task + else: + raise ValueError("target_task should be either a string (task name) or a Task object.") + + + return task_object.run(runner, log_dir, logger, score) + @@ -421,35 +457,4 @@ def score(self, model=None,task=None, run ='last',collate=False): - return score_list - - - - def run_task(self, target_task=None, runner=BenchRunner(), - log_dir=None, score=False): - ''' - run a specific task - ''' - if not log_dir and not self.written: - raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.") - - # If user doesn't specify a log_dir, default to logs folder inside bench folder - if not log_dir: - log_dir = os.path.join(self.benchmark_path, 'logs') - - if not(target_task): - # TODO: use a generator and make this have a state - target_task = list[self.tasks.keys()][0] - - if isinstance(target_task, str): - task_object = self.tasks[target_task] - elif isinstance(target_task, Task): - task_object = target_task - else: - raise ValueError("target_task should be either a string (task name) or a Task object.") - - # TODO: Add log_dir to attributes? - - return task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score) - - + return score_list \ No newline at end of file diff --git a/benchtools/logger.py b/benchtools/logger.py index cc86041..2421029 100644 --- a/benchtools/logger.py +++ b/benchtools/logger.py @@ -16,127 +16,130 @@ def default(self, o): return super().default(o) -def init_log_folder(log_path, model, task_info: dict, id_prompt_list: list, benchmark=None, benchmark_path=None): - '''' - Creates the log directories and sub-directories for a specific task. - - Parameters: - ------------- - log_path: str - The path to the log dir where the log file will be created. - model: - The name of the model running the task - task_info: dict - A dictionary with all the task's info for which the logger is being initialized. +class Logger: + ''' + A class that holds all information and methods related to logging the interactions between the runner and the model. The logger will create the logging structure for each run of a task, and will log the prompt, response, and any other relevant information such as tokens used, stop reason, errors, etc... ''' - # Get timestamp without fractions of seconds - timestamp = int(datetime.datetime.now().timestamp()) - model_dir = os.path.join(log_path, model) - if not os.path.exists(model_dir): - os.mkdir(model_dir) + def __init__(self, log_path): + ''' + Initializes the logger by creating the log directory if it doesn't exist. - task_dir = os.path.join(model_dir, task_info['name']) - if not os.path.exists(task_dir): - os.mkdir(task_dir) + Parameters: + ------------- + log_path: str + The path to the log dir where the log file will be created. + ''' + self.log_path = log_path + # self.init_log_directory() # Create the log folder structure for the task + os.makedirs(self.log_path, exist_ok=True) - run_dir = os.path.join(task_dir, str(timestamp)) - os.mkdir(run_dir) + self.bench_info = {} - # Create run_info.yml with all the metadata - run_info = task_info - if benchmark: - run_info['bench_name'] = benchmark - run_info['benchmark_path'] = benchmark_path - run_info['run_id'] = str(timestamp) - run_info['log_path'] = str(run_dir) - # Add prompt_id of each value set to values - for idx, (prompt_id, _) in enumerate(id_prompt_list): - run_info['values'][idx].update({'prompt_id': prompt_id}) - - with open(os.path.join(run_dir,'run_info.yml'), 'w') as f: - yaml.dump(run_info, f) + def log_bench_info(self, bench_info): + # Get timestamp without fractions of seconds + timestamp = int(datetime.datetime.now().timestamp()) + bench_info[f'bench_run_id'] = str(timestamp) + self.bench_info = bench_info + self.log_path = os.path.join(self.log_path, f"bench_{bench_info['bench_name']}") + os.makedirs(self.log_path, exist_ok=True) - { - # TODO: What can we benifit from the logger? - # log_file = os.path.join(log_path, f'{task_name}_log.txt') - # print(f"\nLOGPATH: {log_file}\n") # Debugging - - # formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') - # handler = logging.FileHandler(log_file) - # handler.setFormatter(formatter) - - # logger = logging.getLogger(task_name) - # logger.setLevel(logging.INFO) # TODO add as an argument to the init functuion to use more options - # logger.addHandler(handler) - - # print(logger) # Debugging - # return logger - } - - return run_dir - -def log_interaction(run_log_dir, prompt_id, prompt, response, error,values,score=None): - """ - Logs the event to the log folder specified by the user - - Parameters: - ------------- - run_log_dir: str - Path to a run-specific directory in a log directory specified in a call to the run method - prompt_id: str - Index of the sub-task being logged - prompt: str - The input provided to the model. - response: str - The output generated by the model. - error: str - Any error from the runner - """ - - # Making this into a directory in case more files (possibly steps) were to be held in here - prompt_dir = os.path.join(run_log_dir, prompt_id) - os.mkdir(prompt_dir) - - with open(os.path.join(prompt_dir, "log.txt"), 'w') as f: - f.write("------ prompt ------\n") - f.write(f"{prompt}\n\n") - f.write("------ response ------\n") - f.write(f"{response}\n\n") - - # Gather run_info info - with open(os.path.join(run_log_dir, "run_info.yml"), 'r') as f: - run_info = yaml.safe_load(f) - - - step_trace = { - 'task_name': run_info['name'], - 'template': run_info['template'], - 'prompt_id': prompt_id, - 'error': error, - 'values':values, - 'steps':{ - 0: { # In case a subtask had more than one step we can always make the 0 dynamic - 'prompt': prompt, - 'response': response, - }, - }, - } - if not(score is None): - step_trace['steps'][0]['score'] = score + def log_task_info(self, task_info, id_prompt_list: list): + ''' + Logs the task info to the log folder specified by the user + + Parameters: + ------------- + task_info: dict + A dictionary with all the task's info for which the logger is being initialized. + ''' + # Get timestamp without fractions of seconds + timestamp = int(datetime.datetime.now().timestamp()) + + task_info['task_run_id'] = str(timestamp) + self.task_info = task_info + + self.task_log_path = os.path.join(self.log_path, f"task_{task_info['name']}") + os.makedirs(self.task_log_path, exist_ok=True) + + # with open(os.path.join(run_log_dir, "task_info.yml"), 'w') as f: + # yaml.dump(task_info, f) + + # Add prompt_id of each value set to values + for idx, (prompt_id, _) in enumerate(id_prompt_list): + task_info['values'][idx].update({'prompt_id': prompt_id}) + + + def log_runner_info(self, runner_info): + '''' + Creates the log directories and sub-directories for a specific task. + + Parameters: + ------------- + runner_info: dict + Dictionary that contains information about the runner of a task + ''' - with open(os.path.join(prompt_dir, "log.json"), 'w') as f: - # yaml.dump(step_trace, f) - json.dump(step_trace, f, indent=4, cls=EnhancedJSONEncoder) + self.model_dir = os.path.join(self.task_log_path, runner_info['model']) + os.makedirs(self.model_dir, exist_ok=True) - # TODO: What can we benifit from the logger? - # logger.info(f'Input: {prompt}') - # logger.info(f'Output: {response}') + self.run_dir = os.path.join(self.model_dir, self.task_info['task_run_id']) + os.makedirs(self.run_dir, exist_ok=True) + + self.runner_info = runner_info + # Create run_info.yml with all the metadata + self.run_info = self.bench_info | self.task_info | self.runner_info + self.run_info['log_path'] = str(self.task_log_path) + with open(os.path.join(self.run_dir,'run_info.yml'), 'w') as f: + yaml.dump(self.run_info, f) - + def log_interaction(self, response_info): + """ + Logs the event to the log folder specified by the user + + Parameters: + ------------- + response_info: dict + A dictionary of logged information from the interaction with the LLM + """ + + # Making this into a directory in case more files (possibly steps) were to be held in here + self.prompt_dir = os.path.join(self.run_dir, response_info['prompt_id']) + os.mkdir(self.prompt_dir) + + with open(os.path.join(self.prompt_dir, "log.txt"), 'w') as f: + f.write("------ prompt ------\n") + f.write(f"{response_info['prompt']}\n\n") + f.write("------ response ------\n") + f.write(f"{response_info['response']}\n\n") + + + step_trace = { + 'task_name': self.run_info['name'], + 'template': self.run_info['template'], + 'steps':{ + 0: response_info, + }, + } + + + with open(os.path.join(self.prompt_dir, "log.json"), 'w') as f: + json.dump(step_trace, f, indent=4, cls=EnhancedJSONEncoder) + + # TODO: What can we benifit from the logger? + # logger.info(f'Input: {prompt}') + # logger.info(f'Output: {response}') + + + + # def log_score(score): + # with open(os.path.join(run_log_dir, "run_info.yml"), 'r') as f: + # run_info = yaml.safe_load(f) + + # step_trace['steps'][0]['score'] = score \ No newline at end of file diff --git a/benchtools/runner.py b/benchtools/runner.py index c8a3c64..60a2ee9 100644 --- a/benchtools/runner.py +++ b/benchtools/runner.py @@ -1,8 +1,13 @@ # module to create and run benchmarks -import yaml import os +import json +import yaml +import boto3 import pandas as pd from pathlib import Path +from .logger import Logger +from ollama import chat, ChatResponse, Client + # possibly resurected for batch runs? class BenchRunner(): @@ -10,7 +15,7 @@ class BenchRunner(): A BenchRunner holds information about how a task is going to be run. ''' - def __init__(self, runner_type='ollama', model='gemma3:1b', api=None): + def __init__(self, runner_type='ollama', model='gemma3:1b', api=None, model_params=None): ''' The constructor for BenchRunner will have default values for all attributes to have a full default runner ready to be used for running any task. P.S. Requires Ollama to be installed and running on your machine. @@ -21,27 +26,188 @@ def __init__(self, runner_type='ollama', model='gemma3:1b', api=None): The name of the LLM to use for running the tasks. Default is 'gemma3'. P.S. Will need to have the model downloaded locally if using ollama api: str The URL of the API to use for accessing an LLM. If None, the default API will be http://localhost:11434 as this is used by ollama by default + model_params: dict + A dictionary with inference parameters to be used for the model generation such as temperature, max_tokens, top_p, stop_sequence, etc. ''' self.runner_type = runner_type self.model = model api_default = {'ollama_api': "http://localhost:11434", 'openai':"https://api.openai.com/v1", - 'ollama':""} + 'ollama':"", + 'bedrock': ""} if api: self.api = api else: self.api = api_default[runner_type] + self.inference_parameters= model_params + + @staticmethod + def from_file(cls, file_path): + runners = [] + model_params = {} + if not os.path.exists(file_path): + raise FileNotFoundError(f"File {file_path} does not exist.") + + with open(os.path.join(file_path), 'r') as f: + run_info = yaml.safe_load(f) + type= run_info.pop('runner_type', 'ollama') + model= run_info.pop('model', 'gemma3:1b') + api= run_info.pop('api', None) + + # Any remaining keys are considered model parameters + model_params = run_info if run_info else None + + return cls(type, model, api, model_params) + def __str__(self): return f'{self.model} via {self.runner_type}' + + def run(self, prompt_id, prompt, values, format, logger): + ''' + Run method of a runner takes a prompt and a format and then finds the correct api call that matches the runner requested by the user. Runs the LLM call and returns the LLM response + ''' + runner_info = { + 'runner_type': self.runner_type, + 'model': self.model, + 'api': self.api, + 'inference_parameters': self.inference_parameters + } + logger.log_runner_info(runner_info) + + response_info = { + 'prompt_id': prompt_id, + 'prompt': prompt, + 'values': values, + 'format': format, + 'response': '', + 'error': None, + 'prompt_tokens': 0, + 'response_tokens': 0, + 'total_tokens': 0, + 'stop_reason': None, + } + + try: + match self.runner_type: + case "ollama": + completion: ChatResponse = chat( + model=self.model, + format = format, + messages=[ + { + 'role': 'user', + 'content':prompt, + }, + ], + options=self.inference_parameters + ) + response_info['response'] = completion.message.content + response_info['prompt_tokens'] = completion.prompt_eval_count + response_info['response_tokens'] = completion.eval_count + response_info['total_tokens'] = completion.eval_count + completion.prompt_eval_count + response_info['stop_reason'] = completion.done_reason + + + case "ollama_api": + client = Client( + host=self.api , + ) + completion = client.chat( + self.model, + format = format, + messages=[ + { + "role": "user", + "content": prompt, + }, + ], + options=self.inference_parameters + ) + response_info['response'] = completion["message"]["content"] + response_info['prompt_tokens'] = completion["prompt_eval_count"] + response_info['response_tokens'] = completion["eval_count"] + response_info['total_tokens'] = completion["eval_count"] + completion["prompt_eval_count"] + response_info['stop_reason'] = completion["done_reason"] + + + case "openai": + client = OpenAI( + base_url=self.api, + ) + chat_completion = client.chat.completions.create( + model=self.model, + messages=[ + { + "role": "user", + "content": prompt, + } + ], + ) + response = chat_completion.choices[0].message.content + + case "bedrock": + config={} + # bedrock has some shared inference parameters but also some model specific ones. + # We pop the shared ones and then send the rest as additionalModelRequestFields for the model to handle as needed. + if self.inference_parameters: + if "temperature" in self.inference_parameters: config.update({"temperature": self.inference_parameters.pop("temperature", None)}) + if "topP" in self.inference_parameters: config.update({"topP": self.inference_parameters.pop("topP", None)}) + if "maxTokens" in self.inference_parameters: config.update({"maxTokens": self.inference_parameters.pop("maxTokens", None)}) + if "stopSequences" in self.inference_parameters: config.update({"stopSequences": self.inference_parameters.pop("stopSequences", None)}) + + client = boto3.client('bedrock-runtime', region_name='us-east-1') + try: + response = client.converse( + modelId=self.model, + messages=[ + { + 'role': 'user', + 'content': [{'text': prompt}] + } + ], + inferenceConfig=config, + additionalModelRequestFields = self.inference_parameters, # For model-specific inference params + # additionalModelResponseFieldPaths[], # For model-specific return fields + ) + # Catch the model family + # model_fam = None + # if self.model.startswith("meta") or self.model.startswith("us.meta"): model_fam = "meta" + # elif self.model.startswith("google"): model_fam = "gemma" + # elif self.model.startswith("nova") or self.model.startswith("us.nova"): model_fam = "nova" + # match model_fam: + # case "meta" |"nova": + # response_info['response'] = response['output']['message']['content'][0]['text'] + # case "gemma" | "_": + # response_info['response'] = response['output']['message']['content']['text'] + response_info['response'] = response['output']['message']['content'][0]['text'] + response_info['prompt_tokens'] = response['usage']['inputTokens'] + response_info['response_tokens'] = response['usage']['outputTokens'] + response_info['total_tokens'] = response['usage']['totalTokens'] + response_info['stop_reason'] = response['stopReason'] + + except Exception as e: + response_info['error'] = e + print(f"bedrock converse API failed with model {self.model}.\n{e}") + + case _: + print(f"Runner type {self.runner_type} not supported") + return None + except Exception as e: + response_info['error'] = e + + logger.log_interaction(response_info) + return response_info['response'], response_info['error'] + + class BenchRunnerList(): ''' - a set of runners + a set of runner objects that can be used to run a benchmark on multiple models and/or runner types. ''' - def __init__(self, runners: list[BenchRunner]): + def __init__(self, runners: list[BenchRunner]=[BenchRunner()]): ''' Parameters @@ -87,11 +253,3 @@ def from_file(cls,file_path): runner_list = [BenchRunner(**runner_info)] return cls(runner_list) - - - - - - - - diff --git a/benchtools/task.py b/benchtools/task.py index e420b2d..cb8c467 100644 --- a/benchtools/task.py +++ b/benchtools/task.py @@ -3,11 +3,9 @@ import os import yaml import json -import boto3 import pandas as pd import itertools -from ollama import chat, ChatResponse, Client -from .logger import init_log_folder, log_interaction +from .logger import Logger from pathlib import PurePath from datasets import load_dataset from .runner import BenchRunner @@ -22,16 +20,6 @@ prompt_id_fx = {'concatenator_id_generator':concatenator_id_generator, 'selector_id_generator':selector_id_generator} -class UnMatchedModel(Exception): - """ - Exception raised for a bedrock model that isn't accounted for in the match statement - Follow https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html for a list of available models on bedrock and their inferance parameters - """ - def __init__(self, model): - self.model = model - message = f"Cannot call the model ${attempted_withdrawal} using aws Bedrock. Please fetch the correct inferance parameters for it and add it in a PR to BenchTools." - super().__init__(message) # Call the base class constructor - class Task: """ @@ -458,9 +446,7 @@ def write_csv(self, target_folder): - def run(self, runner=BenchRunner(), log_dir='logs', - benchmark=None, benchmark_path=None, - score = False): + def run(self, runner=BenchRunner(), logger= None, log_dir='logs', score = False): """ run the task on the stated model and log the interactions. @@ -481,136 +467,22 @@ def run(self, runner=BenchRunner(), log_dir='logs', # Gerenate all the prompts of the task id_prompt_list = self.generate_prompts() - # Create log directory if it doesn't exist - if not os.path.exists(log_dir): - os.mkdir(log_dir) - - run_log="" - # Create logging structure for a task within a log directory - try: - run_log = init_log_folder(log_dir, runner.model, self.get_dict(), - id_prompt_list, benchmark, benchmark_path) - except Exception as e: - print(f"Couldn't create log directory in {log_dir}...\n{e}") - + if not logger: + logger = Logger(log_dir) + + logger.log_task_info(self.get_dict(), id_prompt_list) for (prompt_id, prompt),values in zip(id_prompt_list,self.variant_values): - error = None - response = '' - try: - match runner.runner_type: - case "ollama": - completion: ChatResponse = chat( - model=runner.model, - format = self.FormatClass.model_json_schema(), - messages=[ - { - 'role': 'user', - 'content':prompt, - }, - ]) - # print("response: " + response.message.content) - response = completion.message.content - - - case "ollama_api": - client = Client( - host=runner.api , - ) - completion = client.chat( - runner.model, - format = self.FormatClass.model_json_schema(), - messages=[ - { - "role": "user", - "content": prompt, - }, - ], - ) - response = completion["message"]["content"] - - - case "openai": - client = OpenAI( - base_url=runner.api, - ) - chat_completion = client.chat.completions.create( - model=runner.model, - messages=[ - { - "role": "user", - "content": prompt, - } - ], - ) - response = chat_completion.choices[0].message.content - - case "bedrock": - bedrock_client = boto3.client('bedrock-runtime') - # Bedrock has multiple foundational models that will each differ in request parameters and response fields we included cases for a couple of them - # for available foundational models and their inferance parameters follow - # https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html - # Catch the model family first - model_fam = None - if runner.model.startswith("meta"): model_fam = "llama" - elif runner.model.startswith("google"): model_fam = "gemma" - match model_fam: - case "llama": - # Embed the prompt in Llama 3's instruction format. - formatted_prompt = f""" -<|begin_of_text|><|start_header_id|>user<|end_header_id|> -{prompt} -<|eot_id|> -<|start_header_id|>assistant<|end_header_id|> -""" - # Format the request payload using the model's native structure. - request = { - "prompt": formatted_prompt, - # "max_gen_len": 512, - # "temperature": 0.5, - } - # Convert the native request to JSON. - request = json.dumps(request) - completeion = bedrock_client.invoke_model( - modelId = runner.model, - body = request - ) - # Decode the response body. - response = json.loads(completeion["body"].read()) - response = response["generation"] - case "gemma": - completeion = bedrock_client.invoke_model( - modelId = runner.model, - body = json.dumps( - { - 'messages': [ - { - 'role': 'user', - 'content': prompt - } - ] - } - ) - ) - # Decode the response body. - response = json.loads(completeion['body'].read()) - response = response['choices'][0]['message']['content'] - case _: - raise UnMatchedModel(runner.model) - - case _: - print(f"Runner type {runner.runner_type} not supported") - return None - except Exception as e: - error = e - if score: + response, error = runner.run(prompt_id, prompt, values, self.FormatClass.model_json_schema(), logger) + + if not error and score: score_val = self.scoring_function(response, self.reference[prompt_id]) - else: score_val = None + - log_interaction(run_log, prompt_id, prompt, response, str(error),values,score_val) + # log_interaction(run_log, prompt_id, prompt,values, run_info, score_val) responses.append(response)