Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchtools/assets/demos/folderbench/multiple_models.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
runner_type: ollama
model:
- 'llama3.2'
- 'gemma3'
- 'gemma3'
3 changes: 3 additions & 0 deletions benchtools/assets/demos/folderbench/runner.yml
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
runner_type: ollama
model: 'llama3.2'
temperature: 0.5
max_tokens: 17
top_p: 0.150
4 changes: 4 additions & 0 deletions benchtools/assets/demos/listbench/runner.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
runner_type: ollama
model:
- 'llama3.2'
- 'gemma3'
75 changes: 40 additions & 35 deletions benchtools/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
import yaml
import json
# from pathlib import Path # ???
from benchtools.task import Task
from pathlib import PurePath
from benchtools.runner import BenchRunner
from .task import Task
from .logger import Logger
from .runner import BenchRunner
from .utils import load_asset


Expand Down Expand Up @@ -319,12 +320,47 @@ def run(self, runner=BenchRunner(), log_dir=None, score=False):
score : bool
to run scoring now or not
'''
# If user doesn't specify a log_dir, default to logs folder inside bench folder
if not log_dir and not self.written:
raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.")
elif not log_dir:
log_dir = os.path.join(self.benchmark_path, 'logs')

# Initiaize a logger object that will handle the logging of the info and interactions
logger = Logger(log_dir)
logger.log_bench_info(bench_info={'bench_name': self.bench_name, 'bench_path': self.benchmark_path, 'concept': self.concept})

# Run each task
for name, task in self.tasks.items():
self.run_task(task, runner, log_dir,score)
self.run_task(task, runner, logger,score)



def run_task(self, target_task=None, runner=BenchRunner(), log_dir=None, logger=None, score=False):
'''
run a specific task
'''

# If user doesn't specify a log_dir, default to logs folder inside bench folder
if not log_dir and not self.written:
raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.")
elif not log_dir:
log_dir = os.path.join(self.benchmark_path, 'logs')

if not(target_task):
# TODO: use a generator and make this have a state
target_task = list[self.tasks.keys()][0]

if isinstance(target_task, str):
task_object = self.tasks[target_task]
elif isinstance(target_task, Task):
task_object = target_task
else:
raise ValueError("target_task should be either a string (task name) or a Task object.")


return task_object.run(runner, log_dir, logger, score)




Expand Down Expand Up @@ -421,35 +457,4 @@ def score(self, model=None,task=None, run ='last',collate=False):



return score_list



def run_task(self, target_task=None, runner=BenchRunner(),
log_dir=None, score=False):
'''
run a specific task
'''
if not log_dir and not self.written:
raise ValueError("Benchmark has not been written to disk yet, need to write in order to log.")

# If user doesn't specify a log_dir, default to logs folder inside bench folder
if not log_dir:
log_dir = os.path.join(self.benchmark_path, 'logs')

if not(target_task):
# TODO: use a generator and make this have a state
target_task = list[self.tasks.keys()][0]

if isinstance(target_task, str):
task_object = self.tasks[target_task]
elif isinstance(target_task, Task):
task_object = target_task
else:
raise ValueError("target_task should be either a string (task name) or a Task object.")

# TODO: Add log_dir to attributes?

return task_object.run(runner, log_dir, self.bench_name, self.benchmark_path,score)


return score_list
221 changes: 112 additions & 109 deletions benchtools/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,127 +16,130 @@ def default(self, o):
return super().default(o)


def init_log_folder(log_path, model, task_info: dict, id_prompt_list: list, benchmark=None, benchmark_path=None):
''''
Creates the log directories and sub-directories for a specific task.

Parameters:
-------------
log_path: str
The path to the log dir where the log file will be created.
model:
The name of the model running the task
task_info: dict
A dictionary with all the task's info for which the logger is being initialized.
class Logger:
'''
A class that holds all information and methods related to logging the interactions between the runner and the model. The logger will create the logging structure for each run of a task, and will log the prompt, response, and any other relevant information such as tokens used, stop reason, errors, etc...
'''
# Get timestamp without fractions of seconds
timestamp = int(datetime.datetime.now().timestamp())

model_dir = os.path.join(log_path, model)
if not os.path.exists(model_dir):
os.mkdir(model_dir)
def __init__(self, log_path):
'''
Initializes the logger by creating the log directory if it doesn't exist.

task_dir = os.path.join(model_dir, task_info['name'])
if not os.path.exists(task_dir):
os.mkdir(task_dir)
Parameters:
-------------
log_path: str
The path to the log dir where the log file will be created.
'''
self.log_path = log_path
# self.init_log_directory() # Create the log folder structure for the task
os.makedirs(self.log_path, exist_ok=True)

run_dir = os.path.join(task_dir, str(timestamp))
os.mkdir(run_dir)
self.bench_info = {}

# Create run_info.yml with all the metadata
run_info = task_info
if benchmark:
run_info['bench_name'] = benchmark
run_info['benchmark_path'] = benchmark_path
run_info['run_id'] = str(timestamp)
run_info['log_path'] = str(run_dir)

# Add prompt_id of each value set to values
for idx, (prompt_id, _) in enumerate(id_prompt_list):
run_info['values'][idx].update({'prompt_id': prompt_id})

with open(os.path.join(run_dir,'run_info.yml'), 'w') as f:
yaml.dump(run_info, f)
def log_bench_info(self, bench_info):
# Get timestamp without fractions of seconds
timestamp = int(datetime.datetime.now().timestamp())

bench_info[f'bench_run_id'] = str(timestamp)
self.bench_info = bench_info
self.log_path = os.path.join(self.log_path, f"bench_{bench_info['bench_name']}")
os.makedirs(self.log_path, exist_ok=True)


{
# TODO: What can we benifit from the logger?
# log_file = os.path.join(log_path, f'{task_name}_log.txt')
# print(f"\nLOGPATH: {log_file}\n") # Debugging

# formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# handler = logging.FileHandler(log_file)
# handler.setFormatter(formatter)

# logger = logging.getLogger(task_name)
# logger.setLevel(logging.INFO) # TODO add as an argument to the init functuion to use more options
# logger.addHandler(handler)

# print(logger) # Debugging
# return logger
}

return run_dir

def log_interaction(run_log_dir, prompt_id, prompt, response, error,values,score=None):
"""
Logs the event to the log folder specified by the user

Parameters:
-------------
run_log_dir: str
Path to a run-specific directory in a log directory specified in a call to the run method
prompt_id: str
Index of the sub-task being logged
prompt: str
The input provided to the model.
response: str
The output generated by the model.
error: str
Any error from the runner
"""

# Making this into a directory in case more files (possibly steps) were to be held in here
prompt_dir = os.path.join(run_log_dir, prompt_id)
os.mkdir(prompt_dir)

with open(os.path.join(prompt_dir, "log.txt"), 'w') as f:
f.write("------ prompt ------\n")
f.write(f"{prompt}\n\n")
f.write("------ response ------\n")
f.write(f"{response}\n\n")

# Gather run_info info
with open(os.path.join(run_log_dir, "run_info.yml"), 'r') as f:
run_info = yaml.safe_load(f)


step_trace = {
'task_name': run_info['name'],
'template': run_info['template'],
'prompt_id': prompt_id,
'error': error,
'values':values,
'steps':{
0: { # In case a subtask had more than one step we can always make the 0 dynamic
'prompt': prompt,
'response': response,
},
},
}
if not(score is None):
step_trace['steps'][0]['score'] = score
def log_task_info(self, task_info, id_prompt_list: list):
'''
Logs the task info to the log folder specified by the user

Parameters:
-------------
task_info: dict
A dictionary with all the task's info for which the logger is being initialized.
'''
# Get timestamp without fractions of seconds
timestamp = int(datetime.datetime.now().timestamp())

task_info['task_run_id'] = str(timestamp)
self.task_info = task_info

self.task_log_path = os.path.join(self.log_path, f"task_{task_info['name']}")
os.makedirs(self.task_log_path, exist_ok=True)

# with open(os.path.join(run_log_dir, "task_info.yml"), 'w') as f:
# yaml.dump(task_info, f)

# Add prompt_id of each value set to values
for idx, (prompt_id, _) in enumerate(id_prompt_list):
task_info['values'][idx].update({'prompt_id': prompt_id})


def log_runner_info(self, runner_info):
''''
Creates the log directories and sub-directories for a specific task.

Parameters:
-------------
runner_info: dict
Dictionary that contains information about the runner of a task
'''

with open(os.path.join(prompt_dir, "log.json"), 'w') as f:
# yaml.dump(step_trace, f)
json.dump(step_trace, f, indent=4, cls=EnhancedJSONEncoder)
self.model_dir = os.path.join(self.task_log_path, runner_info['model'])
os.makedirs(self.model_dir, exist_ok=True)

# TODO: What can we benifit from the logger?
# logger.info(f'Input: {prompt}')
# logger.info(f'Output: {response}')
self.run_dir = os.path.join(self.model_dir, self.task_info['task_run_id'])
os.makedirs(self.run_dir, exist_ok=True)

self.runner_info = runner_info

# Create run_info.yml with all the metadata
self.run_info = self.bench_info | self.task_info | self.runner_info
self.run_info['log_path'] = str(self.task_log_path)

with open(os.path.join(self.run_dir,'run_info.yml'), 'w') as f:
yaml.dump(self.run_info, f)



def log_interaction(self, response_info):
"""
Logs the event to the log folder specified by the user

Parameters:
-------------
response_info: dict
A dictionary of logged information from the interaction with the LLM
"""

# Making this into a directory in case more files (possibly steps) were to be held in here
self.prompt_dir = os.path.join(self.run_dir, response_info['prompt_id'])
os.mkdir(self.prompt_dir)

with open(os.path.join(self.prompt_dir, "log.txt"), 'w') as f:
f.write("------ prompt ------\n")
f.write(f"{response_info['prompt']}\n\n")
f.write("------ response ------\n")
f.write(f"{response_info['response']}\n\n")


step_trace = {
'task_name': self.run_info['name'],
'template': self.run_info['template'],
'steps':{
0: response_info,
},
}


with open(os.path.join(self.prompt_dir, "log.json"), 'w') as f:
json.dump(step_trace, f, indent=4, cls=EnhancedJSONEncoder)

# TODO: What can we benifit from the logger?
# logger.info(f'Input: {prompt}')
# logger.info(f'Output: {response}')



# def log_score(score):
# with open(os.path.join(run_log_dir, "run_info.yml"), 'r') as f:
# run_info = yaml.safe_load(f)

# step_trace['steps'][0]['score'] = score
Loading