From 679501c5519d0c8db3084c4db9ca7ae9cfe44637 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Fri, 29 May 2026 18:44:46 +0000
Subject: [PATCH 1/4] Docs: Small fixes and all code-cells  weren't showing

---
 docs/source/demo.md  | 147 +++++++++++++++++++++++++++++++------------
 docs/source/index.md |   9 ---
 docs/source/pylib.md |  43 ++++++-------
 3 files changed, 125 insertions(+), 74 deletions(-)

diff --git a/docs/source/demo.md b/docs/source/demo.md
index 5880bf0..949850d 100644
--- a/docs/source/demo.md
+++ b/docs/source/demo.md
@@ -9,7 +9,7 @@ Installing from source means you can pull to update.
 
 First, clone the repo: 
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 git clone https://github.com/ml4sts/benchtools.git
 ```
@@ -27,7 +27,7 @@ Resolving deltas: 100% (513/513), done.
 
 
 See it creates a folder
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 ls
 ```
@@ -39,11 +39,11 @@ benchtools
 
 Then install: 
 ::::::{important}
-this needs to be `benchtools/` for it to be the path; `benchtools` will try to pull from pypi. Alternatively, `cd benchtools` then `pip install .`
+the following needs to be `benchtools/` for it to be the path; `benchtools` will try to pull from pypi. Alternatively, `cd benchtools` then `pip install .`
 :::::::
 
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 pip install benchtools/
 ```
@@ -72,7 +72,7 @@ the above is truncated, but the last few lines are the most important
 
 Benchrools is packaged with two demos, you can install them to create a copy and explore
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 benchtool demo list
 ```
@@ -89,13 +89,13 @@ listbench
 
 Let's examine the folder-based example first: 
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 benchtool demo install -n folderbench
 cd folderbench
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 ls
 ```
@@ -105,7 +105,7 @@ README.md	tasks
 ```
 
 The tasks folder is the main content: 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 cd tasks/
 :tags: ["skip-execution"]
@@ -118,7 +118,7 @@ add	symbols
 ```
 
 We can look inside one: 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 cd add/
 ls
@@ -145,13 +145,13 @@ a,b,reference
 ```
 
 :::::{important}
-The columns in the csv match the variables in `{}` in the template, plus a `reference` column for the answer (this can be empty, but the heading should be there), and optionally and `id` if you have an alternative naming scheme for the subtasks(each row is a subtask)
+The columns in the csv match the variables in `{}` in the template, plus a `reference` column for the answer (this can be empty, but the heading should be there), and optionally and `id` if you have an alternative naming scheme for the subtasks (each row is a subtask)
 :::::::
 
 we can look at the other task too:
 
 
-```{code-cell} console
+```{code-block} console
 :filename: sybmols/template.txt 
 what is the name for the following symbol? {symb}
 ```
@@ -169,17 +169,16 @@ symb, reference
 ## Running a benchmark
 
 let's install the other benchmark to run it
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 benchtool demo install -n listbench
-
 ```
 
 
 We can see the help for the command
 
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 benchtool run --help
 ```
@@ -207,7 +206,12 @@ this will be filled in later
 
 
 We can run a benchmark by name
-```{code-cell} bash
+
+::::{note}
+The default runner is ollama. Make sure ollama is running by calling `ollama serve` on another terminal window
+:::::::
+
+```{code-block} bash
 :tags: ["skip-execution"]
 benchtool run listbench/
 ```
@@ -217,7 +221,7 @@ Running list_bench now
 
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 cd listbench/
 :tags: ["skip-execution"]
@@ -233,7 +237,7 @@ it creates a `logs` folder if one does not already exist
 
 
 ### Exploring a yaml benchmark
-```{code-cell} console
+```{code-block} console
 :filename tasks.yml 
 - name: product
   template: "find the product of {a} and {b}"
@@ -250,7 +254,7 @@ it creates a `logs` folder if one does not already exist
   scorer: "contains"
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 ls logs/
 ```
@@ -262,18 +266,18 @@ gemma3
 
 there will be a folder per log
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 ls logs/gemma3/
 ```
 
 ```{code-block} console
-product	symbol
+product	product_combination symbol
 
 ```
 then per task
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 ls logs/gemma3/product/
 ```
@@ -284,7 +288,7 @@ ls logs/gemma3/product/
 ```
 then per run, named by the timestamp of the run start
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 ls logs/gemma3/product/1771533769/
 ```
@@ -294,7 +298,7 @@ product_2-3	product_3-4	product_5-5	run_info.yml
 
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 cat logs/gemma3/product/1771533769/run_info.yml 
 ```
@@ -324,7 +328,7 @@ values:
 ```
 it stored overall information for the run
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 ls logs/gemma3/product/1771533769/product_2-3/
 ```
@@ -335,7 +339,7 @@ log.json	log.txt
 ```
 
 and a log for each prompt in both text and json format
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 cat logs/gemma3/product/1771533769/product_2-3/log.txt 
 ```
@@ -352,7 +356,7 @@ So the answer is $\boxed{6}$.
 
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 cat logs/gemma3/product/1771533769/product_2-3/log.json 
 ```
@@ -374,7 +378,7 @@ cat logs/gemma3/product/1771533769/product_2-3/log.json
 
 ## Initializing a new benchmark
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 benchtool
 ```
@@ -389,13 +393,15 @@ Options:
 
 Commands:
   add-task  Set up a new task.
+  demo      demo benchmarks package with benchtools
   init      Initializes a new benchmark.
-  run       Running the benchmark and generating logs , help="The path to...
+  run       Run the benchmark, generate logs, and optionally sore
   run-task  Running the tasks and generating logs
+  score     Running the benchmark and generating logs Parameters:...
 
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 benchtool init --help
 ```
@@ -419,7 +425,7 @@ Options:
 ```
 
 it asks questions interactively
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 benchtool init example --about 'in class example benchmark'
 Do you want to add any tasks now? [y/N]: y
@@ -434,7 +440,7 @@ Do you want to run the benchmark now? [Y/n]: n
 
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 ls
 ```
@@ -444,7 +450,7 @@ benchtools	example
 
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 cd example/
 ```
@@ -453,7 +459,7 @@ cd example/
 
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 ls
 ```
@@ -463,7 +469,7 @@ about.md	info.yml	tasks
 
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 cat info.yml 
 ```
@@ -478,7 +484,7 @@ tasks:
 
 ```
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 cat tasks.yml 
 ```
@@ -501,7 +507,8 @@ cat tasks.yml
 
 ```
 
-```{code-cell} bash
+Let's mannually edit the task to make a good example.
+```{code-block} bash
 :tags: ["skip-execution"]
 nano tasks.yml 
 
@@ -509,10 +516,11 @@ nano tasks.yml
 
 ```{code-block} console
 - description: 'animal identifcaiton '
+  format: StringAnswer
   id_generator: concatenator_id_generator
   name: animal
   reference: ['zebra', 'tiger', "cheetah"]
-  scorer: exact_match
+  scorer: contains
   template: an animal has a {pattern}, {feet}, and {skin}. what kind of animal is it?
   values:
     pattern:
@@ -530,7 +538,7 @@ nano tasks.yml
 ```
 
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 ls
 about.md	info.yml	tasks.yml
@@ -539,12 +547,69 @@ about.md	info.yml	tasks.yml
 
 
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 benchtool run .
 ```
 
 
+```{code-block} bash
+:tags: ["skip-execution"]
+ls
+about.md	info.yml	logs tasks.yml
+
+```
+
+```{code-block} bash
+:tags: ["skip-execution"]
+ls logs/
+gemma3
+
+```
+
+```{code-block} bash
+:tags: ["skip-execution"]
+ls logs/gemma3/
+animal
+
+```
+
+```{code-block} bash
+:tags: ["skip-execution"]
+ls logs/gemma3/animal
+1780063281
+
+```
+
+```{code-block} bash
+:tags: ["skip-execution"]
+ls logs/gemma3/animal/1780063281
+animal_spots-hairy-paws  animal_stripes-hairy-hooves  animal_stripes-hairy-paws  run_info.yml
+
+```
+
+Let's run the scorer to score the LLM on the task at hand.
+
+```{code-block} bash
+:tags: ["skip-execution"]
+benchtool score .
+```
+
+```{code-block} consol
+Saved Eval: ./eval_1780069448
+```
+
+```{code-block} bash
+:tags: ["skip-execution"]
+cat ./eval_1780069448.json
+```
+
+```{code-block} console
+[{"task_name": "animal", "template": "an animal has a {pattern}, {feet}, and {skin}. what kind of animal is it?", "prompt_id": "animal_spots-hairy-paws", "error": "None", "values": {"pattern": "spots", "skin": "hairy", "feet": "paws", "prompt_id": "animal_spots-hairy-paws"}, "steps": {"0": {"prompt": "an animal has a spots, paws, and hairy. what kind of animal is it?", "response": "{\n    \"answer\": \"A dog\"\n}\n", "score": 0}}, "model": "gemma3", "task": "animal", "run": "1780063281"}, {"task_name": "animal", "template": "an animal has a {pattern}, {feet}, and {skin}. what kind of animal is it?", "prompt_id": "animal_stripes-hairy-hooves", "error": "None", "values": {"pattern": "stripes", "skin": "hairy", "feet": "hooves", "prompt_id": "animal_stripes-hairy-hooves"}, "steps": {"0": {"prompt": "an animal has a stripes, hooves, and hairy. what kind of animal is it?", "response": "{\n  \"answer\": \"A zebra!\"\n}", "score": 1}}, "model": "gemma3", "task": "animal", "run": "1780063281"}, {"task_name": "animal", "template": "an animal has a {pattern}, {feet}, and {skin}. what kind of animal is it?", "prompt_id": "animal_stripes-hairy-paws", "error": "None", "values": {"pattern": "stripes", "skin": "hairy", "feet": "paws", "prompt_id": "animal_stripes-hairy-paws"}, "steps": {"0": {"prompt": "an animal has a stripes, paws, and hairy. what kind of animal is it?", "response": "{\n  \"answer\": \"A zebra!\"\n}\n", "score": 0}}, "model": "gemma3", "task": "animal", "run": "1780063281"}]
+```
+
+
+
 ## Get updates
 
 ::::{tip}
@@ -553,7 +618,7 @@ Watch the repo to get notifications for important updates
 
 Then update by pulling 
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 cd benchtools/
 git pull
@@ -562,7 +627,7 @@ git pull
 and re-installing: 
 
 
-```{code-cell} bash
+```{code-block} bash
 :tags: ["skip-execution"]
 pip install .
 ```
diff --git a/docs/source/index.md b/docs/source/index.md
index 9679341..dd44aac 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -71,12 +71,3 @@ pylib.md
 concept.md
 demo.md
 ```
-
-
-```{eval-rst}
-.. click:: benchtools.cli:benchtool
-   :prog: benchtools 
-   :nested: full
-   :commands:
-
-```
\ No newline at end of file
diff --git a/docs/source/pylib.md b/docs/source/pylib.md
index 42df05e..79b5c0c 100644
--- a/docs/source/pylib.md
+++ b/docs/source/pylib.md
@@ -30,7 +30,13 @@ from benchtools import Task
 
 tt = Task('greeting','Hello there','hi', 'contains')
 ```
-<!-- there -->
+
+<!-- Doesn't really run anything 
+File "/work/pi_brownsarahm_uri_edu/ayman_uri/BenchTools/benchtools/benchtools/task.py", line 497, in run
+    for (prompt_id, prompt),values in zip(id_prompt_list,self.variant_values):
+                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+ -->
 
 ```{code-cell}
 response = tt.run()
@@ -46,16 +52,17 @@ tiny_bench.add_task(tt)
 
 There are multiple ways to creating a Task object
 ```
-add_task = Task.from_txt_csv('../../demos/folderbench/tasks/add')
+add_task = Task.from_txt_csv('benchtools/assets/demos/folderbench/tasks/add')
 tiny_bench.add_task(add_task)
 ```
 
 For demo purposes we delete the folder, if it exists, before running. 
-```{code-cell}
-%%bash
+```{code-cell} bash
 rm  -rf tiniest_demo
 ```
 
+<!-- Same problem with run -->
+
 We create a new folder for a benchmark to store it in the file system
 ```{code-cell}
 tiny_bench.initialize_dir()
@@ -64,7 +71,7 @@ tiny_bench.run()
 
 
 ```{code-cell}
-pre_built_yml = Bench.from_yaml('../../demos/listbench')
+pre_built_yml = Bench.from_yaml('benchtools/assets/demos/listbench')
 pre_built_yml.written
 ```
 
@@ -74,36 +81,26 @@ we can access individual tasks:
 pre_built_yml.tasks['product'].variant_values
 ```
 
+```
+[{'a': 2, 'b': 3}, {'a': 3, 'b': 4}, {'a': 5, 'b': 5}]
+```
 
+Make sure you have `ollama serve` running to run the benchmark
 
 ```{code-cell}
 pre_built_yml.run()
 ```
 
-```{code-cell}
-demo_bench = Bench.from_yaml('../../demos/listbench')
-```
-
-
-<!-- ```{code-cell}
-demo_bench = Bench.load('../../demobench')
-``` 
--->
-
-
+Logs will be found in `benchtools/assets/demos/listbench/logs`
 
 
-
-## Creating a Benchmark object
-<!-- Testing which is better -->
-
+## Runner class
 ```{eval-rst}
 .. automodule:: benchtools.runner
     :members:
 ```
 
 ## Benchmark class
-<!-- Testing which is better -->
 ```{eval-rst}
 .. autoclass:: benchtools.benchmark.Bench
     :members:
@@ -111,7 +108,6 @@ demo_bench = Bench.load('../../demobench')
 
 
 ## Task class
-<!-- Testing which is better -->
 ```{eval-rst}
 .. autoclass:: benchtools.task.Task
     :members:
@@ -120,8 +116,7 @@ demo_bench = Bench.load('../../demobench')
 
 
 ## BetterBench
-<!-- Testing which is better -->
 ```{eval-rst}
-.. autoclass:: benchtools.task.Task
+.. autoclass:: benchtools.betterbench.BetterCheckList
     :members:
 ```
\ No newline at end of file

From a2cd27b7bd7f95689d8938d55b42874fa9335607 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Fri, 29 May 2026 18:45:21 +0000
Subject: [PATCH 2/4] CLI: Doc string fix

---
 benchtools/cli.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchtools/cli.py b/benchtools/cli.py
index 77455d4..3930b94 100644
--- a/benchtools/cli.py
+++ b/benchtools/cli.py
@@ -217,8 +217,10 @@ def run(benchmark_path: str, runner_type: str,
 def score(benchmark_path: str, result_id,csv,collate):
     """
     Running the benchmark and generating logs
+    -----------
     Parameters:
-        benchmark-path: The path to the benchmark repository where all the task reside.
+        benchmark-path: str
+            The path to the benchmark repository where all the task reside.
     """
     # if not provided do the last one for each model-task combination
     

From 15c73bfb72634a93e1f43dabc32de4f5184993c6 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 10 Jun 2026 18:48:23 +0000
Subject: [PATCH 3/4] docs: removing mention of ollama serve

---
 docs/source/demo.md  | 2 +-
 docs/source/pylib.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/demo.md b/docs/source/demo.md
index 949850d..64c2787 100644
--- a/docs/source/demo.md
+++ b/docs/source/demo.md
@@ -208,7 +208,7 @@ this will be filled in later
 We can run a benchmark by name
 
 ::::{note}
-The default runner is ollama. Make sure ollama is running by calling `ollama serve` on another terminal window
+The default runner is ollama. Make sure `ollama` is running in advence.
 :::::::
 
 ```{code-block} bash
diff --git a/docs/source/pylib.md b/docs/source/pylib.md
index 79b5c0c..9acecbc 100644
--- a/docs/source/pylib.md
+++ b/docs/source/pylib.md
@@ -85,7 +85,7 @@ pre_built_yml.tasks['product'].variant_values
 [{'a': 2, 'b': 3}, {'a': 3, 'b': 4}, {'a': 5, 'b': 5}]
 ```
 
-Make sure you have `ollama serve` running to run the benchmark
+Make sure `ollama` is running in advence on your system to run the benchmark
 
 ```{code-cell}
 pre_built_yml.run()

From f559e0428317db4a25c2a96e7ca82ccb9b17c720 Mon Sep 17 00:00:00 2001
From: AymanBx <ayman_sandouk@uri.edu>
Date: Wed, 10 Jun 2026 18:52:29 +0000
Subject: [PATCH 4/4] docs: explaining steps in demo.md

---
 docs/source/demo.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/demo.md b/docs/source/demo.md
index 64c2787..efc0198 100644
--- a/docs/source/demo.md
+++ b/docs/source/demo.md
@@ -552,6 +552,7 @@ about.md	info.yml	tasks.yml
 benchtool run .
 ```
 
+Let's see what happened after running the benchmark
 
 ```{code-block} bash
 :tags: ["skip-execution"]
@@ -560,6 +561,7 @@ about.md	info.yml	logs tasks.yml
 
 ```
 
+Now we have a new `logs/` folder. Let's explore its contents 
 ```{code-block} bash
 :tags: ["skip-execution"]
 ls logs/
@@ -580,6 +582,8 @@ ls logs/gemma3/animal
 1780063281
 
 ```
+`1780063281` is the timestamp of when the benchmark was run which represents a single run of the benchmark
+
 
 ```{code-block} bash
 :tags: ["skip-execution"]
@@ -587,6 +591,8 @@ ls logs/gemma3/animal/1780063281
 animal_spots-hairy-paws  animal_stripes-hairy-hooves  animal_stripes-hairy-paws  run_info.yml
 
 ```
+Each run will have a folder for the different subtasks  
+
 
 Let's run the scorer to score the LLM on the task at hand.