From fcdb39c9b0b1cc545003a3ff77b52e5796546b80 Mon Sep 17 00:00:00 2001
From: Abhinavexist <abhinav@interfaze.ai>
Date: Wed, 20 May 2026 22:24:24 +0530
Subject: [PATCH] feat(results): add gemini-3.5-flash

---
 .../response_gemini-3.5-flash_audio.jsonl     |   3 +
 .../audio/gemini-3.5-flash/eval_records.jsonl |   3 +
 .../audio/gemini-3.5-flash/eval_summary.json  | 264 +++++++++++
 .../image/gemini-3.5-flash/eval_records.jsonl |   3 +
 .../image/gemini-3.5-flash/eval_summary.json  | 430 ++++++++++++++++++
 .../text/gemini-3.5-flash/eval_records.jsonl  |   3 +
 .../text/gemini-3.5-flash/eval_summary.json   | 264 +++++++++++
 .../response_gemini-3.5-flash_image.jsonl     |   3 +
 .../response_gemini-3.5-flash.jsonl           |   3 +
 9 files changed, 976 insertions(+)
 create mode 100644 data/audio_responses/response_gemini-3.5-flash_audio.jsonl
 create mode 100644 data/evaluation/audio/gemini-3.5-flash/eval_records.jsonl
 create mode 100644 data/evaluation/audio/gemini-3.5-flash/eval_summary.json
 create mode 100644 data/evaluation/image/gemini-3.5-flash/eval_records.jsonl
 create mode 100644 data/evaluation/image/gemini-3.5-flash/eval_summary.json
 create mode 100644 data/evaluation/text/gemini-3.5-flash/eval_records.jsonl
 create mode 100644 data/evaluation/text/gemini-3.5-flash/eval_summary.json
 create mode 100644 data/images_responses/response_gemini-3.5-flash_image.jsonl
 create mode 100644 data/text_responses/response_gemini-3.5-flash.jsonl

diff --git a/data/audio_responses/response_gemini-3.5-flash_audio.jsonl b/data/audio_responses/response_gemini-3.5-flash_audio.jsonl
new file mode 100644
index 0000000..f8b5517
--- /dev/null
+++ b/data/audio_responses/response_gemini-3.5-flash_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:721a0c1e246230245609882be6a26c1a0f8e9396c11951308b556749d0ca2607
+size 5678744
diff --git a/data/evaluation/audio/gemini-3.5-flash/eval_records.jsonl b/data/evaluation/audio/gemini-3.5-flash/eval_records.jsonl
new file mode 100644
index 0000000..ddc5812
--- /dev/null
+++ b/data/evaluation/audio/gemini-3.5-flash/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:152402dfe306ff56b139f28e259419dacd578b6d43513c9e851f259f7b7b8f12
+size 69026
diff --git a/data/evaluation/audio/gemini-3.5-flash/eval_summary.json b/data/evaluation/audio/gemini-3.5-flash/eval_summary.json
new file mode 100644
index 0000000..5a800ea
--- /dev/null
+++ b/data/evaluation/audio/gemini-3.5-flash/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_gemini-3.5-flash_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "gemini-3.5-flash"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 7,
+    "json_non_structured_root_count": 7,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9391304347826087,
+          "ci95_low": 0.8956521739130435,
+          "ci95_high": 0.9739130434782609,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9391304347826087,
+          "ci95_low": 0.8956521739130435,
+          "ci95_high": 0.9739130434782609,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.808695652173913,
+          "ci95_low": 0.7304347826086957,
+          "ci95_high": 0.8782608695652174,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.18993894462698693,
+          "ci95_low": 0.15206000100577555,
+          "ci95_high": 0.22716980671040382,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.40324767322861155,
+          "ci95_low": 0.35727945846336234,
+          "ci95_high": 0.45624885267548737,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7107172807141823,
+          "ci95_low": 0.6470438576428221,
+          "ci95_high": 0.7725613577146161,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.7382157663542203,
+          "ci95_low": 0.6587174942858072,
+          "ci95_high": 0.803243944413393,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.808695652173913,
+          "ci95_low": 0.7391304347826086,
+          "ci95_high": 0.8782608695652174,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.43463463285659354,
+          "ci95_low": 0.387603928399322,
+          "ci95_high": 0.47748502460229886,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.7852023569006821,
+          "ci95_low": 0.7095056773386922,
+          "ci95_high": 0.8531871629788458,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.2965933089277993,
+          "ci95_low": 0.2565178442680253,
+          "ci95_high": 0.3383158264807672,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8521739130434782,
+          "ci95_low": 0.7971014492753623,
+          "ci95_high": 0.9072463768115941,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9387755102040817,
+          "ci95_low": 0.8944281524926686,
+          "ci95_high": 0.9824561403508771,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9387755102040817,
+          "ci95_low": 0.8950437317784257,
+          "ci95_high": 0.9739130434782609,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8075801749271136,
+          "ci95_low": 0.7376093294460642,
+          "ci95_high": 0.8856304985337243,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.19104646033909764,
+          "ci95_low": 0.156018248465694,
+          "ci95_high": 0.22572981009198756,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.4021751481319986,
+          "ci95_low": 0.35097915092806337,
+          "ci95_high": 0.44979258860329846,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7094489869278406,
+          "ci95_low": 0.6397157902811849,
+          "ci95_high": 0.7691149337349868,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.7369069547138544,
+          "ci95_low": 0.6727107601491489,
+          "ci95_high": 0.799727568544498,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8075801749271136,
+          "ci95_low": 0.7296511627906976,
+          "ci95_high": 0.8691860465116279,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.4342235317996457,
+          "ci95_low": 0.38758707762790273,
+          "ci95_high": 0.4808381927276586,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.7840224348560273,
+          "ci95_low": 0.714558882179947,
+          "ci95_high": 0.8546304987035785,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.29661080423554814,
+          "ci95_low": 0.2589885152565194,
+          "ci95_high": 0.3383396937927052,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8513119533527697,
+          "ci95_low": 0.7877906976744186,
+          "ci95_high": 0.9072463768115943,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/image/gemini-3.5-flash/eval_records.jsonl b/data/evaluation/image/gemini-3.5-flash/eval_records.jsonl
new file mode 100644
index 0000000..d09c48b
--- /dev/null
+++ b/data/evaluation/image/gemini-3.5-flash/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5dbcaaeb03b32dd4c45a4884cac2da0bbad9ddd1d7acc49f097441c28034fc7
+size 172802
diff --git a/data/evaluation/image/gemini-3.5-flash/eval_summary.json b/data/evaluation/image/gemini-3.5-flash/eval_summary.json
new file mode 100644
index 0000000..b7fc1f1
--- /dev/null
+++ b/data/evaluation/image/gemini-3.5-flash/eval_summary.json
@@ -0,0 +1,430 @@
+{
+  "response_file": "data/images_responses/response_gemini-3.5-flash_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "gemini-3.5-flash"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 6,
+    "json_non_structured_root_count": 6,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9712918660287081,
+          "ci95_low": 0.9473684210526315,
+          "ci95_high": 0.9904306220095693,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9712918660287081,
+          "ci95_low": 0.9473684210526315,
+          "ci95_high": 0.9904306220095693,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8564593301435407,
+          "ci95_low": 0.8038277511961722,
+          "ci95_high": 0.9043062200956937,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.6014651317790386,
+          "ci95_low": 0.5540684739454232,
+          "ci95_high": 0.6487739292148866,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.741961845694227,
+          "ci95_low": 0.6921627473674945,
+          "ci95_high": 0.7871633633815889,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8354730711371612,
+          "ci95_low": 0.7853333688015341,
+          "ci95_high": 0.8840432995362576,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8353013262977563,
+          "ci95_low": 0.7886467625159835,
+          "ci95_high": 0.878037253509644,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8564593301435407,
+          "ci95_low": 0.8038277511961722,
+          "ci95_high": 0.8995215311004785,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.11483253588516747,
+          "ci95_low": 0.07177033492822966,
+          "ci95_high": 0.15789473684210525,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7263000162034755,
+          "ci95_low": 0.6846808359237468,
+          "ci95_high": 0.7686229073363774,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8494066621949459,
+          "ci95_low": 0.8012498541109946,
+          "ci95_high": 0.896855753596863,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6717134887366328,
+          "ci95_low": 0.622768431170637,
+          "ci95_high": 0.7150223490753279,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8947368421052632,
+          "ci95_low": 0.8564593301435407,
+          "ci95_high": 0.9282296650717703,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.11483253588516747,
+          "ci95_low": 0.07177033492822966,
+          "ci95_high": 0.15789473684210525,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9717607973421927,
+          "ci95_low": 0.9456342668863262,
+          "ci95_high": 0.9916387959866221,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9717607973421927,
+          "ci95_low": 0.9450915141430949,
+          "ci95_high": 0.9917218543046358,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8538205980066446,
+          "ci95_low": 0.802653399668325,
+          "ci95_high": 0.9001663893510815,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.6051763391228808,
+          "ci95_low": 0.5549673432523973,
+          "ci95_high": 0.6531048353319622,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7431579029519458,
+          "ci95_low": 0.6958644254954833,
+          "ci95_high": 0.7936413480713294,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8330161781679796,
+          "ci95_low": 0.7845761059331688,
+          "ci95_high": 0.8826989939630487,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8328610077177904,
+          "ci95_low": 0.7800757447429821,
+          "ci95_high": 0.8781871964934298,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8538205980066446,
+          "ci95_low": 0.8056478405315615,
+          "ci95_high": 0.9046822742474916,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.11461794019933555,
+          "ci95_low": 0.07154742096505824,
+          "ci95_high": 0.15841584158415842,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7271168067476022,
+          "ci95_low": 0.6776239464179038,
+          "ci95_high": 0.7701048739758077,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8468340679103598,
+          "ci95_low": 0.7958212107324952,
+          "ci95_high": 0.8917200475632132,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6741671210374133,
+          "ci95_low": 0.6253454419115949,
+          "ci95_high": 0.7167788555599995,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8931339977851606,
+          "ci95_low": 0.8525852585258525,
+          "ci95_high": 0.9294444444444444,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.11461794019933555,
+          "ci95_low": 0.07190635451505016,
+          "ci95_high": 0.15728476821192053,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "benefits_of_previews",
+        "count": 1
+      },
+      {
+        "path": "drawbacks_of_previews",
+        "count": 1
+      },
+      {
+        "path": "judicial_idiosyncrasy_definition",
+        "count": 1
+      },
+      {
+        "path": "judicial_idiosyncrasy_definition.term",
+        "count": 1
+      },
+      {
+        "path": "judicial_idiosyncrasy_definition.meaning",
+        "count": 1
+      },
+      {
+        "path": "core_tradeoff_question",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types[].name",
+        "count": 1
+      },
+      {
+        "path": "conclusions",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].id",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/text/gemini-3.5-flash/eval_records.jsonl b/data/evaluation/text/gemini-3.5-flash/eval_records.jsonl
new file mode 100644
index 0000000..7b0fe9f
--- /dev/null
+++ b/data/evaluation/text/gemini-3.5-flash/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:435fc33e0010fd72516482b9304205b3dd488c47f370e3386f02da77778407c5
+size 2855136
diff --git a/data/evaluation/text/gemini-3.5-flash/eval_summary.json b/data/evaluation/text/gemini-3.5-flash/eval_summary.json
new file mode 100644
index 0000000..f021983
--- /dev/null
+++ b/data/evaluation/text/gemini-3.5-flash/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_gemini-3.5-flash.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "gemini-3.5-flash"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 17,
+    "json_non_structured_root_count": 17,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9966,
+          "ci95_low": 0.995,
+          "ci95_high": 0.9982,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9966,
+          "ci95_low": 0.9948,
+          "ci95_high": 0.998,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9732,
+          "ci95_low": 0.9686,
+          "ci95_high": 0.9776,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8324566979883,
+          "ci95_low": 0.825042455877456,
+          "ci95_high": 0.8403134896059169,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.886469431440733,
+          "ci95_low": 0.8794422950905413,
+          "ci95_high": 0.8935675914105855,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9693818561788756,
+          "ci95_low": 0.9645540825844983,
+          "ci95_high": 0.9738200277759402,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.967588264123479,
+          "ci95_low": 0.9631046583062611,
+          "ci95_high": 0.9718631868659059,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9732,
+          "ci95_low": 0.9688,
+          "ci95_high": 0.9778,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.5268,
+          "ci95_low": 0.5128,
+          "ci95_high": 0.5414,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.896102661869303,
+          "ci95_low": 0.8899852390935324,
+          "ci95_high": 0.9024184617079747,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.971329421374493,
+          "ci95_low": 0.9668691425858724,
+          "ci95_high": 0.9758260217521556,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8594630647145164,
+          "ci95_low": 0.8518997488919624,
+          "ci95_high": 0.8667605495992109,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.981,
+          "ci95_low": 0.9774666666666666,
+          "ci95_high": 0.984,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.5268,
+          "ci95_low": 0.5128,
+          "ci95_high": 0.541,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9963995710127164,
+          "ci95_low": 0.9945784972510691,
+          "ci95_high": 0.9980012300123001,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9963995710127164,
+          "ci95_low": 0.9944645191050973,
+          "ci95_high": 0.998003838771593,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9707369388693121,
+          "ci95_low": 0.9652406417112299,
+          "ci95_high": 0.9759036144578314,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8274195361236903,
+          "ci95_low": 0.819589853286067,
+          "ci95_high": 0.8354320486173253,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.881461459164986,
+          "ci95_low": 0.8734278984392413,
+          "ci95_high": 0.8888203085519446,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9667965123902225,
+          "ci95_low": 0.9613697495130022,
+          "ci95_high": 0.9714925816045125,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9649084377802197,
+          "ci95_low": 0.95986301229426,
+          "ci95_high": 0.9698458511672041,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9707369388693121,
+          "ci95_low": 0.9661121380160197,
+          "ci95_high": 0.9756679756679757,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.5147081354374138,
+          "ci95_low": 0.500878063678705,
+          "ci95_high": 0.5284086564196682,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8918925025596328,
+          "ci95_low": 0.8854723280902367,
+          "ci95_high": 0.8981861958256669,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.968794105172948,
+          "ci95_low": 0.9638577430191485,
+          "ci95_high": 0.9737512649515937,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8544404976443382,
+          "ci95_low": 0.8471741313520348,
+          "ci95_high": 0.8626272042893021,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9792911495837802,
+          "ci95_low": 0.9757780162501917,
+          "ci95_high": 0.9826979023120502,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.5147081354374138,
+          "ci95_low": 0.5004213590745422,
+          "ci95_high": 0.528395627914087,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/images_responses/response_gemini-3.5-flash_image.jsonl b/data/images_responses/response_gemini-3.5-flash_image.jsonl
new file mode 100644
index 0000000..44a760b
--- /dev/null
+++ b/data/images_responses/response_gemini-3.5-flash_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49caf1414b9548ba63297689556f496acb631aeed29e59c8def3870b8aa93075
+size 1935885
diff --git a/data/text_responses/response_gemini-3.5-flash.jsonl b/data/text_responses/response_gemini-3.5-flash.jsonl
new file mode 100644
index 0000000..f57e1e4
--- /dev/null
+++ b/data/text_responses/response_gemini-3.5-flash.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0289226325645944fd3d35495f11b898938a2e9254e784387e6f1dd956930830
+size 39086464