From fcdb39c9b0b1cc545003a3ff77b52e5796546b80 Mon Sep 17 00:00:00 2001 From: Abhinavexist Date: Wed, 20 May 2026 22:24:24 +0530 Subject: [PATCH] feat(results): add gemini-3.5-flash --- .../response_gemini-3.5-flash_audio.jsonl | 3 + .../audio/gemini-3.5-flash/eval_records.jsonl | 3 + .../audio/gemini-3.5-flash/eval_summary.json | 264 +++++++++++ .../image/gemini-3.5-flash/eval_records.jsonl | 3 + .../image/gemini-3.5-flash/eval_summary.json | 430 ++++++++++++++++++ .../text/gemini-3.5-flash/eval_records.jsonl | 3 + .../text/gemini-3.5-flash/eval_summary.json | 264 +++++++++++ .../response_gemini-3.5-flash_image.jsonl | 3 + .../response_gemini-3.5-flash.jsonl | 3 + 9 files changed, 976 insertions(+) create mode 100644 data/audio_responses/response_gemini-3.5-flash_audio.jsonl create mode 100644 data/evaluation/audio/gemini-3.5-flash/eval_records.jsonl create mode 100644 data/evaluation/audio/gemini-3.5-flash/eval_summary.json create mode 100644 data/evaluation/image/gemini-3.5-flash/eval_records.jsonl create mode 100644 data/evaluation/image/gemini-3.5-flash/eval_summary.json create mode 100644 data/evaluation/text/gemini-3.5-flash/eval_records.jsonl create mode 100644 data/evaluation/text/gemini-3.5-flash/eval_summary.json create mode 100644 data/images_responses/response_gemini-3.5-flash_image.jsonl create mode 100644 data/text_responses/response_gemini-3.5-flash.jsonl diff --git a/data/audio_responses/response_gemini-3.5-flash_audio.jsonl b/data/audio_responses/response_gemini-3.5-flash_audio.jsonl new file mode 100644 index 0000000..f8b5517 --- /dev/null +++ b/data/audio_responses/response_gemini-3.5-flash_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:721a0c1e246230245609882be6a26c1a0f8e9396c11951308b556749d0ca2607 +size 5678744 diff --git a/data/evaluation/audio/gemini-3.5-flash/eval_records.jsonl b/data/evaluation/audio/gemini-3.5-flash/eval_records.jsonl new file mode 100644 index 0000000..ddc5812 --- /dev/null +++ b/data/evaluation/audio/gemini-3.5-flash/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:152402dfe306ff56b139f28e259419dacd578b6d43513c9e851f259f7b7b8f12 +size 69026 diff --git a/data/evaluation/audio/gemini-3.5-flash/eval_summary.json b/data/evaluation/audio/gemini-3.5-flash/eval_summary.json new file mode 100644 index 0000000..5a800ea --- /dev/null +++ b/data/evaluation/audio/gemini-3.5-flash/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_gemini-3.5-flash_audio.jsonl", + "num_records": 115, + "model_ids": [ + "gemini-3.5-flash" + ], + "data_quality": { + "json_parse_fail_count": 7, + "json_non_structured_root_count": 7, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.9391304347826087, + "ci95_low": 0.8956521739130435, + "ci95_high": 0.9739130434782609, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9391304347826087, + "ci95_low": 0.8956521739130435, + "ci95_high": 0.9739130434782609, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.808695652173913, + "ci95_low": 0.7304347826086957, + "ci95_high": 0.8782608695652174, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.18993894462698693, + "ci95_low": 0.15206000100577555, + "ci95_high": 0.22716980671040382, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.40324767322861155, + "ci95_low": 0.35727945846336234, + "ci95_high": 0.45624885267548737, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7107172807141823, + "ci95_low": 0.6470438576428221, + "ci95_high": 0.7725613577146161, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.7382157663542203, + "ci95_low": 0.6587174942858072, + "ci95_high": 0.803243944413393, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.808695652173913, + "ci95_low": 0.7391304347826086, + "ci95_high": 0.8782608695652174, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.43463463285659354, + "ci95_low": 0.387603928399322, + "ci95_high": 0.47748502460229886, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.7852023569006821, + "ci95_low": 0.7095056773386922, + "ci95_high": 0.8531871629788458, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.2965933089277993, + "ci95_low": 0.2565178442680253, + "ci95_high": 0.3383158264807672, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8521739130434782, + "ci95_low": 0.7971014492753623, + "ci95_high": 0.9072463768115941, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.9387755102040817, + "ci95_low": 0.8944281524926686, + "ci95_high": 0.9824561403508771, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9387755102040817, + "ci95_low": 0.8950437317784257, + "ci95_high": 0.9739130434782609, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8075801749271136, + "ci95_low": 0.7376093294460642, + "ci95_high": 0.8856304985337243, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.19104646033909764, + "ci95_low": 0.156018248465694, + "ci95_high": 0.22572981009198756, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.4021751481319986, + "ci95_low": 0.35097915092806337, + "ci95_high": 0.44979258860329846, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7094489869278406, + "ci95_low": 0.6397157902811849, + "ci95_high": 0.7691149337349868, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.7369069547138544, + "ci95_low": 0.6727107601491489, + "ci95_high": 0.799727568544498, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8075801749271136, + "ci95_low": 0.7296511627906976, + "ci95_high": 0.8691860465116279, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.4342235317996457, + "ci95_low": 0.38758707762790273, + "ci95_high": 0.4808381927276586, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.7840224348560273, + "ci95_low": 0.714558882179947, + "ci95_high": 0.8546304987035785, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.29661080423554814, + "ci95_low": 0.2589885152565194, + "ci95_high": 0.3383396937927052, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8513119533527697, + "ci95_low": 0.7877906976744186, + "ci95_high": 0.9072463768115943, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/image/gemini-3.5-flash/eval_records.jsonl b/data/evaluation/image/gemini-3.5-flash/eval_records.jsonl new file mode 100644 index 0000000..d09c48b --- /dev/null +++ b/data/evaluation/image/gemini-3.5-flash/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5dbcaaeb03b32dd4c45a4884cac2da0bbad9ddd1d7acc49f097441c28034fc7 +size 172802 diff --git a/data/evaluation/image/gemini-3.5-flash/eval_summary.json b/data/evaluation/image/gemini-3.5-flash/eval_summary.json new file mode 100644 index 0000000..b7fc1f1 --- /dev/null +++ b/data/evaluation/image/gemini-3.5-flash/eval_summary.json @@ -0,0 +1,430 @@ +{ + "response_file": "data/images_responses/response_gemini-3.5-flash_image.jsonl", + "num_records": 209, + "model_ids": [ + "gemini-3.5-flash" + ], + "data_quality": { + "json_parse_fail_count": 6, + "json_non_structured_root_count": 6, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9712918660287081, + "ci95_low": 0.9473684210526315, + "ci95_high": 0.9904306220095693, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9712918660287081, + "ci95_low": 0.9473684210526315, + "ci95_high": 0.9904306220095693, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8564593301435407, + "ci95_low": 0.8038277511961722, + "ci95_high": 0.9043062200956937, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.6014651317790386, + "ci95_low": 0.5540684739454232, + "ci95_high": 0.6487739292148866, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.741961845694227, + "ci95_low": 0.6921627473674945, + "ci95_high": 0.7871633633815889, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8354730711371612, + "ci95_low": 0.7853333688015341, + "ci95_high": 0.8840432995362576, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8353013262977563, + "ci95_low": 0.7886467625159835, + "ci95_high": 0.878037253509644, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8564593301435407, + "ci95_low": 0.8038277511961722, + "ci95_high": 0.8995215311004785, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.11483253588516747, + "ci95_low": 0.07177033492822966, + "ci95_high": 0.15789473684210525, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7263000162034755, + "ci95_low": 0.6846808359237468, + "ci95_high": 0.7686229073363774, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8494066621949459, + "ci95_low": 0.8012498541109946, + "ci95_high": 0.896855753596863, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6717134887366328, + "ci95_low": 0.622768431170637, + "ci95_high": 0.7150223490753279, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8947368421052632, + "ci95_low": 0.8564593301435407, + "ci95_high": 0.9282296650717703, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.11483253588516747, + "ci95_low": 0.07177033492822966, + "ci95_high": 0.15789473684210525, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9717607973421927, + "ci95_low": 0.9456342668863262, + "ci95_high": 0.9916387959866221, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9717607973421927, + "ci95_low": 0.9450915141430949, + "ci95_high": 0.9917218543046358, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8538205980066446, + "ci95_low": 0.802653399668325, + "ci95_high": 0.9001663893510815, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.6051763391228808, + "ci95_low": 0.5549673432523973, + "ci95_high": 0.6531048353319622, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7431579029519458, + "ci95_low": 0.6958644254954833, + "ci95_high": 0.7936413480713294, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8330161781679796, + "ci95_low": 0.7845761059331688, + "ci95_high": 0.8826989939630487, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8328610077177904, + "ci95_low": 0.7800757447429821, + "ci95_high": 0.8781871964934298, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8538205980066446, + "ci95_low": 0.8056478405315615, + "ci95_high": 0.9046822742474916, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.11461794019933555, + "ci95_low": 0.07154742096505824, + "ci95_high": 0.15841584158415842, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7271168067476022, + "ci95_low": 0.6776239464179038, + "ci95_high": 0.7701048739758077, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8468340679103598, + "ci95_low": 0.7958212107324952, + "ci95_high": 0.8917200475632132, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6741671210374133, + "ci95_low": 0.6253454419115949, + "ci95_high": 0.7167788555599995, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8931339977851606, + "ci95_low": 0.8525852585258525, + "ci95_high": 0.9294444444444444, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.11461794019933555, + "ci95_low": 0.07190635451505016, + "ci95_high": 0.15728476821192053, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + }, + { + "path": "benefits_of_previews", + "count": 1 + }, + { + "path": "drawbacks_of_previews", + "count": 1 + }, + { + "path": "judicial_idiosyncrasy_definition", + "count": 1 + }, + { + "path": "judicial_idiosyncrasy_definition.term", + "count": 1 + }, + { + "path": "judicial_idiosyncrasy_definition.meaning", + "count": 1 + }, + { + "path": "core_tradeoff_question", + "count": 1 + }, + { + "path": "other_laser_types", + "count": 1 + }, + { + "path": "other_laser_types[].name", + "count": 1 + }, + { + "path": "conclusions", + "count": 1 + }, + { + "path": "recommended_bibliography", + "count": 1 + }, + { + "path": "recommended_bibliography[].id", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/text/gemini-3.5-flash/eval_records.jsonl b/data/evaluation/text/gemini-3.5-flash/eval_records.jsonl new file mode 100644 index 0000000..7b0fe9f --- /dev/null +++ b/data/evaluation/text/gemini-3.5-flash/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:435fc33e0010fd72516482b9304205b3dd488c47f370e3386f02da77778407c5 +size 2855136 diff --git a/data/evaluation/text/gemini-3.5-flash/eval_summary.json b/data/evaluation/text/gemini-3.5-flash/eval_summary.json new file mode 100644 index 0000000..f021983 --- /dev/null +++ b/data/evaluation/text/gemini-3.5-flash/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_gemini-3.5-flash.jsonl", + "num_records": 5000, + "model_ids": [ + "gemini-3.5-flash" + ], + "data_quality": { + "json_parse_fail_count": 17, + "json_non_structured_root_count": 17, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9966, + "ci95_low": 0.995, + "ci95_high": 0.9982, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9966, + "ci95_low": 0.9948, + "ci95_high": 0.998, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9732, + "ci95_low": 0.9686, + "ci95_high": 0.9776, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8324566979883, + "ci95_low": 0.825042455877456, + "ci95_high": 0.8403134896059169, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.886469431440733, + "ci95_low": 0.8794422950905413, + "ci95_high": 0.8935675914105855, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9693818561788756, + "ci95_low": 0.9645540825844983, + "ci95_high": 0.9738200277759402, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.967588264123479, + "ci95_low": 0.9631046583062611, + "ci95_high": 0.9718631868659059, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9732, + "ci95_low": 0.9688, + "ci95_high": 0.9778, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.5268, + "ci95_low": 0.5128, + "ci95_high": 0.5414, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.896102661869303, + "ci95_low": 0.8899852390935324, + "ci95_high": 0.9024184617079747, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.971329421374493, + "ci95_low": 0.9668691425858724, + "ci95_high": 0.9758260217521556, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8594630647145164, + "ci95_low": 0.8518997488919624, + "ci95_high": 0.8667605495992109, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.981, + "ci95_low": 0.9774666666666666, + "ci95_high": 0.984, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.5268, + "ci95_low": 0.5128, + "ci95_high": 0.541, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9963995710127164, + "ci95_low": 0.9945784972510691, + "ci95_high": 0.9980012300123001, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9963995710127164, + "ci95_low": 0.9944645191050973, + "ci95_high": 0.998003838771593, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9707369388693121, + "ci95_low": 0.9652406417112299, + "ci95_high": 0.9759036144578314, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8274195361236903, + "ci95_low": 0.819589853286067, + "ci95_high": 0.8354320486173253, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.881461459164986, + "ci95_low": 0.8734278984392413, + "ci95_high": 0.8888203085519446, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9667965123902225, + "ci95_low": 0.9613697495130022, + "ci95_high": 0.9714925816045125, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9649084377802197, + "ci95_low": 0.95986301229426, + "ci95_high": 0.9698458511672041, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9707369388693121, + "ci95_low": 0.9661121380160197, + "ci95_high": 0.9756679756679757, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.5147081354374138, + "ci95_low": 0.500878063678705, + "ci95_high": 0.5284086564196682, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8918925025596328, + "ci95_low": 0.8854723280902367, + "ci95_high": 0.8981861958256669, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.968794105172948, + "ci95_low": 0.9638577430191485, + "ci95_high": 0.9737512649515937, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8544404976443382, + "ci95_low": 0.8471741313520348, + "ci95_high": 0.8626272042893021, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9792911495837802, + "ci95_low": 0.9757780162501917, + "ci95_high": 0.9826979023120502, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.5147081354374138, + "ci95_low": 0.5004213590745422, + "ci95_high": 0.528395627914087, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/images_responses/response_gemini-3.5-flash_image.jsonl b/data/images_responses/response_gemini-3.5-flash_image.jsonl new file mode 100644 index 0000000..44a760b --- /dev/null +++ b/data/images_responses/response_gemini-3.5-flash_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49caf1414b9548ba63297689556f496acb631aeed29e59c8def3870b8aa93075 +size 1935885 diff --git a/data/text_responses/response_gemini-3.5-flash.jsonl b/data/text_responses/response_gemini-3.5-flash.jsonl new file mode 100644 index 0000000..f57e1e4 --- /dev/null +++ b/data/text_responses/response_gemini-3.5-flash.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0289226325645944fd3d35495f11b898938a2e9254e784387e6f1dd956930830 +size 39086464