This repository was archived by the owner on Mar 26, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluate.py
More file actions
85 lines (74 loc) · 2.89 KB
/
evaluate.py
File metadata and controls
85 lines (74 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
Demonstrate Post-Fine-Tuning Evaluation with these metrics:
1. JSON Syntactic Validity
2. Pydantic Schema Conformance
"""
import json
from vertex import generate, CustomModel
from schemas.canonical import CanonicalRecord
prompts = []
json_validity_count = {
CustomModel.BORBANN_PIPELINE_2: 0,
CustomModel.BORBANN_PIPELINE_3: 0,
CustomModel.BORBANN_PIPELINE_4: 0,
}
pydantic_validity_count = {
CustomModel.BORBANN_PIPELINE_2: 0,
CustomModel.BORBANN_PIPELINE_3: 0,
CustomModel.BORBANN_PIPELINE_4: 0,
}
with open("data/evaluation/evaluation.jsonl", "r", encoding="utf-8") as f:
for line in f:
obj = json.loads(line)
for message in obj.get("contents", []):
if message.get("role") == "user":
for part in message.get("parts", []):
if "text" in part:
prompts.append(part["text"])
# --- JSON Syntactic Validity ---
# HOW: parse generated json string with json.loads()
# METRIC: Percentage of generated outputs that are valid JSON
# IMPORTANCE: Fundamental. If it's not valid JSON, it's useless.
for prompt in prompts:
for model in CustomModel:
result = generate(model, prompt)
try:
json.loads(result)
json_validity_count[model] += 1
except json.JSONDecodeError:
pass
# --- Pydantic Schema Conformance (CanonicalRecord Validation Rate) ---
# HOW: If the generated output is valid JSON, try to instantiate your CanonicalRecord Pydantic model with the parsed dictionary: CanonicalRecord(**parsed_generated_json).
# METRIC: Percentage of syntactically valid JSON outputs that also conform to the CanonicalRecord Pydantic schema (correct field names, data types, required fields present, enum values correct).
# IMPORTANCE: Crucial for ensuring the output is usable by downstream systems. Pydantic's ValidationError will give details on why it failed.
for prompt in prompts:
for model in CustomModel:
result = generate(model, prompt)
try:
json.loads(result)
try:
CanonicalRecord(**json.loads(result))
pydantic_validity_count[model] += 1
except ValueError as e:
print(e)
except json.JSONDecodeError:
pass
# --- Print Results ---
print("JSON Syntactic Validity:")
for model in CustomModel:
print(f"{model}: {json_validity_count[model] / len(prompts) * 100:.2f}%")
print("Pydantic Schema Conformance (CanonicalRecord Validation Rate):")
for model in CustomModel:
print(
f"{model}: {pydantic_validity_count[model] / json_validity_count[model] * 100:.2f}%"
)
# --- Save results ---
with open("evaluation_results.json", "w", encoding="utf-8") as f:
json.dump(
{
"json_validity_count": json_validity_count,
"pydantic_validity_count": pydantic_validity_count,
},
f,
indent=4,
)