data-mapping-model/evaluate.py at main · borbann-platform/data-mapping-model · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
Demonstrate Post-Fine-Tuning Evaluation with these metrics:
1. JSON Syntactic Validity
2. Pydantic Schema Conformance
"""

import json
from vertex import generate, CustomModel
from schemas.canonical import CanonicalRecord

prompts = []
json_validity_count = {
    CustomModel.BORBANN_PIPELINE_2: 0,
    CustomModel.BORBANN_PIPELINE_3: 0,
    CustomModel.BORBANN_PIPELINE_4: 0,
}
pydantic_validity_count = {
    CustomModel.BORBANN_PIPELINE_2: 0,
    CustomModel.BORBANN_PIPELINE_3: 0,
    CustomModel.BORBANN_PIPELINE_4: 0,
}

with open("data/evaluation/evaluation.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        for message in obj.get("contents", []):
            if message.get("role") == "user":
                for part in message.get("parts", []):
                    if "text" in part:
                        prompts.append(part["text"])

# --- JSON Syntactic Validity ---
# HOW: parse generated json string with json.loads()
# METRIC: Percentage of generated outputs that are valid JSON
# IMPORTANCE: Fundamental. If it's not valid JSON, it's useless.

for prompt in prompts:
    for model in CustomModel:
        result = generate(model, prompt)
        try:
            json.loads(result)
            json_validity_count[model] += 1
        except json.JSONDecodeError:
            pass

# ---  Pydantic Schema Conformance (CanonicalRecord Validation Rate) ---
# HOW: If the generated output is valid JSON, try to instantiate your CanonicalRecord Pydantic model with the parsed dictionary: CanonicalRecord(**parsed_generated_json).
# METRIC: Percentage of syntactically valid JSON outputs that also conform to the CanonicalRecord Pydantic schema (correct field names, data types, required fields present, enum values correct).
# IMPORTANCE: Crucial for ensuring the output is usable by downstream systems. Pydantic's ValidationError will give details on why it failed.

for prompt in prompts:
    for model in CustomModel:
        result = generate(model, prompt)
        try:
            json.loads(result)
            try:
                CanonicalRecord(**json.loads(result))
                pydantic_validity_count[model] += 1
            except ValueError as e:
                print(e)
        except json.JSONDecodeError:
            pass

# --- Print Results ---
print("JSON Syntactic Validity:")
for model in CustomModel:
    print(f"{model}: {json_validity_count[model] / len(prompts) * 100:.2f}%")

print("Pydantic Schema Conformance (CanonicalRecord Validation Rate):")
for model in CustomModel:
    print(
        f"{model}: {pydantic_validity_count[model] / json_validity_count[model] * 100:.2f}%"
    )

# --- Save results ---

with open("evaluation_results.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "json_validity_count": json_validity_count,
            "pydantic_validity_count": pydantic_validity_count,
        },
        f,
        indent=4,
    )