-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_grading_quality.py
More file actions
94 lines (74 loc) · 3.05 KB
/
test_grading_quality.py
File metadata and controls
94 lines (74 loc) · 3.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""Tests for grading and shared content-analysis heuristics."""
from AgentEnv.content_utils import count_markdown_headers, has_markdown_headers, has_markdown_lists
from AgentEnv.models import Document
from AgentEnv.tasks import (
BaseGrader,
REQUIRED_DOCUMENTS,
TASKS,
grade_task,
_check_tasks_vs_roadmap_consistency,
)
def test_shared_markdown_structure_helpers_require_real_markdown_syntax():
"""Header/list detection should look for Markdown structure, not raw punctuation anywhere."""
content = "# Overview\n\n1. First task\n- Second task\n\nParagraph text."
assert has_markdown_headers(content) is True
assert count_markdown_headers(content) == 1
assert has_markdown_lists(content) is True
assert has_markdown_headers("plain text with #inline hash") is False
assert has_markdown_lists("asterisk*insideword") is False
def test_keyword_relevance_scores_keyword_coverage_instead_of_any_single_hit():
"""A document should not get full keyword credit for matching only one required keyword."""
documents = {
"PRD": Document.create("PRD", "# PRD\n\nContains authentication only.", "elon"),
"TRD": Document.create("TRD", "# TRD\n\nContains security only.", "jordan"),
}
score = BaseGrader.check_keyword_relevance(
documents,
["authentication", "security", "passwordless"],
)
assert 0.0 < score < 1.0
def test_tasks_vs_roadmap_consistency_needs_more_than_generic_term_overlap():
"""Generic mentions alone should not produce a perfect consistency score."""
documents = {
"TASKS": Document.create(
"TASKS",
"# Tasks\n\n- task item one\n- task item two\n\nstory and ticket planning",
"robert",
),
"ROADMAP": Document.create(
"ROADMAP",
"# Roadmap\n\nphase planning with sprint setup and milestone review",
"robert",
),
}
score = _check_tasks_vs_roadmap_consistency(documents)
assert 0.0 < score < 1.0
def test_grade_task_is_always_strict_open_interval():
"""Task-level grader outputs should always be strictly within (0, 1)."""
empty_docs: dict[str, Document] = {}
empty_score = grade_task(
"easy_user_authentication",
empty_docs,
use_llm=False,
)
assert 0.0 < empty_score < 1.0
strong_docs = {
doc_type: Document.create(
doc_type,
(
"# Overview\n\n"
"## Features\n"
"authentication login password user chat real-time websocket message "
"online notification saas multi-tenant data-isolation subscription "
"billing analytics white-label scalability implementation architecture "
"security scaling\n\n"
"## Implementation\n"
+ ("detailed plan\n" * 200)
),
"sam",
)
for doc_type in REQUIRED_DOCUMENTS
}
for task_id in TASKS:
score = grade_task(task_id, strong_docs, use_llm=False)
assert 0.0 < score < 1.0