SkyPlan/test_grading_quality.py at main · techramit/SkyPlan · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""Tests for grading and shared content-analysis heuristics."""

from AgentEnv.content_utils import count_markdown_headers, has_markdown_headers, has_markdown_lists
from AgentEnv.models import Document
from AgentEnv.tasks import (
    BaseGrader,
    REQUIRED_DOCUMENTS,
    TASKS,
    grade_task,
    _check_tasks_vs_roadmap_consistency,
)


def test_shared_markdown_structure_helpers_require_real_markdown_syntax():
    """Header/list detection should look for Markdown structure, not raw punctuation anywhere."""

    content = "# Overview\n\n1. First task\n- Second task\n\nParagraph text."

    assert has_markdown_headers(content) is True
    assert count_markdown_headers(content) == 1
    assert has_markdown_lists(content) is True
    assert has_markdown_headers("plain text with #inline hash") is False
    assert has_markdown_lists("asterisk*insideword") is False


def test_keyword_relevance_scores_keyword_coverage_instead_of_any_single_hit():
    """A document should not get full keyword credit for matching only one required keyword."""

    documents = {
        "PRD": Document.create("PRD", "# PRD\n\nContains authentication only.", "elon"),
        "TRD": Document.create("TRD", "# TRD\n\nContains security only.", "jordan"),
    }

    score = BaseGrader.check_keyword_relevance(
        documents,
        ["authentication", "security", "passwordless"],
    )

    assert 0.0 < score < 1.0


def test_tasks_vs_roadmap_consistency_needs_more_than_generic_term_overlap():
    """Generic mentions alone should not produce a perfect consistency score."""

    documents = {
        "TASKS": Document.create(
            "TASKS",
            "# Tasks\n\n- task item one\n- task item two\n\nstory and ticket planning",
            "robert",
        ),
        "ROADMAP": Document.create(
            "ROADMAP",
            "# Roadmap\n\nphase planning with sprint setup and milestone review",
            "robert",
        ),
    }

    score = _check_tasks_vs_roadmap_consistency(documents)

    assert 0.0 < score < 1.0


def test_grade_task_is_always_strict_open_interval():
    """Task-level grader outputs should always be strictly within (0, 1)."""

    empty_docs: dict[str, Document] = {}
    empty_score = grade_task(
        "easy_user_authentication",
        empty_docs,
        use_llm=False,
    )
    assert 0.0 < empty_score < 1.0

    strong_docs = {
        doc_type: Document.create(
            doc_type,
            (
                "# Overview\n\n"
                "## Features\n"
                "authentication login password user chat real-time websocket message "
                "online notification saas multi-tenant data-isolation subscription "
                "billing analytics white-label scalability implementation architecture "
                "security scaling\n\n"
                "## Implementation\n"
                + ("detailed plan\n" * 200)
            ),
            "sam",
        )
        for doc_type in REQUIRED_DOCUMENTS
    }

    for task_id in TASKS:
        score = grade_task(task_id, strong_docs, use_llm=False)
        assert 0.0 < score < 1.0