tocs/models.py at main · che-shr-cat/tocs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
"""Core data models for Theory of Code Space.

All structured data flows through these Pydantic models:
- GroundTruth: the "true map" of a generated codebase
- CognitiveMap: the agent's externalized belief state
- Mutation: a change applied during the REVISE phase
- ProbeResult: scored output of a cognitive map probe
"""

from __future__ import annotations

from enum import Enum
from typing import Optional

from pydantic import BaseModel, Field


# ============================================================================
# Ground Truth (generated by generator/)
# ============================================================================


class InvariantType(str, Enum):
    INTERFACE = "INTERFACE"  # Shared type/function signature
    DATAFLOW = "DATAFLOW"  # Required transformation/validation chain
    BOUNDARY = "BOUNDARY"  # Forbidden dependency (architectural firewall)
    INVARIANT = "INVARIANT"  # Cross-module consistency constraint
    PURPOSE = "PURPOSE"  # Why a module exists / design decision


class ArchPattern(str, Enum):
    MVC = "mvc"
    PIPELINE = "pipeline"
    EVENT_DRIVEN = "event_driven"
    LAYERED = "layered"
    MICROKERNEL = "microkernel"
    REPOSITORY = "repository"


class ComplexityTier(str, Enum):
    SMALL = "small"  # 10-15 files
    MEDIUM = "medium"  # 25-40 files
    LARGE = "large"  # 50-80 files


class ConstraintEvidenceType(str, Enum):
    TEST = "test"  # A test file asserts the constraint
    STRUCTURAL = "structural"  # Inferable from code structure (interfaces, cycles)
    DOCUMENTATION = "documentation"  # Stated in comment/docstring


class ModuleGroundTruth(BaseModel):
    """Ground truth for a single component (file/module)."""

    filepath: str
    purpose: str = Field(description="Why this module exists in the architecture")
    exports: list[str] = Field(default_factory=list, description="Public symbols")
    edges: list[dict] = Field(
        default_factory=list,
        description="Typed edges: [{target, type (IMPORTS|CALLS_API|DATA_FLOWS_TO|REGISTRY_WIRES)}]",
    )


class InvariantGroundTruth(BaseModel):
    """A planted cross-module invariant."""

    id: str
    type: InvariantType
    description: str
    structured: dict = Field(
        description="Machine-comparable canonical form: {type, src?, dst?, via?, pattern?}"
    )
    involved_modules: list[str]
    rationale: str = Field(
        description="The design reason behind this invariant (for constraint discovery probing)"
    )
    evidence_types: list[ConstraintEvidenceType] = Field(
        description="How this constraint is discoverable in the repo"
    )
    evidence_locations: list[str] = Field(
        default_factory=list,
        description="Filepaths where evidence of this constraint can be found",
    )


class DesignRationale(BaseModel):
    """A planted design decision for architectural constraint discovery."""

    id: str
    question: str = Field(
        description='Probe question, e.g. "Why does A not import C directly?"'
    )
    answer: str = Field(description="Ground-truth rationale")
    affected_modules: list[str]
    downstream_effects: list[str] = Field(
        description="What would break if this decision were violated"
    )


class ParameterSpec(BaseModel):
    """A single parameter in a function signature — structured for forgiving comparison."""

    name: str
    type_hint: str = Field(
        default="",
        description="Normalized type hint. Equivalence classes applied during scoring: "
        "dict≈Dict[str,Any], list≈List[Any], Optional[X]≈X|None, etc.",
    )
    has_default: bool = Field(
        default=False, description="Whether this parameter has a default value"
    )


class FunctionSignature(BaseModel):
    """Structured function signature — avoids string-match pain (whitespace, naming, type aliases)."""

    params: list[ParameterSpec] = Field(default_factory=list)
    return_type: str = Field(
        default="",
        description="Normalized return type hint (same equivalence classes as params)",
    )


class ExportedAPI(BaseModel):
    """A public API with its signature and callers — for contract-level scoring."""

    name: str = Field(description="Function/class name")
    module: str = Field(description="Filepath of the module exporting this API")
    signature: FunctionSignature = Field(
        description="Structured signature for forgiving comparison"
    )
    callers: list[str] = Field(
        default_factory=list,
        description="Filepaths of modules that call this API (impact set)",
    )


class CodebaseGroundTruth(BaseModel):
    """Complete ground truth for a generated codebase."""

    codebase_id: str
    pattern: ArchPattern
    complexity: ComplexityTier
    language: str = "python"
    modules: dict[str, ModuleGroundTruth]
    invariants: list[InvariantGroundTruth]
    design_rationales: list[DesignRationale]
    dependency_edges: list[dict] = Field(
        description="Typed edges: [{source, target, type (IMPORTS|CALLS_API|DATA_FLOWS_TO|REGISTRY_WIRES)}]"
    )
    contracts: list[ExportedAPI] = Field(
        default_factory=list,
        description="Contract layer: exported APIs with signatures and callers. Used for REVISE scoring.",
    )


# ============================================================================
# Canonical Edge Types (v0.1 — exhaustive)
# ============================================================================


class EdgeType(str, Enum):
    IMPORTS = "IMPORTS"  # Python import statement exists
    CALLS_API = "CALLS_API"  # Calls a public function of another component
    DATA_FLOWS_TO = "DATA_FLOWS_TO"  # Output consumed by another component
    REGISTRY_WIRES = "REGISTRY_WIRES"  # Connected via config/registry, not import


# ============================================================================
# Cognitive Map (externalized by agent during probing)
# Probing does NOT consume the action budget.
# ============================================================================


class ModuleStatus(str, Enum):
    OBSERVED = "observed"
    INFERRED = "inferred"
    UNKNOWN = "unknown"


class BeliefEdge(BaseModel):
    """A typed, directional edge in the agent's belief graph."""

    target: str = Field(description="Target component filepath")
    type: EdgeType
    confidence: float = Field(ge=0.0, le=1.0, default=0.5)


class BeliefExport(BaseModel):
    """Agent's belief about an exported API — contract layer."""

    name: str
    signature: FunctionSignature = Field(
        default_factory=FunctionSignature,
        description="Agent's belief about the function signature (structured for forgiving comparison)",
    )
    callers: list[str] = Field(
        default_factory=list,
        description="Agent's belief about which components call this API",
    )
    confidence: float = Field(ge=0.0, le=1.0, default=0.5)


class ComponentBelief(BaseModel):
    """Agent's belief about a single component (file/module)."""

    filepath: str
    status: ModuleStatus
    purpose: str = ""
    exports: list[BeliefExport] = Field(
        default_factory=list, description="Exported APIs with signatures (contract layer)"
    )
    edges: list[BeliefEdge] = Field(
        default_factory=list, description="Typed outgoing edges"
    )
    confidence: float = Field(ge=0.0, le=1.0, default=0.5)


class StructuredConstraint(BaseModel):
    """Machine-comparable canonical form for an invariant."""

    type: str = Field(description="FORBIDDEN_EDGE | INTERFACE_ONLY | VALIDATION_CHAIN | NAMING_CONVENTION | INVARIANT | PURPOSE")
    src: Optional[str] = Field(default=None, description="Source component (for edge constraints)")
    dst: Optional[str] = Field(default=None, description="Target component (for edge constraints)")
    via: Optional[str] = Field(default=None, description="Required intermediary (for VALIDATION_CHAIN / INTERFACE_ONLY)")
    pattern: Optional[str] = Field(default=None, description="Regex pattern (for NAMING_CONVENTION)")


class InvariantBelief(BaseModel):
    """Agent's belief about a cross-module invariant."""

    type: str
    description: str
    structured: Optional[StructuredConstraint] = Field(
        default=None,
        description="Machine-comparable canonical form (for exact scoring)",
    )
    evidence: list[str] = Field(default_factory=list)
    confidence: float = Field(ge=0.0, le=1.0, default=0.5)


class CognitiveMap(BaseModel):
    """The agent's externalized architectural belief state."""

    step: int = Field(description="Action step at which this map was produced")
    components: dict[str, ComponentBelief] = Field(default_factory=dict)
    invariants: list[InvariantBelief] = Field(default_factory=list)
    unexplored: list[str] = Field(default_factory=list)
    uncertainty_summary: str = ""


# ============================================================================
# Mutations (for REVISE phase)
# ============================================================================


class MutationType(str, Enum):
    INTERFACE_BREAK = "interface_break"  # Change a function signature
    DEPENDENCY_SHIFT = "dependency_shift"  # Move functionality between modules
    INVARIANT_VIOLATION = "invariant_violation"  # Introduce a subtle contract break
    BOUNDARY_BREACH = "boundary_breach"  # Add a forbidden dependency


class Mutation(BaseModel):
    """A change applied to the codebase during REVISE phase."""

    id: str
    type: MutationType
    target_module: str
    description: str
    is_sham: bool = Field(
        default=False,
        description="If True, evidence is presented but no actual change was made (no-change control)",
    )
    affected_modules: list[str] = Field(
        description="All modules whose belief should change (empty if is_sham=True)"
    )
    affected_invariants: list[str] = Field(
        description="IDs of invariants affected by this mutation (empty if is_sham=True)"
    )


# ============================================================================
# Agent Actions
# ============================================================================


class ActionType(str, Enum):
    OPEN = "open"
    LIST = "list"
    SEARCH = "search"
    INSPECT = "inspect"
    DONE = "done"


class AgentAction(BaseModel):
    """A single action taken by the agent."""

    type: ActionType
    argument: Optional[str] = None  # filepath, directory, query, or symbol
    secondary_argument: Optional[str] = None  # for INSPECT: symbol name


class ActionResult(BaseModel):
    """Result of an agent action."""

    action: AgentAction
    output: str
    step: int


# ============================================================================
# Evaluation Results
# ============================================================================


class ExplorationMetrics(BaseModel):
    """Exploration efficiency metrics."""

    information_gain_curve: list[float] = Field(
        description="E(t) at each step (optional, deferred to v1.0)"
    )
    action_efficiency_curve: list[float] = Field(
        default_factory=list,
        description="Edge F1 at each action step (all actions count: LIST/SEARCH/INSPECT/OPEN)",
    )
    observation_efficiency_curve: list[float] = Field(
        default_factory=list,
        description="Edge F1 at each OPEN action (diagnostic: isolates map-building from search overhead)",
    )
    action_auc: float = Field(
        default=0.0,
        description="AUC(F1 vs total actions) — headline metric",
    )
    observation_auc: float = Field(
        default=0.0,
        description="AUC(F1 vs OPEN count) — diagnostic metric",
    )
    steps_to_50_recall: Optional[int] = Field(
        default=None, description="Actions to reach 50% edge recall"
    )
    steps_to_80_recall: Optional[int] = Field(
        default=None, description="Actions to reach 80% edge recall"
    )
    final_efficiency: float
    steps_taken: int
    files_opened: int
    unique_files: int


class MapAccuracyMetrics(BaseModel):
    """Cognitive map accuracy metrics."""

    dependency_precision: float
    dependency_recall: float
    dependency_f1: float
    invariant_precision: float
    invariant_recall: float
    invariant_f1: float
    invariant_precision_relaxed: float = Field(
        default=0.0,
        description="Invariant precision under relaxed matching (type+src+dst, ignoring via/pattern)",
    )
    invariant_recall_relaxed: float = Field(
        default=0.0,
        description="Invariant recall under relaxed matching",
    )
    invariant_f1_relaxed: float = Field(
        default=0.0,
        description="Invariant F1 under relaxed matching",
    )
    confidence_ece: float = Field(description="Expected Calibration Error")


class BeliefRevisionMetrics(BaseModel):
    """Belief revision metrics after mutation (or sham evidence)."""

    evidence_condition: str = Field(
        default="real",
        description="'real' (actual mutation) or 'sham' (no-change control)",
    )
    revision_score: float = Field(description="BRS: fraction correctly updated (all affected elements)")
    inertia_proper: float = Field(
        default=0.0,
        description="Among affected elements correctly believed pre-mutation: fraction NOT updated post-evidence. "
        "Isolates 'sticky priors' from 'never knew it'.",
    )
    impact_discovery: float = Field(
        default=0.0,
        description="Among affected elements missing/wrong pre-mutation: fraction newly discovered post-evidence. "
        "Measures whether evidence triggers useful exploration.",
    )
    gullibility_rate: float = Field(
        default=0.0,
        description="Fraction of elements incorrectly updated when no change occurred (sham condition only)",
    )
    revision_latency: int = Field(
        default=0,
        description="Actions between evidence encounter and belief update",
    )
    mutation_type: MutationType
    pre_mutation_known_count: int = Field(
        default=0,
        description="Number of affected elements correctly represented before mutation (denominator for inertia_proper)",
    )
    pre_mutation_unknown_count: int = Field(
        default=0,
        description="Number of affected elements missing/wrong before mutation (denominator for impact_discovery)",
    )


class ConstraintDiscoveryMetrics(BaseModel):
    """Architectural constraint discovery metrics."""

    counterfactual_probe_accuracy: float = Field(
        description="Accuracy on multiple-choice constraint violation probes"
    )
    behavioral_compliance: Optional[float] = Field(
        default=None,
        description="1 - (violations / applicable_constraints) in generated code. "
        "Requires EXPLOIT phase (v1.0). None in v0.1.",
    )
    rationale_match_rate: Optional[float] = Field(
        default=None,
        description="Exact match rate against generator metadata tags (optional)",
    )


class EvalResult(BaseModel):
    """Complete evaluation result for one model on one codebase."""

    model_name: str
    codebase_id: str
    mode: str = Field(description="'active' or 'passive'")
    passive_condition: Optional[str] = Field(
        default=None,
        description="If mode='passive': 'full' | 'oracle' | 'replay'. None if active.",
    )
    track: str = Field(
        default="probe_as_scratchpad",
        description="'no_probe' (final map only) | 'probe_only' (JSON stripped from context) | 'probe_as_scratchpad' (JSON retained)",
    )
    exploration: ExplorationMetrics
    map_accuracy: MapAccuracyMetrics
    belief_revision: Optional[BeliefRevisionMetrics] = None
    constraint_discovery: Optional[ConstraintDiscoveryMetrics] = None
    exploit_task_accuracy: Optional[float] = Field(
        default=None,
        description="Correctness on EXPLOIT downstream task. Requires write interface (v1.0). None in v0.1.",
    )
    cognitive_maps: list[CognitiveMap] = Field(
        description="Sequence of externalized maps during exploration"
    )