-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmodels.py
More file actions
453 lines (358 loc) · 15.7 KB
/
models.py
File metadata and controls
453 lines (358 loc) · 15.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
"""Core data models for Theory of Code Space.
All structured data flows through these Pydantic models:
- GroundTruth: the "true map" of a generated codebase
- CognitiveMap: the agent's externalized belief state
- Mutation: a change applied during the REVISE phase
- ProbeResult: scored output of a cognitive map probe
"""
from __future__ import annotations
from enum import Enum
from typing import Optional
from pydantic import BaseModel, Field
# ============================================================================
# Ground Truth (generated by generator/)
# ============================================================================
class InvariantType(str, Enum):
INTERFACE = "INTERFACE" # Shared type/function signature
DATAFLOW = "DATAFLOW" # Required transformation/validation chain
BOUNDARY = "BOUNDARY" # Forbidden dependency (architectural firewall)
INVARIANT = "INVARIANT" # Cross-module consistency constraint
PURPOSE = "PURPOSE" # Why a module exists / design decision
class ArchPattern(str, Enum):
MVC = "mvc"
PIPELINE = "pipeline"
EVENT_DRIVEN = "event_driven"
LAYERED = "layered"
MICROKERNEL = "microkernel"
REPOSITORY = "repository"
class ComplexityTier(str, Enum):
SMALL = "small" # 10-15 files
MEDIUM = "medium" # 25-40 files
LARGE = "large" # 50-80 files
class ConstraintEvidenceType(str, Enum):
TEST = "test" # A test file asserts the constraint
STRUCTURAL = "structural" # Inferable from code structure (interfaces, cycles)
DOCUMENTATION = "documentation" # Stated in comment/docstring
class ModuleGroundTruth(BaseModel):
"""Ground truth for a single component (file/module)."""
filepath: str
purpose: str = Field(description="Why this module exists in the architecture")
exports: list[str] = Field(default_factory=list, description="Public symbols")
edges: list[dict] = Field(
default_factory=list,
description="Typed edges: [{target, type (IMPORTS|CALLS_API|DATA_FLOWS_TO|REGISTRY_WIRES)}]",
)
class InvariantGroundTruth(BaseModel):
"""A planted cross-module invariant."""
id: str
type: InvariantType
description: str
structured: dict = Field(
description="Machine-comparable canonical form: {type, src?, dst?, via?, pattern?}"
)
involved_modules: list[str]
rationale: str = Field(
description="The design reason behind this invariant (for constraint discovery probing)"
)
evidence_types: list[ConstraintEvidenceType] = Field(
description="How this constraint is discoverable in the repo"
)
evidence_locations: list[str] = Field(
default_factory=list,
description="Filepaths where evidence of this constraint can be found",
)
class DesignRationale(BaseModel):
"""A planted design decision for architectural constraint discovery."""
id: str
question: str = Field(
description='Probe question, e.g. "Why does A not import C directly?"'
)
answer: str = Field(description="Ground-truth rationale")
affected_modules: list[str]
downstream_effects: list[str] = Field(
description="What would break if this decision were violated"
)
class ParameterSpec(BaseModel):
"""A single parameter in a function signature — structured for forgiving comparison."""
name: str
type_hint: str = Field(
default="",
description="Normalized type hint. Equivalence classes applied during scoring: "
"dict≈Dict[str,Any], list≈List[Any], Optional[X]≈X|None, etc.",
)
has_default: bool = Field(
default=False, description="Whether this parameter has a default value"
)
class FunctionSignature(BaseModel):
"""Structured function signature — avoids string-match pain (whitespace, naming, type aliases)."""
params: list[ParameterSpec] = Field(default_factory=list)
return_type: str = Field(
default="",
description="Normalized return type hint (same equivalence classes as params)",
)
class ExportedAPI(BaseModel):
"""A public API with its signature and callers — for contract-level scoring."""
name: str = Field(description="Function/class name")
module: str = Field(description="Filepath of the module exporting this API")
signature: FunctionSignature = Field(
description="Structured signature for forgiving comparison"
)
callers: list[str] = Field(
default_factory=list,
description="Filepaths of modules that call this API (impact set)",
)
class CodebaseGroundTruth(BaseModel):
"""Complete ground truth for a generated codebase."""
codebase_id: str
pattern: ArchPattern
complexity: ComplexityTier
language: str = "python"
modules: dict[str, ModuleGroundTruth]
invariants: list[InvariantGroundTruth]
design_rationales: list[DesignRationale]
dependency_edges: list[dict] = Field(
description="Typed edges: [{source, target, type (IMPORTS|CALLS_API|DATA_FLOWS_TO|REGISTRY_WIRES)}]"
)
contracts: list[ExportedAPI] = Field(
default_factory=list,
description="Contract layer: exported APIs with signatures and callers. Used for REVISE scoring.",
)
# ============================================================================
# Canonical Edge Types (v0.1 — exhaustive)
# ============================================================================
class EdgeType(str, Enum):
IMPORTS = "IMPORTS" # Python import statement exists
CALLS_API = "CALLS_API" # Calls a public function of another component
DATA_FLOWS_TO = "DATA_FLOWS_TO" # Output consumed by another component
REGISTRY_WIRES = "REGISTRY_WIRES" # Connected via config/registry, not import
# ============================================================================
# Cognitive Map (externalized by agent during probing)
# Probing does NOT consume the action budget.
# ============================================================================
class ModuleStatus(str, Enum):
OBSERVED = "observed"
INFERRED = "inferred"
UNKNOWN = "unknown"
class BeliefEdge(BaseModel):
"""A typed, directional edge in the agent's belief graph."""
target: str = Field(description="Target component filepath")
type: EdgeType
confidence: float = Field(ge=0.0, le=1.0, default=0.5)
class BeliefExport(BaseModel):
"""Agent's belief about an exported API — contract layer."""
name: str
signature: FunctionSignature = Field(
default_factory=FunctionSignature,
description="Agent's belief about the function signature (structured for forgiving comparison)",
)
callers: list[str] = Field(
default_factory=list,
description="Agent's belief about which components call this API",
)
confidence: float = Field(ge=0.0, le=1.0, default=0.5)
class ComponentBelief(BaseModel):
"""Agent's belief about a single component (file/module)."""
filepath: str
status: ModuleStatus
purpose: str = ""
exports: list[BeliefExport] = Field(
default_factory=list, description="Exported APIs with signatures (contract layer)"
)
edges: list[BeliefEdge] = Field(
default_factory=list, description="Typed outgoing edges"
)
confidence: float = Field(ge=0.0, le=1.0, default=0.5)
class StructuredConstraint(BaseModel):
"""Machine-comparable canonical form for an invariant."""
type: str = Field(description="FORBIDDEN_EDGE | INTERFACE_ONLY | VALIDATION_CHAIN | NAMING_CONVENTION | INVARIANT | PURPOSE")
src: Optional[str] = Field(default=None, description="Source component (for edge constraints)")
dst: Optional[str] = Field(default=None, description="Target component (for edge constraints)")
via: Optional[str] = Field(default=None, description="Required intermediary (for VALIDATION_CHAIN / INTERFACE_ONLY)")
pattern: Optional[str] = Field(default=None, description="Regex pattern (for NAMING_CONVENTION)")
class InvariantBelief(BaseModel):
"""Agent's belief about a cross-module invariant."""
type: str
description: str
structured: Optional[StructuredConstraint] = Field(
default=None,
description="Machine-comparable canonical form (for exact scoring)",
)
evidence: list[str] = Field(default_factory=list)
confidence: float = Field(ge=0.0, le=1.0, default=0.5)
class CognitiveMap(BaseModel):
"""The agent's externalized architectural belief state."""
step: int = Field(description="Action step at which this map was produced")
components: dict[str, ComponentBelief] = Field(default_factory=dict)
invariants: list[InvariantBelief] = Field(default_factory=list)
unexplored: list[str] = Field(default_factory=list)
uncertainty_summary: str = ""
# ============================================================================
# Mutations (for REVISE phase)
# ============================================================================
class MutationType(str, Enum):
INTERFACE_BREAK = "interface_break" # Change a function signature
DEPENDENCY_SHIFT = "dependency_shift" # Move functionality between modules
INVARIANT_VIOLATION = "invariant_violation" # Introduce a subtle contract break
BOUNDARY_BREACH = "boundary_breach" # Add a forbidden dependency
class Mutation(BaseModel):
"""A change applied to the codebase during REVISE phase."""
id: str
type: MutationType
target_module: str
description: str
is_sham: bool = Field(
default=False,
description="If True, evidence is presented but no actual change was made (no-change control)",
)
affected_modules: list[str] = Field(
description="All modules whose belief should change (empty if is_sham=True)"
)
affected_invariants: list[str] = Field(
description="IDs of invariants affected by this mutation (empty if is_sham=True)"
)
# ============================================================================
# Agent Actions
# ============================================================================
class ActionType(str, Enum):
OPEN = "open"
LIST = "list"
SEARCH = "search"
INSPECT = "inspect"
DONE = "done"
class AgentAction(BaseModel):
"""A single action taken by the agent."""
type: ActionType
argument: Optional[str] = None # filepath, directory, query, or symbol
secondary_argument: Optional[str] = None # for INSPECT: symbol name
class ActionResult(BaseModel):
"""Result of an agent action."""
action: AgentAction
output: str
step: int
# ============================================================================
# Evaluation Results
# ============================================================================
class ExplorationMetrics(BaseModel):
"""Exploration efficiency metrics."""
information_gain_curve: list[float] = Field(
description="E(t) at each step (optional, deferred to v1.0)"
)
action_efficiency_curve: list[float] = Field(
default_factory=list,
description="Edge F1 at each action step (all actions count: LIST/SEARCH/INSPECT/OPEN)",
)
observation_efficiency_curve: list[float] = Field(
default_factory=list,
description="Edge F1 at each OPEN action (diagnostic: isolates map-building from search overhead)",
)
action_auc: float = Field(
default=0.0,
description="AUC(F1 vs total actions) — headline metric",
)
observation_auc: float = Field(
default=0.0,
description="AUC(F1 vs OPEN count) — diagnostic metric",
)
steps_to_50_recall: Optional[int] = Field(
default=None, description="Actions to reach 50% edge recall"
)
steps_to_80_recall: Optional[int] = Field(
default=None, description="Actions to reach 80% edge recall"
)
final_efficiency: float
steps_taken: int
files_opened: int
unique_files: int
class MapAccuracyMetrics(BaseModel):
"""Cognitive map accuracy metrics."""
dependency_precision: float
dependency_recall: float
dependency_f1: float
invariant_precision: float
invariant_recall: float
invariant_f1: float
invariant_precision_relaxed: float = Field(
default=0.0,
description="Invariant precision under relaxed matching (type+src+dst, ignoring via/pattern)",
)
invariant_recall_relaxed: float = Field(
default=0.0,
description="Invariant recall under relaxed matching",
)
invariant_f1_relaxed: float = Field(
default=0.0,
description="Invariant F1 under relaxed matching",
)
confidence_ece: float = Field(description="Expected Calibration Error")
class BeliefRevisionMetrics(BaseModel):
"""Belief revision metrics after mutation (or sham evidence)."""
evidence_condition: str = Field(
default="real",
description="'real' (actual mutation) or 'sham' (no-change control)",
)
revision_score: float = Field(description="BRS: fraction correctly updated (all affected elements)")
inertia_proper: float = Field(
default=0.0,
description="Among affected elements correctly believed pre-mutation: fraction NOT updated post-evidence. "
"Isolates 'sticky priors' from 'never knew it'.",
)
impact_discovery: float = Field(
default=0.0,
description="Among affected elements missing/wrong pre-mutation: fraction newly discovered post-evidence. "
"Measures whether evidence triggers useful exploration.",
)
gullibility_rate: float = Field(
default=0.0,
description="Fraction of elements incorrectly updated when no change occurred (sham condition only)",
)
revision_latency: int = Field(
default=0,
description="Actions between evidence encounter and belief update",
)
mutation_type: MutationType
pre_mutation_known_count: int = Field(
default=0,
description="Number of affected elements correctly represented before mutation (denominator for inertia_proper)",
)
pre_mutation_unknown_count: int = Field(
default=0,
description="Number of affected elements missing/wrong before mutation (denominator for impact_discovery)",
)
class ConstraintDiscoveryMetrics(BaseModel):
"""Architectural constraint discovery metrics."""
counterfactual_probe_accuracy: float = Field(
description="Accuracy on multiple-choice constraint violation probes"
)
behavioral_compliance: Optional[float] = Field(
default=None,
description="1 - (violations / applicable_constraints) in generated code. "
"Requires EXPLOIT phase (v1.0). None in v0.1.",
)
rationale_match_rate: Optional[float] = Field(
default=None,
description="Exact match rate against generator metadata tags (optional)",
)
class EvalResult(BaseModel):
"""Complete evaluation result for one model on one codebase."""
model_name: str
codebase_id: str
mode: str = Field(description="'active' or 'passive'")
passive_condition: Optional[str] = Field(
default=None,
description="If mode='passive': 'full' | 'oracle' | 'replay'. None if active.",
)
track: str = Field(
default="probe_as_scratchpad",
description="'no_probe' (final map only) | 'probe_only' (JSON stripped from context) | 'probe_as_scratchpad' (JSON retained)",
)
exploration: ExplorationMetrics
map_accuracy: MapAccuracyMetrics
belief_revision: Optional[BeliefRevisionMetrics] = None
constraint_discovery: Optional[ConstraintDiscoveryMetrics] = None
exploit_task_accuracy: Optional[float] = Field(
default=None,
description="Correctness on EXPLOIT downstream task. Requires write interface (v1.0). None in v0.1.",
)
cognitive_maps: list[CognitiveMap] = Field(
description="Sequence of externalized maps during exploration"
)