diff --git a/dhee/adapters/base.py b/dhee/adapters/base.py index a1f7644..c695095 100644 --- a/dhee/adapters/base.py +++ b/dhee/adapters/base.py @@ -195,8 +195,13 @@ def context( self, task_description: Optional[str] = None, user_id: Optional[str] = None, + operational: bool = False, ) -> Dict[str, Any]: - """HyperAgent session bootstrap. Returns everything the agent needs.""" + """HyperAgent session bootstrap. Returns everything the agent needs. + + Args: + operational: If True, return compact actionable-only format. + """ uid = user_id or self._user_id self._tracker.on_context(task_description) hyper_ctx = self._buddhi.get_hyper_context( @@ -204,6 +209,8 @@ def context( task_description=task_description, memory=self._engram._memory, ) + if operational: + return hyper_ctx.to_operational_dict() return hyper_ctx.to_dict() # ------------------------------------------------------------------ diff --git a/dhee/core/buddhi.py b/dhee/core/buddhi.py index 02ca3c3..cef2380 100644 --- a/dhee/core/buddhi.py +++ b/dhee/core/buddhi.py @@ -30,6 +30,7 @@ import json import logging +import math import os import re import time @@ -66,12 +67,45 @@ class Insight: invalidation_count: int # how many times contradicted tags: List[str] + # Utility tracking (EMA of measured performance deltas) + utility: float = 0.0 + apply_count: int = 0 + _UTILITY_ALPHA: float = 0.3 + + def record_outcome( + self, + success: bool, + baseline_score: Optional[float] = None, + actual_score: Optional[float] = None, + ) -> float: + """Record outcome with optional measured delta.""" + self.apply_count += 1 + if success: + self.validation_count += 1 + self.confidence = min(1.0, self.confidence + 0.05) + else: + self.invalidation_count += 1 + self.confidence = max(0.0, self.confidence - 0.1) + self.last_validated = datetime.now(timezone.utc).isoformat() + + delta = 0.0 + if baseline_score is not None and actual_score is not None: + delta = actual_score - baseline_score + self.utility = ( + self._UTILITY_ALPHA * delta + + (1 - self._UTILITY_ALPHA) * self.utility + ) + return delta + def strength(self) -> float: - """Net strength: validated - invalidated, normalized.""" + """Net strength: validated - invalidated, normalized, utility-weighted.""" total = self.validation_count + self.invalidation_count if total == 0: return self.confidence - return self.confidence * (self.validation_count / total) + base = self.confidence * (self.validation_count / total) + # Incorporate utility via sigmoid + utility_factor = 0.5 + 0.5 * (1.0 / (1.0 + math.exp(-3.0 * self.utility))) + return base * utility_factor def to_dict(self) -> Dict[str, Any]: return { @@ -83,6 +117,8 @@ def to_dict(self) -> Dict[str, Any]: "source_task_types": self.source_task_types, "validations": self.validation_count, "tags": self.tags, + "utility": round(self.utility, 3), + "apply_count": self.apply_count, } @@ -152,6 +188,13 @@ class HyperContext: policies: List[Dict[str, Any]] = field(default_factory=list) beliefs: List[Dict[str, Any]] = field(default_factory=list) + # Phase 4: operational cognition packet + active_step: Optional[Dict[str, Any]] = None + step_policies: List[Dict[str, Any]] = field(default_factory=list) + critical_blockers: List[str] = field(default_factory=list) + contradictions: List[Dict[str, Any]] = field(default_factory=list) + action_items: List[str] = field(default_factory=list) + def to_dict(self) -> Dict[str, Any]: return { "user_id": self.user_id, @@ -173,6 +216,11 @@ def to_dict(self) -> Dict[str, Any]: "strength": m.get("strength", 1.0)} for m in self.memories[:10] ], + "active_step": self.active_step, + "step_policies": self.step_policies[:5], + "critical_blockers": self.critical_blockers[:5], + "contradictions": self.contradictions[:5], + "action_items": self.action_items[:10], "meta": { "n_insights": len(self.insights), "n_active_intentions": len(self.intentions), @@ -183,10 +231,34 @@ def to_dict(self) -> Dict[str, Any]: "n_task_states": len(self.task_states), "n_policies": len(self.policies), "n_beliefs": len(self.beliefs), + "has_active_step": self.active_step is not None, + "n_action_items": len(self.action_items), + "n_critical_blockers": len(self.critical_blockers), "performance_tracked": len(self.performance) > 0, }, } + def to_operational_dict(self) -> Dict[str, Any]: + """Compact operational format for per-turn agent consumption. + + Contains ONLY actionable items -- not full history. + Use to_dict() for deep context. + """ + result: Dict[str, Any] = {} + if self.active_step: + result["current_step"] = self.active_step + if self.step_policies: + result["step_policies"] = self.step_policies[:3] + if self.action_items: + result["action_items"] = self.action_items[:5] + if self.critical_blockers: + result["critical_blockers"] = self.critical_blockers[:3] + if self.warnings: + result["warnings"] = self.warnings[:3] + if self.contradictions: + result["contradictions"] = self.contradictions[:3] + return result + # --------------------------------------------------------------------------- # Buddhi — the proactive cognition layer @@ -379,6 +451,49 @@ def get_hyper_context( beliefs = cog_state.get("beliefs", []) warnings.extend(cog_state.get("belief_warnings", [])) + # 15. Operational cognition packet (Phase 4) + active_step_desc = cog_state.get("active_step") + active_step = {"description": active_step_desc} if active_step_desc else None + step_policies_list = cog_state.get("step_policies", []) + + # Critical blockers from active task states + critical_blockers = [] + for ts in task_states: + if ts.get("status") in ("in_progress", "blocked") and ts.get("blockers"): + critical_blockers.extend(ts["blockers"]) + + # Contradictions from belief store + contradictions_list = [] + try: + contradiction_pairs = self._kernel.beliefs.get_contradictions(user_id) + for b1, b2 in contradiction_pairs[:5]: + severity = abs(b1.confidence - b2.confidence) + contradictions_list.append({ + "belief_a": b1.claim[:100], + "belief_b": b2.claim[:100], + "confidence_a": round(b1.confidence, 2), + "confidence_b": round(b2.confidence, 2), + "severity": round(1.0 - severity, 2), + }) + except Exception: + pass + + # Build prioritized action items + action_items = [] + for intention in triggered: + action_items.append(f"[INTENTION] {intention.action_payload}") + if active_step_desc: + action_items.append(f"[NEXT STEP] {active_step_desc}") + for sp in step_policies_list[:3]: + action_items.append(f"[CORRECTION] {sp.get('do', '')[:100]}") + avoid = sp.get("avoid") + if avoid: + avoids = avoid if isinstance(avoid, list) else [avoid] + for a in avoids[:2]: + action_items.append(f"[AVOID] {a[:100]}") + for blocker in critical_blockers[:3]: + action_items.append(f"[BLOCKER] Resolve: {blocker}") + return HyperContext( user_id=user_id, session_id=str(uuid.uuid4()), @@ -395,6 +510,11 @@ def get_hyper_context( task_states=task_states, policies=policies, beliefs=beliefs, + active_step=active_step, + step_policies=step_policies_list, + critical_blockers=critical_blockers, + contradictions=contradictions_list, + action_items=action_items, ) # ------------------------------------------------------------------ @@ -871,6 +991,19 @@ def reflect( except Exception: pass + # Compute baseline from moving average for utility scoring (D2Skill) + # Moved before heuristic/policy blocks so all can use it + baseline_score = None + if outcome_score is not None: + try: + key = f"{user_id}:{task_type}" + records = self._performance.get(key, []) + if len(records) >= 2: + recent = records[-min(10, len(records)):] + baseline_score = sum(r["score"] for r in recent) / len(recent) + except Exception: + pass + # Phase 2: Distill heuristic from what_worked if what_worked: try: @@ -882,22 +1015,12 @@ def reflect( what_failed=what_failed, user_id=user_id, ) - # Close the heuristic validation loop: validate any previously - # retrieved heuristics that were used for this task type - self._validate_used_heuristics(user_id, task_type, what_worked is not None) - except Exception: - pass - - # Phase 3: Extract policy from task outcomes, with utility deltas - # Compute baseline from moving average for utility scoring (D2Skill) - baseline_score = None - if outcome_score is not None: - try: - key = f"{user_id}:{task_type}" - records = self._performance.get(key, []) - if len(records) >= 2: - recent = records[-min(10, len(records)):] - baseline_score = sum(r["score"] for r in recent) / len(recent) + # Close the heuristic validation loop with measured deltas + self._validate_used_heuristics( + user_id, task_type, what_worked is not None, + baseline_score=baseline_score, + actual_score=outcome_score, + ) except Exception: pass @@ -922,6 +1045,10 @@ def reflect( self._kernel.policies.extract_from_tasks( user_id, task_dicts, task_type, ) + # Step-level policy extraction from failure patterns + self._kernel.policies.extract_step_policies( + user_id, task_dicts, task_type, + ) except Exception: pass @@ -940,6 +1067,19 @@ def reflect( except Exception: pass + # Extract step policies from failure patterns + try: + completed = self._kernel.tasks.get_tasks_by_type( + user_id, task_type, limit=10, + ) + if len(completed) >= 3: + task_dicts = [t.to_dict() for t in completed] + self._kernel.policies.extract_step_policies( + user_id, task_dicts, task_type, + ) + except Exception: + pass + # Update beliefs based on outcomes (via kernel) if what_worked: try: @@ -965,16 +1105,122 @@ def reflect( except Exception: pass + # Cross-structure: when beliefs are challenged, check dependent policies + try: + for belief in relevant: + if belief.confidence < 0.3: + claim_words = set(belief.claim.lower().split()[:5]) + for policy in self._kernel.policies._policies.values(): + if policy.user_id != user_id: + continue + approach_words = set(policy.action.approach.lower().split()) + if len(claim_words & approach_words) >= 2: + policy.utility *= 0.8 + policy.updated_at = time.time() + except Exception: + pass + + # Record outcomes on matched insights (utility tracking) + if what_worked: + try: + matched_insights = self._get_relevant_insights( + user_id, f"{task_type} task", + ) + for insight in matched_insights[:5]: + insight.record_outcome( + success=True, + baseline_score=baseline_score, + actual_score=outcome_score, + ) + self._save_insights() + except Exception: + pass + + if what_failed: + try: + matched_insights = self._get_relevant_insights( + user_id, f"{task_type} task", + ) + for insight in matched_insights[:5]: + insight.record_outcome( + success=False, + baseline_score=baseline_score, + actual_score=outcome_score, + ) + self._save_insights() + except Exception: + pass + + # Record outcomes on matched contrastive pairs + try: + store = self._get_contrastive() + matched_pairs = store.retrieve_contrasts( + f"{task_type} task", user_id=user_id, limit=5, + ) + task_succeeded = what_worked is not None + for pair in matched_pairs: + pair.record_outcome( + success=task_succeeded, + baseline_score=baseline_score, + actual_score=outcome_score, + ) + store._save_all() + except Exception: + pass + + # Cross-structure: positive policy delta -> reinforce related heuristics + if what_worked: + try: + matched_policies = self._kernel.policies.match_policies( + user_id, task_type, f"{task_type} task", + ) + for policy in matched_policies: + if policy.last_delta > 0: + distiller = self._get_heuristic_distiller() + related = distiller.retrieve_relevant( + policy.action.approach, user_id=user_id, limit=3, + ) + for h in related: + h.record_outcome( + success=True, + baseline_score=baseline_score, + actual_score=outcome_score, + ) + distiller._save_all() + except Exception: + pass + + # Record outcomes on recently triggered intentions + try: + triggered = [ + i for i in self._kernel.intentions._intentions.values() + if i.user_id == user_id + and i.status == "triggered" + and i.was_useful is None + ] + task_succeeded = what_worked is not None and ( + outcome_score is None or outcome_score >= 0.5 + ) + for intention in triggered: + self._kernel.intentions.record_outcome( + intention.id, + useful=task_succeeded, + outcome_score=outcome_score, + ) + except Exception: + pass + return new_insights def _validate_used_heuristics( self, user_id: str, task_type: str, success: bool, + baseline_score: Optional[float] = None, + actual_score: Optional[float] = None, ) -> None: - """Close the heuristic validation loop. + """Close the heuristic validation loop with measured utility deltas. - When a task completes, validate heuristics that were retrieved for - this task type. This is the missing feedback loop that turns - scaffolding into real self-improvement. + When a task completes, update heuristics that were retrieved for + this task type with EMA utility tracking (not just boolean validate). """ try: distiller = self._get_heuristic_distiller() @@ -984,7 +1230,12 @@ def _validate_used_heuristics( limit=5, ) for h in relevant: - distiller.validate(h.id, validated=success) + h.record_outcome( + success=success, + baseline_score=baseline_score, + actual_score=actual_score, + ) + distiller._save_all() except Exception: pass @@ -1009,6 +1260,8 @@ def _save_insights(self) -> None: "validation_count": insight.validation_count, "invalidation_count": insight.invalidation_count, "tags": insight.tags, + "utility": insight.utility, + "apply_count": insight.apply_count, } f.write(json.dumps(row, ensure_ascii=False) + "\n") except OSError as e: @@ -1050,6 +1303,8 @@ def _load_state(self) -> None: validation_count=row.get("validation_count", 0), invalidation_count=row.get("invalidation_count", 0), tags=row.get("tags", []), + utility=row.get("utility", 0.0), + apply_count=row.get("apply_count", 0), ) self._insights[insight.id] = insight except (OSError, json.JSONDecodeError) as e: diff --git a/dhee/core/cognition_kernel.py b/dhee/core/cognition_kernel.py index aea8821..b99a7d8 100644 --- a/dhee/core/cognition_kernel.py +++ b/dhee/core/cognition_kernel.py @@ -88,12 +88,18 @@ def get_cognitive_state( except Exception: result["episodes"] = [] - # Task states + # Task states + active step context + step_context = "" + active_task_type = task_description or "general" try: task_states = [] active = self.tasks.get_active_task(user_id) if active: task_states.append(active.to_compact()) + current = active.current_step + if current: + step_context = current.description + active_task_type = active.task_type or active_task_type recent_tasks = self.tasks.get_recent_tasks(user_id, limit=3) for t in recent_tasks: c = t.to_compact() @@ -103,18 +109,37 @@ def get_cognitive_state( except Exception: result["task_states"] = [] - # Policies + result["active_step"] = step_context if step_context else None + + # Policies (pass step_context for better matching) try: matched = self.policies.match_policies( user_id=user_id, - task_type=task_description or "general", + task_type=active_task_type, task_description=task_description or "", + step_context=step_context, limit=3, ) result["policies"] = [p.to_compact() for p in matched] except Exception: result["policies"] = [] + # Step policies (separate, for operational context) + try: + if step_context: + step_matched = self.policies.match_step_policies( + user_id=user_id, + task_type=active_task_type, + task_description=task_description or "", + step_context=step_context, + limit=3, + ) + result["step_policies"] = [p.to_compact() for p in step_matched] + else: + result["step_policies"] = [] + except Exception: + result["step_policies"] = [] + # Beliefs belief_warnings: List[str] = [] try: @@ -169,6 +194,25 @@ def record_checkpoint_event( content=summary[:500], metadata={"status": status, "outcome_score": outcome_score}, ) + + # Wire episode.connection_count for cross-primitive links + try: + open_eps = getattr(self.episodes, '_open_episodes', {}) + ep_id = open_eps.get(user_id) + if ep_id: + eps_dict = getattr(self.episodes, '_episodes', {}) + ep = eps_dict.get(ep_id) + if ep: + active_task = self.tasks.get_active_task(user_id) + if active_task: + ep.connection_count += 1 + matched_policies = self.policies.match_policies( + user_id, summary[:50], summary[:200], limit=3, + ) + ep.connection_count += len(matched_policies) + except Exception: + pass + if status == "completed": episode = self.episodes.end_episode( user_id, outcome_score, summary @@ -231,10 +275,56 @@ def update_task_on_checkpoint( result["task_completed"] = active_task.id self.tasks.update_task(active_task) + + # Record outcomes on STEP policies for completed/failed steps + if status == "completed" and active_task.plan: + for step in active_task.plan: + if step.status.value == "completed": + self.record_step_outcome( + user_id, task_type, step.description, + success=True, actual_score=outcome_score, + ) + elif step.status.value == "failed": + self.record_step_outcome( + user_id, task_type, step.description, + success=False, actual_score=outcome_score, + ) except Exception: pass return result + def record_step_outcome( + self, + user_id: str, + task_type: str, + step_description: str, + success: bool, + baseline_score: Optional[float] = None, + actual_score: Optional[float] = None, + ) -> None: + """Record outcome on STEP policies matching a completed/failed step. + + Finds matching STEP policies and records their outcomes. + Zero LLM calls. + """ + try: + matched = self.policies.match_step_policies( + user_id=user_id, + task_type=task_type, + task_description=f"{task_type} task", + step_context=step_description, + limit=5, + ) + for policy in matched: + self.policies.record_outcome( + policy.id, + success=success, + baseline_score=baseline_score, + actual_score=actual_score, + ) + except Exception: + pass + def selective_forget( self, user_id: str, diff --git a/dhee/core/contrastive.py b/dhee/core/contrastive.py index ec443dd..f257047 100644 --- a/dhee/core/contrastive.py +++ b/dhee/core/contrastive.py @@ -43,6 +43,30 @@ class ContrastivePair: tags: List[str] = field(default_factory=list) validation_count: int = 0 # times this contrast proved useful + # Utility tracking (EMA of measured performance deltas) + utility: float = 0.0 + apply_count: int = 0 + _UTILITY_ALPHA: float = 0.3 + + def record_outcome( + self, + success: bool, + baseline_score: Optional[float] = None, + actual_score: Optional[float] = None, + ) -> float: + """Record outcome with optional measured delta.""" + self.apply_count += 1 + if success: + self.validation_count += 1 + delta = 0.0 + if baseline_score is not None and actual_score is not None: + delta = actual_score - baseline_score + self.utility = ( + self._UTILITY_ALPHA * delta + + (1 - self._UTILITY_ALPHA) * self.utility + ) + return delta + def to_dict(self) -> Dict[str, Any]: return { "id": self.id, @@ -55,6 +79,8 @@ def to_dict(self) -> Dict[str, Any]: "user_id": self.user_id, "tags": self.tags, "validation_count": self.validation_count, + "utility": self.utility, + "apply_count": self.apply_count, } @classmethod @@ -70,6 +96,8 @@ def from_dict(cls, d: Dict[str, Any]) -> ContrastivePair: user_id=d.get("user_id", "default"), tags=d.get("tags", []), validation_count=d.get("validation_count", 0), + utility=d.get("utility", 0.0), + apply_count=d.get("apply_count", 0), ) def to_compact(self) -> Dict[str, str]: @@ -79,6 +107,7 @@ def to_compact(self) -> Dict[str, str]: "do": self.success_approach[:300], "avoid": self.failure_approach[:300], "confidence": round(min(1.0, 0.5 + 0.1 * self.validation_count), 2), + "utility": round(self.utility, 3), } diff --git a/dhee/core/heuristic.py b/dhee/core/heuristic.py index 89767e6..188caa4 100644 --- a/dhee/core/heuristic.py +++ b/dhee/core/heuristic.py @@ -22,6 +22,7 @@ import json import logging +import math import os import time import uuid @@ -46,11 +47,47 @@ class Heuristic: invalidation_count: int = 0 tags: List[str] = field(default_factory=list) + # Utility tracking (EMA of measured performance deltas) + utility: float = 0.0 + last_delta: float = 0.0 + apply_count: int = 0 + + # EMA smoothing factor (class constant, not serialized) + _UTILITY_ALPHA: float = 0.3 + + def record_outcome( + self, + success: bool, + baseline_score: Optional[float] = None, + actual_score: Optional[float] = None, + ) -> float: + """Record outcome with optional measured delta. Mirrors PolicyCase.""" + self.apply_count += 1 + if success: + self.validation_count += 1 + self.confidence = min(1.0, self.confidence + 0.05) + else: + self.invalidation_count += 1 + self.confidence = max(0.0, self.confidence - 0.1) + + delta = 0.0 + if baseline_score is not None and actual_score is not None: + delta = actual_score - baseline_score + self.last_delta = delta + self.utility = ( + self._UTILITY_ALPHA * delta + + (1 - self._UTILITY_ALPHA) * self.utility + ) + return delta + def strength(self) -> float: total = self.validation_count + self.invalidation_count if total == 0: return self.confidence - return self.confidence * (self.validation_count / total) + base = self.confidence * (self.validation_count / total) + # Incorporate utility via sigmoid: maps utility to [0.5, 1.0] range + utility_factor = 0.5 + 0.5 * (1.0 / (1.0 + math.exp(-3.0 * self.utility))) + return base * utility_factor def to_dict(self) -> Dict[str, Any]: return { @@ -65,6 +102,9 @@ def to_dict(self) -> Dict[str, Any]: "validation_count": self.validation_count, "invalidation_count": self.invalidation_count, "tags": self.tags, + "utility": self.utility, + "last_delta": self.last_delta, + "apply_count": self.apply_count, } @classmethod @@ -80,6 +120,9 @@ def from_dict(cls, d: Dict[str, Any]) -> Heuristic: validation_count=d.get("validation_count", 0), invalidation_count=d.get("invalidation_count", 0), tags=d.get("tags", []), + utility=d.get("utility", 0.0), + last_delta=d.get("last_delta", 0.0), + apply_count=d.get("apply_count", 0), ) def to_compact(self) -> Dict[str, Any]: @@ -88,6 +131,7 @@ def to_compact(self) -> Dict[str, Any]: "heuristic": self.content[:300], "level": self.abstraction_level, "confidence": round(self.strength(), 2), + "utility": round(self.utility, 3), "applies_to": self.source_task_types[:3], } diff --git a/dhee/core/intention.py b/dhee/core/intention.py index cc0773b..82a1f98 100644 --- a/dhee/core/intention.py +++ b/dhee/core/intention.py @@ -44,6 +44,10 @@ class Intention: created_at: str triggered_at: Optional[str] + # Outcome tracking + outcome_score: Optional[float] = None + was_useful: Optional[bool] = None + def to_dict(self) -> Dict[str, Any]: return { "id": self.id, @@ -53,6 +57,8 @@ def to_dict(self) -> Dict[str, Any]: "status": self.status, "trigger_keywords": self.trigger_keywords, "trigger_after": self.trigger_after, + "outcome_score": self.outcome_score, + "was_useful": self.was_useful, } @@ -198,6 +204,20 @@ def get_active(self, user_id: str) -> List[Intention]: if i.user_id == user_id and i.status == "active" ] + def record_outcome( + self, + intention_id: str, + useful: bool, + outcome_score: Optional[float] = None, + ) -> None: + """Record whether a triggered intention was useful.""" + intention = self._intentions.get(intention_id) + if not intention: + return + intention.was_useful = useful + intention.outcome_score = outcome_score + self._save() + def get_stats(self, user_id: Optional[str] = None) -> Dict[str, Any]: """Stats for health checks.""" intentions = list(self._intentions.values()) @@ -229,6 +249,8 @@ def _save(self) -> None: "status": intention.status, "created_at": intention.created_at, "triggered_at": intention.triggered_at, + "outcome_score": intention.outcome_score, + "was_useful": intention.was_useful, } f.write(json.dumps(row, ensure_ascii=False) + "\n") except OSError as e: @@ -256,6 +278,8 @@ def _load(self) -> None: status=row.get("status", "active"), created_at=row.get("created_at", ""), triggered_at=row.get("triggered_at"), + outcome_score=row.get("outcome_score"), + was_useful=row.get("was_useful"), ) self._intentions[intention.id] = intention except (OSError, json.JSONDecodeError) as e: diff --git a/dhee/core/policy.py b/dhee/core/policy.py index d52c8d4..a4a3327 100644 --- a/dhee/core/policy.py +++ b/dhee/core/policy.py @@ -535,6 +535,99 @@ def extract_from_tasks( source_task_ids=[t.get("id", "") for t in successful[:5]], ) + def extract_step_policies( + self, + user_id: str, + completed_tasks: List[Dict[str, Any]], + task_type: str, + ) -> List[PolicyCase]: + """Extract STEP-granularity policies from repeated failure patterns. + + Analyzes failed steps across completed tasks of the same type. + When the same step fails >=2 times at the same position AND a + different approach succeeded at that position in other tasks, + creates a STEP correction policy. + + Pure structural analysis, zero LLM calls. + """ + # Collect failed steps grouped by (position, normalized description) + failure_counts: Dict[tuple, List[str]] = {} # (pos, desc) -> [task_ids] + success_at_pos: Dict[int, List[str]] = {} # pos -> [successful step descriptions] + + for task in completed_tasks: + plan = task.get("plan", []) + task_id = task.get("id", "") + task_score = task.get("outcome_score", 0) + + for idx, step in enumerate(plan): + status = step.get("status", "pending") + desc = step.get("description", "").lower().strip() + if not desc: + continue + + if status == "failed": + key = (idx, desc) + if key not in failure_counts: + failure_counts[key] = [] + failure_counts[key].append(task_id) + elif status == "completed" and task_score >= 0.6: + if idx not in success_at_pos: + success_at_pos[idx] = [] + success_at_pos[idx].append(desc) + + # Filter to steps that fail >=2 times + new_policies: List[PolicyCase] = [] + for (pos, failed_desc), task_ids in failure_counts.items(): + if len(task_ids) < 2: + continue + + # Find a successful alternative at the same position + alternatives = success_at_pos.get(pos, []) + if not alternatives: + continue + + # Pick the most common successful alternative + alt_freq: Dict[str, int] = {} + for alt in alternatives: + if alt != failed_desc: # Must be different from the failing step + alt_freq[alt] = alt_freq.get(alt, 0) + 1 + if not alt_freq: + continue + best_alt = max(alt_freq, key=alt_freq.get) + + # Extract keywords from failed step as step_patterns + stop = {"the", "a", "an", "to", "of", "in", "for", "on", "and", "or", "is", "it", "with"} + step_patterns = [ + w for w in failed_desc.split() + if len(w) > 2 and w not in stop + ][:5] + + if not step_patterns: + continue + + # Deduplicate against existing STEP policies + existing = self._find_similar_step_policy(user_id, task_type, step_patterns) + if existing: + existing.success_count += 1 + existing.apply_count += 1 + existing.updated_at = time.time() + self._save_policy(existing) + new_policies.append(existing) + continue + + policy = self.create_step_policy( + user_id=user_id, + name=f"{task_type}_step_fix_v{len(self._policies) + 1}", + task_types=[task_type], + step_patterns=step_patterns, + approach=best_alt, + avoid=[failed_desc], + source_task_ids=task_ids[:5], + ) + new_policies.append(policy) + + return new_policies + def match_policies( self, user_id: str, @@ -741,6 +834,31 @@ def _find_similar_policy( return None + def _find_similar_step_policy( + self, user_id: str, task_type: str, step_patterns: List[str], + ) -> Optional[PolicyCase]: + """Find an existing STEP policy with similar step_patterns.""" + pattern_words = set(w.lower() for w in step_patterns) + if not pattern_words: + return None + + for policy in self._policies.values(): + if policy.user_id != user_id: + continue + if policy.granularity != PolicyGranularity.STEP: + continue + if task_type not in policy.condition.task_types: + continue + + existing_words = set(w.lower() for w in policy.condition.step_patterns) + if not existing_words: + continue + overlap = len(pattern_words & existing_words) / len(pattern_words | existing_words) + if overlap > self.SIMILARITY_THRESHOLD: + return policy + + return None + def _delete_policy(self, policy_id: str) -> None: self._policies.pop(policy_id, None) path = os.path.join(self._dir, f"{policy_id}.json") diff --git a/dhee/simple.py b/dhee/simple.py index 26c29ec..2142acc 100644 --- a/dhee/simple.py +++ b/dhee/simple.py @@ -528,6 +528,7 @@ def context( self, task_description: Optional[str] = None, user_id: Optional[str] = None, + operational: bool = False, ) -> Dict[str, Any]: """HyperAgent session bootstrap. Call once at conversation start. @@ -538,10 +539,11 @@ def context( Args: task_description: What you're about to work on. user_id: Override default user_id. + operational: If True, return compact actionable-only format + for per-turn consumption instead of full context. Returns: - HyperContext dict with keys: warnings, insights, intentions, - performance, memories, last_session, meta. + HyperContext dict (full or operational). """ uid = user_id or self._user_id self._tracker.on_context(task_description) @@ -550,6 +552,8 @@ def context( task_description=task_description, memory=self._engram._memory, ) + if operational: + return hyper_ctx.to_operational_dict() return hyper_ctx.to_dict() # ------------------------------------------------------------------ diff --git a/tests/test_cognition_kernel.py b/tests/test_cognition_kernel.py index eebc5a9..6002f85 100644 --- a/tests/test_cognition_kernel.py +++ b/tests/test_cognition_kernel.py @@ -227,3 +227,317 @@ def test_kernel_stores_accessible(self, tmp_path): assert d.kernel.episodes is not None assert d.kernel.policies is not None assert d.kernel.intentions is not None + + +# ── Step-Level Policy Extraction (Phase 2) ──────────────────────── + + +class TestStepPolicyExtraction: + @pytest.fixture + def store(self, tmp_path): + from dhee.core.policy import PolicyStore + return PolicyStore(data_dir=str(tmp_path / "policies")) + + def _make_task(self, task_id, task_type, plan_steps, outcome_score): + """Helper: build a task dict with plan steps.""" + plan = [] + for desc, status in plan_steps: + plan.append({ + "id": f"step-{desc[:8]}", + "description": desc, + "status": status, + }) + return { + "id": task_id, + "task_type": task_type, + "outcome_score": outcome_score, + "plan": plan, + } + + def test_extract_step_policies_from_failures(self, store): + """When the same step fails >=2 times and another succeeds, extract STEP policy.""" + tasks = [ + self._make_task("t1", "bug_fix", [ + ("reproduce bug", "completed"), + ("check imports", "failed"), + ("write test", "completed"), + ], 0.3), + self._make_task("t2", "bug_fix", [ + ("reproduce bug", "completed"), + ("check imports", "failed"), + ("write test", "completed"), + ], 0.4), + self._make_task("t3", "bug_fix", [ + ("reproduce bug", "completed"), + ("trace call stack", "completed"), + ("write test", "completed"), + ], 0.9), + ] + + policies = store.extract_step_policies("u", tasks, "bug_fix") + assert len(policies) >= 1 + step_policy = policies[0] + assert step_policy.granularity.value == "step" + assert "trace call stack" in step_policy.action.approach + assert "check imports" in step_policy.action.avoid + + def test_no_step_policy_below_threshold(self, store): + """Need >=2 failures of same step to extract.""" + tasks = [ + self._make_task("t1", "bug_fix", [ + ("reproduce bug", "completed"), + ("check imports", "failed"), + ], 0.3), + self._make_task("t2", "bug_fix", [ + ("reproduce bug", "completed"), + ("trace call stack", "completed"), + ], 0.9), + ] + policies = store.extract_step_policies("u", tasks, "bug_fix") + assert len(policies) == 0 + + def test_dedup_step_policies(self, store): + """Similar step policies boost existing rather than duplicate.""" + tasks = [ + self._make_task("t1", "bug_fix", [("check imports", "failed")], 0.3), + self._make_task("t2", "bug_fix", [("check imports", "failed")], 0.3), + self._make_task("t3", "bug_fix", [("trace call stack", "completed")], 0.9), + ] + p1 = store.extract_step_policies("u", tasks, "bug_fix") + assert len(p1) == 1 + count_after_first = p1[0].apply_count + + # Extract again — should boost, not duplicate + p2 = store.extract_step_policies("u", tasks, "bug_fix") + assert len(p2) == 1 + assert p2[0].id == p1[0].id + assert p2[0].apply_count > count_after_first + + def test_step_context_in_cognitive_state(self, tmp_path): + """get_cognitive_state includes active_step and step_policies.""" + from dhee.core.cognition_kernel import CognitionKernel + kernel = CognitionKernel(data_dir=str(tmp_path / "kernel")) + # Create active task with current step + task = kernel.tasks.create_task("u", "Fix auth", "bug_fix", plan=["step 1", "step 2"]) + task.start() # starts first step + kernel.tasks.update_task(task) + + state = kernel.get_cognitive_state("u", "bug_fix") + assert "active_step" in state + assert state["active_step"] == "step 1" + assert "step_policies" in state + + def test_record_step_outcome(self, tmp_path): + """record_step_outcome finds and updates matching STEP policies.""" + from dhee.core.cognition_kernel import CognitionKernel + kernel = CognitionKernel(data_dir=str(tmp_path / "kernel")) + + # Create a STEP policy + kernel.policies.create_step_policy( + user_id="u", + name="check_imports_fix", + task_types=["bug_fix"], + step_patterns=["check", "imports"], + approach="trace call stack instead", + ) + + # Record step outcome + kernel.record_step_outcome( + "u", "bug_fix", "check imports first", + success=True, actual_score=0.8, + ) + + # Verify policy was updated + policies = list(kernel.policies._policies.values()) + step_policy = [p for p in policies if p.granularity.value == "step"][0] + assert step_policy.apply_count == 1 + assert step_policy.success_count == 1 + + +# ── Utility Tracking (Phase 3) ─────────────────────────────────── + + +class TestUtilityTracking: + def test_heuristic_record_outcome(self, tmp_path): + """Heuristic.record_outcome updates utility via EMA.""" + from dhee.core.heuristic import Heuristic + h = Heuristic( + id="h1", content="test heuristic", abstraction_level="domain", + source_task_types=["bug_fix"], confidence=0.6, created_at=0.0, + ) + delta = h.record_outcome(success=True, baseline_score=0.5, actual_score=0.8) + assert delta == pytest.approx(0.3) + assert h.utility > 0 + assert h.apply_count == 1 + assert h.validation_count == 1 + + def test_heuristic_strength_includes_utility(self, tmp_path): + """strength() incorporates utility factor.""" + from dhee.core.heuristic import Heuristic + h1 = Heuristic( + id="h1", content="test", abstraction_level="domain", + source_task_types=["t"], confidence=0.8, created_at=0.0, + validation_count=3, invalidation_count=1, + ) + base_strength = h1.strength() + + h2 = Heuristic( + id="h2", content="test", abstraction_level="domain", + source_task_types=["t"], confidence=0.8, created_at=0.0, + validation_count=3, invalidation_count=1, + utility=0.5, # positive utility + ) + boosted_strength = h2.strength() + assert boosted_strength > base_strength + + def test_insight_record_outcome(self): + """Insight.record_outcome updates utility via EMA.""" + from dhee.core.buddhi import Insight + i = Insight( + id="i1", user_id="u", content="test insight", + insight_type="strategy", source_task_types=["bug_fix"], + confidence=0.7, created_at="2026-01-01", last_validated="2026-01-01", + validation_count=0, invalidation_count=0, tags=["test"], + ) + delta = i.record_outcome(success=True, baseline_score=0.4, actual_score=0.9) + assert delta == pytest.approx(0.5) + assert i.utility > 0 + assert i.apply_count == 1 + + def test_intention_record_outcome(self, tmp_path): + """IntentionStore.record_outcome marks intention usefulness.""" + from dhee.core.intention import IntentionStore + store = IntentionStore(data_dir=str(tmp_path / "intentions")) + i = store.store("u", "run tests after deploy", trigger_keywords=["deploy"]) + # Simulate trigger + i.status = "triggered" + store.record_outcome(i.id, useful=True, outcome_score=0.9) + assert i.was_useful is True + assert i.outcome_score == 0.9 + + def test_contrastive_record_outcome(self): + """ContrastivePair.record_outcome updates utility.""" + from dhee.core.contrastive import ContrastivePair + pair = ContrastivePair( + id="c1", task_description="test task", task_type="bug_fix", + success_approach="do X", failure_approach="do Y", + outcome_delta=0.5, created_at=0.0, + ) + delta = pair.record_outcome(success=True, baseline_score=0.5, actual_score=0.8) + assert delta == pytest.approx(0.3) + assert pair.utility > 0 + assert pair.apply_count == 1 + assert pair.validation_count == 1 + + def test_episode_connection_count_incremented(self, tmp_path): + """connection_count increases on checkpoint when task exists.""" + from dhee.core.cognition_kernel import CognitionKernel + kernel = CognitionKernel(data_dir=str(tmp_path / "kernel")) + # Create a task and start an episode + kernel.tasks.create_task("u", "fix auth", "bug_fix") + kernel.episodes.begin_episode("u", "working on auth", "bug_fix") + + kernel.record_checkpoint_event("u", "progress on auth", "paused") + + # Check episode connection_count was incremented + open_eps = getattr(kernel.episodes, '_open_episodes', {}) + ep_id = open_eps.get("u") + if ep_id: + ep = kernel.episodes._episodes.get(ep_id) + assert ep.connection_count >= 1 + + +# ── Operational Context (Phase 4) ──────────────────────────────── + + +class TestOperationalContext: + def test_hyper_context_has_operational_fields(self, tmp_path): + """HyperContext includes active_step, step_policies, action_items.""" + from dhee.core.buddhi import Buddhi + from dhee.core.cognition_kernel import CognitionKernel + kernel = CognitionKernel(data_dir=str(tmp_path / "buddhi")) + buddhi = Buddhi(data_dir=str(tmp_path / "buddhi"), kernel=kernel) + + # Create active task with current step + task = kernel.tasks.create_task("u", "Fix auth", "bug_fix", plan=["step 1", "step 2"]) + task.start() + kernel.tasks.update_task(task) + + ctx = buddhi.get_hyper_context(user_id="u", task_description="bug_fix") + d = ctx.to_dict() + assert "active_step" in d + assert d["active_step"] == {"description": "step 1"} + assert "step_policies" in d + assert "action_items" in d + assert "critical_blockers" in d + assert "contradictions" in d + assert any("[NEXT STEP]" in item for item in d["action_items"]) + + def test_to_operational_dict_compact(self, tmp_path): + """to_operational_dict() returns only actionable items.""" + from dhee.core.buddhi import Buddhi + from dhee.core.cognition_kernel import CognitionKernel + kernel = CognitionKernel(data_dir=str(tmp_path / "buddhi")) + buddhi = Buddhi(data_dir=str(tmp_path / "buddhi"), kernel=kernel) + + task = kernel.tasks.create_task("u", "Fix auth", "bug_fix", plan=["step 1"]) + task.start() + kernel.tasks.update_task(task) + + ctx = buddhi.get_hyper_context(user_id="u", task_description="bug_fix") + op = ctx.to_operational_dict() + + # Should have current_step and action_items + assert "current_step" in op + assert "action_items" in op + # Should NOT have full history fields + assert "insights" not in op + assert "performance" not in op + assert "memories" not in op + + def test_action_items_priority_order(self, tmp_path): + """Intentions come before steps in action_items.""" + from dhee.core.buddhi import Buddhi + from dhee.core.cognition_kernel import CognitionKernel + kernel = CognitionKernel(data_dir=str(tmp_path / "buddhi")) + buddhi = Buddhi(data_dir=str(tmp_path / "buddhi"), kernel=kernel) + + # Create intention + active step + kernel.intentions.store("u", "run tests after deploy", trigger_keywords=["bug_fix"]) + task = kernel.tasks.create_task("u", "Fix auth", "bug_fix", plan=["debug code"]) + task.start() + kernel.tasks.update_task(task) + + ctx = buddhi.get_hyper_context(user_id="u", task_description="bug_fix") + items = ctx.action_items + + intention_idx = next((i for i, x in enumerate(items) if "[INTENTION]" in x), None) + step_idx = next((i for i, x in enumerate(items) if "[NEXT STEP]" in x), None) + if intention_idx is not None and step_idx is not None: + assert intention_idx < step_idx + + def test_context_operational_flag(self, tmp_path): + """context(operational=True) returns compact format.""" + from dhee.simple import Dhee + d = Dhee(in_memory=True, data_dir=str(tmp_path)) + full = d.context("test task") + op = d.context("test task", operational=True) + # Full has many keys, operational is a subset + assert "user_id" in full + assert "user_id" not in op + + def test_critical_blockers_surfaced(self, tmp_path): + """Blockers from active task appear in critical_blockers.""" + from dhee.core.buddhi import Buddhi + from dhee.core.cognition_kernel import CognitionKernel + kernel = CognitionKernel(data_dir=str(tmp_path / "buddhi")) + buddhi = Buddhi(data_dir=str(tmp_path / "buddhi"), kernel=kernel) + + task = kernel.tasks.create_task("u", "Fix auth", "bug_fix", plan=["step 1"]) + task.start() + task.add_blocker("missing API key", severity="hard") + kernel.tasks.update_task(task) + + ctx = buddhi.get_hyper_context(user_id="u", task_description="bug_fix") + assert len(ctx.critical_blockers) >= 1 + assert any("API key" in b for b in ctx.critical_blockers)