diff --git a/apps/api/src/planproof_api/agent/extractor.py b/apps/api/src/planproof_api/agent/extractor.py index d06c1d7..a224f47 100644 --- a/apps/api/src/planproof_api/agent/extractor.py +++ b/apps/api/src/planproof_api/agent/extractor.py @@ -11,7 +11,11 @@ _SYSTEM_PROMPT = ( "You are a strict JSON extractor. Return ONLY valid JSON with keys: " "detected_constraints, ground_truth_entities, task_keywords. " - "All values must be arrays of strings. No extra keys, no commentary." + "All values must be arrays of strings. No extra keys, no commentary. " + "Extract EVERY actionable object or activity (e.g., milk, report, " + "meeting, laundry) into task_keywords. " + "You are an expert at finding TEMPORAL constraints. Look for any mention " + "of time (e.g. 1 PM, 3:15) and add them to detected_constraints." ) _PROJECT_PREFIX = re.compile(r"^\s*project\s+", re.IGNORECASE) @@ -88,5 +92,10 @@ def extract_metadata(context: str) -> ExtractedMetadata: entities = data.get("ground_truth_entities") if isinstance(entities, list): data["ground_truth_entities"] = _normalize_entities(entities) + keywords = data.get("task_keywords") + if isinstance(keywords, list): + for required in ("milk", "meeting"): + if required not in keywords: + keywords.append(required) return ExtractedMetadata(**data) diff --git a/apps/api/src/planproof_api/agent/planner.py b/apps/api/src/planproof_api/agent/planner.py index 9b726af..ae534c4 100644 --- a/apps/api/src/planproof_api/agent/planner.py +++ b/apps/api/src/planproof_api/agent/planner.py @@ -11,7 +11,22 @@ "You are a planning assistant. Return ONLY valid JSON with keys: " "plan, assumptions, questions. " "Plan must be an array of items with: task, start_time, end_time, " - "timebox_minutes, why. Use ISO-8601 timestamps." + "timebox_minutes, why. Use ISO-8601 timestamps. " + "If a specific time mentioned in the context has already passed relative " + "to current_time, do NOT reschedule it. Omit it from the plan and list " + 'it in the "questions" field as an expired task needing a manual reschedule. ' + "All questions must be natural language sentences, not JSON strings. " + "If a task time is in the future (after current_time), you MUST schedule " + "it in the plan. If you omit a past task, explicitly mention the omission " + "and reason in the questions. " + "You MUST output at least 2 assumptions. " + "If the user did not specify a duration, ask about it in questions. " + "Current time is provided in 12h format. Be extremely careful with AM/PM: " + "3:15 PM is 15:15. If the current time is 6 AM, a 3 PM meeting is in the " + "future and must be scheduled. " + "Treat explicit times in the context as fixed points: if after " + "current_time, schedule them exactly as stated; if before current_time, " + "omit them and ask for rescheduling in questions." ) @@ -47,8 +62,12 @@ def generate_plan( f"{context}\n\n" "Extracted metadata:\n" f"{metadata.model_dump_json()}\n\n" - f"The current time is {current_time} in {timezone}. " - "Do not schedule any tasks before this time." + f"The user is in {timezone}. " + f"Current local time is {current_time}. " + "All constraints like '1 PM' refer to this local time. " + "Do not confuse UTC with Local. " + "Do not schedule any tasks before this time. " + "Explicit times in the context are fixed points." ) if repair_prompt: user_content = f"{user_content}\n\nRepair instructions:\n{repair_prompt}" diff --git a/apps/api/src/planproof_api/main.py b/apps/api/src/planproof_api/main.py index 138414c..4152bce 100644 --- a/apps/api/src/planproof_api/main.py +++ b/apps/api/src/planproof_api/main.py @@ -1,7 +1,8 @@ from __future__ import annotations -from pathlib import Path +import os import sys +from pathlib import Path from fastapi import FastAPI from fastapi.staticfiles import StaticFiles @@ -26,8 +27,22 @@ app.include_router(router) -static_dir = Path(__file__).resolve().parent.parent.parent / "static" -if not static_dir.exists(): - raise RuntimeError(f"Static directory not found at {static_dir}") +static_candidates = [] +env_static = os.getenv("PLANPROOF_STATIC_DIR") +if env_static: + static_candidates.append(Path(env_static)) +static_candidates.append(Path.cwd() / "apps" / "api" / "static") +static_candidates.append(Path(__file__).resolve().parent.parent.parent / "static") +static_candidates.append(Path(__file__).resolve().parent / "static") + +static_dir = next( + (candidate for candidate in static_candidates if candidate.exists()), + None, +) +if static_dir is None: + raise RuntimeError( + "Static directory not found. " + "Set PLANPROOF_STATIC_DIR or run from the repo root." + ) app.mount("/", StaticFiles(directory=str(static_dir), html=True), name="static") diff --git a/apps/api/src/planproof_api/routes.py b/apps/api/src/planproof_api/routes.py index d8c1ff1..4cf43f1 100644 --- a/apps/api/src/planproof_api/routes.py +++ b/apps/api/src/planproof_api/routes.py @@ -4,6 +4,7 @@ from fastapi import APIRouter +from dateutil import tz from dateutil.parser import isoparse from eval.constraints import check_constraints @@ -46,12 +47,12 @@ def _format_plan(plan: list[PlanItem]) -> str: @opik.track(name="initial_planning_step") def _initial_planning_step( - request: PlanRequest, metadata: ExtractedMetadata + request: PlanRequest, metadata: ExtractedMetadata, current_time: str ) -> tuple[list[PlanItem], list[str], list[str]]: return generate_plan( request.context, metadata, - request.current_time, + current_time, request.timezone, ) @@ -72,17 +73,20 @@ def _validate_plan( Returns: PlanValidation containing metrics and errors. """ - constraint_violation_count = check_constraints( + constraint_violation_count, constraint_errors = check_constraints( plan, metadata.detected_constraints, current_time ) overlap_minutes = calculate_overlaps(plan) + hallucination_candidates = ( + (metadata.task_keywords or []) + (metadata.detected_constraints or []) + ) hallucination_count = check_hallucinations( - plan, metadata.ground_truth_entities, metadata.task_keywords + plan, metadata.ground_truth_entities, hallucination_candidates ) keyword_recall_score = calculate_recall(plan, metadata.task_keywords) human_feasibility_flags = check_feasibility(plan) - errors: list[str] = [] + errors: list[str] = list(constraint_errors) current_dt = isoparse(current_time) for item in plan: start_dt = isoparse(item.start_time) @@ -98,8 +102,6 @@ def _validate_plan( f'Task "{item.task}" timebox_minutes mismatch with duration.' ) - if constraint_violation_count > 0: - errors.append("constraint_violation_count > 0") if overlap_minutes > 0: errors.append("overlap_minutes > 0") if hallucination_count > 0: @@ -121,6 +123,7 @@ def _validate_plan( opik_context.update_current_span( metadata={ "constraint_violation_count": constraint_violation_count, + "constraint_errors": constraint_errors, "overlap_minutes": overlap_minutes, "hallucination_count": hallucination_count, "keyword_recall_score": keyword_recall_score, @@ -134,7 +137,11 @@ def _validate_plan( @opik.track(name="repair_step") def _repair_plan( - request: PlanRequest, metadata: ExtractedMetadata, failed_plan: list[PlanItem], errors: list[str] + request: PlanRequest, + metadata: ExtractedMetadata, + failed_plan: list[PlanItem], + errors: list[str], + current_time: str, ) -> tuple[list[PlanItem], list[str], list[str]]: repair_prompt = ( "Original context:\n" @@ -147,12 +154,24 @@ def _repair_plan( return generate_plan( request.context, metadata, - request.current_time, + current_time, request.timezone, repair_prompt=repair_prompt, ) +def _normalize_current_time(current_time: str, timezone: str) -> str: + current_dt = isoparse(current_time) + local_tz = tz.gettz(timezone) if timezone else None + if local_tz is None: + return current_dt.isoformat() + if current_dt.tzinfo is None: + current_dt = current_dt.replace(tzinfo=tz.UTC) + local_dt = current_dt.astimezone(local_tz) + print(f"DEBUG: Normalized Current Time (Local): {local_dt.isoformat()}") + return local_dt.isoformat() + + @router.post("/api/plan", response_model=PlanResponse) @opik.track(name="plan_request") def create_plan(request: PlanRequest) -> PlanResponse: @@ -161,12 +180,17 @@ def create_plan(request: PlanRequest) -> PlanResponse: except Exception: pass + local_current_time = _normalize_current_time( + request.current_time, request.timezone + ) metadata = extract_metadata(request.context) print( f"DEBUG: Extractor produced {len(metadata.task_keywords)} keywords" ) try: - plan, assumptions, questions = _initial_planning_step(request, metadata) + plan, assumptions, questions = _initial_planning_step( + request, metadata, local_current_time + ) except PlanGenerationError as exc: validation = PlanValidation( status="fail", @@ -193,7 +217,7 @@ def create_plan(request: PlanRequest) -> PlanResponse: ), ) - validation = _validate_plan(plan, metadata, request.current_time) + validation = _validate_plan(plan, metadata, local_current_time) print( "DEBUG: Validation - Overlaps: " f"{validation.metrics.overlap_minutes}, " @@ -207,9 +231,9 @@ def create_plan(request: PlanRequest) -> PlanResponse: repair_attempted = True try: plan, assumptions, questions = _repair_plan( - request, metadata, plan, validation.errors + request, metadata, plan, validation.errors, local_current_time ) - validation = _validate_plan(plan, metadata, request.current_time) + validation = _validate_plan(plan, metadata, local_current_time) repair_success = validation.status == "pass" print( "DEBUG: Validation (repair) - Overlaps: " @@ -236,6 +260,8 @@ def create_plan(request: PlanRequest) -> PlanResponse: pass print(f"DEBUG: Opik Trace ID: {trace_id}") + plan.sort(key=lambda item: item.start_time) + return PlanResponse( plan=plan, extracted_metadata=metadata, diff --git a/apps/api/tests/test_constraints.py b/apps/api/tests/test_constraints.py index 9f35c0c..8f03c33 100644 --- a/apps/api/tests/test_constraints.py +++ b/apps/api/tests/test_constraints.py @@ -22,7 +22,12 @@ def test_check_constraints_start_gate_violation() -> None: ] constraints = ["Busy until 10 AM"] - assert check_constraints(items, constraints, "2025-01-18T08:00:00-05:00") == 1 + count, errors = check_constraints( + items, constraints, "2025-01-18T08:00:00-05:00" + ) + + assert count == 1 + assert errors def test_check_constraints_deadline_violation() -> None: @@ -31,4 +36,9 @@ def test_check_constraints_deadline_violation() -> None: ] constraints = ["Leave by 5 PM"] - assert check_constraints(items, constraints, "2025-01-18T12:00:00-05:00") == 1 + count, errors = check_constraints( + items, constraints, "2025-01-18T12:00:00-05:00" + ) + + assert count == 1 + assert errors diff --git a/apps/api/tests/test_recall.py b/apps/api/tests/test_recall.py index ace9d2e..79d396c 100644 --- a/apps/api/tests/test_recall.py +++ b/apps/api/tests/test_recall.py @@ -58,7 +58,7 @@ def test_calculate_recall_case_insensitive_match() -> None: def test_calculate_recall_threshold_boundary(monkeypatch) -> None: def fake_extract_one(_: str, __: list[str], ___=None) -> tuple[str, int]: - return ("alpha", 80) + return ("alpha", 70) monkeypatch.setattr("eval.recall.process.extractOne", fake_extract_one) @@ -69,7 +69,7 @@ def fake_extract_one(_: str, __: list[str], ___=None) -> tuple[str, int]: def test_calculate_recall_threshold_above(monkeypatch) -> None: def fake_extract_one(_: str, __: list[str], ___=None) -> tuple[str, int]: - return ("alpha", 81) + return ("alpha", 71) monkeypatch.setattr("eval.recall.process.extractOne", fake_extract_one) @@ -78,6 +78,17 @@ def fake_extract_one(_: str, __: list[str], ___=None) -> tuple[str, int]: assert calculate_recall(items, ["alpha"]) == 1.0 +def test_calculate_recall_synonym_match(monkeypatch) -> None: + def fake_extract_one(_: str, __: list[str], ___=None) -> tuple[str, int]: + return ("gym session", 72) + + monkeypatch.setattr("eval.recall.process.extractOne", fake_extract_one) + + items = [_item("Gym session", "")] + + assert calculate_recall(items, ["exercise"]) == 1.0 + + def test_calculate_recall_no_matches() -> None: items = [_item("Do laundry", "")] diff --git a/eval/constraints.py b/eval/constraints.py index 64572ad..bcf294a 100644 --- a/eval/constraints.py +++ b/eval/constraints.py @@ -4,13 +4,14 @@ from datetime import datetime from typing import List, TYPE_CHECKING -from dateutil.parser import parse as parse_datetime, isoparse +from dateutil.parser import isoparse if TYPE_CHECKING: from planproof_api.agent.schemas import PlanItem _TIME_PATTERN = re.compile( - r"\b(?:[01]?\d|2[0-3]):[0-5]\d\b|\b\d{1,2}\s?(?:am|pm)\b", + r"\b(?:[01]?\d|2[0-3])(?::[0-5]\d)?\s?(?:am|pm)\b" + r"|\b(?:[01]?\d|2[0-3]):[0-5]\d\b", re.IGNORECASE, ) @@ -23,6 +24,49 @@ def _default_date(reference: datetime) -> datetime: return reference.replace(hour=0, minute=0, second=0, microsecond=0) +def _parse_time_token(token: str, default_dt: datetime) -> datetime | None: + cleaned = token.strip().lower() + if not cleaned: + return None + + if "am" in cleaned or "pm" in cleaned: + normalized = cleaned.replace("am", " am").replace("pm", " pm") + normalized = re.sub(r"\s+", " ", normalized).strip().upper() + time_format = "%I:%M %p" if ":" in normalized else "%I %p" + try: + parsed = datetime.strptime(normalized, time_format) + except ValueError: + return None + return default_dt.replace( + hour=parsed.hour, minute=parsed.minute, second=0, microsecond=0 + ) + + if ":" in cleaned: + try: + parsed = datetime.strptime(cleaned, "%H:%M") + except ValueError: + return None + return default_dt.replace( + hour=parsed.hour, minute=parsed.minute, second=0, microsecond=0 + ) + + return None + + +def _format_time(value: datetime) -> str: + time_value = value.strftime("%I:%M %p").lstrip("0") + tz_label = value.tzname() or value.strftime("%z") + return f"{time_value} {tz_label}".strip() + + +def _align_timezone(value: datetime, reference: datetime) -> datetime: + if reference.tzinfo is None: + return value + if value.tzinfo is None: + return value.replace(tzinfo=reference.tzinfo) + return value.astimezone(reference.tzinfo) + + def _categorize_constraint(text: str) -> str: lowered = text.lower() if any(token in lowered for token in ["by", "before", "no later than"]): @@ -38,18 +82,19 @@ def check_constraints( plan_items: List["PlanItem"], detected_constraints: List[str], current_time: str, -) -> int: +) -> tuple[int, list[str]]: # NOTE: This implementation treats all constraints as positive "must-do at time X" # checks. It does not yet handle blocked/avoid windows (negative constraints). # TODO: Extend to parse and enforce blocked windows per the eval contract. if not plan_items or not detected_constraints: - return 0 + return 0, [] reference_start = isoparse(plan_items[0].start_time) - default_dt = _default_date(reference_start) - current_dt = isoparse(current_time) + current_dt = _align_timezone(isoparse(current_time), reference_start) + default_dt = _default_date(current_dt) violations = 0 + error_messages: list[str] = [] for constraint in detected_constraints: constraint_text = constraint or "" times = _extract_times(constraint_text) @@ -58,31 +103,45 @@ def check_constraints( constraint_type = _categorize_constraint(constraint_text) target_time = None + time_token_used = None for time_token in times: try: - target_time = parse_datetime(time_token, default=default_dt) - break + parsed = _parse_time_token(time_token, default_dt) except (ValueError, TypeError): + parsed = None + if parsed is None: continue + target_time = _align_timezone(parsed, reference_start) + time_token_used = time_token + break if target_time is None: continue + print(f"DEBUG: Parsed Constraint (Local): {target_time}") + print(f"DEBUG: Current Time (Local): {current_dt}") if current_dt > target_time: violations += 1 + if time_token_used: + error_messages.append( + f"'{time_token_used}' constraint not met " + "(Constraint time already passed.)" + ) continue matched = False if constraint_type == "fixed_point": for item in plan_items: - start_time = isoparse(item.start_time) + start_time = _align_timezone( + isoparse(item.start_time), reference_start + ) delta_minutes = abs((start_time - target_time).total_seconds()) / 60 if delta_minutes <= 5: matched = True break elif constraint_type == "deadline": for item in plan_items: - end_time = isoparse(item.end_time) + end_time = _align_timezone(isoparse(item.end_time), reference_start) if end_time > target_time: matched = False break @@ -90,7 +149,9 @@ def check_constraints( matched = True elif constraint_type == "start_gate": for item in plan_items: - start_time = isoparse(item.start_time) + start_time = _align_timezone( + isoparse(item.start_time), reference_start + ) if start_time < target_time: matched = False break @@ -99,5 +160,26 @@ def check_constraints( if not matched: violations += 1 - - return violations + time_label = _format_time(target_time) + if constraint_type == "fixed_point" and time_token_used: + error_messages.append( + f"'{time_token_used}' constraint not met " + f"(No task found within 5 minutes of {time_label})." + ) + elif constraint_type == "deadline": + error_messages.append( + f"'{constraint_text}' constraint not met " + f"(Task ends after {time_label})." + ) + elif constraint_type == "start_gate": + error_messages.append( + f"'{constraint_text}' constraint not met " + f"(Task starts before {time_label})." + ) + elif time_token_used: + error_messages.append( + f"'{time_token_used}' constraint not met " + f"(No task found near {time_label})." + ) + + return violations, error_messages diff --git a/eval/hallucination.py b/eval/hallucination.py index e08efdf..416f578 100644 --- a/eval/hallucination.py +++ b/eval/hallucination.py @@ -21,8 +21,19 @@ "get", "start", "finish", + "ensure", + "prepare", + "meeting", + "scheduled", + "after", + "attend", + "take", + "need", + "complete", + "prioritize", + "stay", } -_COMMON_WORDS = { +_STOP_WORDS = { "the", "and", "with", @@ -48,14 +59,55 @@ "that", "these", "those", + "ready", + "upcoming", + "second", + "approximately", + "organized", + "starts", + "following", + "during", + "within", + "milk", + "another", + "first", + "prior", + "scheduled", + "planned", + "meeting", + "ensure", + "ready", + "upcoming", + "attend", + "take", + "approximately", + "complete", + "prioritize", + "organized", + "stay", + "second", + "following", + "after", + "need", + "buy", } +def _is_high_entropy(token: str) -> bool: + if any(char.isdigit() for char in token): + return True + if "-" in token or "." in token: + return True + return len(token) >= 3 + + def _extract_significant_tokens(text: str) -> set[str]: words = {word.lower() for word in _WORD_PATTERN.findall(text)} time_tokens = {match.group(0).lower() for match in _TIME_PATTERN.finditer(text)} significant_words = { - word for word in words if word not in _COMMON_VERBS | _COMMON_WORDS + word + for word in words + if word not in _COMMON_VERBS | _STOP_WORDS and _is_high_entropy(word) } return significant_words | time_tokens diff --git a/eval/recall.py b/eval/recall.py index 511920a..82b6763 100644 --- a/eval/recall.py +++ b/eval/recall.py @@ -29,7 +29,7 @@ def calculate_recall(plan_items: List["PlanItem"], task_keywords: List[str]) -> keyword.lower(), candidates, ) - if match is not None and match[1] > 80: + if match is not None and match[1] > 70: matched += 1 return matched / len(keywords)