diff --git a/.claude/skills/conductor/SKILL.md b/.claude/skills/conductor/SKILL.md index c54094f..1ebfe55 100644 --- a/.claude/skills/conductor/SKILL.md +++ b/.claude/skills/conductor/SKILL.md @@ -100,7 +100,7 @@ For runtime config, context modes, limits, and cost tracking, see [references/au | `context.mode` | How agents share data (accumulate, last_only, explicit) | | `limits` | Safety bounds (max_iterations up to 500, timeout_seconds) | | `cost` | Token usage and cost tracking configuration | -| `runtime` | Provider, model, temperature, max_tokens, MCP servers | +| `runtime` | Provider, model, temperature, max_tokens, reasoning effort, MCP servers | | `--web` | Real-time web dashboard with DAG graph, live streaming, in-browser human gates | | `checkpoint` | Auto-saved on failure; resume with `conductor resume` | | `registry` | Named workflow sources (GitHub repo or local dir) for sharing workflows | diff --git a/.claude/skills/conductor/references/authoring.md b/.claude/skills/conductor/references/authoring.md index 533b50b..f696cee 100644 --- a/.claude/skills/conductor/references/authoring.md +++ b/.claude/skills/conductor/references/authoring.md @@ -19,6 +19,7 @@ workflow: timeout: 600 # Per-request timeout in seconds (optional) max_agent_iterations: 50 # Max tool-use roundtrips per agent (1-500, optional) max_session_seconds: 120 # Wall-clock timeout per agent session (optional) + default_reasoning_effort: medium # Workflow-wide reasoning effort: low, medium, high, xhigh (optional) input: # Define workflow inputs param_name: @@ -76,10 +77,41 @@ agents: max_agent_iterations: 100 # Override workflow default for this agent (optional) max_session_seconds: 60 # Wall-clock timeout for this agent (optional) + reasoning: # Override runtime.default_reasoning_effort (optional) + effort: high # low, medium, high, or xhigh + routes: # Where to go next - to: next_agent ``` +### Reasoning Effort + +`reasoning.effort` (per-agent) and `runtime.default_reasoning_effort` (workflow-wide) accept `low`, `medium`, `high`, or `xhigh`. Per-agent overrides the runtime default. The provider translates the unified value to its native API: + +- **Copilot**: forwarded as `reasoning_effort` on the session. Validated against the model's advertised `supported_reasoning_efforts`; raises `ValidationError` for unsupported combinations (skipped in mock-handler mode or when capability metadata is absent). +- **Claude**: enables extended thinking via `thinking={"type": "enabled", "budget_tokens": N}` with mapping `low=2048`, `medium=8192`, `high=16384`, `xhigh=32768`. Auto-coerces `temperature` to `1.0` (logged at INFO) and bumps `max_tokens` to fit `budget + 4096` (capped at 64000, logged at INFO when clamped). Only valid on thinking-capable models (`claude-3-7-*`, `claude-opus-4*`, `claude-sonnet-4*`, `claude-haiku-4*`); raises `ValidationError` otherwise. + +Both providers surface reasoning content via `agent_reasoning` events visible in the dashboard, JSONL logs, and the console at `-vv`. Not allowed on `script`, `human_gate`, or `workflow` agent types. + +```yaml +runtime: + provider: claude + default_model: claude-opus-4-20250514 + default_reasoning_effort: medium # workflow-wide default + +agents: + - name: explainer + prompt: "Explain this algorithm." + # inherits 'medium' + + - name: architect + reasoning: + effort: high # override + prompt: "Design the system architecture." +``` + +See `examples/reasoning-effort.yaml` for a complete example. + ## Routing Patterns ### Linear diff --git a/.claude/skills/conductor/references/yaml-schema.md b/.claude/skills/conductor/references/yaml-schema.md index aca9cfa..ccd7d54 100644 --- a/.claude/skills/conductor/references/yaml-schema.md +++ b/.claude/skills/conductor/references/yaml-schema.md @@ -34,6 +34,7 @@ workflow: timeout: float # Per-request timeout in seconds (optional, default: 600) max_agent_iterations: integer # Max tool-use roundtrips per agent (1-500, optional) max_session_seconds: float # Wall-clock timeout per agent session in seconds (optional) + default_reasoning_effort: string # Workflow-wide reasoning/thinking effort: low, medium, high, xhigh (optional) mcp_servers: # MCP server configurations : type: string # "stdio" (default), "http", or "sse" @@ -130,6 +131,11 @@ agents: max_agent_iterations: integer # Max tool-use roundtrips for this agent (1-500, optional) max_session_seconds: float # Wall-clock timeout for this agent session (optional) + # Per-agent reasoning effort (overrides runtime.default_reasoning_effort) + # Not allowed for script, human_gate, or workflow agent types. + reasoning: + effort: string # low, medium, high, or xhigh + # Per-agent retry policy (optional, not allowed for script agents) retry: max_attempts: integer # Max attempts including first (1-10, default: 1 = no retry) @@ -146,7 +152,16 @@ agents: timeout: integer # Per-script timeout in seconds ``` -**Script agent restrictions:** Cannot have `prompt`, `provider`, `model`, `tools`, `output`, `system_prompt`, `options`, `retry`. Output is always `{stdout, stderr, exit_code}`. +**Script agent restrictions:** Cannot have `prompt`, `provider`, `model`, `tools`, `output`, `system_prompt`, `options`, `retry`, `reasoning`. Output is always `{stdout, stderr, exit_code}`. + +**Reasoning effort:** `reasoning.effort` (and `runtime.default_reasoning_effort`) accepts `low`, `medium`, `high`, or `xhigh`. Per-agent value overrides the runtime default. Each provider translates the unified value to its native API: + +- **Copilot**: forwards `reasoning_effort` to the session. Validated against the model's advertised `supported_reasoning_efforts` (when available); raises `ValidationError` for unsupported combinations. +- **Claude**: enables extended thinking via `thinking={"type":"enabled","budget_tokens":N}` with mapping low=2048, medium=8192, high=16384, xhigh=32768. Auto-coerces `temperature=1.0` (Anthropic API requirement) and bumps `max_tokens` to fit `budget+4096` (capped at 64000). Only valid on thinking-capable models (Claude 3.7+, Opus/Sonnet/Haiku 4.x); raises `ValidationError` otherwise. + +Both providers continue to surface reasoning content via `agent_reasoning` events visible in the dashboard, JSONL logs, and console at `-vv`. + +Forbidden on agent types: `script`, `human_gate`, `workflow`. ## Script Agent Schema diff --git a/AGENTS.md b/AGENTS.md index d9618ec..7520f17 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -121,6 +121,7 @@ make validate-examples # validate all examples - **Failure modes** for parallel/for-each: `fail_fast`, `continue_on_error`, `all_or_nothing` - **Route evaluation**: First matching `when` condition wins; no `when` = always matches - **Tool resolution**: `null` = all workflow tools, `[]` = none, `[list]` = subset +- **Reasoning effort**: `runtime.default_reasoning_effort` sets a workflow-wide default; per-agent `reasoning.effort` overrides it. Allowed values: `low`, `medium`, `high`, `xhigh`. Each provider translates the unified value to its native API (Copilot: `reasoning_effort` on the session, validated against the model's `supported_reasoning_efforts`; Claude: extended thinking with budget mapping low=2048, medium=8192, high=16384, xhigh=32768 tokens, with `temperature` coerced to 1.0 and `max_tokens` bumped to fit the budget). See `examples/reasoning-effort.yaml`. ## Tests Structure @@ -158,5 +159,6 @@ All providers (`copilot.py`, `claude.py`) must maintain feature parity. Any chan - **Output contract**: Same `AgentOutput` structure with consistent field population (model, tokens, input_tokens, output_tokens, content) - **Tool execution**: Same MCP tool calling interface and result handling - **Session management**: Same lifecycle (`validate_connection()`, `execute()`, `close()`) +- **Reasoning effort**: All providers must accept the unified `reasoning.effort` field (`low` | `medium` | `high` | `xhigh`), translate it to the native API (Copilot `reasoning_effort` on the session; Claude extended `thinking` budget), validate that the selected model supports the requested effort, and raise `ValidationError` with a clear message when it does not. Any reasoning/thinking content the model returns must be surfaced via `agent_reasoning` events so the dashboard, JSONL logger, and console subscriber render it consistently. When modifying any provider, check all other providers for the same change. The dashboard, JSONL logger, console subscriber, and workflow engine all depend on consistent behavior across providers. diff --git a/CHANGELOG.md b/CHANGELOG.md index 7902871..fe9c2c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased](https://github.com/microsoft/conductor/compare/v0.1.11...HEAD) +### Added +- Unified `reasoning.effort` configuration for per-agent and workflow-wide + control of model reasoning / extended-thinking effort. Set + `runtime.default_reasoning_effort` (`low` | `medium` | `high` | `xhigh`) for a + workflow-wide default, or override per agent with a `reasoning.effort` block. + Translates to `reasoning_effort` on the Copilot session and to extended + `thinking` budget on Claude (low=2048, medium=8192, high=16384, xhigh=32768 + tokens, with `temperature` coerced to 1.0 and `max_tokens` bumped to fit). + Validates against each model's supported efforts/capabilities and surfaces + thinking content via `agent_reasoning` events. See + [`examples/reasoning-effort.yaml`](examples/reasoning-effort.yaml). + ## [0.1.11](https://github.com/microsoft/conductor/compare/v0.1.10...v0.1.11) - 2026-05-04 ### Added diff --git a/README.md b/README.md index 8f089fc..1e6524f 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Conductor provides the patterns that work: evaluator-optimizer loops for iterati - **Sub-workflow composition** - Reusable sub-workflows with templated `input_mapping`, usable inside `for_each` groups for dynamic fan-out - **Script steps** - Run shell commands and route on exit code or parsed JSON stdout - **Dialog mode** - Agents can pause for multi-turn conversation when uncertain +- **Reasoning effort** - Unified `reasoning.effort` (low/medium/high/xhigh) per agent or workflow-wide, translated to each provider's native API - **Workspace instructions** - Auto-discover and inject `AGENTS.md` / `CLAUDE.md` / `.github/copilot-instructions.md` into every agent's prompt - **Conditional routing** - Route between agents based on output conditions - **Human-in-the-loop** - Pause for human decisions with Markdown-rendered prompts and clickable file links @@ -231,8 +232,9 @@ conductor registry add official myorg/conductor-workflows --default conductor registry list official # Run a workflow from the registry -conductor run qa-bot # latest from default registry -conductor run qa-bot@official@1.2.3 # specific version +conductor run qa-bot # latest from default registry +conductor run 'qa-bot@official#v1.2.3' # specific tag (quote the #) +conductor run 'qa-bot@official#main' # branch HEAD (re-resolved on fetch) ``` See [docs/design/registry.md](docs/design/registry.md) for the full design. diff --git a/docs/configuration.md b/docs/configuration.md index 42cddf3..679469b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -13,9 +13,18 @@ workflow: runtime: provider: copilot # or 'claude' default_model: gpt-5.2 + temperature: 0.7 + max_tokens: 4096 + default_reasoning_effort: medium # low | medium | high | xhigh (optional) # Provider-specific settings... ``` +The `default_reasoning_effort` field sets a workflow-wide default for model +reasoning / extended-thinking effort that every provider-backed agent inherits +unless it declares its own `reasoning.effort` override. See +[Reasoning Effort](#reasoning-effort) for the per-provider translation and +constraints. + ## Provider Selection ### Copilot Provider @@ -122,6 +131,67 @@ workflow: **Note**: This is output tokens, not context window (200K separate limit) +## Reasoning Effort + +Conductor exposes a single, unified `reasoning.effort` knob that controls how +much "thinking" budget the underlying model uses, and translates it to each +provider's native API. Allowed values: `low`, `medium`, `high`, `xhigh`. + +Set a workflow-wide default and/or override per agent: + +```yaml +workflow: + runtime: + provider: copilot + default_model: gpt-5.2 + default_reasoning_effort: medium # workflow-wide default + +agents: + - name: explainer + # No reasoning block — inherits `medium` from the runtime default. + prompt: "Explain {{ workflow.input.topic }}" + + - name: architect + reasoning: + effort: high # per-agent override wins + prompt: "Design a system for {{ workflow.input.topic }}" +``` + +Per-agent overrides always win over the workflow-wide default. The +`reasoning.effort` field is **only** valid on standard `agent`-type agents; it +is rejected on `script`, `human_gate`, and `workflow` agents (which do not call +a model). + +### Per-provider translation + +- **Copilot** — Forwards the chosen effort as `reasoning_effort` to + `CopilotClient.create_session`. The value is validated against the model's + advertised `supported_reasoning_efforts` capability metadata; a + `ValidationError` is raised at startup if the model does not support the + requested effort. Validation is skipped in mock mode or when capability + metadata is unavailable. +- **Claude** — Enables Anthropic's extended thinking via + `messages.create(thinking={"type": "enabled", "budget_tokens": N})` with the + following effort → budget mapping: + + | Effort | Budget tokens | + |----------|---------------| + | `low` | 2 048 | + | `medium` | 8 192 | + | `high` | 16 384 | + | `xhigh` | 32 768 | + + Extended thinking is only valid on thinking-capable models + (`claude-3-7-*`, `claude-opus-4*`, `claude-sonnet-4*`, `claude-haiku-4*`); a + `ValidationError` is raised otherwise. The provider also auto-coerces + `temperature` to `1.0` (required by the Anthropic API for extended thinking, + logged at INFO) and bumps `max_tokens` to fit `budget + 4096`, capped at + `64000` (logged at INFO when clamped). + +Reasoning / thinking content emitted by the model is surfaced via +`agent_reasoning` events and rendered in the dashboard, JSONL logs, and +`-vv` console output for both providers. + ## MCP Servers Configure [Model Context Protocol (MCP)](https://modelcontextprotocol.io/) servers for tool access. Both the Copilot and Claude providers support MCP tools. diff --git a/docs/providers/claude.md b/docs/providers/claude.md index 43d443f..1e06a96 100644 --- a/docs/providers/claude.md +++ b/docs/providers/claude.md @@ -9,6 +9,7 @@ The Claude provider enables Conductor workflows to use Anthropic's Claude models - [Model Selection](#model-selection) - [Runtime Configuration](#runtime-configuration) - [Streaming Limitations](#streaming-limitations) +- [Extended Thinking](#extended-thinking) - [Troubleshooting](#troubleshooting) - [Cost Optimization](#cost-optimization) @@ -287,6 +288,77 @@ Streaming support is planned for Phase 2 (estimated 2-3 weeks): Track progress in the project roadmap or GitHub issues. +## Extended Thinking + +The Claude provider supports Anthropic's extended thinking via the unified +[`reasoning.effort`](../configuration.md#reasoning-effort) field. Set a +workflow-wide default with `runtime.default_reasoning_effort` and/or override +per agent with an `reasoning.effort` block: + +```yaml +workflow: + runtime: + provider: claude + default_model: claude-sonnet-4.5 + default_reasoning_effort: medium + +agents: + - name: planner + reasoning: + effort: high # per-agent override + prompt: "Plan a deployment for {{ workflow.input.service }}" +``` + +### Effort → thinking budget + +The unified effort level is translated into Anthropic's +`messages.create(thinking={"type": "enabled", "budget_tokens": N})` parameter: + +| Effort | Budget tokens | +|----------|---------------| +| `low` | 2 048 | +| `medium` | 8 192 | +| `high` | 16 384 | +| `xhigh` | 32 768 | + +### Supported models + +Extended thinking is only valid on thinking-capable models. The provider +accepts any model whose name starts with one of: + +- `claude-3-7-*` +- `claude-opus-4*` +- `claude-sonnet-4*` +- `claude-haiku-4*` + +Requesting `reasoning.effort` on any other model raises a `ValidationError` at +startup so you fail fast instead of silently dropping the budget. + +### Auto-coercion of `temperature` and `max_tokens` + +When extended thinking is enabled, the Anthropic API requires `temperature=1.0` +and a `max_tokens` value large enough to contain both the thinking budget and +the visible response. The provider handles this for you: + +- **`temperature`**: coerced to `1.0` (logged at INFO if you configured a + different value). +- **`max_tokens`**: bumped to `budget + 4096`, capped at `64000` (logged at INFO + when clamped). + +This means you don't need to hand-tune `max_tokens` when raising the effort — +the provider will widen the output budget to fit. If you've explicitly set a +`max_tokens` higher than `budget + 4096`, your value is preserved. + +### Reasoning content in events + +Any thinking content the model returns is surfaced as `agent_reasoning` events +alongside the regular `agent_message` stream, and shows up in the dashboard +detail panel, the JSONL log, and the `-vv` console output. The Copilot provider +emits the same event shape so workflows that mix providers render consistently. + +See [`examples/reasoning-effort.yaml`](../../examples/reasoning-effort.yaml) for +a runnable end-to-end example. + ## Troubleshooting ### Common Errors and Solutions diff --git a/docs/providers/comparison.md b/docs/providers/comparison.md index a75f5e0..f335c15 100644 --- a/docs/providers/comparison.md +++ b/docs/providers/comparison.md @@ -12,6 +12,7 @@ This guide helps you choose between GitHub Copilot and Anthropic Claude provider | **Model Selection** | GPT-5.2, o1 | Haiku, Sonnet, Opus | Tie | | **Streaming** | Yes | No (Phase 1) | Copilot | | **Tool Support** | Yes (MCP, all types) | Yes (MCP, stdio only) | Copilot | +| **Reasoning / Extended Thinking** | Yes (`reasoning_effort` on session) | Yes (extended `thinking` budget) | Tie | | **Speed** | Fast | Fast | Tie | | **Output Quality** | Excellent | Excellent | Tie | | **Cost Predictability** | High (flat rate) | Variable (usage-based) | Copilot | @@ -242,6 +243,31 @@ agents: See the [MCP Tools guide](../mcp-tools.md) for details. +### Reasoning / Extended Thinking + +Both providers expose a unified [`reasoning.effort`](../configuration.md#reasoning-effort) +field (`low` | `medium` | `high` | `xhigh`) at workflow scope +(`runtime.default_reasoning_effort`) or per agent (`reasoning.effort`). +Conductor translates the value to each provider's native API: + +**Copilot**: +- Forwarded as `reasoning_effort` on `CopilotClient.create_session` +- Validated against the model's advertised `supported_reasoning_efforts` + +**Claude**: +- Translated to `messages.create(thinking={"type": "enabled", "budget_tokens": N})` +- Effort → budget: low=2048, medium=8192, high=16384, xhigh=32768 tokens +- Restricted to thinking-capable models (`claude-3-7-*`, `claude-opus-4*`, + `claude-sonnet-4*`, `claude-haiku-4*`) +- Auto-coerces `temperature=1.0` and bumps `max_tokens` to fit the budget + +Reasoning content from either provider surfaces as `agent_reasoning` events +in the dashboard, JSONL log, and `-vv` console output. + +**Winner**: Tie (both support it; pick the provider on other grounds) + +See [`examples/reasoning-effort.yaml`](../../examples/reasoning-effort.yaml). + ## Migration Path ### From Copilot to Claude diff --git a/docs/workflow-syntax.md b/docs/workflow-syntax.md index 9b3ad87..2567487 100644 --- a/docs/workflow-syntax.md +++ b/docs/workflow-syntax.md @@ -43,6 +43,18 @@ workflow: on_error: "{{ template }}" # Optional: Expression evaluated on error context_mode: accumulate # accumulate | snapshot | minimal (default: accumulate) + + runtime: + provider: copilot # copilot | claude + default_model: gpt-5.2 + temperature: 0.7 + max_tokens: 4096 + default_reasoning_effort: medium # Optional: low | medium | high | xhigh + # Workflow-wide default for reasoning / + # extended-thinking effort. Inherited by + # every provider-backed agent unless it + # declares its own `reasoning.effort`. + # See docs/configuration.md#reasoning-effort. ``` **Workflow metadata** is included verbatim in the `workflow_started` event and lets downstream consumers (dashboards, queue runners, observability tools) adapt without parsing the YAML. CLI `--metadata key=value` flags merge on top of YAML metadata (CLI wins on conflicts). @@ -84,7 +96,14 @@ agents: tools: # Optional: Agent-specific tools - tool_name - + + reasoning: # Optional: per-agent reasoning override + effort: high # low | medium | high | xhigh + # Overrides runtime.default_reasoning_effort. + # Only valid on type=agent (rejected on + # script, human_gate, workflow). + # See docs/configuration.md#reasoning-effort. + routes: # Optional: Routing logic - to: next_agent # Agent name or $end when: "{{ condition }}" # Optional: Route condition diff --git a/examples/README.md b/examples/README.md index 7cbec82..254a9cc 100644 --- a/examples/README.md +++ b/examples/README.md @@ -72,6 +72,23 @@ conductor run examples/design-review.yaml --input requirement="Build a REST API" conductor run examples/design-review.yaml --input requirement="Build a REST API" --skip-gates ``` +## Reasoning Effort + +### reasoning-effort.yaml + +A two-stage workflow that demonstrates configuring model reasoning / extended-thinking effort. Demonstrates: +- Workflow-wide default via `runtime.default_reasoning_effort` +- Per-agent override via `reasoning.effort` (wins over the default) +- Conditional routing on a structured boolean output +- The unified field translates to each provider's native API: `reasoning_effort` on the Copilot session, or extended `thinking` budget on Claude + +```bash +conductor run examples/reasoning-effort.yaml \ + --input topic="how the Raft consensus algorithm handles leader election" +``` + +See [Reasoning Effort](../docs/configuration.md#reasoning-effort) for the per-provider translation, supported models, and validation rules. + ## Multi-Agent Workflows ### research-assistant.yaml diff --git a/examples/reasoning-effort.yaml b/examples/reasoning-effort.yaml new file mode 100644 index 0000000..d4f3203 --- /dev/null +++ b/examples/reasoning-effort.yaml @@ -0,0 +1,110 @@ +# Reasoning Effort Workflow +# +# This example demonstrates configuring model reasoning / extended-thinking +# effort for provider-backed agents. It shows: +# - A workflow-wide default via `runtime.default_reasoning_effort` +# - A per-agent override via `reasoning.effort` (wins over the default) +# - A simple route based on the previous agent's structured output +# +# Reasoning effort is mapped natively per provider: +# - Copilot SDK: passes `reasoning_effort` to `create_session` +# - Anthropic SDK: enables extended thinking with a budget mapped from +# the effort level (low=2k, medium=8k, high=16k, xhigh=32k tokens) +# +# Validation is strict — execution fails if the chosen model does not +# support the requested effort. +# +# Usage: +# conductor run examples/reasoning-effort.yaml \ +# --input topic="how the Raft consensus algorithm handles leader election" + +workflow: + name: reasoning-effort + description: Demonstrates workflow-wide and per-agent reasoning effort configuration + version: "1.0.0" + entry_point: explainer + + runtime: + provider: copilot + temperature: 0.7 + # Workflow-wide default applied to every provider-backed agent + # unless the agent declares its own `reasoning.effort`. + default_reasoning_effort: medium + + input: + topic: + type: string + required: true + description: A tricky technical topic to first explain, then design a system around + +agents: + - name: explainer + description: Explains a tricky technical topic clearly (inherits the runtime default effort) + model: gpt-5.2 + # No `reasoning:` block here — inherits `runtime.default_reasoning_effort: medium`. + prompt: | + Explain the following topic clearly and accurately for a senior engineer + who is encountering it for the first time: + + Topic: {{ workflow.input.topic }} + + Cover the key intuitions, the failure modes the design avoids, and one + concrete example that makes the mechanism click. + + Then assess whether this topic warrants a follow-up system-design pass + (true if it implies non-trivial distributed-systems trade-offs). + output: + explanation: + type: string + description: A clear, self-contained explanation of the topic + needs_design: + type: boolean + description: Whether a follow-up system-design pass is warranted + routes: + - to: architect + when: "{{ output.needs_design }}" + - to: $end + + - name: architect + description: Designs a small system around the topic (uses high reasoning effort) + model: gpt-5.2 + # Per-agent override: this agent always runs at `high` effort, + # regardless of the workflow-wide default. + reasoning: + effort: high + input: + - workflow.input.topic + - explainer.output.explanation + prompt: | + Using the explanation below as grounding, design a minimal but realistic + system that applies the ideas from this topic in production. + + **Topic:** {{ workflow.input.topic }} + + **Background explanation:** + {{ explainer.output.explanation }} + + Produce: + 1. A short architecture sketch (components and their responsibilities). + 2. The two or three most consequential trade-offs and how you resolved them. + 3. The biggest failure mode and how the design contains it. + output: + architecture: + type: string + description: Architecture sketch with components and responsibilities + tradeoffs: + type: string + description: Key trade-offs and how they were resolved + failure_mode: + type: string + description: Biggest failure mode and the design's containment strategy + routes: + - to: $end + +output: + topic: "{{ workflow.input.topic }}" + explanation: "{{ explainer.output.explanation }}" + needs_design: "{{ explainer.output.needs_design }}" + architecture: "{% if architect is defined %}{{ architect.output.architecture }}{% else %}(skipped — no design pass needed){% endif %}" + tradeoffs: "{% if architect is defined %}{{ architect.output.tradeoffs }}{% endif %}" + failure_mode: "{% if architect is defined %}{{ architect.output.failure_mode }}{% endif %}" diff --git a/src/conductor/config/schema.py b/src/conductor/config/schema.py index 318ce89..def5bf5 100644 --- a/src/conductor/config/schema.py +++ b/src/conductor/config/schema.py @@ -10,6 +10,8 @@ from pydantic import BaseModel, Field, field_validator, model_validator +from conductor.providers.reasoning import ReasoningEffort + class InputDef(BaseModel): """Definition for a workflow input parameter.""" @@ -414,6 +416,31 @@ class DialogConfig(BaseModel): """ +class ReasoningConfig(BaseModel): + """Configuration for model reasoning / extended thinking effort. + + When present on an agent (or as a runtime default), enables the + provider's reasoning capability: + + - **Copilot SDK** sets ``reasoning_effort`` on the session. + - **Anthropic SDK** enables extended thinking with a budget mapped from + the effort level (low=2k, medium=8k, high=16k, xhigh=32k tokens). + + Validation happens at execute time. Claude rejects models that don't + match the supported prefix list; Copilot consults the SDK's advertised + ``supported_reasoning_efforts`` (when available) and otherwise allows + the request through to the SDK. + + Example YAML:: + + reasoning: + effort: high + """ + + effort: ReasoningEffort + """Reasoning effort level applied to the agent's model calls.""" + + class AgentDef(BaseModel): """Definition for a single agent in the workflow.""" @@ -588,6 +615,25 @@ class AgentDef(BaseModel): intent or needs clarification on ambiguous requirements. """ + reasoning: ReasoningConfig | None = None + """Optional reasoning / extended-thinking effort for this agent. + + When set, the provider configures its reasoning capability: + + - Copilot: passes ``reasoning_effort`` to ``create_session``. + - Claude: enables ``thinking`` with a budget mapped from the effort + level (low=2k, medium=8k, high=16k, xhigh=32k tokens). + + Falls back to ``runtime.default_reasoning_effort`` when unset. + + Only applies to provider-backed agents (type='agent' or None). + + Example YAML:: + + reasoning: + effort: high + """ + @field_validator("timeout") @classmethod def validate_timeout(cls, v: int | None) -> int | None: @@ -610,6 +656,8 @@ def validate_agent_type(self) -> AgentDef: raise ValueError("human_gate agents cannot have 'dialog'") if self.max_depth is not None: raise ValueError("human_gate agents cannot have 'max_depth'") + if self.reasoning is not None: + raise ValueError("human_gate agents cannot have 'reasoning'") elif self.type == "script": if not self.command: raise ValueError("script agents require 'command'") @@ -642,6 +690,8 @@ def validate_agent_type(self) -> AgentDef: raise ValueError("script agents cannot have 'dialog'") if self.max_depth is not None: raise ValueError("script agents cannot have 'max_depth'") + if self.reasoning is not None: + raise ValueError("script agents cannot have 'reasoning'") elif self.type == "workflow": if not self.workflow: raise ValueError("workflow agents require 'workflow' path") @@ -679,6 +729,8 @@ def validate_agent_type(self) -> AgentDef: f"'{self.type or 'agent'}' agents cannot have 'max_depth' " "(only workflow agents support max_depth)" ) + if self.type == "workflow" and self.reasoning is not None: + raise ValueError("workflow agents cannot have 'reasoning'") return self @@ -805,6 +857,21 @@ class RuntimeConfig(BaseModel): (Claude: 50, Copilot: unlimited). """ + default_reasoning_effort: ReasoningEffort | None = None + """Workflow-wide default reasoning effort applied to provider-backed agents. + + Each agent may override with its own ``reasoning.effort``. Providers + translate this into their native parameter: + + - Copilot: ``reasoning_effort`` on ``create_session`` + - Claude: ``thinking`` with budget mapped from effort level + + Validation happens at execute time. Claude rejects models that don't + match the supported prefix list; Copilot consults the SDK's advertised + ``supported_reasoning_efforts`` (when available) and otherwise allows + the request through to the SDK. + """ + class WorkflowDef(BaseModel): """Top-level workflow configuration.""" diff --git a/src/conductor/providers/__init__.py b/src/conductor/providers/__init__.py index 4912e0a..9794e9d 100644 --- a/src/conductor/providers/__init__.py +++ b/src/conductor/providers/__init__.py @@ -4,10 +4,16 @@ for different LLM providers (Copilot SDK, Claude SDK, etc.). """ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + from conductor.providers.base import AgentOutput, AgentProvider -from conductor.providers.claude import ClaudeProvider -from conductor.providers.copilot import CopilotProvider -from conductor.providers.factory import create_provider + +if TYPE_CHECKING: + from conductor.providers.claude import ClaudeProvider + from conductor.providers.copilot import CopilotProvider + from conductor.providers.factory import create_provider __all__ = [ "AgentOutput", @@ -16,3 +22,19 @@ "CopilotProvider", "create_provider", ] + + +def __getattr__(name: str) -> Any: + if name == "ClaudeProvider": + from conductor.providers.claude import ClaudeProvider + + return ClaudeProvider + if name == "CopilotProvider": + from conductor.providers.copilot import CopilotProvider + + return CopilotProvider + if name == "create_provider": + from conductor.providers.factory import create_provider + + return create_provider + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/conductor/providers/claude.py b/src/conductor/providers/claude.py index 143d9b5..cd4307c 100644 --- a/src/conductor/providers/claude.py +++ b/src/conductor/providers/claude.py @@ -31,6 +31,12 @@ from conductor.exceptions import ProviderError, ValidationError from conductor.executor.output import validate_output from conductor.providers.base import AgentOutput, AgentProvider, EventCallback, match_model_id +from conductor.providers.reasoning import ( + ReasoningEffort, + effort_to_budget_tokens, + is_claude_thinking_model, + resolve_reasoning_effort, +) if TYPE_CHECKING: from conductor.config.schema import AgentDef, OutputField @@ -60,6 +66,9 @@ class ClaudeContentBlock(Protocol): id: str # for tool_use blocks name: str # for tool_use blocks input: dict[str, Any] # for tool_use blocks + thinking: str # for thinking blocks + signature: str # for thinking blocks (optional) + data: str # for redacted_thinking blocks class ClaudeResponse(Protocol): @@ -121,6 +130,7 @@ def __init__( mcp_servers: dict[str, Any] | None = None, max_agent_iterations: int | None = None, max_session_seconds: float | None = None, + default_reasoning_effort: ReasoningEffort | None = None, ) -> None: """Initialize the Claude provider. @@ -140,6 +150,12 @@ def __init__( Defaults to 50 if not specified. max_session_seconds: Maximum wall-clock duration for agent sessions. Defaults to None (unlimited). + default_reasoning_effort: Workflow-wide default reasoning effort + applied when an agent does not declare its own ``reasoning`` + config. Mapped to a Claude extended-thinking ``budget_tokens`` + value. Only valid on extended-thinking models — a per-agent + model that does not support thinking will raise + ``ValidationError`` at execute time. Raises: ProviderError: If SDK is not installed. @@ -174,6 +190,7 @@ def __init__( max_agent_iterations if max_agent_iterations is not None else 50 ) self._default_max_session_seconds = max_session_seconds + self._default_reasoning_effort: ReasoningEffort | None = default_reasoning_effort # MCP server configuration for tool support self._mcp_servers_config = mcp_servers @@ -262,6 +279,41 @@ def _validate_max_tokens(self, max_tokens: int) -> None: suggestion="Adjust max_tokens to be within the valid range", ) + def _resolve_thinking_for_agent(self, agent: AgentDef, model: str) -> dict[str, Any] | None: + """Resolve effective extended-thinking kwargs for an agent. + + Combines the per-agent ``reasoning`` config with the workflow-wide + ``default_reasoning_effort`` and validates that the chosen model + supports Anthropic extended thinking. + + Args: + agent: Agent definition (may declare ``reasoning.effort``). + model: Resolved model id for this execution. + + Returns: + ``{"type": "enabled", "budget_tokens": N}`` when reasoning is + requested, or ``None`` when neither agent nor runtime default + sets it. + + Raises: + ValidationError: If reasoning effort is requested for a model + that does not support extended thinking. + """ + effort = resolve_reasoning_effort(agent, self._default_reasoning_effort) + if effort is None: + return None + if not is_claude_thinking_model(model): + raise ValidationError( + f"Model {model!r} does not support extended thinking, but " + f"reasoning.effort={effort!r} was requested for agent " + f"{agent.name!r}.", + suggestion=( + "Use a Claude 3.7+ or 4.x model (e.g. claude-opus-4-20250514, " + "claude-sonnet-4-20250514) or remove the reasoning config." + ), + ) + return {"type": "enabled", "budget_tokens": effort_to_budget_tokens(effort)} + def get_retry_history(self) -> list[dict[str, Any]]: """Get the retry history for debugging purposes. @@ -514,21 +566,49 @@ async def execute_dialog_turn( messages.append({"role": "user", "content": user_message}) try: - response = await self._client.messages.create( - model=model or self._default_model, - max_tokens=4096, - system=system_prompt, - messages=messages, - ) + kwargs: dict[str, Any] = { + "model": model or self._default_model, + "max_tokens": 4096, + "system": system_prompt, + "messages": messages, + } + + # Apply workflow-wide default reasoning effort if configured. + # Per-agent reasoning is not available here (no AgentDef in scope). + # Mirrors _resolve_thinking_for_agent: raise ValidationError when + # the resolved model does not support extended thinking, rather + # than silently dropping the reasoning request. + if self._default_reasoning_effort is not None: + resolved_model = kwargs["model"] + if not is_claude_thinking_model(resolved_model): + raise ValidationError( + f"Model {resolved_model!r} does not support extended thinking, " + f"but default_reasoning_effort={self._default_reasoning_effort!r} " + "was configured.", + suggestion=( + "Use a Claude 3.7+ or 4.x model (e.g. claude-opus-4-20250514, " + "claude-sonnet-4-20250514) or remove the reasoning config." + ), + ) + budget = effort_to_budget_tokens(self._default_reasoning_effort) + kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget} + # Thinking requires temperature=1.0 and max_tokens > budget. + kwargs["max_tokens"] = max(kwargs["max_tokens"], budget + 4096) + + response = await self._client.messages.create(**kwargs) - # Extract text from response + # Extract text from response (skip thinking blocks) text_parts = [] for block in response.content: + if hasattr(block, "type") and block.type == "thinking": + continue if hasattr(block, "text"): text_parts.append(block.text) return "\n".join(text_parts) if text_parts else "" + except ValidationError: + raise except Exception as exc: raise ProviderError( f"Dialog turn failed: {exc}", @@ -808,18 +888,28 @@ async def _execute_with_retry( else self._default_max_session_seconds ) - # Validate max_tokens against model-specific limits - if "haiku" in model.lower(): - if max_tokens > 4096: + # Resolve extended-thinking kwarg (validates model compatibility). + # Done before the per-model max_tokens warning so the warning logic + # accounts for thinking-aware caps. + thinking = self._resolve_thinking_for_agent(agent, model) + + # Validate max_tokens against model-specific limits. + # Skip the warning when extended thinking is enabled — the per-call + # cap is bumped to at least ``budget_tokens + 4096`` (capped at + # 64000) by _coerce_for_thinking() to satisfy the + # ``max_tokens > budget_tokens`` constraint. + if thinking is None: + if "haiku" in model.lower(): + if max_tokens > 4096: + logger.warning( + f"max_tokens={max_tokens} exceeds Haiku model limit of 4096. " + "API may reject request." + ) + elif max_tokens > 8192: logger.warning( - f"max_tokens={max_tokens} exceeds Haiku model limit of 4096. " + f"max_tokens={max_tokens} exceeds Sonnet/Opus model limit of 8192. " "API may reject request." ) - elif max_tokens > 8192: - logger.warning( - f"max_tokens={max_tokens} exceeds Sonnet/Opus model limit of 8192. " - "API may reject request." - ) # Build tools list: emit_output (for structured output) + MCP tools all_tools: list[dict[str, Any]] = [] @@ -861,6 +951,7 @@ async def _execute_with_retry( max_session_seconds=max_session_seconds, interrupt_signal=interrupt_signal, event_callback=event_callback, + thinking=thinking, ) # Handle partial output from mid-agent interrupt @@ -1057,6 +1148,76 @@ def _extract_status_code(self, exception: Exception) -> int | None: return None + def _coerce_for_thinking( + self, + temperature: float | None, + max_tokens: int, + model: str, + thinking: dict[str, Any] | None, + ) -> tuple[float | None, int]: + """Adjust temperature and max_tokens to satisfy thinking constraints. + + When extended thinking is enabled the Anthropic API requires: + + - ``temperature == 1.0`` (or omitted) + - ``max_tokens > budget_tokens`` + + We force temperature to 1.0 (logging an info note if the caller + configured a different non-1.0 value) and bump ``max_tokens`` to + at least ``budget_tokens + 4096``, clamped to a per-model cap. + Extended-thinking models accept up to 64000 output tokens, which + is what we use here. + + When ``thinking`` is ``None`` the inputs are returned unchanged. + + Args: + temperature: User-configured temperature (may be ``None``). + max_tokens: User-configured max output tokens. + model: Resolved model identifier. + thinking: Resolved thinking kwarg or ``None``. + + Returns: + Tuple of ``(effective_temperature, effective_max_tokens)``. + """ + if thinking is None: + return temperature, max_tokens + + budget = int(thinking.get("budget_tokens", 0)) + # Per-model cap when thinking is enabled. Extended-thinking models + # accept up to 64000 output tokens. + per_model_cap = 64_000 + required = budget + 4096 + effective_max_tokens = max(max_tokens, required) + if effective_max_tokens > per_model_cap: + logger.info( + "Clamping max_tokens %s to %s for extended thinking on model %s " + "(Anthropic API per-model cap)", + effective_max_tokens, + per_model_cap, + model, + ) + effective_max_tokens = per_model_cap + if effective_max_tokens <= budget: + # Defensive: if cap collapses below budget+1, this would still + # violate the API constraint. Raise rather than silently send a + # request the API will reject. + raise ValidationError( + f"Cannot satisfy thinking budget_tokens={budget} on model " + f"{model!r}: per-model cap {per_model_cap} is not greater " + f"than the requested budget.", + suggestion="Lower reasoning.effort or use a model with a higher cap.", + ) + + if temperature is not None and temperature != 1.0: + logger.info( + "Coercing temperature %s to 1.0 for extended thinking on model %s " + "(Anthropic API requirement)", + temperature, + model, + ) + + return 1.0, effective_max_tokens + async def _execute_api_call( self, messages: list[dict[str, str]], @@ -1064,6 +1225,7 @@ async def _execute_api_call( temperature: float | None, max_tokens: int, tools: list[dict[str, Any]] | None = None, + thinking: dict[str, Any] | None = None, ) -> ClaudeResponse: """Execute non-streaming Claude API call using AsyncAnthropic. @@ -1076,6 +1238,10 @@ async def _execute_api_call( temperature: Temperature setting (0.0-1.0, enforced by SDK). max_tokens: Maximum output tokens. tools: Optional tool definitions for structured output. + thinking: Optional extended-thinking kwarg for the SDK. When + supplied, ``temperature`` is forced to 1.0 and ``max_tokens`` + is bumped to satisfy the API constraint + ``max_tokens > budget_tokens``. Returns: Claude API response object with content blocks and usage metadata. @@ -1090,23 +1256,31 @@ async def _execute_api_call( if self._client is None: raise ProviderError("Claude client not initialized") + effective_temperature, effective_max_tokens = self._coerce_for_thinking( + temperature, max_tokens, model, thinking + ) + # Build API call kwargs kwargs: dict[str, Any] = { "model": model, "messages": messages, - "max_tokens": max_tokens, + "max_tokens": effective_max_tokens, } - if temperature is not None: - kwargs["temperature"] = temperature + if effective_temperature is not None: + kwargs["temperature"] = effective_temperature if tools: kwargs["tools"] = tools + if thinking is not None: + kwargs["thinking"] = thinking + # Execute non-streaming API call (async) logger.debug( f"Executing non-streaming Claude API call: model={model}, " - f"max_tokens={max_tokens}, timeout={self._timeout}s" + f"max_tokens={effective_max_tokens}, timeout={self._timeout}s, " + f"thinking={'enabled' if thinking else 'disabled'}" ) response = await self._client.messages.create(**kwargs) @@ -1125,6 +1299,7 @@ async def _execute_agentic_loop( max_session_seconds: float | None = None, interrupt_signal: asyncio.Event | None = None, event_callback: EventCallback | None = None, + thinking: dict[str, Any] | None = None, ) -> tuple[ClaudeResponse, int | None, bool]: """Execute an agentic loop that handles MCP tool calls. @@ -1198,6 +1373,7 @@ async def _execute_agentic_loop( max_tokens=max_tokens, tools=tools, has_output_schema=has_output_schema, + thinking=thinking, ) total_tokens += interrupt_tokens return interrupt_response, total_tokens, True @@ -1223,6 +1399,7 @@ async def _execute_agentic_loop( max_tokens=max_tokens, tools=tools, output_schema=output_schema, + thinking=thinking, ) ) else: @@ -1233,6 +1410,7 @@ async def _execute_agentic_loop( temperature=temperature, max_tokens=max_tokens, tools=tools, + thinking=thinking, ) ) interrupt_task = asyncio.create_task(interrupt_signal.wait()) @@ -1263,6 +1441,7 @@ async def _execute_agentic_loop( max_tokens=max_tokens, tools=tools, has_output_schema=has_output_schema, + thinking=thinking, ) total_tokens += partial_tokens return partial_resp, total_tokens, True @@ -1276,6 +1455,7 @@ async def _execute_agentic_loop( max_tokens=max_tokens, tools=tools, output_schema=output_schema, + thinking=thinking, ) else: response = await self._execute_api_call( @@ -1284,6 +1464,7 @@ async def _execute_agentic_loop( temperature=temperature, max_tokens=max_tokens, tools=tools, + thinking=thinking, ) # Accumulate token usage @@ -1300,6 +1481,18 @@ async def _execute_agentic_loop( event_callback("agent_message", {"content": block.text}) except Exception: logger.debug("Error in event_callback for agent_message", exc_info=True) + elif hasattr(block, "type") and block.type == "thinking": + thinking_text = getattr(block, "thinking", None) or getattr( + block, "text", None + ) + if thinking_text: + try: + event_callback("agent_reasoning", {"content": thinking_text}) + except Exception: + logger.debug( + "Error in event_callback for agent_reasoning", + exc_info=True, + ) # Check for tool_use blocks tool_uses = [ @@ -1434,6 +1627,21 @@ async def _execute_agentic_loop( "input": dict(block.input) if hasattr(block, "input") else {}, } ) + elif block.type == "thinking": + # Extended thinking requires the unmodified thinking + # blocks (with signature) to be echoed back before + # any tool_use blocks they preceded — otherwise the + # API rejects the next request with a 400. + block_dict: dict[str, Any] = { + "type": "thinking", + "thinking": block.thinking, + } + sig = getattr(block, "signature", None) + if sig is not None: + block_dict["signature"] = sig + assistant_content.append(block_dict) + elif block.type == "redacted_thinking": + assistant_content.append({"type": "redacted_thinking", "data": block.data}) # Add assistant response and tool results to message history working_messages.append( @@ -1463,6 +1671,7 @@ async def _request_partial_output( max_tokens: int, tools: list[dict[str, Any]] | None, has_output_schema: bool, + thinking: dict[str, Any] | None = None, ) -> tuple[Any, int]: """Send a final API call requesting partial output after interrupt. @@ -1506,6 +1715,7 @@ async def _request_partial_output( temperature=temperature, max_tokens=max_tokens, tools=tools, + thinking=thinking, ) call_tokens = 0 @@ -1524,6 +1734,7 @@ async def _execute_with_parse_recovery( max_tokens: int, tools: list[dict[str, Any]] | None, output_schema: dict[str, OutputField] | None, + thinking: dict[str, Any] | None = None, ) -> ClaudeResponse: """Execute API call with parse recovery for malformed JSON responses. @@ -1555,6 +1766,7 @@ async def _execute_with_parse_recovery( temperature=temperature, max_tokens=max_tokens, tools=tools, + thinking=thinking, ) # If no output schema, return immediately (no recovery needed) @@ -1619,6 +1831,7 @@ async def _execute_with_parse_recovery( temperature=temperature, max_tokens=max_tokens, tools=tools, + thinking=thinking, ) # Check if recovery succeeded (tool_use) diff --git a/src/conductor/providers/copilot.py b/src/conductor/providers/copilot.py index 5becd26..0a624a5 100644 --- a/src/conductor/providers/copilot.py +++ b/src/conductor/providers/copilot.py @@ -19,6 +19,7 @@ from conductor.exceptions import ProviderError, ValidationError from conductor.providers.base import AgentOutput, AgentProvider, EventCallback, match_model_id +from conductor.providers.reasoning import ReasoningEffort, resolve_reasoning_effort if TYPE_CHECKING: from conductor.config.schema import AgentDef, OutputField @@ -157,6 +158,7 @@ def __init__( idle_recovery_config: IdleRecoveryConfig | None = None, temperature: float | None = None, max_agent_iterations: int | None = None, + default_reasoning_effort: ReasoningEffort | None = None, ) -> None: """Initialize the Copilot provider. @@ -174,6 +176,10 @@ def __init__( temperature: Default temperature for generation (0.0-1.0). Optional. max_agent_iterations: Maximum tool-use iterations per agent execution. None means no iteration limit (only wall-clock timeout applies). + default_reasoning_effort: Workflow-wide default ``reasoning_effort`` + applied to ``create_session`` when an agent does not specify + its own ``reasoning.effort``. One of ``low``, ``medium``, + ``high``, ``xhigh``, or ``None`` to send no value. """ self._client: Any = None # Will hold Copilot SDK client self._mock_handler = mock_handler @@ -187,6 +193,7 @@ def __init__( self._idle_recovery_config = idle_recovery_config or IdleRecoveryConfig() self._temperature = temperature self._default_max_agent_iterations = max_agent_iterations + self._default_reasoning_effort = default_reasoning_effort self._max_schema_depth = 10 # Max nesting depth for recursive schema building self._session_ids: dict[str, str] = {} self._resume_session_ids: dict[str, str] = {} @@ -412,6 +419,11 @@ async def _execute_with_retry( await asyncio.sleep(delay) + except ValidationError: + # Configuration / capability errors are deterministic and + # never recoverable by retrying. Surface them unwrapped so + # the workflow engine can present the original message. + raise except Exception as e: # Wrap unexpected errors as retryable last_error = e @@ -549,6 +561,20 @@ async def _execute_sdk_call( if self._mcp_servers: session_kwargs["mcp_servers"] = self._mcp_servers + # Resolve reasoning effort: per-agent override wins over runtime default. + # When set, validate against the model's advertised capabilities + # before forwarding to the SDK. + effort = resolve_reasoning_effort(agent, self._default_reasoning_effort) + if effort is not None: + await self._validate_reasoning_effort_for_model(model, effort) + session_kwargs["reasoning_effort"] = effort + logger.debug( + "Setting reasoning_effort=%s for agent %r (model=%s)", + effort, + agent.name, + model, + ) + # Attempt to resume a previous session if one exists for this agent session: Any = None resume_sid = self._resume_session_ids.get(agent.name) @@ -716,6 +742,10 @@ async def _execute_sdk_call( except ProviderError: raise + except ValidationError: + # Configuration errors (e.g. unsupported reasoning_effort) are + # deterministic; surface unwrapped so retries don't mask them. + raise except Exception as e: raise ProviderError( f"Copilot SDK call failed: {e}", @@ -1814,11 +1844,25 @@ async def execute_dialog_turn( session = None try: - session = await self._client.create_session( - model=model or self._default_model, - on_permission_request=self._default_permission_handler, - system_message={"mode": "replace", "content": system_prompt}, - ) + dialog_kwargs: dict[str, Any] = { + "model": model or self._default_model, + "on_permission_request": self._default_permission_handler, + "system_message": {"mode": "replace", "content": system_prompt}, + } + + # Dialog turns honor the workflow-wide default reasoning effort + # only — there's no agent-scoped override at this layer. + effort = self._default_reasoning_effort + if effort is not None: + await self._validate_reasoning_effort_for_model(dialog_kwargs["model"], effort) + dialog_kwargs["reasoning_effort"] = effort + logger.debug( + "Setting reasoning_effort=%s for dialog turn (model=%s)", + effort, + dialog_kwargs["model"], + ) + + session = await self._client.create_session(**dialog_kwargs) response_content = "" done = asyncio.Event() @@ -1856,6 +1900,8 @@ def on_event(event: Any) -> None: except ProviderError: raise + except ValidationError: + raise except Exception as exc: raise ProviderError( f"Dialog turn failed: {exc}", @@ -1907,6 +1953,54 @@ async def get_max_prompt_tokens(self, model: str) -> int | None: limits = getattr(info.capabilities, "limits", None) return getattr(limits, "max_prompt_tokens", None) + async def _validate_reasoning_effort_for_model( + self, model: str, effort: ReasoningEffort + ) -> None: + """Validate ``effort`` against the model's advertised capabilities. + + Looks up the model via ``client.list_models()`` (resolving aliases via + :func:`match_model_id`) and inspects + ``capabilities.supported_reasoning_efforts``. When that list is + present and ``effort`` is not in it, raises :class:`ValidationError`. + + When the field is missing/``None`` (capability unknown), or when the + model can't be matched, or when listing fails, validation is skipped + — capability metadata must never block a workflow that the SDK might + otherwise accept. + + Skipped entirely in mock-handler mode and when the SDK is not + installed. + """ + if self._mock_handler is not None or not COPILOT_SDK_AVAILABLE: + return + try: + await self._ensure_client_started() + models = await self._client.list_models() + except (TimeoutError, ProviderError, OSError, RuntimeError) as e: + logger.debug( + "Failed to list Copilot models for reasoning_effort validation of %r: %s", + model, + e, + ) + return + by_id = {info.id: info for info in models} + matched_id = match_model_id(model, by_id.keys()) + if matched_id is None: + return + info = by_id[matched_id] + supported = getattr(info.capabilities, "supported_reasoning_efforts", None) + if supported is None: + return + if effort not in supported: + raise ValidationError( + f"Model {model!r} does not support reasoning_effort={effort!r}; " + f"supported values: {sorted(supported)}", + suggestion=( + "Choose an effort listed in the model's capabilities, " + "or pick a different model." + ), + ) + def get_session_ids(self) -> dict[str, str]: """Get tracked session IDs for all executed agents. diff --git a/src/conductor/providers/factory.py b/src/conductor/providers/factory.py index 324c1c7..09f504e 100644 --- a/src/conductor/providers/factory.py +++ b/src/conductor/providers/factory.py @@ -12,6 +12,7 @@ from conductor.providers.base import AgentProvider from conductor.providers.claude import ANTHROPIC_SDK_AVAILABLE, ClaudeProvider from conductor.providers.copilot import CopilotProvider, IdleRecoveryConfig +from conductor.providers.reasoning import ReasoningEffort async def create_provider( @@ -24,6 +25,7 @@ async def create_provider( timeout: float | None = None, max_session_seconds: float | None = None, max_agent_iterations: int | None = None, + default_reasoning_effort: ReasoningEffort | None = None, ) -> AgentProvider: """Factory function to create the appropriate provider. @@ -44,6 +46,9 @@ async def create_provider( timeout: Request timeout in seconds. max_session_seconds: Maximum wall-clock duration for agent sessions. max_agent_iterations: Maximum tool-use iterations per agent execution. + default_reasoning_effort: Workflow-wide default reasoning effort + (``low`` / ``medium`` / ``high`` / ``xhigh``) applied when an agent + does not specify its own ``reasoning.effort``. Returns: Configured AgentProvider instance. @@ -69,6 +74,7 @@ async def create_provider( temperature=temperature, idle_recovery_config=idle_recovery_config, max_agent_iterations=max_agent_iterations, + default_reasoning_effort=default_reasoning_effort, ) case "openai-agents": raise ProviderError( @@ -89,6 +95,7 @@ async def create_provider( mcp_servers=mcp_servers, max_agent_iterations=max_agent_iterations, max_session_seconds=max_session_seconds, + default_reasoning_effort=default_reasoning_effort, ) case _: raise ProviderError( @@ -140,6 +147,7 @@ async def create_provider( timeout = getattr(runtime_config, "timeout", None) max_session_seconds = getattr(runtime_config, "max_session_seconds", None) max_agent_iterations = getattr(runtime_config, "max_agent_iterations", None) + default_reasoning_effort = getattr(runtime_config, "default_reasoning_effort", None) return await create_provider( provider_type=provider_type, @@ -150,4 +158,5 @@ async def create_provider( timeout=timeout, max_session_seconds=max_session_seconds, max_agent_iterations=max_agent_iterations, + default_reasoning_effort=default_reasoning_effort, ) diff --git a/src/conductor/providers/reasoning.py b/src/conductor/providers/reasoning.py new file mode 100644 index 0000000..e216332 --- /dev/null +++ b/src/conductor/providers/reasoning.py @@ -0,0 +1,91 @@ +"""Shared reasoning / extended-thinking helpers for providers. + +This module centralizes the provider-agnostic mapping between Conductor's +discrete ``reasoning.effort`` levels and each SDK's native parameter shape: + +- Copilot SDK uses a discrete ``reasoning_effort`` literal. +- Anthropic SDK uses a token budget passed via ``thinking={"type":"enabled", + "budget_tokens": N}`` and is only valid on extended-thinking-capable models. +""" + +from __future__ import annotations + +from collections.abc import Mapping +from typing import TYPE_CHECKING, Final, Literal + +if TYPE_CHECKING: + from conductor.config.schema import AgentDef + +ReasoningEffort = Literal["low", "medium", "high", "xhigh"] + +EFFORT_TO_BUDGET_TOKENS: Final[Mapping[ReasoningEffort, int]] = { + "low": 2048, + "medium": 8192, + "high": 16384, + "xhigh": 32768, +} +"""Mapping from Conductor effort level to Claude ``budget_tokens`` value. + +The minimum supported by the Anthropic API is 1024; all values above sit +comfortably above that floor. +""" + +_CLAUDE_THINKING_MODEL_PREFIXES: tuple[str, ...] = ( + "claude-3-7-", + "claude-opus-4", + "claude-sonnet-4", + "claude-haiku-4", +) +"""Model id prefixes that support extended thinking. + +Anthropic introduced extended thinking with Claude 3.7 and continued it on +the Claude 4.x family. Older 3.5 / 3 / instant models are not supported. +""" + + +def effort_to_budget_tokens(effort: ReasoningEffort) -> int: + """Translate a Conductor effort level into a Claude ``budget_tokens`` value. + + Args: + effort: One of ``low``, ``medium``, ``high``, ``xhigh``. + + Returns: + The number of thinking-budget tokens to allocate. + + Raises: + ValueError: If ``effort`` is not a recognized level. + """ + try: + return EFFORT_TO_BUDGET_TOKENS[effort] + except KeyError as exc: + raise ValueError( + f"Unknown reasoning effort {effort!r}; expected one of " + f"{sorted(EFFORT_TO_BUDGET_TOKENS)}" + ) from exc + + +def is_claude_thinking_model(model_id: str) -> bool: + """Return ``True`` when ``model_id`` supports Anthropic extended thinking. + + Matching is prefix-based to handle dated suffixes (e.g. + ``claude-opus-4-20250514``) and ``-latest`` aliases. + """ + if not model_id: + return False + lowered = model_id.lower() + return any(lowered.startswith(prefix) for prefix in _CLAUDE_THINKING_MODEL_PREFIXES) + + +def resolve_reasoning_effort( + agent: AgentDef, + runtime_default: ReasoningEffort | None, +) -> ReasoningEffort | None: + """Resolve the effective reasoning effort for an agent. + + Per-agent ``reasoning.effort`` wins over the workflow-wide + ``runtime.default_reasoning_effort``. Returns ``None`` when neither is + set, signalling that no reasoning parameter should be sent to the SDK. + """ + if agent.reasoning is not None: + return agent.reasoning.effort + return runtime_default diff --git a/tests/test_config/test_schema.py b/tests/test_config/test_schema.py index e2cbba8..6c33119 100644 --- a/tests/test_config/test_schema.py +++ b/tests/test_config/test_schema.py @@ -14,6 +14,7 @@ InputDef, LimitsConfig, OutputField, + ReasoningConfig, RouteDef, RuntimeConfig, WorkflowConfig, @@ -1273,3 +1274,125 @@ def test_nested_for_each_prohibited(self) -> None: ], ) assert "nested for-each groups are not allowed" in str(exc_info.value).lower() + + +class TestAgentDefReasoning: + """Tests for the reasoning field on AgentDef.""" + + @pytest.mark.parametrize("effort", ["low", "medium", "high", "xhigh"]) + def test_accepts_valid_effort(self, effort: str) -> None: + """Test that AgentDef accepts each valid effort level.""" + agent = AgentDef(name="a", model="gpt-4", prompt="test", reasoning={"effort": effort}) + assert agent.reasoning is not None + assert agent.reasoning.effort == effort + + @pytest.mark.parametrize("effort", ["none", "max", 42]) + def test_rejects_invalid_effort(self, effort: object) -> None: + """Test that invalid effort values raise ValidationError.""" + with pytest.raises(ValidationError): + AgentDef( + name="a", + model="gpt-4", + prompt="test", + reasoning={"effort": effort}, # type: ignore[arg-type] + ) + + def test_reasoning_defaults_to_none(self) -> None: + """Test that reasoning defaults to None when omitted.""" + agent = AgentDef(name="x", model="gpt-4", prompt="test") + assert agent.reasoning is None + + def test_explicit_reasoning_none_is_valid(self) -> None: + """Test that explicitly passing reasoning=None is valid.""" + agent = AgentDef(name="x", model="gpt-4", prompt="test", reasoning=None) + assert agent.reasoning is None + + def test_reasoning_accepts_reasoning_config_instance(self) -> None: + """Test that a ReasoningConfig instance is accepted.""" + agent = AgentDef( + name="a", + model="gpt-4", + prompt="test", + reasoning=ReasoningConfig(effort="high"), + ) + assert agent.reasoning is not None + assert agent.reasoning.effort == "high" + + def test_default_agent_type_accepts_reasoning(self) -> None: + """Test that default (None) agent type accepts reasoning.""" + agent = AgentDef(name="a", model="gpt-4", prompt="test", reasoning={"effort": "medium"}) + assert agent.type is None + assert agent.reasoning is not None + assert agent.reasoning.effort == "medium" + + def test_explicit_agent_type_accepts_reasoning(self) -> None: + """Test that type='agent' accepts reasoning.""" + agent = AgentDef( + name="a", + type="agent", + model="gpt-4", + prompt="test", + reasoning={"effort": "low"}, + ) + assert agent.reasoning is not None + assert agent.reasoning.effort == "low" + + def test_human_gate_with_reasoning_raises(self) -> None: + """Test that human_gate agents cannot have reasoning.""" + with pytest.raises(ValidationError) as exc_info: + AgentDef( + name="gate1", + type="human_gate", + prompt="Choose:", + options=[GateOption(label="Ok", value="ok", route="next")], + reasoning={"effort": "low"}, + ) + assert "human_gate agents cannot have 'reasoning'" in str(exc_info.value) + + def test_script_with_reasoning_raises(self) -> None: + """Test that script agents cannot have reasoning.""" + with pytest.raises(ValidationError) as exc_info: + AgentDef( + name="s", + type="script", + command="echo hello", + reasoning={"effort": "high"}, + ) + assert "script agents cannot have 'reasoning'" in str(exc_info.value) + + def test_workflow_with_reasoning_raises(self) -> None: + """Test that workflow agents cannot have reasoning.""" + with pytest.raises(ValidationError) as exc_info: + AgentDef( + name="w", + type="workflow", + workflow="./sub.yaml", + reasoning={"effort": "medium"}, + ) + assert "workflow agents cannot have 'reasoning'" in str(exc_info.value) + + +class TestRuntimeConfigDefaultReasoningEffort: + """Tests for default_reasoning_effort on RuntimeConfig.""" + + def test_default_is_none(self) -> None: + """Test that default_reasoning_effort defaults to None.""" + config = RuntimeConfig() + assert config.default_reasoning_effort is None + + def test_explicit_none_is_valid(self) -> None: + """Test that explicitly passing None is valid.""" + config = RuntimeConfig(default_reasoning_effort=None) + assert config.default_reasoning_effort is None + + @pytest.mark.parametrize("effort", ["low", "medium", "high", "xhigh"]) + def test_accepts_valid_effort(self, effort: str) -> None: + """Test that each valid effort level is accepted.""" + config = RuntimeConfig(default_reasoning_effort=effort) # type: ignore[arg-type] + assert config.default_reasoning_effort == effort + + @pytest.mark.parametrize("effort", ["none", "max", "extreme", 42, ""]) + def test_rejects_invalid_effort(self, effort: object) -> None: + """Test that invalid effort values raise ValidationError.""" + with pytest.raises(ValidationError): + RuntimeConfig(default_reasoning_effort=effort) # type: ignore[arg-type] diff --git a/tests/test_engine/test_workflow.py b/tests/test_engine/test_workflow.py index 89e169a..b466e68 100644 --- a/tests/test_engine/test_workflow.py +++ b/tests/test_engine/test_workflow.py @@ -2330,3 +2330,179 @@ def test_extract_key_fallback_returns_string(self, workflow_engine: WorkflowEngi key = workflow_engine._extract_key_from_item(item, "missing", fallback_index=42) assert key == "42" assert isinstance(key, str) + + +class _RecordingReasoningProvider: + """Minimal AgentProvider that records the resolved reasoning effort per agent. + + Mirrors what real providers (Copilot/Claude) do: it stores + ``default_reasoning_effort`` on init and calls + :func:`conductor.providers.reasoning.resolve_reasoning_effort` on each + ``execute()`` so we can verify the full plumbing path end-to-end. + """ + + def __init__(self, default_reasoning_effort=None): + from conductor.providers.base import AgentProvider + + assert isinstance(self, object) + self._default_reasoning_effort = default_reasoning_effort + self.resolved_efforts: dict[str, str | None] = {} + # Sanity: ensure the protocol the engine expects is satisfied via duck typing. + _ = AgentProvider + + async def execute( + self, + agent, + context, + rendered_prompt, + tools=None, + interrupt_signal=None, + event_callback=None, + ): + from conductor.providers.base import AgentOutput + from conductor.providers.reasoning import resolve_reasoning_effort + + effort = resolve_reasoning_effort(agent, self._default_reasoning_effort) + self.resolved_efforts[agent.name] = effort + + content: dict[str, object] = {} + if agent.output: + for field_name in agent.output: + content[field_name] = f"{agent.name}:{effort}" + return AgentOutput(content=content, raw_response=None, model=agent.model) + + async def validate_connection(self) -> bool: + return True + + async def close(self) -> None: + return None + + async def get_max_prompt_tokens(self, model: str): + return None + + +class TestReasoningEffortPlumbing: + """End-to-end plumbing for ``runtime.default_reasoning_effort`` and + per-agent ``reasoning.effort`` overrides. + """ + + @pytest.mark.asyncio + async def test_runtime_default_and_per_agent_override_reach_provider(self) -> None: + """Agents inherit the runtime default; per-agent ``reasoning`` overrides it.""" + from conductor.config.schema import ReasoningConfig + + config = WorkflowConfig( + workflow=WorkflowDef( + name="reasoning-effort-plumbing", + entry_point="inheritor", + runtime=RuntimeConfig(provider="copilot", default_reasoning_effort="medium"), + context=ContextConfig(mode="accumulate"), + limits=LimitsConfig(max_iterations=10), + ), + agents=[ + AgentDef( + name="inheritor", + model="gpt-4", + prompt="Inherit the runtime default", + output={"answer": OutputField(type="string")}, + routes=[RouteDef(to="overrider")], + ), + AgentDef( + name="overrider", + model="gpt-4", + prompt="Override with high", + reasoning=ReasoningConfig(effort="high"), + output={"answer": OutputField(type="string")}, + routes=[RouteDef(to="$end")], + ), + ], + output={ + "inheritor": "{{ inheritor.output.answer }}", + "overrider": "{{ overrider.output.answer }}", + }, + ) + + provider = _RecordingReasoningProvider(default_reasoning_effort="medium") + engine = WorkflowEngine(config, provider) + + result = await engine.run({}) + + assert provider.resolved_efforts == { + "inheritor": "medium", + "overrider": "high", + } + # The recording provider encodes the effort into the output, confirming + # the engine actually consumed the value the provider produced. + assert result["inheritor"] == "inheritor:medium" + assert result["overrider"] == "overrider:high" + + @pytest.mark.asyncio + async def test_no_runtime_default_and_no_agent_reasoning_resolves_to_none(self) -> None: + """When neither side sets reasoning, the resolver returns ``None``.""" + config = WorkflowConfig( + workflow=WorkflowDef( + name="reasoning-effort-unset", + entry_point="solo", + runtime=RuntimeConfig(provider="copilot"), + context=ContextConfig(mode="accumulate"), + limits=LimitsConfig(max_iterations=10), + ), + agents=[ + AgentDef( + name="solo", + model="gpt-4", + prompt="No reasoning configured", + output={"answer": OutputField(type="string")}, + routes=[RouteDef(to="$end")], + ), + ], + output={"answer": "{{ solo.output.answer }}"}, + ) + + provider = _RecordingReasoningProvider(default_reasoning_effort=None) + engine = WorkflowEngine(config, provider) + + await engine.run({}) + + assert provider.resolved_efforts == {"solo": None} + + +class TestProviderFactoryReasoningEffortWiring: + """``ProviderFactory.create_provider`` must forward + ``default_reasoning_effort`` from the ``RuntimeConfig`` to the concrete + provider's ``_default_reasoning_effort`` attribute. + """ + + @pytest.mark.asyncio + async def test_factory_forwards_to_copilot(self) -> None: + from conductor.providers.copilot import CopilotProvider as _CopilotProvider + from conductor.providers.factory import ProviderFactory + + runtime = RuntimeConfig(provider="copilot", default_reasoning_effort="high") + provider = await ProviderFactory.create_provider(runtime, validate=False) + try: + assert isinstance(provider, _CopilotProvider) + assert provider._default_reasoning_effort == "high" + finally: + await provider.close() + + @pytest.mark.asyncio + async def test_factory_forwards_to_claude(self) -> None: + from conductor.providers.claude import ( + ANTHROPIC_SDK_AVAILABLE, + ) + from conductor.providers.claude import ( + ClaudeProvider as _ClaudeProvider, + ) + from conductor.providers.factory import ProviderFactory + + if not ANTHROPIC_SDK_AVAILABLE: + pytest.skip("anthropic SDK not installed") + + runtime = RuntimeConfig(provider="claude", default_reasoning_effort="high") + provider = await ProviderFactory.create_provider(runtime, validate=False) + try: + assert isinstance(provider, _ClaudeProvider) + assert provider._default_reasoning_effort == "high" + finally: + await provider.close() diff --git a/tests/test_integration/test_claude_mcp_tool_filter.py b/tests/test_integration/test_claude_mcp_tool_filter.py index 814ef1b..83729a5 100644 --- a/tests/test_integration/test_claude_mcp_tool_filter.py +++ b/tests/test_integration/test_claude_mcp_tool_filter.py @@ -84,6 +84,7 @@ def _make_provider_with_mcp() -> ClaudeProvider: provider._max_schema_depth = 10 provider._default_max_agent_iterations = 50 provider._default_max_session_seconds = None + provider._default_reasoning_effort = None # Pre-wire a mock MCP manager so _ensure_mcp_connected is a no-op mock_mcp = MagicMock() diff --git a/tests/test_providers/test_claude.py b/tests/test_providers/test_claude.py index a7465b6..fd60e21 100644 --- a/tests/test_providers/test_claude.py +++ b/tests/test_providers/test_claude.py @@ -2555,3 +2555,611 @@ async def test_returns_none_when_sdk_unavailable(self, mock_anthropic_class: Moc with patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", False): assert await provider.get_max_prompt_tokens("claude-sonnet-4-5") is None + + +class TestClaudeReasoningEffort: + """Tests for extended-thinking / reasoning effort plumbing.""" + + @staticmethod + def _build_provider( + mock_anthropic_module: Mock, + mock_anthropic_class: Mock, + *, + default_reasoning_effort: str | None = None, + temperature: float | None = None, + ) -> tuple[ClaudeProvider, Mock]: + mock_anthropic_module.__version__ = "0.77.0" + mock_client = Mock() + mock_client.models.list = AsyncMock(return_value=Mock(data=[])) + + text_block = Mock() + text_block.type = "text" + text_block.text = "ok" + + response = Mock() + response.content = [text_block] + response.usage = Mock(input_tokens=1, output_tokens=1) + + mock_client.messages.create = AsyncMock(return_value=response) + mock_anthropic_class.return_value = mock_client + + provider = ClaudeProvider( + temperature=temperature, + default_reasoning_effort=default_reasoning_effort, # type: ignore[arg-type] + ) + return provider, mock_client + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_thinking_kwarg_forwarded_with_correct_shape( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + from conductor.config.schema import ReasoningConfig + + provider, mock_client = self._build_provider(mock_anthropic_module, mock_anthropic_class) + agent = AgentDef( + name="t", + prompt="p", + model="claude-opus-4-20250514", + reasoning=ReasoningConfig(effort="medium"), + ) + await provider.execute(agent=agent, context={}, rendered_prompt="p") + + kwargs = mock_client.messages.create.call_args[1] + assert kwargs["thinking"] == {"type": "enabled", "budget_tokens": 8192} + + @pytest.mark.parametrize( + "effort,expected", + [("low", 2048), ("medium", 8192), ("high", 16384), ("xhigh", 32768)], + ) + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_effort_to_budget_mapping( + self, + mock_anthropic_module: Mock, + mock_anthropic_class: Mock, + effort: str, + expected: int, + ) -> None: + from conductor.config.schema import ReasoningConfig + + provider, mock_client = self._build_provider(mock_anthropic_module, mock_anthropic_class) + agent = AgentDef( + name="t", + prompt="p", + model="claude-sonnet-4-20250514", + reasoning=ReasoningConfig(effort=effort), # type: ignore[arg-type] + ) + await provider.execute(agent=agent, context={}, rendered_prompt="p") + + kwargs = mock_client.messages.create.call_args[1] + assert kwargs["thinking"]["budget_tokens"] == expected + assert kwargs["thinking"]["type"] == "enabled" + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_temperature_coerced_to_one_when_thinking_enabled( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + from conductor.config.schema import ReasoningConfig + + provider, mock_client = self._build_provider( + mock_anthropic_module, mock_anthropic_class, temperature=0.3 + ) + agent = AgentDef( + name="t", + prompt="p", + model="claude-opus-4-20250514", + reasoning=ReasoningConfig(effort="low"), + ) + await provider.execute(agent=agent, context={}, rendered_prompt="p") + + kwargs = mock_client.messages.create.call_args[1] + assert kwargs["temperature"] == 1.0 + # User-configured temperature is preserved on the provider itself. + assert provider._default_temperature == 0.3 + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_max_tokens_bumped_above_budget( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + from conductor.config.schema import ReasoningConfig + + provider, mock_client = self._build_provider(mock_anthropic_module, mock_anthropic_class) + # Default max_tokens=8192. xhigh budget=32768. Effective must be + # >= 32768 + 4096 = 36864. + agent = AgentDef( + name="t", + prompt="p", + model="claude-opus-4-20250514", + reasoning=ReasoningConfig(effort="xhigh"), + ) + await provider.execute(agent=agent, context={}, rendered_prompt="p") + + kwargs = mock_client.messages.create.call_args[1] + assert kwargs["max_tokens"] >= 32768 + 4096 + assert kwargs["max_tokens"] > kwargs["thinking"]["budget_tokens"] + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_validation_error_on_non_thinking_model( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + from conductor.config.schema import ReasoningConfig + + provider, _ = self._build_provider(mock_anthropic_module, mock_anthropic_class) + agent = AgentDef( + name="t", + prompt="p", + model="claude-3-5-sonnet-latest", + reasoning=ReasoningConfig(effort="medium"), + ) + with pytest.raises(ValidationError, match="extended thinking"): + await provider.execute(agent=agent, context={}, rendered_prompt="p") + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_no_validation_error_on_thinking_model( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + from conductor.config.schema import ReasoningConfig + + provider, mock_client = self._build_provider(mock_anthropic_module, mock_anthropic_class) + agent = AgentDef( + name="t", + prompt="p", + model="claude-opus-4-20250514", + reasoning=ReasoningConfig(effort="high"), + ) + # Should not raise. + await provider.execute(agent=agent, context={}, rendered_prompt="p") + assert "thinking" in mock_client.messages.create.call_args[1] + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_runtime_default_used_when_agent_unset( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + provider, mock_client = self._build_provider( + mock_anthropic_module, + mock_anthropic_class, + default_reasoning_effort="low", + ) + agent = AgentDef( + name="t", + prompt="p", + model="claude-sonnet-4-20250514", + # No per-agent reasoning configured. + ) + await provider.execute(agent=agent, context={}, rendered_prompt="p") + + kwargs = mock_client.messages.create.call_args[1] + assert kwargs["thinking"]["budget_tokens"] == 2048 + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_per_agent_reasoning_overrides_runtime_default( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + from conductor.config.schema import ReasoningConfig + + provider, mock_client = self._build_provider( + mock_anthropic_module, + mock_anthropic_class, + default_reasoning_effort="low", + ) + agent = AgentDef( + name="t", + prompt="p", + model="claude-sonnet-4-20250514", + reasoning=ReasoningConfig(effort="high"), + ) + await provider.execute(agent=agent, context={}, rendered_prompt="p") + + kwargs = mock_client.messages.create.call_args[1] + assert kwargs["thinking"]["budget_tokens"] == 16384 + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_thinking_blocks_emit_agent_reasoning_event( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + from conductor.config.schema import ReasoningConfig + + mock_anthropic_module.__version__ = "0.77.0" + mock_client = Mock() + mock_client.models.list = AsyncMock(return_value=Mock(data=[])) + + thinking_block = Mock(spec=["type", "thinking"]) + thinking_block.type = "thinking" + thinking_block.thinking = "Let me reason step by step..." + + text_block = Mock(spec=["type", "text"]) + text_block.type = "text" + text_block.text = "Final answer" + + response = Mock() + response.content = [thinking_block, text_block] + response.usage = Mock(input_tokens=10, output_tokens=20) + + mock_client.messages.create = AsyncMock(return_value=response) + mock_anthropic_class.return_value = mock_client + + provider = ClaudeProvider() + agent = AgentDef( + name="t", + prompt="p", + model="claude-opus-4-20250514", + reasoning=ReasoningConfig(effort="medium"), + ) + + events: list[tuple[str, dict]] = [] + + def cb(event_type: str, data: dict) -> None: + events.append((event_type, data)) + + await provider.execute(agent=agent, context={}, rendered_prompt="p", event_callback=cb) + + reasoning_events = [e for e in events if e[0] == "agent_reasoning"] + assert len(reasoning_events) == 1 + assert reasoning_events[0][1]["content"] == "Let me reason step by step..." + + message_events = [e for e in events if e[0] == "agent_message"] + assert len(message_events) == 1 + assert message_events[0][1]["content"] == "Final answer" + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_no_thinking_kwarg_when_reasoning_unset( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + provider, mock_client = self._build_provider(mock_anthropic_module, mock_anthropic_class) + agent = AgentDef( + name="t", + prompt="p", + model="claude-opus-4-20250514", + # No reasoning config and no runtime default. + ) + await provider.execute(agent=agent, context={}, rendered_prompt="p") + + kwargs = mock_client.messages.create.call_args[1] + assert "thinking" not in kwargs + + +class TestClaudeReasoningEffortRegressions: + """Regression tests for fixes applied on feat/reasoning-effort. + + Covers: + - Fix #1: thinking / redacted_thinking blocks must be echoed back in the + assistant message replay across agentic-loop iterations, otherwise the + Anthropic API rejects iteration 2+ with a 400 when reasoning + tool_use + are combined. + - Fix #3: ``execute_dialog_turn`` raises ``ValidationError`` (not + ``ProviderError``) for non-thinking models when + ``default_reasoning_effort`` is configured. + - Fix #5 / coverage: ``thinking`` kwarg is forwarded through the + parse-recovery path (not just the bare agentic-loop path). + """ + + @staticmethod + def _build_provider_with_responses( + mock_anthropic_module: Mock, + mock_anthropic_class: Mock, + responses: list[Mock], + ) -> tuple[ClaudeProvider, Mock]: + """Build a ClaudeProvider whose messages.create returns ``responses`` in order.""" + mock_anthropic_module.__version__ = "0.77.0" + mock_client = Mock() + mock_client.models.list = AsyncMock(return_value=Mock(data=[])) + mock_client.messages.create = AsyncMock(side_effect=responses) + mock_anthropic_class.return_value = mock_client + + provider = ClaudeProvider() + return provider, mock_client + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_thinking_block_preserved_in_agentic_loop_replay( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + """Regression for Fix #1 (CRITICAL). + + When a Claude response combines a ``thinking`` block with a ``tool_use`` + block, the next iteration of the agentic loop must replay BOTH blocks + in the assistant message, in original order, with the thinking block + serialized as ``{"type": "thinking", "thinking": ..., "signature": ...}``. + Dropping the thinking block (or its signature) causes the Anthropic API + to reject the next request with a 400. + + Also asserts ``thinking`` kwarg is forwarded on EVERY API call within + the loop — not only the first. + """ + from conductor.config.schema import ReasoningConfig + + # Iteration 1: thinking + tool_use → triggers MCP tool execution. + thinking_block = Mock(spec=["type", "thinking", "signature"]) + thinking_block.type = "thinking" + thinking_block.thinking = "Let me consider what tool to call..." + thinking_block.signature = "sig-abc123" + + tool_use_block = Mock(spec=["type", "id", "name", "input"]) + tool_use_block.type = "tool_use" + tool_use_block.id = "toolu_01" + tool_use_block.name = "search" + tool_use_block.input = {"query": "weather"} + + response_iter1 = Mock() + response_iter1.content = [thinking_block, tool_use_block] + response_iter1.usage = Mock(input_tokens=10, output_tokens=20) + + # Iteration 2: text only → loop terminates. + text_block = Mock(spec=["type", "text"]) + text_block.type = "text" + text_block.text = "It is sunny." + + response_iter2 = Mock() + response_iter2.content = [text_block] + response_iter2.usage = Mock(input_tokens=15, output_tokens=5) + + provider, mock_client = self._build_provider_with_responses( + mock_anthropic_module, mock_anthropic_class, [response_iter1, response_iter2] + ) + + # Stub MCP manager so the tool_use is actually executed and the loop + # advances to iteration 2 (without an MCP manager the loop bails out). + mock_mcp = Mock() + mock_mcp.has_servers = Mock(return_value=False) # don't add tools to the request + mock_mcp.call_tool = AsyncMock(return_value="sunny") + provider._mcp_manager = mock_mcp + + agent = AgentDef( + name="t", + prompt="p", + model="claude-opus-4-20250514", + reasoning=ReasoningConfig(effort="medium"), + ) + await provider.execute(agent=agent, context={}, rendered_prompt="p") + + # Both API calls must have happened. + assert mock_client.messages.create.await_count == 2 + + # Thinking kwarg present on BOTH calls (not just the first). + first_kwargs = mock_client.messages.create.call_args_list[0].kwargs + second_kwargs = mock_client.messages.create.call_args_list[1].kwargs + expected_thinking = {"type": "enabled", "budget_tokens": 8192} + assert first_kwargs["thinking"] == expected_thinking + assert second_kwargs["thinking"] == expected_thinking + + # The second call's messages must echo the assistant turn with both + # blocks serialized in original order. + replayed_messages = second_kwargs["messages"] + assistant_turns = [m for m in replayed_messages if m["role"] == "assistant"] + assert len(assistant_turns) == 1, "Exactly one assistant replay expected" + assistant_content = assistant_turns[0]["content"] + + assert assistant_content == [ + { + "type": "thinking", + "thinking": "Let me consider what tool to call...", + "signature": "sig-abc123", + }, + { + "type": "tool_use", + "id": "toolu_01", + "name": "search", + "input": {"query": "weather"}, + }, + ] + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_redacted_thinking_block_preserved_in_agentic_loop_replay( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + """Regression for Fix #1: ``redacted_thinking`` blocks (no signature, has ``data``) + must also be echoed in the assistant replay as + ``{"type": "redacted_thinking", "data": ...}``. + """ + from conductor.config.schema import ReasoningConfig + + redacted_block = Mock(spec=["type", "data"]) + redacted_block.type = "redacted_thinking" + redacted_block.data = "REDACTED_PAYLOAD" + + tool_use_block = Mock(spec=["type", "id", "name", "input"]) + tool_use_block.type = "tool_use" + tool_use_block.id = "toolu_02" + tool_use_block.name = "search" + tool_use_block.input = {"q": "x"} + + response_iter1 = Mock() + response_iter1.content = [redacted_block, tool_use_block] + response_iter1.usage = Mock(input_tokens=5, output_tokens=5) + + text_block = Mock(spec=["type", "text"]) + text_block.type = "text" + text_block.text = "done" + response_iter2 = Mock() + response_iter2.content = [text_block] + response_iter2.usage = Mock(input_tokens=1, output_tokens=1) + + provider, mock_client = self._build_provider_with_responses( + mock_anthropic_module, mock_anthropic_class, [response_iter1, response_iter2] + ) + + mock_mcp = Mock() + mock_mcp.has_servers = Mock(return_value=False) + mock_mcp.call_tool = AsyncMock(return_value="ok") + provider._mcp_manager = mock_mcp + + agent = AgentDef( + name="t", + prompt="p", + model="claude-opus-4-20250514", + reasoning=ReasoningConfig(effort="low"), + ) + await provider.execute(agent=agent, context={}, rendered_prompt="p") + + assert mock_client.messages.create.await_count == 2 + replayed_messages = mock_client.messages.create.call_args_list[1].kwargs["messages"] + assistant_turns = [m for m in replayed_messages if m["role"] == "assistant"] + assert assistant_turns[0]["content"] == [ + {"type": "redacted_thinking", "data": "REDACTED_PAYLOAD"}, + { + "type": "tool_use", + "id": "toolu_02", + "name": "search", + "input": {"q": "x"}, + }, + ] + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_dialog_turn_raises_validation_error_for_non_thinking_model( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + """Regression for Fix #3. + + ``execute_dialog_turn`` must raise ``ValidationError`` (not silently + drop the reasoning request, and not wrap it as ``ProviderError``) + when ``default_reasoning_effort`` is set but the resolved model does + not support extended thinking. Mirrors ``_resolve_thinking_for_agent``. + """ + mock_anthropic_module.__version__ = "0.77.0" + mock_client = Mock() + mock_client.models.list = AsyncMock(return_value=Mock(data=[])) + # messages.create should NOT be called — the validation must trip + # first. Setting it as AsyncMock guards against silent fallthrough. + mock_client.messages.create = AsyncMock(return_value=Mock(content=[])) + mock_anthropic_class.return_value = mock_client + + provider = ClaudeProvider(default_reasoning_effort="high") # type: ignore[arg-type] + + with pytest.raises(ValidationError, match="extended thinking"): + await provider.execute_dialog_turn( + system_prompt="sys", + user_message="hi", + model="claude-3-5-sonnet-latest", + ) + + # And ensure no API call was made (no silent dropping of reasoning). + mock_client.messages.create.assert_not_awaited() + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_dialog_turn_succeeds_for_thinking_model_with_default_effort( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + """Companion to Fix #3: the same ``default_reasoning_effort`` setting + must work on a thinking-capable model and forward the ``thinking`` + kwarg with the correct budget. + """ + mock_anthropic_module.__version__ = "0.77.0" + mock_client = Mock() + mock_client.models.list = AsyncMock(return_value=Mock(data=[])) + + text_block = Mock(spec=["type", "text"]) + text_block.type = "text" + text_block.text = "hello" + response = Mock() + response.content = [text_block] + mock_client.messages.create = AsyncMock(return_value=response) + mock_anthropic_class.return_value = mock_client + + provider = ClaudeProvider(default_reasoning_effort="medium") # type: ignore[arg-type] + + result = await provider.execute_dialog_turn( + system_prompt="sys", + user_message="hi", + model="claude-opus-4-20250514", + ) + + assert result == "hello" + kwargs = mock_client.messages.create.call_args.kwargs + assert kwargs["thinking"] == {"type": "enabled", "budget_tokens": 8192} + # max_tokens must accommodate the thinking budget. + assert kwargs["max_tokens"] >= 8192 + 4096 + + @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) + @patch("conductor.providers.claude.AsyncAnthropic") + @patch("conductor.providers.claude.anthropic") + @pytest.mark.asyncio + async def test_thinking_kwarg_forwarded_through_parse_recovery_path( + self, mock_anthropic_module: Mock, mock_anthropic_class: Mock + ) -> None: + """Regression for Fix #5 / coverage gap. + + Agents with an ``output:`` schema route through + ``_execute_with_parse_recovery`` rather than the bare ``_execute_api_call`` + path. Verify that ``thinking`` is forwarded on that path so reasoning + is not silently dropped for structured-output agents. + """ + from conductor.config.schema import ReasoningConfig + + # emit_output tool_use → parse-recovery happy path (no recovery needed, + # but still goes through _execute_with_parse_recovery). + emit_block = Mock(spec=["type", "name", "input", "id"]) + emit_block.type = "tool_use" + emit_block.name = "emit_output" + emit_block.id = "toolu_emit" + emit_block.input = {"answer": "42"} + + response = Mock() + response.content = [emit_block] + response.usage = Mock(input_tokens=5, output_tokens=5) + + provider, mock_client = self._build_provider_with_responses( + mock_anthropic_module, mock_anthropic_class, [response] + ) + + agent = AgentDef( + name="t", + prompt="p", + model="claude-opus-4-20250514", + reasoning=ReasoningConfig(effort="high"), + output={"answer": OutputField(type="string")}, + ) + result = await provider.execute(agent=agent, context={}, rendered_prompt="p") + + assert result.content == {"answer": "42"} + kwargs = mock_client.messages.create.call_args.kwargs + assert kwargs["thinking"] == {"type": "enabled", "budget_tokens": 16384} + # temperature must be coerced to 1.0 when thinking is enabled. + assert kwargs["temperature"] == 1.0 + + # TODO: cover _execute_api_call interrupt-race branch (interrupt_signal set + # mid-call) — requires racing asyncio.Event with the mocked API call and is + # exercised indirectly today via the agentic-loop tests above. + # TODO: cover _request_partial_output path with thinking forwarded — this + # is a fourth messages.create site reachable only via mid-agent interrupt + # and partial-output flow; mocking complexity is prohibitive for a unit + # test (would need a full asyncio interrupt fixture). diff --git a/tests/test_providers/test_claude_event_callback.py b/tests/test_providers/test_claude_event_callback.py index af47f0d..fb4c764 100644 --- a/tests/test_providers/test_claude_event_callback.py +++ b/tests/test_providers/test_claude_event_callback.py @@ -63,6 +63,7 @@ def _make_provider_with_mcp() -> ClaudeProvider: provider._max_schema_depth = 10 provider._default_max_agent_iterations = 50 provider._default_max_session_seconds = None + provider._default_reasoning_effort = None mock_mcp_manager = MagicMock() mock_mcp_manager.has_servers.return_value = True @@ -89,6 +90,7 @@ def _make_bare_provider() -> ClaudeProvider: provider._max_schema_depth = 10 provider._default_max_agent_iterations = 50 provider._default_max_session_seconds = None + provider._default_reasoning_effort = None return provider @@ -547,6 +549,7 @@ async def test_callback_reaches_agentic_loop(self) -> None: agent.model = None agent.max_agent_iterations = None agent.max_session_seconds = None + agent.reasoning = None await provider._execute_with_retry( agent=agent, diff --git a/tests/test_providers/test_claude_interrupt.py b/tests/test_providers/test_claude_interrupt.py index 1063ce0..10d5955 100644 --- a/tests/test_providers/test_claude_interrupt.py +++ b/tests/test_providers/test_claude_interrupt.py @@ -41,6 +41,7 @@ def _make_provider() -> ClaudeProvider: provider._max_schema_depth = 10 provider._default_max_agent_iterations = 50 provider._default_max_session_seconds = None + provider._default_reasoning_effort = None return provider diff --git a/tests/test_providers/test_claude_parameter_passing.py b/tests/test_providers/test_claude_parameter_passing.py index 8147800..8b17aae 100644 --- a/tests/test_providers/test_claude_parameter_passing.py +++ b/tests/test_providers/test_claude_parameter_passing.py @@ -47,6 +47,7 @@ async def test_common_parameters_passed_from_factory(self, mock_claude_class: Mo mcp_servers=None, max_agent_iterations=None, max_session_seconds=None, + default_reasoning_effort=None, ) @patch("conductor.providers.claude.ANTHROPIC_SDK_AVAILABLE", True) diff --git a/tests/test_providers/test_copilot.py b/tests/test_providers/test_copilot.py index 10fb6ce..7985aa2 100644 --- a/tests/test_providers/test_copilot.py +++ b/tests/test_providers/test_copilot.py @@ -937,3 +937,305 @@ async def list_models() -> list[Any]: provider = self._provider_with_list_models(list_models) assert await provider.get_max_prompt_tokens("claude-3-5-sonnet-latest") == 200_000 assert await provider.get_max_prompt_tokens("claude-3-5-sonnet-20241022") == 200_000 + + +class TestReasoningEffort: + """Tests for reasoning_effort plumbing into create_session.""" + + @staticmethod + def _make_model(model_id: str, supported: list[str] | None) -> Any: + from types import SimpleNamespace + + return SimpleNamespace( + id=model_id, + capabilities=SimpleNamespace( + limits=SimpleNamespace(max_prompt_tokens=128_000), + supported_reasoning_efforts=supported, + ), + ) + + @staticmethod + async def _build_provider( + captured: dict[str, Any], + monkeypatch: pytest.MonkeyPatch, + *, + default_reasoning_effort: str | None = None, + list_models_impl: Any = None, + ) -> CopilotProvider: + """Build a provider with a fake client that captures create_session kwargs. + + The provider is in real-SDK mode (``_mock_handler`` set to ``None``) + so the validation + plumbing path is exercised end to end. + """ + + class _FakeSession: + session_id = "session-xyz" + + async def disconnect(self) -> None: + return None + + class _FakeClient: + async def create_session(self, **kwargs: Any) -> _FakeSession: + captured["create_session_kwargs"] = kwargs + return _FakeSession() + + async def list_models(self) -> list[Any]: + if list_models_impl is None: + return [] + return await list_models_impl() + + provider = CopilotProvider( + mock_handler=stub_handler, + default_reasoning_effort=default_reasoning_effort, + ) + provider._mock_handler = None + provider._client = _FakeClient() + provider._started = True + + async def _noop() -> None: + return None + + async def _fake_send_and_wait(*args: Any, **kwargs: Any) -> SDKResponse: + return SDKResponse(content='{"ok":true}') + + monkeypatch.setattr(provider, "_ensure_client_started", _noop) + monkeypatch.setattr(provider, "_send_and_wait", _fake_send_and_wait) + return provider + + @pytest.mark.asyncio + async def test_per_agent_effort_forwarded_to_create_session( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + from conductor.config.schema import ReasoningConfig + + captured: dict[str, Any] = {} + provider = await self._build_provider(captured, monkeypatch) + agent = AgentDef( + name="planner", + model="gpt-4o", + prompt="Plan", + reasoning=ReasoningConfig(effort="high"), + ) + await provider.execute(agent=agent, context={}, rendered_prompt="Plan") + assert captured["create_session_kwargs"]["reasoning_effort"] == "high" + + @pytest.mark.asyncio + async def test_runtime_default_used_when_agent_has_none( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + captured: dict[str, Any] = {} + provider = await self._build_provider( + captured, monkeypatch, default_reasoning_effort="medium" + ) + agent = AgentDef(name="planner", model="gpt-4o", prompt="Plan") + await provider.execute(agent=agent, context={}, rendered_prompt="Plan") + assert captured["create_session_kwargs"]["reasoning_effort"] == "medium" + + @pytest.mark.asyncio + async def test_per_agent_effort_overrides_runtime_default( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + from conductor.config.schema import ReasoningConfig + + captured: dict[str, Any] = {} + provider = await self._build_provider(captured, monkeypatch, default_reasoning_effort="low") + agent = AgentDef( + name="planner", + model="gpt-4o", + prompt="Plan", + reasoning=ReasoningConfig(effort="xhigh"), + ) + await provider.execute(agent=agent, context={}, rendered_prompt="Plan") + assert captured["create_session_kwargs"]["reasoning_effort"] == "xhigh" + + @pytest.mark.asyncio + async def test_no_effort_set_means_key_absent(self, monkeypatch: pytest.MonkeyPatch) -> None: + captured: dict[str, Any] = {} + provider = await self._build_provider(captured, monkeypatch) + agent = AgentDef(name="planner", model="gpt-4o", prompt="Plan") + await provider.execute(agent=agent, context={}, rendered_prompt="Plan") + assert "reasoning_effort" not in captured["create_session_kwargs"] + + @pytest.mark.asyncio + async def test_validation_error_when_model_does_not_support_effort( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + from conductor.config.schema import ReasoningConfig + from conductor.exceptions import ValidationError + + async def list_models() -> list[Any]: + return [self._make_model("gpt-4o", supported=["low", "medium"])] + + captured: dict[str, Any] = {} + provider = await self._build_provider(captured, monkeypatch, list_models_impl=list_models) + agent = AgentDef( + name="planner", + model="gpt-4o", + prompt="Plan", + reasoning=ReasoningConfig(effort="xhigh"), + ) + with pytest.raises(ValidationError, match="does not support reasoning_effort"): + await provider.execute(agent=agent, context={}, rendered_prompt="Plan") + + @pytest.mark.asyncio + async def test_validation_skipped_in_mock_handler_mode(self) -> None: + """Mock-handler mode must skip capability validation entirely.""" + provider = CopilotProvider(mock_handler=stub_handler) + # Even an obviously bogus effort value is accepted because the SDK + # path is short-circuited by the mock handler. + await provider._validate_reasoning_effort_for_model("gpt-4o", "xhigh") + + @pytest.mark.asyncio + async def test_supported_efforts_none_allows_any_effort( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """When the SDK reports no capability metadata, validation is permissive.""" + from conductor.config.schema import ReasoningConfig + + async def list_models() -> list[Any]: + return [self._make_model("gpt-4o", supported=None)] + + captured: dict[str, Any] = {} + provider = await self._build_provider(captured, monkeypatch, list_models_impl=list_models) + agent = AgentDef( + name="planner", + model="gpt-4o", + prompt="Plan", + reasoning=ReasoningConfig(effort="xhigh"), + ) + await provider.execute(agent=agent, context={}, rendered_prompt="Plan") + assert captured["create_session_kwargs"]["reasoning_effort"] == "xhigh" + + @pytest.mark.asyncio + async def test_validation_error_not_retried_in_execute_with_retry( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Regression: ValidationError from capability check must escape unwrapped + from _execute_with_retry after a single attempt — no retry, no sleep, + and not re-wrapped as ProviderError. + """ + from conductor.config.schema import ReasoningConfig + from conductor.exceptions import ValidationError + + list_models_calls = 0 + + async def list_models() -> list[Any]: + nonlocal list_models_calls + list_models_calls += 1 + return [self._make_model("gpt-4o", supported=["low", "medium"])] + + sleep_calls: list[float] = [] + + async def fake_sleep(delay: float) -> None: + sleep_calls.append(delay) + + monkeypatch.setattr("conductor.providers.copilot.asyncio.sleep", fake_sleep) + + captured: dict[str, Any] = {} + provider = await self._build_provider(captured, monkeypatch, list_models_impl=list_models) + # Force a multi-attempt retry config so a successful retry-suppression + # is unambiguous (a ProviderError-wrapped path would loop 3 times). + provider._retry_config = RetryConfig(max_attempts=3, base_delay=0.0, max_delay=0.0) + + agent = AgentDef( + name="planner", + model="gpt-4o", + prompt="Plan", + reasoning=ReasoningConfig(effort="high"), + ) + + with pytest.raises(ValidationError, match="does not support reasoning_effort"): + await provider.execute(agent=agent, context={}, rendered_prompt="Plan") + + # Capability check ran exactly once — no retry of the SDK call. + assert list_models_calls == 1 + # _retry_history is only appended on the ProviderError/Exception + # branches; the ValidationError branch must skip it entirely. + assert provider.get_retry_history() == [] + # No backoff sleep was scheduled. + assert sleep_calls == [] + + @pytest.mark.asyncio + async def test_validation_error_from_dialog_turn_escapes_unwrapped( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Regression: ValidationError from execute_dialog_turn must propagate + unwrapped (not re-wrapped as ProviderError by the broad except clause). + """ + from unittest.mock import AsyncMock as _AsyncMock + + from conductor.exceptions import ValidationError + + provider = CopilotProvider( + mock_handler=stub_handler, + default_reasoning_effort="xhigh", + ) + provider._mock_handler = None + provider._started = True + + async def list_models() -> list[Any]: + return [self._make_model("gpt-4o", supported=["low", "medium"])] + + # create_session must NOT be reached when validation fails. + create_session_called = False + + async def create_session(**kwargs: Any) -> Any: + nonlocal create_session_called + create_session_called = True + raise AssertionError("create_session should not be called when validation fails") + + client = _AsyncMock() + client.create_session = create_session + client.list_models = list_models + provider._client = client + + async def _noop() -> None: + return None + + monkeypatch.setattr(provider, "_ensure_client_started", _noop) + + with pytest.raises(ValidationError) as exc_info: + await provider.execute_dialog_turn( + system_prompt="be helpful", + user_message="hi", + history=[], + model="gpt-4o", + ) + + # Original typed error preserved (not stringified into ProviderError). + assert "does not support reasoning_effort" in str(exc_info.value) + assert exc_info.value.suggestion is not None + assert not create_session_called + + @pytest.mark.asyncio + async def test_retryable_provider_error_is_still_retried( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Guard against an over-broad fix: non-validation ProviderError must + still trigger the retry loop up to max_attempts. + """ + call_count = 0 + + def mock_handler(agent: AgentDef, prompt: str, context: dict[str, Any]) -> dict[str, Any]: + nonlocal call_count + call_count += 1 + raise ProviderError("transient backend error", status_code=500) + + sleep_calls: list[float] = [] + + async def fake_sleep(delay: float) -> None: + sleep_calls.append(delay) + + monkeypatch.setattr("conductor.providers.copilot.asyncio.sleep", fake_sleep) + + retry_config = RetryConfig(max_attempts=3, base_delay=0.0, max_delay=0.0, jitter=0.0) + provider = CopilotProvider(mock_handler=mock_handler, retry_config=retry_config) + agent = AgentDef(name="planner", model="gpt-4o", prompt="Plan") + + with pytest.raises(ProviderError): + await provider.execute(agent=agent, context={}, rendered_prompt="Plan") + + assert call_count == 3 + assert len(provider.get_retry_history()) == 3 + # Two backoff sleeps between three attempts. + assert len(sleep_calls) == 2