diff --git a/.claude/skills/nasde-benchmark-creator/SKILL.md b/.claude/skills/nasde-benchmark-creator/SKILL.md index 55a3c23..c3d0987 100644 --- a/.claude/skills/nasde-benchmark-creator/SKILL.md +++ b/.claude/skills/nasde-benchmark-creator/SKILL.md @@ -288,6 +288,21 @@ model = "google/gemini-3-flash-preview" # Required format: google/ - `google/gemini-3-flash-preview` — best quality/speed ratio, daily coding tasks - `google/gemini-3.1-flash-lite-preview` — fastest, simple and repetitive tasks +### Scoping a variant to specific tasks (optional) + +If a variant only makes sense for certain tasks — e.g. a skill whose examples are +tuned to one repo's conventions — declare a `tasks` list. It restricts the variant +to those tasks so `--all-variants` never runs it against the wrong codebase: + +```toml +agent = "claude" +model = "claude-sonnet-4-6" +tasks = ["my-benchmark/task-a"] # only runs against these tasks +``` + +Omit `tasks` for a general-purpose variant (the default — runs against all tasks). +The scope wins even over an explicit `--tasks` filter. + ### Claude Code variant ``` diff --git a/.claude/skills/nasde-dev/SKILL.md b/.claude/skills/nasde-dev/SKILL.md index 53e8ca5..bf183fa 100644 --- a/.claude/skills/nasde-dev/SKILL.md +++ b/.claude/skills/nasde-dev/SKILL.md @@ -58,6 +58,7 @@ After any change to the evaluation pipeline, CLI flags, configuration schema, ag **Think DX-first:** For every new option or feature, ask "where will the user be when they need this?" and put the documentation there. A feature that exists only in CLAUDE.md is invisible to most users. Check every touchpoint: **Documentation:** +- `CHANGELOG.md` — **add an entry under `## [Unreleased]`** (Added / Changed / Fixed) for any user-visible change: new CLI flag, config field, behavior change, dependency bump. Add the `[#NN]` PR link-reference at the bottom. Easy to forget — do it as part of the change, not at release time. - `README.md` — user-facing documentation (CLI options table, nasde.toml config reference, explanatory text). This is where most users look first. - `CLAUDE.md` — agent instructions (CLI reference, nasde.toml example, architecture decisions) - `ARCHITECTURE.md` — system architecture with mermaid diagrams (end-to-end flow, trial lifecycle, cloud sandbox providers, assessment evaluation) diff --git a/.gitignore b/.gitignore index 2972cf7..0c9b043 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .DS_Store +.idea/ __pycache__/ *.pyc .venv/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 1834578..f7aaa26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,24 @@ See [docs/RELEASING.md](docs/RELEASING.md) for the release procedure. ## [Unreleased] +### Added +- **Variant task-scoping: `tasks` array in `variant.toml`.** A variant may declare + `tasks = [...]` to restrict it to specific tasks — use it for a repo-specific + variant (e.g. a skill tuned to one repo's conventions) so it never runs against + the wrong codebase. With `--all-variants` a scoped variant runs only against its + declared tasks (others SKIPPED); with a single `--variant`, requesting a task + outside its scope aborts with a clear error. The scope wins over an explicit + `--tasks` filter. Absent/empty → unscoped (the default). ([#54]) + +### Changed +- **Evaluator default `max_turns` raised 30 → 60.** Avoids `error_max_turns` on + large/complex workspaces (e.g. DDD refactors). Override via `[evaluation] + max_turns` in `nasde.toml`. ([#54]) + +### Fixed +- **Bump `starlette` 1.0.0 → 1.1.0** (PYSEC-2026-161; transitive via + harbor/fastapi/mcp). ([#54]) + ## [0.4.0] — 2026-05-21 ### Added @@ -390,4 +408,5 @@ Initial release under the **nasde-toolkit** name (rebrand from [#50]: https://github.com/NoesisVision/nasde-toolkit/pull/50 [#51]: https://github.com/NoesisVision/nasde-toolkit/pull/51 [#52]: https://github.com/NoesisVision/nasde-toolkit/pull/52 +[#54]: https://github.com/NoesisVision/nasde-toolkit/pull/54 [gh-litellm-2026-04]: https://github.com/BerriAI/litellm/security/advisories/GHSA-xqmj-j6mv-4862 diff --git a/CLAUDE.md b/CLAUDE.md index b1e3089..f567324 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -166,7 +166,7 @@ build_commands = [] backend = "claude" # "claude" (default) | "codex" model = "claude-opus-4-7" dimensions_file = "assessment_dimensions.json" -# max_turns = 30 # Max evaluator conversation turns +# max_turns = 60 # Max evaluator conversation turns (default 60) # allowed_tools = ["Read", "Glob", "Grep"] # Override default tool whitelist # mcp_config = "./evaluator_mcp.json" # MCP server config for evaluator # skills_dir = "./evaluator_skills" # Skills directory for evaluator @@ -255,6 +255,8 @@ Declares the agent type and the model the variant runs against. Every variant MU agent = "claude" # "claude" | "codex" | "gemini" model = "claude-sonnet-4-6" # REQUIRED. Model appropriate for the agent family. +tasks = ["my-benchmark/task-a"] # Optional: task-scope. Restrict this variant to specific tasks. + [[skill]] # Optional (ADR-009): skill-by-reference, Claude only. path = "../../../src/plugins/my-plugin/skills/my-skill" # source skill dir (required) ref = "abc1234" # optional git ref, same semantics as [nasde.source] @@ -262,6 +264,16 @@ ref = "abc1234" # optional git ref, same semantics as [nasde **Model priority**: `--model` CLI flag > `variant.toml [model]`. Missing model in both places → SystemExit with a clear error. +**`tasks` (variant task-scope)**: optional list of task names this variant is +meant to run against. Use it for a *repo-specific* variant — e.g. a skill whose +examples reference one repo's conventions — so it never runs against the wrong +codebase. The scope is enforced in **both** run modes: with `--all-variants` a +scoped variant runs only against its declared tasks (others are skipped with a +SKIPPED status); with a single `--variant`, if none of the requested tasks fall in +the variant's scope the run aborts with a clear error. Either way the scope wins +over an explicit `--tasks` filter. Absent or empty → unscoped (runs against every +task, the default). + **`[[skill]]` (skill-by-reference)**: each entry stages the **whole** skill dir (incl. `references/`) into `/app/.claude/skills//` from a source path — no copy under `variants//skills/`. Optional `ref` reads from a temp git diff --git a/README.md b/README.md index 3c09622..992af0b 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,12 @@ The criteria spell out what each score means for each dimension. Here is the ful **The insight:** the same "DDD guidance" skill helps Claude a little (+3.5) and *badly* hurts Codex (-22). The per-dimension breakdown pinpoints *where* Codex regresses — domain modeling, encapsulation, extensibility — which would be invisible without this assessment. Skill optimization is agent-specific. +### Deep dive — does a public skill, and tuning it, actually help? + +A separate study took a public DDD skill (the `tactical-ddd` skill from `ntcoding/claude-skillz`) and its repo-tuned version across four configurations on two deliberately different tasks — a feature on a clean DDD codebase and a legacy anemic→rich refactor. The headline: **the effect is task-dependent** — tuning the skill to the repo adds **+0.13** quality on the clean-feature task but only **+0.06** on the legacy refactor (increment over the bare model; absolute scores across tasks aren't comparable). Two lessons that generalize: judge *per dimension*, not one aggregate; and a skill present on disk is not a skill used — verify it activated. + +→ Full tables, per-dimension radars, and token/time charts in **[Benchmark Results](docs/benchmark-results.md#deep-dive--tactical-ddd-skill-public-vs-repo-tuned-claude-code)**. + ### More benchmarks in the repo - **Refactoring katas (Java + Python)** — four classic refactorings scored on behavior preservation, clarity, technique, scope discipline. *Takeaway:* a candidate "refactoring skill" didn't move the score — shipping it would have been based on vibes. @@ -402,7 +408,7 @@ append_system_prompt = "Pay special attention to SOLID principles when scoring." | `backend` | `claude` | Subprocess backend: `claude` or `codex` | | `model` | `claude-opus-4-7` | Evaluator model | | `dimensions_file` | `assessment_dimensions.json` | Scoring dimensions file | -| `max_turns` | `30` | Max conversation turns | +| `max_turns` | `60` | Max evaluator conversation turns (raise for DDD-rich workspaces with many small files) | | `allowed_tools` | `["Read", "Glob", "Grep"]` | Tool whitelist | | `mcp_config` | — | Path to MCP server config JSON | | `skills_dir` | — | Path to evaluator skills directory | @@ -473,6 +479,28 @@ sandbox — no copy under `variants/`. The legacy `variants//skills//` copy path still works unchanged (and now also carries `references/`, which it previously dropped). +## Scoping a variant to specific tasks (`tasks`) + +Some variants only make sense for one task — for example, a skill whose code +examples are *tuned to a particular repo's conventions*. Running such a variant +against a different codebase produces misleading results. Declare a `tasks` +scope in the variant's `variant.toml`: + +```toml +agent = "claude" +model = "claude-sonnet-4-6" + +# This variant's skill references this repo's value objects, so it should only +# run against that task. +tasks = ["csharp-anemic-to-rich-domain"] +``` + +The scope is enforced either way you run: with `--all-variants` a scoped variant +runs **only** against its declared tasks (others show as `SKIPPED`); with a single +`--variant`, asking for a task outside its scope aborts with a clear error rather +than running against the wrong repo. Omit `tasks` (the default) for a +general-purpose variant that runs everywhere. + ## Commands ### Core @@ -581,7 +609,7 @@ build_commands = [] backend = "claude" # "claude" (default) | "codex" model = "claude-opus-4-7" dimensions_file = "assessment_dimensions.json" -# max_turns = 30 # Max evaluator conversation turns +# max_turns = 60 # Max evaluator conversation turns (default 60) # allowed_tools = ["Read", "Glob", "Grep"] # Override default tool whitelist # mcp_config = "./evaluator_mcp.json" # MCP server config for evaluator # skills_dir = "./evaluator_skills" # Skills directory for evaluator diff --git a/docs/benchmark-results.md b/docs/benchmark-results.md index 2867d90..a79c830 100644 --- a/docs/benchmark-results.md +++ b/docs/benchmark-results.md @@ -30,6 +30,45 @@ Results from the three example benchmarks included in `examples/`. All scores ar **Takeaway:** Architectural guidance helps Claude (+3.5) but dramatically hurts Codex (-22.0). The same skill applied to different agents can have opposite effects — this is exactly the kind of insight NASDE is designed to surface. +### Deep dive — tactical-ddd skill: public vs repo-tuned (Claude Code) + +A focused follow-up on the same benchmark family: we took a public DDD skill ([`tactical-ddd` from `ntcoding/claude-skillz`](https://github.com/NTCoding/claude-skillz)) and a repo-tuned version, and measured four configurations of Claude Code on two deliberately different tasks — a **feature on a clean DDD codebase** (`ddd-weather-discount`) and a **legacy anemic→rich refactor** (`csharp-movie-rental-anemic`). Each configuration was run repeatedly and each run scored repeatedly; the numbers below are medians (normalized 0–1). Skill activation was verified per run — a mounted skill the agent never invokes scores like no skill at all. + +| Configuration | Weather (feature) | Movie (legacy) | +|---|:---:|:---:| +| vanilla (no skill) | 0.79 | 0.56 | +| guided (manual DDD hints, no skill) | 0.84 | 0.58 | +| public skill | 0.85 | 0.60 | +| repo-tuned skill | **0.92** | **0.62** | + +**The effect is task-dependent.** Absolute scores across tasks aren't comparable — task difficulty sets the baseline (movie starts lower because it's harder). What *is* comparable is the **increment over vanilla** on each task: how much each step lifts quality above the bare model. + +

+ Quality gain over vanilla +

+ +The same repo-tuned skill adds **+0.13** on the clean-feature task but only **+0.06** on the legacy refactor — twice the payoff where the design space is open. + +Per-dimension radars show *where* the gains land (test quality stays flat everywhere — the skill teaches modeling, not testing): + +

+ Weather radar + Movie radar +

+ +What does the gain cost? Token usage and run time per configuration — and the answer is **not** the simple "better costs more" you might expect: + +

+ Weather tokens + Movie tokens + Weather time + Movie time +

+ +Cost doesn't track quality. On weather the top-scoring repo-tuned skill spends *fewer* tokens than guided or public; on movie the public skill is the cheapest of all four. The real overhead is run time on the messy refactor, where the skill arms take roughly twice as long as the bare model. + +**Two lessons that generalize:** (1) judge one aggregate number and you miss the story — a real per-dimension gain hides inside an averaged score, which is why we show radars, not a single bar; (2) a skill present on disk is not a skill used — always verify it activated before trusting the result. + ## UC1: Project-Specific Setup — NASDE Dev Skill 1 task: Add multi-attempt support to the nasde-toolkit itself. Claude only (project-specific skill, cross-agent comparison not applicable). diff --git a/examples/ddd-architectural-challenges/.gitignore b/examples/ddd-architectural-challenges/.gitignore index 1c18760..6064dec 100644 --- a/examples/ddd-architectural-challenges/.gitignore +++ b/examples/ddd-architectural-challenges/.gitignore @@ -1 +1,2 @@ jobs/ +_focus_report.md diff --git a/examples/ddd-architectural-challenges/assets/increment_vs_vanilla.png b/examples/ddd-architectural-challenges/assets/increment_vs_vanilla.png new file mode 100644 index 0000000..3e0a73b Binary files /dev/null and b/examples/ddd-architectural-challenges/assets/increment_vs_vanilla.png differ diff --git a/examples/ddd-architectural-challenges/assets/ops_time_movie.png b/examples/ddd-architectural-challenges/assets/ops_time_movie.png new file mode 100644 index 0000000..9f73a8e Binary files /dev/null and b/examples/ddd-architectural-challenges/assets/ops_time_movie.png differ diff --git a/examples/ddd-architectural-challenges/assets/ops_time_weather.png b/examples/ddd-architectural-challenges/assets/ops_time_weather.png new file mode 100644 index 0000000..0c6cf10 Binary files /dev/null and b/examples/ddd-architectural-challenges/assets/ops_time_weather.png differ diff --git a/examples/ddd-architectural-challenges/assets/ops_tokens_movie.png b/examples/ddd-architectural-challenges/assets/ops_tokens_movie.png new file mode 100644 index 0000000..d89e504 Binary files /dev/null and b/examples/ddd-architectural-challenges/assets/ops_tokens_movie.png differ diff --git a/examples/ddd-architectural-challenges/assets/ops_tokens_weather.png b/examples/ddd-architectural-challenges/assets/ops_tokens_weather.png new file mode 100644 index 0000000..f9112bb Binary files /dev/null and b/examples/ddd-architectural-challenges/assets/ops_tokens_weather.png differ diff --git a/examples/ddd-architectural-challenges/assets/radar_movie.png b/examples/ddd-architectural-challenges/assets/radar_movie.png new file mode 100644 index 0000000..0f462e3 Binary files /dev/null and b/examples/ddd-architectural-challenges/assets/radar_movie.png differ diff --git a/examples/ddd-architectural-challenges/assets/radar_weather.png b/examples/ddd-architectural-challenges/assets/radar_weather.png new file mode 100644 index 0000000..f915246 Binary files /dev/null and b/examples/ddd-architectural-challenges/assets/radar_weather.png differ diff --git a/examples/ddd-architectural-challenges/nasde.toml b/examples/ddd-architectural-challenges/nasde.toml index da6f4a7..15040e9 100644 --- a/examples/ddd-architectural-challenges/nasde.toml +++ b/examples/ddd-architectural-challenges/nasde.toml @@ -13,6 +13,7 @@ build_commands = [] model = "claude-opus-4-7" dimensions_file = "assessment_dimensions.json" skills_dir = "./evaluator_skills" +max_turns = 60 # = current default; explicit here as a reminder DDD-rich workspaces need headroom [reporting] platform = "opik" diff --git a/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/assessment_criteria.md b/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/assessment_criteria.md index 5a6f677..2c8f036 100644 --- a/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/assessment_criteria.md +++ b/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/assessment_criteria.md @@ -2,6 +2,15 @@ Evaluate the agent's solution across five dimensions. +**Modern .NET expectation (applies throughout):** the project targets .NET 8. A +strong solution reads like a modern .NET 8 codebase, not a port of a .NET Core 2.x +app. Reward idiomatic modern C# where it improves the domain model — `record` / +`readonly record struct` for value objects (free value equality), `required` / +`init`-only properties, file-scoped namespaces, nullable reference types, pattern +matching. Penalize value objects hand-rolled with manual `Equals`/`GetHashCode` +boilerplate and dated `netcoreapp`-era style when a `record` would express the +concept more clearly. + ## 1. Domain Modeling (0–25) | Score | Criteria | @@ -18,6 +27,7 @@ Evaluate the agent's solution across five dimensions. - Does `Company` protect its invariants (contact employee, source)? - Is the employee-company relationship modeled with proper invariant enforcement? - Are constructors/factory methods used instead of public setters? +- **Modern idioms**: are value objects expressed as `record` / `readonly record struct` (value equality for free) rather than classes with hand-written `Equals`/`GetHashCode`? Are file-scoped namespaces, `init`/`required`, and nullable reference types used? Code that achieves rich DDD but in dated .NET Core 2.x style caps this dimension at 20 (not exemplary). ## 2. Encapsulation (0–20) diff --git a/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/environment/Dockerfile b/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/environment/Dockerfile index 42159a1..cc164c5 100644 --- a/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/environment/Dockerfile +++ b/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/environment/Dockerfile @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/dotnet/sdk:6.0 +FROM mcr.microsoft.com/dotnet/sdk:8.0 RUN apt-get update && apt-get install -y \ git \ @@ -10,14 +10,14 @@ RUN apt-get update && apt-get install -y \ WORKDIR /app RUN git clone https://github.com/kgrzybek/refactoring-from-anemic-to-rich-domain-model-example.git . && git checkout anemic -# Upgrade project from netcoreapp2.2 to net6.0 -RUN sed -i 's/netcoreapp2.2/net6.0/' DotNetConfPl.Refactoring/DotNetConfPl.Refactoring.csproj && \ +# Upgrade project from netcoreapp2.2 to net8.0 +RUN sed -i 's/netcoreapp2.2/net8.0/' DotNetConfPl.Refactoring/DotNetConfPl.Refactoring.csproj && \ sed -i 's///' DotNetConfPl.Refactoring/DotNetConfPl.Refactoring.csproj && \ sed -i 's///' DotNetConfPl.Refactoring/DotNetConfPl.Refactoring.csproj && \ - sed -i 's/Version="2.2.6"/Version="6.0.0"/g' DotNetConfPl.Refactoring/DotNetConfPl.Refactoring.csproj && \ + sed -i 's/Version="2.2.6"/Version="8.0.0"/g' DotNetConfPl.Refactoring/DotNetConfPl.Refactoring.csproj && \ sed -i 's/Version="4.0.1"/Version="6.5.0"/g' DotNetConfPl.Refactoring/DotNetConfPl.Refactoring.csproj -# Fix Swagger for .NET 6 (DescribeAllEnumsAsStrings removed, Info -> OpenApiInfo) +# Fix Swagger for modern .NET (DescribeAllEnumsAsStrings removed, Info -> OpenApiInfo) RUN sed -i '/DescribeAllEnumsAsStrings/d' DotNetConfPl.Refactoring/Startup.cs && \ sed -i 's/Swashbuckle.AspNetCore.Swagger.Info/Microsoft.OpenApi.Models.OpenApiInfo/g' DotNetConfPl.Refactoring/Startup.cs && \ sed -i '1s/^/using Microsoft.OpenApi.Models;\n/' DotNetConfPl.Refactoring/Startup.cs diff --git a/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/instruction.md b/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/instruction.md index f4fa895..e304920 100644 --- a/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/instruction.md +++ b/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/instruction.md @@ -1,7 +1,7 @@ # Task: Refactor Anemic Domain Model to Rich Domain Model ## Context -You are working in `/app`, a C# ASP.NET Core 2.2 application that manages companies, persons, and employees. The codebase follows a classic anemic domain model anti-pattern: domain classes (`Person`, `Company`, `Employee`) are plain data containers with only public getters and setters, while all business logic lives in application service classes (`PersonService`, `CompanyService`, `EmployeeService`). +You are working in `/app`, a C# ASP.NET Core application (targeting **.NET 8**) that manages companies, persons, and employees. The codebase originated on an older .NET version and still carries its dated style: domain classes (`Person`, `Company`, `Employee`) are plain data containers with only public getters and setters, while all business logic lives in application service classes (`PersonService`, `CompanyService`, `EmployeeService`) — a classic anemic domain model anti-pattern. Bring the domain model up to modern standards as you enrich it. The project structure: ``` @@ -34,10 +34,11 @@ Specifically: - Domain objects enforce their own invariants — constructors and methods should reject invalid state - Services contain only orchestration: load, call domain method, save - Value objects are immutable with equality based on attributes -- Follow existing C# conventions in the project +- **Use modern C# / .NET idioms** appropriate to .NET 8 — e.g. `record` / `readonly record struct` for value objects, `required` / `init`-only properties, file-scoped namespaces, nullable reference types, and pattern matching where they make the code clearer. Do not preserve the legacy `netcoreapp`-era style. ## Success Criteria -1. The project compiles successfully +1. The project compiles successfully on .NET 8 2. Domain entities contain behavior methods (not just getters/setters) 3. Service classes are thin orchestrators — no business logic in if/else blocks 4. At least one value object has been introduced +5. The code reads like a modern .NET 8 codebase, not a port of a .NET Core 2.x app diff --git a/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/task.toml b/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/task.toml index 589523d..0d7baec 100644 --- a/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/task.toml +++ b/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/task.toml @@ -11,7 +11,7 @@ framework = ".NET" timeout_sec = 1800 [environment] -memory_mb = 4096 +memory_mb = 6144 [verifier] timeout_sec = 300 diff --git a/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/tests/test.sh b/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/tests/test.sh index 4a9b916..511a14f 100644 --- a/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/tests/test.sh +++ b/examples/ddd-architectural-challenges/tasks/csharp-anemic-to-rich-domain/tests/test.sh @@ -1,8 +1,14 @@ #!/bin/bash + +# C# Anemic → Rich Domain Model - Verifier +# Reward semantics: compiles on modern .NET (net8.0) + tests pass (if any). +# Design quality (rich model, modern idioms) is graded by the LLM-as-judge +# against assessment_criteria.md. + cd /app echo "==========================================" -echo "C# Anemic to Rich Domain Model - Evaluation" +echo "C# Anemic to Rich Domain Model - Verifier" echo "==========================================" echo "" @@ -17,57 +23,25 @@ else exit 1 fi -echo "Step 2: Verifying domain entities have behavior methods..." +echo "Step 2: Verifying the project targets modern .NET (net8.0)..." echo "--------------------------------------" -DOMAIN_FILES=$(find DotNetConfPl.Refactoring/Domain -name "*.cs" -type f) -if echo "$DOMAIN_FILES" | xargs grep -lE "(void (Change|Set|Add|Remove|Assign|Create)|bool (Is|Can|Has))" 2>/dev/null | grep -q .; then - echo "✓ SUCCESS: Domain entities contain behavior methods" +if grep -qE "net8\.0" DotNetConfPl.Refactoring/DotNetConfPl.Refactoring.csproj; then + echo "✓ SUCCESS: Targets net8.0" echo "" else - echo "✗ FAILURE: Domain entities lack behavior methods — still anemic" + echo "✗ FAILURE: Project does not target net8.0 — the solution must run on modern .NET" + echo " Found:" + grep -E " /logs/verifier/reward.txt exit 1 fi -echo "Step 3: Verifying services are thin orchestrators..." -echo "--------------------------------------" -SERVICE_FILES=$(find DotNetConfPl.Refactoring/Application -name "*Service.cs" -type f) -if [ -n "$SERVICE_FILES" ]; then - BIZ_LOGIC_COUNT=$(echo "$SERVICE_FILES" | xargs grep -cE "^\s+if\s*\(" 2>/dev/null | awk -F: '{sum+=$2} END {print sum}') - echo "Business logic if-statements in services: ${BIZ_LOGIC_COUNT:-0}" - if [ "${BIZ_LOGIC_COUNT:-0}" -le 2 ]; then - echo "✓ SUCCESS: Services are thin (≤2 if-statements)" - echo "" - else - echo "✗ FAILURE: Services still contain too much business logic (>2 if-statements)" - echo 0 > /logs/verifier/reward.txt - exit 1 - fi -else - echo "✓ INFO: No service files found — logic may be fully in domain" - echo "" -fi - -echo "Step 4: Checking for value objects..." -echo "--------------------------------------" -ALL_CS=$(find DotNetConfPl.Refactoring -name "*.cs" -type f) -if echo "$ALL_CS" | xargs grep -lE "(readonly struct|sealed class.*: (ValueObject|IEquatable)|record struct|record class)" 2>/dev/null | grep -q .; then - echo "✓ SUCCESS: Value object(s) found" - echo "" -else - if echo "$ALL_CS" | xargs grep -lE "(readonly|immutable|Equals\(|GetHashCode\(\))" 2>/dev/null | grep -q .; then - echo "✓ SUCCESS: Immutable types found (likely value objects)" - echo "" - else - echo "✗ FAILURE: No value objects detected" - echo 0 > /logs/verifier/reward.txt - exit 1 - fi -fi - echo "==========================================" -echo "EVALUATION PASSED ✓" +echo "VERIFIER PASSED ✓" echo "==========================================" +echo "" +echo "Note: this verifier only checks that the feature compiles on modern .NET." +echo "DDD design quality and idiom modernity are evaluated by the LLM-as-judge." echo 1 > /logs/verifier/reward.txt exit 0 diff --git a/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/assessment_criteria.md b/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/assessment_criteria.md new file mode 100644 index 0000000..8c3c51b --- /dev/null +++ b/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/assessment_criteria.md @@ -0,0 +1,161 @@ +# Assessment Criteria: Movie-Rental Anemic to Rich Domain Model + +Evaluate the agent's solution across five dimensions. The codebase is an +"Online Theater" movie-rental backend (`Customer`, `Movie`, +`PurchasedMovie`, `CustomerService`, `MovieService`), persisted with +NHibernate / FluentNHibernate. + +**Modern .NET expectation (applies throughout):** the project targets .NET 8. A +strong solution reads like a modern .NET 8 codebase, not a port of a legacy +.NET Framework / NHibernate app. Reward idiomatic modern C# where it improves the +domain model — `record` / `readonly record struct` for value objects (free value +equality), `required` / `init`-only properties, file-scoped namespaces, nullable +reference types, pattern matching. Penalize value objects hand-rolled with manual +`Equals`/`GetHashCode` boilerplate and dated style when a `record` would express +the concept more clearly. Note the legitimate persistence constraint: NHibernate +needs mapped members `virtual` and a parameterless (possibly `protected`) +constructor — keeping `virtual` members and a parameterless ctor on **entities** +(`Customer`, `Movie`, `PurchasedMovie`) is required for mapping and is correct, not +a defect. That constraint does **not** apply to value objects: `Email`, +`Dollars`/money, expiration date, and `CustomerName` should still be immutable +`record` / `readonly record struct` types. + +**No functional-extensions / `Result` library:** this task forbids introducing a +functional-extensions library (e.g. CSharpFunctionalExtensions) or any `Result` / +`Maybe` monad abstraction. Invariants must be enforced with validating factory +methods that throw a domain-specific exception in plain modern C#. Do **not** award +extra credit for pulling in such a library, and treat adding one as a regression +against the task constraints — it is not a substitute for, nor an improvement over, +factory validation plus a domain exception. + +## 1. Domain Modeling (0–25) + +| Score | Criteria | +|-------|----------| +| 0 | No changes to domain entities — still pure data bags | +| 5 | Some methods added to entities but logic is superficial | +| 10 | Business rules moved from services to entities, basic invariant enforcement | +| 15 | Rich domain model with proper aggregate boundaries, entity behavior, and factory methods | +| 20 | Strong DDD: aggregates protect invariants, state transitions are explicit, ubiquitous language used | +| 25 | Exemplary DDD: aggregates protect invariants, state transitions are explicit, ubiquitous language used throughout, proper use of value objects and domain events where appropriate | + +**Key checks:** +- Does `Customer` own `PurchaseMovie` (rejecting a duplicate active purchase and + keeping `MoneySpent` consistent) and the promotion rules (≥2 active movies in + 30 days **and** ≥$100 in the last year), rather than the service/controller? +- Does `Movie` own its expiration-date and price-by-licensing-model logic + (previously `MovieService.GetExpirationDate` + `CustomerService.CalculatePrice`)? +- Is `PurchasedMovie` created through the domain (e.g. `Customer.PurchaseMovie`) + instead of being assembled field-by-field by a service? Are its + active-vs-expired semantics modeled explicitly on the entity (e.g. an + `ExpirationDate` value object plus an `IsExpired`/`IsActive` query, or an + `Expire()` transition) rather than re-derived ad hoc in a service? +- Are invariants enforced by validating factory methods / constructors that throw a + **domain-specific exception** in plain modern C#? (See the global note: introducing + a functional-extensions / `Result` library is forbidden and must not raise the + score — it is neither required nor a better answer than a validating factory plus a + domain exception.) +- Are constructors / factory methods used instead of public setters? Is the + `Customer` ↔ `PurchasedMovie` collection encapsulated (no public `IList` to + mutate from outside)? +- **Modern idioms**: are the value-object candidates (`Email`, money/`Dollars`, + `ExpirationDate`, `CustomerName`) expressed as `record` / `readonly record struct` + (value equality for free) rather than classes with hand-written + `Equals`/`GetHashCode`? Are file-scoped namespaces, `init`/`required`, and nullable + reference types used? Code that achieves rich DDD but in dated NHibernate-era style + (hand-rolled value-object equality, `netcoreapp`-era boilerplate) caps this + dimension at 20 (not exemplary). Note that pulling in a `Result` / + functional-extensions library does **not** count as a "modern idiom" here and must + not lift the score — see the global note above. + +## 2. Encapsulation (0–20) + +| Score | Criteria | +|-------|----------| +| 0 | All properties still have public setters — no encapsulation | +| 5 | Some setters made private but logic still in services | +| 10 | Most business logic in domain objects, services are mostly orchestration | +| 15 | Services are thin orchestrators: load → call domain method → save | +| 20 | Full encapsulation: no way to put domain objects in invalid state from outside | + +**Key checks:** +- Are public setters removed or made private/protected on the **entities** + (`Customer`, `Movie`, `PurchasedMovie`) — within NHibernate's `virtual` + + parameterless-ctor constraint, which is acceptable and should not be penalized? +- Are the **value objects** (`Email`, `Dollars`/money, `ExpirationDate`, + `CustomerName`) genuinely immutable (`record` with `init`-only members, no public + setters) so they cannot be mutated after construction? +- Is the `PurchasedMovies` collection exposed read-only, with mutation only via a + domain method (e.g. `Customer.PurchaseMovie`)? +- Is `Customer.MoneySpent` kept consistent internally (recomputed/incremented by the + domain on purchase) rather than settable from outside? +- Do `CustomerService` / `MovieService` still contain `if`/`switch` business + logic, or has it moved into the domain (services possibly removed)? +- Can external code (the controller) bypass domain invariants — e.g. set + `Status`, `MoneySpent`, or add a `PurchasedMovie` directly? +- Are invariants guarded inside the domain via factory validation throwing a + domain-specific exception — **not** by threading a `Result` / + functional-extensions type through the model (which is forbidden and earns no + credit)? + +## 3. Architecture Compliance (0–20) + +| Score | Criteria | +|-------|----------| +| 0 | No structural changes | +| 5 | Some refactoring but mixed concerns remain | +| 10 | Clear separation: domain has no infrastructure dependencies | +| 15 | Proper layer separation, domain objects don't reference NHibernate or controllers | +| 20 | Clean architecture: domain is isolated, application orchestrates, infrastructure adapts, dependency direction is correct | + +**Key checks:** +- Does the domain layer leak persistence concerns (NHibernate attributes/usings) + or web concerns into the entities? +- Are the FluentNHibernate mappings kept in the mapping layer (not pushed into the + entities) and updated correctly for new value objects / private setters? +- Is the HTTP contract of `CustomersController` preserved while the controller + becomes a thin delegate to domain methods? +- Is the dependency direction correct (outer layers depend on inner)? + +## 4. Extensibility (0–15) + +| Score | Criteria | +|-------|----------| +| 0 | Refactoring makes future changes harder | +| 3 | No improvement to extensibility | +| 6 | Slightly easier to extend but business rules still scattered | +| 9 | Domain objects are cohesive; adding new business rules has a clear home | +| 12 | Good: aggregate boundaries are clear, new domain rules go in the right place naturally | +| 15 | Excellent: domain model is well-factored, new features require adding behavior to existing aggregates or creating new value objects, no shotgun surgery needed | + +**Key checks:** +- Are aggregate boundaries clear (Customer as the aggregate root over its + PurchasedMovies)? +- Would adding a new licensing model or a new pricing/promotion rule be a localized + change rather than edits across services, controllers, and entities? +- Are value objects (email, money, expiration date) reusable and the obvious home + for related validation? + +## 5. Test Quality (0–20) + +| Score | Criteria | +|-------|----------| +| 0 | Tests broken or removed | +| 4 | Existing tests pass but no new tests for domain behavior | +| 8 | Some tests for new domain methods but poor coverage | +| 12 | Good coverage of domain behavior, invariant enforcement tested | +| 16 | Comprehensive: domain invariants, factory methods, state transitions all tested | +| 20 | Excellent: all domain behavior tested, invalid states tested and rejected, test names express domain concepts, follows project test conventions | + +**Key checks:** +- Note: the repository ships with no test project. Reward the agent for adding one + (e.g. an xUnit/NUnit project referencing `Logic`) and not breaking the build. +- Are domain invariants tested (duplicate active purchase rejected, invalid `Email` + rejected, negative `Dollars`/money rejected, the promotion rule of ≥2 active + movies in 30 days **and** ≥$100 in the last year, price-by-licensing-model, + expired-vs-active `PurchasedMovie`)? +- Are value-object factory methods (`Email`, `Dollars`, `ExpirationDate`, + `CustomerName`) tested with valid and invalid inputs, asserting that invalid input + throws the domain-specific exception (not a `Result`-style failure object)? +- Do test names express domain concepts (e.g. + `Cannot_purchase_the_same_active_movie_twice`)? diff --git a/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/environment/Dockerfile b/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/environment/Dockerfile new file mode 100644 index 0000000..a8d518e --- /dev/null +++ b/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/environment/Dockerfile @@ -0,0 +1,34 @@ +FROM mcr.microsoft.com/dotnet/sdk:8.0 + +RUN apt-get update && apt-get install -y \ + git \ + curl \ + wget \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Use the "dotnet6" branch: it is the SDK-style, modern-hosting port of the +# "Online Theater" sample. The default `master` branch is legacy .NET Framework +# 4.6.2 with old-style csproj + net40 NHibernate binaries, which does NOT build on +# the Linux dotnet/sdk image. The dotnet6 branch uses NuGet FluentNHibernate 3.1.0 / +# NHibernate 5.3.12 / System.Data.SqlClient — all build on Linux with no database. +RUN git clone --depth 1 --branch dotnet6 https://github.com/vkhorikov/AnemicDomainModel.git . + +# The agent works on the anemic "Before" solution. +WORKDIR /app/Before + +# Upgrade the target framework from net6.0 to net8.0. The dotnet/sdk:8.0 image +# ships only the net8.0 targeting pack, so the original net6.0 TFM would fail to +# restore. NHibernate / FluentNHibernate / Newtonsoft.Json are netstandard2.0 and +# work unchanged on net8.0. +RUN find . -name '*.csproj' -exec sed -i 's/net6\.0/net8.0/g' {} + + +# NOTE: NHibernate connects to SQL Server only at RUNTIME (SessionFactory builds a +# session lazily). Compiling and unit-testing the domain model requires NO running +# database, so the image is build-only — no SQL Server sidecar. +RUN dotnet restore OnlineTheaterBefore.sln +RUN dotnet build OnlineTheaterBefore.sln --no-restore + +CMD ["/bin/bash"] diff --git a/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/instruction.md b/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/instruction.md new file mode 100644 index 0000000..f613929 --- /dev/null +++ b/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/instruction.md @@ -0,0 +1,66 @@ +# Task: Refactor Anemic Domain Model to Rich Domain Model + +## Context +You are working in `/app/Before`, an "Online Theater" C# movie-rental backend +(targeting **.NET 8**). It is a *Refactoring from Anemic Domain Model Towards a +Rich One* sample: customers purchase movies, accumulate spend, and can be promoted +to an Advanced status. The domain classes (`Customer`, `Movie`, `PurchasedMovie`) +are plain data containers — public getters and setters with no behavior — while +all business logic lives in application service classes (`CustomerService`, +`MovieService`). This is the classic anemic domain model anti-pattern. The codebase +originated on an older .NET version; bring the domain model up to modern standards +as you enrich it. + +Persistence is handled by NHibernate / FluentNHibernate against SQL Server; you do +**not** need a running database to complete or build the task. + +The project structure: +``` +Before/src/ + Logic/ + Entities/ # Anemic entities — just properties, no behavior + Services/ # Services containing ALL business logic + Repositories/ # NHibernate-backed repositories + Mappings/ # FluentNHibernate ClassMaps + Api/ + Controllers/ # HTTP controllers +``` + +## Requirement +Refactor the domain model from anemic to rich. Business rules currently in the +service classes (`CustomerService`, `MovieService`) must move into the domain +objects where they belong, and each entity must protect its own invariants. + +## Scope +- Modify files under `Before/src/Logic/Entities/` to add behavior and invariants + (and add new value-object types under `Logic/` where appropriate). +- Modify files under `Before/src/Logic/Services/` to remove business logic + (services become thin orchestrators, or disappear if all their logic moves into + the domain). +- Update the FluentNHibernate mappings under `Before/src/Logic/Mappings/` only as + needed to keep the model persistable. +- Do NOT change the public HTTP API (route shape, request/response JSON contract). +- Do NOT add a functional-extensions / `Result` library (e.g. + CSharpFunctionalExtensions); the starting code does not reference one. Enforce + invariants with validating factories that throw a domain-specific exception, in + plain modern C#. + +## Quality Expectations +- Domain objects enforce their own invariants — constructors and methods reject + invalid state; an entity cannot be left inconsistent from outside. +- Services contain only orchestration: load, call a domain method, save. +- Value objects are immutable with equality based on attributes. +- **Use modern C# / .NET idioms** appropriate to .NET 8 — `record` / + `readonly record struct` for value objects, `required` / `init`-only properties, + file-scoped namespaces, nullable reference types, pattern matching. Do not + preserve the legacy NHibernate-era boilerplate. Note that NHibernate requires + mapped *entity* members to be `virtual` with a (possibly protected) parameterless + constructor — honor that on the entities while still modernizing the model. + +## Success Criteria +1. The `Before` solution compiles successfully on .NET 8. +2. Domain entities contain behavior methods (not just getters/setters). +3. Service classes are thin orchestrators — no business logic in `if`/`switch` + blocks (or removed entirely). +4. At least one value object has been introduced. +5. The code reads like a modern .NET 8 codebase, not a port of a legacy app. diff --git a/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/task.toml b/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/task.toml new file mode 100644 index 0000000..06f3173 --- /dev/null +++ b/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/task.toml @@ -0,0 +1,21 @@ +version = "1.0" + +[metadata] +name = "csharp-movie-rental-anemic" +description = "Refactor an anemic movie-rental ('Online Theater') model to a rich domain model in a C# NHibernate application" +difficulty = "intermediate" +language = "C#" +framework = ".NET" + +[agent] +timeout_sec = 1800 + +[environment] +memory_mb = 6144 + +[verifier] +timeout_sec = 300 + +[nasde.source] +git = "https://github.com/vkhorikov/AnemicDomainModel.git" +ref = "dotnet6" diff --git a/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/tests/test.sh b/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/tests/test.sh new file mode 100644 index 0000000..3655707 --- /dev/null +++ b/examples/ddd-architectural-challenges/tasks/csharp-movie-rental-anemic/tests/test.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Movie-Rental Anemic → Rich Domain Model - Verifier +# Reward semantics: compiles on modern .NET (net8.0) + tests pass (if any). +# Design quality (rich model, modern idioms) is graded by the LLM-as-judge +# against assessment_criteria.md. + +cd /app/Before + +echo "==========================================" +echo "Movie-Rental Anemic to Rich Domain Model - Verifier" +echo "==========================================" +echo "" + +echo "Step 1: Verifying solution compiles..." +echo "--------------------------------------" +if dotnet build OnlineTheaterBefore.sln --no-restore -c Release -v q 2>&1; then + echo "✓ SUCCESS: Solution compiles" + echo "" +else + echo "✗ FAILURE: Solution does not compile" + echo 0 > /logs/verifier/reward.txt + exit 1 +fi + +echo "Step 2: Verifying the Logic project targets modern .NET (net8.0)..." +echo "--------------------------------------" +if grep -qE "net8\.0" src/Logic/Logic.csproj; then + echo "✓ SUCCESS: Targets net8.0" + echo "" +else + echo "✗ FAILURE: Logic project does not target net8.0 — the solution must run on modern .NET" + echo " Found:" + grep -E " /logs/verifier/reward.txt + exit 1 +fi + +echo "==========================================" +echo "VERIFIER PASSED ✓" +echo "==========================================" +echo "" +echo "Note: this verifier only checks that the solution compiles on modern .NET." +echo "DDD design quality and idiom modernity are evaluated by the LLM-as-judge." + +echo 1 > /logs/verifier/reward.txt +exit 0 diff --git a/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/assessment_criteria.md b/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/assessment_criteria.md index dfec997..ed35282 100644 --- a/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/assessment_criteria.md +++ b/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/assessment_criteria.md @@ -4,7 +4,7 @@ Evaluate the AI-generated code across five dimensions. ## 1. Domain Modeling (0–25) -Evaluate how well the ThresholdDiscount follows DDD building blocks established by PercentageDiscount and ValueDiscount. +Evaluate how well the ThresholdDiscount follows DDD building blocks established by PercentageDiscount and ValueDiscount. Focus on whether the implementation **makes the implicit explicit** (rich domain language) — do the names of types and operations communicate intent, or are they generic primitives? | Score | Criteria | |-------|----------| @@ -22,6 +22,9 @@ Evaluate how well the ThresholdDiscount follows DDD building blocks established - Are fields `readonly` / init-only? - Does it validate percentage (0–100) and threshold (> 0)? - Is it properly integrated into the Discount discriminated union (`Apply`/`Match` methods)? +- **The menu test (Nick Tunes' tactical-ddd, principle #3)**: reading this code, would a sales-domain expert recognize the names (`ThresholdDiscount`, `Threshold`, `Apply`, `ApplyOn`) as things from their world? Or do you see programmer jargon (`Calculate`, `Process`, `Handle`, `Manager`, `Helper`)? +- **Implicit-to-explicit test (principle #6)**: is the threshold modeled as a meaningful concept (e.g. a `Money`/`Price` value object, or a clearly-named field) or smuggled as a raw `decimal`? Are method names like `Apply` returning rich types (e.g. `Price`) or primitives? +- **Value object liberality (principle #8)**: are concepts like `Threshold` and `Percentage` extracted as their own immutable value objects (or reusing existing ones like `Money`), or just bag-of-decimals on `ThresholdDiscount`? ## 2. Encapsulation (0–20) @@ -39,6 +42,8 @@ Evaluate whether business rules are contained within domain objects. - Can invalid ThresholdDiscount instances be created? - Is the "apply only when price > threshold" rule inside the domain object? - Are validation rules enforced at construction time? +- **Aggregate invariant test (principle #7)**: what must be true *at all times* for a `ThresholdDiscount` to make sense? (e.g. `0 ≤ percentage ≤ 100`, `threshold > 0`.) Are those rules enforced *inside* the object, or does the caller have to remember to check them? +- **Anemic-model test (principle #4)**: does the discount object *decide* anything (e.g. `Apply(price)` returns the result), or is it a passive data carrier where some service has to do `if (price > discount.Threshold) ...` from the outside? ## 3. Architecture Compliance (0–20) @@ -76,6 +81,7 @@ Evaluate how well the discriminated union design supports adding future discount - Does `Discount` have a third case/variant for ThresholdDiscount? - Does `Apply(Money price)` return correct result (apply only when price > threshold)? - Are pattern matching methods (`Match`, `Switch`, or C# pattern match) updated? +- **Hidden-concept test (principle #6)**: is the "apply only when price > threshold" rule encapsulated *inside* `ThresholdDiscount.ApplyOn(Money)` — so the caller just writes `discount.ApplyOn(price)` and gets back the right money — or is the rule leaked to callers as `if (price > discount.Threshold) ...`? The decision and the action belong together (principle #4, "avoid anemic"): one decision-method, not a separate `IsApplicable` flag. - Would adding a 4th discount variant be straightforward following the same pattern? ## 5. Test Quality (0–20) diff --git a/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/task.toml b/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/task.toml index 189ba24..3dbbbfc 100644 --- a/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/task.toml +++ b/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/task.toml @@ -11,7 +11,7 @@ framework = ".NET 8" timeout_sec = 1800 [environment] -memory_mb = 4096 +memory_mb = 6144 [verifier] timeout_sec = 300 diff --git a/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/tests/test.sh b/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/tests/test.sh index 0482a99..d2eaf98 100755 --- a/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/tests/test.sh +++ b/examples/ddd-architectural-challenges/tasks/ddd-threshold-discount/tests/test.sh @@ -1,11 +1,11 @@ #!/bin/bash -# DDD Threshold Discount Challenge - Test Script -# Verifies that the AI agent correctly extended the Discount discriminated union -# with a new ThresholdDiscount value object. +# DDD Threshold Discount Challenge - Verifier +# Reward semantics: compile + functional tests pass. Design quality is graded +# separately by the LLM-as-judge against assessment_criteria.md. echo "==========================================" -echo "DDD Threshold Discount - Evaluation" +echo "DDD Threshold Discount - Verifier" echo "==========================================" echo "" @@ -34,70 +34,12 @@ else exit 1 fi -echo "Step 3: Verifying ThresholdDiscount value object exists..." -echo "--------------------------------------" -if find Sources/Sales -name "ThresholdDiscount.cs" | grep -q .; then - echo "✓ SUCCESS: ThresholdDiscount.cs found" - find Sources/Sales -name "ThresholdDiscount.cs" - echo "" -else - echo "✗ FAILURE: ThresholdDiscount.cs not found under Sources/Sales" - echo "Expected a new value object file implementing the threshold discount logic." - echo 0 > /logs/verifier/reward.txt - exit 1 -fi - -echo "Step 4: Verifying ThresholdDiscount implements PriceModifier..." -echo "--------------------------------------" -if grep -r "PriceModifier" --include="ThresholdDiscount.cs" Sources/Sales/ | grep -q .; then - echo "✓ SUCCESS: ThresholdDiscount implements PriceModifier" - echo "" -else - echo "✗ FAILURE: ThresholdDiscount does not implement PriceModifier" - echo "The value object must implement the PriceModifier interface, as PercentageDiscount and ValueDiscount do." - echo 0 > /logs/verifier/reward.txt - exit 1 -fi - -echo "Step 5: Verifying Discount union has a threshold variant..." -echo "--------------------------------------" -DISCOUNT_FILE="Sources/Sales/Sales.DeepModel/Pricing/Discounts/Discount.cs" -if [ -f "$DISCOUNT_FILE" ] && grep -qi "threshold" "$DISCOUNT_FILE"; then - echo "✓ SUCCESS: Discount union references threshold variant" - echo "" -else - echo "✗ FAILURE: Discount.cs does not contain a threshold variant" - echo "The Discount discriminated union must be extended with a third variant for ThresholdDiscount." - echo 0 > /logs/verifier/reward.txt - exit 1 -fi - -echo "Step 6: Verifying tests for ThresholdDiscount exist..." -echo "--------------------------------------" -if find Sources/Sales -type d -name "*Tests*" -exec grep -r -l -i "threshold" --include="*.cs" {} + 2>/dev/null | grep -q .; then - echo "✓ SUCCESS: Test file(s) for ThresholdDiscount found" - find Sources/Sales -type d -name "*Tests*" -exec grep -r -l -i "threshold" --include="*.cs" {} + 2>/dev/null - echo "" -else - echo "✗ FAILURE: No test file found containing ThresholdDiscount tests" - echo "Tests for the new discount type are required." - echo 0 > /logs/verifier/reward.txt - exit 1 -fi - echo "==========================================" -echo "EVALUATION PASSED ✓" +echo "VERIFIER PASSED ✓" echo "==========================================" echo "" -echo "Summary:" -echo " • Project compiles successfully" -echo " • All tests pass" -echo " • ThresholdDiscount value object implemented" -echo " • Discount union extended with threshold variant" -echo " • Tests for ThresholdDiscount present" -echo "" -echo "The AI agent successfully completed the challenge!" -echo "" +echo "Note: this verifier only checks that the feature works." +echo "DDD design quality is evaluated separately by the LLM-as-judge." echo 1 > /logs/verifier/reward.txt exit 0 diff --git a/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/assessment_criteria.md b/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/assessment_criteria.md index 82508ca..9adf6b3 100644 --- a/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/assessment_criteria.md +++ b/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/assessment_criteria.md @@ -4,7 +4,7 @@ Evaluate the AI-generated code across five dimensions. ## 1. Domain Modeling (0–25) -Evaluate how well weather-related concepts are modeled using DDD building blocks. +Evaluate how well weather-related concepts are modeled using DDD building blocks. Focus on whether the implementation **isolates domain logic** and uses **rich domain language** — does `Precipitation` exist as its own concept, or does the domain just pass around a `decimal`? | Score | Criteria | |-------|----------| @@ -20,6 +20,9 @@ Evaluate how well weather-related concepts are modeled using DDD building blocks - Does the port use domain types (not `HttpResponseMessage`, `JsonElement`, etc.)? - Is discount calculation logic in a domain service or policy (not in the adapter)? - Are weather conditions modeled as value objects with proper semantics? +- **The menu test (Nick Tunes' tactical-ddd, principle #3)**: would a sales/pricing domain expert recognize the names in the domain layer (e.g. `WeatherConditions`, `Precipitation`, `RainyDayDiscount.ApplyOn(price, conditions)`) as their world? Or do you see generic developer jargon (`Manager`, `Handler`, `Service`, `Data`)? +- **Implicit-to-explicit test (principle #6)**: is `Precipitation` named as its own concept (value object with semantics like `IsPresent` / `HasRainOrSnow`), or is it smuggled through method signatures as `decimal precipitation` / `double mm`? +- **Value object liberality (principle #8)**: are weather concepts (`Precipitation`, `Temperature`, `WindSpeed`, eventually `WeatherConditions`) modeled as immutable value objects, or does the domain pass around a JSON blob / DTO of primitives? ## 2. Encapsulation (0–20) @@ -37,6 +40,7 @@ Evaluate whether business rules are contained within domain objects. - Is the "precipitation > X means discount" rule inside the domain layer? - Can callers bypass domain rules by constructing objects directly? - Are failure modes (API down) handled with domain-appropriate defaults? +- **Anemic-model test (principle #4)**: does each weather-discount type own a single decision method that *both checks and acts* (e.g. `PrecipitationDiscount.ApplyOn(Money, WeatherConditions) → Money`), or is it split into `IsApplicable(WeatherConditions): bool` + `DiscountPercentage` exposed to the caller? Splitting the decision from the action is "ask, don't tell" — and it leaks the rule into the orchestrator. ## 3. Architecture Compliance (0–20) @@ -56,6 +60,8 @@ Evaluate separation of concerns, layer isolation, and adherence to project conve - Is the adapter registered in DI container? - Is HttpClient timeout configured? - Does API failure result in "no discount" (not an exception propagating up)? +- **Isolate-domain test (principle #1, Nick's own check)**: "could a domain expert read this code? Can it be unit tested without mocks or spinning up databases?" — if reading the domain layer forces you to think about HTTP, JSON, or status codes, the isolation has failed. +- **Generic-vs-domain test (principle #5)**: retry logic, HTTP caching, JSON deserialization — would that code exist in a *completely different* business domain? If yes, it must live in infra, not domain. ## 4. Extensibility (0–15) @@ -75,6 +81,8 @@ Evaluate how easy it is to add future weather-based discounts (temperature, wind - Can a new weather discount be added by only creating a new class? - Does the weather provider support fetching multiple parameters? - Would adding UV index discount require changing existing classes? +- **Repository-shape test (principle #9, Nick's own check)**: does `IWeatherProvider` (or whatever the port is called) return a *full domain object* describing weather conditions, or does it return a thin DTO / a single primitive (`decimal precipitation`)? A leaky port that returns primitives forces every caller to re-derive the same domain concepts. +- **Generic-vs-domain extensibility test**: adding `TemperatureDiscount` should be writing a *new domain object* with its own invariants — not editing a shared "WeatherDiscountCalculator" class with `if (kind == Rain) ... else if (kind == Cold) ...`. ## 5. Test Quality (0–20) diff --git a/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/task.toml b/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/task.toml index eedcbfc..6e3f271 100644 --- a/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/task.toml +++ b/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/task.toml @@ -11,7 +11,7 @@ framework = ".NET 8" timeout_sec = 1800 [environment] -memory_mb = 4096 +memory_mb = 6144 [verifier] timeout_sec = 300 diff --git a/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/tests/test.sh b/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/tests/test.sh index 59e7373..be74bea 100755 --- a/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/tests/test.sh +++ b/examples/ddd-architectural-challenges/tasks/ddd-weather-discount/tests/test.sh @@ -1,14 +1,15 @@ #!/bin/bash -# DDD Weather Discount Challenge - Evaluation Script -# This script verifies that the AI agent successfully implemented the weather-based discount feature +# DDD Weather Discount Challenge - Verifier +# Reward semantics: compile + functional tests pass + required API used. +# Design quality is graded separately by the LLM-as-judge against +# assessment_criteria.md. echo "==========================================" -echo "DDD Weather Discount - Evaluation" +echo "DDD Weather Discount - Verifier" echo "==========================================" echo "" -# Navigate to project directory cd /app echo "Step 1: Verifying project compiles..." @@ -18,7 +19,6 @@ if dotnet build MyCompany.ECommerce.sln --configuration Debug --no-restore; then echo "" else echo "✗ FAILURE: Project does not compile" - echo "The solution must compile successfully." echo 0 > /logs/verifier/reward.txt exit 1 fi @@ -37,48 +37,23 @@ fi echo "Step 3: Verifying Open-Meteo API integration..." echo "--------------------------------------" -# Search for the Open-Meteo API URL in the codebase -# This confirms the agent actually integrated with the required external API if grep -r "api.open-meteo.com" --include="*.cs" Sources/; then echo "" echo "✓ SUCCESS: Open-Meteo API URL found in codebase" echo "" else echo "✗ FAILURE: Open-Meteo API URL not found in codebase" - echo "Expected to find 'api.open-meteo.com' in the source code." - echo "The implementation must use the specified weather API." + echo "Task constraint: implementation must use api.open-meteo.com (Do NOT change the API URL or service)." echo 0 > /logs/verifier/reward.txt exit 1 fi -echo "Step 4: Checking for weather-related implementation..." -echo "--------------------------------------" -# Check if weather-related code exists (case-insensitive search) -# This is a soft check to see if the agent implemented something weather-related -if grep -ri "weather" --include="*.cs" Sources/Sales/ | grep -v "// " | head -5; then - echo "" - echo "✓ SUCCESS: Weather-related implementation found" - echo "" -else - echo "⚠ WARNING: No obvious weather-related code found" - echo "This may indicate the implementation is incomplete, but continuing evaluation..." - echo "" -fi - echo "==========================================" -echo "EVALUATION PASSED ✓" +echo "VERIFIER PASSED ✓" echo "==========================================" echo "" -echo "Summary:" -echo " • Project compiles successfully" -echo " • All tests pass" -echo " • Open-Meteo API integration verified" -echo " • Weather-based discount feature implemented" -echo "" -echo "The AI agent successfully completed the challenge!" -echo "" +echo "Note: this verifier only checks that the feature works and uses the required API." +echo "DDD design quality is evaluated separately by the LLM-as-judge." -# Write success reward (1 = success in Harbor) echo 1 > /logs/verifier/reward.txt - exit 0 diff --git a/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/assessment_criteria.md b/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/assessment_criteria.md deleted file mode 100644 index 5178f30..0000000 --- a/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/assessment_criteria.md +++ /dev/null @@ -1,85 +0,0 @@ -# Assessment Criteria: Java Order Dispatch Domain Entities - -Evaluate the agent's solution across five dimensions. -This task is in Java (refactoring kata) — apply Java-specific standards. - -## 1. Domain Modeling (0–25) - -| Score | Criteria | -|-------|----------| -| 0 | No changes to entities, or changes break tests | -| 5 | Minor method additions but core dispatch logic still external | -| 10 | Some dispatch-related methods on entities but incomplete encapsulation | -| 15 | Entities have meaningful dispatch behavior; most decision logic moved in; but some rules still external | -| 20 | Entities fully own dispatch decisions; service only coordinates; proper method signatures | -| 25 | Exemplary: entities have intention-revealing methods (isDispatchable, canReceive), clean collaboration between entities, no external state interrogation needed | - -**Key checks:** -- Do Order/Customer entities have methods that express dispatch-related decisions? -- Is the dispatch decision made by entities rather than by external inspection? -- Are entity methods well-named and express domain concepts? - -## 2. Encapsulation (0–20) - -| Score | Criteria | -|-------|----------| -| 0 | Entities still fully exposed via getters; no behavior moved | -| 4 | Some logic moved but getters still used externally for decisions | -| 8 | Behavior methods exist but unnecessary getters remain | -| 12 | Most decision logic inside entities; some getters removed; service is thinner | -| 16 | Getters for decision-making eliminated; entities expose behavior, not state | -| 20 | Perfect: Tell, Don't Ask principle applied; entities expose only behavior methods; no feature envy in service | - -**Key checks:** -- Have getters used only for external decision-making been removed? -- Does the service call entity methods instead of reading state? -- Is the Tell, Don't Ask principle followed? - -## 3. Architecture Compliance (0–20) - -| Score | Criteria | -|-------|----------| -| 0 | Project structure broken, files moved incorrectly | -| 4 | Structure preserved but naming inconsistencies introduced | -| 8 | Good structure but some logic placed in wrong layer | -| 12 | Proper placement: behavior in entities, coordination in service; minor style deviations | -| 16 | Follows all existing conventions; entities and service in correct packages | -| 20 | Perfect: respects existing package structure, consistent naming, no unnecessary new classes or packages | - -**Key checks:** -- Are entity behavior methods in the entity classes (not in new helper classes)? -- Is the service still in its original location? -- Does the solution avoid introducing unnecessary infrastructure? - -## 4. Extensibility (0–15) - -| Score | Criteria | -|-------|----------| -| 0 | Refactoring makes future changes harder | -| 3 | No improvement to extensibility | -| 6 | Slightly easier to add new dispatch rules but still requires modifying entities | -| 9 | New dispatch rules can be added with minimal entity changes | -| 12 | Entity methods are composable; new dispatch criteria can be combined with existing ones | -| 15 | Excellent: entity behavior methods follow SRP, new dispatch rules require only adding new methods or combining existing ones, no modification of existing logic needed | - -**Key checks:** -- Are entity methods focused (single responsibility)? -- Can new dispatch criteria be added without modifying existing methods? -- Is the service thin enough to easily extend with new workflows? - -## 5. Test Quality (0–20) - -| Score | Criteria | -|-------|----------| -| 0 | Tests broken or removed | -| 4 | Existing tests still pass but no new tests for entity behavior | -| 8 | Some tests for new entity methods but poor coverage | -| 12 | Good coverage of new entity behavior; existing tests still pass | -| 16 | Comprehensive entity behavior tests; edge cases covered; follows kata test style | -| 20 | Excellent: all new entity methods tested, edge cases covered, test names express domain concepts, existing tests untouched and passing, follows kata test conventions exactly | - -**Key checks:** -- Do existing tests still pass? -- Are new entity behavior methods tested? -- Do test names express domain concepts? -- Are edge cases covered? diff --git a/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/environment/Dockerfile b/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/environment/Dockerfile deleted file mode 100644 index e92500f..0000000 --- a/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/environment/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -FROM maven:3.9-eclipse-temurin-17 - -RUN apt-get update && apt-get install -y \ - git \ - curl \ - wget \ - ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /app -RUN git clone https://github.com/gregorriegler/order-dispatch-refactoring-kata.git . && git checkout master - -# Java source is in /app/java/ — build from there -RUN cd java && mvn compile -q -B - -CMD ["/bin/bash"] diff --git a/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/instruction.md b/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/instruction.md deleted file mode 100644 index 07f5913..0000000 --- a/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/instruction.md +++ /dev/null @@ -1,44 +0,0 @@ -# Task: Move Order Dispatch Logic into Domain Entities - -## Context - -You are working in `/app`, a Java project structured as a refactoring kata. The codebase models an order dispatch system where orders are dispatched to customers based on various rules. - -The relevant project structure: -``` -src/ - main/java/ - ... — Order, Customer, and dispatch logic - test/java/ - ... — Test suite -``` - -## Requirement - -The dispatch logic currently lives in a service or procedural function that reaches into Order and Customer objects to read their state and make dispatch decisions. The domain entities are data holders with getters — the classic anemic domain model. - -Refactor the codebase so that domain entities own the dispatch decision logic: - -- **Order** should know whether it is dispatchable based on its own state (status, items, validity). -- **Customer** should know whether it can receive dispatches based on its own state (active, valid address, etc.). -- The dispatch decision should emerge from collaboration between entities, not from an external service interrogating their internal state. -- Reduce or eliminate getters that only exist to let external code make decisions that the entity itself should make. - -## Scope - -- Focus on: domain entity classes and the dispatch service/function -- Do NOT modify: test assertions (tests should still pass with same semantics) -- Preserve: all existing test behavior - -## Quality Expectations - -- Follow Java idioms used in the existing codebase -- All existing tests must continue to pass -- Keep changes focused — move logic into entities without changing the external behavior - -## Success Criteria - -1. `mvn compile` (or `gradle build`) succeeds -2. All existing tests pass -3. Domain entities have behavior methods for dispatch-related decisions -4. The dispatch service/function is thinner — it delegates to entities instead of inspecting their state diff --git a/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/task.toml b/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/task.toml deleted file mode 100644 index 3778ffa..0000000 --- a/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/task.toml +++ /dev/null @@ -1,27 +0,0 @@ -version = "1.0" - -[metadata] -language = "Java" -framework = "plain Java" -source_repo = "https://github.com/gregorriegler/order-dispatch-refactoring-kata" -diversity_axes = [ - "language:java", - "difficulty:easy", - "pattern:encapsulation", -] -name = "java-order-dispatch-domain-entities" -description = "Move order dispatch logic from service layer into domain entities in a refactoring kata" -difficulty = "easy" - -[agent] -timeout_sec = 1200 - -[environment] -memory_mb = 4096 - -[verifier] -timeout_sec = 300 - -[nasde.source] -git = "https://github.com/gregorriegler/order-dispatch-refactoring-kata.git" -ref = "master" diff --git a/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/tests/test.sh b/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/tests/test.sh deleted file mode 100755 index ee0181c..0000000 --- a/examples/ddd-architectural-challenges/tasks/java-order-dispatch-domain-entities/tests/test.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -echo "==========================================" -echo "Java Order Dispatch Domain Entities - Evaluation" -echo "==========================================" -echo "" - -cd /app/java - -echo "Step 1: Verifying project compiles..." -echo "--------------------------------------" -if mvn compile -q -B 2>/dev/null || gradle build --no-daemon -q 2>/dev/null; then - echo "✓ SUCCESS: Project compiles" - echo "" -else - echo "✗ FAILURE: Project does not compile" - echo 0 > /logs/verifier/reward.txt - exit 1 -fi - -echo "Step 2: Running tests..." -echo "--------------------------------------" -if mvn test -q -B 2>/dev/null || gradle test --no-daemon -q 2>/dev/null; then - echo "✓ SUCCESS: All tests pass" - echo "" -else - echo "✗ FAILURE: Tests failed" - echo 0 > /logs/verifier/reward.txt - exit 1 -fi - -echo "Step 3: Verifying domain entities have behavior methods..." -echo "--------------------------------------" -ENTITY_FILES=$(find src/main -name "*.java" -type f) -if echo "$ENTITY_FILES" | xargs grep -lE "(boolean (is|can|should)|void (dispatch|accept|validate|process))" 2>/dev/null | grep -q .; then - echo "✓ SUCCESS: Domain entities contain behavior methods" - echo "" -else - echo "✗ FAILURE: Domain entities lack behavior methods" - echo "Expected behavior methods like isDispatchable(), canReceive(), etc. on domain entities" - echo 0 > /logs/verifier/reward.txt - exit 1 -fi - -echo "Step 4: Verifying dispatch service delegates to entities..." -echo "--------------------------------------" -SERVICE_FILES=$(find src/main -name "*.java" -type f | xargs grep -l -i "dispatch\|service" 2>/dev/null) -if [ -n "$SERVICE_FILES" ]; then - GETTER_COUNT=$(echo "$SERVICE_FILES" | xargs grep -c "\.get[A-Z]" 2>/dev/null | awk -F: '{sum+=$2} END {print sum}') - BEHAVIOR_COUNT=$(echo "$SERVICE_FILES" | xargs grep -cE "\.(is|can|should|dispatch|accept|validate)" 2>/dev/null | awk -F: '{sum+=$2} END {print sum}') - echo "Getter calls in service: $GETTER_COUNT" - echo "Behavior method calls in service: $BEHAVIOR_COUNT" - if [ "${BEHAVIOR_COUNT:-0}" -gt 0 ]; then - echo "✓ SUCCESS: Service calls behavior methods on entities" - echo "" - else - echo "✗ FAILURE: Service does not delegate to entity behavior methods" - echo 0 > /logs/verifier/reward.txt - exit 1 - fi -else - echo "✓ INFO: No separate service file found — logic may be fully in entities" - echo "" -fi - -echo "==========================================" -echo "EVALUATION PASSED ✓" -echo "==========================================" - -echo 1 > /logs/verifier/reward.txt -exit 0 diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-anemic-tuned/CLAUDE.md b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-anemic-tuned/CLAUDE.md new file mode 100644 index 0000000..3cf46b5 --- /dev/null +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-anemic-tuned/CLAUDE.md @@ -0,0 +1,8 @@ +# Agent Instructions + +## Approach + +1. Begin by invoking the **tactical-ddd** skill (use the Skill tool) and apply its guidance throughout this task. +2. Before writing any code, explore the existing codebase to understand its architecture and conventions. +3. Follow existing architectural patterns exactly — match naming, namespaces, directory structure, and code style. +4. Write tests that match the style of existing test files. diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-anemic-tuned/skills/tactical-ddd/SKILL.md b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-anemic-tuned/skills/tactical-ddd/SKILL.md new file mode 100644 index 0000000..a1bcf8b --- /dev/null +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-anemic-tuned/skills/tactical-ddd/SKILL.md @@ -0,0 +1,548 @@ +--- +name: tactical-ddd +description: "Design, refactor, analyze, and review code by applying the principles and patterns of tactical domain-driven design. Triggers on: domain modeling, aggregate design, 'entity', 'value object', 'repository', 'bounded context', 'domain event', 'domain service', code touching domain/ directories, rich domain model discussions." +version: 1.0.0 +--- + + + + +# Tactical DDD + +Design, refactor, analyze, and review code by applying the principles and patterns of tactical domain-driven design. + +## This codebase's conventions + +This codebase targets **.NET 8**. It originated on an old .NET version and is being modernized as it +is enriched — write idiomatic modern C#, do not copy the legacy style: + +- **File-scoped namespaces** (`namespace Foo;`), not bracketed blocks. +- **Value objects as `record` / `readonly record struct`** — you get value equality, `ToString`, and + immutability for free, so do NOT hand-roll `Equals`/`GetHashCode`. Use `init`-only / `required` + members and a static factory (or a validating primary constructor) so an instance cannot exist in + an invalid state. EF Core maps these via owned types / value converters — keep a way for EF to + materialize them, but don't let EF needs dictate a mutable, ctor-less data bag. +- **Nullable reference types enabled**; use them to make "must exist" vs "may be absent" explicit + rather than relying on runtime null checks. +- **Invariants**: throw a domain-specific exception (or return a result type) from the factory / + method that enforces them — not a generic `Exception`, and not validation scattered in services. +- **Layers**: `Domain/` (entities, value objects, invariants), `Application/` (services that + orchestrate), `Controllers/` (HTTP + DTOs), `Infrastructure/` (EF Core config). Domain holds no EF + or ASP.NET references. +- Keep the public API (controllers, DTOs, endpoints) unchanged when refactoring internals. + +## Principles + +1. **Isolate domain logic** +2. **Use rich domain language** +3. **Orchestrate with use cases** +4. **Avoid anemic domain model** +5. **Separate generic concepts** +6. **Make the implicit explicit... like your life depends on it** +7. **Design aggregates around invariants** +8. **Extract immutable value objects liberally** +9. **Repositories are for loading and saving full aggregates** + +--- + +## 1. Isolate domain logic + +**What:** Domain logic is not mixed with technical code like HTTP and database transactions. + +**Why:** Easier to understand the most important part of the code, easier to validate with domain experts, easier to test and evolve, easier to plan and implement new features. + +**Test:** Could a domain expert read the code? Can the code be unit tested without mocks or spinning up databases? + +```csharp +// ❌ WRONG - domain polluted with infrastructure +class Delivery +{ + public async Task Dispatch() + { + _logger.LogInformation("Dispatching delivery {Id}", Id); // Infrastructure! + await _db.BeginTransactionAsync(); // Infrastructure! + if (_status != "ready") throw new Exception("Not ready"); + _status = "dispatched"; + await _db.SaveAsync(this); // Infrastructure! + await _db.CommitAsync(); // Infrastructure! + await _pushNotification.NotifyDriverAsync(); // Infrastructure! + } +} + +// ✅ RIGHT - isolated domain logic +class Delivery +{ + public void Dispatch() + { + if (_status != DeliveryStatus.Ready) + throw new DeliveryNotReadyError(Id); + _status = DeliveryStatus.Dispatched; + _dispatchedAt = DateTime.UtcNow; + } +} +``` + +--- + +## 2. Use rich domain language + +**What:** Names in code match exactly what domain experts say. No programmer jargon. No generic names. + +**Why:** Translation between code-speak and business-speak causes bugs. When a domain expert says "assess a claim" and the code says "ProcessEntity", someone will misunderstand something. + +**Test:** Would a domain expert recognize this name? If you'd need to translate it for them, it's wrong. + +**Common generic terms to watch for:** +- `Manager`, `Handler`, `Processor`, `Helper`, `Util` +- `Data`, `Info`, `Item` (when domain terms exist) +- `Process`, `Handle`, `Execute` (what does it actually DO?) + +```csharp +// ❌ WRONG - programmer jargon +class ClaimHandler +{ + public ProcessingResult ProcessClaimData(ClaimDto claimData) + { + return _claimProcessor.Handle(claimData); + } +} + +// ✅ RIGHT - domain language +class ClaimAssessor +{ + public AssessmentDecision AssessClaim(InsuranceClaim claim) + { + if (claim.ExceedsCoverageLimit()) + return AssessmentDecision.Deny(DenialReason.ExceedsCoverage); + return AssessmentDecision.Approve(); + } +} +``` + +--- + +## 3. Orchestrate with use cases + +**What:** A use case is a user goal—something a user would recognize as an action they can perform in your application. + +**Why:** Use cases define the entry points to your domain. They answer "what can a user do?" If something isn't a user goal, it's supporting machinery that belongs elsewhere. + +**Test (the menu test):** If you described your application's features to a user like a menu, would this be on it? + +``` +DELIVERY APP MENU: +├── Request Delivery ← Use case: user goal +├── Track Delivery ← Use case: user goal +├── Cancel Delivery ← Use case: user goal +├── Calculate ETA ← NOT a use case: internal machinery +└── Check Delivery Radius ← NOT a use case: domain rule +``` + +```csharp +// ❌ WRONG - not a user goal, this is internal machinery +// UseCases/CalculateEtaUseCase.cs +public async Task CalculateEta(DeliveryId deliveryId) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + var driver = await _driverRepository.Find(delivery.DriverId); + return _routeService.EstimateArrival(driver.Location, delivery.Destination); +} + +// ✅ RIGHT - actual user goal (appears in menu) +// UseCases/CancelDeliveryUseCase.cs +public async Task CancelDelivery(DeliveryId deliveryId, CancellationReason reason) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + delivery.Cancel(reason); + await _deliveryRepository.Save(delivery); +} +``` + +--- + +## 4. Avoid anemic domain model + +**What:** Domain logic lives in domain objects, not in use cases. Use cases orchestrate; domain objects decide. + +**Why:** When business rules leak into use cases, they scatter across the codebase, duplicate, and diverge. The domain becomes a dumb data carrier. + +**Test:** Is your use case making business decisions, or just coordinating? If the use case contains if/else business logic, you likely have an anemic model. + +```csharp +// ❌ WRONG - business logic in use case (anemic domain) +public async Task ConfirmDropoff(DeliveryId deliveryId, ProofPhoto photo) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + + // Business rules leaked into use case! + if (delivery.Status != "in_transit") + throw new Exception("Delivery not in transit"); + if (photo == null && delivery.RequiresSignature) + throw new Exception("Proof of delivery required"); + + delivery.Status = "delivered"; + delivery.ProofPhoto = photo; + delivery.DeliveredAt = DateTime.UtcNow; + await _deliveryRepository.Save(delivery); +} + +// ✅ RIGHT - use case orchestrates, domain decides +public async Task ConfirmDropoff(DeliveryId deliveryId, ProofPhoto photo) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + + delivery.ConfirmDropoff(photo); // Domain enforces the rules + + await _deliveryRepository.Save(delivery); +} +``` + +**Signs of anemic model:** +- Use cases full of if/else business logic +- Domain objects are just data with getters/setters +- Business rules duplicated across multiple use cases +- Validation logic outside the object being validated + +--- + +## 5. Separate generic concepts + +**What:** Generic capabilities that aren't specific to your domain live separately from domain-specific logic. + +**Why:** A retry mechanism, a caching layer, a validation framework—these aren't YOUR domain. Mixing them with domain logic obscures what's actually specific to your business. + +**Test:** Would this code exist in a completely different business domain? If yes, it's generic. If it's specific to YOUR business rules, it's domain. + +```csharp +// ❌ WRONG - generic retry logic mixed with domain +// Domain/DriverLocator.cs +class DriverLocator +{ + // Generic retry logic does not belong in domain! + private async Task WithRetry(Func> fn, int attempts) + { + for (var i = 0; i < attempts; i++) + { + try { return await fn(); } + catch { if (i == attempts - 1) throw; } + } + throw new Exception("Retry failed"); + } + + public Task FindAvailableDriver(Zone zone) + => WithRetry(() => SearchDriversInZone(zone), 3); + + private Task SearchDriversInZone(Zone zone) + { + // domain logic to find nearest available driver + } +} + +// ✅ RIGHT - same behavior, properly separated +// Infrastructure/Retry.cs (generic, reusable in any project) +public static class Retry +{ + public static async Task WithRetry(Func> fn, int attempts) + { + for (var i = 0; i < attempts; i++) + { + try { return await fn(); } + catch { if (i == attempts - 1) throw; } + } + throw new Exception("Retry failed"); + } +} + +// Domain/DriverLocator.cs (pure domain, no infra imports) +class DriverLocator +{ + public Task FindAvailableDriver(Zone zone) + { + // domain logic to find nearest available driver + } +} + +// UseCases/DispatchDeliveryUseCase.cs (orchestrates domain + infra) +public async Task DispatchDelivery(DeliveryId deliveryId) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + var driver = await Retry.WithRetry( + () => _driverLocator.FindAvailableDriver(delivery.Zone), 3); + delivery.AssignDriver(driver); + await _deliveryRepository.Save(delivery); +} +``` + +--- + +## 6. Make the implicit explicit... like your life depends on it + +**What:** Strive for maximum expressiveness. Go as far as possible to identify and name domain concepts in code. Don't settle for "good enough"—push until the code speaks the domain fluently. + +**Why:** Maximum alignment optimizes communication between engineers and domain experts. Easier to discuss nuances and avoid misconceptions. Easier to plan and implement features and detect when the design of code is causing unnecessary friction. + +**Test:** Could you discuss this code with a domain expert without translation? Are there concepts they use that don't exist in your code? + +```csharp +// This code looks fine - isolated, uses domain terms +class Delivery +{ + public DeliveryStatus Status { get; private set; } + public Driver? Driver { get; private set; } + public DateTime? PickupTime { get; private set; } + public DateTime? DropoffTime { get; private set; } + public Photo? ProofOfDelivery { get; private set; } + + public void AssignDriver(Driver driver) + { + if (Status != DeliveryStatus.Confirmed) throw new Exception("..."); + Driver = driver; + Status = DeliveryStatus.Assigned; + } + + public void RecordPickup() + { + if (Status != DeliveryStatus.Assigned) throw new Exception("..."); + PickupTime = DateTime.UtcNow; + Status = DeliveryStatus.InTransit; + } + + public void RecordDropoff(Photo photo) + { + if (Status != DeliveryStatus.InTransit) throw new Exception("..."); + ProofOfDelivery = photo; + DropoffTime = DateTime.UtcNow; + Status = DeliveryStatus.Delivered; + } +} + +// But the TYPES can describe the domain! Each state is a distinct concept. +// Reading the types alone tells you how deliveries work. + +abstract record Delivery; + +record RequestedDelivery( // Customer placed request + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items) : Delivery; + +record ConfirmedDelivery( // Restaurant accepted + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Duration EstimatedPrepTime) : Delivery; + +record AssignedDelivery( // Driver assigned, heading to restaurant + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Driver Driver, // Now guaranteed to exist + Time EstimatedPickup) : Delivery; + +record InTransitDelivery( // Driver picked up, heading to customer + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Driver Driver, + Time PickupTime, // Now guaranteed to exist + Time EstimatedDropoff) : Delivery; + +record DeliveredDelivery( // Complete with proof + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Driver Driver, + Time PickupTime, + Time DropoffTime, // Now guaranteed to exist + Photo ProofOfDelivery) : Delivery; + +// State transitions are explicit functions +ConfirmedDelivery ConfirmDelivery(RequestedDelivery d, Duration prepTime); +AssignedDelivery AssignDriver(ConfirmedDelivery d, Driver driver); +InTransitDelivery RecordPickup(AssignedDelivery d); +DeliveredDelivery RecordDropoff(InTransitDelivery d, Photo photo); +``` + +**Smaller improvements matter too:** + +```csharp +// Extract an if statement to a named method +if (distance.Kilometers > 10 && !driver.HasLongRangeVehicle) { ... } +if (delivery.ExceedsDriverRange(driver)) { ... } + +// Name a boolean expression +var canAssign = driver.IsAvailable && driver.IsInZone(delivery.Zone) && !driver.AtCapacity; +var canAssign = driver.CanAccept(delivery); + +// Rename to use domain language +var fee = customFee ?? standardFee; +var fee = customFee ?? defaultDeliveryFee; +``` + +**Ways to increase expressiveness:** +- Model states as distinct types (Delivery with status → RequestedDelivery, ConfirmedDelivery, etc.) +- Make optional fields guaranteed at the right state (`Driver? Driver` → `Driver Driver`) +- Extract conditionals to named methods (complex if → ExceedsDriverRange) +- Rename variables to use domain language (standardFee → defaultDeliveryFee) + +--- + +## 7. Design aggregates around invariants + +**What:** An aggregate is a cluster of objects that must be consistent together. The aggregate root enforces the rules. External code cannot violate invariants. + +**Why:** Without clear boundaries, inconsistent states creep in. One piece of code updates the delivery, another updates the route, and suddenly the ETA is wrong. + +**Test:** What must be true at all times? What rules must never be broken? The objects involved in those rules form an aggregate. + +```csharp +// ❌ WRONG - no aggregate boundary, invariants violated +class Delivery +{ + public List Stops; // Exposed! + public Distance TotalDistance; +} + +// External code can break invariants +delivery.Stops.Add(new DeliveryStop(location)); +// Oops - TotalDistance is now wrong! + +// ✅ RIGHT - aggregate protects invariants +class Delivery +{ + private readonly List _stops = new(); + private Distance _totalDistance = Distance.Zero(); + + public void AddStop(Location location) + { + if (_status != DeliveryStatus.Planning) + throw new DeliveryNotModifiableError(Id); + + var previousStop = _stops[^1]; + var stop = new DeliveryStop(location); + _stops.Add(stop); + _totalDistance = _totalDistance.Add( + previousStop.DistanceTo(location)); // Invariant maintained! + } + + public void RemoveStop(StopId stopId) + { + if (_stops.Count <= 2) + throw new MinimumStopsRequiredError(Id); + + // Recalculate total distance after removal + _stops.RemoveAll(s => s.Id.Equals(stopId)); + _totalDistance = CalculateTotalDistance(); // Invariant maintained! + } + + public Distance TotalDistance => _totalDistance; +} +``` + +**Aggregate rules:** +- One root entity per aggregate +- External code accesses only through the root +- The root enforces all invariants +- Reference other aggregates by ID, not object +- Methods should operate on the same state—if they don't, split the aggregate + +--- + +## 8. Extract immutable value objects liberally + +**What:** When something is defined by its attributes (not identity), make it an immutable value object. Do this liberally—more value objects is usually better. + +**Why:** Value objects are simple. They can't change unexpectedly. They're easy to test. They make domain concepts explicit. They're also a good way to extract logic from aggregates and entities that can easily get large—keep entities focused by pulling cohesive concepts into value objects. + +**Test:** Does this need a unique ID to track it over time? No? It's probably a value object. + +```csharp +// Entity with primitives that should be a value object +class Delivery +{ + public DeliveryId Id; + public decimal FeeAmount; + public string FeeCurrency; +} + +// Extract the value object +class Delivery +{ + public DeliveryId Id; + public Money Fee; +} + +// Value object in modern .NET 8 idiom: a readonly record struct — value equality, +// ToString and immutability for free (no hand-rolled Equals/GetHashCode), with a +// validating factory so an invalid instance cannot exist. File-scoped namespace. +namespace DotNetConfPl.Refactoring.Domain; + +public readonly record struct Money +{ + public decimal Amount { get; } + public Currency Currency { get; } + + private Money(decimal amount, Currency currency) + { + Amount = amount; + Currency = currency; + } + + public static Money Of(decimal amount, Currency currency) => + amount < 0 + ? throw new DomainException("Money amount cannot be negative") + : new Money(amount, currency); + + public Money Add(Money other) => + Currency != other.Currency + ? throw new DomainException("Cannot add money in different currencies") + : new Money(Amount + other.Amount, Currency); +} +``` + +> EF Core maps a `readonly record struct` value object via an owned type or a value +> converter — keep the model expressible to EF, but don't regress to a mutable, +> ctor-less data bag just to satisfy the mapper. + +**Good candidates for value objects:** +- Money, Currency, Percentage +- DateRange, TimeSlot, Duration +- Address, Coordinates, Distance +- EmailAddress, PhoneNumber, URL +- Quantity, Weight, Temperature +- PersonName, CompanyName + +--- + +## 9. Repositories are for loading and saving full aggregates + +The job of a repository is to load and save entire aggregates - not partial aggregates or nested entities inside an aggregate. The `load` method takes an ID and returns the full aggregate. + +A repository should not exist for a domain object that is not an aggregate. Entity that is part of an aggreate -> does not have a repository. It is loaded via the aggregate root's repository. + +The `hydrate` method is used ONLY for constructing an aggregate from it's persisted state. It should not be abused for other use cases like creating new instances. Each creation flow should have a dedicated factory method, e.g. `Order.FromExisting()`, `Order.New()`, `Order.Draft()`. + +The `save` method of a repository should take the full aggregate. + +If you just want to query information to display without modifying state and applying business rules, create a separate read model object and don't use a repository. + +--- + +## Mandatory Checklist + +When designing, refactoring, analyzing, or reviewing code: + +1. [ ] Verify domain is isolated from infrastructure (no DB/HTTP/logging in domain; generic utilities in infra; domain doesn't import infra) +2. [ ] Verify names are from YOUR domain, not generic developer jargon +3. [ ] Verify use cases are intentions of users, human or automated (apply the menu test) +4. [ ] Verify business logic lives in domain objects, use cases only orchestrate +5. [ ] Verify states are modeled as distinct types where appropriate +6. [ ] Verify hidden domain concepts are extracted and named explicitly +7. [ ] Verify aggregates are designed around invariants, not naive mapping of domain nouns +8. [ ] Verify values are extracted into value objects expressing a domain concept +9. [ ] Veirfy no abuse of hydrate methods for creation scenarios. Each creation scenario must have dedicated factory method + +Do not proceed until all checks pass. diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-anemic-tuned/variant.toml b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-anemic-tuned/variant.toml new file mode 100644 index 0000000..b4f5e2b --- /dev/null +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-anemic-tuned/variant.toml @@ -0,0 +1,6 @@ +agent = "claude" +model = "claude-sonnet-4-6" + +# Repo-specific: this skill's examples reference the anemic-refactor codebase's +# conventions, so it should ONLY run against that task. --all-variants honors this. +tasks = ["csharp-anemic-to-rich-domain"] diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-movie-tuned/CLAUDE.md b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-movie-tuned/CLAUDE.md new file mode 100644 index 0000000..3cf46b5 --- /dev/null +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-movie-tuned/CLAUDE.md @@ -0,0 +1,8 @@ +# Agent Instructions + +## Approach + +1. Begin by invoking the **tactical-ddd** skill (use the Skill tool) and apply its guidance throughout this task. +2. Before writing any code, explore the existing codebase to understand its architecture and conventions. +3. Follow existing architectural patterns exactly — match naming, namespaces, directory structure, and code style. +4. Write tests that match the style of existing test files. diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-movie-tuned/skills/tactical-ddd/SKILL.md b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-movie-tuned/skills/tactical-ddd/SKILL.md new file mode 100644 index 0000000..4bf91fd --- /dev/null +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-movie-tuned/skills/tactical-ddd/SKILL.md @@ -0,0 +1,561 @@ +--- +name: tactical-ddd +description: "Design, refactor, analyze, and review code by applying the principles and patterns of tactical domain-driven design. Triggers on: domain modeling, aggregate design, 'entity', 'value object', 'repository', 'bounded context', 'domain event', 'domain service', code touching domain/ directories, rich domain model discussions." +version: 1.0.0 +--- + + + + +# Tactical DDD + +Design, refactor, analyze, and review code by applying the principles and patterns of tactical domain-driven design. + +## This codebase's conventions + +This codebase is a movie-rental backend targeting **.NET 8**, persisted with NHibernate. It originated +on an old .NET version and is being modernized as it is enriched — write idiomatic modern C#, do not +copy the legacy style: + +- **File-scoped namespaces** (`namespace Logic.Entities;`), not bracketed blocks. +- **Value objects as `record` / `readonly record struct`** — you get value equality, `ToString`, and + immutability for free, so do NOT hand-roll `Equals`/`GetHashCode`. Use `init`-only / `required` + members and a static factory (or a validating primary constructor) so an instance cannot exist in + an invalid state. +- **Do NOT introduce a functional-extensions / `Result` library** (e.g. CSharpFunctionalExtensions). + It is not referenced by the code you start from. Enforce invariants with a validating factory that + throws a domain-specific exception. Keep the dependency set as-is — adding a value-object base-class + library is exactly the kind of detour that breaks the build; stay with plain modern C#. +- **Nullable reference types enabled**; use them to make "must exist" vs "may be absent" explicit + rather than relying on runtime null checks. +- **NHibernate constraints**: entities are mapped by NHibernate, which needs `virtual` members and a + (protected/private) parameterless constructor to materialize them — add **only** those. NHibernate + does **not** require the legacy MVC/serialization attributes that the starting code carries; do not + treat them as something to be "preserved". Extract cohesive concepts into immutable `record` value + objects around the entities. +- **Strip web/serialization/validation attributes off the domain entities**: the starting `Customer` / + `Movie` / `PurchasedMovie` carry `[Required]`, `[RegularExpression]`, `[MaxLength]`, + `[JsonConverter]`, `[JsonIgnore]`, `Newtonsoft.Json`, and `System.ComponentModel.DataAnnotations` — + these are presentation/validation leaks, not domain concerns. Move validation **into** the domain + (a factory or value object that throws a domain exception); JSON shaping belongs in the API/DTO + layer, not on the entity. The domain layer must import **no** web or serialization namespace. +- **Layout**: `Logic/Entities` (domain entities), `Logic/Services` (services that orchestrate), + `Logic/Repositories` + `Logic/Mappings` (NHibernate persistence), `Api/Controllers` (HTTP + DTOs). + Keep domain logic out of services and out of the persistence/API layers. +- Keep the public API (controllers, DTOs, endpoints) unchanged when refactoring internals. + +## Principles + +1. **Isolate domain logic** +2. **Use rich domain language** +3. **Orchestrate with use cases** +4. **Avoid anemic domain model** +5. **Separate generic concepts** +6. **Make the implicit explicit... like your life depends on it** +7. **Design aggregates around invariants** +8. **Extract immutable value objects liberally** +9. **Repositories are for loading and saving full aggregates** + +--- + +## 1. Isolate domain logic + +**What:** Domain logic is not mixed with technical code like HTTP and database transactions. + +**Why:** Easier to understand the most important part of the code, easier to validate with domain experts, easier to test and evolve, easier to plan and implement new features. + +**Test:** Could a domain expert read the code? Can the code be unit tested without mocks or spinning up databases? + +```csharp +// ❌ WRONG - domain polluted with infrastructure +class Delivery +{ + public async Task Dispatch() + { + _logger.LogInformation("Dispatching delivery {Id}", Id); // Infrastructure! + await _db.BeginTransactionAsync(); // Infrastructure! + if (_status != "ready") throw new Exception("Not ready"); + _status = "dispatched"; + await _db.SaveAsync(this); // Infrastructure! + await _db.CommitAsync(); // Infrastructure! + await _pushNotification.NotifyDriverAsync(); // Infrastructure! + } +} + +// ✅ RIGHT - isolated domain logic +class Delivery +{ + public void Dispatch() + { + if (_status != DeliveryStatus.Ready) + throw new DeliveryNotReadyError(Id); + _status = DeliveryStatus.Dispatched; + _dispatchedAt = DateTime.UtcNow; + } +} +``` + +--- + +## 2. Use rich domain language + +**What:** Names in code match exactly what domain experts say. No programmer jargon. No generic names. + +**Why:** Translation between code-speak and business-speak causes bugs. When a domain expert says "assess a claim" and the code says "ProcessEntity", someone will misunderstand something. + +**Test:** Would a domain expert recognize this name? If you'd need to translate it for them, it's wrong. + +**Common generic terms to watch for:** +- `Manager`, `Handler`, `Processor`, `Helper`, `Util` +- `Data`, `Info`, `Item` (when domain terms exist) +- `Process`, `Handle`, `Execute` (what does it actually DO?) + +```csharp +// ❌ WRONG - programmer jargon +class ClaimHandler +{ + public ProcessingResult ProcessClaimData(ClaimDto claimData) + { + return _claimProcessor.Handle(claimData); + } +} + +// ✅ RIGHT - domain language +class ClaimAssessor +{ + public AssessmentDecision AssessClaim(InsuranceClaim claim) + { + if (claim.ExceedsCoverageLimit()) + return AssessmentDecision.Deny(DenialReason.ExceedsCoverage); + return AssessmentDecision.Approve(); + } +} +``` + +--- + +## 3. Orchestrate with use cases + +**What:** A use case is a user goal—something a user would recognize as an action they can perform in your application. + +**Why:** Use cases define the entry points to your domain. They answer "what can a user do?" If something isn't a user goal, it's supporting machinery that belongs elsewhere. + +**Test (the menu test):** If you described your application's features to a user like a menu, would this be on it? + +``` +DELIVERY APP MENU: +├── Request Delivery ← Use case: user goal +├── Track Delivery ← Use case: user goal +├── Cancel Delivery ← Use case: user goal +├── Calculate ETA ← NOT a use case: internal machinery +└── Check Delivery Radius ← NOT a use case: domain rule +``` + +```csharp +// ❌ WRONG - not a user goal, this is internal machinery +// UseCases/CalculateEtaUseCase.cs +public async Task CalculateEta(DeliveryId deliveryId) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + var driver = await _driverRepository.Find(delivery.DriverId); + return _routeService.EstimateArrival(driver.Location, delivery.Destination); +} + +// ✅ RIGHT - actual user goal (appears in menu) +// UseCases/CancelDeliveryUseCase.cs +public async Task CancelDelivery(DeliveryId deliveryId, CancellationReason reason) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + delivery.Cancel(reason); + await _deliveryRepository.Save(delivery); +} +``` + +--- + +## 4. Avoid anemic domain model + +**What:** Domain logic lives in domain objects, not in use cases. Use cases orchestrate; domain objects decide. + +**Why:** When business rules leak into use cases, they scatter across the codebase, duplicate, and diverge. The domain becomes a dumb data carrier. + +**Test:** Is your use case making business decisions, or just coordinating? If the use case contains if/else business logic, you likely have an anemic model. + +```csharp +// ❌ WRONG - business logic in use case (anemic domain) +public async Task ConfirmDropoff(DeliveryId deliveryId, ProofPhoto photo) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + + // Business rules leaked into use case! + if (delivery.Status != "in_transit") + throw new Exception("Delivery not in transit"); + if (photo == null && delivery.RequiresSignature) + throw new Exception("Proof of delivery required"); + + delivery.Status = "delivered"; + delivery.ProofPhoto = photo; + delivery.DeliveredAt = DateTime.UtcNow; + await _deliveryRepository.Save(delivery); +} + +// ✅ RIGHT - use case orchestrates, domain decides +public async Task ConfirmDropoff(DeliveryId deliveryId, ProofPhoto photo) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + + delivery.ConfirmDropoff(photo); // Domain enforces the rules + + await _deliveryRepository.Save(delivery); +} +``` + +**Signs of anemic model:** +- Use cases full of if/else business logic +- Domain objects are just data with getters/setters +- Business rules duplicated across multiple use cases +- Validation logic outside the object being validated + +--- + +## 5. Separate generic concepts + +**What:** Generic capabilities that aren't specific to your domain live separately from domain-specific logic. + +**Why:** A retry mechanism, a caching layer, a validation framework—these aren't YOUR domain. Mixing them with domain logic obscures what's actually specific to your business. + +**Test:** Would this code exist in a completely different business domain? If yes, it's generic. If it's specific to YOUR business rules, it's domain. + +```csharp +// ❌ WRONG - generic retry logic mixed with domain +// Domain/DriverLocator.cs +class DriverLocator +{ + // Generic retry logic does not belong in domain! + private async Task WithRetry(Func> fn, int attempts) + { + for (var i = 0; i < attempts; i++) + { + try { return await fn(); } + catch { if (i == attempts - 1) throw; } + } + throw new Exception("Retry failed"); + } + + public Task FindAvailableDriver(Zone zone) + => WithRetry(() => SearchDriversInZone(zone), 3); + + private Task SearchDriversInZone(Zone zone) + { + // domain logic to find nearest available driver + } +} + +// ✅ RIGHT - same behavior, properly separated +// Infrastructure/Retry.cs (generic, reusable in any project) +public static class Retry +{ + public static async Task WithRetry(Func> fn, int attempts) + { + for (var i = 0; i < attempts; i++) + { + try { return await fn(); } + catch { if (i == attempts - 1) throw; } + } + throw new Exception("Retry failed"); + } +} + +// Domain/DriverLocator.cs (pure domain, no infra imports) +class DriverLocator +{ + public Task FindAvailableDriver(Zone zone) + { + // domain logic to find nearest available driver + } +} + +// UseCases/DispatchDeliveryUseCase.cs (orchestrates domain + infra) +public async Task DispatchDelivery(DeliveryId deliveryId) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + var driver = await Retry.WithRetry( + () => _driverLocator.FindAvailableDriver(delivery.Zone), 3); + delivery.AssignDriver(driver); + await _deliveryRepository.Save(delivery); +} +``` + +--- + +## 6. Make the implicit explicit... like your life depends on it + +**What:** Strive for maximum expressiveness. Go as far as possible to identify and name domain concepts in code. Don't settle for "good enough"—push until the code speaks the domain fluently. + +**Why:** Maximum alignment optimizes communication between engineers and domain experts. Easier to discuss nuances and avoid misconceptions. Easier to plan and implement features and detect when the design of code is causing unnecessary friction. + +**Test:** Could you discuss this code with a domain expert without translation? Are there concepts they use that don't exist in your code? + +```csharp +// This code looks fine - isolated, uses domain terms +class Delivery +{ + public DeliveryStatus Status { get; private set; } + public Driver? Driver { get; private set; } + public DateTime? PickupTime { get; private set; } + public DateTime? DropoffTime { get; private set; } + public Photo? ProofOfDelivery { get; private set; } + + public void AssignDriver(Driver driver) + { + if (Status != DeliveryStatus.Confirmed) throw new Exception("..."); + Driver = driver; + Status = DeliveryStatus.Assigned; + } + + public void RecordPickup() + { + if (Status != DeliveryStatus.Assigned) throw new Exception("..."); + PickupTime = DateTime.UtcNow; + Status = DeliveryStatus.InTransit; + } + + public void RecordDropoff(Photo photo) + { + if (Status != DeliveryStatus.InTransit) throw new Exception("..."); + ProofOfDelivery = photo; + DropoffTime = DateTime.UtcNow; + Status = DeliveryStatus.Delivered; + } +} + +// But the TYPES can describe the domain! Each state is a distinct concept. +// Reading the types alone tells you how deliveries work. + +abstract record Delivery; + +record RequestedDelivery( // Customer placed request + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items) : Delivery; + +record ConfirmedDelivery( // Restaurant accepted + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Duration EstimatedPrepTime) : Delivery; + +record AssignedDelivery( // Driver assigned, heading to restaurant + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Driver Driver, // Now guaranteed to exist + Time EstimatedPickup) : Delivery; + +record InTransitDelivery( // Driver picked up, heading to customer + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Driver Driver, + Time PickupTime, // Now guaranteed to exist + Time EstimatedDropoff) : Delivery; + +record DeliveredDelivery( // Complete with proof + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Driver Driver, + Time PickupTime, + Time DropoffTime, // Now guaranteed to exist + Photo ProofOfDelivery) : Delivery; + +// State transitions are explicit functions +ConfirmedDelivery ConfirmDelivery(RequestedDelivery d, Duration prepTime); +AssignedDelivery AssignDriver(ConfirmedDelivery d, Driver driver); +InTransitDelivery RecordPickup(AssignedDelivery d); +DeliveredDelivery RecordDropoff(InTransitDelivery d, Photo photo); +``` + +**Smaller improvements matter too:** + +```csharp +// Extract an if statement to a named method +if (distance.Kilometers > 10 && !driver.HasLongRangeVehicle) { ... } +if (delivery.ExceedsDriverRange(driver)) { ... } + +// Name a boolean expression +var canAssign = driver.IsAvailable && driver.IsInZone(delivery.Zone) && !driver.AtCapacity; +var canAssign = driver.CanAccept(delivery); + +// Rename to use domain language +var fee = customFee ?? standardFee; +var fee = customFee ?? defaultDeliveryFee; +``` + +**Ways to increase expressiveness:** +- Model states as distinct types (Delivery with status → RequestedDelivery, ConfirmedDelivery, etc.) +- Make optional fields guaranteed at the right state (`Driver? Driver` → `Driver Driver`) +- Extract conditionals to named methods (complex if → ExceedsDriverRange) +- Rename variables to use domain language (standardFee → defaultDeliveryFee) + +--- + +## 7. Design aggregates around invariants + +**What:** An aggregate is a cluster of objects that must be consistent together. The aggregate root enforces the rules. External code cannot violate invariants. + +**Why:** Without clear boundaries, inconsistent states creep in. One piece of code updates the delivery, another updates the route, and suddenly the ETA is wrong. + +**Test:** What must be true at all times? What rules must never be broken? The objects involved in those rules form an aggregate. + +```csharp +// ❌ WRONG - no aggregate boundary, invariants violated +class Delivery +{ + public List Stops; // Exposed! + public Distance TotalDistance; +} + +// External code can break invariants +delivery.Stops.Add(new DeliveryStop(location)); +// Oops - TotalDistance is now wrong! + +// ✅ RIGHT - aggregate protects invariants +class Delivery +{ + private readonly List _stops = new(); + private Distance _totalDistance = Distance.Zero(); + + public void AddStop(Location location) + { + if (_status != DeliveryStatus.Planning) + throw new DeliveryNotModifiableError(Id); + + var previousStop = _stops[^1]; + var stop = new DeliveryStop(location); + _stops.Add(stop); + _totalDistance = _totalDistance.Add( + previousStop.DistanceTo(location)); // Invariant maintained! + } + + public void RemoveStop(StopId stopId) + { + if (_stops.Count <= 2) + throw new MinimumStopsRequiredError(Id); + + // Recalculate total distance after removal + _stops.RemoveAll(s => s.Id.Equals(stopId)); + _totalDistance = CalculateTotalDistance(); // Invariant maintained! + } + + public Distance TotalDistance => _totalDistance; +} +``` + +**Aggregate rules:** +- One root entity per aggregate +- External code accesses only through the root +- The root enforces all invariants +- Reference other aggregates by ID, not object +- Methods should operate on the same state—if they don't, split the aggregate + +--- + +## 8. Extract immutable value objects liberally + +**What:** When something is defined by its attributes (not identity), make it an immutable value object. Do this liberally—more value objects is usually better. + +**Why:** Value objects are simple. They can't change unexpectedly. They're easy to test. They make domain concepts explicit. They're also a good way to extract logic from aggregates and entities that can easily get large—keep entities focused by pulling cohesive concepts into value objects. + +**Test:** Does this need a unique ID to track it over time? No? It's probably a value object. + +```csharp +// Entity with primitives that should be a value object +class Delivery +{ + public DeliveryId Id; + public decimal FeeAmount; + public string FeeCurrency; +} + +// Extract the value object +class Delivery +{ + public DeliveryId Id; + public Money Fee; +} + +// Value object in modern .NET 8 idiom: a readonly record struct — value equality, +// ToString and immutability for free (no hand-rolled Equals/GetHashCode), with a +// validating factory so an invalid instance cannot exist. File-scoped namespace. +namespace DotNetConfPl.Refactoring.Domain; + +public readonly record struct Money +{ + public decimal Amount { get; } + public Currency Currency { get; } + + private Money(decimal amount, Currency currency) + { + Amount = amount; + Currency = currency; + } + + public static Money Of(decimal amount, Currency currency) => + amount < 0 + ? throw new DomainException("Money amount cannot be negative") + : new Money(amount, currency); + + public Money Add(Money other) => + Currency != other.Currency + ? throw new DomainException("Cannot add money in different currencies") + : new Money(Amount + other.Amount, Currency); +} +``` + +> EF Core maps a `readonly record struct` value object via an owned type or a value +> converter — keep the model expressible to EF, but don't regress to a mutable, +> ctor-less data bag just to satisfy the mapper. + +**Good candidates for value objects:** +- Money, Currency, Percentage +- DateRange, TimeSlot, Duration +- Address, Coordinates, Distance +- EmailAddress, PhoneNumber, URL +- Quantity, Weight, Temperature +- PersonName, CompanyName + +--- + +## 9. Repositories are for loading and saving full aggregates + +The job of a repository is to load and save entire aggregates - not partial aggregates or nested entities inside an aggregate. The `load` method takes an ID and returns the full aggregate. + +A repository should not exist for a domain object that is not an aggregate. Entity that is part of an aggreate -> does not have a repository. It is loaded via the aggregate root's repository. + +The `hydrate` method is used ONLY for constructing an aggregate from it's persisted state. It should not be abused for other use cases like creating new instances. Each creation flow should have a dedicated factory method, e.g. `Order.FromExisting()`, `Order.New()`, `Order.Draft()`. + +The `save` method of a repository should take the full aggregate. + +If you just want to query information to display without modifying state and applying business rules, create a separate read model object and don't use a repository. + +--- + +## Mandatory Checklist + +When designing, refactoring, analyzing, or reviewing code: + +1. [ ] Verify domain is isolated from infrastructure (no DB/HTTP/logging in domain; generic utilities in infra; domain doesn't import infra) +2. [ ] Verify names are from YOUR domain, not generic developer jargon +3. [ ] Verify use cases are intentions of users, human or automated (apply the menu test) +4. [ ] Verify business logic lives in domain objects, use cases only orchestrate +5. [ ] Verify states are modeled as distinct types where appropriate +6. [ ] Verify hidden domain concepts are extracted and named explicitly +7. [ ] Verify aggregates are designed around invariants, not naive mapping of domain nouns +8. [ ] Verify values are extracted into value objects expressing a domain concept +9. [ ] Veirfy no abuse of hydrate methods for creation scenarios. Each creation scenario must have dedicated factory method + +Do not proceed until all checks pass. diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-movie-tuned/variant.toml b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-movie-tuned/variant.toml new file mode 100644 index 0000000..bc813c4 --- /dev/null +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-movie-tuned/variant.toml @@ -0,0 +1,6 @@ +agent = "claude" +model = "claude-sonnet-4-6" + +# Repo-specific: this skill's conventions are tuned to the movie-rental codebase +# (modern .NET 8 idioms). Runs ONLY against that task. +tasks = ["csharp-movie-rental-anemic"] diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-weather-tuned/CLAUDE.md b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-weather-tuned/CLAUDE.md new file mode 100644 index 0000000..3cf46b5 --- /dev/null +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-weather-tuned/CLAUDE.md @@ -0,0 +1,8 @@ +# Agent Instructions + +## Approach + +1. Begin by invoking the **tactical-ddd** skill (use the Skill tool) and apply its guidance throughout this task. +2. Before writing any code, explore the existing codebase to understand its architecture and conventions. +3. Follow existing architectural patterns exactly — match naming, namespaces, directory structure, and code style. +4. Write tests that match the style of existing test files. diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-weather-tuned/skills/tactical-ddd/SKILL.md b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-weather-tuned/skills/tactical-ddd/SKILL.md new file mode 100644 index 0000000..e34b38f --- /dev/null +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-weather-tuned/skills/tactical-ddd/SKILL.md @@ -0,0 +1,569 @@ +--- +name: tactical-ddd +description: "Design, refactor, analyze, and review code by applying the principles and patterns of tactical domain-driven design. Triggers on: domain modeling, aggregate design, 'entity', 'value object', 'repository', 'bounded context', 'domain event', 'domain service', code touching domain/ directories, rich domain model discussions." +version: 1.0.0 +--- + + + + + +# Tactical DDD + +Design, refactor, analyze, and review code by applying the principles and patterns of tactical domain-driven design. + +## Principles + +1. **Isolate domain logic** +2. **Use rich domain language** +3. **Orchestrate with use cases** +4. **Avoid anemic domain model** +5. **Separate generic concepts** +6. **Make the implicit explicit... like your life depends on it** +7. **Design aggregates around invariants** +8. **Extract immutable value objects liberally** +9. **Repositories are for loading and saving full aggregates** + +--- + +## 1. Isolate domain logic + +**What:** Domain logic is not mixed with technical code like HTTP and database transactions. + +**Why:** Easier to understand the most important part of the code, easier to validate with domain experts, easier to test and evolve, easier to plan and implement new features. + +**Test:** Could a domain expert read the code? Can the code be unit tested without mocks or spinning up databases? + +```csharp +// ❌ WRONG - domain polluted with infrastructure +public class Delivery +{ + public async Task Dispatch() + { + _logger.LogInformation("Dispatching delivery {Id}", _id); // Infrastructure! + await _db.BeginTransactionAsync(); // Infrastructure! + if (_status != DeliveryStatus.Ready) throw new InvalidOperationException("Not ready"); + _status = DeliveryStatus.Dispatched; + await _db.SaveAsync(this); // Infrastructure! + await _db.CommitAsync(); // Infrastructure! + await _pushNotification.NotifyDriver(); // Infrastructure! + } +} + +// ✅ RIGHT - isolated domain logic +public class Delivery +{ + public void Dispatch() + { + if (_status != DeliveryStatus.Ready) + throw new DeliveryNotReadyError(_id); + _status = DeliveryStatus.Dispatched; + _dispatchedAt = DateTime.UtcNow; + } +} +``` + +--- + +## 2. Use rich domain language + +**What:** Names in code match exactly what domain experts say. No programmer jargon. No generic names. + +**Why:** Translation between code-speak and business-speak causes bugs. When a domain expert says "assess a claim" and the code says "ProcessEntity", someone will misunderstand something. + +**Test:** Would a domain expert recognize this name? If you'd need to translate it for them, it's wrong. + +**Common generic terms to watch for:** +- `Manager`, `Handler`, `Processor`, `Helper`, `Util` +- `Data`, `Info`, `Item` (when domain terms exist) +- `Process`, `Handle`, `Execute` (what does it actually DO?) + +```csharp +// ❌ WRONG - programmer jargon +public class ClaimHandler +{ + public ProcessingResult ProcessClaimData(ClaimDto claimData) + => _claimProcessor.Handle(claimData); +} + +// ✅ RIGHT - domain language +[DddDomainService] +public class ClaimAssessor +{ + public AssessmentDecision AssessClaim(InsuranceClaim claim) + { + if (claim.ExceedsCoverageLimit()) + return AssessmentDecision.Deny(DenialReason.ExceedsCoverage); + return AssessmentDecision.Approve(); + } +} +``` + +--- + +## 3. Orchestrate with use cases + +**What:** A use case is a user goal—something a user would recognize as an action they can perform in your application. + +**Why:** Use cases define the entry points to your domain. They answer "what can a user do?" If something isn't a user goal, it's supporting machinery that belongs elsewhere. + +**Test (the menu test):** If you described your application's features to a user like a menu, would this be on it? + +``` +DELIVERY APP MENU: +├── Request Delivery ← Use case: user goal +├── Track Delivery ← Use case: user goal +├── Cancel Delivery ← Use case: user goal +├── Calculate ETA ← NOT a use case: internal machinery +└── Check Delivery Radius ← NOT a use case: domain rule +``` + +```csharp +// ❌ WRONG - not a user goal, this is internal machinery +// UseCases/CalculateEta.cs +public async Task CalculateEta(DeliveryId deliveryId) +{ + var delivery = await _deliveryRepository.Load(deliveryId); + var driver = await _driverRepository.Load(delivery.DriverId); + return _routeService.EstimateArrival(driver.Location, delivery.Destination); +} + +// ✅ RIGHT - actual user goal (appears in menu) +// UseCases/CancelDelivery.cs +public async Task CancelDelivery(DeliveryId deliveryId, CancellationReason reason) +{ + var delivery = await _deliveryRepository.Load(deliveryId); + delivery.Cancel(reason); + await _deliveryRepository.Save(delivery); +} +``` + +--- + +## 4. Avoid anemic domain model + +**What:** Domain logic lives in domain objects, not in use cases. Use cases orchestrate; domain objects decide. + +**Why:** When business rules leak into use cases, they scatter across the codebase, duplicate, and diverge. The domain becomes a dumb data carrier. + +**Test:** Is your use case making business decisions, or just coordinating? If the use case contains if/else business logic, you likely have an anemic model. + +```csharp +// ❌ WRONG - business logic in use case (anemic domain) +public async Task ConfirmDropoff(DeliveryId deliveryId, ProofPhoto photo) +{ + var delivery = await _deliveryRepository.Load(deliveryId); + + // Business rules leaked into use case! + if (delivery.Status != DeliveryStatus.InTransit) + throw new InvalidOperationException("Delivery not in transit"); + if (photo is null && delivery.RequiresSignature) + throw new InvalidOperationException("Proof of delivery required"); + + delivery.Status = DeliveryStatus.Delivered; + delivery.ProofPhoto = photo; + delivery.DeliveredAt = DateTime.UtcNow; + await _deliveryRepository.Save(delivery); +} + +// ✅ RIGHT - use case orchestrates, domain decides +public async Task ConfirmDropoff(DeliveryId deliveryId, ProofPhoto photo) +{ + var delivery = await _deliveryRepository.Load(deliveryId); + + delivery.ConfirmDropoff(photo); // Domain enforces the rules + + await _deliveryRepository.Save(delivery); +} +``` + +**Signs of anemic model:** +- Use cases full of if/else business logic +- Domain objects are just data with `{ get; set; }` properties +- Business rules duplicated across multiple use cases +- Validation logic outside the object being validated +- A domain object exposes `IsX()` + `DoX()` ("ask, don't tell") instead of a single decision method that both checks and acts + +--- + +## 5. Separate generic concepts + +**What:** Generic capabilities that aren't specific to your domain live separately from domain-specific logic. + +**Why:** A retry mechanism, a caching layer, a validation framework—these aren't YOUR domain. Mixing them with domain logic obscures what's actually specific to your business. + +**Test:** Would this code exist in a completely different business domain? If yes, it's generic. If it's specific to YOUR business rules, it's domain. + +```csharp +// ❌ WRONG - generic retry logic mixed with domain +// Sales.DeepModel/Delivery/DriverLocator.cs +[DddDomainService] +public class DriverLocator +{ + // Generic retry logic does not belong in domain! + private async Task WithRetry(Func> fn, int attempts) + { + for (var i = 0; i < attempts; i++) + { + try { return await fn(); } + catch { if (i == attempts - 1) throw; } + } + throw new InvalidOperationException("Retry failed"); + } + + public Task FindAvailableDriver(Zone zone) + => WithRetry(() => SearchDriversInZone(zone), 3); + + private Task SearchDriversInZone(Zone zone) { /* domain logic */ } +} + +// ✅ RIGHT - same behavior, properly separated +// Sales.Adapters/Retry/Retry.cs (generic, reusable anywhere) +public static class Retry +{ + public static async Task WithAttempts(Func> fn, int attempts) + { + for (var i = 0; i < attempts; i++) + { + try { return await fn(); } + catch { if (i == attempts - 1) throw; } + } + throw new InvalidOperationException("Retry failed"); + } +} + +// Sales.DeepModel/Delivery/DriverLocator.cs (pure domain, no infra references) +[DddDomainService] +public class DriverLocator +{ + public Task FindAvailableDriver(Zone zone) { /* domain logic */ } +} + +// UseCases/DispatchDelivery.cs (orchestrates domain + infra) +public async Task DispatchDelivery(DeliveryId deliveryId) +{ + var delivery = await _deliveryRepository.Load(deliveryId); + var driver = await Retry.WithAttempts( + () => _driverLocator.FindAvailableDriver(delivery.Zone), attempts: 3); + delivery.AssignDriver(driver); + await _deliveryRepository.Save(delivery); +} +``` + +--- + +## 6. Make the implicit explicit... like your life depends on it + +**What:** Strive for maximum expressiveness. Go as far as possible to identify and name domain concepts in code. Don't settle for "good enough"—push until the code speaks the domain fluently. + +**Why:** Maximum alignment optimizes communication between engineers and domain experts. Easier to discuss nuances and avoid misconceptions. Easier to plan and implement features and detect when the design of code is causing unnecessary friction. + +**Test:** Could you discuss this code with a domain expert without translation? Are there concepts they use that don't exist in your code? + +```csharp +// This code looks fine - isolated, uses domain terms +public class Delivery +{ + public DeliveryStatus Status { get; private set; } + public Driver? Driver { get; private set; } + public DateTime? PickupTime { get; private set; } + public DateTime? DropoffTime { get; private set; } + public Photo? ProofOfDelivery { get; private set; } + + public void AssignDriver(Driver driver) + { + if (Status != DeliveryStatus.Confirmed) throw new InvalidOperationException("..."); + Driver = driver; + Status = DeliveryStatus.Assigned; + } + + public void RecordPickup() + { + if (Status != DeliveryStatus.Assigned) throw new InvalidOperationException("..."); + PickupTime = DateTime.UtcNow; + Status = DeliveryStatus.InTransit; + } + + public void RecordDropoff(Photo photo) + { + if (Status != DeliveryStatus.InTransit) throw new InvalidOperationException("..."); + ProofOfDelivery = photo; + DropoffTime = DateTime.UtcNow; + Status = DeliveryStatus.Delivered; + } +} + +// But the TYPES can describe the domain! Each state is a distinct concept. +// Reading the types alone tells you how deliveries work. + +// Modelled as a discriminated union via readonly struct + Kind enum +// (the same pattern as Discount in this codebase: see Sources/Sales/Sales.DeepModel/Pricing/Discounts/Discount.cs) +[DddValueObject] +public readonly struct Delivery : IEquatable +{ + private readonly DeliveryKind _kind; + private readonly RequestedDelivery _requested; + private readonly ConfirmedDelivery _confirmed; + private readonly AssignedDelivery _assigned; + private readonly InTransitDelivery _inTransit; + private readonly DeliveredDelivery _delivered; + + public static Delivery Requested(Customer customer, Restaurant restaurant, IReadOnlyList items) => + new(DeliveryKind.Requested, RequestedDelivery.Of(customer, restaurant, items), + default, default, default, default); + + public Delivery Confirm(Duration estimatedPrepTime) => _kind switch + { + DeliveryKind.Requested => new(DeliveryKind.Confirmed, default, + ConfirmedDelivery.From(_requested, estimatedPrepTime), default, default, default), + _ => throw new DomainError($"Cannot confirm delivery in state {_kind}") + }; + + public Delivery AssignDriver(Driver driver) => _kind switch + { + DeliveryKind.Confirmed => new(DeliveryKind.Assigned, default, default, + AssignedDelivery.From(_confirmed, driver), default, default), + _ => throw new DomainError($"Cannot assign driver in state {_kind}") + }; + + // ... RecordPickup, RecordDropoff follow the same pattern + + private enum DeliveryKind { Requested, Confirmed, Assigned, InTransit, Delivered } +} + +// Each state is a value object holding ONLY the fields guaranteed at that state: + +[DddValueObject] +public readonly record struct RequestedDelivery( + Customer Customer, Restaurant Restaurant, IReadOnlyList Items) +{ + public static RequestedDelivery Of(Customer c, Restaurant r, IReadOnlyList i) => new(c, r, i); +} + +[DddValueObject] +public readonly record struct AssignedDelivery( + Customer Customer, Restaurant Restaurant, IReadOnlyList Items, + Driver Driver, // Now guaranteed non-null + DateTime EstimatedPickup) +{ + public static AssignedDelivery From(ConfirmedDelivery prev, Driver driver) => new( + prev.Customer, prev.Restaurant, prev.Items, driver, DateTime.UtcNow.AddMinutes(15)); +} + +[DddValueObject] +public readonly record struct DeliveredDelivery( + Customer Customer, Restaurant Restaurant, IReadOnlyList Items, + Driver Driver, + DateTime PickupTime, + DateTime DropoffTime, // Now guaranteed non-null + Photo ProofOfDelivery); // Now guaranteed non-null +``` + +**Smaller improvements matter too:** + +```csharp +// Extract an if statement to a named method +if (distance.Kilometers > 10 && !driver.HasLongRangeVehicle) { ... } +if (delivery.ExceedsDriverRange(driver)) { ... } + +// Name a boolean expression +var canAssign = driver.IsAvailable && driver.IsInZone(delivery.Zone) && !driver.AtCapacity; +var canAssign = driver.CanAccept(delivery); + +// Rename to use domain language +var fee = customFee ?? standardFee; +var fee = customFee ?? defaultDeliveryFee; +``` + +**Ways to increase expressiveness:** +- Model states as distinct types (Delivery with `Status` enum → RequestedDelivery, ConfirmedDelivery, etc. via the readonly-struct discriminated-union pattern this codebase already uses for `Discount`) +- Make optional fields guaranteed at the right state (`Driver?` → `Driver` non-null in `AssignedDelivery`) +- Extract conditionals to named methods (complex `if` → `ExceedsDriverRange`) +- Rename variables to use domain language (`standardFee` → `defaultDeliveryFee`) + +--- + +## 7. Design aggregates around invariants + +**What:** An aggregate is a cluster of objects that must be consistent together. The aggregate root enforces the rules. External code cannot violate invariants. + +**Why:** Without clear boundaries, inconsistent states creep in. One piece of code updates the delivery, another updates the route, and suddenly the ETA is wrong. + +**Test:** What must be true at all times? What rules must never be broken? The objects involved in those rules form an aggregate. + +```csharp +// ❌ WRONG - no aggregate boundary, invariants violated +public class Delivery +{ + public List Stops { get; set; } // Exposed! + public Distance TotalDistance { get; set; } +} + +// External code can break invariants +delivery.Stops.Add(new DeliveryStop(location)); +// Oops - TotalDistance is now wrong! + +// ✅ RIGHT - aggregate protects invariants +[DddAggregateRoot] +public class Delivery +{ + private readonly List _stops = new(); + private Distance _totalDistance = Distance.Zero(); + public DeliveryStatus Status { get; private set; } + + public IReadOnlyList Stops => _stops; + public Distance TotalDistance => _totalDistance; + + public void AddStop(Location location) + { + if (Status != DeliveryStatus.Planning) + throw new DeliveryNotModifiableError(_id); + + var previousStop = _stops[^1]; + var stop = new DeliveryStop(location); + _stops.Add(stop); + _totalDistance = _totalDistance.Add( + previousStop.DistanceTo(location)); // Invariant maintained! + } + + public void RemoveStop(StopId stopId) + { + if (_stops.Count <= 2) + throw new MinimumStopsRequiredError(_id); + + _stops.RemoveAll(s => s.Id.Equals(stopId)); + _totalDistance = CalculateTotalDistance(); // Invariant maintained! + } +} +``` + +**Aggregate rules:** +- One root entity per aggregate +- External code accesses only through the root +- The root enforces all invariants +- Reference other aggregates by ID, not object +- Methods should operate on the same state—if they don't, split the aggregate + +--- + +## 8. Extract immutable value objects liberally + +**What:** When something is defined by its attributes (not identity), make it an immutable value object. Do this liberally—more value objects is usually better. + +**Why:** Value objects are simple. They can't change unexpectedly. They're easy to test. They make domain concepts explicit. They're also a good way to extract logic from aggregates and entities that can easily get large—keep entities focused by pulling cohesive concepts into value objects. + +**Test:** Does this need a unique ID to track it over time? No? It's probably a value object. + +```csharp +// Entity with primitives that should be a value object +public class Delivery +{ + public DeliveryId Id { get; } + public decimal FeeAmount { get; private set; } + public string FeeCurrency { get; private set; } +} + +// Extract the value object +public class Delivery +{ + public DeliveryId Id { get; } + public Money Fee { get; private set; } +} + +// Idiomatic C# value object in this codebase: +// readonly struct, IEquatable, static factory, validating ctor, override Equals/GetHashCode. +// (See Sources/Sales/Sales.Commons/Money.cs and PercentageDiscount.cs for the canonical shape.) +[DddValueObject] +public readonly struct Money : IEquatable +{ + public decimal Amount { get; } + public Currency Currency { get; } + + public static Money Of(decimal amount, Currency currency) => new(amount, currency); + + private Money(decimal amount, Currency currency) + { + if (amount < 0) throw new DomainError("Money amount cannot be negative"); + Amount = amount; + Currency = currency; + } + + public Money Add(Money other) + { + if (Currency != other.Currency) + throw new CurrencyMismatchError(Currency, other.Currency); + return new Money(Amount + other.Amount, Currency); + } + + public bool Equals(Money other) => Amount == other.Amount && Currency == other.Currency; + public override bool Equals(object? obj) => obj is Money other && Equals(other); + public override int GetHashCode() => HashCode.Combine(Amount, Currency); + public override string ToString() => $"{Amount} {Currency}"; +} +``` + +**Good candidates for value objects:** +- `Money`, `Currency`, `Percentage` +- `DateRange`, `TimeSlot`, `Duration` +- `Address`, `Coordinates`, `Distance` +- `EmailAddress`, `PhoneNumber`, `Url` +- `Quantity`, `Weight`, `Temperature`, `Precipitation` +- `PersonName`, `CompanyName` + +--- + +## 9. Repositories are for loading and saving full aggregates + +The job of a repository is to load and save entire aggregates - not partial aggregates or nested entities inside an aggregate. The `Load` method takes an ID and returns the full aggregate. + +A repository should not exist for a domain object that is not an aggregate. An entity that is part of an aggregate → does not have a repository. It is loaded via the aggregate root's repository. + +The `Hydrate` method is used ONLY for constructing an aggregate from its persisted state. It should not be abused for other use cases like creating new instances. Each creation flow should have a dedicated factory method, e.g. `Order.FromExisting()`, `Order.New()`, `Order.Draft()`. + +The `Save` method of a repository should take the full aggregate. + +If you just want to query information to display without modifying state and applying business rules, create a separate read model object and don't use a repository. + +```csharp +// ✅ RIGHT - repository for an aggregate root, factory methods for each creation scenario +public interface IDeliveryRepository +{ + Task Load(DeliveryId id); + Task Save(Delivery delivery); +} + +public class Delivery +{ + // Hydration: reconstruct from persistence — DO NOT use for new instances + public static Delivery Hydrate(DeliveryId id, DeliveryStatus status, IReadOnlyList stops, + Distance totalDistance) => new(id, status, stops, totalDistance); + + // Creation factories — one per use case: + public static Delivery Draft(Customer customer) => new(DeliveryId.New(), DeliveryStatus.Draft, + new List(), Distance.Zero()); + + public static Delivery FromQuote(Quote quote) => new(DeliveryId.New(), DeliveryStatus.Planning, + quote.Stops, quote.TotalDistance); +} +``` + +--- + +## Mandatory Checklist + +When designing, refactoring, analyzing, or reviewing code: + +1. [ ] Verify domain is isolated from infrastructure (no DB/HTTP/logging in domain; generic utilities in infra; domain doesn't `using` infra) +2. [ ] Verify names are from YOUR domain, not generic developer jargon (`Manager`, `Handler`, `Data`, `Process`) +3. [ ] Verify use cases are intentions of users, human or automated (apply the menu test) +4. [ ] Verify business logic lives in domain objects, use cases only orchestrate. **No `IsApplicable()` + `Apply()` split** — give the domain object one decision method that both checks and acts +5. [ ] Verify states are modeled as distinct types where appropriate (readonly-struct discriminated unions; see `Discount` in this codebase) +6. [ ] Verify hidden domain concepts are extracted and named explicitly (a `decimal precipitation` should usually become a `Precipitation` value object) +7. [ ] Verify aggregates are designed around invariants, not naive mapping of domain nouns +8. [ ] Verify values are extracted into value objects expressing a domain concept (`readonly struct`, `IEquatable`, static `Of(...)` factory, validating private ctor, override `Equals`/`GetHashCode`/`ToString`) +9. [ ] Verify no abuse of hydrate methods for creation scenarios. Each creation scenario must have a dedicated factory method (`Of`, `New`, `Draft`, `FromExisting`, ...) + +Do not proceed until all checks pass. diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-weather-tuned/variant.toml b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-weather-tuned/variant.toml new file mode 100644 index 0000000..a252d59 --- /dev/null +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd-weather-tuned/variant.toml @@ -0,0 +1,6 @@ +agent = "claude" +model = "claude-sonnet-4-6" + +# Repo-specific: this skill's examples reference the DDD-starter codebase's value +# objects, so it should ONLY run against the DDD-starter tasks. --all-variants honors this. +tasks = ["ddd-weather-discount", "ddd-threshold-discount"] diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd/CLAUDE.md b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd/CLAUDE.md index 9348af0..3cf46b5 100644 --- a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd/CLAUDE.md +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd/CLAUDE.md @@ -2,6 +2,7 @@ ## Approach -1. Before writing any code, explore the existing codebase to understand its architecture and conventions. -2. Follow existing architectural patterns exactly — match naming, namespaces, directory structure, and code style. -3. Write tests that match the style of existing test files. +1. Begin by invoking the **tactical-ddd** skill (use the Skill tool) and apply its guidance throughout this task. +2. Before writing any code, explore the existing codebase to understand its architecture and conventions. +3. Follow existing architectural patterns exactly — match naming, namespaces, directory structure, and code style. +4. Write tests that match the style of existing test files. diff --git a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd/skills/tactical-ddd/SKILL.md b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd/skills/tactical-ddd/SKILL.md index 5a206c5..359f7c7 100644 --- a/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd/skills/tactical-ddd/SKILL.md +++ b/examples/ddd-architectural-challenges/variants/claude-ntcoding-tactical-ddd/skills/tactical-ddd/SKILL.md @@ -1,11 +1,12 @@ - - --- name: tactical-ddd description: "Design, refactor, analyze, and review code by applying the principles and patterns of tactical domain-driven design. Triggers on: domain modeling, aggregate design, 'entity', 'value object', 'repository', 'bounded context', 'domain event', 'domain service', code touching domain/ directories, rich domain model discussions." version: 1.0.0 --- + + + # Tactical DDD Design, refactor, analyze, and review code by applying the principles and patterns of tactical domain-driven design. @@ -32,29 +33,32 @@ Design, refactor, analyze, and review code by applying the principles and patter **Test:** Could a domain expert read the code? Can the code be unit tested without mocks or spinning up databases? -```typescript +```csharp // ❌ WRONG - domain polluted with infrastructure -class Delivery { - async dispatch() { - this.logger.info('Dispatching delivery', { id: this.id }) // Infrastructure! - await this.db.beginTransaction() // Infrastructure! - if (this.status !== 'ready') throw new Error('Not ready') - this.status = 'dispatched' - await this.db.save(this) // Infrastructure! - await this.db.commit() // Infrastructure! - await this.pushNotification.notifyDriver() // Infrastructure! - } +class Delivery +{ + public async Task Dispatch() + { + _logger.LogInformation("Dispatching delivery {Id}", Id); // Infrastructure! + await _db.BeginTransactionAsync(); // Infrastructure! + if (_status != "ready") throw new Exception("Not ready"); + _status = "dispatched"; + await _db.SaveAsync(this); // Infrastructure! + await _db.CommitAsync(); // Infrastructure! + await _pushNotification.NotifyDriverAsync(); // Infrastructure! + } } // ✅ RIGHT - isolated domain logic -class Delivery { - dispatch(): void { - if (this.status !== DeliveryStatus.Ready) { - throw new DeliveryNotReadyError(this.id) +class Delivery +{ + public void Dispatch() + { + if (_status != DeliveryStatus.Ready) + throw new DeliveryNotReadyError(Id); + _status = DeliveryStatus.Dispatched; + _dispatchedAt = DateTime.UtcNow; } - this.status = DeliveryStatus.Dispatched - this.dispatchedAt = new Date() - } } ``` @@ -64,31 +68,34 @@ class Delivery { **What:** Names in code match exactly what domain experts say. No programmer jargon. No generic names. -**Why:** Translation between code-speak and business-speak causes bugs. When a domain expert says "assess a claim" and the code says "processEntity", someone will misunderstand something. +**Why:** Translation between code-speak and business-speak causes bugs. When a domain expert says "assess a claim" and the code says "ProcessEntity", someone will misunderstand something. **Test:** Would a domain expert recognize this name? If you'd need to translate it for them, it's wrong. **Common generic terms to watch for:** - `Manager`, `Handler`, `Processor`, `Helper`, `Util` - `Data`, `Info`, `Item` (when domain terms exist) -- `process`, `handle`, `execute` (what does it actually DO?) +- `Process`, `Handle`, `Execute` (what does it actually DO?) -```typescript +```csharp // ❌ WRONG - programmer jargon -class ClaimHandler { - processClaimData(claimData: ClaimDTO): ProcessingResult { - return this.claimProcessor.handle(claimData) - } +class ClaimHandler +{ + public ProcessingResult ProcessClaimData(ClaimDto claimData) + { + return _claimProcessor.Handle(claimData); + } } // ✅ RIGHT - domain language -class ClaimAssessor { - assessClaim(claim: InsuranceClaim): AssessmentDecision { - if (claim.exceedsCoverageLimit()) { - return AssessmentDecision.deny(DenialReason.ExceedsCoverage) +class ClaimAssessor +{ + public AssessmentDecision AssessClaim(InsuranceClaim claim) + { + if (claim.ExceedsCoverageLimit()) + return AssessmentDecision.Deny(DenialReason.ExceedsCoverage); + return AssessmentDecision.Approve(); } - return AssessmentDecision.approve() - } } ``` @@ -111,21 +118,23 @@ DELIVERY APP MENU: └── Check Delivery Radius ← NOT a use case: domain rule ``` -```typescript +```csharp // ❌ WRONG - not a user goal, this is internal machinery -// use-cases/calculate-eta.use-case.ts -async function calculateETA(deliveryId: DeliveryId) { - const delivery = await deliveryRepository.find(deliveryId) - const driver = await driverRepository.find(delivery.driverId) - return routeService.estimateArrival(driver.location, delivery.destination) +// UseCases/CalculateEtaUseCase.cs +public async Task CalculateEta(DeliveryId deliveryId) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + var driver = await _driverRepository.Find(delivery.DriverId); + return _routeService.EstimateArrival(driver.Location, delivery.Destination); } // ✅ RIGHT - actual user goal (appears in menu) -// use-cases/cancel-delivery.use-case.ts -async function cancelDelivery(deliveryId: DeliveryId, reason: CancellationReason) { - const delivery = await deliveryRepository.find(deliveryId) - delivery.cancel(reason) - await deliveryRepository.save(delivery) +// UseCases/CancelDeliveryUseCase.cs +public async Task CancelDelivery(DeliveryId deliveryId, CancellationReason reason) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + delivery.Cancel(reason); + await _deliveryRepository.Save(delivery); } ``` @@ -139,32 +148,32 @@ async function cancelDelivery(deliveryId: DeliveryId, reason: CancellationReason **Test:** Is your use case making business decisions, or just coordinating? If the use case contains if/else business logic, you likely have an anemic model. -```typescript +```csharp // ❌ WRONG - business logic in use case (anemic domain) -async function confirmDropoff(deliveryId: DeliveryId, photo: ProofPhoto) { - const delivery = await deliveryRepository.find(deliveryId) - - // Business rules leaked into use case! - if (delivery.status !== 'in_transit') { - throw new Error('Delivery not in transit') - } - if (!photo && delivery.requiresSignature) { - throw new Error('Proof of delivery required') - } - - delivery.status = 'delivered' - delivery.proofPhoto = photo - delivery.deliveredAt = new Date() - await deliveryRepository.save(delivery) +public async Task ConfirmDropoff(DeliveryId deliveryId, ProofPhoto photo) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + + // Business rules leaked into use case! + if (delivery.Status != "in_transit") + throw new Exception("Delivery not in transit"); + if (photo == null && delivery.RequiresSignature) + throw new Exception("Proof of delivery required"); + + delivery.Status = "delivered"; + delivery.ProofPhoto = photo; + delivery.DeliveredAt = DateTime.UtcNow; + await _deliveryRepository.Save(delivery); } // ✅ RIGHT - use case orchestrates, domain decides -async function confirmDropoff(deliveryId: DeliveryId, photo: ProofPhoto) { - const delivery = await deliveryRepository.find(deliveryId) +public async Task ConfirmDropoff(DeliveryId deliveryId, ProofPhoto photo) +{ + var delivery = await _deliveryRepository.Find(deliveryId); - delivery.confirmDropoff(photo) // Domain enforces the rules + delivery.ConfirmDropoff(photo); // Domain enforces the rules - await deliveryRepository.save(delivery) + await _deliveryRepository.Save(delivery); } ``` @@ -184,53 +193,63 @@ async function confirmDropoff(deliveryId: DeliveryId, photo: ProofPhoto) { **Test:** Would this code exist in a completely different business domain? If yes, it's generic. If it's specific to YOUR business rules, it's domain. -```typescript +```csharp // ❌ WRONG - generic retry logic mixed with domain -// domain/driver-locator.ts -class DriverLocator { - // Generic retry logic does not belong in domain! - private async withRetry(fn: () => Promise, attempts: number): Promise { - for (let i = 0; i < attempts; i++) { - try { return await fn() } - catch (e) { if (i === attempts - 1) throw e } +// Domain/DriverLocator.cs +class DriverLocator +{ + // Generic retry logic does not belong in domain! + private async Task WithRetry(Func> fn, int attempts) + { + for (var i = 0; i < attempts; i++) + { + try { return await fn(); } + catch { if (i == attempts - 1) throw; } + } + throw new Exception("Retry failed"); } - throw new Error('Retry failed') - } - async findAvailableDriver(zone: Zone): Promise { - return this.withRetry(() => this.searchDriversInZone(zone), 3) - } + public Task FindAvailableDriver(Zone zone) + => WithRetry(() => SearchDriversInZone(zone), 3); - private async searchDriversInZone(zone: Zone): Promise { - // domain logic to find nearest available driver - } + private Task SearchDriversInZone(Zone zone) + { + // domain logic to find nearest available driver + } } // ✅ RIGHT - same behavior, properly separated -// infra/retry.ts (generic, reusable in any project) -export async function withRetry(fn: () => Promise, attempts: number): Promise { - for (let i = 0; i < attempts; i++) { - try { return await fn() } - catch (e) { if (i === attempts - 1) throw e } - } - throw new Error('Retry failed') +// Infrastructure/Retry.cs (generic, reusable in any project) +public static class Retry +{ + public static async Task WithRetry(Func> fn, int attempts) + { + for (var i = 0; i < attempts; i++) + { + try { return await fn(); } + catch { if (i == attempts - 1) throw; } + } + throw new Exception("Retry failed"); + } } -// domain/driver-locator.ts (pure domain, no infra imports) -class DriverLocator { - async findAvailableDriver(zone: Zone): Promise { - // domain logic to find nearest available driver - } +// Domain/DriverLocator.cs (pure domain, no infra imports) +class DriverLocator +{ + public Task FindAvailableDriver(Zone zone) + { + // domain logic to find nearest available driver + } } -// use-cases/dispatch-delivery.ts (orchestrates domain + infra) -async function dispatchDelivery(deliveryId: DeliveryId) { - const delivery = await deliveryRepository.find(deliveryId) - const driver = await withRetry( - () => driverLocator.findAvailableDriver(delivery.zone), 3 - ) - delivery.assignDriver(driver) - await deliveryRepository.save(delivery) +// UseCases/DispatchDeliveryUseCase.cs (orchestrates domain + infra) +public async Task DispatchDelivery(DeliveryId deliveryId) +{ + var delivery = await _deliveryRepository.Find(deliveryId); + var driver = await Retry.WithRetry( + () => _driverLocator.FindAvailableDriver(delivery.Zone), 3); + delivery.AssignDriver(driver); + await _deliveryRepository.Save(delivery); } ``` @@ -244,117 +263,106 @@ async function dispatchDelivery(deliveryId: DeliveryId) { **Test:** Could you discuss this code with a domain expert without translation? Are there concepts they use that don't exist in your code? -```typescript +```csharp // This code looks fine - isolated, uses domain terms -class Delivery { - status: DeliveryStatus - driver: Driver | null - pickupTime: Date | null - dropoffTime: Date | null - proofOfDelivery: Photo | null - - assignDriver(driver: Driver): void { - if (this.status !== DeliveryStatus.Confirmed) throw new Error('...') - this.driver = driver - this.status = DeliveryStatus.Assigned - } - - recordPickup(): void { - if (this.status !== DeliveryStatus.Assigned) throw new Error('...') - this.pickupTime = new Date() - this.status = DeliveryStatus.InTransit - } - - recordDropoff(photo: Photo): void { - if (this.status !== DeliveryStatus.InTransit) throw new Error('...') - this.proofOfDelivery = photo - this.dropoffTime = new Date() - this.status = DeliveryStatus.Delivered - } -} - -// But the TYPES can describe the domain! Each state is a distinct concept. -// Reading the types alone tells you how deliveries work. - -type Delivery = - | RequestedDelivery // Customer placed request - | ConfirmedDelivery // Restaurant accepted - | AssignedDelivery // Driver assigned, heading to restaurant - | InTransitDelivery // Driver picked up, heading to customer - | DeliveredDelivery // Complete with proof - -interface RequestedDelivery { - kind: 'requested' - customer: Customer - restaurant: Restaurant - items: MenuItem[] -} +class Delivery +{ + public DeliveryStatus Status { get; private set; } + public Driver? Driver { get; private set; } + public DateTime? PickupTime { get; private set; } + public DateTime? DropoffTime { get; private set; } + public Photo? ProofOfDelivery { get; private set; } + + public void AssignDriver(Driver driver) + { + if (Status != DeliveryStatus.Confirmed) throw new Exception("..."); + Driver = driver; + Status = DeliveryStatus.Assigned; + } -interface ConfirmedDelivery { - kind: 'confirmed' - customer: Customer - restaurant: Restaurant - items: MenuItem[] - estimatedPrepTime: Duration -} + public void RecordPickup() + { + if (Status != DeliveryStatus.Assigned) throw new Exception("..."); + PickupTime = DateTime.UtcNow; + Status = DeliveryStatus.InTransit; + } -interface AssignedDelivery { - kind: 'assigned' - customer: Customer - restaurant: Restaurant - items: MenuItem[] - driver: Driver // Now guaranteed to exist - estimatedPickup: Time + public void RecordDropoff(Photo photo) + { + if (Status != DeliveryStatus.InTransit) throw new Exception("..."); + ProofOfDelivery = photo; + DropoffTime = DateTime.UtcNow; + Status = DeliveryStatus.Delivered; + } } -interface InTransitDelivery { - kind: 'in_transit' - customer: Customer - restaurant: Restaurant - items: MenuItem[] - driver: Driver - pickupTime: Time // Now guaranteed to exist - estimatedDropoff: Time -} +// But the TYPES can describe the domain! Each state is a distinct concept. +// Reading the types alone tells you how deliveries work. -interface DeliveredDelivery { - kind: 'delivered' - customer: Customer - restaurant: Restaurant - items: MenuItem[] - driver: Driver - pickupTime: Time - dropoffTime: Time // Now guaranteed to exist - proofOfDelivery: Photo // Now guaranteed to exist -} +abstract record Delivery; + +record RequestedDelivery( // Customer placed request + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items) : Delivery; + +record ConfirmedDelivery( // Restaurant accepted + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Duration EstimatedPrepTime) : Delivery; + +record AssignedDelivery( // Driver assigned, heading to restaurant + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Driver Driver, // Now guaranteed to exist + Time EstimatedPickup) : Delivery; + +record InTransitDelivery( // Driver picked up, heading to customer + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Driver Driver, + Time PickupTime, // Now guaranteed to exist + Time EstimatedDropoff) : Delivery; + +record DeliveredDelivery( // Complete with proof + Customer Customer, + Restaurant Restaurant, + IReadOnlyList Items, + Driver Driver, + Time PickupTime, + Time DropoffTime, // Now guaranteed to exist + Photo ProofOfDelivery) : Delivery; // State transitions are explicit functions -function confirmDelivery(d: RequestedDelivery, prepTime: Duration): ConfirmedDelivery -function assignDriver(d: ConfirmedDelivery, driver: Driver): AssignedDelivery -function recordPickup(d: AssignedDelivery): InTransitDelivery -function recordDropoff(d: InTransitDelivery, photo: Photo): DeliveredDelivery +ConfirmedDelivery ConfirmDelivery(RequestedDelivery d, Duration prepTime); +AssignedDelivery AssignDriver(ConfirmedDelivery d, Driver driver); +InTransitDelivery RecordPickup(AssignedDelivery d); +DeliveredDelivery RecordDropoff(InTransitDelivery d, Photo photo); ``` **Smaller improvements matter too:** -```typescript +```csharp // Extract an if statement to a named method -if (distance.kilometers > 10 && !driver.hasLongRangeVehicle) { ... } -if (delivery.exceedsDriverRange(driver)) { ... } +if (distance.Kilometers > 10 && !driver.HasLongRangeVehicle) { ... } +if (delivery.ExceedsDriverRange(driver)) { ... } // Name a boolean expression -const canAssign = driver.isAvailable && driver.isInZone(delivery.zone) && !driver.atCapacity -const canAssign = driver.canAccept(delivery) +var canAssign = driver.IsAvailable && driver.IsInZone(delivery.Zone) && !driver.AtCapacity; +var canAssign = driver.CanAccept(delivery); // Rename to use domain language -const fee = customFee ?? standardFee -const fee = customFee ?? defaultDeliveryFee +var fee = customFee ?? standardFee; +var fee = customFee ?? defaultDeliveryFee; ``` **Ways to increase expressiveness:** - Model states as distinct types (Delivery with status → RequestedDelivery, ConfirmedDelivery, etc.) -- Make optional fields guaranteed at the right state (driver: Driver | null → driver: Driver) -- Extract conditionals to named methods (complex if → exceedsDriverRange) +- Make optional fields guaranteed at the right state (`Driver? Driver` → `Driver Driver`) +- Extract conditionals to named methods (complex if → ExceedsDriverRange) - Rename variables to use domain language (standardFee → defaultDeliveryFee) --- @@ -367,46 +375,47 @@ const fee = customFee ?? defaultDeliveryFee **Test:** What must be true at all times? What rules must never be broken? The objects involved in those rules form an aggregate. -```typescript +```csharp // ❌ WRONG - no aggregate boundary, invariants violated -class Delivery { - stops: DeliveryStop[] // Exposed! - totalDistance: Distance +class Delivery +{ + public List Stops; // Exposed! + public Distance TotalDistance; } // External code can break invariants -delivery.stops.push(new DeliveryStop(location)) -// Oops - totalDistance is now wrong! +delivery.Stops.Add(new DeliveryStop(location)); +// Oops - TotalDistance is now wrong! // ✅ RIGHT - aggregate protects invariants -class Delivery { - private stops: DeliveryStop[] = [] - private _totalDistance: Distance = Distance.zero() - - addStop(location: Location): void { - if (this.status !== DeliveryStatus.Planning) { - throw new DeliveryNotModifiableError(this.id) +class Delivery +{ + private readonly List _stops = new(); + private Distance _totalDistance = Distance.Zero(); + + public void AddStop(Location location) + { + if (_status != DeliveryStatus.Planning) + throw new DeliveryNotModifiableError(Id); + + var previousStop = _stops[^1]; + var stop = new DeliveryStop(location); + _stops.Add(stop); + _totalDistance = _totalDistance.Add( + previousStop.DistanceTo(location)); // Invariant maintained! } - const previousStop = this.stops[this.stops.length - 1] - const stop = new DeliveryStop(location) - this.stops.push(stop) - this._totalDistance = this._totalDistance.add( - previousStop.distanceTo(location) // Invariant maintained! - ) - } - - removeStop(stopId: StopId): void { - if (this.stops.length <= 2) { - throw new MinimumStopsRequiredError(this.id) + + public void RemoveStop(StopId stopId) + { + if (_stops.Count <= 2) + throw new MinimumStopsRequiredError(Id); + + // Recalculate total distance after removal + _stops.RemoveAll(s => s.Id.Equals(stopId)); + _totalDistance = CalculateTotalDistance(); // Invariant maintained! } - // Recalculate total distance after removal - this.stops = this.stops.filter(s => !s.id.equals(stopId)) - this._totalDistance = this.calculateTotalDistance() // Invariant maintained! - } - - get totalDistance(): Distance { - return this._totalDistance - } + + public Distance TotalDistance => _totalDistance; } ``` @@ -427,36 +436,42 @@ class Delivery { **Test:** Does this need a unique ID to track it over time? No? It's probably a value object. -```typescript +```csharp // Entity with primitives that should be a value object -class Delivery { - id: DeliveryId - feeAmount: number - feeCurrency: string +class Delivery +{ + public DeliveryId Id; + public decimal FeeAmount; + public string FeeCurrency; } // Extract the value object -class Delivery { - id: DeliveryId - fee: Money +class Delivery +{ + public DeliveryId Id; + public Money Fee; } -class Money { - constructor( - readonly amount: number, - readonly currency: Currency - ) {} +class Money +{ + public decimal Amount { get; } + public Currency Currency { get; } + + public Money(decimal amount, Currency currency) + { + Amount = amount; + Currency = currency; + } - add(other: Money): Money { - if (this.currency !== other.currency) { - throw new CurrencyMismatchError(this.currency, other.currency) + public Money Add(Money other) + { + if (Currency != other.Currency) + throw new CurrencyMismatchError(Currency, other.Currency); + return new Money(Amount + other.Amount, Currency); } - return new Money(this.amount + other.amount, this.currency) - } - equals(other: Money): boolean { - return this.amount === other.amount && this.currency === other.currency - } + public bool Equals(Money other) + => Amount == other.Amount && Currency == other.Currency; } ``` @@ -476,7 +491,7 @@ The job of a repository is to load and save entire aggregates - not partial aggr A repository should not exist for a domain object that is not an aggregate. Entity that is part of an aggreate -> does not have a repository. It is loaded via the aggregate root's repository. -The `hydrate` method is used ONLY for constructing an aggregate from it's persisted state. It should not be abused for other use cases like creating new instances. Each creation flow should have a dedicated factory method, e.g. `Order.fromExisting()`, `Order.new()`, `Order.draft()`. +The `hydrate` method is used ONLY for constructing an aggregate from it's persisted state. It should not be abused for other use cases like creating new instances. Each creation flow should have a dedicated factory method, e.g. `Order.FromExisting()`, `Order.New()`, `Order.Draft()`. The `save` method of a repository should take the full aggregate. diff --git a/examples/ddd-architectural-challenges/variants/codex-ntcoding-tactical-ddd/AGENTS.md b/examples/ddd-architectural-challenges/variants/codex-ntcoding-tactical-ddd/AGENTS.md index 9348af0..306feff 100644 --- a/examples/ddd-architectural-challenges/variants/codex-ntcoding-tactical-ddd/AGENTS.md +++ b/examples/ddd-architectural-challenges/variants/codex-ntcoding-tactical-ddd/AGENTS.md @@ -2,6 +2,7 @@ ## Approach -1. Before writing any code, explore the existing codebase to understand its architecture and conventions. -2. Follow existing architectural patterns exactly — match naming, namespaces, directory structure, and code style. -3. Write tests that match the style of existing test files. +1. Begin by consulting the **tactical-ddd** skill and apply its guidance throughout this task. +2. Before writing any code, explore the existing codebase to understand its architecture and conventions. +3. Follow existing architectural patterns exactly — match naming, namespaces, directory structure, and code style. +4. Write tests that match the style of existing test files. diff --git a/examples/ddd-architectural-challenges/variants/gemini-ntcoding-tactical-ddd/GEMINI.md b/examples/ddd-architectural-challenges/variants/gemini-ntcoding-tactical-ddd/GEMINI.md index 9348af0..306feff 100644 --- a/examples/ddd-architectural-challenges/variants/gemini-ntcoding-tactical-ddd/GEMINI.md +++ b/examples/ddd-architectural-challenges/variants/gemini-ntcoding-tactical-ddd/GEMINI.md @@ -2,6 +2,7 @@ ## Approach -1. Before writing any code, explore the existing codebase to understand its architecture and conventions. -2. Follow existing architectural patterns exactly — match naming, namespaces, directory structure, and code style. -3. Write tests that match the style of existing test files. +1. Begin by consulting the **tactical-ddd** skill and apply its guidance throughout this task. +2. Before writing any code, explore the existing codebase to understand its architecture and conventions. +3. Follow existing architectural patterns exactly — match naming, namespaces, directory structure, and code style. +4. Write tests that match the style of existing test files. diff --git a/src/nasde_toolkit/cli.py b/src/nasde_toolkit/cli.py index 2007dd8..8c6545d 100644 --- a/src/nasde_toolkit/cli.py +++ b/src/nasde_toolkit/cli.py @@ -261,17 +261,33 @@ def run( else: assert variant is not None resolved_variant = variant - from nasde_toolkit.runner import _resolve_model, load_variant_agent_type, resolve_variant_dir + from nasde_toolkit.runner import ( + _resolve_model, + load_variant_agent_type, + resolve_variant_dir, + scope_tasks_for_variant, + ) variant_dir = resolve_variant_dir(config.project_dir, resolved_variant) agent_type = load_variant_agent_type(variant_dir) display_model = _resolve_model(resolved_model, variant_dir, config) + base_task_names = [t.name for t in config.tasks] + if tasks_filter: + base_task_names = [n for n in base_task_names if n in set(tasks_filter)] + scoped_tasks = scope_tasks_for_variant(variant_dir, base_task_names, tasks_filter) + if not scoped_tasks: + console.print( + f"[red]ERROR: variant '{resolved_variant}' is task-scoped (variant.toml `tasks`) " + f"and none of the requested tasks fall within its scope.[/red]" + ) + raise typer.Exit(1) + _print_run_header( variant=resolved_variant, model=display_model, timeout=resolved_timeout, - tasks_filter=tasks_filter, + tasks_filter=scoped_tasks, with_opik=with_opik, with_eval=not without_eval, harbor_env=resolved_harbor_env, @@ -285,7 +301,7 @@ def run( variant=resolved_variant, model=resolved_model, timeout_sec=resolved_timeout, - tasks_filter=tasks_filter, + tasks_filter=scoped_tasks, with_opik=with_opik, with_eval=not without_eval, harbor_env=resolved_harbor_env, @@ -428,29 +444,32 @@ def _confirm_multi_variant_run( print_banner(console) + from nasde_toolkit.runner import resolve_variant_dir, scope_tasks_for_variant + task_names = [t.name for t in config.tasks] if tasks_filter: task_names = [n for n in task_names if n in set(tasks_filter)] - n_variants = len(variants) - n_tasks = len(task_names) - total_trials = n_variants * n_tasks * attempts - table = Table(title="Multi-Variant Run") - table.add_column("Variants", style="cyan") + table.add_column("Variant", style="cyan") table.add_column("Tasks", style="green") - table.add_column("Attempts", justify="right") - table.add_column("Total trials", justify="right", style="bold") - table.add_row( - ", ".join(variants), - ", ".join(task_names) if task_names else "(none)", - str(attempts), - str(total_trials), - ) + table.add_column("Trials", justify="right", style="bold") + + total_trials = 0 + for variant in variants: + variant_dir = resolve_variant_dir(config.project_dir, variant) + scoped = scope_tasks_for_variant(variant_dir, task_names, tasks_filter) + n = len(scoped) * attempts + total_trials += n + table.add_row( + variant, + ", ".join(scoped) if scoped else "[yellow](skipped — task-scoped)[/yellow]", + str(n), + ) console.print(table) typer.confirm( - f"Run {total_trials} trials ({n_variants} variants x {n_tasks} tasks x {attempts} attempts)?", + f"Run {total_trials} trials across {len(variants)} variants (x {attempts} attempts each)?", abort=True, ) @@ -469,19 +488,29 @@ async def _run_all_variants( max_concurrent_eval: int = 10, ) -> None: - from nasde_toolkit.runner import run_benchmark + from nasde_toolkit.runner import resolve_variant_dir, run_benchmark, scope_tasks_for_variant results: list[tuple[str, str, str]] = [] + all_task_names = [t.name for t in config.tasks] + base_tasks = [t for t in all_task_names if t in set(tasks_filter)] if tasks_filter else all_task_names for i, variant_name in enumerate(variants, 1): console.print(f"\n[bold]=== Variant {i}/{len(variants)}: {variant_name} ===[/bold]\n") + variant_dir = resolve_variant_dir(config.project_dir, variant_name) + scoped_tasks = scope_tasks_for_variant(variant_dir, base_tasks, tasks_filter) + if not scoped_tasks: + console.print( + f"[yellow]SKIP: variant '{variant_name}' is task-scoped and none of its tasks are in this run.[/yellow]" + ) + results.append((variant_name, "[yellow]SKIPPED[/yellow]", "task-scoped, no matching task")) + continue try: await run_benchmark( config=config, variant=variant_name, model=model, timeout_sec=timeout_sec, - tasks_filter=tasks_filter, + tasks_filter=scoped_tasks, with_opik=with_opik, with_eval=with_eval, harbor_env=harbor_env, diff --git a/src/nasde_toolkit/config.py b/src/nasde_toolkit/config.py index 699cbd6..86da043 100644 --- a/src/nasde_toolkit/config.py +++ b/src/nasde_toolkit/config.py @@ -56,7 +56,7 @@ class EvaluationConfig: backend: str = "claude" model: str = "claude-opus-4-7" dimensions_file: str = "assessment_dimensions.json" - max_turns: int = 30 + max_turns: int = 60 allowed_tools: list[str] | None = None mcp_config: str | None = None skills_dir: str | None = None @@ -149,7 +149,7 @@ def _parse_toml(raw: dict, project_dir: Path) -> ProjectConfig: backend=eval_raw.get("backend", "claude"), model=eval_raw.get("model", "claude-opus-4-7"), dimensions_file=eval_raw.get("dimensions_file", "assessment_dimensions.json"), - max_turns=eval_raw.get("max_turns", 30), + max_turns=eval_raw.get("max_turns", 60), allowed_tools=eval_raw.get("allowed_tools"), mcp_config=eval_raw.get("mcp_config"), skills_dir=eval_raw.get("skills_dir"), diff --git a/src/nasde_toolkit/runner.py b/src/nasde_toolkit/runner.py index f015572..a239751 100644 --- a/src/nasde_toolkit/runner.py +++ b/src/nasde_toolkit/runner.py @@ -386,6 +386,45 @@ def load_variant_agent_type(variant_dir: Path) -> str: return agent_type +def load_variant_task_scope(variant_dir: Path) -> list[str] | None: + """Read the optional ``tasks`` task-scope list from variant.toml. + + A repo-specific variant (e.g. a skill whose examples reference one repo's + conventions) declares the tasks it is meant to run against: + + tasks = ["csharp-anemic-to-rich-domain"] + + Returns the declared task names, or ``None`` when the variant is unscoped + (applies to every task). An empty list is treated as unscoped. + """ + scope = load_variant_config(variant_dir).get("tasks") + if not scope: + return None + if not isinstance(scope, list) or not all(isinstance(t, str) for t in scope): + console.print(f"[red]ERROR: variant.toml 'tasks' must be a list of task names, got: {scope!r}[/red]") + raise SystemExit(1) + return scope + + +def scope_tasks_for_variant( + variant_dir: Path, + task_names: list[str], + explicit_tasks_filter: list[str] | None, +) -> list[str]: + """Intersect a variant's task-scope with the run's task list. + + ``task_names`` are the tasks that would otherwise run (already narrowed by + any ``--tasks`` filter). If the variant declares a ``tasks`` scope, keep + only the tasks in that scope. An explicit ``--tasks`` filter that names a + task outside the variant's scope is dropped (the scope wins) so that + ``--all-variants`` never runs a repo-specific variant against the wrong repo. + """ + scope = load_variant_task_scope(variant_dir) + if scope is None: + return task_names + return [t for t in task_names if t in set(scope)] + + def _agent_import_path(agent_type: str) -> str: if agent_type == "codex": return "nasde_toolkit.agents.configurable_codex:ConfigurableCodex" diff --git a/src/nasde_toolkit/scaffold/__init__.py b/src/nasde_toolkit/scaffold/__init__.py index 6edcdea..c08c722 100644 --- a/src/nasde_toolkit/scaffold/__init__.py +++ b/src/nasde_toolkit/scaffold/__init__.py @@ -95,6 +95,10 @@ VARIANT_TOML_TEMPLATE = """\ agent = "claude" model = "claude-sonnet-4-6" + +# Optional: restrict this variant to specific tasks (e.g. a repo-specific skill). +# Omit for a general-purpose variant that runs against every task. +# tasks = ["my-benchmark/task-name"] """ GITIGNORE_TEMPLATE = """\ diff --git a/tests/test_cli.py b/tests/test_cli.py index 32bd89b..5059b85 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -98,6 +98,40 @@ def test_all_variants_continues_on_failure(mock_run: AsyncMock, benchmark_projec assert "OK" in result.output +@patch("nasde_toolkit.runner.run_benchmark", new_callable=AsyncMock) +def test_single_variant_task_scope_aborts_when_no_task_in_scope(mock_run: AsyncMock, benchmark_project: Path) -> None: + scoped = benchmark_project / "variants" / "scoped" + scoped.mkdir(parents=True) + (scoped / "CLAUDE.md").write_text("# scoped") + (scoped / "variant.toml").write_text( + 'agent = "claude"\nmodel = "claude-sonnet-4-6"\ntasks = ["nasde/other-task"]\n' + ) + result = runner.invoke( + app, + ["run", "--variant", "scoped", "-C", str(benchmark_project)], + ) + assert result.exit_code != 0 + assert "task-scoped" in result.output.lower() + mock_run.assert_not_awaited() + + +@patch("nasde_toolkit.runner.run_benchmark", new_callable=AsyncMock) +def test_single_variant_task_scope_passes_only_scoped_tasks(mock_run: AsyncMock, benchmark_project: Path) -> None: + scoped = benchmark_project / "variants" / "scoped" + scoped.mkdir(parents=True) + (scoped / "CLAUDE.md").write_text("# scoped") + # task names are stored stripped of Harbor's "org/" prefix (nasde/sample -> sample), + # so the scope must reference the stripped name — same as the --tasks filter does. + (scoped / "variant.toml").write_text('agent = "claude"\nmodel = "claude-sonnet-4-6"\ntasks = ["sample"]\n') + result = runner.invoke( + app, + ["run", "--variant", "scoped", "-C", str(benchmark_project)], + ) + assert result.exit_code == 0, result.output + assert mock_run.await_count == 1 + assert mock_run.call_args.kwargs["tasks_filter"] == ["sample"] + + @patch("nasde_toolkit.evaluator.evaluate_job", new_callable=AsyncMock) def test_eval_command_forwards_evaluation_config(mock_evaluate_job: AsyncMock, tmp_path: Path) -> None: (tmp_path / "nasde.toml").write_text( diff --git a/tests/test_config.py b/tests/test_config.py index 0b72ecf..6cf7279 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -68,7 +68,7 @@ def test_evaluation_defaults_when_section_absent(tmp_path: Path) -> None: config = load_project_config(tmp_path) assert config.evaluation.model == "claude-opus-4-7" assert config.evaluation.dimensions_file == "assessment_dimensions.json" - assert config.evaluation.max_turns == 30 + assert config.evaluation.max_turns == 60 assert config.evaluation.allowed_tools is None assert config.evaluation.mcp_config is None assert config.evaluation.skills_dir is None diff --git a/tests/test_runner.py b/tests/test_runner.py index 8ed4c16..8f29764 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -23,6 +23,8 @@ _is_gemini_agent, collect_available_variants, load_variant_agent_type, + load_variant_task_scope, + scope_tasks_for_variant, ) @@ -199,6 +201,49 @@ def test_load_variant_agent_type_missing_field_raises(tmp_path: Path) -> None: load_variant_agent_type(tmp_path) +# --------------------------------------------------------------------------- +# variant task-scope +# --------------------------------------------------------------------------- + + +def test_load_variant_task_scope_absent_is_none(tmp_path: Path) -> None: + (tmp_path / "variant.toml").write_text('agent = "claude"') + assert load_variant_task_scope(tmp_path) is None + + +def test_load_variant_task_scope_empty_list_is_none(tmp_path: Path) -> None: + (tmp_path / "variant.toml").write_text('agent = "claude"\ntasks = []') + assert load_variant_task_scope(tmp_path) is None + + +def test_load_variant_task_scope_returns_list(tmp_path: Path) -> None: + (tmp_path / "variant.toml").write_text('agent = "claude"\ntasks = ["a", "b"]') + assert load_variant_task_scope(tmp_path) == ["a", "b"] + + +def test_load_variant_task_scope_non_string_entries_raise(tmp_path: Path) -> None: + (tmp_path / "variant.toml").write_text('agent = "claude"\ntasks = [1, 2]') + with pytest.raises(SystemExit): + load_variant_task_scope(tmp_path) + + +def test_scope_tasks_unscoped_variant_keeps_all(tmp_path: Path) -> None: + (tmp_path / "variant.toml").write_text('agent = "claude"') + assert scope_tasks_for_variant(tmp_path, ["a", "b", "c"], None) == ["a", "b", "c"] + + +def test_scope_tasks_scoped_variant_intersects(tmp_path: Path) -> None: + (tmp_path / "variant.toml").write_text('agent = "claude"\ntasks = ["b"]') + assert scope_tasks_for_variant(tmp_path, ["a", "b", "c"], None) == ["b"] + + +def test_scope_tasks_scope_wins_over_explicit_filter(tmp_path: Path) -> None: + # Even if the user asked for task "a" via --tasks, a variant scoped to "b" + # only runs "b" — and here "b" was filtered out, so nothing runs. + (tmp_path / "variant.toml").write_text('agent = "claude"\ntasks = ["b"]') + assert scope_tasks_for_variant(tmp_path, ["a"], ["a"]) == [] + + # --------------------------------------------------------------------------- # _is_codex_agent # --------------------------------------------------------------------------- diff --git a/uv.lock b/uv.lock index c2d7506..79367be 100644 --- a/uv.lock +++ b/uv.lock @@ -2513,15 +2513,15 @@ wheels = [ [[package]] name = "starlette" -version = "1.0.0" +version = "1.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/81/69/17425771797c36cded50b7fe44e850315d039f28b15901ab44839e70b593/starlette-1.0.0.tar.gz", hash = "sha256:6a4beaf1f81bb472fd19ea9b918b50dc3a77a6f2e190a12954b25e6ed5eea149", size = 2655289, upload-time = "2026-03-22T18:29:46.779Z" } +sdist = { url = "https://files.pythonhosted.org/packages/95/66/4d20cdf39a8d6a51e663b7038e3b828ff211d3891a43a713fe7e4643f3a8/starlette-1.1.0.tar.gz", hash = "sha256:e83c7fe0ddecd8719c5b840080325aec0260acec86e9832899e377b91d65e90f", size = 2660060, upload-time = "2026-05-23T16:55:41.376Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/c9/584bc9651441b4ba60cc4d557d8a547b5aff901af35bda3a4ee30c819b82/starlette-1.0.0-py3-none-any.whl", hash = "sha256:d3ec55e0bb321692d275455ddfd3df75fff145d009685eb40dc91fc66b03d38b", size = 72651, upload-time = "2026-03-22T18:29:45.111Z" }, + { url = "https://files.pythonhosted.org/packages/93/79/920b8e0a8b20f793e8d64855095cb8febabf6175b8550b6f7a547d813891/starlette-1.1.0-py3-none-any.whl", hash = "sha256:7f0dfd38e428aad5cb6f9f667f0ca1d2d8ca3f3385dccac8305f79ec98458382", size = 72899, upload-time = "2026-05-23T16:55:39.201Z" }, ] [[package]]