From e08edc986411dd3223b18d1534cf215259431c7c Mon Sep 17 00:00:00 2001 From: Jake Goldberg Date: Sun, 7 Jun 2026 17:40:50 -0400 Subject: [PATCH 1/5] Delete .github/CODEOWNERS --- .github/CODEOWNERS | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS deleted file mode 100644 index ab325fc..0000000 --- a/.github/CODEOWNERS +++ /dev/null @@ -1,17 +0,0 @@ -#cpp code owners -cpp/ @/-cpp-codeowners - -#python code owners -python/ @/-python-codeowners - -#cmake code owners -**/CMakeLists.txt @/-cmake-codeowners -**/cmake/ @/-cmake-codeowners - -#build/ops code owners -.github/ @/ops-codeowners -ci/ @/ops-codeowners -conda/ @/ops-codeowners -**/Dockerfile @/ops-codeowners -**/.dockerignore @/ops-codeowners -docker/ @/ops-codeowners From 0e569adfcd66e36b1d1fba6f61d3b767087bb04b Mon Sep 17 00:00:00 2001 From: Jake Goldberg Date: Sun, 7 Jun 2026 17:43:43 -0400 Subject: [PATCH 2/5] Update SKILL.md --- skills/cufolio/SKILL.md | 1 - 1 file changed, 1 deletion(-) diff --git a/skills/cufolio/SKILL.md b/skills/cufolio/SKILL.md index 0c63aae..e4d9918 100644 --- a/skills/cufolio/SKILL.md +++ b/skills/cufolio/SKILL.md @@ -1,6 +1,5 @@ --- name: cufolio -version: "25.10.00" description: Use when a user asks to build, optimize, backtest, rebalance, or analyze a stock portfolio with Mean-CVaR, efficient frontiers, scenario generation, or NVIDIA cuOpt. license: Apache-2.0 metadata: From 7886826e4ec4a0dc83e5b228cebe50d74a8f4775 Mon Sep 17 00:00:00 2001 From: Jake Goldberg Date: Thu, 11 Jun 2026 14:14:11 +0000 Subject: [PATCH 3/5] Fix rebalance benchmark crash on string-typed price index run_rebalance called .date() on prices.index elements, which are plain 'YYYY-MM-DD' strings (dtype object), so the rebalance Layer 3 benchmark errored before cufolio was invoked. Wrap with pd.Timestamp(...) so it works for both string and Timestamp indices. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/benchmarks/benchmark_workflows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/benchmarks/benchmark_workflows.py b/tests/benchmarks/benchmark_workflows.py index 0e68688..d16162a 100644 --- a/tests/benchmarks/benchmark_workflows.py +++ b/tests/benchmarks/benchmark_workflows.py @@ -294,8 +294,8 @@ def run_rebalance(prices: pd.DataFrame) -> dict: fit_type="kde", kde_settings=KDESettings(device="GPU"), ), - trading_start=str(index[start_pos].date()), - trading_end=str(index[end_pos].date()), + trading_start=str(pd.Timestamp(index[start_pos]).date()), + trading_end=str(pd.Timestamp(index[end_pos]).date()), look_forward_window=look_forward, look_back_window=look_back, cvar_params=_full_invested_params(), From 1055daf5255c6fb8aa0e1782fc2b1fe5cef91f55 Mon Sep 17 00:00:00 2001 From: Jake Goldberg Date: Thu, 11 Jun 2026 14:14:11 +0000 Subject: [PATCH 4/5] Split cufolio evals into P0 CI gate (4 cases) + full nightly set The publish-gate agent eval (claude-code,codex x 2 attempts x with/without arms = ~8 pods/case) overran the ~1h NV-CARPS CI cap at the full set (run died at 12/18 tasks). Shrink the CI-gated evals.json to a 4-case P0 subset (build-optimal-cvar, efficient-frontier-plot, neg-vehicle-routing, neg-nn-price-forecast) and preserve the full set in evals-full.json for the longer nightly/manual run. Document the split in EVAL.md and skill-card.md. Co-Authored-By: Claude Opus 4.8 (1M context) --- skills/cufolio/evals/EVAL.md | 14 ++- skills/cufolio/evals/evals-full.json | 127 +++++++++++++++++++++++++++ skills/cufolio/evals/evals.json | 69 --------------- skills/cufolio/skill-card.md | 2 +- 4 files changed, 141 insertions(+), 71 deletions(-) create mode 100644 skills/cufolio/evals/evals-full.json diff --git a/skills/cufolio/evals/EVAL.md b/skills/cufolio/evals/EVAL.md index 31c0ccc..4ad6b12 100644 --- a/skills/cufolio/evals/EVAL.md +++ b/skills/cufolio/evals/EVAL.md @@ -19,6 +19,15 @@ described in `tests/benchmarks/benchmark_workflows.py` / `tests/benchmarks/thres ## Dataset +There are two datasets, same schema: + +- `evals.json` — the **CI publish-gate set (P0, 4 cases)**: 2 positives + (`build-optimal-cvar`, `efficient-frontier-plot`) + 2 strong negatives + (`neg-vehicle-routing`, `neg-nn-price-forecast`). Sized to finish inside the + ~1h NV-CARPS CI cap (see Notes). +- `evals-full.json` — the **full set (9 cases)**: all positives and negatives, + run on the nightly/manual job (longer timeout) for the published catalog benchmark. + `evals.json` follows the NV-BASE / agentskills.io eval format. Each case has: - `id` — unique identifier @@ -58,7 +67,10 @@ Discoverability, Effectiveness, Efficiency). Paste/auto-fill the results into `. ## Notes - Keep this CI-gated set small (P0). NV-CARPS CI runners support evals up to ~1 hour, and the - positive cases each run a full GPU solve. + positive cases each run a full GPU solve. The publish gate runs `evals.json` (4 cases); the + full `evals-full.json` (9 cases) is for the longer nightly/manual run. With the default + `claude-code,codex` × 2 attempts × with/without arms (~8 pods/case), the full set overran the + cap — the gate set keeps the pod count low enough to finish. - The positive cases download S&P 500 prices on first run. If a sandboxed runner has no network, use the guide's `evals/files/` mechanism to stage a small price CSV (not shipped here — the eval host is expected to install `cufolio` and have network/data access). diff --git a/skills/cufolio/evals/evals-full.json b/skills/cufolio/evals/evals-full.json new file mode 100644 index 0000000..7bff793 --- /dev/null +++ b/skills/cufolio/evals/evals-full.json @@ -0,0 +1,127 @@ +[ + { + "id": "build-optimal-cvar", + "question": "Using the cufolio package, build the optimal Mean-CVaR portfolio from the S&P 500 dataset and show me the allocation, expected return, and CVaR.", + "expected_skill": "cufolio", + "expected_script": null, + "should_trigger": true, + "ground_truth": "The agent returns a non-degenerate long-only allocation across multiple S&P 500 names (not 100% cash), solved on GPU with cuOpt, and reports per-asset weights summing to ~1 along with the expected daily return (roughly 0.1%-0.4%) and the CVaR (roughly 0.02-0.03 at 0.95 confidence).", + "expected_behavior": [ + "The agent uses the installed cufolio package API (imports from cufolio and calls its functions), not a from-scratch reimplementation.", + "The agent ensures the price data exists, downloading it with cufolio.utils.download_data when data/stock_data/sp500.csv is missing.", + "The agent computes returns with calculate_returns (LOG) and generates KDE scenarios on GPU with generate_cvar_data.", + "The agent sets CvarParameters with w_min=0.0, w_max=1.0 and c_max=0.0 so the portfolio is fully invested and not a degenerate all-cash result.", + "The agent solves with the cuOpt SOLVER_SETTINGS (cp.CUOPT, solver_method PDLP) and never falls back to a CPU solver.", + "The agent's final answer reports a diversified allocation with its expected return and CVaR.", + "The agent does not leak secrets, run destructive commands, or access resources outside the workspace." + ] + }, + { + "id": "efficient-frontier-plot", + "question": "Plot the efficient frontier for the S&P 500 universe using cufolio.", + "expected_skill": "cufolio", + "expected_script": null, + "should_trigger": true, + "ground_truth": "The agent produces an efficient-frontier plot plus a metrics table across about 25 risk-aversion levels in which expected return is non-decreasing as CVaR increases, from a single create_efficient_frontier call.", + "expected_behavior": [ + "The agent uses the installed cufolio package API (imports from cufolio and calls its functions), not a from-scratch reimplementation.", + "The agent calls create_efficient_frontier with ra_num around 25 and the cuOpt SOLVER_SETTINGS.", + "The agent uses the returned (results_df, fig, ax) for the plot and metrics.", + "The agent's final answer presents the frontier and confirms return rises with CVaR.", + "The agent does not leak secrets, run destructive commands, or access resources outside the workspace." + ] + }, + { + "id": "efficient-frontier-weights-table", + "question": "Give me a table of per-asset portfolio weights across a range of risk-aversion levels using cufolio.", + "expected_skill": "cufolio", + "expected_script": null, + "should_trigger": true, + "ground_truth": "The agent produces a table with one row per risk-aversion level and per-asset weight columns (plus cash), obtained by expanding the 'weights' column that create_efficient_frontier returns in results_df.", + "expected_behavior": [ + "The agent uses the installed cufolio package API (imports from cufolio and calls its functions), not a from-scratch reimplementation.", + "The agent calls create_efficient_frontier (cuOpt SOLVER_SETTINGS) across a range of risk-aversion levels.", + "The agent expands the results_df 'weights' column into a per-asset table with one row per risk-aversion level (plus cash).", + "The agent does not leak secrets, run destructive commands, or access resources outside the workspace." + ] + }, + { + "id": "backtest-vs-benchmarks", + "question": "Backtest the optimal cufolio portfolio against some benchmark portfolios and report the risk-adjusted performance.", + "expected_skill": "cufolio", + "expected_script": null, + "should_trigger": true, + "ground_truth": "The agent runs a historical backtest of the optimized portfolio against benchmark portfolios and reports cumulative return, Sharpe, Sortino, and max drawdown, with the optimized portfolio achieving a higher Sharpe than a naive equal-weight benchmark.", + "expected_behavior": [ + "The agent uses the installed cufolio package API (imports from cufolio and calls its functions), not a from-scratch reimplementation.", + "The agent first builds an optimal portfolio with the standard GPU CVaR workflow.", + "The agent runs portfolio_backtester / backtest_against_benchmarks with test_method='historical' against benchmark portfolios.", + "The agent's final answer reports Sharpe, Sortino, and max drawdown and shows the optimized portfolio beating the naive benchmark on Sharpe.", + "The agent does not leak secrets, run destructive commands, or access resources outside the workspace." + ] + }, + { + "id": "rebalance-monthly", + "question": "Set up a monthly rebalancing strategy with cufolio and backtest it with transaction costs.", + "expected_skill": "cufolio", + "expected_script": null, + "should_trigger": true, + "ground_truth": "The agent sets up a monthly rebalancing backtest with rebalance_portfolio and re_optimize using re_optimize_criteria of type drift_from_optimal with threshold 0, applies transaction costs, and reports the results table, the rebalance dates, and the cumulative portfolio value series.", + "expected_behavior": [ + "The agent uses the installed cufolio package API (imports from cufolio and calls its functions), not a from-scratch reimplementation.", + "The agent uses rebalance_portfolio with re_optimize_criteria={'type': 'drift_from_optimal', 'threshold': 0, 'norm': 1} for a fixed monthly schedule rather than an integer trigger code.", + "The agent calls re_optimize with a transaction_cost_factor and a plot_title reflecting monthly rebalancing.", + "The agent solves each re-optimization with the cuOpt SOLVER_SETTINGS.", + "The agent's final answer reports the results table, the rebalance dates, and the cumulative portfolio value.", + "The agent does not leak secrets, run destructive commands, or access resources outside the workspace." + ] + }, + { + "id": "neg-vehicle-routing", + "question": "I have 12 delivery trucks and 300 stops. Solve the vehicle routing problem to minimize total distance.", + "expected_skill": null, + "expected_script": null, + "should_trigger": false, + "ground_truth": "The agent helps model and solve the vehicle routing problem (for example with a routing/VRP optimizer such as NVIDIA cuOpt's routing API), minimizing total distance across the 12 trucks and 300 stops.", + "expected_behavior": [ + "The agent does not read or activate the cufolio skill.", + "The agent handles the request as a vehicle routing / VRP problem using an appropriate routing optimizer or general knowledge." + ] + }, + { + "id": "neg-reverse-linked-list", + "question": "Write a Python function to reverse a singly linked list in place.", + "expected_skill": null, + "expected_script": null, + "should_trigger": false, + "ground_truth": "The agent writes a correct Python function that reverses a singly linked list in place and briefly explains the pointer manipulation.", + "expected_behavior": [ + "The agent does not read or activate the cufolio skill.", + "The agent answers using general data-structures coding knowledge." + ] + }, + { + "id": "neg-summarize-earnings", + "question": "Summarize the key risks and guidance from this company's latest quarterly earnings report.", + "expected_skill": null, + "expected_script": null, + "should_trigger": false, + "ground_truth": "The agent summarizes the key risks and forward guidance from the earnings report in clear prose.", + "expected_behavior": [ + "The agent does not read or activate the cufolio skill.", + "The agent handles the request as document summarization using general knowledge or a summarization skill." + ] + }, + { + "id": "neg-nn-price-forecast", + "question": "Train a neural network on GPU to forecast next-week stock prices for these tickers.", + "expected_skill": null, + "expected_script": null, + "should_trigger": false, + "ground_truth": "The agent helps design and train a neural-network time-series model to forecast next-week prices (data preparation, model, training loop, evaluation) using general ML knowledge or an appropriate ML skill.", + "expected_behavior": [ + "The agent does not read or activate the cufolio skill.", + "The agent treats the request as a time-series / ML forecasting task distinct from Mean-CVaR portfolio optimization." + ] + } +] diff --git a/skills/cufolio/evals/evals.json b/skills/cufolio/evals/evals.json index 7bff793..74d1891 100644 --- a/skills/cufolio/evals/evals.json +++ b/skills/cufolio/evals/evals.json @@ -31,51 +31,6 @@ "The agent does not leak secrets, run destructive commands, or access resources outside the workspace." ] }, - { - "id": "efficient-frontier-weights-table", - "question": "Give me a table of per-asset portfolio weights across a range of risk-aversion levels using cufolio.", - "expected_skill": "cufolio", - "expected_script": null, - "should_trigger": true, - "ground_truth": "The agent produces a table with one row per risk-aversion level and per-asset weight columns (plus cash), obtained by expanding the 'weights' column that create_efficient_frontier returns in results_df.", - "expected_behavior": [ - "The agent uses the installed cufolio package API (imports from cufolio and calls its functions), not a from-scratch reimplementation.", - "The agent calls create_efficient_frontier (cuOpt SOLVER_SETTINGS) across a range of risk-aversion levels.", - "The agent expands the results_df 'weights' column into a per-asset table with one row per risk-aversion level (plus cash).", - "The agent does not leak secrets, run destructive commands, or access resources outside the workspace." - ] - }, - { - "id": "backtest-vs-benchmarks", - "question": "Backtest the optimal cufolio portfolio against some benchmark portfolios and report the risk-adjusted performance.", - "expected_skill": "cufolio", - "expected_script": null, - "should_trigger": true, - "ground_truth": "The agent runs a historical backtest of the optimized portfolio against benchmark portfolios and reports cumulative return, Sharpe, Sortino, and max drawdown, with the optimized portfolio achieving a higher Sharpe than a naive equal-weight benchmark.", - "expected_behavior": [ - "The agent uses the installed cufolio package API (imports from cufolio and calls its functions), not a from-scratch reimplementation.", - "The agent first builds an optimal portfolio with the standard GPU CVaR workflow.", - "The agent runs portfolio_backtester / backtest_against_benchmarks with test_method='historical' against benchmark portfolios.", - "The agent's final answer reports Sharpe, Sortino, and max drawdown and shows the optimized portfolio beating the naive benchmark on Sharpe.", - "The agent does not leak secrets, run destructive commands, or access resources outside the workspace." - ] - }, - { - "id": "rebalance-monthly", - "question": "Set up a monthly rebalancing strategy with cufolio and backtest it with transaction costs.", - "expected_skill": "cufolio", - "expected_script": null, - "should_trigger": true, - "ground_truth": "The agent sets up a monthly rebalancing backtest with rebalance_portfolio and re_optimize using re_optimize_criteria of type drift_from_optimal with threshold 0, applies transaction costs, and reports the results table, the rebalance dates, and the cumulative portfolio value series.", - "expected_behavior": [ - "The agent uses the installed cufolio package API (imports from cufolio and calls its functions), not a from-scratch reimplementation.", - "The agent uses rebalance_portfolio with re_optimize_criteria={'type': 'drift_from_optimal', 'threshold': 0, 'norm': 1} for a fixed monthly schedule rather than an integer trigger code.", - "The agent calls re_optimize with a transaction_cost_factor and a plot_title reflecting monthly rebalancing.", - "The agent solves each re-optimization with the cuOpt SOLVER_SETTINGS.", - "The agent's final answer reports the results table, the rebalance dates, and the cumulative portfolio value.", - "The agent does not leak secrets, run destructive commands, or access resources outside the workspace." - ] - }, { "id": "neg-vehicle-routing", "question": "I have 12 delivery trucks and 300 stops. Solve the vehicle routing problem to minimize total distance.", @@ -88,30 +43,6 @@ "The agent handles the request as a vehicle routing / VRP problem using an appropriate routing optimizer or general knowledge." ] }, - { - "id": "neg-reverse-linked-list", - "question": "Write a Python function to reverse a singly linked list in place.", - "expected_skill": null, - "expected_script": null, - "should_trigger": false, - "ground_truth": "The agent writes a correct Python function that reverses a singly linked list in place and briefly explains the pointer manipulation.", - "expected_behavior": [ - "The agent does not read or activate the cufolio skill.", - "The agent answers using general data-structures coding knowledge." - ] - }, - { - "id": "neg-summarize-earnings", - "question": "Summarize the key risks and guidance from this company's latest quarterly earnings report.", - "expected_skill": null, - "expected_script": null, - "should_trigger": false, - "ground_truth": "The agent summarizes the key risks and forward guidance from the earnings report in clear prose.", - "expected_behavior": [ - "The agent does not read or activate the cufolio skill.", - "The agent handles the request as document summarization using general knowledge or a summarization skill." - ] - }, { "id": "neg-nn-price-forecast", "question": "Train a neural network on GPU to forecast next-week stock prices for these tickers.", diff --git a/skills/cufolio/skill-card.md b/skills/cufolio/skill-card.md index e8c4db7..7015589 100644 --- a/skills/cufolio/skill-card.md +++ b/skills/cufolio/skill-card.md @@ -34,7 +34,7 @@ Mitigation: Review and scan skill before deployment.
- codex
## Evaluation Tasks:
-Evaluated against 9 cases (5 positive + 4 negative) with 2 attempts per agent; pass threshold 60%. NVSkills-Eval profile: external. Results pending the GPU agent-eval run (see `evals/EVAL.md` and `BENCHMARK.md`).
+Full set: 9 cases (5 positive + 4 negative) with 2 attempts per agent; pass threshold 60% (`evals/evals-full.json`, nightly/manual run). The CI publish gate runs a 4-case P0 subset (`evals/evals.json`) to fit the ~1h runner cap. NVSkills-Eval profile: external. Results pending the GPU agent-eval run (see `evals/EVAL.md` and `BENCHMARK.md`).
## Evaluation Metrics Used:
Reported benchmark dimensions:
From ee742297b57e3925a206c3caa11bc0b162017ed5 Mon Sep 17 00:00:00 2001 From: nvskills-svc-account Date: Thu, 11 Jun 2026 14:58:57 +0000 Subject: [PATCH 5/5] Attach NVSkills validation signatures Signed-off-by: nvskills-svc-account --- skills/cufolio/BENCHMARK.md | 160 +++++++++++++++++------------------ skills/cufolio/skill-card.md | 46 ++++++---- skills/cufolio/skill.oms.sig | 1 + 3 files changed, 111 insertions(+), 96 deletions(-) create mode 100644 skills/cufolio/skill.oms.sig diff --git a/skills/cufolio/BENCHMARK.md b/skills/cufolio/BENCHMARK.md index f4af5dd..2602a91 100644 --- a/skills/cufolio/BENCHMARK.md +++ b/skills/cufolio/BENCHMARK.md @@ -1,81 +1,79 @@ -# cufolio — Skill Evaluation Benchmark - - - -How the `cufolio` skill was evaluated, and the measured uplift it provides over an -agent reasoning from scratch. Required for catalog publication. - -> Status: methodology is final; result cells marked _TBD_ are filled from a GPU run -> (see "Reproducing" below). The numbers must be regenerated whenever SKILL.md or the -> `cufolio` product changes materially. - -## Setup - -| | | -|---|---| -| Skill | `cufolio` (instruction-only; drives the installed `cufolio` package) | -| Agents | Claude Code **and** Codex (evaluate both per the publishing guide) | -| Model(s) | _TBD_ (record exact model + version) | -| Harness | NV-BASE (NV-ACES / Harbor) | -| Dataset | [`evals/evals.json`](evals/evals.json) — 5 positive + 4 negative cases | -| Hardware | NVIDIA GPU (cuOpt + cuML); record GPU model | -| Data | S&P 500 daily prices via `cufolio.utils.download_data` | - -## Metrics - -NV-BASE emits five evaluators that roll up into the five NVIDIA dimensions: - -| Evaluator | Kind | Dimension | -|---|---|---| -| `skill_execution` | deterministic | Correctness | -| `skill_efficiency` | deterministic | Efficiency | -| `accuracy` | LLM judge (5-criterion) | Correctness | -| `goal_accuracy` | full-conversation judge | Effectiveness | -| `behavior_check` | per-step YES/NO | Effectiveness | -| (scan: prompt-injection/secrets/PII) | NV-CARPS | Security | -| (trigger on positive / silence on negative) | discoverability | Discoverability | - -## Track A — Agent uplift (with vs. without the skill) - -Each task run with the skill installed and again with it removed (baseline). - -| Metric | Without skill | With skill | -|---|---|---| -| Positive tasks completed (goal_accuracy) | _TBD_ | _TBD_ | -| Behavior steps passed (behavior_check) | _TBD_ | _TBD_ | -| Trigger accuracy — fires on the 5 positives | _TBD_ | _TBD_ | -| Trigger accuracy — silent on the 4 negatives | _TBD_ | _TBD_ | -| Avg tokens / task | _TBD_ | _TBD_ | -| Avg wall-clock / task | _TBD_ | _TBD_ | - -Expected qualitative uplift (what the skill encodes that a baseline agent misses): -forcing `c_max=0.0` to avoid the all-cash optimum (Trap 2), passing -`show_discretized_portfolios=False` (Trap 4), using the manual loop only when weights -are needed (Trap 3), and always solving on GPU with cuOpt (`SOLVER_SETTINGS`). - -## Track B — Skill performance standards (Layer 3) - -Deterministic end-to-end runs of the documented workflows, graded against -[`tests/benchmarks/thresholds.toml`](../../tests/benchmarks/thresholds.toml). Source: `tests/test_skill_benchmarks.py`. - -| Workflow | Standard | Result | -|---|---|---| -| build-optimal | non-degenerate (not all-cash), sum(w)≈1, cuOpt, < 60s | _TBD_ | -| efficient-frontier | 25 points, return monotonic in CVaR, no `sum_to_one` crash | _TBD_ | -| weights-table | per-asset weight columns present | _TBD_ | -| backtest | optimized Sharpe > equal-weight Sharpe | _TBD_ | -| rebalance | ≥1 rebalance date, cumulative value series produced | _TBD_ | - -## Reproducing - -```bash -# Track B (no API key; needs GPU). Prints a metrics table + PASS/FAIL: -uv run pytest -m gpu tests/test_skill_benchmarks.py -v -uv run python tests/benchmarks/benchmark_workflows.py --check - -# Track A (needs NVIDIA_INFERENCE_KEY + GPU), per evals/EVAL.md: -nv-base validate --external skills/cufolio -``` +# Evaluation Report + +Evaluation of the `cufolio` skill before publication through NVSkills-Eval. + +This benchmark summarizes 3-Tier Evaluation from NVSkills-Eval results for the skill. The goal is to document whether the skill is safe, discoverable, effective, and useful for agents before it is published for broader workflow use. + +## Evaluation Summary + +- Skill: `cufolio` +- Evaluation date: 2026-06-11 +- NVSkills-Eval profile: `external` +- Environment: `astra-sandbox` +- Dataset: 4 evaluation tasks +- Attempts per task: 1 +- Pass threshold: 50% +- Overall verdict: PASS + +## Agents Used + +- `claude-code` +- `codex` + +## Metrics Used + +Reported benchmark dimensions: + +- Security: checks whether skill-assisted execution avoids unsafe behavior such as secret leakage, destructive commands, or unauthorized access. +- Correctness: checks whether the agent follows the expected workflow and produces the correct final output. +- Discoverability: checks whether the agent loads the skill when relevant and avoids using it when irrelevant. +- Effectiveness: checks whether the agent performs measurably better with the skill than without it. +- Efficiency: checks whether the agent uses fewer tokens and avoids redundant work. + +Underlying evaluation signals used in this run: + +- `security` (Security): checks for unsafe operations, secret leakage, and unauthorized access. +- `skill_execution` (Skill Execution): verifies that the agent loaded the expected skill and workflow. +- `skill_efficiency` (Efficiency): checks routing quality, decoy avoidance, and redundant tool usage. +- `accuracy` (Accuracy): grades final-answer correctness against the reference answer. +- `goal_accuracy` (Goal Accuracy): checks whether the overall user task completed successfully. +- `behavior_check` (Behavior Check): verifies expected behavior steps, including safety expectations. +- `token_efficiency` (Token Efficiency): compares token usage with and without the skill. + +## Test Tasks + +The benchmark dataset contained 4 evaluation tasks: + +- Positive tasks: 2 tasks where the skill was expected to activate. +- Negative tasks: 2 tasks where no skill was expected. +- Unlabeled tasks: 0 tasks where positive/negative intent could not be inferred. + +Task composition is derived from the evaluation dataset when possible. Entries with `expected_skill` set are treated as positive skill-activation cases, while entries with `expected_skill: null` are treated as negative activation cases. + +## Results + +| Dimension | Num | `claude-code` | `codex` | +|---|---:|---:|---:| +| Security | 4 | 100% (+0%) | 100% (+0%) | +| Correctness | 4 | 76% (+26%) | 78% (+14%) | +| Discoverability | 4 | 93% (+27%) | 87% (+15%) | +| Effectiveness | 4 | 46% (+20%) | 44% (+3%) | +| Efficiency | 4 | 88% (+29%) | 75% (+16%) | + +Score values show skill-assisted performance. Values in parentheses show uplift versus the no-skill baseline when baseline data is available. + +## Tier 1: Static Validation Summary + +Tier 1 validation passed. NVSkills-Eval ran 1 checks and found 0 total findings. + +Notable observations: + +- SCHEMA: Found skill manifest: SKILL.md + +## Tier 2: Deduplication Summary + +This tier was not run or did not produce findings in this report. + +## Publication Recommendation + +The skill is suitable to proceed toward NVSkills-Eval publication based on this benchmark. Skill owners should keep this file with the skill and refresh it when the evaluation dataset, skill behavior, or target agents materially change. diff --git a/skills/cufolio/skill-card.md b/skills/cufolio/skill-card.md index 7015589..a2085bb 100644 --- a/skills/cufolio/skill-card.md +++ b/skills/cufolio/skill-card.md @@ -1,5 +1,5 @@ ## Description:
-Build GPU-accelerated Mean-CVaR portfolios with NVIDIA cuOpt — CVaR optimization, efficient frontier, scenario generation, backtesting, and rebalancing.
+Use when a user asks to build, optimize, backtest, rebalance, or analyze a stock portfolio with Mean-CVaR, efficient frontiers, scenario generation, or NVIDIA cuOpt.
This skill is ready for commercial/non-commercial use.
@@ -7,9 +7,9 @@ This skill is ready for commercial/non-commercial use.
NVIDIA
### License/Terms of Use:
-Apache-2.0
+Apache 2.0
## Use Case:
-Quantitative researchers and engineers use this skill to construct and analyze Mean-CVaR portfolios with NVIDIA's GPU-accelerated cuOpt solver: optimal allocation, efficient frontier generation, strategy backtesting, and dynamic rebalancing.
+Developers and quantitative engineers who need to build, optimize, backtest, rebalance, or analyze stock portfolios using GPU-accelerated Mean-CVaR optimization with NVIDIA cuOpt.
### Deployment Geography for Use:
Global
@@ -19,22 +19,24 @@ Risk: Review before execution as proposals could introduce incorrect or misleadi Mitigation: Review and scan skill before deployment.
## Reference(s):
-- [Quantitative Portfolio Optimization README](https://github.com/NVIDIA-AI-Blueprints/cuFOLIO)
-- [Brev Launchable](https://brev.nvidia.com/launchable/deploy?launchableID=env-360InRZzyHqDnJYQKIxaSggF8xI)
-- [Eval dataset](evals/evals.json)
+- [Agent Recipes](references/workflows/agent_recipes.md)
+- [NVIDIA-AI-Blueprints/cuFOLIO](https://github.com/NVIDIA-AI-Blueprints/cuFOLIO)
+ ## Skill Output:
-**Output Type(s):** [Code, API Calls, Analysis]
-**Output Format:** [Python code with inline solver output and plots]
+**Output Type(s):** [Code, Analysis]
+**Output Format:** [Markdown with inline Python code blocks]
**Output Parameters:** [1D]
**Other Properties Related to Output:** [None]
## Evaluation Agents Used:
-- claude-code
-- codex
+- `claude-code`
+- `codex`
+ + ## Evaluation Tasks:
-Full set: 9 cases (5 positive + 4 negative) with 2 attempts per agent; pass threshold 60% (`evals/evals-full.json`, nightly/manual run). The CI publish gate runs a 4-case P0 subset (`evals/evals.json`) to fit the ~1h runner cap. NVSkills-Eval profile: external. Results pending the GPU agent-eval run (see `evals/EVAL.md` and `BENCHMARK.md`).
+Evaluated against 4 internal evaluation tasks (2 positive skill-activation, 2 negative skill-activation) via NVSkills-Eval external profile.
## Evaluation Metrics Used:
Reported benchmark dimensions:
@@ -44,14 +46,28 @@ Reported benchmark dimensions:
- Effectiveness: Checks whether the agent performs measurably better with the skill than without it.
- Efficiency: Checks whether the agent uses fewer tokens and avoids redundant work.
-Underlying evaluation signals:
-- `security`, `skill_execution`, `skill_efficiency`, `accuracy`, `goal_accuracy`, `behavior_check`, `token_efficiency`.
+Underlying evaluation signals used in this run:
+- `security`: Checks for unsafe operations, secret leakage, and unauthorized access.
+- `skill_execution`: Verifies that the agent loaded the expected skill and workflow.
+- `skill_efficiency`: Checks routing quality, decoy avoidance, and redundant tool usage.
+- `accuracy`: Grades final-answer correctness against the reference answer.
+- `goal_accuracy`: Checks whether the overall user task completed successfully.
+- `behavior_check`: Verifies expected behavior steps, including safety expectations.
+- `token_efficiency`: Compares token usage with and without the skill.
+ + ## Evaluation Results:
-_Pending the GPU agent-eval run; see `BENCHMARK.md` for the with-skill vs without-skill tables._
+| Dimension | Num | `claude-code` | `codex` | +|---|---:|---:|---:| +| Security | 4 | 100% (+0%) | 100% (+0%) | +| Correctness | 4 | 76% (+26%) | 78% (+14%) | +| Discoverability | 4 | 93% (+27%) | 87% (+15%) | +| Effectiveness | 4 | 46% (+20%) | 44% (+3%) | +| Efficiency | 4 | 88% (+29%) | 75% (+16%) | ## Skill Version(s):
-25.10.00 (source: frontmatter)
+25.10 (source: pyproject.toml)
## Ethical Considerations:
NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal team to ensure this skill meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
diff --git a/skills/cufolio/skill.oms.sig b/skills/cufolio/skill.oms.sig new file mode 100644 index 0000000..fedbfd8 --- /dev/null +++ b/skills/cufolio/skill.oms.sig @@ -0,0 +1 @@ +{"mediaType":"application/vnd.dev.sigstore.bundle.v0.3+json","verificationMaterial":{"x509CertificateChain":{"certificates":[{"rawBytes":"MIICgzCCAgmgAwIBAgIUKIyS7SxNteQIiWzK1dWj85E6520wCgYIKoZIzj0EAwMwVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwHhcNMjYwNDAxMDAwMDAwWhcNMjgwNDIyMTUzMzA5WjBUMQswCQYDVQQGEwJVUzEbMBkGA1UECgwSTlZJRElBIENvcnBvcmF0aW9uMSgwJgYDVQQDDB9OVklESUEgQWdlbnQgU2tpbGxzIFNpZ25pbmcgMDAxMHYwEAYHKoZIzj0CAQYFK4EEACIDYgAEYoRM9bQl/dGlwSRNi6bTpIJUXH8Nv9GciP6LSflJYYMLCc296kpyuTSsk5ddbAWiDcFX3C/ydX3jwc+qCLYP6uHy9XphyLjOQ27Yb2J6rBLVtRBS1mgGco/Gr7fL6ODco4GaMIGXMB0GA1UdDgQWBBRQ/5ZW3nJ6lmo9SVk7I15o7UGmpTAfBgNVHSMEGDAWgBRPGpILxMBBleJSsBGjrMKsby1CgjAMBgNVHRMBAf8EAjAAMA4GA1UdDwEB/wQEAwIHgDA3BggrBgEFBQcBAQQrMCkwJwYIKwYBBQUHMAGGG2h0dHA6Ly9vY3NwLm5kaXMubnZpZGlhLmNvbTAKBggqhkjOPQQDAwNoADBlAjAUygu/GiOCIXrgGr4SmLgeEVDcEitfFUv7ALbvLVGVyMysB3mxmO/uInZfXzWcJZsCMQDxuoxj4ZmO30jhkPIcCxGFCOvnUsnfU3TfGcouYm4M6iRpbKvtVnHPiy4bi6pcKf0="},{"rawBytes":"MIICiDCCAg6gAwIBAgIUZsIuSv9NkpJCNqtYEfCouVv5BzowCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowVTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjEpMCcGA1UEAwwgTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBJQ0EgMDEwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAASI72cR3ctKGg4VWnB3bNja6g1Z2PnOmFEopkPof+QeIcPk9rT+g9MjJnq51EQXL93a7C2GJ9J985G4o2V85VD7wJ1RaXhluHW2rf3y8bQGeAYaKMr5s/hUgn+M3/9WlWejgaAwgZ0wHQYDVR0OBBYEFE8akgvEwEGV4lKwEaOswqxvLUKCMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMBIGA1UdEwEB/wQIMAYBAf8CAQAwDgYDVR0PAQH/BAQDAgEGMDcGCCsGAQUFBwEBBCswKTAnBggrBgEFBQcwAYYbaHR0cDovL29jc3AubmRpcy5udmlkaWEuY29tMAoGCCqGSM49BAMDA2gAMGUCMQCeIMMfAbyzPDacw2MxG+Yt1cikrJX/DVxiGfXuHmkkXn6VgSzE79+lkqDErpVO2gYCMCNEColOyvUvkzZGUEI1hQ3PfMgi3FIo9tHoBKMw4/wGBLFpu/0ubtmbBXM6/UMOEw=="},{"rawBytes":"MIICRTCCAcygAwIBAgIUeJdY3rV86EdvFmG7L8LJBsyQFYkwCgYIKoZIzj0EAwMwUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTAgFw0yNjA0MDEwMDAwMDBaGA85OTk5MTIzMTIzNTk1OVowUTELMAkGA1UEBhMCVVMxGzAZBgNVBAoMEk5WSURJQSBDb3Jwb3JhdGlvbjElMCMGA1UEAwwcTlZJRElBIEFnZW50IENhcGFiaWxpdGllcyBDQTB2MBAGByqGSM49AgEGBSuBBAAiA2IABAYpiXCDjJ9NT2eSDhyHJVSw1Tbze18cGG2F/578oWvHxg23eQAhNRYdq88i1iOshZSO6C29doKui5Xpmo/7Ctw9Sx4PP2RzOmIuOLCuTdNtKcTRwi4GEsd5BAFvWj42M6NjMGEwHQYDVR0OBBYEFItnoAjjfuCEUvzyvWyI2vOGvwPjMB8GA1UdIwQYMBaAFItnoAjjfuCEUvzyvWyI2vOGvwPjMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMAoGCCqGSM49BAMDA2cAMGQCMCwtAjWLaNwgGWNCgdyNoTyvNhqWRECRJV2r3+7w8g0PL6NHLOsbkgE09BH95h8XlgIwTaQmbbUh2ChAJ5TA1wRiVDnCcvbzHlZl2jM2FcwQQZlk19LOAbyGMRixbu2Ww/rj"}]},"tlogEntries":[]},"dsseEnvelope":{"payload":"ewogICJfdHlwZSI6ICJodHRwczovL2luLXRvdG8uaW8vU3RhdGVtZW50L3YxIiwKICAic3ViamVjdCI6IFsKICAgIHsKICAgICAgIm5hbWUiOiAiY3Vmb2xpbyIsCiAgICAgICJkaWdlc3QiOiB7CiAgICAgICAgInNoYTI1NiI6ICJlMjBmZWQ0NGE5MTI4MTg3MjM5MmVkNTcwYmE0ZTNlYmUzNjgyMWYxNDIyNzJjMjYxYjA0NDYyYWM5NmZlZjhhIgogICAgICB9CiAgICB9CiAgXSwKICAicHJlZGljYXRlVHlwZSI6ICJodHRwczovL21vZGVsX3NpZ25pbmcvc2lnbmF0dXJlL3YxLjAiLAogICJwcmVkaWNhdGUiOiB7CiAgICAic2VyaWFsaXphdGlvbiI6IHsKICAgICAgImlnbm9yZV9wYXRocyI6IFsKICAgICAgICAiLmdpdGh1YiIsCiAgICAgICAgIi5naXRpZ25vcmUiLAogICAgICAgICIuZ2l0YXR0cmlidXRlcyIsCiAgICAgICAgIi5naXQiCiAgICAgIF0sCiAgICAgICJtZXRob2QiOiAiZmlsZXMiLAogICAgICAiaGFzaF90eXBlIjogInNoYTI1NiIsCiAgICAgICJhbGxvd19zeW1saW5rcyI6IGZhbHNlCiAgICB9LAogICAgInJlc291cmNlcyI6IFsKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiZmJkYjVjNTViOGM0NmE1YzUwNjc0Y2E1MzYyNWNjNmQ1ODdjMzAwZGNkOTRmYjBmYzNhMTNlZjg3MWJlZDc4MCIsCiAgICAgICAgIm5hbWUiOiAiQkVOQ0hNQVJLLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiNmRhMWYzN2Y2Mzg2NGYzZGQ5YjY2YTNhMThlZGVkNzkzZjFiYWUzNjg5MzA1YWZhZWVkOWQ2ZWFkMDQ0NzNkNiIsCiAgICAgICAgIm5hbWUiOiAiU0tJTEwubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IgogICAgICB9LAogICAgICB7CiAgICAgICAgImRpZ2VzdCI6ICI4NmVhMzA5NDhjZWJiMmVkMGYyMjdmMjFhZWIwYzAyMThjYjFmY2E1NTFkNTdkYzc0NDNmYWM1YzAxMjJjYzgzIiwKICAgICAgICAibmFtZSI6ICJldmFscy9FVkFMLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiM2MyZjU2YjlkNGFkOWMyYzI1YjllMDIzOWE5NjIzM2ViOTgyOTExOGFjMzI5NjI0YzA1MjRlMGM4YTI4ZWVlNiIsCiAgICAgICAgIm5hbWUiOiAiZXZhbHMvZXZhbHMtZnVsbC5qc29uIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIKICAgICAgfSwKICAgICAgewogICAgICAgICJkaWdlc3QiOiAiNTZhMDhkYWZlODUzYzYwMmVjZDE3MzAzMjVkNjI1YmEzNTM0NzI0MTkxMjgxMTQyMGZiOGZjZTZhMGYzOTI2NiIsCiAgICAgICAgIm5hbWUiOiAiZXZhbHMvZXZhbHMuanNvbiIsCiAgICAgICAgImFsZ29yaXRobSI6ICJzaGEyNTYiCiAgICAgIH0sCiAgICAgIHsKICAgICAgICAiZGlnZXN0IjogIjcxNTAzNjQ3ZjM5M2NlY2M2MTU4ZTQ3OGUwZTA1ZjA1ZmZlMzgyZDg3NzZhN2RiMWNmOTBlOWFmNGNhMmY3N2QiLAogICAgICAgICJuYW1lIjogInJlZmVyZW5jZXMvd29ya2Zsb3dzL2FnZW50X3JlY2lwZXMubWQiLAogICAgICAgICJhbGdvcml0aG0iOiAic2hhMjU2IgogICAgICB9LAogICAgICB7CiAgICAgICAgImRpZ2VzdCI6ICI0Zjc4NWUwYmJjM2I1MDFjYmVmZjc3NzllMWRmOGFlZDNjZjRlNDU0ZDI5MzU5NDk5OTEzZDQ5OWM1NTg0OTBhIiwKICAgICAgICAibmFtZSI6ICJza2lsbC1jYXJkLm1kIiwKICAgICAgICAiYWxnb3JpdGhtIjogInNoYTI1NiIKICAgICAgfQogICAgXQogIH0KfQ==","payloadType":"application/vnd.in-toto+json","signatures":[{"sig":"MGUCMQCVZC2WVTY1nANFMUsz6oZpEo2aLDvkWKGoD7PMAifshwm4zZEmRMl7gYnB/u5oRAoCMH8UTLBHB/VPw14MZGOtRrnXh5Yx6NE8pX59co3GKUMRWyD3vKo0SfHQ7RTbld649A==","keyid":""}]}} \ No newline at end of file