From 700fe548d2cb40033cbea30768472f61ae556e16 Mon Sep 17 00:00:00 2001 From: Haksung Jang Date: Sun, 14 Jun 2026 01:30:42 +0900 Subject: [PATCH 1/6] feat(scan): external CycloneDX SBOM ingest endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add POST /v1/projects/{id}/sbom-ingest so external tools (CI, cdxgen-based scanners) can upload an already-generated CycloneDX SBOM; TRUSCA runs the back half of the scan pipeline against it — persist components → trivy sbom matching → findings — reusing the Scan model so ingested scans get ref-keyed retention, the per-project active-scan guard, and the existing Components/Vulnerabilities/Licenses UI and build gate for free. This is NOT a Dependency-Track compatible surface: it is a TRUSCA-native endpoint (Authorization: Bearer, field `sbom`, no autoCreate), not DT's /api/v1/bom + X-Api-Key. Endpoint / service (services/sbom_ingest_service.py, api/v1/sbom.py): - multipart sbom + ref + release; 202 ScanPublic (kind="sbom"). - require_role_or_api_key("developer"); project-scoped key must match. - Reuses trigger_scan's guards via an extracted prepare_scan_target (existence/team 404/403 before archived 409 / cap 429 — authz before state). - Synchronous adversarial validation of untrusted input: bounded read (SBOM_INGEST_MAX_BYTES, 32 MiB → 413), content-type/filename allow-list (415), JSON + CycloneDX structure whitelist (422), component cap (SBOM_INGEST_MAX_COMPONENTS, 50k → 422), and an O(n) string-aware byte nesting-depth pre-check so a deeply nested document is a clean 422 instead of a RecursionError → 500 from json.loads. RFC 7807 throughout. - Atomic: flush wins the active-scan race before the file is written; a 409 loser writes no file; commit-race deletes the file; enqueue failure → 503. Celery task (tasks/ingest_sbom.py, enqueue branch + include): - ingest_sbom_task reuses persist_sbom_components → run_trivy_sbom → persist_trivy_findings → mark_succeeded (ref-keyed supersede). Preserves the uploaded SBOM as a durable sbom_cyclonedx ScanArtifact for the signature surface; containment-guards the path under workspace_root(). Security (Producer-Reviewer findings addressed): - bind_audit_team before the scan INSERT so the audit row carries team_id. - disk-write failure → 503 SbomIngestStorageError (retryable), not 422. - release / original_filename length-capped + control-byte stripped. Tests: pure adversarial validator unit suite (incl. depth-bomb regression), endpoint permission×state matrix + new existence-hide-state 409 rows, realistic multi-CVE fixture pipeline test. Docs: EN/KO ci-integration/sbom-upload. --- apps/backend/api/v1/sbom.py | 266 ++++++- apps/backend/core/config.py | 31 + apps/backend/services/sbom_ingest_service.py | 574 +++++++++++++++ apps/backend/services/scan_service.py | 106 ++- apps/backend/tasks/__init__.py | 14 +- apps/backend/tasks/celery_app.py | 4 + apps/backend/tasks/ingest_sbom.py | 358 ++++++++++ .../sbom_ingest/realistic-trivy-sbom.json | 71 ++ .../fixtures/sbom_ingest/realistic.cdx.json | 85 +++ .../scan/test_ingest_sbom_pipeline.py | 357 ++++++++++ .../test_existence_hide_state_matrix.py | 87 +++ .../tests/integration/test_sbom_ingest_api.py | 662 ++++++++++++++++++ .../services/test_sbom_ingest_validation.py | 403 +++++++++++ docs-site/docs/ci-integration/sbom-upload.md | 167 +++++ .../current/ci-integration/sbom-upload.md | 167 +++++ docs-site/sidebars.ts | 1 + 16 files changed, 3320 insertions(+), 33 deletions(-) create mode 100644 apps/backend/services/sbom_ingest_service.py create mode 100644 apps/backend/tasks/ingest_sbom.py create mode 100644 apps/backend/tests/fixtures/sbom_ingest/realistic-trivy-sbom.json create mode 100644 apps/backend/tests/fixtures/sbom_ingest/realistic.cdx.json create mode 100644 apps/backend/tests/integration/scan/test_ingest_sbom_pipeline.py create mode 100644 apps/backend/tests/integration/test_sbom_ingest_api.py create mode 100644 apps/backend/tests/unit/services/test_sbom_ingest_validation.py create mode 100644 docs-site/docs/ci-integration/sbom-upload.md create mode 100644 docs-site/i18n/ko/docusaurus-plugin-content-docs/current/ci-integration/sbom-upload.md diff --git a/apps/backend/api/v1/sbom.py b/apps/backend/api/v1/sbom.py index d99a1cef..543559d7 100644 --- a/apps/backend/api/v1/sbom.py +++ b/apps/backend/api/v1/sbom.py @@ -21,14 +21,28 @@ from typing import Literal import structlog -from fastapi import APIRouter, Depends, Query, Request, Response, status +from fastapi import ( + APIRouter, + Depends, + File, + Form, + Query, + Request, + Response, + UploadFile, + status, +) from sqlalchemy.ext.asyncio import AsyncSession +from core.api_key_auth import require_role_or_api_key from core.authz import assert_team_access +from core.config import sbom_ingest_max_bytes, scan_trigger_rate_limit from core.db import get_db from core.errors import problem_response +from core.ratelimit import _authenticated_user_key, limiter from core.security import CurrentUser, require_role from models import Project +from schemas.scan import ScanPublic from services.project_service import ( ProjectError, ProjectForbidden, @@ -41,6 +55,11 @@ SBOMUnsupportedFormat, export_sbom, ) +from services.sbom_ingest_service import ( + SbomIngestError, + SbomIngestTooLarge, + ingest_sbom, +) from services.sbom_signature import ( KIND_ATTEST_CERT, KIND_ATTESTATION, @@ -53,6 +72,11 @@ get_signature_artifact, ) from services.scan_resolution import SnapshotScanNotFound, latest_succeeded_scan_id +from services.scan_service import ( + ConcurrentScanLimitExceeded, + ScanError, + ScanInProgressConflict, +) router = APIRouter(prefix="/v1", tags=["sbom"]) log = structlog.get_logger("sbom.api") @@ -96,6 +120,80 @@ def _problem_for_sbom_error(request: Request, exc: SBOMExportError) -> Response: ) +def _problem_for_sbom_ingest_error(request: Request, exc: SbomIngestError) -> Response: + """Translate an SBOM-ingest validation error into an RFC 7807 envelope. + + Each ``SbomIngestError`` subclass carries its own ``status_code`` (413 / 415 / + 422) and a stable ``type_uri`` problem URI. + """ + return problem_response( + status_code=exc.status_code, + title=exc.title, + detail=str(exc) or exc.title, + instance=request.url.path, + type_=exc.type_uri, + ) + + +def _problem_for_scan_error(request: Request, exc: ScanError) -> Response: + """Translate a scan-domain error (raised by the shared scan guards) to 7807. + + The SBOM-ingest path reuses ``services.scan_service`` guards, so it can raise + ``ScanForbidden`` (403), ``ProjectMissingForScan`` (404), + ``ScanArchivedConflict`` (409), ``ConcurrentScanLimitExceeded`` (429), + ``ScanInProgressConflict`` (409), or ``ScanEnqueueFailed`` (503). The mapping + here mirrors ``api/v1/projects.py::_problem_for_scan_error`` exactly so the + two scan-creating surfaces return identical envelopes. + """ + # B1: the per-team concurrency cap carries the `limit` extension, a domain + # `type` URI, and a Retry-After header. M1 (security-reviewer): the live + # running_scans count is deliberately NOT exposed (intra-team side-channel). + if isinstance(exc, ConcurrentScanLimitExceeded): + response = problem_response( + status_code=exc.status_code, + title=exc.title, + detail=str(exc) or exc.title, + instance=request.url.path, + type_=exc.type_uri, + limit=exc.limit, + ) + response.headers["Retry-After"] = str(exc.retry_after_seconds) + return response + # P1 #10 — machine-checkable extension on the per-project active-scan conflict. + if isinstance(exc, ScanInProgressConflict): + return problem_response( + status_code=exc.status_code, + title=exc.title, + detail=str(exc) or exc.title, + instance=request.url.path, + scan_already_in_progress=True, + ) + return problem_response( + status_code=exc.status_code, + title=exc.title, + detail=str(exc) or exc.title, + instance=request.url.path, + ) + + +def _declared_content_length(request: Request) -> int | None: + """Parse the request's ``Content-Length`` header, or ``None`` if absent/bad. + + Mirrors ``api/v1/projects.py``: a multipart upload's Content-Length covers the + whole envelope, so it is a safe over-estimate for an early-reject ceiling. A + malformed value is treated as absent (the streamed-bytes cap in the service is + the authoritative guard). + """ + raw = request.headers.get("content-length") + if raw is None: + return None + try: + value = int(raw) + except ValueError: + return None + return value if value >= 0 else None + + # --------------------------------------------------------------------------- # GET /v1/projects/{project_id}/sbom # --------------------------------------------------------------------------- @@ -576,3 +674,169 @@ async def download_sbom_signature_bundle_endpoint( ), ) return _download_response(artifact) + + +# --------------------------------------------------------------------------- +# POST /v1/projects/{project_id}/sbom-ingest — external CycloneDX SBOM ingest +# --------------------------------------------------------------------------- + + +@router.post( + "/projects/{project_id}/sbom-ingest", + response_model=ScanPublic, + status_code=status.HTTP_202_ACCEPTED, + summary="Ingest an external CycloneDX SBOM (queues a Celery task; returns 202 Accepted)", + responses={ + 202: { + "description": "SBOM accepted; a queued scan row is returned.", + "content": {"application/json": {}}, + }, + 401: {"description": "Authentication required"}, + 403: { + "description": ( + "Caller is not a member of the project's owning team, or a " + "project-scoped API key targets a different project. RFC 7807." + ), + "content": {"application/problem+json": {}}, + }, + 404: { + "description": "Project not found (existence-hidden). RFC 7807.", + "content": {"application/problem+json": {}}, + }, + 409: { + "description": ( + "A scan is already queued/running for this project, or the project " + "is archived. RFC 7807." + ), + "content": {"application/problem+json": {}}, + }, + 413: { + "description": "SBOM exceeds the ingest size cap. RFC 7807.", + "content": {"application/problem+json": {}}, + }, + 415: { + "description": "Upload is not a CycloneDX JSON media type. RFC 7807.", + "content": {"application/problem+json": {}}, + }, + 422: { + "description": ( + "Upload is not a valid / supported CycloneDX document (not JSON, " + "wrong bomFormat, unsupported specVersion, too many components). " + "RFC 7807." + ), + "content": {"application/problem+json": {}}, + }, + 429: { + "description": ( + "Rate limited (too many scan creations from this user) or the " + "team's concurrent-scan cap is reached. RFC 7807 + Retry-After." + ), + "content": {"application/problem+json": {}}, + }, + }, +) +# B1: share the SAME rate-limit bucket as the scan-trigger endpoint +# (``scope="scan_trigger"``, keyed by the authenticated user) — ingesting an SBOM +# is a scan-creating action, so it draws from the same per-user budget rather than +# opening a parallel lane. See ``api/v1/projects.py::trigger_scan_endpoint`` for +# why ``shared_limit`` (route-path-independent bucket key) is required over plain +# ``@limit`` (which would bucket per {project_id} and let a user spray uploads +# across projects to bypass the cap). +@limiter.shared_limit( + scan_trigger_rate_limit, + scope="scan_trigger", + key_func=_authenticated_user_key, +) +async def ingest_sbom_endpoint( + request: Request, + project_id: uuid.UUID, + sbom: UploadFile = File( + ..., + description="A CycloneDX JSON SBOM document (.json / .cdx.json).", + ), + ref: str | None = Form( + default=None, + description=( + "Optional git ref this SBOM was produced from (e.g. refs/heads/main, " + "a tag, or a bare branch name). Normalized into a retention key." + ), + ), + release: str | None = Form( + default=None, + description="Optional release/version label for the resulting snapshot.", + ), + session: AsyncSession = Depends(get_db), + # Accept a JWT or a tos_ API key — CI pipelines push SBOMs with the key, the + # SPA with a JWT. A project-scoped key targeting a different project is 403'd + # inside the shared scan guard (existence-hide is not applied to the key-scope + # mismatch, matching the scan-trigger endpoint). + actor: CurrentUser = Depends(require_role_or_api_key("developer")), +) -> Response: + # NOTE: this is NOT the Dependency-Track ``/api/v1/bom`` + ``X-Api-Key`` BOM + # upload surface — it is a first-party, RBAC-scoped portal endpoint that + # returns a queued scan row (202) rather than a DT token. + + # M2-style fast-fail: reject before reading a single body byte when the + # declared Content-Length already exceeds the ingest cap. The service still + # enforces the cap on the ACTUAL streamed bytes (a client can lie about / + # omit Content-Length), so this is a courtesy short-circuit, not the guard. + declared = _declared_content_length(request) + if declared is not None and declared > sbom_ingest_max_bytes(): + return _problem_for_sbom_ingest_error( + request, + SbomIngestTooLarge( + f"declared content-length {declared} exceeds the " + f"{sbom_ingest_max_bytes()}-byte SBOM ingest limit" + ), + ) + + # Guard order is enforced inside ``ingest_sbom``: authz/existence (404/403) + + # api-key scope (403) FIRST, then request validation (413/415/422), then the + # 409 active-scan conflict at flush — so a non-member never learns a project's + # state. Scan-domain guard failures map via ``_problem_for_scan_error``; SBOM + # validation failures via ``_problem_for_sbom_ingest_error``. + try: + scan = await ingest_sbom( + session, + project_id=project_id, + upload=sbom, + actor=actor, + ref=ref, + release=release, + ) + except SbomIngestError as exc: + return _problem_for_sbom_ingest_error(request, exc) + except ScanError as exc: + return _problem_for_scan_error(request, exc) + + body = ScanPublic.model_validate(scan) + return Response( + # ``by_alias=True`` so the response carries ``metadata`` (the API field + # name) rather than ``scan_metadata`` (the ORM attribute name) — matches + # the scan-trigger endpoint's ScanPublic serialization. + content=body.model_dump_json(by_alias=True), + status_code=status.HTTP_202_ACCEPTED, + media_type="application/json", + ) + + +# slowapi's ``@limiter.shared_limit`` wraps the endpoint with functools.wraps, +# whose ``__globals__`` is slowapi's module. Under ``from __future__ import +# annotations`` FastAPI's ``get_type_hints()`` on the wrapper cannot resolve our +# string annotations, so it misclassifies the body / dependency params. Mirror the +# fix used in projects.py / auth.py: copy the names the wrapper needs into its +# ``__globals__`` (the dict is mutable even though the attribute is read-only). +for _name in ( + "uuid", + "UploadFile", + "AsyncSession", + "CurrentUser", + "Request", + "Response", + "Depends", + "File", + "Form", +): + if _name in globals(): + ingest_sbom_endpoint.__globals__.setdefault(_name, globals()[_name]) +del _name diff --git a/apps/backend/core/config.py b/apps/backend/core/config.py index 0b088e3e..6f8ab9a5 100644 --- a/apps/backend/core/config.py +++ b/apps/backend/core/config.py @@ -815,6 +815,37 @@ def sbom_download_max_bytes() -> int: return int(os.getenv("SBOM_DOWNLOAD_MAX_BYTES", str(64 * 1024 * 1024))) +def sbom_ingest_max_bytes() -> int: + """Hard ceiling on an externally-ingested CycloneDX SBOM upload. + + The SBOM-ingest endpoint (``POST /v1/projects/{id}/sbom-ingest``) accepts an + attacker-controllable JSON document. We read the upload through a bounded, + chunked loop and abort the instant the running total crosses this cap — the + body is NEVER buffered in full first, so an oversized upload cannot exhaust + memory before the size check fires. Over-cap surfaces as a 413 (RFC 7807). + + Default 32 MiB comfortably covers a large real-world CycloneDX SBOM (tens of + thousands of components) while still bounding the request. Read at call time + (CLAUDE.md core rule #11) so an operator can retune without a rebuild. + """ + return int(os.getenv("SBOM_INGEST_MAX_BYTES", str(32 * 1024 * 1024))) + + +def sbom_ingest_max_components() -> int: + """Max number of ``components`` entries an ingested CycloneDX SBOM may carry. + + A second, structural DoS guard layered on top of ``sbom_ingest_max_bytes``: + even a within-size document could declare a pathological component count that + the downstream Celery persister would loop over. The synchronous validation + only checks ``len(components)`` (it never deep-traverses the elements), so the + check is O(1) on the already-parsed list. Over-cap surfaces as a 422. + + Default 50,000 mirrors the source-archive member ceiling. Read at call time + (CLAUDE.md core rule #11). + """ + return int(os.getenv("SBOM_INGEST_MAX_COMPONENTS", "50000")) + + def workspace_root() -> str: """Root directory under which per-scan workspaces live.""" return os.getenv("WORKSPACE_HOST_PATH", "/tmp/trustedoss") # noqa: S108 diff --git a/apps/backend/services/sbom_ingest_service.py b/apps/backend/services/sbom_ingest_service.py new file mode 100644 index 00000000..a3d5219f --- /dev/null +++ b/apps/backend/services/sbom_ingest_service.py @@ -0,0 +1,574 @@ +""" +External CycloneDX SBOM ingest — synchronous validation + scan-row creation. + +This is the synchronous *front half* of the SBOM-ingest feature: it accepts an +uploaded CycloneDX JSON document, validates it adversarially, persists a +``kind="sbom"`` :class:`~models.scan.Scan` row, writes the validated SBOM to a +durable on-disk location, and enqueues the Celery task that does the heavy work +(``tasks.ingest_sbom.ingest_sbom_task``). The endpoint returns ``202 Accepted`` +with the queued scan row — never the result (CLAUDE.md core rule #3). + +This endpoint is NOT the Dependency-Track ``/api/v1/bom`` + ``X-Api-Key`` BOM +upload surface — it is a first-party, RBAC-scoped portal endpoint. + +Security posture (the upload is untrusted input — this is the PR's core attack +surface; recorded for the security reviewer): + + - **Unbounded buffering is forbidden.** The upload is read through a bounded, + chunked loop that aborts the instant the running total crosses + ``sbom_ingest_max_bytes()`` (default 32 MiB) — the whole body is never + materialised first. Over-cap surfaces as 413. The endpoint additionally + fast-fails on a declared ``Content-Length`` over the cap before reading a + single body byte (mirrors the source-archive endpoint). + + - **Content-Type / filename allow-list.** Only JSON-ish media types and a + ``.json`` / ``.cdx.json`` filename are accepted (415 otherwise). The header + is advisory; the JSON parse + CycloneDX structure check are authoritative. + + - **Structural whitelist, NO deep traversal.** We parse the JSON and check + only the TOP-LEVEL keys: ``bomFormat == "CycloneDX"``, ``specVersion`` in a + known set, and (when present) ``components`` is a list whose ``len`` is + within ``sbom_ingest_max_components()`` (default 50,000). We deliberately do + NOT recurse into the component elements here — a deeply-nested hostile + document cannot drive our validation into a recursion / CPU blow-up. The + authoritative deep parse happens later, inside the Celery worker + (``persist_sbom_components``), off the request path. + +CLAUDE.md compliance: + - Core rule #11: every limit is read via ``os.getenv`` at call time (through + the ``core.config`` accessors) — no module-level env caching. + - §4: failures raise typed domain exceptions carrying an HTTP status; the + router maps them to RFC 7807 ``application/problem+json``. + - Core rule #2: no schema change — the validated SBOM rides on disk and its + path is carried in ``scan_metadata['sbom_path']`` (JSONB), so no migration. +""" + +from __future__ import annotations + +import json +import uuid +from pathlib import Path +from typing import Any + +import structlog +from fastapi import UploadFile +from sqlalchemy.exc import IntegrityError +from sqlalchemy.ext.asyncio import AsyncSession + +from core.audit import bind_audit_team +from core.config import ( + sbom_ingest_max_bytes, + sbom_ingest_max_components, + workspace_root, +) +from core.security import CurrentUser +from models import Scan +from services.scan_service import ( + ScanEnqueueFailed, + ScanInProgressConflict, + normalize_ref, + prepare_scan_target, +) +from tasks import enqueue_scan + +log = structlog.get_logger("sbom_ingest.service") + +# Streaming chunk size for the bounded inbound read. +_CHUNK_SIZE = 1024 * 1024 # 1 MiB + +# Content types a browser / CLI realistically sets for a CycloneDX JSON upload. +# The header is advisory (the JSON + structure checks are authoritative) but an +# obviously-wrong declaration fails fast with a clear 415 message. +_ALLOWED_CONTENT_TYPES = frozenset( + { + "application/json", + "application/vnd.cyclonedx+json", + "application/octet-stream", + "", # some CLIs omit the part content-type + } +) + +# CycloneDX spec versions we accept. The Celery persister handles the structural +# differences; here we only gate the declared version so a wildly-mismatched +# document is rejected up front. +_ALLOWED_SPEC_VERSIONS = frozenset({"1.2", "1.3", "1.4", "1.5", "1.6"}) + +# Reject a document whose structural nesting exceeds this before it ever reaches +# ``json.loads`` — the stdlib decoder recurses per nesting level and overflows +# the interpreter recursion limit (RecursionError → unhandled 500) on a +# maliciously deep document. A real CycloneDX SBOM nests only a handful of +# levels (metadata.component, nested components[], dependencies[]); 64 is far +# above any legitimate document yet well under CPython's default recursion +# ceiling, so the cheap O(n) byte scan below rejects the abuse case as a clean +# 422 instead of crashing the worker thread's stack. +_MAX_NESTING_DEPTH = 64 + + +# --------------------------------------------------------------------------- +# Domain exceptions +# --------------------------------------------------------------------------- + + +class SbomIngestError(Exception): + """Base class for SBOM-ingest errors. Each carries an HTTP status.""" + + status_code: int = 400 + title: str = "SBOM Ingest Error" + type_uri: str = "https://docs.trustedoss.io/errors/sbom-ingest" + + +class SbomIngestTooLarge(SbomIngestError): + status_code = 413 + title = "SBOM Too Large" + type_uri = "https://docs.trustedoss.io/errors/sbom-ingest-too-large" + + +class SbomIngestUnsupportedType(SbomIngestError): + status_code = 415 + title = "Unsupported SBOM Type" + type_uri = "https://docs.trustedoss.io/errors/sbom-ingest-unsupported-type" + + +class SbomIngestInvalid(SbomIngestError): + """The upload is not a valid / supported CycloneDX JSON document. + + Maps to 422 — the request was well-formed (right size, right media type) but + the *content* is not an ingestible CycloneDX SBOM (not JSON, not an object, + wrong ``bomFormat``, unsupported ``specVersion``, malformed ``components``, + or too many components). + """ + + status_code = 422 + title = "Invalid SBOM Document" + type_uri = "https://docs.trustedoss.io/errors/sbom-ingest-invalid" + + +class SbomIngestStorageError(SbomIngestError): + """The validated SBOM could not be persisted to disk (transient infra fault). + + Maps to 503 — a server-side I/O failure (full/read-only volume, etc.), NOT a + client error. Distinct from the 422 ``SbomIngestInvalid`` so a CI client + treats it as retryable rather than surfacing a misleading "invalid SBOM". + The client-facing message is generic (no path / errno leak). + """ + + status_code = 503 + title = "SBOM Storage Unavailable" + type_uri = "https://docs.trustedoss.io/errors/sbom-ingest-storage" + + +# --------------------------------------------------------------------------- +# Path helpers +# --------------------------------------------------------------------------- + + +def sbom_ingest_dir_for_project(project_id: uuid.UUID) -> Path: + """Directory holding a project's durably-stored ingested SBOMs. + + Lives under ``workspace_root()`` but OUTSIDE any per-scan workspace + (``{workspace_root()}/{scan_id}``), which the Celery task rmtrees on + completion. The ingest task points its ``sbom_cyclonedx`` ScanArtifact at + this durable copy so the SBOM signature/bundle download surface keeps working. + """ + return Path(workspace_root()) / "sbom-ingest" / str(project_id) + + +def sbom_ingest_path(project_id: uuid.UUID, scan_id: uuid.UUID) -> Path: + """Resolve the on-disk path for one ingested SBOM (keyed by scan id).""" + return sbom_ingest_dir_for_project(project_id) / f"{scan_id}.cdx.json" + + +# --------------------------------------------------------------------------- +# Bounded read +# --------------------------------------------------------------------------- + + +async def _read_bounded(upload: UploadFile, *, max_bytes: int) -> bytes: + """Read the whole upload into memory, but never more than ``max_bytes``. + + Reads in 1 MiB chunks and raises :class:`SbomIngestTooLarge` the instant the + running total would exceed ``max_bytes`` — the body is NEVER buffered in full + before the size is known, so an oversized upload cannot exhaust memory. We + intentionally read up to ``max_bytes`` into memory (not stream-to-disk like + the zip path) because the validated document must be JSON-parsed in full + anyway; the cap keeps that bounded. + """ + buf = bytearray() + while True: + chunk = await upload.read(_CHUNK_SIZE) + if not chunk: + break + buf.extend(chunk) + if len(buf) > max_bytes: + log.warning( + "sbom_ingest.reject_too_large", + limit_bytes=max_bytes, + ) + raise SbomIngestTooLarge( + f"SBOM upload exceeds the {max_bytes}-byte ingest limit" + ) + return bytes(buf) + + +# --------------------------------------------------------------------------- +# Validation (pure — unit-testable without DB / Redis) +# --------------------------------------------------------------------------- + + +def _validate_content_type(*, content_type: str | None, filename: str | None) -> None: + """Reject media types / filenames that are not CycloneDX JSON (415). + + The check is advisory-but-fast-failing: a part content-type in the allow-list + OR a ``.json`` / ``.cdx.json`` filename is accepted. A request that is wrong + on BOTH axes is rejected with 415 before we even parse the body. + """ + normalized_ct = (content_type or "").lower().split(";", 1)[0].strip() + name = (filename or "").strip().lower() + name_ok = name.endswith(".json") or name.endswith(".cdx.json") + if normalized_ct in _ALLOWED_CONTENT_TYPES: + return + if name_ok: + return + log.warning( + "sbom_ingest.reject_content_type", + content_type=normalized_ct or "", + filename=name or "", + ) + raise SbomIngestUnsupportedType( + f"content-type {normalized_ct!r} (filename {name!r}) is not an accepted " + "CycloneDX JSON media type" + ) + + +def _max_nesting_depth(raw: bytes) -> int: + """Return the maximum ``{``/``[`` nesting depth of ``raw``, string-aware. + + A single O(n) byte scan that tracks structural nesting while skipping the + contents of JSON strings (so a license text or description containing ``{`` + never inflates the count) and honouring backslash escapes inside strings. + Used as a pre-check so a pathologically deep document is rejected before the + recursive ``json.loads`` decoder is invoked. No recursion here. + """ + depth = 0 + max_depth = 0 + in_string = False + escaped = False + for byte in raw: + if in_string: + if escaped: + escaped = False + elif byte == 0x5C: # backslash + escaped = True + elif byte == 0x22: # closing quote + in_string = False + continue + if byte == 0x22: # opening quote + in_string = True + elif byte == 0x7B or byte == 0x5B: # '{' or '[' + depth += 1 + if depth > max_depth: + max_depth = depth + elif byte == 0x7D or byte == 0x5D: # '}' or ']' + if depth > 0: + depth -= 1 + return max_depth + + +def validate_cyclonedx_document(raw: bytes) -> dict[str, Any]: + """Parse + structurally validate an uploaded CycloneDX JSON document. + + Returns the parsed top-level dict on success. Raises :class:`SbomIngestInvalid` + (422) on any of: non-JSON, non-object top level, ``bomFormat != "CycloneDX"``, + unsupported ``specVersion``, ``components`` present but not a list, or more + than ``sbom_ingest_max_components()`` components. + + Adversarial-input contract: we inspect ONLY top-level keys and ``len()`` of + ``components`` — we never recurse into the component elements, so a deeply + nested document cannot drive THIS function's own logic into a recursion / CPU + blow-up. The authoritative deep parse runs later in the Celery worker, off + the request path. + + ``json.loads`` itself, however, recurses one stack frame per nesting level + and raises ``RecursionError`` on a maliciously deep document — which is NOT a + ``ValueError`` and would otherwise escape as an unhandled 500. We guard that + two ways: a cheap O(n) byte-level depth pre-check (rejecting before the + decoder runs), and a defensive ``RecursionError`` catch around ``json.loads`` + for any deep input that slips under the byte-depth heuristic. Both surface as + a clean 422. (Total size is already bounded by the cap applied upstream.) + """ + depth = _max_nesting_depth(raw) + if depth > _MAX_NESTING_DEPTH: + raise SbomIngestInvalid( + f"SBOM nesting depth {depth} exceeds the maximum {_MAX_NESTING_DEPTH}" + ) + + try: + parsed = json.loads(raw) + except ValueError as exc: + raise SbomIngestInvalid(f"upload is not valid JSON: {exc}") from exc + except RecursionError as exc: # pragma: no cover — byte pre-check catches first + raise SbomIngestInvalid("SBOM document is too deeply nested") from exc + + if not isinstance(parsed, dict): + raise SbomIngestInvalid("SBOM document top level is not a JSON object") + + if parsed.get("bomFormat") != "CycloneDX": + raise SbomIngestInvalid( + "document is not a CycloneDX SBOM (bomFormat != 'CycloneDX')" + ) + + spec_version = parsed.get("specVersion") + if not isinstance(spec_version, str) or spec_version not in _ALLOWED_SPEC_VERSIONS: + raise SbomIngestInvalid( + f"unsupported CycloneDX specVersion {spec_version!r}; " + f"supported versions are {sorted(_ALLOWED_SPEC_VERSIONS)}" + ) + + components = parsed.get("components") + if components is not None: + if not isinstance(components, list): + raise SbomIngestInvalid("'components' must be a JSON array when present") + max_components = sbom_ingest_max_components() + if len(components) > max_components: + raise SbomIngestInvalid( + f"SBOM declares {len(components)} components; the maximum is " + f"{max_components}" + ) + + return parsed + + +# --------------------------------------------------------------------------- +# Ingest — validate + persist scan row + write file + enqueue +# --------------------------------------------------------------------------- + + +def _unlink_quietly(path: Path) -> None: + try: + path.unlink(missing_ok=True) + except OSError: # pragma: no cover — best-effort cleanup + pass + + +# Operator-facing free-text metadata fields (release label, original filename) +# are bounded + control-byte-stripped before they land in the scan_metadata +# JSONB. ``ref`` is already sanitized by ``normalize_ref``; this keeps the other +# two from drifting on hardening (parity with trigger_scan's mask_pii pass) and +# stops an oversized/embedded-newline value from polluting audit diffs and the +# scan list UI. Returns None for an empty/whitespace value. +_META_TEXT_MAX_LEN = 255 + + +def _clean_meta_text(value: str | None) -> str | None: + if not value: + return None + # Strip C0/C1 control characters (incl. NUL, CR, LF, tab) — they have no + # place in a release label or a filename and would corrupt log/audit lines. + cleaned = "".join(ch for ch in value if ch.isprintable()).strip() + if not cleaned: + return None + return cleaned[:_META_TEXT_MAX_LEN] + + +async def ingest_sbom( + session: AsyncSession, + *, + project_id: uuid.UUID, + upload: UploadFile, + actor: CurrentUser, + ref: str | None = None, + release: str | None = None, +) -> Scan: + """Validate the uploaded SBOM, persist a queued scan row, and enqueue it. + + Guard order (CLAUDE.md §2 rule 1 — authz / existence ALWAYS before state): + + 1. ``prepare_scan_target`` — existence/team-access (404/403), project-scoped + API-key boundary (403), archived (409), per-team concurrency cap (429). + Reuses ``trigger_scan``'s exact guard sequence + exceptions. + 2. Request validation — Content-Type/filename (415), size cap (413), JSON + + CycloneDX structure (422). Runs AFTER the authz/state guards so a + non-member learns nothing about a project from a malformed body. + 3. INSERT the scan row, flush. The partial unique index + ``ix_scans_project_active`` makes this the atomic concurrency check: a + second in-flight scan for the project raises :class:`ScanInProgressConflict` + (409) at flush — AFTER the 404/403 above. + 4. Write the validated SBOM to its durable on-disk path (keyed by scan id), + stamp ``scan_metadata['sbom_path']``, set ``project.latest_scan_id``, + commit. + 5. Enqueue the Celery task; store the returned task id and commit. An enqueue + failure marks the row ``failed`` and raises :class:`ScanEnqueueFailed` + (503) — identical to ``trigger_scan``. + + Atomicity / cleanup rationale (why scan_id → file → enqueue is safe): + + * The scan id only exists after ``flush()``, and the file path is keyed by + it, so the file can only be written for a row that already won the + per-project active-scan race. A loser (409) never writes a file. + * If the post-flush commit fails (e.g. the commit-time re-check of the + unique index), we delete the just-written file before re-raising the 409, + so no orphan SBOM is left for a scan row that does not exist. + * If enqueue fails, the row is flipped to ``failed`` (the durable SBOM is + left in place — it is small, keyed by the failed scan id, and the orphan + workspace/retention sweep reclaims it; deleting it here would race a + retry that re-dispatches the same row). This matches ``trigger_scan``'s + "mark failed, surface 503" behaviour. + """ + # ---- 1. authz / existence / state guards (shared with trigger_scan) ------- + project = await prepare_scan_target(session, project_id=project_id, actor=actor) + + # Bind the owning team into the audit context BEFORE the scan-row INSERT so + # the before_flush audit listener stamps audit_logs.team_id (mirrors + # trigger_scan). Without this the ingest mutation is audited with a NULL / + # stale team_id and drops out of team-scoped audit views — exactly the + # attribution an incident responder needs for an internet-facing surface. + bind_audit_team(project.team_id) + + # ---- 2. request validation (untrusted input) ----------------------------- + _validate_content_type(content_type=upload.content_type, filename=upload.filename) + raw = await _read_bounded(upload, max_bytes=sbom_ingest_max_bytes()) + # Structural whitelist; never deep-traverses component elements. + validate_cyclonedx_document(raw) + + original_filename = _clean_meta_text(upload.filename) + normalized_release = _clean_meta_text(release) + + # Capture identifiers BEFORE any commit so the except branches never touch an + # expired ORM attribute (which would trigger a sync lazy-load on the async + # engine). Plain locals are safe across rollback. + project_id_value = project.id + project_team_id = project.team_id + + # ---- 3. INSERT the scan row + flush to win the active-scan race ----------- + # sbom_path is filled in step 4 (it needs scan.id); seed it absent so the row + # is well-formed even if the flush fails. + scan = Scan( + project_id=project_id_value, + kind="sbom", + status="queued", + progress_percent=0, + current_step=None, + celery_task_id=None, # set after enqueue + requested_by_user_id=actor.id, + scan_metadata={ + "source_type": "sbom", + "release": normalized_release, + "original_filename": original_filename, + }, + # scan-retention: stamp the normalized ref so the ref-keyed retire query + # (run when this scan later succeeds) is index-driven. NULL for ad-hoc. + ref=normalize_ref(ref), + ) + session.add(scan) + try: + await session.flush() + except IntegrityError as exc: + # Partial unique index ix_scans_project_active (project_id WHERE status IN + # ('queued','running')) — a scan is already in flight for this project. + await session.rollback() + raise ScanInProgressConflict( + f"a scan is already queued or running for project {project_id_value}", + ) from exc + + scan_id = scan.id + + # ---- 4. write the durable SBOM, stamp the path, commit -------------------- + dest = sbom_ingest_path(project_id_value, scan_id) + dest.parent.mkdir(parents=True, exist_ok=True) + try: + dest.write_bytes(raw) + except OSError as exc: + # The row was flushed but the SBOM could not be persisted; roll back so we + # do not leave a queued row whose task will immediately abort on a missing + # file. Surface as 503 (transient infra error) via SbomIngestStorageError. + _unlink_quietly(dest) + await session.rollback() + log.error( + "sbom_ingest.write_failed", + project_id=str(project_id_value), + scan_id=str(scan_id), + error=type(exc).__name__, + ) + # 503, not 422: a disk/IO fault is a transient server error, so a CI + # client should retry rather than treat its valid SBOM as rejected. + raise SbomIngestStorageError("failed to persist the uploaded SBOM") from exc + + # Record the durable path so the Celery task can load it. We rewrite the whole + # metadata dict (rather than mutating in place) so the ORM change-tracking sees + # the JSONB column as dirty. + scan.scan_metadata = { + "source_type": "sbom", + "release": normalized_release, + "original_filename": original_filename, + "sbom_path": str(dest), + } + + # Keep the denormalized pointer in sync so list pages show the queued scan + # immediately (mirrors trigger_scan). + project.latest_scan_id = scan_id + + try: + await session.commit() + except IntegrityError as exc: + # Commit-time re-check of the unique index can still fire under a race. + # Delete the just-written SBOM so no orphan file outlives the rolled-back + # row, then surface the same 409 as the flush path. + await session.rollback() + _unlink_quietly(dest) + raise ScanInProgressConflict( + f"a scan is already queued or running for project {project_id_value}", + ) from exc + + await session.refresh(scan) + + # ---- 5. enqueue the Celery task ------------------------------------------- + try: + celery_task_id = enqueue_scan(scan) + except Exception as exc: + # The row exists in 'queued' but no worker will pick it up. Flip it to + # 'failed' with the deterministic prefix trigger_scan uses so callers can + # distinguish enqueue failures from pipeline failures. The durable SBOM is + # left in place (see docstring). + log.error( + "sbom_ingest.enqueue_failed", + scan_id=str(scan_id), + project_id=str(project_id_value), + error=str(exc), + exc_info=True, + ) + scan.status = "failed" + scan.error_message = f"enqueue_failed: {exc}" + try: + await session.commit() + except Exception: # noqa: BLE001 + await session.rollback() + raise ScanEnqueueFailed( + f"failed to enqueue SBOM ingest for project {project_id_value}: {exc}", + ) from exc + + scan.celery_task_id = celery_task_id + await session.commit() + await session.refresh(scan) + + log.info( + "sbom_ingest_queued", + scan_id=str(scan_id), + project_id=str(project_id_value), + team_id=str(project_team_id), + celery_task_id=celery_task_id, + sbom_bytes=len(raw), + ) + return scan + + +__all__ = [ + "SbomIngestError", + "SbomIngestInvalid", + "SbomIngestStorageError", + "SbomIngestTooLarge", + "SbomIngestUnsupportedType", + "ingest_sbom", + "sbom_ingest_dir_for_project", + "sbom_ingest_path", + "validate_cyclonedx_document", +] diff --git a/apps/backend/services/scan_service.py b/apps/backend/services/scan_service.py index 41eb310b..89446487 100644 --- a/apps/backend/services/scan_service.py +++ b/apps/backend/services/scan_service.py @@ -442,6 +442,74 @@ def normalize_ref(raw: str | None) -> str | None: return s +# --------------------------------------------------------------------------- +# Shared pre-flight guards (trigger_scan + sbom-ingest) +# --------------------------------------------------------------------------- + + +async def prepare_scan_target( + session: AsyncSession, + *, + project_id: uuid.UUID, + actor: CurrentUser, +) -> Project: + """Run the create-a-scan pre-flight guards and return the target project. + + Extracted verbatim from ``trigger_scan`` so the SBOM-ingest path reuses the + SAME guard sequence (and the SAME typed exceptions / status codes) instead of + re-implementing them. ``trigger_scan`` keeps calling this so its behaviour is + unchanged — the only refactor is moving these lines behind a name. + + Guard order (CLAUDE.md §2 rule 1 — authz / existence ALWAYS before state): + + 1. existence + team access — :class:`ProjectMissingForScan` (404) for an + unknown id; :class:`ScanForbidden` (403) for a project in another team. + 2. project-scoped API-key boundary — a single-project CI key targeting a + different project raises :class:`ScanForbidden` (403). (M-2) + 3. archived project — :class:`ScanArchivedConflict` (409). (H-7) + 4. per-team concurrency cap — :class:`ConcurrentScanLimitExceeded` (429). + + The 409 archived state deliberately sits AFTER the 403/404 authz checks so a + non-member can never distinguish "archived" from "does not exist". + + The workspace disk guard (:class:`ScanDiskFull`, 503) is deliberately NOT run + here: ``trigger_scan`` runs some kind-specific guards (upload-archive resolve, + the BUG-008 source-unavailable check) BETWEEN the concurrency cap and the disk + guard, so each caller invokes ``_check_disk_guard()`` itself at the right point + to keep its established ordering byte-for-byte. + """ + project = await _load_project(session, project_id) + if not _can_access_team(actor, project.team_id): + raise ScanForbidden( + f"actor is not a member of team {project.team_id}", + ) + + # M-2 — a project-scoped API key is bounded to ITS project, not the whole + # owning team. Without this gate a single-project CI key could trigger + # scans on every other project of the same team (least-privilege breach). + if ( + actor.api_key_project_id is not None + and actor.api_key_project_id != project_id + ): + raise ScanForbidden( + "API key is scoped to a different project", + ) + + # H-7 — archiving disables new scans. Reject before reserving any worker + # slot so a retired project cannot keep consuming capacity. + if project.archived_at is not None: + raise ScanArchivedConflict( + f"project {project_id} is archived; unarchive it to run new scans", + ) + + # B1 — per-team concurrency cap. Reject the trigger up front when the + # team already has the maximum number of queued+running scans, protecting + # the shared Celery worker pool from a single team's burst. + await _enforce_team_concurrency_cap(session, project.team_id) + + return project + + # --------------------------------------------------------------------------- # Trigger scan (skeleton — Celery enqueue lands in PR #8) # --------------------------------------------------------------------------- @@ -479,37 +547,12 @@ async def trigger_scan( concurrent caller hits :class:`ScanInProgressConflict` (409) without ever reaching the Celery dispatcher. """ - project = await _load_project(session, project_id) - if not _can_access_team(actor, project.team_id): - raise ScanForbidden( - f"actor is not a member of team {project.team_id}", - ) - - # M-2 — a project-scoped API key is bounded to ITS project, not the whole - # owning team. Without this gate a single-project CI key could trigger - # scans on every other project of the same team (least-privilege breach). - if ( - actor.api_key_project_id is not None - and actor.api_key_project_id != project_id - ): - raise ScanForbidden( - "API key is scoped to a different project", - ) - - # H-7 — archiving disables new scans. Reject before reserving any worker - # slot so a retired project cannot keep consuming capacity. - if project.archived_at is not None: - raise ScanArchivedConflict( - f"project {project_id} is archived; unarchive it to run new scans", - ) - - # B1 — per-team concurrency cap. Reject the trigger up front when the - # team already has the maximum number of queued+running scans, protecting - # the shared Celery worker pool from a single team's burst. This is a - # soft stability cap (see _enforce_team_concurrency_cap docstring on the - # SELECT-then-INSERT race), distinct from the hard per-project unique - # index enforced below at flush time. - await _enforce_team_concurrency_cap(session, project.team_id) + # Shared pre-flight guards (existence/access/scope/archived/concurrency cap). + # Extracted to ``prepare_scan_target`` so the SBOM-ingest path reuses the EXACT + # same sequence + exceptions; trigger_scan's behaviour is unchanged. The disk + # guard is NOT in the helper — it runs below, after the source-specific guards, + # to keep this path's established ordering byte-for-byte. + project = await prepare_scan_target(session, project_id=project_id, actor=actor) # feat/zip-upload: when the scan asks for an uploaded source archive, # verify the file exists on disk *before* we enqueue. Otherwise the worker @@ -974,5 +1017,6 @@ async def list_scans_for_actor( "list_scans_for_actor", "list_scans_for_project", "normalize_ref", + "prepare_scan_target", "trigger_scan", ] diff --git a/apps/backend/tasks/__init__.py b/apps/backend/tasks/__init__.py index 8e92c960..790e7a74 100644 --- a/apps/backend/tasks/__init__.py +++ b/apps/backend/tasks/__init__.py @@ -11,8 +11,9 @@ The dispatcher branches on ``scan.kind``: - ``"source"`` → :func:`tasks.scan_source.scan_source_task` - ``"container"`` → :func:`tasks.scan_container.scan_container_task` + - ``"sbom"`` → :func:`tasks.ingest_sbom.ingest_sbom_task` -Both tasks accept ``scan_id`` as a UUID string (Celery serialization is JSON; +All tasks accept ``scan_id`` as a UUID string (Celery serialization is JSON; see ``tasks/celery_app.py``). """ @@ -66,6 +67,7 @@ def enqueue_scan(scan: Scan) -> str: scan_hard_time_limit_seconds, scan_soft_time_limit_seconds, ) + from tasks.ingest_sbom import ingest_sbom_task from tasks.scan_container import scan_container_task from tasks.scan_source import scan_source_task @@ -85,6 +87,16 @@ def enqueue_scan(scan: Scan) -> str: soft_time_limit=soft_limit, time_limit=hard_limit, ) + elif scan.kind == "sbom": + # External CycloneDX SBOM ingest reuses the source pipeline's back half + # (components → Trivy → findings → finalize). Same per-dispatch + # time-boxing as source/container so a hung Trivy step cannot pin a + # worker slot. + async_result = ingest_sbom_task.apply_async( + args=(scan_id,), + soft_time_limit=soft_limit, + time_limit=hard_limit, + ) else: raise ValueError(f"unknown scan.kind={scan.kind!r}") return str(async_result.id) diff --git a/apps/backend/tasks/celery_app.py b/apps/backend/tasks/celery_app.py index 6f9e8d70..71612f8a 100644 --- a/apps/backend/tasks/celery_app.py +++ b/apps/backend/tasks/celery_app.py @@ -33,6 +33,10 @@ _TASK_INCLUDES = [ "tasks.scan_source", "tasks.scan_container", + # External CycloneDX SBOM ingest — reuses the source pipeline's back half + # (components → Trivy SBOM matching → findings → finalize) against an + # uploaded SBOM (no clone / cdxgen / scancode / signing). + "tasks.ingest_sbom", # v2.3 r1 — Go govulncheck call-graph reachability enrichment, dispatched as # a follow-up after a source scan succeeds (best-effort, never blocks a scan). "tasks.scan_reachability", diff --git a/apps/backend/tasks/ingest_sbom.py b/apps/backend/tasks/ingest_sbom.py new file mode 100644 index 00000000..ac4cc8d8 --- /dev/null +++ b/apps/backend/tasks/ingest_sbom.py @@ -0,0 +1,358 @@ +""" +External CycloneDX SBOM ingest Celery task. + +A user uploads a CycloneDX JSON SBOM (no first-party source, no clone, no +cdxgen, no scancode, no signing). This task reuses the *back half* of the +source-scan pipeline — component persistence → Trivy SBOM matching → +findings persistence → finalize — against the uploaded document. + +CLAUDE.md core rule #3: like the source/container scans, this runs +asynchronously inside a Celery worker. The FastAPI request handler that +accepted the upload only persisted a ``Scan`` row in state ``queued`` with +``kind="sbom"`` and wrote the validated CycloneDX file to disk under +``{workspace_root()}/sbom-ingest/{project_id}/{scan_id}.cdx.json``; the path +is carried in ``scan_metadata["sbom_path"]``. + +CLAUDE.md core rule #4 (post-W6): Trivy is the single vulnerability-matching +engine. ``persist_sbom_components`` runs BEFORE ``run_trivy_sbom`` because the +Trivy persister matches findings to ``ComponentVersion`` rows by PURL — the +component graph must exist first. + +Idempotency: + Keyed off ``scan_id`` (Celery ``task_acks_late=True`` + a worker restart + can re-enter on the same id). We: + 1. Skip immediately if the scan already reached ``succeeded``. + 2. Otherwise treat the run as a fresh start: ``_reset_scan_for_rerun`` + wipes prior ScanComponent / VulnerabilityFinding / LicenseFinding / + ScanArtifact / edge rows for this scan, then every stage re-runs. + The DB partial-unique index ("at most one in-flight scan per project") + already prevents a parallel collision. + +Workspace: + A fresh ``{workspace_root()}//`` holds the transient Trivy output + and is removed in ``finally`` (``shutil.rmtree(ignore_errors=True)`` — the + orphan workspace cleaner reclaims anything a SIGKILL leaves behind). The + uploaded SBOM lives OUTSIDE this per-scan tree (under + ``{workspace_root()}/sbom-ingest/...``) so the ``finally`` cleanup never + deletes it — that durable copy backs the ``sbom_cyclonedx`` ScanArtifact + so the SBOM signature/bundle download endpoints keep working for an + ingested scan. +""" + +from __future__ import annotations + +import json +import shutil +import uuid +from pathlib import Path +from typing import Any + +import structlog +from celery.exceptions import SoftTimeLimitExceeded + +from core.config import scan_soft_time_limit_seconds, workspace_root +from core.db import sync_session_scope +from integrations.trivy import ( + TrivyError, + TrivyFailed, + TrivyNotInstalled, + TrivyTimeout, + run_trivy_sbom, +) +from models import Project, Scan +from services.vulnerability_matching import persist_trivy_findings +from tasks._progress import ( + close_log_file, + make_line_callback, + reset_log_counter, +) +from tasks._scan_pipeline import ( + mark_failed, + mark_succeeded, + record_terminal_failure, + set_stage, +) +from tasks.celery_app import celery_app +from tasks.scan_source import ( + _mark_running, + _persist_artifact, + _reset_scan_for_rerun, + persist_sbom_components, +) + +log = structlog.get_logger("tasks.ingest_sbom") + + +# --------------------------------------------------------------------------- +# Stage progress mapping +# --------------------------------------------------------------------------- +# +# A condensed view of the source pipeline's stages — the ingest path skips +# fetch / prep / cdxgen / sign / scancode / approvals / preserve. Percentages +# stay monotonic so the WS progress frame contract holds for clients that also +# render source scans. +_STAGE_PROGRESS: dict[str, int] = { + "bootstrap": 0, + "components": 40, + "trivy": 80, + "finalize": 100, +} + + +def _set_stage(scan_uuid: uuid.UUID, stage: str) -> None: + """Advance to ``stage`` using this task's percent mapping.""" + set_stage(scan_uuid, stage, _STAGE_PROGRESS.get(stage)) + + +# ScanArtifact.kind for the ingested CycloneDX document. We reuse the SAME kind +# the source pipeline writes for the cdxgen SBOM so the signature/bundle +# download surface (``services.sbom_signature.KIND_SBOM == "sbom_cyclonedx"``) +# resolves it uniformly; the SBOM *export* endpoint rebuilds from DB rows and +# does not depend on this artifact. +_SBOM_ARTIFACT_KIND = "sbom_cyclonedx" + + +# --------------------------------------------------------------------------- +# Public Celery task +# --------------------------------------------------------------------------- + + +# Time limits are passed per dispatch by ``tasks.enqueue_scan`` via +# ``apply_async(soft_time_limit=..., time_limit=...)`` (read from env at +# dispatch time — CLAUDE.md rule #11), NOT pinned on the decorator. Mirrors +# ``scan_source_task``. +@celery_app.task( # type: ignore[misc] + name="trustedoss.ingest_sbom", + bind=True, +) +def ingest_sbom_task(self: Any, scan_id: str) -> None: + """ + Ingest an uploaded CycloneDX SBOM to completion. + + Args: + scan_id: UUID **string** (Celery JSON serialization compatibility). + """ + structlog.contextvars.bind_contextvars( + scan_id=scan_id, task_id=self.request.id, task_kind="sbom" + ) + try: + scan_uuid = uuid.UUID(scan_id) + except ValueError: + log.error("ingest_sbom_invalid_scan_id", scan_id=scan_id) + return + + # Drop any per-scan log-line budget left from a previous run (acks_late + + # worker restart can re-enter on the same scan_id). Symmetric with the + # prior-rows wipe in _reset_scan_for_rerun. Idempotent on a first run. + reset_log_counter(scan_uuid) + + workspace = Path(workspace_root()) / str(scan_uuid) + + try: + with sync_session_scope() as session: + scan = session.get(Scan, scan_uuid) + if scan is None: + log.warning("ingest_sbom_missing_scan_row") + return + if scan.status == "succeeded": + log.info("ingest_sbom_already_succeeded") + return + + project = session.get(Project, scan.project_id) + if project is None: + mark_failed(session, scan, "project no longer exists") + return + + _reset_scan_for_rerun(session, scan) + _mark_running(session, scan) + project_id = project.id + # Snapshot the metadata blob while the row is session-attached; after + # the `with` block the ORM attribute is expired and touching it would + # trigger a sync lazy-load on the async engine. A plain dict copy is + # safe to carry into the pipeline. + scan_metadata = dict(scan.scan_metadata or {}) + + _run_pipeline( + scan_uuid=scan_uuid, + project_id=project_id, + workspace=workspace, + scan_metadata=scan_metadata, + ) + except _IngestAborted as exc: + # The uploaded SBOM is missing / outside the workspace / not JSON. + # Terminal — the synchronous service Pass does the authoritative + # validation; this is a minimal defensive backstop. + log.warning("ingest_sbom_aborted", error=str(exc)) + record_terminal_failure(scan_uuid, f"SBOM ingest aborted: {exc}") + except TrivyNotInstalled as exc: + log.error("ingest_sbom_trivy_not_installed", error=str(exc)) + record_terminal_failure(scan_uuid, f"Trivy binary missing: {exc}") + except TrivyTimeout as exc: + log.warning("ingest_sbom_trivy_timeout", error=str(exc)) + record_terminal_failure(scan_uuid, f"Trivy scan timed out: {exc}") + except TrivyFailed as exc: + log.error("ingest_sbom_trivy_failed", error=str(exc)) + record_terminal_failure(scan_uuid, f"Trivy scan failed: {exc}") + except TrivyError as exc: + # Catch-all for any other Trivy adapter subclass added later. + log.error("ingest_sbom_trivy_error", error=str(exc)) + record_terminal_failure(scan_uuid, f"Trivy error: {exc}") + except SoftTimeLimitExceeded: + # Mirrors scan_source: a timed-out ingest is terminal, not retryable. + # Caught BEFORE the bare Exception handler so the message stays + # specific. + soft_limit = scan_soft_time_limit_seconds() + log.warning( + "ingest_sbom_timed_out", + scan_id=str(scan_uuid), + soft_limit_seconds=soft_limit, + ) + record_terminal_failure( + scan_uuid, f"SBOM ingest exceeded the time limit ({soft_limit}s)" + ) + except Exception as exc: + # Fail-loud over retry-forever: any unhandled exception terminates the + # scan with status='failed' and a visible error message. + log.exception("ingest_sbom_unhandled_error") + record_terminal_failure(scan_uuid, f"unexpected error: {exc}") + finally: + # Release the per-scan disk-log handle BEFORE rmtree so the FD does not + # race with the directory removal. Idempotent: no handle was ever opened + # for a scan that emitted no log lines. The rmtree removes ONLY the + # transient per-scan workspace — the uploaded SBOM (and its durable + # ScanArtifact path) live under {workspace_root()}/sbom-ingest/... and + # survive. + close_log_file(scan_uuid) + shutil.rmtree(workspace, ignore_errors=True) + structlog.contextvars.unbind_contextvars("scan_id", "task_id", "task_kind") + + +# --------------------------------------------------------------------------- +# Pipeline +# --------------------------------------------------------------------------- + + +class _IngestAborted(Exception): + """Raised when the uploaded SBOM cannot be loaded — caught by the task body.""" + + +def _run_pipeline( + *, + scan_uuid: uuid.UUID, + project_id: uuid.UUID, + workspace: Path, + scan_metadata: dict[str, Any], +) -> None: + """Execute the ingest stages, each committing its own progress update.""" + # Scan-log verbosity parity with the source pipeline: a per-scan + # ``metadata.verbosity == "verbose"`` flips Trivy into its verbose mode. + verbose = str(scan_metadata.get("verbosity", "normal")) == "verbose" + + # Stage 1 — bootstrap workspace. + _set_stage(scan_uuid, "bootstrap") + workspace.mkdir(parents=True, exist_ok=True) + + # Load + minimally validate the uploaded SBOM. The synchronous service Pass + # does the authoritative CycloneDX validation; here we only guard against an + # absent file, a path that escaped the workspace root, or non-JSON content. + sbom_path, sbom_dict = _load_uploaded_sbom(scan_metadata) + + # Stage 2 — persist components + declared licenses. MUST run before Trivy: + # ``persist_trivy_findings`` matches each finding to a ``ComponentVersion`` + # by PURL, so the component graph has to exist first. ``source_dir=None`` + # because an ingested SBOM has no first-party source tree (no npm-lockfile + # enrichment, no scancode detections). + _set_stage(scan_uuid, "components") + with sync_session_scope() as session: + persist_sbom_components( + session, + scan_uuid=scan_uuid, + sbom=sbom_dict, + source_dir=None, + ) + session.commit() + + # Preserve the uploaded SBOM as a ScanArtifact so the signature/bundle + # download surface resolves it (same ``kind`` the source pipeline writes for + # the cdxgen SBOM). We point the artifact at the DURABLE upload path under + # {workspace_root()}/sbom-ingest/... (NOT the per-scan workspace, which the + # `finally` rmtree deletes). _persist_artifact no-ops if the path is gone. + _persist_artifact(scan_uuid, kind=_SBOM_ARTIFACT_KIND, path=sbom_path) + + # Stage 3 — Trivy SBOM matching. ``run_trivy_sbom`` re-validates that + # ``sbom_path`` resolves inside WORKSPACE_HOST_PATH (it does — the ingest + # path is under workspace_root()), then writes its report into the transient + # per-scan workspace. Trivy errors propagate to the task body's typed except + # blocks; the component graph above is already committed, so a matching + # failure still leaves the user a populated component view (degraded, not + # empty) — same philosophy as the source pipeline. + _set_stage(scan_uuid, "trivy") + trivy_result = run_trivy_sbom( + sbom_path=sbom_path, + output_dir=workspace / "trivy", + line_callback=make_line_callback(scan_uuid, stage="trivy"), + verbose=verbose, + ) + # Persist the Trivy report alongside (transient) so admin/debug can diff what + # Trivy consumed against what we matched — mirrors the source pipeline. + _persist_artifact( + scan_uuid, kind="trivy_sbom_report", path=trivy_result.report_path + ) + with sync_session_scope() as session: + inserted = persist_trivy_findings( + session, + scan_uuid=scan_uuid, + trivy_report=trivy_result.report, + ) + session.commit() + log.info( + "ingest_sbom_trivy_done", + scan_id=str(scan_uuid), + findings_persisted=inserted, + ) + + # Stage 4 — finalize. ``mark_succeeded`` itself sets current_step="finalize", + # progress_percent=100, completed_at, supersedes prior ref-keyed scans, and + # publishes the final frame — so a separate set_stage("finalize") would be a + # redundant frame. We rely on it directly (matching the documented contract). + mark_succeeded(scan_uuid) + + +def _load_uploaded_sbom(scan_metadata: dict[str, Any]) -> tuple[Path, dict[str, Any]]: + """Resolve, containment-check, and parse the uploaded CycloneDX SBOM. + + Returns ``(sbom_path, parsed_dict)``. Raises :class:`_IngestAborted` on a + missing path key, a path that resolves outside ``workspace_root()``, an + absent file, or invalid JSON. This is a minimal defensive backstop — the + synchronous service Pass owns the authoritative CycloneDX schema validation. + """ + raw_path = scan_metadata.get("sbom_path") + if not raw_path or not isinstance(raw_path, str): + raise _IngestAborted("scan_metadata.sbom_path is missing") + + try: + root = Path(workspace_root()).resolve() + candidate = Path(raw_path).resolve() + except OSError as exc: + raise _IngestAborted(f"sbom_path could not be resolved: {exc}") from exc + + # Containment guard (defense-in-depth): the path is operator/worker-written + # but a tampered/garbled metadata row must never let this task read an + # arbitrary file. ``run_trivy_sbom`` re-checks this too, but failing here is + # clearer (and avoids spawning a Trivy process on a bad input). + if not candidate.is_relative_to(root): + raise _IngestAborted("sbom_path resolves outside the workspace root") + + if not candidate.is_file(): + raise _IngestAborted(f"SBOM file not found: {candidate}") + + try: + with candidate.open("rb") as fh: + parsed = json.loads(fh.read()) + except (OSError, ValueError) as exc: + raise _IngestAborted(f"SBOM file is not valid JSON: {exc}") from exc + + if not isinstance(parsed, dict): + raise _IngestAborted("SBOM document is not a JSON object") + + return candidate, parsed diff --git a/apps/backend/tests/fixtures/sbom_ingest/realistic-trivy-sbom.json b/apps/backend/tests/fixtures/sbom_ingest/realistic-trivy-sbom.json new file mode 100644 index 00000000..7f022109 --- /dev/null +++ b/apps/backend/tests/fixtures/sbom_ingest/realistic-trivy-sbom.json @@ -0,0 +1,71 @@ +{ + "SchemaVersion": 2, + "ArtifactName": "realistic.cdx.json", + "ArtifactType": "cyclonedx", + "Results": [ + { + "Target": "Node.js", + "Class": "lang-pkgs", + "Type": "npm", + "Vulnerabilities": [ + { + "VulnerabilityID": "CVE-2020-8203", + "PkgName": "lodash", + "InstalledVersion": "4.17.19", + "FixedVersion": "4.17.20", + "Severity": "HIGH", + "Title": "Prototype pollution in lodash", + "Description": "Prototype pollution attack via zipObjectDeep.", + "References": ["https://nvd.nist.gov/vuln/detail/CVE-2020-8203"] + }, + { + "VulnerabilityID": "CVE-2021-23337", + "PkgName": "lodash", + "InstalledVersion": "4.17.19", + "FixedVersion": "4.17.21", + "Severity": "HIGH", + "Title": "Command injection in lodash template", + "Description": "Command injection via the template function.", + "References": ["https://nvd.nist.gov/vuln/detail/CVE-2021-23337"] + }, + { + "VulnerabilityID": "CVE-2020-28500", + "PkgName": "lodash", + "InstalledVersion": "4.17.19", + "FixedVersion": "4.17.21", + "Severity": "MEDIUM", + "Title": "ReDoS in lodash toNumber/trim", + "Description": "Regular expression denial of service.", + "References": ["https://nvd.nist.gov/vuln/detail/CVE-2020-28500"] + }, + { + "VulnerabilityID": "CVE-2021-44906", + "PkgName": "minimist", + "InstalledVersion": "1.2.5", + "FixedVersion": "1.2.6", + "Severity": "CRITICAL", + "Title": "Prototype pollution in minimist", + "Description": "Prototype pollution via crafted arguments.", + "References": ["https://nvd.nist.gov/vuln/detail/CVE-2021-44906"] + } + ] + }, + { + "Target": "Python", + "Class": "lang-pkgs", + "Type": "pip", + "Vulnerabilities": [ + { + "VulnerabilityID": "CVE-2020-28493", + "PkgName": "jinja2", + "InstalledVersion": "2.11.2", + "FixedVersion": "2.11.3", + "Severity": "MEDIUM", + "Title": "ReDoS in Jinja2 urlize filter", + "Description": "Regular expression denial of service in the urlize filter.", + "References": ["https://nvd.nist.gov/vuln/detail/CVE-2020-28493"] + } + ] + } + ] +} diff --git a/apps/backend/tests/fixtures/sbom_ingest/realistic.cdx.json b/apps/backend/tests/fixtures/sbom_ingest/realistic.cdx.json new file mode 100644 index 00000000..a1a99acb --- /dev/null +++ b/apps/backend/tests/fixtures/sbom_ingest/realistic.cdx.json @@ -0,0 +1,85 @@ +{ + "bomFormat": "CycloneDX", + "specVersion": "1.5", + "serialNumber": "urn:uuid:7f1d8e9a-2b3c-4d5e-9f01-2a3b4c5d6e7f", + "version": 1, + "metadata": { + "timestamp": "2026-06-10T12:00:00Z", + "tools": [ + { + "vendor": "cyclonedx", + "name": "cdxgen", + "version": "11.0.0" + } + ], + "component": { + "type": "application", + "bom-ref": "pkg:npm/ingest-fixture-app@0.0.0", + "name": "ingest-fixture-app", + "version": "0.0.0", + "purl": "pkg:npm/ingest-fixture-app@0.0.0" + } + }, + "components": [ + { + "type": "library", + "bom-ref": "pkg:npm/lodash@4.17.19", + "name": "lodash", + "version": "4.17.19", + "purl": "pkg:npm/lodash@4.17.19", + "licenses": [{ "license": { "id": "MIT" } }], + "externalReferences": [ + { "type": "website", "url": "https://lodash.com/" } + ] + }, + { + "type": "library", + "bom-ref": "pkg:npm/minimist@1.2.5", + "name": "minimist", + "version": "1.2.5", + "purl": "pkg:npm/minimist@1.2.5", + "licenses": [{ "license": { "id": "MIT" } }] + }, + { + "type": "library", + "bom-ref": "pkg:npm/conditional-lib@2.0.0", + "name": "conditional-lib", + "version": "2.0.0", + "purl": "pkg:npm/conditional-lib@2.0.0", + "licenses": [{ "license": { "id": "MPL-2.0" } }], + "components": [ + { + "type": "library", + "bom-ref": "pkg:npm/nested-transitive@0.1.0", + "name": "nested-transitive", + "version": "0.1.0", + "purl": "pkg:npm/nested-transitive@0.1.0", + "licenses": [{ "license": { "id": "ISC" } }] + } + ] + }, + { + "type": "library", + "bom-ref": "pkg:pypi/jinja2@2.11.2", + "name": "jinja2", + "version": "2.11.2", + "purl": "pkg:pypi/jinja2@2.11.2", + "licenses": [{ "license": { "name": "BSD-3-Clause" } }] + } + ], + "dependencies": [ + { + "ref": "pkg:npm/ingest-fixture-app@0.0.0", + "dependsOn": [ + "pkg:npm/lodash@4.17.19", + "pkg:npm/minimist@1.2.5", + "pkg:npm/conditional-lib@2.0.0", + "pkg:pypi/jinja2@2.11.2" + ] + }, + { + "ref": "pkg:npm/conditional-lib@2.0.0", + "dependsOn": ["pkg:npm/nested-transitive@0.1.0"] + } + ] +} diff --git a/apps/backend/tests/integration/scan/test_ingest_sbom_pipeline.py b/apps/backend/tests/integration/scan/test_ingest_sbom_pipeline.py new file mode 100644 index 00000000..28f57b9d --- /dev/null +++ b/apps/backend/tests/integration/scan/test_ingest_sbom_pipeline.py @@ -0,0 +1,357 @@ +""" +End-to-end SBOM-ingest Celery task pipeline — realistic-density fixture + stub Trivy. + +We drive ``tasks.ingest_sbom.ingest_sbom_task`` directly (NOT through Celery's +broker) against a queued ``kind="sbom"`` scan whose ``scan_metadata["sbom_path"]`` +points at a REAL-density CycloneDX document on disk under WORKSPACE_HOST_PATH. +``run_trivy_sbom`` is monkeypatched to return a hand-recorded Trivy ``sbom`` +report whose Results mirror the fixture's packages with MULTIPLE CVEs per +package (CLAUDE.md §2 rule 3: realistic density — not a synthetic 1-CVE blob). + +What we pin (the back-half of the source pipeline, reused for ingest): + - components stage: ``persist_sbom_components`` populates Component / + ComponentVersion rows + ScanComponent edges + declared LicenseFinding rows + from the uploaded SBOM (multiple ecosystems, nested + dependencies). + - trivy stage: ``persist_trivy_findings`` matches the dense report to the + persisted ComponentVersions by PURL and writes VulnerabilityFinding rows — + multiple findings against a single component version (lodash → 3 CVEs). + - finalize: ``mark_succeeded`` flips status='succeeded', progress=100, + current_step='finalize', completed_at set; the durable SBOM ScanArtifact + (kind 'sbom_cyclonedx') is preserved. + - ref-keyed supersede: an older succeeded scan on the same ref is superseded. + +These mirror the source-pipeline integration tests +(``test_scan_source_pipeline_mock.py``) but exercise the ingest task's +condensed stage set (no fetch / cdxgen / scancode / sign). +""" + +from __future__ import annotations + +import json +import os +import subprocess +import uuid +from collections.abc import Iterator +from pathlib import Path + +import pytest +from sqlalchemy import create_engine, func, select +from sqlalchemy.orm import Session, sessionmaker + +from integrations.trivy import TrivyResult +from models import ( + Component, + ComponentVersion, + LicenseFinding, + Scan, + ScanArtifact, + ScanComponent, + VulnerabilityFinding, +) +from tests._helpers import ( + make_membership, + make_organization, + make_project, + make_team, + make_user, +) + +BACKEND_ROOT = Path(__file__).resolve().parent.parent.parent.parent +FIXTURES = BACKEND_ROOT / "tests" / "fixtures" / "sbom_ingest" + +pytestmark = pytest.mark.integration + + +def _require_database_url() -> str: + url = os.getenv("DATABASE_URL") + if not url: + pytest.skip("DATABASE_URL not set — skip ingest_sbom pipeline integration") + return url + + +@pytest.fixture(scope="module", autouse=True) +def _migrate_once() -> None: + _require_database_url() + result = subprocess.run( + ["alembic", "upgrade", "head"], + cwd=BACKEND_ROOT, + capture_output=True, + text=True, + timeout=120, + ) + if result.returncode != 0: + pytest.skip( + f"alembic upgrade head failed; ingest pipeline integration cannot run\n" + f"stdout:\n{result.stdout}\nstderr:\n{result.stderr}" + ) + + +@pytest.fixture +def sync_session() -> Iterator[Session]: + from core.config import database_url_sync + + engine = create_engine(database_url_sync(), pool_pre_ping=True, future=True) + factory = sessionmaker(bind=engine, expire_on_commit=False, future=True) + session = factory() + try: + yield session + finally: + session.close() + engine.dispose() + + +def _stub_trivy_from_fixture(monkeypatch: pytest.MonkeyPatch) -> None: + """Replace ``run_trivy_sbom`` with a stub that emits the recorded dense + Trivy ``sbom`` report fixture (multiple CVEs per package).""" + report = json.loads((FIXTURES / "realistic-trivy-sbom.json").read_text()) + + def _fake_run( + sbom_path: Path, + output_dir: Path, + *, + timeout_seconds: int = 0, # noqa: ARG001 + backend: str | None = None, # noqa: ARG001 + **_kwargs: object, # noqa: ARG001 + ) -> TrivyResult: + output_dir.mkdir(parents=True, exist_ok=True) + report_path = output_dir / "trivy-sbom.json" + report["ArtifactName"] = str(sbom_path) + report_path.write_text(json.dumps(report), encoding="utf-8") + return TrivyResult(report_path=report_path, report=report) + + monkeypatch.setattr("tasks.ingest_sbom.run_trivy_sbom", _fake_run) + + +def _seed_queued_sbom_scan( + workspace: Path, *, ref: str | None = None +) -> tuple[uuid.UUID, uuid.UUID]: + """Seed project + queued sbom scan and write the realistic SBOM to its + durable on-disk ingest path. Returns (scan_id, project_id).""" + import asyncio + + from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + + from core.config import database_url + + sbom_bytes = (FIXTURES / "realistic.cdx.json").read_bytes() + + async def _build() -> tuple[uuid.UUID, uuid.UUID]: + engine = create_async_engine(database_url(), pool_pre_ping=True, future=True) + factory = async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession) + async with factory() as s: + org = await make_organization(s) + team = await make_team(s, organization=org) + user = await make_user(s) + await make_membership(s, user=user, team=team, role="developer") + project = await make_project(s, team=team, git_url=None) + scan = Scan( + project_id=project.id, + kind="sbom", + status="queued", + progress_percent=0, + requested_by_user_id=user.id, + ref=ref, + scan_metadata={"source_type": "sbom"}, + ) + s.add(scan) + await s.commit() + await s.refresh(scan) + scan_id = scan.id + project_id = project.id + # Write the durable SBOM at the path the service stamps, then store + # that path in scan_metadata (the task reads it from there). + ingest_dir = workspace / "sbom-ingest" / str(project_id) + ingest_dir.mkdir(parents=True, exist_ok=True) + dest = ingest_dir / f"{scan_id}.cdx.json" + dest.write_bytes(sbom_bytes) + scan.scan_metadata = {"source_type": "sbom", "sbom_path": str(dest)} + await s.commit() + await engine.dispose() + return scan_id, project_id + + return asyncio.run(_build()) + + +def _findings_for_scan(session: Session, scan_id: uuid.UUID) -> list[VulnerabilityFinding]: + return list( + session.execute( + select(VulnerabilityFinding).where(VulnerabilityFinding.scan_id == scan_id) + ).scalars() + ) + + +# --------------------------------------------------------------------------- +# Happy path — full ingest pipeline with realistic density +# --------------------------------------------------------------------------- + + +def test_ingest_pipeline_persists_components_and_dense_findings( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path, sync_session: Session +) -> None: + monkeypatch.setenv("WORKSPACE_HOST_PATH", str(tmp_path)) + _stub_trivy_from_fixture(monkeypatch) + + scan_id, _project_id = _seed_queued_sbom_scan(tmp_path) + + from tasks.ingest_sbom import ingest_sbom_task + + result = ingest_sbom_task.apply(args=[str(scan_id)]) + assert result.successful(), f"task failed: {result.traceback}" + + sync_session.expire_all() + scan = sync_session.execute(select(Scan).where(Scan.id == scan_id)).scalar_one() + assert scan.status == "succeeded" + assert scan.progress_percent == 100 + assert scan.current_step == "finalize" + assert scan.completed_at is not None + assert scan.error_message is None + + # Components: the fixture declares 4 top-level + 1 nested = 5 component + # purls (lodash, minimist, conditional-lib, nested-transitive, jinja2). The + # ingest persister records ScanComponent edges for the uploaded graph. + component_rows = list( + sync_session.execute( + select(ScanComponent).where(ScanComponent.scan_id == scan_id) + ).scalars() + ) + assert len(component_rows) >= 4, ( + f"expected the multi-ecosystem fixture's components persisted; " + f"got {len(component_rows)}" + ) + + # Declared licenses: at least the MPL-2.0 conditional + MIT permissive + + # BSD jinja2 declared findings exist (declared kind). + declared = list( + sync_session.execute( + select(LicenseFinding).where( + LicenseFinding.scan_id == scan_id, LicenseFinding.kind == "declared" + ) + ).scalars() + ) + assert declared, "declared license findings must be persisted from the SBOM" + + # Vulnerabilities: the dense report carries 3 lodash CVEs + 1 minimist + 1 + # jinja2 = 5 findings, ALL matched by PURL to persisted ComponentVersions. + findings = _findings_for_scan(sync_session, scan_id) + assert len(findings) == 5, ( + f"realistic density: 5 findings (lodash×3, minimist×1, jinja2×1); " + f"got {len(findings)}" + ) + + # Multiple CVEs against ONE component version — the density rule 3 case that + # a synthetic 1-CVE fixture would miss. + lodash = sync_session.execute( + select(ComponentVersion) + .join(Component, Component.id == ComponentVersion.component_id) + .where(Component.purl == "pkg:npm/lodash") + ).scalar_one_or_none() + assert lodash is not None, "lodash component version must be persisted" + lodash_findings = sync_session.execute( + select(func.count()) + .select_from(VulnerabilityFinding) + .where( + VulnerabilityFinding.scan_id == scan_id, + VulnerabilityFinding.component_version_id == lodash.id, + ) + ).scalar_one() + assert lodash_findings == 3, f"lodash must carry 3 CVEs; got {lodash_findings}" + + # The durable SBOM artifact is preserved (download surface). + kinds = { + a.kind + for a in sync_session.execute( + select(ScanArtifact).where(ScanArtifact.scan_id == scan_id) + ).scalars() + } + assert "sbom_cyclonedx" in kinds + + +# --------------------------------------------------------------------------- +# Idempotency — re-running a succeeded ingest is a no-op +# --------------------------------------------------------------------------- + + +def test_ingest_pipeline_succeeded_rerun_is_noop( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path, sync_session: Session +) -> None: + monkeypatch.setenv("WORKSPACE_HOST_PATH", str(tmp_path)) + _stub_trivy_from_fixture(monkeypatch) + + scan_id, _ = _seed_queued_sbom_scan(tmp_path) + + from tasks.ingest_sbom import ingest_sbom_task + + ingest_sbom_task.apply(args=[str(scan_id)]) + sync_session.expire_all() + scan = sync_session.execute(select(Scan).where(Scan.id == scan_id)).scalar_one() + assert scan.status == "succeeded" + completed_first = scan.completed_at + + ingest_sbom_task.apply(args=[str(scan_id)]) + sync_session.expire_all() + again = sync_session.execute(select(Scan).where(Scan.id == scan_id)).scalar_one() + assert again.completed_at == completed_first + assert again.status == "succeeded" + + +# --------------------------------------------------------------------------- +# Lifecycle — ref-keyed supersede (生成→succeed→supersede prior) +# --------------------------------------------------------------------------- + + +def test_ingest_supersedes_prior_succeeded_scan_on_same_ref( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path, sync_session: Session +) -> None: + """When the ingest succeeds on a ref that already has a succeeded scan, the + older scan is superseded (scan-retention ref-keyed latest contract).""" + monkeypatch.setenv("WORKSPACE_HOST_PATH", str(tmp_path)) + _stub_trivy_from_fixture(monkeypatch) + + ref = "refs/heads/main" + scan_id, project_id = _seed_queued_sbom_scan(tmp_path, ref=ref) + + # Seed a PRIOR succeeded scan on the same ref (must end up superseded). It + # cannot be active (the partial unique index forbids two in-flight), so it + # is already succeeded. + import asyncio + + from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + + from core.config import database_url + + async def _seed_prior() -> uuid.UUID: + engine = create_async_engine(database_url(), pool_pre_ping=True, future=True) + factory = async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession) + async with factory() as s: + from datetime import UTC, datetime + + from models import Project + + project = ( + await s.execute(select(Project).where(Project.id == project_id)) + ).scalar_one() + from tests._helpers import make_scan + + prior = await make_scan( + s, project=project, kind="sbom", status="succeeded", ref=ref + ) + prior.completed_at = datetime.now(tz=UTC) + await s.commit() + await s.refresh(prior) + pid = prior.id + await engine.dispose() + return pid + + prior_id = asyncio.run(_seed_prior()) + + from tasks.ingest_sbom import ingest_sbom_task + + result = ingest_sbom_task.apply(args=[str(scan_id)]) + assert result.successful(), f"task failed: {result.traceback}" + + sync_session.expire_all() + new_scan = sync_session.execute(select(Scan).where(Scan.id == scan_id)).scalar_one() + prior = sync_session.execute(select(Scan).where(Scan.id == prior_id)).scalar_one() + assert new_scan.status == "succeeded" + assert new_scan.superseded_at is None, "the newest succeeded scan is live" + assert prior.superseded_at is not None, "the prior same-ref scan was superseded" diff --git a/apps/backend/tests/integration/test_existence_hide_state_matrix.py b/apps/backend/tests/integration/test_existence_hide_state_matrix.py index 45a57aa2..52e4b376 100644 --- a/apps/backend/tests/integration/test_existence_hide_state_matrix.py +++ b/apps/backend/tests/integration/test_existence_hide_state_matrix.py @@ -14,10 +14,20 @@ - scan delete × active scan → ScanNotFound (not ScanDeleteConflict) - scan trigger × scan-in-progress → ScanForbidden (not ScanInProgressConflict) + - sbom-ingest × scan-in-progress → ScanForbidden (not ScanInProgressConflict) + - sbom-ingest × archived project → ScanForbidden (not ScanArchivedConflict) - vuln status × stale if_match → VulnerabilityNotFound (not VulnerabilityConflict) - approval × terminal state → ApprovalNotFound (not ApprovalTerminalState / ApprovalInvalidTransition) +The two sbom-ingest rows pin the NEW 409 surfaces this feature introduces +(``POST /v1/projects/{id}/sbom-ingest``): the endpoint reuses +``prepare_scan_target`` + the partial-unique-index flush, so an active scan +(409 ScanInProgressConflict) and an archived project (409 ScanArchivedConflict) +are both state-derived 409s that MUST sit behind the ScanForbidden (403) +permission gate for a non-member — exactly the intersection the campaign found +unguarded for cancel. + scan cancel × terminal is covered where it was fixed: ``tests/unit/services/test_user_cancel_scan_service.py:: test_other_team_terminal_scan_is_404_not_409`` (#370). @@ -153,6 +163,83 @@ async def test_trigger_on_other_team_busy_project_is_permission_denial_not_409( ) +# --------------------------------------------------------------------------- +# sbom-ingest × scan already in progress / archived (NEW 409 surfaces) +# +# POST /v1/projects/{id}/sbom-ingest reuses prepare_scan_target + the +# partial-unique-index flush, so it introduces TWO new state-derived 409 +# surfaces (active-scan conflict, archived project). Both must sit behind the +# permission gate: a non-member uploading to a busy/archived project must hit +# ScanForbidden (403, this domain's contract — NOT existence-hiding 404, mirror +# of scan-trigger) BEFORE any 409 that would confirm the project + its state. +# +# The guard order is verified at the SERVICE layer: prepare_scan_target raises +# ScanForbidden before ingest_sbom ever reads the upload body, so we can pass a +# throwaway (never-read) UploadFile. +# --------------------------------------------------------------------------- + + +def _throwaway_upload() -> object: + """A minimal CycloneDX UploadFile that the permission gate rejects BEFORE + the body is ever read (prepare_scan_target runs first).""" + import io + + from starlette.datastructures import Headers, UploadFile + + return UploadFile( + file=io.BytesIO(b'{"bomFormat":"CycloneDX","specVersion":"1.5"}'), + filename="bom.cdx.json", + headers=Headers({"content-type": "application/json"}), + ) + + +@pytest.mark.parametrize("active_status", ["queued", "running"]) +async def test_sbom_ingest_other_team_busy_project_is_permission_denial_not_409( + db_session: AsyncSession, active_status: str +) -> None: + """An active scan would 409 (ScanInProgressConflict) for a member ingesting + a duplicate — an outsider must hit ScanForbidden (403) before any + in-progress probe.""" + from services.sbom_ingest_service import ingest_sbom + from services.scan_service import ScanForbidden + + actor, owning_team = await _outsider_and_resource_team(db_session) + project = await make_project(db_session, team=owning_team) + await make_scan(db_session, project=project, status=active_status) + + with pytest.raises(ScanForbidden): + await ingest_sbom( + db_session, + project_id=project.id, + upload=_throwaway_upload(), # type: ignore[arg-type] + actor=actor, + ) + + +async def test_sbom_ingest_other_team_archived_project_is_permission_denial_not_409( + db_session: AsyncSession, +) -> None: + """An archived project would 409 (ScanArchivedConflict) for a member — an + outsider must hit ScanForbidden (403) before the archived-state check.""" + from datetime import UTC, datetime + + from services.sbom_ingest_service import ingest_sbom + from services.scan_service import ScanForbidden + + actor, owning_team = await _outsider_and_resource_team(db_session) + project = await make_project(db_session, team=owning_team) + project.archived_at = datetime.now(tz=UTC) + await db_session.commit() + + with pytest.raises(ScanForbidden): + await ingest_sbom( + db_session, + project_id=project.id, + upload=_throwaway_upload(), # type: ignore[arg-type] + actor=actor, + ) + + # --------------------------------------------------------------------------- # vulnerability status × stale if_match # --------------------------------------------------------------------------- diff --git a/apps/backend/tests/integration/test_sbom_ingest_api.py b/apps/backend/tests/integration/test_sbom_ingest_api.py new file mode 100644 index 00000000..4024d0d6 --- /dev/null +++ b/apps/backend/tests/integration/test_sbom_ingest_api.py @@ -0,0 +1,662 @@ +""" +Integration tests for POST /v1/projects/{project_id}/sbom-ingest — feat/sbom-ingest-endpoint. + +Pins the HTTP contract over the real ASGI app + Postgres. This is the +synchronous front-half of the SBOM-ingest feature: it validates an uploaded +CycloneDX JSON document, persists a ``kind="sbom"`` queued scan row, writes the +SBOM to a durable on-disk path, and enqueues the Celery task (which we +short-circuit here via a monkeypatched ``enqueue_scan``). + +Guard-order contract (CLAUDE.md §2 rule 1 — authz/existence ALWAYS before +state): a cross-team caller hits the permission gate (403, matching the +scan-trigger endpoint's existing contract — see note below) BEFORE any +state-derived 409, even when the target project has an active scan. The +existence/state cross-product for the new 409 surfaces lives in +``test_existence_hide_state_matrix.py``. + +NOTE on 403-vs-404: the sbom-ingest path reuses ``prepare_scan_target`` and +maps ``ScanForbidden`` → 403 (NOT existence-hiding 404), identical to +``POST /v1/projects/{id}/scans`` (``test_scans_api.py:: +test_trigger_scan_other_team_returns_403``). The feature spec's "non-member → +404" wording does not match this domain's actual contract; we assert 403 here +and the matrix file asserts the ``ScanForbidden`` permission-beats-state +ordering at the service layer. + +The autouse ``_stub_enqueue_scan`` conftest fixture patches +``services.scan_service.enqueue_scan`` — but ``sbom_ingest_service`` imports +``enqueue_scan`` from ``tasks`` directly, so we patch +``services.sbom_ingest_service.enqueue_scan`` ourselves in an autouse fixture +below to keep these tests off the real broker. +""" + +from __future__ import annotations + +import json +import os +import subprocess +import uuid +from collections.abc import AsyncIterator +from pathlib import Path + +import pytest +from httpx import ASGITransport, AsyncClient + +from core.security import create_access_token +from models import User +from tests._helpers import ( + make_membership, + make_organization, + make_project, + make_scan, + make_team, + make_user, +) + +BACKEND_ROOT = Path(__file__).resolve().parent.parent.parent +PROBLEM_JSON = "application/problem+json" +FIXTURES = BACKEND_ROOT / "tests" / "fixtures" / "sbom_ingest" + +pytestmark = pytest.mark.integration + +# A static, valid CycloneDX document the happy-path tests upload. Realistic +# density (multiple components, multiple ecosystems, nested + dependencies) +# lives in tests/fixtures/sbom_ingest/realistic.cdx.json; this inline minimal +# doc keeps the HTTP-contract tests fast where density is irrelevant. +_VALID_SBOM = json.dumps( + { + "bomFormat": "CycloneDX", + "specVersion": "1.5", + "components": [ + { + "type": "library", + "name": "lodash", + "version": "4.17.19", + "purl": "pkg:npm/lodash@4.17.19", + } + ], + } +).encode("utf-8") + +_STUB_TASK_ID = "11111111-2222-3333-4444-555555555555" + + +def _require_database_url() -> str: + url = os.getenv("DATABASE_URL") + if not url: + pytest.skip("DATABASE_URL not set — skip sbom-ingest API tests") + return url + + +@pytest.fixture(scope="module", autouse=True) +def _migrate_once() -> None: + _require_database_url() + result = subprocess.run( + ["alembic", "upgrade", "head"], + cwd=BACKEND_ROOT, + capture_output=True, + text=True, + timeout=120, + ) + if result.returncode != 0: + pytest.skip( + f"alembic upgrade head failed; sbom-ingest API tests cannot run\n" + f"stdout:\n{result.stdout}\nstderr:\n{result.stderr}" + ) + + +@pytest.fixture(autouse=True) +def _workspace(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Point WORKSPACE_HOST_PATH at a per-test tmp dir so the durable SBOM file + write lands somewhere isolated and inspectable.""" + monkeypatch.setenv("WORKSPACE_HOST_PATH", str(tmp_path)) + return tmp_path + + +@pytest.fixture(autouse=True) +def _stub_ingest_enqueue(monkeypatch: pytest.MonkeyPatch) -> None: + """Short-circuit the Celery dispatch on the SBOM-ingest path. + + ``sbom_ingest_service`` does ``from tasks import enqueue_scan``, so the + conftest stub (which patches ``services.scan_service.enqueue_scan``) does + NOT cover it. Patch the name bound in the ingest service module. + """ + import services.sbom_ingest_service as svc + + monkeypatch.setattr(svc, "enqueue_scan", lambda scan: _STUB_TASK_ID) + + +@pytest.fixture +def app(): + from main import app as fastapi_app + + return fastapi_app + + +@pytest.fixture +async def client(app) -> AsyncIterator[AsyncClient]: + transport = ASGITransport(app=app, raise_app_exceptions=False) + async with AsyncClient(transport=transport, base_url="http://testserver") as ac: + yield ac + + +def _bearer_for(user: User) -> dict[str, str]: + role = "super_admin" if user.is_superuser else None + token = create_access_token(subject=str(user.id), role=role) + return {"Authorization": f"Bearer {token}"} + + +async def _factory(client: AsyncClient): + app = client._transport.app # type: ignore[attr-defined] + factory = getattr(app.state, "session_factory", None) + if factory is None: + from core.db import _ensure_state + + factory = _ensure_state(app) + return factory + + +async def _seed(client: AsyncClient, *, role: str = "developer", is_superuser: bool = False): + factory = await _factory(client) + async with factory() as session: + org = await make_organization(session) + team = await make_team(session, organization=org) + user = await make_user(session, is_superuser=is_superuser) + if not is_superuser: + await make_membership(session, user=user, team=team, role=role) + project = await make_project(session, team=team) + return team, user, project + + +async def _seed_extra_project(client: AsyncClient, *, team_id: uuid.UUID): + factory = await _factory(client) + async with factory() as session: + from sqlalchemy import select + + from models import Team + + team = ( + await session.execute(select(Team).where(Team.id == team_id)) + ).scalar_one() + return await make_project(session, team=team) + + +async def _seed_active_scan(client: AsyncClient, *, project_id: uuid.UUID, status: str): + factory = await _factory(client) + async with factory() as session: + from sqlalchemy import select + + from models import Project + + project = ( + await session.execute(select(Project).where(Project.id == project_id)) + ).scalar_one() + scan = await make_scan(session, project=project, status=status) + return scan.id + + +def _sbom_part( + body: bytes = _VALID_SBOM, + *, + name: str = "bom.cdx.json", + ctype: str = "application/json", +): + return {"sbom": (name, body, ctype)} + + +async def _issue_project_api_key( + client: AsyncClient, *, user: User, project_id: uuid.UUID +) -> str: + resp = await client.post( + "/v1/api-keys", + json={"name": "ci-ingest", "scope": "project", "project_id": str(project_id)}, + headers=_bearer_for(user), + ) + assert resp.status_code == 201, resp.text + return str(resp.json()["raw_key"]) + + +def _ingest_dir(workspace: Path, project_id: uuid.UUID) -> Path: + return workspace / "sbom-ingest" / str(project_id) + + +# --------------------------------------------------------------------------- +# Happy path — 202 + queued sbom scan row, durable file, metadata stamped +# --------------------------------------------------------------------------- + + +async def test_developer_ingest_returns_202_queued_sbom_scan( + client, _workspace: Path +) -> None: + team, user, project = await _seed(client, role="developer") + + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers=_bearer_for(user), + files=_sbom_part(name="my-bom.cdx.json"), + data={"ref": "refs/heads/main", "release": "v1.2.3"}, + ) + assert resp.status_code == 202, resp.text + body = resp.json() + assert body["project_id"] == str(project.id) + assert body["kind"] == "sbom" + assert body["status"] == "queued" + assert body["progress_percent"] == 0 + # enqueue stub returned the deterministic task id. + assert body["celery_task_id"] == _STUB_TASK_ID + # ScanPublic surfaces `metadata` (alias), not `scan_metadata`. + assert "scan_metadata" not in body + meta = body["metadata"] + assert meta["source_type"] == "sbom" + assert meta["release"] == "v1.2.3" + assert meta["original_filename"] == "my-bom.cdx.json" + scan_id = body["id"] + assert meta["sbom_path"] == str( + _ingest_dir(_workspace, project.id) / f"{scan_id}.cdx.json" + ) + + # The durable SBOM file was written at the stamped path with the uploaded bytes. + durable = _ingest_dir(_workspace, project.id) / f"{scan_id}.cdx.json" + assert durable.is_file() + assert durable.read_bytes() == _VALID_SBOM + + +async def test_ingest_without_ref_or_release_stamps_nulls( + client, _workspace: Path +) -> None: + _team, user, project = await _seed(client, role="developer") + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers=_bearer_for(user), + files=_sbom_part(), + ) + assert resp.status_code == 202, resp.text + meta = resp.json()["metadata"] + assert meta["release"] is None + assert meta["source_type"] == "sbom" + + +async def test_ingest_accepts_realistic_density_fixture( + client, _workspace: Path +) -> None: + """Upload the real-density CycloneDX fixture (multiple ecosystems, nested + components, dependencies). The synchronous validator accepts it (the deep + parse is the worker's job) and returns 202.""" + _team, user, project = await _seed(client, role="developer") + body = (FIXTURES / "realistic.cdx.json").read_bytes() + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers=_bearer_for(user), + files=_sbom_part(body, name="realistic.cdx.json"), + ) + assert resp.status_code == 202, resp.text + assert resp.json()["status"] == "queued" + + +# --------------------------------------------------------------------------- +# API-key auth (CI pushes SBOMs with a tos_ key) +# --------------------------------------------------------------------------- + + +async def test_ingest_accepts_project_scoped_api_key(client) -> None: + _team, user, project = await _seed(client, role="developer") + raw_key = await _issue_project_api_key(client, user=user, project_id=project.id) + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers={"Authorization": f"Bearer {raw_key}"}, + files=_sbom_part(), + ) + assert resp.status_code == 202, resp.text + + +async def test_ingest_rejects_anonymous(client) -> None: + _team, _user, project = await _seed(client, role="developer") + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", files=_sbom_part() + ) + assert resp.status_code == 401 + + +async def test_api_key_cannot_ingest_other_teams_project(client) -> None: + _teamA, userA, projectA = await _seed(client, role="developer") + _teamB, _userB, projectB = await _seed(client, role="developer") + raw_key = await _issue_project_api_key(client, user=userA, project_id=projectA.id) + resp = await client.post( + f"/v1/projects/{projectB.id}/sbom-ingest", + headers={"Authorization": f"Bearer {raw_key}"}, + files=_sbom_part(), + ) + # Project-scope boundary → 403 (ScanForbidden), like the scan-trigger key test. + assert resp.status_code == 403, resp.text + assert resp.headers["content-type"].startswith(PROBLEM_JSON) + + +# --------------------------------------------------------------------------- +# Authz / existence / state — guard ordering (CLAUDE.md §2 rule 1) +# --------------------------------------------------------------------------- + + +async def test_ingest_other_team_returns_403(client) -> None: + _team, _owner, target_project = await _seed(client, role="developer") + _team2, outsider, _p2 = await _seed(client, role="developer") + resp = await client.post( + f"/v1/projects/{target_project.id}/sbom-ingest", + headers=_bearer_for(outsider), + files=_sbom_part(), + ) + # Domain contract mirrors scan-trigger: cross-team is 403, not existence-hide. + assert resp.status_code == 403, resp.text + assert resp.headers["content-type"].startswith(PROBLEM_JSON) + + +async def test_ingest_other_team_with_active_scan_is_403_not_409(client) -> None: + """Permission BEATS state: a non-member uploading to a project that already + has an active scan must get the 403 permission denial, NEVER the 409 + scan-already-in-progress (which would confirm the project + its busy state). + """ + _team, _owner, target_project = await _seed(client, role="developer") + await _seed_active_scan(client, project_id=target_project.id, status="running") + _team2, outsider, _p2 = await _seed(client, role="developer") + + resp = await client.post( + f"/v1/projects/{target_project.id}/sbom-ingest", + headers=_bearer_for(outsider), + files=_sbom_part(), + ) + assert resp.status_code == 403, resp.text + body = resp.json() + assert "scan_already_in_progress" not in body + + +async def test_ingest_unknown_project_returns_404(client) -> None: + _team, admin, _project = await _seed( + client, role="developer", is_superuser=True + ) + resp = await client.post( + f"/v1/projects/{uuid.uuid4()}/sbom-ingest", + headers=_bearer_for(admin), + files=_sbom_part(), + ) + assert resp.status_code == 404, resp.text + assert resp.headers["content-type"].startswith(PROBLEM_JSON) + + +async def test_ingest_on_archived_project_returns_409(client) -> None: + from datetime import UTC, datetime + + from sqlalchemy import update as sa_update + + from models import Project as ProjectModel + + _team, user, project = await _seed(client, role="developer") + factory = await _factory(client) + async with factory() as session: + await session.execute( + sa_update(ProjectModel) + .where(ProjectModel.id == project.id) + .values(archived_at=datetime.now(tz=UTC)) + ) + await session.commit() + + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers=_bearer_for(user), + files=_sbom_part(), + ) + assert resp.status_code == 409, resp.text + assert resp.headers["content-type"].startswith(PROBLEM_JSON) + assert resp.json()["title"] == "Project Archived" + + +async def test_ingest_with_active_scan_returns_409_in_progress( + client, _workspace: Path +) -> None: + """A member uploading while a scan is already queued/running for the project + gets the 409 scan_already_in_progress contract — AND no loser SBOM file is + written for the rejected upload (atomicity).""" + _team, user, project = await _seed(client, role="developer") + await _seed_active_scan(client, project_id=project.id, status="running") + + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers=_bearer_for(user), + files=_sbom_part(), + ) + assert resp.status_code == 409, resp.text + assert resp.headers["content-type"].startswith(PROBLEM_JSON) + body = resp.json() + assert body["title"] == "Scan Already In Progress" + assert body.get("scan_already_in_progress") is True + + # Atomicity: the 409 loser never reached the file-write step (the scan_id is + # only minted after winning the active-scan race), so the project's + # sbom-ingest dir holds no file for this rejected attempt. The dir may not + # even exist; if it does, it must be empty. + ingest_dir = _ingest_dir(_workspace, project.id) + if ingest_dir.exists(): + assert list(ingest_dir.iterdir()) == [] + + +async def test_ingest_concurrency_cap_returns_429(client, monkeypatch) -> None: + """B1 per-team concurrency cap applies to SBOM ingest too: 429 + Retry-After + + machine-checkable `limit` extension.""" + monkeypatch.setenv("SCAN_CONCURRENCY_CAP_PER_TEAM", "1") + monkeypatch.setenv("SCAN_TRIGGER_RATE_LIMIT", "100/minute") + + team, user, project1 = await _seed(client, role="developer") + project2 = await _seed_extra_project(client, team_id=team.id) + headers = _bearer_for(user) + + r1 = await client.post( + f"/v1/projects/{project1.id}/sbom-ingest", headers=headers, files=_sbom_part() + ) + assert r1.status_code == 202, r1.text + + r2 = await client.post( + f"/v1/projects/{project2.id}/sbom-ingest", headers=headers, files=_sbom_part() + ) + assert r2.status_code == 429, r2.text + assert r2.headers["content-type"].startswith(PROBLEM_JSON) + assert "Retry-After" in r2.headers + body = r2.json() + assert body["type"] == "urn:trustedoss:problem:concurrent_scan_limit" + assert body["limit"] == 1 + assert "running_scans" not in body + + +# --------------------------------------------------------------------------- +# Rate limit — SHARED scan_trigger bucket (project spray cannot bypass) +# --------------------------------------------------------------------------- + + +async def test_ingest_shares_scan_trigger_rate_limit_bucket( + client, monkeypatch +) -> None: + """The ingest endpoint draws from the SAME per-user `scan_trigger` bucket as + POST /scans. Spreading uploads across DISTINCT projects must NOT bypass the + cap (the limiter keys by user, not by {project_id}).""" + monkeypatch.setenv("SCAN_TRIGGER_RATE_LIMIT", "2/minute") + monkeypatch.setenv("SCAN_CONCURRENCY_CAP_PER_TEAM", "0") # isolate the limiter + + team, user, project1 = await _seed(client, role="developer") + project2 = await _seed_extra_project(client, team_id=team.id) + project3 = await _seed_extra_project(client, team_id=team.id) + headers = _bearer_for(user) + + r1 = await client.post( + f"/v1/projects/{project1.id}/sbom-ingest", headers=headers, files=_sbom_part() + ) + r2 = await client.post( + f"/v1/projects/{project2.id}/sbom-ingest", headers=headers, files=_sbom_part() + ) + assert r1.status_code == 202, r1.text + assert r2.status_code == 202, r2.text + + # Third upload (distinct project) within the same minute exceeds 2/minute. + r3 = await client.post( + f"/v1/projects/{project3.id}/sbom-ingest", headers=headers, files=_sbom_part() + ) + assert r3.status_code == 429, r3.text + assert "Retry-After" in r3.headers + + +async def test_ingest_and_scan_trigger_share_one_bucket(client, monkeypatch) -> None: + """Cross-surface: one POST /scans + one POST /sbom-ingest by the same user + exhausts a 2/minute shared budget; a third creation on either surface is + 429. Proves the two scan-creating endpoints draw from ONE bucket, not two.""" + monkeypatch.setenv("SCAN_TRIGGER_RATE_LIMIT", "2/minute") + monkeypatch.setenv("SCAN_CONCURRENCY_CAP_PER_TEAM", "0") + + team, user, project1 = await _seed(client, role="developer") + project2 = await _seed_extra_project(client, team_id=team.id) + project3 = await _seed_extra_project(client, team_id=team.id) + headers = _bearer_for(user) + + a = await client.post( + f"/v1/projects/{project1.id}/scans", headers=headers, json={"kind": "source"} + ) + b = await client.post( + f"/v1/projects/{project2.id}/sbom-ingest", headers=headers, files=_sbom_part() + ) + assert a.status_code == 202, a.text + assert b.status_code == 202, b.text + + c = await client.post( + f"/v1/projects/{project3.id}/sbom-ingest", headers=headers, files=_sbom_part() + ) + assert c.status_code == 429, c.text + + +# --------------------------------------------------------------------------- +# Request validation — 413 / 415 / 422 at the HTTP layer +# --------------------------------------------------------------------------- + + +async def test_ingest_oversized_returns_413(client, monkeypatch) -> None: + monkeypatch.setenv("SBOM_INGEST_MAX_BYTES", "256") + _team, user, project = await _seed(client, role="developer") + big = json.dumps( + { + "bomFormat": "CycloneDX", + "specVersion": "1.5", + "components": [{"type": "library", "name": "x" * 4096}], + } + ).encode("utf-8") + assert len(big) > 256 + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers=_bearer_for(user), + files=_sbom_part(big), + ) + assert resp.status_code == 413, resp.text + assert resp.headers["content-type"].startswith(PROBLEM_JSON) + body = resp.json() + assert body["status"] == 413 + assert body["type"].endswith("sbom-ingest-too-large") + + +async def test_ingest_wrong_type_and_extension_returns_415(client) -> None: + _team, user, project = await _seed(client, role="developer") + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers=_bearer_for(user), + files=_sbom_part(_VALID_SBOM, name="evil.html", ctype="text/html"), + ) + assert resp.status_code == 415, resp.text + assert resp.headers["content-type"].startswith(PROBLEM_JSON) + body = resp.json() + assert body["status"] == 415 + assert body["type"].endswith("sbom-ingest-unsupported-type") + + +@pytest.mark.parametrize( + ("payload", "label"), + [ + (b"this is not json", "non-json"), + (b"[]", "top-level-array"), + (json.dumps({"bomFormat": "SPDX", "specVersion": "1.5"}).encode(), "wrong-format"), + (json.dumps({"bomFormat": "CycloneDX", "specVersion": "2.0"}).encode(), "bad-version"), + ( + json.dumps( + {"bomFormat": "CycloneDX", "specVersion": "1.5", "components": {}} + ).encode(), + "components-not-list", + ), + ], +) +async def test_ingest_invalid_document_returns_422( + client, payload: bytes, label: str +) -> None: + _team, user, project = await _seed(client, role="developer") + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers=_bearer_for(user), + files=_sbom_part(payload, name="bom.json"), + ) + assert resp.status_code == 422, f"{label}: {resp.text}" + assert resp.headers["content-type"].startswith(PROBLEM_JSON) + body = resp.json() + assert body["status"] == 422 + assert body["type"].endswith("sbom-ingest-invalid") + + +async def test_ingest_too_many_components_returns_422(client, monkeypatch) -> None: + monkeypatch.setenv("SBOM_INGEST_MAX_COMPONENTS", "2") + _team, user, project = await _seed(client, role="developer") + payload = json.dumps( + { + "bomFormat": "CycloneDX", + "specVersion": "1.5", + "components": [{"type": "library", "name": f"c{i}"} for i in range(3)], + } + ).encode("utf-8") + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers=_bearer_for(user), + files=_sbom_part(payload, name="bom.json"), + ) + assert resp.status_code == 422, resp.text + assert resp.json()["type"].endswith("sbom-ingest-invalid") + + +# --------------------------------------------------------------------------- +# Lifecycle / atomicity — enqueue failure marks scan failed + 503 +# --------------------------------------------------------------------------- + + +async def test_ingest_enqueue_failure_marks_scan_failed_and_returns_503( + client, monkeypatch, _workspace: Path +) -> None: + """If the Celery dispatch raises, the row is flipped to failed with the + deterministic `enqueue_failed:` prefix and the endpoint surfaces 503 + (RFC 7807). The durable SBOM file is left in place (see service docstring).""" + import services.sbom_ingest_service as svc + + def _boom(scan): # type: ignore[no-untyped-def] + raise RuntimeError("broker unreachable") + + monkeypatch.setattr(svc, "enqueue_scan", _boom) + + _team, user, project = await _seed(client, role="developer") + resp = await client.post( + f"/v1/projects/{project.id}/sbom-ingest", + headers=_bearer_for(user), + files=_sbom_part(), + ) + assert resp.status_code == 503, resp.text + assert resp.headers["content-type"].startswith(PROBLEM_JSON) + + # The persisted scan row is failed with the enqueue_failed prefix. + factory = await _factory(client) + async with factory() as session: + from sqlalchemy import select + + from models import Scan + + scan = ( + await session.execute( + select(Scan).where(Scan.project_id == project.id) + ) + ).scalar_one() + assert scan.status == "failed" + assert scan.error_message is not None + assert scan.error_message.startswith("enqueue_failed:") diff --git a/apps/backend/tests/unit/services/test_sbom_ingest_validation.py b/apps/backend/tests/unit/services/test_sbom_ingest_validation.py new file mode 100644 index 00000000..5638e349 --- /dev/null +++ b/apps/backend/tests/unit/services/test_sbom_ingest_validation.py @@ -0,0 +1,403 @@ +""" +Unit tests for the synchronous SBOM-ingest validator + media-type guard. + +These exercise the PURE front-half of the SBOM-ingest feature +(``services.sbom_ingest_service``) — no DB, no Redis, no Celery. The functions +under test are: + + - ``validate_cyclonedx_document(raw: bytes) -> dict`` — parse + structural + whitelist (top-level keys + ``len(components)`` only; NEVER deep traversal). + - ``_validate_content_type(...)`` — Content-Type / filename allow-list (415). + - ``_read_bounded(...)`` — chunked, capped inbound read (413). + +Adversarial-input contract (CLAUDE.md memory: untrusted-input parsers must be +parametrized over hostile inputs). The validator must: + * accept well-formed CycloneDX 1.2–1.6 documents with or without components, + * reject non-JSON / non-object / wrong bomFormat / unsupported specVersion / + non-list components / over-cap component count with ``SbomIngestInvalid`` (422), + * NOT recurse into component elements — it counts only ``len(components)`` at + the top level, deferring the deep parse to the Celery worker — yet still + bound structural nesting via a cheap O(n) byte-depth pre-check, so a + pathologically deep document is a clean 422, never a RecursionError → 500. + +This file is import-pure: it does not import ``main`` / ``core.ratelimit`` / +``core.db`` (so the autouse redis-backed rate-limiter fixture in conftest is +never exercised), which lets the validator cases run standalone with plain +``python`` as well as under pytest. +""" + +from __future__ import annotations + +import json + +import pytest + +from services.sbom_ingest_service import ( + _META_TEXT_MAX_LEN, + SbomIngestInvalid, + SbomIngestTooLarge, + SbomIngestUnsupportedType, + _clean_meta_text, + _read_bounded, + _validate_content_type, + validate_cyclonedx_document, +) + + +def _doc(spec_version: str = "1.5", **extra: object) -> bytes: + base: dict[str, object] = {"bomFormat": "CycloneDX", "specVersion": spec_version} + base.update(extra) + return json.dumps(base).encode("utf-8") + + +# --------------------------------------------------------------------------- +# Happy path — supported versions, components present / absent +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("spec_version", ["1.2", "1.3", "1.4", "1.5", "1.6"]) +def test_accepts_supported_spec_versions_without_components(spec_version: str) -> None: + parsed = validate_cyclonedx_document(_doc(spec_version)) + assert parsed["bomFormat"] == "CycloneDX" + assert parsed["specVersion"] == spec_version + # No 'components' key is valid (a metadata-only BOM). + assert "components" not in parsed + + +@pytest.mark.parametrize("spec_version", ["1.4", "1.5", "1.6"]) +def test_accepts_documents_with_components(spec_version: str) -> None: + components = [ + { + "type": "library", + "name": "lodash", + "version": "4.17.19", + "purl": "pkg:npm/lodash@4.17.19", + }, + { + "type": "library", + "name": "jinja2", + "version": "2.11.2", + "purl": "pkg:pypi/jinja2@2.11.2", + }, + ] + parsed = validate_cyclonedx_document(_doc(spec_version, components=components)) + assert isinstance(parsed["components"], list) + assert len(parsed["components"]) == 2 + + +def test_accepts_empty_components_list() -> None: + parsed = validate_cyclonedx_document(_doc(components=[])) + assert parsed["components"] == [] + + +# --------------------------------------------------------------------------- +# Rejection matrix — every structural failure → SbomIngestInvalid (422) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("raw", "why"), + [ + (b"this is not json", "non-JSON garbage"), + (b"", "empty body"), + (b'{"bomFormat": "CycloneDX", "specVersion": "1.5"', "truncated JSON object"), + (b"[]", "top-level JSON array"), + (b'["CycloneDX"]', "top-level array with a string"), + (b'"CycloneDX"', "top-level JSON scalar (string)"), + (b"42", "top-level JSON scalar (number)"), + (b"true", "top-level JSON scalar (bool)"), + (b"null", "top-level JSON null"), + ], +) +def test_rejects_non_object_or_non_json(raw: bytes, why: str) -> None: + with pytest.raises(SbomIngestInvalid): + validate_cyclonedx_document(raw) + + +@pytest.mark.parametrize( + "bom_format", + ["SPDX", "cyclonedx", "CYCLONEDX", "", "CycloneDX ", None, 1, ["CycloneDX"]], +) +def test_rejects_wrong_bom_format(bom_format: object) -> None: + raw = json.dumps({"bomFormat": bom_format, "specVersion": "1.5"}).encode("utf-8") + with pytest.raises(SbomIngestInvalid): + validate_cyclonedx_document(raw) + + +def test_rejects_missing_bom_format() -> None: + raw = json.dumps({"specVersion": "1.5"}).encode("utf-8") + with pytest.raises(SbomIngestInvalid): + validate_cyclonedx_document(raw) + + +@pytest.mark.parametrize( + "spec_version", + ["1.1", "2.0", "1.0", "1.7", "v1.5", "1.5.0", "", "latest"], +) +def test_rejects_unsupported_spec_version_strings(spec_version: str) -> None: + with pytest.raises(SbomIngestInvalid): + validate_cyclonedx_document(_doc(spec_version)) + + +@pytest.mark.parametrize("spec_version", [1.5, 15, None, True, ["1.5"], {"v": "1.5"}]) +def test_rejects_non_string_spec_version(spec_version: object) -> None: + raw = json.dumps( + {"bomFormat": "CycloneDX", "specVersion": spec_version} + ).encode("utf-8") + with pytest.raises(SbomIngestInvalid): + validate_cyclonedx_document(raw) + + +def test_rejects_missing_spec_version() -> None: + raw = json.dumps({"bomFormat": "CycloneDX"}).encode("utf-8") + with pytest.raises(SbomIngestInvalid): + validate_cyclonedx_document(raw) + + +@pytest.mark.parametrize( + "components", + [ + {"not": "a list"}, + "pkg:npm/lodash@4.17.19", + 42, + True, + ], +) +def test_rejects_components_that_are_not_a_list(components: object) -> None: + raw = json.dumps( + {"bomFormat": "CycloneDX", "specVersion": "1.5", "components": components} + ).encode("utf-8") + with pytest.raises(SbomIngestInvalid): + validate_cyclonedx_document(raw) + + +def test_rejects_too_many_components(monkeypatch: pytest.MonkeyPatch) -> None: + """len(components) > cap → SbomIngestInvalid. The cap is read at call time + (CLAUDE.md rule #11), so a tiny env override drives the rejection cheaply + without building 50k elements.""" + monkeypatch.setenv("SBOM_INGEST_MAX_COMPONENTS", "3") + components = [{"type": "library", "name": f"c{i}"} for i in range(4)] + with pytest.raises(SbomIngestInvalid): + validate_cyclonedx_document(_doc(components=components)) + + +def test_accepts_components_exactly_at_cap(monkeypatch: pytest.MonkeyPatch) -> None: + """Boundary: len == cap passes; only len > cap is rejected.""" + monkeypatch.setenv("SBOM_INGEST_MAX_COMPONENTS", "3") + components = [{"type": "library", "name": f"c{i}"} for i in range(3)] + parsed = validate_cyclonedx_document(_doc(components=components)) + assert len(parsed["components"]) == 3 + + +# --------------------------------------------------------------------------- +# Deep nesting — within caps must PASS (no recursion / CPU blow-up) +# --------------------------------------------------------------------------- + + +def _nested_components_doc(depth: int) -> bytes: + """Build {components:[{components:[{components:[...]}]}]} ``depth`` deep as a + raw JSON STRING by concatenation. + + We deliberately do NOT round-trip a ``depth``-deep Python object through + ``json.dumps``: CPython's json *encoder* is itself recursive and would + overflow while building the fixture (a different limit from the one under + test). The assembled string is still valid JSON. + """ + opening = '{"type":"library","name":"n","components":[' + leaf = '{"type":"library","name":"leaf"}' + closing = "]}" + return ( + '{"bomFormat":"CycloneDX","specVersion":"1.5","components":[' + + opening * depth + + leaf + + closing * depth + + "]}" + ).encode("utf-8") + + +def test_moderately_nested_document_within_caps_passes() -> None: + """A legitimately nested-component chain (well under the depth cap) with only + ONE top-level component validates successfully. + + Two properties at once: (a) the validator inspects ONLY the top-level keys + and ``len(components)`` — it never recurses into the element graph to count + or sanitise nested components (it sees ``len == 1`` regardless of the chain + below); (b) a normal SBOM that nests a handful of assembly levels stays well + under ``_MAX_NESTING_DEPTH`` and is not falsely rejected. ``_nested_components_doc(d)`` + produces a structural byte-depth of roughly ``2 + 2*d``, so depth 20 (~42) + sits comfortably under the 64 cap. The authoritative deep parse is deferred + to the Celery worker; the abuse boundary is pinned in + ``test_extremely_deep_document_is_rejected_422``. + """ + parsed = validate_cyclonedx_document(_nested_components_doc(20)) + # Exactly one TOP-LEVEL component — the validator counted len() == 1 and did + # not descend into the deep chain. + assert len(parsed["components"]) == 1 + + +def test_extremely_deep_document_is_rejected_422() -> None: + """A pathologically deep document is rejected as a clean 422 by the O(n) + byte-depth pre-check, BEFORE the recursive ``json.loads`` decoder runs. + + Regression guard for the RecursionError → unhandled 500 bug: ``json.loads`` + recurses one frame per nesting level, so a ~10k-deep CycloneDX document would + overflow CPython's recursion limit and raise ``RecursionError`` (a + ``RuntimeError`` subclass the old ``except ValueError`` did not catch), + escaping as a 500. ``validate_cyclonedx_document`` now rejects it as + ``SbomIngestInvalid`` via ``_max_nesting_depth`` (and defensively catches + ``RecursionError`` as belt-and-braces). + """ + with pytest.raises(SbomIngestInvalid): + validate_cyclonedx_document(_nested_components_doc(10_000)) + + +# --------------------------------------------------------------------------- +# Pass-through inputs — validator does NOT reject these (deferred to persist) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("purl", "label"), + [ + ("pkg:npm/evil@1.0.0\x00injected", "null byte in purl"), + ("pkg:npm/evil@1.0.0\r\nX-Injected: 1", "CRLF in purl"), + ("pkg:npm/" + "a" * 100_000 + "@1.0.0", "oversized purl"), + ("javascript:alert(1)", "scheme-injection purl"), + ], +) +def test_hostile_component_fields_pass_validation(purl: str, label: str) -> None: + """Null bytes / CRLF / oversized / scheme-injection INSIDE a component are + NOT rejected by the synchronous validator — it never traverses element + fields. The Celery persist stage owns sanitising these (covered by the + vulnerability_matching scrubber tests). Here we pin that the structural + validator passes such a document through, so the contract boundary is + explicit.""" + components = [{"type": "library", "name": "evil", "purl": purl}] + parsed = validate_cyclonedx_document(_doc(components=components)) + assert len(parsed["components"]) == 1 + + +# --------------------------------------------------------------------------- +# Content-Type / filename allow-list (415) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("content_type", "filename"), + [ + ("application/json", "sbom.json"), + ("application/vnd.cyclonedx+json", "sbom.cdx.json"), + ("application/octet-stream", "bom.json"), + ("", "sbom.cdx.json"), # CLI omits the part content-type → filename saves it + ("application/json; charset=utf-8", "weird.bin"), # param stripped, ct ok + ("text/plain", "good.json"), # bad ct but .json filename rescues it + ("APPLICATION/JSON", "x"), # case-insensitive content-type + (None, "bom.cdx.json"), # no ct header at all but valid filename + ], +) +def test_content_type_guard_accepts(content_type: str | None, filename: str) -> None: + # The guard returns None on success and raises SbomIngestUnsupportedType + # otherwise; "accepts" == "does not raise". + _validate_content_type(content_type=content_type, filename=filename) + + +@pytest.mark.parametrize( + ("content_type", "filename"), + [ + ("text/html", "index.html"), + ("application/zip", "source.zip"), + ("application/xml", "sbom.xml"), # CycloneDX XML not accepted by THIS endpoint + ("text/plain", "notes.txt"), + ("image/png", "logo.png"), + ], +) +def test_content_type_guard_rejects_on_both_axes( + content_type: str, filename: str +) -> None: + """415 only when BOTH the media type AND the filename are wrong — either one + in the allow-list is enough to pass.""" + with pytest.raises(SbomIngestUnsupportedType): + _validate_content_type(content_type=content_type, filename=filename) + + +# --------------------------------------------------------------------------- +# Bounded read (413) — the chunked cap guard +# --------------------------------------------------------------------------- + + +class _FakeUpload: + """Minimal UploadFile stand-in: serves ``data`` in ``chunk`` byte slices.""" + + def __init__(self, data: bytes, *, chunk: int) -> None: + self._data = data + self._chunk = chunk + self._pos = 0 + + async def read(self, size: int = -1) -> bytes: + if self._pos >= len(self._data): + return b"" + n = self._chunk if size < 0 else min(size, self._chunk) + out = self._data[self._pos : self._pos + n] + self._pos += len(out) + return out + + +@pytest.mark.asyncio +async def test_read_bounded_returns_full_body_under_cap() -> None: + payload = b"A" * 4096 + upload = _FakeUpload(payload, chunk=512) + out = await _read_bounded(upload, max_bytes=8192) # type: ignore[arg-type] + assert out == payload + + +@pytest.mark.asyncio +async def test_read_bounded_raises_too_large_over_cap() -> None: + payload = b"B" * 9000 + upload = _FakeUpload(payload, chunk=512) + with pytest.raises(SbomIngestTooLarge): + await _read_bounded(upload, max_bytes=4096) # type: ignore[arg-type] + + +@pytest.mark.asyncio +async def test_read_bounded_aborts_before_buffering_whole_body() -> None: + """The cap fires on the running total, not after materialising everything — + a 64 MiB stream against a 1 MiB cap must abort having read only a bounded + prefix (a couple of chunks past the cap), never the whole body.""" + big = b"C" * (64 * 1024 * 1024) + upload = _FakeUpload(big, chunk=1024 * 1024) + with pytest.raises(SbomIngestTooLarge): + await _read_bounded(upload, max_bytes=1024 * 1024) # type: ignore[arg-type] + # The loop stopped right after the running total crossed the cap. + assert upload._pos <= 2 * 1024 * 1024 + + +# --------------------------------------------------------------------------- +# Metadata text cleaning — release / original_filename defense-in-depth +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("raw", "expected"), + [ + (None, None), + ("", None), + (" ", None), + ("v1.2.3", "v1.2.3"), + (" v1.2.3 ", "v1.2.3"), + # Control bytes (NUL, CR, LF, tab) are stripped — they would corrupt + # audit/log lines and have no place in a release label or filename. + ("v1\x00.2\n.3\r", "v1.2.3"), + ("a\tb", "ab"), + # A value that is only control bytes collapses to None. + ("\x00\n\r", None), + ], +) +def test_clean_meta_text(raw: str | None, expected: str | None) -> None: + assert _clean_meta_text(raw) == expected + + +def test_clean_meta_text_caps_length() -> None: + """An oversized release/filename is truncated to the cap (no unbounded JSONB).""" + cleaned = _clean_meta_text("x" * (_META_TEXT_MAX_LEN + 500)) + assert cleaned is not None + assert len(cleaned) == _META_TEXT_MAX_LEN diff --git a/docs-site/docs/ci-integration/sbom-upload.md b/docs-site/docs/ci-integration/sbom-upload.md new file mode 100644 index 00000000..712a15a8 --- /dev/null +++ b/docs-site/docs/ci-integration/sbom-upload.md @@ -0,0 +1,167 @@ +--- +id: sbom-upload +title: Upload an SBOM +description: Upload a CycloneDX SBOM that an external tool already produced — TRUSCA queues a scan that matches CVEs, classifies declared licenses, and runs the build gate. +sidebar_label: Upload an SBOM +sidebar_position: 5 +--- + +# Upload an SBOM + +Already have a CycloneDX SBOM (software bill of materials) from another tool? Upload it to an existing TRUSCA project and TRUSCA matches its components against vulnerability data, classifies declared licenses, builds the dependency graph, and runs the build gate — without cloning or scanning your source. + +The endpoint is `POST /v1/projects/{project_id}/sbom-ingest`. It is asynchronous: a successful request returns `202 Accepted` with a queued scan row, and you poll the scan to read the result. + +:::note Audience +Engineers and CI pipelines that produce a CycloneDX JSON SBOM with a tool of their own (for example a build that runs cdxgen) and want TRUSCA to analyze it. You need a TRUSCA API key — see [API keys](../admin-guide/api-keys.md). +::: + +:::caution Not a Dependency-Track endpoint +TRUSCA is **not** Dependency-Track API compatible. The Dependency-Track flow — `POST /api/v1/bom` with an `X-Api-Key` header, an `autoCreate` form field, and a base64 `bom` field — does not work here. Use the TRUSCA endpoint, the `Authorization: Bearer` header, and the multipart fields documented below. The project must already exist; there is no auto-create. +::: + +## Prerequisites + +- A TRUSCA API key in the `tos__` format. Create one at **/integrations → API keys → New API key**; see [API keys](../admin-guide/api-keys.md) for the scope model. +- The target **project already exists**. Copy its UUID from **Project Settings → CI/CD**. Uploading an SBOM does not create a project. +- The API key's scope covers that project — a `project`-scoped key bound to it, or a `team`-scoped key for a project the team owns. +- A CycloneDX JSON document. Supported `specVersion` values are `1.2` through `1.6`. SPDX is not accepted on this endpoint. +- No scan is currently queued or running for the project (one in-flight scan per project; a second returns `409`). + +## Upload an SBOM + +Send the document as `multipart/form-data`: + +| Field | Required | Example | Description | +|---|---|---|---| +| `sbom` | yes | `@bom.cdx.json` | The CycloneDX JSON SBOM file. | +| `ref` | no | `main` | The git ref the SBOM was produced from (branch name, tag, or full ref). TRUSCA normalizes it into a retention key. | +| `release` | no | `v1.2.3` | A release or version label for the resulting snapshot. | + +Authenticate with the API key as a bearer token. The header is `Authorization: Bearer ` — **not** `X-Api-Key`. + +```bash +curl -X POST \ + https://trustedoss.example.com/v1/projects//sbom-ingest \ + -H "Authorization: Bearer $TRUSTEDOSS_API_KEY" \ + -F "sbom=@bom.cdx.json" \ + -F "ref=main" \ + -F "release=v1.2.3" +``` + +Substitute `` with the project UUID and set `TRUSTEDOSS_API_KEY` in your environment. A cdxgen-based pipeline can produce `bom.cdx.json` in a build step and upload it with the command above. + +On success the response is `202 Accepted` with the queued scan row: + +```json +{ + "id": "3f9a2c10-7b4e-4d2a-9c11-0e8f5d6a1b22", + "project_id": "", + "kind": "sbom", + "status": "queued", + "ref": "main", + "release": "v1.2.3" +} +``` + +`kind` is always `sbom` for an uploaded SBOM, and `status` starts at `queued`. Keep the `id` — that is the scan id you poll next. + +## Watch the scan finish + +Poll the scan with the same bearer token until it reaches a terminal state (`succeeded`, `failed`, or `cancelled`). This is the same polling pattern the [GitHub Actions](./github-actions.md) integration uses. + +```bash +curl https://trustedoss.example.com/v1/scans/ \ + -H "Authorization: Bearer $TRUSTEDOSS_API_KEY" +``` + +`status` moves `queued → running → succeeded`. A reasonable cadence is one poll every 30 seconds. Once `status` is `succeeded`, open the project in the portal to read components, vulnerabilities, and licenses. + +## Verify it worked + +After the scan reaches `succeeded`: + +- The project's **Components** tab lists the packages from the SBOM, and the component count is greater than zero. +- The **Vulnerabilities** tab shows CVE (Common Vulnerabilities and Exposures) findings that Trivy matched against the components. +- The **Licenses** tab shows the declared licenses carried in the SBOM. +- The **Overview** tab shows the dependency graph and the project risk score. + +If the project has a build gate policy, the gate runs on the uploaded SBOM exactly as it does for a source scan. + +## What an uploaded SBOM fills in + +An uploaded SBOM carries only what the producing tool wrote into it, so TRUSCA can enrich some surfaces and not others. + +**Filled in:** + +- Component list — every component in the SBOM. +- Vulnerabilities — CVE findings, matched by Trivy against the components by PURL. +- Declared licenses — the license each component declares in the SBOM. +- Dependency graph — built from the SBOM's `dependencies`. +- Build gate — Critical CVEs and forbidden-classification licenses trip the gate, so a CI step that calls this endpoint and then checks the gate can block a build the same way a source scan does. + +**Not filled in (these come only from a source or repository scan):** + +- Detected licenses — the license texts a source scan finds in the files themselves (scancode). An uploaded SBOM is never cloned or scanned, so there is nothing to detect. +- Registry-concluded licenses — the reconciled license a source scan derives from registry metadata. +- SBOM signature and attestation — an uploaded SBOM is not signed (cosign), so the signature, certificate, and attestation download endpoints have nothing to serve for it. +- Source preservation — no source is fetched or kept. + +If you need detected licenses, signing, or source preservation, run a source scan against the repository instead — see [Scans](../user-guide/scans.md). + +## Limits + +| Limit | Default | Environment variable | Exceeded | +|---|---|---|---| +| Upload size | 32 MiB | `SBOM_INGEST_MAX_BYTES` | `413` | +| Component count | 50,000 | `SBOM_INGEST_MAX_COMPONENTS` | `422` | + +An operator can raise or lower either limit per deployment; see [Environment variables](../reference/env-variables.md). + +## Errors + +All errors are RFC 7807 (Problem Details for HTTP APIs) responses with the `application/problem+json` content type. + +| Status | When | +|---|---| +| `403` | The caller is not a member of the project's owning team, or a project-scoped API key targets a different project. | +| `404` | The project does not exist, or it is hidden from the caller (existence-hide). | +| `409` | A scan is already queued or running for this project, or the project is archived. | +| `413` | The upload exceeds the size cap (`SBOM_INGEST_MAX_BYTES`). | +| `415` | The upload is not a CycloneDX JSON media type — the content type and filename are both wrong. Use `application/json` or `application/vnd.cyclonedx+json`, with a `.json` or `.cdx.json` filename. | +| `422` | The upload is not a valid CycloneDX document — not JSON, `bomFormat` is not `CycloneDX`, an unsupported `specVersion`, malformed `components`, or more components than `SBOM_INGEST_MAX_COMPONENTS`. | +| `429` | Rate limited, or the team's concurrent-scan cap is reached. The response carries a `Retry-After` header. | + +## Troubleshooting + +### `401 Unauthorized` + +The bearer token is missing, malformed, or expired. Confirm the header is `Authorization: Bearer ` — TRUSCA does not read an `X-Api-Key` header. Re-paste the key from the API key modal; it is exactly `tos_` + 8 characters + `_` + 32 characters. + +### `403 Forbidden` + +The API key's scope does not cover the project. Re-issue the key with scope `project` bound to that project, or scope `team` for a project the team owns. See [API keys](../admin-guide/api-keys.md). + +### `409 Conflict` + +A scan is already queued or running for this project — TRUSCA allows one in-flight scan per project. Wait for it to finish (poll `GET /v1/scans/{scan_id}`), then retry. A `409` also fires when the project is archived; restore it first. + +### `415 Unsupported Media Type` + +TRUSCA accepts only CycloneDX JSON. Confirm the file is JSON and the upload sets a JSON media type or a `.json` / `.cdx.json` filename. SPDX and CycloneDX XML are not accepted here. + +### `422 Unprocessable Entity` + +The document is JSON but not an ingestible CycloneDX SBOM. Check that `bomFormat` is `CycloneDX`, that `specVersion` is between `1.2` and `1.6`, and that the component count is within `SBOM_INGEST_MAX_COMPONENTS`. The `detail` field names the specific reason. + +### `429 Too Many Requests` + +You hit the per-user scan-creation rate limit, or the team reached its concurrent-scan cap. Honor the `Retry-After` header and retry after the stated delay. + +## See also + +- [GitHub Actions](./github-actions.md) — trigger a source scan and gate the build from a workflow +- [API keys](../admin-guide/api-keys.md) — the `tos_` key format and scope model +- [Scans](../user-guide/scans.md) — source and container scans, and what each one fills in +- [Scan retention](../admin-guide/scan-retention.md) — how `ref` and `release` group and keep scans +- [Environment variables](../reference/env-variables.md) — the ingest size and component limits diff --git a/docs-site/i18n/ko/docusaurus-plugin-content-docs/current/ci-integration/sbom-upload.md b/docs-site/i18n/ko/docusaurus-plugin-content-docs/current/ci-integration/sbom-upload.md new file mode 100644 index 00000000..532c69fe --- /dev/null +++ b/docs-site/i18n/ko/docusaurus-plugin-content-docs/current/ci-integration/sbom-upload.md @@ -0,0 +1,167 @@ +--- +id: sbom-upload +title: SBOM 업로드 +description: 외부 도구가 이미 생성한 CycloneDX SBOM을 업로드하면 TRUSCA가 CVE를 매칭하고 선언 라이선스를 분류하며 빌드 게이트를 실행하는 스캔을 큐에 넣습니다. +sidebar_label: SBOM 업로드 +sidebar_position: 5 +--- + +# SBOM 업로드 + +다른 도구로 만든 CycloneDX SBOM(software bill of materials, 소프트웨어 구성 명세)이 이미 있습니까? 기존 TRUSCA 프로젝트에 업로드하면 TRUSCA가 소스를 복제하거나 스캔하지 않고도 그 컴포넌트를 취약점 데이터와 매칭하고, 선언 라이선스를 분류하고, 의존성 그래프를 구성하고, 빌드 게이트를 실행합니다. + +엔드포인트는 `POST /v1/projects/{project_id}/sbom-ingest` 입니다. 비동기로 동작합니다. 요청이 성공하면 큐에 들어간 스캔 행과 함께 `202 Accepted`를 반환하므로, 스캔을 폴링해 결과를 확인합니다. + +:::note 대상 독자 +자체 도구(예: 빌드에서 실행하는 cdxgen)로 CycloneDX JSON SBOM을 생성하고 TRUSCA로 분석하려는 엔지니어와 CI 파이프라인. TRUSCA API Key가 필요합니다 — [API keys](../admin-guide/api-keys.md) 참고. +::: + +:::caution Dependency-Track 엔드포인트 아님 +TRUSCA는 Dependency-Track API 호환이 **아닙니다**. Dependency-Track 방식 — `X-Api-Key` 헤더와 `autoCreate` 폼 필드, base64 `bom` 필드를 쓰는 `POST /api/v1/bom` — 은 여기서 통하지 않습니다. 아래에 정리한 TRUSCA 엔드포인트와 `Authorization: Bearer` 헤더, multipart 필드를 사용하세요. 프로젝트는 사전에 존재해야 하며 자동 생성은 없습니다. +::: + +## 사전 조건 + +- `tos__` 형식의 TRUSCA API Key. **/integrations → API keys → New API key**에서 생성하며, 스코프 모델은 [API keys](../admin-guide/api-keys.md) 참고. +- 대상 **프로젝트가 이미 존재**. UUID는 **Project Settings → CI/CD**에서 복사합니다. SBOM 업로드는 프로젝트를 생성하지 않습니다. +- API Key의 스코프가 그 프로젝트를 커버 — 프로젝트에 바인딩된 `project` 스코프 키이거나, 팀이 소유한 프로젝트라면 `team` 스코프 키. +- CycloneDX JSON 문서. 지원하는 `specVersion`은 `1.2`부터 `1.6`까지입니다. 이 엔드포인트는 SPDX를 받지 않습니다. +- 프로젝트에 큐 대기 중이거나 실행 중인 스캔이 없음(프로젝트당 진행 스캔 1개, 두 번째는 `409` 반환). + +## SBOM 업로드 방법 + +문서를 `multipart/form-data`로 보냅니다. + +| 필드 | 필수 | 예 | 설명 | +|---|---|---|---| +| `sbom` | 예 | `@bom.cdx.json` | CycloneDX JSON SBOM 파일. | +| `ref` | 아니오 | `main` | SBOM을 생성한 git ref(브랜치명·태그·전체 ref). TRUSCA가 보존 키로 정규화합니다. | +| `release` | 아니오 | `v1.2.3` | 결과 스냅샷에 붙일 릴리스/버전 레이블. | + +API Key를 베어러 토큰으로 인증합니다. 헤더는 `Authorization: Bearer ` 이며, `X-Api-Key`가 **아닙니다**. + +```bash +curl -X POST \ + https://trustedoss.example.com/v1/projects//sbom-ingest \ + -H "Authorization: Bearer $TRUSTEDOSS_API_KEY" \ + -F "sbom=@bom.cdx.json" \ + -F "ref=main" \ + -F "release=v1.2.3" +``` + +``는 프로젝트 UUID로 바꾸고 `TRUSTEDOSS_API_KEY`는 환경에 설정합니다. cdxgen 기반 파이프라인은 빌드 단계에서 `bom.cdx.json`을 생성한 다음 위 명령으로 업로드할 수 있습니다. + +성공하면 응답은 큐에 들어간 스캔 행과 함께 `202 Accepted` 입니다. + +```json +{ + "id": "3f9a2c10-7b4e-4d2a-9c11-0e8f5d6a1b22", + "project_id": "", + "kind": "sbom", + "status": "queued", + "ref": "main", + "release": "v1.2.3" +} +``` + +업로드된 SBOM의 `kind`는 항상 `sbom`이고 `status`는 `queued`로 시작합니다. `id`를 보관하세요 — 다음에 폴링할 스캔 id입니다. + +## 스캔 완료 확인 + +같은 베어러 토큰으로 스캔이 최종 상태(`succeeded`·`failed`·`cancelled`)에 도달할 때까지 폴링합니다. [GitHub Actions](./github-actions.md) 연동이 쓰는 폴링 패턴과 동일합니다. + +```bash +curl https://trustedoss.example.com/v1/scans/ \ + -H "Authorization: Bearer $TRUSTEDOSS_API_KEY" +``` + +`status`는 `queued → running → succeeded`로 이동합니다. 30초에 한 번 폴링하는 주기가 적당합니다. `status`가 `succeeded`가 되면 포털에서 프로젝트를 열어 컴포넌트·취약점·라이선스를 확인합니다. + +## 동작 확인 + +스캔이 `succeeded`에 도달한 다음: + +- 프로젝트의 **Components** 탭에 SBOM의 패키지가 나열되고 컴포넌트 개수가 0보다 큽니다. +- **Vulnerabilities** 탭에 Trivy가 컴포넌트와 매칭한 CVE(Common Vulnerabilities and Exposures, 공통 취약점·노출) 발견 항목이 표시됩니다. +- **Licenses** 탭에 SBOM이 담은 선언 라이선스가 표시됩니다. +- **Overview** 탭에 의존성 그래프와 프로젝트 리스크 점수가 표시됩니다. + +프로젝트에 빌드 게이트 정책이 있으면, 소스 스캔과 똑같이 업로드된 SBOM에도 게이트가 실행됩니다. + +## 업로드된 SBOM이 채우는 것 + +업로드된 SBOM은 생성한 도구가 안에 기록한 내용만 담으므로, TRUSCA가 보강하는 영역과 그렇지 않은 영역이 나뉩니다. + +**채워지는 것:** + +- 컴포넌트 목록 — SBOM의 모든 컴포넌트. +- 취약점 — Trivy가 PURL로 컴포넌트와 매칭한 CVE 발견 항목. +- 선언 라이선스 — 각 컴포넌트가 SBOM에 선언한 라이선스. +- 의존성 그래프 — SBOM의 `dependencies`로 구성. +- 빌드 게이트 — Critical CVE와 금지 분류 라이선스가 게이트를 발동하므로, 이 엔드포인트를 호출한 다음 게이트를 확인하는 CI 단계는 소스 스캔과 동일하게 빌드를 차단할 수 있습니다. + +**채워지지 않는 것(소스/저장소 스캔에서만 나옵니다):** + +- 검출 라이선스 — 소스 스캔이 파일 안에서 직접 찾는 라이선스 텍스트(scancode). 업로드된 SBOM은 복제도 스캔도 하지 않으므로 검출할 대상이 없습니다. +- 레지스트리 concluded 라이선스 — 소스 스캔이 레지스트리 메타데이터에서 도출하는 정리된 라이선스. +- SBOM 서명·증명 — 업로드된 SBOM은 서명(cosign)되지 않으므로 서명·인증서·증명 다운로드 엔드포인트가 제공할 대상이 없습니다. +- 소스 보존 — 소스를 가져오거나 보관하지 않습니다. + +검출 라이선스·서명·소스 보존이 필요하면 저장소를 대상으로 소스 스캔을 실행하세요 — [Scans](../user-guide/scans.md) 참고. + +## 제한 + +| 제한 | 기본값 | 환경 변수 | 초과 시 | +|---|---|---|---| +| 업로드 용량 | 32 MiB | `SBOM_INGEST_MAX_BYTES` | `413` | +| 컴포넌트 개수 | 50,000 | `SBOM_INGEST_MAX_COMPONENTS` | `422` | + +운영자는 배포마다 두 제한을 올리거나 내릴 수 있습니다 — [환경 변수](../reference/env-variables.md) 참고. + +## 오류 + +모든 오류는 RFC 7807(Problem Details for HTTP APIs) 응답이며 `application/problem+json` 콘텐츠 타입을 씁니다. + +| 상태 | 발생 시점 | +|---|---| +| `403` | 호출자가 프로젝트 소유 팀의 멤버가 아니거나, project 스코프 API Key가 다른 프로젝트를 가리킴. | +| `404` | 프로젝트가 없거나, 호출자에게 숨겨짐(존재 은닉). | +| `409` | 이 프로젝트에 스캔이 이미 큐 대기 중이거나 실행 중이거나, 프로젝트가 archived 상태. | +| `413` | 업로드가 용량 상한(`SBOM_INGEST_MAX_BYTES`)을 초과. | +| `415` | 업로드가 CycloneDX JSON 미디어 타입이 아님 — 콘텐츠 타입과 파일명이 모두 잘못됨. `application/json` 또는 `application/vnd.cyclonedx+json`을 쓰고 `.json` 또는 `.cdx.json` 파일명을 쓰세요. | +| `422` | 업로드가 유효한 CycloneDX 문서가 아님 — JSON이 아니거나, `bomFormat`이 `CycloneDX`가 아니거나, 지원하지 않는 `specVersion`이거나, `components`가 잘못됐거나, 컴포넌트가 `SBOM_INGEST_MAX_COMPONENTS`보다 많음. | +| `429` | 레이트 리밋에 걸렸거나 팀의 동시 스캔 상한에 도달. 응답에 `Retry-After` 헤더가 실립니다. | + +## 문제 해결 + +### `401 Unauthorized` + +베어러 토큰이 없거나 형식이 잘못됐거나 만료됐습니다. 헤더가 `Authorization: Bearer `인지 확인하세요 — TRUSCA는 `X-Api-Key` 헤더를 읽지 않습니다. API Key 모달에서 키를 다시 붙여 넣으세요. 키는 정확히 `tos_` + 8자 + `_` + 32자입니다. + +### `403 Forbidden` + +API Key의 스코프가 프로젝트를 커버하지 않습니다. 그 프로젝트에 바인딩된 `project` 스코프, 또는 팀이 소유한 프로젝트라면 `team` 스코프로 키를 다시 발급하세요. [API keys](../admin-guide/api-keys.md) 참고. + +### `409 Conflict` + +이 프로젝트에 스캔이 이미 큐 대기 중이거나 실행 중입니다 — TRUSCA는 프로젝트당 진행 스캔 1개만 허용합니다. 끝날 때까지 기다린 다음(`GET /v1/scans/{scan_id}` 폴링) 다시 시도하세요. 프로젝트가 archived 상태일 때도 `409`가 발생합니다. 먼저 복원하세요. + +### `415 Unsupported Media Type` + +TRUSCA는 CycloneDX JSON만 받습니다. 파일이 JSON인지, 업로드가 JSON 미디어 타입이나 `.json` / `.cdx.json` 파일명을 설정하는지 확인하세요. SPDX와 CycloneDX XML은 여기서 받지 않습니다. + +### `422 Unprocessable Entity` + +문서는 JSON이지만 처리할 수 있는 CycloneDX SBOM이 아닙니다. `bomFormat`이 `CycloneDX`인지, `specVersion`이 `1.2`에서 `1.6` 사이인지, 컴포넌트 개수가 `SBOM_INGEST_MAX_COMPONENTS` 이내인지 확인하세요. `detail` 필드가 구체적 사유를 알려 줍니다. + +### `429 Too Many Requests` + +사용자별 스캔 생성 레이트 리밋에 걸렸거나 팀이 동시 스캔 상한에 도달했습니다. `Retry-After` 헤더를 따라 명시된 지연 후 다시 시도하세요. + +## 더 보기 + +- [GitHub Actions](./github-actions.md) — 워크플로에서 소스 스캔을 트리거하고 빌드를 게이트 +- [API keys](../admin-guide/api-keys.md) — `tos_` 키 형식과 스코프 모델 +- [Scans](../user-guide/scans.md) — 소스·컨테이너 스캔, 각각이 채우는 것 +- [Scan retention](../admin-guide/scan-retention.md) — `ref`와 `release`로 스캔을 묶고 보존하는 방식 +- [환경 변수](../reference/env-variables.md) — 업로드 용량·컴포넌트 제한 diff --git a/docs-site/sidebars.ts b/docs-site/sidebars.ts index 44492837..4745043f 100644 --- a/docs-site/sidebars.ts +++ b/docs-site/sidebars.ts @@ -74,6 +74,7 @@ const sidebars: SidebarsConfig = { "ci-integration/gitlab-ci", "ci-integration/jenkins", "ci-integration/webhooks", + "ci-integration/sbom-upload", ], }, { From 07d8289a049546ee6ed73c8608885b67356b5cb6 Mon Sep 17 00:00:00 2001 From: Haksung Jang Date: Sun, 14 Jun 2026 01:51:36 +0900 Subject: [PATCH 2/6] test(scan): regenerate OpenAPI snapshot for sbom-ingest endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OpenAPI contract snapshot test (test_openapi_no_drift) flagged the new POST /v1/projects/{project_id}/sbom-ingest path. Add it to the committed snapshot — path param project_id only (sbom/ref/release are requestBody). --- apps/backend/tests/unit/openapi_endpoints.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/backend/tests/unit/openapi_endpoints.json b/apps/backend/tests/unit/openapi_endpoints.json index 60d7d9e0..5093530d 100644 --- a/apps/backend/tests/unit/openapi_endpoints.json +++ b/apps/backend/tests/unit/openapi_endpoints.json @@ -416,6 +416,9 @@ "POST /v1/projects/{project_id}/remediation/npm/pull-request": [ "project_id" ], + "POST /v1/projects/{project_id}/sbom-ingest": [ + "project_id" + ], "POST /v1/projects/{project_id}/scans": [ "project_id" ], From 20a3040c020336bb8b93202f40b38f8098a8b9f7 Mon Sep 17 00:00:00 2001 From: Haksung Jang Date: Sun, 14 Jun 2026 02:14:11 +0900 Subject: [PATCH 3/6] =?UTF-8?q?fix(worker):=20bump=20cdxgen=2012.3.3=20?= =?UTF-8?q?=E2=86=92=2012.5.1=20to=20bust=20stale=20image-scan=20node-pkg?= =?UTF-8?q?=20layer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit image-scan (worker) HARD-failed on 3 node-pkg findings — lodash 4.17.19 (CVE-2021-23337, CVE-2026-4800) and minimist 1.2.5 (CVE-2021-44906) — that live under @cyclonedx/cdxgen/node_modules. Reproduction in node:20-bookworm shows cdxgen 11.x bundles both, while 12.3.3 AND 12.5.1 ship neither: a clean build already lacks them, so the failure was a stale type=gha scope=worker cache layer serving the pre-12.x install tree (same class as the earlier php-symfony image-scan incident). Bumping the version interpolated into the global npm install changes that layer's cache key, forcing a fresh (clean) install — root-cause removal, not a .trivyignore suppression (suppressing a package absent from a clean build would wrongly mute a future regression). cdxgen invocation is unchanged across 12.3.3→12.5.1 and engines.node still allows ^20, so no scan regression. Fixes main too (shared cache) once merged. --- .trivyignore | 4 ++-- apps/backend/Dockerfile.worker | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/.trivyignore b/.trivyignore index b60f79f1..e1c6b478 100644 --- a/.trivyignore +++ b/.trivyignore @@ -44,7 +44,7 @@ # crypto are all outside this code path. # # ---------------------------------------------------------------------------- -# cdxgen 12.3.3 reach surface (single source of truth — keep in sync with +# cdxgen 12.5.1 reach surface (single source of truth — keep in sync with # apps/backend/integrations/cdxgen.py:107-115): # cmd = [ # "cdxgen", @@ -330,7 +330,7 @@ CVE-2026-26171 CVE-2026-33116 # --------------------------------------------------------------------------- -# Bundled Go-binary toolchain reach surface — cdxgen 12.3.3 + ORT 85.1.1 +# Bundled Go-binary toolchain reach surface — cdxgen 12.5.1 + ORT 85.1.1 # vendored Go executables (single source of truth — keep in sync with # apps/backend/integrations/cdxgen.py:107-115 and apps/backend/integrations/ # ort.py:128-139): diff --git a/apps/backend/Dockerfile.worker b/apps/backend/Dockerfile.worker index 09c3c970..199d4b67 100644 --- a/apps/backend/Dockerfile.worker +++ b/apps/backend/Dockerfile.worker @@ -231,7 +231,7 @@ RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_MAJOR}.x | bash - \ && node --version \ && npm --version -# ---- cdxgen 12.3.3 (Apache-2.0). ------------------------------------------- +# ---- cdxgen 12.5.1 (Apache-2.0). ------------------------------------------- # CycloneDX SBOM generator, 30+ ecosystems. The 12.x line trims the 11.x # transitive deps tree (drops cacache / pacote / @npmcli/run-script and # the cross-spawn 7.0.3 chain underneath) — that's the ~23 HIGH npm CVEs @@ -239,6 +239,35 @@ RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_MAJOR}.x | bash - \ # 12.x still supports Node 20 LTS via `engines.node: ^20 || ^22 || ^24 || ^25`. # We install globally so it lands on PATH for the celery worker. # +# PR #406 (image-scan node-pkg fix): bumped 12.3.3 → 12.5.1 (latest 12.x as +# of 2026-06). This is a ROOT-CAUSE removal, not a suppression, of three +# Trivy node-pkg findings the image-scan gate started failing on: +# lodash 4.17.19 — CVE-2021-23337 (HIGH, fixed 4.17.21), +# CVE-2026-4800 (HIGH, fixed 4.18.0) +# minimist 1.2.5 — CVE-2021-44906 (CRITICAL, fixed 1.2.6 / 0.2.4) +# Those packages were transitive deps of cdxgen's OLDER (11.x) tree, bundled +# under usr/lib/node_modules/@cyclonedx/cdxgen/node_modules/{lodash,minimist}. +# Reproduced in a node:20.18.1-bookworm-slim container running this exact +# `npm install -g --omit=dev --omit=optional @cyclonedx/cdxgen@` command: +# - cdxgen 11.11.0 → ships node_modules/lodash + node_modules/minimist +# - cdxgen 12.3.3 → drops both packages entirely (clean tree) +# - cdxgen 12.5.1 → also clean (verified empty) +# So a CLEAN 12.3.3 build already lacks them. The CI failure was a STALE +# `type=gha,scope=worker` build-cache layer (cache-from/to in ci.yml +# image-scan) still serving the pre-12.x cdxgen install layer that DID hold +# lodash 4.17.19 / minimist 1.2.5. Bumping CDXGEN_VERSION changes this RUN's +# layer cache key (the version is interpolated into the command), forcing a +# fresh `npm install` that lands the lodash/minimist-free 12.5.1 tree — which +# busts the stale layer AND keeps us on a cdxgen line that never re-introduces +# those packages. No `.trivyignore` entry is added: suppressing a package that +# is not present in a clean build would be wrong (the gate must keep failing +# if a future cdxgen re-introduces a vulnerable lodash/minimist). +# +# Invocation compatibility: our only cdxgen call (apps/backend/integrations/ +# cdxgen.py — `cdxgen -r -o --spec-version 1.5 `) is unchanged +# across 12.3.3 → 12.5.1, and `engines.node` still lists `^20` so Node 20 LTS +# in this image remains supported. No SBOM-emit regression risk. +# # Install flags: # --omit=dev drops devDependencies (biome / poku / sinon / typescript) # so the image stays lean. Effective. @@ -269,7 +298,7 @@ RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_MAJOR}.x | bash - \ # pipeline), drop --omit=optional and add corresponding # .trivyignore entries with reach analysis — do not # rollback the gate. -ENV CDXGEN_VERSION=12.3.3 +ENV CDXGEN_VERSION=12.5.1 RUN npm install -g --omit=dev --omit=optional "@cyclonedx/cdxgen@${CDXGEN_VERSION}" \ && cdxgen --version From a17e5fa545ce06f1b740458c6968f7f52ac2ab6a Mon Sep 17 00:00:00 2001 From: Haksung Jang Date: Sun, 14 Jun 2026 02:39:25 +0900 Subject: [PATCH 4/6] fix(ci): bump worker image-scan GHA cache scope to force a clean rebuild MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit image-scan kept HARD-failing on lodash 4.17.19 (CVE-2021-23337, CVE-2026-4800) and minimist 1.2.5 (CVE-2021-44906) even after the cdxgen 12.3.3→12.5.1 bump, which only rebuilt the cdxgen layer. A fresh local install of cdxgen 12.5.1 and of npm 11.14.1 — the image's only two npm-package installers — pulls neither package, and these CVEs were never in .trivyignore, yet image-scan passed on #404/#405. The vulnerable copies therefore live in a stale, earlier `scope=worker` cache layer (a non-deterministic npm-install resolution cached long ago), not in anything the current Dockerfile produces. Bumping the buildx GHA cache scope (worker → worker-v2) abandons the poisoned cache and forces a single clean rebuild; the new namespace caches the clean tree. Keeps the cdxgen 12.5.1 bump (latest 12.x, verified lodash/minimist-free). --- .github/workflows/ci.yml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cd96c067..d8d52f89 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -391,8 +391,16 @@ jobs: file: apps/backend/Dockerfile.worker tags: trustedoss/backend-worker:dev load: true - cache-from: type=gha,scope=worker - cache-to: type=gha,scope=worker,mode=max + # Cache scope bumped worker → worker-v2 (PR #406): the prior scope's + # cached npm-install layer carried a non-deterministically-resolved + # transitive tree (lodash 4.17.19 / minimist 1.2.5) that image-scan + # HARD-failed on. A fresh local install of the current cdxgen 12.5.1 + # AND npm 11.14.1 pulls neither package, so the vulnerable copies came + # from a stale cached layer, not the current Dockerfile. Bumping the + # scope abandons that poisoned cache and forces one clean rebuild; the + # new namespace then caches the clean tree. Bump again if it recurs. + cache-from: type=gha,scope=worker-v2 + cache-to: type=gha,scope=worker-v2,mode=max - name: Trivy scan (HARD FAIL on CRITICAL,HIGH) # Chore P (Phase 8 worker-image refresh): single combined gate. From 38894f62562efa35861a5a3eae10a945666f37a5 Mon Sep 17 00:00:00 2001 From: Haksung Jang Date: Sun, 14 Jun 2026 08:51:33 +0900 Subject: [PATCH 5/6] Revert "fix(ci): bump worker image-scan GHA cache scope to force a clean rebuild" This reverts commit a17e5fa545ce06f1b740458c6968f7f52ac2ab6a. --- .github/workflows/ci.yml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d8d52f89..cd96c067 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -391,16 +391,8 @@ jobs: file: apps/backend/Dockerfile.worker tags: trustedoss/backend-worker:dev load: true - # Cache scope bumped worker → worker-v2 (PR #406): the prior scope's - # cached npm-install layer carried a non-deterministically-resolved - # transitive tree (lodash 4.17.19 / minimist 1.2.5) that image-scan - # HARD-failed on. A fresh local install of the current cdxgen 12.5.1 - # AND npm 11.14.1 pulls neither package, so the vulnerable copies came - # from a stale cached layer, not the current Dockerfile. Bumping the - # scope abandons that poisoned cache and forces one clean rebuild; the - # new namespace then caches the clean tree. Bump again if it recurs. - cache-from: type=gha,scope=worker-v2 - cache-to: type=gha,scope=worker-v2,mode=max + cache-from: type=gha,scope=worker + cache-to: type=gha,scope=worker,mode=max - name: Trivy scan (HARD FAIL on CRITICAL,HIGH) # Chore P (Phase 8 worker-image refresh): single combined gate. From dd9e8fc1975676ebb990c35ce2f775566e7f8342 Mon Sep 17 00:00:00 2001 From: Haksung Jang Date: Sun, 14 Jun 2026 08:51:33 +0900 Subject: [PATCH 6/6] =?UTF-8?q?Revert=20"fix(worker):=20bump=20cdxgen=2012?= =?UTF-8?q?.3.3=20=E2=86=92=2012.5.1=20to=20bust=20stale=20image-scan=20no?= =?UTF-8?q?de-pkg=20layer"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 20a3040c020336bb8b93202f40b38f8098a8b9f7. --- .trivyignore | 4 ++-- apps/backend/Dockerfile.worker | 33 ++------------------------------- 2 files changed, 4 insertions(+), 33 deletions(-) diff --git a/.trivyignore b/.trivyignore index e1c6b478..b60f79f1 100644 --- a/.trivyignore +++ b/.trivyignore @@ -44,7 +44,7 @@ # crypto are all outside this code path. # # ---------------------------------------------------------------------------- -# cdxgen 12.5.1 reach surface (single source of truth — keep in sync with +# cdxgen 12.3.3 reach surface (single source of truth — keep in sync with # apps/backend/integrations/cdxgen.py:107-115): # cmd = [ # "cdxgen", @@ -330,7 +330,7 @@ CVE-2026-26171 CVE-2026-33116 # --------------------------------------------------------------------------- -# Bundled Go-binary toolchain reach surface — cdxgen 12.5.1 + ORT 85.1.1 +# Bundled Go-binary toolchain reach surface — cdxgen 12.3.3 + ORT 85.1.1 # vendored Go executables (single source of truth — keep in sync with # apps/backend/integrations/cdxgen.py:107-115 and apps/backend/integrations/ # ort.py:128-139): diff --git a/apps/backend/Dockerfile.worker b/apps/backend/Dockerfile.worker index 199d4b67..09c3c970 100644 --- a/apps/backend/Dockerfile.worker +++ b/apps/backend/Dockerfile.worker @@ -231,7 +231,7 @@ RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_MAJOR}.x | bash - \ && node --version \ && npm --version -# ---- cdxgen 12.5.1 (Apache-2.0). ------------------------------------------- +# ---- cdxgen 12.3.3 (Apache-2.0). ------------------------------------------- # CycloneDX SBOM generator, 30+ ecosystems. The 12.x line trims the 11.x # transitive deps tree (drops cacache / pacote / @npmcli/run-script and # the cross-spawn 7.0.3 chain underneath) — that's the ~23 HIGH npm CVEs @@ -239,35 +239,6 @@ RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_MAJOR}.x | bash - \ # 12.x still supports Node 20 LTS via `engines.node: ^20 || ^22 || ^24 || ^25`. # We install globally so it lands on PATH for the celery worker. # -# PR #406 (image-scan node-pkg fix): bumped 12.3.3 → 12.5.1 (latest 12.x as -# of 2026-06). This is a ROOT-CAUSE removal, not a suppression, of three -# Trivy node-pkg findings the image-scan gate started failing on: -# lodash 4.17.19 — CVE-2021-23337 (HIGH, fixed 4.17.21), -# CVE-2026-4800 (HIGH, fixed 4.18.0) -# minimist 1.2.5 — CVE-2021-44906 (CRITICAL, fixed 1.2.6 / 0.2.4) -# Those packages were transitive deps of cdxgen's OLDER (11.x) tree, bundled -# under usr/lib/node_modules/@cyclonedx/cdxgen/node_modules/{lodash,minimist}. -# Reproduced in a node:20.18.1-bookworm-slim container running this exact -# `npm install -g --omit=dev --omit=optional @cyclonedx/cdxgen@` command: -# - cdxgen 11.11.0 → ships node_modules/lodash + node_modules/minimist -# - cdxgen 12.3.3 → drops both packages entirely (clean tree) -# - cdxgen 12.5.1 → also clean (verified empty) -# So a CLEAN 12.3.3 build already lacks them. The CI failure was a STALE -# `type=gha,scope=worker` build-cache layer (cache-from/to in ci.yml -# image-scan) still serving the pre-12.x cdxgen install layer that DID hold -# lodash 4.17.19 / minimist 1.2.5. Bumping CDXGEN_VERSION changes this RUN's -# layer cache key (the version is interpolated into the command), forcing a -# fresh `npm install` that lands the lodash/minimist-free 12.5.1 tree — which -# busts the stale layer AND keeps us on a cdxgen line that never re-introduces -# those packages. No `.trivyignore` entry is added: suppressing a package that -# is not present in a clean build would be wrong (the gate must keep failing -# if a future cdxgen re-introduces a vulnerable lodash/minimist). -# -# Invocation compatibility: our only cdxgen call (apps/backend/integrations/ -# cdxgen.py — `cdxgen -r -o --spec-version 1.5 `) is unchanged -# across 12.3.3 → 12.5.1, and `engines.node` still lists `^20` so Node 20 LTS -# in this image remains supported. No SBOM-emit regression risk. -# # Install flags: # --omit=dev drops devDependencies (biome / poku / sinon / typescript) # so the image stays lean. Effective. @@ -298,7 +269,7 @@ RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_MAJOR}.x | bash - \ # pipeline), drop --omit=optional and add corresponding # .trivyignore entries with reach analysis — do not # rollback the gate. -ENV CDXGEN_VERSION=12.5.1 +ENV CDXGEN_VERSION=12.3.3 RUN npm install -g --omit=dev --omit=optional "@cyclonedx/cdxgen@${CDXGEN_VERSION}" \ && cdxgen --version