Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions apps/backend/api/v1/sbom.py
Original file line number Diff line number Diff line change
Expand Up @@ -687,7 +687,7 @@ async def download_sbom_signature_bundle_endpoint(
"/projects/{project_id}/sbom-ingest",
response_model=ScanPublic,
status_code=status.HTTP_202_ACCEPTED,
summary="Ingest an external CycloneDX SBOM (queues a Celery task; returns 202 Accepted)",
summary="Ingest an external CycloneDX or SPDX SBOM (queues a Celery task; returns 202)",
responses={
202: {
"description": "SBOM accepted; a queued scan row is returned.",
Expand Down Expand Up @@ -717,14 +717,14 @@ async def download_sbom_signature_bundle_endpoint(
"content": {"application/problem+json": {}},
},
415: {
"description": "Upload is not a CycloneDX JSON media type. RFC 7807.",
"description": "Upload is not a CycloneDX or SPDX media type. RFC 7807.",
"content": {"application/problem+json": {}},
},
422: {
"description": (
"Upload is not a valid / supported CycloneDX document (not JSON, "
"wrong bomFormat, unsupported specVersion, too many components). "
"RFC 7807."
"Upload is not a valid / supported SBOM document not CycloneDX-JSON "
"or SPDX (JSON / Tag-Value), wrong bomFormat, unsupported specVersion, "
"too many components/packages, or too deeply nested. RFC 7807."
),
"content": {"application/problem+json": {}},
},
Expand Down Expand Up @@ -754,7 +754,11 @@ async def ingest_sbom_endpoint(
project_id: uuid.UUID,
sbom: UploadFile = File(
...,
description="A CycloneDX JSON SBOM document (.json / .cdx.json).",
description=(
"A CycloneDX-JSON (.json / .cdx.json) or SPDX (.spdx / .spdx.json / "
".tag) SBOM document. Trivy auto-detects the format for CVE matching; "
"SPDX is mapped to CycloneDX for component persistence."
),
),
ref: str | None = Form(
default=None,
Expand Down
138 changes: 115 additions & 23 deletions apps/backend/services/sbom_ingest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
External CycloneDX SBOM ingest — synchronous validation + scan-row creation.

This is the synchronous *front half* of the SBOM-ingest feature: it accepts an
uploaded CycloneDX JSON document, validates it adversarially, persists a
``kind="sbom"`` :class:`~models.scan.Scan` row, writes the validated SBOM to a
durable on-disk location, and enqueues the Celery task that does the heavy work
(``tasks.ingest_sbom.ingest_sbom_task``). The endpoint returns ``202 Accepted``
uploaded CycloneDX-JSON or SPDX (JSON / Tag-Value) document, validates it
adversarially, persists a ``kind="sbom"`` :class:`~models.scan.Scan` row, writes
the original SBOM bytes to a durable on-disk location, and enqueues the Celery
task that does the heavy work (``tasks.ingest_sbom.ingest_sbom_task`` — which
maps SPDX → CycloneDX for component persistence and hands the original file to
Trivy, which auto-detects the format). The endpoint returns ``202 Accepted``
with the queued scan row — never the result (CLAUDE.md core rule #3).

This endpoint is NOT the Dependency-Track ``/api/v1/bom`` + ``X-Api-Key`` BOM
Expand All @@ -21,18 +23,20 @@
fast-fails on a declared ``Content-Length`` over the cap before reading a
single body byte (mirrors the source-archive endpoint).

- **Content-Type / filename allow-list.** Only JSON-ish media types and a
``.json`` / ``.cdx.json`` filename are accepted (415 otherwise). The header
is advisory; the JSON parse + CycloneDX structure check are authoritative.

- **Structural whitelist, NO deep traversal.** We parse the JSON and check
only the TOP-LEVEL keys: ``bomFormat == "CycloneDX"``, ``specVersion`` in a
known set, and (when present) ``components`` is a list whose ``len`` is
within ``sbom_ingest_max_components()`` (default 50,000). We deliberately do
NOT recurse into the component elements here — a deeply-nested hostile
document cannot drive our validation into a recursion / CPU blow-up. The
authoritative deep parse happens later, inside the Celery worker
(``persist_sbom_components``), off the request path.
- **Content-Type / filename allow-list.** Only JSON-ish / SPDX media types and
a ``.json`` / ``.cdx.json`` / ``.spdx`` / ``.tag`` filename are accepted (415
otherwise). The header is advisory; the content sniff + structure check are
authoritative.

- **Structural whitelist, NO deep traversal.** ``validate_uploaded_sbom``
detects the format (CycloneDX-JSON or SPDX JSON/Tag-Value) and checks only
TOP-LEVEL shape: for CycloneDX, ``bomFormat``, ``specVersion`` in a known
set, and a bounded ``components`` array; for SPDX-JSON, a bounded
``packages`` array. A byte-level depth pre-check runs BEFORE any
``json.loads`` so a deeply-nested hostile document is a clean 422, never a
decoder ``RecursionError`` (500). We deliberately do NOT recurse into element
bodies here — the authoritative deep parse / SPDX→CycloneDX mapping happens
later, inside the Celery worker, off the request path.

CLAUDE.md compliance:
- Core rule #11: every limit is read via ``os.getenv`` at call time (through
Expand All @@ -46,6 +50,7 @@
from __future__ import annotations

import json
import re
import uuid
from pathlib import Path
from typing import Any
Expand All @@ -63,6 +68,12 @@
)
from core.security import CurrentUser
from models import Scan
from services.sbom_conformance import (
FORMAT_CYCLONEDX,
FORMAT_SPDX_JSON,
FORMAT_SPDX_TV,
detect_format,
)
from services.scan_service import (
ScanEnqueueFailed,
ScanInProgressConflict,
Expand All @@ -83,6 +94,13 @@
{
"application/json",
"application/vnd.cyclonedx+json",
# SPDX media types (model 3 SPDX-input support). SPDX-JSON carriers use
# application/spdx+json or application/json; SPDX Tag-Value uploads use
# text/spdx (or omit the part content-type, or carry a .spdx/.tag
# filename — see the filename allow-list). We deliberately do NOT add the
# over-broad text/plain: a .txt with text/plain stays a 415.
"application/spdx+json",
"text/spdx",
"application/octet-stream",
"", # some CLIs omit the part content-type
}
Expand Down Expand Up @@ -224,7 +242,12 @@ def _validate_content_type(*, content_type: str | None, filename: str | None) ->
"""
normalized_ct = (content_type or "").lower().split(";", 1)[0].strip()
name = (filename or "").strip().lower()
name_ok = name.endswith(".json") or name.endswith(".cdx.json")
# CycloneDX JSON (.json / .cdx.json) or SPDX (.spdx / .spdx.json / .tag /
# .spdx.tag). The authoritative format gate is the content sniff in
# validate_uploaded_sbom; this is the advisory fast-fail.
name_ok = name.endswith(
(".json", ".cdx.json", ".spdx", ".spdx.json", ".tag", ".spdx.tag")
)
if normalized_ct in _ALLOWED_CONTENT_TYPES:
return
if name_ok:
Expand All @@ -236,7 +259,7 @@ def _validate_content_type(*, content_type: str | None, filename: str | None) ->
)
raise SbomIngestUnsupportedType(
f"content-type {normalized_ct!r} (filename {name!r}) is not an accepted "
"CycloneDX JSON media type"
"CycloneDX or SPDX media type"
)


Expand Down Expand Up @@ -338,6 +361,71 @@ def validate_cyclonedx_document(raw: bytes) -> dict[str, Any]:
return parsed


def _validate_spdx_json_packages(doc: dict[str, Any]) -> None:
"""Bound the SPDX-JSON ``packages`` array (the SPDX analogue of the
CycloneDX ``components`` cap) so a huge document is rejected up front."""
packages = doc.get("packages")
if packages is None:
return
if not isinstance(packages, list):
raise SbomIngestInvalid("SPDX 'packages' must be a JSON array when present")
max_packages = sbom_ingest_max_components()
if len(packages) > max_packages:
raise SbomIngestInvalid(
f"SBOM declares {len(packages)} packages; the maximum is {max_packages}"
)


def validate_uploaded_sbom(raw: bytes) -> str:
"""Validate an uploaded SBOM of any supported format; return the format tag.

Accepts CycloneDX-JSON and SPDX (JSON / Tag-Value). Raises
:class:`SbomIngestInvalid` (422) for anything else, mirroring the per-format
handling of :mod:`services.sbom_conformance` / :mod:`services.sbom_convert`.

Adversarial-input contract: the O(n) byte-depth pre-check runs FIRST, before
any ``json.loads`` — including the one inside ``detect_format`` — so a
maliciously deep JSON document is rejected as a clean 422 and can never drive
the stdlib decoder into a ``RecursionError`` (which would escape as a 500).
Total size is already bounded by the read cap applied upstream; deep parsing
of the (now format-confirmed) document still happens later in the worker, off
the request path.
"""
depth = _max_nesting_depth(raw)
if depth > _MAX_NESTING_DEPTH:
raise SbomIngestInvalid(
f"SBOM nesting depth {depth} exceeds the maximum {_MAX_NESTING_DEPTH}"
)

fmt, doc = detect_format(raw)
if fmt == FORMAT_CYCLONEDX:
validate_cyclonedx_document(raw)
return fmt
if fmt == FORMAT_SPDX_JSON:
# ``doc`` is the parsed SPDX-JSON object (detect_format already decoded
# it under the depth guard above).
_validate_spdx_json_packages(doc or {})
return fmt
if fmt == FORMAT_SPDX_TV:
# Tag-Value is line-oriented (no recursion surface), but the read cap
# bounds BYTES, not package COUNT — a 32 MiB file of 14-byte
# ``PackageName:`` lines is ~2.4M packages, far past the JSON cap. Bound
# the count here so every format enforces the same component ceiling
# (the worker would otherwise drive millions of upserts). The anchored
# regex is linear (security review: no ReDoS).
text = raw.decode("utf-8", errors="replace")
package_count = len(re.findall(r"(?m)^PackageName:", text))
max_packages = sbom_ingest_max_components()
if package_count > max_packages:
raise SbomIngestInvalid(
f"SBOM declares {package_count} packages; the maximum is {max_packages}"
)
return fmt
raise SbomIngestInvalid(
"upload is not a CycloneDX or SPDX (JSON / Tag-Value) document"
)


# ---------------------------------------------------------------------------
# Ingest — validate + persist scan row + write file + enqueue
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -386,9 +474,10 @@ async def ingest_sbom(
1. ``prepare_scan_target`` — existence/team-access (404/403), project-scoped
API-key boundary (403), archived (409), per-team concurrency cap (429).
Reuses ``trigger_scan``'s exact guard sequence + exceptions.
2. Request validation — Content-Type/filename (415), size cap (413), JSON +
CycloneDX structure (422). Runs AFTER the authz/state guards so a
non-member learns nothing about a project from a malformed body.
2. Request validation — Content-Type/filename (415), size cap (413),
CycloneDX-JSON or SPDX (JSON/Tag-Value) structure (422). Runs AFTER the
authz/state guards so a non-member learns nothing about a project from a
malformed body.
3. INSERT the scan row, flush. The partial unique index
``ix_scans_project_active`` makes this the atomic concurrency check: a
second in-flight scan for the project raises :class:`ScanInProgressConflict`
Expand Down Expand Up @@ -427,8 +516,11 @@ async def ingest_sbom(
# ---- 2. request validation (untrusted input) -----------------------------
_validate_content_type(content_type=upload.content_type, filename=upload.filename)
raw = await _read_bounded(upload, max_bytes=sbom_ingest_max_bytes())
# Structural whitelist; never deep-traverses component elements.
validate_cyclonedx_document(raw)
# Format-dispatched structural whitelist (CycloneDX-JSON or SPDX JSON/TV).
# Depth-guarded; never deep-traverses element bodies (that runs in the
# worker). The original bytes are stored as-is and Trivy auto-detects the
# format; the worker maps SPDX → CycloneDX for component persistence.
validate_uploaded_sbom(raw)

original_filename = _clean_meta_text(upload.filename)
normalized_release = _clean_meta_text(release)
Expand Down
40 changes: 26 additions & 14 deletions apps/backend/tasks/ingest_sbom.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@

from __future__ import annotations

import json
import shutil
import uuid
from pathlib import Path
Expand All @@ -63,6 +62,7 @@
)
from models import Project, SbomConformance, Scan
from services import sbom_conformance
from services.sbom_convert import UnsupportedSbomFormat, to_cyclonedx
from services.vulnerability_matching import persist_trivy_findings
from tasks._progress import (
close_log_file,
Expand Down Expand Up @@ -384,12 +384,19 @@ def _persist_conformance(


def _load_uploaded_sbom(scan_metadata: dict[str, Any]) -> tuple[Path, dict[str, Any]]:
"""Resolve, containment-check, and parse the uploaded CycloneDX SBOM.

Returns ``(sbom_path, parsed_dict)``. Raises :class:`_IngestAborted` on a
missing path key, a path that resolves outside ``workspace_root()``, an
absent file, or invalid JSON. This is a minimal defensive backstop — the
synchronous service Pass owns the authoritative CycloneDX schema validation.
"""Resolve, containment-check, and NORMALISE the uploaded SBOM to CycloneDX.

Returns ``(sbom_path, cyclonedx_dict)``. ``sbom_path`` is the ORIGINAL
uploaded file (CycloneDX **or** SPDX) — Trivy reads it directly (``trivy
sbom`` auto-detects both formats by content), so no converted file is ever
written to disk. The returned dict is always CycloneDX-shaped: an SPDX
upload is mapped by :func:`services.sbom_convert.to_cyclonedx` so
``persist_sbom_components`` consumes one shape for every input format.

Raises :class:`_IngestAborted` on a missing path key, a path that resolves
outside ``workspace_root()``, an absent/unreadable file, or content that is
neither CycloneDX-JSON nor SPDX (JSON / Tag-Value). The synchronous service
Pass owns the authoritative up-front validation; this is a defensive backstop.
"""
raw_path = scan_metadata.get("sbom_path")
if not raw_path or not isinstance(raw_path, str):
Expand All @@ -412,12 +419,17 @@ def _load_uploaded_sbom(scan_metadata: dict[str, Any]) -> tuple[Path, dict[str,
raise _IngestAborted(f"SBOM file not found: {candidate}")

try:
with candidate.open("rb") as fh:
parsed = json.loads(fh.read())
except (OSError, ValueError) as exc:
raise _IngestAborted(f"SBOM file is not valid JSON: {exc}") from exc
raw = candidate.read_bytes()
except OSError as exc:
raise _IngestAborted(f"SBOM file could not be read: {exc}") from exc

if not isinstance(parsed, dict):
raise _IngestAborted("SBOM document is not a JSON object")
# Normalise to a CycloneDX dict for component persistence. CycloneDX passes
# through; SPDX (JSON / Tag-Value) is mapped. RDF/XML/junk raises
# UnsupportedSbomFormat → terminal abort (the sync service already rejected
# these up front, so reaching here means a tampered/garbled stored file).
try:
cyclonedx = to_cyclonedx(raw)
except UnsupportedSbomFormat as exc:
raise _IngestAborted(f"SBOM is not CycloneDX or SPDX: {exc}") from exc

return candidate, parsed
return candidate, cyclonedx
65 changes: 61 additions & 4 deletions apps/backend/tests/integration/scan/test_ingest_sbom_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,17 +124,18 @@ def _fake_run(


def _seed_queued_sbom_scan(
workspace: Path, *, ref: str | None = None
workspace: Path, *, ref: str | None = None, sbom_src: Path | None = None
) -> tuple[uuid.UUID, uuid.UUID]:
"""Seed project + queued sbom scan and write the realistic SBOM to its
durable on-disk ingest path. Returns (scan_id, project_id)."""
"""Seed project + queued sbom scan and write the SBOM to its durable on-disk
ingest path. Defaults to the realistic CycloneDX fixture; ``sbom_src`` points
at a different fixture (e.g. a real SPDX document). Returns (scan_id, project_id)."""
import asyncio

from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine

from core.config import database_url

sbom_bytes = (FIXTURES / "realistic.cdx.json").read_bytes()
sbom_bytes = (sbom_src or (FIXTURES / "realistic.cdx.json")).read_bytes()

async def _build() -> tuple[uuid.UUID, uuid.UUID]:
engine = create_async_engine(database_url(), pool_pre_ping=True, future=True)
Expand Down Expand Up @@ -426,3 +427,59 @@ def test_ingest_rerun_replaces_conformance_verdict(
assert len(rows) == 1, "re-entry must REPLACE the verdict, not duplicate it"
assert rows[0].id != first_id, "the verdict row was re-created (delete-then-insert)"
assert rows[0].result == "warn"


# ---------------------------------------------------------------------------
# SPDX input — a real syft SPDX-JSON document ingests (SPDX→CycloneDX mapping)
# ---------------------------------------------------------------------------

# Real syft SPDX fixtures recorded in PR1 (tests/fixtures/sbom/).
_SBOM_FIXTURES = BACKEND_ROOT / "tests" / "fixtures" / "sbom"


def test_ingest_spdx_json_persists_components_and_conformance(
monkeypatch: pytest.MonkeyPatch, tmp_path: Path, sync_session: Session
) -> None:
"""An uploaded SPDX-JSON document is mapped to CycloneDX for component
persistence, and its conformance verdict records source_format='spdx-json'.
Trivy reads the original SPDX file directly (the stub returns a report); the
point of this test is the SPDX→component path + the SPDX conformance tag."""
monkeypatch.setenv("WORKSPACE_HOST_PATH", str(tmp_path))
_stub_trivy_from_fixture(monkeypatch)

scan_id, _ = _seed_queued_sbom_scan(
tmp_path, sbom_src=_SBOM_FIXTURES / "real_spdx.json"
)

from tasks.ingest_sbom import ingest_sbom_task

result = ingest_sbom_task.apply(args=[str(scan_id)])
assert result.successful(), f"task failed: {result.traceback}"

sync_session.expire_all()
scan = sync_session.execute(select(Scan).where(Scan.id == scan_id)).scalar_one()
assert scan.status == "succeeded"

# Components were mapped out of the SPDX packages and persisted.
component_rows = list(
sync_session.execute(
select(ScanComponent).where(ScanComponent.scan_id == scan_id)
).scalars()
)
assert component_rows, "SPDX packages must map to persisted ScanComponent rows"

# The conformance verdict is tagged as SPDX-JSON (scored on the original).
verdict = sync_session.execute(
select(SbomConformance).where(SbomConformance.scan_id == scan_id)
).scalar_one()
assert verdict.source_format == "spdx-json"

# The durable original SBOM is preserved (the bytes are SPDX, but the
# download artifact kind is the shared 'sbom_cyclonedx' label).
kinds = {
a.kind
for a in sync_session.execute(
select(ScanArtifact).where(ScanArtifact.scan_id == scan_id)
).scalars()
}
assert "sbom_cyclonedx" in kinds
Loading
Loading