From 2ec4d47c48be706e4410cbc1547c58a7ae2b9617 Mon Sep 17 00:00:00 2001 From: shaiananvari8 <228813044+shaiananvari8@users.noreply.github.com> Date: Wed, 3 Jun 2026 21:49:47 -0500 Subject: [PATCH] Add data code hosting manifest guard --- data-code-hosting-manifest-guard/package.json | 11 ++ data-code-hosting-manifest-guard/readme.md | 26 +++ .../reports/manifest-guard-packet.json | 123 ++++++++++++ .../reports/manifest-guard-report.md | 33 ++++ .../reports/summary.svg | 11 ++ .../scripts/demo.js | 92 +++++++++ data-code-hosting-manifest-guard/src/index.js | 183 ++++++++++++++++++ .../test/index.test.js | 84 ++++++++ 8 files changed, 563 insertions(+) create mode 100644 data-code-hosting-manifest-guard/package.json create mode 100644 data-code-hosting-manifest-guard/readme.md create mode 100644 data-code-hosting-manifest-guard/reports/manifest-guard-packet.json create mode 100644 data-code-hosting-manifest-guard/reports/manifest-guard-report.md create mode 100644 data-code-hosting-manifest-guard/reports/summary.svg create mode 100644 data-code-hosting-manifest-guard/scripts/demo.js create mode 100644 data-code-hosting-manifest-guard/src/index.js create mode 100644 data-code-hosting-manifest-guard/test/index.test.js diff --git a/data-code-hosting-manifest-guard/package.json b/data-code-hosting-manifest-guard/package.json new file mode 100644 index 00000000..22c04385 --- /dev/null +++ b/data-code-hosting-manifest-guard/package.json @@ -0,0 +1,11 @@ +{ + "name": "data-code-hosting-manifest-guard", + "version": "1.0.0", + "private": true, + "type": "module", + "scripts": { + "check": "node --check src/index.js && node --check scripts/demo.js && node --check test/index.test.js", + "test": "node --test test/index.test.js", + "demo": "node scripts/demo.js" + } +} diff --git a/data-code-hosting-manifest-guard/readme.md b/data-code-hosting-manifest-guard/readme.md new file mode 100644 index 00000000..c2fa9062 --- /dev/null +++ b/data-code-hosting-manifest-guard/readme.md @@ -0,0 +1,26 @@ +# Data and Code Hosting Manifest Guard + +This module is a focused implementation slice for SCIBASE issue #14, Scientific/Engineering Data & Code Hosting. + +It evaluates a scientific project repository manifest before a data/code release is published. The guard checks: + +- FAIR-style metadata presence, including DOI, license, authors, keywords, and schema version. +- Dataset and executable artifact registration. +- SHA-256 artifact hash locks. +- Dataset metadata schemas and code runtime declarations. +- Visibility, embargo release dates, and access policy shape. +- Executable environment definitions. +- Reproducibility commands and output digests. +- Versioning strategy, release tag, and diffable review paths. + +The implementation is dependency-free and uses synthetic manifests only. It does not call external services, read private projects, mutate repositories, issue DOIs, upload files, or contact storage providers. + +## Commands + +```bash +npm run check +npm test +npm run demo +``` + +`npm run demo` writes a JSON packet, Markdown report, and SVG summary under `reports/`. diff --git a/data-code-hosting-manifest-guard/reports/manifest-guard-packet.json b/data-code-hosting-manifest-guard/reports/manifest-guard-packet.json new file mode 100644 index 00000000..9829b697 --- /dev/null +++ b/data-code-hosting-manifest-guard/reports/manifest-guard-packet.json @@ -0,0 +1,123 @@ +{ + "generatedAt": "2026-06-04T00:00:00.000Z", + "readyResult": { + "status": "ready_for_repository_release", + "summary": { + "artifactCount": 2, + "datasetCount": 1, + "codeCount": 1, + "blockerCount": 0, + "warningCount": 0 + }, + "blockers": [], + "warnings": [], + "auditDigest": "b00f635cbe73f436a63d6338a7b6fe87dad6a923c75943df5d0e2b33c704e44a" + }, + "blockedResult": { + "status": "block_publication", + "summary": { + "artifactCount": 1, + "datasetCount": 1, + "codeCount": 0, + "blockerCount": 10, + "warningCount": 5 + }, + "blockers": [ + { + "code": "missing_metadata", + "message": "Required metadata field 'authors' is missing.", + "evidence": "authors" + }, + { + "code": "missing_metadata", + "message": "Required metadata field 'license' is missing.", + "evidence": "license" + }, + { + "code": "missing_metadata", + "message": "Required metadata field 'doi' is missing.", + "evidence": "doi" + }, + { + "code": "missing_metadata", + "message": "Required metadata field 'keywords' is missing.", + "evidence": "keywords" + }, + { + "code": "missing_metadata", + "message": "Required metadata field 'schemaVersion' is missing.", + "evidence": "schemaVersion" + }, + { + "code": "invalid_artifact_hash", + "message": "Artifact 'unhashed-export' does not contain a valid SHA-256 hash lock.", + "evidence": "missing" + }, + { + "code": "embargo_without_release_date", + "message": "Embargoed repositories need an explicit release date.", + "evidence": { + "visibility": "embargoed" + } + }, + { + "code": "missing_execution_environment", + "message": "Executable repository hosting needs a Dockerfile, environment.yml, notebook kernel, or equivalent runtime definition.", + "evidence": { + "kind": "docker", + "definition": "" + } + }, + { + "code": "missing_reproducibility_commands", + "message": "Repository needs at least one command that reproduces or verifies the hosted outputs.", + "evidence": { + "commands": [] + } + }, + { + "code": "missing_versioning_policy", + "message": "Repository needs explicit versioning strategy and current tag metadata.", + "evidence": { + "strategy": "", + "currentTag": "" + } + } + ], + "warnings": [ + { + "code": "missing_code_artifact", + "message": "No executable code, notebook, package, or model artifact is registered.", + "evidence": [ + "dataset" + ] + }, + { + "code": "dataset_schema_missing", + "message": "Dataset 'unhashed-export' does not declare a metadata schema.", + "evidence": "unhashed-export" + }, + { + "code": "docker_definition_unclear", + "message": "Docker runtime is declared without a Dockerfile-style definition pointer.", + "evidence": "" + }, + { + "code": "missing_output_digest", + "message": "Reproducibility command has no expected output digest.", + "evidence": { + "commands": [] + } + }, + { + "code": "missing_diffable_paths", + "message": "No diffable paths are declared for dataset/code review.", + "evidence": { + "strategy": "", + "currentTag": "" + } + } + ], + "auditDigest": "777f3022d220fed8225ca1f9586d09274e35b149e2cd764a753769c72c8dd951" + } +} diff --git a/data-code-hosting-manifest-guard/reports/manifest-guard-report.md b/data-code-hosting-manifest-guard/reports/manifest-guard-report.md new file mode 100644 index 00000000..d9f7ee24 --- /dev/null +++ b/data-code-hosting-manifest-guard/reports/manifest-guard-report.md @@ -0,0 +1,33 @@ +# Data and Code Hosting Manifest Guard + +Status: block_publication +Audit digest: 777f3022d220fed8225ca1f9586d09274e35b149e2cd764a753769c72c8dd951 + +## Summary + +- Artifacts: 1 +- Dataset artifacts: 1 +- Code artifacts: 0 +- Blockers: 10 +- Warnings: 5 + +## Blockers + +- missing_metadata: Required metadata field 'authors' is missing. +- missing_metadata: Required metadata field 'license' is missing. +- missing_metadata: Required metadata field 'doi' is missing. +- missing_metadata: Required metadata field 'keywords' is missing. +- missing_metadata: Required metadata field 'schemaVersion' is missing. +- invalid_artifact_hash: Artifact 'unhashed-export' does not contain a valid SHA-256 hash lock. +- embargo_without_release_date: Embargoed repositories need an explicit release date. +- missing_execution_environment: Executable repository hosting needs a Dockerfile, environment.yml, notebook kernel, or equivalent runtime definition. +- missing_reproducibility_commands: Repository needs at least one command that reproduces or verifies the hosted outputs. +- missing_versioning_policy: Repository needs explicit versioning strategy and current tag metadata. + +## Warnings + +- missing_code_artifact: No executable code, notebook, package, or model artifact is registered. +- dataset_schema_missing: Dataset 'unhashed-export' does not declare a metadata schema. +- docker_definition_unclear: Docker runtime is declared without a Dockerfile-style definition pointer. +- missing_output_digest: Reproducibility command has no expected output digest. +- missing_diffable_paths: No diffable paths are declared for dataset/code review. diff --git a/data-code-hosting-manifest-guard/reports/summary.svg b/data-code-hosting-manifest-guard/reports/summary.svg new file mode 100644 index 00000000..6120056c --- /dev/null +++ b/data-code-hosting-manifest-guard/reports/summary.svg @@ -0,0 +1,11 @@ + + + + Data and Code Hosting Manifest Guard + Status + + block_publication + Artifacts 1 | Datasets 1 | Code 0 + Blockers 10 | Warnings 5 + Audit 777f3022d220fed8225ca1f9... + diff --git a/data-code-hosting-manifest-guard/scripts/demo.js b/data-code-hosting-manifest-guard/scripts/demo.js new file mode 100644 index 00000000..b41c46fd --- /dev/null +++ b/data-code-hosting-manifest-guard/scripts/demo.js @@ -0,0 +1,92 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import { createSampleManifest, evaluateRepositoryManifest } from "../src/index.js"; + +const reportDir = new URL("../reports/", import.meta.url); + +function escapeXml(value) { + return String(value) + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll('"', """); +} + +function renderMarkdown(result) { + const lines = [ + "# Data and Code Hosting Manifest Guard", + "", + `Status: ${result.status}`, + `Audit digest: ${result.auditDigest}`, + "", + "## Summary", + "", + `- Artifacts: ${result.summary.artifactCount}`, + `- Dataset artifacts: ${result.summary.datasetCount}`, + `- Code artifacts: ${result.summary.codeCount}`, + `- Blockers: ${result.summary.blockerCount}`, + `- Warnings: ${result.summary.warningCount}`, + "", + "## Blockers", + "" + ]; + + lines.push(...(result.blockers.length ? result.blockers.map((finding) => `- ${finding.code}: ${finding.message}`) : ["- None"])); + lines.push("", "## Warnings", ""); + lines.push(...(result.warnings.length ? result.warnings.map((finding) => `- ${finding.code}: ${finding.message}`) : ["- None"])); + lines.push(""); + return lines.join("\n"); +} + +function renderSvg(result) { + const statusColor = result.status === "ready_for_repository_release" ? "#15803d" : result.status === "needs_metadata_review" ? "#a16207" : "#b91c1c"; + return ` + + + Data and Code Hosting Manifest Guard + Status + + ${escapeXml(result.status)} + Artifacts ${result.summary.artifactCount} | Datasets ${result.summary.datasetCount} | Code ${result.summary.codeCount} + Blockers ${result.summary.blockerCount} | Warnings ${result.summary.warningCount} + Audit ${escapeXml(result.auditDigest.slice(0, 24))}... + +`; +} + +await mkdir(reportDir, { recursive: true }); + +const readyResult = evaluateRepositoryManifest(createSampleManifest()); +const blockedResult = evaluateRepositoryManifest({ + metadata: { title: "Private lab dump" }, + artifacts: [ + { + id: "unhashed-export", + type: "dataset", + format: "xlsx", + sha256: "missing", + version: "draft", + license: "custom-lab-license" + } + ], + access: { visibility: "embargoed" }, + environment: { kind: "docker", definition: "" }, + reproducibility: { commands: [] }, + versioning: { strategy: "", currentTag: "" } +}); + +const packet = { + generatedAt: new Date("2026-06-04T00:00:00.000Z").toISOString(), + readyResult, + blockedResult +}; + +await writeFile(new URL("manifest-guard-packet.json", reportDir), `${JSON.stringify(packet, null, 2)}\n`); +await writeFile(new URL("manifest-guard-report.md", reportDir), renderMarkdown(blockedResult)); +await writeFile(new URL("summary.svg", reportDir), renderSvg(blockedResult)); + +console.log(JSON.stringify({ + status: blockedResult.status, + blockers: blockedResult.summary.blockerCount, + warnings: blockedResult.summary.warningCount, + auditDigest: blockedResult.auditDigest +}, null, 2)); diff --git a/data-code-hosting-manifest-guard/src/index.js b/data-code-hosting-manifest-guard/src/index.js new file mode 100644 index 00000000..1a03274d --- /dev/null +++ b/data-code-hosting-manifest-guard/src/index.js @@ -0,0 +1,183 @@ +import { createHash } from "node:crypto"; + +const REQUIRED_METADATA = ["title", "authors", "license", "doi", "keywords", "schemaVersion"]; +const REQUIRED_ARTIFACT_FIELDS = ["id", "type", "format", "sha256", "version", "license"]; +const DATASET_TYPES = new Set(["dataset", "supplement", "instrument-output"]); +const CODE_TYPES = new Set(["script", "notebook", "package", "model"]); +const ALLOWED_VISIBILITY = new Set(["public", "private", "institutional", "embargoed"]); +const REUSABLE_LICENSES = new Set(["CC-BY-4.0", "CC0-1.0", "MIT", "Apache-2.0", "BSD-3-Clause"]); + +function hasText(value) { + return typeof value === "string" && value.trim().length > 0; +} + +function hasArray(value) { + return Array.isArray(value) && value.length > 0; +} + +function addFinding(collection, code, message, evidence) { + collection.push({ code, message, evidence }); +} + +function normalizeArtifacts(manifest) { + return Array.isArray(manifest.artifacts) ? manifest.artifacts : []; +} + +function digestPayload(payload) { + return createHash("sha256").update(JSON.stringify(payload)).digest("hex"); +} + +export function evaluateRepositoryManifest(manifest, options = {}) { + const blockers = []; + const warnings = []; + const metadata = manifest?.metadata ?? {}; + const artifacts = normalizeArtifacts(manifest); + const datasetCount = artifacts.filter((artifact) => DATASET_TYPES.has(artifact.type)).length; + const codeCount = artifacts.filter((artifact) => CODE_TYPES.has(artifact.type)).length; + + for (const field of REQUIRED_METADATA) { + const value = metadata[field]; + const present = Array.isArray(value) ? hasArray(value) : hasText(value); + if (!present) { + addFinding(blockers, "missing_metadata", `Required metadata field '${field}' is missing.`, field); + } + } + + if (metadata.license && !REUSABLE_LICENSES.has(metadata.license)) { + addFinding(warnings, "weak_reuse_license", "Repository license is not in the preferred reusable-license set.", metadata.license); + } + + if (!hasArray(artifacts)) { + addFinding(blockers, "missing_artifacts", "At least one data, code, notebook, model, or supplementary artifact must be registered.", "artifacts"); + } + + if (datasetCount === 0) { + addFinding(blockers, "missing_dataset_artifact", "Scientific data hosting requires at least one registered dataset-like artifact.", artifacts.map((artifact) => artifact.type)); + } + + if (codeCount === 0) { + addFinding(warnings, "missing_code_artifact", "No executable code, notebook, package, or model artifact is registered.", artifacts.map((artifact) => artifact.type)); + } + + for (const artifact of artifacts) { + for (const field of REQUIRED_ARTIFACT_FIELDS) { + if (!hasText(artifact[field])) { + addFinding(blockers, "incomplete_artifact_manifest", `Artifact '${artifact.id ?? "unknown"}' is missing '${field}'.`, artifact); + } + } + + if (artifact.sha256 && !/^[a-f0-9]{64}$/i.test(artifact.sha256)) { + addFinding(blockers, "invalid_artifact_hash", `Artifact '${artifact.id}' does not contain a valid SHA-256 hash lock.`, artifact.sha256); + } + + if (artifact.type === "dataset" && !hasText(artifact.metadataSchema)) { + addFinding(warnings, "dataset_schema_missing", `Dataset '${artifact.id}' does not declare a metadata schema.`, artifact.id); + } + + if (CODE_TYPES.has(artifact.type) && !hasText(artifact.runtime)) { + addFinding(warnings, "runtime_missing", `Executable artifact '${artifact.id}' does not declare its runtime.`, artifact.id); + } + } + + const access = manifest?.access ?? {}; + if (!ALLOWED_VISIBILITY.has(access.visibility)) { + addFinding(blockers, "invalid_visibility", "Repository visibility must be public, private, institutional, or embargoed.", access.visibility); + } + + if (access.visibility === "embargoed" && !hasText(access.embargoEndsAt)) { + addFinding(blockers, "embargo_without_release_date", "Embargoed repositories need an explicit release date.", access); + } + + const environment = manifest?.environment ?? {}; + if (!hasText(environment.kind) || !hasText(environment.definition)) { + addFinding(blockers, "missing_execution_environment", "Executable repository hosting needs a Dockerfile, environment.yml, notebook kernel, or equivalent runtime definition.", environment); + } + + if (environment.kind === "docker" && !environment.definition.toLowerCase().includes("dockerfile")) { + addFinding(warnings, "docker_definition_unclear", "Docker runtime is declared without a Dockerfile-style definition pointer.", environment.definition); + } + + const reproducibility = manifest?.reproducibility ?? {}; + if (!hasArray(reproducibility.commands)) { + addFinding(blockers, "missing_reproducibility_commands", "Repository needs at least one command that reproduces or verifies the hosted outputs.", reproducibility); + } + + if (!hasText(reproducibility.expectedOutputHash)) { + addFinding(warnings, "missing_output_digest", "Reproducibility command has no expected output digest.", reproducibility); + } + + const versioning = manifest?.versioning ?? {}; + if (!hasText(versioning.strategy) || !hasText(versioning.currentTag)) { + addFinding(blockers, "missing_versioning_policy", "Repository needs explicit versioning strategy and current tag metadata.", versioning); + } + + if (!hasArray(versioning.diffablePaths)) { + addFinding(warnings, "missing_diffable_paths", "No diffable paths are declared for dataset/code review.", versioning); + } + + const status = blockers.length > 0 ? "block_publication" : warnings.length > (options.warningBudget ?? 3) ? "needs_metadata_review" : "ready_for_repository_release"; + const result = { + status, + summary: { + artifactCount: artifacts.length, + datasetCount, + codeCount, + blockerCount: blockers.length, + warningCount: warnings.length + }, + blockers, + warnings + }; + + return { ...result, auditDigest: digestPayload(result) }; +} + +export function createSampleManifest(overrides = {}) { + return { + metadata: { + title: "Open river sensor calibration study", + authors: ["SCIBASE Synthetic Lab"], + license: "CC-BY-4.0", + doi: "10.5555/scibase.synthetic.001", + keywords: ["hydrology", "sensor-calibration", "open-data"], + schemaVersion: "scibase-repository-manifest/v1" + }, + artifacts: [ + { + id: "river-readings-2026", + type: "dataset", + format: "csv", + sha256: "a".repeat(64), + version: "v1.0.0", + license: "CC-BY-4.0", + metadataSchema: "DataCite 4.5" + }, + { + id: "calibration-notebook", + type: "notebook", + format: "ipynb", + sha256: "b".repeat(64), + version: "v1.0.0", + license: "MIT", + runtime: "python-3.12" + } + ], + access: { + visibility: "public" + }, + environment: { + kind: "docker", + definition: "Dockerfile" + }, + reproducibility: { + commands: ["python notebooks/reproduce.py"], + expectedOutputHash: "c".repeat(64) + }, + versioning: { + strategy: "semantic-versioning", + currentTag: "v1.0.0", + diffablePaths: ["data/", "notebooks/", "metadata.json"] + }, + ...overrides + }; +} diff --git a/data-code-hosting-manifest-guard/test/index.test.js b/data-code-hosting-manifest-guard/test/index.test.js new file mode 100644 index 00000000..1c49e553 --- /dev/null +++ b/data-code-hosting-manifest-guard/test/index.test.js @@ -0,0 +1,84 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import { createSampleManifest, evaluateRepositoryManifest } from "../src/index.js"; + +test("ready repository manifests pass with stable audit evidence", () => { + const result = evaluateRepositoryManifest(createSampleManifest()); + + assert.equal(result.status, "ready_for_repository_release"); + assert.equal(result.summary.artifactCount, 2); + assert.equal(result.summary.datasetCount, 1); + assert.equal(result.summary.codeCount, 1); + assert.equal(result.summary.blockerCount, 0); + assert.match(result.auditDigest, /^[a-f0-9]{64}$/); +}); + +test("missing FAIR metadata and artifacts block publication", () => { + const result = evaluateRepositoryManifest({ + metadata: { title: "Incomplete repository" }, + artifacts: [], + access: { visibility: "public" } + }); + + assert.equal(result.status, "block_publication"); + assert.ok(result.blockers.some((finding) => finding.code === "missing_metadata" && finding.evidence === "doi")); + assert.ok(result.blockers.some((finding) => finding.code === "missing_dataset_artifact")); + assert.ok(result.blockers.some((finding) => finding.code === "missing_execution_environment")); + assert.ok(result.blockers.some((finding) => finding.code === "missing_reproducibility_commands")); +}); + +test("invalid hash locks and embargo metadata are caught", () => { + const manifest = createSampleManifest({ + artifacts: [ + { + id: "supplement", + type: "dataset", + format: "json", + sha256: "not-a-real-hash", + version: "v1.0.0", + license: "CC0-1.0", + metadataSchema: "schema.org/Dataset" + } + ], + access: { visibility: "embargoed" } + }); + + const result = evaluateRepositoryManifest(manifest); + + assert.equal(result.status, "block_publication"); + assert.ok(result.blockers.some((finding) => finding.code === "invalid_artifact_hash")); + assert.ok(result.blockers.some((finding) => finding.code === "embargo_without_release_date")); +}); + +test("warning budget moves risky manifests into metadata review", () => { + const manifest = createSampleManifest({ + metadata: { + ...createSampleManifest().metadata, + license: "custom-lab-license" + }, + artifacts: [ + { + id: "river-readings-2026", + type: "dataset", + format: "csv", + sha256: "a".repeat(64), + version: "v1.0.0", + license: "custom-lab-license" + } + ], + reproducibility: { + commands: ["python notebooks/reproduce.py"] + }, + versioning: { + strategy: "semantic-versioning", + currentTag: "v1.0.0" + } + }); + + const result = evaluateRepositoryManifest(manifest, { warningBudget: 2 }); + + assert.equal(result.status, "needs_metadata_review"); + assert.ok(result.warnings.some((finding) => finding.code === "weak_reuse_license")); + assert.ok(result.warnings.some((finding) => finding.code === "missing_code_artifact")); + assert.ok(result.warnings.some((finding) => finding.code === "missing_output_digest")); +});