BAL-DMU · jagh · May 27, 2026 · May 27, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,82 @@
+# Changelog
+
+All notable changes to **datorcloud** are documented in this file. The
+format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/)
+and the version numbers follow [Semantic Versioning](https://semver.org).
+
+## [0.2.0] - 2026-05-27
+
+Phase 1 of the DORIS integration plan landed: the layered L1-L4 catalog,
+the formal `(I, C, Q, F)` operators, and the L4 snapshot freeze.
+
+### Added
+
+- **L1-L4 catalog DDL** (`datorcloud/schemas/l1_l4.sql`,
+  `schema_version: 1.0.0`). Idempotent. L1 unique key includes
+  `study_id` so DICOM rows with multiple studies per subject do not
+  collide. L2 is keyed on `(record_uid, modality, sequence)` so
+  compound CVPR modality strings split losslessly. New ENUMs:
+  `privacy_class`, `annotation_kind`, `instance_label`,
+  `processing_stage`. New L1 companion tables: `l1_citations`,
+  `l1_processing` (CVPR ingest provenance, `cvpr_folder` column on
+  `l1_experiment`). New L4 tables: `l4_cohort_snapshot` (with
+  `l13_payload` Parquet blob + `catalog_sha256` + `hf_publication_log`
+  reserved for Phase 3) and `l4_eval_set` (annotator columns, target
+  labels, inter-observer quantiles; per design invariant I3 multiple
+  eval sets may reference one snapshot).
+- **`datorcloud.schemas.Migration`** runner with a stable
+  `schema_sha` digest computed from the canonical DDL text. Apply is
+  idempotent (`schema_sha` stable across two runs).
+- **`ParquetCatalogComponent`** (`datorcloud/components/parquet_catalog_component.py`),
+  replacing `metadata_storage_component.py`. Hive-partitioned by
+  `dataset_id` + `dataset_version` for L1-L3 layers under
+  `<base>/<layer>/dataset_id=<id>/dataset_version=<v>/part.parquet`,
+  flat layout for L4 tables. Canonical views `v_doris` and
+  `v_doris_egress` (license / privacy filtered) materialised at
+  construction.
+- **Formal `(I, C, Q, F)` operators on `DatorCloudOrchestrator`** -
+  `ingest(layer, df)`, `query(sql=...)` or
+  `query(view=..., filters=...)`, `snapshot_cohort(...)`,
+  `create_eval_set(...)`, `fetch(snapshot_id, dest)`. `from_env()`
+  factory and `.env` contract unchanged; new optional
+  `DATORCLOUD_CATALOG_URI` env var picks the catalog root.
+- **L4 snapshot freeze** (`datorcloud/snapshots.py`). At
+  `snapshot_cohort()` time, the matched L1-L3 rows are deep-copied into
+  a single Parquet blob, hashed over a deterministic canonical
+  serialisation into `catalog_sha256`, and persisted. The hash is stable
+  across reruns even after `l2_sensor.converted_uri` is updated between
+  two consecutive snapshots (the gate that integration-test
+  `doris-it-01-catalog` assertion (c) checks).
+- **CLI verb** `datorcloud query --sql "SELECT ... FROM v_doris"` runs
+  end-to-end against the new L1-L4 catalog without MinIO credentials.
+- **Integration test** `tests/integration/test_01_catalog.py`
+  (`doris-it-01-catalog`) covering assertions (a)-(f) of
+  STEP_BY_STEP_PLAN.md §3.
+
+### Changed
+
+- `DatorCloudOrchestrator.__init__` now accepts an optional
+  `parquet_catalog=` and `catalog_base_uri=` argument. Existing
+  callers continue to work unchanged; the new path is opt-in.
+- `pyproject.toml` declares `pyarrow>=14` as a runtime dependency
+  (used by the snapshot freeze) and adds package-data for
+  `datorcloud.schemas/*.sql`.
+
+### Documentation
+
+- `docs/snapshots.md` describes the snapshot-freeze semantics, the
+  canonical-serialisation hashing rule, and the L4 eval-set join
+  surface that the T2D inter-observer pipeline reads in Phase 5.
+
+### Migration notes (downstream)
+
+- Downstream pins to `datorcloud>=0.2.0`. `msk-ai-trust-to-deploy`
+  bumped its requirement in `pyproject.toml`.
+- The legacy `metadata_storage_component` remains importable but is
+  superseded by `ParquetCatalogComponent` for any new code path. It
+  will be removed no earlier than `0.3.0`.
+
+## [0.1.0] - 2026-05-15
+
+- Initial public release: component-oriented framework for MinIO
+  object storage, DuckDB-backed CSV queries, and Dagster assets.
diff --git a/datorcloud/__init__.py b/datorcloud/__init__.py
@@ -5,18 +5,34 @@
     MetadataStorageComponent,
     MinioObjectComponent,
     ObjectRetrievalComponent,
+    ParquetCatalogComponent,
     QueryComponent,
 )
 from .core import DatorCloudOrchestrator
+from .schemas import SCHEMA_VERSION as L1_L4_SCHEMA_VERSION
+from .snapshots import (
+    EvalSet,
+    Snapshot,
+    create_eval_set,
+    load_snapshot_payload,
+    snapshot_cohort,
+)
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 
 __all__ = [
     "DatorCloudOrchestrator",
     "MetadataGeneratorComponent",
     "MetadataStorageComponent",
     "MinioObjectComponent",
     "ObjectRetrievalComponent",
+    "ParquetCatalogComponent",
     "QueryComponent",
+    "Snapshot",
+    "EvalSet",
+    "snapshot_cohort",
+    "create_eval_set",
+    "load_snapshot_payload",
+    "L1_L4_SCHEMA_VERSION",
     "__version__",
 ]
diff --git a/datorcloud/cli.py b/datorcloud/cli.py
@@ -65,20 +65,26 @@ def _parse_filters(values: Optional[Sequence[str]]) -> Dict[str, Any]:
     return out
 
 
-def _build_orchestrator(args: argparse.Namespace) -> DatorCloudOrchestrator:
+def _build_orchestrator(
+    args: argparse.Namespace, *, require_minio: bool = True
+) -> DatorCloudOrchestrator:
     """Construct the orchestrator from CLI args.
 
     CLI defaults pull from the environment (which has been populated by
     ``load_dotenv()`` above) without hard-coding any credentials. Missing
     credentials surface as a clear ``ValueError`` from the underlying
     components.
+
+    The catalog-only verbs (``query --sql``, ``snapshot``, etc.) pass
+    ``require_minio=False`` so callers can drive the L1-L4 catalog
+    against a local DuckDB / Parquet root without configuring MinIO.
     """
-    if not args.minio_access_key or not args.minio_secret_key:
+    if require_minio and (not args.minio_access_key or not args.minio_secret_key):
         raise SystemExit(
             "MinIO credentials are missing. Set S3_ACCESS_KEY and S3_SECRET_KEY "
             "in your .env file, or pass --minio-access-key / --minio-secret-key."
         )
-    return DatorCloudOrchestrator(
+    kwargs: Dict[str, Any] = dict(
         minio_endpoint=args.minio_endpoint,
         minio_access_key=args.minio_access_key,
         minio_secret_key=args.minio_secret_key,
@@ -87,7 +93,17 @@ def _build_orchestrator(args: argparse.Namespace) -> DatorCloudOrchestrator:
         metadata_bucket=args.metadata_bucket,
         local_download_dir=args.local_download_dir,
         duckdb_extension_path=args.duckdb_extension_path,
+        catalog_base_uri=getattr(args, "catalog_base_uri", None)
+        or os.environ.get("DATORCLOUD_CATALOG_URI"),
     )
+    if not require_minio and (
+        not args.minio_access_key or not args.minio_secret_key
+    ):
+        # Catalog-only mode: stub out credentials so the minio component
+        # is constructed but never used.
+        kwargs["minio_access_key"] = args.minio_access_key or "_catalog_only_"
+        kwargs["minio_secret_key"] = args.minio_secret_key or "_catalog_only_"
+    return DatorCloudOrchestrator(**kwargs)
 
 
 def _add_common_args(parser: argparse.ArgumentParser) -> None:
@@ -123,6 +139,16 @@ def _add_common_args(parser: argparse.ArgumentParser) -> None:
         "--duckdb-extension-path",
         default=os.environ.get("DUCKDB_HTTPFS_EXTENSION_PATH"),
     )
+    parser.add_argument(
+        "--catalog-base-uri",
+        dest="catalog_base_uri",
+        default=os.environ.get("DATORCLOUD_CATALOG_URI"),
+        help=(
+            "Root URI for the L1-L4 Parquet catalog "
+            "(file:// or s3:// or bare path). "
+            "Defaults to $DATORCLOUD_CATALOG_URI."
+        ),
+    )
     parser.add_argument(
         "-v", "--verbose", action="count", default=0, help="Increase log verbosity."
     )
@@ -158,6 +184,14 @@ def _cmd_metadata(args: argparse.Namespace) -> int:
 
 
 def _cmd_query(args: argparse.Namespace) -> int:
+    # New Phase-1 path: --sql goes directly through the formal (Q)
+    # operator on the L1-L4 catalog. The legacy --metadata-file path
+    # routes to the original CSV-backed query_metadata for back-compat.
+    if args.sql is not None:
+        orchestrator = _build_orchestrator(args, require_minio=False)
+        df = orchestrator.query(sql=args.sql)
+        print(df.to_csv(index=False))
+        return 0
     orchestrator = _build_orchestrator(args)
     filters = _parse_filters(args.filter)
     df = orchestrator.query_metadata(
@@ -220,7 +254,21 @@ def build_parser() -> argparse.ArgumentParser:
     _add_common_args(p_meta)
     p_meta.set_defaults(func=_cmd_metadata)
 
-    p_query = sub.add_parser("query", help="Run a filtered query against the metadata CSV.")
+    p_query = sub.add_parser(
+        "query",
+        help=(
+            "Query the catalog. Use --sql for the Phase-1 L1-L4 path or "
+            "--metadata-file for the legacy CSV path."
+        ),
+    )
+    p_query.add_argument(
+        "--sql",
+        default=None,
+        help=(
+            "Raw DuckDB SQL evaluated against the Phase-1 L1-L4 catalog "
+            "views (v_doris, v_doris_egress)."
+        ),
+    )
     p_query.add_argument("--metadata-file", default=None)
     p_query.add_argument("--filter", action="append", default=[])
     p_query.add_argument("--limit", type=int, default=None)

diff --git a/datorcloud/components/__init__.py b/datorcloud/components/__init__.py
@@ -1,13 +1,15 @@
 from .minio_component import MinioObjectComponent
 from .metadata_generator_component import MetadataGeneratorComponent
 from .metadata_storage_component import MetadataStorageComponent
+from .parquet_catalog_component import ParquetCatalogComponent
 from .query_component import QueryComponent
 from .retrieval_component import ObjectRetrievalComponent
 
 __all__ = [
     "MinioObjectComponent",
     "MetadataGeneratorComponent",
     "MetadataStorageComponent",
+    "ParquetCatalogComponent",
     "QueryComponent",
-    "ObjectRetrievalComponent"
-] 
+    "ObjectRetrievalComponent",
+]