feat(migrations): make NMA_SurfaceWaterData.thing_id nullable

jirhiker · jirhiker · commit 2d4d8ff18569 · 2026-02-19T17:27:48.000-07:00
diff --git a/alembic/versions/i2c3d4e5f6a7_make_surface_water_data_thing_id_nullable.py b/alembic/versions/i2c3d4e5f6a7_make_surface_water_data_thing_id_nullable.py
@@ -0,0 +1,57 @@
+"""Make NMA_SurfaceWaterData.thing_id nullable.
+
+Revision ID: i2c3d4e5f6a7
+Revises: f1a2b3c4d5e6
+Create Date: 2026-02-20 17:40:00.000000
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy import inspect
+
+# revision identifiers, used by Alembic.
+revision: str = "i2c3d4e5f6a7"
+down_revision: Union[str, Sequence[str], None] = "f1a2b3c4d5e6"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Allow orphan legacy SurfaceWaterData rows without a mapped Thing."""
+    bind = op.get_bind()
+    inspector = inspect(bind)
+    if not inspector.has_table("NMA_SurfaceWaterData"):
+        return
+
+    columns = {col["name"] for col in inspector.get_columns("NMA_SurfaceWaterData")}
+    if "thing_id" not in columns:
+        return
+
+    op.alter_column(
+        "NMA_SurfaceWaterData",
+        "thing_id",
+        existing_type=sa.Integer(),
+        nullable=True,
+    )
+
+
+def downgrade() -> None:
+    """Revert to NOT NULL only when no null thing_id values exist."""
+    bind = op.get_bind()
+    inspector = inspect(bind)
+    if not inspector.has_table("NMA_SurfaceWaterData"):
+        return
+
+    columns = {col["name"] for col in inspector.get_columns("NMA_SurfaceWaterData")}
+    if "thing_id" not in columns:
+        return
+
+    op.execute('DELETE FROM "NMA_SurfaceWaterData" WHERE thing_id IS NULL')
+    op.alter_column(
+        "NMA_SurfaceWaterData",
+        "thing_id",
+        existing_type=sa.Integer(),
+        nullable=False,
+    )
diff --git a/db/nma_legacy.py b/db/nma_legacy.py
@@ -578,9 +578,9 @@ class NMA_SurfaceWaterData(Base):
     object_id: Mapped[int] = mapped_column("OBJECTID", Integer, primary_key=True)
 
     # FK
-    # FK to Thing - required for all SurfaceWaterData records
-    thing_id: Mapped[int] = mapped_column(
-        Integer, ForeignKey("thing.id", ondelete="CASCADE"), nullable=False
+    # FK to Thing - optional when legacy rows cannot be mapped to a Thing.
+    thing_id: Mapped[Optional[int]] = mapped_column(
+        Integer, ForeignKey("thing.id", ondelete="CASCADE"), nullable=True
     )
 
     # Legacy PK (for audit)
@@ -615,16 +615,9 @@ class NMA_SurfaceWaterData(Base):
     data_source: Mapped[Optional[str]] = mapped_column("DataSource", String(255))
 
     # Relationships
-    thing: Mapped["Thing"] = relationship("Thing", back_populates="surface_water_data")
-
-    @validates("thing_id")
-    def validate_thing_id(self, key, value):
-        """Prevent orphan NMA_SurfaceWaterData - must have a parent Thing."""
-        if value is None:
-            raise ValueError(
-                "NMA_SurfaceWaterData requires a parent Thing (thing_id cannot be None)"
-            )
-        return value
+    thing: Mapped[Optional["Thing"]] = relationship(
+        "Thing", back_populates="surface_water_data"
+    )
 
 
 class NMA_SurfaceWaterPhotos(Base):
diff --git a/transfers/surface_water_data.py b/transfers/surface_water_data.py
@@ -62,22 +62,12 @@ def _get_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]:
 
     def _transfer_hook(self, session: Session) -> None:
         rows: list[dict[str, Any]] = []
-        skipped_missing_thing = 0
         for raw in self.cleaned_df.to_dict("records"):
             record = self._row_dict(raw)
-            if record is None:
-                skipped_missing_thing += 1
-                continue
             rows.append(record)
 
         rows = self._dedupe_rows(rows, key="OBJECTID", include_missing=True)
 
-        if skipped_missing_thing:
-            logger.warning(
-                "Skipped %s SurfaceWaterData rows without matching Thing",
-                skipped_missing_thing,
-            )
-
         insert_stmt = insert(NMA_SurfaceWaterData)
         excluded = insert_stmt.excluded
 
@@ -111,7 +101,7 @@ def _transfer_hook(self, session: Session) -> None:
             session.commit()
             session.expunge_all()
 
-    def _row_dict(self, row: dict[str, Any]) -> Optional[dict[str, Any]]:
+    def _row_dict(self, row: dict[str, Any]) -> dict[str, Any]:
         def val(key: str) -> Optional[Any]:
             v = row.get(key)
             if pd.isna(v):
@@ -133,12 +123,6 @@ def to_uuid(v: Any) -> Optional[uuid.UUID]:
 
         location_id = to_uuid(val("LocationId"))
         thing_id = self._resolve_thing_id(location_id)
-        if thing_id is None:
-            logger.warning(
-                "Skipping SurfaceWaterData LocationId=%s - Thing not found",
-                location_id,
-            )
-            return None
 
         return {
             "LocationId": location_id,
diff --git a/transfers/transfer_results_builder.py b/transfers/transfer_results_builder.py
@@ -7,6 +7,7 @@
 from sqlalchemy import select, func
 
 from db.engine import session_ctx
+from transfers.transfer import load_transfer_options
 from transfers.transfer_results_specs import (
     TRANSFER_COMPARISON_SPECS,
     TransferComparisonSpec,
@@ -15,7 +16,12 @@
     TransferComparisonResults,
     TransferResult,
 )
-from transfers.util import read_csv
+from transfers.util import (
+    read_csv,
+    replace_nans,
+    get_transferable_wells,
+)
+import os
 
 
 def _normalize_key(value: Any) -> str | None:
@@ -56,6 +62,8 @@ class TransferResultsBuilder:
 
     def __init__(self, sample_limit: int = 25):
         self.sample_limit = sample_limit
+        self.transfer_options = load_transfer_options()
+        self.transfer_limit = int(os.getenv("TRANSFER_LIMIT", "1000"))
 
     def build(self) -> TransferComparisonResults:
         results: dict[str, TransferResult] = {}
@@ -70,16 +78,18 @@ def _build_one(self, spec: TransferComparisonSpec) -> TransferResult:
         source_df = read_csv(spec.source_csv)
         if spec.source_filter:
             source_df = spec.source_filter(source_df)
-        source_series = _normalized_series(source_df, spec.source_key_column)
+        comparison_df = source_df
+        enabled = self._is_enabled(spec)
+        if not enabled:
+            comparison_df = source_df.iloc[0:0]
+        elif spec.transfer_name == "WellData":
+            comparison_df = self._agreed_welldata_df()
+
+        source_series = _normalized_series(comparison_df, spec.source_key_column)
         source_keys = set(source_series.unique().tolist())
         source_keyed_row_count = int(source_series.shape[0])
         source_duplicate_key_row_count = source_keyed_row_count - len(source_keys)
-        agreed_transfer_row_count = int(len(source_df))
-        if spec.agreed_row_counter is not None:
-            try:
-                agreed_transfer_row_count = int(spec.agreed_row_counter())
-            except Exception:
-                agreed_transfer_row_count = int(len(source_df))
+        agreed_transfer_row_count = int(len(comparison_df))
 
         model = spec.destination_model
         key_col = getattr(model, spec.destination_key_column)
@@ -134,20 +144,44 @@ def _build_one(self, spec: TransferComparisonSpec) -> TransferResult:
             extra_in_destination_sample=extra[: self.sample_limit],
         )
 
+    def _is_enabled(self, spec: TransferComparisonSpec) -> bool:
+        if not spec.option_field:
+            return True
+        return bool(getattr(self.transfer_options, spec.option_field, True))
+
+    def _agreed_welldata_df(self) -> pd.DataFrame:
+        wdf = read_csv("WellData", dtype={"OSEWelltagID": str})
+        ldf = read_csv("Location")
+        ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1, errors="ignore")
+        wdf = wdf.join(ldf.set_index("LocationId"), on="LocationId")
+        wdf = wdf[wdf["SiteType"] == "GW"]
+        wdf = wdf[wdf["Easting"].notna() & wdf["Northing"].notna()]
+        wdf = replace_nans(wdf)
+
+        cleaned_df = get_transferable_wells(wdf)
+
+        dupes = cleaned_df["PointID"].duplicated(keep=False)
+        if dupes.any():
+            dup_ids = set(cleaned_df.loc[dupes, "PointID"])
+            cleaned_df = cleaned_df[~cleaned_df["PointID"].isin(dup_ids)]
+
+        if self.transfer_limit > 0:
+            cleaned_df = cleaned_df.head(self.transfer_limit)
+        return cleaned_df
+
     @staticmethod
     def write_summary(path: Path, comparison: TransferComparisonResults) -> None:
         lines = [
             f"generated_at={comparison.generated_at}",
             "",
-            "| Transfer | Source CSV | Source Rows | Agreed Rows | Dest Model | Dest Rows | Missing Agreed | Matched | Missing | Extra |",
-            "|---|---|---:|---:|---|---:|---:|---:|---:|---:|",
+            "| Transfer | Source CSV | Source Rows | Agreed Rows | Dest Model | Dest Rows | Missing Agreed |",
+            "|---|---|---:|---:|---|---:|---:|",
         ]
         for name in sorted(comparison.results.keys()):
             r = comparison.results[name]
             missing_agreed = r.agreed_transfer_row_count - r.destination_row_count
             lines.append(
                 f"| {name} | {r.source_csv} | {r.source_row_count} | {r.agreed_transfer_row_count} | "
-                f"{r.destination_model} | {r.destination_row_count} | {missing_agreed} | "
-                f"{r.matched_key_count} | {r.missing_in_destination_count} | {r.extra_in_destination_count} |"
+                f"{r.destination_model} | {r.destination_row_count} | {missing_agreed} |"
             )
         path.write_text("\n".join(lines) + "\n")
diff --git a/transfers/transfer_results_specs.py b/transfers/transfer_results_specs.py