From 1195f1a15adf15703c6a8a6ef857aaed8ca84952 Mon Sep 17 00:00:00 2001 From: jakeross Date: Thu, 19 Feb 2026 09:15:00 -0700 Subject: [PATCH 01/14] feat: add WellTransferResultsBuilder for summarizing well transfer outcomes --- transfers/well_transfer_results.py | 332 +++++++++++++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 transfers/well_transfer_results.py diff --git a/transfers/well_transfer_results.py b/transfers/well_transfer_results.py new file mode 100644 index 00000000..555ab9f7 --- /dev/null +++ b/transfers/well_transfer_results.py @@ -0,0 +1,332 @@ +# =============================================================================== +# Copyright 2026 ross +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +from __future__ import annotations + +import argparse +import csv +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path + +import pandas as pd +from sqlalchemy import select + +from db import Thing +from db.engine import session_ctx +from transfers.util import ( + filter_non_transferred_wells, + get_transferable_wells, + read_csv, + replace_nans, +) + + +@dataclass +class ValidationIssue: + pointid: str + table: str + field: str + error: str + + +@dataclass +class WellTransferResults: + source_count: int + committed_count: int + transferred_count: int + skipped_by_decision: list[str] + validation_issue_wells: list[str] + validation_issues: list[ValidationIssue] + metrics_file: Path | None + skipped_by_existing_destination: list[str] + + +class WellTransferResultsBuilder: + """Build well transfer outcome summaries by comparing source and destination.""" + + def __init__( + self, + pointids: list[str] | None = None, + metrics_file: Path | None = None, + output_dir: Path | None = None, + ): + self.pointids = set(pointids or []) + self.metrics_file = metrics_file + self.output_dir = output_dir or (Path("transfers") / "metrics") + + def build(self) -> WellTransferResults: + source_df = self._load_source_wells() + committed_df = self._load_committed_wells(source_df) + committed_without_existing_df = filter_non_transferred_wells(committed_df) + + source_ids = self._point_ids(source_df) + committed_ids = self._point_ids(committed_df) + committed_without_existing_ids = self._point_ids(committed_without_existing_df) + destination_ids = self._load_destination_ids() + + skipped_by_decision = sorted(source_ids - committed_ids) + skipped_by_existing_destination = sorted( + committed_ids - committed_without_existing_ids + ) + transferred_ids = committed_ids & destination_ids + missing_committed_ids = committed_ids - transferred_ids + + validation_issues = self._load_well_validation_issues( + self._resolve_metrics_file() + ) + validation_issue_ids = { + issue.pointid for issue in validation_issues if issue.pointid in source_ids + } + validation_issue_wells = sorted(validation_issue_ids & missing_committed_ids) + + return WellTransferResults( + source_count=len(source_ids), + committed_count=len(committed_ids), + transferred_count=len(transferred_ids), + skipped_by_decision=skipped_by_decision, + validation_issue_wells=validation_issue_wells, + validation_issues=validation_issues, + metrics_file=self._resolve_metrics_file(), + skipped_by_existing_destination=skipped_by_existing_destination, + ) + + def write_reports(self, results: WellTransferResults) -> dict[str, Path]: + self.output_dir.mkdir(parents=True, exist_ok=True) + stamp = datetime.now().strftime("%Y-%m-%dT%H_%M_%S") + + summary_path = self.output_dir / f"well_transfer_results_{stamp}.txt" + not_migrated_path = self.output_dir / f"wells_not_migrated_{stamp}.csv" + validation_path = self.output_dir / f"wells_validation_issues_{stamp}.csv" + already_exists_path = ( + self.output_dir / f"wells_already_in_destination_{stamp}.csv" + ) + + summary_lines = [ + "Well Transfer Results", + f"source_count={results.source_count}", + f"committed_count={results.committed_count}", + f"transferred_count={results.transferred_count}", + f"not_transferred_by_decision_count={len(results.skipped_by_decision)}", + f"not_transferred_validation_count={len(results.validation_issue_wells)}", + ( + f"already_in_destination_count=" + f"{len(results.skipped_by_existing_destination)}" + ), + ( + f"metrics_file={results.metrics_file}" + if results.metrics_file + else "metrics_file=None" + ), + ] + summary_path.write_text("\n".join(summary_lines) + "\n") + + self._write_pointids(not_migrated_path, "pointid", results.skipped_by_decision) + self._write_pointids( + already_exists_path, "pointid", results.skipped_by_existing_destination + ) + self._write_validation_issues( + validation_path, + [ + issue + for issue in results.validation_issues + if issue.pointid in set(results.validation_issue_wells) + ], + ) + + return { + "summary": summary_path, + "not_migrated": not_migrated_path, + "validation_issues": validation_path, + "already_in_destination": already_exists_path, + } + + def _load_source_wells(self) -> pd.DataFrame: + wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) + ldf = read_csv("Location") + ldf = ldf.drop(columns=["PointID", "SSMA_TimeStamp"], errors="ignore") + wdf = wdf.join(ldf.set_index("LocationId"), on="LocationId") + + wdf = wdf[wdf["SiteType"] == "GW"] + wdf = wdf[wdf["Easting"].notna() & wdf["Northing"].notna()] + wdf = replace_nans(wdf) + + if self.pointids: + wdf = wdf[wdf["PointID"].isin(self.pointids)] + + return wdf + + def _load_committed_wells(self, source_df: pd.DataFrame) -> pd.DataFrame: + committed_df = get_transferable_wells(source_df) + if self.pointids: + committed_df = committed_df[committed_df["PointID"].isin(self.pointids)] + + duplicates = committed_df["PointID"].duplicated(keep=False) + if duplicates.any(): + duplicate_ids = set(committed_df.loc[duplicates, "PointID"].tolist()) + committed_df = committed_df[~committed_df["PointID"].isin(duplicate_ids)] + + return committed_df.sort_values("PointID") + + @staticmethod + def _point_ids(df: pd.DataFrame) -> set[str]: + if df.empty: + return set() + return set(df["PointID"].dropna().astype(str).unique().tolist()) + + def _load_destination_ids(self) -> set[str]: + with session_ctx() as session: + ids = session.execute( + select(Thing.name).where(Thing.thing_type == "water well") + ).scalars() + thing_names = {str(name) for name in ids if name} + + if self.pointids: + thing_names = thing_names & self.pointids + + return thing_names + + def _resolve_metrics_file(self) -> Path | None: + if self.metrics_file: + return self.metrics_file + + metrics_dir = Path("transfers") / "metrics" + candidates = sorted( + metrics_dir.glob("metrics_*.csv"), key=lambda p: p.stat().st_mtime + ) + if not candidates: + return None + return candidates[-1] + + @staticmethod + def _load_well_validation_issues( + metrics_file: Path | None, + ) -> list[ValidationIssue]: + if metrics_file is None or not metrics_file.exists(): + return [] + + issues: list[ValidationIssue] = [] + current_model: str | None = None + with metrics_file.open(newline="") as f: + reader = csv.reader(f, delimiter="|") + for row in reader: + if not row: + continue + + if len(row) >= 5 and row[0] not in {"model", "PointID"}: + current_model = row[0] + continue + + if row[0] == "PointID": + continue + + if len(row) < 4: + continue + + if current_model != "Well": + continue + + pointid, table, field, error = row[0], row[1], row[2], row[3] + if table != "WellData": + continue + if "Validation Error" not in error: + continue + issues.append( + ValidationIssue( + pointid=pointid, + table=table, + field=field, + error=error, + ) + ) + return issues + + @staticmethod + def _write_pointids(path: Path, header: str, pointids: list[str]) -> None: + with path.open("w", newline="") as f: + writer = csv.writer(f) + writer.writerow([header]) + for pointid in pointids: + writer.writerow([pointid]) + + @staticmethod + def _write_validation_issues(path: Path, issues: list[ValidationIssue]) -> None: + with path.open("w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["pointid", "table", "field", "error"]) + for issue in issues: + writer.writerow([issue.pointid, issue.table, issue.field, issue.error]) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Build transfer results for wells.") + parser.add_argument( + "--metrics-file", + type=Path, + default=None, + help="Optional metrics CSV to use for validation issue extraction.", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path("transfers") / "metrics", + help="Directory where result files are written.", + ) + parser.add_argument( + "--pointids", + default=None, + help="Optional comma-separated list of PointID values to scope the report.", + ) + return parser.parse_args() + + +def main() -> None: + args = _parse_args() + pointids = args.pointids.split(",") if args.pointids else None + builder = WellTransferResultsBuilder( + pointids=pointids, + metrics_file=args.metrics_file, + output_dir=args.output_dir, + ) + results = builder.build() + outputs = builder.write_reports(results) + + print(f"Source wells: {results.source_count}") + print(f"Committed to migrate: {results.committed_count}") + print(f"Successfully transferred: {results.transferred_count}") + print( + f"Not transferred (decided not to migrate): {len(results.skipped_by_decision)}" + ) + print(f"Not transferred (validation issues): {len(results.validation_issue_wells)}") + print( + f"Already in destination before migration filter: " + f"{len(results.skipped_by_existing_destination)}" + ) + print(f"Summary file: {outputs['summary']}") + print(f"Not migrated wells file: {outputs['not_migrated']}") + print(f"Validation issue wells file: {outputs['validation_issues']}") + print(f"Already-in-destination wells file: {outputs['already_in_destination']}") + + print("\nWells not transferred (decided not to migrate):") + for pointid in results.skipped_by_decision: + print(pointid) + + print("\nWells not transferred (data validation issues):") + for pointid in results.validation_issue_wells: + print(pointid) + + +if __name__ == "__main__": + main() From e8d8bf35cdd937d97fea9dc4150c5d7d33a7ae16 Mon Sep 17 00:00:00 2001 From: jross Date: Thu, 19 Feb 2026 17:04:48 -0700 Subject: [PATCH 02/14] feat: implement TransferResultsBuilder and comparison specs for transfer input validation --- transfers/transfer_results.py | 51 +++ transfers/transfer_results_builder.py | 153 ++++++++ transfers/transfer_results_specs.py | 485 ++++++++++++++++++++++++++ transfers/transfer_results_types.py | 81 +++++ transfers/well_transfer_results.py | 332 ------------------ 5 files changed, 770 insertions(+), 332 deletions(-) create mode 100644 transfers/transfer_results.py create mode 100644 transfers/transfer_results_builder.py create mode 100644 transfers/transfer_results_specs.py create mode 100644 transfers/transfer_results_types.py delete mode 100644 transfers/well_transfer_results.py diff --git a/transfers/transfer_results.py b/transfers/transfer_results.py new file mode 100644 index 00000000..0483e7fd --- /dev/null +++ b/transfers/transfer_results.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import argparse +from pathlib import Path + +from transfers.transfer_results_builder import TransferResultsBuilder +from transfers.transfer_results_specs import ( + TRANSFER_COMPARISON_SPECS, + TransferComparisonSpec, +) +from transfers.transfer_results_types import * # noqa: F401,F403 + + +__all__ = [ + "TransferResultsBuilder", + "TransferComparisonSpec", + "TRANSFER_COMPARISON_SPECS", +] + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Compare each transfer input CSV against destination Postgres rows." + ) + parser.add_argument( + "--summary-path", + type=Path, + default=Path("transfers") / "metrics" / "transfer_results_summary.md", + help="Output path for markdown summary table.", + ) + parser.add_argument( + "--sample-limit", + type=int, + default=25, + help="Max missing/extra key samples stored per transfer.", + ) + return parser.parse_args() + + +def main() -> None: + args = _parse_args() + builder = TransferResultsBuilder(sample_limit=args.sample_limit) + results = builder.build() + args.summary_path.parent.mkdir(parents=True, exist_ok=True) + TransferResultsBuilder.write_summary(args.summary_path, results) + print(f"Wrote comparison summary: {args.summary_path}") + print(f"Transfer comparisons: {len(results.results)}") + + +if __name__ == "__main__": + main() diff --git a/transfers/transfer_results_builder.py b/transfers/transfer_results_builder.py new file mode 100644 index 00000000..a8e384a7 --- /dev/null +++ b/transfers/transfer_results_builder.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pandas as pd +from sqlalchemy import select, func + +from db.engine import session_ctx +from transfers.transfer_results_specs import ( + TRANSFER_COMPARISON_SPECS, + TransferComparisonSpec, +) +from transfers.transfer_results_types import ( + TransferComparisonResults, + TransferResult, +) +from transfers.util import read_csv + + +def _normalize_key(value: Any) -> str | None: + if value is None: + return None + try: + if pd.isna(value): + return None + except TypeError: + pass + s = str(value).strip() + if not s: + return None + return s.lower() + + +def _source_keys(df: pd.DataFrame, key_col: str) -> set[str]: + if key_col not in df.columns: + return set() + return { + key + for key in (_normalize_key(v) for v in df[key_col].tolist()) + if key is not None + } + + +def _normalized_series(df: pd.DataFrame, key_col: str) -> pd.Series: + if key_col not in df.columns: + return pd.Series([], dtype=object) + s = df[key_col].map(_normalize_key).dropna() + if s.empty: + return pd.Series([], dtype=object) + return s.astype(str) + + +class TransferResultsBuilder: + """Compare transfer input CSV keys to destination database keys per transfer.""" + + def __init__(self, sample_limit: int = 25): + self.sample_limit = sample_limit + + def build(self) -> TransferComparisonResults: + results: dict[str, TransferResult] = {} + for spec in TRANSFER_COMPARISON_SPECS: + results[spec.transfer_name] = self._build_one(spec) + return TransferComparisonResults( + generated_at=pd.Timestamp.utcnow().isoformat(), + results=results, + ) + + def _build_one(self, spec: TransferComparisonSpec) -> TransferResult: + source_df = read_csv(spec.source_csv) + if spec.source_filter: + source_df = spec.source_filter(source_df) + source_series = _normalized_series(source_df, spec.source_key_column) + source_keys = set(source_series.unique().tolist()) + source_keyed_row_count = int(source_series.shape[0]) + source_duplicate_key_row_count = source_keyed_row_count - len(source_keys) + agreed_transfer_row_count = int(len(source_df)) + if spec.agreed_row_counter is not None: + try: + agreed_transfer_row_count = int(spec.agreed_row_counter()) + except Exception: + agreed_transfer_row_count = int(len(source_df)) + + model = spec.destination_model + key_col = getattr(model, spec.destination_key_column) + with session_ctx() as session: + key_sql = select(key_col).where(key_col.is_not(None)) + count_sql = select(func.count()).select_from(model) + + if spec.destination_where: + where_clause = spec.destination_where(model) + key_sql = key_sql.where(where_clause) + count_sql = count_sql.where(where_clause) + + raw_dest_keys = session.execute(key_sql).scalars().all() + destination_row_count = int(session.execute(count_sql).scalar_one()) + + destination_series = pd.Series( + [_normalize_key(v) for v in raw_dest_keys], dtype=object + ).dropna() + if destination_series.empty: + destination_series = pd.Series([], dtype=object) + else: + destination_series = destination_series.astype(str) + + destination_keys = set(destination_series.unique().tolist()) + destination_keyed_row_count = int(destination_series.shape[0]) + destination_duplicate_key_row_count = destination_keyed_row_count - len( + destination_keys + ) + + missing = sorted(source_keys - destination_keys) + extra = sorted(destination_keys - source_keys) + + return spec.result_cls( + transfer_name=spec.transfer_name, + source_csv=spec.source_csv, + source_key_column=spec.source_key_column, + destination_model=model.__name__, + destination_key_column=spec.destination_key_column, + source_row_count=len(source_df), + agreed_transfer_row_count=agreed_transfer_row_count, + source_keyed_row_count=source_keyed_row_count, + source_key_count=len(source_keys), + source_duplicate_key_row_count=source_duplicate_key_row_count, + destination_row_count=destination_row_count, + destination_keyed_row_count=destination_keyed_row_count, + destination_key_count=len(destination_keys), + destination_duplicate_key_row_count=destination_duplicate_key_row_count, + matched_key_count=len(source_keys & destination_keys), + missing_in_destination_count=len(missing), + extra_in_destination_count=len(extra), + missing_in_destination_sample=missing[: self.sample_limit], + extra_in_destination_sample=extra[: self.sample_limit], + ) + + @staticmethod + def write_summary(path: Path, comparison: TransferComparisonResults) -> None: + lines = [ + f"generated_at={comparison.generated_at}", + "", + "| Transfer | Source CSV | Source Rows | Agreed Rows | Dest Model | Dest Rows | Missing Agreed | Matched | Missing | Extra |", + "|---|---|---:|---:|---|---:|---:|---:|---:|---:|", + ] + for name in sorted(comparison.results.keys()): + r = comparison.results[name] + missing_agreed = r.agreed_transfer_row_count - r.destination_row_count + lines.append( + f"| {name} | {r.source_csv} | {r.source_row_count} | {r.agreed_transfer_row_count} | " + f"{r.destination_model} | {r.destination_row_count} | {missing_agreed} | " + f"{r.matched_key_count} | {r.missing_in_destination_count} | {r.extra_in_destination_count} |" + ) + path.write_text("\n".join(lines) + "\n") diff --git a/transfers/transfer_results_specs.py b/transfers/transfer_results_specs.py new file mode 100644 index 00000000..f86e13b7 --- /dev/null +++ b/transfers/transfer_results_specs.py @@ -0,0 +1,485 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Callable + +import pandas as pd + +from transfers.associated_data import AssociatedDataTransferer +from transfers.chemistry_sampleinfo import ChemistrySampleInfoTransferer +from transfers.contact_transfer import ContactTransfer +from transfers.field_parameters_transfer import FieldParametersTransferer +from transfers.group_transfer import ProjectGroupTransferer +from transfers.hydraulicsdata import HydraulicsDataTransferer +from transfers.major_chemistry import MajorChemistryTransferer +from transfers.minor_trace_chemistry_transfer import MinorTraceChemistryTransferer +from transfers.ngwmn_views import ( + NGWMNLithologyTransferer, + NGWMNWaterLevelsTransferer, + NGWMNWellConstructionTransferer, +) +from transfers.radionuclides import RadionuclidesTransferer +from transfers.sensor_transfer import SensorTransferer +from transfers.soil_rock_results import SoilRockResultsTransferer +from transfers.stratigraphy_legacy import StratigraphyLegacyTransferer +from transfers.surface_water_data import SurfaceWaterDataTransferer +from transfers.surface_water_photos import SurfaceWaterPhotosTransferer +from transfers.util import read_csv +from transfers.waterlevels_transfer import WaterLevelTransferer +from transfers.waterlevelscontinuous_pressure_daily import ( + NMA_WaterLevelsContinuous_Pressure_DailyTransferer, +) +from transfers.weather_data import WeatherDataTransferer +from transfers.weather_photos import WeatherPhotosTransferer +from transfers.well_transfer import WellScreenTransferer, WellTransferer +from db import ( + Contact, + Group, + NMA_AssociatedData, + NMA_Chemistry_SampleInfo, + NMA_FieldParameters, + NMA_HydraulicsData, + NMA_MajorChemistry, + NMA_MinorTraceChemistry, + NMA_Radionuclides, + NMA_Soil_Rock_Results, + NMA_Stratigraphy, + NMA_SurfaceWaterData, + NMA_SurfaceWaterPhotos, + NMA_WaterLevelsContinuous_Pressure_Daily, + NMA_WeatherData, + NMA_WeatherPhotos, + NMA_view_NGWMN_Lithology, + NMA_view_NGWMN_WaterLevels, + NMA_view_NGWMN_WellConstruction, + Observation, + Sensor, + Thing, + WellScreen, +) +from transfers.transfer_results_types import ( + AssociatedDataTransferResult, + ChemistrySampleInfoTransferResult, + DiversionOfSurfaceWaterTransferResult, + EphemeralStreamsTransferResult, + EquipmentTransferResult, + FieldParametersTransferResult, + HydraulicsDataTransferResult, + LakePondReservoirTransferResult, + MajorChemistryTransferResult, + MetStationsTransferResult, + MinorTraceChemistryTransferResult, + NGWMNLithologyTransferResult, + NGWMNWaterLevelsTransferResult, + NGWMNWellConstructionTransferResult, + OtherSiteTypesTransferResult, + OutfallWastewaterReturnFlowTransferResult, + OwnersDataTransferResult, + PerennialStreamsTransferResult, + PressureDailyTransferResult, + ProjectsTransferResult, + RadionuclidesTransferResult, + RockSampleLocationsTransferResult, + SoilGasSampleLocationsTransferResult, + SoilRockResultsTransferResult, + SpringsTransferResult, + StratigraphyTransferResult, + SurfaceWaterDataTransferResult, + SurfaceWaterPhotosTransferResult, + TransferResult, + WaterLevelsTransferResult, + WeatherDataTransferResult, + WeatherPhotosTransferResult, + WellDataTransferResult, + WellScreensTransferResult, +) + + +@dataclass(frozen=True) +class TransferComparisonSpec: + transfer_name: str + result_cls: type[TransferResult] + source_csv: str + source_key_column: str + destination_model: Any + destination_key_column: str + source_filter: Callable[[pd.DataFrame], pd.DataFrame] | None = None + destination_where: Callable[[Any], Any] | None = None + agreed_row_counter: Callable[[], int] | None = None + + +def _location_site_filter(site_type: str) -> Callable[[pd.DataFrame], pd.DataFrame]: + def _f(df: pd.DataFrame) -> pd.DataFrame: + if "SiteType" not in df.columns: + return df.iloc[0:0] + return df[df["SiteType"] == site_type] + + return _f + + +def _agreed_rows_from_transferer(transferer_cls) -> int: + transferer = transferer_cls() + _, cleaned_df = transferer._get_dfs() + return int(len(cleaned_df)) + + +def _agreed_rows_location(site_type: str) -> int: + df = read_csv("Location") + df = df[df["SiteType"] == site_type] + df = df[df["Easting"].notna() & df["Northing"].notna()] + return int(len(df)) + + +TRANSFER_COMPARISON_SPECS: list[TransferComparisonSpec] = [ + TransferComparisonSpec( + "WellData", + WellDataTransferResult, + "WellData", + "WellID", + Thing, + "nma_pk_welldata", + destination_where=lambda m: m.thing_type == "water well", + agreed_row_counter=lambda: _agreed_rows_from_transferer(WellTransferer), + ), + TransferComparisonSpec( + "WellScreens", + WellScreensTransferResult, + "WellScreens", + "GlobalID", + WellScreen, + "nma_pk_wellscreens", + agreed_row_counter=lambda: _agreed_rows_from_transferer(WellScreenTransferer), + ), + TransferComparisonSpec( + "OwnersData", + OwnersDataTransferResult, + "OwnersData", + "OwnerKey", + Contact, + "nma_pk_owners", + agreed_row_counter=lambda: _agreed_rows_from_transferer(ContactTransfer), + ), + TransferComparisonSpec( + "WaterLevels", + WaterLevelsTransferResult, + "WaterLevels", + "GlobalID", + Observation, + "nma_pk_waterlevels", + agreed_row_counter=lambda: _agreed_rows_from_transferer(WaterLevelTransferer), + ), + TransferComparisonSpec( + "Equipment", + EquipmentTransferResult, + "Equipment", + "GlobalID", + Sensor, + "nma_pk_equipment", + agreed_row_counter=lambda: _agreed_rows_from_transferer(SensorTransferer), + ), + TransferComparisonSpec( + "Projects", + ProjectsTransferResult, + "Projects", + "Project", + Group, + "name", + agreed_row_counter=lambda: _agreed_rows_from_transferer(ProjectGroupTransferer), + ), + TransferComparisonSpec( + "SurfaceWaterPhotos", + SurfaceWaterPhotosTransferResult, + "SurfaceWaterPhotos", + "GlobalID", + NMA_SurfaceWaterPhotos, + "global_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + SurfaceWaterPhotosTransferer + ), + ), + TransferComparisonSpec( + "Soil_Rock_Results", + SoilRockResultsTransferResult, + "Soil_Rock_Results", + "Point_ID", + NMA_Soil_Rock_Results, + "nma_point_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + SoilRockResultsTransferer + ), + ), + TransferComparisonSpec( + "WeatherPhotos", + WeatherPhotosTransferResult, + "WeatherPhotos", + "GlobalID", + NMA_WeatherPhotos, + "global_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + WeatherPhotosTransferer + ), + ), + TransferComparisonSpec( + "AssociatedData", + AssociatedDataTransferResult, + "AssociatedData", + "AssocID", + NMA_AssociatedData, + "nma_assoc_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + AssociatedDataTransferer + ), + ), + TransferComparisonSpec( + "SurfaceWaterData", + SurfaceWaterDataTransferResult, + "SurfaceWaterData", + "OBJECTID", + NMA_SurfaceWaterData, + "object_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + SurfaceWaterDataTransferer + ), + ), + TransferComparisonSpec( + "HydraulicsData", + HydraulicsDataTransferResult, + "HydraulicsData", + "GlobalID", + NMA_HydraulicsData, + "nma_global_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + HydraulicsDataTransferer + ), + ), + TransferComparisonSpec( + "Chemistry_SampleInfo", + ChemistrySampleInfoTransferResult, + "Chemistry_SampleInfo", + "SamplePtID", + NMA_Chemistry_SampleInfo, + "nma_sample_pt_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + ChemistrySampleInfoTransferer + ), + ), + TransferComparisonSpec( + "view_NGWMN_WellConstruction", + NGWMNWellConstructionTransferResult, + "view_NGWMN_WellConstruction", + "PointID", + NMA_view_NGWMN_WellConstruction, + "point_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + NGWMNWellConstructionTransferer + ), + ), + TransferComparisonSpec( + "view_NGWMN_WaterLevels", + NGWMNWaterLevelsTransferResult, + "view_NGWMN_WaterLevels", + "PointID", + NMA_view_NGWMN_WaterLevels, + "point_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + NGWMNWaterLevelsTransferer + ), + ), + TransferComparisonSpec( + "view_NGWMN_Lithology", + NGWMNLithologyTransferResult, + "view_NGWMN_Lithology", + "PointID", + NMA_view_NGWMN_Lithology, + "point_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + NGWMNLithologyTransferer + ), + ), + TransferComparisonSpec( + "WaterLevelsContinuous_Pressure_Daily", + PressureDailyTransferResult, + "WaterLevelsContinuous_Pressure_Daily", + "GlobalID", + NMA_WaterLevelsContinuous_Pressure_Daily, + "global_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + NMA_WaterLevelsContinuous_Pressure_DailyTransferer + ), + ), + TransferComparisonSpec( + "WeatherData", + WeatherDataTransferResult, + "WeatherData", + "OBJECTID", + NMA_WeatherData, + "object_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer(WeatherDataTransferer), + ), + TransferComparisonSpec( + "Stratigraphy", + StratigraphyTransferResult, + "Stratigraphy", + "GlobalID", + NMA_Stratigraphy, + "nma_global_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + StratigraphyLegacyTransferer + ), + ), + TransferComparisonSpec( + "MajorChemistry", + MajorChemistryTransferResult, + "MajorChemistry", + "GlobalID", + NMA_MajorChemistry, + "nma_global_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + MajorChemistryTransferer + ), + ), + TransferComparisonSpec( + "Radionuclides", + RadionuclidesTransferResult, + "Radionuclides", + "GlobalID", + NMA_Radionuclides, + "nma_global_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + RadionuclidesTransferer + ), + ), + TransferComparisonSpec( + "MinorandTraceChemistry", + MinorTraceChemistryTransferResult, + "MinorandTraceChemistry", + "GlobalID", + NMA_MinorTraceChemistry, + "nma_global_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + MinorTraceChemistryTransferer + ), + ), + TransferComparisonSpec( + "FieldParameters", + FieldParametersTransferResult, + "FieldParameters", + "GlobalID", + NMA_FieldParameters, + "nma_global_id", + agreed_row_counter=lambda: _agreed_rows_from_transferer( + FieldParametersTransferer + ), + ), + TransferComparisonSpec( + "Springs", + SpringsTransferResult, + "Location", + "LocationId", + Thing, + "nma_pk_location", + source_filter=_location_site_filter("SP"), + destination_where=lambda m: m.thing_type == "spring", + agreed_row_counter=lambda: _agreed_rows_location("SP"), + ), + TransferComparisonSpec( + "PerennialStreams", + PerennialStreamsTransferResult, + "Location", + "LocationId", + Thing, + "nma_pk_location", + source_filter=_location_site_filter("PS"), + destination_where=lambda m: m.thing_type == "perennial stream", + agreed_row_counter=lambda: _agreed_rows_location("PS"), + ), + TransferComparisonSpec( + "EphemeralStreams", + EphemeralStreamsTransferResult, + "Location", + "LocationId", + Thing, + "nma_pk_location", + source_filter=_location_site_filter("ES"), + destination_where=lambda m: m.thing_type == "ephemeral stream", + agreed_row_counter=lambda: _agreed_rows_location("ES"), + ), + TransferComparisonSpec( + "MetStations", + MetStationsTransferResult, + "Location", + "LocationId", + Thing, + "nma_pk_location", + source_filter=_location_site_filter("M"), + destination_where=lambda m: m.thing_type == "meteorological station", + agreed_row_counter=lambda: _agreed_rows_location("M"), + ), + TransferComparisonSpec( + "RockSampleLocations", + RockSampleLocationsTransferResult, + "Location", + "LocationId", + Thing, + "nma_pk_location", + source_filter=_location_site_filter("R"), + destination_where=lambda m: m.thing_type == "rock sample location", + agreed_row_counter=lambda: _agreed_rows_location("R"), + ), + TransferComparisonSpec( + "DiversionOfSurfaceWater", + DiversionOfSurfaceWaterTransferResult, + "Location", + "LocationId", + Thing, + "nma_pk_location", + source_filter=_location_site_filter("D"), + destination_where=lambda m: m.thing_type == "diversion of surface water, etc.", + agreed_row_counter=lambda: _agreed_rows_location("D"), + ), + TransferComparisonSpec( + "LakePondReservoir", + LakePondReservoirTransferResult, + "Location", + "LocationId", + Thing, + "nma_pk_location", + source_filter=_location_site_filter("L"), + destination_where=lambda m: m.thing_type == "lake, pond or reservoir", + agreed_row_counter=lambda: _agreed_rows_location("L"), + ), + TransferComparisonSpec( + "SoilGasSampleLocations", + SoilGasSampleLocationsTransferResult, + "Location", + "LocationId", + Thing, + "nma_pk_location", + source_filter=_location_site_filter("S"), + destination_where=lambda m: m.thing_type == "soil gas sample location", + agreed_row_counter=lambda: _agreed_rows_location("S"), + ), + TransferComparisonSpec( + "OtherSiteTypes", + OtherSiteTypesTransferResult, + "Location", + "LocationId", + Thing, + "nma_pk_location", + source_filter=_location_site_filter("OT"), + destination_where=lambda m: m.thing_type == "other", + agreed_row_counter=lambda: _agreed_rows_location("OT"), + ), + TransferComparisonSpec( + "OutfallWastewaterReturnFlow", + OutfallWastewaterReturnFlowTransferResult, + "Location", + "LocationId", + Thing, + "nma_pk_location", + source_filter=_location_site_filter("O"), + destination_where=lambda m: m.thing_type + == "outfall of wastewater or return flow", + agreed_row_counter=lambda: _agreed_rows_location("O"), + ), +] diff --git a/transfers/transfer_results_types.py b/transfers/transfer_results_types.py new file mode 100644 index 00000000..dc58238a --- /dev/null +++ b/transfers/transfer_results_types.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass +class TransferResult: + transfer_name: str + source_csv: str + source_key_column: str + destination_model: str + destination_key_column: str + source_row_count: int = 0 + agreed_transfer_row_count: int = 0 + source_keyed_row_count: int = 0 + source_key_count: int = 0 + source_duplicate_key_row_count: int = 0 + destination_row_count: int = 0 + destination_keyed_row_count: int = 0 + destination_key_count: int = 0 + destination_duplicate_key_row_count: int = 0 + matched_key_count: int = 0 + missing_in_destination_count: int = 0 + extra_in_destination_count: int = 0 + missing_in_destination_sample: list[str] = field(default_factory=list) + extra_in_destination_sample: list[str] = field(default_factory=list) + + +@dataclass +class TransferComparisonResults: + generated_at: str + results: dict[str, TransferResult] + + +_RESULT_CLASS_NAMES = [ + "WellData", + "WellScreens", + "OwnersData", + "WaterLevels", + "Equipment", + "Projects", + "SurfaceWaterPhotos", + "SoilRockResults", + "WeatherPhotos", + "AssociatedData", + "SurfaceWaterData", + "HydraulicsData", + "ChemistrySampleInfo", + "NGWMNWellConstruction", + "NGWMNWaterLevels", + "NGWMNLithology", + "PressureDaily", + "WeatherData", + "Stratigraphy", + "MajorChemistry", + "Radionuclides", + "MinorTraceChemistry", + "FieldParameters", + "Springs", + "PerennialStreams", + "EphemeralStreams", + "MetStations", + "RockSampleLocations", + "DiversionOfSurfaceWater", + "LakePondReservoir", + "SoilGasSampleLocations", + "OtherSiteTypes", + "OutfallWastewaterReturnFlow", +] + +for _name in _RESULT_CLASS_NAMES: + globals()[f"{_name}TransferResult"] = type( + f"{_name}TransferResult", (TransferResult,), {} + ) + + +__all__ = [ + "TransferResult", + "TransferComparisonResults", + *[f"{name}TransferResult" for name in _RESULT_CLASS_NAMES], +] diff --git a/transfers/well_transfer_results.py b/transfers/well_transfer_results.py deleted file mode 100644 index 555ab9f7..00000000 --- a/transfers/well_transfer_results.py +++ /dev/null @@ -1,332 +0,0 @@ -# =============================================================================== -# Copyright 2026 ross -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== -from __future__ import annotations - -import argparse -import csv -from dataclasses import dataclass -from datetime import datetime -from pathlib import Path - -import pandas as pd -from sqlalchemy import select - -from db import Thing -from db.engine import session_ctx -from transfers.util import ( - filter_non_transferred_wells, - get_transferable_wells, - read_csv, - replace_nans, -) - - -@dataclass -class ValidationIssue: - pointid: str - table: str - field: str - error: str - - -@dataclass -class WellTransferResults: - source_count: int - committed_count: int - transferred_count: int - skipped_by_decision: list[str] - validation_issue_wells: list[str] - validation_issues: list[ValidationIssue] - metrics_file: Path | None - skipped_by_existing_destination: list[str] - - -class WellTransferResultsBuilder: - """Build well transfer outcome summaries by comparing source and destination.""" - - def __init__( - self, - pointids: list[str] | None = None, - metrics_file: Path | None = None, - output_dir: Path | None = None, - ): - self.pointids = set(pointids or []) - self.metrics_file = metrics_file - self.output_dir = output_dir or (Path("transfers") / "metrics") - - def build(self) -> WellTransferResults: - source_df = self._load_source_wells() - committed_df = self._load_committed_wells(source_df) - committed_without_existing_df = filter_non_transferred_wells(committed_df) - - source_ids = self._point_ids(source_df) - committed_ids = self._point_ids(committed_df) - committed_without_existing_ids = self._point_ids(committed_without_existing_df) - destination_ids = self._load_destination_ids() - - skipped_by_decision = sorted(source_ids - committed_ids) - skipped_by_existing_destination = sorted( - committed_ids - committed_without_existing_ids - ) - transferred_ids = committed_ids & destination_ids - missing_committed_ids = committed_ids - transferred_ids - - validation_issues = self._load_well_validation_issues( - self._resolve_metrics_file() - ) - validation_issue_ids = { - issue.pointid for issue in validation_issues if issue.pointid in source_ids - } - validation_issue_wells = sorted(validation_issue_ids & missing_committed_ids) - - return WellTransferResults( - source_count=len(source_ids), - committed_count=len(committed_ids), - transferred_count=len(transferred_ids), - skipped_by_decision=skipped_by_decision, - validation_issue_wells=validation_issue_wells, - validation_issues=validation_issues, - metrics_file=self._resolve_metrics_file(), - skipped_by_existing_destination=skipped_by_existing_destination, - ) - - def write_reports(self, results: WellTransferResults) -> dict[str, Path]: - self.output_dir.mkdir(parents=True, exist_ok=True) - stamp = datetime.now().strftime("%Y-%m-%dT%H_%M_%S") - - summary_path = self.output_dir / f"well_transfer_results_{stamp}.txt" - not_migrated_path = self.output_dir / f"wells_not_migrated_{stamp}.csv" - validation_path = self.output_dir / f"wells_validation_issues_{stamp}.csv" - already_exists_path = ( - self.output_dir / f"wells_already_in_destination_{stamp}.csv" - ) - - summary_lines = [ - "Well Transfer Results", - f"source_count={results.source_count}", - f"committed_count={results.committed_count}", - f"transferred_count={results.transferred_count}", - f"not_transferred_by_decision_count={len(results.skipped_by_decision)}", - f"not_transferred_validation_count={len(results.validation_issue_wells)}", - ( - f"already_in_destination_count=" - f"{len(results.skipped_by_existing_destination)}" - ), - ( - f"metrics_file={results.metrics_file}" - if results.metrics_file - else "metrics_file=None" - ), - ] - summary_path.write_text("\n".join(summary_lines) + "\n") - - self._write_pointids(not_migrated_path, "pointid", results.skipped_by_decision) - self._write_pointids( - already_exists_path, "pointid", results.skipped_by_existing_destination - ) - self._write_validation_issues( - validation_path, - [ - issue - for issue in results.validation_issues - if issue.pointid in set(results.validation_issue_wells) - ], - ) - - return { - "summary": summary_path, - "not_migrated": not_migrated_path, - "validation_issues": validation_path, - "already_in_destination": already_exists_path, - } - - def _load_source_wells(self) -> pd.DataFrame: - wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) - ldf = read_csv("Location") - ldf = ldf.drop(columns=["PointID", "SSMA_TimeStamp"], errors="ignore") - wdf = wdf.join(ldf.set_index("LocationId"), on="LocationId") - - wdf = wdf[wdf["SiteType"] == "GW"] - wdf = wdf[wdf["Easting"].notna() & wdf["Northing"].notna()] - wdf = replace_nans(wdf) - - if self.pointids: - wdf = wdf[wdf["PointID"].isin(self.pointids)] - - return wdf - - def _load_committed_wells(self, source_df: pd.DataFrame) -> pd.DataFrame: - committed_df = get_transferable_wells(source_df) - if self.pointids: - committed_df = committed_df[committed_df["PointID"].isin(self.pointids)] - - duplicates = committed_df["PointID"].duplicated(keep=False) - if duplicates.any(): - duplicate_ids = set(committed_df.loc[duplicates, "PointID"].tolist()) - committed_df = committed_df[~committed_df["PointID"].isin(duplicate_ids)] - - return committed_df.sort_values("PointID") - - @staticmethod - def _point_ids(df: pd.DataFrame) -> set[str]: - if df.empty: - return set() - return set(df["PointID"].dropna().astype(str).unique().tolist()) - - def _load_destination_ids(self) -> set[str]: - with session_ctx() as session: - ids = session.execute( - select(Thing.name).where(Thing.thing_type == "water well") - ).scalars() - thing_names = {str(name) for name in ids if name} - - if self.pointids: - thing_names = thing_names & self.pointids - - return thing_names - - def _resolve_metrics_file(self) -> Path | None: - if self.metrics_file: - return self.metrics_file - - metrics_dir = Path("transfers") / "metrics" - candidates = sorted( - metrics_dir.glob("metrics_*.csv"), key=lambda p: p.stat().st_mtime - ) - if not candidates: - return None - return candidates[-1] - - @staticmethod - def _load_well_validation_issues( - metrics_file: Path | None, - ) -> list[ValidationIssue]: - if metrics_file is None or not metrics_file.exists(): - return [] - - issues: list[ValidationIssue] = [] - current_model: str | None = None - with metrics_file.open(newline="") as f: - reader = csv.reader(f, delimiter="|") - for row in reader: - if not row: - continue - - if len(row) >= 5 and row[0] not in {"model", "PointID"}: - current_model = row[0] - continue - - if row[0] == "PointID": - continue - - if len(row) < 4: - continue - - if current_model != "Well": - continue - - pointid, table, field, error = row[0], row[1], row[2], row[3] - if table != "WellData": - continue - if "Validation Error" not in error: - continue - issues.append( - ValidationIssue( - pointid=pointid, - table=table, - field=field, - error=error, - ) - ) - return issues - - @staticmethod - def _write_pointids(path: Path, header: str, pointids: list[str]) -> None: - with path.open("w", newline="") as f: - writer = csv.writer(f) - writer.writerow([header]) - for pointid in pointids: - writer.writerow([pointid]) - - @staticmethod - def _write_validation_issues(path: Path, issues: list[ValidationIssue]) -> None: - with path.open("w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["pointid", "table", "field", "error"]) - for issue in issues: - writer.writerow([issue.pointid, issue.table, issue.field, issue.error]) - - -def _parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Build transfer results for wells.") - parser.add_argument( - "--metrics-file", - type=Path, - default=None, - help="Optional metrics CSV to use for validation issue extraction.", - ) - parser.add_argument( - "--output-dir", - type=Path, - default=Path("transfers") / "metrics", - help="Directory where result files are written.", - ) - parser.add_argument( - "--pointids", - default=None, - help="Optional comma-separated list of PointID values to scope the report.", - ) - return parser.parse_args() - - -def main() -> None: - args = _parse_args() - pointids = args.pointids.split(",") if args.pointids else None - builder = WellTransferResultsBuilder( - pointids=pointids, - metrics_file=args.metrics_file, - output_dir=args.output_dir, - ) - results = builder.build() - outputs = builder.write_reports(results) - - print(f"Source wells: {results.source_count}") - print(f"Committed to migrate: {results.committed_count}") - print(f"Successfully transferred: {results.transferred_count}") - print( - f"Not transferred (decided not to migrate): {len(results.skipped_by_decision)}" - ) - print(f"Not transferred (validation issues): {len(results.validation_issue_wells)}") - print( - f"Already in destination before migration filter: " - f"{len(results.skipped_by_existing_destination)}" - ) - print(f"Summary file: {outputs['summary']}") - print(f"Not migrated wells file: {outputs['not_migrated']}") - print(f"Validation issue wells file: {outputs['validation_issues']}") - print(f"Already-in-destination wells file: {outputs['already_in_destination']}") - - print("\nWells not transferred (decided not to migrate):") - for pointid in results.skipped_by_decision: - print(pointid) - - print("\nWells not transferred (data validation issues):") - for pointid in results.validation_issue_wells: - print(pointid) - - -if __name__ == "__main__": - main() From cfb576e226bdab534c09de0c7d5d358044f0d1ef Mon Sep 17 00:00:00 2001 From: jirhiker <2035568+jirhiker@users.noreply.github.com> Date: Fri, 20 Feb 2026 00:05:14 +0000 Subject: [PATCH 03/14] Formatting changes --- transfers/transfer_results.py | 1 - 1 file changed, 1 deletion(-) diff --git a/transfers/transfer_results.py b/transfers/transfer_results.py index 0483e7fd..36337d52 100644 --- a/transfers/transfer_results.py +++ b/transfers/transfer_results.py @@ -10,7 +10,6 @@ ) from transfers.transfer_results_types import * # noqa: F401,F403 - __all__ = [ "TransferResultsBuilder", "TransferComparisonSpec", From 2d4d8ff185690ef10e79ca2b9715511d47ef5e30 Mon Sep 17 00:00:00 2001 From: jross Date: Thu, 19 Feb 2026 17:27:48 -0700 Subject: [PATCH 04/14] feat(migrations): make NMA_SurfaceWaterData.thing_id nullable --- ...ke_surface_water_data_thing_id_nullable.py | 57 +++++++ db/nma_legacy.py | 19 +-- transfers/surface_water_data.py | 18 +-- transfers/transfer_results_builder.py | 58 ++++++-- transfers/transfer_results_specs.py | 139 +++++------------- 5 files changed, 143 insertions(+), 148 deletions(-) create mode 100644 alembic/versions/i2c3d4e5f6a7_make_surface_water_data_thing_id_nullable.py diff --git a/alembic/versions/i2c3d4e5f6a7_make_surface_water_data_thing_id_nullable.py b/alembic/versions/i2c3d4e5f6a7_make_surface_water_data_thing_id_nullable.py new file mode 100644 index 00000000..0b0f00a2 --- /dev/null +++ b/alembic/versions/i2c3d4e5f6a7_make_surface_water_data_thing_id_nullable.py @@ -0,0 +1,57 @@ +"""Make NMA_SurfaceWaterData.thing_id nullable. + +Revision ID: i2c3d4e5f6a7 +Revises: f1a2b3c4d5e6 +Create Date: 2026-02-20 17:40:00.000000 +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy import inspect + +# revision identifiers, used by Alembic. +revision: str = "i2c3d4e5f6a7" +down_revision: Union[str, Sequence[str], None] = "f1a2b3c4d5e6" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Allow orphan legacy SurfaceWaterData rows without a mapped Thing.""" + bind = op.get_bind() + inspector = inspect(bind) + if not inspector.has_table("NMA_SurfaceWaterData"): + return + + columns = {col["name"] for col in inspector.get_columns("NMA_SurfaceWaterData")} + if "thing_id" not in columns: + return + + op.alter_column( + "NMA_SurfaceWaterData", + "thing_id", + existing_type=sa.Integer(), + nullable=True, + ) + + +def downgrade() -> None: + """Revert to NOT NULL only when no null thing_id values exist.""" + bind = op.get_bind() + inspector = inspect(bind) + if not inspector.has_table("NMA_SurfaceWaterData"): + return + + columns = {col["name"] for col in inspector.get_columns("NMA_SurfaceWaterData")} + if "thing_id" not in columns: + return + + op.execute('DELETE FROM "NMA_SurfaceWaterData" WHERE thing_id IS NULL') + op.alter_column( + "NMA_SurfaceWaterData", + "thing_id", + existing_type=sa.Integer(), + nullable=False, + ) diff --git a/db/nma_legacy.py b/db/nma_legacy.py index cab2014e..8c01eae6 100644 --- a/db/nma_legacy.py +++ b/db/nma_legacy.py @@ -578,9 +578,9 @@ class NMA_SurfaceWaterData(Base): object_id: Mapped[int] = mapped_column("OBJECTID", Integer, primary_key=True) # FK - # FK to Thing - required for all SurfaceWaterData records - thing_id: Mapped[int] = mapped_column( - Integer, ForeignKey("thing.id", ondelete="CASCADE"), nullable=False + # FK to Thing - optional when legacy rows cannot be mapped to a Thing. + thing_id: Mapped[Optional[int]] = mapped_column( + Integer, ForeignKey("thing.id", ondelete="CASCADE"), nullable=True ) # Legacy PK (for audit) @@ -615,16 +615,9 @@ class NMA_SurfaceWaterData(Base): data_source: Mapped[Optional[str]] = mapped_column("DataSource", String(255)) # Relationships - thing: Mapped["Thing"] = relationship("Thing", back_populates="surface_water_data") - - @validates("thing_id") - def validate_thing_id(self, key, value): - """Prevent orphan NMA_SurfaceWaterData - must have a parent Thing.""" - if value is None: - raise ValueError( - "NMA_SurfaceWaterData requires a parent Thing (thing_id cannot be None)" - ) - return value + thing: Mapped[Optional["Thing"]] = relationship( + "Thing", back_populates="surface_water_data" + ) class NMA_SurfaceWaterPhotos(Base): diff --git a/transfers/surface_water_data.py b/transfers/surface_water_data.py index 9b4a6e32..e4e8a908 100644 --- a/transfers/surface_water_data.py +++ b/transfers/surface_water_data.py @@ -62,22 +62,12 @@ def _get_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]: def _transfer_hook(self, session: Session) -> None: rows: list[dict[str, Any]] = [] - skipped_missing_thing = 0 for raw in self.cleaned_df.to_dict("records"): record = self._row_dict(raw) - if record is None: - skipped_missing_thing += 1 - continue rows.append(record) rows = self._dedupe_rows(rows, key="OBJECTID", include_missing=True) - if skipped_missing_thing: - logger.warning( - "Skipped %s SurfaceWaterData rows without matching Thing", - skipped_missing_thing, - ) - insert_stmt = insert(NMA_SurfaceWaterData) excluded = insert_stmt.excluded @@ -111,7 +101,7 @@ def _transfer_hook(self, session: Session) -> None: session.commit() session.expunge_all() - def _row_dict(self, row: dict[str, Any]) -> Optional[dict[str, Any]]: + def _row_dict(self, row: dict[str, Any]) -> dict[str, Any]: def val(key: str) -> Optional[Any]: v = row.get(key) if pd.isna(v): @@ -133,12 +123,6 @@ def to_uuid(v: Any) -> Optional[uuid.UUID]: location_id = to_uuid(val("LocationId")) thing_id = self._resolve_thing_id(location_id) - if thing_id is None: - logger.warning( - "Skipping SurfaceWaterData LocationId=%s - Thing not found", - location_id, - ) - return None return { "LocationId": location_id, diff --git a/transfers/transfer_results_builder.py b/transfers/transfer_results_builder.py index a8e384a7..15ba47c8 100644 --- a/transfers/transfer_results_builder.py +++ b/transfers/transfer_results_builder.py @@ -7,6 +7,7 @@ from sqlalchemy import select, func from db.engine import session_ctx +from transfers.transfer import load_transfer_options from transfers.transfer_results_specs import ( TRANSFER_COMPARISON_SPECS, TransferComparisonSpec, @@ -15,7 +16,12 @@ TransferComparisonResults, TransferResult, ) -from transfers.util import read_csv +from transfers.util import ( + read_csv, + replace_nans, + get_transferable_wells, +) +import os def _normalize_key(value: Any) -> str | None: @@ -56,6 +62,8 @@ class TransferResultsBuilder: def __init__(self, sample_limit: int = 25): self.sample_limit = sample_limit + self.transfer_options = load_transfer_options() + self.transfer_limit = int(os.getenv("TRANSFER_LIMIT", "1000")) def build(self) -> TransferComparisonResults: results: dict[str, TransferResult] = {} @@ -70,16 +78,18 @@ def _build_one(self, spec: TransferComparisonSpec) -> TransferResult: source_df = read_csv(spec.source_csv) if spec.source_filter: source_df = spec.source_filter(source_df) - source_series = _normalized_series(source_df, spec.source_key_column) + comparison_df = source_df + enabled = self._is_enabled(spec) + if not enabled: + comparison_df = source_df.iloc[0:0] + elif spec.transfer_name == "WellData": + comparison_df = self._agreed_welldata_df() + + source_series = _normalized_series(comparison_df, spec.source_key_column) source_keys = set(source_series.unique().tolist()) source_keyed_row_count = int(source_series.shape[0]) source_duplicate_key_row_count = source_keyed_row_count - len(source_keys) - agreed_transfer_row_count = int(len(source_df)) - if spec.agreed_row_counter is not None: - try: - agreed_transfer_row_count = int(spec.agreed_row_counter()) - except Exception: - agreed_transfer_row_count = int(len(source_df)) + agreed_transfer_row_count = int(len(comparison_df)) model = spec.destination_model key_col = getattr(model, spec.destination_key_column) @@ -134,20 +144,44 @@ def _build_one(self, spec: TransferComparisonSpec) -> TransferResult: extra_in_destination_sample=extra[: self.sample_limit], ) + def _is_enabled(self, spec: TransferComparisonSpec) -> bool: + if not spec.option_field: + return True + return bool(getattr(self.transfer_options, spec.option_field, True)) + + def _agreed_welldata_df(self) -> pd.DataFrame: + wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) + ldf = read_csv("Location") + ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1, errors="ignore") + wdf = wdf.join(ldf.set_index("LocationId"), on="LocationId") + wdf = wdf[wdf["SiteType"] == "GW"] + wdf = wdf[wdf["Easting"].notna() & wdf["Northing"].notna()] + wdf = replace_nans(wdf) + + cleaned_df = get_transferable_wells(wdf) + + dupes = cleaned_df["PointID"].duplicated(keep=False) + if dupes.any(): + dup_ids = set(cleaned_df.loc[dupes, "PointID"]) + cleaned_df = cleaned_df[~cleaned_df["PointID"].isin(dup_ids)] + + if self.transfer_limit > 0: + cleaned_df = cleaned_df.head(self.transfer_limit) + return cleaned_df + @staticmethod def write_summary(path: Path, comparison: TransferComparisonResults) -> None: lines = [ f"generated_at={comparison.generated_at}", "", - "| Transfer | Source CSV | Source Rows | Agreed Rows | Dest Model | Dest Rows | Missing Agreed | Matched | Missing | Extra |", - "|---|---|---:|---:|---|---:|---:|---:|---:|---:|", + "| Transfer | Source CSV | Source Rows | Agreed Rows | Dest Model | Dest Rows | Missing Agreed |", + "|---|---|---:|---:|---|---:|---:|", ] for name in sorted(comparison.results.keys()): r = comparison.results[name] missing_agreed = r.agreed_transfer_row_count - r.destination_row_count lines.append( f"| {name} | {r.source_csv} | {r.source_row_count} | {r.agreed_transfer_row_count} | " - f"{r.destination_model} | {r.destination_row_count} | {missing_agreed} | " - f"{r.matched_key_count} | {r.missing_in_destination_count} | {r.extra_in_destination_count} |" + f"{r.destination_model} | {r.destination_row_count} | {missing_agreed} |" ) path.write_text("\n".join(lines) + "\n") diff --git a/transfers/transfer_results_specs.py b/transfers/transfer_results_specs.py index f86e13b7..3cfd7c05 100644 --- a/transfers/transfer_results_specs.py +++ b/transfers/transfer_results_specs.py @@ -5,33 +5,6 @@ import pandas as pd -from transfers.associated_data import AssociatedDataTransferer -from transfers.chemistry_sampleinfo import ChemistrySampleInfoTransferer -from transfers.contact_transfer import ContactTransfer -from transfers.field_parameters_transfer import FieldParametersTransferer -from transfers.group_transfer import ProjectGroupTransferer -from transfers.hydraulicsdata import HydraulicsDataTransferer -from transfers.major_chemistry import MajorChemistryTransferer -from transfers.minor_trace_chemistry_transfer import MinorTraceChemistryTransferer -from transfers.ngwmn_views import ( - NGWMNLithologyTransferer, - NGWMNWaterLevelsTransferer, - NGWMNWellConstructionTransferer, -) -from transfers.radionuclides import RadionuclidesTransferer -from transfers.sensor_transfer import SensorTransferer -from transfers.soil_rock_results import SoilRockResultsTransferer -from transfers.stratigraphy_legacy import StratigraphyLegacyTransferer -from transfers.surface_water_data import SurfaceWaterDataTransferer -from transfers.surface_water_photos import SurfaceWaterPhotosTransferer -from transfers.util import read_csv -from transfers.waterlevels_transfer import WaterLevelTransferer -from transfers.waterlevelscontinuous_pressure_daily import ( - NMA_WaterLevelsContinuous_Pressure_DailyTransferer, -) -from transfers.weather_data import WeatherDataTransferer -from transfers.weather_photos import WeatherPhotosTransferer -from transfers.well_transfer import WellScreenTransferer, WellTransferer from db import ( Contact, Group, @@ -105,7 +78,7 @@ class TransferComparisonSpec: destination_key_column: str source_filter: Callable[[pd.DataFrame], pd.DataFrame] | None = None destination_where: Callable[[Any], Any] | None = None - agreed_row_counter: Callable[[], int] | None = None + option_field: str | None = None def _location_site_filter(site_type: str) -> Callable[[pd.DataFrame], pd.DataFrame]: @@ -117,19 +90,6 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: return _f -def _agreed_rows_from_transferer(transferer_cls) -> int: - transferer = transferer_cls() - _, cleaned_df = transferer._get_dfs() - return int(len(cleaned_df)) - - -def _agreed_rows_location(site_type: str) -> int: - df = read_csv("Location") - df = df[df["SiteType"] == site_type] - df = df[df["Easting"].notna() & df["Northing"].notna()] - return int(len(df)) - - TRANSFER_COMPARISON_SPECS: list[TransferComparisonSpec] = [ TransferComparisonSpec( "WellData", @@ -139,7 +99,6 @@ def _agreed_rows_location(site_type: str) -> int: Thing, "nma_pk_welldata", destination_where=lambda m: m.thing_type == "water well", - agreed_row_counter=lambda: _agreed_rows_from_transferer(WellTransferer), ), TransferComparisonSpec( "WellScreens", @@ -148,7 +107,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", WellScreen, "nma_pk_wellscreens", - agreed_row_counter=lambda: _agreed_rows_from_transferer(WellScreenTransferer), + option_field="transfer_screens", ), TransferComparisonSpec( "OwnersData", @@ -157,7 +116,7 @@ def _agreed_rows_location(site_type: str) -> int: "OwnerKey", Contact, "nma_pk_owners", - agreed_row_counter=lambda: _agreed_rows_from_transferer(ContactTransfer), + option_field="transfer_contacts", ), TransferComparisonSpec( "WaterLevels", @@ -166,7 +125,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", Observation, "nma_pk_waterlevels", - agreed_row_counter=lambda: _agreed_rows_from_transferer(WaterLevelTransferer), + option_field="transfer_waterlevels", ), TransferComparisonSpec( "Equipment", @@ -175,7 +134,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", Sensor, "nma_pk_equipment", - agreed_row_counter=lambda: _agreed_rows_from_transferer(SensorTransferer), + option_field="transfer_sensors", ), TransferComparisonSpec( "Projects", @@ -184,7 +143,7 @@ def _agreed_rows_location(site_type: str) -> int: "Project", Group, "name", - agreed_row_counter=lambda: _agreed_rows_from_transferer(ProjectGroupTransferer), + option_field="transfer_groups", ), TransferComparisonSpec( "SurfaceWaterPhotos", @@ -193,9 +152,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", NMA_SurfaceWaterPhotos, "global_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - SurfaceWaterPhotosTransferer - ), + option_field="transfer_surface_water_photos", ), TransferComparisonSpec( "Soil_Rock_Results", @@ -204,9 +161,7 @@ def _agreed_rows_location(site_type: str) -> int: "Point_ID", NMA_Soil_Rock_Results, "nma_point_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - SoilRockResultsTransferer - ), + option_field="transfer_soil_rock_results", ), TransferComparisonSpec( "WeatherPhotos", @@ -215,9 +170,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", NMA_WeatherPhotos, "global_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - WeatherPhotosTransferer - ), + option_field="transfer_weather_photos", ), TransferComparisonSpec( "AssociatedData", @@ -226,9 +179,7 @@ def _agreed_rows_location(site_type: str) -> int: "AssocID", NMA_AssociatedData, "nma_assoc_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - AssociatedDataTransferer - ), + option_field="transfer_associated_data", ), TransferComparisonSpec( "SurfaceWaterData", @@ -237,9 +188,7 @@ def _agreed_rows_location(site_type: str) -> int: "OBJECTID", NMA_SurfaceWaterData, "object_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - SurfaceWaterDataTransferer - ), + option_field="transfer_surface_water_data", ), TransferComparisonSpec( "HydraulicsData", @@ -248,9 +197,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", NMA_HydraulicsData, "nma_global_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - HydraulicsDataTransferer - ), + option_field="transfer_hydraulics_data", ), TransferComparisonSpec( "Chemistry_SampleInfo", @@ -259,9 +206,7 @@ def _agreed_rows_location(site_type: str) -> int: "SamplePtID", NMA_Chemistry_SampleInfo, "nma_sample_pt_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - ChemistrySampleInfoTransferer - ), + option_field="transfer_chemistry_sampleinfo", ), TransferComparisonSpec( "view_NGWMN_WellConstruction", @@ -270,9 +215,7 @@ def _agreed_rows_location(site_type: str) -> int: "PointID", NMA_view_NGWMN_WellConstruction, "point_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - NGWMNWellConstructionTransferer - ), + option_field="transfer_ngwmn_views", ), TransferComparisonSpec( "view_NGWMN_WaterLevels", @@ -281,9 +224,7 @@ def _agreed_rows_location(site_type: str) -> int: "PointID", NMA_view_NGWMN_WaterLevels, "point_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - NGWMNWaterLevelsTransferer - ), + option_field="transfer_ngwmn_views", ), TransferComparisonSpec( "view_NGWMN_Lithology", @@ -292,9 +233,7 @@ def _agreed_rows_location(site_type: str) -> int: "PointID", NMA_view_NGWMN_Lithology, "point_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - NGWMNLithologyTransferer - ), + option_field="transfer_ngwmn_views", ), TransferComparisonSpec( "WaterLevelsContinuous_Pressure_Daily", @@ -303,9 +242,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", NMA_WaterLevelsContinuous_Pressure_Daily, "global_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - NMA_WaterLevelsContinuous_Pressure_DailyTransferer - ), + option_field="transfer_pressure_daily", ), TransferComparisonSpec( "WeatherData", @@ -314,7 +251,7 @@ def _agreed_rows_location(site_type: str) -> int: "OBJECTID", NMA_WeatherData, "object_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer(WeatherDataTransferer), + option_field="transfer_weather_data", ), TransferComparisonSpec( "Stratigraphy", @@ -323,9 +260,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", NMA_Stratigraphy, "nma_global_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - StratigraphyLegacyTransferer - ), + option_field="transfer_nma_stratigraphy", ), TransferComparisonSpec( "MajorChemistry", @@ -334,9 +269,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", NMA_MajorChemistry, "nma_global_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - MajorChemistryTransferer - ), + option_field="transfer_major_chemistry", ), TransferComparisonSpec( "Radionuclides", @@ -345,9 +278,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", NMA_Radionuclides, "nma_global_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - RadionuclidesTransferer - ), + option_field="transfer_radionuclides", ), TransferComparisonSpec( "MinorandTraceChemistry", @@ -356,9 +287,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", NMA_MinorTraceChemistry, "nma_global_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - MinorTraceChemistryTransferer - ), + option_field="transfer_minor_trace_chemistry", ), TransferComparisonSpec( "FieldParameters", @@ -367,9 +296,7 @@ def _agreed_rows_location(site_type: str) -> int: "GlobalID", NMA_FieldParameters, "nma_global_id", - agreed_row_counter=lambda: _agreed_rows_from_transferer( - FieldParametersTransferer - ), + option_field="transfer_field_parameters", ), TransferComparisonSpec( "Springs", @@ -380,7 +307,7 @@ def _agreed_rows_location(site_type: str) -> int: "nma_pk_location", source_filter=_location_site_filter("SP"), destination_where=lambda m: m.thing_type == "spring", - agreed_row_counter=lambda: _agreed_rows_location("SP"), + option_field="transfer_springs", ), TransferComparisonSpec( "PerennialStreams", @@ -391,7 +318,7 @@ def _agreed_rows_location(site_type: str) -> int: "nma_pk_location", source_filter=_location_site_filter("PS"), destination_where=lambda m: m.thing_type == "perennial stream", - agreed_row_counter=lambda: _agreed_rows_location("PS"), + option_field="transfer_perennial_streams", ), TransferComparisonSpec( "EphemeralStreams", @@ -402,7 +329,7 @@ def _agreed_rows_location(site_type: str) -> int: "nma_pk_location", source_filter=_location_site_filter("ES"), destination_where=lambda m: m.thing_type == "ephemeral stream", - agreed_row_counter=lambda: _agreed_rows_location("ES"), + option_field="transfer_ephemeral_streams", ), TransferComparisonSpec( "MetStations", @@ -413,7 +340,7 @@ def _agreed_rows_location(site_type: str) -> int: "nma_pk_location", source_filter=_location_site_filter("M"), destination_where=lambda m: m.thing_type == "meteorological station", - agreed_row_counter=lambda: _agreed_rows_location("M"), + option_field="transfer_met_stations", ), TransferComparisonSpec( "RockSampleLocations", @@ -424,7 +351,7 @@ def _agreed_rows_location(site_type: str) -> int: "nma_pk_location", source_filter=_location_site_filter("R"), destination_where=lambda m: m.thing_type == "rock sample location", - agreed_row_counter=lambda: _agreed_rows_location("R"), + option_field="transfer_rock_sample_locations", ), TransferComparisonSpec( "DiversionOfSurfaceWater", @@ -435,7 +362,7 @@ def _agreed_rows_location(site_type: str) -> int: "nma_pk_location", source_filter=_location_site_filter("D"), destination_where=lambda m: m.thing_type == "diversion of surface water, etc.", - agreed_row_counter=lambda: _agreed_rows_location("D"), + option_field="transfer_diversion_of_surface_water", ), TransferComparisonSpec( "LakePondReservoir", @@ -446,7 +373,7 @@ def _agreed_rows_location(site_type: str) -> int: "nma_pk_location", source_filter=_location_site_filter("L"), destination_where=lambda m: m.thing_type == "lake, pond or reservoir", - agreed_row_counter=lambda: _agreed_rows_location("L"), + option_field="transfer_lake_pond_reservoir", ), TransferComparisonSpec( "SoilGasSampleLocations", @@ -457,7 +384,7 @@ def _agreed_rows_location(site_type: str) -> int: "nma_pk_location", source_filter=_location_site_filter("S"), destination_where=lambda m: m.thing_type == "soil gas sample location", - agreed_row_counter=lambda: _agreed_rows_location("S"), + option_field="transfer_soil_gas_sample_locations", ), TransferComparisonSpec( "OtherSiteTypes", @@ -468,7 +395,7 @@ def _agreed_rows_location(site_type: str) -> int: "nma_pk_location", source_filter=_location_site_filter("OT"), destination_where=lambda m: m.thing_type == "other", - agreed_row_counter=lambda: _agreed_rows_location("OT"), + option_field="transfer_other_site_types", ), TransferComparisonSpec( "OutfallWastewaterReturnFlow", @@ -480,6 +407,6 @@ def _agreed_rows_location(site_type: str) -> int: source_filter=_location_site_filter("O"), destination_where=lambda m: m.thing_type == "outfall of wastewater or return flow", - agreed_row_counter=lambda: _agreed_rows_location("O"), + option_field="transfer_outfall_wastewater_return_flow", ), ] From ba7881bccf444a643ac5aae17a38c5e2597e5d63 Mon Sep 17 00:00:00 2001 From: jakeross Date: Thu, 19 Feb 2026 18:27:02 -0700 Subject: [PATCH 05/14] fix: enforce required thing_id for NMA_SurfaceWaterData and add validation --- ...ke_surface_water_data_thing_id_nullable.py | 57 ------------------- db/nma_legacy.py | 19 +++++-- 2 files changed, 13 insertions(+), 63 deletions(-) delete mode 100644 alembic/versions/i2c3d4e5f6a7_make_surface_water_data_thing_id_nullable.py diff --git a/alembic/versions/i2c3d4e5f6a7_make_surface_water_data_thing_id_nullable.py b/alembic/versions/i2c3d4e5f6a7_make_surface_water_data_thing_id_nullable.py deleted file mode 100644 index 0b0f00a2..00000000 --- a/alembic/versions/i2c3d4e5f6a7_make_surface_water_data_thing_id_nullable.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Make NMA_SurfaceWaterData.thing_id nullable. - -Revision ID: i2c3d4e5f6a7 -Revises: f1a2b3c4d5e6 -Create Date: 2026-02-20 17:40:00.000000 -""" - -from typing import Sequence, Union - -import sqlalchemy as sa -from alembic import op -from sqlalchemy import inspect - -# revision identifiers, used by Alembic. -revision: str = "i2c3d4e5f6a7" -down_revision: Union[str, Sequence[str], None] = "f1a2b3c4d5e6" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - """Allow orphan legacy SurfaceWaterData rows without a mapped Thing.""" - bind = op.get_bind() - inspector = inspect(bind) - if not inspector.has_table("NMA_SurfaceWaterData"): - return - - columns = {col["name"] for col in inspector.get_columns("NMA_SurfaceWaterData")} - if "thing_id" not in columns: - return - - op.alter_column( - "NMA_SurfaceWaterData", - "thing_id", - existing_type=sa.Integer(), - nullable=True, - ) - - -def downgrade() -> None: - """Revert to NOT NULL only when no null thing_id values exist.""" - bind = op.get_bind() - inspector = inspect(bind) - if not inspector.has_table("NMA_SurfaceWaterData"): - return - - columns = {col["name"] for col in inspector.get_columns("NMA_SurfaceWaterData")} - if "thing_id" not in columns: - return - - op.execute('DELETE FROM "NMA_SurfaceWaterData" WHERE thing_id IS NULL') - op.alter_column( - "NMA_SurfaceWaterData", - "thing_id", - existing_type=sa.Integer(), - nullable=False, - ) diff --git a/db/nma_legacy.py b/db/nma_legacy.py index 8c01eae6..cab2014e 100644 --- a/db/nma_legacy.py +++ b/db/nma_legacy.py @@ -578,9 +578,9 @@ class NMA_SurfaceWaterData(Base): object_id: Mapped[int] = mapped_column("OBJECTID", Integer, primary_key=True) # FK - # FK to Thing - optional when legacy rows cannot be mapped to a Thing. - thing_id: Mapped[Optional[int]] = mapped_column( - Integer, ForeignKey("thing.id", ondelete="CASCADE"), nullable=True + # FK to Thing - required for all SurfaceWaterData records + thing_id: Mapped[int] = mapped_column( + Integer, ForeignKey("thing.id", ondelete="CASCADE"), nullable=False ) # Legacy PK (for audit) @@ -615,9 +615,16 @@ class NMA_SurfaceWaterData(Base): data_source: Mapped[Optional[str]] = mapped_column("DataSource", String(255)) # Relationships - thing: Mapped[Optional["Thing"]] = relationship( - "Thing", back_populates="surface_water_data" - ) + thing: Mapped["Thing"] = relationship("Thing", back_populates="surface_water_data") + + @validates("thing_id") + def validate_thing_id(self, key, value): + """Prevent orphan NMA_SurfaceWaterData - must have a parent Thing.""" + if value is None: + raise ValueError( + "NMA_SurfaceWaterData requires a parent Thing (thing_id cannot be None)" + ) + return value class NMA_SurfaceWaterPhotos(Base): From b4764b2e9e06d93fdf536b8e38a3bf058f8ee215 Mon Sep 17 00:00:00 2001 From: jakeross Date: Thu, 19 Feb 2026 20:30:57 -0700 Subject: [PATCH 06/14] feat: add transfer-results command for generating transfer results summary --- ...e6_merge_migrations_after_staging_merge.py | 25 -- ...add_unique_index_ngwmn_wellconstruction.py | 4 +- cli/cli.py | 27 ++ pyproject.toml | 2 +- tests/test_cli_commands.py | 56 ++- transfers/transfer.py | 8 +- transfers/transfer_results.py | 50 --- transfers/transfer_results_builder.py | 9 +- transfers/transfer_results_specs.py | 322 ++++++++++++++++++ 9 files changed, 414 insertions(+), 89 deletions(-) delete mode 100644 alembic/versions/43bc34504ee6_merge_migrations_after_staging_merge.py delete mode 100644 transfers/transfer_results.py diff --git a/alembic/versions/43bc34504ee6_merge_migrations_after_staging_merge.py b/alembic/versions/43bc34504ee6_merge_migrations_after_staging_merge.py deleted file mode 100644 index 86943385..00000000 --- a/alembic/versions/43bc34504ee6_merge_migrations_after_staging_merge.py +++ /dev/null @@ -1,25 +0,0 @@ -"""merge_migrations_after_staging_merge - -Revision ID: 43bc34504ee6 -Revises: 3cb924ca51fd -Create Date: 2026-01-30 11:52:41.932306 - -""" - -from typing import Sequence, Union - -# revision identifiers, used by Alembic. -revision: str = "43bc34504ee6" -down_revision: Union[str, Sequence[str], None] = "3cb924ca51fd" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - """Upgrade schema.""" - pass - - -def downgrade() -> None: - """Downgrade schema.""" - pass diff --git a/alembic/versions/50d1c2a3b4c5_add_unique_index_ngwmn_wellconstruction.py b/alembic/versions/50d1c2a3b4c5_add_unique_index_ngwmn_wellconstruction.py index ceffbdaa..edf6fb8e 100644 --- a/alembic/versions/50d1c2a3b4c5_add_unique_index_ngwmn_wellconstruction.py +++ b/alembic/versions/50d1c2a3b4c5_add_unique_index_ngwmn_wellconstruction.py @@ -1,7 +1,7 @@ """Add unique index for NGWMN well construction Revision ID: 50d1c2a3b4c5 -Revises: 43bc34504ee6 +Revises: 3cb924ca51fd Create Date: 2026-01-31 00:27:12.204176 """ @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision: str = "50d1c2a3b4c5" -down_revision: Union[str, Sequence[str], None] = "43bc34504ee6" +down_revision: Union[str, Sequence[str], None] = "3cb924ca51fd" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None diff --git a/cli/cli.py b/cli/cli.py index 6be0e16e..c84c862a 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -109,6 +109,33 @@ def associate_assets_command( associate_assets(root_directory) +@cli.command("transfer-results") +def transfer_results( + summary_path: Path = typer.Option( + Path("transfers") / "metrics" / "transfer_results_summary.md", + "--summary-path", + help="Output path for markdown summary table.", + ), + sample_limit: int = typer.Option( + 25, + "--sample-limit", + min=1, + help="Max missing/extra key samples stored per transfer.", + ), + theme: ThemeMode = typer.Option( + ThemeMode.auto, "--theme", help="Color theme: auto, light, dark." + ), +): + from transfers.transfer_results_builder import TransferResultsBuilder + + builder = TransferResultsBuilder(sample_limit=sample_limit) + results = builder.build() + summary_path.parent.mkdir(parents=True, exist_ok=True) + TransferResultsBuilder.write_summary(summary_path, results) + typer.echo(f"Wrote comparison summary: {summary_path}") + typer.echo(f"Transfer comparisons: {len(results.results)}") + + @cli.command("well-inventory-csv") def well_inventory_csv( file_path: str = typer.Argument( diff --git a/pyproject.toml b/pyproject.toml index 70d4bae8..45f81453 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,7 @@ dependencies = [ package = true [tool.setuptools] -packages = ["alembic", "cli", "core", "db", "schemas", "services"] +packages = ["alembic", "cli", "core", "db", "schemas", "services", "transfers"] [project.scripts] oco = "cli.cli:cli" diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py index f70d8613..8bdc2f9c 100644 --- a/tests/test_cli_commands.py +++ b/tests/test_cli_commands.py @@ -18,13 +18,15 @@ import textwrap import uuid from pathlib import Path +from types import SimpleNamespace + +from sqlalchemy import select +from typer.testing import CliRunner from cli.cli import cli from cli.service_adapter import WellInventoryResult from db import FieldActivity, FieldEvent, Observation, Sample from db.engine import session_ctx -from sqlalchemy import select -from typer.testing import CliRunner def test_initialize_lexicon_invokes_initializer(monkeypatch): @@ -95,6 +97,50 @@ def fake_well_inventory(file_path): assert "[WELL INVENTORY IMPORT] SUCCESS" in result.output +def test_transfer_results_command_writes_summary(monkeypatch, tmp_path): + captured: dict[str, object] = {} + + class FakeBuilder: + def __init__(self, sample_limit: int = 25): + captured["sample_limit"] = sample_limit + + def build(self): + captured["built"] = True + return SimpleNamespace( + results={"WellData": object(), "WaterLevels": object()} + ) + + @staticmethod + def write_summary(path, comparison): + captured["summary_path"] = Path(path) + captured["result_count"] = len(comparison.results) + + monkeypatch.setattr( + "transfers.transfer_results_builder.TransferResultsBuilder", FakeBuilder + ) + + summary_path = tmp_path / "metrics" / "summary.md" + runner = CliRunner() + result = runner.invoke( + cli, + [ + "transfer-results", + "--summary-path", + str(summary_path), + "--sample-limit", + "11", + ], + ) + + assert result.exit_code == 0, result.output + assert captured["sample_limit"] == 11 + assert captured["built"] is True + assert captured["summary_path"] == summary_path + assert captured["result_count"] == 2 + assert f"Wrote comparison summary: {summary_path}" in result.output + assert "Transfer comparisons: 2" in result.output + + def test_well_inventory_csv_command_reports_validation_errors(monkeypatch, tmp_path): inventory_file = tmp_path / "inventory.csv" inventory_file.write_text("header\nvalue\n") @@ -198,10 +244,12 @@ def test_water_levels_cli_persists_observations(tmp_path, water_well_thing): """ def _write_csv(path: Path, *, well_name: str, notes: str): - csv_text = textwrap.dedent(f"""\ + csv_text = textwrap.dedent( + f"""\ field_staff,well_name_point_id,field_event_date_time,measurement_date_time,sampler,sample_method,mp_height,level_status,depth_to_water_ft,data_quality,water_level_notes CLI Tester,{well_name},2025-02-15T08:00:00-07:00,2025-02-15T10:30:00-07:00,Groundwater Team,electric tape,1.5,stable,42.5,approved,{notes} - """) + """ + ) path.write_text(csv_text) unique_notes = f"pytest-{uuid.uuid4()}" diff --git a/transfers/transfer.py b/transfers/transfer.py index 1e50accb..83b8df3b 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -20,6 +20,7 @@ from dataclasses import dataclass from dotenv import load_dotenv + from transfers.thing_transfer import ( transfer_rock_sample_locations, transfer_springs, @@ -698,9 +699,10 @@ def main(): profile_artifacts = transfer_all(metrics) metrics.close() - metrics.save_to_storage_bucket() - save_log_to_bucket() - upload_profile_artifacts(profile_artifacts) + if get_bool_env("SAVE_TO_BUCKET", False): + metrics.save_to_storage_bucket() + save_log_to_bucket() + upload_profile_artifacts(profile_artifacts) message("END--------------------------------------") diff --git a/transfers/transfer_results.py b/transfers/transfer_results.py deleted file mode 100644 index 36337d52..00000000 --- a/transfers/transfer_results.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import annotations - -import argparse -from pathlib import Path - -from transfers.transfer_results_builder import TransferResultsBuilder -from transfers.transfer_results_specs import ( - TRANSFER_COMPARISON_SPECS, - TransferComparisonSpec, -) -from transfers.transfer_results_types import * # noqa: F401,F403 - -__all__ = [ - "TransferResultsBuilder", - "TransferComparisonSpec", - "TRANSFER_COMPARISON_SPECS", -] - - -def _parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Compare each transfer input CSV against destination Postgres rows." - ) - parser.add_argument( - "--summary-path", - type=Path, - default=Path("transfers") / "metrics" / "transfer_results_summary.md", - help="Output path for markdown summary table.", - ) - parser.add_argument( - "--sample-limit", - type=int, - default=25, - help="Max missing/extra key samples stored per transfer.", - ) - return parser.parse_args() - - -def main() -> None: - args = _parse_args() - builder = TransferResultsBuilder(sample_limit=args.sample_limit) - results = builder.build() - args.summary_path.parent.mkdir(parents=True, exist_ok=True) - TransferResultsBuilder.write_summary(args.summary_path, results) - print(f"Wrote comparison summary: {args.summary_path}") - print(f"Transfer comparisons: {len(results.results)}") - - -if __name__ == "__main__": - main() diff --git a/transfers/transfer_results_builder.py b/transfers/transfer_results_builder.py index 15ba47c8..1a2392c0 100644 --- a/transfers/transfer_results_builder.py +++ b/transfers/transfer_results_builder.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from pathlib import Path from typing import Any @@ -21,7 +22,6 @@ replace_nans, get_transferable_wells, ) -import os def _normalize_key(value: Any) -> str | None: @@ -79,9 +79,11 @@ def _build_one(self, spec: TransferComparisonSpec) -> TransferResult: if spec.source_filter: source_df = spec.source_filter(source_df) comparison_df = source_df + if spec.agreed_filter: + comparison_df = spec.agreed_filter(comparison_df) enabled = self._is_enabled(spec) if not enabled: - comparison_df = source_df.iloc[0:0] + comparison_df = comparison_df.iloc[0:0] elif spec.transfer_name == "WellData": comparison_df = self._agreed_welldata_df() @@ -179,9 +181,8 @@ def write_summary(path: Path, comparison: TransferComparisonResults) -> None: ] for name in sorted(comparison.results.keys()): r = comparison.results[name] - missing_agreed = r.agreed_transfer_row_count - r.destination_row_count lines.append( f"| {name} | {r.source_csv} | {r.source_row_count} | {r.agreed_transfer_row_count} | " - f"{r.destination_model} | {r.destination_row_count} | {missing_agreed} |" + f"{r.destination_model} | {r.destination_row_count} | {r.missing_in_destination_count} |" ) path.write_text("\n".join(lines) + "\n") diff --git a/transfers/transfer_results_specs.py b/transfers/transfer_results_specs.py index 3cfd7c05..449ffa89 100644 --- a/transfers/transfer_results_specs.py +++ b/transfers/transfer_results_specs.py @@ -1,7 +1,9 @@ from __future__ import annotations +import json from dataclasses import dataclass from typing import Any, Callable +from uuid import UUID import pandas as pd @@ -29,6 +31,15 @@ Sensor, Thing, WellScreen, + Location, + LocationThingAssociation, +) +from db.engine import session_ctx +from transfers.contact_transfer import ( + _get_organization, + _make_name, + _safe_make_name, + _select_ownerkey_col, ) from transfers.transfer_results_types import ( AssociatedDataTransferResult, @@ -66,6 +77,13 @@ WellDataTransferResult, WellScreensTransferResult, ) +from transfers.util import ( + filter_by_valid_measuring_agency, + filter_to_valid_point_ids, + get_transfers_data_path, + read_csv, + replace_nans, +) @dataclass(frozen=True) @@ -77,6 +95,7 @@ class TransferComparisonSpec: destination_model: Any destination_key_column: str source_filter: Callable[[pd.DataFrame], pd.DataFrame] | None = None + agreed_filter: Callable[[pd.DataFrame], pd.DataFrame] | None = None destination_where: Callable[[Any], Any] | None = None option_field: str | None = None @@ -90,6 +109,297 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: return _f +def _chemistry_sampleinfo_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror ChemistrySampleInfoTransferer filters: + # 1) valid LocationId that resolves to a Thing via LocationThingAssociation + # 2) valid UUID SamplePtID + if "LocationId" not in df.columns or "SamplePtID" not in df.columns: + return df.iloc[0:0] + + with session_ctx() as session: + rows = ( + session.query(Location.nma_pk_location) + .join( + LocationThingAssociation, + Location.id == LocationThingAssociation.location_id, + ) + .filter(Location.nma_pk_location.isnot(None)) + .all() + ) + valid_location_ids = { + str(nma_pk_location).strip().lower() for (nma_pk_location,) in rows + } + + def _normalize_location(value: Any) -> str | None: + if pd.isna(value): + return None + text = str(value).strip().lower() + return text or None + + def _is_valid_uuid(value: Any) -> bool: + if pd.isna(value): + return False + try: + UUID(str(value)) + except (TypeError, ValueError): + return False + return True + + location_mask = df["LocationId"].apply(_normalize_location).isin(valid_location_ids) + sample_pt_mask = df["SamplePtID"].apply(_is_valid_uuid) + return df[location_mask & sample_pt_mask].copy() + + +def _chemistry_child_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror ChemistryTransferer._filter_to_valid_sample_infos: + # keep only rows whose SamplePtID resolves to an existing ChemistrySampleInfo. + if "SamplePtID" not in df.columns: + return df.iloc[0:0] + + with session_ctx() as session: + rows = ( + session.query(NMA_Chemistry_SampleInfo.nma_sample_pt_id) + .filter(NMA_Chemistry_SampleInfo.nma_sample_pt_id.isnot(None)) + .all() + ) + valid_sample_pt_ids = {sample_pt_id for (sample_pt_id,) in rows} + + def _uuid_or_none(value: Any) -> UUID | None: + if pd.isna(value): + return None + try: + return UUID(str(value)) + except (TypeError, ValueError): + return None + + sample_pt_mask = df["SamplePtID"].map(_uuid_or_none).isin(valid_sample_pt_ids) + return df[sample_pt_mask].copy() + + +def _waterlevels_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror WaterLevelTransferer._get_dfs filtering stage. + cleaned_df = replace_nans(df.copy()) + cleaned_df = filter_to_valid_point_ids(cleaned_df) + cleaned_df = filter_by_valid_measuring_agency(cleaned_df) + return cleaned_df + + +def _stratigraphy_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror StratigraphyLegacyTransferer._get_dfs filtering stage. + cleaned_df = replace_nans(df.copy()) + cleaned_df = filter_to_valid_point_ids(cleaned_df) + return cleaned_df + + +def _hydraulics_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror HydraulicsDataTransferer._filter_to_valid_things: + # keep only rows whose PointID exists in Thing.name. + if "PointID" not in df.columns: + return df.iloc[0:0] + + with session_ctx() as session: + thing_names = { + name + for (name,) in session.query(Thing.name) + .filter(Thing.name.isnot(None)) + .all() + } + + return df[df["PointID"].isin(thing_names)].copy() + + +def _ngwmn_waterlevels_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror NGWMNWaterLevelsTransferer dedupe key: + # conflict columns are (PointID, DateMeasured), with later rows winning. + if "PointID" not in df.columns or "DateMeasured" not in df.columns: + return df.iloc[0:0] + + dedupe_df = df.copy() + dedupe_df["_pointid_norm"] = dedupe_df["PointID"].astype(str) + parsed_dates = pd.to_datetime(dedupe_df["DateMeasured"], errors="coerce") + dedupe_df["_date_measured_norm"] = parsed_dates.dt.date + # Match transfer _dedupe_rows(..., include_missing=True): + # rows with missing key parts are not deduped. + missing_key_mask = ( + dedupe_df["_pointid_norm"].isna() | dedupe_df["_date_measured_norm"].isna() + ) + non_missing = dedupe_df.loc[~missing_key_mask].drop_duplicates( + subset=["_pointid_norm", "_date_measured_norm"], keep="last" + ) + missing = dedupe_df.loc[missing_key_mask] + out = pd.concat([non_missing, missing], axis=0) + return out.drop(columns=["_pointid_norm", "_date_measured_norm"]) + + +def _ngwmn_wellconstruction_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror NGWMNWellConstructionTransferer dedupe key: + # conflict columns are (PointID, CasingTop, ScreenTop), with later rows winning. + required = {"PointID", "CasingTop", "ScreenTop"} + if not required.issubset(df.columns): + return df.iloc[0:0] + + def _float_or_none(value: Any) -> float | None: + if value is None or pd.isna(value): + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + import re + + match = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", value) + if match: + try: + return float(match.group(0)) + except ValueError: + return None + return None + + dedupe_df = df.copy() + dedupe_df["_pointid_norm"] = dedupe_df["PointID"].astype(str) + dedupe_df["_casing_top_norm"] = dedupe_df["CasingTop"].map(_float_or_none) + dedupe_df["_screen_top_norm"] = dedupe_df["ScreenTop"].map(_float_or_none) + # Match transfer _dedupe_rows(..., include_missing=True): + # rows with missing key parts are not deduped. + missing_key_mask = ( + dedupe_df["_pointid_norm"].isna() + | dedupe_df["_casing_top_norm"].isna() + | dedupe_df["_screen_top_norm"].isna() + ) + non_missing = dedupe_df.loc[~missing_key_mask].drop_duplicates( + subset=["_pointid_norm", "_casing_top_norm", "_screen_top_norm"], + keep="last", + ) + missing = dedupe_df.loc[missing_key_mask] + out = pd.concat([non_missing, missing], axis=0) + return out.drop(columns=["_pointid_norm", "_casing_top_norm", "_screen_top_norm"]) + + +def _load_json_mapping(path: str) -> dict[str, str]: + try: + with open(path, "r") as f: + return json.load(f) + except FileNotFoundError: + return {} + + +def _ownersdata_agreed_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror ContactTransfer fan-out: + # one OwnersData source row can produce 0/1/2 Contact rows. + odf = df.drop(["OBJECTID", "GlobalID"], axis=1, errors="ignore") + ldf = read_csv("OwnerLink").drop(["OBJECTID", "GlobalID"], axis=1, errors="ignore") + locdf = read_csv("Location") + ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId") + + owner_key_col = _select_ownerkey_col(odf, "OwnersData") + link_owner_key_col = _select_ownerkey_col(ldf, "OwnerLink") + + ownerkey_mapper = _load_json_mapping( + str(get_transfers_data_path("owners_ownerkey_mapper.json")) + ) + org_mapper = _load_json_mapping( + str(get_transfers_data_path("owners_organization_mapper.json")) + ) + + if ownerkey_mapper: + odf["ownerkey_canonical"] = odf[owner_key_col].replace(ownerkey_mapper) + ldf["ownerkey_canonical"] = ldf[link_owner_key_col].replace(ownerkey_mapper) + else: + odf["ownerkey_canonical"] = odf[owner_key_col] + ldf["ownerkey_canonical"] = ldf[link_owner_key_col] + + odf["ownerkey_norm"] = ( + odf["ownerkey_canonical"] + .fillna("") + .astype(str) + .str.strip() + .str.casefold() + .replace({"": pd.NA}) + ) + ldf["ownerkey_norm"] = ( + ldf["ownerkey_canonical"] + .fillna("") + .astype(str) + .str.strip() + .str.casefold() + .replace({"": pd.NA}) + ) + + ldf_join = ldf.set_index("ownerkey_norm") + overlap_cols = [col for col in ldf_join.columns if col in odf.columns] + if overlap_cols: + ldf_join = ldf_join.drop(columns=overlap_cols, errors="ignore") + odf = odf.join(ldf_join, on="ownerkey_norm") + + odf = replace_nans(odf) + odf = filter_to_valid_point_ids(odf) + + # Emulate ContactTransfer + _make_contact_and_assoc semantics: + # 1) dedupe by (OwnerKey, ContactType) + # 2) then dedupe by (name, organization) via in-memory "added" list + # 3) only successful CreateContact payloads count as agreed. + agreed_rows: list[dict[str, Any]] = [] + created_owner_type: set[tuple[str, str]] = set() + added_name_org: set[tuple[str | None, str | None]] = set() + + ordered = odf.sort_values(by=["PointID"], kind="stable") + + def _record_new_contact( + owner_key: Any, + contact_type: str, + name: str | None, + organization: str | None, + ) -> bool: + if name is None and organization is None: + return False + + owner_key_text = None if owner_key is None else str(owner_key) + owner_type_key = None + if owner_key_text: + owner_type_key = (owner_key_text, contact_type) + + if owner_type_key and owner_type_key in created_owner_type: + return False + + name_org_key = (name, organization) + if name_org_key in added_name_org: + return False + + if owner_type_key: + created_owner_type.add(owner_type_key) + added_name_org.add(name_org_key) + agreed_rows.append({"OwnerKey": owner_key}) + return True + + for row in ordered.itertuples(): + owner_key = getattr(row, owner_key_col, None) + organization = _get_organization(row, org_mapper) + + primary_name = _safe_make_name( + getattr(row, "FirstName", None), + getattr(row, "LastName", None), + owner_key, + organization, + ) + _record_new_contact(owner_key, "Primary", primary_name, organization) + + has_secondary_input = not all( + [ + getattr(row, "SecondFirstName", None) is None, + getattr(row, "SecondLastName", None) is None, + getattr(row, "SecondCtctEmail", None) is None, + getattr(row, "SecondCtctPhone", None) is None, + ] + ) + if has_secondary_input: + secondary_name = _make_name( + getattr(row, "SecondFirstName", None), + getattr(row, "SecondLastName", None), + ) + _record_new_contact(owner_key, "Secondary", secondary_name, organization) + + return pd.DataFrame(agreed_rows, columns=["OwnerKey"]) + + TRANSFER_COMPARISON_SPECS: list[TransferComparisonSpec] = [ TransferComparisonSpec( "WellData", @@ -116,6 +426,8 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "OwnerKey", Contact, "nma_pk_owners", + agreed_filter=_ownersdata_agreed_filter, + destination_where=lambda m: m.nma_pk_owners.is_not(None), option_field="transfer_contacts", ), TransferComparisonSpec( @@ -125,6 +437,7 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "GlobalID", Observation, "nma_pk_waterlevels", + agreed_filter=_waterlevels_filter, option_field="transfer_waterlevels", ), TransferComparisonSpec( @@ -197,6 +510,7 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "GlobalID", NMA_HydraulicsData, "nma_global_id", + agreed_filter=_hydraulics_filter, option_field="transfer_hydraulics_data", ), TransferComparisonSpec( @@ -206,6 +520,7 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "SamplePtID", NMA_Chemistry_SampleInfo, "nma_sample_pt_id", + agreed_filter=_chemistry_sampleinfo_filter, option_field="transfer_chemistry_sampleinfo", ), TransferComparisonSpec( @@ -215,6 +530,7 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "PointID", NMA_view_NGWMN_WellConstruction, "point_id", + agreed_filter=_ngwmn_wellconstruction_filter, option_field="transfer_ngwmn_views", ), TransferComparisonSpec( @@ -224,6 +540,7 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "PointID", NMA_view_NGWMN_WaterLevels, "point_id", + agreed_filter=_ngwmn_waterlevels_filter, option_field="transfer_ngwmn_views", ), TransferComparisonSpec( @@ -260,6 +577,7 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "GlobalID", NMA_Stratigraphy, "nma_global_id", + agreed_filter=_stratigraphy_filter, option_field="transfer_nma_stratigraphy", ), TransferComparisonSpec( @@ -269,6 +587,7 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "GlobalID", NMA_MajorChemistry, "nma_global_id", + agreed_filter=_chemistry_child_filter, option_field="transfer_major_chemistry", ), TransferComparisonSpec( @@ -278,6 +597,7 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "GlobalID", NMA_Radionuclides, "nma_global_id", + agreed_filter=_chemistry_child_filter, option_field="transfer_radionuclides", ), TransferComparisonSpec( @@ -287,6 +607,7 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "GlobalID", NMA_MinorTraceChemistry, "nma_global_id", + agreed_filter=_chemistry_child_filter, option_field="transfer_minor_trace_chemistry", ), TransferComparisonSpec( @@ -296,6 +617,7 @@ def _f(df: pd.DataFrame) -> pd.DataFrame: "GlobalID", NMA_FieldParameters, "nma_global_id", + agreed_filter=_chemistry_child_filter, option_field="transfer_field_parameters", ), TransferComparisonSpec( From 35287180aa4a02c5ebceedcfd8804d7a12a4f256 Mon Sep 17 00:00:00 2001 From: jirhiker <2035568+jirhiker@users.noreply.github.com> Date: Fri, 20 Feb 2026 03:31:20 +0000 Subject: [PATCH 07/14] Formatting changes --- tests/test_cli_commands.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py index 8bdc2f9c..412ebea3 100644 --- a/tests/test_cli_commands.py +++ b/tests/test_cli_commands.py @@ -244,12 +244,10 @@ def test_water_levels_cli_persists_observations(tmp_path, water_well_thing): """ def _write_csv(path: Path, *, well_name: str, notes: str): - csv_text = textwrap.dedent( - f"""\ + csv_text = textwrap.dedent(f"""\ field_staff,well_name_point_id,field_event_date_time,measurement_date_time,sampler,sample_method,mp_height,level_status,depth_to_water_ft,data_quality,water_level_notes CLI Tester,{well_name},2025-02-15T08:00:00-07:00,2025-02-15T10:30:00-07:00,Groundwater Team,electric tape,1.5,stable,42.5,approved,{notes} - """ - ) + """) path.write_text(csv_text) unique_notes = f"pytest-{uuid.uuid4()}" From fd7e2430c8f51eed6dcdb9d71799f532bf656bd1 Mon Sep 17 00:00:00 2001 From: jakeross Date: Sun, 22 Feb 2026 14:24:18 -0700 Subject: [PATCH 08/14] feat: make various fields nullable and enhance data transfer handling --- ...3b_make_measuring_point_height_nullable.py | 36 +++ ...d3e4f_make_address_postal_code_nullable.py | 36 +++ ...e_deployment_installation_date_nullable.py | 36 +++ ...5e6f7a8_make_wellscreen_depths_nullable.py | 48 ++++ ...f7a8b9_make_address_city_state_nullable.py | 48 ++++ api/README.md | 18 ++ cli/README.md | 25 ++ cli/cli.py | 142 ++++++++++ core/lexicon.json | 1 + db/README.md | 22 ++ db/contact.py | 6 +- db/deployment.py | 2 +- db/measuring_point_history.py | 2 +- db/thing.py | 4 +- schemas/contact.py | 15 +- schemas/deployment.py | 2 +- schemas/sample.py | 4 +- schemas/thing.py | 53 ++-- tests/README.md | 31 +++ tests/features/environment.py | 15 +- tests/test_cli_commands.py | 6 +- tests/test_util.py | 26 +- .../unit/test_contact_transfer_email_utils.py | 19 ++ transfers/README.md | 27 ++ transfers/contact_transfer.py | 258 ++++++++++++++---- transfers/geologic_formation_transfer.py | 105 +++---- transfers/link_ids_transfer.py | 189 ++++++++----- transfers/logger.py | 23 +- transfers/relaxed_constraints.md | 10 + transfers/sensor_transfer.py | 20 +- transfers/thing_transfer.py | 180 ++++++++++-- transfers/transfer_results_builder.py | 161 ++++++++++- transfers/transfer_results_specs.py | 91 +++++- transfers/transfer_results_types.py | 2 + transfers/transferer.py | 10 - transfers/util.py | 23 ++ transfers/waterlevels_transfer.py | 46 +--- transfers/well_transfer.py | 62 +++-- 38 files changed, 1449 insertions(+), 355 deletions(-) create mode 100644 alembic/versions/8c9d0e1f2a3b_make_measuring_point_height_nullable.py create mode 100644 alembic/versions/9a0b1c2d3e4f_make_address_postal_code_nullable.py create mode 100644 alembic/versions/a1b2c3d4e5f7_make_deployment_installation_date_nullable.py create mode 100644 alembic/versions/b3c4d5e6f7a8_make_wellscreen_depths_nullable.py create mode 100644 alembic/versions/c4d5e6f7a8b9_make_address_city_state_nullable.py create mode 100644 api/README.md create mode 100644 cli/README.md create mode 100644 db/README.md create mode 100644 tests/README.md create mode 100644 tests/unit/test_contact_transfer_email_utils.py create mode 100644 transfers/README.md create mode 100644 transfers/relaxed_constraints.md diff --git a/alembic/versions/8c9d0e1f2a3b_make_measuring_point_height_nullable.py b/alembic/versions/8c9d0e1f2a3b_make_measuring_point_height_nullable.py new file mode 100644 index 00000000..58a3050c --- /dev/null +++ b/alembic/versions/8c9d0e1f2a3b_make_measuring_point_height_nullable.py @@ -0,0 +1,36 @@ +"""make measuring_point_history.measuring_point_height nullable + +Revision ID: 8c9d0e1f2a3b +Revises: 5336a52336df +Create Date: 2026-02-21 12:00:00.000000 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "8c9d0e1f2a3b" +down_revision: Union[str, Sequence[str], None] = "5336a52336df" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.alter_column( + "measuring_point_history", + "measuring_point_height", + existing_type=sa.Numeric(), + nullable=True, + ) + + +def downgrade() -> None: + op.alter_column( + "measuring_point_history", + "measuring_point_height", + existing_type=sa.Numeric(), + nullable=False, + ) diff --git a/alembic/versions/9a0b1c2d3e4f_make_address_postal_code_nullable.py b/alembic/versions/9a0b1c2d3e4f_make_address_postal_code_nullable.py new file mode 100644 index 00000000..05138add --- /dev/null +++ b/alembic/versions/9a0b1c2d3e4f_make_address_postal_code_nullable.py @@ -0,0 +1,36 @@ +"""make address.postal_code nullable + +Revision ID: 9a0b1c2d3e4f +Revises: 8c9d0e1f2a3b +Create Date: 2026-02-21 13:00:00.000000 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "9a0b1c2d3e4f" +down_revision: Union[str, Sequence[str], None] = "8c9d0e1f2a3b" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.alter_column( + "address", + "postal_code", + existing_type=sa.String(length=20), + nullable=True, + ) + + +def downgrade() -> None: + op.alter_column( + "address", + "postal_code", + existing_type=sa.String(length=20), + nullable=False, + ) diff --git a/alembic/versions/a1b2c3d4e5f7_make_deployment_installation_date_nullable.py b/alembic/versions/a1b2c3d4e5f7_make_deployment_installation_date_nullable.py new file mode 100644 index 00000000..59f899a6 --- /dev/null +++ b/alembic/versions/a1b2c3d4e5f7_make_deployment_installation_date_nullable.py @@ -0,0 +1,36 @@ +"""make deployment installation_date nullable + +Revision ID: a1b2c3d4e5f7 +Revises: 9a0b1c2d3e4f +Create Date: 2026-02-21 14:32:00.000000 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "a1b2c3d4e5f7" +down_revision: Union[str, Sequence[str], None] = "9a0b1c2d3e4f" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.alter_column( + "deployment", + "installation_date", + existing_type=sa.Date(), + nullable=True, + ) + + +def downgrade() -> None: + op.alter_column( + "deployment", + "installation_date", + existing_type=sa.Date(), + nullable=False, + ) diff --git a/alembic/versions/b3c4d5e6f7a8_make_wellscreen_depths_nullable.py b/alembic/versions/b3c4d5e6f7a8_make_wellscreen_depths_nullable.py new file mode 100644 index 00000000..7e1bca3a --- /dev/null +++ b/alembic/versions/b3c4d5e6f7a8_make_wellscreen_depths_nullable.py @@ -0,0 +1,48 @@ +"""make wellscreen depth fields nullable + +Revision ID: b3c4d5e6f7a8 +Revises: a1b2c3d4e5f7 +Create Date: 2026-02-21 15:20:00.000000 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "b3c4d5e6f7a8" +down_revision: Union[str, Sequence[str], None] = "a1b2c3d4e5f7" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.alter_column( + "well_screen", + "screen_depth_top", + existing_type=sa.Float(), + nullable=True, + ) + op.alter_column( + "well_screen", + "screen_depth_bottom", + existing_type=sa.Float(), + nullable=True, + ) + + +def downgrade() -> None: + op.alter_column( + "well_screen", + "screen_depth_bottom", + existing_type=sa.Float(), + nullable=False, + ) + op.alter_column( + "well_screen", + "screen_depth_top", + existing_type=sa.Float(), + nullable=False, + ) diff --git a/alembic/versions/c4d5e6f7a8b9_make_address_city_state_nullable.py b/alembic/versions/c4d5e6f7a8b9_make_address_city_state_nullable.py new file mode 100644 index 00000000..fb55e860 --- /dev/null +++ b/alembic/versions/c4d5e6f7a8b9_make_address_city_state_nullable.py @@ -0,0 +1,48 @@ +"""make address.city and address.state nullable + +Revision ID: c4d5e6f7a8b9 +Revises: b3c4d5e6f7a8 +Create Date: 2026-02-21 16:30:00.000000 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "c4d5e6f7a8b9" +down_revision: Union[str, Sequence[str], None] = "b3c4d5e6f7a8" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.alter_column( + "address", + "city", + existing_type=sa.String(length=100), + nullable=True, + ) + op.alter_column( + "address", + "state", + existing_type=sa.String(length=50), + nullable=True, + ) + + +def downgrade() -> None: + op.alter_column( + "address", + "city", + existing_type=sa.String(length=100), + nullable=False, + ) + op.alter_column( + "address", + "state", + existing_type=sa.String(length=50), + nullable=False, + ) diff --git a/api/README.md b/api/README.md new file mode 100644 index 00000000..fd6767de --- /dev/null +++ b/api/README.md @@ -0,0 +1,18 @@ +# API + +This directory contains FastAPI route modules grouped by resource/domain. + +## Structure + +- One module per domain (for example `thing.py`, `contact.py`, `observation.py`) +- `api/ogc/` contains OGC-specific endpoints + +## Guidelines + +- Keep endpoints focused on transport concerns (request/response, status codes). +- Put transfer/business logic in service or transfer modules. +- Ensure response schemas match `schemas/` definitions. + +## Running locally + +Use project entrypoint from repo root (see top-level README for full setup). diff --git a/cli/README.md b/cli/README.md new file mode 100644 index 00000000..42d557c8 --- /dev/null +++ b/cli/README.md @@ -0,0 +1,25 @@ +# CLI + +This directory contains Typer-based command entrypoints for operational and migration workflows. + +## Main entrypoint + +- `cli/cli.py` + +Run commands from repo root: + +```bash +source .venv/bin/activate +python -m cli.cli --help +``` + +## Common commands + +- `python -m cli.cli transfer-results` +- `python -m cli.cli compare-duplicated-welldata` +- `python -m cli.cli alembic-upgrade-and-data` + +## Notes + +- CLI logging is written to `cli/logs/`. +- Keep CLI commands thin; move heavy logic into service/transfer modules. diff --git a/cli/cli.py b/cli/cli.py index c84c862a..cb29338e 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -20,10 +20,12 @@ from pathlib import Path from textwrap import shorten, wrap +import pandas as pd import typer from dotenv import load_dotenv load_dotenv() +os.environ.setdefault("OCO_LOG_CONTEXT", "cli") cli = typer.Typer(help="Command line interface for managing the application.") water_levels = typer.Typer(help="Water-level utilities") @@ -136,6 +138,146 @@ def transfer_results( typer.echo(f"Transfer comparisons: {len(results.results)}") +@cli.command("compare-duplicated-welldata") +def compare_duplicated_welldata( + pointid: list[str] = typer.Option( + None, + "--pointid", + help="Optional PointID filter. Repeat --pointid for multiple values.", + ), + apply_transfer_filters: bool = typer.Option( + True, + "--apply-transfer-filters/--no-apply-transfer-filters", + help=( + "Apply WellTransferer-like pre-filters (GW + coordinates + transferable), " + "excluding DB-dependent non-transferred filtering." + ), + ), + summary_path: Path = typer.Option( + Path("transfers") / "metrics" / "welldata_duplicate_comparison_summary.csv", + "--summary-path", + help="Output CSV path for duplicate PointID summary.", + ), + detail_path: Path = typer.Option( + Path("transfers") / "metrics" / "welldata_duplicate_comparison_detail.csv", + "--detail-path", + help="Output CSV path for row x differing-column detail values.", + ), + theme: ThemeMode = typer.Option( + ThemeMode.auto, "--theme", help="Color theme: auto, light, dark." + ), +): + from transfers.util import get_transferable_wells, read_csv, replace_nans + + df = read_csv("WellData", dtype={"OSEWelltagID": str}) + + if apply_transfer_filters: + if "LocationId" in df.columns: + ldf = read_csv("Location") + ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1, errors="ignore") + df = df.join(ldf.set_index("LocationId"), on="LocationId") + + if "SiteType" in df.columns: + df = df[df["SiteType"] == "GW"] + + if "Easting" in df.columns and "Northing" in df.columns: + df = df[df["Easting"].notna() & df["Northing"].notna()] + + df = replace_nans(df) + df = get_transferable_wells(df) + else: + df = replace_nans(df) + + if pointid: + requested = {pid.strip() for pid in pointid if pid and pid.strip()} + df = df[df["PointID"].isin(requested)] + + if "PointID" not in df.columns: + typer.echo("WellData has no PointID column after filtering.") + raise typer.Exit(code=1) + + dup_mask = df["PointID"].duplicated(keep=False) + dup_df = df.loc[dup_mask].copy() + + summary_rows: list[dict] = [] + detail_rows: list[dict] = [] + + if not dup_df.empty: + for pid, group in dup_df.groupby("PointID", sort=True): + diff_cols: list[str] = [] + for col in group.columns: + series = group[col] + non_null = series[~series.isna()] + if non_null.empty: + continue + if len({str(v) for v in non_null}) > 1: + diff_cols.append(col) + + summary_rows.append( + { + "pointid": pid, + "duplicate_row_count": int(len(group)), + "differing_column_count": int(len(diff_cols)), + "differing_columns": "|".join(diff_cols), + } + ) + + normalized = group.reset_index(drop=False).rename( + columns={"index": "source_row_index"} + ) + for row_num, row in normalized.iterrows(): + for col in diff_cols: + value = row.get(col, None) + detail_rows.append( + { + "pointid": pid, + "row_number": int(row_num), + "source_row_index": int(row["source_row_index"]), + "column": col, + "value": value, + } + ) + + summary_df = pd.DataFrame(summary_rows) + if not summary_df.empty: + summary_df = summary_df.sort_values( + by=["duplicate_row_count", "pointid"], ascending=[False, True] + ) + + detail_df = pd.DataFrame(detail_rows) + if not detail_df.empty: + detail_df = detail_df.sort_values( + by=["pointid", "row_number", "column"], ascending=[True, True, True] + ) + + summary_path.parent.mkdir(parents=True, exist_ok=True) + detail_path.parent.mkdir(parents=True, exist_ok=True) + summary_df.to_csv(summary_path, index=False) + detail_df.to_csv(detail_path, index=False) + + if summary_df.empty: + typer.echo("No duplicated WellData PointIDs found for current filters.") + typer.echo(f"Wrote empty summary: {summary_path}") + typer.echo(f"Wrote empty detail: {detail_path}") + return + + total_dup_rows = int(len(dup_df)) + total_dup_pointids = int(summary_df["pointid"].nunique()) + typer.echo( + f"Found {total_dup_pointids} duplicated PointIDs across {total_dup_rows} rows." + ) + typer.echo(f"Wrote summary: {summary_path}") + typer.echo(f"Wrote detail: {detail_path}") + + preview = summary_df.head(20) + typer.echo("\nTop duplicate PointIDs:") + for row in preview.itertuples(index=False): + typer.echo( + f"- {row.pointid}: rows={row.duplicate_row_count}, " + f"differing_columns={row.differing_column_count}" + ) + + @cli.command("well-inventory-csv") def well_inventory_csv( file_path: str = typer.Argument( diff --git a/core/lexicon.json b/core/lexicon.json index 9da523f9..07b32c30 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -421,6 +421,7 @@ "elevation_method", "sample_method", "coordinate_method", + "well_construction_method", "well_purpose", "status", "organization", diff --git a/db/README.md b/db/README.md new file mode 100644 index 00000000..02556c22 --- /dev/null +++ b/db/README.md @@ -0,0 +1,22 @@ +# DB + +This directory contains SQLAlchemy models, engine/session setup, and database initialization helpers. + +## Key files + +- `db/base.py`: shared ORM base mixins and common fields +- `db/engine.py`: engine/session configuration +- `db/initialization.py`: schema/bootstrap utilities + +## Schema changes + +- Use Alembic migrations under `alembic/versions/` for all DDL changes. +- Keep model nullability/defaults aligned with migrations. +- Prefer idempotent data migrations and safe re-runs. + +## Local usage + +```bash +source .venv/bin/activate +alembic upgrade head +``` diff --git a/db/contact.py b/db/contact.py index fa3146df..0fb59473 100644 --- a/db/contact.py +++ b/db/contact.py @@ -188,9 +188,9 @@ class Address(Base, AutoBaseMixin, ReleaseMixin): ) address_line_1: Mapped[str] = mapped_column(String(255), nullable=False) address_line_2: Mapped[str | None] = mapped_column(String(255), nullable=True) - city: Mapped[str] = mapped_column(String(100), nullable=False) - state: Mapped[str] = mapped_column(String(50), nullable=False) - postal_code: Mapped[str] = mapped_column(String(20), nullable=False) + city: Mapped[str | None] = mapped_column(String(100), nullable=True) + state: Mapped[str | None] = mapped_column(String(50), nullable=True) + postal_code: Mapped[str] = mapped_column(String(20), nullable=True) country: Mapped[str] = mapped_column( String(50), default="United States", nullable=False ) diff --git a/db/deployment.py b/db/deployment.py index 6f07830a..60377c4d 100644 --- a/db/deployment.py +++ b/db/deployment.py @@ -33,7 +33,7 @@ class Deployment(Base, AutoBaseMixin, ReleaseMixin): ) # --- Columns --- - installation_date: Mapped[Date] = mapped_column(Date, nullable=False) + installation_date: Mapped[Date | None] = mapped_column(Date, nullable=True) removal_date: Mapped[Date] = mapped_column(Date, nullable=True) recording_interval: Mapped[int] = mapped_column(Integer, nullable=True) recording_interval_units: Mapped[str] = lexicon_term(nullable=True) diff --git a/db/measuring_point_history.py b/db/measuring_point_history.py index 7d23518a..16857a23 100644 --- a/db/measuring_point_history.py +++ b/db/measuring_point_history.py @@ -37,7 +37,7 @@ class MeasuringPointHistory(Base, AutoBaseMixin, ReleaseMixin): # --- Columns --- measuring_point_height: Mapped[float] = mapped_column( Numeric, - nullable=False, + nullable=True, comment="The official, surveyed height of the measuring point relative to ground surface (in feet).", ) measuring_point_description: Mapped[str] = mapped_column( diff --git a/db/thing.py b/db/thing.py index a0f3db3b..f5fbff5b 100644 --- a/db/thing.py +++ b/db/thing.py @@ -594,10 +594,10 @@ class WellScreen(Base, AutoBaseMixin, ReleaseMixin): geologic_formation_id: Mapped[int] = mapped_column( ForeignKey("geologic_formation.id", ondelete="SET NULL"), nullable=True ) - screen_depth_top: Mapped[float] = mapped_column( + screen_depth_top: Mapped[float | None] = mapped_column( info={"unit": "feet below ground surface"}, nullable=True ) - screen_depth_bottom: Mapped[float] = mapped_column( + screen_depth_bottom: Mapped[float | None] = mapped_column( info={"unit": "feet below ground surface"}, nullable=True ) screen_type: Mapped[str] = lexicon_term(nullable=True) # e.g., "PVC", "Steel", etc. diff --git a/schemas/contact.py b/schemas/contact.py index a9302daa..248ff173 100644 --- a/schemas/contact.py +++ b/schemas/contact.py @@ -24,6 +24,7 @@ from schemas import BaseResponseModel, BaseCreateModel, BaseUpdateModel from schemas.notes import CreateNote, NoteResponse + # -------- VALIDATORS ---------- @@ -123,10 +124,12 @@ class CreateAddress(BaseCreateModel): # todo: use a postal API to validate address and suggest corrections address_line_1: str # Required (e.g., "123 Main St") address_line_2: str | None = None # Optional (e.g., "Apt 4B", "Suite 200") - city: str + city: str | None = None # todo: add validation. Should state be required? what about foreign addresses? - state: str = "NM" # Default to New Mexico - postal_code: str + state: str | None = "NM" # Default to New Mexico + + # todo: make postal code required? + postal_code: str | None = None country: str = "United States" # Default to United States address_type: AddressType = "Primary" @@ -193,9 +196,9 @@ class AddressResponse(BaseItemResponse): address_line_1: str address_line_2: str | None = None - city: str - state: str - postal_code: str + city: str | None = None + state: str | None = None + postal_code: str | None = None country: str address_type: AddressType diff --git a/schemas/deployment.py b/schemas/deployment.py index 5bd05014..2e7df9f8 100644 --- a/schemas/deployment.py +++ b/schemas/deployment.py @@ -7,7 +7,7 @@ class DeploymentResponse(BaseResponseModel): thing_id: int sensor: SensorResponse - installation_date: date + installation_date: date | None removal_date: date | None recording_interval: int | None recording_interval_units: str | None diff --git a/schemas/sample.py b/schemas/sample.py index 4d821e57..8dce646b 100644 --- a/schemas/sample.py +++ b/schemas/sample.py @@ -91,7 +91,7 @@ def convert_sample_date_to_utc(sample_date: AwareDatetime) -> AwareDatetime: # -------- CREATE ---------- class CreateSample(BaseCreateModel, ValidateSample): field_activity_id: int - field_event_participant_id: int + field_event_participant_id: int | None = None sample_date: Annotated[AwareDatetime, PastDatetime()] sample_name: str sample_matrix: SampleMatrix @@ -130,7 +130,7 @@ class SampleResponse(BaseResponseModel): thing: ThingResponse field_event: FieldEventResponse field_activity: FieldActivityResponse - contact: ContactResponse + contact: ContactResponse | None sample_date: UTCAwareDatetime sample_name: str sample_matrix: SampleMatrix diff --git a/schemas/thing.py b/schemas/thing.py index 60dfce42..a6080923 100644 --- a/schemas/thing.py +++ b/schemas/thing.py @@ -35,6 +35,7 @@ from schemas.notes import NoteResponse, CreateNote from schemas.permission_history import PermissionHistoryResponse + # -------- VALIDATE ---------- @@ -47,6 +48,9 @@ class ValidateWell(BaseModel): @model_validator(mode="after") def validate_values(self): + # todo: reenable depth validation. removed for transfer + return self + if self.hole_depth is not None: if self.well_depth is not None and self.well_depth > self.hole_depth: raise ValueError( @@ -66,25 +70,6 @@ def validate_values(self): elif self.hole_depth is not None and self.well_pump_depth > self.hole_depth: raise ValueError("well pump depth must be less than hole depth") - # if self.measuring_point_height is not None: - # if ( - # self.hole_depth is not None - # and self.measuring_point_height >= self.hole_depth - # ): - # raise ValueError("measuring point height must be less than hole depth") - # elif ( - # self.well_casing_depth is not None - # and self.measuring_point_height >= self.well_casing_depth - # ): - # raise ValueError( - # "measuring point height must be less than well casing depth" - # ) - # elif ( - # self.well_depth is not None - # and self.measuring_point_height >= self.well_depth - # ): - # raise ValueError("measuring point height must be less than well depth") - return self @@ -145,7 +130,9 @@ class CreateWell(CreateBaseThing, ValidateWell): default=None, gt=0, description="Well casing depth in feet" ) well_casing_materials: list[CasingMaterial] | None = None - measuring_point_height: float = Field(description="Measuring point height in feet") + measuring_point_height: float | None = Field( + default=None, description="Measuring point height in feet" + ) measuring_point_description: str | None = None well_completion_date: PastOrTodayDate | None = None well_completion_date_source: str | None = None @@ -177,18 +164,26 @@ class CreateWellScreen(BaseCreateModel): thing_id: int aquifer_system_id: int | None = None geologic_formation_id: int | None = None - screen_depth_bottom: float = Field(gt=0, description="Screen depth bottom in feet") - screen_depth_top: float = Field(gt=0, description="Screen depth top in feet") + screen_depth_bottom: float | None = Field( + default=None, ge=0, description="Screen depth bottom in feet" + ) + screen_depth_top: float | None = Field( + default=None, ge=0, description="Screen depth top in feet" + ) screen_type: ScreenType | None = None screen_description: str | None = None # validate that screen depth bottom is greater than top @model_validator(mode="after") def check_depths(self): - if self.screen_depth_bottom < self.screen_depth_top: - raise ValueError( - "screen_depth_bottom must be greater than screen_depth_top" - ) + # todo: reenable depth validation. removed for transfer + return self + + if self.screen_depth_bottom or self.screen_depth_top: + if self.screen_depth_bottom < self.screen_depth_top: + raise ValueError( + "screen_depth_bottom must be greater than screen_depth_top" + ) return self @@ -260,7 +255,7 @@ class WellResponse(BaseThingResponse): well_status: str | None open_status: str | None datalogger_suitability_status: str | None - measuring_point_height: float + measuring_point_height: float | None measuring_point_height_unit: str = "ft" measuring_point_description: str | None aquifers: list[dict] = [] @@ -352,9 +347,9 @@ class WellScreenResponse(BaseResponseModel): aquifer_type: str | None = None geologic_formation_id: int | None = None geologic_formation: str | None = None - screen_depth_bottom: float + screen_depth_bottom: float | None = None screen_depth_bottom_unit: str = "ft" - screen_depth_top: float + screen_depth_top: float | None = None screen_depth_top_unit: str = "ft" screen_type: str | None = None screen_description: str | None = None diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..2593c593 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,31 @@ +# Tests + +This directory contains automated tests (unit, integration, transfer, and API behavior). + +## Layout + +- `tests/unit/`: focused unit tests +- `tests/integration/`: cross-component tests +- `tests/transfers/`: transfer-focused tests +- `tests/features/`: BDD-style feature tests + +## Running tests + +From repo root: + +```bash +source .venv/bin/activate +set -a; source .env; set +a +pytest -q +``` + +Run a subset: + +```bash +pytest -q tests/transfers +``` + +## Notes + +- Many tests depend on database settings from `.env`. +- Keep tests deterministic and idempotent where possible. diff --git a/tests/features/environment.py b/tests/features/environment.py index 266df26f..4f3a6d2b 100644 --- a/tests/features/environment.py +++ b/tests/features/environment.py @@ -19,6 +19,8 @@ from alembic import command from alembic.config import Config +from sqlalchemy import select + from core.initializers import init_lexicon, init_parameter from db import ( Location, @@ -51,7 +53,7 @@ ) from db.engine import session_ctx from db.initialization import recreate_public_schema, sync_search_vector_triggers -from sqlalchemy import select +from services.util import get_bool_env def add_context_object_container(name): @@ -521,6 +523,10 @@ def _initialize_test_schema() -> None: def before_all(context): context.objects = {} + + if not get_bool_env("DROP_AND_REBUILD_DB"): + return + _initialize_test_schema() with session_ctx() as session: @@ -711,6 +717,9 @@ def before_all(context): def after_all(context): + if not get_bool_env("DROP_AND_REBUILD_DB"): + return + with session_ctx() as session: for table in reversed(Base.metadata.sorted_tables): if table.name in ("alembic_version", "parameter"): @@ -731,6 +740,10 @@ def before_scenario(context, scenario): def after_scenario(context, scenario): + + if not get_bool_env("DROP_AND_REBUILD_DB"): + return + # runs after EVERY scenario # e.g. clean up temp files, close db sessions if scenario.name.startswith( diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py index 412ebea3..8bdc2f9c 100644 --- a/tests/test_cli_commands.py +++ b/tests/test_cli_commands.py @@ -244,10 +244,12 @@ def test_water_levels_cli_persists_observations(tmp_path, water_well_thing): """ def _write_csv(path: Path, *, well_name: str, notes: str): - csv_text = textwrap.dedent(f"""\ + csv_text = textwrap.dedent( + f"""\ field_staff,well_name_point_id,field_event_date_time,measurement_date_time,sampler,sample_method,mp_height,level_status,depth_to_water_ft,data_quality,water_level_notes CLI Tester,{well_name},2025-02-15T08:00:00-07:00,2025-02-15T10:30:00-07:00,Groundwater Team,electric tape,1.5,stable,42.5,approved,{notes} - """) + """ + ) path.write_text(csv_text) unique_notes = f"pytest-{uuid.uuid4()}" diff --git a/tests/test_util.py b/tests/test_util.py index dea033ee..8a637b6d 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -54,6 +54,30 @@ def test_measuring_point_estimator_handles_missing_point(monkeypatch): assert mph_descs == [] +def test_measuring_point_estimator_rounds_estimated_height_to_two_sig_figs(monkeypatch): + monkeypatch.setattr( + "transfers.util.read_csv", lambda name: _mock_waterlevels_df().copy() + ) + estimator = MeasuringPointEstimator() + row = SimpleNamespace(PointID="A", MPHeight=None, MeasuringPoint=None) + + mphs, _, _, _ = estimator.estimate_measuring_point_height(row) + + assert mphs[0] == 1.2 + + +def test_measuring_point_estimator_keeps_explicit_height_unrounded(monkeypatch): + monkeypatch.setattr( + "transfers.util.read_csv", lambda name: _mock_waterlevels_df().copy() + ) + estimator = MeasuringPointEstimator() + row = SimpleNamespace(PointID="A", MPHeight=1.234, MeasuringPoint="top of casing") + + mphs, _, _, _ = estimator.estimate_measuring_point_height(row) + + assert mphs == [1.234] + + def _mock_waterlevels_df(): return pd.DataFrame( { @@ -63,7 +87,7 @@ def _mock_waterlevels_df(): "2024-01-01", "2023-12-01", ], - "DepthToWater": [10.0, 11.0, 5.0], + "DepthToWater": [10.0, 11.234, 5.0], "DepthToWaterBGS": [9.0, 10.0, 4.5], } ) diff --git a/tests/unit/test_contact_transfer_email_utils.py b/tests/unit/test_contact_transfer_email_utils.py new file mode 100644 index 00000000..65ab9d03 --- /dev/null +++ b/tests/unit/test_contact_transfer_email_utils.py @@ -0,0 +1,19 @@ +from transfers.contact_transfer import _looks_like_phone_in_email_field, _make_email + + +def test_make_email_strips_email_prefix_and_trailing_punctuation(): + email = _make_email( + "first", + "owner", + email="Email: dlglnd@verizon.net.", + email_type="Primary", + release_status="private", + ) + assert email is not None + assert email.email == "dlglnd@verizon.net" + + +def test_phone_like_email_field_detection(): + assert _looks_like_phone_in_email_field("(505)-470-5877") is True + assert _looks_like_phone_in_email_field("(505) 259-1757") is True + assert _looks_like_phone_in_email_field("francisco_rael@hotmail.com") is False diff --git a/transfers/README.md b/transfers/README.md new file mode 100644 index 00000000..48a5743a --- /dev/null +++ b/transfers/README.md @@ -0,0 +1,27 @@ +# Transfers + +This directory contains legacy-to-target ETL transfer logic. + +## Main orchestration + +- `transfers/transfer.py` + +## Important supporting modules + +- `transfers/transferer.py`: base transfer patterns +- `transfers/util.py`: shared parsing/mapping helpers +- `transfers/logger.py`: transfer logging +- `transfers/metrics.py`: metrics capture + +## Performance rules + +For high-volume tables, prefer Core batch inserts: + +- `session.execute(insert(Model), rows)` + +Avoid ORM-heavy per-row object construction for bulk workloads. + +## Outputs + +- Logs: `transfers/logs/` +- Metrics: `transfers/metrics/` diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index dc649fc0..1e99d88b 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== import json +import re import pandas as pd from pandas import DataFrame @@ -93,7 +94,26 @@ def __init__(self, *args, **kw): ) self._ownerkey_mapper = {} - self._added = [] + self._added: set[tuple[str | None, str | None]] = set() + self._contact_by_owner_type: dict[tuple[str, str], Contact] = {} + self._contact_by_name_org: dict[tuple[str | None, str | None], Contact] = {} + self._commit_step = 500 + + def _build_contact_caches(self, session: Session) -> None: + contacts = session.query(Contact).all() + owner_type: dict[tuple[str, str], Contact] = {} + name_org: dict[tuple[str | None, str | None], Contact] = {} + for contact in contacts: + if contact.nma_pk_owners and contact.contact_type: + owner_type[(contact.nma_pk_owners, contact.contact_type)] = contact + name_org[(contact.name, contact.organization)] = contact + self._contact_by_owner_type = owner_type + self._contact_by_name_org = name_org + logger.info( + "Built contact caches: owner_type=%s name_org=%s", + len(self._contact_by_owner_type), + len(self._contact_by_name_org), + ) def calculate_missing_organizations(self): input_df, cleaned_df = self._get_dfs() @@ -184,6 +204,47 @@ def _get_dfs(self): def _get_prepped_group(self, group) -> DataFrame: return group.sort_values(by=["PointID"]) + def _transfer_hook(self, session: Session): + self._build_contact_caches(session) + + groups = self._get_group() + pointids = [ + idx[0] if isinstance(idx, tuple) else idx for idx in groups.groups.keys() + ] + things = session.query(Thing).filter(Thing.name.in_(pointids)).all() + thing_by_name = {thing.name: thing for thing in things} + logger.info( + "Prepared ContactTransfer caches: %s grouped PointIDs, %s matching Things", + len(pointids), + len(thing_by_name), + ) + + processed_groups = 0 + for index, group in groups: + pointid = index[0] if isinstance(index, tuple) else index + db_item = thing_by_name.get(pointid) + if db_item is None: + logger.warning(f"Thing with PointID {pointid} not found in database.") + continue + + prepped_group = self._get_prepped_group(group) + for row in prepped_group.itertuples(): + try: + self._group_step(session, row, db_item) + except Exception as e: + logger.critical( + f"Could not add contact(s) for PointID {pointid}: {e}" + ) + self._capture_error(pointid, str(e), "UnknownField") + + processed_groups += 1 + if processed_groups % self._commit_step == 0: + session.commit() + logger.info( + "Committed ContactTransfer progress: %s groups processed", + processed_groups, + ) + def _group_step(self, session: Session, row: pd.Series, db_item: Base): organization = _get_organization(row, self._co_to_org_mapper) for adder, tag in (_add_first_contact, "first"), ( @@ -197,6 +258,8 @@ def _group_step(self, session: Session, row: pd.Series, db_item: Base): db_item, organization, self._added, + self._contact_by_owner_type, + self._contact_by_name_org, ) if contact is not None: session.flush([contact]) @@ -209,7 +272,6 @@ def _group_step(self, session: Session, row: pd.Series, db_item: Base): ): note = contact.add_note(row.OwnerComment, "OwnerComment") session.add(note) - session.commit() logger.info(f"added {tag} contact for PointID {row.PointID}") except ValidationError as e: logger.critical( @@ -225,14 +287,26 @@ def _group_step(self, session: Session, row: pd.Series, db_item: Base): def _add_first_contact( - session: Session, row: pd.Series, thing: Thing, organization: str, added: list + session: Session, + row: pd.Series, + thing: Thing, + organization: str, + added: set[tuple[str | None, str | None]], + contact_by_owner_type: dict[tuple[str, str], Contact], + contact_by_name_org: dict[tuple[str | None, str | None], Contact], ) -> Contact | None: # TODO: extract role from OwnerComment # role = extract_owner_role(row.OwnerComment) role = "Owner" release_status = "private" - name = _safe_make_name(row.FirstName, row.LastName, row.OwnerKey, organization) + name = _safe_make_name( + row.FirstName, + row.LastName, + row.OwnerKey, + organization, + fallback_suffix="primary", + ) contact_data = { "thing_id": thing.id, @@ -247,23 +321,47 @@ def _add_first_contact( "phones": [], } - contact, new = _make_contact_and_assoc(session, contact_data, thing, added) + contact, new = _make_contact_and_assoc( + session, + contact_data, + thing, + added, + contact_by_owner_type, + contact_by_name_org, + ) if not new: return None - else: - added.append((name, organization)) if row.Email: - email = _make_email( - "first", - row.OwnerKey, - email=row.Email.strip(), - email_type="Primary", - release_status=release_status, - ) - if email: - contact.emails.append(email) + raw_email = str(row.Email).strip() + if _looks_like_phone_in_email_field(raw_email): + logger.warning( + "first '%s' Email field looked like a phone number; storing as phone instead.", + row.OwnerKey, + ) + phone, complete = _make_phone( + "first", + row.OwnerKey, + phone_number=raw_email, + phone_type="Primary", + release_status=release_status, + ) + if phone: + if complete: + contact.phones.append(phone) + else: + contact.incomplete_nma_phones.append(phone) + else: + email = _make_email( + "first", + row.OwnerKey, + email=raw_email, + email_type="Primary", + release_status=release_status, + ) + if email: + contact.emails.append(email) if row.Phone: phone, complete = _make_phone( @@ -327,20 +425,33 @@ def _add_first_contact( def _safe_make_name( - first: str | None, last: str | None, ownerkey: str, organization: str | None + first: str | None, + last: str | None, + ownerkey: str, + organization: str | None, + fallback_suffix: str | None = None, ) -> str | None: name = _make_name(first, last) if name is None and organization is None: + fallback = str(ownerkey) if ownerkey is not None else None + if fallback and fallback_suffix: + fallback = f"{fallback}-{fallback_suffix}" logger.warning( f"Missing both first and last name and organization for OwnerKey {ownerkey}; " - f"using OwnerKey as fallback name." + f"using OwnerKey fallback name '{fallback}'." ) - return ownerkey + return fallback return name def _add_second_contact( - session: Session, row: pd.Series, thing: Thing, organization: str, added: list + session: Session, + row: pd.Series, + thing: Thing, + organization: str, + added: set[tuple[str | None, str | None]], + contact_by_owner_type: dict[tuple[str, str], Contact], + contact_by_name_org: dict[tuple[str | None, str | None], Contact], ) -> None: if all( [ @@ -352,7 +463,13 @@ def _add_second_contact( return release_status = "private" - name = _make_name(row.SecondFirstName, row.SecondLastName) + name = _safe_make_name( + row.SecondFirstName, + row.SecondLastName, + row.OwnerKey, + organization, + fallback_suffix="secondary", + ) contact_data = { "thing_id": thing.id, @@ -367,22 +484,46 @@ def _add_second_contact( "phones": [], } - contact, new = _make_contact_and_assoc(session, contact_data, thing, added) + contact, new = _make_contact_and_assoc( + session, + contact_data, + thing, + added, + contact_by_owner_type, + contact_by_name_org, + ) if not new: return - else: - added.append((name, organization)) if row.SecondCtctEmail: - email = _make_email( - "second", - row.OwnerKey, - email=row.SecondCtctEmail, - email_type="Primary", - release_status=release_status, - ) - if email: - contact.emails.append(email) + raw_email = str(row.SecondCtctEmail).strip() + if _looks_like_phone_in_email_field(raw_email): + logger.warning( + "second '%s' Email field looked like a phone number; storing as phone instead.", + row.OwnerKey, + ) + phone, complete = _make_phone( + "second", + row.OwnerKey, + phone_number=raw_email, + phone_type="Primary", + release_status=release_status, + ) + if phone: + if complete: + contact.phones.append(phone) + else: + contact.incomplete_nma_phones.append(phone) + else: + email = _make_email( + "second", + row.OwnerKey, + email=raw_email, + email_type="Primary", + release_status=release_status, + ) + if email: + contact.emails.append(email) if row.SecondCtctPhone: phone, complete = _make_phone( @@ -428,7 +569,12 @@ def _make_email(first_second: str, ownerkey: str, **kw) -> Email | None: try: if "email" in kw: - kw["email"] = kw["email"].strip() + email = kw["email"].strip() + # Normalize legacy values like "Email: user@example.com" + email = re.sub(r"^\s*email\s*:\s*", "", email, flags=re.IGNORECASE) + # Normalize trailing punctuation from data-entry notes (e.g., "user@aol.com.") + email = re.sub(r"[.,;:]+$", "", email) + kw["email"] = email email = CreateEmail(**kw) return Email(**email.model_dump()) @@ -438,6 +584,21 @@ def _make_email(first_second: str, ownerkey: str, **kw) -> Email | None: ) +def _looks_like_phone_in_email_field(value: str | None) -> bool: + if not value: + return False + + text = value.strip() + if "@" in text: + return False + + # Accept common phone formatting chars, require enough digits to be a phone number. + if not re.fullmatch(r"[\d\s().+\-]+", text): + return False + digits = re.sub(r"\D", "", text) + return len(digits) >= 7 + + def _make_phone(first_second: str, ownerkey: str, **kw) -> tuple[Phone | None, bool]: from schemas.contact import CreatePhone @@ -473,41 +634,40 @@ def _make_address(first_second: str, ownerkey: str, kind: str, **kw) -> Address def _make_contact_and_assoc( - session: Session, data: dict, thing: Thing, added: list + session: Session, + data: dict, + thing: Thing, + added: set[tuple[str | None, str | None]], + contact_by_owner_type: dict[tuple[str, str], Contact], + contact_by_name_org: dict[tuple[str | None, str | None], Contact], ) -> tuple[Contact, bool]: new_contact = True contact = None - # Prefer OwnerKey-based dedupe so fallback names don't split the same owner - # into multiple contacts when some rows have real names and others do not. owner_key = data.get("nma_pk_owners") contact_type = data.get("contact_type") if owner_key and contact_type: - contact = ( - session.query(Contact) - .filter_by(nma_pk_owners=owner_key, contact_type=contact_type) - .first() - ) + contact = contact_by_owner_type.get((owner_key, contact_type)) if contact is not None: new_contact = False - if contact is None and (data["name"], data["organization"]) in added: - contact = ( - session.query(Contact) - .filter_by(name=data["name"], organization=data["organization"]) - .first() - ) + name_org_key = (data["name"], data["organization"]) + if contact is None and name_org_key in added: + contact = contact_by_name_org.get(name_org_key) if contact is not None: new_contact = False if contact is None: - from schemas.contact import CreateContact contact = CreateContact(**data) contact_data = contact.model_dump(exclude=["thing_id", "notes"]) contact = Contact(**contact_data) session.add(contact) + if owner_key and contact_type: + contact_by_owner_type[(owner_key, contact_type)] = contact + contact_by_name_org[name_org_key] = contact + added.add(name_org_key) assoc = ThingContactAssociation() assoc.thing = thing diff --git a/transfers/geologic_formation_transfer.py b/transfers/geologic_formation_transfer.py index 4b8250c7..9d633682 100644 --- a/transfers/geologic_formation_transfer.py +++ b/transfers/geologic_formation_transfer.py @@ -1,6 +1,5 @@ -import time - from pydantic import ValidationError +from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.orm import Session from db import GeologicFormation @@ -27,12 +26,13 @@ def transfer_geologic_formations(session: Session, limit: int = None) -> tuple: # 2. Replace NaNs with None cleaned_df = replace_nans(input_df) + if limit is not None: + cleaned_df = cleaned_df.head(limit) + # 3. Initialize tracking variables for logging n = len(cleaned_df) - step = 25 - start_time = time.time() errors = [] - created_count = 0 + prepared_count = 0 skipped_count = 0 logger.info( @@ -40,46 +40,34 @@ def transfer_geologic_formations(session: Session, limit: int = None) -> tuple: n, ) - # 4. Process each row - for i, row in enumerate(cleaned_df.itertuples()): - # Log progress every 'step' rows - if i and not i % step: - logger.info( - f"Processing row {i} of {n}. Avg rows per second: {step / (time.time() - start_time):.2f}" - ) - start_time = time.time() + # 4. Build a deduplicated, validated payload for a set-based insert. + rows_to_insert: list[dict] = [] + seen_codes: set[str] = set() + for i, row in enumerate(cleaned_df.itertuples(index=False), start=1): + if i % 1000 == 0: + logger.info("Prepared %s/%s geologic formation rows", i, n) - # Commit progress periodically - try: - session.commit() - except Exception as e: - logger.critical(f"Error committing geologic formations: {e}") - session.rollback() - continue + # 5. Extract and normalize formation code + formation_code = getattr(row, "Code", None) - # 5. Extract formation code and description - formation_code = row.Code + if not formation_code: + logger.warning("Skipping row %s: Missing formation code", i) + skipped_count += 1 + continue + formation_code = str(formation_code).strip().upper() if not formation_code: - logger.warning(f"Skipping row {i}: Missing formation code") + logger.warning("Skipping row %s: Blank formation code", i) + skipped_count += 1 + continue + + if formation_code in seen_codes: + # Duplicate code in source payload; keep first one only. skipped_count += 1 continue + seen_codes.add(formation_code) - # Check if this formation already exists - # existing = ( - # session.query(GeologicFormation) - # .filter(GeologicFormation.formation_code == formation_code) - # .first() - # ) - # - # if existing: - # logger.info( - # f"Skipping row {i}: Formation code {formation_code} already exists" - # ) - # skipped_count += 1 - # continue - - # 6. Prepare data for creation + # 6. Validate and prepare payload # Note: We only store the formation_code. Formation names will be mapped by the API using a # formations.json file from authoritative sources (e.g., USGS). # The description field is left as None and can be populated later if needed. @@ -105,33 +93,30 @@ def transfer_geologic_formations(session: Session, limit: int = None) -> tuple: logger.critical(f"Error preparing data for {formation_code}: {e}") continue - # 7. Create database object - geologic_formation = None - try: - formation_data = data.model_dump() - geologic_formation = GeologicFormation(**formation_data) - session.add(geologic_formation) - created_count += 1 + rows_to_insert.append(data.model_dump()) + prepared_count += 1 - logger.info( - f"Created geologic formation: {geologic_formation.formation_code}" - ) - - except Exception as e: - if geologic_formation is not None: - session.expunge(geologic_formation) - errors.append({"code": formation_code, "error": str(e)}) - logger.critical( - f"Error creating geologic formation for {formation_code}: {e}" + # 7. Bulk insert with idempotent upsert semantics. + created_count = 0 + try: + if rows_to_insert: + stmt = ( + pg_insert(GeologicFormation) + .values(rows_to_insert) + .on_conflict_do_nothing(index_elements=["formation_code"]) + .returning(GeologicFormation.formation_code) ) - continue + inserted_codes = session.execute(stmt).scalars().all() + created_count = len(inserted_codes) - # 8. Final commit - try: session.commit() logger.info( - f"Successfully transferred {created_count} geologic formations, skipped {skipped_count}. " - f"Note: lithology is None and will be updated during stratigraphy transfer." + "Successfully transferred geologic formations. prepared=%s created=%s skipped=%s " + "existing_or_duplicate=%s. Note: lithology is None and will be updated during stratigraphy transfer.", + prepared_count, + created_count, + skipped_count, + max(prepared_count - created_count, 0), ) except Exception as e: logger.critical(f"Error during final commit of geologic formations: {e}") diff --git a/transfers/link_ids_transfer.py b/transfers/link_ids_transfer.py index c32fd0b8..462f6de7 100644 --- a/transfers/link_ids_transfer.py +++ b/transfers/link_ids_transfer.py @@ -16,8 +16,10 @@ import re import pandas as pd +from sqlalchemy import insert from db import Thing, ThingIdLink +from transfers.transferer import chunk_by_size from transfers.util import ( filter_to_valid_point_ids, logger, @@ -31,47 +33,78 @@ class LinkIdsWellDataTransferer(WellChunkTransferer): source_table = "WellData" source_dtypes = {"OSEWellID": str, "OSEWelltagID": str} + _ose_wellid_regex = re.compile(r"^[A-Z]{1,3}-\d{3,6}$") + + def _transfer_hook(self, session): + df = self._get_df_to_iterate() + for ci, chunk in enumerate(chunk_by_size(df, self.chunk_size)): + thing_id_by_pointid = { + name: thing_id + for name, thing_id in session.query(Thing.name, Thing.id) + .filter(Thing.name.in_(chunk.PointID.tolist())) + .all() + } + logger.info( + "Processing LinkIdsWellData chunk %s, %s rows, %s db items", + ci, + len(chunk), + len(thing_id_by_pointid), + ) - def _chunk_step(self, session, dr, i, row, db_item): - if pd.isna(row.OSEWellID) and pd.isna(row.OSEWelltagID): - return - - for aid, klass, regex in ( - (row.OSEWellID, "OSEPOD", r"^[A-Z]{1,3}-\d{3,6}"), - ( - row.OSEWelltagID, - "OSEWellTagID", - r"", - ), # TODO: need to figure out regex for this field - ): - if pd.isna(aid): - # logger.warning(f"{klass} is null for {row.PointID}") - continue - - # RULE: exclude any id that == 'X', '?' - if aid.strip().lower() in ("x", "?", "exempt"): - logger.critical( - f'{klass} is "X", "?", or "exempt", id={aid} for {row.PointID}' - ) - continue - - if regex and not re.match(regex, aid): - logger.critical( - f"{klass} id does not match regex {regex}, id={aid} for {row.PointID}" - ) - continue - - # TODO: add guards for null values - link_id = ThingIdLink() - link_id.thing = db_item - link_id.relation = klass - link_id.alternate_id = aid - link_id.alternate_organization = "NMOSE" - - # does link_id need a class e.g. - # link_id.alternate_id_class = klass - - session.add(link_id) + rows_to_insert: list[dict] = [] + for row in chunk.itertuples(index=False): + thing_id = thing_id_by_pointid.get(row.PointID) + if thing_id is None: + self._missing_db_item_warning(row) + continue + + if pd.isna(row.OSEWellID) and pd.isna(row.OSEWelltagID): + continue + + for aid, relation, regex in ( + (row.OSEWellID, "OSEPOD", self._ose_wellid_regex), + (row.OSEWelltagID, "OSEWellTagID", None), + ): + if pd.isna(aid): + continue + + aid_text = str(aid).strip() + if not aid_text: + continue + + # RULE: exclude any id that == 'X', '?', or 'exempt' + if aid_text.casefold() in ("x", "?", "exempt"): + logger.critical( + '%s is "X", "?", or "exempt", id=%s for %s', + relation, + aid_text, + row.PointID, + ) + continue + + if regex and not regex.match(aid_text): + logger.critical( + "%s id does not match regex %s, id=%s for %s", + relation, + regex.pattern, + aid_text, + row.PointID, + ) + continue + + rows_to_insert.append( + { + "thing_id": thing_id, + "relation": relation, + "alternate_id": aid_text, + "alternate_organization": "NMOSE", + } + ) + + if rows_to_insert: + session.execute(insert(ThingIdLink), rows_to_insert) + session.commit() + session.expunge_all() class LinkIdsLocationDataTransferer(WellChunkTransferer): @@ -105,31 +138,65 @@ def _get_dfs(self): cleaned_df = filter_to_valid_point_ids(ldf) return input_df, cleaned_df + def _transfer_hook(self, session): + df = self._get_df_to_iterate() + for ci, chunk in enumerate(chunk_by_size(df, self.chunk_size)): + thing_id_by_pointid = { + name: thing_id + for name, thing_id in session.query(Thing.name, Thing.id) + .filter(Thing.name.in_(chunk.PointID.tolist())) + .all() + } + logger.info( + "Processing LinkIdsLocationData chunk %s, %s rows, %s db items", + ci, + len(chunk), + len(thing_id_by_pointid), + ) + + rows_to_insert: list[dict] = [] + for row in chunk.itertuples(index=False): + thing_id = thing_id_by_pointid.get(row.PointID) + if thing_id is None: + self._missing_db_item_warning(row) + continue + + for func in ( + self._add_link_alternate_site_id, + self._add_link_site_id, + self._add_link_plss, + ): + link_row = func(row, thing_id) + if link_row: + rows_to_insert.append(link_row) + + if rows_to_insert: + session.execute(insert(ThingIdLink), rows_to_insert) + session.commit() + session.expunge_all() + def _chunk_step(self, session, df, i, row, db_item): - logger.info( - f"Processing PointID: {row.PointID}, " - f"Thing ID: {db_item.id}, " - f"AlternateSiteID={row.AlternateSiteID}, " - f"AlternateSiteID2={row.AlternateSiteID2}" - ) + # Kept for compatibility; bulk path uses _transfer_hook. for func in ( self._add_link_alternate_site_id, self._add_link_site_id, self._add_link_plss, ): - link = func(row, db_item) + link = func(row, db_item.id) if link: - session.add(link) + session.execute(insert(ThingIdLink), [link]) - def _add_link_alternate_site_id(self, row: pd.Series, thing: Thing): + def _add_link_alternate_site_id(self, row: pd.Series, thing_id: int): if not row.AlternateSiteID: return return _make_thing_id_link( - thing, row.AlternateSiteID, extract_organization(str(row.AlternateSiteID)) + thing_id, + row.AlternateSiteID, + extract_organization(str(row.AlternateSiteID)), ) - def _add_link_site_id(self, row, thing): + def _add_link_site_id(self, row, thing_id: int): if not row.SiteID: return @@ -143,9 +210,9 @@ def _add_link_site_id(self, row, thing): ) return - return _make_thing_id_link(thing, row.SiteID, "USGS") + return _make_thing_id_link(thing_id, row.SiteID, "USGS") - def _add_link_plss(self, row, thing): + def _add_link_plss(self, row, thing_id: int): township = row.Township township_direction = row.TownshipDirection _range = row.Range @@ -167,18 +234,18 @@ def _add_link_plss(self, row, thing): logger.critical(f"alternate id {alternate_id} is not a valid PLSS") return - return _make_thing_id_link(thing, alternate_id, "PLSS") + return _make_thing_id_link(thing_id, alternate_id, "PLSS") def _make_thing_id_link( - thing, alternate_id, alternate_organization, relation="same_as" + thing_id: int, alternate_id, alternate_organization, relation="same_as" ): - return ThingIdLink( - thing=thing, - relation=relation, - alternate_id=alternate_id, - alternate_organization=alternate_organization, - ) + return { + "thing_id": thing_id, + "relation": relation, + "alternate_id": alternate_id, + "alternate_organization": alternate_organization, + } # ============= EOF ============================================= diff --git a/transfers/logger.py b/transfers/logger.py index decf34d0..57a78f8f 100644 --- a/transfers/logger.py +++ b/transfers/logger.py @@ -21,14 +21,20 @@ from services.gcs_helper import get_storage_bucket -root = Path("logs") -if not os.getcwd().endswith("transfers"): - root = Path("transfers") / root +_context = os.environ.get("OCO_LOG_CONTEXT", "transfer").strip().lower() or "transfer" -if not os.path.exists(root): - os.mkdir(root) +if _context == "cli": + root = Path("cli") / "logs" + _prefix = "cli" +else: + root = Path("logs") + if not os.getcwd().endswith("transfers"): + root = Path("transfers") / root + _prefix = "transfer" -log_filename = f"transfer_{datetime.now():%Y-%m-%dT%H_%M_%S}.log" +root.mkdir(parents=True, exist_ok=True) + +log_filename = f"{_prefix}_{datetime.now():%Y-%m-%dT%H_%M_%S}.log" log_path = root / log_filename @@ -53,9 +59,10 @@ def save_log_to_bucket(): bucket = get_storage_bucket() - blob = bucket.blob(f"transfer_logs/{log_filename}") + bucket_folder = "transfer_logs" if _context != "cli" else "cli_logs" + blob = bucket.blob(f"{bucket_folder}/{log_filename}") blob.upload_from_filename(log_path) - logger.info(f"Uploaded log to gs://{bucket.name}/transfer_logs/{log_filename}") + logger.info(f"Uploaded log to gs://{bucket.name}/{bucket_folder}/{log_filename}") # ============= EOF ============================================= diff --git a/transfers/relaxed_constraints.md b/transfers/relaxed_constraints.md new file mode 100644 index 00000000..1ab097a0 --- /dev/null +++ b/transfers/relaxed_constraints.md @@ -0,0 +1,10 @@ +Address.postal_code is nullable +Thing measuring_point_height is nullable +ValidateWell, depth validation removed +Deployment.installation_date is nullable +CreateWellScreen depth validation removed +FieldEventParticipants not required +screen_depth_bottom is nullable +screen_depth_top is nullable +city nullable +state nullable \ No newline at end of file diff --git a/transfers/sensor_transfer.py b/transfers/sensor_transfer.py index 61aea732..a1c65b27 100644 --- a/transfers/sensor_transfer.py +++ b/transfers/sensor_transfer.py @@ -166,16 +166,10 @@ def _group_step(self, session: Session, row: pd.Series, db_item: Base): estimator = self._get_estimator(sensor_type) installation_date = estimator.estimate_installation_date(row) if not installation_date: - logger.critical( - f"Installation Date cannot be None. Skipping deployment. Sensor: {row.ID}, " - f"SerialNo: {row.SerialNo} PointID: {pointid}" - ) - self._capture_error( - pointid, - f"row.SerialNo={row.SerialNo}. Installation Date cannot be None", - "DateInstalled", + logger.warning( + f"Installation Date is None. Proceeding with NULL deployment installation date. " + f"Sensor: {row.ID}, SerialNo: {row.SerialNo} PointID: {pointid}" ) - return else: logger.warning( f"Estimated installation date={installation_date} for {pointid}" @@ -204,10 +198,6 @@ def _group_step(self, session: Session, row: pd.Series, db_item: Base): if recording_interval is not None: recording_interval_unit = unit - logger.info( - f"name={sensor.name}, serial_no={sensor.serial_no}. " - f"estimated recording interval: {recording_interval} {unit}" - ) self._capture_error( pointid, f"Estimated recording interval={recording_interval} {unit}. Is this correct?", @@ -215,10 +205,6 @@ def _group_step(self, session: Session, row: pd.Series, db_item: Base): ) else: - logger.critical( - f"name={sensor.name}, serial_no={sensor.serial_no} error={error}" - ) - self._capture_error( pointid, f"name={sensor.name}, row.SerialNo={row.SerialNo}. " diff --git a/transfers/thing_transfer.py b/transfers/thing_transfer.py index 6c78cc8e..e0603b8a 100644 --- a/transfers/thing_transfer.py +++ b/transfers/thing_transfer.py @@ -14,13 +14,15 @@ # limitations under the License. # =============================================================================== import time +from threading import Lock +from types import SimpleNamespace from pandas import isna from pydantic import ValidationError +from sqlalchemy import insert from sqlalchemy.orm import Session -from db import LocationThingAssociation -from services.thing_helper import add_thing +from db import LocationThingAssociation, Location, Thing, Notes, DataProvenance from transfers.logger import logger from transfers.util import ( make_location, @@ -29,23 +31,49 @@ replace_nans, ) +_LOCATION_DF_CACHE = None +_LOCATION_DF_LOCK = Lock() + + +def _get_location_df(): + global _LOCATION_DF_CACHE + if _LOCATION_DF_CACHE is None: + with _LOCATION_DF_LOCK: + if _LOCATION_DF_CACHE is None: + df = read_csv("Location") + _LOCATION_DF_CACHE = replace_nans(df) + return _LOCATION_DF_CACHE -def transfer_thing(session: Session, site_type: str, make_payload, limit=None) -> None: - ldf = read_csv("Location") +def transfer_thing(session: Session, site_type: str, make_payload, limit=None) -> None: + ldf = _get_location_df() ldf = ldf[ldf["SiteType"] == site_type] ldf = ldf[ldf["Easting"].notna() & ldf["Northing"].notna()] - ldf = replace_nans(ldf) + + # Pre-compute duplicate PointIDs once to avoid O(n^2) filtering in the loop. + duplicate_mask = ldf["PointID"].duplicated(keep=False) + duplicate_pointids = set(ldf.loc[duplicate_mask, "PointID"]) + if duplicate_pointids: + logger.warning( + "Found %s duplicate PointID values for site type %s; these will be skipped.", + len(duplicate_pointids), + site_type, + ) + n = len(ldf) start_time = time.time() + batch_size = 500 logger.info("Starting transfer: Things (%s) [%s rows]", site_type, n) cached_elevations = {} + prepared_rows: list[dict] = [] + skipped_count = 0 - for i, row in enumerate(ldf.itertuples()): + for i, row in enumerate(ldf.itertuples(index=False)): pointid = row.PointID - if ldf[ldf["PointID"] == pointid].shape[0] > 1: - logger.critical(f"PointID {pointid} has duplicate records. Skipping.") + if pointid in duplicate_pointids: + logger.critical("PointID %s has duplicate records. Skipping.", pointid) + skipped_count += 1 continue if limit is not None and limit > 0 and i >= limit: @@ -56,42 +84,136 @@ def transfer_thing(session: Session, site_type: str, make_payload, limit=None) - logger.info( f"Processing row {i} of {n}. {row.PointID}, avg rows per second: {i / (time.time() - start_time):.2f}" ) - session.commit() try: location, elevation_method, location_notes = make_location( row, cached_elevations ) - session.add(location) - session.flush() - for note_type, note_content in location_notes.items(): - if not isna(note_content): - location_note = location.add_note(note_content, note_type) - session.add(location_note) - - data_provenances = make_location_data_provenance( - row, location, elevation_method - ) - for dp in data_provenances: - session.add(dp) - payload = make_payload(row) - thing_type = payload.pop("thing_type") - payload["nma_pk_location"] = row.LocationId - thing = add_thing(session, payload, thing_type=thing_type) - assoc = LocationThingAssociation() - assoc.location = location - assoc.thing = thing - session.add(assoc) + prepared_rows.append( + { + "row": row, + "location_row": { + "nma_pk_location": location.nma_pk_location, + "description": location.description, + "point": location.point, + "elevation": location.elevation, + "release_status": location.release_status, + "nma_date_created": location.nma_date_created, + "nma_site_date": location.nma_site_date, + "nma_location_notes": location.nma_location_notes, + "nma_coordinate_notes": location.nma_coordinate_notes, + "nma_data_reliability": location.nma_data_reliability, + }, + "location_notes": location_notes, + "elevation_method": elevation_method, + "thing_row": { + "name": payload["name"], + "thing_type": payload["thing_type"], + "release_status": payload["release_status"], + "nma_pk_location": row.LocationId, + }, + } + ) except ValidationError as e: logger.critical( f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" ) + skipped_count += 1 except Exception as e: logger.critical(f"Error creating location for {row.PointID}: {e}") + skipped_count += 1 + continue + + created_count = 0 + for start in range(0, len(prepared_rows), batch_size): + chunk = prepared_rows[start : start + batch_size] + if not chunk: continue + location_rows = [item["location_row"] for item in chunk] + inserted_locations = session.execute( + insert(Location).returning(Location.id, Location.nma_pk_location), + location_rows, + ).all() + location_id_by_nma_pk = { + nma_pk: loc_id for loc_id, nma_pk in inserted_locations + } + + thing_rows = [item["thing_row"] for item in chunk] + inserted_things = session.execute( + insert(Thing).returning(Thing.id, Thing.nma_pk_location), + thing_rows, + ).all() + thing_id_by_nma_pk = {nma_pk: thing_id for thing_id, nma_pk in inserted_things} + + notes_rows: list[dict] = [] + provenance_rows: list[dict] = [] + assoc_rows: list[dict] = [] + + for item in chunk: + nma_pk_location = item["thing_row"]["nma_pk_location"] + location_id = location_id_by_nma_pk.get(nma_pk_location) + thing_id = thing_id_by_nma_pk.get(nma_pk_location) + + if location_id is None or thing_id is None: + logger.critical( + "Failed to resolve inserted IDs for nma_pk_location=%s; skipping associations", + nma_pk_location, + ) + skipped_count += 1 + continue + + assoc_rows.append({"location_id": location_id, "thing_id": thing_id}) + + for note_type, note_content in item["location_notes"].items(): + if not isna(note_content): + notes_rows.append( + { + "target_id": location_id, + "target_table": "location", + "note_type": note_type, + "content": note_content, + "release_status": "draft", + } + ) + + # Reuse existing provenance mapper by passing an object with .id. + location_stub = SimpleNamespace(id=location_id) + data_provenances = make_location_data_provenance( + item["row"], location_stub, item["elevation_method"] + ) + for dp in data_provenances: + provenance_rows.append( + { + "target_id": dp.target_id, + "target_table": dp.target_table, + "field_name": dp.field_name, + "origin_type": dp.origin_type, + "origin_source": dp.origin_source, + "collection_method": dp.collection_method, + "accuracy_value": dp.accuracy_value, + "accuracy_unit": dp.accuracy_unit, + "release_status": dp.release_status or "draft", + } + ) + + if notes_rows: + session.execute(insert(Notes), notes_rows) + if provenance_rows: + session.execute(insert(DataProvenance), provenance_rows) + if assoc_rows: + session.execute(insert(LocationThingAssociation), assoc_rows) + created_count += len(assoc_rows) + session.commit() + logger.info( + "Things transfer summary (%s): created=%s skipped=%s total_candidates=%s", + site_type, + created_count, + skipped_count, + n, + ) logger.info("Completed transfer: Things (%s)", site_type) diff --git a/transfers/transfer_results_builder.py b/transfers/transfer_results_builder.py index 1a2392c0..296529cd 100644 --- a/transfers/transfer_results_builder.py +++ b/transfers/transfer_results_builder.py @@ -7,7 +7,11 @@ import pandas as pd from sqlalchemy import select, func +from db import Deployment, Sensor, Thing from db.engine import session_ctx +from transfers.sensor_transfer import ( + EQUIPMENT_TO_SENSOR_TYPE_MAP, +) from transfers.transfer import load_transfer_options from transfers.transfer_results_specs import ( TRANSFER_COMPARISON_SPECS, @@ -18,12 +22,26 @@ TransferResult, ) from transfers.util import ( + SensorParameterEstimator, read_csv, replace_nans, get_transferable_wells, ) +def _model_column(model: Any, token: str) -> Any: + if hasattr(model, token): + return getattr(model, token) + table = model.__table__ + if token in table.c: + return table.c[token] + token_norm = token.casefold() + for col in table.c: + if col.key.casefold() == token_norm or col.name.casefold() == token_norm: + return col + raise AttributeError(f"{model.__name__} has no column '{token}'") + + def _normalize_key(value: Any) -> str | None: if value is None: return None @@ -57,6 +75,96 @@ def _normalized_series(df: pd.DataFrame, key_col: str) -> pd.Series: return s.astype(str) +def _normalize_date_like(value: Any) -> str: + if value is None: + return "" + try: + if pd.isna(value): + return "" + except TypeError: + pass + dt = pd.to_datetime(value, errors="coerce") + if pd.isna(dt): + return "" + return dt.date().isoformat() + + +def _parse_legacy_datetime_date(value: Any) -> str | None: + if value is None: + return None + try: + if pd.isna(value): + return None + except TypeError: + pass + text = str(value).strip() + if not text: + return None + try: + return pd.to_datetime(text, format="%Y-%m-%d %H:%M:%S.%f").date().isoformat() + except (TypeError, ValueError): + return None + + +def _equipment_source_series(df: pd.DataFrame) -> pd.Series: + required = {"PointID", "SerialNo", "DateInstalled", "DateRemoved"} + if not required.issubset(df.columns): + return pd.Series([], dtype=object) + + estimators: dict[str, SensorParameterEstimator] = {} + keys: list[str] = [] + for row in df.itertuples(index=False): + pointid = _normalize_key(getattr(row, "PointID", None)) or "" + serial = _normalize_key(getattr(row, "SerialNo", None)) or "" + + installed = _parse_legacy_datetime_date(getattr(row, "DateInstalled", None)) + if installed is None: + equipment_type = getattr(row, "EquipmentType", None) + sensor_type = EQUIPMENT_TO_SENSOR_TYPE_MAP.get(equipment_type) + if sensor_type: + estimator = estimators.get(sensor_type) + if estimator is None: + estimator = SensorParameterEstimator(sensor_type) + estimators[sensor_type] = estimator + estimated = estimator.estimate_installation_date(row) + installed = _normalize_date_like(estimated) + else: + installed = "" + + removed = _parse_legacy_datetime_date(getattr(row, "DateRemoved", None)) + if removed is None: + removed = "" + + keys.append(f"{pointid}|{serial}|{installed}|{removed}") + return pd.Series(keys, dtype=object) + + +def _equipment_destination_series(session) -> pd.Series: + sql = ( + select( + Thing.name.label("point_id"), + Sensor.serial_no.label("serial_no"), + Deployment.installation_date.label("installed"), + Deployment.removal_date.label("removed"), + ) + .select_from(Deployment) + .join(Thing, Deployment.thing_id == Thing.id) + .join(Sensor, Deployment.sensor_id == Sensor.id) + .where(Thing.name.is_not(None)) + .where(Sensor.serial_no.is_not(None)) + ) + rows = session.execute(sql).all() + if not rows: + return pd.Series([], dtype=object) + pointid = pd.Series([_normalize_key(r.point_id) or "" for r in rows], dtype=object) + serial = pd.Series([_normalize_key(r.serial_no) or "" for r in rows], dtype=object) + installed = pd.Series( + [_normalize_date_like(r.installed) for r in rows], dtype=object + ) + removed = pd.Series([_normalize_date_like(r.removed) for r in rows], dtype=object) + return pointid + "|" + serial + "|" + installed + "|" + removed + + class TransferResultsBuilder: """Compare transfer input CSV keys to destination database keys per transfer.""" @@ -87,29 +195,45 @@ def _build_one(self, spec: TransferComparisonSpec) -> TransferResult: elif spec.transfer_name == "WellData": comparison_df = self._agreed_welldata_df() - source_series = _normalized_series(comparison_df, spec.source_key_column) + if spec.transfer_name == "Equipment": + source_series = _equipment_source_series(comparison_df) + else: + source_series = _normalized_series(comparison_df, spec.source_key_column) source_keys = set(source_series.unique().tolist()) source_keyed_row_count = int(source_series.shape[0]) source_duplicate_key_row_count = source_keyed_row_count - len(source_keys) agreed_transfer_row_count = int(len(comparison_df)) model = spec.destination_model - key_col = getattr(model, spec.destination_key_column) + destination_model_name = model.__name__ + destination_key_column = spec.destination_key_column with session_ctx() as session: - key_sql = select(key_col).where(key_col.is_not(None)) - count_sql = select(func.count()).select_from(model) + if spec.transfer_name == "Equipment": + count_sql = select(func.count()).select_from(Deployment) + count_sql = count_sql.join(Thing, Deployment.thing_id == Thing.id) + count_sql = count_sql.join(Sensor, Deployment.sensor_id == Sensor.id) + count_sql = count_sql.where(Thing.name.is_not(None)) + count_sql = count_sql.where(Sensor.serial_no.is_not(None)) + destination_series = _equipment_destination_series(session) + destination_row_count = int(session.execute(count_sql).scalar_one()) + destination_model_name = "Deployment" + destination_key_column = "thing.name|sensor.serial_no|deployment.installation_date|deployment.removal_date" + else: + key_col = _model_column(model, spec.destination_key_column) + key_sql = select(key_col).where(key_col.is_not(None)) + count_sql = select(func.count()).select_from(model) - if spec.destination_where: - where_clause = spec.destination_where(model) - key_sql = key_sql.where(where_clause) - count_sql = count_sql.where(where_clause) + if spec.destination_where: + where_clause = spec.destination_where(model) + key_sql = key_sql.where(where_clause) + count_sql = count_sql.where(where_clause) - raw_dest_keys = session.execute(key_sql).scalars().all() - destination_row_count = int(session.execute(count_sql).scalar_one()) + raw_dest_keys = session.execute(key_sql).scalars().all() + destination_series = pd.Series( + [_normalize_key(v) for v in raw_dest_keys], dtype=object + ).dropna() + destination_row_count = int(session.execute(count_sql).scalar_one()) - destination_series = pd.Series( - [_normalize_key(v) for v in raw_dest_keys], dtype=object - ).dropna() if destination_series.empty: destination_series = pd.Series([], dtype=object) else: @@ -123,13 +247,18 @@ def _build_one(self, spec: TransferComparisonSpec) -> TransferResult: missing = sorted(source_keys - destination_keys) extra = sorted(destination_keys - source_keys) + transferred_agreed_row_count = int(source_series.isin(destination_keys).sum()) + missing_agreed_row_count = max( + agreed_transfer_row_count - transferred_agreed_row_count, + 0, + ) return spec.result_cls( transfer_name=spec.transfer_name, source_csv=spec.source_csv, source_key_column=spec.source_key_column, - destination_model=model.__name__, - destination_key_column=spec.destination_key_column, + destination_model=destination_model_name, + destination_key_column=destination_key_column, source_row_count=len(source_df), agreed_transfer_row_count=agreed_transfer_row_count, source_keyed_row_count=source_keyed_row_count, @@ -142,6 +271,8 @@ def _build_one(self, spec: TransferComparisonSpec) -> TransferResult: matched_key_count=len(source_keys & destination_keys), missing_in_destination_count=len(missing), extra_in_destination_count=len(extra), + transferred_agreed_row_count=transferred_agreed_row_count, + missing_agreed_row_count=missing_agreed_row_count, missing_in_destination_sample=missing[: self.sample_limit], extra_in_destination_sample=extra[: self.sample_limit], ) diff --git a/transfers/transfer_results_specs.py b/transfers/transfer_results_specs.py index 449ffa89..c117e7b3 100644 --- a/transfers/transfer_results_specs.py +++ b/transfers/transfer_results_specs.py @@ -37,7 +37,6 @@ from db.engine import session_ctx from transfers.contact_transfer import ( _get_organization, - _make_name, _safe_make_name, _select_ownerkey_col, ) @@ -78,9 +77,12 @@ WellScreensTransferResult, ) from transfers.util import ( + filter_non_transferred_wells, filter_by_valid_measuring_agency, filter_to_valid_point_ids, + get_transferable_wells, get_transfers_data_path, + lexicon_mapper, read_csv, replace_nans, ) @@ -181,9 +183,87 @@ def _waterlevels_filter(df: pd.DataFrame) -> pd.DataFrame: cleaned_df = replace_nans(df.copy()) cleaned_df = filter_to_valid_point_ids(cleaned_df) cleaned_df = filter_by_valid_measuring_agency(cleaned_df) + + # Mirror WaterLevelTransferer behavior for observation creation: + # rows whose mapped LevelStatus indicates a destroyed well only create + # FieldEvent notes and intentionally do not create observations. + def _is_destroyed(level_status: Any) -> bool: + if pd.isna(level_status): + return False + + value = level_status + if value == "X?": + value = "X" + mapped = lexicon_mapper.map_value(f"LU_LevelStatus:{value}") + return ( + mapped + == "Well was destroyed (no subsequent water levels should be recorded)" + ) + + if "LevelStatus" in cleaned_df.columns: + cleaned_df = cleaned_df[~cleaned_df["LevelStatus"].map(_is_destroyed)] + + return cleaned_df + + +def _equipment_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror SensorTransferer._get_dfs filtering stage. + cleaned_df = df.copy() + cleaned_df.columns = cleaned_df.columns.str.replace(" ", "_") + if "SerialNo" in cleaned_df.columns: + cleaned_df = cleaned_df[cleaned_df["SerialNo"].notna()] + else: + return cleaned_df.iloc[0:0] + cleaned_df = filter_to_valid_point_ids(cleaned_df) + cleaned_df = replace_nans(cleaned_df) + return cleaned_df + + +def _wellscreens_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror WellChunkTransferer._get_dfs used by WellScreenTransferer. + cleaned_df = replace_nans(df.copy()) + cleaned_df = filter_to_valid_point_ids(cleaned_df) return cleaned_df +def _welldata_filter(df: pd.DataFrame) -> pd.DataFrame: + # Mirror WellTransferer._get_dfs filtering stage. + if "LocationId" not in df.columns: + return df.iloc[0:0] + + cleaned_df = df.copy() + ldf = read_csv("Location") + ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1, errors="ignore") + cleaned_df = cleaned_df.join(ldf.set_index("LocationId"), on="LocationId") + + if "SiteType" in cleaned_df.columns: + cleaned_df = cleaned_df[cleaned_df["SiteType"] == "GW"] + else: + return cleaned_df.iloc[0:0] + + if "Easting" in cleaned_df.columns and "Northing" in cleaned_df.columns: + cleaned_df = cleaned_df[ + cleaned_df["Easting"].notna() & cleaned_df["Northing"].notna() + ] + else: + return cleaned_df.iloc[0:0] + + cleaned_df = replace_nans(cleaned_df) + cleaned_df = get_transferable_wells(cleaned_df) + cleaned_df = filter_non_transferred_wells(cleaned_df) + + if "PointID" not in cleaned_df.columns: + return cleaned_df.iloc[0:0] + + # Match WellTransferer behavior: skip every duplicated PointID. + dupes = cleaned_df["PointID"].duplicated(keep=False) + if dupes.any(): + dup_ids = set(cleaned_df.loc[dupes, "PointID"]) + cleaned_df = cleaned_df[~cleaned_df["PointID"].isin(dup_ids)] + + return cleaned_df.sort_values(by=["PointID"]) + + def _stratigraphy_filter(df: pd.DataFrame) -> pd.DataFrame: # Mirror StratigraphyLegacyTransferer._get_dfs filtering stage. cleaned_df = replace_nans(df.copy()) @@ -379,6 +459,7 @@ def _record_new_contact( getattr(row, "LastName", None), owner_key, organization, + fallback_suffix="primary", ) _record_new_contact(owner_key, "Primary", primary_name, organization) @@ -391,9 +472,12 @@ def _record_new_contact( ] ) if has_secondary_input: - secondary_name = _make_name( + secondary_name = _safe_make_name( getattr(row, "SecondFirstName", None), getattr(row, "SecondLastName", None), + owner_key, + organization, + fallback_suffix="secondary", ) _record_new_contact(owner_key, "Secondary", secondary_name, organization) @@ -408,6 +492,7 @@ def _record_new_contact( "WellID", Thing, "nma_pk_welldata", + agreed_filter=_welldata_filter, destination_where=lambda m: m.thing_type == "water well", ), TransferComparisonSpec( @@ -417,6 +502,7 @@ def _record_new_contact( "GlobalID", WellScreen, "nma_pk_wellscreens", + agreed_filter=_wellscreens_filter, option_field="transfer_screens", ), TransferComparisonSpec( @@ -447,6 +533,7 @@ def _record_new_contact( "GlobalID", Sensor, "nma_pk_equipment", + agreed_filter=_equipment_filter, option_field="transfer_sensors", ), TransferComparisonSpec( diff --git a/transfers/transfer_results_types.py b/transfers/transfer_results_types.py index dc58238a..1163a2c7 100644 --- a/transfers/transfer_results_types.py +++ b/transfers/transfer_results_types.py @@ -22,6 +22,8 @@ class TransferResult: matched_key_count: int = 0 missing_in_destination_count: int = 0 extra_in_destination_count: int = 0 + transferred_agreed_row_count: int = 0 + missing_agreed_row_count: int = 0 missing_in_destination_sample: list[str] = field(default_factory=list) extra_in_destination_sample: list[str] = field(default_factory=list) diff --git a/transfers/transferer.py b/transfers/transferer.py index afef86e3..e05fd90d 100644 --- a/transfers/transferer.py +++ b/transfers/transferer.py @@ -329,16 +329,6 @@ def _filter_to_valid_sample_infos(self, df: pd.DataFrame) -> pd.DataFrame: parsed_sample_pt_ids = df["SamplePtID"].map(self._uuid_val) mask = parsed_sample_pt_ids.isin(valid_sample_pt_ids) filtered_df = df[mask].copy() - inverted_df = df[~mask].copy() - if not inverted_df.empty: - for _, row in inverted_df.iterrows(): - sample_pt_id = row.get("SamplePtID") - self._capture_error( - sample_pt_id, - f"No matching ChemistrySampleInfo for SamplePtID: {sample_pt_id}", - "SamplePtID", - ) - after_count = len(filtered_df) if before_count > after_count: diff --git a/transfers/util.py b/transfers/util.py index d358937c..5fd1a471 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -126,6 +126,7 @@ def estimate_measuring_point_height( # try to estimate mpheight from measurements for m in df.itertuples(): mphi = m.DepthToWater - m.DepthToWaterBGS + mphi = _round_sig_figs(mphi, 2) start_date = m.DateMeasured if mphi not in mphs: if notna(mphi): @@ -155,6 +156,28 @@ def estimate_measuring_point_height( return mphs, mph_descs, start_dates, end_dates +def _round_sig_figs(value: float, sig_figs: int) -> float: + if value is None: + return value + try: + if pd.isna(value): + return value + except TypeError: + pass + + try: + numeric = float(value) + except (TypeError, ValueError): + return value + + if not math.isfinite(numeric): + return value + + if numeric == 0: + return 0.0 + return round(numeric, sig_figs - int(math.floor(math.log10(abs(numeric)))) - 1) + + def _get_defined_recording_interval(pointid: str) -> tuple[int, str] | None: if pointid in DEFINED_RECORDING_INTERVALS: return DEFINED_RECORDING_INTERVALS[pointid] diff --git a/transfers/waterlevels_transfer.py b/transfers/waterlevels_transfer.py index 3b664e4c..261faf53 100644 --- a/transfers/waterlevels_transfer.py +++ b/transfers/waterlevels_transfer.py @@ -19,6 +19,10 @@ from typing import Any import pandas as pd +from sqlalchemy import insert +from sqlalchemy.exc import DatabaseError, SQLAlchemyError +from sqlalchemy.orm import Session + from db import ( Thing, ThingContactAssociation, @@ -31,9 +35,6 @@ Parameter, ) from db.engine import session_ctx -from sqlalchemy import insert -from sqlalchemy.exc import DatabaseError, SQLAlchemyError -from sqlalchemy.orm import Session from transfers.transferer import Transferer from transfers.util import ( filter_to_valid_point_ids, @@ -149,7 +150,7 @@ def _transfer_hook(self, session: Session) -> None: "rows_created": 0, "rows_skipped_dt": 0, "rows_skipped_reason": 0, - "rows_skipped_contacts": 0, + "rows_missing_participants": 0, "rows_well_destroyed": 0, "field_events_created": 0, "field_activities_created": 0, @@ -175,9 +176,6 @@ def _transfer_hook(self, session: Session) -> None: thing_id = self._thing_id_by_pointid.get(pointid) if thing_id is None: stats["groups_skipped_missing_thing"] += 1 - logger.warning( - "Skipping PointID=%s because Thing was not found", pointid - ) self._capture_error(pointid, "Thing not found", "PointID") continue @@ -219,12 +217,7 @@ def _transfer_hook(self, session: Session) -> None: ) if not field_event_participants: - stats["rows_skipped_contacts"] += 1 - logger.warning( - "Skipping %s because no field event participants were found", - self._row_context(row), - ) - continue + stats["rows_missing_participants"] += 1 is_destroyed = ( glv @@ -406,29 +399,14 @@ def _transfer_hook(self, session: Session) -> None: stats["groups_processed"] += 1 except DatabaseError as e: stats["groups_failed_commit"] += 1 - logger.exception( - "Failed committing WaterLevels group for PointID=%s: %s", - pointid, - e, - ) session.rollback() self._capture_database_error(pointid, e) except SQLAlchemyError as e: stats["groups_failed_commit"] += 1 - logger.exception( - "SQLAlchemy failure committing WaterLevels group for PointID=%s: %s", - pointid, - e, - ) session.rollback() - self._capture_error(pointid, str(e), "UnknownField") + self._capture_error(pointid, str(e), "SQLAlchemyError") except Exception as e: stats["groups_failed_commit"] += 1 - logger.exception( - "Unexpected failure committing WaterLevels group for PointID=%s: %s", - pointid, - e, - ) session.rollback() self._capture_error(pointid, str(e), "UnknownField") @@ -673,9 +651,9 @@ def _get_field_event_participants(self, session, row) -> list[Contact]: self._last_contacts_reused_count += 1 if len(field_event_participants) == 0: - logger.critical( - f"No contacts can be associated with the WaterLevels record with GlobalID {row.GlobalID}, " - f"therefore no field event, field activity, sample, and observation can be made. Skipping." + logger.warning( + f"No contacts can be associated with the WaterLevels record with GlobalID {row.GlobalID}; " + f"continuing with nullable field_event_participant_id." ) return field_event_participants @@ -690,7 +668,7 @@ def _row_context(self, row: Any) -> str: def _log_transfer_summary(self, stats: dict[str, int]) -> None: logger.info( "WaterLevels summary: groups total=%s processed=%s skipped_missing_thing=%s failed_commit=%s " - "rows total=%s created=%s skipped_dt=%s skipped_reason=%s skipped_contacts=%s well_destroyed=%s " + "rows total=%s created=%s skipped_dt=%s skipped_reason=%s missing_participants=%s well_destroyed=%s " "field_events=%s activities=%s samples=%s observations=%s contacts_created=%s contacts_reused=%s", stats["groups_total"], stats["groups_processed"], @@ -700,7 +678,7 @@ def _log_transfer_summary(self, stats: dict[str, int]) -> None: stats["rows_created"], stats["rows_skipped_dt"], stats["rows_skipped_reason"], - stats["rows_skipped_contacts"], + stats["rows_missing_participants"], stats["rows_well_destroyed"], stats["field_events_created"], stats["field_activities_created"], diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index a6fa6408..5d459c23 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -188,9 +188,12 @@ def transfer_parallel(self, num_workers: int = None) -> None: all_errors = [] errors_lock = threading.Lock() aquifers_lock = threading.Lock() + progress_lock = threading.Lock() + transferred_count = 0 def process_batch(batch_idx: int, batch_df: pd.DataFrame) -> dict: """Process a batch of wells in a separate thread with its own session.""" + nonlocal transferred_count batch_errors = [] batch_start = time.time() @@ -206,7 +209,7 @@ def process_batch(batch_idx: int, batch_df: pd.DataFrame) -> dict: for i, row in enumerate(batch_df.itertuples()): try: # Process single well with all dependent objects - self._step_parallel_complete( + transferred = self._step_parallel_complete( session, row, local_aquifers, @@ -214,6 +217,15 @@ def process_batch(batch_idx: int, batch_df: pd.DataFrame) -> dict: batch_errors, aquifers_lock, ) + if transferred: + with progress_lock: + transferred_count += 1 + logger.info( + "[%s/%s] Transferred PointID=%s", + transferred_count, + n, + row.PointID, + ) except Exception as e: self._log_exception( getattr(row, "PointID", "Unknown"), @@ -321,12 +333,19 @@ def _extract_well_purposes(self, row) -> list[str]: if isna(cu): return [] + + cu = cu.strip() + if not cu: + return [] else: purposes = [] for cui in cu: if cui == "A": # skip "Open, unequipped well" as that gets mapped to the status_history table continue + if cui == ",": + continue + p = self._get_lexicon_value(row, f"LU_CurrentUse:{cui}") if p is not None: purposes.append(p) @@ -718,6 +737,7 @@ def _add_notes_and_provenance( def _add_histories(self, session: Session, row, well: Thing) -> None: mphs = self._measuring_point_estimator.estimate_measuring_point_height(row) + added_measuring_point = False for mph, mph_desc, start_date, end_date in zip(*mphs): session.add( MeasuringPointHistory( @@ -728,6 +748,21 @@ def _add_histories(self, session: Session, row, well: Thing) -> None: end_date=end_date, ) ) + added_measuring_point = True + + # Preserve transfer intent even when no MP height can be measured/estimated. + if not added_measuring_point: + raw_desc = getattr(row, "MeasuringPoint", None) + mp_desc = None if isna(raw_desc) else raw_desc + session.add( + MeasuringPointHistory( + thing_id=well.id, + measuring_point_height=None, + measuring_point_description=mp_desc, + start_date=datetime.now(tz=UTC).date(), + end_date=None, + ) + ) target_id = well.id target_table = "thing" @@ -810,22 +845,22 @@ def _step_parallel_complete( local_formations: dict, batch_errors: list, aquifers_lock: threading.Lock, - ): + ) -> bool: """ Process a single well with ALL dependent objects in one pass. Combines _step_parallel and _after_hook_chunk for maximum parallelization. """ payload = self._build_well_payload(row) if not payload: - return + return False well = self._persist_well(session, row, payload, batch_errors) if well is None: - return + return False location_result = self._persist_location(session, row, batch_errors) if not location_result: - return + return False location, elevation_method, location_note_payload = location_result assoc = LocationThingAssociation( @@ -873,6 +908,7 @@ def _step_parallel_complete( session, row, well, location, location_note_payload, elevation_method ) self._add_histories(session, row, well) + return True def _get_lexicon_value_safe(self, row, value, default, errors_list): """Thread-safe version of _get_lexicon_value.""" @@ -1028,7 +1064,6 @@ def _chunk_step(self, session, df, i, row, db_item): "thing_id": db_item.id, "screen_depth_top": row.ScreenTop, "screen_depth_bottom": row.ScreenBottom, - # "screen_type": row.ScreenType, "screen_description": row.ScreenDescription, "release_status": "draft", "nma_pk_wellscreens": row.GlobalID, @@ -1037,9 +1072,6 @@ def _chunk_step(self, session, df, i, row, db_item): # TODO: add validation logic here to ensure no overlapping screens for the same well CreateWellScreen.model_validate(well_screen_data) except ValidationError as e: - logger.critical( - f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" - ) self._capture_validation_error(row.PointID, e) return @@ -1047,16 +1079,4 @@ def _chunk_step(self, session, df, i, row, db_item): session.add(well_screen) -# def transfer_wells(flags: dict = None): -# transferer = WellTransferer(flags=flags) -# transferer.transfer() -# return transferer.input_df, transferer.cleaned_df, transferer.errors -# -# -# def transfer_wellscreens(flags: dict = None): -# transferer = WellScreenTransferer(flags=flags) -# transferer.chunk_transfer() -# return transferer.input_df, transferer.cleaned_df, transferer.errors - - # ============= EOF ============================================= From 41ff8de1ee171b0adacb613d054bccdf5243ae37 Mon Sep 17 00:00:00 2001 From: jirhiker <2035568+jirhiker@users.noreply.github.com> Date: Sun, 22 Feb 2026 21:25:09 +0000 Subject: [PATCH 09/14] Formatting changes --- schemas/contact.py | 1 - schemas/thing.py | 1 - tests/test_cli_commands.py | 6 ++---- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/schemas/contact.py b/schemas/contact.py index 248ff173..590d6db8 100644 --- a/schemas/contact.py +++ b/schemas/contact.py @@ -24,7 +24,6 @@ from schemas import BaseResponseModel, BaseCreateModel, BaseUpdateModel from schemas.notes import CreateNote, NoteResponse - # -------- VALIDATORS ---------- diff --git a/schemas/thing.py b/schemas/thing.py index a6080923..fceba6c0 100644 --- a/schemas/thing.py +++ b/schemas/thing.py @@ -35,7 +35,6 @@ from schemas.notes import NoteResponse, CreateNote from schemas.permission_history import PermissionHistoryResponse - # -------- VALIDATE ---------- diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py index 8bdc2f9c..412ebea3 100644 --- a/tests/test_cli_commands.py +++ b/tests/test_cli_commands.py @@ -244,12 +244,10 @@ def test_water_levels_cli_persists_observations(tmp_path, water_well_thing): """ def _write_csv(path: Path, *, well_name: str, notes: str): - csv_text = textwrap.dedent( - f"""\ + csv_text = textwrap.dedent(f"""\ field_staff,well_name_point_id,field_event_date_time,measurement_date_time,sampler,sample_method,mp_height,level_status,depth_to_water_ft,data_quality,water_level_notes CLI Tester,{well_name},2025-02-15T08:00:00-07:00,2025-02-15T10:30:00-07:00,Groundwater Team,electric tape,1.5,stable,42.5,approved,{notes} - """ - ) + """) path.write_text(csv_text) unique_notes = f"pytest-{uuid.uuid4()}" From a2baff6f0b6aadc9d56509da4094e0e9b6c78a78 Mon Sep 17 00:00:00 2001 From: jakeross Date: Sun, 22 Feb 2026 14:28:48 -0700 Subject: [PATCH 10/14] feat: enable database drop and rebuild for unit tests --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 26e1f08f..221c559b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -104,6 +104,7 @@ jobs: BASE_URL: http://localhost:8000 SESSION_SECRET_KEY: supersecretkeyforunittests AUTHENTIK_DISABLE_AUTHENTICATION: 1 + DROP_AND_REBUILD_DB: 1 services: postgis: From d2f4f1f9f5b20e1d6935a5437e8ad80598c29fe2 Mon Sep 17 00:00:00 2001 From: jakeross Date: Sun, 22 Feb 2026 14:37:11 -0700 Subject: [PATCH 11/14] feat: enhance data transfer handling by logging skipped records and updating row processing --- tests/test_thing.py | 2 ++ transfers/surface_water_data.py | 24 +++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/test_thing.py b/tests/test_thing.py index 713b7444..00a476d9 100644 --- a/tests/test_thing.py +++ b/tests/test_thing.py @@ -63,6 +63,7 @@ def override_authentication_dependency_fixture(): # VALIDATE tests =============================================================== +@pytest.mark.skip(reason="Temporarily not relevant until transfer process is complete.") def test_validate_hole_depth_well_depth(): with pytest.raises( ValueError, match="well depth must be less than than or equal to hole depth" @@ -70,6 +71,7 @@ def test_validate_hole_depth_well_depth(): ValidateWell(well_depth=100.0, hole_depth=90.0) +@pytest.mark.skip(reason="Temporarily not relevant until transfer process is complete.") def test_validate_hole_depth_casing_depth(): with pytest.raises( ValueError, diff --git a/transfers/surface_water_data.py b/transfers/surface_water_data.py index e4e8a908..519d9a62 100644 --- a/transfers/surface_water_data.py +++ b/transfers/surface_water_data.py @@ -62,10 +62,24 @@ def _get_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]: def _transfer_hook(self, session: Session) -> None: rows: list[dict[str, Any]] = [] + skipped_missing_thing = 0 for raw in self.cleaned_df.to_dict("records"): record = self._row_dict(raw) + if record is None: + skipped_missing_thing += 1 + continue rows.append(record) + if skipped_missing_thing: + logger.warning( + "Skipped %s SurfaceWaterData rows without matching Thing", + skipped_missing_thing, + ) + + if not rows: + logger.info("No SurfaceWaterData rows to transfer") + return + rows = self._dedupe_rows(rows, key="OBJECTID", include_missing=True) insert_stmt = insert(NMA_SurfaceWaterData) @@ -101,7 +115,7 @@ def _transfer_hook(self, session: Session) -> None: session.commit() session.expunge_all() - def _row_dict(self, row: dict[str, Any]) -> dict[str, Any]: + def _row_dict(self, row: dict[str, Any]) -> Optional[dict[str, Any]]: def val(key: str) -> Optional[Any]: v = row.get(key) if pd.isna(v): @@ -123,6 +137,14 @@ def to_uuid(v: Any) -> Optional[uuid.UUID]: location_id = to_uuid(val("LocationId")) thing_id = self._resolve_thing_id(location_id) + if thing_id is None: + logger.warning( + "Skipping SurfaceWaterData OBJECTID=%s PointID=%s LocationId=%s - Thing not found", + val("OBJECTID"), + val("PointID"), + location_id, + ) + return None return { "LocationId": location_id, From e089b32a93556fb7a24f9cfbe0226d0b873f5806 Mon Sep 17 00:00:00 2001 From: jakeross Date: Sun, 22 Feb 2026 14:44:01 -0700 Subject: [PATCH 12/14] feat: update nullable fields in relaxed_constraints.md for MeasuringPointHistory and remove depth validation --- transfers/relaxed_constraints.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transfers/relaxed_constraints.md b/transfers/relaxed_constraints.md index 1ab097a0..a8d932df 100644 --- a/transfers/relaxed_constraints.md +++ b/transfers/relaxed_constraints.md @@ -1,5 +1,5 @@ Address.postal_code is nullable -Thing measuring_point_height is nullable +MeasuringPointHistory.measuring_point_height is nullable ValidateWell, depth validation removed Deployment.installation_date is nullable CreateWellScreen depth validation removed From c9cf672566b2b4f37741ca145e3b49a6389c2a4a Mon Sep 17 00:00:00 2001 From: jakeross Date: Mon, 23 Feb 2026 11:55:15 -0700 Subject: [PATCH 13/14] feat: simplify location DataFrame caching by removing threading lock --- transfers/thing_transfer.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/transfers/thing_transfer.py b/transfers/thing_transfer.py index e0603b8a..a7442bb3 100644 --- a/transfers/thing_transfer.py +++ b/transfers/thing_transfer.py @@ -14,7 +14,6 @@ # limitations under the License. # =============================================================================== import time -from threading import Lock from types import SimpleNamespace from pandas import isna @@ -32,16 +31,15 @@ ) _LOCATION_DF_CACHE = None -_LOCATION_DF_LOCK = Lock() def _get_location_df(): global _LOCATION_DF_CACHE + # transfer_thing is executed in a session-scoped, non-threaded transfer flow. + # Keep a simple module-level cache and avoid lock complexity here. if _LOCATION_DF_CACHE is None: - with _LOCATION_DF_LOCK: - if _LOCATION_DF_CACHE is None: - df = read_csv("Location") - _LOCATION_DF_CACHE = replace_nans(df) + df = read_csv("Location") + _LOCATION_DF_CACHE = replace_nans(df) return _LOCATION_DF_CACHE From 782477977828bc3879c57f238db23f5a24784acc Mon Sep 17 00:00:00 2001 From: jakeross Date: Mon, 23 Feb 2026 15:15:49 -0700 Subject: [PATCH 14/14] feat: add well smoke test command and enhance contact handling with missing value checks --- .gitignore | 1 + cli/cli.py | 106 ++ core/lexicon.json | 7 + transfers/contact_transfer.py | 133 +- .../data/owners_organization_mapper.json | 3 +- transfers/smoke_test.py | 1094 +++++++++++++++++ transfers/waterlevels_transfer.py | 109 +- 7 files changed, 1374 insertions(+), 79 deletions(-) create mode 100644 transfers/smoke_test.py diff --git a/.gitignore b/.gitignore index 197d0355..9d9c353e 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ transfers/logs/* run_bdd-local.sh .pre-commit-config.local.yaml .serena/ +cli/logs # deployment files app.yaml diff --git a/cli/cli.py b/cli/cli.py index cb29338e..ae54ab42 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -44,6 +44,11 @@ class ThemeMode(str, Enum): dark = "dark" +class SmokePopulation(str, Enum): + all = "all" + agreed = "agreed" + + def _resolve_theme(theme: ThemeMode) -> ThemeMode: if theme != ThemeMode.auto: return theme @@ -278,6 +283,107 @@ def compare_duplicated_welldata( ) +@cli.command("well-smoke-test") +def well_smoke_test( + sample_size: int = typer.Option( + 25, + "--sample-size", + min=1, + help="Number of wells to sample.", + ), + population: SmokePopulation = typer.Option( + SmokePopulation.agreed, + "--population", + help="Sample from all wells or transfer-agreed wells.", + ), + all_wells: bool = typer.Option( + False, + "--all-wells/--sampled", + help="Check all wells in the selected population instead of sampling.", + ), + seed: int = typer.Option( + 42, + "--seed", + help="Random seed for deterministic sampling.", + ), + detail_path: Path = typer.Option( + Path("transfers") / "metrics" / "well_smoke_test_detail.csv", + "--detail-path", + help="Output CSV path for per-well per-entity smoke-test rows.", + ), + summary_path: Path = typer.Option( + Path("transfers") / "metrics" / "well_smoke_test_summary.json", + "--summary-path", + help="Output JSON path for smoke-test summary.", + ), + fail_on_mismatch: bool = typer.Option( + False, + "--fail-on-mismatch/--no-fail-on-mismatch", + help="Exit with code 1 if any mismatches are found.", + ), + theme: ThemeMode = typer.Option( + ThemeMode.auto, "--theme", help="Color theme: auto, light, dark." + ), +): + from transfers.smoke_test import ( + SmokePopulation as SmokePopulationModel, + run_well_smoke_test, + write_smoke_outputs, + ) + + payload = run_well_smoke_test( + sample_size=sample_size, + population=SmokePopulationModel(population.value), + seed=seed, + all_wells=all_wells, + ) + write_smoke_outputs(payload, detail_path=detail_path, summary_path=summary_path) + + sampled_wells = payload.get("sampled_wells", 0) + mismatch_count = payload.get("mismatch_count", 0) + value_mismatch_count = payload.get("value_mismatch_count", 0) + fail_count = payload.get("well_fail_count", 0) + typer.echo( + f"Smoke test complete: sampled_wells={sampled_wells}, " + f"presence_mismatches={mismatch_count}, " + f"value_mismatches={value_mismatch_count}, " + f"failed_wells={fail_count}" + ) + typer.echo(f"Wrote detail: {detail_path}") + typer.echo(f"Wrote summary: {summary_path}") + + if mismatch_count or value_mismatch_count: + failed_wells = payload.get("failed_wells", [])[:20] + typer.echo(f"Sample failed wells (up to 20): {failed_wells}") + + if value_mismatch_count: + entity_results = payload.get("entity_results", []) + value_mismatches = [ + r + for r in entity_results + if r.get("value_status") not in {"MATCH", "NOT_APPLICABLE"} + ] + typer.echo("\nValue mismatches:") + for row in value_mismatches[:100]: + pointid = row.get("pointid") + entity = row.get("entity") + status = row.get("value_status") + missing = row.get("missing_value_sample") or [] + extra = row.get("extra_value_sample") or [] + typer.echo( + f"- {pointid} | {entity} | {status} | " + f"missing={missing[:3]} | extra={extra[:3]}" + ) + if len(value_mismatches) > 100: + typer.echo( + f"... truncated {len(value_mismatches) - 100} additional value mismatches" + ) + + if mismatch_count or value_mismatch_count: + if fail_on_mismatch: + raise typer.Exit(code=1) + + @cli.command("well-inventory-csv") def well_inventory_csv( file_path: str = typer.Argument( diff --git a/core/lexicon.json b/core/lexicon.json index 07b32c30..2f325282 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -3703,6 +3703,13 @@ "term": "Commonwealth Conservancy", "definition": "Commonwealth Conservancy" }, + { + "categories": [ + "organization" + ], + "term": "Costilla MDWCA", + "definition": "Costilla MDWCA" + }, { "categories": [ "organization" diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index 1e99d88b..4167eec2 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -330,9 +330,6 @@ def _add_first_contact( contact_by_name_org, ) - if not new: - return None - if row.Email: raw_email = str(row.Email).strip() if _looks_like_phone_in_email_field(raw_email): @@ -349,9 +346,9 @@ def _add_first_contact( ) if phone: if complete: - contact.phones.append(phone) + _append_phone_if_missing(contact, phone) else: - contact.incomplete_nma_phones.append(phone) + _append_incomplete_phone_if_missing(contact, phone) else: email = _make_email( "first", @@ -361,7 +358,7 @@ def _add_first_contact( release_status=release_status, ) if email: - contact.emails.append(email) + _append_email_if_missing(contact, email) if row.Phone: phone, complete = _make_phone( @@ -373,9 +370,9 @@ def _add_first_contact( ) if phone: if complete: - contact.phones.append(phone) + _append_phone_if_missing(contact, phone) else: - contact.incomplete_nma_phones.append(phone) + _append_incomplete_phone_if_missing(contact, phone) if row.CellPhone: phone, complete = _make_phone( @@ -387,9 +384,9 @@ def _add_first_contact( ) if phone: if complete: - contact.phones.append(phone) + _append_phone_if_missing(contact, phone) else: - contact.incomplete_nma_phones.append(phone) + _append_incomplete_phone_if_missing(contact, phone) if row.MailingAddress: address = _make_address( @@ -404,7 +401,7 @@ def _add_first_contact( release_status=release_status, ) if address: - contact.addresses.append(address) + _append_address_if_missing(contact, address) if row.PhysicalAddress: address = _make_address( @@ -419,9 +416,9 @@ def _add_first_contact( release_status=release_status, ) if address: - contact.addresses.append(address) + _append_address_if_missing(contact, address) - return contact + return contact if new else None def _safe_make_name( @@ -452,7 +449,7 @@ def _add_second_contact( added: set[tuple[str | None, str | None]], contact_by_owner_type: dict[tuple[str, str], Contact], contact_by_name_org: dict[tuple[str | None, str | None], Contact], -) -> None: +) -> Contact | None: if all( [ getattr(row, f"Second{f}") is None @@ -492,9 +489,6 @@ def _add_second_contact( contact_by_owner_type, contact_by_name_org, ) - if not new: - return - if row.SecondCtctEmail: raw_email = str(row.SecondCtctEmail).strip() if _looks_like_phone_in_email_field(raw_email): @@ -511,9 +505,9 @@ def _add_second_contact( ) if phone: if complete: - contact.phones.append(phone) + _append_phone_if_missing(contact, phone) else: - contact.incomplete_nma_phones.append(phone) + _append_incomplete_phone_if_missing(contact, phone) else: email = _make_email( "second", @@ -523,7 +517,7 @@ def _add_second_contact( release_status=release_status, ) if email: - contact.emails.append(email) + _append_email_if_missing(contact, email) if row.SecondCtctPhone: phone, complete = _make_phone( @@ -535,9 +529,11 @@ def _add_second_contact( ) if phone: if complete: - contact.phones.append(phone) + _append_phone_if_missing(contact, phone) else: - contact.incomplete_nma_phones.append(phone) + _append_incomplete_phone_if_missing(contact, phone) + + return contact if new else None # helpers @@ -633,6 +629,68 @@ def _make_address(first_second: str, ownerkey: str, kind: str, **kw) -> Address ) +def _norm_text(value) -> str: + return str(value).strip().casefold() if value is not None else "" + + +def _phone_digits(value) -> str: + if value is None: + return "" + return re.sub(r"\D", "", str(value)) + + +def _append_email_if_missing(contact: Contact, email: Email) -> None: + new_key = (_norm_text(email.email), _norm_text(email.email_type)) + existing = { + (_norm_text(e.email), _norm_text(e.email_type)) for e in (contact.emails or []) + } + if new_key not in existing: + contact.emails.append(email) + + +def _append_phone_if_missing(contact: Contact, phone: Phone) -> None: + new_key = (_phone_digits(phone.phone_number), _norm_text(phone.phone_type)) + existing = { + (_phone_digits(p.phone_number), _norm_text(p.phone_type)) + for p in (contact.phones or []) + } + if new_key not in existing: + contact.phones.append(phone) + + +def _append_incomplete_phone_if_missing( + contact: Contact, phone: IncompleteNMAPhone +) -> None: + new_key = _phone_digits(phone.phone_number) + existing = { + _phone_digits(p.phone_number) for p in (contact.incomplete_nma_phones or []) + } + if new_key not in existing: + contact.incomplete_nma_phones.append(phone) + + +def _append_address_if_missing(contact: Contact, address: Address) -> None: + new_key = ( + _norm_text(address.address_line_1), + _norm_text(address.city), + _norm_text(address.state), + _norm_text(address.postal_code), + _norm_text(address.address_type), + ) + existing = { + ( + _norm_text(a.address_line_1), + _norm_text(a.city), + _norm_text(a.state), + _norm_text(a.postal_code), + _norm_text(a.address_type), + ) + for a in (contact.addresses or []) + } + if new_key not in existing: + contact.addresses.append(address) + + def _make_contact_and_assoc( session: Session, data: dict, @@ -646,13 +704,17 @@ def _make_contact_and_assoc( owner_key = data.get("nma_pk_owners") contact_type = data.get("contact_type") + organization = data.get("organization") + # Prefer owner-key/type identity. Allow name/org reuse when organization is + # present (stable identity) or when owner key is unavailable. + allow_name_org_fallback = (not bool(owner_key)) or bool(organization) if owner_key and contact_type: contact = contact_by_owner_type.get((owner_key, contact_type)) if contact is not None: new_contact = False name_org_key = (data["name"], data["organization"]) - if contact is None and name_org_key in added: + if contact is None and allow_name_org_fallback: contact = contact_by_name_org.get(name_org_key) if contact is not None: new_contact = False @@ -664,15 +726,28 @@ def _make_contact_and_assoc( contact_data = contact.model_dump(exclude=["thing_id", "notes"]) contact = Contact(**contact_data) session.add(contact) - if owner_key and contact_type: - contact_by_owner_type[(owner_key, contact_type)] = contact contact_by_name_org[name_org_key] = contact added.add(name_org_key) - assoc = ThingContactAssociation() - assoc.thing = thing - assoc.contact = contact - session.add(assoc) + if owner_key and contact_type: + contact_by_owner_type[(owner_key, contact_type)] = contact + + assoc_exists = False + if contact.id is not None: + assoc_exists = ( + session.query(ThingContactAssociation.id) + .filter( + ThingContactAssociation.thing_id == thing.id, + ThingContactAssociation.contact_id == contact.id, + ) + .first() + is not None + ) + if not assoc_exists: + assoc = ThingContactAssociation() + assoc.thing = thing + assoc.contact = contact + session.add(assoc) return contact, new_contact diff --git a/transfers/data/owners_organization_mapper.json b/transfers/data/owners_organization_mapper.json index b10f5da0..674bf154 100644 --- a/transfers/data/owners_organization_mapper.json +++ b/transfers/data/owners_organization_mapper.json @@ -51,6 +51,7 @@ "City of Truth or Consequences, WWTP": "City of Truth or Consequences, WWTP", "Cloud Country West Subdivision": "Cloud Country West Subdivision", "Commonwealth Conservancy": "Commonwealth Conservancy", + "Costilla MDWCA": "Costilla MDWCA", "Cottonwood Rural Water Assn.": "Cottonwood RWA", "Country Club Garden MHP": "Country Club Garden Mobile Home Park", "Coyote Creek MDWUA": "Coyote Creek MDWUA", @@ -235,4 +236,4 @@ "Winter Brothers/U.S. Government": "Winter Brothers", "Yates Petroleum": "Yates Petroleum Corporation", "Zamora Accounting Services": "Zamora Accounting Services" -} \ No newline at end of file +} diff --git a/transfers/smoke_test.py b/transfers/smoke_test.py new file mode 100644 index 00000000..09a45ff3 --- /dev/null +++ b/transfers/smoke_test.py @@ -0,0 +1,1094 @@ +from __future__ import annotations + +import json +import random +import re +from collections import defaultdict +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Any + +import pandas as pd +from sqlalchemy import func, select + +from core.enums import Organization +from db import ( + Address, + Contact, + Deployment, + Email, + IncompleteNMAPhone, + Observation, + Phone, + Sensor, + Thing, + ThingContactAssociation, + WellScreen, +) +from db.engine import session_ctx +from db.field import FieldActivity, FieldEvent +from db.sample import Sample +from transfers.contact_transfer import _select_ownerkey_col +from transfers.sensor_transfer import EQUIPMENT_TO_SENSOR_TYPE_MAP +from transfers.util import ( + SensorParameterEstimator, + filter_by_valid_measuring_agency, + get_transfers_data_path, + get_transferable_wells, + read_csv, + replace_nans, +) + + +class SmokePopulation(str, Enum): + all = "all" + agreed = "agreed" + + +class EntityStatus(str, Enum): + present_in_both = "PRESENT_IN_BOTH" + absent_in_both = "ABSENT_IN_BOTH" + missing_in_destination = "MISSING_IN_DESTINATION" + extra_in_destination = "EXTRA_IN_DESTINATION" + + +class ValueStatus(str, Enum): + match = "MATCH" + missing_in_destination = "MISSING_IN_DESTINATION" + extra_in_destination = "EXTRA_IN_DESTINATION" + both_missing_and_extra = "BOTH_MISSING_AND_EXTRA" + not_applicable = "NOT_APPLICABLE" + + +@dataclass +class SmokeResult: + pointid: str + entity: str + source_count: int + destination_count: int + status: EntityStatus + value_status: ValueStatus + missing_value_sample: list[str] + extra_value_sample: list[str] + + @property + def passed(self) -> bool: + return self.status in { + EntityStatus.present_in_both, + EntityStatus.absent_in_both, + } + + +def _normalize_text(value: Any) -> str: + if value is None: + return "" + try: + if pd.isna(value): + return "" + except TypeError: + pass + return str(value).strip() + + +def _has_text(value: Any) -> bool: + return bool(_normalize_text(value)) + + +def _looks_like_phone(value: Any) -> bool: + text = _normalize_text(value) + if not text or "@" in text: + return False + if not re.fullmatch(r"[\d\s().+\-]+", text): + return False + digits = re.sub(r"\D", "", text) + return len(digits) >= 7 + + +def _normalize_email(raw: Any) -> str: + text = _normalize_text(raw) + if not text: + return "" + text = re.sub(r"^\s*email\s*:\s*", "", text, flags=re.IGNORECASE) + text = re.sub(r"[.,;:]+$", "", text) + return text.strip() + + +def _normalize_number(value: Any) -> str: + text = _normalize_text(value) + if not text: + return "" + try: + return f"{float(text):.6f}" + except ValueError: + return text.lower() + + +def _normalize_contact_name(value: Any) -> str: + text = _normalize_text(value) + if not text: + return "" + # Transfer may preserve errant multiple spaces from source; compare normalized. + return re.sub(r"\s+", " ", text).strip().lower() + + +def _normalize_phone(raw: Any) -> str: + text = _normalize_text(raw) + if not text: + return "" + digits = re.sub(r"\D", "", text) + # Treat US country-code-prefixed values as equivalent (1XXXXXXXXXX == XXXXXXXXXX). + if len(digits) == 11 and digits.startswith("1"): + return digits[1:] + return digits + + +def _parse_legacy_datetime_date(value: Any) -> str | None: + if value is None: + return None + try: + if pd.isna(value): + return None + except TypeError: + pass + text = str(value).strip() + if not text: + return None + try: + return pd.to_datetime(text, format="%Y-%m-%d %H:%M:%S.%f").date().isoformat() + except (TypeError, ValueError): + return None + + +def _normalize_date_like(value: Any) -> str: + if value is None: + return "" + try: + if pd.isna(value): + return "" + except TypeError: + pass + dt = pd.to_datetime(value, errors="coerce") + if pd.isna(dt): + return "" + return dt.date().isoformat() + + +def _load_owner_org_mapper() -> dict[str, str]: + try: + mapper_path = get_transfers_data_path("owners_organization_mapper.json") + with open(mapper_path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception: + return {} + + +def _load_ownerkey_mapper() -> dict[str, str]: + try: + mapper_path = get_transfers_data_path("owners_ownerkey_mapper.json") + with open(mapper_path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception: + return {} + + +def _normalize_source_organization(raw_company: Any, mapper: dict[str, str]) -> str: + company = _normalize_text(raw_company) + if not company: + return "" + organization = mapper.get(company, company) + try: + Organization(organization) + except ValueError: + return "" + return _normalize_text(organization) + + +def _load_well_population(population: SmokePopulation) -> pd.DataFrame: + wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) + ldf = read_csv("Location") + ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1, errors="ignore") + df = wdf.join(ldf.set_index("LocationId"), on="LocationId") + df = df[df["SiteType"] == "GW"] + df = df[df["Easting"].notna() & df["Northing"].notna()] + df = replace_nans(df) + + if population == SmokePopulation.agreed: + df = get_transferable_wells(df) + + # Match current WellTransferer duplicate handling (skip every duplicate PointID). + dupes = df["PointID"].duplicated(keep=False) + if dupes.any(): + dup_ids = set(df.loc[dupes, "PointID"]) + df = df[~df["PointID"].isin(dup_ids)] + + return df + + +def _sample_pointids( + df: pd.DataFrame, sample_size: int, seed: int, all_wells: bool = False +) -> list[str]: + pointids = sorted( + {_normalize_text(v) for v in df["PointID"].tolist() if _has_text(v)} + ) + if not pointids: + return [] + if all_wells: + return pointids + + n = min(sample_size, len(pointids)) + rng = random.Random(seed) + return sorted(rng.sample(pointids, n)) + + +def _count_by_pointid( + df: pd.DataFrame, pointid_col: str, pointids: list[str] +) -> dict[str, int]: + if df.empty or pointid_col not in df.columns: + return {pid: 0 for pid in pointids} + sub = df[df[pointid_col].isin(pointids)] + if sub.empty: + return {pid: 0 for pid in pointids} + + counts = sub.groupby(pointid_col).size().to_dict() + return {pid: int(counts.get(pid, 0)) for pid in pointids} + + +def _source_entity_counts( + pointids: list[str], well_df: pd.DataFrame +) -> dict[str, dict[str, int]]: + counts = { + "thing": _count_by_pointid(well_df, "PointID", pointids), + } + + ws = replace_nans(read_csv("WellScreens")) + counts["wellscreens"] = _count_by_pointid(ws, "PointID", pointids) + + wl = replace_nans(read_csv("WaterLevels")) + wl = filter_by_valid_measuring_agency(wl) + counts["waterlevel_observations"] = _count_by_pointid(wl, "PointID", pointids) + + eq = read_csv("Equipment") + eq.columns = eq.columns.str.replace(" ", "_") + if "SerialNo" in eq.columns: + eq = eq[eq["SerialNo"].notna()] + else: + eq = eq.iloc[0:0] + eq = replace_nans(eq) + counts["deployments"] = _count_by_pointid(eq, "PointID", pointids) + + # Owners/contact graph counts. + odf = read_csv("OwnersData") + odf = odf.drop(["OBJECTID", "GlobalID"], axis=1, errors="ignore") + + ldf = read_csv("OwnerLink") + ldf = ldf.drop(["OBJECTID", "GlobalID"], axis=1, errors="ignore") + locdf = read_csv("Location") + ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId") + + owner_key_col = _select_ownerkey_col(odf, "OwnersData") + link_owner_key_col = _select_ownerkey_col(ldf, "OwnerLink") + + odf["ownerkey_norm"] = ( + odf[owner_key_col] + .fillna("") + .astype(str) + .str.strip() + .str.casefold() + .replace({"": pd.NA}) + ) + ldf["ownerkey_norm"] = ( + ldf[link_owner_key_col] + .fillna("") + .astype(str) + .str.strip() + .str.casefold() + .replace({"": pd.NA}) + ) + + ldf_join = ldf.set_index("ownerkey_norm")[["PointID"]] + owners = odf.join(ldf_join, on="ownerkey_norm") + owners = replace_nans(owners) + owners = owners[owners["PointID"].isin(pointids)] + + contact_counts = defaultdict(int) + phone_counts = defaultdict(int) + email_counts = defaultdict(int) + address_counts = defaultdict(int) + + for row in owners.itertuples(index=False): + pid = _normalize_text(getattr(row, "PointID", None)) + if not pid: + continue + + contact_counts[pid] += 1 + + primary_phone = getattr(row, "Phone", None) + cell_phone = getattr(row, "CellPhone", None) + secondary_phone = getattr(row, "SecondCtctPhone", None) + for phone_value in (primary_phone, cell_phone, secondary_phone): + if _has_text(phone_value): + phone_counts[pid] += 1 + + for email_value in ( + getattr(row, "Email", None), + getattr(row, "SecondCtctEmail", None), + ): + normalized = _normalize_email(email_value) + if not normalized: + continue + if _looks_like_phone(normalized): + phone_counts[pid] += 1 + else: + email_counts[pid] += 1 + + if _has_text(getattr(row, "MailingAddress", None)): + address_counts[pid] += 1 + if _has_text(getattr(row, "PhysicalAddress", None)): + address_counts[pid] += 1 + + counts["contacts"] = {pid: int(contact_counts.get(pid, 0)) for pid in pointids} + counts["contact_phones"] = {pid: int(phone_counts.get(pid, 0)) for pid in pointids} + counts["contact_emails"] = {pid: int(email_counts.get(pid, 0)) for pid in pointids} + counts["contact_addresses"] = { + pid: int(address_counts.get(pid, 0)) for pid in pointids + } + + return counts + + +def _blank_signature_map(pointids: list[str]) -> dict[str, set[str]]: + return {pid: set() for pid in pointids} + + +def _source_entity_signatures( + pointids: list[str], well_df: pd.DataFrame +) -> dict[str, dict[str, set[str]]]: + owner_org_mapper = _load_owner_org_mapper() + ownerkey_mapper = _load_ownerkey_mapper() + signatures = { + "thing": _blank_signature_map(pointids), + "wellscreens": _blank_signature_map(pointids), + "contacts": _blank_signature_map(pointids), + "contact_phones": _blank_signature_map(pointids), + "contact_emails": _blank_signature_map(pointids), + "contact_addresses": _blank_signature_map(pointids), + "waterlevel_observations": _blank_signature_map(pointids), + "deployments": _blank_signature_map(pointids), + } + + # Well core fields from WellData. + for row in well_df[well_df["PointID"].isin(pointids)].itertuples(index=False): + pid = _normalize_text(getattr(row, "PointID", None)) + if not pid: + continue + sig = "|".join( + [ + _normalize_number(getattr(row, "WellDepth", None)), + _normalize_number(getattr(row, "HoleDepth", None)), + _normalize_text(getattr(row, "FormationZone", None)).upper(), + ] + ) + signatures["thing"][pid].add(sig) + + # Well screens. + ws = replace_nans(read_csv("WellScreens")) + ws = ws[ws["PointID"].isin(pointids)] + for row in ws.itertuples(index=False): + pid = _normalize_text(getattr(row, "PointID", None)) + if not pid: + continue + top = getattr(row, "ScreenTop", None) + bottom = getattr(row, "ScreenBottom", None) + stype = getattr(row, "ScreenType", None) + sig = "|".join( + [ + _normalize_number(top), + _normalize_number(bottom), + _normalize_text(stype).lower(), + ] + ) + signatures["wellscreens"][pid].add(sig) + + # Deployments from Equipment. + eq = read_csv("Equipment") + eq.columns = eq.columns.str.replace(" ", "_") + if "SerialNo" in eq.columns: + eq = eq[eq["SerialNo"].notna()] + else: + eq = eq.iloc[0:0] + eq = replace_nans(eq) + eq = eq[eq["PointID"].isin(pointids)] + estimators: dict[str, SensorParameterEstimator] = {} + for row in eq.itertuples(index=False): + pid = _normalize_text(getattr(row, "PointID", None)) + if not pid: + continue + installed = _parse_legacy_datetime_date(getattr(row, "DateInstalled", None)) + if installed is None: + equipment_type = getattr(row, "EquipmentType", None) + sensor_type = EQUIPMENT_TO_SENSOR_TYPE_MAP.get(equipment_type) + if sensor_type: + estimator = estimators.get(sensor_type) + if estimator is None: + estimator = SensorParameterEstimator(sensor_type) + estimators[sensor_type] = estimator + installed = _normalize_date_like( + estimator.estimate_installation_date(row) + ) + else: + installed = "" + removed = _parse_legacy_datetime_date(getattr(row, "DateRemoved", None)) or "" + sig = "|".join( + [ + _normalize_text(getattr(row, "SerialNo", None)).lower(), + installed, + removed, + ] + ) + signatures["deployments"][pid].add(sig) + + # Owners/contact graph signatures. + odf = read_csv("OwnersData") + odf = odf.drop(["OBJECTID", "GlobalID"], axis=1, errors="ignore") + ldf = read_csv("OwnerLink") + ldf = ldf.drop(["OBJECTID", "GlobalID"], axis=1, errors="ignore") + locdf = read_csv("Location") + ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId") + + owner_key_col = _select_ownerkey_col(odf, "OwnersData") + link_owner_key_col = _select_ownerkey_col(ldf, "OwnerLink") + odf["ownerkey_canonical"] = odf[owner_key_col].replace(ownerkey_mapper) + ldf["ownerkey_canonical"] = ldf[link_owner_key_col].replace(ownerkey_mapper) + odf["ownerkey_norm"] = ( + odf["ownerkey_canonical"] + .fillna("") + .astype(str) + .str.strip() + .str.casefold() + .replace({"": pd.NA}) + ) + ldf["ownerkey_norm"] = ( + ldf["ownerkey_canonical"] + .fillna("") + .astype(str) + .str.strip() + .str.casefold() + .replace({"": pd.NA}) + ) + owners = replace_nans( + odf.join(ldf.set_index("ownerkey_norm")[["PointID"]], on="ownerkey_norm") + ) + owners = owners[owners["PointID"].notna()] + owners = owners.sort_values(by=["PointID"]) + + ContactIdentity = tuple[str | None, str | None, str] + contact_by_owner_type: dict[tuple[str, str], int] = {} + contact_by_name_org: dict[tuple[str | None, str | None], int] = {} + contact_store: dict[int, dict[str, Any]] = {} + pid_to_contact_ids: dict[str, set[int]] = defaultdict(set) + next_contact_id = 1 + + def _make_name(first: Any, last: Any) -> str | None: + f = _normalize_text(first) + l = _normalize_text(last) + if not f and not l: + return None + if f and not l: + return f + if not f and l: + return l + return f"{f} {l}" + + def _safe_make_name( + first: Any, + last: Any, + owner_key: str | None, + organization: str | None, + fallback_suffix: str | None, + ) -> str | None: + name = _make_name(first, last) + if name is None and not organization: + fallback = _normalize_text(owner_key) or None + if fallback and fallback_suffix: + fallback = f"{fallback}-{fallback_suffix}" + return fallback + return name + + def _resolve_contact( + owner_key: str | None, + contact_type: str, + name: str | None, + organization: str | None, + ) -> tuple[int | None, bool]: + nonlocal next_contact_id + key_owner = ( + (_normalize_text(owner_key), contact_type) + if _normalize_text(owner_key) + else None + ) + key_name_org = (name, organization) + allow_name_org_fallback = (not _normalize_text(owner_key)) or bool(organization) + + if key_owner and key_owner in contact_by_owner_type: + return contact_by_owner_type[key_owner], False + + if allow_name_org_fallback and key_name_org in contact_by_name_org: + contact_id = contact_by_name_org[key_name_org] + if key_owner: + contact_by_owner_type[key_owner] = contact_id + return contact_id, False + + if not name and not organization: + return None, False + + contact_id = next_contact_id + next_contact_id += 1 + contact_store[contact_id] = { + "name": name, + "organization": organization, + "contact_type": contact_type, + "phones": set(), + "emails": set(), + "addresses": set(), + } + contact_by_name_org[key_name_org] = contact_id + if key_owner: + contact_by_owner_type[key_owner] = contact_id + return contact_id, True + + for row in owners.itertuples(index=False): + pid = _normalize_text(getattr(row, "PointID", None)) + if not pid: + continue + + owner_key = _normalize_text(getattr(row, "OwnerKey", None)) or None + has_secondary_info = any( + _has_text(getattr(row, field, None)) + for field in ( + "SecondFirstName", + "SecondLastName", + "SecondCtctEmail", + "SecondCtctPhone", + ) + ) + company = _normalize_source_organization( + getattr(row, "Company", None), owner_org_mapper + ) + company = company or None + + primary_name = _safe_make_name( + getattr(row, "FirstName", None), + getattr(row, "LastName", None), + owner_key, + company, + "primary", + ) + primary_contact, primary_new = _resolve_contact( + owner_key, "Primary", primary_name, company + ) + if primary_contact: + pid_to_contact_ids[pid].add(primary_contact) + if primary_contact: + c = contact_store[primary_contact] + for phone_value in ( + getattr(row, "Phone", None), + getattr(row, "CellPhone", None), + ): + pn = _normalize_phone(phone_value) + if pn: + c["phones"].add(pn) + + em = _normalize_email(getattr(row, "Email", None)).lower() + if em: + if _looks_like_phone(em): + pn = _normalize_phone(em) + if pn: + c["phones"].add(pn) + else: + c["emails"].add(em) + + for prefix in ("Mail", "Physical"): + line1 = _normalize_text( + getattr( + row, + ( + f"{prefix}ingAddress" + if prefix == "Mail" + else "PhysicalAddress" + ), + None, + ) + ) + city = _normalize_text(getattr(row, f"{prefix}City", None)) + state = _normalize_text(getattr(row, f"{prefix}State", None)) + zipc = _normalize_text(getattr(row, f"{prefix}ZipCode", None)) + if line1: + c["addresses"].add( + f"{line1.lower()}|{city.lower()}|{state.lower()}|{zipc.lower()}" + ) + + if has_secondary_info: + secondary_name = _safe_make_name( + getattr(row, "SecondFirstName", None), + getattr(row, "SecondLastName", None), + owner_key, + company, + "secondary", + ) + secondary_contact, secondary_new = _resolve_contact( + owner_key, "Secondary", secondary_name, company + ) + if secondary_contact: + pid_to_contact_ids[pid].add(secondary_contact) + if secondary_contact: + c = contact_store[secondary_contact] + pn = _normalize_phone(getattr(row, "SecondCtctPhone", None)) + if pn: + c["phones"].add(pn) + + em = _normalize_email(getattr(row, "SecondCtctEmail", None)).lower() + if em: + if _looks_like_phone(em): + pn = _normalize_phone(em) + if pn: + c["phones"].add(pn) + else: + c["emails"].add(em) + + for pid in pointids: + for contact_id in pid_to_contact_ids.get(pid, set()): + c = contact_store.get(contact_id) + if not c: + continue + signatures["contacts"][pid].add( + f"{_normalize_text(c.get('contact_type')).lower()}|{_normalize_contact_name(c.get('name'))}|{_normalize_text(c.get('organization')).lower()}" + ) + for pn in c.get("phones", set()): + signatures["contact_phones"][pid].add(pn) + for em in c.get("emails", set()): + signatures["contact_emails"][pid].add(em) + for addr in c.get("addresses", set()): + signatures["contact_addresses"][pid].add(addr) + + return signatures + + +def _rows_to_count_dict( + rows: list[tuple[str, int]], pointids: list[str] +) -> dict[str, int]: + lut = {pid: 0 for pid in pointids} + for pid, n in rows: + if pid in lut: + lut[pid] = int(n) + return lut + + +def _destination_entity_counts(pointids: list[str]) -> dict[str, dict[str, int]]: + if not pointids: + return { + "thing": {}, + "wellscreens": {}, + "contacts": {}, + "contact_phones": {}, + "contact_emails": {}, + "contact_addresses": {}, + "waterlevel_observations": {}, + "deployments": {}, + } + + with session_ctx() as session: + thing_rows = session.execute( + select(Thing.name, func.count(Thing.id)) + .where(Thing.name.in_(pointids)) + .where(Thing.thing_type == "water well") + .group_by(Thing.name) + ).all() + + screen_rows = session.execute( + select(Thing.name, func.count(WellScreen.id)) + .join(WellScreen, WellScreen.thing_id == Thing.id) + .where(Thing.name.in_(pointids)) + .group_by(Thing.name) + ).all() + + contact_rows = session.execute( + select(Thing.name, func.count(ThingContactAssociation.id)) + .join(ThingContactAssociation, ThingContactAssociation.thing_id == Thing.id) + .where(Thing.name.in_(pointids)) + .group_by(Thing.name) + ).all() + + phone_rows = session.execute( + select(Thing.name, func.count(Phone.id)) + .join(ThingContactAssociation, ThingContactAssociation.thing_id == Thing.id) + .join(Contact, Contact.id == ThingContactAssociation.contact_id) + .join(Phone, Phone.contact_id == Contact.id) + .where(Thing.name.in_(pointids)) + .group_by(Thing.name) + ).all() + incomplete_phone_rows = session.execute( + select(Thing.name, func.count(IncompleteNMAPhone.id)) + .join(ThingContactAssociation, ThingContactAssociation.thing_id == Thing.id) + .join(Contact, Contact.id == ThingContactAssociation.contact_id) + .join(IncompleteNMAPhone, IncompleteNMAPhone.contact_id == Contact.id) + .where(Thing.name.in_(pointids)) + .group_by(Thing.name) + ).all() + + email_rows = session.execute( + select(Thing.name, func.count(Email.id)) + .join(ThingContactAssociation, ThingContactAssociation.thing_id == Thing.id) + .join(Contact, Contact.id == ThingContactAssociation.contact_id) + .join(Email, Email.contact_id == Contact.id) + .where(Thing.name.in_(pointids)) + .group_by(Thing.name) + ).all() + + address_rows = session.execute( + select(Thing.name, func.count(Address.id)) + .join(ThingContactAssociation, ThingContactAssociation.thing_id == Thing.id) + .join(Contact, Contact.id == ThingContactAssociation.contact_id) + .join(Address, Address.contact_id == Contact.id) + .where(Thing.name.in_(pointids)) + .group_by(Thing.name) + ).all() + + deployment_rows = session.execute( + select(Thing.name, func.count(Deployment.id)) + .join(Deployment, Deployment.thing_id == Thing.id) + .where(Thing.name.in_(pointids)) + .group_by(Thing.name) + ).all() + + waterlevel_obs_rows = session.execute( + select(Thing.name, func.count(Observation.id)) + .join(FieldEvent, FieldEvent.thing_id == Thing.id) + .join(FieldActivity, FieldActivity.field_event_id == FieldEvent.id) + .join(Sample, Sample.field_activity_id == FieldActivity.id) + .join(Observation, Observation.sample_id == Sample.id) + .where(Thing.name.in_(pointids)) + .where(Sample.nma_pk_waterlevels.is_not(None)) + .group_by(Thing.name) + ).all() + + results = { + "thing": _rows_to_count_dict(thing_rows, pointids), + "wellscreens": _rows_to_count_dict(screen_rows, pointids), + "contacts": _rows_to_count_dict(contact_rows, pointids), + "contact_phones": _rows_to_count_dict(phone_rows, pointids), + "contact_emails": _rows_to_count_dict(email_rows, pointids), + "contact_addresses": _rows_to_count_dict(address_rows, pointids), + "waterlevel_observations": _rows_to_count_dict(waterlevel_obs_rows, pointids), + "deployments": _rows_to_count_dict(deployment_rows, pointids), + } + incomplete_phone_counts = _rows_to_count_dict(incomplete_phone_rows, pointids) + for pid in pointids: + results["contact_phones"][pid] = int( + results["contact_phones"].get(pid, 0) + ) + int(incomplete_phone_counts.get(pid, 0)) + return results + + +def _destination_entity_signatures( + pointids: list[str], +) -> dict[str, dict[str, set[str]]]: + signatures = { + "thing": _blank_signature_map(pointids), + "wellscreens": _blank_signature_map(pointids), + "contacts": _blank_signature_map(pointids), + "contact_phones": _blank_signature_map(pointids), + "contact_emails": _blank_signature_map(pointids), + "contact_addresses": _blank_signature_map(pointids), + "waterlevel_observations": _blank_signature_map(pointids), + "deployments": _blank_signature_map(pointids), + } + if not pointids: + return signatures + + with session_ctx() as session: + thing_rows = session.execute( + select( + Thing.name, Thing.well_depth, Thing.hole_depth, Thing.nma_formation_zone + ) + .where(Thing.name.in_(pointids)) + .where(Thing.thing_type == "water well") + ).all() + for pid, wd, hd, fz in thing_rows: + signatures["thing"][pid].add( + "|".join( + [ + _normalize_number(wd), + _normalize_number(hd), + _normalize_text(fz).upper(), + ] + ) + ) + + ws_rows = session.execute( + select( + Thing.name, + WellScreen.screen_depth_top, + WellScreen.screen_depth_bottom, + WellScreen.screen_type, + ) + .join(WellScreen, WellScreen.thing_id == Thing.id) + .where(Thing.name.in_(pointids)) + ).all() + for pid, top, bottom, stype in ws_rows: + signatures["wellscreens"][pid].add( + "|".join( + [ + _normalize_number(top), + _normalize_number(bottom), + _normalize_text(stype).lower(), + ] + ) + ) + + contact_rows = session.execute( + select(Thing.name, Contact.contact_type, Contact.name, Contact.organization) + .join(ThingContactAssociation, ThingContactAssociation.thing_id == Thing.id) + .join(Contact, Contact.id == ThingContactAssociation.contact_id) + .where(Thing.name.in_(pointids)) + ).all() + for pid, ctype, name, org in contact_rows: + signatures["contacts"][pid].add( + f"{_normalize_text(ctype).lower()}|{_normalize_contact_name(name)}|{_normalize_text(org).lower()}" + ) + + phone_rows = session.execute( + select(Thing.name, Phone.phone_number) + .join(ThingContactAssociation, ThingContactAssociation.thing_id == Thing.id) + .join(Contact, Contact.id == ThingContactAssociation.contact_id) + .join(Phone, Phone.contact_id == Contact.id) + .where(Thing.name.in_(pointids)) + ).all() + for pid, phone in phone_rows: + pn = _normalize_phone(phone) + if pn: + signatures["contact_phones"][pid].add(pn) + incomplete_phone_rows = session.execute( + select(Thing.name, IncompleteNMAPhone.phone_number) + .join(ThingContactAssociation, ThingContactAssociation.thing_id == Thing.id) + .join(Contact, Contact.id == ThingContactAssociation.contact_id) + .join(IncompleteNMAPhone, IncompleteNMAPhone.contact_id == Contact.id) + .where(Thing.name.in_(pointids)) + ).all() + for pid, phone in incomplete_phone_rows: + pn = _normalize_phone(phone) + if pn: + signatures["contact_phones"][pid].add(pn) + + email_rows = session.execute( + select(Thing.name, Email.email) + .join(ThingContactAssociation, ThingContactAssociation.thing_id == Thing.id) + .join(Contact, Contact.id == ThingContactAssociation.contact_id) + .join(Email, Email.contact_id == Contact.id) + .where(Thing.name.in_(pointids)) + ).all() + for pid, email in email_rows: + em = _normalize_email(email).lower() + if em: + signatures["contact_emails"][pid].add(em) + + address_rows = session.execute( + select( + Thing.name, + Address.address_line_1, + Address.city, + Address.state, + Address.postal_code, + ) + .join(ThingContactAssociation, ThingContactAssociation.thing_id == Thing.id) + .join(Contact, Contact.id == ThingContactAssociation.contact_id) + .join(Address, Address.contact_id == Contact.id) + .where(Thing.name.in_(pointids)) + ).all() + for pid, line1, city, state, zipc in address_rows: + if _has_text(line1): + signatures["contact_addresses"][pid].add( + f"{_normalize_text(line1).lower()}|{_normalize_text(city).lower()}|{_normalize_text(state).lower()}|{_normalize_text(zipc).lower()}" + ) + + dep_rows = session.execute( + select( + Thing.name, + Sensor.serial_no, + Deployment.installation_date, + Deployment.removal_date, + ) + .join(Deployment, Deployment.thing_id == Thing.id) + .join(Sensor, Sensor.id == Deployment.sensor_id) + .where(Thing.name.in_(pointids)) + ).all() + for pid, sensor_serial, installed, removed in dep_rows: + signatures["deployments"][pid].add( + "|".join( + [ + _normalize_text(sensor_serial).lower(), + _normalize_text(installed)[:10], + _normalize_text(removed)[:10], + ] + ) + ) + + return signatures + + +def _status(source_count: int, destination_count: int) -> EntityStatus: + src = source_count > 0 + dst = destination_count > 0 + if src and dst: + return EntityStatus.present_in_both + if (not src) and (not dst): + return EntityStatus.absent_in_both + if src and (not dst): + return EntityStatus.missing_in_destination + return EntityStatus.extra_in_destination + + +def _value_status( + source_values: set[str], destination_values: set[str], compare_enabled: bool +) -> tuple[ValueStatus, list[str], list[str]]: + if not compare_enabled: + return ValueStatus.not_applicable, [], [] + + missing = sorted(source_values - destination_values) + extra = sorted(destination_values - source_values) + if not missing and not extra: + return ValueStatus.match, [], [] + if missing and extra: + return ValueStatus.both_missing_and_extra, missing[:5], extra[:5] + if missing: + return ValueStatus.missing_in_destination, missing[:5], [] + return ValueStatus.extra_in_destination, [], extra[:5] + + +def run_well_smoke_test( + sample_size: int, + population: SmokePopulation, + seed: int, + all_wells: bool = False, +) -> dict[str, Any]: + well_df = _load_well_population(population) + pointids = _sample_pointids( + well_df, sample_size=sample_size, seed=seed, all_wells=all_wells + ) + + if not pointids: + return { + "population": population.value, + "seed": seed, + "sample_size": sample_size, + "available_wells": 0, + "sampled_wells": 0, + "entity_results": [], + "mismatch_count": 0, + "well_fail_count": 0, + } + + source = _source_entity_counts(pointids, well_df) + dest = _destination_entity_counts(pointids) + source_values = _source_entity_signatures(pointids, well_df) + dest_values = _destination_entity_signatures(pointids) + + entities = [ + "thing", + "wellscreens", + "contacts", + "contact_phones", + "contact_emails", + "contact_addresses", + "waterlevel_observations", + "deployments", + ] + value_compare_entities = { + "thing", + "wellscreens", + "contacts", + "contact_phones", + "contact_emails", + "contact_addresses", + "deployments", + } + + results: list[SmokeResult] = [] + for pid in pointids: + for entity in entities: + src_values_set = source_values.get(entity, {}).get(pid, set()) + dst_values_set = dest_values.get(entity, {}).get(pid, set()) + src_count = int(source.get(entity, {}).get(pid, 0)) + dst_count = int(dest.get(entity, {}).get(pid, 0)) + # For entities where we compare normalized value sets, use those sets + # for presence status to avoid false count mismatches from contact reuse. + if entity in value_compare_entities: + src_count = len(src_values_set) + dst_count = len(dst_values_set) + vstatus, missing_vals, extra_vals = _value_status( + src_values_set, + dst_values_set, + compare_enabled=entity in value_compare_entities, + ) + results.append( + SmokeResult( + pointid=pid, + entity=entity, + source_count=src_count, + destination_count=dst_count, + status=_status(src_count, dst_count), + value_status=vstatus, + missing_value_sample=missing_vals, + extra_value_sample=extra_vals, + ) + ) + + value_mismatches = [ + r + for r in results + if r.value_status not in {ValueStatus.match, ValueStatus.not_applicable} + ] + mismatches = [r for r in results if not r.passed] + failed_wells = sorted( + {r.pointid for r in mismatches} | {r.pointid for r in value_mismatches} + ) + + payload = { + "population": population.value, + "seed": seed, + "sample_size": sample_size, + "available_wells": int(well_df["PointID"].dropna().nunique()), + "sampled_wells": len(pointids), + "mismatch_count": len(mismatches), + "value_mismatch_count": len(value_mismatches), + "well_fail_count": len(failed_wells), + "failed_wells": failed_wells, + "entity_results": [ + { + "pointid": r.pointid, + "entity": r.entity, + "source_count": r.source_count, + "destination_count": r.destination_count, + "status": r.status.value, + "value_status": r.value_status.value, + "missing_value_sample": r.missing_value_sample, + "extra_value_sample": r.extra_value_sample, + "passed": r.passed, + } + for r in results + ], + } + return payload + + +def write_smoke_outputs( + payload: dict[str, Any], detail_path: Path, summary_path: Path +) -> None: + detail_path.parent.mkdir(parents=True, exist_ok=True) + summary_path.parent.mkdir(parents=True, exist_ok=True) + + rows = payload.get("entity_results", []) + pd.DataFrame(rows).to_csv(detail_path, index=False) + + summary = {k: v for k, v in payload.items() if k not in {"entity_results"}} + summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8") diff --git a/transfers/waterlevels_transfer.py b/transfers/waterlevels_transfer.py index 261faf53..9c45cf26 100644 --- a/transfers/waterlevels_transfer.py +++ b/transfers/waterlevels_transfer.py @@ -94,7 +94,7 @@ def __init__(self, *args, **kw): with open(path, "r") as f: self._measured_by_mapper = json.load(f) - self._created_contacts = {} + self._created_contact_id_by_key: dict[tuple[str, str], int] = {} self._thing_id_by_pointid: dict[str, int] = {} self._owner_contact_id_by_pointid: dict[str, int] = {} self._build_caches() @@ -206,7 +206,7 @@ def _transfer_hook(self, session: Session) -> None: release_status = "public" if row.PublicRelease else "private" - field_event_participants = self._get_field_event_participants( + field_event_participant_ids = self._get_field_event_participant_ids( session, row ) stats["contacts_created"] += getattr( @@ -216,7 +216,7 @@ def _transfer_hook(self, session: Session) -> None: self, "_last_contacts_reused_count", 0 ) - if not field_event_participants: + if not field_event_participant_ids: stats["rows_missing_participants"] += 1 is_destroyed = ( @@ -236,7 +236,7 @@ def _transfer_hook(self, session: Session) -> None: "dt_utc": dt_utc, "glv": glv, "release_status": release_status, - "participants": field_event_participants, + "participant_ids": field_event_participant_ids, "is_destroyed": is_destroyed, } ) @@ -273,11 +273,13 @@ def _transfer_hook(self, session: Session) -> None: participant_rows: list[dict[str, Any]] = [] lead_row_pos_by_prepared_idx: dict[int, int] = {} for prepared_idx, prep in enumerate(prepared_rows): - for participant_idx, participant in enumerate(prep["participants"]): + for participant_idx, participant_id in enumerate( + prep["participant_ids"] + ): participant_rows.append( { "field_event_id": field_event_ids[prepared_idx], - "contact_id": participant.id, + "contact_id": participant_id, "participant_role": ( "Lead" if participant_idx == 0 else "Participant" ), @@ -578,10 +580,10 @@ def _get_groundwater_level_reason(self, row) -> str: raise ValueError(f"Unknown groundwater level reason: {glv}") return glv - def _get_field_event_participants(self, session, row) -> list[Contact]: + def _get_field_event_participant_ids(self, session, row) -> list[int]: self._last_contacts_created_count = 0 self._last_contacts_reused_count = 0 - field_event_participants = [] + field_event_participant_ids: list[int] = [] measured_by = None if pd.isna(row.MeasuredBy) else row.MeasuredBy if measured_by not in ["Owner", "Owner report", "Well owner"]: @@ -590,35 +592,58 @@ def _get_field_event_participants(self, session, row) -> list[Contact]: contact_info = get_contacts_info( row, measured_by, self._measured_by_mapper ) + contacts_to_create: list[dict[str, Any]] = [] + missing_keys: list[tuple[str, str]] = [] for name, organization, role in contact_info: - if (name, organization) in self._created_contacts: - contact = self._created_contacts[(name, organization)] + key = (name, organization) + contact_id = self._created_contact_id_by_key.get(key) + if contact_id is not None: + field_event_participant_ids.append(contact_id) self._last_contacts_reused_count += 1 else: - try: - # create new contact if not already created - contact = Contact( - name=name, - role=role, - contact_type="Field Event Participant", - organization=organization, - nma_pk_waterlevels=row.GlobalID, - ) - session.add(contact) - - logger.info( - f"{SPACE_2}Created contact: | Name {contact.name} | Role {contact.role} | Organization {contact.organization} | nma_pk_waterlevels {contact.nma_pk_waterlevels}" + contacts_to_create.append( + { + "name": name, + "role": role, + "contact_type": "Field Event Participant", + "organization": organization, + "nma_pk_waterlevels": row.GlobalID, + } + ) + missing_keys.append(key) + + if contacts_to_create: + try: + created_contact_ids = ( + session.execute( + insert(Contact).returning(Contact.id), + contacts_to_create, ) - - self._created_contacts[(name, organization)] = contact + .scalars() + .all() + ) + except Exception as e: + logger.critical( + "Contact insert failed for PointID=%s, GlobalID=%s: %s", + row.PointID, + row.GlobalID, + str(e), + ) + else: + for key, created_contact_id, payload in zip( + missing_keys, created_contact_ids, contacts_to_create + ): + self._created_contact_id_by_key[key] = created_contact_id + field_event_participant_ids.append(created_contact_id) self._last_contacts_created_count += 1 - except Exception as e: - logger.critical( - f"Contact cannot be created: Name {name} | Role {role} | Organization {organization} because of the following: {str(e)}" + logger.info( + "%sCreated contact: | Name %s | Role %s | Organization %s | nma_pk_waterlevels %s", + SPACE_2, + payload["name"], + payload["role"], + payload["organization"], + payload["nma_pk_waterlevels"], ) - continue - - field_event_participants.append(contact) else: owner_contact_id = self._owner_contact_id_by_pointid.get(row.PointID) if owner_contact_id is None: @@ -633,30 +658,16 @@ def _get_field_event_participants(self, session, row) -> list[Contact]: "MeasuredBy", ) else: - contact = session.get(Contact, owner_contact_id) - if contact is None: - logger.warning( - "Owner contact id=%s not found for PointID=%s; cannot use owner fallback for %s", - owner_contact_id, - row.PointID, - self._row_context(row), - ) - self._capture_error( - row.PointID, - f"owner contact id {owner_contact_id} not found", - "MeasuredBy", - ) - else: - field_event_participants.append(contact) - self._last_contacts_reused_count += 1 + field_event_participant_ids.append(owner_contact_id) + self._last_contacts_reused_count += 1 - if len(field_event_participants) == 0: + if len(field_event_participant_ids) == 0: logger.warning( f"No contacts can be associated with the WaterLevels record with GlobalID {row.GlobalID}; " f"continuing with nullable field_event_participant_id." ) - return field_event_participants + return field_event_participant_ids def _row_context(self, row: Any) -> str: return (