DataIntegrationGroup
diff --git a/‎core/lexicon.json‎
Lines changed: 1 addition & 0 deletions b/‎core/lexicon.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎transfers/contact_transfer.py‎
Lines changed: 51 additions & 44 deletions b/‎transfers/contact_transfer.py‎
Lines changed: 51 additions & 44 deletions
diff --git a/‎transfers/link_ids_transfer.py‎
Lines changed: 4 additions & 4 deletions b/‎transfers/link_ids_transfer.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎transfers/metrics.py‎
Lines changed: 68 additions & 19 deletions b/‎transfers/metrics.py‎
Lines changed: 68 additions & 19 deletions
diff --git a/‎transfers/sensor_transfer.py‎
Lines changed: 16 additions & 0 deletions b/‎transfers/sensor_transfer.py‎
Lines changed: 16 additions & 0 deletions
@@ -357,6 +357,7 @@
     {"categories": ["analysis_method_type"], "term": "Laboratory", "definition": "A procedure performed on a physical sample in a controlled, off-site laboratory environment. These methods typically involve complex instrumentation, standardized reagents, and formal quality control protocols."},
     {"categories": ["analysis_method_type"], "term": "Field Procedure", "definition": "A standardized procedure performed on-site at the time of sample collection. This can involve direct measurement of the environmental medium using a calibrated field instrument or a specific, documented technique for collecting a sample."},
     {"categories": ["analysis_method_type"], "term": "Calculation", "definition": "A mathematical procedure used to derive a new data point from one or more directly measured values. This type is used to document the provenance of calculated data, providing an auditable trail."},
+    {"categories": ["organization"], "term": "NMSU", "definition": "New Mexico State University"},
     {"categories": ["organization"], "term": "USGS", "definition": "US Geological Survey"},
     {"categories": ["organization"], "term": "TWDB", "definition": "Texas Water Development Board"},
     {"categories": ["organization"], "term": "NMED", "definition": "New Mexico Environment Department"},
 
@@ -21,6 +21,7 @@
 from transfers.logger import logger
 from transfers.util import (
     get_transfers_data_path,
+    chunk_by_size,
 )
 from transfers.util import read_csv, filter_to_valid_point_ids, replace_nans
 
@@ -66,50 +67,56 @@ def transfer_contacts(session):
     odf = filter_to_valid_point_ids(session, odf)
     cleaned_df = odf
     errors = []
-    for i, row in odf.iterrows():
-        thing = session.query(Thing).where(Thing.name == row.PointID).first()
-        logger.info(f"Processing PointID: {i} {row.PointID}")
-        if thing is None:
-            logger.critical(
-                f"Thing with PointID {row.PointID} not found. Skipping owner."
-            )
-            continue
-
-        # TODO: use contact_helper.add_contact
-        try:
-            _add_first_contact(session, row, thing, co_to_org_mapper)
-            session.commit()
-            session.flush()
-            logger.info(f"added first contact for PointID {row.PointID}")
-        except ValidationError as e:
-            logger.critical(
-                f"Skipping first contact for PointID {row.PointID} due to validation error: {e.errors()}"
-            )
-            session.rollback()
-            errors.append({"pointid": row.PointID, "error": e.errors()})
-        except Exception as e:
-            logger.critical(
-                f"Skipping first contact for PointID {row.PointID} due to error: {e}"
-            )
-            session.rollback()
-            errors.append({"pointid": row.PointID, "error": e})
-        try:
-            _add_second_contact(session, row, thing, co_to_org_mapper)
-            session.commit()
-            session.flush()
-            logger.info(f"added second contact for PointID {row.PointID}")
-        except ValidationError as e:
-            logger.critical(
-                f"Skipping second contact for PointID {row.PointID} due to validation error: {e.errors()}"
-            )
-            session.rollback()
-            errors.append({"pointid": row.PointID, "error": e.errors()})
-        except Exception as e:
-            logger.critical(
-                f"Skipping second contact for PointID {row.PointID} due to error: {e}"
-            )
-            session.rollback()
-            errors.append({"pointid": row.PointID, "error": e})
+    # for i, row in odf.iterrows():
+    for chunk in chunk_by_size(odf, 500):
+        things = (
+            session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all()
+        )
+        for i, row in chunk.iterrows():
+            thing = next((thing for thing in things if thing.name == row.PointID), None)
+            logger.info(f"Processing PointID: {i} {row.PointID}")
+            if thing is None:
+                logger.critical(
+                    f"Thing with PointID {row.PointID} not found. Skipping owner."
+                )
+                continue
+
+            # TODO: use contact_helper.add_contact
+            try:
+                _add_first_contact(session, row, thing, co_to_org_mapper)
+                session.commit()
+                # session.flush()
+                logger.info(f"added first contact for PointID {row.PointID}")
+            except ValidationError as e:
+                logger.critical(
+                    f"Skipping first contact for PointID {row.PointID} due to validation error: {e.errors()}"
+                )
+                session.rollback()
+                errors.append({"pointid": row.PointID, "error": e.errors()})
+            except Exception as e:
+                logger.critical(
+                    f"Skipping first contact for PointID {row.PointID} due to error: {e}"
+                )
+                session.rollback()
+                errors.append({"pointid": row.PointID, "error": e})
+
+            try:
+                _add_second_contact(session, row, thing, co_to_org_mapper)
+                session.commit()
+                # session.flush()
+                logger.info(f"added second contact for PointID {row.PointID}")
+            except ValidationError as e:
+                logger.critical(
+                    f"Skipping second contact for PointID {row.PointID} due to validation error: {e.errors()}"
+                )
+                session.rollback()
+                errors.append({"pointid": row.PointID, "error": e.errors()})
+            except Exception as e:
+                logger.critical(
+                    f"Skipping second contact for PointID {row.PointID} due to error: {e}"
+                )
+                session.rollback()
+                errors.append({"pointid": row.PointID, "error": e})
 
     return input_df, cleaned_df, errors
 
 
@@ -33,8 +33,8 @@ def transfer_link_ids_welldata(session):
 
     ldf = filter_to_valid_point_ids(session, ldf)
 
-    for chunk in chunk_by_size(ldf, 25):
-        locations = (
+    for chunk in chunk_by_size(ldf, 100):
+        things = (
             session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all()
         )
         for row in chunk.itertuples():
@@ -45,7 +45,7 @@ def transfer_link_ids_welldata(session):
                 # )
                 continue
 
-            thing = next((l for l in locations if l.name == row.PointID), None)
+            thing = next((l for l in things if l.name == row.PointID), None)
             if thing is None:
                 logger.warning(
                     f"Thing not found forPointID {row.PointID}. Skipping link ids."
@@ -162,7 +162,7 @@ def transfer_link_ids(session, site_type="GW"):
     ldf = replace_nans(ldf)
 
     ldf = filter_to_valid_point_ids(session, ldf)
-    for chunk in chunk_by_size(ldf, 25):
+    for chunk in chunk_by_size(ldf, 100):
         locations = (
             session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all()
         )
 
@@ -22,10 +22,21 @@
 from sqlalchemy import select, func
 from sqlalchemy.orm import Session
 
-from db import Thing, WellScreen, Sensor, Contact, Observation, Parameter
+from db import (
+    Thing,
+    WellScreen,
+    Sensor,
+    Contact,
+    Observation,
+    Parameter,
+    Deployment,
+    TransducerObservation,
+)
 
 
 class Metrics:
+    include_errors = False
+
     def __init__(self):
         # create a new path for the metrics
         root = Path("metrics")
@@ -36,12 +47,16 @@ def __init__(self):
             os.mkdir(root)
 
         self.path = root / f"metrics_{datetime.now()}.csv"
-
-        self._writer = csv.writer(self.path.open("a"), delimiter="|")
-        self._writer.writerow(["model", "transferred", "input_count", "cleaned_count"])
+        delimiter = "|" if self.include_errors else ","
+        self._writer = csv.writer(self.path.open("a"), delimiter=delimiter)
+        self._writer.writerow(
+            ["model", "input_count", "cleaned_count", "transferred", "issue_percentage"]
+        )
 
     def well_metrics(self, *args, **kw) -> None:
-        self._handle_metrics(Thing, where=Thing.thing_type == "water well", *args, **kw)
+        self._handle_metrics(
+            Thing, where=Thing.thing_type == "water well", name="Well", *args, **kw
+        )
 
     def sensor_metrics(self, *args, **kw) -> None:
         self._handle_metrics(Sensor, *args, **kw)
@@ -56,7 +71,9 @@ def contact_metrics(self, sess, input_df, cleaned_df, errors) -> None:
         )
 
         # since each contact in nma contacts a primary and a secondary contact multiply the count by 2
-        metrics = [Contact.__name__, len(input_df) * 2, len(cleaned_df) * 2, count]
+        metrics = self._make_metrics(
+            Contact.__name__, len(input_df) * 2, len(cleaned_df) * 2, count
+        )
         self._writer.writerow(metrics)
         self._write_errors(errors)
 
@@ -69,33 +86,65 @@ def water_level_metrics(self, sess, input_df, cleaned_df, errors) -> None:
         )
         count = sess.execute(sql).scalar_one()
 
-        metrics = ["Manual Water Levels", len(input_df), len(cleaned_df), count]
+        metrics = self._make_metrics(
+            "Manual Water Levels", len(input_df), len(cleaned_df), count
+        )
         self._writer.writerow(metrics)
         self._write_errors(errors)
 
+    def acoustic_metrics(self, *args, **kw) -> None:
+        self._transducer_metrics("Acoustic Sounder", *args, **kw)
+
+    def pressure_metrics(self, *args, **kw) -> None:
+        self._transducer_metrics("Pressure Transducer", *args, **kw)
+
+    def _transducer_metrics(
+        self, sensor_type, sess, input_df, cleaned_df, errors
+    ) -> None:
+        sql = (
+            select(func.count())
+            .select_from(TransducerObservation)
+            .join(Deployment)
+            .join(Sensor)
+            .join(Parameter)
+            .where(Sensor.sensor_type == sensor_type)
+            .where(Parameter.parameter_name == "groundwater level")
+        )
+        count = sess.execute(sql).scalar_one()
+        metrics = self._make_metrics(sensor_type, len(input_df), len(cleaned_df), count)
+        self._writer.writerow(metrics)
+        self._write_errors(errors)
+
+    def _make_metrics(self, name, input_n, cleaned_n, count):
+        percent_issue = (cleaned_n - count) / cleaned_n * 100 if cleaned_n == 0 else 0
+        return [name, input_n, cleaned_n, count, percent_issue]
+
     def _handle_metrics(
-        self, model, sess, input_df, cleaned_df, errors, where=None
+        self, model, sess, input_df, cleaned_df, errors, where=None, name=None
     ) -> None:
         count = self._get_count(sess, model, where=where)
-        self._write_metrics(model.__name__, count, input_df, cleaned_df)
+
+        if name is None:
+            name = model.__name__
+        self._write_metrics(name, count, input_df, cleaned_df)
         self._write_errors(errors)
 
     def _write_errors(self, errors: list) -> None:
-        self._writer.writerow(["PointID", "Error"])
-        for e in errors:
-            error = e["error"]
-            if not isinstance(error, (list, tuple)):
-                error = [error]
+        if self.include_errors:
+            self._writer.writerow(["PointID", "Error"])
+            for e in errors:
+                error = e["error"]
+                if not isinstance(error, (list, tuple)):
+                    error = [error]
 
-            for ee in error:
-                self._writer.writerow([e["pointid"], ee])
-        self._writer.writerow([])
+                for ee in error:
+                    self._writer.writerow([e["pointid"], ee])
+            self._writer.writerow([])
 
     def _write_metrics(
         self, name: str, count: int, input_df: DataFrame, cleaned_df: DataFrame
     ) -> None:
-        metrics = [name, len(input_df), len(cleaned_df), count]
-
+        metrics = self._make_metrics(name, len(input_df), len(cleaned_df), count)
         self._writer.writerow(metrics)
 
     def _get_count(self, sess: Session, model, where=None) -> int:
 
@@ -15,6 +15,8 @@
 # ===============================================================================
 from datetime import datetime
 
+from sqlalchemy import select
+
 from db import Sensor, Deployment, Thing
 from transfers.util import read_csv, logger, filter_to_valid_point_ids, replace_nans
 
@@ -119,6 +121,20 @@ def transfer_sensors(session):
                             f"not an integer",
                         }
                     )
+                sql = (
+                    select(Deployment)
+                    .join(Thing)
+                    .join(Sensor)
+                    .where(Thing.name == pointid)
+                    .where(Sensor.serial_no == sensor.serial_no)
+                    .where(Deployment.installation_date == installation_date)
+                    .where(Deployment.removal_date == removal_date)
+                )
+
+                existing_deployment = session.execute(sql).scalars().one_or_none()
+                if existing_deployment:
+                    logger.info("existing deployment")
+                    continue
 
                 # TODO: add validation
                 deployment = Deployment(