diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 77ce402..05fe71e 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -18,10 +18,10 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python 3.10 - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: "3.10" cache: "pip" diff --git a/CHANGELOG.md b/CHANGELOG.md index eff71cd..d6457f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,15 +4,41 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## Unreleased: 0.8.0 +## 0.9.2 + +### Added +- `--sites-only` flag to only retrieve site data +- `--output-format` flag to write out sites/summary tables as csv or geojson. + - options are `csv` or `geojson` + - timeseries data is always written to a csv +- NM OSE POD data for sites. + - can be removed from output with `--no-nmose-pod` +- `--output-dir` to change the output directory to a location other than `.` (the current working directory) + +### Changed +- `output` to `output-type` for CLI + +### Fixed +- a bug with `--site-limit`. it now exports the number of sets requested by the user + +## 0.8.0 ### Added - water level for WQP +- `earliest_date`, `earliest_time`, `earliest_value`, and `earliest_units` to the summary table +- `die wells` to get all wells for which the DIE reports observations +- `die source {parameter}` to list sources that report a particular parameter +- NM OSE PODs, though its information is only currently available for the invocation of `die wells` ### Changed - NM OSE Roswell data is now pulled from ST2 and not CKAN +- renamed the column `location` to `name` in the summary table to match the format of the `sites` table when timeseries data are exported +- renamed the columns `most_recent_date`, `most_recent_time`, `most_recent_value`, and `most_recent_units` to `latest_date`, `latest_time`, `latest_value`, and `latest_units` respectively for succinctness and juxtaposition with the newly added `earliest` columns. + - This naming schema also enables the development of datetime filters as the descriptor will apply to the latest datetime within the provided time frame filter, whereas most recent indicates np filters. +- removed sites that are not in New Mexico ### Fixed +- removed records from USGS where the value is "-999999" ## 0.7.0 diff --git a/README.md b/README.md index d07b1dd..0cb15ac 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,8 @@ Data comes from the following sources. We are continuously adding new sources as - Available data: `water levels`, `water quality` - [New Mexico Environment Department Drinking Water Bureau (NMED DWB)](https://nmenv.newmexicowaterdata.org/FROST-Server/v1.1/) - Available data: `water quality` +- [New Mexico Office of the State Engineer Points of Diversions (NMOSEPODs)](https://services2.arcgis.com/qXZbWTdPDbTjl7Dy/ArcGIS/rest/services/OSE_PODs/FeatureServer/0) + - Available data: `None` - [New Mexico Office of the State Engineer ISC Seven Rivers (NMOSE ISC Seven Rivers)](https://nmisc-wf.gladata.com/api/getMonitoringPoints.ashx) - Available data: `water levels`, `water quality` - [New Mexico Office of the State Engineer Roswell District Office (NMOSE Roswell)](https://catalog.newmexicowaterdata.org/dataset/pecos_region_manual_groundwater_levels) @@ -63,27 +65,28 @@ where `{parameter}` is the name of the parameter whose data is to be retrieved, | **nmbgmr-amp** | X | X | X | X | X | X | X | X | X | X | X | X | X | X | X | X | | **nmed-dwb** | - | X | X | X | - | X | X | X | X | X | X | X | X | X | X | X | | **nmose-isc-seven-rivers** | X | - | X | X | - | X | X | X | X | X | X | X | X | X | X | - | +| **nmose-pod** | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | | **nmose-roswell** | X | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | | **nwis** | X | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | | **pvacd** | X | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | | **wqp** | X | X | X | X | X | X | X | X | X | X | X | X | X | X | X | X | -### Output -The `--output` option is required and used to set the output type: +### Output Type +The `--output-type` option is required and used to set the output type: ``` ---output summary +--output-type summary ``` - A summary table consisting of location information as well as summary statistics for the parameter of interest for every location that has observations. ``` ---output timeseries_unified +--output-type timeseries_unified ``` - A single table consisting of time series data for all locations for the parameter of interest. - A single table of site data that contains information such as latitude, longitude, and elevation ``` ---output timeseries_separated +--output-type timeseries_separated ``` - Separate time series tables for all locations for the parameter of interest. - A single table of site data that contains information such as latitude, longitude, and elevation @@ -98,7 +101,7 @@ A log of the inputs and processes, called `die.log`, is also saved to the output | :----------- | :---------- | :-------- | :------------- | | source | the organization/source for the site | string | Y | | id | the id of the site. The id is used as the key to join the site and timeseries tables | string | Y | -| location | the colloquial name for the site | string | Y | +| name | the colloquial name for the site | string | Y | | usgs_site_id | USGS site id | string | N | | alternate_site_id | alternate site id | string | N | | latitude | latitude in decimal degrees | float | Y | @@ -114,10 +117,14 @@ A log of the inputs and processes, called `die.log`, is also saved to the output | min | the minimum observation | float | Y | | max | the maximum observation | float | Y | | mean | the mean value of the observations | float | Y | -| most_recent_date| date of most recent record in YYYY-MM-DD | string | Y | -| most_recent_time | time of most recent record in HH:MM:SS or HH:MM:SS.mmm | string | N | -| most_recent_value | value of the most recent record | float | Y | -| most_recent_units | units of the most recent record | string | Y | +| earliest_date| date of the earliest record in YYYY-MM-DD | string | Y | +| earliest_time | time of the earliest record in HH:MM:SS or HH:MM:SS.mmm | string | N | +| earliest_value | value of the earliest recent record | float | Y | +| earliest_units | units of the earliest record | string | Y | +| latest_date| date of the latest record in YYYY-MM-DD | string | Y | +| latest_time | time of the latest record in HH:MM:SS or HH:MM:SS.mmm | string | N | +| latest_value | value of the latest recent record | float | Y | +| latest_units | units of the latest record | string | Y | *CABQ elevation is calculated as [elevation at top of casing] - [stickup height]; if stickup height < 0 the measuring point is assumed to be beneath the ground surface @@ -139,6 +146,7 @@ A log of the inputs and processes, called `die.log`, is also saved to the output | formation | geologic formation in which the well terminates | string | N | | aquifer | aquifer from which the well draws water | string | N | | well_depth | depth of well | float | N | +| well_depth_units | units of well depth. Defaults to ft | string | N | **CABQ elevation is calculated as [elevation at top of casing] - [stickup height]; if stickup height < 0 the measuring point is assumed to be beneath the ground surface @@ -167,12 +175,13 @@ The Data Integration Engine enables the user to obtain groundwater level and gro - `--no-nmbgmr-amp` to exclude New Mexico Bureau of Geology and Mineral Resources (NMBGMR) Aquifer Mapping Program (AMP) data - `--no-nmed-dwb` to exclude New Mexico Environment Department (NMED) Drinking Water Bureau (DWB) data - `--no-nmose-isc-seven-rivers` to exclude New Mexico Office of State Engineer (NMOSE) Interstate Stream Commission (ISC) Seven Rivers data +- `--no-nmose-pod` to exclude New Mexico Office of State Engineer (NMOSE) Point of Diversion (POD) data (though none except for well information is currently available) - `--no-nmose-roswell` to exclude New Mexico Office of State Engineer (NMOSE) Roswell data - `--no-nwis` to exclude USGS NWIS data - `--no-pvacd` to exclude Pecos Valley Artesian Convservancy District (PVACD) data - `--no-wqp` to exclude Water Quality Portal (WQP) data -### Geographic Filters +### Geographic Filters [In Development] The following flags can be used to geographically filter data: @@ -184,7 +193,11 @@ The following flags can be used to geographically filter data: -- bbox 'x1 y1, x2 y2' ``` -### Date Filters +``` +-- wkt {wkt polygon or multipolygon} +``` + +### Date Filters [In Development] The following flags can be used to filter by dates: @@ -206,12 +219,12 @@ die sources {parameter} to print the sources that report that parameter to the terminal. -### Wells [In Development] +### Sites Use ``` -die wells +die sites ``` -to print wells to the terminal. +to export site information only \ No newline at end of file diff --git a/auto_worker_requirements.txt b/auto_worker_requirements.txt new file mode 100644 index 0000000..cff7193 --- /dev/null +++ b/auto_worker_requirements.txt @@ -0,0 +1,11 @@ +flask +gunicorn +httpx +pandas +geopandas +frost_sta_client +google-cloud-storage +pytest +urllib3>=2.2.0,<3.0.0 +Geoalchemy2 +sqlalchemy \ No newline at end of file diff --git a/backend/__init__.py b/backend/__init__.py index e69de29..804491c 100644 --- a/backend/__init__.py +++ b/backend/__init__.py @@ -0,0 +1,16 @@ +from enum import Enum +from os import environ + + +class OutputFormat(str, Enum): + GEOJSON = "geojson" + CSV = "csv" + GEOSERVER = "geoserver" + + +def get_bool_env_variable(var: str) -> bool: + env_var = environ.get(var, None) + if env_var is None or env_var.strip().lower() not in ["true", "1", "yes"]: + return False + else: + return True diff --git a/backend/bounding_polygons.py b/backend/bounding_polygons.py index d9cd100..8a9ccd6 100644 --- a/backend/bounding_polygons.py +++ b/backend/bounding_polygons.py @@ -15,12 +15,15 @@ # =============================================================================== import json import os +from pprint import pprint import click import httpx from shapely import Polygon, box from shapely.geometry import shape +from backend.geo_utils import transform_srid, SRID_WGS84, SRID_UTM_ZONE_13N + # polygon retrivial functions # multiple polygons @@ -159,7 +162,7 @@ def get_county_polygon(name, as_wkt=True): _warning(f"Invalid state. {state}") -def get_state_polygon(state): +def get_state_polygon(state: str, buffer: int | None = None): statefp = _statelookup(state) if statefp: obj = _get_cached_object( @@ -167,13 +170,20 @@ def get_state_polygon(state): f"{state} state", f"https://reference.geoconnex.us/collections/states/items/{statefp}?&f=json", ) + geom_gcs = shape(obj["features"][0]["geometry"]) + + if buffer: + geom_utm = transform_srid(geom_gcs, SRID_WGS84, SRID_UTM_ZONE_13N) + geom_utm = geom_utm.buffer(buffer) + geom_gcs = transform_srid(geom_utm, SRID_UTM_ZONE_13N, SRID_WGS84) - return shape(obj["geometry"]) + return geom_gcs # private helpers ============================ def _make_shape(obj, as_wkt): poly = shape(obj["geometry"]) + poly = poly.simplify(0.1) if as_wkt: return poly.wkt return poly @@ -231,8 +241,9 @@ def _get_cached_object(name, msg, url): return obj +NM_BOUNDARY_BUFFERED = get_state_polygon("NM", 25000) + + if __name__ == "__main__": - # w = get_huc_polygon('0101000201') - # print(w) - print(get_state_hucs_boundaries(state="CO", level=4)) + print(get_state_polygon("NM")) # ============= EOF ============================================= diff --git a/backend/config.py b/backend/config.py index ec9be97..86ef36a 100644 --- a/backend/config.py +++ b/backend/config.py @@ -15,13 +15,12 @@ # =============================================================================== import os import sys -import time from datetime import datetime, timedelta - +from enum import Enum import shapely.wkt +import yaml -from backend.logging import Loggable - +from . import OutputFormat from .bounding_polygons import get_county_polygon from .connectors.nmbgmr.source import ( NMBGMRSiteSource, @@ -29,24 +28,35 @@ NMBGMRAnalyteSource, ) from .connectors.bor.source import BORSiteSource, BORAnalyteSource -from .connectors.ckan import ( - HONDO_RESOURCE_ID, - FORT_SUMNER_RESOURCE_ID, - ROSWELL_RESOURCE_ID, -) -from .connectors.ckan.source import ( - OSERoswellSiteSource, - OSERoswellWaterLevelSource, -) from .connectors.nmenv.source import DWBSiteSource, DWBAnalyteSource -from .constants import MILLIGRAMS_PER_LITER, WGS84, FEET +from .connectors.nmose.source import NMOSEPODSiteSource +from .constants import ( + MILLIGRAMS_PER_LITER, + WGS84, + FEET, + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) from .connectors.isc_seven_rivers.source import ( ISCSevenRiversSiteSource, ISCSevenRiversWaterLevelSource, ISCSevenRiversAnalyteSource, ) from .connectors.st2.source import ( - ST2SiteSource, PVACDSiteSource, PVACDWaterLevelSource, EBIDSiteSource, @@ -60,47 +70,35 @@ ) from .connectors.usgs.source import NWISSiteSource, NWISWaterLevelSource from .connectors.wqp.source import WQPSiteSource, WQPAnalyteSource, WQPWaterLevelSource +from backend.logger import Loggable -SOURCE_KEYS = ( - "bernco", - "bor", - "cabq", - "ebid", - "nmbgmr_amp", - "nmed_dwb", - "nmose_isc_seven_rivers", - "nmose_roswell", - "nwis", - "pvacd", - "wqp", -) + +SOURCE_DICT = { + "bernco": BernCoSiteSource, + "bor": BORSiteSource, + "cabq": CABQSiteSource, + "ebid": EBIDSiteSource, + "nmbgmr_amp": NMBGMRSiteSource, + "nmed_dwb": DWBSiteSource, + "nmose_isc_seven_rivers": ISCSevenRiversSiteSource, + "nmose_pod": NMOSEPODSiteSource, + "nmose_roswell": NMOSERoswellSiteSource, + "nwis": NWISSiteSource, + "pvacd": PVACDSiteSource, + "wqp": WQPSiteSource, +} + +SOURCE_KEYS = sorted(list(SOURCE_DICT.keys())) def get_source(source): - if source == "bernco": - return BernCoSiteSource() - elif source == "bor": - return BORSiteSource() - elif source == "cabq": - return CABQSiteSource() - elif source == "ebid": - return EBIDSiteSource() - elif source == "nmbgmr_amp": - return NMBGMRSiteSource() - elif source == "nmed_dwb": - return DWBSiteSource() - elif source == "nmose_isc_seven_rivers": - return ISCSevenRiversSiteSource() - elif source == "nmose_roswell": - return NMOSERoswellSiteSource() - elif source == "nwis": - return NWISSiteSource() - elif source == "pvacd": - return PVACDSiteSource() - elif source == "wqp": - return WQPSiteSource() - - return None + try: + klass = SOURCE_DICT[source] + except KeyError: + raise ValueError(f"Unknown source {source}") + + if klass: + return klass() class Config(Loggable): @@ -112,10 +110,12 @@ class Config(Loggable): end_date: str = "" # spatial - bbox: dict # dict or str + bbox: str = "" county: str = "" wkt: str = "" + sites_only = False + # sources use_source_bernco: bool = True use_source_bor: bool = True @@ -124,6 +124,7 @@ class Config(Loggable): use_source_nmbgmr_amp: bool = True use_source_nmed_dwb: bool = True use_source_nmose_isc_seven_rivers: bool = True + use_source_nmose_pod: bool = True use_source_nmose_roswell: bool = True use_source_nwis: bool = True use_source_pvacd: bool = True @@ -148,14 +149,19 @@ class Config(Loggable): analyte_output_units: str = MILLIGRAMS_PER_LITER waterlevel_output_units: str = FEET - use_csv: bool = True - use_geojson: bool = False + output_format: str = OutputFormat.CSV + + yes: bool = False - def __init__(self, model=None, payload=None): + def __init__(self, model=None, payload=None, path=None): # need to initialize logger super().__init__() - self.bbox = {} + if path: + payload = self._load_from_yaml(path) + + self._payload = payload + if model: if model.wkt: self.wkt = model.wkt @@ -169,22 +175,135 @@ def __init__(self, model=None, payload=None): for s in SOURCE_KEYS: setattr(self, f"use_source_{s}", s in model.sources) elif payload: - self.wkt = payload.get("wkt", "") - self.county = payload.get("county", "") - self.output_summary = payload.get("output_summary", False) - self.output_timeseries_unified = payload.get( - "output_timeseries_unified", False - ) - self.output_timeseries_separated = payload.get( - "output_timeseries_separated", False - ) - self.output_name = payload.get("output_name", "output") - self.start_date = payload.get("start_date", "") - self.end_date = payload.get("end_date", "") - self.parameter = payload.get("parameter", "") + sources = payload.get("sources", []) + if sources: + for sk in SOURCE_KEYS: + value = sources.get(sk) + if value is not None: + setattr(self, f"use_source_{sk}", value) + + for attr in ( + "wkt", + "county", + "bbox", + "output_summary", + "output_timeseries_unified", + "output_timeseries_separated", + "start_date", + "end_date", + "parameter", + "output_name", + "dry", + "latest_water_level_only", + "output_format", + "use_cloud_storage", + "yes", + ): + if attr in payload: + setattr(self, attr, payload[attr]) + + def _load_from_yaml(self, path): + path = os.path.abspath(path) + if os.path.exists(path): + self.log(f"Loading config from {path}") + with open(path, "r") as f: + data = yaml.safe_load(f) + return data + else: + self.warn(f"Config file {path} not found") + + def get_config_and_false_agencies(self): + if self.parameter == WATERLEVELS: + config_agencies = [ + "bernco", + "cabq", + "ebid", + "nmbgmr_amp", + "nmose_isc_seven_rivers", + "nmose_roswell", + "nwis", + "pvacd", + "wqp", + ] + false_agencies = ["bor", "nmose_pod", "nmed_dwb"] + elif self.parameter == CARBONATE: + config_agencies = ["nmbgmr_amp", "wqp"] + false_agencies = [ + "bor", + "bernco", + "cabq", + "ebid", + "nmed_dwb", + "nmose_isc_seven_rivers", + "nmose_pod", + "nmose_roswell", + "nwis", + "pvacd", + ] + elif self.parameter in [ARSENIC, URANIUM]: + config_agencies = ["bor", "nmbgmr_amp", "nmed_dwb", "wqp"] + false_agencies = [ + "bernco", + "cabq", + "ebid", + "nmose_isc_seven_rivers", + "nmose_roswell", + "nmose_pod", + "nwis", + "pvacd", + ] + elif self.parameter in [ + BICARBONATE, + CALCIUM, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + ]: + config_agencies = [ + "bor", + "nmbgmr_amp", + "nmed_dwb", + "nmose_isc_seven_rivers", + "wqp", + ] + false_agencies = [ + "bernco", + "cabq", + "ebid", + "nmose_roswell", + "nmose_pod", + "nwis", + "pvacd", + ] + return config_agencies, false_agencies - for s in SOURCE_KEYS: - setattr(self, f"use_source_{s}", s in payload.get("sources", [])) + def finalize(self): + self._update_output_units() + if self.output_format != OutputFormat.GEOSERVER: + self.update_output_name() + + self.make_output_directory() + self.make_output_path() + + def all_site_sources(self): + sources = [] + for s in SOURCE_KEYS: + if getattr(self, f"use_source_{s}"): + source = get_source(s) + source.set_config(self) + sources.append((source, None)) + + # pods = NMOSEPODSiteSource() + # pods.set_config(self) + # sources.append((pods, None)) + return sources def analyte_sources(self): sources = [] @@ -328,6 +447,8 @@ def _report_attributes(title, attrs): "output_timeseries_separated", "output_horizontal_datum", "output_elevation_units", + "use_cloud_storage", + "output_format", ), ) @@ -384,7 +505,14 @@ def _validate_county(self): return True - def _update_output_name(self): + def make_output_directory(self): + """ + Create the output directory if it doesn't exist. + """ + if not os.path.exists(self.output_dir): + os.mkdir(self.output_dir) + + def update_output_name(self): """ Generate a unique output name based on existing directories in the output directory. @@ -419,7 +547,7 @@ def _update_output_name(self): self.output_name = output_name - def _make_output_path(self): + def make_output_path(self): if not os.path.exists(self.output_path): os.mkdir(self.output_path) @@ -440,5 +568,9 @@ def end_dt(self): def output_path(self): return os.path.join(self.output_dir, f"{self.output_name}") + def get(self, attr): + if self._payload: + return self._payload.get(attr) + # ============= EOF ============================================= diff --git a/backend/connectors/bor/source.py b/backend/connectors/bor/source.py index eac6fb3..5ad03e1 100644 --- a/backend/connectors/bor/source.py +++ b/backend/connectors/bor/source.py @@ -27,13 +27,15 @@ SOURCE_PARAMETER_NAME, SOURCE_PARAMETER_UNITS, DT_MEASURED, + EARLIEST, + LATEST, ) from backend.source import ( BaseSource, BaseSiteSource, BaseAnalyteSource, - get_most_recent, + get_terminal_record, get_analyte_search_param, ) @@ -93,9 +95,8 @@ def _extract_parameter_dates(self, records): def _extract_source_parameter_names(self, records): return [self._source_parameter_name for ri in records] - def _extract_most_recent(self, rs): - - record = get_most_recent(rs, "attributes.dateTime") + def _extract_terminal_record(self, records, bookend): + record = get_terminal_record(records, "attributes.dateTime", bookend=bookend) return { "value": record["attributes"]["result"], "datetime": parse_dt(record["attributes"]["dateTime"]), diff --git a/backend/connectors/ckan/source.py b/backend/connectors/ckan/source.py index 32bfed0..736d668 100644 --- a/backend/connectors/ckan/source.py +++ b/backend/connectors/ckan/source.py @@ -46,7 +46,7 @@ BaseSource, BaseSiteSource, BaseWaterLevelSource, - get_most_recent, + get_terminal_record, ) @@ -138,8 +138,8 @@ def _parse_response(self, site_record, resp): def _extract_source_parameter_results(self, records): return [float(r["DTWGS"]) for r in records] - def _extract_most_recent(self, records): - record = get_most_recent(records, tag="Date") + def _extract_terminal_record(self, records, bookend): + record = get_terminal_record(records, tag="Date", bookend=bookend) return { "value": record["DTWGS"], "datetime": record["Date"], diff --git a/backend/connectors/isc_seven_rivers/source.py b/backend/connectors/isc_seven_rivers/source.py index b791bb1..5679fad 100644 --- a/backend/connectors/isc_seven_rivers/source.py +++ b/backend/connectors/isc_seven_rivers/source.py @@ -28,6 +28,8 @@ PARAMETER_UNITS, SOURCE_PARAMETER_NAME, SOURCE_PARAMETER_UNITS, + EARLIEST, + LATEST, ) from backend.connectors.isc_seven_rivers.transformer import ( ISCSevenRiversSiteTransformer, @@ -39,7 +41,7 @@ BaseSiteSource, BaseWaterLevelSource, BaseAnalyteSource, - get_most_recent, + get_terminal_record, get_analyte_search_param, ) @@ -120,8 +122,8 @@ def _extract_parameter_record(self, record): return record - def _extract_most_recent(self, records): - record = get_most_recent(records, "dateTime") + def _extract_terminal_record(self, records, bookend): + record = get_terminal_record(records, "dateTime", bookend=bookend) return { "value": record["result"], @@ -185,7 +187,11 @@ def get_records(self, site_record): ) def _clean_records(self, records): - return [r for r in records if r["depthToWaterFeet"] is not None] + return [ + r + for r in records + if r["depthToWaterFeet"] is not None and not r["invalid"] and not r["dry"] + ] def _extract_parameter_record(self, record): record[PARAMETER_NAME] = DTW @@ -197,9 +203,7 @@ def _extract_parameter_record(self, record): return record def _extract_source_parameter_results(self, records): - return [ - r["depthToWaterFeet"] for r in records if not r["invalid"] and not r["dry"] - ] + return [r["depthToWaterFeet"] for r in records] def _extract_parameter_dates(self, records: list) -> list: return [get_datetime(r) for r in records] @@ -210,14 +214,14 @@ def _extract_source_parameter_names(self, records): def _extract_source_parameter_units(self, records): return [self._source_parameter_units for r in records] - def _extract_most_recent(self, records): - record = get_most_recent(records, "dateTime") + def _extract_terminal_record(self, records, bookend): + record = get_terminal_record(records, "dateTime", bookend=bookend) t = get_datetime(record) return { "value": record["depthToWaterFeet"], "datetime": t, "source_parameter_units": self._source_parameter_units, - "source_parameter_name": DTW, + "source_parameter_name": self._source_parameter_name, } diff --git a/backend/connectors/nmbgmr/source.py b/backend/connectors/nmbgmr/source.py index d75adae..d01cd11 100644 --- a/backend/connectors/nmbgmr/source.py +++ b/backend/connectors/nmbgmr/source.py @@ -15,8 +15,7 @@ # =============================================================================== import os -import httpx - +from backend import get_bool_env_variable from backend.connectors import NM_STATE_BOUNDING_POLYGON from backend.connectors.nmbgmr.transformer import ( NMBGMRSiteTransformer, @@ -33,12 +32,14 @@ PARAMETER_VALUE, SOURCE_PARAMETER_NAME, SOURCE_PARAMETER_UNITS, + EARLIEST, + LATEST, ) from backend.source import ( BaseWaterLevelSource, BaseSiteSource, BaseAnalyteSource, - get_most_recent, + get_terminal_record, get_analyte_search_param, make_site_list, ) @@ -46,13 +47,15 @@ def _make_url(endpoint): if os.getenv("DEBUG") == "1": - return f"http://localhost:8000/latest/{endpoint}" - return f"https://waterdata.nmt.edu/latest/{endpoint}" + url = f"http://localhost:8000/latest/{endpoint}" + else: + url = f"https://waterdata.nmt.edu/latest/{endpoint}" + return url class NMBGMRSiteSource(BaseSiteSource): transformer_klass = NMBGMRSiteTransformer - chunk_size = 10 + chunk_size = 100 bounding_polygon = NM_STATE_BOUNDING_POLYGON def __repr__(self): @@ -70,33 +73,38 @@ def get_records(self): if config.has_bounds(): params["wkt"] = config.bounding_wkt() - if config.site_limit: - params["limit"] = config.site_limit + if not config.sites_only: - if config.parameter.lower() != "waterlevels": - params["parameter"] = get_analyte_search_param( - config.parameter, NMBGMR_ANALYTE_MAPPING - ) - else: - params["parameter"] = "Manual groundwater levels" + if config.parameter.lower() != "waterlevels": + params["parameter"] = get_analyte_search_param( + config.parameter, NMBGMR_ANALYTE_MAPPING + ) + else: + params["parameter"] = "Manual groundwater levels" # tags="features" because the response object is a GeoJSON sites = self._execute_json_request( _make_url("locations"), params, tag="features", timeout=30 ) - for site in sites: - print(f"Obtaining well data for {site['properties']['point_id']}") - well_data = self._execute_json_request( - _make_url("wells"), - params={"pointid": site["properties"]["point_id"]}, - tag="", - ) - site["properties"]["formation"] = well_data["formation"] - site["properties"]["well_depth"] = well_data["well_depth_ftbgs"] - site["properties"]["well_depth_units"] = FEET - # site["properties"]["formation"] = None - # site["properties"]["well_depth"] = None - # site["properties"]["well_depth_units"] = FEET + if not config.sites_only: + for site in sites: + if get_bool_env_variable("IS_TESTING_ENV"): + print( + f"Skipping well data for {site['properties']['point_id']} for testing (until well data can be retrieved in batches)" + ) + site["properties"]["formation"] = None + site["properties"]["well_depth"] = None + site["properties"]["well_depth_units"] = FEET + else: + print(f"Obtaining well data for {site['properties']['point_id']}") + well_data = self._execute_json_request( + _make_url("wells"), + params={"pointid": site["properties"]["point_id"]}, + tag="", + ) + site["properties"]["formation"] = well_data["formation"] + site["properties"]["well_depth"] = well_data["well_depth_ftbgs"] + site["properties"]["well_depth_units"] = FEET return sites @@ -131,8 +139,8 @@ def _extract_site_records(self, records, site_record): def _extract_source_parameter_units(self, records): return [r["Units"] for r in records] - def _extract_most_recent(self, records): - record = get_most_recent(records, "info.CollectionDate") + def _extract_terminal_record(self, records, bookend): + record = get_terminal_record(records, "info.CollectionDate", bookend=bookend) return { "value": record["SampleValue"], "datetime": record["info"]["CollectionDate"], @@ -168,7 +176,11 @@ def __repr__(self): def _clean_records(self, records): # remove records with no depth to water value - return [r for r in records if r["DepthToWaterBGS"] is not None] + return [ + r + for r in records + if r["DepthToWaterBGS"] is not None and r["DateMeasured"] is not None + ] def _extract_parameter_record(self, record, *args, **kw): record[PARAMETER_NAME] = DTW @@ -179,8 +191,8 @@ def _extract_parameter_record(self, record, *args, **kw): record[SOURCE_PARAMETER_UNITS] = record["DepthToWaterBGSUnits"] return record - def _extract_most_recent(self, records): - record = get_most_recent(records, "DateMeasured") + def _extract_terminal_record(self, records, bookend): + record = get_terminal_record(records, "DateMeasured", bookend=bookend) return { "value": record["DepthToWaterBGS"], "datetime": (record["DateMeasured"], record["TimeMeasured"]), @@ -195,7 +207,7 @@ def _extract_source_parameter_results(self, records): return [r["DepthToWaterBGS"] for r in records] def _extract_site_records(self, records, site_record): - return [ri for ri in records if ri["Well"]["PointID"] == site_record.id] + return [ri for ri in records if ri["PointID"] == site_record.id] def _extract_source_parameter_names(self, records): return ["DepthToWaterBGS" for r in records] @@ -212,7 +224,19 @@ def get_records(self, site_record): # just use manual waterlevels temporarily url = _make_url("waterlevels/manual") - return self._execute_json_request(url, params) + paginated_records = self._execute_json_request(url, params, tag="") + items = paginated_records["items"] + page = paginated_records["page"] + pages = paginated_records["pages"] + + while page < pages: + page += 1 + params["page"] = page + new_records = self._execute_json_request(url, params, tag="") + items.extend(new_records["items"]) + pages = new_records["pages"] + + return items # ============= EOF ============================================= diff --git a/backend/connectors/nmbgmr/transformer.py b/backend/connectors/nmbgmr/transformer.py index dd1163e..420c7f6 100644 --- a/backend/connectors/nmbgmr/transformer.py +++ b/backend/connectors/nmbgmr/transformer.py @@ -38,9 +38,9 @@ def _transform(self, record): "vertical_datum": props["altitude_datum"], "usgs_site_id": props["site_id"], "alternate_site_id": props["alternate_site_id"], - "formation": props["formation"], - "well_depth": props["well_depth"], - "well_depth_units": props["well_depth_units"], + "formation": props.get("formation", ""), + "well_depth": props.get("well_depth", ""), + "well_depth_units": props.get("well_depth_units", ""), } return rec diff --git a/backend/connectors/nmenv/source.py b/backend/connectors/nmenv/source.py index 335fd73..08b1d68 100644 --- a/backend/connectors/nmenv/source.py +++ b/backend/connectors/nmenv/source.py @@ -27,8 +27,9 @@ DT_MEASURED, SOURCE_PARAMETER_NAME, SOURCE_PARAMETER_UNITS, + TDS, ) -from backend.source import get_analyte_search_param, get_most_recent +from backend.source import get_analyte_search_param, get_terminal_record URL = "https://nmenv.newmexicowaterdata.org/FROST-Server/v1.1/" @@ -44,32 +45,46 @@ def __repr__(self): return "DWBSiteSource" def health(self): - return self.get_records(top=10, analyte="TDS") + return self.get_records(top=10, analyte=TDS) def get_records(self, *args, **kw): + analyte = None if "analyte" in kw: analyte = kw["analyte"] elif self.config: analyte = self.config.parameter - analyte = get_analyte_search_param(analyte, DWB_ANALYTE_MAPPING) - if analyte is None: - return [] - service = self.get_service() - ds = service.datastreams() - q = ds.query() - fs = [f"ObservedProperty/id eq {analyte}"] - if self.config: + if self.config.sites_only: + ds = service.things() + q = ds.query() + fs = [] if self.config.has_bounds(): fs.append( - f"st_within(Thing/Location/location, geography'{self.config.bounding_wkt()}')" + f"st_within(Locations/location, geography'{self.config.bounding_wkt()}')" ) + q = q.expand("Locations") + if fs: + q = q.filter(" and ".join(fs)) + return [thing.locations.entities[0] for thing in q.list()] + else: + analyte = get_analyte_search_param(analyte, DWB_ANALYTE_MAPPING) + if analyte is None: + return [] + + ds = service.datastreams() + q = ds.query() + fs = [f"ObservedProperty/id eq {analyte}"] + if self.config: + if self.config.has_bounds(): + fs.append( + f"st_within(Thing/Location/location, geography'{self.config.bounding_wkt()}')" + ) - q = q.filter(" and ".join(fs)) - q = q.expand("Thing/Locations") - return [ds.thing.locations.entities[0] for ds in q.list()] + q = q.filter(" and ".join(fs)) + q = q.expand("Thing/Locations") + return [di.thing.locations.entities[0] for di in q.list()] class DWBAnalyteSource(STAnalyteSource): @@ -150,10 +165,10 @@ def _extract_parameter_dates(self, records: list) -> list: def _extract_source_parameter_names(self, records: list) -> list: return [r["datastream"].observed_property.name for r in records] - def _extract_most_recent(self, records): + def _extract_terminal_record(self, records, bookend): # this is only used in summary output - record = get_most_recent( - records, tag=lambda x: x["observation"].phenomenon_time + record = get_terminal_record( + records, tag=lambda x: x["observation"].phenomenon_time, bookend=bookend ) return { diff --git a/backend/connectors/nmose/source.py b/backend/connectors/nmose/source.py index 5cb7a3e..5def1bf 100644 --- a/backend/connectors/nmose/source.py +++ b/backend/connectors/nmose/source.py @@ -1,2 +1,73 @@ -import os +from typing import List, Dict, Any + +from shapely import wkt +from backend.connectors import NM_STATE_BOUNDING_POLYGON +from backend.connectors.nmose.transformer import NMOSEPODSiteTransformer from backend.source import BaseSiteSource + + +def wkt_to_arcgis_json(obj): + if isinstance(obj, str): + obj = wkt.loads(obj) + coords = [[coord[0], coord[1]] for coord in obj.exterior.coords] + return {"rings": [coords], "spatialReference": {"wkid": 4326}} + + +class NMOSEPODSiteSource(BaseSiteSource): + """ + NMOSEPODSiteSource is a class that inherits from BaseSiteSource. + It is used to fetch site data from the NMOSEPOD API. + """ + + transformer_klass = NMOSEPODSiteTransformer + chunk_size: int = 5000 + bounding_polygon = NM_STATE_BOUNDING_POLYGON + + def get_records(self, *args, **kw) -> List[Dict]: + config = self.config + params: Dict[str, Any] = {} + # if config.has_bounds(): + # bbox = config.bbox_bounding_points() + # params["bBox"] = ",".join([str(b) for b in bbox]) + # else: + # params["stateCd"] = "NM" + # + # if config.start_date: + # params["startDt"] = config.start_dt.date().isoformat() + # if config.end_date: + # params["endDt"] = config.end_dt.date().isoformat() + + url: str = ( + "https://services2.arcgis.com/qXZbWTdPDbTjl7Dy/arcgis/rest/services/OSE_PODs/FeatureServer/0/query" + ) + + params["where"] = "pod_status = 'ACT' AND pod_basin NOT IN ('SP', 'SD', 'LWD')" + params["outFields"] = ( + "OBJECTID,pod_basin,pod_status,easting,northing,datum,utm_accura,status,county" + "pod_name,pod_nbr,pod_suffix,pod_file,depth_well,aquifer,elevation" + ) + + params["outSR"] = 4326 + params["f"] = "json" + params["resultRecordCount"] = self.chunk_size + params["resultOffset"] = 0 + + if config.has_bounds(): + wkt = config.bounding_wkt() + params["geometry"] = wkt_to_arcgis_json(wkt) + params["geometryType"] = "esriGeometryPolygon" + + records: List = [] + i = 1 + while 1: + rs = self._execute_json_request(url, params, tag="features") + if rs is None: + continue + else: + records.extend(rs) + params["resultOffset"] += self.chunk_size + if len(rs) < self.chunk_size: + break + i += 1 + + return records diff --git a/backend/connectors/nmose/transformer.py b/backend/connectors/nmose/transformer.py index e69de29..8f26ebb 100644 --- a/backend/connectors/nmose/transformer.py +++ b/backend/connectors/nmose/transformer.py @@ -0,0 +1,35 @@ +from backend.transformer import BaseTransformer, SiteTransformer + + +class NMOSEPODSiteTransformer(SiteTransformer): + def _transform(self, record) -> dict: + """ + Transform the record into a dictionary format. + + Args: + record (dict): The record to transform. + + Returns: + dict: The transformed record. + """ + + properties = record["attributes"] + geometry = record["geometry"] + + # print(properties.keys()) + # print(geometry.keys()) + rec = { + "source": "NMOSEPOD", + "id": properties["pod_file"], + # "name": record["station_nm"], + "latitude": geometry["y"], + "longitude": geometry["x"], + "elevation": properties["elevation"], + "elevation_units": "ft", + # "horizontal_datum": datum, + # "vertical_datum": record["alt_datum_cd"], + "aquifer": properties["aquifer"], + "well_depth": properties["depth_well"], + "well_depth_units": "ft", + } + return rec diff --git a/backend/connectors/st2/source.py b/backend/connectors/st2/source.py index 69739f4..181513b 100644 --- a/backend/connectors/st2/source.py +++ b/backend/connectors/st2/source.py @@ -50,7 +50,7 @@ SOURCE_PARAMETER_NAME, SOURCE_PARAMETER_UNITS, ) -from backend.source import BaseSiteSource, BaseWaterLevelSource, get_most_recent +from backend.source import BaseSiteSource, BaseWaterLevelSource, get_terminal_record URL = "https://st2.newmexicowaterdata.org/FROST-Server/v1.1" @@ -113,18 +113,6 @@ def __repr__(self): class ST2WaterLevelSource(STWaterLevelSource): url = URL - def _extract_most_recent(self, records): - record = get_most_recent( - records, tag=lambda x: x["observation"].phenomenon_time - ) - - return { - "value": record["observation"].result, - "datetime": record["observation"].phenomenon_time, - "source_parameter_units": record["datastream"].unit_of_measurement.symbol, - "source_parameter_name": record["datastream"].name, - } - def _extract_parameter_record(self, record): record[PARAMETER_NAME] = DTW record[PARAMETER_VALUE] = record["observation"].result diff --git a/backend/connectors/st_connector.py b/backend/connectors/st_connector.py index d6b78ea..d596fe1 100644 --- a/backend/connectors/st_connector.py +++ b/backend/connectors/st_connector.py @@ -19,11 +19,12 @@ from shapely import MultiPolygon, Polygon, unary_union from backend.bounding_polygons import get_state_polygon +from backend.constants import EARLIEST, LATEST from backend.source import ( BaseSiteSource, BaseWaterLevelSource, BaseAnalyteSource, - get_most_recent, + get_terminal_record, ) from backend.transformer import SiteTransformer @@ -56,15 +57,16 @@ def _get_things( return things.list() - def _extract_most_recent(self, records): - record = get_most_recent( - records, tag=lambda x: x["observation"].phenomenon_time + def _extract_terminal_record(self, records, bookend): + record = get_terminal_record( + records, tag=lambda x: x["observation"].phenomenon_time, bookend=bookend ) return { "value": self._parse_result(record["observation"].result), "datetime": record["observation"].phenomenon_time, - "units": record["datastream"].unit_of_measurement.symbol, + "source_parameter_units": record["datastream"].unit_of_measurement.symbol, + "source_parameter_name": record["datastream"].name, } def _parse_result(self, result): diff --git a/backend/connectors/usgs/source.py b/backend/connectors/usgs/source.py index cd0e1ad..cac4f2a 100644 --- a/backend/connectors/usgs/source.py +++ b/backend/connectors/usgs/source.py @@ -27,6 +27,8 @@ PARAMETER_UNITS, SOURCE_PARAMETER_NAME, SOURCE_PARAMETER_UNITS, + EARLIEST, + LATEST, ) from backend.connectors.usgs.transformer import ( NWISSiteTransformer, @@ -37,7 +39,7 @@ BaseWaterLevelSource, BaseSiteSource, make_site_list, - get_most_recent, + get_terminal_record, ) @@ -74,11 +76,12 @@ def parse_json(data): for location in data["timeSeries"]: site_code = location["sourceInfo"]["siteCode"][0]["value"] + agency = location["sourceInfo"]["siteCode"][0]["agencyCode"] source_parameter_name = location["variable"]["variableName"] source_parameter_units = location["variable"]["unit"]["unitCode"] for value in location["values"][0]["value"]: record = { - "site_code": site_code, + "site_id": f"{agency}-{site_code}", "source_parameter_name": source_parameter_name, "value": value["value"], "datetime_measured": value["dateTime"], @@ -148,12 +151,16 @@ def __repr__(self): return "NWISWaterLevelSource" def get_records(self, site_record): + # query sites with the agency, which need to be in the form of "{agency}:{site number}" + sites = make_site_list(site_record) + sites_with_colons = [s.replace("-", ":") for s in sites] + params = { "format": "json", "siteType": "GW", "siteStatus": "all", "parameterCd": "72019", - "sites": ",".join(make_site_list(site_record)), + "sites": ",".join(sites_with_colons), } config = self.config @@ -176,10 +183,14 @@ def get_records(self, site_record): return records def _extract_site_records(self, records, site_record): - return [ri for ri in records if ri["site_code"] == site_record.id] + return [ri for ri in records if ri["site_id"] == site_record.id] def _clean_records(self, records): - return [r for r in records if r["value"] is not None and r["value"].strip()] + return [ + r + for r in records + if r["value"] is not None and r["value"].strip() and r["value"] != "-999999" + ] def _extract_source_parameter_results(self, records): return [float(r["value"]) for r in records] @@ -193,8 +204,8 @@ def _extract_source_parameter_names(self, records: list) -> list: def _extract_source_parameter_units(self, records): return [r["source_parameter_units"] for r in records] - def _extract_most_recent(self, records): - record = get_most_recent(records, "datetime_measured") + def _extract_terminal_record(self, records, bookend): + record = get_terminal_record(records, "datetime_measured", bookend=bookend) return { "value": float(record["value"]), # "datetime": (record["date_measured"], record["time_measured"]), diff --git a/backend/connectors/usgs/transformer.py b/backend/connectors/usgs/transformer.py index 1f61cf5..379b8bd 100644 --- a/backend/connectors/usgs/transformer.py +++ b/backend/connectors/usgs/transformer.py @@ -32,9 +32,13 @@ def _transform(self, record): # if not self.contained(lng, lat): # return + agency = record["agency_cd"] + site_no = record["site_no"] + site_id = f"{agency}-{site_no}" + rec = { "source": "USGS-NWIS", - "id": record["site_no"], + "id": site_id, "name": record["station_nm"], "latitude": lat, "longitude": lng, diff --git a/backend/connectors/wqp/source.py b/backend/connectors/wqp/source.py index 4987fee..f9550ab 100644 --- a/backend/connectors/wqp/source.py +++ b/backend/connectors/wqp/source.py @@ -26,6 +26,8 @@ SOURCE_PARAMETER_NAME, SOURCE_PARAMETER_UNITS, DT_MEASURED, + EARLIEST, + LATEST, ) from backend.connectors.wqp.transformer import ( WQPSiteTransformer, @@ -38,7 +40,7 @@ BaseWaterLevelSource, BaseParameterSource, make_site_list, - get_most_recent, + get_terminal_record, get_analyte_search_param, ) @@ -87,15 +89,15 @@ def get_records(self): } if config.has_bounds(): params["bBox"] = ",".join([str(b) for b in config.bbox_bounding_points()]) - - if config.parameter.lower() != "waterlevels": - params["characteristicName"] = get_analyte_search_param( - config.parameter, WQP_ANALYTE_MAPPING - ) - else: - # every record with pCode 30210 (depth in m) has a corresponding - # record with pCode 72019 (depth in ft) but not vice versa - params["pCode"] = "30210" + if not config.sites_only: + if config.parameter.lower() != "waterlevels": + params["characteristicName"] = get_analyte_search_param( + config.parameter, WQP_ANALYTE_MAPPING + ) + else: + # every record with pCode 30210 (depth in m) has a corresponding + # record with pCode 72019 (depth in ft) but not vice versa + params["pCode"] = "30210" params.update(get_date_range(config)) @@ -139,13 +141,13 @@ def _extract_parameter_dates(self, records): def _extract_source_parameter_names(self, records): return [ri["CharacteristicName"] for ri in records] - def _extract_most_recent(self, records): - ri = get_most_recent(records, "ActivityStartDate") + def _extract_terminal_record(self, records, bookend): + record = get_terminal_record(records, "ActivityStartDate", bookend=bookend) return { - "value": ri["ResultMeasureValue"], - "datetime": ri["ActivityStartDate"], - "source_parameter_units": ri["ResultMeasure/MeasureUnitCode"], - "source_parameter_name": ri["CharacteristicName"], + "value": record["ResultMeasureValue"], + "datetime": record["ActivityStartDate"], + "source_parameter_units": record["ResultMeasure/MeasureUnitCode"], + "source_parameter_name": record["CharacteristicName"], } def get_records(self, site_record): diff --git a/backend/constants.py b/backend/constants.py index 2f56f9c..b7635ab 100644 --- a/backend/constants.py +++ b/backend/constants.py @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== +EARLIEST = "earliest" +LATEST = "latest" + WATERLEVELS = "waterlevels" ARSENIC = "arsenic" diff --git a/backend/geo_utils.py b/backend/geo_utils.py index 930f76d..4484ee9 100644 --- a/backend/geo_utils.py +++ b/backend/geo_utils.py @@ -14,12 +14,27 @@ # limitations under the License. # =============================================================================== import pyproj +from shapely.ops import transform PROJECTIONS = {} TRANSFORMS = {} ALLOWED_DATUMS = ["NAD27", "NAD83", "WGS84"] +# srids for NM +SRID_WGS84 = 4326 +SRID_UTM_ZONE_13N = 26913 + + +def transform_srid(geometry, source_srid, target_srid): + """ + geometry must be a shapely geometry object, like Point, Polygon, or MultiPolygon + """ + source_crs = pyproj.CRS(f"EPSG:{source_srid}") + target_crs = pyproj.CRS(f"EPSG:{target_srid}") + transformer = pyproj.Transformer.from_crs(source_crs, target_crs, always_xy=True) + return transform(transformer.transform, geometry) + def datum_transform(x, y, in_datum, out_datum): """ diff --git a/backend/logging.py b/backend/logger.py similarity index 100% rename from backend/logging.py rename to backend/logger.py diff --git a/backend/persister.py b/backend/persister.py index 38e8493..b470c2b 100644 --- a/backend/persister.py +++ b/backend/persister.py @@ -16,12 +16,12 @@ import csv import io import os -import shutil +from pprint import pprint +import json -import pandas as pd -import geopandas as gpd +from backend import OutputFormat +from backend.logger import Loggable -from backend.logging import Loggable try: from google.cloud import storage @@ -29,19 +29,75 @@ print("google cloud storage not available") +def write_memory(func, records, output_format=None): + f = io.BytesIO() + func(f, records, output_format) + return f.getvalue() + + +def dump_timeseries(path, timeseries: list[list]): + """ + Dumps timeseries records to a CSV file. The timeseries must be a list of + lists, where each inner list contains the records for a single site. In the case + of timeseries separated, the inner list will contain the records for a single site + and this function will be called multiple times, once for each site. + """ + with open(path, "w", newline="") as f: + writer = csv.writer(f) + headers_have_not_been_written = True + for i, records in enumerate(timeseries): + for record in records: + if i == 0 and headers_have_not_been_written: + writer.writerow(record.keys) + headers_have_not_been_written = False + writer.writerow(record.to_row()) + + +def dump_sites_summary(path, records, output_format: OutputFormat): + if output_format == OutputFormat.CSV: + with open(path, "w", newline="") as f: + writer = csv.writer(f) + for i, site in enumerate(records): + if i == 0: + writer.writerow(site.keys) + writer.writerow(site.to_row()) + else: + features = [ + { + "type": "Feature", + "geometry": { + "type": "Point", + "coordinates": [ + getattr(record, "longitude"), + getattr(record, "latitude"), + getattr(record, "elevation"), + ], + }, + "properties": { + k: getattr(record, k) + for k in record.keys + if k not in ["latitude", "longitude", "elevation"] + }, + } + for record in records + ] + feature_collection = {"type": "FeatureCollection", "features": features} + + with open(path, "w") as f: + json.dump(feature_collection, f, indent=4) + + class BasePersister(Loggable): """ Class to persist the data to a file or cloud storage. If persisting to a file, the output directory is created by config._make_output_path() """ - extension: str - # output_id: str - - def __init__(self): + def __init__(self, config=None): self.records = [] self.timeseries = [] self.sites = [] + self.config = config super().__init__() # self.keys = record_klass.keys @@ -55,27 +111,27 @@ def finalize(self, output_name: str): def dump_sites(self, path: str): if self.sites: path = os.path.join(path, "sites") - path = self.add_extension(path) + path = self.add_extension(path, self.config.output_format) self.log(f"dumping sites to {os.path.abspath(path)}") - self._write(path, self.sites) + self._dump_sites_summary(path, self.sites, self.config.output_format) else: self.log("no sites to dump", fg="red") def dump_summary(self, path: str): if self.records: path = os.path.join(path, "summary") - path = self.add_extension(path) + path = self.add_extension(path, self.config.output_format) self.log(f"dumping summary to {os.path.abspath(path)}") - self._write(path, self.records) + self._dump_sites_summary(path, self.records, self.config.output_format) else: self.log("no records to dump", fg="red") def dump_timeseries_unified(self, path: str): if self.timeseries: path = os.path.join(path, "timeseries_unified") - path = self.add_extension(path) + path = self.add_extension(path, OutputFormat.CSV) self.log(f"dumping unified timeseries to {os.path.abspath(path)}") - self._dump_timeseries_unified(path, self.timeseries) + self._dump_timeseries(path, self.timeseries) else: self.log("no timeseries records to dump", fg="red") @@ -85,74 +141,45 @@ def dump_timeseries_separated(self, path: str): # the individual site timeseries will be dumped timeseries_path = os.path.join(path, "timeseries") self._make_output_directory(timeseries_path) - for site, records in self.timeseries: - path = os.path.join(timeseries_path, str(site.id).replace(" ", "_")) - path = self.add_extension(path) - self.log(f"dumping {site.id} to {os.path.abspath(path)}") - self._write(path, records) + for records in self.timeseries: + site_id = records[0].id + path = os.path.join(timeseries_path, str(site_id).replace(" ", "_")) + path = self.add_extension(path, OutputFormat.CSV) + self.log(f"dumping {site_id} to {os.path.abspath(path)}") + + list_of_records = [records] + self._dump_timeseries(path, list_of_records) else: self.log("no timeseries records to dump", fg="red") - def save(self, path: str): - if self.records: - path = self.add_extension(path) - self.log(f"saving to {path}") - self._write(path, self.records) - else: - self.log("no records to save", fg="red") - - def add_extension(self, path: str): - if not self.extension: + def add_extension(self, path: str, extension: OutputFormat): + if not extension: raise NotImplementedError + else: + ext = extension - if not path.endswith(self.extension): - path = f"{path}.{self.extension}" + if not path.endswith(ext): + path = f"{path}.{ext}" return path - def _write(self, path: str, records): - raise NotImplementedError + def _dump_sites_summary( + self, path: str, records: list, output_format: OutputFormat + ): + dump_sites_summary(path, records, output_format) - def _dump_timeseries_unified(self, path: str, timeseries: list): - raise NotImplementedError + def _dump_timeseries(self, path: str, timeseries: list): + dump_timeseries(path, timeseries) def _make_output_directory(self, output_directory: str): os.mkdir(output_directory) -def write_file(path, func, records): - with open(path, "w", newline="") as f: - func(csv.writer(f), records) - - -def write_memory(path, func, records): - f = io.StringIO() - func(csv.writer(f), records) - return f.getvalue() - - -def dump_timeseries_unified(writer, timeseries): - headers_have_not_been_written = True - for i, (site, records) in enumerate(timeseries): - for j, record in enumerate(records): - if i == 0 and headers_have_not_been_written: - writer.writerow(record.keys) - headers_have_not_been_written = False - writer.writerow(record.to_row()) - - -def dump_sites(writer, records): - for i, site in enumerate(records): - if i == 0: - writer.writerow(site.keys) - writer.writerow(site.to_row()) - - class CloudStoragePersister(BasePersister): extension = "csv" _content: list - def __init__(self): - super(CloudStoragePersister, self).__init__() + def __init__(self, *args, **kwargs): + super(CloudStoragePersister, self).__init__(*args, **kwargs) self._content = [] def finalize(self, output_name: str): @@ -177,46 +204,37 @@ def finalize(self, output_name: str): blob.upload_from_string(zip_buffer.getvalue()) else: path, cnt = self._content[0] + + # this is a hack. need a better way to specify the output path + dirname = os.path.basename(os.path.dirname(path)) + path = os.path.join(dirname, os.path.basename(path)) + blob = bucket.blob(path) - blob.upload_from_string(cnt) + blob.upload_from_string( + cnt, + content_type=( + "application/json" + if self.config.output_format == OutputFormat.GEOJSON + else "text/csv" + ), + ) def _make_output_directory(self, output_directory: str): # prevent making root directory, because we are not saving to disk pass - def _write(self, path: str, records: list): - content = write_memory(path, dump_sites, records) - self._add_content(path, content) - def _add_content(self, path: str, content: str): self._content.append((path, content)) - def _dump_timeseries_unified(self, path: str, timeseries: list): - content = write_memory(path, dump_timeseries_unified, timeseries) + def _dump_sites_summary( + self, path: str, records: list, output_format: OutputFormat + ): + content = write_memory(dump_sites_summary, records, output_format) self._add_content(path, content) - -class CSVPersister(BasePersister): - extension = "csv" - - def _write(self, path: str, records: list): - write_file(path, dump_sites, records) - def _dump_timeseries_unified(self, path: str, timeseries: list): - write_file(path, dump_timeseries_unified, timeseries) - - -class GeoJSONPersister(BasePersister): - extension = "geojson" - - def _write(self, path: str, records: list): - r0 = records[0] - df = pd.DataFrame([r.to_row() for r in records], columns=r0.keys) - - gdf = gpd.GeoDataFrame( - df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326" - ) - gdf.to_file(path, driver="GeoJSON") + content = write_memory(path, dump_timeseries, timeseries) + self._add_content(path, content) # class ST2Persister(BasePersister): diff --git a/backend/persisters/__init__.py b/backend/persisters/__init__.py new file mode 100644 index 0000000..28a0970 --- /dev/null +++ b/backend/persisters/__init__.py @@ -0,0 +1,10 @@ +# =============================================================================== +# Author: Jake Ross +# Copyright 2025 New Mexico Bureau of Geology & Mineral Resources +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# =============================================================================== + + +# ============= EOF ============================================= diff --git a/backend/persisters/geoserver.py b/backend/persisters/geoserver.py new file mode 100644 index 0000000..d8c07fc --- /dev/null +++ b/backend/persisters/geoserver.py @@ -0,0 +1,369 @@ +# =============================================================================== +# Author: Jake Ross +# Copyright 2025 New Mexico Bureau of Geology & Mineral Resources +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# =============================================================================== +import json +import os +import time +from itertools import groupby +from typing import Type +from shapely.geometry.multipoint import MultiPoint +from shapely.geometry.point import Point +from sqlalchemy.dialects.postgresql import JSONB, insert +from sqlalchemy.orm import declarative_base, sessionmaker, relationship, Mapped + + +from backend.persister import BasePersister + +from sqlalchemy import ( + Column, + ForeignKey, + create_engine, + UUID, + String, + Integer, + Float, + Date, + Time, +) +from geoalchemy2 import Geometry + +Base = declarative_base() + + +def session_factory(connection: dict): + user = connection.get("user", "postgres") + password = connection.get("password", "") + host = connection.get("host", "localhost") + port = connection.get("port", 5432) + database = connection.get("dbname", "gis") + + url = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}" + engine = create_engine(url) + SessionFactory = sessionmaker(autocommit=False, autoflush=False, bind=engine) + return SessionFactory + + +class Location(Base): + __tablename__ = "tbl_location" + + id = Column(Integer, primary_key=True, index=True) + name = Column(String) + data_source_uid = Column(String, index=True) + + properties = Column(JSONB) + geometry = Column(Geometry(geometry_type="POINT", srid=4326)) + source_slug = Column(String, ForeignKey("tbl_sources.name")) + + source: Mapped["Sources"] = relationship( + "Sources", backref="locations", uselist=False + ) + + +class Summary(Base): + __tablename__ = "tbl_summary" + + id = Column(Integer, primary_key=True, index=True) + name = Column(String) + data_source_uid = Column(String, index=True) + + properties = Column(JSONB) + geometry = Column(Geometry(geometry_type="POINT", srid=4326)) + source_slug = Column(String, ForeignKey("tbl_sources.name")) + parameter_slug = Column(String, ForeignKey("tbl_parameters.name")) + + source: Mapped["Sources"] = relationship( + "Sources", backref="summaries", uselist=False + ) + + value = Column(Float) + nrecords = Column(Integer) + min = Column(Float) + max = Column(Float) + mean = Column(Float) + + latest_value = Column(Float) + latest_date = Column(Date) + latest_time = Column(Time) + + earliest_value = Column(Float) + earliest_date = Column(Date) + earliest_time = Column(Time) + + +class Parameters(Base): + __tablename__ = "tbl_parameters" + name = Column(String, primary_key=True, index=True) + units = Column(String) + + +class Sources(Base): + __tablename__ = "tbl_sources" + id = Column(Integer) + name = Column(String, primary_key=True, index=True) + convex_hull = Column(Geometry(geometry_type="POLYGON", srid=4326)) + + +class GeoServerPersister(BasePersister): + def __init__(self, *args, **kwargs): + super(GeoServerPersister, self).__init__(*args, **kwargs) + self._connection = None + self._connect() + + def dump_sites(self, path: str): + if self.sites: + db = self.config.get("geoserver").get("db") + dbname = db.get("db_name") + self.log(f"dumping sites to {dbname}") + self._write_to_sites(self.sites) + else: + self.log("no sites to dump", fg="red") + + def dump_summary(self, path: str): + if self.records: + db = self.config.get("geoserver").get("db") + dbname = db.get("db_name") + self.log(f"dumping summary to {dbname}") + self._write_to_summary(self.records) + else: + self.log("no records to dump", fg="red") + + def _connect(self): + """ + Connect to a PostgreSQL database on Cloud SQL. + """ + sf = session_factory(self.config.get("geoserver").get("db")) + self._connection = sf() + + def _write_sources(self, records: list): + sources = {r.source for r in records} + with self._connection as conn: + sql = ( + insert(Sources) + .values([{"name": source} for source in sources]) + .on_conflict_do_nothing( + index_elements=[Sources.name], + ) + ) + conn.execute(sql) + conn.commit() + + def _write_sources_with_convex_hull(self, records: list): + # sources = {r.source for r in records} + with self._connection as conn: + + def key(r): + return str(r.source) + + records = sorted(records, key=key) + for source_name, group in groupby(records, key=key): + source_records = list(group) + # calculate convex hull for the source from the records + + # Create a MultiPoint object + points = MultiPoint( + [ + Point(record.longitude, record.latitude) + for record in source_records + ] + ) + + # Calculate the convex hull + sinsert = insert(Sources) + print("Writing source", source_name, points.convex_hull) + sql = sinsert.values( + [{"name": source_name, "convex_hull": points.convex_hull.wkt}] + ).on_conflict_do_update( + index_elements=[Sources.name], + set_={"convex_hull": sinsert.excluded.convex_hull}, + ) + # sql = insert(Sources).values([{"name": source,} for source in sources]).on_conflict_do_nothing( + # index_elements=[Sources.name],) + conn.execute(sql) + conn.commit() + + def _write_parameters(self): + with self._connection as conn: + sql = ( + insert(Parameters) + .values( + [ + { + "name": self.config.parameter, + "units": self.config.analyte_output_units, + } + ] + ) + .on_conflict_do_nothing( + index_elements=[Parameters.name], + ) + ) + print(sql) + conn.execute(sql) + conn.commit() + + def _write_to_summary(self, records: list): + self._write_sources(records) + self._write_parameters() + for r in records: + print(r, [r.to_dict()]) + keys = [ + "usgs_site_id", + "alternate_site_id", + "formation", + "aquifer", + "well_depth", + ] + + def make_stmt(chunk): + values = [ + { + "name": record.location, + "data_source_uid": record.id, + "properties": record.to_dict(keys), + "geometry": f"SRID=4326;POINT({record.longitude} {record.latitude})", + "source_slug": record.source, + "parameter_slug": self.config.parameter, + "nrecords": record.nrecords, + "min": record.min, + "max": record.max, + "mean": record.mean, + "latest_value": record.latest_value, + "latest_date": record.latest_date, + "latest_time": record.latest_time if record.latest_time else None, + "earliest_value": record.earliest_value, + "earliest_date": record.earliest_date, + "earliest_time": ( + record.earliest_time if record.earliest_time else None + ), + } + for record in chunk + ] + + linsert = insert(Summary) + return linsert.values(values).on_conflict_do_update( + index_elements=[Summary.data_source_uid], + set_={"properties": linsert.excluded.properties}, + ) + + self._chunk_insert(make_stmt, records) + + def _chunk_insert(self, make_stmt, records: list, chunk_size: int = 10): + for i in range(0, len(records), chunk_size): + chunk = records[i : i + chunk_size] + print( + f"Writing chunk {i // chunk_size + 1} of {len(records) // chunk_size + 1}" + ) + st = time.time() + + stmt = make_stmt(chunk) + with self._connection as conn: + conn.execute(stmt) + conn.commit() + + print("Chunk write time:", time.time() - st) + + def _write_to_sites(self, records: list): + """ + Write records to a PostgreSQL database in optimized chunks. + """ + + self._write_sources_with_convex_hull(records) + + keys = [ + "usgs_site_id", + "alternate_site_id", + "formation", + "aquifer", + "well_depth", + ] + chunk_size = 1000 # Larger chunk size for fewer commits + + def make_stmt(chunk): + values = [ + { + "name": record.location, + "data_source_uid": record.id, + "properties": record.to_dict(keys), + "geometry": f"SRID=4326;POINT({record.longitude} {record.latitude})", + "source_slug": record.source, + } + for record in chunk + ] + linsert = insert(Location) + stmt = linsert.values(values).on_conflict_do_update( + index_elements=[Location.data_source_uid], + set_={"properties": linsert.excluded.properties}, + ) + return stmt + + self._chunk_insert(make_stmt, records, chunk_size) + + # + # newrecords = [] + # records = sorted(records, key=lambda r: str(r.id)) + # for name, gs in groupby(records, lambda r: str(r.id)): + # gs = list(gs) + # n = len(gs) + # # print(f"Writing {n} records for {name}") + # if n>1: + # if n > len({r.source for r in gs}): + # print("Duplicate source name found. Skipping...", name, [(r.name, r.source) for r in gs]) + # continue + # newrecords.extend(gs) + # # break + # # pass + # # print("Duplicate source name found. Skipping...", name, [r.source for r in gs]) + # # break + # + # + # for i in range(0, len(newrecords), chunk_size): + # chunk = newrecords[i:i + chunk_size] + # print(f"Writing chunk {i // chunk_size + 1} of {len(records) // chunk_size + 1}") + # st = time.time() + # + # values = [ + # { + # "name": record.name, + # "data_source_uid": record.id, + # "properties": record.to_dict(keys), + # "geometry": f"SRID=4326;POINT({record.longitude} {record.latitude})", + # "source_slug": record.source, + # } + # for record in chunk + # ] + # + # # stmt = insert(Location).values(values).on_conflict_do_nothing() + # linsert = insert(Location) + # stmt = linsert.values(values).on_conflict_do_update( + # index_elements=[Location.data_source_uid], + # set_={"properties": linsert.excluded.properties} + # ) + # + # with self._connection as conn: + # conn.execute(stmt) + # conn.commit() + # + # print('Chunk write time:', time.time() - st) + + # # Pre-serialize properties to reduce processing time + # values = [ + # (record.name, json.dumps(record.to_dict(keys)), record.longitude, record.latitude, record.source) + # for record in chunk + # ] + # + # with self._connection.cursor() as cursor: + # sql = """INSERT INTO public.tbl_location (name, properties, geometry, source_slug) + # VALUES (%s, %s, public.ST_SetSRID(public.ST_MakePoint(%s, %s), 4326), %s) + # ON CONFLICT (name) DO UPDATE SET properties = EXCLUDED.properties;""" + # cursor.executemany(sql, values) + # + # self._connection.commit() # Commit once per chunk + # print('Chunk write time:', time.time() - st) + # break + + +# ============= EOF ============================================= diff --git a/backend/record.py b/backend/record.py index 8772edd..ac8a9f9 100644 --- a/backend/record.py +++ b/backend/record.py @@ -31,44 +31,51 @@ def to_csv(self): def __init__(self, payload): self._payload = payload - def to_row(self): - - def get(attr): - # v = self._payload.get(attr) - # if v is None and self.defaults: - # v = self.defaults.get(attr) - v = self.__getattr__(attr) - - field_sigfigs = [ - ("elevation", 2), - ("well_depth", 2), - ("latitude", 6), - ("longitude", 6), - ("min", 2), - ("max", 2), - ("mean", 2), - ] - - # both analyte and water level tables have the same fields, but the - # rounding should only occur for water level tables - if isinstance(self, WaterLevelRecord): - field_sigfigs.append((PARAMETER_VALUE, 2)) - - for field, sigfigs in field_sigfigs: - if v is not None and field == attr: - try: - v = round(v, sigfigs) - except TypeError as e: - print(field, attr) - raise e - break - return v - - return [get(k) for k in self.keys] + def to_row(self, keys=None): + if keys is None: + keys = self.keys + + return [self._get_sigfig_formatted_value(k) for k in keys] + + def to_dict(self, keys=None): + if keys is None: + keys = self.keys + return {k: self._get_sigfig_formatted_value(k) for k in keys} def update(self, **kw): self._payload.update(kw) + def _get_sigfig_formatted_value(self, attr): + # v = self._payload.get(attr) + # if v is None and self.defaults: + # v = self.defaults.get(attr) + v = self.__getattr__(attr) + + field_sigfigs = [ + ("elevation", 2), + ("well_depth", 2), + ("latitude", 6), + ("longitude", 6), + ("min", 2), + ("max", 2), + ("mean", 2), + ] + + # both analyte and water level tables have the same fields, but the + # rounding should only occur for water level tables + if isinstance(self, WaterLevelRecord): + field_sigfigs.append((PARAMETER_VALUE, 2)) + + for field, sigfigs in field_sigfigs: + if v is not None and field == attr: + try: + v = round(v, sigfigs) + except TypeError as e: + print(field, attr) + raise e + break + return v + def __getattr__(self, attr): v = self._payload.get(attr) if v is None and self.defaults: @@ -110,7 +117,7 @@ class SummaryRecord(BaseRecord): keys: tuple = ( "source", "id", - "location", + "name", "usgs_site_id", "alternate_site_id", "latitude", @@ -126,10 +133,14 @@ class SummaryRecord(BaseRecord): "min", "max", "mean", - "most_recent_date", - "most_recent_time", - "most_recent_value", - "most_recent_units", + "earliest_date", + "earliest_time", + "earliest_value", + "earliest_units", + "latest_date", + "latest_time", + "latest_value", + "latest_units", ) defaults: dict = {} @@ -158,6 +169,7 @@ class SiteRecord(BaseRecord): "formation", "aquifer", "well_depth", + "well_depth_units", ) defaults: dict = { @@ -175,6 +187,7 @@ class SiteRecord(BaseRecord): "formation": "", "aquifer": "", "well_depth": None, + "well_depth_units": FEET, } diff --git a/backend/source.py b/backend/source.py index 457006b..5189258 100644 --- a/backend/source.py +++ b/backend/source.py @@ -15,26 +15,21 @@ # =============================================================================== from json import JSONDecodeError -import click import httpx import shapely.wkt from shapely import MultiPoint -from typing import Union, List +from typing import Union, List, Callable, Dict from backend.constants import ( - MILLIGRAMS_PER_LITER, FEET, - METERS, - PARTS_PER_MILLION, - DTW, - DTW_UNITS, DT_MEASURED, PARAMETER_NAME, PARAMETER_UNITS, PARAMETER_VALUE, + EARLIEST, + LATEST, ) -from backend.logging import Loggable -from backend.persister import BasePersister, CSVPersister +from backend.logger import Loggable from backend.record import ( AnalyteRecord, AnalyteSummaryRecord, @@ -45,7 +40,7 @@ from backend.transformer import BaseTransformer, convert_units -def make_site_list(site_record: list | dict) -> list | str: +def make_site_list(site_record: list[SiteRecord] | SiteRecord) -> list | str: """ Returns a list of site ids, as defined by site_record @@ -65,7 +60,7 @@ def make_site_list(site_record: list | dict) -> list | str: return sites -def get_most_recent(records: list, tag: Union[str, callable]) -> dict: +def get_terminal_record(records: list, tag: Union[str, Callable], bookend: str) -> dict: """ Returns the most recent record based on the tag @@ -77,6 +72,9 @@ def get_most_recent(records: list, tag: Union[str, callable]) -> dict: tag: str or callable the tag to use to sort the records + bookend: str + determines if the earliest or lastest record is retrieved + Returns ------- dict @@ -97,7 +95,14 @@ def func(x): def func(x): return x[tag] - return sorted(records, key=func)[-1] + if bookend == EARLIEST: + return sorted(records, key=func)[0] + elif bookend == LATEST: + return sorted(records, key=func)[-1] + else: + raise ValueError( + f"Invalid bookend {bookend}. Must be either {EARLIEST} or {LATEST}" + ) def get_analyte_search_param(parameter: str, mapping: dict) -> str: @@ -170,11 +175,9 @@ class BaseSource(Loggable): """ transformer_klass = BaseTransformer - config = None - def __init__(self, config=None): + def __init__(self): self.transformer = self.transformer_klass() - self.set_config(config) super().__init__() @property @@ -183,7 +186,7 @@ def tag(self): def set_config(self, config): self.config = config - self.transformer.config = config + self.transformer.set_config(config) def check(self, *args, **kw): return True @@ -197,44 +200,7 @@ def discover(self, *args, **kw): # Methods Already Implemented # ========================================================================== - # def warn(self, msg): - # """ - # Prints warning messages to the console in red - # - # Parameters - # ---------- - # msg : str - # the message to print - # - # Returns - # ------- - # None - # """ - # s = self.log(msg, fg="red") - # self.config.warnings.append(s) - - # def log(self, msg, fg="yellow"): - # """ - # Prints the message to the console in yellow - # - # Parameters - # ---------- - # msg : str - # the message to print - # - # fg : str - # the color of the message, defaults to yellow - # - # Returns - # ------- - # None - # """ - # s = f"{self.__class__.__name__:25s} -- {msg}" - # click.secho(s, fg=fg) - # self.config.logs.append(s) - # return s - - def _execute_text_request(self, url: str, params=None, **kw) -> str: + def _execute_text_request(self, url: str, params: dict | None = None, **kw) -> str: """ Executes a get request to the provided url and returns the text response. @@ -264,8 +230,8 @@ def _execute_text_request(self, url: str, params=None, **kw) -> str: return "" def _execute_json_request( - self, url: str, params: dict = None, tag: str = None, **kw - ) -> dict: + self, url: str, params: dict | None = None, tag: str | None = None, **kw + ) -> dict | None: """ Executes a get request to the provided url and returns the json response. @@ -285,7 +251,6 @@ def _execute_json_request( dict the json response """ - # print(url) resp = httpx.get(url, params=params, **kw) if tag is None: tag = "data" @@ -298,17 +263,18 @@ def _execute_json_request( return obj except JSONDecodeError: self.warn(f"service responded but with no data. \n{resp.text}") - return [] + return None else: self.warn(f"service responded with status {resp.status_code}") self.warn(f"service responded with text {resp.text}") - return [] + self.warn(f"service at url: {resp.url}") + return None # ========================================================================== # Methods Implemented in BaseSiteSource and BaseParameterSource # ========================================================================== - def read(self, *args, **kw) -> list: + def read(self, *args, **kw) -> list | None: """ Returns the records. Implemented in BaseSiteSource and BaseAnalyteSource """ @@ -318,7 +284,7 @@ def read(self, *args, **kw) -> list: # Methods That Need to be Implemented For Each Source # ========================================================================== - def get_records(self, *args, **kw) -> dict: + def get_records(self, *args, **kw) -> List[Dict]: """ Returns records as a dictionary, where the keys are site ids and the values are site or parameter records. @@ -466,7 +432,7 @@ def intersects(self, wkt: str) -> bool: return True - def read(self, *args, **kw) -> List[SiteRecord]: + def read(self, *args, **kw) -> List[SiteRecord] | None: """ Returns a list of transformed site records. Calls self.get_records, which needs to be implemented for each source @@ -483,6 +449,7 @@ def read(self, *args, **kw) -> List[SiteRecord]: return self._transform_sites(records) else: self.warn("No site records returned") + return None def _transform_sites(self, records: list) -> List[SiteRecord]: """ @@ -508,7 +475,7 @@ def _transform_sites(self, records: list) -> List[SiteRecord]: self.log(f"processed nrecords={len(transformed_records)}") return transformed_records - def chunks(self, records: list, chunk_size: int = None) -> list: + def chunks(self, records: list, chunk_size: int | None = None) -> list: """ Returns a list of records split into lists of size chunk_size. If chunk_size less than 1 then the records are not split @@ -547,6 +514,14 @@ class BaseParameterSource(BaseSource): Methods With Universal Implementations (Already Implemented) ============================================================================ + _extract_earliest_record + Returns the earliest record for a particular site. Requires _extract_terminal_record + to be implemented for each source + + _extract_latest_record + Returns the most recent record for a particular site. Requires _extract_terminal_record + to be implemented for each source + read Reads the parameter records and returns the transformed records, where the transform standardizes the records so the format is the same for all sources @@ -573,8 +548,9 @@ class BaseParameterSource(BaseSource): _extract_site_records Returns all records for a single site as a list of records - _extract_most_recent - Returns the most recent record + _extract_terminal_record + Returns the terminal record for a particular site. This is only used for + summary, not time series, outputs. _clean_records (optional) Returns cleaned records if this function is defined for each source. @@ -600,14 +576,53 @@ class BaseParameterSource(BaseSource): # Methods Already Implemented # ========================================================================== + def _extract_earliest_record(self, records: list) -> dict: + """ + Returns the earliest record for a particular site + + Parameters + ---------- + records : list + a list of records + + Returns + ------- + dict + the earliest record + """ + return self._extract_terminal_record(records, bookend=EARLIEST) + + def _extract_latest_record(self, records: list) -> dict: + """ + Returns the most recent record for a particular site + + Parameters + ---------- + records : list + a list of records + + Returns + ------- + dict + the most recent record + """ + return self._extract_terminal_record(records, bookend=LATEST) + def read( - self, site_record: SiteRecord, use_summarize: bool, start_ind: int, end_ind: int - ) -> List[ - AnalyteRecord - | AnalyteSummaryRecord - | WaterLevelRecord - | WaterLevelSummaryRecord - ]: + self, + site_record: SiteRecord | list, + use_summarize: bool, + start_ind: int, + end_ind: int, + ) -> ( + List[ + AnalyteRecord + | AnalyteSummaryRecord + | WaterLevelRecord + | WaterLevelSummaryRecord + ] + | None + ): """ Returns a list of transformed parameter records. Transformed parameter records are standardized so that all of the records have the same format. They are @@ -651,7 +666,6 @@ def read( if not site_records: self.warn(f"{site.id}: No records found") continue - # get cleaned records if _clean_records is defined by the source. This usually removes Nones/Null cleaned = self._clean_records(site_records) if not cleaned: @@ -688,9 +702,6 @@ def read( else: msg = f"{warning_msg} for {site.id}" self.warn(msg) - skipped_items.append( - (site.id, source_result, source_unit) - ) except TypeError: skipped_items.append((site.id, source_result, source_unit)) except ValueError: @@ -705,20 +716,29 @@ def read( if kept_items is not None and len(kept_items): n = len(kept_items) - most_recent_result = self._extract_most_recent(cleaned) - if not most_recent_result: + earliest_result = self._extract_earliest_record(cleaned) + latest_result = self._extract_latest_record(cleaned) + if not latest_result: continue rec = { "nrecords": n, "min": min(kept_items), "max": max(kept_items), "mean": sum(kept_items) / n, - "most_recent_datetime": most_recent_result["datetime"], - "most_recent_value": most_recent_result["value"], - "most_recent_source_units": most_recent_result[ + "earliest_datetime": earliest_result["datetime"], + "earliest_value": earliest_result["value"], + "earliest_source_units": earliest_result[ + "source_parameter_units" + ], + "earliest_source_name": earliest_result[ + "source_parameter_name" + ], + "latest_datetime": latest_result["datetime"], + "latest_value": latest_result["value"], + "latest_source_units": latest_result[ "source_parameter_units" ], - "most_recent_source_name": most_recent_result[ + "latest_source_name": latest_result[ "source_parameter_name" ], } @@ -755,6 +775,7 @@ def read( name = ",".join(names) self.warn(f"{name}: No records found") + return None # ========================================================================== # Methods Implemented in BaseAnalyteSource and BaseWaterLevelSource @@ -803,7 +824,7 @@ def _get_output_units(self) -> str: # Methods That Need to be Implemented For Each Source # ========================================================================== - def _extract_site_records(self, records: dict, site_record: dict) -> list: + def _extract_site_records(self, records: list[dict], site_record) -> list: """ Returns all records for a single site as a list of records (which are dictionaries). @@ -845,22 +866,25 @@ def _clean_records(self, records: list) -> list: """ return records - def _extract_most_recent(self, records: list) -> dict: + def _extract_terminal_record(self, records, bookend): """ - Returns the most recent record for a particular site + Returns the terminal record for a particular site Parameters ---------- records : list a list of records + bookend : str + determines if the first or last record is retrieved + Returns ------- dict - the most recent record + the most recent record for every site """ raise NotImplementedError( - f"{self.__class__.__name__} Must implement _extract_most_recent" + f"{self.__class__.__name__} Must implement _extract_terminal_record" ) def _extract_source_parameter_units(self, records: list) -> list: diff --git a/backend/transformer.py b/backend/transformer.py index 232bd16..cb3afe5 100644 --- a/backend/transformer.py +++ b/backend/transformer.py @@ -20,6 +20,7 @@ import shapely from shapely import Point +from backend.bounding_polygons import NM_BOUNDARY_BUFFERED from backend.constants import ( MILLIGRAMS_PER_LITER, PARTS_PER_MILLION, @@ -30,9 +31,11 @@ MICROGRAMS_PER_LITER, DT_MEASURED, DTW, + EARLIEST, + LATEST, ) from backend.geo_utils import datum_transform, ALLOWED_DATUMS -from backend.logging import Loggable +from backend.logger import Loggable from backend.record import ( WaterLevelSummaryRecord, WaterLevelRecord, @@ -128,8 +131,8 @@ def convert_units( output_units: str, source_parameter_name: str, die_parameter_name: str, - dt: str = None, -) -> tuple[float, float, str]: + dt: str | None = None, +) -> tuple[float, float | None, str]: """ Converts the following units for any parameter value: @@ -195,7 +198,7 @@ def convert_units( the source_parameter_name (e.g. nitrate as n). """ if die_parameter_name == "ph": - conversion_factor = 1 + conversion_factor = 1.0 elif output_units == mgl: if input_units in ["mg/l caco3", "mg/l caco3**"]: if die_parameter_name == "bicarbonate": @@ -207,7 +210,7 @@ def convert_units( elif input_units == "mg/l as n": conversion_factor = 4.427 elif input_units in ["mg/l asno3", "mg/l as no3"]: - conversion_factor = 1 + conversion_factor = 1.0 elif input_units == "ug/l as n": conversion_factor = 0.004427 elif input_units == "pci/l": @@ -217,22 +220,22 @@ def convert_units( elif input_units == tpaf: conversion_factor = 735.47 elif input_units == ppm: - conversion_factor = 1 + conversion_factor = 1.0 elif input_units == output_units: if source_parameter_name in ["nitrate as n", "nitrate (as n)"]: conversion_factor = 4.427 else: - conversion_factor = 1 + conversion_factor = 1.0 elif output_units == ft: if input_units in [m, "meters"]: conversion_factor = 3.28084 elif input_units in [ft, "feet"]: - conversion_factor = 1 + conversion_factor = 1.0 elif output_units == m: if input_units in [ft, "feet"]: conversion_factor = 0.3048 elif input_units in [m, "meters"]: - conversion_factor = 1 + conversion_factor = 1.0 if conversion_factor: return input_value * conversion_factor, conversion_factor, warning @@ -328,13 +331,25 @@ class BaseTransformer(Loggable): """ _cached_polygon = None - config = None + # config = None check_contained = True # ========================================================================== # Methods Already Implemented # ========================================================================== + def set_config(self, config): + """ + Sets the config for the transformer. Called in BaseSource.set_config() + to set the config for both the source and the transformer. + + Parameters + -------- + config: Config + The config to set for the transformer + """ + self.config = config + def do_transform( self, inrecord: dict, *args, **kw ) -> ( @@ -344,6 +359,7 @@ def do_transform( | AnalyteSummaryRecord | WaterLevelSummaryRecord | SummaryRecord + | None ): """ Transforms a record, site or parameter, into a standardized format. @@ -390,55 +406,59 @@ def do_transform( """ # _transform needs to be implemented by each SiteTransformer # _transform is already implemented in each ParameterTransformer - record = self._transform(inrecord, *args, **kw) - if not record: - return + transformed_record = self._transform(inrecord, *args, **kw) + if not transformed_record: + return None # ensure that a site or summary record is contained within the boundaing polygon - if "longitude" in record and "latitude" in record: - if not self.contained(record["longitude"], record["latitude"]): + if "longitude" in transformed_record and "latitude" in transformed_record: + if not self.contained( + transformed_record["longitude"], transformed_record["latitude"] + ): self.warn( - f"Skipping site {record['id']}. It is not within the defined geographic bounds" + f"Skipping site {transformed_record['id']}. It is not within the defined geographic bounds" ) - return + return None - self._post_transform(record, *args, **kw) + self._post_transform(transformed_record, *args, **kw) # standardize datetime - dt = record.get(DT_MEASURED) + dt = transformed_record.get(DT_MEASURED) if dt: - d, t = standardize_datetime(dt, record["id"]) - record["date_measured"] = d - record["time_measured"] = t + d, t = standardize_datetime(dt, transformed_record["id"]) + transformed_record["date_measured"] = d + transformed_record["time_measured"] = t else: - mrd = record.get("most_recent_datetime") + mrd = transformed_record.get("latest_datetime") if mrd: - d, t = standardize_datetime(mrd, record["id"]) - record["date_measured"] = d - record["time_measured"] = t + d, t = standardize_datetime(mrd, transformed_record["id"]) + transformed_record["date_measured"] = d + transformed_record["time_measured"] = t # convert to proper record type # a record klass holds the original record's data as a dictionary, and has methods to update the record's data and get the record's data klass = self._get_record_klass() - record = klass(record) + klassed_record = klass(transformed_record) # update the record's geographic information and well data if it is a SiteRecord or SummaryRecord # transforms the horizontal datum and lon/lat coordinates to WGS84 # transforms the elevation and well depth units to the output unit specified in the config # transforms the well depth and well depth units to the output unit specified in the config - if isinstance(record, (SiteRecord, SummaryRecord)): - y = float(record.latitude) - x = float(record.longitude) + if isinstance(klassed_record, (SiteRecord, SummaryRecord)): + y = float(klassed_record.latitude) + x = float(klassed_record.longitude) if x == 0 or y == 0: - self.warn(f"Skipping site {record.id}. Latitude or Longitude is 0") + self.warn( + f"Skipping site {klassed_record.id}. Latitude or Longitude is 0" + ) return None - input_horizontal_datum = record.horizontal_datum + input_horizontal_datum = klassed_record.horizontal_datum if input_horizontal_datum not in ALLOWED_DATUMS: self.warn( - f"Skipping site {record.id}. Datum {input_horizontal_datum} cannot be processed" + f"Skipping site {klassed_record.id}. Datum {input_horizontal_datum} cannot be processed" ) return None @@ -456,39 +476,46 @@ def do_transform( input_horizontal_datum, output_horizontal_datum, ) - record.update(latitude=lat) - record.update(longitude=lng) - record.update(horizontal_datum=datum) + + if not self.in_nm(lng, lat): + self.warn( + f"Skipping site {klassed_record.id}. Coordinates {x}, {y} with datum {input_horizontal_datum} are not within 25km of New Mexico" + ) + return None + + klassed_record.update(latitude=lat) + klassed_record.update(longitude=lng) + klassed_record.update(horizontal_datum=datum) elevation, elevation_unit = transform_length_units( - record.elevation, - record.elevation_units, + klassed_record.elevation, + klassed_record.elevation_units, output_elevation_units, ) - record.update(elevation=elevation) - record.update(elevation_units=elevation_unit) + klassed_record.update(elevation=elevation) + klassed_record.update(elevation_units=elevation_unit) well_depth, well_depth_unit = transform_length_units( - record.well_depth, - record.well_depth_units, + klassed_record.well_depth, + klassed_record.well_depth_units, well_depth_units, ) - record.update(well_depth=well_depth) - record.update(well_depth_units=well_depth_unit) + klassed_record.update(well_depth=well_depth) + klassed_record.update(well_depth_units=well_depth_unit) # update the units to the output unit for analyte records # this is done after converting the units to the output unit for the analyte records # convert the parameter value to the output unit specified in the config - elif isinstance(record, (AnalyteRecord, WaterLevelRecord)): - if isinstance(record, AnalyteRecord): + elif isinstance(klassed_record, (AnalyteRecord, WaterLevelRecord)): + if isinstance(klassed_record, AnalyteRecord): output_units = self.config.analyte_output_units else: output_units = self.config.waterlevel_output_units - source_result = record.parameter_value - source_unit = record.source_parameter_units - dt = record.date_measured - source_name = record.source_parameter_name + source_result = klassed_record.parameter_value + source_unit = klassed_record.source_parameter_units + dt = klassed_record.date_measured + source_name = klassed_record.source_parameter_name conversion_factor = None # conversion factor will remain None if record is kept for time series and cannot be converted, such as non-detects warning_msg = "" try: @@ -501,24 +528,47 @@ def do_transform( dt, ) if warning_msg != "": - msg = f"{warning_msg} for {record.id}" + msg = f"{warning_msg} for {klassed_record.id}" self.warn(msg) except TypeError: - msg = f"Keeping {source_result} for {record.id} on {record.date_measured} for time series data" + msg = f"Keeping {source_result} for {klassed_record.id} on {klassed_record.date_measured} for time series data" self.warn(msg) converted_result = source_result except ValueError: - msg = f"Keeping {source_result} for {record.id} on {record.date_measured} for time series data" + msg = f"Keeping {source_result} for {klassed_record.id} on {klassed_record.date_measured} for time series data" self.warn(msg) converted_result = source_result if warning_msg == "": - record.update(conversion_factor=conversion_factor) - record.update(parameter_value=converted_result) + klassed_record.update(conversion_factor=conversion_factor) + klassed_record.update(parameter_value=converted_result) else: - record = None + klassed_record = None + + return klassed_record + + def in_nm(self, lng: float | int | str, lat: float | int | str) -> bool: + """ + Returns True if the point is in New Mexico, otherwise returns False - return record + Parameters + -------- + lng: float | int | str + The longitude of the point + + lat: float | int | str + The latitude of the point + + Returns + -------- + bool + True if the point is in New Mexico, otherwise False + """ + point = Point(lng, lat) + if NM_BOUNDARY_BUFFERED.contains(point): + return True + else: + return False def contained( self, @@ -634,7 +684,7 @@ def _get_record_klass(self): class SiteTransformer(BaseTransformer): - def _get_record_klass(self) -> SiteRecord: + def _get_record_klass(self) -> type[SiteRecord]: """ Returns the SiteRecord class to use for the transformer for all site records @@ -663,12 +713,13 @@ def _transform(self, record, site_record): rec = {} if self.config.output_summary: - self._transform_most_recents(record, site_record.id) + self._transform_earliest_record(record, site_record.id) + self._transform_latest_record(record, site_record.id) parameter, units = self._get_parameter_name_and_units() rec.update( { - "location": site_record.name, + "name": site_record.name, "usgs_site_id": site_record.usgs_site_id, "alternate_site_id": site_record.alternate_site_id, "latitude": site_record.latitude, @@ -695,29 +746,66 @@ def _transform(self, record, site_record): rec.update(source_id) return rec - def _transform_most_recents(self, record, site_id): - # convert most_recents - dt, tt = standardize_datetime(record["most_recent_datetime"], site_id) - record["most_recent_date"] = dt - record["most_recent_time"] = tt - parameter_name, unit = self._get_parameter_name_and_units() + def _transform_terminal_record(self, record, site_id, bookend): + """ + Convert either the earliest or latest record to the standard format. + + Parameters + -------- + record: dict + The record to convert + + site_id: str + The site ID for the record - converted_most_recent_value, conversion_factor, warning_msg = convert_units( - record["most_recent_value"], - record["most_recent_source_units"], + bookend: str + The bookend of the record to convert. Either "earliest" or "latest" + """ + if bookend == EARLIEST: + datetime_key = "earliest_datetime" + date_key = "earliest_date" + time_key = "earliest_time" + value_key = "earliest_value" + unit_key = "earliest_units" + source_units_key = "earliest_source_units" + source_name_key = "earliest_source_name" + elif bookend == LATEST: + datetime_key = "latest_datetime" + date_key = "latest_date" + time_key = "latest_time" + value_key = "latest_value" + unit_key = "latest_units" + source_units_key = "latest_source_units" + source_name_key = "latest_source_name" + + dt, tt = standardize_datetime(record[datetime_key], site_id) + parameter_name, unit = self._get_parameter_name_and_units() + converted_value, conversion_factor, warning_msg = convert_units( + record[value_key], + record[source_units_key], unit, - record["most_recent_source_name"], + record[source_name_key], parameter_name, dt, ) # all failed conversions are skipped and handled in source.read(), so no need to duplicate here - record["most_recent_value"] = converted_most_recent_value - record["most_recent_units"] = unit + record[date_key] = dt + record[time_key] = tt + record[value_key] = converted_value + record[unit_key] = unit + + def _transform_earliest_record(self, record, site_id): + self._transform_terminal_record(record, site_id, EARLIEST) + + def _transform_latest_record(self, record, site_id): + self._transform_terminal_record(record, site_id, LATEST) class WaterLevelTransformer(ParameterTransformer): - def _get_record_klass(self) -> WaterLevelRecord | WaterLevelSummaryRecord: + def _get_record_klass( + self, + ) -> type[WaterLevelRecord] | type[WaterLevelSummaryRecord]: """ Returns the WaterLevelRecord class to use for the transformer for water level records if config.output_summary is False, otherwise @@ -746,7 +834,7 @@ def _get_parameter_name_and_units(self) -> tuple: class AnalyteTransformer(ParameterTransformer): - def _get_record_klass(self) -> AnalyteRecord | AnalyteSummaryRecord: + def _get_record_klass(self) -> type[AnalyteRecord] | type[AnalyteSummaryRecord]: """ Returns the AnalyteRecord class to use for the transformer for water level records if config.output_summary is False, otherwise diff --git a/backend/unifier.py b/backend/unifier.py index 9523da9..b070631 100644 --- a/backend/unifier.py +++ b/backend/unifier.py @@ -15,13 +15,15 @@ # =============================================================================== import shapely -from backend.config import Config, get_source -from backend.logging import setup_logging -from backend.persister import CSVPersister, GeoJSONPersister, CloudStoragePersister +from backend.config import Config, get_source, OutputFormat +from backend.logger import setup_logging +from backend.constants import WATERLEVELS +from backend.persister import BasePersister +from backend.persisters.geoserver import GeoServerPersister from backend.source import BaseSiteSource -def health_check(source: BaseSiteSource) -> bool: +def health_check(source: BaseSiteSource) -> bool | None: """ Determines if data can be returned from the source (if it is healthy) @@ -38,17 +40,8 @@ def health_check(source: BaseSiteSource) -> bool: source = get_source(source) if source: return bool(source.health()) - - -def unify_sites(config): - print("Unifying sites\n") - - # def func(config, persister): - # for source in config.site_sources(): - # s = source() - # persister.load(s.read(config)) - - # _unify_wrapper(config, func) + else: + return None def unify_analytes(config): @@ -74,34 +67,48 @@ def unify_waterlevels(config): return True -def _perister_factory(config): - """ - Determines the type of persister to use based on the configuration. The - persister types are: +def unify_sites(config): + print("Unifying sites only\n") - - CSVPersister - - CloudStoragePersister - - GeoJSONPersister + # config.report() -- report is done in cli.py, no need to do it twice + config.validate() - Parameters - ------- - config: Config - The configuration object + if not config.dry: + _unify_parameter(config, config.all_site_sources()) - Returns - ------- - Persister - The persister object to use - """ - persister_klass = CSVPersister - if config.use_cloud_storage: - persister_klass = CloudStoragePersister - elif config.use_csv: - persister_klass = CSVPersister - elif config.use_geojson: - persister_klass = GeoJSONPersister + return True + + +# def _perister_factory(config): +# """ +# Determines the type of persister to use based on the configuration. The +# persister types are: + +# - CSVPersister +# - CloudStoragePersister +# - GeoJSONPersister + +# Parameters +# ------- +# config: Config +# The configuration object + +# Returns +# ------- +# Persister +# The persister object to use +# """ +# persister_klass = CSVPersister +# if config.use_cloud_storage: +# persister_klass = CloudStoragePersister +# elif config.output_format == OutputFormat.CSV: +# persister_klass = CSVPersister +# elif config.output_format == OutputFormat.GEOJSON: +# persister_klass = GeoJSONPersister +# elif config.output_format == OutputFormat.GEOSERVER: +# persister_klass = GeoServerPersister - return persister_klass() +# return persister_klass(config) # def _unify_wrapper(config, func): @@ -133,42 +140,69 @@ def _site_wrapper(site_source, parameter_source, persister, config): return sites_with_records_count = 0 - start_ind = 1 + start_ind = 0 end_ind = 0 first_flag = True - for sites in site_source.chunks(sites): - if site_limit and sites_with_records_count == site_limit: - break - - if type(sites) == list: - if first_flag: - end_ind += len(sites) - first_flag = False + + if config.sites_only: + persister.sites.extend(sites) + else: + for site_records in site_source.chunks(sites): + if type(site_records) == list: + n = len(site_records) + if first_flag: + first_flag = False + else: + start_ind = end_ind + 1 + + end_ind += n + + if use_summarize: + summary_records = parameter_source.read( + site_records, use_summarize, start_ind, end_ind + ) + if summary_records: + persister.records.extend(summary_records) + sites_with_records_count += len(summary_records) + else: + continue else: - start_ind = end_ind + 1 - end_ind += len(sites) - - if use_summarize: - summary_records = parameter_source.read( - sites, use_summarize, start_ind, end_ind - ) - if summary_records: - persister.records.extend(summary_records) - else: - results = parameter_source.read( - sites, use_summarize, start_ind, end_ind - ) - # no records are returned if there is no site record for parameter - # or if the record isn't clean (doesn't have the correct fields) - # don't count these sites to apply to site_limit - if results is None or len(results) == 0: - continue - - for site, records in results: - persister.timeseries.append((site, records)) - persister.sites.append(site) - - sites_with_records_count += 1 + results = parameter_source.read( + site_records, use_summarize, start_ind, end_ind + ) + # no records are returned if there is no site record for parameter + # or if the record isn't clean (doesn't have the correct fields) + # don't count these sites to apply to site_limit + if results is None or len(results) == 0: + continue + else: + sites_with_records_count += len(results) + + for site, records in results: + persister.timeseries.append(records) + persister.sites.append(site) + + if site_limit: + if sites_with_records_count >= site_limit: + # remove any extra sites that were gathered. removes 0 if site_limit is not exceeded + num_sites_to_remove = sites_with_records_count - site_limit + + # if sites_with_records_count == sit_limit then num_sites_to_remove = 0 + # and calling list[:0] will retur an empty list, so subtract + # num_sites_to_remove from the length of the list + # to remove the last num_sites_to_remove sites + if use_summarize: + persister.records = persister.records[ + : len(persister.records) - num_sites_to_remove + ] + else: + persister.timeseries = persister.timeseries[ + : len(persister.timeseries) - num_sites_to_remove + ] + persister.sites = persister.sites[ + : len(persister.sites) - num_sites_to_remove + ] + break except BaseException: import traceback @@ -182,15 +216,27 @@ def _unify_parameter( config, sources, ): - persister = _perister_factory(config) + + if config.output_format == OutputFormat.GEOSERVER: + persister = GeoServerPersister(config) + else: + persister = BasePersister(config) + for site_source, parameter_source in sources: - _site_wrapper(site_source, parameter_source, persister, config) + _site_wrapper( + site_source, + parameter_source, + persister, + config, + ) if config.output_summary: persister.dump_summary(config.output_path) elif config.output_timeseries_unified: persister.dump_timeseries_unified(config.output_path) persister.dump_sites(config.output_path) + elif config.sites_only: + persister.dump_sites(config.output_path) else: # config.output_timeseries_separated persister.dump_timeseries_separated(config.output_path) persister.dump_sites(config.output_path) @@ -244,7 +290,7 @@ def get_sources(config=None): config = Config() sources = [] - if config.parameter.lower() == "waterlevels": + if config.parameter == WATERLEVELS: allsources = config.water_level_sources() else: allsources = config.analyte_sources() @@ -297,9 +343,12 @@ def waterlevel_unification_test(): cfg.use_source_nwis = False cfg.use_source_nmbgmr = False cfg.use_source_iscsevenrivers = False - # cfg.use_source_pvacd = False - cfg.use_source_oseroswell = False + cfg.use_source_pvacd = False + # cfg.use_source_oseroswell = False cfg.use_source_bernco = False + cfg.use_source_iscsevenrivers = False + cfg.use_source_nmose_isc_seven_rivers = False + cfg.use_source_ebid = False # cfg.site_limit = 10 unify_waterlevels(cfg) @@ -322,16 +371,17 @@ def get_datastreams(): print(si, si.id, ds["@iot.id"]) -if __name__ == "__main__": - # test_waterlevel_unification() - # root = logging.getLogger() - # root.setLevel(logging.DEBUG) - # shandler = logging.StreamHandler() - # get_sources(Config()) - setup_logging() - waterlevel_unification_test() - # analyte_unification_test() - # print(health_check("nwis")) - # generate_site_bounds() +# if __name__ == "__main__": +# test_waterlevel_unification() +# root = logging.getLogger() +# root.setLevel(logging.DEBUG) +# shandler = logging.StreamHandler() +# get_sources(Config()) +# setup_logging() +# site_unification_test() +# waterlevel_unification_test() +# analyte_unification_test() +# print(health_check("nwis")) +# generate_site_bounds() # ============= EOF ============================================= diff --git a/frontend/cli.py b/frontend/cli.py index e03ac0b..879e5d3 100644 --- a/frontend/cli.py +++ b/frontend/cli.py @@ -17,11 +17,13 @@ import click +from backend import OutputFormat from backend.config import Config from backend.constants import PARAMETER_OPTIONS from backend.unifier import unify_sites, unify_waterlevels, unify_analytes -from backend.logging import setup_logging +from backend.logger import setup_logging + # setup_logging() @@ -81,6 +83,13 @@ def cli(): show_default=True, help="Exclude NMOSE ISC Seven Rivers data. Default is to include", ), + click.option( + "--no-nmose-pod", + is_flag=True, + default=True, + show_default=True, + help="Exclude NMOSE POD data. Default is to include", + ), click.option( "--no-nmose-roswell", is_flag=True, @@ -122,6 +131,11 @@ def cli(): default="", help="New Mexico county name", ), + click.option( + "--wkt", + default="", + help="Well known text (WKT) representation of a geometry. For example, 'POLYGON((x1 y1, x2 y2, x3 y3, x1 y1))'", + ), ] DEBUG_OPTIONS = [ click.option( @@ -136,6 +150,12 @@ def cli(): default=False, help="Dry run. Do not execute unifier. Used by unit tests", ), + click.option( + "--yes", + is_flag=True, + default=False, + help="Do not ask for confirmation before running", + ), ] DT_OPTIONS = [ @@ -150,33 +170,42 @@ def cli(): help="End date in the form 'YYYY', 'YYYY-MM', 'YYYY-MM-DD', 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS'", ), ] - -TIMESERIES_OPTIONS = [ +OUTPUT_TYPE_OPTIONS = [ click.option( - "--separated_timeseries", - is_flag=True, - default=False, - show_default=True, - help="Output separate timeseries files for every site", + "--output-type", + type=click.Choice(["summary", "timeseries_unified", "timeseries_separated"]), + required=True, + help="Output summary file, single unified timeseries file, or separated timeseries files", ), +] + +OUTPUT_DIR_OPTIONS = [ click.option( - "--unified_timeseries", - is_flag=True, - default=False, - show_default=True, - help="Output single timeseries file, which includes all sites", - ), + "--output-dir", + default=".", + help="Output root directory. Default is current directory", + ) ] -OUTPUT_OPTIONS = [ +OUTPUT_FORMATS = sorted([value for value in OutputFormat]) +OUTPUT_FORMAT_OPTIONS = [ click.option( - "--output", - type=click.Choice(["summary", "timeseries_unified", "timeseries_separated"]), - required=True, - help="Output summary file, single unified timeseries file, or separated timeseries files", + "--output-format", + type=click.Choice(OUTPUT_FORMATS), + default="csv", + help=f"Output file format for sites: {OUTPUT_FORMATS}. Default is csv", ) ] +CONFIG_PATH_OPTIONS = [ + click.option( + "--config-path", + type=click.Path(exists=True), + default=None, + help="Path to config file. Default is config.yaml", + ), +] + def add_options(options): def _add_options(func): @@ -189,22 +218,28 @@ def _add_options(func): @cli.command() @click.argument( - "weave", + "parameter", type=click.Choice(PARAMETER_OPTIONS, case_sensitive=False), required=True, ) -@add_options(OUTPUT_OPTIONS) +@add_options(CONFIG_PATH_OPTIONS) +@add_options(OUTPUT_TYPE_OPTIONS) +@add_options(OUTPUT_DIR_OPTIONS) @add_options(DT_OPTIONS) @add_options(SPATIAL_OPTIONS) @add_options(ALL_SOURCE_OPTIONS) @add_options(DEBUG_OPTIONS) +@add_options(OUTPUT_FORMAT_OPTIONS) def weave( - weave, - output, + parameter, + config_path, + output_type, + output_dir, start_date, end_date, bbox, county, + wkt, no_bernco, no_bor, no_cabq, @@ -212,143 +247,154 @@ def weave( no_nmbgmr_amp, no_nmed_dwb, no_nmose_isc_seven_rivers, + no_nmose_pod, no_nmose_roswell, no_nwis, no_pvacd, no_wqp, site_limit, dry, + yes, + output_format, ): """ Get parameter timeseries or summary data """ - parameter = weave # instantiate config and set up parameter - config = setup_config(f"{parameter}", bbox, county, site_limit, dry) - config.parameter = parameter - - # make sure config.output_name is properly set - config._update_output_name() - - # make output_path now so that die.log can be written to it live - config._make_output_path() + config = setup_config( + tag=parameter, + config_path=config_path, + bbox=bbox, + county=county, + wkt=wkt, + site_limit=site_limit, + dry=dry, + output_format=output_format, + ) - # setup logging here so that the path can be set to config.output_path - setup_logging(path=config.output_path) + config.parameter = parameter # output type - if output == "summary": + if output_type == "summary": summary = True timeseries_unified = False timeseries_separated = False - elif output == "timeseries_unified": + elif output_type == "timeseries_unified": summary = False timeseries_unified = True timeseries_separated = False - elif output == "timeseries_separated": + elif output_type == "timeseries_separated": summary = False timeseries_unified = False timeseries_separated = True + else: + click.echo(f"Invalid output type: {output_type}") + return config.output_summary = summary config.output_timeseries_unified = timeseries_unified config.output_timeseries_separated = timeseries_separated - # sources - if parameter == "waterlevels": - config.use_source_bernco = no_bernco - config.use_source_cabq = no_cabq - config.use_source_ebid = no_ebid - config.use_source_nmbgmr_amp = no_nmbgmr_amp - config.use_source_nmose_isc_seven_rivers = no_nmose_isc_seven_rivers - config.use_source_nmose_roswell = no_nmose_roswell - config.use_source_nwis = no_nwis - config.use_source_pvacd = no_pvacd - config.use_source_wqp = no_wqp - - config.use_source_bor = False - config.use_source_nmed_dwb = False - - elif parameter == "carbonate": - config.use_source_nmbgmr_amp = no_nmbgmr_amp - config.use_source_wqp = no_wqp - - config.use_source_bor = False - config.use_source_bernco = False - config.use_source_cabq = False - config.use_source_ebid = False - config.use_source_nmed_dwb = False - config.use_source_nmose_isc_seven_rivers = False - config.use_source_nmose_roswell = False - config.use_source_nwis = False - config.use_source_pvacd = False - - elif parameter in ["arsenic", "uranium"]: - config.use_source_bor = no_bor - config.use_source_nmbgmr_amp = no_nmbgmr_amp - config.use_source_nmed_dwb = no_nmed_dwb - config.use_source_wqp = no_wqp - - config.use_source_bernco = False - config.use_source_cabq = False - config.use_source_ebid = False - config.use_source_nmose_isc_seven_rivers = False - config.use_source_nmose_roswell = False - config.use_source_nwis = False - config.use_source_pvacd = False - - elif parameter in [ - "bicarbonate", - "calcium", - "chloride", - "fluoride", - "magnesium", - "nitrate", - "ph", - "potassium", - "silica", - "sodium", - "sulfate", - "tds", - ]: - config.use_source_bor = no_bor - config.use_source_nmbgmr_amp = no_nmbgmr_amp - config.use_source_nmed_dwb = no_nmed_dwb - config.use_source_nmose_isc_seven_rivers = no_nmose_isc_seven_rivers - config.use_source_wqp = no_wqp - - config.use_source_bernco = False - config.use_source_cabq = False - config.use_source_ebid = False - config.use_source_nmose_roswell = False - config.use_source_nwis = False - config.use_source_pvacd = False + config_agencies, false_agencies = config.get_config_and_false_agencies() + + for agency in false_agencies: + setattr(config, f"use_source_{agency}", False) + if config_path is None: + lcs = locals() + if config_agencies: + for agency in config_agencies: + setattr(config, f"use_source_{agency}", lcs.get(f"no_{agency}", False)) # dates config.start_date = start_date config.end_date = end_date - if not dry: - config.report() - # prompt user to continue - if not click.confirm("Do you want to continue?", default=True): - return - - config._update_output_units() + config.finalize() + # setup logging here so that the path can be set to config.output_path + setup_logging(path=config.output_path) - if parameter.lower() == "waterlevels": - unify_waterlevels(config) - else: - unify_analytes(config) + config.report() + if not dry: + if not yes and not config.yes: + # prompt user to continue + if not click.confirm("Do you want to continue?", default=True): + return + + if parameter.lower() == "waterlevels": + unify_waterlevels(config) + else: + unify_analytes(config) + return config @cli.command() +@add_options(CONFIG_PATH_OPTIONS) @add_options(SPATIAL_OPTIONS) -def wells(bbox, county): +@add_options(OUTPUT_DIR_OPTIONS) +@add_options(ALL_SOURCE_OPTIONS) +@add_options(DEBUG_OPTIONS) +@add_options(OUTPUT_FORMAT_OPTIONS) +def sites( + config_path, + bbox, + county, + wkt, + output_dir, + no_bernco, + no_bor, + no_cabq, + no_ebid, + no_nmbgmr_amp, + no_nmed_dwb, + no_nmose_isc_seven_rivers, + no_nmose_pod, + no_nmose_roswell, + no_nwis, + no_pvacd, + no_wqp, + site_limit, + dry, + yes, + output_format, +): """ - Get locations + Get sites """ - config = setup_config("sites", bbox, county) + config = setup_config( + "sites", config_path, bbox, county, wkt, site_limit, dry, output_format + ) + config_agencies = [ + "bernco", + "bor", + "cabq", + "ebid", + "nmbgmr_amp", + "nmed_dwb", + "nmose_isc_seven_rivers", + "nmose_roswell", + "nwis", + "pvacd", + "wqp", + "nmose_pod", + ] + + if config_path is None: + lcs = locals() + for agency in config_agencies: + setattr(config, f"use_source_{agency}", lcs.get(f"no_{agency}", False)) + config.output_dir = output_dir + + config.sites_only = True + config.finalize() + # setup logging here so that the path can be set to config.output_path + setup_logging(path=config.output_path) + + config.report() + if not yes and not config.yes: + # prompt user to continue + if not click.confirm("Do you want to continue?", default=True): + return + unify_sites(config) @@ -359,7 +405,7 @@ def wells(bbox, county): required=True, ) @add_options(SPATIAL_OPTIONS) -def sources(sources, bbox, county): +def sources(sources, bbox, wkt, county): """ List available sources """ @@ -370,16 +416,33 @@ def sources(sources, bbox, county): config.county = county elif bbox: config.bbox = bbox + elif wkt: + config.wkt = wkt parameter = sources config.parameter = parameter + config_agencies, false_agencies = config.get_config_and_false_agencies() + + for agency in false_agencies: + setattr(config, f"use_source_{agency}", False) + sources = get_sources(config) for s in sources: click.echo(s) -def setup_config(tag, bbox, county, site_limit, dry): - config = Config() +def setup_config( + tag, + config_path, + bbox, + county, + wkt, + site_limit, + dry, + output_format=OutputFormat.CSV, +): + config = Config(path=config_path) + if county: click.echo(f"Getting {tag} for county {county}") config.county = county @@ -387,10 +450,18 @@ def setup_config(tag, bbox, county, site_limit, dry): click.echo(f"Getting {tag} for bounding box {bbox}") # bbox = -105.396826 36.219290, -106.024162 35.384307 config.bbox = bbox + elif wkt: + click.echo(f"Getting {tag} for WKT {wkt}") + config.wkt = wkt - config.site_limit = site_limit + if site_limit: + config.site_limit = int(site_limit) + else: + config.site_limit = None config.dry = dry + config.output_format = output_format.value + return config diff --git a/frontend/cronjob_worker.sh b/frontend/cronjob_worker.sh new file mode 100644 index 0000000..46b086a --- /dev/null +++ b/frontend/cronjob_worker.sh @@ -0,0 +1,3 @@ + + +die sites --config config.yaml \ No newline at end of file diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..4904098 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,4 @@ +[mypy] +ignore_missing_imports = True +exclude = ^(venv|.github|.mypy_cache|.pytest_cache|nmuwd.egg-info|__pycache__|build|tests/archived) +plugins = sqlalchemy.ext.mypy.plugin diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..8ea4712 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +; skip archived tests but keep for reference +norecursedirs = tests/archived \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4e9f7c5..50e80af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,14 @@ flask +frost_sta_client +Geoalchemy2 +geopandas +google-cloud-storage gunicorn httpx +mypy pandas -geopandas -frost_sta_client -google-cloud-storage +psycopg2 pytest +pyyaml +types-pyyaml urllib3>=2.2.0,<3.0.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 1b99d57..d3f855e 100644 --- a/setup.py +++ b/setup.py @@ -19,9 +19,13 @@ with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() +# Read dependencies from requirements.txt +with open("requirements.txt", "r", encoding="utf-8") as req_file: + requirements = req_file.read().splitlines() + setup( name="nmuwd", - version="0.7.1", + version="0.9.3", author="Jake Ross", description="New Mexico Water Data Integration Engine", long_description=long_description, @@ -31,7 +35,7 @@ "Programming Language :: Python :: 3", "Operating System :: OS Independent", ], - install_requires=["click", "httpx", "geopandas", "frost_sta_client"], + install_requires=requirements, entry_points={ "console_scripts": [ "die = frontend.cli:cli", diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..bcf9e80 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,8 @@ +def recursively_clean_directory(path): + """Recursively delete all files and directories in the given path.""" + for item in path.iterdir(): + if item.is_dir(): + recursively_clean_directory(item) + else: + item.unlink() + path.rmdir() diff --git a/tests/archived/__init__.py b/tests/archived/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_cli.py b/tests/archived/test_cli.py similarity index 55% rename from tests/test_cli.py rename to tests/archived/test_cli.py index 3d65365..3e53924 100644 --- a/tests/test_cli.py +++ b/tests/archived/test_cli.py @@ -218,190 +218,189 @@ def test_waterlevels_invalid_end(): _tester(waterlevels, args, fail=True) -# -# def _tester(source, func, county, bbox, args=None): -# runner = CliRunner() -# -# nosources = [ -# f -# for f in ( -# "--no-amp", -# "--no-nwis", -# "--no-st2", -# "--no-bor", -# "--no-dwb", -# "--no-wqp", -# "--no-isc-seven-rivers", -# "--no-ckan", -# ) -# if f != f"--no-{source}" -# ] -# -# dargs = nosources + ["--site-limit", 10] -# -# if args: -# args += dargs -# else: -# args = dargs -# -# if county: -# args.extend(("--county", county)) -# elif bbox: -# args.extend(("--bbox", bbox)) -# -# print(" ".join([str(f) for f in args])) -# result = runner.invoke(func, args) -# -# return result +def _tester(source, func, county, bbox, args=None): + runner = CliRunner() + nosources = [ + f + for f in ( + "--no-amp", + "--no-nwis", + "--no-st2", + "--no-bor", + "--no-dwb", + "--no-wqp", + "--no-isc-seven-rivers", + "--no-ckan", + ) + if f != f"--no-{source}" + ] + + dargs = nosources + ["--site-limit", 10] + + if args: + args += dargs + else: + args = dargs + + if county: + args.extend(("--county", county)) + elif bbox: + args.extend(("--bbox", bbox)) + + print(" ".join([str(f) for f in args])) + result = runner.invoke(func, args) + + return result + + +def _summary_tester(source, func, county=None, bbox=None, args=None): + if not (county or bbox): + county = "eddy" + + runner = CliRunner() + # with runner.isolated_filesystem(): + # result = _tester(source, func, county, bbox, args) + # assert result.exit_code == 0 + # assert os.path.isfile("output.csv") + + +def _timeseries_tester( + source, + func, + combined_flag=True, + timeseries_flag=True, + county=None, + bbox=None, + args=None, +): + if args is None: + args = [] + # runner = CliRunner() + # with runner.isolated_filesystem(): + # result = _tester(source, func, county, bbox, args=args + ["--timeseries"]) + # assert result.exit_code == 0 + # print("combined", os.path.isfile("output.combined.csv"), combined_flag) + # assert os.path.isfile("output.combined.csv") == combined_flag + # print("timeseries", os.path.isdir("output_timeseries"), timeseries_flag) + # assert os.path.isdir("output_timeseries") == timeseries_flag + + +# ====== Analyte Tests ======================================================= +def _analyte_summary_tester(key): + _summary_tester(key, analytes, args=["TDS"]) + + +def _analyte_county_tester(source, **kw): + _timeseries_tester(source, analytes, args=["TDS"], county="eddy", **kw) + + +def test_unify_analytes_amp(): + _analyte_county_tester("amp", timeseries_flag=False) + + +def test_unify_analytes_wqp(): + _analyte_county_tester("wqp") + + +def test_unify_analytes_bor(): + _analyte_county_tester("bor", combined_flag=False) + + +def test_unify_analytes_isc_seven_rivers(): + _analyte_county_tester("isc-seven-rivers") + + +def test_unify_analytes_dwb(): + _analyte_county_tester("dwb", timeseries_flag=False) + + +def test_unify_analytes_wqp_summary(): + _analyte_summary_tester("wqp") + + +def test_unify_analytes_bor_summary(): + _analyte_summary_tester("bor") -# def _summary_tester(source, func, county=None, bbox=None, args=None): -# if not (county or bbox): -# county = "eddy" -# -# runner = CliRunner() -# # with runner.isolated_filesystem(): -# # result = _tester(source, func, county, bbox, args) -# # assert result.exit_code == 0 -# # assert os.path.isfile("output.csv") -# -# -# def _timeseries_tester( -# source, -# func, -# combined_flag=True, -# timeseries_flag=True, -# county=None, -# bbox=None, -# args=None, -# ): -# if args is None: -# args = [] -# # runner = CliRunner() -# # with runner.isolated_filesystem(): -# # result = _tester(source, func, county, bbox, args=args + ["--timeseries"]) -# # assert result.exit_code == 0 -# # print("combined", os.path.isfile("output.combined.csv"), combined_flag) -# # assert os.path.isfile("output.combined.csv") == combined_flag -# # print("timeseries", os.path.isdir("output_timeseries"), timeseries_flag) -# # assert os.path.isdir("output_timeseries") == timeseries_flag -# -# -# # ====== Analyte Tests ======================================================= -# def _analyte_summary_tester(key): -# _summary_tester(key, analytes, args=["TDS"]) -# -# -# def _analyte_county_tester(source, **kw): -# _timeseries_tester(source, analytes, args=["TDS"], county="eddy", **kw) -# -# -# def test_unify_analytes_amp(): -# _analyte_county_tester("amp", timeseries_flag=False) -# -# -# def test_unify_analytes_wqp(): -# _analyte_county_tester("wqp") -# -# -# def test_unify_analytes_bor(): -# _analyte_county_tester("bor", combined_flag=False) -# -# -# def test_unify_analytes_isc_seven_rivers(): -# _analyte_county_tester("isc-seven-rivers") -# -# -# def test_unify_analytes_dwb(): -# _analyte_county_tester("dwb", timeseries_flag=False) -# -# -# def test_unify_analytes_wqp_summary(): -# _analyte_summary_tester("wqp") -# -# -# def test_unify_analytes_bor_summary(): -# _analyte_summary_tester("bor") -# -# -# def test_unify_analytes_amp_summary(): -# _analyte_summary_tester("amp") -# -# -# def test_unify_analytes_dwb_summary(): -# _analyte_summary_tester("dwb") -# -# -# def test_unify_analytes_isc_seven_rivers_summary(): -# _analyte_summary_tester("isc-seven-rivers") + +def test_unify_analytes_amp_summary(): + _analyte_summary_tester("amp") + + +def test_unify_analytes_dwb_summary(): + _analyte_summary_tester("dwb") + + +def test_unify_analytes_isc_seven_rivers_summary(): + _analyte_summary_tester("isc-seven-rivers") # ====== End Analyte Tests ======================================================= # ====== Water Level Tests ======================================================= -# def _waterlevel_county_tester(source, **kw): -# _timeseries_tester(source, waterlevels, county="eddy", **kw) -# -# -# def _waterlevel_bbox_tester(source, **kw): -# _timeseries_tester(source, waterlevels, bbox="-104.5 32.5,-104 33", **kw) +def _waterlevel_county_tester(source, **kw): + _timeseries_tester(source, waterlevels, county="eddy", **kw) -# -# def test_unify_waterlevels_nwis(): -# _waterlevel_county_tester("nwis", timeseries_flag=False) -# -# -# def test_unify_waterlevels_amp(): -# _waterlevel_county_tester("amp", timeseries_flag=False) -# -# -# def test_unify_waterlevels_st2(): -# _waterlevel_county_tester("st2", combined_flag=False) -# -# -# def test_unify_waterlevels_isc_seven_rivers(): -# _waterlevel_county_tester("isc-seven-rivers") -# -# -# def test_unify_waterlevels_ckan(): -# _waterlevel_county_tester("ckan") -# -# -# def test_unify_waterlevels_nwis_summary(): -# _summary_tester("nwis", waterlevels) -# -# -# def test_unify_waterlevels_amp_summary(): -# _summary_tester("amp", waterlevels) -# -# -# def test_unify_waterlevels_st2_summary(): -# _summary_tester("st2", waterlevels) -# -# -# def test_unify_waterlevels_isc_seven_rivers_summary(): -# _summary_tester("isc-seven-rivers", waterlevels) -# -# -# def test_unify_waterlevels_nwis_bbox(): -# _waterlevel_bbox_tester("nwis", timeseries_flag=False) -# -# -# def test_unify_waterlevels_amp_bbox(): -# _waterlevel_bbox_tester("amp") -# -# -# def test_unify_waterlevels_st2_bbox(): -# _waterlevel_bbox_tester("st2", combined_flag=False) -# -# -# def test_unify_waterlevels_isc_seven_rivers_bbox(): -# _waterlevel_bbox_tester("isc-seven-rivers", combined_flag=False) -# -# -# def test_unify_waterlevels_ckan_bbox(): -# _waterlevel_bbox_tester("ckan") + +def _waterlevel_bbox_tester(source, **kw): + _timeseries_tester(source, waterlevels, bbox="-104.5 32.5,-104 33", **kw) + + +def test_unify_waterlevels_nwis(): + _waterlevel_county_tester("nwis", timeseries_flag=False) + + +def test_unify_waterlevels_amp(): + _waterlevel_county_tester("amp", timeseries_flag=False) + + +def test_unify_waterlevels_st2(): + _waterlevel_county_tester("st2", combined_flag=False) + + +def test_unify_waterlevels_isc_seven_rivers(): + _waterlevel_county_tester("isc-seven-rivers") + + +def test_unify_waterlevels_ckan(): + _waterlevel_county_tester("ckan") + + +def test_unify_waterlevels_nwis_summary(): + _summary_tester("nwis", waterlevels) + + +def test_unify_waterlevels_amp_summary(): + _summary_tester("amp", waterlevels) + + +def test_unify_waterlevels_st2_summary(): + _summary_tester("st2", waterlevels) + + +def test_unify_waterlevels_isc_seven_rivers_summary(): + _summary_tester("isc-seven-rivers", waterlevels) + + +def test_unify_waterlevels_nwis_bbox(): + _waterlevel_bbox_tester("nwis", timeseries_flag=False) + + +def test_unify_waterlevels_amp_bbox(): + _waterlevel_bbox_tester("amp") + + +def test_unify_waterlevels_st2_bbox(): + _waterlevel_bbox_tester("st2", combined_flag=False) + + +def test_unify_waterlevels_isc_seven_rivers_bbox(): + _waterlevel_bbox_tester("isc-seven-rivers", combined_flag=False) + + +def test_unify_waterlevels_ckan_bbox(): + _waterlevel_bbox_tester("ckan") # ====== End Water Level Tests ======================================================= diff --git a/tests/test_unifier.py b/tests/archived/test_unifier.py similarity index 100% rename from tests/test_unifier.py rename to tests/archived/test_unifier.py diff --git a/tests/test_cli/__init__.py b/tests/test_cli/__init__.py new file mode 100644 index 0000000..4d342ae --- /dev/null +++ b/tests/test_cli/__init__.py @@ -0,0 +1,258 @@ +from click.testing import CliRunner +from logging import shutdown as logger_shutdown +from pathlib import Path +import pytest +from typing import List + +from backend.config import SOURCE_KEYS +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from frontend.cli import weave +from tests import recursively_clean_directory + + +class BaseCLITestClass: + + runner: CliRunner + agency: str + agency_reports_parameter: dict + output_dir: Path + + @pytest.fixture(autouse=True) + def setup(self): + # SETUP CODE ----------------------------------------------------------- + self.runner = CliRunner() + + # RUN TESTS ------------------------------------------------------------ + yield + + # TEARDOWN CODE --------------------------------------------------------- + logger_shutdown() + recursively_clean_directory(self.output_dir) + + def _test_weave( + self, + parameter: str, + output_type: str, + output_format: str = "csv", + site_limit: int = 4, + start_date: str = "1990-08-10", + end_date: str = "1990-08-11", + bbox: str | None = None, + county: str | None = None, + wkt: str | None = None, + ): + # Arrange + # turn off all sources except for the one being tested + no_agencies = [] + for source in SOURCE_KEYS: + source_with_dash = source.replace("_", "-") + if source_with_dash == self.agency: + continue + else: + no_agencies.append(f"--no-{source_with_dash}") + + geographic_filter_name: str | None = None + geographic_filter_value: str | None = None + if bbox: + geographic_filter_name = "bbox" + geographic_filter_value = bbox + elif county: + geographic_filter_name = "county" + geographic_filter_value = county + elif wkt: + geographic_filter_name = "wkt" + geographic_filter_value = wkt + + arguments = [ + parameter, + "--output-type", + output_type, + "--dry", + "--site-limit", + str(site_limit), + "--start-date", + start_date, + "--end-date", + end_date, + "--output-format", + output_format, + ] + + if geographic_filter_name and geographic_filter_value: + arguments.extend([f"--{geographic_filter_name}", geographic_filter_value]) + + arguments.extend(no_agencies) + + # Act + result = self.runner.invoke(weave, arguments, standalone_mode=False) + + # Assert + assert result.exit_code == 0 + + """ + For the config, check that + + 0. (set output dir to clean up tests results even in event of failure) + 1. The parameter is set correctly + 2. The agencies are set correctly + 3. The output types are set correctly + 4. The site limit is set correctly + 5. The dry is set correctly + 6. The start date is set correctly + 7. The end date is set correctly + 8. The geographic filter is set correctly + 9. The site output type is set correctly + """ + config = result.return_value + + # 0 + self.output_dir = Path(config.output_path) + + # 1 + assert getattr(config, "parameter") == parameter + + # 2 + agency_with_underscore = self.agency.replace("-", "_") + if self.agency_reports_parameter[parameter]: + assert getattr(config, f"use_source_{agency_with_underscore}") is True + else: + assert getattr(config, f"use_source_{agency_with_underscore}") is False + + for no_agency in no_agencies: + no_agency_with_underscore = no_agency.replace("--no-", "").replace("-", "_") + assert getattr(config, f"use_source_{no_agency_with_underscore}") is False + + # 3 + output_types = ["summary", "timeseries_unified", "timeseries_separated"] + for ot in output_types: + if ot == output_type: + assert getattr(config, f"output_{ot}") is True + else: + assert getattr(config, f"output_{ot}") is False + + # 4 + assert getattr(config, "site_limit") == 4 + + # 5 + assert getattr(config, "dry") is True + + # 6 + assert getattr(config, "start_date") == start_date + + # 7 + assert getattr(config, "end_date") == end_date + + # 8 + if geographic_filter_name and geographic_filter_value: + for _geographic_filter_name in ["bbox", "county", "wkt"]: + if _geographic_filter_name == geographic_filter_name: + assert ( + getattr(config, _geographic_filter_name) + == geographic_filter_value + ) + else: + assert getattr(config, _geographic_filter_name) == "" + + # 9 + assert getattr(config, "output_format") == output_format + + def test_weave_summary(self): + self._test_weave(parameter=WATERLEVELS, output_type="summary") + + def test_weave_timeseries_unified(self): + self._test_weave(parameter=WATERLEVELS, output_type="timeseries_unified") + + def test_weave_timeseries_separated(self): + self._test_weave(parameter=WATERLEVELS, output_type="timeseries_separated") + + def test_weave_csv(self): + self._test_weave( + parameter=WATERLEVELS, output_type="summary", output_format="csv" + ) + + def test_weave_geojson(self): + self._test_weave( + parameter=WATERLEVELS, output_type="summary", output_format="geojson" + ) + + def test_weave_bbox(self): + self._test_weave( + parameter=WATERLEVELS, output_type="summary", bbox="32.0,-106.0,36.0,-102.0" + ) + + def test_weave_county(self): + self._test_weave( + parameter=WATERLEVELS, output_type="summary", county="Bernalillo" + ) + + def test_weave_wkt(self): + self._test_weave( + parameter=WATERLEVELS, + output_type="summary", + wkt="POLYGON((-106.0 32.0, -102.0 32.0, -102.0 36.0, -106.0 36.0, -106.0 32.0))", + ) + + def test_weave_waterlevels(self): + self._test_weave(parameter=WATERLEVELS, output_type="summary") + + def test_weave_arsenic(self): + self._test_weave(parameter=ARSENIC, output_type="summary") + + def test_weave_bicarbonate(self): + self._test_weave(parameter=BICARBONATE, output_type="summary") + + def test_weave_calcium(self): + self._test_weave(parameter=CALCIUM, output_type="summary") + + def test_weave_carbonate(self): + self._test_weave(parameter=CARBONATE, output_type="summary") + + def test_weave_chloride(self): + self._test_weave(parameter=CHLORIDE, output_type="summary") + + def test_weave_fluoride(self): + self._test_weave(parameter=FLUORIDE, output_type="summary") + + def test_weave_magnesium(self): + self._test_weave(parameter=MAGNESIUM, output_type="summary") + + def test_weave_nitrate(self): + self._test_weave(parameter=NITRATE, output_type="summary") + + def test_weave_ph(self): + self._test_weave(parameter=PH, output_type="summary") + + def test_weave_potassium(self): + self._test_weave(parameter=POTASSIUM, output_type="summary") + + def test_weave_silica(self): + self._test_weave(parameter=SILICA, output_type="summary") + + def test_weave_sodium(self): + self._test_weave(parameter=SODIUM, output_type="summary") + + def test_weave_sulfate(self): + self._test_weave(parameter=SULFATE, output_type="summary") + + def test_weave_tds(self): + self._test_weave(parameter=TDS, output_type="summary") + + def test_weave_uranium(self): + self._test_weave(parameter=URANIUM, output_type="summary") diff --git a/tests/test_cli/test_bernco.py b/tests/test_cli/test_bernco.py new file mode 100644 index 0000000..331ed26 --- /dev/null +++ b/tests/test_cli/test_bernco.py @@ -0,0 +1,42 @@ +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from tests.test_cli import BaseCLITestClass + + +class TestBernCoCLI(BaseCLITestClass): + + agency = "bernco" + agency_reports_parameter = { + WATERLEVELS: True, + ARSENIC: False, + BICARBONATE: False, + CALCIUM: False, + CARBONATE: False, + CHLORIDE: False, + FLUORIDE: False, + MAGNESIUM: False, + NITRATE: False, + PH: False, + POTASSIUM: False, + SILICA: False, + SODIUM: False, + SULFATE: False, + TDS: False, + URANIUM: False, + } diff --git a/tests/test_cli/test_cabq.py b/tests/test_cli/test_cabq.py new file mode 100644 index 0000000..1748975 --- /dev/null +++ b/tests/test_cli/test_cabq.py @@ -0,0 +1,42 @@ +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from tests.test_cli import BaseCLITestClass + + +class TestCABQCLI(BaseCLITestClass): + + agency = "cabq" + agency_reports_parameter = { + WATERLEVELS: True, + ARSENIC: False, + BICARBONATE: False, + CALCIUM: False, + CARBONATE: False, + CHLORIDE: False, + FLUORIDE: False, + MAGNESIUM: False, + NITRATE: False, + PH: False, + POTASSIUM: False, + SILICA: False, + SODIUM: False, + SULFATE: False, + TDS: False, + URANIUM: False, + } diff --git a/tests/test_cli/test_ebid.py b/tests/test_cli/test_ebid.py new file mode 100644 index 0000000..76429f1 --- /dev/null +++ b/tests/test_cli/test_ebid.py @@ -0,0 +1,42 @@ +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from tests.test_cli import BaseCLITestClass + + +class TestEBIDCLI(BaseCLITestClass): + + agency = "ebid" + agency_reports_parameter = { + WATERLEVELS: True, + ARSENIC: False, + BICARBONATE: False, + CALCIUM: False, + CARBONATE: False, + CHLORIDE: False, + FLUORIDE: False, + MAGNESIUM: False, + NITRATE: False, + PH: False, + POTASSIUM: False, + SILICA: False, + SODIUM: False, + SULFATE: False, + TDS: False, + URANIUM: False, + } diff --git a/tests/test_cli/test_nmbgmr_amp.py b/tests/test_cli/test_nmbgmr_amp.py new file mode 100644 index 0000000..df4ea49 --- /dev/null +++ b/tests/test_cli/test_nmbgmr_amp.py @@ -0,0 +1,42 @@ +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from tests.test_cli import BaseCLITestClass + + +class TestNMBGMRCLI(BaseCLITestClass): + + agency = "nmbgmr-amp" + agency_reports_parameter = { + WATERLEVELS: True, + ARSENIC: True, + BICARBONATE: True, + CALCIUM: True, + CARBONATE: True, + CHLORIDE: True, + FLUORIDE: True, + MAGNESIUM: True, + NITRATE: True, + PH: True, + POTASSIUM: True, + SILICA: True, + SODIUM: True, + SULFATE: True, + TDS: True, + URANIUM: True, + } diff --git a/tests/test_cli/test_nmed_dwb.py b/tests/test_cli/test_nmed_dwb.py new file mode 100644 index 0000000..edd9d68 --- /dev/null +++ b/tests/test_cli/test_nmed_dwb.py @@ -0,0 +1,42 @@ +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from tests.test_cli import BaseCLITestClass + + +class TestNMEDDWBCLI(BaseCLITestClass): + + agency = "nmed-dwb" + agency_reports_parameter = { + WATERLEVELS: False, + ARSENIC: True, + BICARBONATE: True, + CALCIUM: True, + CARBONATE: False, + CHLORIDE: True, + FLUORIDE: True, + MAGNESIUM: True, + NITRATE: True, + PH: True, + POTASSIUM: True, + SILICA: True, + SODIUM: True, + SULFATE: True, + TDS: True, + URANIUM: True, + } diff --git a/tests/test_cli/test_nmose_isc_seven_rivers.py b/tests/test_cli/test_nmose_isc_seven_rivers.py new file mode 100644 index 0000000..0f99e70 --- /dev/null +++ b/tests/test_cli/test_nmose_isc_seven_rivers.py @@ -0,0 +1,42 @@ +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from tests.test_cli import BaseCLITestClass + + +class TestNMOSEISCSevenRiversCLI(BaseCLITestClass): + + agency = "nmose-isc-seven-rivers" + agency_reports_parameter = { + WATERLEVELS: True, + ARSENIC: False, + BICARBONATE: True, + CALCIUM: True, + CARBONATE: False, + CHLORIDE: True, + FLUORIDE: True, + MAGNESIUM: True, + NITRATE: True, + PH: True, + POTASSIUM: True, + SILICA: True, + SODIUM: True, + SULFATE: True, + TDS: True, + URANIUM: False, + } diff --git a/tests/test_cli/test_nmose_roswell.py b/tests/test_cli/test_nmose_roswell.py new file mode 100644 index 0000000..0c2be39 --- /dev/null +++ b/tests/test_cli/test_nmose_roswell.py @@ -0,0 +1,42 @@ +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from tests.test_cli import BaseCLITestClass + + +class TestNMOSERoswellCLI(BaseCLITestClass): + + agency = "nmose-roswell" + agency_reports_parameter = { + WATERLEVELS: True, + ARSENIC: False, + BICARBONATE: False, + CALCIUM: False, + CARBONATE: False, + CHLORIDE: False, + FLUORIDE: False, + MAGNESIUM: False, + NITRATE: False, + PH: False, + POTASSIUM: False, + SILICA: False, + SODIUM: False, + SULFATE: False, + TDS: False, + URANIUM: False, + } diff --git a/tests/test_cli/test_nwis.py b/tests/test_cli/test_nwis.py new file mode 100644 index 0000000..0fd236a --- /dev/null +++ b/tests/test_cli/test_nwis.py @@ -0,0 +1,42 @@ +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from tests.test_cli import BaseCLITestClass + + +class TestNWISCLI(BaseCLITestClass): + + agency = "nwis" + agency_reports_parameter = { + WATERLEVELS: True, + ARSENIC: False, + BICARBONATE: False, + CALCIUM: False, + CARBONATE: False, + CHLORIDE: False, + FLUORIDE: False, + MAGNESIUM: False, + NITRATE: False, + PH: False, + POTASSIUM: False, + SILICA: False, + SODIUM: False, + SULFATE: False, + TDS: False, + URANIUM: False, + } diff --git a/tests/test_cli/test_pvacd.py b/tests/test_cli/test_pvacd.py new file mode 100644 index 0000000..041c9a9 --- /dev/null +++ b/tests/test_cli/test_pvacd.py @@ -0,0 +1,42 @@ +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from tests.test_cli import BaseCLITestClass + + +class TestPVACDCLI(BaseCLITestClass): + + agency = "pvacd" + agency_reports_parameter = { + WATERLEVELS: True, + ARSENIC: False, + BICARBONATE: False, + CALCIUM: False, + CARBONATE: False, + CHLORIDE: False, + FLUORIDE: False, + MAGNESIUM: False, + NITRATE: False, + PH: False, + POTASSIUM: False, + SILICA: False, + SODIUM: False, + SULFATE: False, + TDS: False, + URANIUM: False, + } diff --git a/tests/test_cli/test_wqp.py b/tests/test_cli/test_wqp.py new file mode 100644 index 0000000..f3beb7b --- /dev/null +++ b/tests/test_cli/test_wqp.py @@ -0,0 +1,42 @@ +from backend.constants import ( + WATERLEVELS, + ARSENIC, + BICARBONATE, + CALCIUM, + CARBONATE, + CHLORIDE, + FLUORIDE, + MAGNESIUM, + NITRATE, + PH, + POTASSIUM, + SILICA, + SODIUM, + SULFATE, + TDS, + URANIUM, +) +from tests.test_cli import BaseCLITestClass + + +class TestWQPCLI(BaseCLITestClass): + + agency = "wqp" + agency_reports_parameter = { + WATERLEVELS: True, + ARSENIC: True, + BICARBONATE: True, + CALCIUM: True, + CARBONATE: True, + CHLORIDE: True, + FLUORIDE: True, + MAGNESIUM: True, + NITRATE: True, + PH: True, + POTASSIUM: True, + SILICA: True, + SODIUM: True, + SULFATE: True, + TDS: True, + URANIUM: True, + } diff --git a/tests/test_sources/__init__.py b/tests/test_sources/__init__.py new file mode 100644 index 0000000..34d0485 --- /dev/null +++ b/tests/test_sources/__init__.py @@ -0,0 +1,270 @@ +import json +from logging import shutdown as logger_shutdown +from pathlib import Path +import pytest +from shapely import Geometry + +from backend.config import Config, SOURCE_KEYS +from backend.constants import WATERLEVELS +from backend.logger import setup_logging +from backend.record import SummaryRecord, SiteRecord, ParameterRecord +from backend.unifier import unify_analytes, unify_waterlevels +from tests import recursively_clean_directory + +EXCLUDED_GEOJSON_KEYS = ["latitude", "longitude", "elevation"] + +SUMMARY_RECORD_CSV_HEADERS = list(SummaryRecord.keys) +SUMMARY_RECORD_GEOJSON_KEYS = [ + k for k in SUMMARY_RECORD_CSV_HEADERS if k not in EXCLUDED_GEOJSON_KEYS +] + +SITE_RECORD_CSV_HEADERS = list(SiteRecord.keys) +SITE_RECORD_GEOJSON_KEYS = [ + k for k in SITE_RECORD_CSV_HEADERS if k not in EXCLUDED_GEOJSON_KEYS +] + +PARAMETER_RECORD_HEADERS = list(ParameterRecord.keys) + + +class BaseSourceTestClass: + parameter: str + units: str + agency: str + bounds: Geometry + + # set site_limit for tests + site_limit: int = 3 + + @pytest.fixture(autouse=True) + def setup(self): + # SETUP CODE ---------------------------------------------------------- + # 1: setup test/config attributes + self.config = Config() + for agency in SOURCE_KEYS: + setattr(self.config, f"use_source_{agency}", False) + setattr(self.config, "site_limit", self.site_limit) + setattr(self.config, "parameter", self.parameter) + setattr(self.config, "units", self.units) + setattr(self.config, f"use_source_{self.agency}", True) + self.config.finalize() + + # 2: initiate logger + setup_logging(path=self.config.output_path) + + # RUN TESTS ------------------------------------------------------------ + yield + + # UNIVERSAL ASSERTIONS ------------------------------------------------- + # 1: log file exists + log_path = Path(self.config.output_path) / "die.log" + assert log_path.exists() + + # TEARDOWN CODE -------------------------------------------------------- + # 1: close logger to delete log file + logger_shutdown() + + # 2: delete newly created dirs and files + path_to_clean = Path(self.config.output_path) + print(f"Cleaning and removing {path_to_clean}") + recursively_clean_directory(path_to_clean) + + # reset test attributes + self.dirs_to_delete = [] + self.config = None + self.unifier = None + + def _run_unifier(self): + if self.parameter == WATERLEVELS: + unify_waterlevels(self.config) + else: + unify_analytes(self.config) + + def _check_summary_file(self, extension: str): + summary_file = Path(self.config.output_path) / f"summary.{extension}" + assert summary_file.exists() + + if extension == "csv": + with open(summary_file, "r") as f: + headers = f.readline().strip().split(",") + assert headers == SUMMARY_RECORD_CSV_HEADERS + + # +1 for the header + with open(summary_file, "r") as f: + lines = f.readlines() + assert len(lines) == self.site_limit + 1 + elif extension == "geojson": + with open(summary_file, "r") as f: + summary = json.load(f) + assert len(summary["features"]) == self.site_limit + assert summary["type"] == "FeatureCollection" + for feature in summary["features"]: + assert feature["geometry"]["type"] == "Point" + assert len(feature["geometry"]["coordinates"]) == 3 + assert sorted(feature["properties"].keys()) == sorted( + SUMMARY_RECORD_GEOJSON_KEYS + ) + assert summary["features"][0]["type"] == "Feature" + else: + raise ValueError(f"Unsupported file extension: {extension}") + + def _check_sites_file(self, extension: str): + sites_file = Path(self.config.output_path) / f"sites.{extension}" + assert sites_file.exists() + + if extension == "csv": + with open(sites_file, "r") as f: + headers = f.readline().strip().split(",") + assert headers == SITE_RECORD_CSV_HEADERS + + # +1 for the header + with open(sites_file, "r") as f: + lines = f.readlines() + assert len(lines) == self.site_limit + 1 + elif extension == "geojson": + with open(sites_file, "r") as f: + sites = json.load(f) + assert len(sites["features"]) == self.site_limit + assert sites["type"] == "FeatureCollection" + for feature in sites["features"]: + assert feature["geometry"]["type"] == "Point" + assert len(feature["geometry"]["coordinates"]) == 3 + assert sorted(feature["properties"].keys()) == sorted( + SITE_RECORD_GEOJSON_KEYS + ) + assert sites["features"][0]["type"] == "Feature" + else: + raise ValueError(f"Unsupported file extension: {extension}") + + def _check_timeseries_file(self, timeseries_dir, timeseries_file_name): + timeseries_file = Path(timeseries_dir) / timeseries_file_name + assert timeseries_file.exists() + + with open(timeseries_file, "r") as f: + headers = f.readline().strip().split(",") + assert headers == PARAMETER_RECORD_HEADERS + + def test_health(self): + # do a health check for the agency + source = self.config.all_site_sources()[0][0] + assert source.health() + + def test_summary_csv(self): + # Arrange -------------------------------------------------------------- + self.config.output_summary = True + self.config.report() + + # Act ------------------------------------------------------------------ + self._run_unifier() + + # Assert --------------------------------------------------------------- + self._check_summary_file("csv") + + def test_summary_geojson(self): + # Arrange -------------------------------------------------------------- + self.config.output_summary = True + self.config.output_format = "geojson" + self.config.report() + + # Act ------------------------------------------------------------------ + self._run_unifier() + + # Assert --------------------------------------------------------------- + self._check_summary_file("geojson") + + def test_timeseries_unified_csv(self): + # Arrange -------------------------------------------------------------- + self.config.output_timeseries_unified = True + self.config.report() + + # Act ------------------------------------------------------------------ + self._run_unifier() + + # Assert --------------------------------------------------------------- + # Check the sites file + self._check_sites_file("csv") + + # Check the timeseries file + timeseries_dir = Path(self.config.output_path) + timeseries_file_name = "timeseries_unified.csv" + self._check_timeseries_file(timeseries_dir, timeseries_file_name) + + def test_timeseries_unified_geojson(self): + # Arrange -------------------------------------------------------------- + self.config.output_timeseries_unified = True + self.config.output_format = "geojson" + self.config.report() + + # Act ------------------------------------------------------------------ + self._run_unifier() + + # Assert --------------------------------------------------------------- + # Check the sites file + self._check_sites_file("geojson") + + # Check the timeseries file + timeseries_dir = Path(self.config.output_path) + timeseries_file_name = "timeseries_unified.csv" + self._check_timeseries_file(timeseries_dir, timeseries_file_name) + + def test_timeseries_separated_csv(self): + # Arrange -------------------------------------------------------------- + self.config.output_timeseries_separated = True + self.config.report() + + # Act ------------------------------------------------------------------ + self._run_unifier() + + # Assert --------------------------------------------------------------- + # Check the sites file + self._check_sites_file("csv") + + # Check the timeseries files + timeseries_dir = Path(self.config.output_path) / "timeseries" + assert len([f for f in timeseries_dir.iterdir()]) == self.site_limit + + for timeseries_file in timeseries_dir.iterdir(): + self._check_timeseries_file(timeseries_dir, timeseries_file.name) + + def test_timeseries_separated_geojson(self): + # Arrange -------------------------------------------------------------- + self.config.output_timeseries_separated = True + self.config.output_format = "geojson" + self.config.report() + + # Act ------------------------------------------------------------------ + self._run_unifier() + + # Assert --------------------------------------------------------------- + # Check the sites file + self._check_sites_file("geojson") + + # Check the timeseries files + timeseries_dir = Path(self.config.output_path) / "timeseries" + assert len([f for f in timeseries_dir.iterdir()]) == self.site_limit + + for timeseries_file in timeseries_dir.iterdir(): + self._check_timeseries_file(timeseries_dir, timeseries_file.name) + + @pytest.mark.skip(reason="test_date_range not implemented yet") + def test_date_range(self): + pass + + @pytest.mark.skip(reason="test_bounds not implemented yet") + def test_bounds(self): + pass + + @pytest.mark.skip(reason="test_wkt not implemented yet") + def test_wkt(self): + pass + + @pytest.mark.skip(reason="test_county not implemented yet") + def test_county(self): + pass + + @pytest.mark.skip(reason="test_huc not implemented yet") + def test_huc(self): + pass + + @pytest.mark.skip(reason="test_bbox not implemented yet") + def text_bbox(self): + pass diff --git a/tests/test_sources/test_bernco.py b/tests/test_sources/test_bernco.py new file mode 100644 index 0000000..48004a9 --- /dev/null +++ b/tests/test_sources/test_bernco.py @@ -0,0 +1,9 @@ +from backend.constants import WATERLEVELS, FEET +from tests.test_sources import BaseSourceTestClass + + +class TestBernCoWaterlevels(BaseSourceTestClass): + + parameter = WATERLEVELS + units = FEET + agency = "bernco" diff --git a/tests/test_sources/test_bor.py b/tests/test_sources/test_bor.py new file mode 100644 index 0000000..003391d --- /dev/null +++ b/tests/test_sources/test_bor.py @@ -0,0 +1,9 @@ +from backend.constants import CALCIUM, MILLIGRAMS_PER_LITER +from tests.test_sources import BaseSourceTestClass + + +class TestBoRAnalyte(BaseSourceTestClass): + + parameter = CALCIUM + units = MILLIGRAMS_PER_LITER + agency = "bor" diff --git a/tests/test_sources/test_cabq.py b/tests/test_sources/test_cabq.py new file mode 100644 index 0000000..9f3ff3c --- /dev/null +++ b/tests/test_sources/test_cabq.py @@ -0,0 +1,9 @@ +from backend.constants import WATERLEVELS, FEET +from tests.test_sources import BaseSourceTestClass + + +class TestCABQWaterlevels(BaseSourceTestClass): + + parameter = WATERLEVELS + units = FEET + agency = "cabq" diff --git a/tests/test_sources/test_ebid.py b/tests/test_sources/test_ebid.py new file mode 100644 index 0000000..6adfd6f --- /dev/null +++ b/tests/test_sources/test_ebid.py @@ -0,0 +1,9 @@ +from backend.constants import WATERLEVELS, FEET +from tests.test_sources import BaseSourceTestClass + + +class TestEBIDWaterlevels(BaseSourceTestClass): + + parameter = WATERLEVELS + units = FEET + agency = "ebid" diff --git a/tests/test_sources/test_nmbgmr_amp.py b/tests/test_sources/test_nmbgmr_amp.py new file mode 100644 index 0000000..b56fd5b --- /dev/null +++ b/tests/test_sources/test_nmbgmr_amp.py @@ -0,0 +1,33 @@ +import os +import pytest + +from backend.constants import WATERLEVELS, CALCIUM, MILLIGRAMS_PER_LITER, FEET +from tests.test_sources import BaseSourceTestClass + +os.environ["IS_TESTING_ENV"] = "True" + + +@pytest.fixture(autouse=True) +def setup(): + # SETUP CODE ----------------------------------------------------------- + os.environ["IS_TESTING_ENV"] = "True" + + # RUN TESTS ------------------------------------------------------------ + yield + + # TEARDOWN CODE --------------------------------------------------------- + os.environ["IS_TESTING_ENV"] = "False" + + +class TestNMBGMRWaterlevels(BaseSourceTestClass): + + parameter = WATERLEVELS + units = FEET + agency = "nmbgmr_amp" + + +class TestNMBGMRAnalyte(BaseSourceTestClass): + + parameter = CALCIUM + units = MILLIGRAMS_PER_LITER + agency = "nmbgmr_amp" diff --git a/tests/test_sources/test_nmed_dwb.py b/tests/test_sources/test_nmed_dwb.py new file mode 100644 index 0000000..2a27be3 --- /dev/null +++ b/tests/test_sources/test_nmed_dwb.py @@ -0,0 +1,9 @@ +from backend.constants import CALCIUM, MILLIGRAMS_PER_LITER +from tests.test_sources import BaseSourceTestClass + + +class TestNMEDDWBAnalyte(BaseSourceTestClass): + + parameter = CALCIUM + units = MILLIGRAMS_PER_LITER + agency = "nmed_dwb" diff --git a/tests/test_sources/test_nmose_isc_seven_rivers.py b/tests/test_sources/test_nmose_isc_seven_rivers.py new file mode 100644 index 0000000..55b345e --- /dev/null +++ b/tests/test_sources/test_nmose_isc_seven_rivers.py @@ -0,0 +1,16 @@ +from backend.constants import WATERLEVELS, CALCIUM, FEET, MILLIGRAMS_PER_LITER +from tests.test_sources import BaseSourceTestClass + + +class TestNMOSEISCSevenRiversWaterlevels(BaseSourceTestClass): + + parameter = WATERLEVELS + units = FEET + agency = "nmose_isc_seven_rivers" + + +class TestNMOSEISCSevenRiversAnalyte(BaseSourceTestClass): + + parameter = CALCIUM + units = MILLIGRAMS_PER_LITER + agency = "nmose_isc_seven_rivers" diff --git a/tests/test_sources/test_nmose_roswell.py b/tests/test_sources/test_nmose_roswell.py new file mode 100644 index 0000000..585090f --- /dev/null +++ b/tests/test_sources/test_nmose_roswell.py @@ -0,0 +1,9 @@ +from backend.constants import WATERLEVELS, FEET +from tests.test_sources import BaseSourceTestClass + + +class TestNMOSERoswellWaterlevels(BaseSourceTestClass): + + parameter = WATERLEVELS + units = FEET + agency = "nmose_roswell" diff --git a/tests/test_sources/test_nwis.py b/tests/test_sources/test_nwis.py new file mode 100644 index 0000000..b7bf272 --- /dev/null +++ b/tests/test_sources/test_nwis.py @@ -0,0 +1,9 @@ +from backend.constants import WATERLEVELS, FEET +from tests.test_sources import BaseSourceTestClass + + +class TestNWISWaterlevels(BaseSourceTestClass): + + parameter = WATERLEVELS + units = FEET + agency = "nwis" diff --git a/tests/test_sources/test_pvacd.py b/tests/test_sources/test_pvacd.py new file mode 100644 index 0000000..715acf7 --- /dev/null +++ b/tests/test_sources/test_pvacd.py @@ -0,0 +1,9 @@ +from backend.constants import WATERLEVELS, FEET +from tests.test_sources import BaseSourceTestClass + + +class TestPVACDWaterlevels(BaseSourceTestClass): + + parameter = WATERLEVELS + units = FEET + agency = "pvacd" diff --git a/tests/test_sources/test_wqp.py b/tests/test_sources/test_wqp.py new file mode 100644 index 0000000..4f8437e --- /dev/null +++ b/tests/test_sources/test_wqp.py @@ -0,0 +1,16 @@ +from backend.constants import WATERLEVELS, CALCIUM, MILLIGRAMS_PER_LITER, FEET +from tests.test_sources import BaseSourceTestClass + + +class TestWQPWaterlevels(BaseSourceTestClass): + + parameter = WATERLEVELS + units = FEET + agency = "wqp" + + +class TestWQPAnalyte(BaseSourceTestClass): + + parameter = CALCIUM + units = MILLIGRAMS_PER_LITER + agency = "wqp"