Skip to content

Commit 572139e

Browse files
committed
use ActivityIdentifier to identify duplicate records
1 parent d972ba8 commit 572139e

2 files changed

Lines changed: 33 additions & 21 deletions

File tree

backend/connectors/wqp/source.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030
LATEST,
3131
TDS,
3232
WATERLEVELS,
33+
USGS_PCODE_30210,
34+
USGS_PCODE_70300,
35+
USGS_PCODE_70301,
36+
USGS_PCODE_70303,
3337
)
3438
from backend.connectors.wqp.transformer import (
3539
WQPSiteTransformer,
@@ -99,7 +103,7 @@ def get_records(self):
99103
else:
100104
# every record with pCode 30210 (depth in m) has a corresponding
101105
# record with pCode 72019 (depth in ft) but not vice versa
102-
params["pCode"] = "30210"
106+
params["pCode"] = USGS_PCODE_30210
103107

104108
params.update(get_date_range(config))
105109

@@ -132,40 +136,43 @@ def _extract_source_parameter_results(self, records):
132136
return [ri["ResultMeasureValue"] for ri in records]
133137

134138
def _clean_records(self, records):
135-
records_with_values = [r for r in records if r["ResultMeasureValue"]]
139+
clean_records = [r for r in records if r["ResultMeasureValue"]]
136140

137-
if self.config.parameter == TDS and len(records_with_values) > 1:
138-
site_id = records_with_values[0]["MonitoringLocationIdentifier"]
141+
if self.config.parameter == TDS and len(clean_records) > 1:
142+
site_id = clean_records[0]["MonitoringLocationIdentifier"]
139143
return_records = []
140-
dates = [record["ActivityStartDate"] for record in records]
141-
dates = list(set(dates))
142-
for date in dates:
143-
# get all records for this date
144-
date_records = {
144+
activity_identifiers = [record["ActivityIdentifier"] for record in records]
145+
activity_identifiers = list(set(activity_identifiers))
146+
for activity_identifier in activity_identifiers:
147+
# get all records for this activity identifier
148+
ai_records = {
145149
record["USGSPCode"]: record
146150
for record in records
147-
if record["ActivityStartDate"] == date
151+
if record["ActivityIdentifier"] == activity_identifier
148152
}
149-
if len(date_records.items()) > 1:
150-
if "70301" in date_records.keys():
151-
kept_record = date_records["70301"]
152-
pcode = "70301"
153-
elif "70303" in date_records.keys():
154-
kept_record = date_records["70303"]
155-
pcode = "70303"
153+
if len(ai_records.items()) > 1:
154+
if USGS_PCODE_70300 in ai_records.keys():
155+
kept_record = ai_records[USGS_PCODE_70300]
156+
pcode = USGS_PCODE_70300
157+
elif USGS_PCODE_70301 in ai_records.keys():
158+
kept_record = ai_records[USGS_PCODE_70301]
159+
pcode = USGS_PCODE_70301
160+
elif USGS_PCODE_70303 in ai_records.keys():
161+
kept_record = ai_records[USGS_PCODE_70303]
162+
pcode = USGS_PCODE_70303
156163
else:
157164
raise ValueError(
158-
f"Multiple TDS records found for {site_id} on date {date} but no 70301 or 70303 pcodes found."
165+
f"Multiple TDS records found for {site_id} with ActivityIdentifier {activity_identifier} but no 70300, 70301, or 70303 pcodes found."
159166
)
160167
self.log(
161-
f"Removing duplicates for {site_id} on date {date}. Keeping record with pcode {pcode}."
168+
f"Removing duplicates for {site_id} with ActivityIdentifier {activity_identifier}. Keeping record with pcode {pcode}."
162169
)
163170
else:
164-
kept_record = list(date_records.values())[0]
171+
kept_record = list(ai_records.values())[0]
165172
return_records.append(kept_record)
166173
return return_records
167174
else:
168-
return records_with_values
175+
return clean_records
169176

170177
def _extract_source_parameter_units(self, records):
171178
return [ri["ResultMeasure/MeasureUnitCode"] for ri in records]

backend/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@
5757
SOURCE_PARAMETER_UNITS = "source_parameter_units"
5858
CONVERSION_FACTOR = "conversion_factor"
5959

60+
USGS_PCODE_30210 = "30210"
61+
USGS_PCODE_70300 = "70300"
62+
USGS_PCODE_70301 = "70301"
63+
USGS_PCODE_70303 = "70303"
64+
6065
ANALYTE_OPTIONS = sorted(
6166
[
6267
ARSENIC,

0 commit comments

Comments
 (0)