From b39b56327f5ed162d3aa5867e27c006fcb410ec4 Mon Sep 17 00:00:00 2001 From: Kelsey Smuczynski Date: Tue, 10 Feb 2026 11:35:48 -0700 Subject: [PATCH 1/6] fix(transfers): normalize OwnerKey joins with mapper and collision guard - add owners_ownerkey_mapper.json for canonical OwnerKey mapping - apply canonicalization + casefold normalization before OwnerLink join - fail fast on normalization collisions with actionable logging - document the mapping file in README --- README.md | 4 ++ transfers/contact_transfer.py | 69 +++++++++++++++++++++- transfers/data/owners_ownerkey_mapper.json | 4 ++ 3 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 transfers/data/owners_ownerkey_mapper.json diff --git a/README.md b/README.md index 8382b1f97..82be22219 100644 --- a/README.md +++ b/README.md @@ -262,6 +262,10 @@ python -m transfers.transfer Configure the `.env` file with the appropriate credentials before running transfers. +If contact transfers fail with `OwnerKey normalization collisions`, add or update +`transfers/data/owners_ownerkey_mapper.json` to map inconsistent `OwnerKey` values +to a single canonical spelling before re-running the transfer. + To drop the existing schema and rebuild from migrations before transferring data, set: ```bash diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index 9a2040774..37a518b33 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -57,6 +57,13 @@ def __init__(self, *args, **kw): with open(co_to_org_mapper_path, "r") as f: self._co_to_org_mapper = json.load(f) + ownerkey_mapper_path = get_transfers_data_path("owners_ownerkey_mapper.json") + try: + with open(ownerkey_mapper_path, "r") as f: + self._ownerkey_mapper = json.load(f) + except FileNotFoundError: + self._ownerkey_mapper = {} + self._added = [] def calculate_missing_organizations(self): @@ -78,7 +85,67 @@ def _get_dfs(self): locdf = read_csv("Location") ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId") - odf = odf.join(ldf.set_index("OwnerKey"), on="OwnerKey") + owner_key_col = next( + col for col in odf.columns if col.lower().endswith("ownerkey") + ) + link_owner_key_col = next( + col for col in ldf.columns if col.lower().endswith("ownerkey") + ) + + if self._ownerkey_mapper: + odf["ownerkey_canonical"] = odf[owner_key_col].map( + lambda v: self._ownerkey_mapper.get(v, v) + ) + ldf["ownerkey_canonical"] = ldf[link_owner_key_col].map( + lambda v: self._ownerkey_mapper.get(v, v) + ) + else: + odf["ownerkey_canonical"] = odf[owner_key_col] + ldf["ownerkey_canonical"] = ldf[link_owner_key_col] + + odf["ownerkey_norm"] = ( + odf["ownerkey_canonical"] + .fillna("") + .astype(str) + .str.strip() + .str.casefold() + .replace({"": pd.NA}) + ) + ldf["ownerkey_norm"] = ( + ldf["ownerkey_canonical"] + .fillna("") + .astype(str) + .str.strip() + .str.casefold() + .replace({"": pd.NA}) + ) + + collisions = ( + ldf.groupby("ownerkey_norm")["ownerkey_canonical"] + .nunique(dropna=True) + .loc[lambda s: s > 1] + ) + if not collisions.empty: + examples = [] + for key in collisions.index[:10]: + variants = ( + ldf.loc[ldf["ownerkey_norm"] == key, "ownerkey_canonical"] + .dropna() + .unique() + .tolist() + ) + examples.append(f"{key} -> {sorted(variants)}") + logger.critical( + "OwnerKey normalization collision(s) detected in OwnerLink. " + "Resolve these before proceeding. Examples: %s", + "; ".join(examples), + ) + raise ValueError( + "OwnerKey normalization collisions detected in OwnerLink. " + "Fix source data or update owners_ownerkey_mapper.json." + ) + + odf = odf.join(ldf.set_index("ownerkey_norm"), on="ownerkey_norm") odf = replace_nans(odf) diff --git a/transfers/data/owners_ownerkey_mapper.json b/transfers/data/owners_ownerkey_mapper.json new file mode 100644 index 000000000..c4ca6e43d --- /dev/null +++ b/transfers/data/owners_ownerkey_mapper.json @@ -0,0 +1,4 @@ +{ + "Rio en Medio MDWCA": "Rio En Medio MDWCA", + "city of Rocks": "City of Rocks" +} From dad386ccb4571ffc025c6c31faa34c41019a9ac2 Mon Sep 17 00:00:00 2001 From: Kelsey Smuczynski Date: Tue, 10 Feb 2026 17:16:47 -0700 Subject: [PATCH 2/6] fix(transfers): avoid column collisions in contact OwnerLink join MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop overlapping OwnerLink columns before joining on normalized OwnerKey to prevent “columns overlap” errors during contact transfer. --- transfers/contact_transfer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index 37a518b33..f81857df0 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -145,7 +145,11 @@ def _get_dfs(self): "Fix source data or update owners_ownerkey_mapper.json." ) - odf = odf.join(ldf.set_index("ownerkey_norm"), on="ownerkey_norm") + ldf_join = ldf.set_index("ownerkey_norm") + overlap_cols = [col for col in ldf_join.columns if col in odf.columns] + if overlap_cols: + ldf_join = ldf_join.drop(columns=overlap_cols, errors="ignore") + odf = odf.join(ldf_join, on="ownerkey_norm") odf = replace_nans(odf) From a3b6bcee1ea617b13c86ef360b57397977296a57 Mon Sep 17 00:00:00 2001 From: Kelsey Smuczynski Date: Wed, 11 Feb 2026 11:01:35 -0700 Subject: [PATCH 3/6] fix(transfers): replace ambiguous column matching with explicit validation - Replaced next(...endswith()) logic with explicit name matching and count validation for OwnerKey. - Impact: Prevents silent data corruption caused by non-deterministic column selection when multiple similar keys exist. --- transfers/contact_transfer.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index f81857df0..f0990a226 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -39,6 +39,25 @@ from transfers.util import read_csv, filter_to_valid_point_ids, replace_nans +def _select_ownerkey_col(df: DataFrame, source_name: str) -> str: + exact = next((col for col in df.columns if col.lower() == "ownerkey"), None) + if exact: + return exact + + candidates = [col for col in df.columns if col.lower().endswith("ownerkey")] + if not candidates: + raise ValueError( + f"No owner key column found in {source_name}; expected a column named " + "'OwnerKey' (case-insensitive) or ending with 'OwnerKey'." + ) + if len(candidates) > 1: + raise ValueError( + f"Multiple owner key-like columns found in {source_name}: {candidates}. " + "Please disambiguate." + ) + return candidates[0] + + class ContactTransfer(ThingBasedTransferer): source_table = "OwnersData" @@ -85,12 +104,8 @@ def _get_dfs(self): locdf = read_csv("Location") ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId") - owner_key_col = next( - col for col in odf.columns if col.lower().endswith("ownerkey") - ) - link_owner_key_col = next( - col for col in ldf.columns if col.lower().endswith("ownerkey") - ) + owner_key_col = _select_ownerkey_col(odf, "OwnersData") + link_owner_key_col = _select_ownerkey_col(ldf, "OwnerLink") if self._ownerkey_mapper: odf["ownerkey_canonical"] = odf[owner_key_col].map( From 19d80b2a8b2e6b62099e704d5730565073ad761b Mon Sep 17 00:00:00 2001 From: Kelsey Smuczynski Date: Wed, 11 Feb 2026 11:09:02 -0700 Subject: [PATCH 4/6] fix(transfers): warn when owner key mapper is missing Logs the expected path on FileNotFoundError to make missing mappings visible and easier to diagnose. Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- transfers/contact_transfer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index f81857df0..0c0ba3171 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -62,6 +62,10 @@ def __init__(self, *args, **kw): with open(ownerkey_mapper_path, "r") as f: self._ownerkey_mapper = json.load(f) except FileNotFoundError: + logger.warning( + "Owner key mapper file not found at '%s'; proceeding with empty owner key mapping.", + ownerkey_mapper_path, + ) self._ownerkey_mapper = {} self._added = [] From 1e4b7779bd1bd6ce76cb9a9d2453da75cce9acb7 Mon Sep 17 00:00:00 2001 From: Kelsey Smuczynski Date: Wed, 11 Feb 2026 11:37:17 -0700 Subject: [PATCH 5/6] perf(transfers): speed up owner key mapping Use vectorized replacement instead of per-row lambdas. Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- transfers/contact_transfer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index ede668250..e013b4386 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -112,11 +112,11 @@ def _get_dfs(self): link_owner_key_col = _select_ownerkey_col(ldf, "OwnerLink") if self._ownerkey_mapper: - odf["ownerkey_canonical"] = odf[owner_key_col].map( - lambda v: self._ownerkey_mapper.get(v, v) + odf["ownerkey_canonical"] = odf[owner_key_col].replace( + self._ownerkey_mapper ) - ldf["ownerkey_canonical"] = ldf[link_owner_key_col].map( - lambda v: self._ownerkey_mapper.get(v, v) + ldf["ownerkey_canonical"] = ldf[link_owner_key_col].replace( + self._ownerkey_mapper ) else: odf["ownerkey_canonical"] = odf[owner_key_col] From 5b10a0576b5965a95b8ba941ac83267960df2bb3 Mon Sep 17 00:00:00 2001 From: Kelsey Smuczynski Date: Wed, 11 Feb 2026 11:52:44 -0700 Subject: [PATCH 6/6] fix(transfers): avoid unclear owner key choice Stops if multiple case-variant OwnerKey columns are present. Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- transfers/contact_transfer.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index e013b4386..0acedb57f 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -40,9 +40,15 @@ def _select_ownerkey_col(df: DataFrame, source_name: str) -> str: - exact = next((col for col in df.columns if col.lower() == "ownerkey"), None) - if exact: - return exact + exact_matches = [col for col in df.columns if col.lower() == "ownerkey"] + if len(exact_matches) == 1: + return exact_matches[0] + if len(exact_matches) > 1: + raise ValueError( + f"Multiple 'OwnerKey' columns found in {source_name}: {exact_matches}. " + "Column names differing only by case are ambiguous; please " + "disambiguate." + ) candidates = [col for col in df.columns if col.lower().endswith("ownerkey")] if not candidates: