Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ python -m transfers.transfer

Configure the `.env` file with the appropriate credentials before running transfers.

If contact transfers fail with `OwnerKey normalization collisions`, add or update
`transfers/data/owners_ownerkey_mapper.json` to map inconsistent `OwnerKey` values
to a single canonical spelling before re-running the transfer.

To drop the existing schema and rebuild from migrations before transferring data, set:

```bash
Expand Down
98 changes: 97 additions & 1 deletion transfers/contact_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,31 @@
from transfers.util import read_csv, filter_to_valid_point_ids, replace_nans


def _select_ownerkey_col(df: DataFrame, source_name: str) -> str:
exact_matches = [col for col in df.columns if col.lower() == "ownerkey"]
if len(exact_matches) == 1:
return exact_matches[0]
if len(exact_matches) > 1:
raise ValueError(
f"Multiple 'OwnerKey' columns found in {source_name}: {exact_matches}. "
"Column names differing only by case are ambiguous; please "
"disambiguate."
)

candidates = [col for col in df.columns if col.lower().endswith("ownerkey")]
if not candidates:
raise ValueError(
f"No owner key column found in {source_name}; expected a column named "
"'OwnerKey' (case-insensitive) or ending with 'OwnerKey'."
)
if len(candidates) > 1:
raise ValueError(
f"Multiple owner key-like columns found in {source_name}: {candidates}. "
"Please disambiguate."
)
return candidates[0]


class ContactTransfer(ThingBasedTransferer):
source_table = "OwnersData"

Expand All @@ -57,6 +82,17 @@ def __init__(self, *args, **kw):
with open(co_to_org_mapper_path, "r") as f:
self._co_to_org_mapper = json.load(f)

ownerkey_mapper_path = get_transfers_data_path("owners_ownerkey_mapper.json")
try:
with open(ownerkey_mapper_path, "r") as f:
self._ownerkey_mapper = json.load(f)
except FileNotFoundError:
logger.warning(
"Owner key mapper file not found at '%s'; proceeding with empty owner key mapping.",
ownerkey_mapper_path,
)
self._ownerkey_mapper = {}
Comment on lines 82 to +94
Copy link

Copilot AI Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

JSON files should be opened with an explicit encoding to avoid platform-dependent defaults. Use encoding=\"utf-8\" for both mapper reads.

Copilot uses AI. Check for mistakes.

self._added = []

def calculate_missing_organizations(self):
Expand All @@ -78,7 +114,67 @@ def _get_dfs(self):
locdf = read_csv("Location")
ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId")

odf = odf.join(ldf.set_index("OwnerKey"), on="OwnerKey")
owner_key_col = _select_ownerkey_col(odf, "OwnersData")
link_owner_key_col = _select_ownerkey_col(ldf, "OwnerLink")

if self._ownerkey_mapper:
odf["ownerkey_canonical"] = odf[owner_key_col].replace(
self._ownerkey_mapper
)
ldf["ownerkey_canonical"] = ldf[link_owner_key_col].replace(
self._ownerkey_mapper
)
else:
odf["ownerkey_canonical"] = odf[owner_key_col]
ldf["ownerkey_canonical"] = ldf[link_owner_key_col]

odf["ownerkey_norm"] = (
odf["ownerkey_canonical"]
.fillna("")
.astype(str)
.str.strip()
.str.casefold()
.replace({"": pd.NA})
)
ldf["ownerkey_norm"] = (
ldf["ownerkey_canonical"]
.fillna("")
.astype(str)
.str.strip()
.str.casefold()
.replace({"": pd.NA})
)
Comment on lines +131 to +146
Copy link

Copilot AI Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The helper columns ownerkey_canonical and ownerkey_norm are added to odf and can leak into subsequent processing/output. If they’re only intended for joining, consider dropping them after the join (or renaming to clearly-internal names like _ownerkey_norm) to keep the resulting dataframe schema stable.

Copilot uses AI. Check for mistakes.
Comment on lines +131 to +146
Copy link

Copilot AI Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The normalization logic is duplicated for odf and ldf, which increases the chance of future drift (e.g., one side adding a transform but not the other). Consider extracting this into a small helper (e.g., _normalize_ownerkey(series)) and using it for both frames.

Copilot uses AI. Check for mistakes.

collisions = (
ldf.groupby("ownerkey_norm")["ownerkey_canonical"]
.nunique(dropna=True)
.loc[lambda s: s > 1]
)
if not collisions.empty:
examples = []
for key in collisions.index[:10]:
variants = (
ldf.loc[ldf["ownerkey_norm"] == key, "ownerkey_canonical"]
.dropna()
.unique()
.tolist()
)
examples.append(f"{key} -> {sorted(variants)}")
logger.critical(
"OwnerKey normalization collision(s) detected in OwnerLink. "
"Resolve these before proceeding. Examples: %s",
"; ".join(examples),
)
raise ValueError(
"OwnerKey normalization collisions detected in OwnerLink. "
"Fix source data or update owners_ownerkey_mapper.json."
)

ldf_join = ldf.set_index("ownerkey_norm")
overlap_cols = [col for col in ldf_join.columns if col in odf.columns]
if overlap_cols:
ldf_join = ldf_join.drop(columns=overlap_cols, errors="ignore")
odf = odf.join(ldf_join, on="ownerkey_norm")

odf = replace_nans(odf)

Expand Down
4 changes: 4 additions & 0 deletions transfers/data/owners_ownerkey_mapper.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"Rio en Medio MDWCA": "Rio En Medio MDWCA",
"city of Rocks": "City of Rocks"
}
Loading