-
Notifications
You must be signed in to change notification settings - Fork 4
fix(transfers): normalize OwnerKey joins with mapper and collision guard #482
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b39b563
dad386c
a3b6bce
19d80b2
419eddf
1e4b777
5b10a05
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,31 @@ | |
| from transfers.util import read_csv, filter_to_valid_point_ids, replace_nans | ||
|
|
||
|
|
||
| def _select_ownerkey_col(df: DataFrame, source_name: str) -> str: | ||
| exact_matches = [col for col in df.columns if col.lower() == "ownerkey"] | ||
| if len(exact_matches) == 1: | ||
| return exact_matches[0] | ||
| if len(exact_matches) > 1: | ||
| raise ValueError( | ||
| f"Multiple 'OwnerKey' columns found in {source_name}: {exact_matches}. " | ||
| "Column names differing only by case are ambiguous; please " | ||
| "disambiguate." | ||
| ) | ||
|
|
||
| candidates = [col for col in df.columns if col.lower().endswith("ownerkey")] | ||
| if not candidates: | ||
| raise ValueError( | ||
| f"No owner key column found in {source_name}; expected a column named " | ||
| "'OwnerKey' (case-insensitive) or ending with 'OwnerKey'." | ||
| ) | ||
| if len(candidates) > 1: | ||
| raise ValueError( | ||
| f"Multiple owner key-like columns found in {source_name}: {candidates}. " | ||
| "Please disambiguate." | ||
| ) | ||
| return candidates[0] | ||
|
|
||
|
|
||
| class ContactTransfer(ThingBasedTransferer): | ||
| source_table = "OwnersData" | ||
|
|
||
|
|
@@ -57,6 +82,17 @@ def __init__(self, *args, **kw): | |
| with open(co_to_org_mapper_path, "r") as f: | ||
| self._co_to_org_mapper = json.load(f) | ||
|
|
||
| ownerkey_mapper_path = get_transfers_data_path("owners_ownerkey_mapper.json") | ||
| try: | ||
| with open(ownerkey_mapper_path, "r") as f: | ||
| self._ownerkey_mapper = json.load(f) | ||
| except FileNotFoundError: | ||
| logger.warning( | ||
| "Owner key mapper file not found at '%s'; proceeding with empty owner key mapping.", | ||
| ownerkey_mapper_path, | ||
| ) | ||
| self._ownerkey_mapper = {} | ||
ksmuczynski marked this conversation as resolved.
Show resolved
Hide resolved
Comment on lines
82
to
+94
|
||
|
|
||
| self._added = [] | ||
|
|
||
| def calculate_missing_organizations(self): | ||
|
|
@@ -78,7 +114,67 @@ def _get_dfs(self): | |
| locdf = read_csv("Location") | ||
| ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId") | ||
|
|
||
| odf = odf.join(ldf.set_index("OwnerKey"), on="OwnerKey") | ||
| owner_key_col = _select_ownerkey_col(odf, "OwnersData") | ||
| link_owner_key_col = _select_ownerkey_col(ldf, "OwnerLink") | ||
|
|
||
| if self._ownerkey_mapper: | ||
| odf["ownerkey_canonical"] = odf[owner_key_col].replace( | ||
| self._ownerkey_mapper | ||
| ) | ||
| ldf["ownerkey_canonical"] = ldf[link_owner_key_col].replace( | ||
| self._ownerkey_mapper | ||
| ) | ||
| else: | ||
| odf["ownerkey_canonical"] = odf[owner_key_col] | ||
| ldf["ownerkey_canonical"] = ldf[link_owner_key_col] | ||
|
|
||
| odf["ownerkey_norm"] = ( | ||
| odf["ownerkey_canonical"] | ||
| .fillna("") | ||
| .astype(str) | ||
| .str.strip() | ||
| .str.casefold() | ||
| .replace({"": pd.NA}) | ||
| ) | ||
| ldf["ownerkey_norm"] = ( | ||
| ldf["ownerkey_canonical"] | ||
| .fillna("") | ||
| .astype(str) | ||
| .str.strip() | ||
| .str.casefold() | ||
| .replace({"": pd.NA}) | ||
| ) | ||
|
Comment on lines
+131
to
+146
|
||
|
|
||
| collisions = ( | ||
| ldf.groupby("ownerkey_norm")["ownerkey_canonical"] | ||
| .nunique(dropna=True) | ||
| .loc[lambda s: s > 1] | ||
| ) | ||
| if not collisions.empty: | ||
| examples = [] | ||
| for key in collisions.index[:10]: | ||
| variants = ( | ||
| ldf.loc[ldf["ownerkey_norm"] == key, "ownerkey_canonical"] | ||
| .dropna() | ||
| .unique() | ||
| .tolist() | ||
| ) | ||
| examples.append(f"{key} -> {sorted(variants)}") | ||
| logger.critical( | ||
| "OwnerKey normalization collision(s) detected in OwnerLink. " | ||
| "Resolve these before proceeding. Examples: %s", | ||
| "; ".join(examples), | ||
| ) | ||
| raise ValueError( | ||
| "OwnerKey normalization collisions detected in OwnerLink. " | ||
| "Fix source data or update owners_ownerkey_mapper.json." | ||
ksmuczynski marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ) | ||
|
|
||
ksmuczynski marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ldf_join = ldf.set_index("ownerkey_norm") | ||
| overlap_cols = [col for col in ldf_join.columns if col in odf.columns] | ||
| if overlap_cols: | ||
| ldf_join = ldf_join.drop(columns=overlap_cols, errors="ignore") | ||
| odf = odf.join(ldf_join, on="ownerkey_norm") | ||
ksmuczynski marked this conversation as resolved.
Show resolved
Hide resolved
ksmuczynski marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
ksmuczynski marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| odf = replace_nans(odf) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| { | ||
| "Rio en Medio MDWCA": "Rio En Medio MDWCA", | ||
| "city of Rocks": "City of Rocks" | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.