From 5ef99a362efa7d86126a65c0d2da124e9c446fc7 Mon Sep 17 00:00:00 2001 From: Chryseis Liu <130321313+Chryseisliu@users.noreply.github.com> Date: Mon, 22 Jun 2026 16:30:22 +0800 Subject: [PATCH] Fix stringified nulls in UID mapping --- src/logic_network_generator.py | 32 +++++++++++++++--- tests/test_logic_network_generator.py | 48 +++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 5 deletions(-) diff --git a/src/logic_network_generator.py b/src/logic_network_generator.py index 44d88cd..40b2b0e 100755 --- a/src/logic_network_generator.py +++ b/src/logic_network_generator.py @@ -236,7 +236,23 @@ def get_negative_regulators_for_reaction( def _get_non_null_values(df: pd.DataFrame, column: str) -> List[Any]: """Extract non-null values from a DataFrame column.""" - return [value for value in df[column].tolist() if pd.notna(value)] + return [value for value in df[column].tolist() if not _is_missing_value(value)] + + +def _is_missing_value(value: Any) -> bool: + """Return True for real nulls and stringified nulls from cached CSV reloads.""" + if pd.isna(value): + return True + if isinstance(value, str) and value.strip() in {"None", "nan", "NaN", ""}: + return True + return False + + +def _is_missing_reference_value(value: Any) -> bool: + """Return True when a UID/reference field should not become a graph ID.""" + if _is_missing_value(value): + return True + return isinstance(value, str) and value.strip() == "" def _get_hash_for_reaction(reaction_id_map: pd.DataFrame, uid: str, hash_type: str) -> str: @@ -254,15 +270,21 @@ def _build_uid_index(decomposed_uid_mapping: pd.DataFrame) -> Dict[str, tuple]: """ index: Dict[str, tuple] = {} for uid_val, group in decomposed_uid_mapping.groupby("uid"): - nested_uids = _get_non_null_values(group, "input_or_output_uid") - terminal_ids = _get_non_null_values(group, "input_or_output_reactome_id") + nested_uids = [ + value for value in group["input_or_output_uid"].tolist() + if not _is_missing_reference_value(value) + ] + terminal_ids = [ + value for value in group["input_or_output_reactome_id"].tolist() + if not _is_missing_reference_value(value) + ] stoich_map: Dict[str, int] = {} for _, row in group.iterrows(): stoich_raw = row.get("stoichiometry") stoich = 1 if stoich_raw is None or pd.isna(stoich_raw) else int(stoich_raw) - if pd.notna(row.get("input_or_output_uid")): + if not _is_missing_reference_value(row.get("input_or_output_uid")): stoich_map[row["input_or_output_uid"]] = stoich - if pd.notna(row.get("input_or_output_reactome_id")): + if not _is_missing_reference_value(row.get("input_or_output_reactome_id")): stoich_map[row["input_or_output_reactome_id"]] = stoich index[str(uid_val)] = (nested_uids, terminal_ids, stoich_map) return index diff --git a/tests/test_logic_network_generator.py b/tests/test_logic_network_generator.py index 1fb9eac..f05eb67 100644 --- a/tests/test_logic_network_generator.py +++ b/tests/test_logic_network_generator.py @@ -15,11 +15,13 @@ with patch('py2neo.Graph'): from src.logic_network_generator import ( _assign_uuids, + _build_uid_index, _build_entity_producer_count, _canonicalize_registry, _emit_boundary_decomposition_edges, _register_entity_uuid, _get_or_create_entity_uuid, + _resolve_to_terminal_reactome_ids, _resolve_vr_entities, ) @@ -146,6 +148,52 @@ def test_single_producer_returns_one(self): assert count["Y"] == 1 +class TestUidIndex: + """Tests for cached decomposition UID resolution.""" + + def test_stringified_nulls_are_not_treated_as_entities(self): + """Cached CSV reloads can contain literal 'None' strings; skip them.""" + decomposed = pd.DataFrame( + [ + { + "uid": "hash-1", + "input_or_output_uid": "None", + "input_or_output_reactome_id": "R-HSA-1", + "stoichiometry": 2, + }, + { + "uid": "hash-1", + "input_or_output_uid": "", + "input_or_output_reactome_id": "nan", + "stoichiometry": 1, + }, + { + "uid": "hash-1", + "input_or_output_uid": "hash-2", + "input_or_output_reactome_id": "", + "stoichiometry": 3, + }, + { + "uid": "hash-2", + "input_or_output_uid": "NaN", + "input_or_output_reactome_id": "R-HSA-2", + "stoichiometry": 4, + }, + ] + ) + + uid_index = _build_uid_index(decomposed) + + assert uid_index["hash-1"][0] == ["hash-2"] + assert uid_index["hash-1"][1] == ["R-HSA-1"] + assert "None" not in uid_index["hash-1"][2] + assert "nan" not in uid_index["hash-1"][2] + + resolved = _resolve_to_terminal_reactome_ids(uid_index, "hash-1") + + assert resolved == {"R-HSA-1": 2, "R-HSA-2": 12} + + class TestInterReactionConnectivity: """Tests for inter-reaction entity UUID connectivity (3-phase approach).