PyPSA · MaykThewessen · Mar 25, 2026 · Mar 25, 2026
diff --git a/powerplantmatching/matching.py b/powerplantmatching/matching.py
@@ -20,6 +20,88 @@
 logger = logging.getLogger(__name__)
 
 
+def _match_by_eic(df0, df1, labels):
+    """
+    Deterministic matching of two datasets by EIC (Energy Identification Code).
+
+    Performs an exact join on EIC codes before Duke fuzzy matching, so that
+    plants with known unique identifiers are matched with certainty. This
+    prevents co-located plants with similar names but different fuels from
+    being incorrectly merged by the fuzzy matcher (e.g. Eemshavencentrale
+    coal vs Eemscentrale gas in the Netherlands).
+
+    Parameters
+    ----------
+    df0, df1 : pd.DataFrame
+        Source dataframes with an 'EIC' column containing sets of EIC codes
+        (as produced by ``aggregate_units``).
+    labels : list of str
+        Two-element list of dataset names for the output columns.
+
+    Returns
+    -------
+    matches : pd.DataFrame
+        DataFrame with columns ``labels``, containing matched index pairs.
+    matched_idx0 : set
+        Indices from df0 that were matched.
+    matched_idx1 : set
+        Indices from df1 that were matched.
+    """
+    empty = pd.DataFrame(columns=labels), set(), set()
+
+    if "EIC" not in df0.columns or "EIC" not in df1.columns:
+        return empty
+
+    def _build_eic_index(df):
+        """Map each valid EIC code to its row index."""
+        code_to_idx = {}
+        for row_idx, eic_set in df["EIC"].items():
+            if not isinstance(eic_set, set):
+                continue
+            for code in eic_set:
+                if isinstance(code, str) and code:
+                    code_to_idx[code] = row_idx
+        return code_to_idx
+
+    eic_to_idx0 = _build_eic_index(df0)
+    eic_to_idx1 = _build_eic_index(df1)
+
+    shared_codes = eic_to_idx0.keys() & eic_to_idx1.keys()
+    if not shared_codes:
+        return empty
+
+    # Greedy 1-to-1: first shared code claims the pair, skip already-matched
+    matched_0_to_1 = {}
+    claimed_idx1 = set()
+    for code in shared_codes:
+        i0 = eic_to_idx0[code]
+        i1 = eic_to_idx1[code]
+        if i0 not in matched_0_to_1 and i1 not in claimed_idx1:
+            matched_0_to_1[i0] = i1
+            claimed_idx1.add(i1)
+
+    if not matched_0_to_1:
+        return empty
+
+    matches = pd.DataFrame(
+        {
+            labels[0]: list(matched_0_to_1.keys()),
+            labels[1]: list(matched_0_to_1.values()),
+        }
+    )
+    matched_idx0 = set(matched_0_to_1)
+    matched_idx1 = claimed_idx1
+
+    logger.info(
+        "EIC matching: %d deterministic matches between `%s` and `%s`",
+        len(matches),
+        labels[0],
+        labels[1],
+    )
+
+    return matches, matched_idx0, matched_idx1
+
+
 def best_matches(links):
     """
     Subsequent to duke() with singlematch=True. Returns reduced list of
@@ -77,6 +159,16 @@ def compare_two_datasets(dfs, labels, country_wise=True, config=None, **dukeargs
     if "singlematch" not in dukeargs:
         dukeargs["singlematch"] = True
 
+    # ── Deterministic EIC matching (before fuzzy) ────────────────────
+    eic_matches, matched_idx0, matched_idx1 = _match_by_eic(dfs[0], dfs[1], labels)
+
+    # Remove EIC-matched rows from the Duke input
+    remaining = [
+        dfs[0].drop(index=matched_idx0, errors="ignore"),
+        dfs[1].drop(index=matched_idx1, errors="ignore"),
+    ]
+
+    # ── Duke fuzzy matching on residual ──────────────────────────────
     def country_link(dfs, country):
         # country_selector for both dataframes
         sel_country_b = [df["Country"] == country for df in dfs]
@@ -90,20 +182,22 @@ def country_link(dfs, country):
 
     if country_wise:
         countries = config["target_countries"]
-        links = [country_link(dfs, c) for c in countries]
+        links = [country_link(remaining, c) for c in countries]
         links = [link for link in links if not link.empty]
         if links:
             links = pd.concat(links, ignore_index=True)
         else:
             links = pd.DataFrame(columns=[*labels, "scores"])
     else:
-        links = duke(dfs, labels=labels, **dukeargs)
+        links = duke(remaining, labels=labels, **dukeargs)
 
     if links.empty:
-        matches = pd.DataFrame(columns=labels)
+        duke_matches = pd.DataFrame(columns=labels)
     else:
-        matches = best_matches(links)
+        duke_matches = best_matches(links)
 
+    # ── Combine EIC + Duke matches ───────────────────────────────────
+    matches = pd.concat([eic_matches, duke_matches], ignore_index=True)
     return matches
 
 

diff --git a/test/test_matching.py b/test/test_matching.py
@@ -0,0 +1,129 @@
+# SPDX-FileCopyrightText: Contributors to powerplantmatching <https://github.com/pypsa/powerplantmatching>
+#
+# SPDX-License-Identifier: MIT
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from powerplantmatching.matching import _match_by_eic
+
+
+@pytest.fixture
+def df_entsoe():
+    """ENTSOE-like dataset with EIC codes as sets."""
+    return pd.DataFrame(
+        {
+            "Name": ["Eemshavencentrale", "Eemscentrale", "Maasvlakte"],
+            "Fueltype": ["Hard Coal", "Natural Gas", "Hard Coal"],
+            "Country": ["Netherlands", "Netherlands", "Netherlands"],
+            "Capacity": [1560.0, 2200.0, 1040.0],
+            "EIC": [
+                {"49W000000000EMSA"},
+                {"49W00000000008xG", "49W00000000008xK"},
+                {"49W000000000MVSQ"},
+            ],
+            "lat": [53.44, 53.44, 51.95],
+            "lon": [6.83, 6.84, 4.03],
+        }
+    )
+
+
+@pytest.fixture
+def df_opsd():
+    """OPSD-like dataset with EIC codes as sets."""
+    return pd.DataFrame(
+        {
+            "Name": ["Eemshaven coal", "Eems gas", "Rijnmond"],
+            "Fueltype": ["Hard Coal", "Natural Gas", "Natural Gas"],
+            "Country": ["Netherlands", "Netherlands", "Netherlands"],
+            "Capacity": [1560.0, 2200.0, 800.0],
+            "EIC": [
+                {"49W000000000EMSA"},
+                {"49W00000000008xG"},
+                set(),  # Rijnmond has no EIC
+            ],
+            "lat": [53.44, 53.44, 51.88],
+            "lon": [6.83, 6.84, 4.50],
+        }
+    )
+
+
+def test_eic_matching_basic(df_entsoe, df_opsd):
+    """EIC matching correctly pairs plants sharing EIC codes."""
+    labels = ["ENTSOE", "OPSD"]
+    matches, idx0, idx1 = _match_by_eic(df_entsoe, df_opsd, labels)
+
+    # Eemshavencentrale (0) ↔ Eemshaven coal (0) via EMSA
+    # Eemscentrale (1) ↔ Eems gas (1) via 008xG
+    assert len(matches) == 2
+    assert set(idx0) == {0, 1}
+    assert set(idx1) == {0, 1}
+
+    # Maasvlakte (2) and Rijnmond (2) should NOT match (no shared EIC)
+    assert 2 not in idx0
+    assert 2 not in idx1
+
+
+def test_eic_matching_no_eic_column():
+    """Gracefully handles datasets without EIC column."""
+    df0 = pd.DataFrame({"Name": ["Plant A"], "Capacity": [100]})
+    df1 = pd.DataFrame({"Name": ["Plant B"], "Capacity": [200], "EIC": [{"CODE1"}]})
+
+    matches, idx0, idx1 = _match_by_eic(df0, df1, ["A", "B"])
+    assert matches.empty
+    assert len(idx0) == 0
+
+
+def test_eic_matching_empty_sets():
+    """No matches when all EIC sets are empty."""
+    df0 = pd.DataFrame({"Name": ["A"], "EIC": [set()]})
+    df1 = pd.DataFrame({"Name": ["B"], "EIC": [set()]})
+
+    matches, _, _ = _match_by_eic(df0, df1, ["X", "Y"])
+    assert matches.empty
+
+
+def test_eic_matching_nan_values():
+    """Float nan inside EIC sets does not produce false matches."""
+    df0 = pd.DataFrame({"Name": ["A", "B"], "EIC": [{np.nan}, {"CODE1"}]})
+    df1 = pd.DataFrame({"Name": ["X", "Y"], "EIC": [{np.nan}, {"CODE1"}]})
+
+    matches, idx0, idx1 = _match_by_eic(df0, df1, ["L", "R"])
+    # Only CODE1 should match, not nan
+    assert len(matches) == 1
+    assert 0 not in idx0  # row with {nan} not matched
+
+
+def test_eic_matching_nan_only():
+    """All-NaN EIC column produces no matches."""
+    df0 = pd.DataFrame({"Name": ["A"], "EIC": [None]})
+    df1 = pd.DataFrame({"Name": ["B"], "EIC": [None]})
+
+    matches, _, _ = _match_by_eic(df0, df1, ["X", "Y"])
+    assert matches.empty
+
+
+def test_eic_matching_one_to_one():
+    """Enforces 1-to-1: each row matches at most once."""
+    # Plant A has {C1, C2}; Plant X has {C1}, Plant Y has {C2}
+    df0 = pd.DataFrame({"Name": ["Plant A"], "EIC": [{"C1", "C2"}]})
+    df1 = pd.DataFrame({"Name": ["Plant X", "Plant Y"], "EIC": [{"C1"}, {"C2"}]})
+
+    matches, idx0, idx1 = _match_by_eic(df0, df1, ["src0", "src1"])
+
+    # Plant A should match exactly one of X or Y (1-to-1 constraint)
+    assert len(matches) == 1
+    assert matches["src0"].iloc[0] == 0
+    assert matches["src1"].iloc[0] in {0, 1}
+
+
+def test_eic_matching_non_set_values():
+    """Non-set EIC values (e.g. raw strings from CSV) are skipped."""
+    df0 = pd.DataFrame({"Name": ["A", "B"], "EIC": ["CODE1", {"CODE2"}]})
+    df1 = pd.DataFrame({"Name": ["X", "Y"], "EIC": [{"CODE1"}, {"CODE2"}]})
+
+    matches, idx0, idx1 = _match_by_eic(df0, df1, ["L", "R"])
+    # Only CODE2 matches (CODE1 in df0 is a raw string, not a set)
+    assert len(matches) == 1
+    assert 1 in idx0