Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 98 additions & 4 deletions powerplantmatching/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,88 @@
logger = logging.getLogger(__name__)


def _match_by_eic(df0, df1, labels):
"""
Deterministic matching of two datasets by EIC (Energy Identification Code).

Performs an exact join on EIC codes before Duke fuzzy matching, so that
plants with known unique identifiers are matched with certainty. This
prevents co-located plants with similar names but different fuels from
being incorrectly merged by the fuzzy matcher (e.g. Eemshavencentrale
coal vs Eemscentrale gas in the Netherlands).

Parameters
----------
df0, df1 : pd.DataFrame
Source dataframes with an 'EIC' column containing sets of EIC codes
(as produced by ``aggregate_units``).
labels : list of str
Two-element list of dataset names for the output columns.

Returns
-------
matches : pd.DataFrame
DataFrame with columns ``labels``, containing matched index pairs.
matched_idx0 : set
Indices from df0 that were matched.
matched_idx1 : set
Indices from df1 that were matched.
"""
empty = pd.DataFrame(columns=labels), set(), set()

if "EIC" not in df0.columns or "EIC" not in df1.columns:
return empty

def _build_eic_index(df):
"""Map each valid EIC code to its row index."""
code_to_idx = {}
for row_idx, eic_set in df["EIC"].items():
if not isinstance(eic_set, set):
continue
for code in eic_set:
if isinstance(code, str) and code:
code_to_idx[code] = row_idx
return code_to_idx

eic_to_idx0 = _build_eic_index(df0)
eic_to_idx1 = _build_eic_index(df1)

shared_codes = eic_to_idx0.keys() & eic_to_idx1.keys()
if not shared_codes:
return empty

# Greedy 1-to-1: first shared code claims the pair, skip already-matched
matched_0_to_1 = {}
claimed_idx1 = set()
for code in shared_codes:
i0 = eic_to_idx0[code]
i1 = eic_to_idx1[code]
if i0 not in matched_0_to_1 and i1 not in claimed_idx1:
matched_0_to_1[i0] = i1
claimed_idx1.add(i1)

if not matched_0_to_1:
return empty

matches = pd.DataFrame(
{
labels[0]: list(matched_0_to_1.keys()),
labels[1]: list(matched_0_to_1.values()),
}
)
matched_idx0 = set(matched_0_to_1)
matched_idx1 = claimed_idx1

logger.info(
"EIC matching: %d deterministic matches between `%s` and `%s`",
len(matches),
labels[0],
labels[1],
)

return matches, matched_idx0, matched_idx1


def best_matches(links):
"""
Subsequent to duke() with singlematch=True. Returns reduced list of
Expand Down Expand Up @@ -77,6 +159,16 @@ def compare_two_datasets(dfs, labels, country_wise=True, config=None, **dukeargs
if "singlematch" not in dukeargs:
dukeargs["singlematch"] = True

# ── Deterministic EIC matching (before fuzzy) ────────────────────
eic_matches, matched_idx0, matched_idx1 = _match_by_eic(dfs[0], dfs[1], labels)

# Remove EIC-matched rows from the Duke input
remaining = [
dfs[0].drop(index=matched_idx0, errors="ignore"),
dfs[1].drop(index=matched_idx1, errors="ignore"),
]

# ── Duke fuzzy matching on residual ──────────────────────────────
def country_link(dfs, country):
# country_selector for both dataframes
sel_country_b = [df["Country"] == country for df in dfs]
Expand All @@ -90,20 +182,22 @@ def country_link(dfs, country):

if country_wise:
countries = config["target_countries"]
links = [country_link(dfs, c) for c in countries]
links = [country_link(remaining, c) for c in countries]
links = [link for link in links if not link.empty]
if links:
links = pd.concat(links, ignore_index=True)
else:
links = pd.DataFrame(columns=[*labels, "scores"])
else:
links = duke(dfs, labels=labels, **dukeargs)
links = duke(remaining, labels=labels, **dukeargs)

if links.empty:
matches = pd.DataFrame(columns=labels)
duke_matches = pd.DataFrame(columns=labels)
else:
matches = best_matches(links)
duke_matches = best_matches(links)

# ── Combine EIC + Duke matches ───────────────────────────────────
matches = pd.concat([eic_matches, duke_matches], ignore_index=True)
return matches


Expand Down
129 changes: 129 additions & 0 deletions test/test_matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# SPDX-FileCopyrightText: Contributors to powerplantmatching <https://github.com/pypsa/powerplantmatching>
#
# SPDX-License-Identifier: MIT

import numpy as np
import pandas as pd
import pytest

from powerplantmatching.matching import _match_by_eic


@pytest.fixture
def df_entsoe():
"""ENTSOE-like dataset with EIC codes as sets."""
return pd.DataFrame(
{
"Name": ["Eemshavencentrale", "Eemscentrale", "Maasvlakte"],
"Fueltype": ["Hard Coal", "Natural Gas", "Hard Coal"],
"Country": ["Netherlands", "Netherlands", "Netherlands"],
"Capacity": [1560.0, 2200.0, 1040.0],
"EIC": [
{"49W000000000EMSA"},
{"49W00000000008xG", "49W00000000008xK"},
{"49W000000000MVSQ"},
],
"lat": [53.44, 53.44, 51.95],
"lon": [6.83, 6.84, 4.03],
}
)


@pytest.fixture
def df_opsd():
"""OPSD-like dataset with EIC codes as sets."""
return pd.DataFrame(
{
"Name": ["Eemshaven coal", "Eems gas", "Rijnmond"],
"Fueltype": ["Hard Coal", "Natural Gas", "Natural Gas"],
"Country": ["Netherlands", "Netherlands", "Netherlands"],
"Capacity": [1560.0, 2200.0, 800.0],
"EIC": [
{"49W000000000EMSA"},
{"49W00000000008xG"},
set(), # Rijnmond has no EIC
],
"lat": [53.44, 53.44, 51.88],
"lon": [6.83, 6.84, 4.50],
}
)


def test_eic_matching_basic(df_entsoe, df_opsd):
"""EIC matching correctly pairs plants sharing EIC codes."""
labels = ["ENTSOE", "OPSD"]
matches, idx0, idx1 = _match_by_eic(df_entsoe, df_opsd, labels)

# Eemshavencentrale (0) ↔ Eemshaven coal (0) via EMSA
# Eemscentrale (1) ↔ Eems gas (1) via 008xG
assert len(matches) == 2
assert set(idx0) == {0, 1}
assert set(idx1) == {0, 1}

# Maasvlakte (2) and Rijnmond (2) should NOT match (no shared EIC)
assert 2 not in idx0
assert 2 not in idx1


def test_eic_matching_no_eic_column():
"""Gracefully handles datasets without EIC column."""
df0 = pd.DataFrame({"Name": ["Plant A"], "Capacity": [100]})
df1 = pd.DataFrame({"Name": ["Plant B"], "Capacity": [200], "EIC": [{"CODE1"}]})

matches, idx0, idx1 = _match_by_eic(df0, df1, ["A", "B"])
assert matches.empty
assert len(idx0) == 0


def test_eic_matching_empty_sets():
"""No matches when all EIC sets are empty."""
df0 = pd.DataFrame({"Name": ["A"], "EIC": [set()]})
df1 = pd.DataFrame({"Name": ["B"], "EIC": [set()]})

matches, _, _ = _match_by_eic(df0, df1, ["X", "Y"])
assert matches.empty


def test_eic_matching_nan_values():
"""Float nan inside EIC sets does not produce false matches."""
df0 = pd.DataFrame({"Name": ["A", "B"], "EIC": [{np.nan}, {"CODE1"}]})
df1 = pd.DataFrame({"Name": ["X", "Y"], "EIC": [{np.nan}, {"CODE1"}]})

matches, idx0, idx1 = _match_by_eic(df0, df1, ["L", "R"])
# Only CODE1 should match, not nan
assert len(matches) == 1
assert 0 not in idx0 # row with {nan} not matched


def test_eic_matching_nan_only():
"""All-NaN EIC column produces no matches."""
df0 = pd.DataFrame({"Name": ["A"], "EIC": [None]})
df1 = pd.DataFrame({"Name": ["B"], "EIC": [None]})

matches, _, _ = _match_by_eic(df0, df1, ["X", "Y"])
assert matches.empty


def test_eic_matching_one_to_one():
"""Enforces 1-to-1: each row matches at most once."""
# Plant A has {C1, C2}; Plant X has {C1}, Plant Y has {C2}
df0 = pd.DataFrame({"Name": ["Plant A"], "EIC": [{"C1", "C2"}]})
df1 = pd.DataFrame({"Name": ["Plant X", "Plant Y"], "EIC": [{"C1"}, {"C2"}]})

matches, idx0, idx1 = _match_by_eic(df0, df1, ["src0", "src1"])

# Plant A should match exactly one of X or Y (1-to-1 constraint)
assert len(matches) == 1
assert matches["src0"].iloc[0] == 0
assert matches["src1"].iloc[0] in {0, 1}


def test_eic_matching_non_set_values():
"""Non-set EIC values (e.g. raw strings from CSV) are skipped."""
df0 = pd.DataFrame({"Name": ["A", "B"], "EIC": ["CODE1", {"CODE2"}]})
df1 = pd.DataFrame({"Name": ["X", "Y"], "EIC": [{"CODE1"}, {"CODE2"}]})

matches, idx0, idx1 = _match_by_eic(df0, df1, ["L", "R"])
# Only CODE2 matches (CODE1 in df0 is a raw string, not a set)
assert len(matches) == 1
assert 1 in idx0
Loading