From 5e816a0a8d8ba53d3776c2601dd60c330b33874a Mon Sep 17 00:00:00 2001 From: Castorp <50649074+ShinDongWoon@users.noreply.github.com> Date: Wed, 20 Aug 2025 09:07:32 +0900 Subject: [PATCH] test: ensure calendar feature reduction --- .../preprocess/preprocess_pipeline_v1_1.py | 70 ++++++++++++++----- tests/test_calendar_feature_maker.py | 49 +++++++++++++ 2 files changed, 101 insertions(+), 18 deletions(-) create mode 100644 tests/test_calendar_feature_maker.py diff --git a/LGHackerton/preprocess/preprocess_pipeline_v1_1.py b/LGHackerton/preprocess/preprocess_pipeline_v1_1.py index 6a81837..bc94742 100644 --- a/LGHackerton/preprocess/preprocess_pipeline_v1_1.py +++ b/LGHackerton/preprocess/preprocess_pipeline_v1_1.py @@ -248,8 +248,17 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: class CalendarFeatureMaker: """Adds calendar features and optional Korean holidays if provider available.""" - def __init__(self, holiday_provider: Optional[HolidayProvider] = None): + def __init__( + self, + holiday_provider: Optional[HolidayProvider] = None, + cyclical: bool = False, + keep_months: Optional[Iterable[int]] = None, + keep_woys: Optional[Iterable[int]] = None, + ): self.holiday_provider = holiday_provider or HolidayProvider() + self.cyclical = cyclical + self.keep_months = set(keep_months) if keep_months is not None else None + self.keep_woys = set(keep_woys) if keep_woys is not None else None self._holiday_cache: set = set() self._woy_cols: List[str] = [] self._month_cols: List[str] = [] @@ -262,6 +271,10 @@ def fit(self, df: pd.DataFrame): # store columns to align dummies at transform weeks = df[DATE_COL].dt.isocalendar().week.unique().tolist() months = df[DATE_COL].dt.month.unique().tolist() + if self.keep_woys is not None: + weeks = [w for w in weeks if w in self.keep_woys] + if self.keep_months is not None: + months = [m for m in months if m in self.keep_months] self._woy_cols = [f"woy_{w}" for w in sorted(weeks)] self._month_cols = [f"month_{m}" for m in sorted(months)] promo_candidates = [ @@ -270,18 +283,32 @@ def fit(self, df: pd.DataFrame): if c in df.columns ] self._promo_col = promo_candidates[0] if promo_candidates else None - self._feature_names = [ - "year", - "day", - "dow", - "is_weekend", - "is_month_start", - "is_month_end", - "is_holiday", - "is_priority_outlet", - *self._month_cols, - *self._woy_cols, - ] + if self.cyclical: + cyc_cols = ["month_sin", "month_cos", "woy_sin", "woy_cos"] + self._feature_names = [ + "year", + "day", + "dow", + "is_weekend", + "is_month_start", + "is_month_end", + "is_holiday", + "is_priority_outlet", + *cyc_cols, + ] + else: + self._feature_names = [ + "year", + "day", + "dow", + "is_weekend", + "is_month_start", + "is_month_end", + "is_holiday", + "is_priority_outlet", + *self._month_cols, + *self._woy_cols, + ] if self._promo_col is not None: self._feature_names.append("is_promo") return self @@ -298,11 +325,18 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: # base calendar categories d["month"] = d[DATE_COL].dt.month d["weekofyear"] = d[DATE_COL].dt.isocalendar().week.astype(int) - month_dum = pd.get_dummies(d["month"], prefix="month", dtype=np.int8) - month_dum = month_dum.reindex(columns=self._month_cols, fill_value=0) - woy_dum = pd.get_dummies(d["weekofyear"], prefix="woy", dtype=np.int8) - woy_dum = woy_dum.reindex(columns=self._woy_cols, fill_value=0) - d = pd.concat([d.drop(columns=["month", "weekofyear"]), month_dum, woy_dum], axis=1) + if self.cyclical: + d["month_sin"] = np.sin(2 * np.pi * d["month"] / 12) + d["month_cos"] = np.cos(2 * np.pi * d["month"] / 12) + d["woy_sin"] = np.sin(2 * np.pi * d["weekofyear"] / 52) + d["woy_cos"] = np.cos(2 * np.pi * d["weekofyear"] / 52) + d = d.drop(columns=["month", "weekofyear"]) + else: + month_dum = pd.get_dummies(d["month"], prefix="month", dtype=np.int8) + month_dum = month_dum.reindex(columns=self._month_cols, fill_value=0) + woy_dum = pd.get_dummies(d["weekofyear"], prefix="woy", dtype=np.int8) + woy_dum = woy_dum.reindex(columns=self._woy_cols, fill_value=0) + d = pd.concat([d.drop(columns=["month", "weekofyear"]), month_dum, woy_dum], axis=1) if self._holiday_cache: d["is_holiday"] = d[DATE_COL].dt.date.isin(self._holiday_cache).astype(np.int8) diff --git a/tests/test_calendar_feature_maker.py b/tests/test_calendar_feature_maker.py new file mode 100644 index 0000000..24c01c7 --- /dev/null +++ b/tests/test_calendar_feature_maker.py @@ -0,0 +1,49 @@ +import sys +from pathlib import Path + +import pandas as pd + +sys.path.append(str(Path(__file__).resolve().parents[1])) + +from LGHackerton.preprocess.preprocess_pipeline_v1_1 import ( # noqa: E402 + CalendarFeatureMaker, + DATE_COL, + SHOP_COL, +) + + +def _sample_df() -> pd.DataFrame: + dates = pd.to_datetime( + ["2020-01-01", "2020-01-08", "2020-02-05", "2020-02-12"] + ) + return pd.DataFrame({DATE_COL: dates, SHOP_COL: "A"}) + + +def test_cyclical_reduces_columns_and_variance(): + df = _sample_df() + base = CalendarFeatureMaker().fit(df).transform(df) + cyc = CalendarFeatureMaker(cyclical=True).fit(df).transform(df) + + base_cols = [c for c in base.columns if c.startswith("month_") or c.startswith("woy_")] + cyc_cols = [c for c in cyc.columns if c.endswith("_sin") or c.endswith("_cos")] + + assert len(cyc_cols) < len(base_cols) + + base_var = base[base_cols].var().sum() + cyc_var = cyc[cyc_cols].var().sum() + assert cyc_var < base_var + + +def test_keep_selected_reduces_columns_and_variance(): + df = _sample_df() + base = CalendarFeatureMaker().fit(df).transform(df) + kept = CalendarFeatureMaker(keep_months=[1], keep_woys=[1, 2]).fit(df).transform(df) + + base_cols = [c for c in base.columns if c.startswith("month_") or c.startswith("woy_")] + kept_cols = [c for c in kept.columns if c.startswith("month_") or c.startswith("woy_")] + + assert len(kept_cols) < len(base_cols) + + base_var = base[base_cols].var().sum() + kept_var = kept[kept_cols].var().sum() + assert kept_var < base_var