From 0b073f72c60f51dbd1189176463c00c14bccf876 Mon Sep 17 00:00:00 2001 From: Castorp <50649074+ShinDongWoon@users.noreply.github.com> Date: Sun, 17 Aug 2025 13:28:44 +0900 Subject: [PATCH] Add Optuna tuning for LightGBM --- LGHackerton/tune.py | 75 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 LGHackerton/tune.py diff --git a/LGHackerton/tune.py b/LGHackerton/tune.py new file mode 100644 index 0000000..3bd8a46 --- /dev/null +++ b/LGHackerton/tune.py @@ -0,0 +1,75 @@ +from __future__ import annotations +import json +from pathlib import Path +from typing import List + +import optuna +import pandas as pd + +from LGHackerton.models.lgbm_trainer import LGBMParams, LGBMTrainer +from LGHackerton.models.base_trainer import TrainConfig +from LGHackerton.utils.metrics import weighted_smape_np +from LGHackerton.config.default import ARTIFACTS_DIR + + +def tune_lgbm(df_train: pd.DataFrame, features: List[str], cfg: TrainConfig): + """Hyper-parameter tuning for LightGBM using Optuna. + + Parameters + ---------- + df_train : pandas.DataFrame + Preprocessed training dataframe ready for LGBMTrainer. + features : list[str] + List of feature column names to use for training. + cfg : TrainConfig + Training configuration. Should contain attributes for + "model_dir" and optional "n_trials" and "timeout" for optuna. + """ + + def objective(trial: optuna.trial.Trial) -> float: + sampled_params = { + "objective": "tweedie", + "tweedie_variance_power": trial.suggest_float("tweedie_variance_power", 1.1, 1.6), + "num_leaves": trial.suggest_int("num_leaves", 31, 255), + "max_depth": trial.suggest_int("max_depth", 3, 16), + "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200), + "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True), + "subsample": trial.suggest_float("subsample", 0.5, 1.0), + "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0), + "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True), + "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True), + "n_estimators": trial.suggest_int("n_estimators", 1000, 4000), + "early_stopping_rounds": 200, + } + + params = LGBMParams(**sampled_params) + trial_dir = Path(getattr(cfg, "model_dir", ".")) / f"optuna_trial_{trial.number}" + trainer = LGBMTrainer( + params=params, + features=features, + model_dir=str(trial_dir), + device=getattr(cfg, "device", "cpu"), + ) + trainer.train(df_train, cfg) + oof = trainer.get_oof() + outlets = oof["series_id"].str.split("::").str[0] + loss = weighted_smape_np( + oof["y"].values, + oof["yhat"].values, + outlet_names=outlets, + priority_weight=getattr(cfg, "priority_weight", 3.0), + ) + return loss + + study = optuna.create_study(direction="minimize") + n_trials = int(getattr(cfg, "n_trials", 20)) + timeout = getattr(cfg, "timeout", None) + study.optimize(objective, n_trials=n_trials, timeout=timeout) + + optuna_dir = ARTIFACTS_DIR / "optuna" + optuna_dir.mkdir(parents=True, exist_ok=True) + out_path = optuna_dir / "lgbm_best.json" + with open(out_path, "w", encoding="utf-8") as f: + json.dump({"params": study.best_params, "value": study.best_value}, f, ensure_ascii=False, indent=2) + + return study