Forecast-Sandbox-Lite / models /lgbm_modeling.py
PranavSharma's picture
Initial Commit
b099c5d verified
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from utils.metrics import mae, bias
from pathlib import Path
# ---------------------------------------------------------
# CONFIG
# ---------------------------------------------------------
HORIZON = 14 # FreshRetailNet setup: 14-step forecast
TRAIN_DATA_PATH = Path("data/processed/lgbm_ready/train.csv")
TARGET_DATA_PATH = Path("data/processed/lgbm_ready/target.csv")
METRICS_PATH = Path("metrics/lgbm_metrics.csv")
PREDICTIONS_PATH = Path("metrics/lgbm_predictions.csv")
INFERENCE_PATH = Path("data/processed/lgbm_ready/inference/inference_train.csv")
VALIDATION_PATH = Path("data/processed/lgbm_ready/inference/inference_target.csv")
# ---------------------------------------------------------
# Base single-target LGBM
# ---------------------------------------------------------
base_lgbm = LGBMRegressor(
n_estimators=500,
learning_rate=0.05,
max_depth=-1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
n_jobs=-1,
verbose=0,
)
model = MultiOutputRegressor(base_lgbm)
def build_features(df: pd.DataFrame) -> pd.DataFrame:
"""
For LGBM multi-output here, each row is:
id | t1 | t2 | ... | tN
We drop 'id' and use the time columns as features/targets.
"""
return df.drop(columns=["id"])
# ---------------------------------------------------------
# Load data
# ---------------------------------------------------------
train_data = pd.read_csv(TRAIN_DATA_PATH)
target_data = pd.read_csv(TARGET_DATA_PATH)
inference_data = pd.read_csv(INFERENCE_PATH)
validation_data = pd.read_csv(VALIDATION_PATH)
# Sanity
# print(train_data.head())
# print(target_data.head())
# print(inference_data.head())
# print(validation_data.head())
train_df = build_features(train_data)
target_df = build_features(target_data)
inference_df = build_features(inference_data)
validation_df = build_features(validation_data)
# ---------------------------------------------------------
# Fit model
# ---------------------------------------------------------
model.fit(train_df, target_df)
# Predict for inference windows
preds = model.predict(inference_df)
# If you want integer-unit forecasts, uncomment:
# preds = np.round(preds)
# ---------------------------------------------------------
# Build long-format prediction DataFrame
# ---------------------------------------------------------
preds_df = pd.DataFrame(preds) # shape: [n_rows, HORIZON]
# Long form: one row per (row_index, horizon_step)
preds_df_long = preds_df.stack().reset_index()
preds_df_long.columns = ["id_index", "h_raw", "forecast"]
# Map back to actual ids using validation_data row order
preds_df_long["id"] = validation_data.iloc[preds_df_long["id_index"]]["id"].values
preds_df_long["h"] = preds_df_long["h_raw"].astype(int) + 1 # 1..HORIZON
preds_df_long["model"] = "lightgbm"
preds_df_final = preds_df_long[["id", "model", "h", "forecast"]]
print(preds_df_final.head(HORIZON))
# ---------------------------------------------------------
# Build long-format truth for validation horizon
# ---------------------------------------------------------
long = validation_data.melt(
id_vars=["id"],
var_name="h",
value_name="sales",
)
long["h"] = long["h"].astype(int)
# ---------------------------------------------------------
# Compute metrics per id
# ---------------------------------------------------------
metrics_rows = []
for (sku_id, model_name), g_fore in preds_df_final.groupby(["id", "model"]):
g_test = long[long["id"] == sku_id].copy()
if g_test["h"].max() < HORIZON:
# safety check – skip if somehow shorter
continue
merged = pd.merge(
g_test[["id", "h", "sales"]],
g_fore[["id", "h", "forecast"]],
on=["id", "h"],
how="inner",
)
if merged.empty:
continue
y_true = merged["sales"].values
y_pred = merged["forecast"].values
m = mae(y_true, y_pred)
b = bias(y_true, y_pred)
s = m + abs(b)
metrics_rows.append(
{
"id": sku_id,
"model": model_name,
"mae": float(m),
"bias": float(b),
"score": float(s),
}
)
metrics_df = pd.DataFrame(metrics_rows)
metrics_df.to_csv(METRICS_PATH, index=False)
preds_df_final.to_csv(PREDICTIONS_PATH, index=False)
print("Saved:")
print(f" - forecasts β†’ {PREDICTIONS_PATH}")
print(f" - metrics β†’ {METRICS_PATH}")