{ "cells": [ { "cell_type": "code", "execution_count": 15, "id": "a7c40345", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "best_model\n", "lightgbm 0.70\n", "Naive 0.08\n", "HistoricAverage 0.05\n", "Holt 0.05\n", "HoltWinters 0.05\n", "SeasonalExponentialSmoothingOptimized 0.02\n", "CrostonSBA 0.02\n", "CrostonOptimized 0.02\n", "Name: proportion, dtype: float64\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "df = pd.read_csv(\"../metrics/best_models.csv\")\n", "\n", "# global win rate\n", "print(df[\"best_model\"].value_counts(normalize=True).round(2))\n" ] }, { "cell_type": "code", "execution_count": 16, "id": "05afbf96", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cv_bin best_model \n", "Low CrostonOptimized 0\n", " CrostonSBA 0\n", " HistoricAverage 1\n", " Holt 0\n", " HoltWinters 1\n", " Naive 1\n", " SeasonalExponentialSmoothingOptimized 1\n", " lightgbm 9\n", "Mid CrostonOptimized 1\n", " CrostonSBA 1\n", " HistoricAverage 1\n", " Holt 1\n", " HoltWinters 1\n", " Naive 0\n", " SeasonalExponentialSmoothingOptimized 0\n", " lightgbm 8\n", "High CrostonOptimized 0\n", " CrostonSBA 0\n", " HistoricAverage 0\n", " Holt 1\n", " HoltWinters 0\n", " Naive 2\n", " SeasonalExponentialSmoothingOptimized 0\n", " lightgbm 11\n", "dtype: int64\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\topra\\AppData\\Local\\Temp\\ipykernel_2576\\2721712874.py:9: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", " print(best.groupby([\"cv_bin\",\"best_model\"]).size())\n" ] } ], "source": [ "full = pd.read_csv(\"../data/processed/train.csv\")\n", "\n", "vol = full.groupby(\"id\")[\"sales\"].agg([\"mean\",\"std\"]).reset_index()\n", "vol[\"cv\"] = vol[\"std\"] / (vol[\"mean\"] + 1e-9)\n", "\n", "best = df.merge(vol[[\"id\",\"cv\"]], on=\"id\")\n", "\n", "best[\"cv_bin\"] = pd.qcut(best[\"cv\"], 3, labels=[\"Low\",\"Mid\",\"High\"])\n", "print(best.groupby([\"cv_bin\",\"best_model\"]).size())" ] }, { "cell_type": "code", "execution_count": 17, "id": "e7c767a9", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"../data/processed/train.csv\")\n", "\n", "g = df.groupby(\"id\")[\"sales\"]\n", "summary = g.agg([\"mean\",\"std\",\"count\"])\n", "summary = summary.rename(columns={\"count\":\"T\"})\n", "\n", "summary[\"N\"] = g.apply(lambda x: (x>0).sum())\n", "summary[\"ADI\"] = summary[\"T\"] / summary[\"N\"].replace(0,1)\n", "summary[\"CV2\"] = (summary[\"std\"]/summary[\"mean\"].replace(0,1))**2\n", "\n", "summary.to_csv(\"../metrics/demand_profile.csv\")\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "6acec387", "metadata": {}, "outputs": [], "source": [ "summary[\"ADI_class\"] = np.where(summary[\"ADI\"] > 1.32, \"High\", \"Low\")\n", "summary[\"CV2_class\"] = np.where(summary[\"CV2\"] > 0.49, \"High\", \"Low\")\n", "\n", "summary[\"regime\"] = summary[\"ADI_class\"] + \"-\" + summary[\"CV2_class\"]" ] }, { "cell_type": "code", "execution_count": 19, "id": "04e8efeb", "metadata": {}, "outputs": [], "source": [ "best = pd.read_csv(\"../metrics/best_models.csv\")\n", "merged = best.merge(summary[[\"ADI\",\"CV2\",\"regime\"]], on=\"id\", how=\"left\")" ] }, { "cell_type": "code", "execution_count": 20, "id": "db10f026", "metadata": {}, "outputs": [], "source": [ "merged.groupby(\"regime\")[\"best_model\"].value_counts(normalize=True).to_csv(\"../metrics/regime_model_performance.csv\")" ] }, { "cell_type": "code", "execution_count": 21, "id": "53439790", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "best_model\n", "CrostonOptimized 1\n", "CrostonSBA 1\n", "HistoricAverage 2\n", "Holt 2\n", "HoltWinters 2\n", "Naive 3\n", "SeasonalExponentialSmoothingOptimized 1\n", "lightgbm 28\n", "dtype: int64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged.groupby('best_model').size()" ] }, { "cell_type": "code", "execution_count": 22, "id": "dccd9376", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "regime best_model \n", "High-High lightgbm 17\n", " Naive 2\n", " Holt 1\n", "Low-High lightgbm 6\n", " HoltWinters 2\n", " CrostonOptimized 1\n", " CrostonSBA 1\n", " HistoricAverage 1\n", " Holt 1\n", "Low-Low lightgbm 5\n", " HistoricAverage 1\n", " Naive 1\n", " SeasonalExponentialSmoothingOptimized 1\n", "Name: count, dtype: int64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged.groupby('regime')['best_model'].value_counts()" ] }, { "cell_type": "code", "execution_count": 23, "id": "32eff5fe", "metadata": {}, "outputs": [], "source": [ "merged.to_csv('../metrics/best_by_sku.csv')" ] }, { "cell_type": "markdown", "id": "2c05c469", "metadata": {}, "source": [ "## Key Insight\n", "Although classical literature suggests that intermittent & highly variable demand should be handled by Croston-type methods, our empirical evaluation on SKU-level series showed an asymmetry: LightGBM generalizes extremely well even under High-High ADI/CV² regimes, implying latent autocorrelation and structure that classical smoothing does not capture" ] }, { "cell_type": "code", "execution_count": 24, "id": "e11acb96", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "best_model\n", "lightgbm 0.70\n", "Naive 0.08\n", "HistoricAverage 0.05\n", "Holt 0.05\n", "HoltWinters 0.05\n", "SeasonalExponentialSmoothingOptimized 0.02\n", "CrostonSBA 0.02\n", "CrostonOptimized 0.02\n", "Name: proportion, dtype: float64\n" ] } ], "source": [ "print(best[\"best_model\"].value_counts(normalize=True).round(2))" ] }, { "cell_type": "code", "execution_count": 25, "id": "192189b7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "model\n", "lightgbm 2.505754\n", "SeasonalExponentialSmoothingOptimized 3.206895\n", "AutoARIMA 3.584182\n", "HistoricAverage 3.584731\n", "CrostonClassic 3.644770\n", "CrostonSBA 3.656476\n", "CrostonOptimized 3.785437\n", "SimpleExponentialSmoothingOptimized 3.792451\n", "Holt 3.812089\n", "HoltWinters 3.913065\n", "DynamicOptimizedTheta 3.958835\n", "Theta 3.960178\n", "DynamicTheta 3.961312\n", "OptimizedTheta 3.961813\n", "SeasonalNaive 4.716071\n", "WindowAverage 5.234375\n", "Naive 6.369643\n", "RandomWalkWithDrift 6.437935\n", "Name: score, dtype: float64\n" ] } ], "source": [ "m = pd.read_csv(\"../metrics/combined_metrics.csv\")\n", "print(m.groupby(\"model\")[\"score\"].mean().sort_values())" ] }, { "cell_type": "code", "execution_count": 26, "id": "223d1a5c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | id | \n", "best_model | \n", "
|---|---|---|
| 0 | \n", "FOODS_1_018_CA_1_validation | \n", "lightgbm | \n", "
| 1 | \n", "FOODS_1_085_CA_1_validation | \n", "lightgbm | \n", "
| 2 | \n", "FOODS_2_013_CA_1_validation | \n", "lightgbm | \n", "
| 3 | \n", "FOODS_2_019_CA_1_validation | \n", "SeasonalExponentialSmoothingOptimized | \n", "
| 4 | \n", "FOODS_2_030_CA_1_validation | \n", "lightgbm | \n", "
| 5 | \n", "FOODS_2_181_CA_1_validation | \n", "lightgbm | \n", "
| 6 | \n", "FOODS_2_197_CA_1_validation | \n", "HistoricAverage | \n", "
| 7 | \n", "HOBBIES_1_001_CA_1_validation | \n", "lightgbm | \n", "
| 8 | \n", "HOBBIES_1_002_CA_1_validation | \n", "lightgbm | \n", "
| 9 | \n", "HOBBIES_1_003_CA_1_validation | \n", "lightgbm | \n", "
| 10 | \n", "HOBBIES_1_004_CA_1_validation | \n", "lightgbm | \n", "
| 11 | \n", "HOBBIES_1_005_CA_1_validation | \n", "lightgbm | \n", "
| 12 | \n", "HOBBIES_1_006_CA_1_validation | \n", "lightgbm | \n", "
| 13 | \n", "HOBBIES_1_007_CA_1_validation | \n", "lightgbm | \n", "
| 14 | \n", "HOBBIES_1_008_CA_1_validation | \n", "lightgbm | \n", "
| 15 | \n", "HOBBIES_1_009_CA_1_validation | \n", "lightgbm | \n", "
| 16 | \n", "HOBBIES_1_010_CA_1_validation | \n", "lightgbm | \n", "
| 17 | \n", "HOBBIES_1_011_CA_1_validation | \n", "Naive | \n", "
| 18 | \n", "HOBBIES_1_012_CA_1_validation | \n", "Holt | \n", "
| 19 | \n", "HOBBIES_1_013_CA_1_validation | \n", "lightgbm | \n", "
| 20 | \n", "HOBBIES_1_014_CA_1_validation | \n", "lightgbm | \n", "
| 21 | \n", "HOBBIES_1_015_CA_1_validation | \n", "CrostonSBA | \n", "
| 22 | \n", "HOBBIES_1_016_CA_1_validation | \n", "lightgbm | \n", "
| 23 | \n", "HOBBIES_1_017_CA_1_validation | \n", "lightgbm | \n", "
| 24 | \n", "HOBBIES_1_018_CA_1_validation | \n", "Naive | \n", "
| 25 | \n", "HOBBIES_1_019_CA_1_validation | \n", "lightgbm | \n", "
| 26 | \n", "HOBBIES_1_020_CA_1_validation | \n", "lightgbm | \n", "
| 27 | \n", "HOBBIES_1_021_CA_1_validation | \n", "lightgbm | \n", "
| 28 | \n", "HOBBIES_1_022_CA_1_validation | \n", "lightgbm | \n", "
| 29 | \n", "HOBBIES_1_103_CA_1_validation | \n", "HoltWinters | \n", "
| 30 | \n", "HOBBIES_1_134_CA_1_validation | \n", "lightgbm | \n", "
| 31 | \n", "HOBBIES_1_147_CA_1_validation | \n", "lightgbm | \n", "
| 32 | \n", "HOBBIES_1_178_CA_1_validation | \n", "Holt | \n", "
| 33 | \n", "HOBBIES_1_254_CA_1_validation | \n", "HistoricAverage | \n", "
| 34 | \n", "HOBBIES_1_256_CA_1_validation | \n", "CrostonOptimized | \n", "
| 35 | \n", "HOBBIES_1_268_CA_1_validation | \n", "lightgbm | \n", "
| 36 | \n", "HOBBIES_1_337_CA_1_validation | \n", "lightgbm | \n", "
| 37 | \n", "HOUSEHOLD_1_243_CA_1_validation | \n", "lightgbm | \n", "
| 38 | \n", "HOUSEHOLD_1_373_CA_1_validation | \n", "Naive | \n", "
| 39 | \n", "HOUSEHOLD_1_494_CA_1_validation | \n", "HoltWinters | \n", "