Spaces:

PranavSharma
/

Forecast-Sandbox-Lite

Sleeping

App Files Files Community

Forecast-Sandbox-Lite / data /prepare_data_freshnet.py

PranavSharma

Initial Commit

b099c5d verified 5 days ago

raw

history blame contribute delete

5.85 kB

	# %%
	import pandas as pd
	import numpy as np

	np.random.seed(42)

	# 1. Load HF dataset and convert to pandas
	raw_df = pd.read_parquet("data/raw/FreshRetailNet-50K/train.parquet")
	df_fresh_eval = pd.read_parquet("data/raw/FreshRetailNet-50K/eval.parquet")


	# %%
	# raw_df = raw_df.head(2000)
	# 2. Rename for cleanliness
	df = raw_df.rename(columns={
	"dt": "date",
	"sale_amount": "sales",
	"first_category_id": "category_1",
	"second_category_id": "category_2",
	"third_category_id": "category_3",
	"holiday_flag": "holiday",
	"precpt": "precip",
	"avg_temperature": "temp",
	"avg_humidity": "humidity",
	"avg_wind_level": "wind_level",
	})

	df["date"] = pd.to_datetime(df["date"])

	# 3. Build a stable SKU-ID like you did for M5
	# here: city–store–product triple → id
	df["id"] = "CID" + df["city_id"].astype(str) + "_SID" + df["store_id"].astype(str) + "_PID" + df["product_id"].astype(str) + "_MGID" + \
	df["management_group_id"].astype(str) + "_CAT1" + df["category_1"].astype(str) + "-CAT2" + df["category_2"].astype(str) + "-CAT3" + df["category_3"].astype(str)

	# %%
	def extract_daily_features(row):
	hs = np.array(row["hours_sale"], dtype=float) # length 24
	st = np.array(row["hours_stock_status"], dtype=int) # 1 = out-of-stock

	sale_hours = (hs > 0).sum()
	sale_hour_ratio = sale_hours / 24.0

	stockout_hours = st.sum()
	stockout_hour_ratio = stockout_hours / 24.0
	avail_hour_ratio = 1.0 - stockout_hour_ratio

	return pd.Series({
	"sale_hours": sale_hours,
	"sale_hour_ratio": sale_hour_ratio,
	"stockout_hours": stockout_hours,
	"stockout_hour_ratio": stockout_hour_ratio,
	"avail_hour_ratio": avail_hour_ratio,
	})

	daily_feats = df.apply(extract_daily_features, axis=1)
	df = pd.concat([df, daily_feats], axis=1)


	# %%
	tidy_df = df[['id',
	"date",
	"city_id", "store_id", "product_id",
	"management_group_id", "category_1", "category_2", "category_3",
	"sales",
	"sale_hours", "sale_hour_ratio",
	"stockout_hours", "stockout_hour_ratio", "avail_hour_ratio",
	"stock_hour6_22_cnt",
	"discount", "holiday", "activity_flag",
	"precip", "temp", "humidity", "wind_level",
	]].copy()

	# %%
	g = tidy_df.groupby("id")["sales"]

	summary = g.agg(["mean", "std", "count"])
	summary = summary.rename(columns={"count": "T"})

	summary["N"] = g.apply(lambda x: (x > 0).sum())
	summary["ADI"] = summary["T"] / summary["N"].replace(0, 1)
	summary["CV2"] = (summary["std"] / summary["mean"].replace(0, 1)) ** 2

	summary["ADI_class"] = np.where(summary["ADI"] > 1.32, "High", "Low")
	summary["CV2_class"] = np.where(summary["CV2"] > 0.49, "High", "Low")
	summary["regime"] = summary["ADI_class"] + "-" + summary["CV2_class"]


	# %%
	tidy_df = tidy_df.merge(summary, on="id", how="left")

	# %%
	tidy_high_high = tidy_df[tidy_df["regime"] == "High-High"]
	tidy_low_high = tidy_df[tidy_df["regime"] == "Low-High"]
	tidy_high_low = tidy_df[tidy_df["regime"] == "High-Low"]
	tidy_low_low = tidy_df[tidy_df["regime"] == "Low-Low"]

	def sample_by_regime(df_regime: pd.DataFrame, num_ids_needed: int) -> pd.DataFrame:
	"""
	Sample num_ids_needed unique IDs from df_regime and return all their history.
	"""
	concat_df = []
	for i, sku_id in enumerate(df_regime["id"].unique()):
	if i < num_ids_needed:
	concat_df.append(df_regime[df_regime["id"] == sku_id])
	else:
	break
	if not concat_df:
	return pd.DataFrame(columns=df_regime.columns)
	return pd.concat(concat_df, ignore_index=True)

	multiples = 3
	df_high_high_sampled = sample_by_regime(tidy_high_high, 3 * multiples)
	df_low_high_sampled = sample_by_regime(tidy_low_high, 15 * multiples)
	df_high_low_sampled = sample_by_regime(tidy_high_low, 10 * multiples)
	df_low_low_sampled = sample_by_regime(tidy_low_low, 81 * multiples)
	tidy_subset = pd.concat(
	[df_high_high_sampled, df_low_high_sampled, df_high_low_sampled, df_low_low_sampled],
	ignore_index=True
	)

	print(tidy_subset["regime"].value_counts())
	print(tidy_subset["regime"].value_counts(normalize=True))


	# %%
	# Sort properly
	tidy_subset = tidy_subset.sort_values(["id", "date"])

	# Per-SKU day index (1..T within each id)
	tidy_subset["day_idx"] = (
	tidy_subset
	.groupby("id")["date"]
	.rank(method="first")
	.astype(int)
	)


	# %%
	# Sort properly
	tidy_subset = tidy_subset.sort_values(["id", "date"])

	# Per-SKU day index (1..T within each id)
	tidy_subset["day_idx"] = (
	tidy_subset
	.groupby("id")["date"]
	.rank(method="first")
	.astype(int)
	)


	# %%
	# a=b

	# %%
	# For 90-day series: use first 76 days as train, last 14 as test
	HORIZON = 14
	TRAIN_HORIZON_END = 90-HORIZON # 90 - 28


	train_df = tidy_subset[tidy_subset["day_idx"] <= TRAIN_HORIZON_END]
	test_df = tidy_subset[tidy_subset["day_idx"] > TRAIN_HORIZON_END]

	# Inference input for LGBM (last 200 days equivalent; here min(200, series_len))
	# For 90-day series you might just use last 62 or so; here we take last 62:
	inference_input_df_lgbm = tidy_subset[
	tidy_subset["day_idx"] > (TRAIN_HORIZON_END - 62)
	]

	# %%
	train_df.head()

	train_df["sales"] = train_df["sales"] * 100
	test_df["sales"] = test_df["sales"] * 100
	inference_input_df_lgbm["sales"] = inference_input_df_lgbm["sales"] * 100
	# %%
	import os
	os.makedirs("data/processed", exist_ok=True)

	tidy_subset.to_csv("data/processed/freshretailnet_subset.csv", index=False)
	train_df.to_csv("data/processed/train.csv", index=False)
	test_df.to_csv("data/processed/test.csv", index=False)
	inference_input_df_lgbm.to_csv(
	"data/processed/inference_input_df_lgbm.csv",
	index=False
	)

	# %%