fraud-detector / preprocessing.py
MyNameIsTatiBond's picture
Enable shape-aware SHAP explanations
ff34011
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
# Load metadata
BASE_DIR = Path(__file__).resolve().parent
META_PATH = BASE_DIR / "preprocessing_metadata.json"
with open(META_PATH, "r") as f:
META = json.load(f)
EXPECTED_COLS = META["expected_columns"]
DEFAULTS = META["defaults"]
def get_hour_bin(h):
"""Categorize hour into 4 bins"""
if 0 <= h < 6: return "Night (0-5)"
if 6 <= h < 12: return "Morning (6-11)"
if 12 <= h < 18: return "Afternoon (12-17)"
return "Evening (18-23)"
def preprocess_input(input_data: dict) -> pd.DataFrame:
"""
Transform raw user input (dict) into the exact DataFrame structure
expected by the model pipeline (42 columns).
"""
# 1. Start with Defaults
# We copy the defaults so we have a full dictionary of 42 fields
final_data = DEFAULTS.copy()
# 2. Update with User Input
# Only update keys that exist in the input and are not None
for k, v in input_data.items():
if v is not None:
final_data[k] = v
# 3. Handle Feature Engineering
# Hour Logic
if 'incident_hour_of_the_day' in input_data:
h = input_data['incident_hour_of_the_day']
# Sine/Cosine
hour_rad = (h / 24) * 2 * np.pi
final_data['hour_sin'] = np.sin(hour_rad)
final_data['hour_cos'] = np.cos(hour_rad)
# Binning
final_data['hour_bin_4'] = get_hour_bin(h)
# Missing Flags Logic
# collision_type
col_type = input_data.get('collision_type')
if col_type == '?' or col_type is None:
final_data['collision_type_missing'] = "YES" # Assuming boolean or string "YES"/"NO"??
# Let's check the defaults type.
# If defaults['collision_type_missing'] is 0 or 1, use that.
# But 'preprocesed_for_trees.csv' usually has numeric 0/1 for booleans?
# Let's assume 1/0 based on standard engineering.
# Wait, notebook output showed "collision_type_missing" column exists.
# I'll stick to 1/0 if default is number.
if isinstance(DEFAULTS.get('collision_type_missing'), (int, float)):
final_data['collision_type_missing'] = 1
else:
final_data['collision_type_missing'] = "YES"
else:
if isinstance(DEFAULTS.get('collision_type_missing'), (int, float)):
final_data['collision_type_missing'] = 0
else:
final_data['collision_type_missing'] = "NO"
# police_report_available_missing
police = input_data.get('police_report_available')
if police == '?' or police is None:
# Same logic
if isinstance(DEFAULTS.get('police_report_available_missing'), (int, float)):
final_data['police_report_available_missing'] = 1
else:
final_data['police_report_available_missing'] = "YES"
# Usually if it is missing, we might impute the original column too?
# The Pipeline likely handles '?' as a category if it was trained on it.
else:
if isinstance(DEFAULTS.get('police_report_available_missing'), (int, float)):
final_data['police_report_available_missing'] = 0
# authorities_contacted_missing
auth = input_data.get('authorities_contacted')
if auth is None or auth == "None": # Check string "None" too
if isinstance(DEFAULTS.get('authorities_contacted_missing'), (int, float)):
final_data['authorities_contacted_missing'] = 1
else:
if isinstance(DEFAULTS.get('authorities_contacted_missing'), (int, float)):
final_data['authorities_contacted_missing'] = 0
# 4. Share Normalization Logic (New)
# Check if we have injury_share/property_share in input, or using defaults
inj_share = input_data.get('injury_share')
prop_share = input_data.get('property_share')
if inj_share is not None and prop_share is not None:
# Both provided: Normalize if sum != 1
total_share = inj_share + prop_share
if abs(total_share - 1.0) > 0.01 and total_share > 0:
final_data['injury_share'] = inj_share / total_share
final_data['property_share'] = prop_share / total_share
elif inj_share is not None and prop_share is None:
# Only injury provided: Infer property
# Clip to 0-1 range first? Model handles it, but safety is good.
final_data['property_share'] = max(0.0, 1.0 - inj_share)
elif inj_share is None and prop_share is not None:
# Only property provided: Infer injury
final_data['injury_share'] = max(0.0, 1.0 - prop_share)
else:
# Both missing: Use Defaults (0.125 each) ??
# User requested fix for "silent invention".
# Since we can't break the model, let's normalize the defaults IF they are weird?
# Defaults are 0.125/0.125 -> sum 0.25.
# If we normalize defaults, we get 0.5/0.5.
# Let's enforce normalization on final_data regardless of source
curr_inj = final_data.get('injury_share', 0.0)
curr_prop = final_data.get('property_share', 0.0)
curr_tot = curr_inj + curr_prop
if abs(curr_tot - 1.0) > 0.01 and curr_tot > 0:
final_data['injury_share'] = curr_inj / curr_tot
final_data['property_share'] = curr_prop / curr_tot
# 5. Create DataFrame
# Ensure strict column order
df = pd.DataFrame([final_data])
# Select only expected columns in correct order
df = df[EXPECTED_COLS]
return df