Spaces:
Running
Running
| import pandas as pd | |
| import numpy as np | |
| import json | |
| import os | |
| from pathlib import Path | |
| # Load metadata | |
| BASE_DIR = Path(__file__).resolve().parent | |
| META_PATH = BASE_DIR / "preprocessing_metadata.json" | |
| with open(META_PATH, "r") as f: | |
| META = json.load(f) | |
| EXPECTED_COLS = META["expected_columns"] | |
| DEFAULTS = META["defaults"] | |
| def get_hour_bin(h): | |
| """Categorize hour into 4 bins""" | |
| if 0 <= h < 6: return "Night (0-5)" | |
| if 6 <= h < 12: return "Morning (6-11)" | |
| if 12 <= h < 18: return "Afternoon (12-17)" | |
| return "Evening (18-23)" | |
| def preprocess_input(input_data: dict) -> pd.DataFrame: | |
| """ | |
| Transform raw user input (dict) into the exact DataFrame structure | |
| expected by the model pipeline (42 columns). | |
| """ | |
| # 1. Start with Defaults | |
| # We copy the defaults so we have a full dictionary of 42 fields | |
| final_data = DEFAULTS.copy() | |
| # 2. Update with User Input | |
| # Only update keys that exist in the input and are not None | |
| for k, v in input_data.items(): | |
| if v is not None: | |
| final_data[k] = v | |
| # 3. Handle Feature Engineering | |
| # Hour Logic | |
| if 'incident_hour_of_the_day' in input_data: | |
| h = input_data['incident_hour_of_the_day'] | |
| # Sine/Cosine | |
| hour_rad = (h / 24) * 2 * np.pi | |
| final_data['hour_sin'] = np.sin(hour_rad) | |
| final_data['hour_cos'] = np.cos(hour_rad) | |
| # Binning | |
| final_data['hour_bin_4'] = get_hour_bin(h) | |
| # Missing Flags Logic | |
| # collision_type | |
| col_type = input_data.get('collision_type') | |
| if col_type == '?' or col_type is None: | |
| final_data['collision_type_missing'] = "YES" # Assuming boolean or string "YES"/"NO"?? | |
| # Let's check the defaults type. | |
| # If defaults['collision_type_missing'] is 0 or 1, use that. | |
| # But 'preprocesed_for_trees.csv' usually has numeric 0/1 for booleans? | |
| # Let's assume 1/0 based on standard engineering. | |
| # Wait, notebook output showed "collision_type_missing" column exists. | |
| # I'll stick to 1/0 if default is number. | |
| if isinstance(DEFAULTS.get('collision_type_missing'), (int, float)): | |
| final_data['collision_type_missing'] = 1 | |
| else: | |
| final_data['collision_type_missing'] = "YES" | |
| else: | |
| if isinstance(DEFAULTS.get('collision_type_missing'), (int, float)): | |
| final_data['collision_type_missing'] = 0 | |
| else: | |
| final_data['collision_type_missing'] = "NO" | |
| # police_report_available_missing | |
| police = input_data.get('police_report_available') | |
| if police == '?' or police is None: | |
| # Same logic | |
| if isinstance(DEFAULTS.get('police_report_available_missing'), (int, float)): | |
| final_data['police_report_available_missing'] = 1 | |
| else: | |
| final_data['police_report_available_missing'] = "YES" | |
| # Usually if it is missing, we might impute the original column too? | |
| # The Pipeline likely handles '?' as a category if it was trained on it. | |
| else: | |
| if isinstance(DEFAULTS.get('police_report_available_missing'), (int, float)): | |
| final_data['police_report_available_missing'] = 0 | |
| # authorities_contacted_missing | |
| auth = input_data.get('authorities_contacted') | |
| if auth is None or auth == "None": # Check string "None" too | |
| if isinstance(DEFAULTS.get('authorities_contacted_missing'), (int, float)): | |
| final_data['authorities_contacted_missing'] = 1 | |
| else: | |
| if isinstance(DEFAULTS.get('authorities_contacted_missing'), (int, float)): | |
| final_data['authorities_contacted_missing'] = 0 | |
| # 4. Share Normalization Logic (New) | |
| # Check if we have injury_share/property_share in input, or using defaults | |
| inj_share = input_data.get('injury_share') | |
| prop_share = input_data.get('property_share') | |
| if inj_share is not None and prop_share is not None: | |
| # Both provided: Normalize if sum != 1 | |
| total_share = inj_share + prop_share | |
| if abs(total_share - 1.0) > 0.01 and total_share > 0: | |
| final_data['injury_share'] = inj_share / total_share | |
| final_data['property_share'] = prop_share / total_share | |
| elif inj_share is not None and prop_share is None: | |
| # Only injury provided: Infer property | |
| # Clip to 0-1 range first? Model handles it, but safety is good. | |
| final_data['property_share'] = max(0.0, 1.0 - inj_share) | |
| elif inj_share is None and prop_share is not None: | |
| # Only property provided: Infer injury | |
| final_data['injury_share'] = max(0.0, 1.0 - prop_share) | |
| else: | |
| # Both missing: Use Defaults (0.125 each) ?? | |
| # User requested fix for "silent invention". | |
| # Since we can't break the model, let's normalize the defaults IF they are weird? | |
| # Defaults are 0.125/0.125 -> sum 0.25. | |
| # If we normalize defaults, we get 0.5/0.5. | |
| # Let's enforce normalization on final_data regardless of source | |
| curr_inj = final_data.get('injury_share', 0.0) | |
| curr_prop = final_data.get('property_share', 0.0) | |
| curr_tot = curr_inj + curr_prop | |
| if abs(curr_tot - 1.0) > 0.01 and curr_tot > 0: | |
| final_data['injury_share'] = curr_inj / curr_tot | |
| final_data['property_share'] = curr_prop / curr_tot | |
| # 5. Create DataFrame | |
| # Ensure strict column order | |
| df = pd.DataFrame([final_data]) | |
| # Select only expected columns in correct order | |
| df = df[EXPECTED_COLS] | |
| return df | |