import pandas as pd
import numpy as np
import json
import os
from pathlib import Path

# Load metadata
BASE_DIR = Path(__file__).resolve().parent
META_PATH = BASE_DIR / "preprocessing_metadata.json"

with open(META_PATH, "r") as f:
    META = json.load(f)

EXPECTED_COLS = META["expected_columns"]
DEFAULTS = META["defaults"]

def get_hour_bin(h):
    """Categorize hour into 4 bins"""
    if 0 <= h < 6: return "Night (0-5)"
    if 6 <= h < 12: return "Morning (6-11)"
    if 12 <= h < 18: return "Afternoon (12-17)"
    return "Evening (18-23)"

def preprocess_input(input_data: dict) -> pd.DataFrame:
    """
    Transform raw user input (dict) into the exact DataFrame structure
    expected by the model pipeline (42 columns).
    """
    
    # 1. Start with Defaults
    # We copy the defaults so we have a full dictionary of 42 fields
    final_data = DEFAULTS.copy()
    
    # 2. Update with User Input
    # Only update keys that exist in the input and are not None
    for k, v in input_data.items():
        if v is not None:
            final_data[k] = v
            
    # 3. Handle Feature Engineering
    
    # Hour Logic
    if 'incident_hour_of_the_day' in input_data:
        h = input_data['incident_hour_of_the_day']
        
        # Sine/Cosine
        hour_rad = (h / 24) * 2 * np.pi
        final_data['hour_sin'] = np.sin(hour_rad)
        final_data['hour_cos'] = np.cos(hour_rad)
        
        # Binning
        final_data['hour_bin_4'] = get_hour_bin(h)
    
    # Missing Flags Logic
    # collision_type
    col_type = input_data.get('collision_type')
    if col_type == '?' or col_type is None:
        final_data['collision_type_missing'] = "YES" # Assuming boolean or string "YES"/"NO"??
        # Let's check the defaults type.
        # If defaults['collision_type_missing'] is 0 or 1, use that.
        # But 'preprocesed_for_trees.csv' usually has numeric 0/1 for booleans?
        # Let's assume 1/0 based on standard engineering.
        # Wait, notebook output showed "collision_type_missing" column exists.
        # I'll stick to 1/0 if default is number.
        if isinstance(DEFAULTS.get('collision_type_missing'), (int, float)):
             final_data['collision_type_missing'] = 1
        else:
             final_data['collision_type_missing'] = "YES"
    else:
        if isinstance(DEFAULTS.get('collision_type_missing'), (int, float)):
             final_data['collision_type_missing'] = 0
        else:
             final_data['collision_type_missing'] = "NO"

    # police_report_available_missing
    police = input_data.get('police_report_available')
    if police == '?' or police is None:
         # Same logic
         if isinstance(DEFAULTS.get('police_report_available_missing'), (int, float)):
             final_data['police_report_available_missing'] = 1
         else:
             final_data['police_report_available_missing'] = "YES"
         
         # Usually if it is missing, we might impute the original column too? 
         # The Pipeline likely handles '?' as a category if it was trained on it.
    else:
         if isinstance(DEFAULTS.get('police_report_available_missing'), (int, float)):
             final_data['police_report_available_missing'] = 0
    
    # authorities_contacted_missing
    auth = input_data.get('authorities_contacted')
    if auth is None or auth == "None": # Check string "None" too
         if isinstance(DEFAULTS.get('authorities_contacted_missing'), (int, float)):
             final_data['authorities_contacted_missing'] = 1
    else:
         if isinstance(DEFAULTS.get('authorities_contacted_missing'), (int, float)):
             final_data['authorities_contacted_missing'] = 0

    # 4. Share Normalization Logic (New)
    # Check if we have injury_share/property_share in input, or using defaults
    inj_share = input_data.get('injury_share')
    prop_share = input_data.get('property_share')
    
    if inj_share is not None and prop_share is not None:
        # Both provided: Normalize if sum != 1
        total_share = inj_share + prop_share
        if abs(total_share - 1.0) > 0.01 and total_share > 0:
            final_data['injury_share'] = inj_share / total_share
            final_data['property_share'] = prop_share / total_share
    elif inj_share is not None and prop_share is None:
        # Only injury provided: Infer property
        # Clip to 0-1 range first? Model handles it, but safety is good.
        final_data['property_share'] = max(0.0, 1.0 - inj_share)
    elif inj_share is None and prop_share is not None:
        # Only property provided: Infer injury
        final_data['injury_share'] = max(0.0, 1.0 - prop_share)
    else:
        # Both missing: Use Defaults (0.125 each) ??
        # User requested fix for "silent invention".
        # Since we can't break the model, let's normalize the defaults IF they are weird?
        # Defaults are 0.125/0.125 -> sum 0.25.
        # If we normalize defaults, we get 0.5/0.5.
        # Let's enforce normalization on final_data regardless of source
        curr_inj = final_data.get('injury_share', 0.0)
        curr_prop = final_data.get('property_share', 0.0)
        curr_tot = curr_inj + curr_prop
        if abs(curr_tot - 1.0) > 0.01 and curr_tot > 0:
             final_data['injury_share'] = curr_inj / curr_tot
             final_data['property_share'] = curr_prop / curr_tot

    # 5. Create DataFrame
    # Ensure strict column order
    df = pd.DataFrame([final_data])
    
    # Select only expected columns in correct order
    df = df[EXPECTED_COLS]
    
    return df