finalhackathon / src /train_model.py
MHuzaifaa's picture
Upload project
100fb60
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from data_loader import load_data
from preprocessing import preprocess_pipeline
def train_and_evaluate():
# Load Data
data_dir = os.path.join(os.path.dirname(__file__), '../data/crimedataset')
train_df, _ = load_data(data_dir)
# Preprocess
print("Preprocessing data...")
# Pass None for kmeans_model to trigger fitting
df, kmeans_model = preprocess_pipeline(train_df, is_train=True, kmeans_model=None)
# Feature Selection
features = ['Hour', 'Day', 'Month', 'Year', 'DayOfWeek', 'IsWeekend', 'IsHoliday', 'LocationCluster', 'PdDistrict', 'Season']
target = 'IsViolent'
# Encoding Categorical Variables
print("Encoding categorical features...")
le_dict = {}
for col in ['PdDistrict', 'Season']:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
le_dict[col] = le
X = df[features]
y = df[target]
# Split Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
models = {
'Naive Bayes': GaussianNB(),
'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1),
'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}
best_model = None
best_score = 0
results = {}
print("Training models...")
for name, model in models.items():
print(f"Training {name}...")
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
prec = precision_score(y_val, y_pred)
rec = recall_score(y_val, y_pred)
results[name] = {'Accuracy': acc, 'Precision': prec, 'Recall': rec}
print(f"{name} - Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}")
if acc > best_score:
best_score = acc
best_model = model
# Save Artifacts
models_dir = os.path.join(os.path.dirname(__file__), '../models')
os.makedirs(models_dir, exist_ok=True)
print(f"Saving best model: {best_model.__class__.__name__}")
joblib.dump(best_model, os.path.join(models_dir, 'best_model.pkl'))
joblib.dump(le_dict, os.path.join(models_dir, 'label_encoders.pkl'))
joblib.dump(kmeans_model, os.path.join(models_dir, 'kmeans.pkl'))
return results
if __name__ == "__main__":
train_and_evaluate()