|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import joblib |
|
|
import os |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.naive_bayes import GaussianNB |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report |
|
|
from sklearn.preprocessing import LabelEncoder |
|
|
import xgboost as xgb |
|
|
from data_loader import load_data |
|
|
from preprocessing import preprocess_pipeline |
|
|
|
|
|
def train_and_evaluate(): |
|
|
|
|
|
data_dir = os.path.join(os.path.dirname(__file__), '../data/crimedataset') |
|
|
train_df, _ = load_data(data_dir) |
|
|
|
|
|
|
|
|
print("Preprocessing data...") |
|
|
|
|
|
df, kmeans_model = preprocess_pipeline(train_df, is_train=True, kmeans_model=None) |
|
|
|
|
|
|
|
|
features = ['Hour', 'Day', 'Month', 'Year', 'DayOfWeek', 'IsWeekend', 'IsHoliday', 'LocationCluster', 'PdDistrict', 'Season'] |
|
|
target = 'IsViolent' |
|
|
|
|
|
|
|
|
print("Encoding categorical features...") |
|
|
le_dict = {} |
|
|
for col in ['PdDistrict', 'Season']: |
|
|
le = LabelEncoder() |
|
|
df[col] = le.fit_transform(df[col]) |
|
|
le_dict[col] = le |
|
|
|
|
|
X = df[features] |
|
|
y = df[target] |
|
|
|
|
|
|
|
|
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
models = { |
|
|
'Naive Bayes': GaussianNB(), |
|
|
'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1), |
|
|
'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42) |
|
|
} |
|
|
|
|
|
best_model = None |
|
|
best_score = 0 |
|
|
results = {} |
|
|
|
|
|
print("Training models...") |
|
|
for name, model in models.items(): |
|
|
print(f"Training {name}...") |
|
|
model.fit(X_train, y_train) |
|
|
y_pred = model.predict(X_val) |
|
|
|
|
|
acc = accuracy_score(y_val, y_pred) |
|
|
prec = precision_score(y_val, y_pred) |
|
|
rec = recall_score(y_val, y_pred) |
|
|
|
|
|
results[name] = {'Accuracy': acc, 'Precision': prec, 'Recall': rec} |
|
|
print(f"{name} - Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}") |
|
|
|
|
|
if acc > best_score: |
|
|
best_score = acc |
|
|
best_model = model |
|
|
|
|
|
|
|
|
models_dir = os.path.join(os.path.dirname(__file__), '../models') |
|
|
os.makedirs(models_dir, exist_ok=True) |
|
|
|
|
|
print(f"Saving best model: {best_model.__class__.__name__}") |
|
|
joblib.dump(best_model, os.path.join(models_dir, 'best_model.pkl')) |
|
|
joblib.dump(le_dict, os.path.join(models_dir, 'label_encoders.pkl')) |
|
|
joblib.dump(kmeans_model, os.path.join(models_dir, 'kmeans.pkl')) |
|
|
|
|
|
return results |
|
|
|
|
|
if __name__ == "__main__": |
|
|
train_and_evaluate() |
|
|
|