diff --git a/experiments/ml/evals.py b/experiments/ml/evals.py index 5258a4a..e264356 100644 --- a/experiments/ml/evals.py +++ b/experiments/ml/evals.py @@ -9,6 +9,38 @@ from PIL import Image logger = getLogger(__name__) + +def log_feature_importance(writer, model, feature_names, epoch): + """Visualize and log feature importance to TensorBoard""" + if not hasattr(model, 'feature_importances_') or model.feature_importances_ is None: + return + + importance = model.feature_importances_ + indices = np.argsort(importance)[::-1][:20] # top 20 + top_features = [feature_names[i] for i in indices] + top_importance = importance[indices] + + for i, (feat, imp) in enumerate(zip(top_features, top_importance)): + writer.add_scalar(f'FeatureImportance/{feat}', imp, epoch) + + fig, ax = plt.subplots(figsize=(10, 8)) + ax.barh(range(len(top_features)), top_importance, align='center') + ax.set_yticks(range(len(top_features))) + ax.set_yticklabels(top_features) + ax.invert_yaxis() + ax.set_xlabel('Importance') + ax.set_title(f'Top 20 Feature Importance (Epoch {epoch})') + ax.grid(axis='x', alpha=0.3) + + buf = io.BytesIO() + plt.tight_layout() + plt.savefig(buf, format='png', dpi=100) + buf.seek(0) + img = Image.open(buf) + img_arr = np.array(img) + writer.add_image('FeatureImportance/Chart', img_arr, epoch, dataformats='HWC') + plt.close() + def evaluate(perdicted_class, predicted_proba, true_class, writer: SummaryWriter, epoch: int): accuracy = accuracy_score(true_class, perdicted_class) precision = precision_score(true_class, perdicted_class, zero_division=0) diff --git a/experiments/ml/train.py b/experiments/ml/train.py index cd658f2..8766114 100644 --- a/experiments/ml/train.py +++ b/experiments/ml/train.py @@ -1,13 +1,137 @@ from torch.utils.tensorboard import SummaryWriter +from sklearn.model_selection import train_test_split from logging import getLogger -from evals import evaluate +from pathlib import Path +import pandas as pd +import numpy as np +import joblib +from datetime import datetime +from ml.evals import evaluate, log_feature_importance +from ml.arch import XGBoostAgentClassifier, LightGBMAgentClassifier, LABELS + logger = getLogger(__name__) +FEATURE_COLS_EXCLUDE = ['sessionId', 'experimentId', 'is_agent', 'xp_human_only', 'xp_market_mode', 'browser_family'] +RUNS_DIR = Path('ml/runs') +CHECKPOINTS_DIR = Path('ml/checkpoints') -def train(): - pass +def prepare_data(df): + """ + Prepare feature matrix and labels from raw dataframe + Handles missing labels, feature selection, and categorical encoding + Returns: (X, y, feature_cols) + """ + # drop rows with missing labels + n_before = len(df) + df = df[df['is_agent'].notna()].copy() + n_dropped = n_before - len(df) + if n_dropped > 0: + logger.warning(f"Dropped {n_dropped} sessions with missing labels") + + if len(df) == 0: + logger.error("No labeled data available") + return None, None, None + + feature_cols = [c for c in df.columns if c not in FEATURE_COLS_EXCLUDE] + + # handle categorical browser_family via one-hot encoding + if 'browser_family' in df.columns: + browser_dummies = pd.get_dummies(df['browser_family'], prefix='browser', drop_first=True) + df = pd.concat([df, browser_dummies], axis=1) + feature_cols.extend(browser_dummies.columns.tolist()) + + X = df[feature_cols].fillna(0) + y = df['is_agent'].astype(int) + + return X, y, feature_cols + + +def train(data_path=None, model_type='xgboost', test_size=0.2, random_state=42, + n_estimators=200, max_depth=6, learning_rate=0.05): + """ + Train agent detection classifier + Args: + data_path: path to labeled feature matrix CSV or parquet + model_type: 'xgboost' or 'lightgbm' + test_size: fraction for test split + random_state: seed for reproducibility + """ + RUNS_DIR.mkdir(exist_ok=True) + CHECKPOINTS_DIR.mkdir(exist_ok=True) + + run_name = f"{model_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + writer = SummaryWriter(log_dir=RUNS_DIR / run_name) + logger.info(f"Starting training run: {run_name}") + + # load data + if data_path is None: + logger.error("data_path required") + return + df = pd.read_parquet(data_path) + logger.info(f"Loaded {len(df)} sessions from {data_path}") + + # prepare features and labels + if 'is_agent' not in df.columns: + logger.error("Missing is_agent column") + return + + X, y, feature_cols = prepare_data(df) + if X is None: + return + + # class distribution + n_agents = y.sum() + n_humans = (y == 0).sum() + logger.info(f"Class distribution: {n_humans} humans, {n_agents} agents (ratio {n_humans/n_agents:.2f})") + + # train/test split with stratification + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=random_state, stratify=y + ) + logger.info(f"Train: {len(X_train)}, Test: {len(X_test)}") + + # init model + if model_type == 'xgboost': + model = XGBoostAgentClassifier( + n_estimators=n_estimators, + max_depth=max_depth, + learning_rate=learning_rate + ) + elif model_type == 'lightgbm': + model = LightGBMAgentClassifier( + n_estimators=n_estimators, + max_depth=max_depth, + learning_rate=learning_rate + ) + else: + logger.error(f"Unknown model type: {model_type}") + return + + # train with eval set for early stopping + model.fit(X_train, y_train, eval_set=[(X_test, y_test)]) + logger.info("Training complete") + + # evaluate on test set + y_pred = model.predict(X_test) + y_prob = model.predict_proba(X_test)[:, 1] + + evaluate(y_pred, y_prob, y_test, writer, epoch=0) + + # log feature importance + log_feature_importance(writer, model, X.columns.tolist(), epoch=0) + + # save model + model_path = CHECKPOINTS_DIR / f"{run_name}.pkl" + joblib.dump({'model': model, 'feature_cols': X.columns.tolist(), 'run_name': run_name}, model_path) + logger.info(f"Model saved to {model_path}") + + writer.close() + return model, X.columns.tolist() if __name__ == "__main__": - train() + import sys + data_path = sys.argv[1] + model_type = sys.argv[2] if len(sys.argv) > 2 else 'xgboost' + train(data_path, model_type=model_type)