Files
PHANTOM/experiments/ml/train.py
Daniel Alves Rösel f2271e368e 34 initial discriminator of interaction data (#38)
* feat: training pipeline + tensorboard

* tesnorboard forgot

* chore: ml basic boilerplate

* feat: naive architecture as start

* eval setup

* chore: parquet exporting of data

* chore: updating requirements necesary

* feat: separating modules and adding training logs paths

* Update experiments/ml/train.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* fix: new path for runs

* fix: undoing ai slop code

* chore: modules and reqs

---------

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-12-14 18:58:42 +01:00

138 lines
4.4 KiB
Python

from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from logging import getLogger
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
from datetime import datetime
from ml.evals import evaluate, log_feature_importance
from ml.arch import XGBoostAgentClassifier, LightGBMAgentClassifier, LABELS
logger = getLogger(__name__)
FEATURE_COLS_EXCLUDE = ['sessionId', 'experimentId', 'is_agent', 'xp_human_only', 'xp_market_mode', 'browser_family']
RUNS_DIR = Path('ml/runs')
CHECKPOINTS_DIR = Path('ml/checkpoints')
def prepare_data(df):
"""
Prepare feature matrix and labels from raw dataframe
Handles missing labels, feature selection, and categorical encoding
Returns: (X, y, feature_cols)
"""
# drop rows with missing labels
n_before = len(df)
df = df[df['is_agent'].notna()].copy()
n_dropped = n_before - len(df)
if n_dropped > 0:
logger.warning(f"Dropped {n_dropped} sessions with missing labels")
if len(df) == 0:
logger.error("No labeled data available")
return None, None, None
feature_cols = [c for c in df.columns if c not in FEATURE_COLS_EXCLUDE]
# handle categorical browser_family via one-hot encoding
if 'browser_family' in df.columns:
browser_dummies = pd.get_dummies(df['browser_family'], prefix='browser', drop_first=True)
df = pd.concat([df, browser_dummies], axis=1)
feature_cols.extend(browser_dummies.columns.tolist())
X = df[feature_cols].fillna(0)
y = df['is_agent'].astype(int)
return X, y, feature_cols
def train(data_path=None, model_type='xgboost', test_size=0.2, random_state=42,
n_estimators=200, max_depth=6, learning_rate=0.05):
"""
Train agent detection classifier
Args:
data_path: path to labeled feature matrix CSV or parquet
model_type: 'xgboost' or 'lightgbm'
test_size: fraction for test split
random_state: seed for reproducibility
"""
RUNS_DIR.mkdir(exist_ok=True)
CHECKPOINTS_DIR.mkdir(exist_ok=True)
run_name = f"{model_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
writer = SummaryWriter(log_dir=RUNS_DIR / run_name)
logger.info(f"Starting training run: {run_name}")
# load data
if data_path is None:
logger.error("data_path required")
return
df = pd.read_parquet(data_path)
logger.info(f"Loaded {len(df)} sessions from {data_path}")
# prepare features and labels
if 'is_agent' not in df.columns:
logger.error("Missing is_agent column")
return
X, y, feature_cols = prepare_data(df)
if X is None:
return
# class distribution
n_agents = y.sum()
n_humans = (y == 0).sum()
logger.info(f"Class distribution: {n_humans} humans, {n_agents} agents" + (f" (ratio {n_humans / n_agents:.2f})" if n_agents > 0 else ""))
# train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state, stratify=y
)
logger.info(f"Train: {len(X_train)}, Test: {len(X_test)}")
# init model
if model_type == 'xgboost':
model = XGBoostAgentClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate
)
elif model_type == 'lightgbm':
model = LightGBMAgentClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate
)
else:
logger.error(f"Unknown model type: {model_type}")
return
# train with eval set for early stopping
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
logger.info("Training complete")
# evaluate on test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
evaluate(y_pred, y_prob, y_test, writer, epoch=0)
# log feature importance
log_feature_importance(writer, model, X.columns.tolist(), epoch=0)
# save model
model_path = CHECKPOINTS_DIR / f"{run_name}.pkl"
joblib.dump({'model': model, 'feature_cols': X.columns.tolist(), 'run_name': run_name}, model_path)
logger.info(f"Model saved to {model_path}")
writer.close()
return model, X.columns.tolist()
if __name__ == "__main__":
import sys
data_path = sys.argv[1]
model_type = sys.argv[2] if len(sys.argv) > 2 else 'xgboost'
train(data_path, model_type=model_type)