feat: separating modules and adding training logs paths

This commit is contained in:
2025-12-12 12:45:51 +01:00
parent 0119408897
commit 48cf50db32
2 changed files with 160 additions and 4 deletions

View File

@@ -9,6 +9,38 @@ from PIL import Image
logger = getLogger(__name__) logger = getLogger(__name__)
def log_feature_importance(writer, model, feature_names, epoch):
"""Visualize and log feature importance to TensorBoard"""
if not hasattr(model, 'feature_importances_') or model.feature_importances_ is None:
return
importance = model.feature_importances_
indices = np.argsort(importance)[::-1][:20] # top 20
top_features = [feature_names[i] for i in indices]
top_importance = importance[indices]
for i, (feat, imp) in enumerate(zip(top_features, top_importance)):
writer.add_scalar(f'FeatureImportance/{feat}', imp, epoch)
fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(range(len(top_features)), top_importance, align='center')
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features)
ax.invert_yaxis()
ax.set_xlabel('Importance')
ax.set_title(f'Top 20 Feature Importance (Epoch {epoch})')
ax.grid(axis='x', alpha=0.3)
buf = io.BytesIO()
plt.tight_layout()
plt.savefig(buf, format='png', dpi=100)
buf.seek(0)
img = Image.open(buf)
img_arr = np.array(img)
writer.add_image('FeatureImportance/Chart', img_arr, epoch, dataformats='HWC')
plt.close()
def evaluate(perdicted_class, predicted_proba, true_class, writer: SummaryWriter, epoch: int): def evaluate(perdicted_class, predicted_proba, true_class, writer: SummaryWriter, epoch: int):
accuracy = accuracy_score(true_class, perdicted_class) accuracy = accuracy_score(true_class, perdicted_class)
precision = precision_score(true_class, perdicted_class, zero_division=0) precision = precision_score(true_class, perdicted_class, zero_division=0)

View File

@@ -1,13 +1,137 @@
from torch.utils.tensorboard import SummaryWriter from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from logging import getLogger from logging import getLogger
from evals import evaluate from pathlib import Path
import pandas as pd
import numpy as np
import joblib
from datetime import datetime
from ml.evals import evaluate, log_feature_importance
from ml.arch import XGBoostAgentClassifier, LightGBMAgentClassifier, LABELS
logger = getLogger(__name__) logger = getLogger(__name__)
FEATURE_COLS_EXCLUDE = ['sessionId', 'experimentId', 'is_agent', 'xp_human_only', 'xp_market_mode', 'browser_family']
RUNS_DIR = Path('ml/runs')
CHECKPOINTS_DIR = Path('ml/checkpoints')
def train():
pass
def prepare_data(df):
"""
Prepare feature matrix and labels from raw dataframe
Handles missing labels, feature selection, and categorical encoding
Returns: (X, y, feature_cols)
"""
# drop rows with missing labels
n_before = len(df)
df = df[df['is_agent'].notna()].copy()
n_dropped = n_before - len(df)
if n_dropped > 0:
logger.warning(f"Dropped {n_dropped} sessions with missing labels")
if len(df) == 0:
logger.error("No labeled data available")
return None, None, None
feature_cols = [c for c in df.columns if c not in FEATURE_COLS_EXCLUDE]
# handle categorical browser_family via one-hot encoding
if 'browser_family' in df.columns:
browser_dummies = pd.get_dummies(df['browser_family'], prefix='browser', drop_first=True)
df = pd.concat([df, browser_dummies], axis=1)
feature_cols.extend(browser_dummies.columns.tolist())
X = df[feature_cols].fillna(0)
y = df['is_agent'].astype(int)
return X, y, feature_cols
def train(data_path=None, model_type='xgboost', test_size=0.2, random_state=42,
n_estimators=200, max_depth=6, learning_rate=0.05):
"""
Train agent detection classifier
Args:
data_path: path to labeled feature matrix CSV or parquet
model_type: 'xgboost' or 'lightgbm'
test_size: fraction for test split
random_state: seed for reproducibility
"""
RUNS_DIR.mkdir(exist_ok=True)
CHECKPOINTS_DIR.mkdir(exist_ok=True)
run_name = f"{model_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
writer = SummaryWriter(log_dir=RUNS_DIR / run_name)
logger.info(f"Starting training run: {run_name}")
# load data
if data_path is None:
logger.error("data_path required")
return
df = pd.read_parquet(data_path)
logger.info(f"Loaded {len(df)} sessions from {data_path}")
# prepare features and labels
if 'is_agent' not in df.columns:
logger.error("Missing is_agent column")
return
X, y, feature_cols = prepare_data(df)
if X is None:
return
# class distribution
n_agents = y.sum()
n_humans = (y == 0).sum()
logger.info(f"Class distribution: {n_humans} humans, {n_agents} agents (ratio {n_humans/n_agents:.2f})")
# train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state, stratify=y
)
logger.info(f"Train: {len(X_train)}, Test: {len(X_test)}")
# init model
if model_type == 'xgboost':
model = XGBoostAgentClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate
)
elif model_type == 'lightgbm':
model = LightGBMAgentClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate
)
else:
logger.error(f"Unknown model type: {model_type}")
return
# train with eval set for early stopping
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
logger.info("Training complete")
# evaluate on test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
evaluate(y_pred, y_prob, y_test, writer, epoch=0)
# log feature importance
log_feature_importance(writer, model, X.columns.tolist(), epoch=0)
# save model
model_path = CHECKPOINTS_DIR / f"{run_name}.pkl"
joblib.dump({'model': model, 'feature_cols': X.columns.tolist(), 'run_name': run_name}, model_path)
logger.info(f"Model saved to {model_path}")
writer.close()
return model, X.columns.tolist()
if __name__ == "__main__": if __name__ == "__main__":
train() import sys
data_path = sys.argv[1]
model_type = sys.argv[2] if len(sys.argv) > 2 else 'xgboost'
train(data_path, model_type=model_type)