mirror of
https://github.com/velocitatem/PHANTOM.git
synced 2026-05-31 08:33:36 +00:00
feat: separating modules and adding training logs paths
This commit is contained in:
@@ -9,6 +9,38 @@ from PIL import Image
|
|||||||
|
|
||||||
logger = getLogger(__name__)
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def log_feature_importance(writer, model, feature_names, epoch):
|
||||||
|
"""Visualize and log feature importance to TensorBoard"""
|
||||||
|
if not hasattr(model, 'feature_importances_') or model.feature_importances_ is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
importance = model.feature_importances_
|
||||||
|
indices = np.argsort(importance)[::-1][:20] # top 20
|
||||||
|
top_features = [feature_names[i] for i in indices]
|
||||||
|
top_importance = importance[indices]
|
||||||
|
|
||||||
|
for i, (feat, imp) in enumerate(zip(top_features, top_importance)):
|
||||||
|
writer.add_scalar(f'FeatureImportance/{feat}', imp, epoch)
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(10, 8))
|
||||||
|
ax.barh(range(len(top_features)), top_importance, align='center')
|
||||||
|
ax.set_yticks(range(len(top_features)))
|
||||||
|
ax.set_yticklabels(top_features)
|
||||||
|
ax.invert_yaxis()
|
||||||
|
ax.set_xlabel('Importance')
|
||||||
|
ax.set_title(f'Top 20 Feature Importance (Epoch {epoch})')
|
||||||
|
ax.grid(axis='x', alpha=0.3)
|
||||||
|
|
||||||
|
buf = io.BytesIO()
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(buf, format='png', dpi=100)
|
||||||
|
buf.seek(0)
|
||||||
|
img = Image.open(buf)
|
||||||
|
img_arr = np.array(img)
|
||||||
|
writer.add_image('FeatureImportance/Chart', img_arr, epoch, dataformats='HWC')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
def evaluate(perdicted_class, predicted_proba, true_class, writer: SummaryWriter, epoch: int):
|
def evaluate(perdicted_class, predicted_proba, true_class, writer: SummaryWriter, epoch: int):
|
||||||
accuracy = accuracy_score(true_class, perdicted_class)
|
accuracy = accuracy_score(true_class, perdicted_class)
|
||||||
precision = precision_score(true_class, perdicted_class, zero_division=0)
|
precision = precision_score(true_class, perdicted_class, zero_division=0)
|
||||||
|
|||||||
@@ -1,13 +1,137 @@
|
|||||||
from torch.utils.tensorboard import SummaryWriter
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from evals import evaluate
|
from pathlib import Path
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import joblib
|
||||||
|
from datetime import datetime
|
||||||
|
from ml.evals import evaluate, log_feature_importance
|
||||||
|
from ml.arch import XGBoostAgentClassifier, LightGBMAgentClassifier, LABELS
|
||||||
|
|
||||||
logger = getLogger(__name__)
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
FEATURE_COLS_EXCLUDE = ['sessionId', 'experimentId', 'is_agent', 'xp_human_only', 'xp_market_mode', 'browser_family']
|
||||||
|
RUNS_DIR = Path('ml/runs')
|
||||||
|
CHECKPOINTS_DIR = Path('ml/checkpoints')
|
||||||
|
|
||||||
def train():
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
def prepare_data(df):
|
||||||
|
"""
|
||||||
|
Prepare feature matrix and labels from raw dataframe
|
||||||
|
Handles missing labels, feature selection, and categorical encoding
|
||||||
|
Returns: (X, y, feature_cols)
|
||||||
|
"""
|
||||||
|
# drop rows with missing labels
|
||||||
|
n_before = len(df)
|
||||||
|
df = df[df['is_agent'].notna()].copy()
|
||||||
|
n_dropped = n_before - len(df)
|
||||||
|
if n_dropped > 0:
|
||||||
|
logger.warning(f"Dropped {n_dropped} sessions with missing labels")
|
||||||
|
|
||||||
|
if len(df) == 0:
|
||||||
|
logger.error("No labeled data available")
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
feature_cols = [c for c in df.columns if c not in FEATURE_COLS_EXCLUDE]
|
||||||
|
|
||||||
|
# handle categorical browser_family via one-hot encoding
|
||||||
|
if 'browser_family' in df.columns:
|
||||||
|
browser_dummies = pd.get_dummies(df['browser_family'], prefix='browser', drop_first=True)
|
||||||
|
df = pd.concat([df, browser_dummies], axis=1)
|
||||||
|
feature_cols.extend(browser_dummies.columns.tolist())
|
||||||
|
|
||||||
|
X = df[feature_cols].fillna(0)
|
||||||
|
y = df['is_agent'].astype(int)
|
||||||
|
|
||||||
|
return X, y, feature_cols
|
||||||
|
|
||||||
|
|
||||||
|
def train(data_path=None, model_type='xgboost', test_size=0.2, random_state=42,
|
||||||
|
n_estimators=200, max_depth=6, learning_rate=0.05):
|
||||||
|
"""
|
||||||
|
Train agent detection classifier
|
||||||
|
Args:
|
||||||
|
data_path: path to labeled feature matrix CSV or parquet
|
||||||
|
model_type: 'xgboost' or 'lightgbm'
|
||||||
|
test_size: fraction for test split
|
||||||
|
random_state: seed for reproducibility
|
||||||
|
"""
|
||||||
|
RUNS_DIR.mkdir(exist_ok=True)
|
||||||
|
CHECKPOINTS_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
run_name = f"{model_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||||
|
writer = SummaryWriter(log_dir=RUNS_DIR / run_name)
|
||||||
|
logger.info(f"Starting training run: {run_name}")
|
||||||
|
|
||||||
|
# load data
|
||||||
|
if data_path is None:
|
||||||
|
logger.error("data_path required")
|
||||||
|
return
|
||||||
|
df = pd.read_parquet(data_path)
|
||||||
|
logger.info(f"Loaded {len(df)} sessions from {data_path}")
|
||||||
|
|
||||||
|
# prepare features and labels
|
||||||
|
if 'is_agent' not in df.columns:
|
||||||
|
logger.error("Missing is_agent column")
|
||||||
|
return
|
||||||
|
|
||||||
|
X, y, feature_cols = prepare_data(df)
|
||||||
|
if X is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# class distribution
|
||||||
|
n_agents = y.sum()
|
||||||
|
n_humans = (y == 0).sum()
|
||||||
|
logger.info(f"Class distribution: {n_humans} humans, {n_agents} agents (ratio {n_humans/n_agents:.2f})")
|
||||||
|
|
||||||
|
# train/test split with stratification
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
X, y, test_size=test_size, random_state=random_state, stratify=y
|
||||||
|
)
|
||||||
|
logger.info(f"Train: {len(X_train)}, Test: {len(X_test)}")
|
||||||
|
|
||||||
|
# init model
|
||||||
|
if model_type == 'xgboost':
|
||||||
|
model = XGBoostAgentClassifier(
|
||||||
|
n_estimators=n_estimators,
|
||||||
|
max_depth=max_depth,
|
||||||
|
learning_rate=learning_rate
|
||||||
|
)
|
||||||
|
elif model_type == 'lightgbm':
|
||||||
|
model = LightGBMAgentClassifier(
|
||||||
|
n_estimators=n_estimators,
|
||||||
|
max_depth=max_depth,
|
||||||
|
learning_rate=learning_rate
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.error(f"Unknown model type: {model_type}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# train with eval set for early stopping
|
||||||
|
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
|
||||||
|
logger.info("Training complete")
|
||||||
|
|
||||||
|
# evaluate on test set
|
||||||
|
y_pred = model.predict(X_test)
|
||||||
|
y_prob = model.predict_proba(X_test)[:, 1]
|
||||||
|
|
||||||
|
evaluate(y_pred, y_prob, y_test, writer, epoch=0)
|
||||||
|
|
||||||
|
# log feature importance
|
||||||
|
log_feature_importance(writer, model, X.columns.tolist(), epoch=0)
|
||||||
|
|
||||||
|
# save model
|
||||||
|
model_path = CHECKPOINTS_DIR / f"{run_name}.pkl"
|
||||||
|
joblib.dump({'model': model, 'feature_cols': X.columns.tolist(), 'run_name': run_name}, model_path)
|
||||||
|
logger.info(f"Model saved to {model_path}")
|
||||||
|
|
||||||
|
writer.close()
|
||||||
|
return model, X.columns.tolist()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
train()
|
import sys
|
||||||
|
data_path = sys.argv[1]
|
||||||
|
model_type = sys.argv[2] if len(sys.argv) > 2 else 'xgboost'
|
||||||
|
train(data_path, model_type=model_type)
|
||||||
|
|||||||
Reference in New Issue
Block a user