mirror of
https://github.com/velocitatem/PHANTOM.git
synced 2026-05-31 08:33:36 +00:00
chore: syntax cleaning and code quality
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from sklearn.base import BaseEstimator, TransformerMixin
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
from procesing.context import PipelineContext
|
from procesing.context import PipelineContext
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
class BaseContextStep(BaseEstimator, TransformerMixin, ABC):
|
class BaseContextStep(BaseEstimator, TransformerMixin, ABC):
|
||||||
"""
|
"""
|
||||||
@@ -16,7 +17,7 @@ class BaseContextStep(BaseEstimator, TransformerMixin, ABC):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def transform(self, X):
|
def transform(self, X) -> Any:
|
||||||
"""Transform input using context. Must be implemented by subclass."""
|
"""Transform input using context. Must be implemented by subclass."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -29,11 +29,11 @@ class TemporalFeatureStep(BaseContextStep):
|
|||||||
self.timeout_sec = timeout_sec
|
self.timeout_sec = timeout_sec
|
||||||
self.velocity_window = velocity_window
|
self.velocity_window = velocity_window
|
||||||
|
|
||||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df = X.copy()
|
||||||
if df.empty or 'ts' not in df.columns:
|
if df.empty or 'ts' not in df.columns:
|
||||||
return pd.DataFrame(columns=['sessionId'])
|
return pd.DataFrame(columns=pd.Series(['sessionId']))
|
||||||
|
|
||||||
df = df.copy()
|
|
||||||
df['ts_dt'] = pd.to_datetime(df['ts'])
|
df['ts_dt'] = pd.to_datetime(df['ts'])
|
||||||
df = df.sort_values(['sessionId', 'ts_dt'])
|
df = df.sort_values(['sessionId', 'ts_dt'])
|
||||||
df['time_diff'] = df.groupby('sessionId')['ts_dt'].diff().dt.total_seconds()
|
df['time_diff'] = df.groupby('sessionId')['ts_dt'].diff().dt.total_seconds()
|
||||||
@@ -53,19 +53,18 @@ class TemporalFeatureStep(BaseContextStep):
|
|||||||
(agg['total_interactions'] / agg['session_duration_sec']) * 60, 0)
|
(agg['total_interactions'] / agg['session_duration_sec']) * 60, 0)
|
||||||
|
|
||||||
vel = df.set_index('ts_dt').groupby('sessionId').resample(self.velocity_window, include_groups=False).size()
|
vel = df.set_index('ts_dt').groupby('sessionId').resample(self.velocity_window, include_groups=False).size()
|
||||||
agg = agg.merge(vel.groupby('sessionId').max().rename('max_velocity_5min'),
|
agg = agg.merge(vel.groupby('sessionId').max().rename('max_velocity_5min'),on='sessionId', how='left').fillna({'max_velocity_5min': 0}) # warns but its a series so whatevs
|
||||||
on='sessionId', how='left').fillna({'max_velocity_5min': 0})
|
|
||||||
return agg
|
return agg
|
||||||
|
|
||||||
|
|
||||||
class BehavioralFeatureStep(BaseContextStep):
|
class BehavioralFeatureStep(BaseContextStep):
|
||||||
"""Vectorized event counts and ratios per session."""
|
"""Vectorized event counts and ratios per session."""
|
||||||
|
|
||||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df = X.copy()
|
||||||
if df.empty or 'eventName' not in df.columns:
|
if df.empty or 'eventName' not in df.columns:
|
||||||
return pd.DataFrame(columns=['sessionId'])
|
return pd.DataFrame(columns=pd.Series(['sessionId']))
|
||||||
|
|
||||||
df = df.copy()
|
|
||||||
for cat, events in EVENT_CATS.items():
|
for cat, events in EVENT_CATS.items():
|
||||||
df[f'is_{cat}'] = df['eventName'].isin(events)
|
df[f'is_{cat}'] = df['eventName'].isin(events)
|
||||||
df['is_hover'] = df['is_hover'] | df['eventName'].str.startswith('hover_over_')
|
df['is_hover'] = df['is_hover'] | df['eventName'].str.startswith('hover_over_')
|
||||||
@@ -86,11 +85,10 @@ class BehavioralFeatureStep(BaseContextStep):
|
|||||||
class ProductFeatureStep(BaseContextStep):
|
class ProductFeatureStep(BaseContextStep):
|
||||||
"""Vectorized product interaction features: diversity, depth, price sensitivity."""
|
"""Vectorized product interaction features: diversity, depth, price sensitivity."""
|
||||||
|
|
||||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df = X.copy()
|
||||||
if df.empty:
|
if df.empty:
|
||||||
return pd.DataFrame(columns=['sessionId'])
|
return pd.DataFrame(columns=pd.Series(['sessionId']))
|
||||||
|
|
||||||
df = df.copy()
|
|
||||||
price_col = next((c for c in ['metadata_base_price', 'metadata_price', 'base_price'] if c in df.columns), None)
|
price_col = next((c for c in ['metadata_base_price', 'metadata_price', 'base_price'] if c in df.columns), None)
|
||||||
df['price_seen'] = pd.to_numeric(df[price_col], errors='coerce') if price_col else np.nan
|
df['price_seen'] = pd.to_numeric(df[price_col], errors='coerce') if price_col else np.nan
|
||||||
|
|
||||||
@@ -111,9 +109,10 @@ class ProductFeatureStep(BaseContextStep):
|
|||||||
class UserAgentFeatureStep(BaseContextStep):
|
class UserAgentFeatureStep(BaseContextStep):
|
||||||
"""Parse userAgent into bot-detection signals."""
|
"""Parse userAgent into bot-detection signals."""
|
||||||
|
|
||||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame|pd.Series:
|
||||||
|
df = X.copy()
|
||||||
if df.empty or 'userAgent' not in df.columns:
|
if df.empty or 'userAgent' not in df.columns:
|
||||||
return pd.DataFrame(columns=['sessionId'])
|
return pd.DataFrame(columns=pd.Series(['sessionId']))
|
||||||
|
|
||||||
ua = df.groupby('sessionId')['userAgent'].first().reset_index()
|
ua = df.groupby('sessionId')['userAgent'].first().reset_index()
|
||||||
ua['is_headless'] = ua['userAgent'].str.contains(HEADLESS_RE, na=False)
|
ua['is_headless'] = ua['userAgent'].str.contains(HEADLESS_RE, na=False)
|
||||||
@@ -135,9 +134,10 @@ class ExtractSessionFeaturesStep(BaseContextStep):
|
|||||||
Output: session-level feature matrix
|
Output: session-level feature matrix
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
||||||
if df.empty:
|
if X.empty:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
df = X.copy()
|
||||||
|
|
||||||
# run all feature steps and merge on sessionId
|
# run all feature steps and merge on sessionId
|
||||||
temporal = TemporalFeatureStep(self.context).transform(df)
|
temporal = TemporalFeatureStep(self.context).transform(df)
|
||||||
@@ -165,7 +165,8 @@ class JoinLabelsStep(BaseContextStep):
|
|||||||
Output: labeled feature matrix with is_agent column
|
Output: labeled feature matrix with is_agent column
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def transform(self, data) -> pd.DataFrame:
|
def transform(self, X : tuple) -> pd.DataFrame:
|
||||||
|
data = X;
|
||||||
if isinstance(data, tuple):
|
if isinstance(data, tuple):
|
||||||
features_df, experiments_df = data
|
features_df, experiments_df = data
|
||||||
else:
|
else:
|
||||||
@@ -199,7 +200,8 @@ class ValidateDataStep(BaseContextStep):
|
|||||||
"""
|
"""
|
||||||
REQUIRED = ['sessionId', 'eventName', 'ts']
|
REQUIRED = ['sessionId', 'eventName', 'ts']
|
||||||
|
|
||||||
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df = X.copy()
|
||||||
report = {'status': 'valid', 'rows': len(df), 'sessions': 0}
|
report = {'status': 'valid', 'rows': len(df), 'sessions': 0}
|
||||||
if df.empty:
|
if df.empty:
|
||||||
report['status'] = 'empty'
|
report['status'] = 'empty'
|
||||||
@@ -235,7 +237,7 @@ def _extract_features_for_session(session_df: pd.DataFrame, session_timeout_sec:
|
|||||||
session_df['sessionId'] = 'tmp'
|
session_df['sessionId'] = 'tmp'
|
||||||
|
|
||||||
# use a dummy context for the steps
|
# use a dummy context for the steps
|
||||||
class DummyCtx: config = {}
|
class DummyCtx: config = {} # should maybe inherit but whatever
|
||||||
ctx = DummyCtx()
|
ctx = DummyCtx()
|
||||||
|
|
||||||
t = TemporalFeatureStep(ctx, timeout_sec=session_timeout_sec).transform(session_df)
|
t = TemporalFeatureStep(ctx, timeout_sec=session_timeout_sec).transform(session_df)
|
||||||
|
|||||||
Reference in New Issue
Block a user