feat: introduced cumulative features step for state definition

This commit is contained in:
2025-11-29 22:28:40 +01:00
parent d654bbf4b4
commit 955102090d
6 changed files with 135 additions and 181 deletions

View File

@@ -8,23 +8,123 @@ from typing import Optional, Dict, Any
from collections import Counter
from procesing.steps.base import BaseContextStep
def _extract_features_for_session(session_df: pd.DataFrame, session_timeout_sec: float = 900) -> Dict[str, Any]:
"""Compute features for single session.
Args:
session_df: interaction events for this session
session_timeout_sec: max gap between events before resetting duration (default 900s = 15min)
"""
features = {}
# basic counts
features['total_interactions'] = len(session_df)
event_counts = session_df['eventName'].value_counts().to_dict()
features['page_views'] = event_counts.get('page_view', 0) + event_counts.get('view_item_page', 0)
features['item_views'] = event_counts.get('view_item_page', 0)
features['searches'] = event_counts.get('search', 0)
features['cart_adds'] = event_counts.get('add_item_to_cart', 0)
# hover events
hover_events = ['hover_over_title', 'hover_over_paragraph', 'hover_over_link', 'hover_over_button']
features['hovers'] = sum(event_counts.get(ev, 0) for ev in hover_events)
# product-level signals
product_ids = session_df['productId'].dropna()
features['unique_products_viewed'] = product_ids.nunique()
if len(product_ids) > 0:
product_view_counts = Counter(product_ids)
features['product_view_depth'] = max(product_view_counts.values())
else:
features['product_view_depth'] = 0
# temporal features with session timeout logic
if 'ts' in session_df.columns:
timestamps = session_df['ts'].sort_values()
# compute active duration considering timeout gaps
if len(timestamps) > 1:
time_diffs = timestamps.diff().dropna().dt.total_seconds()
# only count gaps shorter than timeout towards active session duration
active_diffs = time_diffs[time_diffs <= session_timeout_sec]
features['session_duration_sec'] = active_diffs.sum() if len(active_diffs) > 0 else 0.0
features['avg_time_between_events'] = time_diffs.mean()
features['std_time_between_events'] = time_diffs.std()
else:
features['session_duration_sec'] = 0.0
features['avg_time_between_events'] = 0.0
features['std_time_between_events'] = 0.0
if features['session_duration_sec'] > 0:
features['interaction_velocity'] = (features['total_interactions'] / features['session_duration_sec']) * 60
else:
features['interaction_velocity'] = 0.0
else:
features['session_duration_sec'] = 0.0
features['interaction_velocity'] = 0.0
features['avg_time_between_events'] = 0.0
features['std_time_between_events'] = 0.0
# cart/conversion signals
features['cart_to_view_ratio'] = features['cart_adds'] / features['item_views'] if features['item_views'] > 0 else 0.0
return features
def _apply_to_slice(df: pd.DataFrame) -> pd.DataFrame:
"""Apply feature extraction to sliding window of interactions."""
# add columns of all features at each step
new_cols = ["total_interactions", "page_views", "item_views", "searches",
"cart_adds", "hovers", "unique_products_viewed", "product_view_depth",
"session_duration_sec", "interaction_velocity",
"avg_time_between_events", "std_time_between_events",
"cart_to_view_ratio"]
for col in new_cols: df[col] = np.nan
for idx in range(1, len(df) + 1):
features = _extract_features_for_session(df.iloc[:idx])
# fillna kinda meh
features = { k: (v if not pd.isna(v) else 0.0) for k, v in features.items() }
for col in new_cols:
df.at[df.index[idx - 1], col] = features[col]
#print(f"Processed {idx}/{len(df)} events for session {df['sessionId'].iloc[0]}")
return df
class BuildStateSpaceStep(BaseContextStep):
"""
Build state space representation S_t from session features.
Input: session_features DataFrame
Output: state_space_df DataFrame with S_t vectors
"""
def transform(self, rich_dataset: pd.DataFrame) -> pd.DataFrame:
# check if features are present
required_cols = ["total_interactions", "page_views", "item_views", "searches",
"cart_adds", "hovers", "unique_products_viewed", "product_view_depth",
"session_duration_sec", "interaction_velocity",
"avg_time_between_events", "std_time_between_events",
"cart_to_view_ratio"]
if not all(col in rich_dataset.columns for col in required_cols):
raise ValueError("Missing required columns for feature extraction.")
if rich_dataset.empty:
return pd.DataFrame()
# For simplicity, we return as is
return rich_dataset.copy()
class ExtractSessionFeaturesStep(BaseContextStep):
"""
Extract session-level behavioral features from interaction logs.
Input: interactions_df (user-interactions from earlier pipeline step)
Output: session_features DataFrame [sessionId, feature_1, feature_2, ...]
Features computed:
- total_interactions: count of all events
- page_views, item_views, searches, cart_adds: event type counts
- hovers: hover event counts
- unique_products_viewed: distinct product IDs
- interaction_velocity: events per minute
- session_duration_sec: time span of session
- avg_time_between_events: mean inter-event time
- product_view_depth: max views for single product (attention signal)
Output: interactions_df with added session feature columns
"""
def transform(self, interactions_df: pd.DataFrame) -> pd.DataFrame:
@@ -39,66 +139,11 @@ class ExtractSessionFeaturesStep(BaseContextStep):
# group by session and compute features
session_features = []
for session_id, session_df in interactions_df.groupby('sessionId'):
features = self._extract_features_for_session(session_id, session_df)
session_features.append(features)
new_slice = _apply_to_slice(session_df.sort_values('ts'))
session_features.append(new_slice)
return pd.DataFrame(session_features)
return pd.concat(session_features, ignore_index=True)
def _extract_features_for_session(self, session_id: str, session_df: pd.DataFrame) -> Dict[str, Any]:
"""Compute features for single session."""
features = {'sessionId': session_id}
# basic counts
features['total_interactions'] = len(session_df)
event_counts = session_df['eventName'].value_counts().to_dict()
features['page_views'] = event_counts.get('page_view', 0) + event_counts.get('view_item_page', 0)
features['item_views'] = event_counts.get('view_item_page', 0)
features['searches'] = event_counts.get('search', 0)
features['cart_adds'] = event_counts.get('add_item_to_cart', 0)
# hover events
hover_events = ['hover_over_title', 'hover_over_paragraph', 'hover_over_link', 'hover_over_button']
features['hovers'] = sum(event_counts.get(ev, 0) for ev in hover_events)
# product-level signals
product_ids = session_df['productId'].dropna()
features['unique_products_viewed'] = product_ids.nunique()
if len(product_ids) > 0:
product_view_counts = Counter(product_ids)
features['product_view_depth'] = max(product_view_counts.values())
else:
features['product_view_depth'] = 0
# temporal features
if 'ts' in session_df.columns:
timestamps = session_df['ts'].sort_values()
features['session_duration_sec'] = (timestamps.max() - timestamps.min()).total_seconds()
if features['session_duration_sec'] > 0:
features['interaction_velocity'] = (features['total_interactions'] / features['session_duration_sec']) * 60
else:
features['interaction_velocity'] = 0.0
# inter-event timing
if len(timestamps) > 1:
time_diffs = timestamps.diff().dropna().dt.total_seconds()
features['avg_time_between_events'] = time_diffs.mean()
features['std_time_between_events'] = time_diffs.std()
else:
features['avg_time_between_events'] = 0.0
features['std_time_between_events'] = 0.0
else:
features['session_duration_sec'] = 0.0
features['interaction_velocity'] = 0.0
features['avg_time_between_events'] = 0.0
features['std_time_between_events'] = 0.0
# cart/conversion signals
features['cart_to_view_ratio'] = features['cart_adds'] / features['item_views'] if features['item_views'] > 0 else 0.0
return features
class FilterSessionInteractionsStep(BaseContextStep):