extra session feature extraction

This commit is contained in:
2025-11-29 17:42:30 +01:00
parent d0d18927cf
commit 5d5795b212

View File

@@ -0,0 +1,114 @@
"""
Session feature extraction for S_t component of state space.
Computes behavioral signals from interaction data already in pipeline.
"""
import pandas as pd
import numpy as np
from typing import Optional, Dict, Any
from collections import Counter
from procesing.steps.base import BaseContextStep
class ExtractSessionFeaturesStep(BaseContextStep):
"""
Extract session-level behavioral features from interaction logs.
Input: interactions_df (user-interactions from earlier pipeline step)
Output: session_features DataFrame [sessionId, feature_1, feature_2, ...]
Features computed:
- total_interactions: count of all events
- page_views, item_views, searches, cart_adds: event type counts
- hovers: hover event counts
- unique_products_viewed: distinct product IDs
- interaction_velocity: events per minute
- session_duration_sec: time span of session
- avg_time_between_events: mean inter-event time
- product_view_depth: max views for single product (attention signal)
"""
def transform(self, interactions_df: pd.DataFrame) -> pd.DataFrame:
if interactions_df.empty:
return pd.DataFrame()
# ensure timestamp column
if 'ts' in interactions_df.columns:
interactions_df = interactions_df.copy()
interactions_df['ts'] = pd.to_datetime(interactions_df['ts'])
# group by session and compute features
session_features = []
for session_id, session_df in interactions_df.groupby('sessionId'):
features = self._extract_features_for_session(session_id, session_df)
session_features.append(features)
return pd.DataFrame(session_features)
def _extract_features_for_session(self, session_id: str, session_df: pd.DataFrame) -> Dict[str, Any]:
"""Compute features for single session."""
features = {'sessionId': session_id}
# basic counts
features['total_interactions'] = len(session_df)
event_counts = session_df['eventName'].value_counts().to_dict()
features['page_views'] = event_counts.get('page_view', 0) + event_counts.get('view_item_page', 0)
features['item_views'] = event_counts.get('view_item_page', 0)
features['searches'] = event_counts.get('search', 0)
features['cart_adds'] = event_counts.get('add_item_to_cart', 0)
# hover events
hover_events = ['hover_over_title', 'hover_over_paragraph', 'hover_over_link', 'hover_over_button']
features['hovers'] = sum(event_counts.get(ev, 0) for ev in hover_events)
# product-level signals
product_ids = session_df['productId'].dropna()
features['unique_products_viewed'] = product_ids.nunique()
if len(product_ids) > 0:
product_view_counts = Counter(product_ids)
features['product_view_depth'] = max(product_view_counts.values())
else:
features['product_view_depth'] = 0
# temporal features
if 'ts' in session_df.columns:
timestamps = session_df['ts'].sort_values()
features['session_duration_sec'] = (timestamps.max() - timestamps.min()).total_seconds()
if features['session_duration_sec'] > 0:
features['interaction_velocity'] = (features['total_interactions'] / features['session_duration_sec']) * 60
else:
features['interaction_velocity'] = 0.0
# inter-event timing
if len(timestamps) > 1:
time_diffs = timestamps.diff().dropna().dt.total_seconds()
features['avg_time_between_events'] = time_diffs.mean()
features['std_time_between_events'] = time_diffs.std()
else:
features['avg_time_between_events'] = 0.0
features['std_time_between_events'] = 0.0
else:
features['session_duration_sec'] = 0.0
features['interaction_velocity'] = 0.0
features['avg_time_between_events'] = 0.0
features['std_time_between_events'] = 0.0
# cart/conversion signals
features['cart_to_view_ratio'] = features['cart_adds'] / features['item_views'] if features['item_views'] > 0 else 0.0
return features
class FilterSessionInteractionsStep(BaseContextStep):
"""
Filter interactions DataFrame to specific session.
Input: (interactions_df, session_id)
Output: interactions_df filtered to session_id
"""
def transform(self, data: tuple) -> pd.DataFrame:
interactions_df, session_id = data
return interactions_df[interactions_df['sessionId'] == session_id].copy()