Airflow addition (#28)

* introducing airflow to run pipeline * chore: updating dag with upload to registry * introducing complete provider (non refactored and noisy) * chore: removing old shit * generic pricing baselines * feature: super simple model registry (to be updated maybe third party OS software) * chore: refactoring the providers docker config and requirements * chore: refactored and broke down components (braking * exporting all * local pipeline excution working * fix: fixing import structures from nonrelativistic * chore: enables cross comm pickling with fully e2e pipeline compilation * docs: what the pipeline is like now * pipelines local running and pipeline high level definition * cleaning old pipeline and vectorization * leaked but fixing, not so important * test: started with pipeline step testing * chore: cleaning up provider of prices * test: extra tests wit hsemantic meaning checks * migrating pricers * feature: introducing pricing predictors (pricers) * chore: e2e is done with new pipeline * extra session feature extraction * feature: experiemntal sessin pricer and metrics(vibe) * chore: redefined and connected pricers (#29)
2026-07-16 01:53:37 +00:00 · 2025-11-29 17:50:16 +01:00
parent 2a0e44ab24
commit ad9423bf59
49 changed files with 3642 additions and 619 deletions
--- a/experiments/procesing/steps/init.py
+++ b/experiments/procesing/steps/init.py
@@ -0,0 +1,27 @@
+from procesing.steps.base import BaseContextStep
+from procesing.steps.fetch import FetchInteractionsStep, FetchPriceLogsStep, FetchExperimentsStep
+from procesing.steps.join import JoinExperimentsStep
+from procesing.steps.augment import CreatePriceBucketsStep, AugmentEventNamesStep
+from procesing.steps.chunk import ChunkByTimeWindowStep
+from procesing.steps.demand import ComputeDemandStep, ComputeDemandForChunksStep
+from procesing.steps.elasticity import AggregatePriceLogsStep, ComputeElasticityStep
+from procesing.steps.pricing import StateSpace, BuildStateSpaceStep, FitPricingFunctionStep, PredictPricesStep
+
+__all__ = [
+    'BaseContextStep',
+    'FetchInteractionsStep',
+    'FetchPriceLogsStep',
+    'FetchExperimentsStep',
+    'JoinExperimentsStep',
+    'CreatePriceBucketsStep',
+    'AugmentEventNamesStep',
+    'ChunkByTimeWindowStep',
+    'ComputeDemandStep',
+    'ComputeDemandForChunksStep',
+    'AggregatePriceLogsStep',
+    'ComputeElasticityStep',
+    'StateSpace',
+    'BuildStateSpaceStep',
+    'FitPricingFunctionStep',
+    'PredictPricesStep',
+]
--- a/experiments/procesing/steps/augment.py
+++ b/experiments/procesing/steps/augment.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pandas as pd
+from procesing.steps.base import BaseContextStep
+
+class CreatePriceBucketsStep(BaseContextStep):
+    """Create price bucket labels from price data"""
+
+    def transform(self, df: pd.DataFrame):
+        if df.empty or 'metadata_price' not in df.columns:
+            df['price_bucket'] = ""
+            return df
+
+        n_buckets = self.context.config.get('n_price_buckets', 5)
+
+        if df['metadata_price'].notnull().sum() > 0:
+            try:
+                price_buckets = pd.qcut(
+                    df['metadata_price'],
+                    q=n_buckets,
+                    labels=[f"PB_{i+1}" for i in range(n_buckets)],
+                    duplicates='drop'
+                )
+            except ValueError:
+                # fallback for insufficient unique values
+                price_buckets = df['metadata_price'].apply(
+                    lambda x: f"P_{int(x)}" if pd.notnull(x) else ""
+                )
+        else:
+            price_buckets = pd.Series([""] * len(df), index=df.index)
+
+        df['price_bucket'] = price_buckets
+        return df
+
+
+class AugmentEventNamesStep(BaseContextStep):
+    """Augment event names with product and price bucket schema"""
+
+    def transform(self, df: pd.DataFrame):
+        if df.empty:
+            return df
+
+        # Create schema: _productId@price_bucket
+        has_product = df.get('productId', pd.Series()).notnull()
+        has_bucket = df.get('price_bucket', pd.Series()).notnull()
+
+        df['metadata_schema'] = np.where(
+            has_product & has_bucket,
+            "_" + df['productId'].astype(str) + "@" + df['price_bucket'].astype(str),
+            ""
+        )
+
+        df['eventName'] = df['eventName'] + df['metadata_schema']
+        return df
--- a/experiments/procesing/steps/base.py
+++ b/experiments/procesing/steps/base.py
@@ -0,0 +1,31 @@
+from abc import ABC, abstractmethod
+from sklearn.base import BaseEstimator, TransformerMixin
+from procesing.context import PipelineContext
+
+class BaseContextStep(BaseEstimator, TransformerMixin, ABC):
+    """
+    Base for all pipeline steps.
+    Each step is stateless, context-driven, and performs ONE transformation.
+    """
+
+    def __init__(self, context: PipelineContext):
+        self.context = context
+
+    def fit(self, X=None, y=None):
+        """Most steps don't need training"""
+        return self
+
+    @abstractmethod
+    def transform(self, X):
+        """Transform input using context. Must be implemented by subclass."""
+        pass
+
+    def get_params(self, deep=True):
+        """sklearn compatibility"""
+        return {'context': self.context}
+
+    def set_params(self, **params):
+        """sklearn compatibility"""
+        if 'context' in params:
+            self.context = params['context']
+        return self
--- a/experiments/procesing/steps/chunk.py
+++ b/experiments/procesing/steps/chunk.py
@@ -0,0 +1,34 @@
+import pandas as pd
+from procesing.steps.base import BaseContextStep
+
+class ChunkByTimeWindowStep(BaseContextStep):
+    """
+    Chunk dataframe into time windows.
+    Returns list of dicts with window metadata.
+    """
+
+    def transform(self, df: pd.DataFrame):
+        if df.empty:
+            return []
+
+        df = df.copy()
+        ts_col = self.context.config.get('ts_col', 'ts')
+        window_size = self.context.window_size
+
+        # ensure datetime
+        if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
+            df[ts_col] = pd.to_datetime(df[ts_col])
+
+        df = df.sort_values(ts_col)
+        df['_window'] = df[ts_col].dt.floor(window_size)
+
+        chunks = []
+        for idx, (window_start, group) in enumerate(df.groupby('_window')):
+            chunks.append({
+                'window_start': window_start,
+                'window_end': window_start + pd.Timedelta(window_size),
+                'window_idx': idx,
+                'data': group.drop(columns=['_window'])
+            })
+
+        return chunks
--- a/experiments/procesing/steps/demand.py
+++ b/experiments/procesing/steps/demand.py
@@ -0,0 +1,61 @@
+import pandas as pd
+from procesing.steps.base import BaseContextStep
+
+class ComputeDemandStep(BaseContextStep):
+    """
+    Compute demand vector for a single time window or dataframe.
+    Input: single chunk dict OR raw dataframe
+    Output: demand dataframe with [productId, demand_score]
+    """
+
+    def transform(self, chunk):
+        # handle both chunk dict and raw dataframe
+        if isinstance(chunk, dict):
+            interactions = chunk['data']
+            window_meta = {k: v for k, v in chunk.items() if k != 'data'}
+        else:
+            interactions = chunk
+            window_meta = {}
+
+        products = self.context.products
+        unique_products = products['id'].unique()
+
+        # apply filters if configured
+        session_filter = self.context.config.get('session_filter')
+        experiment_filter = self.context.config.get('experiment_filter')
+
+        if session_filter and 'sessionId' in interactions.columns:
+            interactions = interactions[interactions['sessionId'] == session_filter]
+        if experiment_filter and 'experimentId' in interactions.columns:
+            interactions = interactions[interactions['experimentId'] == experiment_filter]
+
+        interactions_with_products = interactions.dropna(subset=['productId'])
+
+        if interactions_with_products.empty:
+            demand_df = pd.DataFrame({
+                'productId': unique_products,
+                'demand_score': 0
+            })
+        else:
+            # crosstab for simple demand count
+            demand_df = pd.crosstab(
+                interactions_with_products['productId'],
+                'count'
+            ).reindex(unique_products, fill_value=0).reset_index()
+            demand_df.columns = ['productId', 'demand_score']
+
+        # attach window metadata if present
+        if window_meta:
+            return {**window_meta, 'demand_vector': demand_df}
+        return demand_df
+
+
+class ComputeDemandForChunksStep(BaseContextStep):
+    """Apply ComputeDemandStep to list of chunks"""
+
+    def transform(self, chunks: list):
+        if not chunks:
+            return []
+
+        demand_step = ComputeDemandStep(self.context)
+        return [demand_step.transform(chunk) for chunk in chunks]
--- a/experiments/procesing/steps/elasticity.py
+++ b/experiments/procesing/steps/elasticity.py
@@ -0,0 +1,253 @@
+import numpy as np
+import pandas as pd
+from typing import Dict, List
+from procesing.steps.base import BaseContextStep
+
+class AggregatePriceLogsStep(BaseContextStep):
+    """
+    Aggregate price logs into time windows using VECTORIZED operations.
+    Input: price_logs_df
+    Output: list of price chunks with [productId, price]
+    """
+
+    def transform(self, price_logs_df: pd.DataFrame):
+        if price_logs_df.empty:
+            return []
+
+        df = price_logs_df.copy()
+        ts_col = self.context.config.get('ts_col', 'ts')
+        window_size = self.context.window_size
+
+        # ensure datetime
+        if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
+            df[ts_col] = pd.to_datetime(df[ts_col])
+
+        df = df.sort_values([ts_col, 'productId'])
+        products = self.context.products
+        unique_products = products['id'].unique()
+
+        # VECTORIZED: group by product, resample by time window, compute mean
+        df_indexed = df.set_index(ts_col)
+
+        windowed = (
+            df_indexed
+            .groupby('productId')['price']
+            .resample(window_size)
+            .mean()
+            .reset_index()
+        )
+
+        # forward fill missing windows (carry last known price)
+        windowed = windowed.sort_values([ts_col, 'productId'])
+        windowed['price'] = windowed.groupby('productId')['price'].ffill()
+        windowed = windowed.dropna(subset=['price'])
+
+        # group into chunks by window
+        chunks = []
+        for window_start, group in windowed.groupby(ts_col):
+            price_vector = group[['productId', 'price']].copy()
+
+            # fill missing products with last known price before this window
+            missing_products = set(unique_products) - set(price_vector['productId'])
+            if missing_products:
+                for pid in missing_products:
+                    last_price = df_indexed[
+                        (df_indexed['productId'] == pid) &
+                        (df_indexed.index < window_start)
+                    ]['price']
+
+                    if not last_price.empty:
+                        price_vector = pd.concat([
+                            price_vector,
+                            pd.DataFrame({'productId': [pid], 'price': [last_price.iloc[-1]]})
+                        ], ignore_index=True)
+
+            if not price_vector.empty:
+                chunks.append({
+                    'window_start': window_start,
+                    'window_end': window_start + pd.Timedelta(window_size),
+                    'price_vector': price_vector
+                })
+
+        return chunks
+
+
+class ComputeElasticityStep(BaseContextStep):
+    """
+    Compute price elasticity from demand and price chunks.
+    Input: (demand_chunks, price_chunks)
+    Output: elasticity_df [productId, elasticity, std_error, n_obs]
+    """
+
+    def transform(self, chunk_tuple: tuple):
+        demand_chunks, price_chunks = chunk_tuple
+
+        method = self.context.config.get('elasticity_method', 'point')
+        min_obs = self.context.config.get('min_observations', 2)
+
+        products = self.context.products
+        all_product_ids = products['id'].unique()
+
+        # align chunks by window_start
+        aligned = self._align_chunks(demand_chunks, price_chunks)
+
+        if not aligned:
+            return pd.DataFrame({
+                'productId': all_product_ids,
+                'elasticity': 0.0,
+                'std_error': 0.0,
+                'n_obs': 0
+            })
+
+        # build time series per product
+        product_series = self._build_timeseries(aligned)
+
+        # compute elasticity per product
+        elasticities = []
+        for pid, series in product_series.items():
+            if len(series) < min_obs:
+                elasticities.append({
+                    'productId': pid,
+                    'elasticity': 0.0,
+                    'std_error': 0.0,
+                    'n_obs': len(series)
+                })
+                continue
+
+            elast = self._compute_elasticity(series, method)
+            elasticities.append({
+                'productId': pid,
+                'elasticity': elast['value'],
+                'std_error': elast.get('std_error', 0.0),
+                'n_obs': len(series)
+            })
+
+        result_df = pd.DataFrame(elasticities)
+
+        # fill missing products with zero elasticity
+        observed_pids = set(result_df['productId'])
+        missing_pids = [p for p in all_product_ids if p not in observed_pids]
+
+        if missing_pids:
+            missing_df = pd.DataFrame({
+                'productId': missing_pids,
+                'elasticity': 0.0,
+                'std_error': 0.0,
+                'n_obs': 0
+            })
+            result_df = pd.concat([result_df, missing_df], ignore_index=True)
+
+        return result_df
+
+    def _align_chunks(self, demand_chunks: List[Dict], price_chunks: List[Dict]):
+        """Align demand and price chunks by window_start"""
+        price_lookup = {c['window_start']: c for c in price_chunks}
+        aligned = []
+
+        for dc in demand_chunks:
+            ws = dc['window_start']
+            if ws in price_lookup:
+                aligned.append({
+                    'window_start': ws,
+                    'window_end': dc['window_end'],
+                    'demand': dc['demand_vector'],
+                    'prices': price_lookup[ws]['price_vector']
+                })
+
+        return aligned
+
+    def _build_timeseries(self, aligned: List[Dict]):
+        """Build time series [timestamp, price, quantity] per product"""
+        series_by_product = {}
+
+        for chunk in aligned:
+            merged = chunk['demand'].merge(chunk['prices'], on='productId', how='inner')
+
+            for _, row in merged.iterrows():
+                pid = row['productId']
+                if pid not in series_by_product:
+                    series_by_product[pid] = []
+
+                series_by_product[pid].append({
+                    'timestamp': chunk['window_start'],
+                    'price': row['price'],
+                    'quantity': row['demand_score']
+                })
+
+        return series_by_product
+
+    def _compute_elasticity(self, series: List[Dict], method: str):
+        """Compute point or arc elasticity"""
+        prices = np.array([s['price'] for s in series])
+        quantities = np.array([s['quantity'] for s in series])
+
+        # filter out zero/negative values
+        valid = (prices > 0) & (quantities > 0)
+        if valid.sum() < 2:
+            return {'value': 0.0, 'std_error': 0.0}
+
+        prices = prices[valid]
+        quantities = quantities[valid]
+
+        if method == 'point':
+            return self._point_elasticity(prices, quantities)
+        elif method == 'arc':
+            return self._arc_elasticity(prices, quantities)
+        else:
+            raise ValueError(f"Unknown elasticity method: {method}")
+
+    def _point_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
+        """Point elasticity via log-log regression: log(Q) = a + b*log(P), elasticity = b"""
+        if len(prices) < 2:
+            return {'value': 0.0, 'std_error': 0.0}
+
+        log_p = np.log(prices)
+        log_q = np.log(quantities)
+
+        if log_p.std() == 0:
+            return {'value': 0.0, 'std_error': 0.0}
+
+        cov = np.cov(log_p, log_q)[0, 1]
+        var = np.var(log_p)
+        b = cov / var
+
+        # std error estimate
+        if len(prices) > 2:
+            residuals = log_q - (log_q.mean() + b * (log_p - log_p.mean()))
+            mse = (residuals ** 2).sum() / (len(prices) - 2)
+            se_b = np.sqrt(mse / (len(prices) * var))
+        else:
+            se_b = 0.0
+
+        return {'value': b, 'std_error': se_b}
+
+    def _arc_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
+        """Arc elasticity: average period-over-period elasticity"""
+        elasticities = []
+
+        for i in range(1, len(prices)):
+            p1, p2 = prices[i-1], prices[i]
+            q1, q2 = quantities[i-1], quantities[i]
+
+            p_avg = (p1 + p2) / 2
+            q_avg = (q1 + q2) / 2
+
+            if p_avg == 0 or q_avg == 0:
+                continue
+
+            delta_p = p2 - p1
+            delta_q = q2 - q1
+
+            if delta_p == 0:
+                continue
+
+            e = (delta_q / q_avg) / (delta_p / p_avg)
+            elasticities.append(e)
+
+        if not elasticities:
+            return {'value': 0.0, 'std_error': 0.0}
+
+        return {
+            'value': np.mean(elasticities),
+            'std_error': np.std(elasticities) / np.sqrt(len(elasticities))
+        }
--- a/experiments/procesing/steps/fetch.py
+++ b/experiments/procesing/steps/fetch.py
@@ -0,0 +1,46 @@
+import pandas as pd
+from procesing.steps.base import BaseContextStep
+
+class FetchInteractionsStep(BaseContextStep):
+    """Fetch raw interaction data from Kafka topic"""
+
+    def transform(self, X=None):
+        df = self.context.provider.fetch_kafka_topic('user-interactions')
+
+        if df.empty:
+            return df
+
+        # Explode metadata JSON column
+        if 'metadata' in df.columns:
+            df = df.join(
+                pd.json_normalize(df.pop('metadata'), sep='.').add_prefix('metadata_')
+            )
+
+        df = df.dropna(subset=['eventName'])
+
+        # Remap dateIndex if present
+        if 'metadata_dateIndex' in df.columns:
+            df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
+
+        return df
+
+
+class FetchPriceLogsStep(BaseContextStep):
+    """Fetch price log data from Kafka topic"""
+
+    def transform(self, X=None):
+        return self.context.provider.fetch_kafka_topic('price-logs')
+
+
+class FetchExperimentsStep(BaseContextStep):
+    """Fetch experiment metadata for given interaction data"""
+
+    def transform(self, interactions_df: pd.DataFrame):
+        if interactions_df.empty or 'experimentId' not in interactions_df.columns:
+            return pd.DataFrame()
+
+        exp_ids = interactions_df['experimentId'].dropna().unique().tolist()
+        if not exp_ids:
+            return pd.DataFrame()
+
+        return self.context.provider.fetch_experiments(exp_ids)
--- a/experiments/procesing/steps/join.py
+++ b/experiments/procesing/steps/join.py
@@ -0,0 +1,34 @@
+import pandas as pd
+from procesing.steps.base import BaseContextStep
+
+class JoinExperimentsStep(BaseContextStep):
+    """Join experiment metadata to interactions"""
+
+    def transform(self, data: tuple):
+        """
+        Args:
+            data: (interactions_df, experiments_df)
+        Returns:
+            merged interactions dataframe
+        """
+        interactions_df, experiments_df = data
+
+        if experiments_df.empty:
+            return interactions_df
+
+        # Flatten nested task field if present
+        if 'task' in experiments_df.columns and experiments_df['task'].notnull().any():
+            task_norm = pd.json_normalize(experiments_df['task'].dropna())
+            task_norm.index = experiments_df[experiments_df['task'].notnull()].index
+            experiments_df = experiments_df.drop('task', axis=1).join(task_norm, rsuffix='_task')
+
+        # Rename for clarity
+        experiments_df = experiments_df.rename(columns={
+            'id': 'experimentId',
+            'subject_name': 'exp_subject',
+            'xp_human_only': 'exp_human_only',
+            'xp_market_mode': 'exp_market_mode',
+            'xp_task_id': 'exp_task_id'
+        })
+
+        return interactions_df.merge(experiments_df, on='experimentId', how='left')
--- a/experiments/procesing/steps/pricing.py
+++ b/experiments/procesing/steps/pricing.py
@@ -0,0 +1,149 @@
+import numpy as np
+import pandas as pd
+from typing import Optional, List, Dict, Any
+from dataclasses import dataclass, field
+from procesing.steps.base import BaseContextStep
+from procesing.pricers import ElasticityBasedPricer
+
+@dataclass
+class StateSpace:
+    """
+    State representation for pricing functions.
+
+    Components:
+        Q_t: demand ∈ R^n (current demand signal per product)
+        P_t: prices ∈ R^n (current/base prices)
+        S_t: session_features (behavioral signals, interaction data)
+        H_t: history = {Q_{t-k}, P_{t-k}, S_{t-k}} for k in [1, history_length]
+
+    Additionally stores:
+        - product_ids: product identifiers (n,)
+        - elasticity: price elasticity per product (n,)
+        - metadata: arbitrary context (experiment_id, timestamp, etc.)
+    """
+    demand: np.ndarray  # Q_t ∈ R^n
+    prices: np.ndarray  # P_t ∈ R^n
+    session_features: pd.DataFrame = field(default_factory=pd.DataFrame)  # S_t
+
+    # augmented state components
+    product_ids: Optional[np.ndarray] = None
+    elasticity: Optional[np.ndarray] = None
+
+    # historical trajectory H_t = {(Q_{t-k}, P_{t-k}, S_{t-k})}
+    history: List[Dict[str, Any]] = field(default_factory=list)
+
+    # metadata for context
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self):
+        """Validate dimensions."""
+        n = len(self.demand)
+        assert len(self.prices) == n, "demand and prices must have same dimension"
+        if self.elasticity is not None:
+            assert len(self.elasticity) == n, "elasticity must match dimension"
+        if self.product_ids is not None:
+            assert len(self.product_ids) == n, "product_ids must match dimension"
+
+    @property
+    def n_products(self) -> int:
+        """Number of products in state space."""
+        return len(self.demand)
+
+    def add_history(self, q: np.ndarray, p: np.ndarray, s: pd.DataFrame, max_length: int = 10):
+        """Append historical state to trajectory H_t."""
+        self.history.append({'demand': q, 'prices': p, 'session_features': s})
+        if len(self.history) > max_length:
+            self.history.pop(0)
+
+    def get_history_window(self, k: int = 5) -> List[Dict[str, Any]]:
+        """Retrieve last k historical states."""
+        return self.history[-k:] if len(self.history) >= k else self.history
+
+
+class BuildStateSpaceStep(BaseContextStep):
+    """
+    Build state space from elasticity, demand, and price data.
+
+    Input: elasticity_df [productId, elasticity, ...], optional demand_df
+    Output: StateSpace instance with Q_t, P_t, elasticity, product_ids
+    """
+
+    def transform(self, elasticity_df: pd.DataFrame, demand_df: Optional[pd.DataFrame] = None):
+        products = self.context.products
+
+        # extract base prices from product metadata
+        products_with_prices = products.copy()
+        if 'metadata' in products_with_prices.columns:
+            products_with_prices['base_price'] = products_with_prices['metadata'].apply(
+                lambda m: m.get('base_price', 0) if isinstance(m, dict) else 0
+            )
+        else:
+            products_with_prices['base_price'] = 0
+
+        # merge with elasticity
+        merged = products_with_prices[['id', 'base_price']].rename(
+            columns={'id': 'productId'}
+        ).merge(
+            elasticity_df[['productId', 'elasticity']],
+            on='productId',
+            how='left'
+        ).fillna({'elasticity': 0.0, 'base_price': 0.0})
+
+        # merge with demand if provided, else use default
+        if demand_df is not None and 'demand' in demand_df.columns:
+            merged = merged.merge(
+                demand_df[['productId', 'demand']],
+                on='productId',
+                how='left'
+            ).fillna({'demand': 0.0})
+            demand_vector = merged['demand'].values
+        else:
+            # default: uniform demand or use elasticity as proxy
+            demand_vector = np.ones(len(merged)) * 10.0
+
+        return StateSpace(
+            demand=demand_vector,
+            prices=merged['base_price'].values,
+            session_features=pd.DataFrame(),
+            product_ids=merged['productId'].values,
+            elasticity=merged['elasticity'].values,
+            metadata={'timestamp': pd.Timestamp.now().isoformat()}
+        )
+
+
+class FitPricingFunctionStep(BaseContextStep):
+    """
+    Fit pricing function using elasticity data.
+    Input: elasticity_df
+    Output: fitted pricing function instance
+    """
+
+    def transform(self, elasticity_df: pd.DataFrame):
+        pricing_class = self.context.config.get('pricing_function_class', ElasticityBasedPricer)
+        pricing_params = self.context.config.get('pricing_function_params', {})
+
+        pricer = pricing_class(**pricing_params)
+        pricer.fit(elasticity_df)
+
+        return pricer
+
+
+class PredictPricesStep(BaseContextStep):
+    """
+    Predict optimal prices using fitted pricing function.
+    Input: (pricer, state_space)
+    Output: prices_df [productId, predicted_price]
+    """
+
+    def transform(self, data: tuple):
+        pricer, state_space = data
+
+        products = self.context.products
+        product_ids = products['id'].values
+
+        predicted_prices = pricer.predict(state_space)
+
+        return pd.DataFrame({
+            'productId': product_ids,
+            'predicted_price': predicted_prices
+        })
--- a/experiments/procesing/steps/session.py
+++ b/experiments/procesing/steps/session.py
@@ -0,0 +1,114 @@
+"""
+Session feature extraction for S_t component of state space.
+Computes behavioral signals from interaction data already in pipeline.
+"""
+import pandas as pd
+import numpy as np
+from typing import Optional, Dict, Any
+from collections import Counter
+from procesing.steps.base import BaseContextStep
+
+
+class ExtractSessionFeaturesStep(BaseContextStep):
+    """
+    Extract session-level behavioral features from interaction logs.
+
+    Input: interactions_df (user-interactions from earlier pipeline step)
+    Output: session_features DataFrame [sessionId, feature_1, feature_2, ...]
+
+    Features computed:
+        - total_interactions: count of all events
+        - page_views, item_views, searches, cart_adds: event type counts
+        - hovers: hover event counts
+        - unique_products_viewed: distinct product IDs
+        - interaction_velocity: events per minute
+        - session_duration_sec: time span of session
+        - avg_time_between_events: mean inter-event time
+        - product_view_depth: max views for single product (attention signal)
+    """
+
+    def transform(self, interactions_df: pd.DataFrame) -> pd.DataFrame:
+        if interactions_df.empty:
+            return pd.DataFrame()
+
+        # ensure timestamp column
+        if 'ts' in interactions_df.columns:
+            interactions_df = interactions_df.copy()
+            interactions_df['ts'] = pd.to_datetime(interactions_df['ts'])
+
+        # group by session and compute features
+        session_features = []
+        for session_id, session_df in interactions_df.groupby('sessionId'):
+            features = self._extract_features_for_session(session_id, session_df)
+            session_features.append(features)
+
+        return pd.DataFrame(session_features)
+
+    def _extract_features_for_session(self, session_id: str, session_df: pd.DataFrame) -> Dict[str, Any]:
+        """Compute features for single session."""
+        features = {'sessionId': session_id}
+
+        # basic counts
+        features['total_interactions'] = len(session_df)
+
+        event_counts = session_df['eventName'].value_counts().to_dict()
+        features['page_views'] = event_counts.get('page_view', 0) + event_counts.get('view_item_page', 0)
+        features['item_views'] = event_counts.get('view_item_page', 0)
+        features['searches'] = event_counts.get('search', 0)
+        features['cart_adds'] = event_counts.get('add_item_to_cart', 0)
+
+        # hover events
+        hover_events = ['hover_over_title', 'hover_over_paragraph', 'hover_over_link', 'hover_over_button']
+        features['hovers'] = sum(event_counts.get(ev, 0) for ev in hover_events)
+
+        # product-level signals
+        product_ids = session_df['productId'].dropna()
+        features['unique_products_viewed'] = product_ids.nunique()
+
+        if len(product_ids) > 0:
+            product_view_counts = Counter(product_ids)
+            features['product_view_depth'] = max(product_view_counts.values())
+        else:
+            features['product_view_depth'] = 0
+
+        # temporal features
+        if 'ts' in session_df.columns:
+            timestamps = session_df['ts'].sort_values()
+            features['session_duration_sec'] = (timestamps.max() - timestamps.min()).total_seconds()
+
+            if features['session_duration_sec'] > 0:
+                features['interaction_velocity'] = (features['total_interactions'] / features['session_duration_sec']) * 60
+            else:
+                features['interaction_velocity'] = 0.0
+
+            # inter-event timing
+            if len(timestamps) > 1:
+                time_diffs = timestamps.diff().dropna().dt.total_seconds()
+                features['avg_time_between_events'] = time_diffs.mean()
+                features['std_time_between_events'] = time_diffs.std()
+            else:
+                features['avg_time_between_events'] = 0.0
+                features['std_time_between_events'] = 0.0
+        else:
+            features['session_duration_sec'] = 0.0
+            features['interaction_velocity'] = 0.0
+            features['avg_time_between_events'] = 0.0
+            features['std_time_between_events'] = 0.0
+
+        # cart/conversion signals
+        features['cart_to_view_ratio'] = features['cart_adds'] / features['item_views'] if features['item_views'] > 0 else 0.0
+
+        return features
+
+
+class FilterSessionInteractionsStep(BaseContextStep):
+    """
+    Filter interactions DataFrame to specific session.
+
+    Input: (interactions_df, session_id)
+    Output: interactions_df filtered to session_id
+    """
+
+    def transform(self, data: tuple) -> pd.DataFrame:
+        interactions_df, session_id = data
+        return interactions_df[interactions_df['sessionId'] == session_id].copy()