Improving interface after experiment01 (#30)

* fix: fixes of backwords * fixing hotel information with image placeholders * chore: clean up product display in hotel and cleaner interfacing * adding loader with historical data loading * feature: cleaning up pipeline * chore: simple surge pricer * created new pricing pipeline * adding a checkout page to both sites * fix: fixing stale pacakge * test: we wont be using elasticity anymore so its okay * chore: cleaning elasticity references * chore: store sting * feature: e2e intro pipline surge pricing * fix: CVE vulnerability patching
2026-07-16 01:53:37 +00:00 · 2025-12-06 17:47:14 +01:00
parent 59d4fb7891
commit 8751583764
27 changed files with 709 additions and 1096 deletions
--- a/experiments/procesing/steps/init.py
+++ b/experiments/procesing/steps/init.py
@@ -1,13 +1,12 @@
 from procesing.steps.base import BaseContextStep
 from procesing.steps.fetch import FetchInteractionsStep, FetchPriceLogsStep, FetchExperimentsStep
-from procesing.steps.join import JoinExperimentsStep
-from procesing.steps.augment import CreatePriceBucketsStep, AugmentEventNamesStep
+from procesing.steps.join import JoinExperimentsStep, JoinProductFeaturesStep
+from procesing.steps.augment import CreatePriceBucketsStep, AugmentEventNamesStep, AugmentInteractionsStep
 from procesing.steps.chunk import ChunkByTimeWindowStep
 from procesing.steps.demand import ComputeDemandStep, ComputeDemandForChunksStep
-from procesing.steps.elasticity import AggregatePriceLogsStep, ComputeElasticityStep
+from procesing.steps.elasticity import AggregatePriceLogsStep
 from procesing.steps.pricing import FitPricingFunctionStep, PredictPricesStep
 from procesing.steps.session import ExtractSessionFeaturesStep, _extract_features_for_session
-# StateSpace, BuildStateSpaceStep,

 __all__ = [
    'BaseContextStep',
@@ -15,13 +14,14 @@ __all__ = [
    'FetchPriceLogsStep',
    'FetchExperimentsStep',
    'JoinExperimentsStep',
+    'JoinProductFeaturesStep',
    'CreatePriceBucketsStep',
    'AugmentEventNamesStep',
+    'AugmentInteractionsStep',
    'ChunkByTimeWindowStep',
    'ComputeDemandStep',
    'ComputeDemandForChunksStep',
    'AggregatePriceLogsStep',
-    'ComputeElasticityStep',
    'FitPricingFunctionStep',
    'PredictPricesStep',
    'ExtractSessionFeaturesStep',
--- a/experiments/procesing/steps/augment.py
+++ b/experiments/procesing/steps/augment.py
@@ -2,6 +2,93 @@ import numpy as np
 import pandas as pd
 from procesing.steps.base import BaseContextStep

+
+class AugmentInteractionsStep(BaseContextStep):
+    """
+    Consolidated step: create price buckets, augment event names, join experiments.
+    Input: (interactions_df, price_logs_df)
+    Output: enriched interactions_df
+    """
+
+    def transform(self, data: tuple):
+        interactions_df, price_logs_df = data
+
+        if interactions_df.empty:
+            return interactions_df
+
+        # Step 1: Create price buckets
+        interactions_df = self._create_price_buckets(interactions_df)
+
+        # Step 2: Augment event names
+        interactions_df = self._augment_event_names(interactions_df)
+
+        # Step 3: Join experiments (optional)
+        if 'experimentId' in interactions_df.columns:
+            interactions_df = self._join_experiments(interactions_df)
+
+        return interactions_df
+
+    def _create_price_buckets(self, df: pd.DataFrame):
+        """Create price bucket labels from price data"""
+        if 'metadata_price' not in df.columns:
+            df['price_bucket'] = ""
+            return df
+
+        n_buckets = self.context.config.get('n_price_buckets', 5)
+
+        if df['metadata_price'].notnull().sum() > 0:
+            try:
+                price_buckets = pd.qcut(
+                    df['metadata_price'],
+                    q=n_buckets,
+                    labels=[f"PB_{i+1}" for i in range(n_buckets)],
+                    duplicates='drop'
+                )
+            except ValueError:
+                # fallback for insufficient unique values
+                price_buckets = df['metadata_price'].apply(
+                    lambda x: f"P_{int(x)}" if pd.notnull(x) else ""
+                )
+        else:
+            price_buckets = pd.Series([""] * len(df), index=df.index)
+
+        df['price_bucket'] = price_buckets
+        return df
+
+    def _augment_event_names(self, df: pd.DataFrame):
+        """Augment event names with product and price bucket schema"""
+        # Create schema: _productId@price_bucket
+        has_product = df.get('productId', pd.Series()).notnull()
+        has_bucket = df.get('price_bucket', pd.Series()).notnull()
+
+        df['metadata_schema'] = np.where(
+            has_product & has_bucket,
+            "_" + df['productId'].astype(str) + "@" + df['price_bucket'].astype(str),
+            ""
+        )
+
+        df['eventName'] = df['eventName'] + df['metadata_schema']
+        return df
+
+    def _join_experiments(self, df: pd.DataFrame):
+        """Join experiment metadata if experimentId present"""
+        exp_ids = df['experimentId'].dropna().unique().tolist()
+        if not exp_ids:
+            return df
+
+        experiments_df = self.context.provider.fetch_experiments(exp_ids)
+        if experiments_df.empty:
+            return df
+
+        return df.merge(
+            experiments_df,
+            left_on='experimentId',
+            right_on='id',
+            how='left',
+            suffixes=('', '_exp')
+        )
+
+
 class CreatePriceBucketsStep(BaseContextStep):
    """Create price bucket labels from price data"""

--- a/experiments/procesing/steps/elasticity.py
+++ b/experiments/procesing/steps/elasticity.py
@@ -16,7 +16,7 @@ class AggregatePriceLogsStep(BaseContextStep):

        df = price_logs_df.copy()
        ts_col = self.context.config.get('ts_col', 'ts')
-        window_size = self.context.window_size
+        #window_size = self.context.window_size WE ARE NOT USING CHUNKS ANYMORE

        # ensure datetime
        if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
@@ -24,230 +24,19 @@ class AggregatePriceLogsStep(BaseContextStep):

        df = df.sort_values([ts_col, 'productId'])
        products = self.context.products
-        unique_products = products['id'].unique()
-
-        # VECTORIZED: group by product, resample by time window, compute mean
-        df_indexed = df.set_index(ts_col)
-
-        windowed = (
-            df_indexed
-            .groupby('productId')['price']
-            .resample(window_size)
-            .mean()
-            .reset_index()
+        # get base price from metadata if available 1) read the metadata col as json and get the base_price
+        products['base_price'] = products.apply(
+            lambda row: row['metadata'].get('base_price', 0) if isinstance(row['metadata'], dict) else 0,
+            axis=1
        )

-        # forward fill missing windows (carry last known price)
-        windowed = windowed.sort_values([ts_col, 'productId'])
-        windowed['price'] = windowed.groupby('productId')['price'].ffill()
-        windowed = windowed.dropna(subset=['price'])
+        unique_products = products['id'].unique()

-        # group into chunks by window
-        chunks = []
-        for window_start, group in windowed.groupby(ts_col):
-            price_vector = group[['productId', 'price']].copy()
-
-            # fill missing products with last known price before this window
-            missing_products = set(unique_products) - set(price_vector['productId'])
-            if missing_products:
-                for pid in missing_products:
-                    last_price = df_indexed[
-                        (df_indexed['productId'] == pid) &
-                        (df_indexed.index < window_start)
-                    ]['price']
-
-                    if not last_price.empty:
-                        price_vector = pd.concat([
-                            price_vector,
-                            pd.DataFrame({'productId': [pid], 'price': [last_price.iloc[-1]]})
-                        ], ignore_index=True)
-
-            if not price_vector.empty:
-                chunks.append({
-                    'window_start': window_start,
-                    'window_end': window_start + pd.Timedelta(window_size),
-                    'price_vector': price_vector
-                })
-
-        return chunks
-
-
-class ComputeElasticityStep(BaseContextStep):
-    """
-    Compute price elasticity from demand and price chunks.
-    Input: (demand_chunks, price_chunks)
-    Output: elasticity_df [productId, elasticity, std_error, n_obs]
-    """
-
-    def transform(self, chunk_tuple: tuple):
-        demand_chunks, price_chunks = chunk_tuple
-
-        method = self.context.config.get('elasticity_method', 'point')
-        min_obs = self.context.config.get('min_observations', 2)
-
-        products = self.context.products
-        all_product_ids = products['id'].unique()
-
-        # align chunks by window_start
-        aligned = self._align_chunks(demand_chunks, price_chunks)
-
-        if not aligned:
-            return pd.DataFrame({
-                'productId': all_product_ids,
-                'elasticity': 0.0,
-                'std_error': 0.0,
-                'n_obs': 0
-            })
-
-        # build time series per product
-        product_series = self._build_timeseries(aligned)
-
-        # compute elasticity per product
-        elasticities = []
-        for pid, series in product_series.items():
-            if len(series) < min_obs:
-                elasticities.append({
-                    'productId': pid,
-                    'elasticity': 0.0,
-                    'std_error': 0.0,
-                    'n_obs': len(series)
-                })
-                continue
-
-            elast = self._compute_elasticity(series, method)
-            elasticities.append({
-                'productId': pid,
-                'elasticity': elast['value'],
-                'std_error': elast.get('std_error', 0.0),
-                'n_obs': len(series)
-            })
-
-        result_df = pd.DataFrame(elasticities)
-
-        # fill missing products with zero elasticity
-        observed_pids = set(result_df['productId'])
-        missing_pids = [p for p in all_product_ids if p not in observed_pids]
-
-        if missing_pids:
-            missing_df = pd.DataFrame({
-                'productId': missing_pids,
-                'elasticity': 0.0,
-                'std_error': 0.0,
-                'n_obs': 0
-            })
-            result_df = pd.concat([result_df, missing_df], ignore_index=True)
-
-        return result_df
-
-    def _align_chunks(self, demand_chunks: List[Dict], price_chunks: List[Dict]):
-        """Align demand and price chunks by window_start"""
-        price_lookup = {c['window_start']: c for c in price_chunks}
-        aligned = []
-
-        for dc in demand_chunks:
-            ws = dc['window_start']
-            if ws in price_lookup:
-                aligned.append({
-                    'window_start': ws,
-                    'window_end': dc['window_end'],
-                    'demand': dc['demand_vector'],
-                    'prices': price_lookup[ws]['price_vector']
-                })
-
-        return aligned
-
-    def _build_timeseries(self, aligned: List[Dict]):
-        """Build time series [timestamp, price, quantity] per product"""
-        series_by_product = {}
-
-        for chunk in aligned:
-            merged = chunk['demand'].merge(chunk['prices'], on='productId', how='inner')
-
-            for _, row in merged.iterrows():
-                pid = row['productId']
-                if pid not in series_by_product:
-                    series_by_product[pid] = []
-
-                series_by_product[pid].append({
-                    'timestamp': chunk['window_start'],
-                    'price': row['price'],
-                    'quantity': row['demand_score']
-                })
-
-        return series_by_product
-
-    def _compute_elasticity(self, series: List[Dict], method: str):
-        """Compute point or arc elasticity"""
-        prices = np.array([s['price'] for s in series])
-        quantities = np.array([s['quantity'] for s in series])
-
-        # filter out zero/negative values
-        valid = (prices > 0) & (quantities > 0)
-        if valid.sum() < 2:
-            return {'value': 0.0, 'std_error': 0.0}
-
-        prices = prices[valid]
-        quantities = quantities[valid]
-
-        if method == 'point':
-            return self._point_elasticity(prices, quantities)
-        elif method == 'arc':
-            return self._arc_elasticity(prices, quantities)
-        else:
-            raise ValueError(f"Unknown elasticity method: {method}")
-
-    def _point_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
-        """Point elasticity via log-log regression: log(Q) = a + b*log(P), elasticity = b"""
-        if len(prices) < 2:
-            return {'value': 0.0, 'std_error': 0.0}
-
-        log_p = np.log(prices)
-        log_q = np.log(quantities)
-
-        if log_p.std() == 0:
-            return {'value': 0.0, 'std_error': 0.0}
-
-        cov = np.cov(log_p, log_q)[0, 1]
-        var = np.var(log_p)
-        b = cov / var
-
-        # std error estimate
-        if len(prices) > 2:
-            residuals = log_q - (log_q.mean() + b * (log_p - log_p.mean()))
-            mse = (residuals ** 2).sum() / (len(prices) - 2)
-            se_b = np.sqrt(mse / (len(prices) * var))
-        else:
-            se_b = 0.0
-
-        return {'value': b, 'std_error': se_b}
-
-    def _arc_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
-        """Arc elasticity: average period-over-period elasticity"""
-        elasticities = []
-
-        for i in range(1, len(prices)):
-            p1, p2 = prices[i-1], prices[i]
-            q1, q2 = quantities[i-1], quantities[i]
-
-            p_avg = (p1 + p2) / 2
-            q_avg = (q1 + q2) / 2
-
-            if p_avg == 0 or q_avg == 0:
-                continue
-
-            delta_p = p2 - p1
-            delta_q = q2 - q1
-
-            if delta_p == 0:
-                continue
-
-            e = (delta_q / q_avg) / (delta_p / p_avg)
-            elasticities.append(e)
-
-        if not elasticities:
-            return {'value': 0.0, 'std_error': 0.0}
-
-        return {
-            'value': np.mean(elasticities),
-            'std_error': np.std(elasticities) / np.sqrt(len(elasticities))
-        }
+        df_indexed = df.set_index(ts_col)
+        # we return a df of average price per product over the entire period
+        # TODO: maybe consider different opration to handle price aggregation over time
+        avg_prices = df_indexed.groupby('productId')['price'].mean().reindex(unique_products, fill_value=0).reset_index()
+        avg_prices.columns = ['productId', 'price']
+        # fill 0s with base_price from products
+        base_price_map = products.set_index('id')['base_price'].to_dict()
+        return avg_prices
--- a/experiments/procesing/steps/fetch.py
+++ b/experiments/procesing/steps/fetch.py
@@ -2,7 +2,11 @@ import pandas as pd
 from procesing.steps.base import BaseContextStep

 class FetchInteractionsStep(BaseContextStep):
-    """Fetch raw interaction data from Kafka topic"""
+    """Fetch raw interaction data from Kafka topic with optional time filtering"""
+
+    def __init__(self, context, lookback: str = None):
+        super().__init__(context)
+        self.lookback = lookback

    def transform(self, X=None):
        df = self.context.provider.fetch_kafka_topic('user-interactions')
@@ -24,14 +28,35 @@ class FetchInteractionsStep(BaseContextStep):
        if 'metadata_dateIndex' in df.columns:
            df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')

+        # Apply time filtering if lookback specified
+        if self.lookback and 'ts' in df.columns:
+            df['ts'] = pd.to_datetime(df['ts'])
+            cutoff = pd.Timestamp.now() - pd.Timedelta(self.lookback)
+            df = df[df['ts'] >= cutoff]
+
        return df


 class FetchPriceLogsStep(BaseContextStep):
-    """Fetch price log data from Kafka topic"""
+    """Fetch price log data from Kafka topic with optional time filtering"""
+
+    def __init__(self, context, lookback: str = None):
+        super().__init__(context)
+        self.lookback = lookback

    def transform(self, X=None):
-        return self.context.provider.fetch_kafka_topic('price-logs')
+        df = self.context.provider.fetch_kafka_topic('price-logs')
+
+        if df.empty:
+            return df
+
+        # Apply time filtering if lookback specified
+        if self.lookback and 'ts' in df.columns:
+            df['ts'] = pd.to_datetime(df['ts'])
+            cutoff = pd.Timestamp.now() - pd.Timedelta(self.lookback)
+            df = df[df['ts'] >= cutoff]
+
+        return df


 class FetchExperimentsStep(BaseContextStep):
--- a/experiments/procesing/steps/join.py
+++ b/experiments/procesing/steps/join.py
@@ -32,3 +32,27 @@ class JoinExperimentsStep(BaseContextStep):
        })

        return interactions_df.merge(experiments_df, on='experimentId', how='left')
+
+class JoinProductFeaturesStep(BaseContextStep):
+    """Join product features to interactions"""
+
+    def transform(self, data: tuple):
+        """
+        Args:
+            data: (interactions_df, products_df)
+        Returns:
+            merged interactions dataframe
+        """
+        demand_df, price_df = data
+
+        # get base prices from products if available
+        products = self.context.products
+        products['base_price'] = products.apply(
+            lambda row: float(row['metadata'].get('base_price', 0.0)) if isinstance(row['metadata'], dict) else 0,
+            axis=1
+        )
+        products = products[['id', 'base_price']].rename(columns={'id': 'productId'})
+
+        if price_df.empty:
+            return demand_df
+        return demand_df.merge(price_df, on='productId', how='left').merge(products, on='productId', how='left')