First pricing implementation (#27)

* first implementation of elasticity demand computation * chor: fixing test :( * feature: rudemantary defintition of pricing pipeline * chor: fixing cross product missing data * add warning * feature: e2e pricing pipeline with inference
2026-05-31 16:43:36 +00:00 · 2025-11-27 18:25:27 +01:00
parent 8b76d24ade
commit c432c45343
8 changed files with 829 additions and 39 deletions
--- a/experiments/procesing/extract.py
+++ b/experiments/procesing/extract.py
@@ -6,6 +6,7 @@ import requests
 from dotenv import load_dotenv
 from sklearn.base import BaseEstimator, TransformerMixin
 from supabase import create_client, Client
+from typing import Tuple, List, Dict
 load_dotenv()

 BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:5000")
@@ -17,11 +18,13 @@ supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)


 class KafkaDataFetcher(BaseEstimator, TransformerMixin):
+    def __init__(self, topic: str = "user-interactions"):
+        self.topic = topic # also can be price-logs
    def fit(self, X=None, y=None):
        return self

    def transform(self, X=None):
-        resp = requests.get(f"{BACKEND_URL}/api/kafka/dump")
+        resp = requests.get(f"{BACKEND_URL}/api/kafka/dump?topic={self.topic}")
        resp.raise_for_status()
        data = resp.json()

@@ -29,12 +32,12 @@ class KafkaDataFetcher(BaseEstimator, TransformerMixin):
            return pd.DataFrame()

        df = pd.DataFrame(data['data'])
-        # explode metadata col json
-        if 'metadata' in df.columns:
-            df = df.join(pd.json_normalize(df.pop("metadata"), sep=".").add_prefix("metadata_"))
-        df = df.dropna(subset=['eventName'])
-        # remape dateIndex
-        df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
+        if self.topic == 'user-interactions':
+            if 'metadata' in df.columns: # explode metadata col json
+                df = df.join(pd.json_normalize(df.pop("metadata"), sep=".").add_prefix("metadata_"))
+            df = df.dropna(subset=['eventName'])
+            # remape dateIndex
+            df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
        return df


@@ -110,3 +113,95 @@ class EventTitleAugmenter(BaseEstimator, TransformerMixin):
        )
        df["eventName"] = df["eventName"] + df["metadata_schema"].astype(str)
        return df
+
+
+def chunk_shared_data(interactions_df: pd.DataFrame,
+                      price_logs_df: pd.DataFrame,
+                      window_size: str = '30s',
+                      ts_col: str = 'ts') -> Tuple[List[Dict], List[Dict]]:
+    """
+    Chunk interaction and price data into aligned time windows.
+
+    Args:
+        interactions_df: interaction data with timestamp column
+        price_logs_df: price log data with timestamp column
+        window_size: pandas freq string ('30s', '1min', '1h', etc)
+        ts_col: name of timestamp column
+
+    Returns:
+        tuple of (interaction_chunks, price_chunks) where each is list of dicts:
+        {
+            'window_start': timestamp,
+            'window_end': timestamp,
+            'data': dataframe for this window
+        }
+    """
+    if interactions_df.empty and price_logs_df.empty:
+        return [], []
+
+    # convert timestamps to datetime
+    interactions_df = interactions_df.copy()
+    price_logs_df = price_logs_df.copy()
+
+    if not interactions_df.empty:
+        if not pd.api.types.is_datetime64_any_dtype(interactions_df[ts_col]):
+            interactions_df[ts_col] = pd.to_datetime(interactions_df[ts_col])
+
+    if not price_logs_df.empty:
+        if not pd.api.types.is_datetime64_any_dtype(price_logs_df[ts_col]):
+            price_logs_df[ts_col] = pd.to_datetime(price_logs_df[ts_col])
+
+    # find global time bounds
+    times = []
+    if not interactions_df.empty:
+        times.extend([interactions_df[ts_col].min(), interactions_df[ts_col].max()])
+    if not price_logs_df.empty:
+        times.extend([price_logs_df[ts_col].min(), price_logs_df[ts_col].max()])
+
+    if not times:
+        return [], []
+
+    earliest = min(times)
+    latest = max(times)
+
+    # create shared time windows
+    windows = pd.date_range(start=earliest, end=latest, freq=window_size)
+
+    if len(windows) < 2:
+        return [], []
+
+    # chunk both datasets
+    interaction_chunks = []
+    price_chunks = []
+
+    for i in range(len(windows) - 1):
+        window_start = windows[i]
+        window_end = windows[i + 1]
+
+        # filter interactions in this window
+        if not interactions_df.empty:
+            mask = (interactions_df[ts_col] >= window_start) & (interactions_df[ts_col] < window_end)
+            interaction_chunk = interactions_df[mask]
+        else:
+            interaction_chunk = pd.DataFrame()
+
+        interaction_chunks.append({
+            'window_start': window_start,
+            'window_end': window_end,
+            'data': interaction_chunk
+        })
+
+        # filter price logs in this window
+        if not price_logs_df.empty:
+            mask = (price_logs_df[ts_col] >= window_start) & (price_logs_df[ts_col] < window_end)
+            price_chunk = price_logs_df[mask]
+        else:
+            price_chunk = pd.DataFrame()
+
+        price_chunks.append({
+            'window_start': window_start,
+            'window_end': window_end,
+            'data': price_chunk
+        })
+
+    return interaction_chunks, price_chunks