First pricing implementation (#27)

* first implementation of elasticity demand computation * chor: fixing test :( * feature: rudemantary defintition of pricing pipeline * chor: fixing cross product missing data * add warning * feature: e2e pricing pipeline with inference
2026-05-31 08:33:36 +00:00 · 2025-11-27 18:25:27 +01:00
parent 8b76d24ade
commit c432c45343
8 changed files with 829 additions and 39 deletions
--- a/experiments/procesing/demand.py
+++ b/experiments/procesing/demand.py
@@ -2,14 +2,81 @@ from sklearn.base import BaseEstimator, TransformerMixin
 import numpy as np
 import pandas as pd
 from supabase import create_client, Client
-import pandas as pd
+from typing import Optional, Literal
 import os
+import logging
+log = logging.getLogger(__name__)

-SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
-SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
+SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
+SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")

 supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

+class ChunkInteractionsIntoSteps(BaseEstimator, TransformerMixin):
+    """
+    Split interaction data into time windows for temporal analysis.
+    Returns a list of dataframes, one per time window.
+    """
+    def __init__(self,
+                 window_size:str='1h',
+                 ts_col:str='ts',
+                 return_metadata:bool=True):
+        """
+        Args:
+            window_size: pandas freq string ('1h', '30T', '1D', etc)
+            ts_col: timestamp column name
+            return_metadata: if True, return dict with metadata per chunk
+        """
+        self.window_size = window_size
+        self.ts_col = ts_col
+        self.return_metadata = return_metadata
+
+    def fit(self, X):
+        return self
+
+    def transform(self, interactions: pd.DataFrame):
+        """
+        Returns:
+            if return_metadata=False: list of dataframes, one per window
+            if return_metadata=True: list of dicts with keys:
+                - 'data': dataframe for this window
+                - 'window_start': start timestamp
+                - 'window_end': end timestamp
+                - 'window_idx': integer index
+        """
+        if interactions.empty:
+            return []
+
+        df = interactions.copy()
+
+        # ensure timestamp is datetime
+        if not pd.api.types.is_datetime64_any_dtype(df[self.ts_col]):
+            df[self.ts_col] = pd.to_datetime(df[self.ts_col])
+
+        # sort by time
+        df = df.sort_values(self.ts_col)
+
+        # assign window
+        df['_window'] = df[self.ts_col].dt.floor(self.window_size)
+
+        # group by window
+        chunks = []
+        for idx, (window_start, group) in enumerate(df.groupby('_window')):
+            chunk_data = group.drop(columns=['_window'])
+
+            if self.return_metadata:
+                chunks.append({
+                    'data': chunk_data,
+                    'window_start': window_start,
+                    'window_end': window_start + pd.Timedelta(self.window_size),
+                    'window_idx': idx
+                })
+            else:
+                chunks.append(chunk_data)
+
+        return chunks
+
+
 class DemandEstimator(BaseEstimator, TransformerMixin):
    def __init__(self,
                 store_mode:str='hotel',
@@ -28,12 +95,25 @@ class DemandEstimator(BaseEstimator, TransformerMixin):
            interactions = interactions[interactions['sessionId'] == self.session_filter]
        if self.experiment_filter:
            interactions = interactions[interactions['experimentId'] == self.experiment_filter]
+
        products=supabase.table(f'{self.store}_products').select("id, room_type, date_index, metadata, availability").execute()
        products = pd.DataFrame(products.data)
        unique_products = products['id'].unique()
+        log.info(f"Demand estimator found {len(unique_products)} in data")
+
+        # filter out rows without productId
+        interactions_with_products = interactions.dropna(subset=['productId'])
+
+        if interactions_with_products.empty:
+            # no interactions with products, return all zeros
+            return pd.DataFrame({
+                'productId': unique_products,
+                'demand_score': 0
+            })
+
        # TODO: improve demand score calculation rather than just counting interactions (use weights..)
        # while maintaining simplicity of a simple cross tab approach
-        product_demand = pd.crosstab(interactions['productId'], "no_of_interactions")
+        product_demand = pd.crosstab(interactions_with_products['productId'], "no_of_interactions")
        product_demand = product_demand.reindex(unique_products, fill_value=0).reset_index()
        product_demand.columns = ['productId', 'demand_score']
        return product_demand