PHANTOM/experiments/procesing/demand.py

from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from supabase import create_client, Client
from typing import Optional, Literal
import os
import logging
log = logging.getLogger(__name__)

SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")

supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

class ChunkInteractionsIntoSteps(BaseEstimator, TransformerMixin):
    """
    Split interaction data into time windows for temporal analysis.
    Returns a list of dataframes, one per time window.
    """
    def __init__(self,
                 window_size:str='1h',
                 ts_col:str='ts',
                 return_metadata:bool=True):
        """
        Args:
            window_size: pandas freq string ('1h', '30T', '1D', etc)
            ts_col: timestamp column name
            return_metadata: if True, return dict with metadata per chunk
        """
        self.window_size = window_size
        self.ts_col = ts_col
        self.return_metadata = return_metadata

    def fit(self, X):
        return self

    def transform(self, interactions: pd.DataFrame):
        """
        Returns:
            if return_metadata=False: list of dataframes, one per window
            if return_metadata=True: list of dicts with keys:
                - 'data': dataframe for this window
                - 'window_start': start timestamp
                - 'window_end': end timestamp
                - 'window_idx': integer index
        """
        if interactions.empty:
            return []

        df = interactions.copy()

        # ensure timestamp is datetime
        if not pd.api.types.is_datetime64_any_dtype(df[self.ts_col]):
            df[self.ts_col] = pd.to_datetime(df[self.ts_col])

        # sort by time
        df = df.sort_values(self.ts_col)

        # assign window
        df['_window'] = df[self.ts_col].dt.floor(self.window_size)

        # group by window
        chunks = []
        for idx, (window_start, group) in enumerate(df.groupby('_window')):
            chunk_data = group.drop(columns=['_window'])

            if self.return_metadata:
                chunks.append({
                    'data': chunk_data,
                    'window_start': window_start,
                    'window_end': window_start + pd.Timedelta(self.window_size),
                    'window_idx': idx
                })
            else:
                chunks.append(chunk_data)

        return chunks


class DemandEstimator(BaseEstimator, TransformerMixin):
    def __init__(self,
                 store_mode:str='hotel',
                 session_filter:str="",
                 experiment_filter:str=""):
        self.store=store_mode
        self.session_filter=session_filter if len(session_filter)>0 else None
        self.experiment_filter=experiment_filter if len(experiment_filter)>0 else None
    def fit(self, X):
        return self

    def transform(self, interactions : pd.DataFrame):
        if interactions.empty:
            return pd.DataFrame(columns=["productId", "demand_score"])
        if self.session_filter:
            interactions = interactions[interactions['sessionId'] == self.session_filter]
        if self.experiment_filter:
            interactions = interactions[interactions['experimentId'] == self.experiment_filter]

        products=supabase.table(f'{self.store}_products').select("id, room_type, date_index, metadata, availability").execute()
        products = pd.DataFrame(products.data)
        unique_products = products['id'].unique()
        log.info(f"Demand estimator found {len(unique_products)} in data")

        # filter out rows without productId
        interactions_with_products = interactions.dropna(subset=['productId'])

        if interactions_with_products.empty:
            # no interactions with products, return all zeros
            return pd.DataFrame({
                'productId': unique_products,
                'demand_score': 0
            })

        # TODO: improve demand score calculation rather than just counting interactions (use weights..)
        # while maintaining simplicity of a simple cross tab approach
        product_demand = pd.crosstab(interactions_with_products['productId'], "no_of_interactions")
        product_demand = product_demand.reindex(unique_products, fill_value=0).reset_index()
        product_demand.columns = ['productId', 'demand_score']
        return product_demand