refactoring and demand estimation

This commit is contained in:
2025-11-22 22:07:07 +01:00
parent 2661b841fc
commit 2ae027dba2
4 changed files with 151 additions and 99 deletions

View File

@@ -0,0 +1,19 @@
from .extract import (
KafkaDataFetcher,
ExperimentJoiner,
EventTitleAugmenter,
)
from .demand import DemandEstimator
from .mapping import SessionTransitionProbMatrixTransformer, render_graph
from .pipeline import etl_pipeline, pricing_pipeline
__all__ = [
'KafkaDataFetcher',
'ExperimentJoiner',
'EventTitleAugmenter',
'DemandEstimator',
'SessionTransitionProbMatrixTransformer',
'render_graph',
'etl_pipeline',
'pricing_pipeline',
]

View File

@@ -0,0 +1,39 @@
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from supabase import create_client, Client
import pandas as pd
import os
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
class DemandEstimator(BaseEstimator, TransformerMixin):
def __init__(self,
store_mode:str='hotel',
session_filter:str="",
experiment_filter:str=""):
self.store=store_mode
self.session_filter=session_filter if len(session_filter)>0 else None
self.experiment_filter=experiment_filter if len(experiment_filter)>0 else None
def fit(self, X):
return self
def transform(self, interactions : pd.DataFrame):
if interactions.empty:
return pd.DataFrame(columns=["productId", "demand_score"])
if self.session_filter:
interactions = interactions[interactions['sessionId'] == self.session_filter]
if self.experiment_filter:
interactions = interactions[interactions['experimentId'] == self.experiment_filter]
products=supabase.table(f'{self.store}_products').select("id, room_type, date_index, metadata, availability").execute()
products = pd.DataFrame(products.data)
unique_products = products['id'].unique()
# TODO: improve demand score calculation rather than just counting interactions (use weights..)
# while maintaining simplicity of a simple cross tab approach
product_demand = pd.crosstab(interactions['productId'], "no_of_interactions")
product_demand = product_demand.reindex(unique_products, fill_value=0).reset_index()
product_demand.columns = ['productId', 'demand_score']
return product_demand

View File

@@ -15,8 +15,12 @@ N_PRICE_BUCKETS = 5
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
def get_data_from_kafka() -> pd.DataFrame:
"""fetch all events from backend dump endpoint"""
class KafkaDataFetcher(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, X=None):
resp = requests.get(f"{BACKEND_URL}/api/kafka/dump")
resp.raise_for_status()
data = resp.json()
@@ -29,10 +33,16 @@ def get_data_from_kafka() -> pd.DataFrame:
if 'metadata' in df.columns:
df = df.join(pd.json_normalize(df.pop("metadata"), sep=".").add_prefix("metadata_"))
df = df.dropna(subset=['eventName'])
# remape dateIndex
df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
return df
def join_with_experiments(df: pd.DataFrame) -> pd.DataFrame:
class ExperimentJoiner(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, df):
if df.empty or 'experimentId' not in df.columns:
return df
@@ -68,7 +78,11 @@ def join_with_experiments(df: pd.DataFrame) -> pd.DataFrame:
return df
def augment_event_titles(df: pd.DataFrame) -> pd.DataFrame:
class EventTitleAugmenter(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, df):
# from taking standard view_item_page in eventName to view_item_page_{metadata_schema}
# we want metadata schema to create product specific event names
@@ -96,25 +110,3 @@ def augment_event_titles(df: pd.DataFrame) -> pd.DataFrame:
)
df["eventName"] = df["eventName"] + df["metadata_schema"].astype(str)
return df
def extract() -> pd.DataFrame:
df = get_data_from_kafka()
df = join_with_experiments(df)
df = augment_event_titles(df)
return df
class DataExtractor(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, X=None):
return extract()
if __name__ == "__main__":
df = extract()
print(df.head())
print(df.tail())
print(df.info())

View File

@@ -1,20 +1,22 @@
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from extract import DataExtractor
from extract import KafkaDataFetcher, ExperimentJoiner, EventTitleAugmenter
from mapping import SessionTransitionProbMatrixTransformer, render_graph
from demand import DemandEstimator
# exposable pipelines
etl_pipeline = Pipeline([
('data_extraction', DataExtractor()),
('kafka_fetch', KafkaDataFetcher()),
('experiment_join', ExperimentJoiner()),
('event_augment', EventTitleAugmenter()),
])
pricing_pipeline = Pipeline([
('demand_estimation', DemandEstimator()),
('scaling', StandardScaler()),
])
if __name__ == "__main__":
processed_data = etl_pipeline.fit_transform(None)
pricing = pricing_pipeline.fit_transform(processed_data)
print(pricing)