refactoring and demand estimation

This commit is contained in:
2025-11-22 22:07:07 +01:00
parent 2661b841fc
commit 2ae027dba2
4 changed files with 151 additions and 99 deletions

View File

@@ -0,0 +1,19 @@
from .extract import (
KafkaDataFetcher,
ExperimentJoiner,
EventTitleAugmenter,
)
from .demand import DemandEstimator
from .mapping import SessionTransitionProbMatrixTransformer, render_graph
from .pipeline import etl_pipeline, pricing_pipeline
__all__ = [
'KafkaDataFetcher',
'ExperimentJoiner',
'EventTitleAugmenter',
'DemandEstimator',
'SessionTransitionProbMatrixTransformer',
'render_graph',
'etl_pipeline',
'pricing_pipeline',
]

View File

@@ -0,0 +1,39 @@
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from supabase import create_client, Client
import pandas as pd
import os
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
class DemandEstimator(BaseEstimator, TransformerMixin):
def __init__(self,
store_mode:str='hotel',
session_filter:str="",
experiment_filter:str=""):
self.store=store_mode
self.session_filter=session_filter if len(session_filter)>0 else None
self.experiment_filter=experiment_filter if len(experiment_filter)>0 else None
def fit(self, X):
return self
def transform(self, interactions : pd.DataFrame):
if interactions.empty:
return pd.DataFrame(columns=["productId", "demand_score"])
if self.session_filter:
interactions = interactions[interactions['sessionId'] == self.session_filter]
if self.experiment_filter:
interactions = interactions[interactions['experimentId'] == self.experiment_filter]
products=supabase.table(f'{self.store}_products').select("id, room_type, date_index, metadata, availability").execute()
products = pd.DataFrame(products.data)
unique_products = products['id'].unique()
# TODO: improve demand score calculation rather than just counting interactions (use weights..)
# while maintaining simplicity of a simple cross tab approach
product_demand = pd.crosstab(interactions['productId'], "no_of_interactions")
product_demand = product_demand.reindex(unique_products, fill_value=0).reset_index()
product_demand.columns = ['productId', 'demand_score']
return product_demand

View File

@@ -15,8 +15,12 @@ N_PRICE_BUCKETS = 5
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY) supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
def get_data_from_kafka() -> pd.DataFrame:
"""fetch all events from backend dump endpoint""" class KafkaDataFetcher(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, X=None):
resp = requests.get(f"{BACKEND_URL}/api/kafka/dump") resp = requests.get(f"{BACKEND_URL}/api/kafka/dump")
resp.raise_for_status() resp.raise_for_status()
data = resp.json() data = resp.json()
@@ -29,10 +33,16 @@ def get_data_from_kafka() -> pd.DataFrame:
if 'metadata' in df.columns: if 'metadata' in df.columns:
df = df.join(pd.json_normalize(df.pop("metadata"), sep=".").add_prefix("metadata_")) df = df.join(pd.json_normalize(df.pop("metadata"), sep=".").add_prefix("metadata_"))
df = df.dropna(subset=['eventName']) df = df.dropna(subset=['eventName'])
# remape dateIndex
df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
return df return df
def join_with_experiments(df: pd.DataFrame) -> pd.DataFrame: class ExperimentJoiner(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, df):
if df.empty or 'experimentId' not in df.columns: if df.empty or 'experimentId' not in df.columns:
return df return df
@@ -68,7 +78,11 @@ def join_with_experiments(df: pd.DataFrame) -> pd.DataFrame:
return df return df
def augment_event_titles(df: pd.DataFrame) -> pd.DataFrame: class EventTitleAugmenter(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, df):
# from taking standard view_item_page in eventName to view_item_page_{metadata_schema} # from taking standard view_item_page in eventName to view_item_page_{metadata_schema}
# we want metadata schema to create product specific event names # we want metadata schema to create product specific event names
@@ -96,25 +110,3 @@ def augment_event_titles(df: pd.DataFrame) -> pd.DataFrame:
) )
df["eventName"] = df["eventName"] + df["metadata_schema"].astype(str) df["eventName"] = df["eventName"] + df["metadata_schema"].astype(str)
return df return df
def extract() -> pd.DataFrame:
df = get_data_from_kafka()
df = join_with_experiments(df)
df = augment_event_titles(df)
return df
class DataExtractor(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, X=None):
return extract()
if __name__ == "__main__":
df = extract()
print(df.head())
print(df.tail())
print(df.info())

View File

@@ -1,20 +1,22 @@
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from extract import DataExtractor from extract import KafkaDataFetcher, ExperimentJoiner, EventTitleAugmenter
from mapping import SessionTransitionProbMatrixTransformer, render_graph from mapping import SessionTransitionProbMatrixTransformer, render_graph
from demand import DemandEstimator from demand import DemandEstimator
# exposable pipelines # exposable pipelines
etl_pipeline = Pipeline([ etl_pipeline = Pipeline([
('data_extraction', DataExtractor()), ('kafka_fetch', KafkaDataFetcher()),
('experiment_join', ExperimentJoiner()),
('event_augment', EventTitleAugmenter()),
]) ])
pricing_pipeline = Pipeline([ pricing_pipeline = Pipeline([
('demand_estimation', DemandEstimator()), ('demand_estimation', DemandEstimator()),
('scaling', StandardScaler()),
]) ])
if __name__ == "__main__": if __name__ == "__main__":
processed_data = etl_pipeline.fit_transform(None) processed_data = etl_pipeline.fit_transform(None)
pricing = pricing_pipeline.fit_transform(processed_data) pricing = pricing_pipeline.fit_transform(processed_data)
print(pricing)