Improving interface after experiment01 (#30)

* fix: fixes of backwords

* fixing hotel information with image placeholders

* chore: clean up product display in hotel and cleaner interfacing

* adding loader with historical data loading

* feature: cleaning up pipeline

* chore: simple surge pricer

* created new pricing pipeline

* adding a checkout page to both sites

* fix: fixing stale pacakge

* test: we wont be using elasticity anymore so its okay

* chore: cleaning elasticity references

* chore: store sting

* feature: e2e intro pipline surge pricing

* fix: CVE vulnerability patching
This commit is contained in:
Daniel Alves Rösel
2025-12-06 17:47:14 +01:00
committed by GitHub
parent 59d4fb7891
commit 8751583764
27 changed files with 709 additions and 1096 deletions

View File

@@ -1,13 +1,12 @@
from procesing.steps.base import BaseContextStep
from procesing.steps.fetch import FetchInteractionsStep, FetchPriceLogsStep, FetchExperimentsStep
from procesing.steps.join import JoinExperimentsStep
from procesing.steps.augment import CreatePriceBucketsStep, AugmentEventNamesStep
from procesing.steps.join import JoinExperimentsStep, JoinProductFeaturesStep
from procesing.steps.augment import CreatePriceBucketsStep, AugmentEventNamesStep, AugmentInteractionsStep
from procesing.steps.chunk import ChunkByTimeWindowStep
from procesing.steps.demand import ComputeDemandStep, ComputeDemandForChunksStep
from procesing.steps.elasticity import AggregatePriceLogsStep, ComputeElasticityStep
from procesing.steps.elasticity import AggregatePriceLogsStep
from procesing.steps.pricing import FitPricingFunctionStep, PredictPricesStep
from procesing.steps.session import ExtractSessionFeaturesStep, _extract_features_for_session
# StateSpace, BuildStateSpaceStep,
__all__ = [
'BaseContextStep',
@@ -15,13 +14,14 @@ __all__ = [
'FetchPriceLogsStep',
'FetchExperimentsStep',
'JoinExperimentsStep',
'JoinProductFeaturesStep',
'CreatePriceBucketsStep',
'AugmentEventNamesStep',
'AugmentInteractionsStep',
'ChunkByTimeWindowStep',
'ComputeDemandStep',
'ComputeDemandForChunksStep',
'AggregatePriceLogsStep',
'ComputeElasticityStep',
'FitPricingFunctionStep',
'PredictPricesStep',
'ExtractSessionFeaturesStep',

View File

@@ -2,6 +2,93 @@ import numpy as np
import pandas as pd
from procesing.steps.base import BaseContextStep
class AugmentInteractionsStep(BaseContextStep):
"""
Consolidated step: create price buckets, augment event names, join experiments.
Input: (interactions_df, price_logs_df)
Output: enriched interactions_df
"""
def transform(self, data: tuple):
interactions_df, price_logs_df = data
if interactions_df.empty:
return interactions_df
# Step 1: Create price buckets
interactions_df = self._create_price_buckets(interactions_df)
# Step 2: Augment event names
interactions_df = self._augment_event_names(interactions_df)
# Step 3: Join experiments (optional)
if 'experimentId' in interactions_df.columns:
interactions_df = self._join_experiments(interactions_df)
return interactions_df
def _create_price_buckets(self, df: pd.DataFrame):
"""Create price bucket labels from price data"""
if 'metadata_price' not in df.columns:
df['price_bucket'] = ""
return df
n_buckets = self.context.config.get('n_price_buckets', 5)
if df['metadata_price'].notnull().sum() > 0:
try:
price_buckets = pd.qcut(
df['metadata_price'],
q=n_buckets,
labels=[f"PB_{i+1}" for i in range(n_buckets)],
duplicates='drop'
)
except ValueError:
# fallback for insufficient unique values
price_buckets = df['metadata_price'].apply(
lambda x: f"P_{int(x)}" if pd.notnull(x) else ""
)
else:
price_buckets = pd.Series([""] * len(df), index=df.index)
df['price_bucket'] = price_buckets
return df
def _augment_event_names(self, df: pd.DataFrame):
"""Augment event names with product and price bucket schema"""
# Create schema: _productId@price_bucket
has_product = df.get('productId', pd.Series()).notnull()
has_bucket = df.get('price_bucket', pd.Series()).notnull()
df['metadata_schema'] = np.where(
has_product & has_bucket,
"_" + df['productId'].astype(str) + "@" + df['price_bucket'].astype(str),
""
)
df['eventName'] = df['eventName'] + df['metadata_schema']
return df
def _join_experiments(self, df: pd.DataFrame):
"""Join experiment metadata if experimentId present"""
exp_ids = df['experimentId'].dropna().unique().tolist()
if not exp_ids:
return df
experiments_df = self.context.provider.fetch_experiments(exp_ids)
if experiments_df.empty:
return df
return df.merge(
experiments_df,
left_on='experimentId',
right_on='id',
how='left',
suffixes=('', '_exp')
)
class CreatePriceBucketsStep(BaseContextStep):
"""Create price bucket labels from price data"""

View File

@@ -16,7 +16,7 @@ class AggregatePriceLogsStep(BaseContextStep):
df = price_logs_df.copy()
ts_col = self.context.config.get('ts_col', 'ts')
window_size = self.context.window_size
#window_size = self.context.window_size WE ARE NOT USING CHUNKS ANYMORE
# ensure datetime
if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
@@ -24,230 +24,19 @@ class AggregatePriceLogsStep(BaseContextStep):
df = df.sort_values([ts_col, 'productId'])
products = self.context.products
unique_products = products['id'].unique()
# VECTORIZED: group by product, resample by time window, compute mean
df_indexed = df.set_index(ts_col)
windowed = (
df_indexed
.groupby('productId')['price']
.resample(window_size)
.mean()
.reset_index()
# get base price from metadata if available 1) read the metadata col as json and get the base_price
products['base_price'] = products.apply(
lambda row: row['metadata'].get('base_price', 0) if isinstance(row['metadata'], dict) else 0,
axis=1
)
# forward fill missing windows (carry last known price)
windowed = windowed.sort_values([ts_col, 'productId'])
windowed['price'] = windowed.groupby('productId')['price'].ffill()
windowed = windowed.dropna(subset=['price'])
unique_products = products['id'].unique()
# group into chunks by window
chunks = []
for window_start, group in windowed.groupby(ts_col):
price_vector = group[['productId', 'price']].copy()
# fill missing products with last known price before this window
missing_products = set(unique_products) - set(price_vector['productId'])
if missing_products:
for pid in missing_products:
last_price = df_indexed[
(df_indexed['productId'] == pid) &
(df_indexed.index < window_start)
]['price']
if not last_price.empty:
price_vector = pd.concat([
price_vector,
pd.DataFrame({'productId': [pid], 'price': [last_price.iloc[-1]]})
], ignore_index=True)
if not price_vector.empty:
chunks.append({
'window_start': window_start,
'window_end': window_start + pd.Timedelta(window_size),
'price_vector': price_vector
})
return chunks
class ComputeElasticityStep(BaseContextStep):
"""
Compute price elasticity from demand and price chunks.
Input: (demand_chunks, price_chunks)
Output: elasticity_df [productId, elasticity, std_error, n_obs]
"""
def transform(self, chunk_tuple: tuple):
demand_chunks, price_chunks = chunk_tuple
method = self.context.config.get('elasticity_method', 'point')
min_obs = self.context.config.get('min_observations', 2)
products = self.context.products
all_product_ids = products['id'].unique()
# align chunks by window_start
aligned = self._align_chunks(demand_chunks, price_chunks)
if not aligned:
return pd.DataFrame({
'productId': all_product_ids,
'elasticity': 0.0,
'std_error': 0.0,
'n_obs': 0
})
# build time series per product
product_series = self._build_timeseries(aligned)
# compute elasticity per product
elasticities = []
for pid, series in product_series.items():
if len(series) < min_obs:
elasticities.append({
'productId': pid,
'elasticity': 0.0,
'std_error': 0.0,
'n_obs': len(series)
})
continue
elast = self._compute_elasticity(series, method)
elasticities.append({
'productId': pid,
'elasticity': elast['value'],
'std_error': elast.get('std_error', 0.0),
'n_obs': len(series)
})
result_df = pd.DataFrame(elasticities)
# fill missing products with zero elasticity
observed_pids = set(result_df['productId'])
missing_pids = [p for p in all_product_ids if p not in observed_pids]
if missing_pids:
missing_df = pd.DataFrame({
'productId': missing_pids,
'elasticity': 0.0,
'std_error': 0.0,
'n_obs': 0
})
result_df = pd.concat([result_df, missing_df], ignore_index=True)
return result_df
def _align_chunks(self, demand_chunks: List[Dict], price_chunks: List[Dict]):
"""Align demand and price chunks by window_start"""
price_lookup = {c['window_start']: c for c in price_chunks}
aligned = []
for dc in demand_chunks:
ws = dc['window_start']
if ws in price_lookup:
aligned.append({
'window_start': ws,
'window_end': dc['window_end'],
'demand': dc['demand_vector'],
'prices': price_lookup[ws]['price_vector']
})
return aligned
def _build_timeseries(self, aligned: List[Dict]):
"""Build time series [timestamp, price, quantity] per product"""
series_by_product = {}
for chunk in aligned:
merged = chunk['demand'].merge(chunk['prices'], on='productId', how='inner')
for _, row in merged.iterrows():
pid = row['productId']
if pid not in series_by_product:
series_by_product[pid] = []
series_by_product[pid].append({
'timestamp': chunk['window_start'],
'price': row['price'],
'quantity': row['demand_score']
})
return series_by_product
def _compute_elasticity(self, series: List[Dict], method: str):
"""Compute point or arc elasticity"""
prices = np.array([s['price'] for s in series])
quantities = np.array([s['quantity'] for s in series])
# filter out zero/negative values
valid = (prices > 0) & (quantities > 0)
if valid.sum() < 2:
return {'value': 0.0, 'std_error': 0.0}
prices = prices[valid]
quantities = quantities[valid]
if method == 'point':
return self._point_elasticity(prices, quantities)
elif method == 'arc':
return self._arc_elasticity(prices, quantities)
else:
raise ValueError(f"Unknown elasticity method: {method}")
def _point_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
"""Point elasticity via log-log regression: log(Q) = a + b*log(P), elasticity = b"""
if len(prices) < 2:
return {'value': 0.0, 'std_error': 0.0}
log_p = np.log(prices)
log_q = np.log(quantities)
if log_p.std() == 0:
return {'value': 0.0, 'std_error': 0.0}
cov = np.cov(log_p, log_q)[0, 1]
var = np.var(log_p)
b = cov / var
# std error estimate
if len(prices) > 2:
residuals = log_q - (log_q.mean() + b * (log_p - log_p.mean()))
mse = (residuals ** 2).sum() / (len(prices) - 2)
se_b = np.sqrt(mse / (len(prices) * var))
else:
se_b = 0.0
return {'value': b, 'std_error': se_b}
def _arc_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
"""Arc elasticity: average period-over-period elasticity"""
elasticities = []
for i in range(1, len(prices)):
p1, p2 = prices[i-1], prices[i]
q1, q2 = quantities[i-1], quantities[i]
p_avg = (p1 + p2) / 2
q_avg = (q1 + q2) / 2
if p_avg == 0 or q_avg == 0:
continue
delta_p = p2 - p1
delta_q = q2 - q1
if delta_p == 0:
continue
e = (delta_q / q_avg) / (delta_p / p_avg)
elasticities.append(e)
if not elasticities:
return {'value': 0.0, 'std_error': 0.0}
return {
'value': np.mean(elasticities),
'std_error': np.std(elasticities) / np.sqrt(len(elasticities))
}
df_indexed = df.set_index(ts_col)
# we return a df of average price per product over the entire period
# TODO: maybe consider different opration to handle price aggregation over time
avg_prices = df_indexed.groupby('productId')['price'].mean().reindex(unique_products, fill_value=0).reset_index()
avg_prices.columns = ['productId', 'price']
# fill 0s with base_price from products
base_price_map = products.set_index('id')['base_price'].to_dict()
return avg_prices

View File

@@ -2,7 +2,11 @@ import pandas as pd
from procesing.steps.base import BaseContextStep
class FetchInteractionsStep(BaseContextStep):
"""Fetch raw interaction data from Kafka topic"""
"""Fetch raw interaction data from Kafka topic with optional time filtering"""
def __init__(self, context, lookback: str = None):
super().__init__(context)
self.lookback = lookback
def transform(self, X=None):
df = self.context.provider.fetch_kafka_topic('user-interactions')
@@ -24,14 +28,35 @@ class FetchInteractionsStep(BaseContextStep):
if 'metadata_dateIndex' in df.columns:
df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
# Apply time filtering if lookback specified
if self.lookback and 'ts' in df.columns:
df['ts'] = pd.to_datetime(df['ts'])
cutoff = pd.Timestamp.now() - pd.Timedelta(self.lookback)
df = df[df['ts'] >= cutoff]
return df
class FetchPriceLogsStep(BaseContextStep):
"""Fetch price log data from Kafka topic"""
"""Fetch price log data from Kafka topic with optional time filtering"""
def __init__(self, context, lookback: str = None):
super().__init__(context)
self.lookback = lookback
def transform(self, X=None):
return self.context.provider.fetch_kafka_topic('price-logs')
df = self.context.provider.fetch_kafka_topic('price-logs')
if df.empty:
return df
# Apply time filtering if lookback specified
if self.lookback and 'ts' in df.columns:
df['ts'] = pd.to_datetime(df['ts'])
cutoff = pd.Timestamp.now() - pd.Timedelta(self.lookback)
df = df[df['ts'] >= cutoff]
return df
class FetchExperimentsStep(BaseContextStep):

View File

@@ -32,3 +32,27 @@ class JoinExperimentsStep(BaseContextStep):
})
return interactions_df.merge(experiments_df, on='experimentId', how='left')
class JoinProductFeaturesStep(BaseContextStep):
"""Join product features to interactions"""
def transform(self, data: tuple):
"""
Args:
data: (interactions_df, products_df)
Returns:
merged interactions dataframe
"""
demand_df, price_df = data
# get base prices from products if available
products = self.context.products
products['base_price'] = products.apply(
lambda row: float(row['metadata'].get('base_price', 0.0)) if isinstance(row['metadata'], dict) else 0,
axis=1
)
products = products[['id', 'base_price']].rename(columns={'id': 'productId'})
if price_df.empty:
return demand_df
return demand_df.merge(price_df, on='productId', how='left').merge(products, on='productId', how='left')