Airflow addition (#28)

* introducing airflow to run pipeline

* chore: updating dag with upload to registry

* introducing complete provider (non refactored and noisy)

* chore: removing old shit

* generic pricing baselines

* feature: super simple model registry (to be updated maybe third party OS software)

* chore: refactoring the providers docker config and requirements

* chore: refactored and broke down components (braking

* exporting all

* local pipeline excution working

* fix: fixing import structures from nonrelativistic

* chore: enables cross comm pickling with fully e2e pipeline compilation

* docs: what the pipeline is like now

* pipelines local running and pipeline high level definition

* cleaning old pipeline and vectorization

* leaked but fixing, not so important

* test: started with pipeline step testing

* chore: cleaning up provider of prices

* test: extra tests wit hsemantic meaning checks

* migrating pricers

* feature: introducing pricing predictors (pricers)

* chore: e2e is done with new pipeline

* extra session feature extraction

* feature: experiemntal sessin pricer and metrics(vibe)

* chore: redefined and connected pricers (#29)
This commit is contained in:
Daniel Alves Rösel
2025-11-29 17:50:16 +01:00
committed by GitHub
parent 2a0e44ab24
commit ad9423bf59
49 changed files with 3642 additions and 619 deletions

View File

@@ -1,19 +1,55 @@
from .extract import (
KafkaDataFetcher,
ExperimentJoiner,
EventTitleAugmenter,
from procesing.context import PipelineContext
from procesing.providers import DataProvider, SupabaseProvider, BackendAPIProvider
from procesing.steps import (
BaseContextStep,
FetchInteractionsStep,
FetchPriceLogsStep,
FetchExperimentsStep,
JoinExperimentsStep,
CreatePriceBucketsStep,
AugmentEventNamesStep,
ChunkByTimeWindowStep,
ComputeDemandStep,
ComputeDemandForChunksStep,
AggregatePriceLogsStep,
ComputeElasticityStep,
StateSpace,
BuildStateSpaceStep,
FitPricingFunctionStep,
PredictPricesStep,
)
from procesing.pipelines import (
interaction_extraction_pipeline,
price_extraction_pipeline,
elasticity_computation_pipeline,
pricing_pipeline,
full_pipeline,
)
from .demand import DemandEstimator
from .mapping import SessionTransitionProbMatrixTransformer, render_graph
from .pipeline import etl_pipeline, pricing_pipeline
__all__ = [
'KafkaDataFetcher',
'ExperimentJoiner',
'EventTitleAugmenter',
'DemandEstimator',
'SessionTransitionProbMatrixTransformer',
'render_graph',
'etl_pipeline',
'PipelineContext',
'DataProvider',
'SupabaseProvider',
'BackendAPIProvider',
'BaseContextStep',
'FetchInteractionsStep',
'FetchPriceLogsStep',
'FetchExperimentsStep',
'JoinExperimentsStep',
'CreatePriceBucketsStep',
'AugmentEventNamesStep',
'ChunkByTimeWindowStep',
'ComputeDemandStep',
'ComputeDemandForChunksStep',
'AggregatePriceLogsStep',
'ComputeElasticityStep',
'StateSpace',
'BuildStateSpaceStep',
'FitPricingFunctionStep',
'PredictPricesStep',
'interaction_extraction_pipeline',
'price_extraction_pipeline',
'elasticity_computation_pipeline',
'pricing_pipeline',
'full_pipeline',
]

View File

@@ -0,0 +1,34 @@
from typing import Any, Dict
import pandas as pd
from procesing.providers.base import DataProvider
class PipelineContext:
"""
Context for pipeline execution holding config, provider, and cached data.
Enables dependency injection and eliminates global state.
"""
def __init__(self,
provider: DataProvider,
store_mode: str,
window_size: str = '30s',
**config):
self.provider = provider
self.store_mode = store_mode
self.window_size = window_size
self.config = config
self._cache: Dict[str, Any] = {}
def get_cached(self, key: str, default=None):
return self._cache.get(key, default)
def cache(self, key: str, value):
self._cache[key] = value
return value
@property
def products(self) -> pd.DataFrame:
"""Lazy-load and cache product catalog, single fetch per pipeline run"""
if 'products' not in self._cache:
self._cache['products'] = self.provider.fetch_products(self.store_mode)
return self._cache['products']

View File

@@ -1,119 +0,0 @@
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from supabase import create_client, Client
from typing import Optional, Literal
import os
import logging
log = logging.getLogger(__name__)
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
class ChunkInteractionsIntoSteps(BaseEstimator, TransformerMixin):
"""
Split interaction data into time windows for temporal analysis.
Returns a list of dataframes, one per time window.
"""
def __init__(self,
window_size:str='1h',
ts_col:str='ts',
return_metadata:bool=True):
"""
Args:
window_size: pandas freq string ('1h', '30T', '1D', etc)
ts_col: timestamp column name
return_metadata: if True, return dict with metadata per chunk
"""
self.window_size = window_size
self.ts_col = ts_col
self.return_metadata = return_metadata
def fit(self, X):
return self
def transform(self, interactions: pd.DataFrame):
"""
Returns:
if return_metadata=False: list of dataframes, one per window
if return_metadata=True: list of dicts with keys:
- 'data': dataframe for this window
- 'window_start': start timestamp
- 'window_end': end timestamp
- 'window_idx': integer index
"""
if interactions.empty:
return []
df = interactions.copy()
# ensure timestamp is datetime
if not pd.api.types.is_datetime64_any_dtype(df[self.ts_col]):
df[self.ts_col] = pd.to_datetime(df[self.ts_col])
# sort by time
df = df.sort_values(self.ts_col)
# assign window
df['_window'] = df[self.ts_col].dt.floor(self.window_size)
# group by window
chunks = []
for idx, (window_start, group) in enumerate(df.groupby('_window')):
chunk_data = group.drop(columns=['_window'])
if self.return_metadata:
chunks.append({
'data': chunk_data,
'window_start': window_start,
'window_end': window_start + pd.Timedelta(self.window_size),
'window_idx': idx
})
else:
chunks.append(chunk_data)
return chunks
class DemandEstimator(BaseEstimator, TransformerMixin):
def __init__(self,
store_mode:str='hotel',
session_filter:str="",
experiment_filter:str=""):
self.store=store_mode
self.session_filter=session_filter if len(session_filter)>0 else None
self.experiment_filter=experiment_filter if len(experiment_filter)>0 else None
def fit(self, X):
return self
def transform(self, interactions : pd.DataFrame):
if interactions.empty:
return pd.DataFrame(columns=["productId", "demand_score"])
if self.session_filter:
interactions = interactions[interactions['sessionId'] == self.session_filter]
if self.experiment_filter:
interactions = interactions[interactions['experimentId'] == self.experiment_filter]
products=supabase.table(f'{self.store}_products').select("id, room_type, date_index, metadata, availability").execute()
products = pd.DataFrame(products.data)
unique_products = products['id'].unique()
log.info(f"Demand estimator found {len(unique_products)} in data")
# filter out rows without productId
interactions_with_products = interactions.dropna(subset=['productId'])
if interactions_with_products.empty:
# no interactions with products, return all zeros
return pd.DataFrame({
'productId': unique_products,
'demand_score': 0
})
# TODO: improve demand score calculation rather than just counting interactions (use weights..)
# while maintaining simplicity of a simple cross tab approach
product_demand = pd.crosstab(interactions_with_products['productId'], "no_of_interactions")
product_demand = product_demand.reindex(unique_products, fill_value=0).reset_index()
product_demand.columns = ['productId', 'demand_score']
return product_demand

View File

@@ -130,25 +130,24 @@ class TemporalElasticityEstimator(BaseEstimator, TransformerMixin):
def _build_product_timeseries(self, aligned_chunks):
"""Build time series [price, quantity] per product."""
series_by_product = {}
# vectorize chunk merging instead of iterating rows
all_merged = []
for chunk in aligned_chunks:
demand_df = chunk['demand']
price_df = chunk['prices']
merged = chunk['demand'].merge(chunk['prices'], on='productId', how='inner')
merged['timestamp'] = chunk['window_start']
all_merged.append(merged[['productId', 'timestamp', 'price', 'demand_score']])
# merge on productId
merged = demand_df.merge(price_df, on='productId', how='inner')
if not all_merged:
return {}
for _, row in merged.iterrows():
pid = row['productId']
if pid not in series_by_product:
series_by_product[pid] = []
series_by_product[pid].append({
'timestamp': chunk['window_start'],
'price': row['price'],
'quantity': row['demand_score']
})
# concat all chunks and group by productId in one pass
combined = pd.concat(all_merged, ignore_index=True)
series_by_product = {
pid: group[['timestamp', 'price', 'demand_score']].rename(
columns={'demand_score': 'quantity'}
).to_dict('records')
for pid, group in combined.groupby('productId')
}
return series_by_product

View File

@@ -1,207 +0,0 @@
import pandas as pd
import json
import numpy as np
import os
import requests
from dotenv import load_dotenv
from sklearn.base import BaseEstimator, TransformerMixin
from supabase import create_client, Client
from typing import Tuple, List, Dict
load_dotenv()
BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:5000")
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL")
SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
N_PRICE_BUCKETS = 5
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
class KafkaDataFetcher(BaseEstimator, TransformerMixin):
def __init__(self, topic: str = "user-interactions"):
self.topic = topic # also can be price-logs
def fit(self, X=None, y=None):
return self
def transform(self, X=None):
resp = requests.get(f"{BACKEND_URL}/api/kafka/dump?topic={self.topic}")
resp.raise_for_status()
data = resp.json()
if not data.get('success') or not data.get('data'):
return pd.DataFrame()
df = pd.DataFrame(data['data'])
if self.topic == 'user-interactions':
if 'metadata' in df.columns: # explode metadata col json
df = df.join(pd.json_normalize(df.pop("metadata"), sep=".").add_prefix("metadata_"))
df = df.dropna(subset=['eventName'])
# remape dateIndex
df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
return df
class ExperimentJoiner(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, df):
if df.empty or 'experimentId' not in df.columns:
return df
unique_exp_ids = df['experimentId'].dropna().unique()
if len(unique_exp_ids) == 0:
return df
resp = supabase.table('experiments').select(
'id, subject_name, xp_human_only, xp_market_mode, xp_task_id, task:tasks(task_name, task_description, task_def_of_done)'
).in_('id', unique_exp_ids.tolist()).execute()
if not resp.data:
return df
exp_df = pd.DataFrame(resp.data)
# flatten task nested object if present
if 'task' in exp_df.columns and exp_df['task'].notnull().any():
task_normalized = pd.json_normalize(exp_df['task'].dropna())
task_normalized.index = exp_df[exp_df['task'].notnull()].index
exp_df = exp_df.drop(columns=['task']).join(task_normalized, rsuffix='_task')
# rename experiment columns for clarity
exp_df = exp_df.rename(columns={
'id': 'experimentId',
'subject_name': 'exp_subject',
'xp_human_only': 'exp_human_only',
'xp_market_mode': 'exp_market_mode',
'xp_task_id': 'exp_task_id'
})
df = df.merge(exp_df, on='experimentId', how='left')
return df
class EventTitleAugmenter(BaseEstimator, TransformerMixin):
def fit(self, X=None, y=None):
return self
def transform(self, df):
# from taking standard view_item_page in eventName to view_item_page_{metadata_schema}
# we want metadata schema to create product specific event names
# only create price buckets if we have enough unique prices
if df["metadata_price"].notnull().sum() > 0:
try:
price_buckets = pd.qcut(
df["metadata_price"],
q=N_PRICE_BUCKETS,
labels=[f"PB_{i+1}" for i in range(N_PRICE_BUCKETS)],
duplicates='drop' # handle duplicate bin edges
)
except ValueError:
# fallback: if still not enough unique values, use cut with fixed ranges or just use raw price
price_buckets = df["metadata_price"].apply(lambda x: f"P_{int(x)}" if pd.notnull(x) else "")
else:
price_buckets = pd.Series([""] * len(df), index=df.index)
# metadata_schema: _product_id@price_bucket_{i} only if we have product metadata otherswise keep original event name
# TODO: make this adaptive, if we have hover_over_title we append the title, if its view_page we say which page
df["metadata_schema"] = np.where(
df["productId"].notnull() & df["metadata_price"].notnull(),
"_" + df["productId"].astype(str) + "@" + price_buckets.astype(str),
""
)
df["eventName"] = df["eventName"] + df["metadata_schema"].astype(str)
return df
def chunk_shared_data(interactions_df: pd.DataFrame,
price_logs_df: pd.DataFrame,
window_size: str = '30s',
ts_col: str = 'ts') -> Tuple[List[Dict], List[Dict]]:
"""
Chunk interaction and price data into aligned time windows.
Args:
interactions_df: interaction data with timestamp column
price_logs_df: price log data with timestamp column
window_size: pandas freq string ('30s', '1min', '1h', etc)
ts_col: name of timestamp column
Returns:
tuple of (interaction_chunks, price_chunks) where each is list of dicts:
{
'window_start': timestamp,
'window_end': timestamp,
'data': dataframe for this window
}
"""
if interactions_df.empty and price_logs_df.empty:
return [], []
# convert timestamps to datetime
interactions_df = interactions_df.copy()
price_logs_df = price_logs_df.copy()
if not interactions_df.empty:
if not pd.api.types.is_datetime64_any_dtype(interactions_df[ts_col]):
interactions_df[ts_col] = pd.to_datetime(interactions_df[ts_col])
if not price_logs_df.empty:
if not pd.api.types.is_datetime64_any_dtype(price_logs_df[ts_col]):
price_logs_df[ts_col] = pd.to_datetime(price_logs_df[ts_col])
# find global time bounds
times = []
if not interactions_df.empty:
times.extend([interactions_df[ts_col].min(), interactions_df[ts_col].max()])
if not price_logs_df.empty:
times.extend([price_logs_df[ts_col].min(), price_logs_df[ts_col].max()])
if not times:
return [], []
earliest = min(times)
latest = max(times)
# create shared time windows
windows = pd.date_range(start=earliest, end=latest, freq=window_size)
if len(windows) < 2:
return [], []
# chunk both datasets
interaction_chunks = []
price_chunks = []
for i in range(len(windows) - 1):
window_start = windows[i]
window_end = windows[i + 1]
# filter interactions in this window
if not interactions_df.empty:
mask = (interactions_df[ts_col] >= window_start) & (interactions_df[ts_col] < window_end)
interaction_chunk = interactions_df[mask]
else:
interaction_chunk = pd.DataFrame()
interaction_chunks.append({
'window_start': window_start,
'window_end': window_end,
'data': interaction_chunk
})
# filter price logs in this window
if not price_logs_df.empty:
mask = (price_logs_df[ts_col] >= window_start) & (price_logs_df[ts_col] < window_end)
price_chunk = price_logs_df[mask]
else:
price_chunk = pd.DataFrame()
price_chunks.append({
'window_start': window_start,
'window_end': window_end,
'data': price_chunk
})
return interaction_chunks, price_chunks

View File

@@ -1,158 +0,0 @@
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
def build_transition_prob_matrix(df: pd.DataFrame):
df = df.dropna(subset=['eventName'])
events = df['eventName'].tolist()
labels = pd.Index(events).unique().tolist()
idx = {e:i for i,e in enumerate(labels)}
M = np.zeros((len(labels), len(labels)), dtype=float)
for a, b in zip(events, events[1:]):
M[idx[a], idx[b]] += 1
row_sums = M.sum(axis=1, keepdims=True)
with np.errstate(divide='ignore', invalid='ignore'):
P = np.divide(M, row_sums, where=row_sums>0) # row-normalized
return P, labels
# https://medium.com/data-science/time-series-data-markov-transition-matrices-7060771e362b
from graphviz import Digraph
import numpy as np
import pandas as pd
def _as_prob_df(matrix, labels=None):
"""Return a square DataFrame with index=columns=labels."""
if isinstance(matrix, pd.DataFrame):
# Ensure square and aligned
assert (matrix.index == matrix.columns).all(), "Index/columns must match."
return matrix
matrix = np.asarray(matrix, dtype=float)
assert matrix.shape[0] == matrix.shape[1], "Matrix must be square."
if labels is None:
raise ValueError("labels are required when matrix is not a DataFrame")
assert len(labels) == matrix.shape[0], "labels length must match matrix size."
return pd.DataFrame(matrix, index=list(labels), columns=list(labels))
def _df_to_edgelist(P: pd.DataFrame, threshold=0.0, round_digits=2):
"""Build weighted edges > threshold."""
edges = []
for src in P.index:
for dst in P.columns:
w = float(P.loc[src, dst])
if w > threshold:
edges.append((str(src), str(dst), f"{w:.{round_digits}f}"))
return edges
def render_graph(fname, matrix, ls_index=None, threshold=0.0, fmt="svg", view=False):
"""
fname: output file stem (no extension)
matrix: NumPy array or pandas DataFrame of transition PROBABILITIES
ls_index: ordered labels (required if matrix is not a DataFrame)
threshold: hide edges with weight <= threshold
fmt: 'svg'|'png'|'pdf' etc.
view: open after rendering
"""
P = _as_prob_df(matrix, labels=ls_index)
edges = _df_to_edgelist(P, threshold=threshold)
g = Digraph(format=fmt)
g.attr(rankdir="LR", size="30")
g.attr("node", shape="circle")
# ensure isolated nodes appear
for node in P.index:
g.node(str(node), width="1", height="1")
for src, dst, label in edges:
g.edge(src, dst, label=label)
g.render(fname, view=view, cleanup=True)
return g
class TransitionProbMatrixTransformer(BaseEstimator, TransformerMixin):
def __init__(self, threshold=0.0):
self.threshold = threshold
self.P_ = None
self.labels_ = None
def fit(self, X: pd.DataFrame, y=None):
P, labels = build_transition_prob_matrix(X)
self.P_ = P
self.labels_ = labels
return self
def transform(self, X: pd.DataFrame = None):
return self.P_, self.labels_
def render(self, fname: str, fmt="svg", view=False):
if self.P_ is None or self.labels_ is None:
raise ValueError("Transformer has not been fitted yet.")
return render_graph(
fname,
self.P_,
ls_index=self.labels_,
threshold=self.threshold,
fmt=fmt,
view=view
)
class SessionTransitionProbMatrixTransformer(BaseEstimator, TransformerMixin):
def __init__(self, threshold=0.0, session_col='sessionId'):
self.threshold = threshold
self.session_col = session_col
self.session_matrices_ = None
def fit(self, X: pd.DataFrame, y=None):
if self.session_col not in X.columns:
raise ValueError(f"Column '{self.session_col}' not found in DataFrame")
session_matrices = {}
for session_id, grp in X.groupby(self.session_col):
if len(grp) > 1: # need at least 2 events for transitions
P, labels = build_transition_prob_matrix(grp)
session_matrices[session_id] = {'matrix': P, 'labels': labels}
self.session_matrices_ = session_matrices
return self
def transform(self, X: pd.DataFrame = None):
if self.session_matrices_ is None:
raise ValueError("Transformer has not been fitted yet.")
return pd.Series(self.session_matrices_)
def render_session(self, session_id: str, fname: str, fmt="svg", view=False):
if self.session_matrices_ is None:
raise ValueError("Transformer has not been fitted yet.")
if session_id not in self.session_matrices_:
raise ValueError(f"Session '{session_id}' not found in fitted data.")
sess_data = self.session_matrices_[session_id]
return render_graph(
fname,
sess_data['matrix'],
ls_index=sess_data['labels'],
threshold=self.threshold,
fmt=fmt,
view=view
)
if __name__ == "__main__":
# Example usage
data = {
'eventName': [
'A', 'B', 'A', 'C', 'B', 'A', 'A', 'C', 'B', 'C',
'A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A'
]
}
df = pd.DataFrame(data)
transformer = TransitionProbMatrixTransformer(threshold=0.1)
transformer.fit(df)
P, labels = transformer.transform(None)
print("Transition Probability Matrix:")
print(pd.DataFrame(P, index=labels, columns=labels))
# Render the graph
transformer.render("transition_graph", fmt="svg", view=False)

View File

@@ -0,0 +1,245 @@
"""
Revenue and KPI benchmark framework for pricing strategies.
Computes session-level and aggregate metrics to compare pricing functions:
- Revenue: R_T = Σ P_t^T · Q_t
- Conversion rate
- Average order value (AOV)
- Agent exploitation loss: L_agent = R_oracle - R_observed
"""
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, field, asdict
import pandas as pd
import numpy as np
@dataclass
class SessionMetrics:
"""KPIs for single session."""
session_id: str
experiment_id: Optional[str] = None
# interaction metrics
total_interactions: int = 0
page_views: int = 0
item_views: int = 0
searches: int = 0
cart_adds: int = 0
# revenue metrics
items_purchased: int = 0
total_revenue: float = 0.0
avg_item_price: float = 0.0
conversion_rate: float = 0.0
# pricing signals
total_price_shown: float = 0.0 # sum of all prices displayed
avg_markup: float = 0.0 # avg (price / base_price)
# behavioral features (for agent detection)
interaction_velocity: float = 0.0 # interactions per minute
session_duration_sec: float = 0.0
unique_products_viewed: int = 0
metadata: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class AggregateMetrics:
"""Aggregate KPIs across sessions/experiments."""
experiment_id: Optional[str] = None
n_sessions: int = 0
# revenue aggregates
total_revenue: float = 0.0
avg_revenue_per_session: float = 0.0
median_revenue_per_session: float = 0.0
# conversion aggregates
total_conversions: int = 0
conversion_rate: float = 0.0 # purchases / sessions
# pricing aggregates
avg_markup: float = 0.0
median_markup: float = 0.0
# agent exploitation metrics
estimated_agent_sessions: int = 0 # sessions flagged as agent-driven
agent_revenue: float = 0.0
human_revenue: float = 0.0
agent_loss: float = 0.0 # L_agent = R_oracle - R_observed (if available)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
class MetricsComputer:
"""Compute session and aggregate metrics from interaction/price logs."""
@staticmethod
def compute_session_metrics(
session_id: str,
interactions: pd.DataFrame,
price_logs: pd.DataFrame,
purchases: Optional[pd.DataFrame] = None,
experiment_id: Optional[str] = None
) -> SessionMetrics:
"""
Compute metrics for single session.
Args:
session_id: session identifier
interactions: user-interactions events for this session
price_logs: price-logs for this session
purchases: purchase events (if available)
experiment_id: experiment identifier
"""
metrics = SessionMetrics(session_id=session_id, experiment_id=experiment_id)
if interactions.empty:
return metrics
# interaction counts
event_counts = interactions['eventName'].value_counts().to_dict()
metrics.total_interactions = len(interactions)
metrics.page_views = event_counts.get('page_view', 0) + event_counts.get('view_item_page', 0)
metrics.item_views = event_counts.get('view_item_page', 0)
metrics.searches = event_counts.get('search', 0)
metrics.cart_adds = event_counts.get('add_item_to_cart', 0)
# unique products viewed
metrics.unique_products_viewed = interactions['productId'].dropna().nunique()
# session duration
if 'ts' in interactions.columns:
timestamps = pd.to_datetime(interactions['ts'])
metrics.session_duration_sec = (timestamps.max() - timestamps.min()).total_seconds()
if metrics.session_duration_sec > 0:
metrics.interaction_velocity = (metrics.total_interactions / metrics.session_duration_sec) * 60
# revenue from purchases
if purchases is not None and not purchases.empty:
metrics.items_purchased = len(purchases)
metrics.total_revenue = purchases['price'].sum() if 'price' in purchases.columns else 0.0
metrics.avg_item_price = metrics.total_revenue / metrics.items_purchased if metrics.items_purchased > 0 else 0.0
metrics.conversion_rate = 1.0 if metrics.items_purchased > 0 else 0.0
# pricing metrics
if not price_logs.empty:
metrics.total_price_shown = price_logs['price'].sum()
# compute markup if base_price available in price logs or join with product catalog
if 'base_price' in price_logs.columns:
valid_markup = price_logs[price_logs['base_price'] > 0]
if not valid_markup.empty:
metrics.avg_markup = (valid_markup['price'] / valid_markup['base_price']).mean()
return metrics
@staticmethod
def compute_aggregate_metrics(
session_metrics_list: List[SessionMetrics],
experiment_id: Optional[str] = None,
agent_detector_fn: Optional[callable] = None
) -> AggregateMetrics:
"""
Aggregate metrics across sessions.
Args:
session_metrics_list: list of SessionMetrics
experiment_id: experiment identifier
agent_detector_fn: optional function to classify session as agent (returns bool)
"""
agg = AggregateMetrics(experiment_id=experiment_id)
agg.n_sessions = len(session_metrics_list)
if agg.n_sessions == 0:
return agg
df = pd.DataFrame([m.to_dict() for m in session_metrics_list])
# revenue aggregates
agg.total_revenue = df['total_revenue'].sum()
agg.avg_revenue_per_session = df['total_revenue'].mean()
agg.median_revenue_per_session = df['total_revenue'].median()
# conversion aggregates
agg.total_conversions = (df['items_purchased'] > 0).sum()
agg.conversion_rate = agg.total_conversions / agg.n_sessions
# pricing aggregates
valid_markups = df[df['avg_markup'] > 0]
if not valid_markups.empty:
agg.avg_markup = valid_markups['avg_markup'].mean()
agg.median_markup = valid_markups['avg_markup'].median()
# agent detection (if detector provided)
if agent_detector_fn is not None:
agent_flags = [agent_detector_fn(m) for m in session_metrics_list]
agg.estimated_agent_sessions = sum(agent_flags)
agent_revenue = sum(m.total_revenue for m, is_agent in zip(session_metrics_list, agent_flags) if is_agent)
human_revenue = sum(m.total_revenue for m, is_agent in zip(session_metrics_list, agent_flags) if not is_agent)
agg.agent_revenue = agent_revenue
agg.human_revenue = human_revenue
return agg
@staticmethod
def compare_pricing_strategies(
experiments: Dict[str, List[SessionMetrics]],
baseline_experiment_id: Optional[str] = None
) -> pd.DataFrame:
"""
Compare multiple pricing strategies/experiments.
Args:
experiments: dict mapping experiment_id -> list of SessionMetrics
baseline_experiment_id: experiment to use as baseline for comparison
Returns:
DataFrame with comparative metrics
"""
results = []
baseline_agg = None
for exp_id, session_metrics in experiments.items():
agg = MetricsComputer.compute_aggregate_metrics(session_metrics, experiment_id=exp_id)
result = agg.to_dict()
if exp_id == baseline_experiment_id:
baseline_agg = agg
results.append(result)
df = pd.DataFrame(results)
# add relative metrics if baseline exists
if baseline_agg is not None:
df['revenue_lift_pct'] = ((df['total_revenue'] - baseline_agg.total_revenue) / baseline_agg.total_revenue * 100)
df['conversion_lift_pct'] = ((df['conversion_rate'] - baseline_agg.conversion_rate) / baseline_agg.conversion_rate * 100)
return df
def simple_agent_detector(session_metrics: SessionMetrics, velocity_threshold: float = 5.0) -> bool:
"""
Simple heuristic agent detector based on interaction velocity.
Args:
session_metrics: SessionMetrics instance
velocity_threshold: interactions per minute threshold (default: 5.0)
Returns:
True if session likely agent-driven
"""
# agents tend to have higher interaction velocity and lower session duration
if session_metrics.interaction_velocity > velocity_threshold:
return True
# agents often view many products quickly without converting
if session_metrics.unique_products_viewed > 10 and session_metrics.conversion_rate == 0:
return True
return False

View File

@@ -1,90 +0,0 @@
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import logging
log = logging.getLogger(__name__)
from extract import KafkaDataFetcher, ExperimentJoiner, EventTitleAugmenter, chunk_shared_data
from mapping import SessionTransitionProbMatrixTransformer, render_graph
from demand import DemandEstimator, ChunkInteractionsIntoSteps
from elasticity import TemporalElasticityEstimator, aggregate_price_logs
# elasticity pipeline components (not sklearn compatible, manual orchestration)
def elasticity_pipeline(interactions_df, price_logs_df, window_size='30s', store_mode='hotel'):
"""
Compute price elasticity from interaction and price data.
Args:
interactions_df: raw interaction data from demand_data_pipeline
price_logs_df: price log data from price_data_pipeline
window_size: time window for chunking
store_mode: 'hotel' or 'airline'
Returns:
df with [productId, elasticity, std_error, n_obs]
"""
# step 1: chunk interactions into time windows
chunker = ChunkInteractionsIntoSteps(window_size=window_size, return_metadata=True)
interaction_chunks = chunker.transform(interactions_df)
log.info(f"Chunked interactions into {len(interaction_chunks)} windows of size {window_size}")
if not interaction_chunks:
return None
# step 2: compute demand per window
demand_estimator = DemandEstimator(store_mode=store_mode)
demand_chunks = []
for chunk in interaction_chunks:
demand_vector = demand_estimator.transform(chunk['data'])
demand_chunks.append({
'window_start': chunk['window_start'],
'window_end': chunk['window_end'],
'demand_vector': demand_vector # each has a full list of all products, even if demand is 0
})
# [q_chunk1, q_chunk2, ...]
# step 3: aggregate price logs into windows
price_chunks = aggregate_price_logs(price_logs_df, window_size=window_size)
# step 4: compute elasticity
elasticity_estimator = TemporalElasticityEstimator(method='point', min_observations=2)
elasticity_df = elasticity_estimator.transform(demand_chunks, price_chunks, store_mode=store_mode)
return elasticity_df
# exposable pipelines
interaction_pipeline = Pipeline([
('kafka_fetch', KafkaDataFetcher(topic='user-interactions')),
('experiment_join', ExperimentJoiner()),
('event_augment', EventTitleAugmenter()),
])
price_data_pipeline = Pipeline([
('kafka_fetch', KafkaDataFetcher(topic='price-logs')),
])
# interaction_data + price_data -> elasticity (demand)
# elasticity -> pricing
pricing_pipeline = Pipeline([
('demand_estimation', DemandEstimator()),
])
if __name__ == "__main__":
# fetch both datasets
interaction_data = interaction_pipeline.fit_transform(None)
pricing_data = price_data_pipeline.fit_transform(None)
if interaction_data.empty or pricing_data.empty:
print("Insufficient data for elasticity computation"); exit(0)
# compute elasticity via unified pipeline
window_size = "30s"
elasticity_results = elasticity_pipeline(interaction_data, pricing_data, window_size=window_size)
elasticity_value_array = elasticity_results['elasticity'].values if elasticity_results is not None else np.array([])
print(elasticity_value_array)
if elasticity_results is not None and not elasticity_results.empty:
print(elasticity_results.to_string(index=False))
else:
print("\nInsufficient data for elasticity computation")

View File

@@ -0,0 +1,138 @@
from sklearn.pipeline import Pipeline
import pandas as pd
from procesing.context import PipelineContext
from procesing.providers import SupabaseProvider, BackendAPIProvider
from typing import Union
from procesing.steps import (
FetchInteractionsStep,
FetchPriceLogsStep,
FetchExperimentsStep,
JoinExperimentsStep,
CreatePriceBucketsStep,
AugmentEventNamesStep,
ChunkByTimeWindowStep,
ComputeDemandForChunksStep,
AggregatePriceLogsStep,
ComputeElasticityStep,
BuildStateSpaceStep,
FitPricingFunctionStep,
PredictPricesStep,
)
def interaction_extraction_pipeline(context: PipelineContext):
"""Pipeline for extracting and augmenting interaction data"""
return Pipeline([
('fetch', FetchInteractionsStep(context)),
('create_buckets', CreatePriceBucketsStep(context)),
('augment_events', AugmentEventNamesStep(context)),
])
def price_extraction_pipeline(context: PipelineContext):
"""Pipeline for extracting price logs"""
return Pipeline([
('fetch', FetchPriceLogsStep(context)),
])
def elasticity_computation_pipeline(context: PipelineContext,
interactions_df: pd.DataFrame,
price_logs_df: pd.DataFrame):
"""
Compute elasticity from interactions and price logs.
Manual orchestration needed for branching logic.
"""
# branch 1: chunk interactions and compute demand
chunk_step = ChunkByTimeWindowStep(context)
interaction_chunks = chunk_step.transform(interactions_df)
demand_step = ComputeDemandForChunksStep(context)
demand_chunks = demand_step.transform(interaction_chunks)
# branch 2: aggregate price logs
price_step = AggregatePriceLogsStep(context)
price_chunks = price_step.transform(price_logs_df)
# convergence: compute elasticity
elasticity_step = ComputeElasticityStep(context)
elasticity_df = elasticity_step.transform((demand_chunks, price_chunks))
return elasticity_df
def pricing_pipeline(context: PipelineContext, elasticity_df: pd.DataFrame):
"""
Generate optimal prices from elasticity estimates.
"""
# build state space
state_step = BuildStateSpaceStep(context)
state_space = state_step.transform(elasticity_df)
# fit pricing function
fit_step = FitPricingFunctionStep(context)
pricer = fit_step.transform(elasticity_df)
# predict prices
predict_step = PredictPricesStep(context)
prices_df = predict_step.transform((pricer, state_space))
return prices_df
def full_pipeline(context: PipelineContext):
"""
Complete end-to-end pipeline: data extraction -> elasticity -> pricing
Returns: (elasticity_df, prices_df)
"""
# extract interactions
interaction_pipe = interaction_extraction_pipeline(context)
interactions_df = interaction_pipe.fit_transform(None)
# extract price logs
price_pipe = price_extraction_pipeline(context)
price_logs_df = price_pipe.fit_transform(None)
if interactions_df.empty or price_logs_df.empty:
return None, None
# compute elasticity
elasticity_df = elasticity_computation_pipeline(
context,
interactions_df,
price_logs_df
)
if elasticity_df is None or elasticity_df.empty:
return elasticity_df, None
# generate prices
prices_df = pricing_pipeline(context, elasticity_df)
return elasticity_df, prices_df
if __name__ == '__main__':
class Provider(SupabaseProvider, BackendAPIProvider):
def __init__(self, backend_url: str):
SupabaseProvider.__init__(self)
BackendAPIProvider.__init__(self, backend_url=backend_url)
# example run
context = PipelineContext(
provider=Provider(backend_url="http://localhost:5000"),
store_mode='hotel',
)
elasticity_df, prices_df = full_pipeline(context)
if elasticity_df is not None and not elasticity_df.empty:
print("Elasticity Estimates:")
print(elasticity_df.to_string(index=False))
else:
print("No elasticity estimates computed.")
if prices_df is not None and not prices_df.empty:
print("\nPredicted Prices:")
print(prices_df.to_string(index=False))
else:
print("No prices predicted.")

View File

@@ -0,0 +1,13 @@
from procesing.pricers.base import PricingFunction
from procesing.pricers.elasticity import ElasticityBasedPricer
from procesing.pricers.simple import StaticPricer, RandomPricer
from procesing.pricers.session_aware import SessionAwarePricer, ProductSpecificSessionPricer
__all__ = [
'PricingFunction',
'ElasticityBasedPricer',
'StaticPricer',
'RandomPricer',
'SessionAwarePricer',
'ProductSpecificSessionPricer'
]

View File

@@ -0,0 +1,70 @@
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any, List
import numpy as np
import pandas as pd
class PricingFunction(ABC):
"""
Abstract base for pricing functions.
Defines mapping: f(Q_t, P_t, S_t, H_t) -> P_{t+1}
Where:
Q_t ∈ R^n: demand vector at time t
P_t ∈ R^n: price vector at time t
S_t: session features (behavioral signals, interactions)
H_t = {Q_{t-k}, P_{t-k}, S_{t-k}}: historical state trajectory
Objective:
maximize E[R_T] = E[Σ P_t^T · Q_t]
subject to:
Q_t = g(P_t, S_t) (demand response via elasticity)
P_t ≥ C (cost floor)
minimize L_agent = R_oracle - R_observed
"""
@abstractmethod
def fit(self, historical_data: pd.DataFrame, **kwargs):
"""
Offline training on historical data.
Args:
historical_data: DataFrame with elasticity, prices, demand signals
**kwargs: additional training parameters
"""
pass
@abstractmethod
def predict(self, state_space) -> np.ndarray:
"""
Generate optimal prices given current state.
Args:
state_space: StateSpace object containing Q_t, P_t, S_t, H_t
Returns:
P_{t+1}: price vector in R^n
"""
pass
def update(self, observation: Dict[str, Any]):
"""
Online learning update (optional).
Args:
observation: dict with {state, action, reward, next_state}
- state: StateSpace before pricing decision
- action: prices shown (P_t)
- reward: revenue/conversion signal
- next_state: StateSpace after user interaction
"""
pass # default: no online learning
def get_params(self) -> Dict[str, Any]:
"""Return pricing function parameters for serialization."""
return {}
def set_params(self, params: Dict[str, Any]):
"""Load pricing function parameters from dict."""
pass

View File

@@ -0,0 +1,59 @@
import numpy as np
import pandas as pd
from procesing.pricers.base import PricingFunction
class ElasticityBasedPricer(PricingFunction):
"""
Pricing based on demand elasticity estimates.
f(Q, S) = base_price * (1 + alpha * elasticity * demand_deviation)
"""
def __init__(self, alpha: float = 0.1, price_floor: float = 0.0, price_ceil: float = np.inf):
self.alpha = alpha
self.price_floor = price_floor
self.price_ceil = price_ceil
self.elasticity = None
self.base_prices = None
self.mean_demand = None
def fit(self, historical_data: pd.DataFrame):
"""
Calibrate from historical elasticity estimates.
Expects: [productId, elasticity, base_price, mean_demand]
"""
if 'elasticity' not in historical_data.columns:
raise ValueError("historical_data must contain 'elasticity' column")
self.elasticity = historical_data['elasticity'].values
self.base_prices = (historical_data['base_price'].values
if 'base_price' in historical_data.columns
else np.ones(len(historical_data)) * 100)
self.mean_demand = (historical_data['mean_demand'].values
if 'mean_demand' in historical_data.columns
else np.ones(len(historical_data)) * 10)
return self
def predict(self, state_space) -> np.ndarray:
"""
Adjust prices based on demand deviation and elasticity.
Higher demand -> increase price (but less for elastic goods)
"""
if self.elasticity is None:
raise ValueError("Must call fit() before predict()")
demand = np.asarray(state_space.demand)
if len(demand) != len(self.elasticity):
raise ValueError(f"Demand vector size {len(demand)} != elasticity size {len(self.elasticity)}")
# compute demand deviation from mean
demand_dev = (demand - self.mean_demand) / (self.mean_demand + 1e-6)
# adjust price: if demand high and elastic, don't increase much
# if demand high and inelastic, increase more
price_multiplier = 1 + self.alpha * np.abs(self.elasticity) * demand_dev
prices = self.base_prices * price_multiplier
# enforce bounds
prices = np.clip(prices, self.price_floor, self.price_ceil)
return prices

View File

@@ -0,0 +1,172 @@
"""
Session-aware pricing functions that leverage behavioral features S_t.
These pricers aim to minimize L_agent = R_oracle - R_observed.
"""
import numpy as np
import pandas as pd
from procesing.pricers.base import PricingFunction
from procesing.pricers.elasticity import ElasticityBasedPricer
class SessionAwarePricer(PricingFunction):
"""
Extends elasticity-based pricing with session behavioral signals.
f(Q, P, S) = base_price * elasticity_factor * session_factor
Where session_factor adjusts for:
- interaction_velocity (agent detection proxy)
- product_view_depth (interest signal)
- cart_to_view_ratio (conversion intent)
Strategy: charge higher prices to suspected agents (high velocity)
to recover oracle revenue from reconnaissance sessions.
"""
def __init__(self,
alpha: float = 0.1,
beta_velocity: float = 0.05,
beta_attention: float = 0.03,
agent_velocity_threshold: float = 5.0,
agent_markup: float = 1.2,
price_floor: float = 0.0,
price_ceil: float = np.inf):
"""
Args:
alpha: elasticity sensitivity
beta_velocity: interaction velocity weight
beta_attention: product attention weight
agent_velocity_threshold: velocity above which to apply agent markup
agent_markup: price multiplier for suspected agent sessions
price_floor, price_ceil: price bounds
"""
self.alpha = alpha
self.beta_velocity = beta_velocity
self.beta_attention = beta_attention
self.agent_velocity_threshold = agent_velocity_threshold
self.agent_markup = agent_markup
self.price_floor = price_floor
self.price_ceil = price_ceil
# fitted parameters
self.elasticity = None
self.base_prices = None
self.mean_demand = None
def fit(self, historical_data: pd.DataFrame, **kwargs):
"""Calibrate from historical elasticity data."""
if 'elasticity' not in historical_data.columns:
raise ValueError("historical_data must contain 'elasticity'")
self.elasticity = historical_data['elasticity'].values
self.base_prices = (historical_data['base_price'].values
if 'base_price' in historical_data.columns
else np.ones(len(historical_data)) * 100)
self.mean_demand = (historical_data['mean_demand'].values
if 'mean_demand' in historical_data.columns
else np.ones(len(historical_data)) * 10)
return self
def predict(self, state_space) -> np.ndarray:
"""Generate prices with session awareness."""
if self.elasticity is None:
raise ValueError("Must call fit() before predict()")
demand = np.asarray(state_space.demand)
n_products = len(demand)
# base elasticity-driven pricing
demand_dev = (demand - self.mean_demand) / (self.mean_demand + 1e-6)
elasticity_factor = 1 + self.alpha * np.abs(self.elasticity) * demand_dev
# session-aware adjustments
session_factor = np.ones(n_products)
if not state_space.session_features.empty:
sf = state_space.session_features.iloc[0] # single session features
# agent detection via velocity
velocity = sf.get('interaction_velocity', 0.0)
if velocity > self.agent_velocity_threshold:
# suspected agent: apply markup to recover oracle revenue
session_factor *= self.agent_markup
# attention signal: higher view depth -> user interested -> can charge more
view_depth = sf.get('product_view_depth', 0)
if view_depth > 0:
attention_boost = 1 + self.beta_attention * np.log1p(view_depth)
session_factor *= attention_boost
# cart presence: if user has items in cart, slightly increase prices
cart_to_view = sf.get('cart_to_view_ratio', 0.0)
if cart_to_view > 0.1:
session_factor *= (1 + 0.02) # small boost for conversion intent
prices = self.base_prices * elasticity_factor * session_factor
prices = np.clip(prices, self.price_floor, self.price_ceil)
return prices
class ProductSpecificSessionPricer(PricingFunction):
"""
Session-aware pricer with product-specific demand signals.
Uses S_t to extract per-product interaction counts and adjusts pricing
for products the user has already viewed/hovered.
Strategy: products viewed multiple times = high interest -> price up
"""
def __init__(self,
alpha: float = 0.1,
view_boost: float = 0.02,
max_view_boost: float = 0.15,
price_floor: float = 0.0,
price_ceil: float = np.inf):
self.alpha = alpha
self.view_boost = view_boost
self.max_view_boost = max_view_boost
self.price_floor = price_floor
self.price_ceil = price_ceil
self.elasticity = None
self.base_prices = None
self.mean_demand = None
self.product_ids = None
def fit(self, historical_data: pd.DataFrame, **kwargs):
if 'elasticity' not in historical_data.columns or 'productId' not in historical_data.columns:
raise ValueError("historical_data must contain 'elasticity' and 'productId'")
self.elasticity = historical_data['elasticity'].values
self.base_prices = (historical_data['base_price'].values
if 'base_price' in historical_data.columns
else np.ones(len(historical_data)) * 100)
self.mean_demand = (historical_data['mean_demand'].values
if 'mean_demand' in historical_data.columns
else np.ones(len(historical_data)) * 10)
self.product_ids = historical_data['productId'].values
return self
def predict(self, state_space) -> np.ndarray:
if self.elasticity is None:
raise ValueError("Must call fit() before predict()")
demand = np.asarray(state_space.demand)
n_products = len(demand)
# base pricing
demand_dev = (demand - self.mean_demand) / (self.mean_demand + 1e-6)
base_prices = self.base_prices * (1 + self.alpha * np.abs(self.elasticity) * demand_dev)
# product-specific session adjustments
if not state_space.session_features.empty and state_space.product_ids is not None:
# extract product interaction counts from session metadata
# (this would require session features to include per-product signals)
# for now, use uniform boost as placeholder
# TODO: extend session feature extraction to include product-specific counts
pass
prices = np.clip(base_prices, self.price_floor, self.price_ceil)
return prices

View File

@@ -0,0 +1,48 @@
import numpy as np
import pandas as pd
from procesing.pricers.base import PricingFunction
class StaticPricer(PricingFunction):
"""Static pricing: always return fixed base prices"""
def __init__(self, base_prices: np.ndarray = None):
self.base_prices = base_prices
def fit(self, historical_data: pd.DataFrame):
"""Extract base prices from historical data"""
if 'base_price' in historical_data.columns:
self.base_prices = historical_data['base_price'].values
elif 'price' in historical_data.columns:
self.base_prices = historical_data['price'].values
else:
raise ValueError("historical_data must contain 'base_price' or 'price' column")
return self
def predict(self, state_space) -> np.ndarray:
"""Return static base prices regardless of state"""
if self.base_prices is None:
raise ValueError("Must call fit() or provide base_prices in constructor")
return self.base_prices.copy()
class RandomPricer(PricingFunction):
"""Random pricing within bounds (for baseline comparison)"""
def __init__(self, price_min: float = 50.0, price_max: float = 500.0, seed: int = None):
self.price_min = price_min
self.price_max = price_max
self.seed = seed
self.n_products = None
self.rng = np.random.default_rng(seed)
def fit(self, historical_data: pd.DataFrame):
"""Learn number of products"""
self.n_products = len(historical_data)
return self
def predict(self, state_space) -> np.ndarray:
"""Generate random prices"""
if self.n_products is None:
self.n_products = len(state_space.demand)
return self.rng.uniform(self.price_min, self.price_max, size=self.n_products)

View File

@@ -35,8 +35,9 @@ from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()
from supabase import create_client, Client
from pipeline import interaction_pipeline, price_data_pipeline, elasticity_pipeline
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
SUPABASE_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")
@@ -79,18 +80,136 @@ class PricingFunction(BaseEstimator, TransformerMixin, ABC):
class SimpleLinearPricingFunction(PricingFunction):
def __init__(self, price_sensitivity: float = -0.1):
super().__init__()
self.price_sensitivity = price_sensitivity # simple coefficient
self.price_sensitivity = price_sensitivity
def fit(self, historical_data):
return self
def transform(self, state_space: StateSpace) -> np.ndarray:
# Simple linear adjustment: P_{t+1} = P_t + sensitivity * Q_t
new_prices = state_space.prices + self.price_sensitivity * state_space.demand # this is not great
new_prices = state_space.prices + self.price_sensitivity * state_space.demand
return np.maximum(new_prices, 0)
class ElasticityBasedPricingFunction(PricingFunction):
"""
Revenue-maximizing pricing using elasticity estimates.
For each product, optimal price P* maximizes R = P * Q(P)
where Q(P) follows power law: Q(P) = Q_0 * (P/P_0)^ε
Taking derivative dR/dP = 0 gives optimal markup:
P* = P_0 * (1 + 1/ε) if ε < -1 (elastic)
For inelastic demand (|ε| < 1), we apply bounded markup.
"""
def __init__(self,
cost_floor: float = 0.5,
max_markup: float = 2.0,
min_markup: float = 1.0,
inelastic_markup: float = 1.3):
super().__init__()
self.cost_floor = cost_floor # prices as fraction of base
self.max_markup = max_markup # max price = base * max_markup
self.min_markup = min_markup # min price = base * min_markup
self.inelastic_markup = inelastic_markup # default for |ε| < 1
self.elasticity_map = {} # productId -> elasticity
def fit(self, elasticity_df: pd.DataFrame):
"""
Args:
elasticity_df: df with [productId, elasticity, std_error, n_obs]
"""
if elasticity_df is not None and not elasticity_df.empty:
self.elasticity_map = dict(zip(
elasticity_df['productId'],
elasticity_df['elasticity']
))
return self
def transform(self, state_space: StateSpace, product_ids: np.ndarray = None) -> np.ndarray:
"""
Args:
state_space: current state (prices = base prices)
product_ids: array of productIds aligned with state_space.prices
Returns:
optimized prices P_{t+1}
"""
base_prices = state_space.prices
if product_ids is None:
# fallback: use positional index as productId (not ideal)
product_ids = np.arange(len(base_prices))
new_prices = np.zeros_like(base_prices)
for i, (base_p, pid) in enumerate(zip(base_prices, product_ids)):
elasticity = self.elasticity_map.get(pid, 0.0)
if elasticity < -1: # elastic demand
# optimal markup: (1 + 1/ε)
markup = 1 + (1 / elasticity)
optimal_p = base_p * markup
elif elasticity > -1 and elasticity < 0: # inelastic
# conservative markup
optimal_p = base_p * self.inelastic_markup
else: # ε ≥ 0 (demand increases with price, or no data)
# no elasticity data or anomalous, keep base price
optimal_p = base_p
# apply bounds
optimal_p = np.clip(
optimal_p,
base_p * self.min_markup,
base_p * self.max_markup
)
optimal_p = max(optimal_p, self.cost_floor)
new_prices[i] = optimal_p
return new_prices
class ContextualElasticityPricing(PricingFunction):
"""
Revenue optimization with contextual adjustments based on session features.
Combines elasticity-based pricing with surge/demand-based multipliers.
"""
def __init__(self,
base_pricer: ElasticityBasedPricingFunction = None,
demand_sensitivity: float = 0.1,
surge_threshold: float = 0.7):
super().__init__()
self.base_pricer = base_pricer or ElasticityBasedPricingFunction()
self.demand_sensitivity = demand_sensitivity
self.surge_threshold = surge_threshold
def fit(self, elasticity_df: pd.DataFrame):
self.base_pricer.fit(elasticity_df)
return self
def transform(self, state_space: StateSpace, product_ids: np.ndarray = None) -> np.ndarray:
# get base optimal prices from elasticity
base_optimal = self.base_pricer.transform(state_space, product_ids)
# compute surge multiplier from demand
if len(state_space.demand) > 0:
demand_normalized = state_space.demand / (state_space.demand.max() + 1e-8)
surge_multiplier = 1 + self.demand_sensitivity * np.maximum(
demand_normalized - self.surge_threshold, 0
)
else:
surge_multiplier = np.ones_like(base_optimal)
return base_optimal * surge_multiplier
# Example usage:
if __name__ == "__main__":
from pipeline import interaction_pipeline, price_data_pipeline, elasticity_pipeline
store_mode = 'hotel'
interaction_data = interaction_pipeline.fit_transform(None)
price_data = price_data_pipeline.fit_transform(None)

View File

@@ -0,0 +1,5 @@
from procesing.providers.base import DataProvider
from procesing.providers.supabase import SupabaseProvider
from procesing.providers.backend import BackendAPIProvider
__all__ = ['DataProvider', 'SupabaseProvider', 'BackendAPIProvider']

View File

@@ -0,0 +1,19 @@
import os
import pandas as pd
import requests
from typing import List
from procesing.providers.base import DataProvider
class BackendAPIProvider(DataProvider):
"""Concrete backend API implementation"""
def __init__(self, backend_url: str = None):
self.backend_url = backend_url or os.getenv("BACKEND_URL", "http://localhost:5000")
def fetch_kafka_topic(self, topic: str) -> pd.DataFrame:
resp = requests.get(f"{self.backend_url}/api/kafka/dump?topic={topic}")
resp.raise_for_status()
data = resp.json()
if not data.get('success') or not data.get('data'):
return pd.DataFrame()
return pd.DataFrame(data['data'])

View File

@@ -0,0 +1,21 @@
from abc import ABC, abstractmethod
from typing import List
import pandas as pd
class DataProvider(ABC):
"""Abstract interface for data access, enables DI and testing"""
@abstractmethod
def fetch_products(self, store_mode: str) -> pd.DataFrame:
"""Fetch product catalog for given store mode"""
pass
@abstractmethod
def fetch_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
"""Fetch experiment metadata for given IDs"""
pass
@abstractmethod
def fetch_kafka_topic(self, topic: str) -> pd.DataFrame:
"""Fetch data from Kafka topic via backend API"""
pass

View File

@@ -0,0 +1,35 @@
import os
import pandas as pd
import requests
from typing import List
from supabase import create_client, Client
from procesing.providers.base import DataProvider
from dotenv import load_dotenv
class SupabaseProvider(DataProvider):
"""Concrete Supabase + backend API implementation"""
def __init__(self,
supabase_url: str = None,
supabase_key: str = None,):
load_dotenv()
self.supabase_url = supabase_url or os.getenv("NEXT_PUBLIC_SUPABASE_URL")
self.supabase_key = supabase_key or os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
self.supabase: Client = create_client(self.supabase_url, self.supabase_key)
def fetch_products(self, store_mode: str) -> pd.DataFrame:
resp = self.supabase.table(f'{store_mode}_products').select(
"id, room_type, date_index, metadata, availability"
).execute()
return pd.DataFrame(resp.data) if resp.data else pd.DataFrame()
def fetch_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
if not experiment_ids:
return pd.DataFrame()
resp = self.supabase.table('experiments').select(
'id, subject_name, xp_human_only, xp_market_mode, xp_task_id, '
'task:tasks(task_name, task_description, task_def_of_done)'
).in_('id', experiment_ids).execute()
return pd.DataFrame(resp.data) if resp.data else pd.DataFrame()

View File

@@ -0,0 +1,27 @@
from procesing.steps.base import BaseContextStep
from procesing.steps.fetch import FetchInteractionsStep, FetchPriceLogsStep, FetchExperimentsStep
from procesing.steps.join import JoinExperimentsStep
from procesing.steps.augment import CreatePriceBucketsStep, AugmentEventNamesStep
from procesing.steps.chunk import ChunkByTimeWindowStep
from procesing.steps.demand import ComputeDemandStep, ComputeDemandForChunksStep
from procesing.steps.elasticity import AggregatePriceLogsStep, ComputeElasticityStep
from procesing.steps.pricing import StateSpace, BuildStateSpaceStep, FitPricingFunctionStep, PredictPricesStep
__all__ = [
'BaseContextStep',
'FetchInteractionsStep',
'FetchPriceLogsStep',
'FetchExperimentsStep',
'JoinExperimentsStep',
'CreatePriceBucketsStep',
'AugmentEventNamesStep',
'ChunkByTimeWindowStep',
'ComputeDemandStep',
'ComputeDemandForChunksStep',
'AggregatePriceLogsStep',
'ComputeElasticityStep',
'StateSpace',
'BuildStateSpaceStep',
'FitPricingFunctionStep',
'PredictPricesStep',
]

View File

@@ -0,0 +1,53 @@
import numpy as np
import pandas as pd
from procesing.steps.base import BaseContextStep
class CreatePriceBucketsStep(BaseContextStep):
"""Create price bucket labels from price data"""
def transform(self, df: pd.DataFrame):
if df.empty or 'metadata_price' not in df.columns:
df['price_bucket'] = ""
return df
n_buckets = self.context.config.get('n_price_buckets', 5)
if df['metadata_price'].notnull().sum() > 0:
try:
price_buckets = pd.qcut(
df['metadata_price'],
q=n_buckets,
labels=[f"PB_{i+1}" for i in range(n_buckets)],
duplicates='drop'
)
except ValueError:
# fallback for insufficient unique values
price_buckets = df['metadata_price'].apply(
lambda x: f"P_{int(x)}" if pd.notnull(x) else ""
)
else:
price_buckets = pd.Series([""] * len(df), index=df.index)
df['price_bucket'] = price_buckets
return df
class AugmentEventNamesStep(BaseContextStep):
"""Augment event names with product and price bucket schema"""
def transform(self, df: pd.DataFrame):
if df.empty:
return df
# Create schema: _productId@price_bucket
has_product = df.get('productId', pd.Series()).notnull()
has_bucket = df.get('price_bucket', pd.Series()).notnull()
df['metadata_schema'] = np.where(
has_product & has_bucket,
"_" + df['productId'].astype(str) + "@" + df['price_bucket'].astype(str),
""
)
df['eventName'] = df['eventName'] + df['metadata_schema']
return df

View File

@@ -0,0 +1,31 @@
from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, TransformerMixin
from procesing.context import PipelineContext
class BaseContextStep(BaseEstimator, TransformerMixin, ABC):
"""
Base for all pipeline steps.
Each step is stateless, context-driven, and performs ONE transformation.
"""
def __init__(self, context: PipelineContext):
self.context = context
def fit(self, X=None, y=None):
"""Most steps don't need training"""
return self
@abstractmethod
def transform(self, X):
"""Transform input using context. Must be implemented by subclass."""
pass
def get_params(self, deep=True):
"""sklearn compatibility"""
return {'context': self.context}
def set_params(self, **params):
"""sklearn compatibility"""
if 'context' in params:
self.context = params['context']
return self

View File

@@ -0,0 +1,34 @@
import pandas as pd
from procesing.steps.base import BaseContextStep
class ChunkByTimeWindowStep(BaseContextStep):
"""
Chunk dataframe into time windows.
Returns list of dicts with window metadata.
"""
def transform(self, df: pd.DataFrame):
if df.empty:
return []
df = df.copy()
ts_col = self.context.config.get('ts_col', 'ts')
window_size = self.context.window_size
# ensure datetime
if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
df[ts_col] = pd.to_datetime(df[ts_col])
df = df.sort_values(ts_col)
df['_window'] = df[ts_col].dt.floor(window_size)
chunks = []
for idx, (window_start, group) in enumerate(df.groupby('_window')):
chunks.append({
'window_start': window_start,
'window_end': window_start + pd.Timedelta(window_size),
'window_idx': idx,
'data': group.drop(columns=['_window'])
})
return chunks

View File

@@ -0,0 +1,61 @@
import pandas as pd
from procesing.steps.base import BaseContextStep
class ComputeDemandStep(BaseContextStep):
"""
Compute demand vector for a single time window or dataframe.
Input: single chunk dict OR raw dataframe
Output: demand dataframe with [productId, demand_score]
"""
def transform(self, chunk):
# handle both chunk dict and raw dataframe
if isinstance(chunk, dict):
interactions = chunk['data']
window_meta = {k: v for k, v in chunk.items() if k != 'data'}
else:
interactions = chunk
window_meta = {}
products = self.context.products
unique_products = products['id'].unique()
# apply filters if configured
session_filter = self.context.config.get('session_filter')
experiment_filter = self.context.config.get('experiment_filter')
if session_filter and 'sessionId' in interactions.columns:
interactions = interactions[interactions['sessionId'] == session_filter]
if experiment_filter and 'experimentId' in interactions.columns:
interactions = interactions[interactions['experimentId'] == experiment_filter]
interactions_with_products = interactions.dropna(subset=['productId'])
if interactions_with_products.empty:
demand_df = pd.DataFrame({
'productId': unique_products,
'demand_score': 0
})
else:
# crosstab for simple demand count
demand_df = pd.crosstab(
interactions_with_products['productId'],
'count'
).reindex(unique_products, fill_value=0).reset_index()
demand_df.columns = ['productId', 'demand_score']
# attach window metadata if present
if window_meta:
return {**window_meta, 'demand_vector': demand_df}
return demand_df
class ComputeDemandForChunksStep(BaseContextStep):
"""Apply ComputeDemandStep to list of chunks"""
def transform(self, chunks: list):
if not chunks:
return []
demand_step = ComputeDemandStep(self.context)
return [demand_step.transform(chunk) for chunk in chunks]

View File

@@ -0,0 +1,253 @@
import numpy as np
import pandas as pd
from typing import Dict, List
from procesing.steps.base import BaseContextStep
class AggregatePriceLogsStep(BaseContextStep):
"""
Aggregate price logs into time windows using VECTORIZED operations.
Input: price_logs_df
Output: list of price chunks with [productId, price]
"""
def transform(self, price_logs_df: pd.DataFrame):
if price_logs_df.empty:
return []
df = price_logs_df.copy()
ts_col = self.context.config.get('ts_col', 'ts')
window_size = self.context.window_size
# ensure datetime
if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
df[ts_col] = pd.to_datetime(df[ts_col])
df = df.sort_values([ts_col, 'productId'])
products = self.context.products
unique_products = products['id'].unique()
# VECTORIZED: group by product, resample by time window, compute mean
df_indexed = df.set_index(ts_col)
windowed = (
df_indexed
.groupby('productId')['price']
.resample(window_size)
.mean()
.reset_index()
)
# forward fill missing windows (carry last known price)
windowed = windowed.sort_values([ts_col, 'productId'])
windowed['price'] = windowed.groupby('productId')['price'].ffill()
windowed = windowed.dropna(subset=['price'])
# group into chunks by window
chunks = []
for window_start, group in windowed.groupby(ts_col):
price_vector = group[['productId', 'price']].copy()
# fill missing products with last known price before this window
missing_products = set(unique_products) - set(price_vector['productId'])
if missing_products:
for pid in missing_products:
last_price = df_indexed[
(df_indexed['productId'] == pid) &
(df_indexed.index < window_start)
]['price']
if not last_price.empty:
price_vector = pd.concat([
price_vector,
pd.DataFrame({'productId': [pid], 'price': [last_price.iloc[-1]]})
], ignore_index=True)
if not price_vector.empty:
chunks.append({
'window_start': window_start,
'window_end': window_start + pd.Timedelta(window_size),
'price_vector': price_vector
})
return chunks
class ComputeElasticityStep(BaseContextStep):
"""
Compute price elasticity from demand and price chunks.
Input: (demand_chunks, price_chunks)
Output: elasticity_df [productId, elasticity, std_error, n_obs]
"""
def transform(self, chunk_tuple: tuple):
demand_chunks, price_chunks = chunk_tuple
method = self.context.config.get('elasticity_method', 'point')
min_obs = self.context.config.get('min_observations', 2)
products = self.context.products
all_product_ids = products['id'].unique()
# align chunks by window_start
aligned = self._align_chunks(demand_chunks, price_chunks)
if not aligned:
return pd.DataFrame({
'productId': all_product_ids,
'elasticity': 0.0,
'std_error': 0.0,
'n_obs': 0
})
# build time series per product
product_series = self._build_timeseries(aligned)
# compute elasticity per product
elasticities = []
for pid, series in product_series.items():
if len(series) < min_obs:
elasticities.append({
'productId': pid,
'elasticity': 0.0,
'std_error': 0.0,
'n_obs': len(series)
})
continue
elast = self._compute_elasticity(series, method)
elasticities.append({
'productId': pid,
'elasticity': elast['value'],
'std_error': elast.get('std_error', 0.0),
'n_obs': len(series)
})
result_df = pd.DataFrame(elasticities)
# fill missing products with zero elasticity
observed_pids = set(result_df['productId'])
missing_pids = [p for p in all_product_ids if p not in observed_pids]
if missing_pids:
missing_df = pd.DataFrame({
'productId': missing_pids,
'elasticity': 0.0,
'std_error': 0.0,
'n_obs': 0
})
result_df = pd.concat([result_df, missing_df], ignore_index=True)
return result_df
def _align_chunks(self, demand_chunks: List[Dict], price_chunks: List[Dict]):
"""Align demand and price chunks by window_start"""
price_lookup = {c['window_start']: c for c in price_chunks}
aligned = []
for dc in demand_chunks:
ws = dc['window_start']
if ws in price_lookup:
aligned.append({
'window_start': ws,
'window_end': dc['window_end'],
'demand': dc['demand_vector'],
'prices': price_lookup[ws]['price_vector']
})
return aligned
def _build_timeseries(self, aligned: List[Dict]):
"""Build time series [timestamp, price, quantity] per product"""
series_by_product = {}
for chunk in aligned:
merged = chunk['demand'].merge(chunk['prices'], on='productId', how='inner')
for _, row in merged.iterrows():
pid = row['productId']
if pid not in series_by_product:
series_by_product[pid] = []
series_by_product[pid].append({
'timestamp': chunk['window_start'],
'price': row['price'],
'quantity': row['demand_score']
})
return series_by_product
def _compute_elasticity(self, series: List[Dict], method: str):
"""Compute point or arc elasticity"""
prices = np.array([s['price'] for s in series])
quantities = np.array([s['quantity'] for s in series])
# filter out zero/negative values
valid = (prices > 0) & (quantities > 0)
if valid.sum() < 2:
return {'value': 0.0, 'std_error': 0.0}
prices = prices[valid]
quantities = quantities[valid]
if method == 'point':
return self._point_elasticity(prices, quantities)
elif method == 'arc':
return self._arc_elasticity(prices, quantities)
else:
raise ValueError(f"Unknown elasticity method: {method}")
def _point_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
"""Point elasticity via log-log regression: log(Q) = a + b*log(P), elasticity = b"""
if len(prices) < 2:
return {'value': 0.0, 'std_error': 0.0}
log_p = np.log(prices)
log_q = np.log(quantities)
if log_p.std() == 0:
return {'value': 0.0, 'std_error': 0.0}
cov = np.cov(log_p, log_q)[0, 1]
var = np.var(log_p)
b = cov / var
# std error estimate
if len(prices) > 2:
residuals = log_q - (log_q.mean() + b * (log_p - log_p.mean()))
mse = (residuals ** 2).sum() / (len(prices) - 2)
se_b = np.sqrt(mse / (len(prices) * var))
else:
se_b = 0.0
return {'value': b, 'std_error': se_b}
def _arc_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
"""Arc elasticity: average period-over-period elasticity"""
elasticities = []
for i in range(1, len(prices)):
p1, p2 = prices[i-1], prices[i]
q1, q2 = quantities[i-1], quantities[i]
p_avg = (p1 + p2) / 2
q_avg = (q1 + q2) / 2
if p_avg == 0 or q_avg == 0:
continue
delta_p = p2 - p1
delta_q = q2 - q1
if delta_p == 0:
continue
e = (delta_q / q_avg) / (delta_p / p_avg)
elasticities.append(e)
if not elasticities:
return {'value': 0.0, 'std_error': 0.0}
return {
'value': np.mean(elasticities),
'std_error': np.std(elasticities) / np.sqrt(len(elasticities))
}

View File

@@ -0,0 +1,46 @@
import pandas as pd
from procesing.steps.base import BaseContextStep
class FetchInteractionsStep(BaseContextStep):
"""Fetch raw interaction data from Kafka topic"""
def transform(self, X=None):
df = self.context.provider.fetch_kafka_topic('user-interactions')
if df.empty:
return df
# Explode metadata JSON column
if 'metadata' in df.columns:
df = df.join(
pd.json_normalize(df.pop('metadata'), sep='.').add_prefix('metadata_')
)
df = df.dropna(subset=['eventName'])
# Remap dateIndex if present
if 'metadata_dateIndex' in df.columns:
df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
return df
class FetchPriceLogsStep(BaseContextStep):
"""Fetch price log data from Kafka topic"""
def transform(self, X=None):
return self.context.provider.fetch_kafka_topic('price-logs')
class FetchExperimentsStep(BaseContextStep):
"""Fetch experiment metadata for given interaction data"""
def transform(self, interactions_df: pd.DataFrame):
if interactions_df.empty or 'experimentId' not in interactions_df.columns:
return pd.DataFrame()
exp_ids = interactions_df['experimentId'].dropna().unique().tolist()
if not exp_ids:
return pd.DataFrame()
return self.context.provider.fetch_experiments(exp_ids)

View File

@@ -0,0 +1,34 @@
import pandas as pd
from procesing.steps.base import BaseContextStep
class JoinExperimentsStep(BaseContextStep):
"""Join experiment metadata to interactions"""
def transform(self, data: tuple):
"""
Args:
data: (interactions_df, experiments_df)
Returns:
merged interactions dataframe
"""
interactions_df, experiments_df = data
if experiments_df.empty:
return interactions_df
# Flatten nested task field if present
if 'task' in experiments_df.columns and experiments_df['task'].notnull().any():
task_norm = pd.json_normalize(experiments_df['task'].dropna())
task_norm.index = experiments_df[experiments_df['task'].notnull()].index
experiments_df = experiments_df.drop('task', axis=1).join(task_norm, rsuffix='_task')
# Rename for clarity
experiments_df = experiments_df.rename(columns={
'id': 'experimentId',
'subject_name': 'exp_subject',
'xp_human_only': 'exp_human_only',
'xp_market_mode': 'exp_market_mode',
'xp_task_id': 'exp_task_id'
})
return interactions_df.merge(experiments_df, on='experimentId', how='left')

View File

@@ -0,0 +1,149 @@
import numpy as np
import pandas as pd
from typing import Optional, List, Dict, Any
from dataclasses import dataclass, field
from procesing.steps.base import BaseContextStep
from procesing.pricers import ElasticityBasedPricer
@dataclass
class StateSpace:
"""
State representation for pricing functions.
Components:
Q_t: demand ∈ R^n (current demand signal per product)
P_t: prices ∈ R^n (current/base prices)
S_t: session_features (behavioral signals, interaction data)
H_t: history = {Q_{t-k}, P_{t-k}, S_{t-k}} for k in [1, history_length]
Additionally stores:
- product_ids: product identifiers (n,)
- elasticity: price elasticity per product (n,)
- metadata: arbitrary context (experiment_id, timestamp, etc.)
"""
demand: np.ndarray # Q_t ∈ R^n
prices: np.ndarray # P_t ∈ R^n
session_features: pd.DataFrame = field(default_factory=pd.DataFrame) # S_t
# augmented state components
product_ids: Optional[np.ndarray] = None
elasticity: Optional[np.ndarray] = None
# historical trajectory H_t = {(Q_{t-k}, P_{t-k}, S_{t-k})}
history: List[Dict[str, Any]] = field(default_factory=list)
# metadata for context
metadata: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
"""Validate dimensions."""
n = len(self.demand)
assert len(self.prices) == n, "demand and prices must have same dimension"
if self.elasticity is not None:
assert len(self.elasticity) == n, "elasticity must match dimension"
if self.product_ids is not None:
assert len(self.product_ids) == n, "product_ids must match dimension"
@property
def n_products(self) -> int:
"""Number of products in state space."""
return len(self.demand)
def add_history(self, q: np.ndarray, p: np.ndarray, s: pd.DataFrame, max_length: int = 10):
"""Append historical state to trajectory H_t."""
self.history.append({'demand': q, 'prices': p, 'session_features': s})
if len(self.history) > max_length:
self.history.pop(0)
def get_history_window(self, k: int = 5) -> List[Dict[str, Any]]:
"""Retrieve last k historical states."""
return self.history[-k:] if len(self.history) >= k else self.history
class BuildStateSpaceStep(BaseContextStep):
"""
Build state space from elasticity, demand, and price data.
Input: elasticity_df [productId, elasticity, ...], optional demand_df
Output: StateSpace instance with Q_t, P_t, elasticity, product_ids
"""
def transform(self, elasticity_df: pd.DataFrame, demand_df: Optional[pd.DataFrame] = None):
products = self.context.products
# extract base prices from product metadata
products_with_prices = products.copy()
if 'metadata' in products_with_prices.columns:
products_with_prices['base_price'] = products_with_prices['metadata'].apply(
lambda m: m.get('base_price', 0) if isinstance(m, dict) else 0
)
else:
products_with_prices['base_price'] = 0
# merge with elasticity
merged = products_with_prices[['id', 'base_price']].rename(
columns={'id': 'productId'}
).merge(
elasticity_df[['productId', 'elasticity']],
on='productId',
how='left'
).fillna({'elasticity': 0.0, 'base_price': 0.0})
# merge with demand if provided, else use default
if demand_df is not None and 'demand' in demand_df.columns:
merged = merged.merge(
demand_df[['productId', 'demand']],
on='productId',
how='left'
).fillna({'demand': 0.0})
demand_vector = merged['demand'].values
else:
# default: uniform demand or use elasticity as proxy
demand_vector = np.ones(len(merged)) * 10.0
return StateSpace(
demand=demand_vector,
prices=merged['base_price'].values,
session_features=pd.DataFrame(),
product_ids=merged['productId'].values,
elasticity=merged['elasticity'].values,
metadata={'timestamp': pd.Timestamp.now().isoformat()}
)
class FitPricingFunctionStep(BaseContextStep):
"""
Fit pricing function using elasticity data.
Input: elasticity_df
Output: fitted pricing function instance
"""
def transform(self, elasticity_df: pd.DataFrame):
pricing_class = self.context.config.get('pricing_function_class', ElasticityBasedPricer)
pricing_params = self.context.config.get('pricing_function_params', {})
pricer = pricing_class(**pricing_params)
pricer.fit(elasticity_df)
return pricer
class PredictPricesStep(BaseContextStep):
"""
Predict optimal prices using fitted pricing function.
Input: (pricer, state_space)
Output: prices_df [productId, predicted_price]
"""
def transform(self, data: tuple):
pricer, state_space = data
products = self.context.products
product_ids = products['id'].values
predicted_prices = pricer.predict(state_space)
return pd.DataFrame({
'productId': product_ids,
'predicted_price': predicted_prices
})

View File

@@ -0,0 +1,114 @@
"""
Session feature extraction for S_t component of state space.
Computes behavioral signals from interaction data already in pipeline.
"""
import pandas as pd
import numpy as np
from typing import Optional, Dict, Any
from collections import Counter
from procesing.steps.base import BaseContextStep
class ExtractSessionFeaturesStep(BaseContextStep):
"""
Extract session-level behavioral features from interaction logs.
Input: interactions_df (user-interactions from earlier pipeline step)
Output: session_features DataFrame [sessionId, feature_1, feature_2, ...]
Features computed:
- total_interactions: count of all events
- page_views, item_views, searches, cart_adds: event type counts
- hovers: hover event counts
- unique_products_viewed: distinct product IDs
- interaction_velocity: events per minute
- session_duration_sec: time span of session
- avg_time_between_events: mean inter-event time
- product_view_depth: max views for single product (attention signal)
"""
def transform(self, interactions_df: pd.DataFrame) -> pd.DataFrame:
if interactions_df.empty:
return pd.DataFrame()
# ensure timestamp column
if 'ts' in interactions_df.columns:
interactions_df = interactions_df.copy()
interactions_df['ts'] = pd.to_datetime(interactions_df['ts'])
# group by session and compute features
session_features = []
for session_id, session_df in interactions_df.groupby('sessionId'):
features = self._extract_features_for_session(session_id, session_df)
session_features.append(features)
return pd.DataFrame(session_features)
def _extract_features_for_session(self, session_id: str, session_df: pd.DataFrame) -> Dict[str, Any]:
"""Compute features for single session."""
features = {'sessionId': session_id}
# basic counts
features['total_interactions'] = len(session_df)
event_counts = session_df['eventName'].value_counts().to_dict()
features['page_views'] = event_counts.get('page_view', 0) + event_counts.get('view_item_page', 0)
features['item_views'] = event_counts.get('view_item_page', 0)
features['searches'] = event_counts.get('search', 0)
features['cart_adds'] = event_counts.get('add_item_to_cart', 0)
# hover events
hover_events = ['hover_over_title', 'hover_over_paragraph', 'hover_over_link', 'hover_over_button']
features['hovers'] = sum(event_counts.get(ev, 0) for ev in hover_events)
# product-level signals
product_ids = session_df['productId'].dropna()
features['unique_products_viewed'] = product_ids.nunique()
if len(product_ids) > 0:
product_view_counts = Counter(product_ids)
features['product_view_depth'] = max(product_view_counts.values())
else:
features['product_view_depth'] = 0
# temporal features
if 'ts' in session_df.columns:
timestamps = session_df['ts'].sort_values()
features['session_duration_sec'] = (timestamps.max() - timestamps.min()).total_seconds()
if features['session_duration_sec'] > 0:
features['interaction_velocity'] = (features['total_interactions'] / features['session_duration_sec']) * 60
else:
features['interaction_velocity'] = 0.0
# inter-event timing
if len(timestamps) > 1:
time_diffs = timestamps.diff().dropna().dt.total_seconds()
features['avg_time_between_events'] = time_diffs.mean()
features['std_time_between_events'] = time_diffs.std()
else:
features['avg_time_between_events'] = 0.0
features['std_time_between_events'] = 0.0
else:
features['session_duration_sec'] = 0.0
features['interaction_velocity'] = 0.0
features['avg_time_between_events'] = 0.0
features['std_time_between_events'] = 0.0
# cart/conversion signals
features['cart_to_view_ratio'] = features['cart_adds'] / features['item_views'] if features['item_views'] > 0 else 0.0
return features
class FilterSessionInteractionsStep(BaseContextStep):
"""
Filter interactions DataFrame to specific session.
Input: (interactions_df, session_id)
Output: interactions_df filtered to session_id
"""
def transform(self, data: tuple) -> pd.DataFrame:
interactions_df, session_id = data
return interactions_df[interactions_df['sessionId'] == session_id].copy()

View File

View File

@@ -0,0 +1,271 @@
import pytest
import pandas as pd
from typing import List
from procesing.providers.base import DataProvider
from procesing.context import PipelineContext
class MockProvider(DataProvider):
"""Mock provider for testing, holds in-memory fixtures"""
def __init__(self, products_df=None, experiments_df=None, kafka_data=None):
self._products = products_df if products_df is not None else pd.DataFrame()
self._experiments = experiments_df if experiments_df is not None else pd.DataFrame()
self._kafka_data = kafka_data if kafka_data is not None else {}
def fetch_products(self, store_mode: str) -> pd.DataFrame:
return self._products.copy()
def fetch_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
if self._experiments.empty:
return pd.DataFrame()
return self._experiments[
self._experiments['id'].isin(experiment_ids)
].copy()
def fetch_kafka_topic(self, topic: str) -> pd.DataFrame:
return self._kafka_data.get(topic, pd.DataFrame()).copy()
@pytest.fixture
def mock_products():
"""Standard product catalog fixture with realistic IDs from test data"""
return pd.DataFrame({
'id': [
'd018efc1-25e9-4284-b276-80386e048b25',
'51266ddb-5b07-47b7-89ee-5b5cae94bb11',
'2cd7f756-fc65-4ba0-ab01-74521c1fff43'
],
'name': ['Junior Suite', 'Superior Room', 'Deluxe Room'],
'base_price': [200.0, 150.0, 180.0]
})
@pytest.fixture
def mock_interactions_raw_kafka():
"""Raw Kafka message structure for interactions, matches production format"""
return [
{
'partitionID': 0, 'offset': 203, 'timestamp': 1764102082676,
'value': {
'payload': {
'sessionId': 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',
'experimentId': '53aefd07-f66a-4d7f-ba8b-7ea1fc562d35',
'eventName': 'learn_more_about_item',
'page': '/hotel/products/d018efc1-25e9-4284-b276-80386e048b25',
'productId': 'd018efc1-25e9-4284-b276-80386e048b25',
'metadata': {'type': 'hotel', 'dateIndex': 1, 'roomType': 'Junior Suite'},
'storeMode': 'hotel',
'ts': '2025-11-25T20:21:22.674Z'
}
}
},
{
'partitionID': 0, 'offset': 204, 'timestamp': 1764102086982,
'value': {
'payload': {
'sessionId': 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',
'experimentId': '53aefd07-f66a-4d7f-ba8b-7ea1fc562d35',
'eventName': 'page_view',
'page': '/hotel/products',
'productId': None,
'metadata': {'referrer': ''},
'storeMode': 'hotel',
'ts': '2025-11-25T20:21:26.947Z'
}
}
},
{
'partitionID': 0, 'offset': 205, 'timestamp': 1764102091825,
'value': {
'payload': {
'sessionId': 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',
'experimentId': '53aefd07-f66a-4d7f-ba8b-7ea1fc562d35',
'eventName': 'hover_over_title',
'page': '/hotel/products',
'productId': '51266ddb-5b07-47b7-89ee-5b5cae94bb11',
'metadata': {'elementText': 'Superior Room', 'dateIndex': 1, 'dwellTime': 1200},
'storeMode': 'hotel',
'ts': '2025-11-25T20:21:31.823Z'
}
}
},
{
'partitionID': 0, 'offset': 206, 'timestamp': 1764102094193,
'value': {
'payload': {
'sessionId': 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',
'experimentId': 'bbbbcccc-dddd-eeee-ffff-000011112222',
'eventName': 'hover_over_paragraph',
'page': '/hotel/products',
'productId': '51266ddb-5b07-47b7-89ee-5b5cae94bb11',
'metadata': {'elementText': 'price', 'dateIndex': 1, 'dwellTime': 1307},
'storeMode': 'hotel',
'ts': '2025-11-25T20:21:34.191Z'
}
}
},
{
'partitionID': 0, 'offset': 207, 'timestamp': 1764102101970,
'value': {
'payload': {
'sessionId': 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',
'experimentId': 'bbbbcccc-dddd-eeee-ffff-000011112222',
'eventName': 'hover_over_paragraph',
'page': '/hotel/products',
'productId': 'd018efc1-25e9-4284-b276-80386e048b25',
'metadata': {'elementText': 'price', 'dateIndex': 1, 'dwellTime': 1201},
'storeMode': 'hotel',
'ts': '2025-11-25T20:21:41.967Z'
}
}
}
]
@pytest.fixture
def mock_interactions(mock_interactions_raw_kafka):
"""Processed interaction DataFrame (what provider.fetch_kafka_topic returns)"""
records = [msg['value']['payload'] for msg in mock_interactions_raw_kafka]
df = pd.DataFrame(records)
df['timestamp'] = pd.to_datetime(df['ts'])
return df
@pytest.fixture
def mock_price_logs_raw_kafka():
"""Raw Kafka message structure for price logs, matches production format"""
return [
{
'partitionID': 0, 'offset': 32, 'timestamp': 1764104757969,
'value': {
'payload': {
'productId': '2cd7f756-fc65-4ba0-ab01-74521c1fff43',
'price': 162.47,
'sessionId': 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',
'experimentId': '53aefd07-f66a-4d7f-ba8b-7ea1fc562d35',
'storeMode': 'shop',
'ts': '2025-11-25T21:05:57.967Z'
}
}
},
{
'partitionID': 0, 'offset': 33, 'timestamp': 1764104757995,
'value': {
'payload': {
'productId': '2ddabbfc-4127-48fc-86dc-ebc4c677efa2',
'price': 743.49,
'sessionId': 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',
'experimentId': '53aefd07-f66a-4d7f-ba8b-7ea1fc562d35',
'storeMode': 'shop',
'ts': '2025-11-25T21:05:57.993Z'
}
}
},
{
'partitionID': 0, 'offset': 34, 'timestamp': 1764104758011,
'value': {
'payload': {
'productId': '2cd7f756-fc65-4ba0-ab01-74521c1fff43',
'price': 163.87,
'sessionId': 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',
'experimentId': '53aefd07-f66a-4d7f-ba8b-7ea1fc562d35',
'storeMode': 'shop',
'ts': '2025-11-25T21:05:58.009Z'
}
}
},
{
'partitionID': 0, 'offset': 35, 'timestamp': 1764104758050,
'value': {
'payload': {
'productId': '2ddabbfc-4127-48fc-86dc-ebc4c677efa2',
'price': 397.46,
'sessionId': 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',
'experimentId': '53aefd07-f66a-4d7f-ba8b-7ea1fc562d35',
'storeMode': 'shop',
'ts': '2025-11-25T21:05:58.049Z'
}
}
},
{
'partitionID': 0, 'offset': 36, 'timestamp': 1764104768865,
'value': {
'payload': {
'productId': '2cd7f756-fc65-4ba0-ab01-74521c1fff43',
'price': 401.66,
'sessionId': 'd423ce8a-77aa-4c9a-94d4-d1adddcc3472',
'experimentId': '53aefd07-f66a-4d7f-ba8b-7ea1fc562d35',
'storeMode': 'shop',
'ts': '2025-11-25T21:06:08.864Z'
}
}
}
]
@pytest.fixture
def mock_price_logs(mock_price_logs_raw_kafka):
"""Processed price logs DataFrame (what provider.fetch_kafka_topic returns)"""
# extract payloads and flatten
records = [msg['value']['payload'] for msg in mock_price_logs_raw_kafka]
df = pd.DataFrame(records)
df['timestamp'] = pd.to_datetime(df['ts'])
return df
@pytest.fixture
def mock_experiments():
"""Standard experiment metadata fixture matching Supabase schema"""
return pd.DataFrame({
'id': ['53aefd07-f66a-4d7f-ba8b-7ea1fc562d35', 'bbbbcccc-dddd-eeee-ffff-000011112222'],
'created_at': pd.to_datetime(['2025-11-25T20:00:00Z', '2025-11-26T10:00:00Z']),
'subject_name': ['Session A', 'Session B'],
'xp_human_only': [True, False],
'xp_market_mode': ['hotel', 'shop'],
'xp_task_id': [None, None]
})
@pytest.fixture
def mock_provider(mock_products, mock_experiments, mock_interactions, mock_price_logs):
"""Fully configured mock provider"""
return MockProvider(
products_df=mock_products,
experiments_df=mock_experiments,
kafka_data={
'user-interactions': mock_interactions,
'price-logs': mock_price_logs
}
)
@pytest.fixture
def pipeline_context(mock_provider):
"""Standard pipeline context for testing"""
return PipelineContext(
provider=mock_provider,
store_mode='hotel',
window_size='30s',
n_price_buckets=3
)
@pytest.fixture
def empty_provider():
"""Provider with no data, for edge case testing"""
return MockProvider(
products_df=pd.DataFrame(columns=['id', 'name', 'base_price']),
experiments_df=pd.DataFrame(columns=['id', 'created_at', 'subject_name', 'xp_human_only', 'xp_market_mode', 'xp_task_id']),
kafka_data={'user-interactions': pd.DataFrame(), 'price-logs': pd.DataFrame()}
)
@pytest.fixture
def empty_context(empty_provider):
"""Context with empty provider"""
return PipelineContext(
provider=empty_provider,
store_mode='hotel',
window_size='30s'
)

View File

@@ -0,0 +1,45 @@
import pytest
import random
import pandas as pd
from procesing.steps import (
CreatePriceBucketsStep,
AugmentEventNamesStep
)
def test_bucketing(pipeline_context):
step = CreatePriceBucketsStep(context=pipeline_context)
# Test with normal price data
df = pd.DataFrame({
'metadata_price': random.sample(range(10, 1000), 100)
})
result = step.transform(df)
assert 'price_bucket' in result.columns
# test if is categorical
assert isinstance(result['price_bucket'].dtype, pd.CategoricalDtype)
assert result['price_bucket'].nunique() == 3 # as per context config
# distribution check
counts = result['price_bucket'].value_counts()
assert all(counts > 0)
assert counts.max() - counts.min() <= 10 # roughly equal distribution for 100 samples
# Test with empty DataFrame
df = pd.DataFrame()
result = step.transform(df)
assert 'price_bucket' in result.columns
assert result.empty
def test_augment_names(pipeline_context):
df = pd.DataFrame({
'eventName': ['click', 'view', 'purchase'],
'productId': ['prod_1', 'prod_2', None],
'price_bucket': ['PB_1', None, 'PB_3']
})
step = AugmentEventNamesStep(context=pipeline_context)
result = step.transform(df)
expected_event_names = [
'click_prod_1@PB_1',
'view',
'purchase'
]
assert result['eventName'].tolist() == expected_event_names

View File

@@ -0,0 +1,49 @@
import pytest
import random
import pandas as pd
from procesing.steps import (
ComputeDemandStep
)
def test_compute_demand(pipeline_context):
step = ComputeDemandStep(context=pipeline_context)
# Test with normal interaction data
df = pd.DataFrame({
'ts': pd.date_range(start='2023-01-01', periods=100, freq='h'),
'productId': random.choices([
'd018efc1-25e9-4284-b276-80386e048b25',
'51266ddb-5b07-47b7-89ee-5b5cae94bb11',
'2cd7f756-fc65-4ba0-ab01-74521c1fff43'
], k=100),
'eventName': random.choices(['view', 'click', 'purchase'], k=100)
})
result = step.transform(df)
assert type(result) == pd.DataFrame
assert not result.empty
assert set(result['productId']) == set(pipeline_context.products['id'])
assert all(result['demand_score'] > 100/3 -10)
def test_compute_demand_skewed(pipeline_context):
step = ComputeDemandStep(context=pipeline_context)
# Test with normal interaction data
df = pd.DataFrame({
'ts': pd.date_range(start='2023-01-01', periods=100, freq='h'),
'productId': random.choices([
'd018efc1-25e9-4284-b276-80386e048b25',
'51266ddb-5b07-47b7-89ee-5b5cae94bb11',
'2cd7f756-fc65-4ba0-ab01-74521c1fff43'
], weights=[0.7, 0.2, 0.1], k=100),
'eventName': random.choices(['view', 'click', 'purchase'], k=100)
})
result = step.transform(df)
assert type(result) == pd.DataFrame
assert not result.empty
assert set(result['productId']) == set(pipeline_context.products['id'])
# test for skewness
scores = result.set_index('productId')['demand_score'].to_dict()
assert scores['d018efc1-25e9-4284-b276-80386e048b25'] > \
scores['51266ddb-5b07-47b7-89ee-5b5cae94bb11'] > \
scores['2cd7f756-fc65-4ba0-ab01-74521c1fff43']

View File

@@ -0,0 +1,353 @@
import pytest
import pandas as pd
import numpy as np
from procesing.steps import (
AggregatePriceLogsStep,
ComputeElasticityStep
)
def test_aggregate_price_logs_basic(pipeline_context):
"""Test basic price aggregation into time windows"""
step = AggregatePriceLogsStep(pipeline_context)
# Create price logs with known window structure
df = pd.DataFrame({
'ts': pd.date_range(start='2023-01-01 10:00:00', periods=100, freq='10s'),
'productId': np.tile([
'd018efc1-25e9-4284-b276-80386e048b25',
'51266ddb-5b07-47b7-89ee-5b5cae94bb11',
'2cd7f756-fc65-4ba0-ab01-74521c1fff43'
], 34)[:100],
'price': np.random.uniform(100, 200, 100)
})
result = step.transform(df)
assert isinstance(result, list)
assert len(result) > 0
# each chunk should have window metadata and price vector
for chunk in result:
assert 'window_start' in chunk
assert 'window_end' in chunk
assert 'price_vector' in chunk
assert isinstance(chunk['price_vector'], pd.DataFrame)
assert 'productId' in chunk['price_vector'].columns
assert 'price' in chunk['price_vector'].columns
def test_aggregate_price_logs_handles_gaps(pipeline_context):
"""Test that price aggregation forward-fills missing windows"""
step = AggregatePriceLogsStep(pipeline_context)
# create sparse data with gaps
df = pd.DataFrame({
'ts': pd.to_datetime([
'2023-01-01 10:00:00',
'2023-01-01 10:00:05',
'2023-01-01 10:02:00', # gap of ~2 mins
'2023-01-01 10:02:30'
]),
'productId': [
'd018efc1-25e9-4284-b276-80386e048b25',
'd018efc1-25e9-4284-b276-80386e048b25',
'51266ddb-5b07-47b7-89ee-5b5cae94bb11',
'51266ddb-5b07-47b7-89ee-5b5cae94bb11'
],
'price': [100, 102, 150, 153]
})
result = step.transform(df)
assert isinstance(result, list)
# should have multiple windows despite gaps
assert len(result) >= 2
def test_compute_elasticity_with_known_relationship(pipeline_context):
"""Test elasticity computation with known price-demand relationship"""
step = ComputeElasticityStep(pipeline_context)
# simulate elastic demand: when price ↑10%, demand ↓15% (elasticity ~ -1.5)
base_price = 100
base_demand = 50
demand_chunks = [
{
'window_start': pd.Timestamp('2023-01-01 10:00:00'),
'window_end': pd.Timestamp('2023-01-01 10:00:30'),
'demand_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'demand_score': [base_demand]
})
},
{
'window_start': pd.Timestamp('2023-01-01 10:00:30'),
'window_end': pd.Timestamp('2023-01-01 10:01:00'),
'demand_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'demand_score': [base_demand * 0.85] # 15% decrease
})
},
{
'window_start': pd.Timestamp('2023-01-01 10:01:00'),
'window_end': pd.Timestamp('2023-01-01 10:01:30'),
'demand_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'demand_score': [base_demand * 0.70] # further decrease
})
}
]
price_chunks = [
{
'window_start': pd.Timestamp('2023-01-01 10:00:00'),
'window_end': pd.Timestamp('2023-01-01 10:00:30'),
'price_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'price': [base_price]
})
},
{
'window_start': pd.Timestamp('2023-01-01 10:00:30'),
'window_end': pd.Timestamp('2023-01-01 10:01:00'),
'price_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'price': [base_price * 1.10] # 10% increase
})
},
{
'window_start': pd.Timestamp('2023-01-01 10:01:00'),
'window_end': pd.Timestamp('2023-01-01 10:01:30'),
'price_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'price': [base_price * 1.20] # 20% increase
})
}
]
result = step.transform((demand_chunks, price_chunks))
assert isinstance(result, pd.DataFrame)
assert not result.empty
assert 'productId' in result.columns
assert 'elasticity' in result.columns
assert 'n_obs' in result.columns
# check elasticity is negative (normal good)
product_elast = result[result['productId'] == 'd018efc1-25e9-4284-b276-80386e048b25']
assert len(product_elast) == 1
assert product_elast.iloc[0]['elasticity'] < 0
# should be roughly elastic (< -1)
assert product_elast.iloc[0]['n_obs'] == 3
def test_compute_elasticity_inelastic_product(pipeline_context):
"""Test with inelastic demand: price changes, demand barely moves"""
step = ComputeElasticityStep(pipeline_context)
base_price = 150
base_demand = 40
demand_chunks = [
{
'window_start': pd.Timestamp('2023-01-01 10:00:00'),
'window_end': pd.Timestamp('2023-01-01 10:00:30'),
'demand_vector': pd.DataFrame({
'productId': ['51266ddb-5b07-47b7-89ee-5b5cae94bb11'],
'demand_score': [base_demand]
})
},
{
'window_start': pd.Timestamp('2023-01-01 10:00:30'),
'window_end': pd.Timestamp('2023-01-01 10:01:00'),
'demand_vector': pd.DataFrame({
'productId': ['51266ddb-5b07-47b7-89ee-5b5cae94bb11'],
'demand_score': [base_demand * 0.98] # tiny 2% decrease
})
}
]
price_chunks = [
{
'window_start': pd.Timestamp('2023-01-01 10:00:00'),
'window_end': pd.Timestamp('2023-01-01 10:00:30'),
'price_vector': pd.DataFrame({
'productId': ['51266ddb-5b07-47b7-89ee-5b5cae94bb11'],
'price': [base_price]
})
},
{
'window_start': pd.Timestamp('2023-01-01 10:00:30'),
'window_end': pd.Timestamp('2023-01-01 10:01:00'),
'price_vector': pd.DataFrame({
'productId': ['51266ddb-5b07-47b7-89ee-5b5cae94bb11'],
'price': [base_price * 1.20] # 20% increase
})
}
]
result = step.transform((demand_chunks, price_chunks))
product_elast = result[result['productId'] == '51266ddb-5b07-47b7-89ee-5b5cae94bb11']
assert len(product_elast) == 1
# inelastic: elasticity between 0 and -1
assert -1 < product_elast.iloc[0]['elasticity'] < 0
def test_compute_elasticity_multiple_products(pipeline_context):
"""Test elasticity computation across multiple products simultaneously"""
step = ComputeElasticityStep(pipeline_context)
products = [
'd018efc1-25e9-4284-b276-80386e048b25',
'51266ddb-5b07-47b7-89ee-5b5cae94bb11',
'2cd7f756-fc65-4ba0-ab01-74521c1fff43'
]
# create 5 time windows with all 3 products
demand_chunks = []
price_chunks = []
for i in range(5):
ts = pd.Timestamp('2023-01-01 10:00:00') + pd.Timedelta(f'{i*30}s')
demand_chunks.append({
'window_start': ts,
'window_end': ts + pd.Timedelta('30s'),
'demand_vector': pd.DataFrame({
'productId': products,
'demand_score': [
50 * (0.9 ** i), # elastic: decreases as price rises
40 * (0.98 ** i), # inelastic: barely changes
30 * (0.85 ** i) # very elastic
]
})
})
price_chunks.append({
'window_start': ts,
'window_end': ts + pd.Timedelta('30s'),
'price_vector': pd.DataFrame({
'productId': products,
'price': [
100 * (1.05 ** i),
150 * (1.10 ** i),
120 * (1.08 ** i)
]
})
})
result = step.transform((demand_chunks, price_chunks))
assert isinstance(result, pd.DataFrame)
assert len(result) == 3 # all products should have elasticity
assert set(result['productId']) == set(products)
assert all(result['n_obs'] == 5)
assert all(result['elasticity'] < 0) # all normal goods
def test_compute_elasticity_insufficient_data(pipeline_context):
"""Test behavior with insufficient observations"""
step = ComputeElasticityStep(pipeline_context)
# only 1 observation
demand_chunks = [{
'window_start': pd.Timestamp('2023-01-01 10:00:00'),
'window_end': pd.Timestamp('2023-01-01 10:00:30'),
'demand_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'demand_score': [50]
})
}]
price_chunks = [{
'window_start': pd.Timestamp('2023-01-01 10:00:00'),
'window_end': pd.Timestamp('2023-01-01 10:00:30'),
'price_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'price': [100]
})
}]
result = step.transform((demand_chunks, price_chunks))
# should still return result but with low n_obs
product_elast = result[result['productId'] == 'd018efc1-25e9-4284-b276-80386e048b25']
assert len(product_elast) == 1
assert product_elast.iloc[0]['n_obs'] == 1
assert product_elast.iloc[0]['elasticity'] == 0.0 # not enough data
def test_compute_elasticity_misaligned_chunks(pipeline_context):
"""Test with non-overlapping demand and price windows"""
step = ComputeElasticityStep(pipeline_context)
demand_chunks = [{
'window_start': pd.Timestamp('2023-01-01 10:00:00'),
'window_end': pd.Timestamp('2023-01-01 10:00:30'),
'demand_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'demand_score': [50]
})
}]
price_chunks = [{
'window_start': pd.Timestamp('2023-01-01 11:00:00'), # different time
'window_end': pd.Timestamp('2023-01-01 11:00:30'),
'price_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'price': [100]
})
}]
result = step.transform((demand_chunks, price_chunks))
# should handle gracefully with no aligned data
assert isinstance(result, pd.DataFrame)
assert all(result['n_obs'] == 0)
def test_elasticity_arc_method(pipeline_context):
"""Test arc elasticity computation method"""
# configure context for arc method
pipeline_context.config['elasticity_method'] = 'arc'
step = ComputeElasticityStep(pipeline_context)
demand_chunks = [
{
'window_start': pd.Timestamp('2023-01-01 10:00:00'),
'window_end': pd.Timestamp('2023-01-01 10:00:30'),
'demand_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'demand_score': [100]
})
},
{
'window_start': pd.Timestamp('2023-01-01 10:00:30'),
'window_end': pd.Timestamp('2023-01-01 10:01:00'),
'demand_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'demand_score': [80]
})
}
]
price_chunks = [
{
'window_start': pd.Timestamp('2023-01-01 10:00:00'),
'window_end': pd.Timestamp('2023-01-01 10:00:30'),
'price_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'price': [100]
})
},
{
'window_start': pd.Timestamp('2023-01-01 10:00:30'),
'window_end': pd.Timestamp('2023-01-01 10:01:00'),
'price_vector': pd.DataFrame({
'productId': ['d018efc1-25e9-4284-b276-80386e048b25'],
'price': [110]
})
}
]
result = step.transform((demand_chunks, price_chunks))
product_elast = result[result['productId'] == 'd018efc1-25e9-4284-b276-80386e048b25']
assert len(product_elast) == 1
assert product_elast.iloc[0]['elasticity'] < 0
# reset config
pipeline_context.config['elasticity_method'] = 'point'

View File

@@ -0,0 +1,51 @@
import pytest
import pandas as pd
from procesing.steps import (
FetchInteractionsStep,
FetchPriceLogsStep,
FetchExperimentsStep,
)
def test_fetch_interactions_data(pipeline_context):
step = FetchInteractionsStep(pipeline_context)
data = step.transform(None)
assert data is not None
assert isinstance(data, pd.DataFrame)
expected_cols = [
"eventName",
"dateIndex",
"experimentId",
"storeMode",
"metadata_elementText"
]
for expected in expected_cols:
assert expected in data.columns
def test_fetch_price_logs(pipeline_context):
step = FetchPriceLogsStep(pipeline_context)
data = step.transform(None)
assert data is not None
assert isinstance(data, pd.DataFrame)
expected_cols = [
"price",
"productId"
]
for expected in expected_cols:
assert expected in data.columns
prices = data['price'].to_list()
assert min(prices) >= 0
assert max(prices) <= 9999
def test_experiments_fetching(pipeline_context):
interactions = FetchInteractionsStep(pipeline_context).transform(None)
assert interactions is not None
experiments = FetchExperimentsStep(pipeline_context)
experiment_data = experiments.transform(interactions)
assert experiment_data is not None
assert isinstance(experiment_data, pd.DataFrame)
assert not experiment_data.empty
assert 'id' in experiment_data.columns
assert len(experiment_data) == 2
assert '53aefd07-f66a-4d7f-ba8b-7ea1fc562d35' in experiment_data['id'].values

View File

@@ -0,0 +1,87 @@
import pytest
import pandas as pd
from procesing.pricers import (
StaticPricer,
RandomPricer,
ElasticityBasedPricer
)
def test_static_pricer_fit_and_predict():
# Sample historical data
historical_data = pd.DataFrame({
'product_id': [1, 2, 3],
'base_price': [100.0, 150.0, 200.0]
})
# Initialize and fit StaticPricer
pricer = StaticPricer()
pricer.fit(historical_data)
# Predict prices
predicted_prices = pricer.predict(None)
# Assert that predicted prices match base prices
expected_prices = historical_data['base_price'].values
assert all(predicted_prices == expected_prices), "Predicted prices do not match base prices"
def test_random_pricer_fit_and_predict():
# Sample historical data
historical_data = pd.DataFrame({
'product_id': [1, 2, 3],
'base_price': [100.0, 150.0, 200.0]
})
# Initialize and fit RandomPricer
pricer = RandomPricer(price_min=50.0, price_max=250.0, seed=42)
pricer.fit(historical_data)
# Predict prices
predicted_prices = pricer.predict(None)
# Assert that predicted prices are within bounds
assert predicted_prices.min() >= 50.0, "Predicted prices are below minimum bound"
assert predicted_prices.max() <= 250.0, "Predicted prices are above maximum bound"
# distribution check (not so strict)
assert len(set(predicted_prices)) > 1, "Predicted prices are not varied enough"
assert len(predicted_prices) == len(historical_data), "Number of predicted prices does not match number of products"
def test_elasticity_based_pricer_fit_and_predict():
# Sample historical data
historical_data = pd.DataFrame({
'productId': [1, 2, 3],
'elasticity': [-1.5, -0.5, -2.0],
'base_price': [100.0, 150.0, 200.0],
'mean_demand': [10, 20, 15]
})
# Initialize and fit ElasticityBasedPricer
pricer = ElasticityBasedPricer(alpha=0.1, price_floor=50.0, price_ceil=300.0)
pricer.fit(historical_data)
# Create a mock state space with demand deviations
class MockStateSpace:
def __init__(self, demand):
self.demand = demand
# Simulate demand higher than mean for all products
state_space = MockStateSpace(demand=[15, 25, 20])
# Predict prices
predicted_prices = pricer.predict(state_space)
# Assert that predicted prices are within bounds
assert predicted_prices.min() >= 50.0, "Predicted prices are below minimum bound"
assert predicted_prices.max() <= 300.0, "Predicted prices are above maximum bound"
assert len(predicted_prices) == len(historical_data), "Number of predicted prices does not match number of products"
# now we gotta check semantic validity
# since demand is higher than mean, prices should generally increase
for i, row in historical_data.iterrows():
base_price = row['base_price']
elasticity = row['elasticity']
expected_increase = base_price * (1 + 0.1 * abs(elasticity) * ((state_space.demand[i] - row['mean_demand']) / row['mean_demand']))
assert predicted_prices[i] >= base_price, f"Predicted price for product {row['productId']} did not increase as expected"
assert abs(predicted_prices[i] - expected_increase) < 1e-5, f"Predicted price for product {row['productId']} does not match expected calculation within 1e-5 tolerance"