mirror of
https://github.com/velocitatem/PHANTOM.git
synced 2026-05-31 16:43:36 +00:00
Airflow addition (#28)
* introducing airflow to run pipeline * chore: updating dag with upload to registry * introducing complete provider (non refactored and noisy) * chore: removing old shit * generic pricing baselines * feature: super simple model registry (to be updated maybe third party OS software) * chore: refactoring the providers docker config and requirements * chore: refactored and broke down components (braking * exporting all * local pipeline excution working * fix: fixing import structures from nonrelativistic * chore: enables cross comm pickling with fully e2e pipeline compilation * docs: what the pipeline is like now * pipelines local running and pipeline high level definition * cleaning old pipeline and vectorization * leaked but fixing, not so important * test: started with pipeline step testing * chore: cleaning up provider of prices * test: extra tests wit hsemantic meaning checks * migrating pricers * feature: introducing pricing predictors (pricers) * chore: e2e is done with new pipeline * extra session feature extraction * feature: experiemntal sessin pricer and metrics(vibe) * chore: redefined and connected pricers (#29)
This commit is contained in:
committed by
GitHub
parent
2a0e44ab24
commit
ad9423bf59
27
experiments/procesing/steps/__init__.py
Executable file
27
experiments/procesing/steps/__init__.py
Executable file
@@ -0,0 +1,27 @@
|
||||
from procesing.steps.base import BaseContextStep
|
||||
from procesing.steps.fetch import FetchInteractionsStep, FetchPriceLogsStep, FetchExperimentsStep
|
||||
from procesing.steps.join import JoinExperimentsStep
|
||||
from procesing.steps.augment import CreatePriceBucketsStep, AugmentEventNamesStep
|
||||
from procesing.steps.chunk import ChunkByTimeWindowStep
|
||||
from procesing.steps.demand import ComputeDemandStep, ComputeDemandForChunksStep
|
||||
from procesing.steps.elasticity import AggregatePriceLogsStep, ComputeElasticityStep
|
||||
from procesing.steps.pricing import StateSpace, BuildStateSpaceStep, FitPricingFunctionStep, PredictPricesStep
|
||||
|
||||
__all__ = [
|
||||
'BaseContextStep',
|
||||
'FetchInteractionsStep',
|
||||
'FetchPriceLogsStep',
|
||||
'FetchExperimentsStep',
|
||||
'JoinExperimentsStep',
|
||||
'CreatePriceBucketsStep',
|
||||
'AugmentEventNamesStep',
|
||||
'ChunkByTimeWindowStep',
|
||||
'ComputeDemandStep',
|
||||
'ComputeDemandForChunksStep',
|
||||
'AggregatePriceLogsStep',
|
||||
'ComputeElasticityStep',
|
||||
'StateSpace',
|
||||
'BuildStateSpaceStep',
|
||||
'FitPricingFunctionStep',
|
||||
'PredictPricesStep',
|
||||
]
|
||||
53
experiments/procesing/steps/augment.py
Executable file
53
experiments/procesing/steps/augment.py
Executable file
@@ -0,0 +1,53 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from procesing.steps.base import BaseContextStep
|
||||
|
||||
class CreatePriceBucketsStep(BaseContextStep):
|
||||
"""Create price bucket labels from price data"""
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
if df.empty or 'metadata_price' not in df.columns:
|
||||
df['price_bucket'] = ""
|
||||
return df
|
||||
|
||||
n_buckets = self.context.config.get('n_price_buckets', 5)
|
||||
|
||||
if df['metadata_price'].notnull().sum() > 0:
|
||||
try:
|
||||
price_buckets = pd.qcut(
|
||||
df['metadata_price'],
|
||||
q=n_buckets,
|
||||
labels=[f"PB_{i+1}" for i in range(n_buckets)],
|
||||
duplicates='drop'
|
||||
)
|
||||
except ValueError:
|
||||
# fallback for insufficient unique values
|
||||
price_buckets = df['metadata_price'].apply(
|
||||
lambda x: f"P_{int(x)}" if pd.notnull(x) else ""
|
||||
)
|
||||
else:
|
||||
price_buckets = pd.Series([""] * len(df), index=df.index)
|
||||
|
||||
df['price_bucket'] = price_buckets
|
||||
return df
|
||||
|
||||
|
||||
class AugmentEventNamesStep(BaseContextStep):
|
||||
"""Augment event names with product and price bucket schema"""
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
# Create schema: _productId@price_bucket
|
||||
has_product = df.get('productId', pd.Series()).notnull()
|
||||
has_bucket = df.get('price_bucket', pd.Series()).notnull()
|
||||
|
||||
df['metadata_schema'] = np.where(
|
||||
has_product & has_bucket,
|
||||
"_" + df['productId'].astype(str) + "@" + df['price_bucket'].astype(str),
|
||||
""
|
||||
)
|
||||
|
||||
df['eventName'] = df['eventName'] + df['metadata_schema']
|
||||
return df
|
||||
31
experiments/procesing/steps/base.py
Executable file
31
experiments/procesing/steps/base.py
Executable file
@@ -0,0 +1,31 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from procesing.context import PipelineContext
|
||||
|
||||
class BaseContextStep(BaseEstimator, TransformerMixin, ABC):
|
||||
"""
|
||||
Base for all pipeline steps.
|
||||
Each step is stateless, context-driven, and performs ONE transformation.
|
||||
"""
|
||||
|
||||
def __init__(self, context: PipelineContext):
|
||||
self.context = context
|
||||
|
||||
def fit(self, X=None, y=None):
|
||||
"""Most steps don't need training"""
|
||||
return self
|
||||
|
||||
@abstractmethod
|
||||
def transform(self, X):
|
||||
"""Transform input using context. Must be implemented by subclass."""
|
||||
pass
|
||||
|
||||
def get_params(self, deep=True):
|
||||
"""sklearn compatibility"""
|
||||
return {'context': self.context}
|
||||
|
||||
def set_params(self, **params):
|
||||
"""sklearn compatibility"""
|
||||
if 'context' in params:
|
||||
self.context = params['context']
|
||||
return self
|
||||
34
experiments/procesing/steps/chunk.py
Executable file
34
experiments/procesing/steps/chunk.py
Executable file
@@ -0,0 +1,34 @@
|
||||
import pandas as pd
|
||||
from procesing.steps.base import BaseContextStep
|
||||
|
||||
class ChunkByTimeWindowStep(BaseContextStep):
|
||||
"""
|
||||
Chunk dataframe into time windows.
|
||||
Returns list of dicts with window metadata.
|
||||
"""
|
||||
|
||||
def transform(self, df: pd.DataFrame):
|
||||
if df.empty:
|
||||
return []
|
||||
|
||||
df = df.copy()
|
||||
ts_col = self.context.config.get('ts_col', 'ts')
|
||||
window_size = self.context.window_size
|
||||
|
||||
# ensure datetime
|
||||
if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
|
||||
df[ts_col] = pd.to_datetime(df[ts_col])
|
||||
|
||||
df = df.sort_values(ts_col)
|
||||
df['_window'] = df[ts_col].dt.floor(window_size)
|
||||
|
||||
chunks = []
|
||||
for idx, (window_start, group) in enumerate(df.groupby('_window')):
|
||||
chunks.append({
|
||||
'window_start': window_start,
|
||||
'window_end': window_start + pd.Timedelta(window_size),
|
||||
'window_idx': idx,
|
||||
'data': group.drop(columns=['_window'])
|
||||
})
|
||||
|
||||
return chunks
|
||||
61
experiments/procesing/steps/demand.py
Executable file
61
experiments/procesing/steps/demand.py
Executable file
@@ -0,0 +1,61 @@
|
||||
import pandas as pd
|
||||
from procesing.steps.base import BaseContextStep
|
||||
|
||||
class ComputeDemandStep(BaseContextStep):
|
||||
"""
|
||||
Compute demand vector for a single time window or dataframe.
|
||||
Input: single chunk dict OR raw dataframe
|
||||
Output: demand dataframe with [productId, demand_score]
|
||||
"""
|
||||
|
||||
def transform(self, chunk):
|
||||
# handle both chunk dict and raw dataframe
|
||||
if isinstance(chunk, dict):
|
||||
interactions = chunk['data']
|
||||
window_meta = {k: v for k, v in chunk.items() if k != 'data'}
|
||||
else:
|
||||
interactions = chunk
|
||||
window_meta = {}
|
||||
|
||||
products = self.context.products
|
||||
unique_products = products['id'].unique()
|
||||
|
||||
# apply filters if configured
|
||||
session_filter = self.context.config.get('session_filter')
|
||||
experiment_filter = self.context.config.get('experiment_filter')
|
||||
|
||||
if session_filter and 'sessionId' in interactions.columns:
|
||||
interactions = interactions[interactions['sessionId'] == session_filter]
|
||||
if experiment_filter and 'experimentId' in interactions.columns:
|
||||
interactions = interactions[interactions['experimentId'] == experiment_filter]
|
||||
|
||||
interactions_with_products = interactions.dropna(subset=['productId'])
|
||||
|
||||
if interactions_with_products.empty:
|
||||
demand_df = pd.DataFrame({
|
||||
'productId': unique_products,
|
||||
'demand_score': 0
|
||||
})
|
||||
else:
|
||||
# crosstab for simple demand count
|
||||
demand_df = pd.crosstab(
|
||||
interactions_with_products['productId'],
|
||||
'count'
|
||||
).reindex(unique_products, fill_value=0).reset_index()
|
||||
demand_df.columns = ['productId', 'demand_score']
|
||||
|
||||
# attach window metadata if present
|
||||
if window_meta:
|
||||
return {**window_meta, 'demand_vector': demand_df}
|
||||
return demand_df
|
||||
|
||||
|
||||
class ComputeDemandForChunksStep(BaseContextStep):
|
||||
"""Apply ComputeDemandStep to list of chunks"""
|
||||
|
||||
def transform(self, chunks: list):
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
demand_step = ComputeDemandStep(self.context)
|
||||
return [demand_step.transform(chunk) for chunk in chunks]
|
||||
253
experiments/procesing/steps/elasticity.py
Executable file
253
experiments/procesing/steps/elasticity.py
Executable file
@@ -0,0 +1,253 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from typing import Dict, List
|
||||
from procesing.steps.base import BaseContextStep
|
||||
|
||||
class AggregatePriceLogsStep(BaseContextStep):
|
||||
"""
|
||||
Aggregate price logs into time windows using VECTORIZED operations.
|
||||
Input: price_logs_df
|
||||
Output: list of price chunks with [productId, price]
|
||||
"""
|
||||
|
||||
def transform(self, price_logs_df: pd.DataFrame):
|
||||
if price_logs_df.empty:
|
||||
return []
|
||||
|
||||
df = price_logs_df.copy()
|
||||
ts_col = self.context.config.get('ts_col', 'ts')
|
||||
window_size = self.context.window_size
|
||||
|
||||
# ensure datetime
|
||||
if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
|
||||
df[ts_col] = pd.to_datetime(df[ts_col])
|
||||
|
||||
df = df.sort_values([ts_col, 'productId'])
|
||||
products = self.context.products
|
||||
unique_products = products['id'].unique()
|
||||
|
||||
# VECTORIZED: group by product, resample by time window, compute mean
|
||||
df_indexed = df.set_index(ts_col)
|
||||
|
||||
windowed = (
|
||||
df_indexed
|
||||
.groupby('productId')['price']
|
||||
.resample(window_size)
|
||||
.mean()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# forward fill missing windows (carry last known price)
|
||||
windowed = windowed.sort_values([ts_col, 'productId'])
|
||||
windowed['price'] = windowed.groupby('productId')['price'].ffill()
|
||||
windowed = windowed.dropna(subset=['price'])
|
||||
|
||||
# group into chunks by window
|
||||
chunks = []
|
||||
for window_start, group in windowed.groupby(ts_col):
|
||||
price_vector = group[['productId', 'price']].copy()
|
||||
|
||||
# fill missing products with last known price before this window
|
||||
missing_products = set(unique_products) - set(price_vector['productId'])
|
||||
if missing_products:
|
||||
for pid in missing_products:
|
||||
last_price = df_indexed[
|
||||
(df_indexed['productId'] == pid) &
|
||||
(df_indexed.index < window_start)
|
||||
]['price']
|
||||
|
||||
if not last_price.empty:
|
||||
price_vector = pd.concat([
|
||||
price_vector,
|
||||
pd.DataFrame({'productId': [pid], 'price': [last_price.iloc[-1]]})
|
||||
], ignore_index=True)
|
||||
|
||||
if not price_vector.empty:
|
||||
chunks.append({
|
||||
'window_start': window_start,
|
||||
'window_end': window_start + pd.Timedelta(window_size),
|
||||
'price_vector': price_vector
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
class ComputeElasticityStep(BaseContextStep):
|
||||
"""
|
||||
Compute price elasticity from demand and price chunks.
|
||||
Input: (demand_chunks, price_chunks)
|
||||
Output: elasticity_df [productId, elasticity, std_error, n_obs]
|
||||
"""
|
||||
|
||||
def transform(self, chunk_tuple: tuple):
|
||||
demand_chunks, price_chunks = chunk_tuple
|
||||
|
||||
method = self.context.config.get('elasticity_method', 'point')
|
||||
min_obs = self.context.config.get('min_observations', 2)
|
||||
|
||||
products = self.context.products
|
||||
all_product_ids = products['id'].unique()
|
||||
|
||||
# align chunks by window_start
|
||||
aligned = self._align_chunks(demand_chunks, price_chunks)
|
||||
|
||||
if not aligned:
|
||||
return pd.DataFrame({
|
||||
'productId': all_product_ids,
|
||||
'elasticity': 0.0,
|
||||
'std_error': 0.0,
|
||||
'n_obs': 0
|
||||
})
|
||||
|
||||
# build time series per product
|
||||
product_series = self._build_timeseries(aligned)
|
||||
|
||||
# compute elasticity per product
|
||||
elasticities = []
|
||||
for pid, series in product_series.items():
|
||||
if len(series) < min_obs:
|
||||
elasticities.append({
|
||||
'productId': pid,
|
||||
'elasticity': 0.0,
|
||||
'std_error': 0.0,
|
||||
'n_obs': len(series)
|
||||
})
|
||||
continue
|
||||
|
||||
elast = self._compute_elasticity(series, method)
|
||||
elasticities.append({
|
||||
'productId': pid,
|
||||
'elasticity': elast['value'],
|
||||
'std_error': elast.get('std_error', 0.0),
|
||||
'n_obs': len(series)
|
||||
})
|
||||
|
||||
result_df = pd.DataFrame(elasticities)
|
||||
|
||||
# fill missing products with zero elasticity
|
||||
observed_pids = set(result_df['productId'])
|
||||
missing_pids = [p for p in all_product_ids if p not in observed_pids]
|
||||
|
||||
if missing_pids:
|
||||
missing_df = pd.DataFrame({
|
||||
'productId': missing_pids,
|
||||
'elasticity': 0.0,
|
||||
'std_error': 0.0,
|
||||
'n_obs': 0
|
||||
})
|
||||
result_df = pd.concat([result_df, missing_df], ignore_index=True)
|
||||
|
||||
return result_df
|
||||
|
||||
def _align_chunks(self, demand_chunks: List[Dict], price_chunks: List[Dict]):
|
||||
"""Align demand and price chunks by window_start"""
|
||||
price_lookup = {c['window_start']: c for c in price_chunks}
|
||||
aligned = []
|
||||
|
||||
for dc in demand_chunks:
|
||||
ws = dc['window_start']
|
||||
if ws in price_lookup:
|
||||
aligned.append({
|
||||
'window_start': ws,
|
||||
'window_end': dc['window_end'],
|
||||
'demand': dc['demand_vector'],
|
||||
'prices': price_lookup[ws]['price_vector']
|
||||
})
|
||||
|
||||
return aligned
|
||||
|
||||
def _build_timeseries(self, aligned: List[Dict]):
|
||||
"""Build time series [timestamp, price, quantity] per product"""
|
||||
series_by_product = {}
|
||||
|
||||
for chunk in aligned:
|
||||
merged = chunk['demand'].merge(chunk['prices'], on='productId', how='inner')
|
||||
|
||||
for _, row in merged.iterrows():
|
||||
pid = row['productId']
|
||||
if pid not in series_by_product:
|
||||
series_by_product[pid] = []
|
||||
|
||||
series_by_product[pid].append({
|
||||
'timestamp': chunk['window_start'],
|
||||
'price': row['price'],
|
||||
'quantity': row['demand_score']
|
||||
})
|
||||
|
||||
return series_by_product
|
||||
|
||||
def _compute_elasticity(self, series: List[Dict], method: str):
|
||||
"""Compute point or arc elasticity"""
|
||||
prices = np.array([s['price'] for s in series])
|
||||
quantities = np.array([s['quantity'] for s in series])
|
||||
|
||||
# filter out zero/negative values
|
||||
valid = (prices > 0) & (quantities > 0)
|
||||
if valid.sum() < 2:
|
||||
return {'value': 0.0, 'std_error': 0.0}
|
||||
|
||||
prices = prices[valid]
|
||||
quantities = quantities[valid]
|
||||
|
||||
if method == 'point':
|
||||
return self._point_elasticity(prices, quantities)
|
||||
elif method == 'arc':
|
||||
return self._arc_elasticity(prices, quantities)
|
||||
else:
|
||||
raise ValueError(f"Unknown elasticity method: {method}")
|
||||
|
||||
def _point_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
|
||||
"""Point elasticity via log-log regression: log(Q) = a + b*log(P), elasticity = b"""
|
||||
if len(prices) < 2:
|
||||
return {'value': 0.0, 'std_error': 0.0}
|
||||
|
||||
log_p = np.log(prices)
|
||||
log_q = np.log(quantities)
|
||||
|
||||
if log_p.std() == 0:
|
||||
return {'value': 0.0, 'std_error': 0.0}
|
||||
|
||||
cov = np.cov(log_p, log_q)[0, 1]
|
||||
var = np.var(log_p)
|
||||
b = cov / var
|
||||
|
||||
# std error estimate
|
||||
if len(prices) > 2:
|
||||
residuals = log_q - (log_q.mean() + b * (log_p - log_p.mean()))
|
||||
mse = (residuals ** 2).sum() / (len(prices) - 2)
|
||||
se_b = np.sqrt(mse / (len(prices) * var))
|
||||
else:
|
||||
se_b = 0.0
|
||||
|
||||
return {'value': b, 'std_error': se_b}
|
||||
|
||||
def _arc_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
|
||||
"""Arc elasticity: average period-over-period elasticity"""
|
||||
elasticities = []
|
||||
|
||||
for i in range(1, len(prices)):
|
||||
p1, p2 = prices[i-1], prices[i]
|
||||
q1, q2 = quantities[i-1], quantities[i]
|
||||
|
||||
p_avg = (p1 + p2) / 2
|
||||
q_avg = (q1 + q2) / 2
|
||||
|
||||
if p_avg == 0 or q_avg == 0:
|
||||
continue
|
||||
|
||||
delta_p = p2 - p1
|
||||
delta_q = q2 - q1
|
||||
|
||||
if delta_p == 0:
|
||||
continue
|
||||
|
||||
e = (delta_q / q_avg) / (delta_p / p_avg)
|
||||
elasticities.append(e)
|
||||
|
||||
if not elasticities:
|
||||
return {'value': 0.0, 'std_error': 0.0}
|
||||
|
||||
return {
|
||||
'value': np.mean(elasticities),
|
||||
'std_error': np.std(elasticities) / np.sqrt(len(elasticities))
|
||||
}
|
||||
46
experiments/procesing/steps/fetch.py
Executable file
46
experiments/procesing/steps/fetch.py
Executable file
@@ -0,0 +1,46 @@
|
||||
import pandas as pd
|
||||
from procesing.steps.base import BaseContextStep
|
||||
|
||||
class FetchInteractionsStep(BaseContextStep):
|
||||
"""Fetch raw interaction data from Kafka topic"""
|
||||
|
||||
def transform(self, X=None):
|
||||
df = self.context.provider.fetch_kafka_topic('user-interactions')
|
||||
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
# Explode metadata JSON column
|
||||
if 'metadata' in df.columns:
|
||||
df = df.join(
|
||||
pd.json_normalize(df.pop('metadata'), sep='.').add_prefix('metadata_')
|
||||
)
|
||||
|
||||
df = df.dropna(subset=['eventName'])
|
||||
|
||||
# Remap dateIndex if present
|
||||
if 'metadata_dateIndex' in df.columns:
|
||||
df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
|
||||
|
||||
return df
|
||||
|
||||
|
||||
class FetchPriceLogsStep(BaseContextStep):
|
||||
"""Fetch price log data from Kafka topic"""
|
||||
|
||||
def transform(self, X=None):
|
||||
return self.context.provider.fetch_kafka_topic('price-logs')
|
||||
|
||||
|
||||
class FetchExperimentsStep(BaseContextStep):
|
||||
"""Fetch experiment metadata for given interaction data"""
|
||||
|
||||
def transform(self, interactions_df: pd.DataFrame):
|
||||
if interactions_df.empty or 'experimentId' not in interactions_df.columns:
|
||||
return pd.DataFrame()
|
||||
|
||||
exp_ids = interactions_df['experimentId'].dropna().unique().tolist()
|
||||
if not exp_ids:
|
||||
return pd.DataFrame()
|
||||
|
||||
return self.context.provider.fetch_experiments(exp_ids)
|
||||
34
experiments/procesing/steps/join.py
Executable file
34
experiments/procesing/steps/join.py
Executable file
@@ -0,0 +1,34 @@
|
||||
import pandas as pd
|
||||
from procesing.steps.base import BaseContextStep
|
||||
|
||||
class JoinExperimentsStep(BaseContextStep):
|
||||
"""Join experiment metadata to interactions"""
|
||||
|
||||
def transform(self, data: tuple):
|
||||
"""
|
||||
Args:
|
||||
data: (interactions_df, experiments_df)
|
||||
Returns:
|
||||
merged interactions dataframe
|
||||
"""
|
||||
interactions_df, experiments_df = data
|
||||
|
||||
if experiments_df.empty:
|
||||
return interactions_df
|
||||
|
||||
# Flatten nested task field if present
|
||||
if 'task' in experiments_df.columns and experiments_df['task'].notnull().any():
|
||||
task_norm = pd.json_normalize(experiments_df['task'].dropna())
|
||||
task_norm.index = experiments_df[experiments_df['task'].notnull()].index
|
||||
experiments_df = experiments_df.drop('task', axis=1).join(task_norm, rsuffix='_task')
|
||||
|
||||
# Rename for clarity
|
||||
experiments_df = experiments_df.rename(columns={
|
||||
'id': 'experimentId',
|
||||
'subject_name': 'exp_subject',
|
||||
'xp_human_only': 'exp_human_only',
|
||||
'xp_market_mode': 'exp_market_mode',
|
||||
'xp_task_id': 'exp_task_id'
|
||||
})
|
||||
|
||||
return interactions_df.merge(experiments_df, on='experimentId', how='left')
|
||||
149
experiments/procesing/steps/pricing.py
Executable file
149
experiments/procesing/steps/pricing.py
Executable file
@@ -0,0 +1,149 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from typing import Optional, List, Dict, Any
|
||||
from dataclasses import dataclass, field
|
||||
from procesing.steps.base import BaseContextStep
|
||||
from procesing.pricers import ElasticityBasedPricer
|
||||
|
||||
@dataclass
|
||||
class StateSpace:
|
||||
"""
|
||||
State representation for pricing functions.
|
||||
|
||||
Components:
|
||||
Q_t: demand ∈ R^n (current demand signal per product)
|
||||
P_t: prices ∈ R^n (current/base prices)
|
||||
S_t: session_features (behavioral signals, interaction data)
|
||||
H_t: history = {Q_{t-k}, P_{t-k}, S_{t-k}} for k in [1, history_length]
|
||||
|
||||
Additionally stores:
|
||||
- product_ids: product identifiers (n,)
|
||||
- elasticity: price elasticity per product (n,)
|
||||
- metadata: arbitrary context (experiment_id, timestamp, etc.)
|
||||
"""
|
||||
demand: np.ndarray # Q_t ∈ R^n
|
||||
prices: np.ndarray # P_t ∈ R^n
|
||||
session_features: pd.DataFrame = field(default_factory=pd.DataFrame) # S_t
|
||||
|
||||
# augmented state components
|
||||
product_ids: Optional[np.ndarray] = None
|
||||
elasticity: Optional[np.ndarray] = None
|
||||
|
||||
# historical trajectory H_t = {(Q_{t-k}, P_{t-k}, S_{t-k})}
|
||||
history: List[Dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
# metadata for context
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate dimensions."""
|
||||
n = len(self.demand)
|
||||
assert len(self.prices) == n, "demand and prices must have same dimension"
|
||||
if self.elasticity is not None:
|
||||
assert len(self.elasticity) == n, "elasticity must match dimension"
|
||||
if self.product_ids is not None:
|
||||
assert len(self.product_ids) == n, "product_ids must match dimension"
|
||||
|
||||
@property
|
||||
def n_products(self) -> int:
|
||||
"""Number of products in state space."""
|
||||
return len(self.demand)
|
||||
|
||||
def add_history(self, q: np.ndarray, p: np.ndarray, s: pd.DataFrame, max_length: int = 10):
|
||||
"""Append historical state to trajectory H_t."""
|
||||
self.history.append({'demand': q, 'prices': p, 'session_features': s})
|
||||
if len(self.history) > max_length:
|
||||
self.history.pop(0)
|
||||
|
||||
def get_history_window(self, k: int = 5) -> List[Dict[str, Any]]:
|
||||
"""Retrieve last k historical states."""
|
||||
return self.history[-k:] if len(self.history) >= k else self.history
|
||||
|
||||
|
||||
class BuildStateSpaceStep(BaseContextStep):
|
||||
"""
|
||||
Build state space from elasticity, demand, and price data.
|
||||
|
||||
Input: elasticity_df [productId, elasticity, ...], optional demand_df
|
||||
Output: StateSpace instance with Q_t, P_t, elasticity, product_ids
|
||||
"""
|
||||
|
||||
def transform(self, elasticity_df: pd.DataFrame, demand_df: Optional[pd.DataFrame] = None):
|
||||
products = self.context.products
|
||||
|
||||
# extract base prices from product metadata
|
||||
products_with_prices = products.copy()
|
||||
if 'metadata' in products_with_prices.columns:
|
||||
products_with_prices['base_price'] = products_with_prices['metadata'].apply(
|
||||
lambda m: m.get('base_price', 0) if isinstance(m, dict) else 0
|
||||
)
|
||||
else:
|
||||
products_with_prices['base_price'] = 0
|
||||
|
||||
# merge with elasticity
|
||||
merged = products_with_prices[['id', 'base_price']].rename(
|
||||
columns={'id': 'productId'}
|
||||
).merge(
|
||||
elasticity_df[['productId', 'elasticity']],
|
||||
on='productId',
|
||||
how='left'
|
||||
).fillna({'elasticity': 0.0, 'base_price': 0.0})
|
||||
|
||||
# merge with demand if provided, else use default
|
||||
if demand_df is not None and 'demand' in demand_df.columns:
|
||||
merged = merged.merge(
|
||||
demand_df[['productId', 'demand']],
|
||||
on='productId',
|
||||
how='left'
|
||||
).fillna({'demand': 0.0})
|
||||
demand_vector = merged['demand'].values
|
||||
else:
|
||||
# default: uniform demand or use elasticity as proxy
|
||||
demand_vector = np.ones(len(merged)) * 10.0
|
||||
|
||||
return StateSpace(
|
||||
demand=demand_vector,
|
||||
prices=merged['base_price'].values,
|
||||
session_features=pd.DataFrame(),
|
||||
product_ids=merged['productId'].values,
|
||||
elasticity=merged['elasticity'].values,
|
||||
metadata={'timestamp': pd.Timestamp.now().isoformat()}
|
||||
)
|
||||
|
||||
|
||||
class FitPricingFunctionStep(BaseContextStep):
|
||||
"""
|
||||
Fit pricing function using elasticity data.
|
||||
Input: elasticity_df
|
||||
Output: fitted pricing function instance
|
||||
"""
|
||||
|
||||
def transform(self, elasticity_df: pd.DataFrame):
|
||||
pricing_class = self.context.config.get('pricing_function_class', ElasticityBasedPricer)
|
||||
pricing_params = self.context.config.get('pricing_function_params', {})
|
||||
|
||||
pricer = pricing_class(**pricing_params)
|
||||
pricer.fit(elasticity_df)
|
||||
|
||||
return pricer
|
||||
|
||||
|
||||
class PredictPricesStep(BaseContextStep):
|
||||
"""
|
||||
Predict optimal prices using fitted pricing function.
|
||||
Input: (pricer, state_space)
|
||||
Output: prices_df [productId, predicted_price]
|
||||
"""
|
||||
|
||||
def transform(self, data: tuple):
|
||||
pricer, state_space = data
|
||||
|
||||
products = self.context.products
|
||||
product_ids = products['id'].values
|
||||
|
||||
predicted_prices = pricer.predict(state_space)
|
||||
|
||||
return pd.DataFrame({
|
||||
'productId': product_ids,
|
||||
'predicted_price': predicted_prices
|
||||
})
|
||||
114
experiments/procesing/steps/session.py
Normal file
114
experiments/procesing/steps/session.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
Session feature extraction for S_t component of state space.
|
||||
Computes behavioral signals from interaction data already in pipeline.
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Optional, Dict, Any
|
||||
from collections import Counter
|
||||
from procesing.steps.base import BaseContextStep
|
||||
|
||||
|
||||
class ExtractSessionFeaturesStep(BaseContextStep):
|
||||
"""
|
||||
Extract session-level behavioral features from interaction logs.
|
||||
|
||||
Input: interactions_df (user-interactions from earlier pipeline step)
|
||||
Output: session_features DataFrame [sessionId, feature_1, feature_2, ...]
|
||||
|
||||
Features computed:
|
||||
- total_interactions: count of all events
|
||||
- page_views, item_views, searches, cart_adds: event type counts
|
||||
- hovers: hover event counts
|
||||
- unique_products_viewed: distinct product IDs
|
||||
- interaction_velocity: events per minute
|
||||
- session_duration_sec: time span of session
|
||||
- avg_time_between_events: mean inter-event time
|
||||
- product_view_depth: max views for single product (attention signal)
|
||||
"""
|
||||
|
||||
def transform(self, interactions_df: pd.DataFrame) -> pd.DataFrame:
|
||||
if interactions_df.empty:
|
||||
return pd.DataFrame()
|
||||
|
||||
# ensure timestamp column
|
||||
if 'ts' in interactions_df.columns:
|
||||
interactions_df = interactions_df.copy()
|
||||
interactions_df['ts'] = pd.to_datetime(interactions_df['ts'])
|
||||
|
||||
# group by session and compute features
|
||||
session_features = []
|
||||
for session_id, session_df in interactions_df.groupby('sessionId'):
|
||||
features = self._extract_features_for_session(session_id, session_df)
|
||||
session_features.append(features)
|
||||
|
||||
return pd.DataFrame(session_features)
|
||||
|
||||
def _extract_features_for_session(self, session_id: str, session_df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""Compute features for single session."""
|
||||
features = {'sessionId': session_id}
|
||||
|
||||
# basic counts
|
||||
features['total_interactions'] = len(session_df)
|
||||
|
||||
event_counts = session_df['eventName'].value_counts().to_dict()
|
||||
features['page_views'] = event_counts.get('page_view', 0) + event_counts.get('view_item_page', 0)
|
||||
features['item_views'] = event_counts.get('view_item_page', 0)
|
||||
features['searches'] = event_counts.get('search', 0)
|
||||
features['cart_adds'] = event_counts.get('add_item_to_cart', 0)
|
||||
|
||||
# hover events
|
||||
hover_events = ['hover_over_title', 'hover_over_paragraph', 'hover_over_link', 'hover_over_button']
|
||||
features['hovers'] = sum(event_counts.get(ev, 0) for ev in hover_events)
|
||||
|
||||
# product-level signals
|
||||
product_ids = session_df['productId'].dropna()
|
||||
features['unique_products_viewed'] = product_ids.nunique()
|
||||
|
||||
if len(product_ids) > 0:
|
||||
product_view_counts = Counter(product_ids)
|
||||
features['product_view_depth'] = max(product_view_counts.values())
|
||||
else:
|
||||
features['product_view_depth'] = 0
|
||||
|
||||
# temporal features
|
||||
if 'ts' in session_df.columns:
|
||||
timestamps = session_df['ts'].sort_values()
|
||||
features['session_duration_sec'] = (timestamps.max() - timestamps.min()).total_seconds()
|
||||
|
||||
if features['session_duration_sec'] > 0:
|
||||
features['interaction_velocity'] = (features['total_interactions'] / features['session_duration_sec']) * 60
|
||||
else:
|
||||
features['interaction_velocity'] = 0.0
|
||||
|
||||
# inter-event timing
|
||||
if len(timestamps) > 1:
|
||||
time_diffs = timestamps.diff().dropna().dt.total_seconds()
|
||||
features['avg_time_between_events'] = time_diffs.mean()
|
||||
features['std_time_between_events'] = time_diffs.std()
|
||||
else:
|
||||
features['avg_time_between_events'] = 0.0
|
||||
features['std_time_between_events'] = 0.0
|
||||
else:
|
||||
features['session_duration_sec'] = 0.0
|
||||
features['interaction_velocity'] = 0.0
|
||||
features['avg_time_between_events'] = 0.0
|
||||
features['std_time_between_events'] = 0.0
|
||||
|
||||
# cart/conversion signals
|
||||
features['cart_to_view_ratio'] = features['cart_adds'] / features['item_views'] if features['item_views'] > 0 else 0.0
|
||||
|
||||
return features
|
||||
|
||||
|
||||
class FilterSessionInteractionsStep(BaseContextStep):
|
||||
"""
|
||||
Filter interactions DataFrame to specific session.
|
||||
|
||||
Input: (interactions_df, session_id)
|
||||
Output: interactions_df filtered to session_id
|
||||
"""
|
||||
|
||||
def transform(self, data: tuple) -> pd.DataFrame:
|
||||
interactions_df, session_id = data
|
||||
return interactions_df[interactions_df['sessionId'] == session_id].copy()
|
||||
Reference in New Issue
Block a user