mirror of
https://github.com/velocitatem/PHANTOM.git
synced 2026-05-31 08:33:36 +00:00
feature: cleaning up pipeline
This commit is contained in:
@@ -21,7 +21,6 @@ from procesing.steps import (
|
|||||||
from procesing.pipelines import (
|
from procesing.pipelines import (
|
||||||
interaction_extraction_pipeline,
|
interaction_extraction_pipeline,
|
||||||
price_extraction_pipeline,
|
price_extraction_pipeline,
|
||||||
elasticity_computation_pipeline,
|
|
||||||
pricing_pipeline,
|
pricing_pipeline,
|
||||||
full_pipeline,
|
full_pipeline,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ from sklearn.pipeline import Pipeline
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from procesing.context import PipelineContext
|
from procesing.context import PipelineContext
|
||||||
from procesing.providers import SupabaseProvider, BackendAPIProvider
|
from procesing.providers import SupabaseProvider, BackendAPIProvider
|
||||||
from typing import Union
|
|
||||||
from procesing.steps import (
|
from procesing.steps import (
|
||||||
FetchInteractionsStep,
|
FetchInteractionsStep,
|
||||||
FetchPriceLogsStep,
|
FetchPriceLogsStep,
|
||||||
@@ -17,6 +16,8 @@ from procesing.steps import (
|
|||||||
# BuildStateSpaceStep,
|
# BuildStateSpaceStep,
|
||||||
FitPricingFunctionStep,
|
FitPricingFunctionStep,
|
||||||
PredictPricesStep,
|
PredictPricesStep,
|
||||||
|
ComputeDemandStep,
|
||||||
|
JoinProductFeaturesStep
|
||||||
)
|
)
|
||||||
|
|
||||||
def interaction_extraction_pipeline(context: PipelineContext):
|
def interaction_extraction_pipeline(context: PipelineContext):
|
||||||
@@ -35,80 +36,127 @@ def price_extraction_pipeline(context: PipelineContext):
|
|||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
def elasticity_computation_pipeline(context: PipelineContext,
|
def product_features_pipeline(context: PipelineContext,
|
||||||
interactions_df: pd.DataFrame,
|
interactions_df: pd.DataFrame,
|
||||||
price_logs_df: pd.DataFrame):
|
price_logs_df: pd.DataFrame):
|
||||||
"""
|
# elasticity_step = ComputeElasticityStep(context)
|
||||||
Compute elasticity from interactions and price logs.
|
demand_step = ComputeDemandStep(context)
|
||||||
Manual orchestration needed for branching logic.
|
|
||||||
"""
|
|
||||||
# branch 1: chunk interactions and compute demand
|
|
||||||
chunk_step = ChunkByTimeWindowStep(context)
|
|
||||||
interaction_chunks = chunk_step.transform(interactions_df)
|
|
||||||
|
|
||||||
demand_step = ComputeDemandForChunksStep(context)
|
|
||||||
demand_chunks = demand_step.transform(interaction_chunks)
|
|
||||||
|
|
||||||
# branch 2: aggregate price logs
|
|
||||||
price_step = AggregatePriceLogsStep(context)
|
price_step = AggregatePriceLogsStep(context)
|
||||||
price_chunks = price_step.transform(price_logs_df)
|
join_step = JoinProductFeaturesStep(context)
|
||||||
|
|
||||||
# convergence: compute elasticity
|
|
||||||
elasticity_step = ComputeElasticityStep(context)
|
|
||||||
elasticity_df = elasticity_step.transform((demand_chunks, price_chunks))
|
|
||||||
|
|
||||||
return elasticity_df
|
|
||||||
|
|
||||||
|
|
||||||
def pricing_pipeline(context: PipelineContext, elasticity_df: pd.DataFrame):
|
demand_data = demand_step.transform(interactions_df)
|
||||||
|
price_data= price_step.transform(price_logs_df)
|
||||||
|
joined_data = join_step.transform((demand_data, price_data))
|
||||||
|
|
||||||
|
return joined_data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def pricing_pipeline(context: "PipelineContext",
|
||||||
|
data: pd.DataFrame,
|
||||||
|
high_threshold: int = 10,
|
||||||
|
low_threshold: int = 2,
|
||||||
|
surge_multiplier: float = 1.2,
|
||||||
|
discount_multiplier: float = 0.9) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Generate optimal prices from elasticity estimates.
|
Generate product-level optimal prices using simple surge pricing rules.
|
||||||
|
Replaces complex Bayesian curve fitting with threshold-based adjustments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
context: Pipeline context
|
||||||
|
data: DataFrame with [productId, demand_score, price]
|
||||||
|
high_threshold: Demand threshold for surge pricing (default 10)
|
||||||
|
low_threshold: Demand threshold for discounts (default 2)
|
||||||
|
surge_multiplier: Price multiplier for high demand (default 1.2 = +20%)
|
||||||
|
discount_multiplier: Price multiplier for low demand (default 0.9 = -10%)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame with [productId, current_price, optimal_price, demand_score]
|
||||||
"""
|
"""
|
||||||
# build state space
|
|
||||||
state_step = BuildStateSpaceStep(context)
|
|
||||||
state_space = state_step.transform(elasticity_df)
|
|
||||||
|
|
||||||
# fit pricing function
|
if data.empty or 'productId' not in data.columns:
|
||||||
fit_step = FitPricingFunctionStep(context)
|
return pd.DataFrame()
|
||||||
pricer = fit_step.transform(elasticity_df)
|
|
||||||
|
|
||||||
# predict prices
|
products = context.products
|
||||||
predict_step = PredictPricesStep(context)
|
results = []
|
||||||
prices_df = predict_step.transform((pricer, state_space))
|
|
||||||
|
|
||||||
return prices_df
|
for pid in data['productId'].unique():
|
||||||
|
prod_data = data[data['productId'] == pid]
|
||||||
|
|
||||||
|
if prod_data.empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
demand = prod_data["demand_score"].mean()
|
||||||
|
current_price = prod_data["price"].mean()
|
||||||
|
|
||||||
|
# get base price from metadata or use current price
|
||||||
|
prod_meta = products[products['id'] == pid]
|
||||||
|
if not prod_meta.empty:
|
||||||
|
meta = prod_meta.iloc[0]['metadata']
|
||||||
|
base_price = meta.get('base_price', current_price) if isinstance(meta, dict) else current_price
|
||||||
|
else:
|
||||||
|
base_price = current_price
|
||||||
|
|
||||||
|
# apply surge rules
|
||||||
|
if demand >= high_threshold:
|
||||||
|
optimal_price = base_price * surge_multiplier
|
||||||
|
elif demand <= low_threshold:
|
||||||
|
optimal_price = base_price * discount_multiplier
|
||||||
|
else:
|
||||||
|
optimal_price = base_price
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
'productId': pid,
|
||||||
|
'current_price': current_price,
|
||||||
|
'base_price': base_price,
|
||||||
|
'optimal_price': optimal_price,
|
||||||
|
'demand_score': demand
|
||||||
|
})
|
||||||
|
|
||||||
|
return pd.DataFrame(results)
|
||||||
|
|
||||||
|
|
||||||
def full_pipeline(context: PipelineContext):
|
|
||||||
|
|
||||||
|
|
||||||
|
def full_pipeline(context: PipelineContext,
|
||||||
|
high_threshold: int = 10,
|
||||||
|
low_threshold: int = 2,
|
||||||
|
surge_multiplier: float = 1.2,
|
||||||
|
discount_multiplier: float = 0.9):
|
||||||
"""
|
"""
|
||||||
Complete end-to-end pipeline: data extraction -> elasticity -> pricing
|
Complete end-to-end pipeline: data extraction -> demand/price aggregation -> surge pricing
|
||||||
Returns: (elasticity_df, prices_df)
|
|
||||||
|
Args:
|
||||||
|
context: Pipeline context
|
||||||
|
high_threshold: Demand threshold for surge pricing
|
||||||
|
low_threshold: Demand threshold for discounts
|
||||||
|
surge_multiplier: Price multiplier for high demand
|
||||||
|
discount_multiplier: Price multiplier for low demand
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (product_features_df, optimal_prices_df)
|
||||||
|
- product_features_df: [productId, demand_score, price]
|
||||||
|
- optimal_prices_df: [productId, current_price, optimal_price, demand_score]
|
||||||
"""
|
"""
|
||||||
# extract interactions
|
|
||||||
interaction_pipe = interaction_extraction_pipeline(context)
|
interaction_pipe = interaction_extraction_pipeline(context)
|
||||||
interactions_df = interaction_pipe.fit_transform(None)
|
|
||||||
|
|
||||||
# extract price logs
|
|
||||||
price_pipe = price_extraction_pipeline(context)
|
price_pipe = price_extraction_pipeline(context)
|
||||||
|
|
||||||
|
interactions_df = interaction_pipe.fit_transform(None)
|
||||||
price_logs_df = price_pipe.fit_transform(None)
|
price_logs_df = price_pipe.fit_transform(None)
|
||||||
|
product_features_df = product_features_pipeline(context, interactions_df, price_logs_df)
|
||||||
|
print(product_features_df.to_string())
|
||||||
|
|
||||||
if interactions_df.empty or price_logs_df.empty:
|
# generate optimal prices using surge rules
|
||||||
return None, None
|
optimal_prices_df = pricing_pipeline(context, product_features_df,
|
||||||
|
high_threshold=high_threshold,
|
||||||
|
low_threshold=low_threshold,
|
||||||
|
surge_multiplier=surge_multiplier,
|
||||||
|
discount_multiplier=discount_multiplier)
|
||||||
|
|
||||||
# compute elasticity
|
return product_features_df, optimal_prices_df
|
||||||
elasticity_df = elasticity_computation_pipeline(
|
|
||||||
context,
|
|
||||||
interactions_df,
|
|
||||||
price_logs_df
|
|
||||||
)
|
|
||||||
|
|
||||||
if elasticity_df is None or elasticity_df.empty:
|
|
||||||
return elasticity_df, None
|
|
||||||
|
|
||||||
# generate prices
|
|
||||||
prices_df = pricing_pipeline(context, elasticity_df)
|
|
||||||
|
|
||||||
return elasticity_df, prices_df
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -140,20 +188,7 @@ if __name__ == '__main__':
|
|||||||
context = PipelineContext(
|
context = PipelineContext(
|
||||||
provider=HistoricalProvider(),
|
provider=HistoricalProvider(),
|
||||||
store_mode='hotel',
|
store_mode='hotel',
|
||||||
# 15 min not month
|
|
||||||
window_size='15min',
|
|
||||||
)
|
)
|
||||||
|
|
||||||
elasticity_df, prices_df = full_pipeline(context)
|
product_features, prices = full_pipeline(context)
|
||||||
|
print(prices.to_string())
|
||||||
if elasticity_df is not None and not elasticity_df.empty:
|
|
||||||
print("Elasticity Estimates:")
|
|
||||||
print(elasticity_df.to_string(index=False))
|
|
||||||
else:
|
|
||||||
print("No elasticity estimates computed.")
|
|
||||||
|
|
||||||
if prices_df is not None and not prices_df.empty:
|
|
||||||
print("\nPredicted Prices:")
|
|
||||||
print(prices_df.to_string(index=False))
|
|
||||||
else:
|
|
||||||
print("No prices predicted.")
|
|
||||||
|
|||||||
@@ -1,13 +1,12 @@
|
|||||||
from procesing.steps.base import BaseContextStep
|
from procesing.steps.base import BaseContextStep
|
||||||
from procesing.steps.fetch import FetchInteractionsStep, FetchPriceLogsStep, FetchExperimentsStep
|
from procesing.steps.fetch import FetchInteractionsStep, FetchPriceLogsStep, FetchExperimentsStep
|
||||||
from procesing.steps.join import JoinExperimentsStep
|
from procesing.steps.join import JoinExperimentsStep, JoinProductFeaturesStep
|
||||||
from procesing.steps.augment import CreatePriceBucketsStep, AugmentEventNamesStep
|
from procesing.steps.augment import CreatePriceBucketsStep, AugmentEventNamesStep, AugmentInteractionsStep
|
||||||
from procesing.steps.chunk import ChunkByTimeWindowStep
|
from procesing.steps.chunk import ChunkByTimeWindowStep
|
||||||
from procesing.steps.demand import ComputeDemandStep, ComputeDemandForChunksStep
|
from procesing.steps.demand import ComputeDemandStep, ComputeDemandForChunksStep
|
||||||
from procesing.steps.elasticity import AggregatePriceLogsStep, ComputeElasticityStep
|
from procesing.steps.elasticity import AggregatePriceLogsStep, ComputeElasticityStep
|
||||||
from procesing.steps.pricing import FitPricingFunctionStep, PredictPricesStep
|
from procesing.steps.pricing import FitPricingFunctionStep, PredictPricesStep
|
||||||
from procesing.steps.session import ExtractSessionFeaturesStep, _extract_features_for_session
|
from procesing.steps.session import ExtractSessionFeaturesStep, _extract_features_for_session
|
||||||
# StateSpace, BuildStateSpaceStep,
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'BaseContextStep',
|
'BaseContextStep',
|
||||||
@@ -15,8 +14,10 @@ __all__ = [
|
|||||||
'FetchPriceLogsStep',
|
'FetchPriceLogsStep',
|
||||||
'FetchExperimentsStep',
|
'FetchExperimentsStep',
|
||||||
'JoinExperimentsStep',
|
'JoinExperimentsStep',
|
||||||
|
'JoinProductFeaturesStep',
|
||||||
'CreatePriceBucketsStep',
|
'CreatePriceBucketsStep',
|
||||||
'AugmentEventNamesStep',
|
'AugmentEventNamesStep',
|
||||||
|
'AugmentInteractionsStep',
|
||||||
'ChunkByTimeWindowStep',
|
'ChunkByTimeWindowStep',
|
||||||
'ComputeDemandStep',
|
'ComputeDemandStep',
|
||||||
'ComputeDemandForChunksStep',
|
'ComputeDemandForChunksStep',
|
||||||
|
|||||||
@@ -2,6 +2,93 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from procesing.steps.base import BaseContextStep
|
from procesing.steps.base import BaseContextStep
|
||||||
|
|
||||||
|
|
||||||
|
class AugmentInteractionsStep(BaseContextStep):
|
||||||
|
"""
|
||||||
|
Consolidated step: create price buckets, augment event names, join experiments.
|
||||||
|
Input: (interactions_df, price_logs_df)
|
||||||
|
Output: enriched interactions_df
|
||||||
|
"""
|
||||||
|
|
||||||
|
def transform(self, data: tuple):
|
||||||
|
interactions_df, price_logs_df = data
|
||||||
|
|
||||||
|
if interactions_df.empty:
|
||||||
|
return interactions_df
|
||||||
|
|
||||||
|
# Step 1: Create price buckets
|
||||||
|
interactions_df = self._create_price_buckets(interactions_df)
|
||||||
|
|
||||||
|
# Step 2: Augment event names
|
||||||
|
interactions_df = self._augment_event_names(interactions_df)
|
||||||
|
|
||||||
|
# Step 3: Join experiments (optional)
|
||||||
|
if 'experimentId' in interactions_df.columns:
|
||||||
|
interactions_df = self._join_experiments(interactions_df)
|
||||||
|
|
||||||
|
return interactions_df
|
||||||
|
|
||||||
|
def _create_price_buckets(self, df: pd.DataFrame):
|
||||||
|
"""Create price bucket labels from price data"""
|
||||||
|
if 'metadata_price' not in df.columns:
|
||||||
|
df['price_bucket'] = ""
|
||||||
|
return df
|
||||||
|
|
||||||
|
n_buckets = self.context.config.get('n_price_buckets', 5)
|
||||||
|
|
||||||
|
if df['metadata_price'].notnull().sum() > 0:
|
||||||
|
try:
|
||||||
|
price_buckets = pd.qcut(
|
||||||
|
df['metadata_price'],
|
||||||
|
q=n_buckets,
|
||||||
|
labels=[f"PB_{i+1}" for i in range(n_buckets)],
|
||||||
|
duplicates='drop'
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
# fallback for insufficient unique values
|
||||||
|
price_buckets = df['metadata_price'].apply(
|
||||||
|
lambda x: f"P_{int(x)}" if pd.notnull(x) else ""
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
price_buckets = pd.Series([""] * len(df), index=df.index)
|
||||||
|
|
||||||
|
df['price_bucket'] = price_buckets
|
||||||
|
return df
|
||||||
|
|
||||||
|
def _augment_event_names(self, df: pd.DataFrame):
|
||||||
|
"""Augment event names with product and price bucket schema"""
|
||||||
|
# Create schema: _productId@price_bucket
|
||||||
|
has_product = df.get('productId', pd.Series()).notnull()
|
||||||
|
has_bucket = df.get('price_bucket', pd.Series()).notnull()
|
||||||
|
|
||||||
|
df['metadata_schema'] = np.where(
|
||||||
|
has_product & has_bucket,
|
||||||
|
"_" + df['productId'].astype(str) + "@" + df['price_bucket'].astype(str),
|
||||||
|
""
|
||||||
|
)
|
||||||
|
|
||||||
|
df['eventName'] = df['eventName'] + df['metadata_schema']
|
||||||
|
return df
|
||||||
|
|
||||||
|
def _join_experiments(self, df: pd.DataFrame):
|
||||||
|
"""Join experiment metadata if experimentId present"""
|
||||||
|
exp_ids = df['experimentId'].dropna().unique().tolist()
|
||||||
|
if not exp_ids:
|
||||||
|
return df
|
||||||
|
|
||||||
|
experiments_df = self.context.provider.fetch_experiments(exp_ids)
|
||||||
|
if experiments_df.empty:
|
||||||
|
return df
|
||||||
|
|
||||||
|
return df.merge(
|
||||||
|
experiments_df,
|
||||||
|
left_on='experimentId',
|
||||||
|
right_on='id',
|
||||||
|
how='left',
|
||||||
|
suffixes=('', '_exp')
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class CreatePriceBucketsStep(BaseContextStep):
|
class CreatePriceBucketsStep(BaseContextStep):
|
||||||
"""Create price bucket labels from price data"""
|
"""Create price bucket labels from price data"""
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ class AggregatePriceLogsStep(BaseContextStep):
|
|||||||
|
|
||||||
df = price_logs_df.copy()
|
df = price_logs_df.copy()
|
||||||
ts_col = self.context.config.get('ts_col', 'ts')
|
ts_col = self.context.config.get('ts_col', 'ts')
|
||||||
window_size = self.context.window_size
|
#window_size = self.context.window_size WE ARE NOT USING CHUNKS ANYMORE
|
||||||
|
|
||||||
# ensure datetime
|
# ensure datetime
|
||||||
if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
|
if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
|
||||||
@@ -24,52 +24,23 @@ class AggregatePriceLogsStep(BaseContextStep):
|
|||||||
|
|
||||||
df = df.sort_values([ts_col, 'productId'])
|
df = df.sort_values([ts_col, 'productId'])
|
||||||
products = self.context.products
|
products = self.context.products
|
||||||
unique_products = products['id'].unique()
|
# get base price from metadata if available 1) read the metadata col as json and get the base_price
|
||||||
|
products['base_price'] = products.apply(
|
||||||
# VECTORIZED: group by product, resample by time window, compute mean
|
lambda row: row['metadata'].get('base_price', 0) if isinstance(row['metadata'], dict) else 0,
|
||||||
df_indexed = df.set_index(ts_col)
|
axis=1
|
||||||
|
|
||||||
windowed = (
|
|
||||||
df_indexed
|
|
||||||
.groupby('productId')['price']
|
|
||||||
.resample(window_size)
|
|
||||||
.mean()
|
|
||||||
.reset_index()
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# forward fill missing windows (carry last known price)
|
unique_products = products['id'].unique()
|
||||||
windowed = windowed.sort_values([ts_col, 'productId'])
|
|
||||||
windowed['price'] = windowed.groupby('productId')['price'].ffill()
|
|
||||||
windowed = windowed.dropna(subset=['price'])
|
|
||||||
|
|
||||||
# group into chunks by window
|
df_indexed = df.set_index(ts_col)
|
||||||
chunks = []
|
# we return a df of average price per product over the entire period
|
||||||
for window_start, group in windowed.groupby(ts_col):
|
# TODO: maybe consider different opration to handle price aggregation over time
|
||||||
price_vector = group[['productId', 'price']].copy()
|
avg_prices = df_indexed.groupby('productId')['price'].mean().reindex(unique_products, fill_value=0).reset_index()
|
||||||
|
avg_prices.columns = ['productId', 'price']
|
||||||
|
# fill 0s with base_price from products
|
||||||
|
base_price_map = products.set_index('id')['base_price'].to_dict()
|
||||||
|
return avg_prices
|
||||||
|
|
||||||
# fill missing products with last known price before this window
|
|
||||||
missing_products = set(unique_products) - set(price_vector['productId'])
|
|
||||||
if missing_products:
|
|
||||||
for pid in missing_products:
|
|
||||||
last_price = df_indexed[
|
|
||||||
(df_indexed['productId'] == pid) &
|
|
||||||
(df_indexed.index < window_start)
|
|
||||||
]['price']
|
|
||||||
|
|
||||||
if not last_price.empty:
|
|
||||||
price_vector = pd.concat([
|
|
||||||
price_vector,
|
|
||||||
pd.DataFrame({'productId': [pid], 'price': [last_price.iloc[-1]]})
|
|
||||||
], ignore_index=True)
|
|
||||||
|
|
||||||
if not price_vector.empty:
|
|
||||||
chunks.append({
|
|
||||||
'window_start': window_start,
|
|
||||||
'window_end': window_start + pd.Timedelta(window_size),
|
|
||||||
'price_vector': price_vector
|
|
||||||
})
|
|
||||||
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
|
|
||||||
class ComputeElasticityStep(BaseContextStep):
|
class ComputeElasticityStep(BaseContextStep):
|
||||||
@@ -89,9 +60,9 @@ class ComputeElasticityStep(BaseContextStep):
|
|||||||
all_product_ids = products['id'].unique()
|
all_product_ids = products['id'].unique()
|
||||||
|
|
||||||
# align chunks by window_start
|
# align chunks by window_start
|
||||||
aligned = self._align_chunks(demand_chunks, price_chunks)
|
# aligned = self._align_chunks(demand_chunks, price_chunks)
|
||||||
|
|
||||||
if not aligned:
|
if None:
|
||||||
return pd.DataFrame({
|
return pd.DataFrame({
|
||||||
'productId': all_product_ids,
|
'productId': all_product_ids,
|
||||||
'elasticity': 0.0,
|
'elasticity': 0.0,
|
||||||
|
|||||||
@@ -2,7 +2,11 @@ import pandas as pd
|
|||||||
from procesing.steps.base import BaseContextStep
|
from procesing.steps.base import BaseContextStep
|
||||||
|
|
||||||
class FetchInteractionsStep(BaseContextStep):
|
class FetchInteractionsStep(BaseContextStep):
|
||||||
"""Fetch raw interaction data from Kafka topic"""
|
"""Fetch raw interaction data from Kafka topic with optional time filtering"""
|
||||||
|
|
||||||
|
def __init__(self, context, lookback: str = None):
|
||||||
|
super().__init__(context)
|
||||||
|
self.lookback = lookback
|
||||||
|
|
||||||
def transform(self, X=None):
|
def transform(self, X=None):
|
||||||
df = self.context.provider.fetch_kafka_topic('user-interactions')
|
df = self.context.provider.fetch_kafka_topic('user-interactions')
|
||||||
@@ -24,14 +28,35 @@ class FetchInteractionsStep(BaseContextStep):
|
|||||||
if 'metadata_dateIndex' in df.columns:
|
if 'metadata_dateIndex' in df.columns:
|
||||||
df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
|
df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
|
||||||
|
|
||||||
|
# Apply time filtering if lookback specified
|
||||||
|
if self.lookback and 'ts' in df.columns:
|
||||||
|
df['ts'] = pd.to_datetime(df['ts'])
|
||||||
|
cutoff = pd.Timestamp.now() - pd.Timedelta(self.lookback)
|
||||||
|
df = df[df['ts'] >= cutoff]
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
class FetchPriceLogsStep(BaseContextStep):
|
class FetchPriceLogsStep(BaseContextStep):
|
||||||
"""Fetch price log data from Kafka topic"""
|
"""Fetch price log data from Kafka topic with optional time filtering"""
|
||||||
|
|
||||||
|
def __init__(self, context, lookback: str = None):
|
||||||
|
super().__init__(context)
|
||||||
|
self.lookback = lookback
|
||||||
|
|
||||||
def transform(self, X=None):
|
def transform(self, X=None):
|
||||||
return self.context.provider.fetch_kafka_topic('price-logs')
|
df = self.context.provider.fetch_kafka_topic('price-logs')
|
||||||
|
|
||||||
|
if df.empty:
|
||||||
|
return df
|
||||||
|
|
||||||
|
# Apply time filtering if lookback specified
|
||||||
|
if self.lookback and 'ts' in df.columns:
|
||||||
|
df['ts'] = pd.to_datetime(df['ts'])
|
||||||
|
cutoff = pd.Timestamp.now() - pd.Timedelta(self.lookback)
|
||||||
|
df = df[df['ts'] >= cutoff]
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
class FetchExperimentsStep(BaseContextStep):
|
class FetchExperimentsStep(BaseContextStep):
|
||||||
|
|||||||
@@ -32,3 +32,19 @@ class JoinExperimentsStep(BaseContextStep):
|
|||||||
})
|
})
|
||||||
|
|
||||||
return interactions_df.merge(experiments_df, on='experimentId', how='left')
|
return interactions_df.merge(experiments_df, on='experimentId', how='left')
|
||||||
|
|
||||||
|
class JoinProductFeaturesStep(BaseContextStep):
|
||||||
|
"""Join product features to interactions"""
|
||||||
|
|
||||||
|
def transform(self, data: tuple):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
data: (interactions_df, products_df)
|
||||||
|
Returns:
|
||||||
|
merged interactions dataframe
|
||||||
|
"""
|
||||||
|
demand_df, price_df = data
|
||||||
|
|
||||||
|
if price_df.empty:
|
||||||
|
return demand_df
|
||||||
|
return demand_df.merge(price_df, on='productId', how='left')
|
||||||
|
|||||||
@@ -11,3 +11,4 @@ pytest-asyncio
|
|||||||
uv
|
uv
|
||||||
scikit-learn
|
scikit-learn
|
||||||
supabase
|
supabase
|
||||||
|
pymc3
|
||||||
|
|||||||
Reference in New Issue
Block a user