local pipeline excution working

This commit is contained in:
2025-11-28 13:52:41 +01:00
parent 519b3b7f93
commit eb30b04271
3 changed files with 245 additions and 189 deletions

View File

@@ -5,14 +5,27 @@ from datetime import timedelta
import pandas as pd import pandas as pd
import logging import logging
import sys import sys
import os import pickle
import io
# add procesing module to path (mounted at /opt/airflow/procesing in container) # add procesing module to path (mounted at /opt/airflow/procesing in container)
sys.path.insert(0, '/opt/airflow/procesing') sys.path.insert(0, '/opt/airflow/procesing')
from extract import KafkaDataFetcher, ExperimentJoiner, EventTitleAugmenter from context import PipelineContext
from demand import DemandEstimator, ChunkInteractionsIntoSteps from providers import SupabaseProvider, BackendAPIProvider
from elasticity import TemporalElasticityEstimator, aggregate_price_logs from steps import (
FetchInteractionsStep,
FetchPriceLogsStep,
CreatePriceBucketsStep,
AugmentEventNamesStep,
ChunkByTimeWindowStep,
ComputeDemandForChunksStep,
AggregatePriceLogsStep,
ComputeElasticityStep,
BuildStateSpaceStep,
FitPricingFunctionStep,
PredictPricesStep,
)
default_args = { default_args = {
'owner': 'phantom-research', 'owner': 'phantom-research',
@@ -23,214 +36,210 @@ default_args = {
'retry_delay': timedelta(minutes=5), 'retry_delay': timedelta(minutes=5),
} }
# callable functions for tasks (stateless, idempotent) def get_provider():
def fetch_interactions(**context): """Factory to create composite provider"""
"""Extract interaction data from Kafka and augment""" class CompositeProvider(SupabaseProvider, BackendAPIProvider):
fetcher = KafkaDataFetcher(topic='user-interactions') def __init__(self):
data = fetcher.fit_transform(None) SupabaseProvider.__init__(self)
BackendAPIProvider.__init__(self)
return CompositeProvider()
if data.empty: def get_context(**kwargs):
logging.warning("No interaction data fetched") """Build pipeline context from Airflow config"""
return None dag_conf = kwargs.get('dag_run').conf if kwargs.get('dag_run') else {}
return PipelineContext(
provider=get_provider(),
store_mode=dag_conf.get('store_mode', 'hotel'),
window_size=dag_conf.get('window_size', '30s'),
n_price_buckets=dag_conf.get('n_price_buckets', 5),
elasticity_method=dag_conf.get('elasticity_method', 'point'),
min_observations=dag_conf.get('min_observations', 2),
)
data = ExperimentJoiner().fit_transform(data) # atomic task functions (each wraps one sklearn step)
data = EventTitleAugmenter().fit_transform(data) def fetch_interactions(**kwargs):
"""Task: Fetch interaction data from Kafka"""
context = get_context(**kwargs)
step = FetchInteractionsStep(context)
df = step.transform(None)
# push to XCom for downstream tasks kwargs['ti'].xcom_push(key='interactions_raw', value=df.to_json())
context['task_instance'].xcom_push(key='interaction_data', value=data.to_json()) logging.info(f"Fetched {len(df)} interaction records")
logging.info(f"Fetched {len(data)} interaction records") return len(df)
return len(data)
def fetch_price_logs(**context): def fetch_price_logs(**kwargs):
"""Extract price logs from Kafka""" """Task: Fetch price logs from Kafka"""
fetcher = KafkaDataFetcher(topic='price-logs') context = get_context(**kwargs)
data = fetcher.fit_transform(None) step = FetchPriceLogsStep(context)
df = step.transform(None)
if data.empty: kwargs['ti'].xcom_push(key='price_logs_raw', value=df.to_json())
logging.warning("No price data fetched") logging.info(f"Fetched {len(df)} price records")
return None return len(df)
context['task_instance'].xcom_push(key='price_data', value=data.to_json()) def create_price_buckets(**kwargs):
logging.info(f"Fetched {len(data)} price records") """Task: Create price buckets for interactions"""
return len(data) ti = kwargs['ti']
interactions_json = ti.xcom_pull(key='interactions_raw')
df = pd.read_json(io.StringIO(interactions_json))
def compute_demand_chunks(**context): context = get_context(**kwargs)
"""Chunk interactions and compute demand per window""" step = CreatePriceBucketsStep(context)
import io df = step.transform(df)
ti = context['task_instance']
window_size = context['dag_run'].conf.get('window_size', '30s')
store_mode = context['dag_run'].conf.get('store_mode', 'hotel')
# pull from XCom ti.xcom_push(key='interactions_bucketed', value=df.to_json())
interaction_json = ti.xcom_pull(task_ids='fetch_interactions', key='interaction_data') logging.info(f"Created price buckets for {len(df)} interactions")
if not interaction_json: return len(df)
logging.error("No interaction data available")
return None
interactions_df = pd.read_json(io.StringIO(interaction_json)) def augment_event_names(**kwargs):
"""Task: Augment event names with product and price schema"""
ti = kwargs['ti']
interactions_json = ti.xcom_pull(key='interactions_bucketed')
df = pd.read_json(io.StringIO(interactions_json))
# chunk into windows context = get_context(**kwargs)
chunker = ChunkInteractionsIntoSteps(window_size=window_size, return_metadata=True) step = AugmentEventNamesStep(context)
chunks = chunker.transform(interactions_df) df = step.transform(df)
if not chunks: ti.xcom_push(key='interactions_final', value=df.to_json())
logging.warning("No chunks generated") logging.info(f"Augmented event names for {len(df)} interactions")
return None return len(df)
# compute demand per chunk def chunk_interactions(**kwargs):
estimator = DemandEstimator(store_mode=store_mode) """Task: Chunk interactions into time windows"""
demand_chunks = [ ti = kwargs['ti']
{ interactions_json = ti.xcom_pull(key='interactions_final')
'window_start': c['window_start'].isoformat(), df = pd.read_json(io.StringIO(interactions_json))
'window_end': c['window_end'].isoformat(),
'demand_vector': estimator.transform(c['data']).to_json()
}
for c in chunks
]
ti.xcom_push(key='demand_chunks', value=demand_chunks) context = get_context(**kwargs)
logging.info(f"Generated {len(demand_chunks)} demand chunks @ {window_size}") step = ChunkByTimeWindowStep(context)
chunks = step.transform(df)
ti.xcom_push(key='interaction_chunks', value=pickle.dumps(chunks))
logging.info(f"Generated {len(chunks)} interaction chunks")
return len(chunks)
def compute_demand(**kwargs):
"""Task: Compute demand vectors for all chunks"""
ti = kwargs['ti']
chunks = pickle.loads(ti.xcom_pull(key='interaction_chunks'))
context = get_context(**kwargs)
step = ComputeDemandForChunksStep(context)
demand_chunks = step.transform(chunks)
ti.xcom_push(key='demand_chunks', value=pickle.dumps(demand_chunks))
logging.info(f"Computed demand for {len(demand_chunks)} chunks")
return len(demand_chunks) return len(demand_chunks)
def aggregate_prices(**context): def aggregate_price_logs(**kwargs):
"""Aggregate price logs into aligned windows""" """Task: Aggregate price logs into time windows (VECTORIZED)"""
import io ti = kwargs['ti']
ti = context['task_instance'] price_logs_json = ti.xcom_pull(key='price_logs_raw')
window_size = context['dag_run'].conf.get('window_size', '30s') df = pd.read_json(io.StringIO(price_logs_json))
store_mode = context['dag_run'].conf.get('store_mode', 'hotel')
price_json = ti.xcom_pull(task_ids='fetch_price_logs', key='price_data') context = get_context(**kwargs)
if not price_json: step = AggregatePriceLogsStep(context)
logging.error("No price data available") price_chunks = step.transform(df)
return None
price_df = pd.read_json(io.StringIO(price_json)) ti.xcom_push(key='price_chunks', value=pickle.dumps(price_chunks))
price_chunks = aggregate_price_logs(price_df, window_size=window_size, store_mode=store_mode) logging.info(f"Aggregated {len(price_chunks)} price chunks (vectorized)")
# serialize for XCom
serialized = [
{
'window_start': c['window_start'].isoformat(),
'window_end': c['window_end'].isoformat(),
'price_vector': c['price_vector'].to_json()
}
for c in price_chunks
]
ti.xcom_push(key='price_chunks', value=serialized)
logging.info(f"Aggregated {len(price_chunks)} price chunks")
return len(price_chunks) return len(price_chunks)
def compute_elasticity(**context): def compute_elasticity(**kwargs):
"""Compute price elasticity from demand and price chunks""" """Task: Compute price elasticity from demand and price chunks"""
import io ti = kwargs['ti']
ti = context['task_instance'] demand_chunks = pickle.loads(ti.xcom_pull(key='demand_chunks'))
store_mode = context['dag_run'].conf.get('store_mode', 'hotel') price_chunks = pickle.loads(ti.xcom_pull(key='price_chunks'))
method = context['dag_run'].conf.get('elasticity_method', 'point')
min_obs = context['dag_run'].conf.get('min_observations', 2)
# pull chunks from XCom context = get_context(**kwargs)
demand_chunks_raw = ti.xcom_pull(task_ids='compute_demand', key='demand_chunks') step = ComputeElasticityStep(context)
price_chunks_raw = ti.xcom_pull(task_ids='aggregate_prices', key='price_chunks') elasticity_df = step.transform((demand_chunks, price_chunks))
if not demand_chunks_raw or not price_chunks_raw:
logging.error("Missing demand or price chunks")
return None
# deserialize
demand_chunks = [
{
'window_start': pd.Timestamp(c['window_start']),
'window_end': pd.Timestamp(c['window_end']),
'demand_vector': pd.read_json(io.StringIO(c['demand_vector']))
}
for c in demand_chunks_raw
]
price_chunks = [
{
'window_start': pd.Timestamp(c['window_start']),
'window_end': pd.Timestamp(c['window_end']),
'price_vector': pd.read_json(io.StringIO(c['price_vector']))
}
for c in price_chunks_raw
]
# compute elasticity
estimator = TemporalElasticityEstimator(method=method, min_observations=min_obs)
elasticity_df = estimator.transform(demand_chunks, price_chunks, store_mode=store_mode)
if elasticity_df is None or elasticity_df.empty:
logging.warning("No elasticity results computed")
return None
# store results (could push to DB, S3, or XCom)
ti.xcom_push(key='elasticity_results', value=elasticity_df.to_json()) ti.xcom_push(key='elasticity_results', value=elasticity_df.to_json())
logging.info(f"Computed elasticity for {len(elasticity_df)} products") logging.info(f"Computed elasticity for {len(elasticity_df)} products")
# summary stats
return { return {
'n_products': len(elasticity_df), 'n_products': len(elasticity_df),
'mean_elasticity': float(elasticity_df['elasticity'].mean()), 'mean_elasticity': float(elasticity_df['elasticity'].mean()),
'median_elasticity': float(elasticity_df['elasticity'].median()) 'median_elasticity': float(elasticity_df['elasticity'].median())
} }
def publish_results(**context): def build_state_space(**kwargs):
"""Publish elasticity results to model registry and train pricing model""" """Task: Build state space from elasticity"""
import io ti = kwargs['ti']
ti = context['task_instance'] elasticity_json = ti.xcom_pull(key='elasticity_results')
elasticity_json = ti.xcom_pull(task_ids='compute_elasticity', key='elasticity_results')
if not elasticity_json:
logging.error("No elasticity results to publish")
return None
elasticity_df = pd.read_json(io.StringIO(elasticity_json)) elasticity_df = pd.read_json(io.StringIO(elasticity_json))
# import registry and pricing modules context = get_context(**kwargs)
import sys step = BuildStateSpaceStep(context)
sys.path.insert(0, '/opt/airflow/procesing') # this is pretty janky state_space = step.transform(elasticity_df)
ti.xcom_push(key='state_space', value=pickle.dumps(state_space))
logging.info("Built state space for pricing")
return True
def fit_pricing_function(**kwargs):
"""Task: Fit pricing function using elasticity"""
ti = kwargs['ti']
elasticity_json = ti.xcom_pull(key='elasticity_results')
elasticity_df = pd.read_json(io.StringIO(elasticity_json))
context = get_context(**kwargs)
step = FitPricingFunctionStep(context)
pricer = step.transform(elasticity_df)
ti.xcom_push(key='pricer', value=pickle.dumps(pricer))
logging.info("Fitted pricing function")
return True
def predict_prices(**kwargs):
"""Task: Predict optimal prices"""
ti = kwargs['ti']
pricer = pickle.loads(ti.xcom_pull(key='pricer'))
state_space = pickle.loads(ti.xcom_pull(key='state_space'))
context = get_context(**kwargs)
step = PredictPricesStep(context)
prices_df = step.transform((pricer, state_space))
ti.xcom_push(key='predicted_prices', value=prices_df.to_json())
logging.info(f"Predicted prices for {len(prices_df)} products")
return len(prices_df)
def publish_results(**kwargs):
"""Task: Publish elasticity and pricing results to model registry"""
ti = kwargs['ti']
elasticity_json = ti.xcom_pull(key='elasticity_results')
prices_json = ti.xcom_pull(key='predicted_prices')
elasticity_df = pd.read_json(io.StringIO(elasticity_json))
prices_df = pd.read_json(io.StringIO(prices_json))
sys.path.insert(0, '/opt/airflow') sys.path.insert(0, '/opt/airflow')
from lib.model_registry import ModelRegistry from lib.model_registry import ModelRegistry
from procesing.pricing import ElasticityBasedPricingFunction
# initialize registry
registry = ModelRegistry() registry = ModelRegistry()
dag_conf = kwargs.get('dag_run').conf if kwargs.get('dag_run') else {}
# publish elasticity data
metadata = { metadata = {
'timestamp': pd.Timestamp.now().isoformat(), 'timestamp': pd.Timestamp.now().isoformat(),
'window_size': context['dag_run'].conf.get('window_size', '30s'), 'window_size': dag_conf.get('window_size', '30s'),
'store_mode': context['dag_run'].conf.get('store_mode', 'hotel'), 'store_mode': dag_conf.get('store_mode', 'hotel'),
'dag_run_id': context['dag_run'].run_id 'dag_run_id': kwargs['dag_run'].run_id if kwargs.get('dag_run') else 'manual'
} }
registry.publish_elasticity( registry.publish_elasticity(elasticity_df, model_name='latest', metadata=metadata)
elasticity_df,
model_name='latest',
metadata=metadata
)
# train and publish pricing model
pricing_model = ElasticityBasedPricingFunction(
cost_floor=0.5,
max_markup=2.5,
min_markup=1.0,
inelastic_markup=1.2
)
pricing_model.fit(elasticity_df)
# get fitted pricer from XCom
pricer = pickle.loads(ti.xcom_pull(key='pricer'))
registry.publish_pricing_model( registry.publish_pricing_model(
pricing_model, pricer,
model_name='latest', model_name='latest',
metadata={ metadata={**metadata, 'model_type': type(pricer).__name__}
**metadata,
'model_type': 'ElasticityBasedPricingFunction'
}
) )
logging.info(f"Published elasticity + pricing model for {len(elasticity_df)} products to registry") logging.info(f"Published elasticity + pricing for {len(elasticity_df)} products")
return { return {
'n_products': len(elasticity_df), 'n_products': len(elasticity_df),
@@ -243,57 +252,104 @@ def publish_results(**context):
with DAG( with DAG(
'elasticity_pricing_pipeline', 'elasticity_pricing_pipeline',
default_args=default_args, default_args=default_args,
description='E2E pipeline: interactions -> demand -> elasticity -> pricing', description='E2E refactored pipeline: atomic steps with proper separation',
schedule_interval='*/15 * * * *', # every 5 minutes for real-time pricing schedule_interval='*/15 * * * *',
start_date=days_ago(1), start_date=days_ago(1),
catchup=False, catchup=False,
max_active_runs=1, max_active_runs=1,
tags=['pricing', 'elasticity', 'research'], tags=['pricing', 'elasticity', 'research', 'refactored'],
) as dag: ) as dag:
# parallel data fetching # parallel data fetching
fetch_interactions_task = PythonOperator( t_fetch_interactions = PythonOperator(
task_id='fetch_interactions', task_id='fetch_interactions',
python_callable=fetch_interactions, python_callable=fetch_interactions,
provide_context=True, provide_context=True,
) )
fetch_price_logs_task = PythonOperator( t_fetch_price_logs = PythonOperator(
task_id='fetch_price_logs', task_id='fetch_price_logs',
python_callable=fetch_price_logs, python_callable=fetch_price_logs,
provide_context=True, provide_context=True,
) )
# demand computation (depends on interactions) # interaction processing branch
compute_demand_task = PythonOperator( t_create_buckets = PythonOperator(
task_id='create_price_buckets',
python_callable=create_price_buckets,
provide_context=True,
)
t_augment_events = PythonOperator(
task_id='augment_event_names',
python_callable=augment_event_names,
provide_context=True,
)
t_chunk_interactions = PythonOperator(
task_id='chunk_interactions',
python_callable=chunk_interactions,
provide_context=True,
)
t_compute_demand = PythonOperator(
task_id='compute_demand', task_id='compute_demand',
python_callable=compute_demand_chunks, python_callable=compute_demand,
provide_context=True, provide_context=True,
) )
# price aggregation (depends on price logs) # price processing branch (VECTORIZED)
aggregate_prices_task = PythonOperator( t_aggregate_prices = PythonOperator(
task_id='aggregate_prices', task_id='aggregate_price_logs',
python_callable=aggregate_prices, python_callable=aggregate_price_logs,
provide_context=True, provide_context=True,
) )
# elasticity computation (depends on both demand and prices) # convergence: compute elasticity
compute_elasticity_task = PythonOperator( t_compute_elasticity = PythonOperator(
task_id='compute_elasticity', task_id='compute_elasticity',
python_callable=compute_elasticity, python_callable=compute_elasticity,
provide_context=True, provide_context=True,
) )
# publish results (depends on elasticity) # pricing tasks
publish_results_task = PythonOperator( t_build_state = PythonOperator(
task_id='build_state_space',
python_callable=build_state_space,
provide_context=True,
)
t_fit_pricer = PythonOperator(
task_id='fit_pricing_function',
python_callable=fit_pricing_function,
provide_context=True,
)
t_predict_prices = PythonOperator(
task_id='predict_prices',
python_callable=predict_prices,
provide_context=True,
)
# publish to registry
t_publish = PythonOperator(
task_id='publish_results', task_id='publish_results',
python_callable=publish_results, python_callable=publish_results,
provide_context=True, provide_context=True,
) )
# dependency graph # dependency graph (clear atomic flow)
fetch_interactions_task >> compute_demand_task # parallel fetches
fetch_price_logs_task >> aggregate_prices_task [t_fetch_interactions, t_fetch_price_logs]
[compute_demand_task, aggregate_prices_task] >> compute_elasticity_task
compute_elasticity_task >> publish_results_task # interaction branch: fetch -> bucket -> augment -> chunk -> demand
t_fetch_interactions >> t_create_buckets >> t_augment_events >> t_chunk_interactions >> t_compute_demand
# price branch: fetch -> aggregate (vectorized)
t_fetch_price_logs >> t_aggregate_prices
# convergence: both branches -> elasticity
[t_compute_demand, t_aggregate_prices] >> t_compute_elasticity
# pricing: elasticity -> state + fit -> predict -> publish
t_compute_elasticity >> [t_build_state, t_fit_pricer] >> t_predict_prices >> t_publish

View File

@@ -35,6 +35,8 @@ from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import os import os
from dotenv import load_dotenv
load_dotenv()
from supabase import create_client, Client from supabase import create_client, Client
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "") SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")

View File

@@ -57,8 +57,6 @@ class FitPricingFunctionStep(BaseContextStep):
""" """
def transform(self, elasticity_df: pd.DataFrame): def transform(self, elasticity_df: pd.DataFrame):
from pricing import ElasticityBasedPricingFunction
pricing_class = self.context.config.get('pricing_function_class', ElasticityBasedPricingFunction) pricing_class = self.context.config.get('pricing_function_class', ElasticityBasedPricingFunction)
pricing_params = self.context.config.get('pricing_function_params', {}) pricing_params = self.context.config.get('pricing_function_params', {})