Airflow addition (#28)

* introducing airflow to run pipeline

* chore: updating dag with upload to registry

* introducing complete provider (non refactored and noisy)

* chore: removing old shit

* generic pricing baselines

* feature: super simple model registry (to be updated maybe third party OS software)

* chore: refactoring the providers docker config and requirements

* chore: refactored and broke down components (braking

* exporting all

* local pipeline excution working

* fix: fixing import structures from nonrelativistic

* chore: enables cross comm pickling with fully e2e pipeline compilation

* docs: what the pipeline is like now

* pipelines local running and pipeline high level definition

* cleaning old pipeline and vectorization

* leaked but fixing, not so important

* test: started with pipeline step testing

* chore: cleaning up provider of prices

* test: extra tests wit hsemantic meaning checks

* migrating pricers

* feature: introducing pricing predictors (pricers)

* chore: e2e is done with new pipeline

* extra session feature extraction

* feature: experiemntal sessin pricer and metrics(vibe)

* chore: redefined and connected pricers (#29)
This commit is contained in:
Daniel Alves Rösel
2025-11-29 17:50:16 +01:00
committed by GitHub
parent 2a0e44ab24
commit ad9423bf59
49 changed files with 3642 additions and 619 deletions

View File

@@ -0,0 +1,27 @@
from procesing.steps.base import BaseContextStep
from procesing.steps.fetch import FetchInteractionsStep, FetchPriceLogsStep, FetchExperimentsStep
from procesing.steps.join import JoinExperimentsStep
from procesing.steps.augment import CreatePriceBucketsStep, AugmentEventNamesStep
from procesing.steps.chunk import ChunkByTimeWindowStep
from procesing.steps.demand import ComputeDemandStep, ComputeDemandForChunksStep
from procesing.steps.elasticity import AggregatePriceLogsStep, ComputeElasticityStep
from procesing.steps.pricing import StateSpace, BuildStateSpaceStep, FitPricingFunctionStep, PredictPricesStep
__all__ = [
'BaseContextStep',
'FetchInteractionsStep',
'FetchPriceLogsStep',
'FetchExperimentsStep',
'JoinExperimentsStep',
'CreatePriceBucketsStep',
'AugmentEventNamesStep',
'ChunkByTimeWindowStep',
'ComputeDemandStep',
'ComputeDemandForChunksStep',
'AggregatePriceLogsStep',
'ComputeElasticityStep',
'StateSpace',
'BuildStateSpaceStep',
'FitPricingFunctionStep',
'PredictPricesStep',
]

View File

@@ -0,0 +1,53 @@
import numpy as np
import pandas as pd
from procesing.steps.base import BaseContextStep
class CreatePriceBucketsStep(BaseContextStep):
"""Create price bucket labels from price data"""
def transform(self, df: pd.DataFrame):
if df.empty or 'metadata_price' not in df.columns:
df['price_bucket'] = ""
return df
n_buckets = self.context.config.get('n_price_buckets', 5)
if df['metadata_price'].notnull().sum() > 0:
try:
price_buckets = pd.qcut(
df['metadata_price'],
q=n_buckets,
labels=[f"PB_{i+1}" for i in range(n_buckets)],
duplicates='drop'
)
except ValueError:
# fallback for insufficient unique values
price_buckets = df['metadata_price'].apply(
lambda x: f"P_{int(x)}" if pd.notnull(x) else ""
)
else:
price_buckets = pd.Series([""] * len(df), index=df.index)
df['price_bucket'] = price_buckets
return df
class AugmentEventNamesStep(BaseContextStep):
"""Augment event names with product and price bucket schema"""
def transform(self, df: pd.DataFrame):
if df.empty:
return df
# Create schema: _productId@price_bucket
has_product = df.get('productId', pd.Series()).notnull()
has_bucket = df.get('price_bucket', pd.Series()).notnull()
df['metadata_schema'] = np.where(
has_product & has_bucket,
"_" + df['productId'].astype(str) + "@" + df['price_bucket'].astype(str),
""
)
df['eventName'] = df['eventName'] + df['metadata_schema']
return df

View File

@@ -0,0 +1,31 @@
from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, TransformerMixin
from procesing.context import PipelineContext
class BaseContextStep(BaseEstimator, TransformerMixin, ABC):
"""
Base for all pipeline steps.
Each step is stateless, context-driven, and performs ONE transformation.
"""
def __init__(self, context: PipelineContext):
self.context = context
def fit(self, X=None, y=None):
"""Most steps don't need training"""
return self
@abstractmethod
def transform(self, X):
"""Transform input using context. Must be implemented by subclass."""
pass
def get_params(self, deep=True):
"""sklearn compatibility"""
return {'context': self.context}
def set_params(self, **params):
"""sklearn compatibility"""
if 'context' in params:
self.context = params['context']
return self

View File

@@ -0,0 +1,34 @@
import pandas as pd
from procesing.steps.base import BaseContextStep
class ChunkByTimeWindowStep(BaseContextStep):
"""
Chunk dataframe into time windows.
Returns list of dicts with window metadata.
"""
def transform(self, df: pd.DataFrame):
if df.empty:
return []
df = df.copy()
ts_col = self.context.config.get('ts_col', 'ts')
window_size = self.context.window_size
# ensure datetime
if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
df[ts_col] = pd.to_datetime(df[ts_col])
df = df.sort_values(ts_col)
df['_window'] = df[ts_col].dt.floor(window_size)
chunks = []
for idx, (window_start, group) in enumerate(df.groupby('_window')):
chunks.append({
'window_start': window_start,
'window_end': window_start + pd.Timedelta(window_size),
'window_idx': idx,
'data': group.drop(columns=['_window'])
})
return chunks

View File

@@ -0,0 +1,61 @@
import pandas as pd
from procesing.steps.base import BaseContextStep
class ComputeDemandStep(BaseContextStep):
"""
Compute demand vector for a single time window or dataframe.
Input: single chunk dict OR raw dataframe
Output: demand dataframe with [productId, demand_score]
"""
def transform(self, chunk):
# handle both chunk dict and raw dataframe
if isinstance(chunk, dict):
interactions = chunk['data']
window_meta = {k: v for k, v in chunk.items() if k != 'data'}
else:
interactions = chunk
window_meta = {}
products = self.context.products
unique_products = products['id'].unique()
# apply filters if configured
session_filter = self.context.config.get('session_filter')
experiment_filter = self.context.config.get('experiment_filter')
if session_filter and 'sessionId' in interactions.columns:
interactions = interactions[interactions['sessionId'] == session_filter]
if experiment_filter and 'experimentId' in interactions.columns:
interactions = interactions[interactions['experimentId'] == experiment_filter]
interactions_with_products = interactions.dropna(subset=['productId'])
if interactions_with_products.empty:
demand_df = pd.DataFrame({
'productId': unique_products,
'demand_score': 0
})
else:
# crosstab for simple demand count
demand_df = pd.crosstab(
interactions_with_products['productId'],
'count'
).reindex(unique_products, fill_value=0).reset_index()
demand_df.columns = ['productId', 'demand_score']
# attach window metadata if present
if window_meta:
return {**window_meta, 'demand_vector': demand_df}
return demand_df
class ComputeDemandForChunksStep(BaseContextStep):
"""Apply ComputeDemandStep to list of chunks"""
def transform(self, chunks: list):
if not chunks:
return []
demand_step = ComputeDemandStep(self.context)
return [demand_step.transform(chunk) for chunk in chunks]

View File

@@ -0,0 +1,253 @@
import numpy as np
import pandas as pd
from typing import Dict, List
from procesing.steps.base import BaseContextStep
class AggregatePriceLogsStep(BaseContextStep):
"""
Aggregate price logs into time windows using VECTORIZED operations.
Input: price_logs_df
Output: list of price chunks with [productId, price]
"""
def transform(self, price_logs_df: pd.DataFrame):
if price_logs_df.empty:
return []
df = price_logs_df.copy()
ts_col = self.context.config.get('ts_col', 'ts')
window_size = self.context.window_size
# ensure datetime
if not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
df[ts_col] = pd.to_datetime(df[ts_col])
df = df.sort_values([ts_col, 'productId'])
products = self.context.products
unique_products = products['id'].unique()
# VECTORIZED: group by product, resample by time window, compute mean
df_indexed = df.set_index(ts_col)
windowed = (
df_indexed
.groupby('productId')['price']
.resample(window_size)
.mean()
.reset_index()
)
# forward fill missing windows (carry last known price)
windowed = windowed.sort_values([ts_col, 'productId'])
windowed['price'] = windowed.groupby('productId')['price'].ffill()
windowed = windowed.dropna(subset=['price'])
# group into chunks by window
chunks = []
for window_start, group in windowed.groupby(ts_col):
price_vector = group[['productId', 'price']].copy()
# fill missing products with last known price before this window
missing_products = set(unique_products) - set(price_vector['productId'])
if missing_products:
for pid in missing_products:
last_price = df_indexed[
(df_indexed['productId'] == pid) &
(df_indexed.index < window_start)
]['price']
if not last_price.empty:
price_vector = pd.concat([
price_vector,
pd.DataFrame({'productId': [pid], 'price': [last_price.iloc[-1]]})
], ignore_index=True)
if not price_vector.empty:
chunks.append({
'window_start': window_start,
'window_end': window_start + pd.Timedelta(window_size),
'price_vector': price_vector
})
return chunks
class ComputeElasticityStep(BaseContextStep):
"""
Compute price elasticity from demand and price chunks.
Input: (demand_chunks, price_chunks)
Output: elasticity_df [productId, elasticity, std_error, n_obs]
"""
def transform(self, chunk_tuple: tuple):
demand_chunks, price_chunks = chunk_tuple
method = self.context.config.get('elasticity_method', 'point')
min_obs = self.context.config.get('min_observations', 2)
products = self.context.products
all_product_ids = products['id'].unique()
# align chunks by window_start
aligned = self._align_chunks(demand_chunks, price_chunks)
if not aligned:
return pd.DataFrame({
'productId': all_product_ids,
'elasticity': 0.0,
'std_error': 0.0,
'n_obs': 0
})
# build time series per product
product_series = self._build_timeseries(aligned)
# compute elasticity per product
elasticities = []
for pid, series in product_series.items():
if len(series) < min_obs:
elasticities.append({
'productId': pid,
'elasticity': 0.0,
'std_error': 0.0,
'n_obs': len(series)
})
continue
elast = self._compute_elasticity(series, method)
elasticities.append({
'productId': pid,
'elasticity': elast['value'],
'std_error': elast.get('std_error', 0.0),
'n_obs': len(series)
})
result_df = pd.DataFrame(elasticities)
# fill missing products with zero elasticity
observed_pids = set(result_df['productId'])
missing_pids = [p for p in all_product_ids if p not in observed_pids]
if missing_pids:
missing_df = pd.DataFrame({
'productId': missing_pids,
'elasticity': 0.0,
'std_error': 0.0,
'n_obs': 0
})
result_df = pd.concat([result_df, missing_df], ignore_index=True)
return result_df
def _align_chunks(self, demand_chunks: List[Dict], price_chunks: List[Dict]):
"""Align demand and price chunks by window_start"""
price_lookup = {c['window_start']: c for c in price_chunks}
aligned = []
for dc in demand_chunks:
ws = dc['window_start']
if ws in price_lookup:
aligned.append({
'window_start': ws,
'window_end': dc['window_end'],
'demand': dc['demand_vector'],
'prices': price_lookup[ws]['price_vector']
})
return aligned
def _build_timeseries(self, aligned: List[Dict]):
"""Build time series [timestamp, price, quantity] per product"""
series_by_product = {}
for chunk in aligned:
merged = chunk['demand'].merge(chunk['prices'], on='productId', how='inner')
for _, row in merged.iterrows():
pid = row['productId']
if pid not in series_by_product:
series_by_product[pid] = []
series_by_product[pid].append({
'timestamp': chunk['window_start'],
'price': row['price'],
'quantity': row['demand_score']
})
return series_by_product
def _compute_elasticity(self, series: List[Dict], method: str):
"""Compute point or arc elasticity"""
prices = np.array([s['price'] for s in series])
quantities = np.array([s['quantity'] for s in series])
# filter out zero/negative values
valid = (prices > 0) & (quantities > 0)
if valid.sum() < 2:
return {'value': 0.0, 'std_error': 0.0}
prices = prices[valid]
quantities = quantities[valid]
if method == 'point':
return self._point_elasticity(prices, quantities)
elif method == 'arc':
return self._arc_elasticity(prices, quantities)
else:
raise ValueError(f"Unknown elasticity method: {method}")
def _point_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
"""Point elasticity via log-log regression: log(Q) = a + b*log(P), elasticity = b"""
if len(prices) < 2:
return {'value': 0.0, 'std_error': 0.0}
log_p = np.log(prices)
log_q = np.log(quantities)
if log_p.std() == 0:
return {'value': 0.0, 'std_error': 0.0}
cov = np.cov(log_p, log_q)[0, 1]
var = np.var(log_p)
b = cov / var
# std error estimate
if len(prices) > 2:
residuals = log_q - (log_q.mean() + b * (log_p - log_p.mean()))
mse = (residuals ** 2).sum() / (len(prices) - 2)
se_b = np.sqrt(mse / (len(prices) * var))
else:
se_b = 0.0
return {'value': b, 'std_error': se_b}
def _arc_elasticity(self, prices: np.ndarray, quantities: np.ndarray):
"""Arc elasticity: average period-over-period elasticity"""
elasticities = []
for i in range(1, len(prices)):
p1, p2 = prices[i-1], prices[i]
q1, q2 = quantities[i-1], quantities[i]
p_avg = (p1 + p2) / 2
q_avg = (q1 + q2) / 2
if p_avg == 0 or q_avg == 0:
continue
delta_p = p2 - p1
delta_q = q2 - q1
if delta_p == 0:
continue
e = (delta_q / q_avg) / (delta_p / p_avg)
elasticities.append(e)
if not elasticities:
return {'value': 0.0, 'std_error': 0.0}
return {
'value': np.mean(elasticities),
'std_error': np.std(elasticities) / np.sqrt(len(elasticities))
}

View File

@@ -0,0 +1,46 @@
import pandas as pd
from procesing.steps.base import BaseContextStep
class FetchInteractionsStep(BaseContextStep):
"""Fetch raw interaction data from Kafka topic"""
def transform(self, X=None):
df = self.context.provider.fetch_kafka_topic('user-interactions')
if df.empty:
return df
# Explode metadata JSON column
if 'metadata' in df.columns:
df = df.join(
pd.json_normalize(df.pop('metadata'), sep='.').add_prefix('metadata_')
)
df = df.dropna(subset=['eventName'])
# Remap dateIndex if present
if 'metadata_dateIndex' in df.columns:
df['dateIndex'] = df['metadata_dateIndex'].astype('Int64')
return df
class FetchPriceLogsStep(BaseContextStep):
"""Fetch price log data from Kafka topic"""
def transform(self, X=None):
return self.context.provider.fetch_kafka_topic('price-logs')
class FetchExperimentsStep(BaseContextStep):
"""Fetch experiment metadata for given interaction data"""
def transform(self, interactions_df: pd.DataFrame):
if interactions_df.empty or 'experimentId' not in interactions_df.columns:
return pd.DataFrame()
exp_ids = interactions_df['experimentId'].dropna().unique().tolist()
if not exp_ids:
return pd.DataFrame()
return self.context.provider.fetch_experiments(exp_ids)

View File

@@ -0,0 +1,34 @@
import pandas as pd
from procesing.steps.base import BaseContextStep
class JoinExperimentsStep(BaseContextStep):
"""Join experiment metadata to interactions"""
def transform(self, data: tuple):
"""
Args:
data: (interactions_df, experiments_df)
Returns:
merged interactions dataframe
"""
interactions_df, experiments_df = data
if experiments_df.empty:
return interactions_df
# Flatten nested task field if present
if 'task' in experiments_df.columns and experiments_df['task'].notnull().any():
task_norm = pd.json_normalize(experiments_df['task'].dropna())
task_norm.index = experiments_df[experiments_df['task'].notnull()].index
experiments_df = experiments_df.drop('task', axis=1).join(task_norm, rsuffix='_task')
# Rename for clarity
experiments_df = experiments_df.rename(columns={
'id': 'experimentId',
'subject_name': 'exp_subject',
'xp_human_only': 'exp_human_only',
'xp_market_mode': 'exp_market_mode',
'xp_task_id': 'exp_task_id'
})
return interactions_df.merge(experiments_df, on='experimentId', how='left')

View File

@@ -0,0 +1,149 @@
import numpy as np
import pandas as pd
from typing import Optional, List, Dict, Any
from dataclasses import dataclass, field
from procesing.steps.base import BaseContextStep
from procesing.pricers import ElasticityBasedPricer
@dataclass
class StateSpace:
"""
State representation for pricing functions.
Components:
Q_t: demand ∈ R^n (current demand signal per product)
P_t: prices ∈ R^n (current/base prices)
S_t: session_features (behavioral signals, interaction data)
H_t: history = {Q_{t-k}, P_{t-k}, S_{t-k}} for k in [1, history_length]
Additionally stores:
- product_ids: product identifiers (n,)
- elasticity: price elasticity per product (n,)
- metadata: arbitrary context (experiment_id, timestamp, etc.)
"""
demand: np.ndarray # Q_t ∈ R^n
prices: np.ndarray # P_t ∈ R^n
session_features: pd.DataFrame = field(default_factory=pd.DataFrame) # S_t
# augmented state components
product_ids: Optional[np.ndarray] = None
elasticity: Optional[np.ndarray] = None
# historical trajectory H_t = {(Q_{t-k}, P_{t-k}, S_{t-k})}
history: List[Dict[str, Any]] = field(default_factory=list)
# metadata for context
metadata: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
"""Validate dimensions."""
n = len(self.demand)
assert len(self.prices) == n, "demand and prices must have same dimension"
if self.elasticity is not None:
assert len(self.elasticity) == n, "elasticity must match dimension"
if self.product_ids is not None:
assert len(self.product_ids) == n, "product_ids must match dimension"
@property
def n_products(self) -> int:
"""Number of products in state space."""
return len(self.demand)
def add_history(self, q: np.ndarray, p: np.ndarray, s: pd.DataFrame, max_length: int = 10):
"""Append historical state to trajectory H_t."""
self.history.append({'demand': q, 'prices': p, 'session_features': s})
if len(self.history) > max_length:
self.history.pop(0)
def get_history_window(self, k: int = 5) -> List[Dict[str, Any]]:
"""Retrieve last k historical states."""
return self.history[-k:] if len(self.history) >= k else self.history
class BuildStateSpaceStep(BaseContextStep):
"""
Build state space from elasticity, demand, and price data.
Input: elasticity_df [productId, elasticity, ...], optional demand_df
Output: StateSpace instance with Q_t, P_t, elasticity, product_ids
"""
def transform(self, elasticity_df: pd.DataFrame, demand_df: Optional[pd.DataFrame] = None):
products = self.context.products
# extract base prices from product metadata
products_with_prices = products.copy()
if 'metadata' in products_with_prices.columns:
products_with_prices['base_price'] = products_with_prices['metadata'].apply(
lambda m: m.get('base_price', 0) if isinstance(m, dict) else 0
)
else:
products_with_prices['base_price'] = 0
# merge with elasticity
merged = products_with_prices[['id', 'base_price']].rename(
columns={'id': 'productId'}
).merge(
elasticity_df[['productId', 'elasticity']],
on='productId',
how='left'
).fillna({'elasticity': 0.0, 'base_price': 0.0})
# merge with demand if provided, else use default
if demand_df is not None and 'demand' in demand_df.columns:
merged = merged.merge(
demand_df[['productId', 'demand']],
on='productId',
how='left'
).fillna({'demand': 0.0})
demand_vector = merged['demand'].values
else:
# default: uniform demand or use elasticity as proxy
demand_vector = np.ones(len(merged)) * 10.0
return StateSpace(
demand=demand_vector,
prices=merged['base_price'].values,
session_features=pd.DataFrame(),
product_ids=merged['productId'].values,
elasticity=merged['elasticity'].values,
metadata={'timestamp': pd.Timestamp.now().isoformat()}
)
class FitPricingFunctionStep(BaseContextStep):
"""
Fit pricing function using elasticity data.
Input: elasticity_df
Output: fitted pricing function instance
"""
def transform(self, elasticity_df: pd.DataFrame):
pricing_class = self.context.config.get('pricing_function_class', ElasticityBasedPricer)
pricing_params = self.context.config.get('pricing_function_params', {})
pricer = pricing_class(**pricing_params)
pricer.fit(elasticity_df)
return pricer
class PredictPricesStep(BaseContextStep):
"""
Predict optimal prices using fitted pricing function.
Input: (pricer, state_space)
Output: prices_df [productId, predicted_price]
"""
def transform(self, data: tuple):
pricer, state_space = data
products = self.context.products
product_ids = products['id'].values
predicted_prices = pricer.predict(state_space)
return pd.DataFrame({
'productId': product_ids,
'predicted_price': predicted_prices
})

View File

@@ -0,0 +1,114 @@
"""
Session feature extraction for S_t component of state space.
Computes behavioral signals from interaction data already in pipeline.
"""
import pandas as pd
import numpy as np
from typing import Optional, Dict, Any
from collections import Counter
from procesing.steps.base import BaseContextStep
class ExtractSessionFeaturesStep(BaseContextStep):
"""
Extract session-level behavioral features from interaction logs.
Input: interactions_df (user-interactions from earlier pipeline step)
Output: session_features DataFrame [sessionId, feature_1, feature_2, ...]
Features computed:
- total_interactions: count of all events
- page_views, item_views, searches, cart_adds: event type counts
- hovers: hover event counts
- unique_products_viewed: distinct product IDs
- interaction_velocity: events per minute
- session_duration_sec: time span of session
- avg_time_between_events: mean inter-event time
- product_view_depth: max views for single product (attention signal)
"""
def transform(self, interactions_df: pd.DataFrame) -> pd.DataFrame:
if interactions_df.empty:
return pd.DataFrame()
# ensure timestamp column
if 'ts' in interactions_df.columns:
interactions_df = interactions_df.copy()
interactions_df['ts'] = pd.to_datetime(interactions_df['ts'])
# group by session and compute features
session_features = []
for session_id, session_df in interactions_df.groupby('sessionId'):
features = self._extract_features_for_session(session_id, session_df)
session_features.append(features)
return pd.DataFrame(session_features)
def _extract_features_for_session(self, session_id: str, session_df: pd.DataFrame) -> Dict[str, Any]:
"""Compute features for single session."""
features = {'sessionId': session_id}
# basic counts
features['total_interactions'] = len(session_df)
event_counts = session_df['eventName'].value_counts().to_dict()
features['page_views'] = event_counts.get('page_view', 0) + event_counts.get('view_item_page', 0)
features['item_views'] = event_counts.get('view_item_page', 0)
features['searches'] = event_counts.get('search', 0)
features['cart_adds'] = event_counts.get('add_item_to_cart', 0)
# hover events
hover_events = ['hover_over_title', 'hover_over_paragraph', 'hover_over_link', 'hover_over_button']
features['hovers'] = sum(event_counts.get(ev, 0) for ev in hover_events)
# product-level signals
product_ids = session_df['productId'].dropna()
features['unique_products_viewed'] = product_ids.nunique()
if len(product_ids) > 0:
product_view_counts = Counter(product_ids)
features['product_view_depth'] = max(product_view_counts.values())
else:
features['product_view_depth'] = 0
# temporal features
if 'ts' in session_df.columns:
timestamps = session_df['ts'].sort_values()
features['session_duration_sec'] = (timestamps.max() - timestamps.min()).total_seconds()
if features['session_duration_sec'] > 0:
features['interaction_velocity'] = (features['total_interactions'] / features['session_duration_sec']) * 60
else:
features['interaction_velocity'] = 0.0
# inter-event timing
if len(timestamps) > 1:
time_diffs = timestamps.diff().dropna().dt.total_seconds()
features['avg_time_between_events'] = time_diffs.mean()
features['std_time_between_events'] = time_diffs.std()
else:
features['avg_time_between_events'] = 0.0
features['std_time_between_events'] = 0.0
else:
features['session_duration_sec'] = 0.0
features['interaction_velocity'] = 0.0
features['avg_time_between_events'] = 0.0
features['std_time_between_events'] = 0.0
# cart/conversion signals
features['cart_to_view_ratio'] = features['cart_adds'] / features['item_views'] if features['item_views'] > 0 else 0.0
return features
class FilterSessionInteractionsStep(BaseContextStep):
"""
Filter interactions DataFrame to specific session.
Input: (interactions_df, session_id)
Output: interactions_df filtered to session_id
"""
def transform(self, data: tuple) -> pd.DataFrame:
interactions_df, session_id = data
return interactions_df[interactions_df['sessionId'] == session_id].copy()