catchup: rogue scripts

This commit is contained in:
2026-02-27 12:45:46 +01:00
parent e8a9716f69
commit 5444a4ea13
27 changed files with 6908 additions and 2 deletions

View File

@@ -0,0 +1,269 @@
"""
Session-Aware Pricing DAG
THIS implements the core pricing computation (policy layer).
Flow: τ → θ̂ → D → p*
1. Fetch recent sessions from Kafka (last 10 active)
2. Extract features per session (τ → θ̂)
3. Map features to demand proxy (θ̂ → D)
4. Compute optimal prices (D → p*)
5. Write to Redis session:{sessionId}:prices
Scheduled: every 1 minute when enabled
"""
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
from datetime import timedelta
import pandas as pd
import numpy as np
import logging
import sys
import pickle
sys.path.insert(0, '/opt/airflow')
from procesing.context import PipelineContext
from procesing.providers import SupabaseProvider, BackendAPIProvider
from procesing.steps.session import ExtractSessionFeaturesStep
from procesing.pricers.simple import SimpleSurgePricer, session_features_to_demand
from procesing.pricing import StateSpace
from lib.model_registry import ModelRegistry
DEFAULT_ARGS = {
'owner': 'phantom-research',
'depends_on_past': False,
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(seconds=30),
}
class CompositeProvider(SupabaseProvider, BackendAPIProvider):
def __init__(self):
SupabaseProvider.__init__(self)
BackendAPIProvider.__init__(self)
def _get_context(store_mode: str = 'hotel') -> PipelineContext:
return PipelineContext(provider=CompositeProvider(), store_mode=store_mode)
def fetch_recent_sessions(**kwargs):
"""
Task: Fetch last N active sessions from Kafka.
Returns: DataFrame of interaction events for recent sessions.
"""
dag_conf = kwargs.get('dag_run').conf if kwargs.get('dag_run') else {}
store_mode = dag_conf.get('store_mode', 'hotel')
session_limit = dag_conf.get('session_limit', 10)
ctx = _get_context(store_mode)
provider = ctx.provider
# fetch all recent interactions from Kafka
try:
interactions_df = provider.fetch_kafka_topic("user-interactions")
except Exception as e:
logging.error(f"Failed to fetch interactions: {e}")
kwargs['ti'].xcom_push(key='sessions_data', value=pickle.dumps(pd.DataFrame()))
return 0
if interactions_df.empty or 'sessionId' not in interactions_df.columns:
kwargs['ti'].xcom_push(key='sessions_data', value=pickle.dumps(pd.DataFrame()))
return 0
# identify last N active sessions (most recent by event count)
recent_sessions = interactions_df['sessionId'].value_counts().head(session_limit).index.tolist()
# filter to only those sessions
filtered_df = interactions_df[interactions_df['sessionId'].isin(recent_sessions)].copy()
kwargs['ti'].xcom_push(key='sessions_data', value=pickle.dumps(filtered_df))
kwargs['ti'].xcom_push(key='session_ids', value=recent_sessions)
logging.info(f"Fetched {len(filtered_df)} events for {len(recent_sessions)} sessions")
return len(recent_sessions)
def extract_session_features(**kwargs):
"""
Task: Extract behavioral features from session trajectories.
THIS implements τ → θ̂ transformation.
"""
ti = kwargs['ti']
sessions_df = pickle.loads(ti.xcom_pull(key='sessions_data'))
if sessions_df.empty:
ti.xcom_push(key='session_features', value=pickle.dumps(pd.DataFrame()))
return 0
dag_conf = kwargs.get('dag_run').conf if kwargs.get('dag_run') else {}
ctx = _get_context(dag_conf.get('store_mode', 'hotel'))
# extract features using vectorized pipeline
feature_extractor = ExtractSessionFeaturesStep(ctx)
features_df = feature_extractor.transform(sessions_df)
ti.xcom_push(key='session_features', value=pickle.dumps(features_df))
logging.info(f"Extracted {len(features_df.columns)} features for {len(features_df)} sessions")
logging.info(f"Feature columns: {list(features_df.columns)}")
logging.info(f"Sample features (first session):\n{features_df.iloc[0].to_dict()}")
return len(features_df)
def compute_session_prices(**kwargs):
"""
Task: Compute optimal prices for each session.
THIS implements θ̂ → D → p* transformation.
"""
ti = kwargs['ti']
features_df = pickle.loads(ti.xcom_pull(key='session_features'))
if features_df.empty:
ti.xcom_push(key='price_results', value=pickle.dumps({}))
return 0
dag_conf = kwargs.get('dag_run').conf if kwargs.get('dag_run') else {}
store_mode = dag_conf.get('store_mode', 'hotel')
ctx = _get_context(store_mode)
# fetch product catalog for base prices
products_df = ctx.provider.fetch_products(store_mode)
if products_df.empty:
logging.error("No products found in catalog")
ti.xcom_push(key='price_results', value=pickle.dumps({}))
return 0
products_df['base_price'] = products_df['metadata'].apply(
lambda m: m.get('base_price', 100.0) if isinstance(m, dict) else 100.0
)
# initialize pricing model
pricer = SimpleSurgePricer(
high_threshold=dag_conf.get('high_threshold', 10),
low_threshold=dag_conf.get('low_threshold', 2),
surge_multiplier=dag_conf.get('surge_multiplier', 1.15),
discount_multiplier=dag_conf.get('discount_multiplier', 0.95)
)
pricer.fit(products_df)
# compute prices per session
price_results = {}
n_products = len(products_df)
logging.info(f"Starting price computation for {len(features_df)} sessions, {n_products} products")
logging.info(f"Pricer config: high_thresh={pricer.high_threshold}, low_thresh={pricer.low_threshold}, surge_mult={pricer.surge_multiplier}")
for idx, session_row in features_df.iterrows():
session_id = session_row.get('sessionId')
if not session_id:
continue
# map features to demand proxy (θ̂ → D)
session_features_single = pd.DataFrame([session_row])
demand_proxy = session_features_to_demand(session_features_single)
logging.info(f"[Session {session_id}] Features → Demand: {demand_proxy:.2f}")
logging.info(f"[Session {session_id}] Key features: velocity={session_row.get('interaction_velocity', 0):.2f}, cart_ratio={session_row.get('cart_to_view_ratio', 0):.2f}, item_views={session_row.get('item_views', 0)}")
# build state space
state_space = StateSpace(
demand=np.full(n_products, demand_proxy), # broadcast session demand to all products
prices=products_df['base_price'].values,
session_features=session_features_single
)
# compute optimal prices (D → p*)
optimal_prices = pricer.predict(state_space)
base_avg = products_df['base_price'].mean()
optimal_avg = optimal_prices.mean()
price_change_pct = ((optimal_avg - base_avg) / base_avg) * 100
logging.info(f"[Session {session_id}] Price adjustment: base_avg={base_avg:.2f}, optimal_avg={optimal_avg:.2f}, change={price_change_pct:+.1f}%")
# store as dict {productId: price}
price_map = {
str(products_df.iloc[i]['id']): float(optimal_prices[i])
for i in range(n_products)
}
price_results[session_id] = price_map
ti.xcom_push(key='price_results', value=pickle.dumps(price_results))
logging.info(f"Computed prices for {len(price_results)} sessions, {n_products} products each")
return len(price_results)
def publish_to_registry(**kwargs):
"""
Task: Write session prices to Redis registry.
THIS is the write path: prices → session:{sessionId}:prices
"""
ti = kwargs['ti']
price_results = pickle.loads(ti.xcom_pull(key='price_results'))
if not price_results:
logging.warning("No prices to publish")
return 0
registry = ModelRegistry()
ttl = kwargs.get('dag_run').conf.get('ttl', 1800) if kwargs.get('dag_run') and kwargs.get('dag_run').conf else 1800
published_count = 0
for session_id, price_map in price_results.items():
registry.set_session_prices(session_id, price_map, ttl=ttl)
published_count += 1
logging.info(f"Published prices for {published_count} sessions to registry (TTL={ttl}s)")
return {
'sessions_published': published_count,
'products_per_session': len(next(iter(price_results.values()))) if price_results else 0,
'status': 'success'
}
# DAG definition
with DAG(
'session_pricing_pipeline',
default_args=DEFAULT_ARGS,
description='Session-aware pricing: extract features → compute prices → publish to registry',
schedule_interval='*/1 * * * *', # every 1 minute
start_date=days_ago(1),
catchup=False,
max_active_runs=1,
tags=['pricing', 'session-aware', 'research', 'real-time'],
) as dag:
t_fetch_sessions = PythonOperator(
task_id='fetch_recent_sessions',
python_callable=fetch_recent_sessions,
provide_context=True,
)
t_extract_features = PythonOperator(
task_id='extract_session_features',
python_callable=extract_session_features,
provide_context=True,
)
t_compute_prices = PythonOperator(
task_id='compute_session_prices',
python_callable=compute_session_prices,
provide_context=True,
)
t_publish = PythonOperator(
task_id='publish_to_registry',
python_callable=publish_to_registry,
provide_context=True,
)
# linear dependency: fetch → extract → compute → publish
t_fetch_sessions >> t_extract_features >> t_compute_prices >> t_publish

View File

@@ -0,0 +1 @@
from .encoder import Window, extract_windows, build_windows, WindowDataset, PrototypeClassifier, train, loocv

View File

@@ -0,0 +1,210 @@
"""Contrastive encoder via trajectory windowing. Classification by prototype distance."""
import sys
sys.path.insert(0, "/home/velocitatem/Documents/Projects/PHANTOM/sim/rl/behavior_loader")
sys.path.insert(0, "/home/velocitatem/Documents/Projects/PHANTOM/experiments/ml")
from sim.rl.behavior_loader.loader import JointLoader, PayloadModel
from arch import TrajectoryEncoder, featurize_trajectory, nt_xent_loss
from typing import List, Dict, Tuple
from dataclasses import dataclass
from datetime import datetime
import numpy as np, torch, torch.nn.functional as F, random, optuna
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter
RUNS = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/ml/runs"
AGENT_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
HUMAN_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
@dataclass
class Window:
events: List[PayloadModel]
traj_id: str
label: int # 0=human, 1=agent
def extract_windows(events: List[PayloadModel], traj_id: str, label: int,
sizes: List[int] = [5, 10, 15], stride: int = 2) -> List[Window]:
"""Multi-scale overlapping windows from trajectory"""
n = len(events)
wins = [Window(events[i:i+s], traj_id, label) for s in sizes if n >= s for i in range(0, n-s+1, stride)]
if n >= 3: wins.append(Window(events, traj_id, label)) # full traj
return wins
def build_windows(data: Dict[str, List], sizes=[5,10,15], stride=2) -> List[Window]:
return [w for tid, evts in data.items()
for w in extract_windows(evts, tid, 0 if tid.startswith('human_') else 1, sizes, stride)]
class WindowDataset(Dataset):
"""Yields (anchor, positive) pairs from same class"""
def __init__(self, windows: List[Window], dim: int = 64):
self.wins, self.dim = windows, dim
self.by_label = {0: [i for i,w in enumerate(windows) if w.label==0],
1: [i for i,w in enumerate(windows) if w.label==1]}
self.by_traj = {}
for i, w in enumerate(windows): self.by_traj.setdefault(w.traj_id, []).append(i)
def __len__(self): return len(self.wins)
def _feat(self, evts): return featurize_trajectory(evts, None, self.dim)
def _aug(self, evts): # subsample 70-100%
if len(evts) < 4: return evts
k = max(3, int(len(evts) * random.uniform(0.7, 1.0)))
start = random.randint(0, len(evts) - k)
return evts[start:start+k]
def __getitem__(self, idx):
w = self.wins[idx]
pool = [i for i in self.by_label[w.label] if self.wins[i].traj_id != w.traj_id]
pos_idx = random.choice(pool) if pool else idx
a = torch.tensor(self._feat(self._aug(w.events)), dtype=torch.float32)
p = torch.tensor(self._feat(self._aug(self.wins[pos_idx].events)), dtype=torch.float32)
return a, p, w.label
class PrototypeClassifier:
"""Classify by distance to class centroids"""
def __init__(self, encoder: TrajectoryEncoder, device = 'cuda', dim=64):
self.enc, self.dev, self.dim = encoder, device, dim
self.centroids = {0: None, 1: None}
def fit(self, windows: List[Window]):
self.enc.eval()
embs = {0: [], 1: []}
with torch.no_grad():
for w in windows:
x = torch.tensor(featurize_trajectory(w.events, None, self.dim), dtype=torch.float32)
z = self.enc(x.unsqueeze(0).unsqueeze(1).to(self.dev))
embs[w.label].append(z)
self.centroids = {k: torch.cat(v).mean(0, keepdim=True) if v else None for k, v in embs.items()}
return self
def predict(self, events: List[PayloadModel]) -> Tuple[int, float, Dict]:
"""Returns (pred, confidence, debug). Confidence via softmax over -distances."""
self.enc.eval()
with torch.no_grad():
x = torch.tensor(featurize_trajectory(events, None, self.dim), dtype=torch.float32)
z = self.enc(x.unsqueeze(0).unsqueeze(1).to(self.dev))
dists = {k: torch.norm(z - c, dim=1).item() for k, c in self.centroids.items() if c is not None}
if not dists: return 0, 0.0, {'d': {}, 'p': [0.5, 0.5]}
pred = min(dists, key=dists.get)
d0, d1 = dists.get(0, 1e6), dists.get(1, 1e6) # softmax(-d) gives higher prob to closer centroid
probs = F.softmax(torch.tensor([[-d0, -d1]]), dim=1).squeeze()
return pred, probs[pred].item(), {'d': dists, 'p': probs.tolist()}
def train(epochs=200, lr=5e-4, batch=16, dim=64, emb=32, temp=0.5,
sizes=[5,10,15], stride=2, name=None, verbose=True):
data = JointLoader(HUMAN_DIR, AGENT_DIR).get_data()
wins = build_windows(data, sizes, stride)
if verbose: print(f"Windows: {len(wins)} ({sum(w.label==0 for w in wins)}h/{sum(w.label==1 for w in wins)}a)")
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
enc = TrajectoryEncoder(dim, emb).to(dev)
opt = Adam(enc.parameters(), lr=lr)
loader = DataLoader(WindowDataset(wins, dim), batch_size=batch, shuffle=True, drop_last=True)
name = name or f"enc_{dim}_{emb}_{datetime.now():%Y%m%d_%H%M%S}"
writer = SummaryWriter(f"{RUNS}/encoder/{name}")
for ep in range(epochs):
enc.train()
total, n = 0.0, 0
for a, p, _ in loader:
loss = nt_xent_loss(enc(a.unsqueeze(1).to(dev)), enc(p.unsqueeze(1).to(dev)), temp)
opt.zero_grad(); loss.backward(); opt.step()
total += loss.item(); n += 1
avg = total / max(n, 1)
writer.add_scalar('loss-ntxent', avg, ep)
if verbose and (ep+1) % 20 == 0: print(f"Epoch {ep+1}: {avg:.4f}")
writer.close()
return enc, wins, dev
def loocv(epochs=100, lr=5e-4, dim=64, emb=32, temp=0.5, sizes=[5,10,15], stride=2, verbose=True):
"""Leave-one-trajectory-out CV"""
data = JointLoader(HUMAN_DIR, AGENT_DIR).get_data()
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
results = []
for test_id in data:
train_data = {k: v for k, v in data.items() if k != test_id}
if not any(k.startswith('human_') for k in train_data) or not any(k.startswith('agent_') for k in train_data):
continue
wins = build_windows(train_data, sizes, stride)
enc = TrajectoryEncoder(dim, emb).to(dev)
opt = Adam(enc.parameters(), lr=lr)
loader = DataLoader(WindowDataset(wins, dim), batch_size=min(16, len(wins)//2 or 1),
shuffle=True, drop_last=len(wins)>2)
for _ in range(epochs):
enc.train()
for a, p, _ in loader:
loss = nt_xent_loss(enc(a.unsqueeze(1).to(dev)), enc(p.unsqueeze(1).to(dev)), temp)
opt.zero_grad(); loss.backward(); opt.step()
clf = PrototypeClassifier(enc, dev, dim).fit(wins)
pred, conf, dbg = clf.predict(data[test_id])
actual = 0 if test_id.startswith('human_') else 1
results.append((pred, actual, conf))
if verbose: print(f"{test_id[:18]}: pred={pred} conf={conf:.2f} actual={actual} {'OK' if pred==actual else 'MISS'}")
if results:
acc = sum(p==a for p,a,_ in results) / len(results)
if verbose: print(f"\nAccuracy: {acc:.1%} ({sum(p==a for p,a,_ in results)}/{len(results)})")
return acc, results
return 0.0, []
def hparam_tune(n_trials=50, epochs=60, n_jobs=2, verbose=True):
"""Optuna hyperparameter search maximizing LOOCV accuracy"""
def objective(trial):
lr = trial.suggest_float('lr', 1e-5, 1e-2, log=True)
dim = trial.suggest_categorical('dim', [32, 64, 128, 256])
emb = trial.suggest_categorical('emb', [16, 32, 64, 128])
temp = trial.suggest_float('temp', 0.05, 1.0)
stride = trial.suggest_int('stride', 1, 4)
sizes = [trial.suggest_int(f's{i}', 3, 20) for i in range(3)]
sizes = sorted(set(sizes)) # unique sorted
acc, _ = loocv(epochs, lr, dim, emb, temp, sizes, stride, verbose=False)
return acc
study = optuna.create_study(direction='maximize', study_name='encoder_hparam',
sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs, show_progress_bar=verbose)
best = study.best_params
if verbose:
print(f"\nBest accuracy: {study.best_value:.1%}")
print(f"Best params: {best}")
return best, study
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser()
p.add_argument('--mode', choices=['train', 'eval', 'hparam'], default='train')
p.add_argument('--epochs', type=int, default=200)
p.add_argument('--lr', type=float, default=5e-4)
p.add_argument('--dim', type=int, default=128)
p.add_argument('--emb', type=int, default=64)
p.add_argument('--temp', type=float, default=0.1)
p.add_argument('--sizes', type=str, default='5,10,15')
p.add_argument('--stride', type=int, default=2)
p.add_argument('--n_trials', type=int, default=50)
args = p.parse_args()
sizes = [int(x) for x in args.sizes.split(',')]
if args.mode == 'train':
enc, wins, dev = train(args.epochs, args.lr, 16, args.dim, args.emb, args.temp, sizes, args.stride)
elif args.mode == 'hparam':
best, study = hparam_tune(args.n_trials, min(args.epochs, 60))
else:
loocv(args.epochs, args.lr, args.dim, args.emb, args.temp, sizes, args.stride)

View File

@@ -0,0 +1,957 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "62eafcd9-5462-4063-8873-0e7fb9add907",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from kafka import KafkaConsumer\n",
"import pandas as pd\n",
"import json\n",
"import numpy as np\n",
"import os\n",
"from dotenv import load_dotenv\n",
"import matplotlib.pyplot as plt\n",
"from IPython.display import display, SVG, Image\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "4af65cb4-e8cf-4877-b2db-13ac19f3838f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 73 entries, 0 to 72\n",
"Data columns (total 13 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sessionId 73 non-null object \n",
" 1 eventName 73 non-null object \n",
" 2 page 73 non-null object \n",
" 3 productId 67 non-null object \n",
" 4 storeMode 73 non-null object \n",
" 5 userAgent 73 non-null object \n",
" 6 ts 73 non-null object \n",
" 7 metadata_referrer 6 non-null object \n",
" 8 metadata_roomType 45 non-null object \n",
" 9 metadata_price 45 non-null float64\n",
" 10 metadata_nights 45 non-null float64\n",
" 11 metadata_elementText 22 non-null object \n",
" 12 metadata_dwellTime 22 non-null float64\n",
"dtypes: float64(3), object(10)\n",
"memory usage: 7.5+ KB\n"
]
}
],
"source": [
"KAFKA_PORT=os.getenv(\"KAFKA_PORT\", 9092)\n",
"topic = \"user-interactions\"\n",
"consumer = KafkaConsumer(\n",
" topic, \n",
" enable_auto_commit=True,\n",
" value_deserializer=lambda x: json.loads(x.decode('utf-8')),\n",
" auto_offset_reset='earliest', \n",
" bootstrap_servers=['localhost:9092'])\n",
"messages=consumer.poll(timeout_ms=1000,max_records=10000)\n",
"df = []\n",
"for m in messages.values():\n",
" for i in m:\n",
" df.append(i.value)\n",
"df = pd.DataFrame(df)\n",
"# explode metadata col json\n",
"df = df.join(pd.json_normalize(df.pop(\"metadata\"), sep=\".\").add_prefix(\"metadata_\"))\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "f6819a1c-32ab-49c7-845b-5df7bf60f561",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sessionId</th>\n",
" <th>eventName</th>\n",
" <th>page</th>\n",
" <th>productId</th>\n",
" <th>storeMode</th>\n",
" <th>userAgent</th>\n",
" <th>ts</th>\n",
" <th>metadata_referrer</th>\n",
" <th>metadata_roomType</th>\n",
" <th>metadata_price</th>\n",
" <th>metadata_nights</th>\n",
" <th>metadata_elementText</th>\n",
" <th>metadata_dwellTime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>d176d7c9-4027-4702-9e31-2a71395cdda0</td>\n",
" <td>page_view</td>\n",
" <td>/products</td>\n",
" <td>None</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:23:46.270Z</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>f0317a5d-e424-44e9-b784-c8f7291ffe31</td>\n",
" <td>page_view</td>\n",
" <td>/</td>\n",
" <td>None</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...</td>\n",
" <td>2025-11-14T13:26:00.291Z</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>f0317a5d-e424-44e9-b784-c8f7291ffe31</td>\n",
" <td>page_view</td>\n",
" <td>/products</td>\n",
" <td>None</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...</td>\n",
" <td>2025-11-14T13:26:07.769Z</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>f0317a5d-e424-44e9-b784-c8f7291ffe31</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...</td>\n",
" <td>2025-11-14T13:26:15.010Z</td>\n",
" <td>NaN</td>\n",
" <td>Premium Room</td>\n",
" <td>269.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>238dc588-a7ab-4c0e-bccd-6abca5076c66</td>\n",
" <td>page_view</td>\n",
" <td>/products</td>\n",
" <td>None</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...</td>\n",
" <td>2025-11-14T13:27:15.457Z</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>238dc588-a7ab-4c0e-bccd-6abca5076c66</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...</td>\n",
" <td>2025-11-14T13:27:15.591Z</td>\n",
" <td>NaN</td>\n",
" <td>Premium Room</td>\n",
" <td>264.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432</th>\n",
" <td>214d9fad-9b00-40c3-bd0e-7739b6acd654</td>\n",
" <td>click</td>\n",
" <td>1762448192425</td>\n",
" <td>DIV</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>/</td>\n",
" <td>NaN</td>\n",
" <td>1623.0</td>\n",
" <td>493.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>238dc588-a7ab-4c0e-bccd-6abca5076c66</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...</td>\n",
" <td>2025-11-14T13:27:21.483Z</td>\n",
" <td>NaN</td>\n",
" <td>Premium Room</td>\n",
" <td>264.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>238dc588-a7ab-4c0e-bccd-6abca5076c66</td>\n",
" <td>hover_over_title</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...</td>\n",
" <td>2025-11-14T13:27:22.646Z</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Grand Plaza Hotel</td>\n",
" <td>1200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>238dc588-a7ab-4c0e-bccd-6abca5076c66</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...</td>\n",
" <td>2025-11-14T13:27:25.889Z</td>\n",
" <td>NaN</td>\n",
" <td>Premium Room</td>\n",
" <td>264.0</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>013fc334-4045-4d5a-8739-dd0a8766a63b</td>\n",
" <td>page_view</td>\n",
" <td>/products</td>\n",
" <td>None</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:53:59.993Z</td>\n",
" <td></td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>013fc334-4045-4d5a-8739-dd0a8766a63b</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:54:10.705Z</td>\n",
" <td>NaN</td>\n",
" <td>Premium Room</td>\n",
" <td>223.0</td>\n",
" <td>3.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>013fc334-4045-4d5a-8739-dd0a8766a63b</td>\n",
" <td>hover_over_title</td>\n",
" <td>/products</td>\n",
" <td>htl-0</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:54:11.771Z</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>416.0</td>\n",
" <td>397.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Grand Plaza Hotel</td>\n",
" <td>1200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>013fc334-4045-4d5a-8739-dd0a8766a63b</td>\n",
" <td>view_item_page</td>\n",
" <td>/products</td>\n",
" <td>htl-1</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:54:29.772Z</td>\n",
" <td>NaN</td>\n",
" <td>Standard Room</td>\n",
" <td>267.0</td>\n",
" <td>5.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>013fc334-4045-4d5a-8739-dd0a8766a63b</td>\n",
" <td>hover_over_title</td>\n",
" <td>/products</td>\n",
" <td>htl-1</td>\n",
" <td>hotel</td>\n",
" <td>Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...</td>\n",
" <td>2025-11-14T13:54:30.833Z</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Seaside Resort</td>\n",
" <td>1200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sessionId eventName page \\\n",
"0 d176d7c9-4027-4702-9e31-2a71395cdda0 page_view /products \n",
"1 f0317a5d-e424-44e9-b784-c8f7291ffe31 page_view / \n",
"2 f0317a5d-e424-44e9-b784-c8f7291ffe31 page_view /products \n",
"3 f0317a5d-e424-44e9-b784-c8f7291ffe31 view_item_page /products \n",
"4 238dc588-a7ab-4c0e-bccd-6abca5076c66 page_view /products \n",
"5 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n",
"6 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n",
"7 238dc588-a7ab-4c0e-bccd-6abca5076c66 hover_over_title /products \n",
"8 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n",
"35 013fc334-4045-4d5a-8739-dd0a8766a63b page_view /products \n",
"36 013fc334-4045-4d5a-8739-dd0a8766a63b view_item_page /products \n",
"37 013fc334-4045-4d5a-8739-dd0a8766a63b hover_over_title /products \n",
"38 013fc334-4045-4d5a-8739-dd0a8766a63b view_item_page /products \n",
"39 013fc334-4045-4d5a-8739-dd0a8766a63b hover_over_title /products \n",
"\n",
" productId storeMode userAgent \\\n",
"0 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"1 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n",
"2 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n",
"3 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n",
"4 None hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n",
"5 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n",
"6 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n",
"7 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n",
"8 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n",
"35 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"36 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"37 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"38 htl-1 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"39 htl-1 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n",
"\n",
" ts metadata_referrer metadata_roomType \\\n",
"0 2025-11-14T13:23:46.270Z NaN \n",
"1 2025-11-14T13:26:00.291Z NaN \n",
"2 2025-11-14T13:26:07.769Z NaN \n",
"3 2025-11-14T13:26:15.010Z NaN Premium Room \n",
"4 2025-11-14T13:27:15.457Z NaN \n",
"5 2025-11-14T13:27:15.591Z NaN Premium Room \n",
"6 2025-11-14T13:27:21.483Z NaN Premium Room \n",
"7 2025-11-14T13:27:22.646Z NaN NaN \n",
"8 2025-11-14T13:27:25.889Z NaN Premium Room \n",
"35 2025-11-14T13:53:59.993Z NaN \n",
"36 2025-11-14T13:54:10.705Z NaN Premium Room \n",
"37 2025-11-14T13:54:11.771Z NaN NaN \n",
"38 2025-11-14T13:54:29.772Z NaN Standard Room \n",
"39 2025-11-14T13:54:30.833Z NaN NaN \n",
"\n",
" metadata_price metadata_nights metadata_elementText metadata_dwellTime \n",
"0 NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN \n",
"3 269.0 1.0 NaN NaN \n",
"4 NaN NaN NaN NaN \n",
"5 264.0 2.0 NaN NaN \n",
"6 264.0 2.0 NaN NaN \n",
"7 NaN NaN Grand Plaza Hotel 1200.0 \n",
"8 264.0 2.0 NaN NaN \n",
"35 NaN NaN NaN NaN \n",
"36 223.0 3.0 NaN NaN \n",
"37 NaN NaN Grand Plaza Hotel 1200.0 \n",
"38 267.0 5.0 NaN NaN \n",
"39 NaN NaN Seaside Resort 1200.0 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('sessionId').head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "380eca5f-8304-4fb2-be32-e8bcfd312085",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['013fc334-4045-4d5a-8739-dd0a8766a63b',\n",
" '238dc588-a7ab-4c0e-bccd-6abca5076c66',\n",
" 'd176d7c9-4027-4702-9e31-2a71395cdda0',\n",
" 'f0317a5d-e424-44e9-b784-c8f7291ffe31']"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sessions = list(set(df['sessionId'])); sessions # 238dc588-a7ab-4c0e-bccd-6abca5076c66"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "f4ae6f81-dcb8-44be-aee7-30dbc3a6bae1",
"metadata": {},
"outputs": [],
"source": [
"# map sessions to experiments"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "050d90a4-20a9-47f5-b998-c31178a54cb3",
"metadata": {},
"outputs": [],
"source": [
"def build_transition_prob_matrix(df: pd.DataFrame):\n",
" df = df.dropna(subset=['eventName'])\n",
" events = df['eventName'].tolist()\n",
" labels = pd.Index(events).unique().tolist()\n",
" idx = {e:i for i,e in enumerate(labels)}\n",
" M = np.zeros((len(labels), len(labels)), dtype=float)\n",
" for a, b in zip(events, events[1:]):\n",
" M[idx[a], idx[b]] += 1\n",
" row_sums = M.sum(axis=1, keepdims=True)\n",
" with np.errstate(divide='ignore', invalid='ignore'):\n",
" P = np.divide(M, row_sums, where=row_sums>0) # row-normalized\n",
" return P, labels"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "e68f9004-82f5-4826-aece-e3dc6e15a18f",
"metadata": {},
"outputs": [],
"source": [
"# https://medium.com/data-science/time-series-data-markov-transition-matrices-7060771e362b\n",
"from graphviz import Digraph\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"def _as_prob_df(matrix, labels=None):\n",
" \"\"\"Return a square DataFrame with index=columns=labels.\"\"\"\n",
" if isinstance(matrix, pd.DataFrame):\n",
" # Ensure square and aligned\n",
" assert (matrix.index == matrix.columns).all(), \"Index/columns must match.\"\n",
" return matrix\n",
" matrix = np.asarray(matrix, dtype=float)\n",
" assert matrix.shape[0] == matrix.shape[1], \"Matrix must be square.\"\n",
" if labels is None:\n",
" raise ValueError(\"labels are required when matrix is not a DataFrame\")\n",
" assert len(labels) == matrix.shape[0], \"labels length must match matrix size.\"\n",
" return pd.DataFrame(matrix, index=list(labels), columns=list(labels))\n",
"\n",
"def _df_to_edgelist(P: pd.DataFrame, threshold=0.0, round_digits=2):\n",
" \"\"\"Build weighted edges > threshold.\"\"\"\n",
" edges = []\n",
" for src in P.index:\n",
" for dst in P.columns:\n",
" w = float(P.loc[src, dst])\n",
" if w > threshold:\n",
" edges.append((str(src), str(dst), f\"{w:.{round_digits}f}\"))\n",
" return edges\n",
"\n",
"def render_graph(fname, matrix, ls_index=None, threshold=0.0, fmt=\"svg\", view=False):\n",
" \"\"\"\n",
" fname: output file stem (no extension)\n",
" matrix: NumPy array or pandas DataFrame of transition PROBABILITIES\n",
" ls_index: ordered labels (required if matrix is not a DataFrame)\n",
" threshold: hide edges with weight <= threshold\n",
" fmt: 'svg'|'png'|'pdf' etc.\n",
" view: open after rendering\n",
" \"\"\"\n",
" P = _as_prob_df(matrix, labels=ls_index)\n",
" edges = _df_to_edgelist(P, threshold=threshold)\n",
"\n",
" g = Digraph(format=fmt)\n",
" g.attr(rankdir=\"LR\", size=\"30\")\n",
" g.attr(\"node\", shape=\"circle\")\n",
"\n",
" # ensure isolated nodes appear\n",
" for node in P.index:\n",
" g.node(str(node), width=\"1\", height=\"1\")\n",
"\n",
" for src, dst, label in edges:\n",
" g.edge(src, dst, label=label)\n",
"\n",
" g.render(fname, view=view, cleanup=True)\n",
" return g\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e255a2c1-6454-4e5e-89f6-ef8ac51ab6cc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"013fc334-4045-4d5a-8739-dd0a8766a63b\n"
]
},
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 13.1.2 (0)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"565pt\" height=\"354pt\"\n",
" viewBox=\"0.00 0.00 565.00 354.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 349.64)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-349.64 561.05,-349.64 561.05,4 -4,4\"/>\n",
"<!-- page_view -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>page_view</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"48.19\" cy=\"-235.83\" rx=\"48.19\" ry=\"48.19\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"48.19\" y=\"-231.16\" font-family=\"Times,serif\" font-size=\"14.00\">page_view</text>\n",
"</g>\n",
"<!-- view_item_page -->\n",
"<g id=\"node2\" class=\"node\">\n",
"<title>view_item_page</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"232.88\" cy=\"-235.83\" rx=\"69.01\" ry=\"69.01\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-231.16\" font-family=\"Times,serif\" font-size=\"14.00\">view_item_page</text>\n",
"</g>\n",
"<!-- page_view&#45;&gt;view_item_page -->\n",
"<g id=\"edge1\" class=\"edge\">\n",
"<title>page_view&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M96.71,-235.83C113.69,-235.83 133.31,-235.83 152.25,-235.83\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"152.1,-239.33 162.1,-235.83 152.1,-232.33 152.1,-239.33\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"130.12\" y=\"-239.78\" font-family=\"Times,serif\" font-size=\"14.00\">1.00</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;view_item_page -->\n",
"<g id=\"edge2\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M214.74,-302.59C217.1,-314.51 223.14,-322.84 232.88,-322.84 239.27,-322.84 244.07,-319.26 247.28,-313.42\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"250.57,-314.62 250.52,-304.02 243.95,-312.33 250.57,-314.62\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-326.79\" font-family=\"Times,serif\" font-size=\"14.00\">0.68</text>\n",
"</g>\n",
"<!-- hover_over_title -->\n",
"<g id=\"node3\" class=\"node\">\n",
"<title>hover_over_title</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"463.22\" cy=\"-275.83\" rx=\"69.81\" ry=\"69.81\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"463.22\" y=\"-271.16\" font-family=\"Times,serif\" font-size=\"14.00\">hover_over_title</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;hover_over_title -->\n",
"<g id=\"edge3\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;hover_over_title</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M300.48,-250.14C307.03,-251.43 313.58,-252.69 319.89,-253.83 340.12,-257.51 362.05,-261.1 382.5,-264.27\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"381.77,-267.7 392.19,-265.76 382.83,-260.78 381.77,-267.7\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-263.17\" font-family=\"Times,serif\" font-size=\"14.00\">0.29</text>\n",
"</g>\n",
"<!-- hover_over_paragraph -->\n",
"<g id=\"node4\" class=\"node\">\n",
"<title>hover_over_paragraph</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"463.22\" cy=\"-93.83\" rx=\"93.83\" ry=\"93.83\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"463.22\" y=\"-89.16\" font-family=\"Times,serif\" font-size=\"14.00\">hover_over_paragraph</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;hover_over_paragraph -->\n",
"<g id=\"edge4\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;hover_over_paragraph</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M292.09,-199.63C316.79,-184.27 346.14,-166.02 373.44,-149.04\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"375.08,-152.15 381.72,-143.89 371.38,-146.2 375.08,-152.15\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-185.68\" font-family=\"Times,serif\" font-size=\"14.00\">0.04</text>\n",
"</g>\n",
"<!-- hover_over_title&#45;&gt;view_item_page -->\n",
"<g id=\"edge5\" class=\"edge\">\n",
"<title>hover_over_title&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M399.53,-246.73C384.12,-240.88 367.42,-235.6 351.39,-232.58 339.13,-230.28 326.03,-229.26 313.19,-229.04\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"313.51,-225.54 303.51,-229.04 313.51,-232.54 313.51,-225.54\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-236.53\" font-family=\"Times,serif\" font-size=\"14.00\">1.00</text>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x7f0779e818b0>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[]\n"
]
},
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 13.1.2 (0)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"8pt\" height=\"8pt\"\n",
" viewBox=\"0.00 0.00 8.00 8.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 4)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-4 4,-4 4,4 -4,4\"/>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x7f6800fac980>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0.00000000e+000 1.00000000e+000 0.00000000e+000 0.00000000e+000]\n",
" [0.00000000e+000 6.78571429e-001 2.85714286e-001 3.57142857e-002]\n",
" [0.00000000e+000 1.00000000e+000 0.00000000e+000 0.00000000e+000]\n",
" [2.05833592e-312 2.29175545e-312 4.94065646e-324 6.92110218e-310]]\n",
"238dc588-a7ab-4c0e-bccd-6abca5076c66\n"
]
},
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 13.1.2 (0)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"565pt\" height=\"354pt\"\n",
" viewBox=\"0.00 0.00 565.00 354.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 349.64)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-349.64 561.05,-349.64 561.05,4 -4,4\"/>\n",
"<!-- page_view -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>page_view</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"48.19\" cy=\"-109.83\" rx=\"48.19\" ry=\"48.19\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"48.19\" y=\"-105.16\" font-family=\"Times,serif\" font-size=\"14.00\">page_view</text>\n",
"</g>\n",
"<!-- view_item_page -->\n",
"<g id=\"node2\" class=\"node\">\n",
"<title>view_item_page</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"232.88\" cy=\"-197.83\" rx=\"69.01\" ry=\"69.01\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-193.16\" font-family=\"Times,serif\" font-size=\"14.00\">view_item_page</text>\n",
"</g>\n",
"<!-- page_view&#45;&gt;view_item_page -->\n",
"<g id=\"edge1\" class=\"edge\">\n",
"<title>page_view&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M92.02,-130.47C112.32,-140.25 137.13,-152.2 160.18,-163.3\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"158.39,-166.32 168.92,-167.51 161.43,-160.02 158.39,-166.32\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"130.12\" y=\"-157.78\" font-family=\"Times,serif\" font-size=\"14.00\">1.00</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;view_item_page -->\n",
"<g id=\"edge2\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M214.74,-264.59C217.1,-276.51 223.14,-284.84 232.88,-284.84 239.27,-284.84 244.07,-281.26 247.28,-275.42\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"250.57,-276.62 250.52,-266.02 243.95,-274.33 250.57,-276.62\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-288.79\" font-family=\"Times,serif\" font-size=\"14.00\">0.19</text>\n",
"</g>\n",
"<!-- hover_over_title -->\n",
"<g id=\"node3\" class=\"node\">\n",
"<title>hover_over_title</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"463.22\" cy=\"-275.83\" rx=\"69.81\" ry=\"69.81\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"463.22\" y=\"-271.16\" font-family=\"Times,serif\" font-size=\"14.00\">hover_over_title</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;hover_over_title -->\n",
"<g id=\"edge3\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;hover_over_title</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M289.6,-237.16C299.36,-242.77 309.67,-247.94 319.89,-251.83 339.45,-259.28 361.4,-264.43 382.1,-267.98\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"381.52,-271.43 391.95,-269.55 382.62,-264.52 381.52,-271.43\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-265.16\" font-family=\"Times,serif\" font-size=\"14.00\">0.38</text>\n",
"</g>\n",
"<!-- hover_over_paragraph -->\n",
"<g id=\"node4\" class=\"node\">\n",
"<title>hover_over_paragraph</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"463.22\" cy=\"-93.83\" rx=\"93.83\" ry=\"93.83\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"463.22\" y=\"-89.16\" font-family=\"Times,serif\" font-size=\"14.00\">hover_over_paragraph</text>\n",
"</g>\n",
"<!-- view_item_page&#45;&gt;hover_over_paragraph -->\n",
"<g id=\"edge4\" class=\"edge\">\n",
"<title>view_item_page&#45;&gt;hover_over_paragraph</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M300.22,-180.71C317.22,-175.46 335.24,-169.12 351.39,-161.83 358.97,-158.41 366.67,-154.57 374.29,-150.49\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"375.84,-153.63 382.92,-145.75 372.47,-147.5 375.84,-153.63\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-178.15\" font-family=\"Times,serif\" font-size=\"14.00\">0.44</text>\n",
"</g>\n",
"<!-- hover_over_title&#45;&gt;view_item_page -->\n",
"<g id=\"edge5\" class=\"edge\">\n",
"<title>hover_over_title&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M398.52,-248.36C383.21,-242.16 366.82,-235.87 351.39,-230.58 338.42,-226.15 324.5,-221.86 310.94,-217.93\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"312.2,-214.65 301.62,-215.28 310.28,-221.39 312.2,-214.65\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-234.53\" font-family=\"Times,serif\" font-size=\"14.00\">1.00</text>\n",
"</g>\n",
"<!-- hover_over_paragraph&#45;&gt;page_view -->\n",
"<g id=\"edge6\" class=\"edge\">\n",
"<title>hover_over_paragraph&#45;&gt;page_view</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M369.13,-95.76C310.26,-97.17 232.59,-99.41 163.87,-102.58 145.72,-103.42 125.98,-104.58 108.06,-105.73\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"107.86,-102.24 98.1,-106.38 108.31,-109.22 107.86,-102.24\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-106.53\" font-family=\"Times,serif\" font-size=\"14.00\">0.14</text>\n",
"</g>\n",
"<!-- hover_over_paragraph&#45;&gt;view_item_page -->\n",
"<g id=\"edge7\" class=\"edge\">\n",
"<title>hover_over_paragraph&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M372.68,-119.15C354.84,-125.32 336.5,-132.51 319.89,-140.58 312.9,-143.98 305.81,-147.87 298.86,-151.98\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"297.49,-148.71 290.78,-156.91 301.14,-154.69 297.49,-148.71\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"335.64\" y=\"-144.53\" font-family=\"Times,serif\" font-size=\"14.00\">0.86</text>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x7f6800f97110>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0. 1. 0. 0. ]\n",
" [0. 0.1875 0.375 0.4375 ]\n",
" [0. 1. 0. 0. ]\n",
" [0.14285714 0.85714286 0. 0. ]]\n",
"d176d7c9-4027-4702-9e31-2a71395cdda0\n"
]
},
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 13.1.2 (0)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"104pt\" height=\"104pt\"\n",
" viewBox=\"0.00 0.00 104.00 104.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 100.37)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-100.37 100.37,-100.37 100.37,4 -4,4\"/>\n",
"<!-- page_view -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>page_view</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"48.19\" cy=\"-48.19\" rx=\"48.19\" ry=\"48.19\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"48.19\" y=\"-43.51\" font-family=\"Times,serif\" font-size=\"14.00\">page_view</text>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x7f6800f97110>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0.]]\n",
"f0317a5d-e424-44e9-b784-c8f7291ffe31\n"
]
},
{
"data": {
"image/svg+xml": [
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
"<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
" \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
"<!-- Generated by graphviz version 13.1.2 (0)\n",
" -->\n",
"<!-- Pages: 1 -->\n",
"<svg width=\"310pt\" height=\"160pt\"\n",
" viewBox=\"0.00 0.00 310.00 160.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 156.44)\">\n",
"<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-156.44 305.89,-156.44 305.89,4 -4,4\"/>\n",
"<!-- page_view -->\n",
"<g id=\"node1\" class=\"node\">\n",
"<title>page_view</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"48.19\" cy=\"-69.01\" rx=\"48.19\" ry=\"48.19\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"48.19\" y=\"-64.33\" font-family=\"Times,serif\" font-size=\"14.00\">page_view</text>\n",
"</g>\n",
"<!-- page_view&#45;&gt;page_view -->\n",
"<g id=\"edge1\" class=\"edge\">\n",
"<title>page_view&#45;&gt;page_view</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M33.03,-115.09C34.09,-126.6 39.14,-135.19 48.19,-135.19 53.98,-135.19 58.13,-131.66 60.65,-126.1\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"64.01,-127.11 62.98,-116.56 57.21,-125.45 64.01,-127.11\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"48.19\" y=\"-139.14\" font-family=\"Times,serif\" font-size=\"14.00\">0.50</text>\n",
"</g>\n",
"<!-- view_item_page -->\n",
"<g id=\"node2\" class=\"node\">\n",
"<title>view_item_page</title>\n",
"<ellipse fill=\"none\" stroke=\"black\" cx=\"232.88\" cy=\"-69.01\" rx=\"69.01\" ry=\"69.01\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"232.88\" y=\"-64.33\" font-family=\"Times,serif\" font-size=\"14.00\">view_item_page</text>\n",
"</g>\n",
"<!-- page_view&#45;&gt;view_item_page -->\n",
"<g id=\"edge2\" class=\"edge\">\n",
"<title>page_view&#45;&gt;view_item_page</title>\n",
"<path fill=\"none\" stroke=\"black\" d=\"M96.71,-69.01C113.69,-69.01 133.31,-69.01 152.25,-69.01\"/>\n",
"<polygon fill=\"black\" stroke=\"black\" points=\"152.1,-72.51 162.1,-69.01 152.1,-65.51 152.1,-72.51\"/>\n",
"<text xml:space=\"preserve\" text-anchor=\"middle\" x=\"130.12\" y=\"-72.96\" font-family=\"Times,serif\" font-size=\"14.00\">0.50</text>\n",
"</g>\n",
"</g>\n",
"</svg>\n"
],
"text/plain": [
"<graphviz.graphs.Digraph at 0x7f6800bf50f0>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[5.0e-001 5.0e-001]\n",
" [9.9e-324 1.5e-323]]\n"
]
}
],
"source": [
"def explore_session(session_id: str):\n",
" subset = df[df['sessionId'] == session_id]\n",
" print(session_id)\n",
" P, labels = build_transition_prob_matrix(subset)\n",
" g = render_graph(f\"session_{session_id}\", P, ls_index=labels, threshold=0.01, fmt=\"svg\", view=False)\n",
" display(g)\n",
" return P\n",
"for session in sessions:\n",
" print(explore_session(session))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (PHANTOM)",
"language": "python",
"name": "phantom"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,165 @@
import pytest
import pandas as pd
import numpy as np
from procesing.steps.session import (
TemporalFeatureStep,
BehavioralFeatureStep,
ProductFeatureStep,
UserAgentFeatureStep,
ExtractSessionFeaturesStep,
JoinLabelsStep,
ValidateDataStep,
)
# TemporalFeatureStep tests
def test_temporal_empty(pipeline_context):
result = TemporalFeatureStep(pipeline_context).transform(pd.DataFrame())
assert 'sessionId' in result.columns
assert result.empty
def test_temporal_basic(pipeline_context, session_interactions):
result = TemporalFeatureStep(pipeline_context).transform(session_interactions)
assert 'session_duration_sec' in result.columns
assert 'interaction_velocity' in result.columns
assert 'max_velocity_5min' in result.columns
assert result['total_interactions'].sum() == len(session_interactions)
def test_temporal_timeout(pipeline_context):
df = pd.DataFrame({
'sessionId': ['s1', 's1'],
'ts': ['2025-01-01T10:00:00Z', '2025-01-01T11:00:00Z'], # 1 hour gap
})
result = TemporalFeatureStep(pipeline_context, timeout_sec=900).transform(df)
assert result.iloc[0]['session_duration_sec'] == 0 # gap exceeds timeout
# BehavioralFeatureStep tests
def test_behavioral_empty(pipeline_context):
result = BehavioralFeatureStep(pipeline_context).transform(pd.DataFrame())
assert 'sessionId' in result.columns
def test_behavioral_counts(pipeline_context, session_interactions):
result = BehavioralFeatureStep(pipeline_context).transform(session_interactions)
assert 'page_views' in result.columns
assert 'item_views' in result.columns
assert 'hover_events' in result.columns
assert result['total_events'].sum() == len(session_interactions)
def test_behavioral_hover_prefix(pipeline_context):
df = pd.DataFrame({
'sessionId': ['s1', 's1'],
'eventName': ['hover_over_custom', 'hover_over_button'],
'page': ['/products', '/products'],
})
result = BehavioralFeatureStep(pipeline_context).transform(df)
assert result.iloc[0]['hover_events'] == 2
# ProductFeatureStep tests
def test_product_empty(pipeline_context):
result = ProductFeatureStep(pipeline_context).transform(pd.DataFrame())
assert 'sessionId' in result.columns
def test_product_features(pipeline_context, session_interactions):
result = ProductFeatureStep(pipeline_context).transform(session_interactions)
assert 'unique_products_viewed' in result.columns
assert 'price_range' in result.columns
assert result['unique_products_viewed'].sum() > 0
# UserAgentFeatureStep tests
def test_ua_empty(pipeline_context):
result = UserAgentFeatureStep(pipeline_context).transform(pd.DataFrame())
assert 'sessionId' in result.columns
def test_ua_headless_detection(pipeline_context):
df = pd.DataFrame({
'sessionId': ['s1', 's2'],
'userAgent': ['Mozilla/5.0 Chrome/120', 'HeadlessChrome/120'],
})
result = UserAgentFeatureStep(pipeline_context).transform(df)
assert 'is_headless' in result.columns
headless = dict(zip(result['sessionId'], result['is_headless']))
assert headless['s1'] == False
assert headless['s2'] == True
def test_ua_browser_family(pipeline_context):
df = pd.DataFrame({
'sessionId': ['s1', 's2', 's3'],
'userAgent': ['Mozilla/5.0 Firefox/120', 'Safari/605.1.15', 'Unknown'],
})
result = UserAgentFeatureStep(pipeline_context).transform(df)
browsers = dict(zip(result['sessionId'], result['browser_family']))
assert browsers['s1'] == 'Firefox'
assert browsers['s2'] == 'Safari'
assert browsers['s3'] == 'Other'
def test_ua_automation_detection(pipeline_context):
df = pd.DataFrame({
'sessionId': ['s1', 's2'],
'userAgent': ['Selenium WebDriver', 'Normal Chrome/120'],
})
result = UserAgentFeatureStep(pipeline_context).transform(df)
auto = dict(zip(result['sessionId'], result['is_automation']))
assert auto['s1'] == True
assert auto['s2'] == False
# ExtractSessionFeaturesStep tests
def test_extract_empty(pipeline_context):
result = ExtractSessionFeaturesStep(pipeline_context).transform(pd.DataFrame())
assert result.empty
def test_extract_merges_all(pipeline_context, session_interactions):
result = ExtractSessionFeaturesStep(pipeline_context).transform(session_interactions)
expected = ['session_duration_sec', 'total_events', 'unique_products_viewed', 'is_headless']
for col in expected:
assert col in result.columns
assert 'experimentId' in result.columns
# JoinLabelsStep tests
def test_join_labels_tuple_input(pipeline_context):
features = pd.DataFrame({'sessionId': ['s1'], 'experimentId': ['exp1'], 'total_events': [5]})
experiments = pd.DataFrame({'id': ['exp1'], 'xp_human_only': [True]})
result = JoinLabelsStep(pipeline_context).transform((features, experiments))
assert 'is_agent' in result.columns
assert result.iloc[0]['is_agent'] == False
def test_join_labels_empty_experiments(pipeline_context):
features = pd.DataFrame({'sessionId': ['s1'], 'experimentId': ['exp1']})
result = JoinLabelsStep(pipeline_context).transform((features, pd.DataFrame()))
assert pd.isna(result.iloc[0]['is_agent'])
# ValidateDataStep tests
def test_validate_empty(pipeline_context):
ValidateDataStep(pipeline_context).transform(pd.DataFrame())
report = pipeline_context.get_cached('validation_report')
assert report['status'] == 'empty'
def test_validate_missing_cols(pipeline_context):
df = pd.DataFrame({'sessionId': ['s1'], 'ts': ['2025-01-01']})
ValidateDataStep(pipeline_context).transform(df)
report = pipeline_context.get_cached('validation_report')
assert report['status'] == 'invalid'
assert 'eventName' in report['missing_cols']
def test_validate_valid(pipeline_context, session_interactions):
ValidateDataStep(pipeline_context).transform(session_interactions)
report = pipeline_context.get_cached('validation_report')
assert report['status'] == 'valid'
assert report['sessions'] > 0