diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2bb1107 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,17 @@ +.git +.venv +.venv-tpu +**/__pycache__ +**/*.pyc +**/*.pyo +**/.pytest_cache +**/.mypy_cache +**/.ruff_cache +**/.ipynb_checkpoints +wandb +build +paper/build +paper/build-cais +node_modules +**/node_modules +*.egg-info diff --git a/.env.sweep.example b/.env.sweep.example new file mode 100644 index 0000000..1cfb168 --- /dev/null +++ b/.env.sweep.example @@ -0,0 +1,18 @@ +# Copy this file to .env.sweep and fill in values. + +# Required for wandb runs and sweep agent workers. +WANDB_API_KEY= +WANDB_ENTITY= +WANDB_PROJECT=phantom-pricing + +# Required for private repo bootstrap workers. +GITHUB_TOKEN= + +# Optional defaults for bootstrap mode. +# REPO_URL=https://github.com/org/repo.git +# BRANCH=main +# WORKDIR=$HOME/PHANTOM-agent +# SWEEP_ID=entity/project/id +# AGENT_COUNT=0 +# AGENT_LOOP=1 +# RETRY_SECONDS=20 diff --git a/.gitignore b/.gitignore index f18e3d4..8ae7e83 100644 --- a/.gitignore +++ b/.gitignore @@ -1,21 +1,50 @@ +# environment and secrets **/.env +.env.* +!.env.*.example **/.venv + +# python build/cache artifacts **/__pycache__ +phantom.egg-info/ +*.egg-info/ + +# notebook artifacts **/.ipynb_checkpoints/ **/.virtual_documents/ + +# editor/tool state +**/.pdf-view-restore +.nextstep +.ignore-gitlogue +.cloudflare + +# generated svg/graphics **/session_*.svg **/*graph.svg **/auto/*.el + +# misc generated *.old **/package-lock.json **/*.parquet **/_build/ +# paper build artifacts paper/src/bib/auto -**/_build/ paper/src/auto/* paper/src/bib/auto paper/template/* +paper/build-cais/ +paper/src/main.pdf +paper/src/main-blx.bib +paper/src/svg-inkscape/ +paper/src/mirrors/ +paper/variations/ +paper/src/graphics/test_*.png +thesis-latest.pdf + +# experiment run artifacts and logs docs/goals/*.md PHANTOM.wiki/ experiments/airflow/logs/* @@ -23,11 +52,35 @@ experiments/airflow/logs/scheduler/ experiments/airflow/logs/dag_processor_manager/ experiments/collected_data/ experiments/agents/collected_data/ +tests/e2e/test-results/ +tests/e2e/node_modules/** + +# rl/sim run outputs sim/rl/behavior_loader/*.dot sim/rl/behavior_loader/*.png sim/rl/behavior_loader/*.svg sim/rl/behavior_loader/*.pdf -tests/e2e/node_modules/** +sim/rl/runs/ lab/case/thesis/runs*/ sim/case/thesis_simplified/runs*/ + +# model binaries +engine/models/*.zip +*.zip + +# wandb local state +wandb/ + +# data directory (large datasets) +data/ + +# ktem local app data +ktem_app_data/ + +# generated visualization pdfs +*_mdp_viz.pdf +phantom_env_comparison.png +sim/phantom_env_comparison.png + +# web clone PHANTOM_web/* diff --git a/AGENTS.md b/AGENTS.md new file mode 120000 index 0000000..681311e --- /dev/null +++ b/AGENTS.md @@ -0,0 +1 @@ +CLAUDE.md \ No newline at end of file diff --git a/engine/sweeps/tpu_jax.yaml b/engine/sweeps/tpu_jax.yaml new file mode 100644 index 0000000..6b4e001 --- /dev/null +++ b/engine/sweeps/tpu_jax.yaml @@ -0,0 +1,93 @@ +method: bayes +metric: + name: sweep/score + goal: maximize +command: + - ${env} + - python + - -m + - engine.train +parameters: + # fixed: always use JAX backend so TPU chips are actually exercised + use_jax: + value: true + # all four algos have JAX implementations + algo: + values: [ppo, a2c, dqn, qtable] + total_timesteps: + values: [50000, 80000, 120000] + checkpoint_interval: + value: 200000 + seed: + values: [13, 42, 77] + n_products: + values: [8, 10, 12] + # COI framework parameters -- primary research variables + alpha: + distribution: uniform + min: 0.1 + max: 0.6 + lambda_coi: + distribution: uniform + min: 0.05 + max: 0.6 + robust_radius: + distribution: uniform + min: 0.0 + max: 0.3 + robust_points: + values: [3, 5, 7] + info_value: + distribution: uniform + min: 0.5 + max: 2.0 + revenue_weight: + values: [0.005, 0.01, 0.02] + # shared hyperparameters + learning_rate: + distribution: log_uniform_values + min: 1.0e-5 + max: 1.0e-3 + gamma: + values: [0.97, 0.99, 0.995] + # JAX parallelism -- key lever for TPU throughput + jax_num_envs: + values: [8, 16, 32] + jax_num_steps: + values: [64, 128, 256] + jax_num_minibatches: + values: [2, 4, 8] + jax_update_epochs: + values: [2, 4, 8] + # PPO/A2C specific + gae_lambda: + values: [0.9, 0.95, 0.98] + clip_range: + values: [0.1, 0.2, 0.3] + ent_coef: + values: [0.0, 0.005, 0.01] + # DQN specific + buffer_size: + values: [20000, 50000, 100000] + batch_size: + values: [128, 256, 512] + learning_starts: + values: [500, 1000, 3000] + exploration_fraction: + values: [0.1, 0.2, 0.3] + exploration_final_eps: + values: [0.01, 0.03, 0.05] + # QTable specific + q_lr: + values: [0.03, 0.05, 0.1, 0.2] + eps_end: + values: [0.02, 0.05, 0.1] + eps_decay: + values: [0.999, 0.9995, 0.9999] + # action space + action_levels: + values: [7, 9, 11] + action_scale_low: + values: [0.75, 0.8, 0.85] + action_scale_high: + values: [1.15, 1.2, 1.25] diff --git a/engine/sweeps/tpu_pod.yaml b/engine/sweeps/tpu_pod.yaml new file mode 100644 index 0000000..35d8ded --- /dev/null +++ b/engine/sweeps/tpu_pod.yaml @@ -0,0 +1,64 @@ +method: bayes +metric: + name: sweep/score + goal: maximize +command: + - ${env} + - python + - -m + - engine.train +parameters: + use_jax: + value: true + # pmap requires all workers to compile the same computation graph shape, + # so structural params are fixed -- only research/scalar params are swept + algo: + values: [ppo, a2c] + jax_num_envs: + value: 32 + jax_num_steps: + value: 128 + jax_num_minibatches: + value: 4 + jax_update_epochs: + value: 4 + total_timesteps: + value: 100000 + checkpoint_interval: + value: 200000 + n_products: + value: 10 + action_levels: + value: 9 + # research parameters -- primary sweep targets + alpha: + distribution: uniform + min: 0.1 + max: 0.6 + lambda_coi: + distribution: uniform + min: 0.05 + max: 0.6 + robust_radius: + distribution: uniform + min: 0.0 + max: 0.3 + info_value: + distribution: uniform + min: 0.5 + max: 2.0 + revenue_weight: + values: [0.005, 0.01, 0.02] + # training hyperparameters + learning_rate: + distribution: log_uniform_values + min: 1.0e-5 + max: 1.0e-3 + gamma: + values: [0.97, 0.99, 0.995] + gae_lambda: + values: [0.9, 0.95, 0.98] + clip_range: + values: [0.1, 0.2, 0.3] + ent_coef: + values: [0.0, 0.005, 0.01] diff --git a/engine/wandb_checkpoint.py b/engine/wandb_checkpoint.py new file mode 100644 index 0000000..4deea92 --- /dev/null +++ b/engine/wandb_checkpoint.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import hashlib +import json +import re +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any, Mapping + +try: + import wandb + from wandb.errors import CommError + + HAS_WANDB = True +except ImportError: + HAS_WANDB = False + wandb = None # type: ignore[assignment] + CommError = RuntimeError # type: ignore[assignment] + + +def _safe_value(value: Any) -> Any: + if isinstance(value, (str, int, float, bool)) or value is None: + return value + if isinstance(value, (list, tuple)): + return [_safe_value(v) for v in value] + if isinstance(value, dict): + return {str(k): _safe_value(value[k]) for k in sorted(value)} + return str(value) + + +def _safe_scope(scope: str | None) -> str: + raw = "manual" if scope in (None, "") else str(scope) + cleaned = re.sub(r"[^A-Za-z0-9_.-]+", "-", raw).strip("-") + return cleaned or "manual" + + +def checkpoint_artifact_name( + cfg: Mapping[str, Any], *, backend: str, sweep_id: str | None = None +) -> str: + payload = {k: _safe_value(cfg[k]) for k in sorted(cfg)} + scope = _safe_scope(sweep_id) + canonical = json.dumps( + {"backend": backend, "scope": scope, "cfg": payload}, + sort_keys=True, + separators=(",", ":"), + ) + digest = hashlib.sha1(canonical.encode("utf-8")).hexdigest()[:14] + return f"phantom-{backend}-ckpt-{scope}-{digest}"[:128] + + +def _is_missing_artifact_error(exc: Exception) -> bool: + if isinstance(exc, CommError): + msg = str(exc).lower() + return "not found" in msg or "does not exist" in msg + return False + + +def download_latest_checkpoint( + artifact_name: str, *, file_name: str +) -> tuple[Path, dict[str, Any]] | None: + if not HAS_WANDB or wandb.run is None: + return None + try: + artifact = wandb.run.use_artifact(f"{artifact_name}:latest") + except Exception as exc: + if _is_missing_artifact_error(exc): + return None + raise + directory = Path(artifact.download()) + checkpoint_path = directory / file_name + if not checkpoint_path.exists(): + return None + metadata = dict(getattr(artifact, "metadata", {}) or {}) + return checkpoint_path, metadata + + +def _aliases_from_metadata(metadata: dict[str, Any] | None) -> list[str]: + aliases = ["latest"] + if metadata is None: + return aliases + if "step" in metadata: + try: + aliases.append(f"step-{int(metadata['step'])}") + except (TypeError, ValueError): + pass + return aliases + + +def log_checkpoint_bytes( + artifact_name: str, + *, + file_name: str, + payload: bytes, + metadata: dict[str, Any] | None = None, +) -> bool: + if not HAS_WANDB or wandb.run is None: + return False + with TemporaryDirectory(prefix="phantom-ckpt-") as tmpdir: + path = Path(tmpdir) / file_name + path.write_bytes(payload) + artifact = wandb.Artifact( + name=artifact_name, + type="checkpoint", + metadata=metadata or {}, + ) + artifact.add_file(path.as_posix(), name=file_name) + wandb.log_artifact(artifact, aliases=_aliases_from_metadata(metadata)) + return True + + +def log_checkpoint_file( + artifact_name: str, + *, + file_path: str | Path, + artifact_file_name: str, + metadata: dict[str, Any] | None = None, +) -> bool: + if not HAS_WANDB or wandb.run is None: + return False + src = Path(file_path) + if not src.exists(): + return False + artifact = wandb.Artifact( + name=artifact_name, + type="checkpoint", + metadata=metadata or {}, + ) + artifact.add_file(src.as_posix(), name=artifact_file_name) + wandb.log_artifact(artifact, aliases=_aliases_from_metadata(metadata)) + return True diff --git a/experiments/airflow/dags/session_pricing_pipeline.py b/experiments/airflow/dags/session_pricing_pipeline.py new file mode 100644 index 0000000..ab8db77 --- /dev/null +++ b/experiments/airflow/dags/session_pricing_pipeline.py @@ -0,0 +1,269 @@ +""" +Session-Aware Pricing DAG +THIS implements the core pricing computation (policy layer). + +Flow: τ → θ̂ → D → p* + 1. Fetch recent sessions from Kafka (last 10 active) + 2. Extract features per session (τ → θ̂) + 3. Map features to demand proxy (θ̂ → D) + 4. Compute optimal prices (D → p*) + 5. Write to Redis session:{sessionId}:prices + +Scheduled: every 1 minute when enabled +""" +from airflow import DAG +from airflow.operators.python import PythonOperator +from airflow.utils.dates import days_ago +from datetime import timedelta +import pandas as pd +import numpy as np +import logging +import sys +import pickle + +sys.path.insert(0, '/opt/airflow') + +from procesing.context import PipelineContext +from procesing.providers import SupabaseProvider, BackendAPIProvider +from procesing.steps.session import ExtractSessionFeaturesStep +from procesing.pricers.simple import SimpleSurgePricer, session_features_to_demand +from procesing.pricing import StateSpace +from lib.model_registry import ModelRegistry + +DEFAULT_ARGS = { + 'owner': 'phantom-research', + 'depends_on_past': False, + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': timedelta(seconds=30), +} + + +class CompositeProvider(SupabaseProvider, BackendAPIProvider): + def __init__(self): + SupabaseProvider.__init__(self) + BackendAPIProvider.__init__(self) + + +def _get_context(store_mode: str = 'hotel') -> PipelineContext: + return PipelineContext(provider=CompositeProvider(), store_mode=store_mode) + + +def fetch_recent_sessions(**kwargs): + """ + Task: Fetch last N active sessions from Kafka. + Returns: DataFrame of interaction events for recent sessions. + """ + dag_conf = kwargs.get('dag_run').conf if kwargs.get('dag_run') else {} + store_mode = dag_conf.get('store_mode', 'hotel') + session_limit = dag_conf.get('session_limit', 10) + + ctx = _get_context(store_mode) + provider = ctx.provider + + # fetch all recent interactions from Kafka + try: + interactions_df = provider.fetch_kafka_topic("user-interactions") + except Exception as e: + logging.error(f"Failed to fetch interactions: {e}") + kwargs['ti'].xcom_push(key='sessions_data', value=pickle.dumps(pd.DataFrame())) + return 0 + + if interactions_df.empty or 'sessionId' not in interactions_df.columns: + kwargs['ti'].xcom_push(key='sessions_data', value=pickle.dumps(pd.DataFrame())) + return 0 + + # identify last N active sessions (most recent by event count) + recent_sessions = interactions_df['sessionId'].value_counts().head(session_limit).index.tolist() + + # filter to only those sessions + filtered_df = interactions_df[interactions_df['sessionId'].isin(recent_sessions)].copy() + + kwargs['ti'].xcom_push(key='sessions_data', value=pickle.dumps(filtered_df)) + kwargs['ti'].xcom_push(key='session_ids', value=recent_sessions) + + logging.info(f"Fetched {len(filtered_df)} events for {len(recent_sessions)} sessions") + return len(recent_sessions) + + +def extract_session_features(**kwargs): + """ + Task: Extract behavioral features from session trajectories. + THIS implements τ → θ̂ transformation. + """ + ti = kwargs['ti'] + sessions_df = pickle.loads(ti.xcom_pull(key='sessions_data')) + + if sessions_df.empty: + ti.xcom_push(key='session_features', value=pickle.dumps(pd.DataFrame())) + return 0 + + dag_conf = kwargs.get('dag_run').conf if kwargs.get('dag_run') else {} + ctx = _get_context(dag_conf.get('store_mode', 'hotel')) + + # extract features using vectorized pipeline + feature_extractor = ExtractSessionFeaturesStep(ctx) + features_df = feature_extractor.transform(sessions_df) + + ti.xcom_push(key='session_features', value=pickle.dumps(features_df)) + + logging.info(f"Extracted {len(features_df.columns)} features for {len(features_df)} sessions") + logging.info(f"Feature columns: {list(features_df.columns)}") + logging.info(f"Sample features (first session):\n{features_df.iloc[0].to_dict()}") + + return len(features_df) + + +def compute_session_prices(**kwargs): + """ + Task: Compute optimal prices for each session. + THIS implements θ̂ → D → p* transformation. + """ + ti = kwargs['ti'] + features_df = pickle.loads(ti.xcom_pull(key='session_features')) + + if features_df.empty: + ti.xcom_push(key='price_results', value=pickle.dumps({})) + return 0 + + dag_conf = kwargs.get('dag_run').conf if kwargs.get('dag_run') else {} + store_mode = dag_conf.get('store_mode', 'hotel') + ctx = _get_context(store_mode) + + # fetch product catalog for base prices + products_df = ctx.provider.fetch_products(store_mode) + if products_df.empty: + logging.error("No products found in catalog") + ti.xcom_push(key='price_results', value=pickle.dumps({})) + return 0 + + products_df['base_price'] = products_df['metadata'].apply( + lambda m: m.get('base_price', 100.0) if isinstance(m, dict) else 100.0 + ) + + # initialize pricing model + pricer = SimpleSurgePricer( + high_threshold=dag_conf.get('high_threshold', 10), + low_threshold=dag_conf.get('low_threshold', 2), + surge_multiplier=dag_conf.get('surge_multiplier', 1.15), + discount_multiplier=dag_conf.get('discount_multiplier', 0.95) + ) + pricer.fit(products_df) + + # compute prices per session + price_results = {} + n_products = len(products_df) + + logging.info(f"Starting price computation for {len(features_df)} sessions, {n_products} products") + logging.info(f"Pricer config: high_thresh={pricer.high_threshold}, low_thresh={pricer.low_threshold}, surge_mult={pricer.surge_multiplier}") + + for idx, session_row in features_df.iterrows(): + session_id = session_row.get('sessionId') + if not session_id: + continue + + # map features to demand proxy (θ̂ → D) + session_features_single = pd.DataFrame([session_row]) + demand_proxy = session_features_to_demand(session_features_single) + + logging.info(f"[Session {session_id}] Features → Demand: {demand_proxy:.2f}") + logging.info(f"[Session {session_id}] Key features: velocity={session_row.get('interaction_velocity', 0):.2f}, cart_ratio={session_row.get('cart_to_view_ratio', 0):.2f}, item_views={session_row.get('item_views', 0)}") + + # build state space + state_space = StateSpace( + demand=np.full(n_products, demand_proxy), # broadcast session demand to all products + prices=products_df['base_price'].values, + session_features=session_features_single + ) + + # compute optimal prices (D → p*) + optimal_prices = pricer.predict(state_space) + + base_avg = products_df['base_price'].mean() + optimal_avg = optimal_prices.mean() + price_change_pct = ((optimal_avg - base_avg) / base_avg) * 100 + + logging.info(f"[Session {session_id}] Price adjustment: base_avg={base_avg:.2f}, optimal_avg={optimal_avg:.2f}, change={price_change_pct:+.1f}%") + + # store as dict {productId: price} + price_map = { + str(products_df.iloc[i]['id']): float(optimal_prices[i]) + for i in range(n_products) + } + + price_results[session_id] = price_map + + ti.xcom_push(key='price_results', value=pickle.dumps(price_results)) + + logging.info(f"Computed prices for {len(price_results)} sessions, {n_products} products each") + return len(price_results) + + +def publish_to_registry(**kwargs): + """ + Task: Write session prices to Redis registry. + THIS is the write path: prices → session:{sessionId}:prices + """ + ti = kwargs['ti'] + price_results = pickle.loads(ti.xcom_pull(key='price_results')) + + if not price_results: + logging.warning("No prices to publish") + return 0 + + registry = ModelRegistry() + ttl = kwargs.get('dag_run').conf.get('ttl', 1800) if kwargs.get('dag_run') and kwargs.get('dag_run').conf else 1800 + + published_count = 0 + for session_id, price_map in price_results.items(): + registry.set_session_prices(session_id, price_map, ttl=ttl) + published_count += 1 + + logging.info(f"Published prices for {published_count} sessions to registry (TTL={ttl}s)") + + return { + 'sessions_published': published_count, + 'products_per_session': len(next(iter(price_results.values()))) if price_results else 0, + 'status': 'success' + } + + +# DAG definition +with DAG( + 'session_pricing_pipeline', + default_args=DEFAULT_ARGS, + description='Session-aware pricing: extract features → compute prices → publish to registry', + schedule_interval='*/1 * * * *', # every 1 minute + start_date=days_ago(1), + catchup=False, + max_active_runs=1, + tags=['pricing', 'session-aware', 'research', 'real-time'], +) as dag: + + t_fetch_sessions = PythonOperator( + task_id='fetch_recent_sessions', + python_callable=fetch_recent_sessions, + provide_context=True, + ) + + t_extract_features = PythonOperator( + task_id='extract_session_features', + python_callable=extract_session_features, + provide_context=True, + ) + + t_compute_prices = PythonOperator( + task_id='compute_session_prices', + python_callable=compute_session_prices, + provide_context=True, + ) + + t_publish = PythonOperator( + task_id='publish_to_registry', + python_callable=publish_to_registry, + provide_context=True, + ) + + # linear dependency: fetch → extract → compute → publish + t_fetch_sessions >> t_extract_features >> t_compute_prices >> t_publish diff --git a/experiments/ml/encoder/__init__.py b/experiments/ml/encoder/__init__.py new file mode 100644 index 0000000..0b18d1a --- /dev/null +++ b/experiments/ml/encoder/__init__.py @@ -0,0 +1 @@ +from .encoder import Window, extract_windows, build_windows, WindowDataset, PrototypeClassifier, train, loocv diff --git a/experiments/ml/encoder/encoder.py b/experiments/ml/encoder/encoder.py new file mode 100644 index 0000000..2d9f3c2 --- /dev/null +++ b/experiments/ml/encoder/encoder.py @@ -0,0 +1,210 @@ +"""Contrastive encoder via trajectory windowing. Classification by prototype distance.""" +import sys +sys.path.insert(0, "/home/velocitatem/Documents/Projects/PHANTOM/sim/rl/behavior_loader") +sys.path.insert(0, "/home/velocitatem/Documents/Projects/PHANTOM/experiments/ml") + +from sim.rl.behavior_loader.loader import JointLoader, PayloadModel +from arch import TrajectoryEncoder, featurize_trajectory, nt_xent_loss +from typing import List, Dict, Tuple +from dataclasses import dataclass +from datetime import datetime +import numpy as np, torch, torch.nn.functional as F, random, optuna +from torch.utils.data import Dataset, DataLoader +from torch.optim import Adam +from torch.utils.tensorboard import SummaryWriter + +RUNS = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/ml/runs" +AGENT_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/" +HUMAN_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/" + + +@dataclass +class Window: + events: List[PayloadModel] + traj_id: str + label: int # 0=human, 1=agent + + +def extract_windows(events: List[PayloadModel], traj_id: str, label: int, + sizes: List[int] = [5, 10, 15], stride: int = 2) -> List[Window]: + """Multi-scale overlapping windows from trajectory""" + n = len(events) + wins = [Window(events[i:i+s], traj_id, label) for s in sizes if n >= s for i in range(0, n-s+1, stride)] + if n >= 3: wins.append(Window(events, traj_id, label)) # full traj + return wins + + +def build_windows(data: Dict[str, List], sizes=[5,10,15], stride=2) -> List[Window]: + return [w for tid, evts in data.items() + for w in extract_windows(evts, tid, 0 if tid.startswith('human_') else 1, sizes, stride)] + + +class WindowDataset(Dataset): + """Yields (anchor, positive) pairs from same class""" + def __init__(self, windows: List[Window], dim: int = 64): + self.wins, self.dim = windows, dim + self.by_label = {0: [i for i,w in enumerate(windows) if w.label==0], + 1: [i for i,w in enumerate(windows) if w.label==1]} + self.by_traj = {} + for i, w in enumerate(windows): self.by_traj.setdefault(w.traj_id, []).append(i) + + def __len__(self): return len(self.wins) + + def _feat(self, evts): return featurize_trajectory(evts, None, self.dim) + + def _aug(self, evts): # subsample 70-100% + if len(evts) < 4: return evts + k = max(3, int(len(evts) * random.uniform(0.7, 1.0))) + start = random.randint(0, len(evts) - k) + return evts[start:start+k] + + def __getitem__(self, idx): + w = self.wins[idx] + pool = [i for i in self.by_label[w.label] if self.wins[i].traj_id != w.traj_id] + pos_idx = random.choice(pool) if pool else idx + a = torch.tensor(self._feat(self._aug(w.events)), dtype=torch.float32) + p = torch.tensor(self._feat(self._aug(self.wins[pos_idx].events)), dtype=torch.float32) + return a, p, w.label + + +class PrototypeClassifier: + """Classify by distance to class centroids""" + def __init__(self, encoder: TrajectoryEncoder, device = 'cuda', dim=64): + self.enc, self.dev, self.dim = encoder, device, dim + self.centroids = {0: None, 1: None} + + def fit(self, windows: List[Window]): + self.enc.eval() + embs = {0: [], 1: []} + with torch.no_grad(): + for w in windows: + x = torch.tensor(featurize_trajectory(w.events, None, self.dim), dtype=torch.float32) + z = self.enc(x.unsqueeze(0).unsqueeze(1).to(self.dev)) + embs[w.label].append(z) + self.centroids = {k: torch.cat(v).mean(0, keepdim=True) if v else None for k, v in embs.items()} + return self + + def predict(self, events: List[PayloadModel]) -> Tuple[int, float, Dict]: + """Returns (pred, confidence, debug). Confidence via softmax over -distances.""" + self.enc.eval() + with torch.no_grad(): + x = torch.tensor(featurize_trajectory(events, None, self.dim), dtype=torch.float32) + z = self.enc(x.unsqueeze(0).unsqueeze(1).to(self.dev)) + dists = {k: torch.norm(z - c, dim=1).item() for k, c in self.centroids.items() if c is not None} + if not dists: return 0, 0.0, {'d': {}, 'p': [0.5, 0.5]} + pred = min(dists, key=dists.get) + d0, d1 = dists.get(0, 1e6), dists.get(1, 1e6) # softmax(-d) gives higher prob to closer centroid + probs = F.softmax(torch.tensor([[-d0, -d1]]), dim=1).squeeze() + return pred, probs[pred].item(), {'d': dists, 'p': probs.tolist()} + + +def train(epochs=200, lr=5e-4, batch=16, dim=64, emb=32, temp=0.5, + sizes=[5,10,15], stride=2, name=None, verbose=True): + data = JointLoader(HUMAN_DIR, AGENT_DIR).get_data() + wins = build_windows(data, sizes, stride) + if verbose: print(f"Windows: {len(wins)} ({sum(w.label==0 for w in wins)}h/{sum(w.label==1 for w in wins)}a)") + + dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + enc = TrajectoryEncoder(dim, emb).to(dev) + opt = Adam(enc.parameters(), lr=lr) + loader = DataLoader(WindowDataset(wins, dim), batch_size=batch, shuffle=True, drop_last=True) + + name = name or f"enc_{dim}_{emb}_{datetime.now():%Y%m%d_%H%M%S}" + writer = SummaryWriter(f"{RUNS}/encoder/{name}") + + for ep in range(epochs): + enc.train() + total, n = 0.0, 0 + for a, p, _ in loader: + loss = nt_xent_loss(enc(a.unsqueeze(1).to(dev)), enc(p.unsqueeze(1).to(dev)), temp) + opt.zero_grad(); loss.backward(); opt.step() + total += loss.item(); n += 1 + avg = total / max(n, 1) + writer.add_scalar('loss-ntxent', avg, ep) + if verbose and (ep+1) % 20 == 0: print(f"Epoch {ep+1}: {avg:.4f}") + + writer.close() + return enc, wins, dev + + +def loocv(epochs=100, lr=5e-4, dim=64, emb=32, temp=0.5, sizes=[5,10,15], stride=2, verbose=True): + """Leave-one-trajectory-out CV""" + data = JointLoader(HUMAN_DIR, AGENT_DIR).get_data() + dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + results = [] + + for test_id in data: + train_data = {k: v for k, v in data.items() if k != test_id} + if not any(k.startswith('human_') for k in train_data) or not any(k.startswith('agent_') for k in train_data): + continue + + wins = build_windows(train_data, sizes, stride) + enc = TrajectoryEncoder(dim, emb).to(dev) + opt = Adam(enc.parameters(), lr=lr) + loader = DataLoader(WindowDataset(wins, dim), batch_size=min(16, len(wins)//2 or 1), + shuffle=True, drop_last=len(wins)>2) + + for _ in range(epochs): + enc.train() + for a, p, _ in loader: + loss = nt_xent_loss(enc(a.unsqueeze(1).to(dev)), enc(p.unsqueeze(1).to(dev)), temp) + opt.zero_grad(); loss.backward(); opt.step() + + clf = PrototypeClassifier(enc, dev, dim).fit(wins) + pred, conf, dbg = clf.predict(data[test_id]) + actual = 0 if test_id.startswith('human_') else 1 + results.append((pred, actual, conf)) + if verbose: print(f"{test_id[:18]}: pred={pred} conf={conf:.2f} actual={actual} {'OK' if pred==actual else 'MISS'}") + + if results: + acc = sum(p==a for p,a,_ in results) / len(results) + if verbose: print(f"\nAccuracy: {acc:.1%} ({sum(p==a for p,a,_ in results)}/{len(results)})") + return acc, results + return 0.0, [] + + +def hparam_tune(n_trials=50, epochs=60, n_jobs=2, verbose=True): + """Optuna hyperparameter search maximizing LOOCV accuracy""" + def objective(trial): + lr = trial.suggest_float('lr', 1e-5, 1e-2, log=True) + dim = trial.suggest_categorical('dim', [32, 64, 128, 256]) + emb = trial.suggest_categorical('emb', [16, 32, 64, 128]) + temp = trial.suggest_float('temp', 0.05, 1.0) + stride = trial.suggest_int('stride', 1, 4) + sizes = [trial.suggest_int(f's{i}', 3, 20) for i in range(3)] + sizes = sorted(set(sizes)) # unique sorted + acc, _ = loocv(epochs, lr, dim, emb, temp, sizes, stride, verbose=False) + return acc + + study = optuna.create_study(direction='maximize', study_name='encoder_hparam', + sampler=optuna.samplers.TPESampler(seed=42)) + study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs, show_progress_bar=verbose) + + best = study.best_params + if verbose: + print(f"\nBest accuracy: {study.best_value:.1%}") + print(f"Best params: {best}") + return best, study + + +if __name__ == "__main__": + import argparse + p = argparse.ArgumentParser() + p.add_argument('--mode', choices=['train', 'eval', 'hparam'], default='train') + p.add_argument('--epochs', type=int, default=200) + p.add_argument('--lr', type=float, default=5e-4) + p.add_argument('--dim', type=int, default=128) + p.add_argument('--emb', type=int, default=64) + p.add_argument('--temp', type=float, default=0.1) + p.add_argument('--sizes', type=str, default='5,10,15') + p.add_argument('--stride', type=int, default=2) + p.add_argument('--n_trials', type=int, default=50) + args = p.parse_args() + sizes = [int(x) for x in args.sizes.split(',')] + + if args.mode == 'train': + enc, wins, dev = train(args.epochs, args.lr, 16, args.dim, args.emb, args.temp, sizes, args.stride) + elif args.mode == 'hparam': + best, study = hparam_tune(args.n_trials, min(args.epochs, 60)) + else: + loocv(args.epochs, args.lr, args.dim, args.emb, args.temp, sizes, args.stride) diff --git a/experiments/notebooks/data_export.ipynb b/experiments/notebooks/data_export.ipynb new file mode 100644 index 0000000..7cd9366 --- /dev/null +++ b/experiments/notebooks/data_export.ipynb @@ -0,0 +1,957 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "id": "62eafcd9-5462-4063-8873-0e7fb9add907", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from kafka import KafkaConsumer\n", + "import pandas as pd\n", + "import json\n", + "import numpy as np\n", + "import os\n", + "from dotenv import load_dotenv\n", + "import matplotlib.pyplot as plt\n", + "from IPython.display import display, SVG, Image\n", + "load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4af65cb4-e8cf-4877-b2db-13ac19f3838f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 73 entries, 0 to 72\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 sessionId 73 non-null object \n", + " 1 eventName 73 non-null object \n", + " 2 page 73 non-null object \n", + " 3 productId 67 non-null object \n", + " 4 storeMode 73 non-null object \n", + " 5 userAgent 73 non-null object \n", + " 6 ts 73 non-null object \n", + " 7 metadata_referrer 6 non-null object \n", + " 8 metadata_roomType 45 non-null object \n", + " 9 metadata_price 45 non-null float64\n", + " 10 metadata_nights 45 non-null float64\n", + " 11 metadata_elementText 22 non-null object \n", + " 12 metadata_dwellTime 22 non-null float64\n", + "dtypes: float64(3), object(10)\n", + "memory usage: 7.5+ KB\n" + ] + } + ], + "source": [ + "KAFKA_PORT=os.getenv(\"KAFKA_PORT\", 9092)\n", + "topic = \"user-interactions\"\n", + "consumer = KafkaConsumer(\n", + " topic, \n", + " enable_auto_commit=True,\n", + " value_deserializer=lambda x: json.loads(x.decode('utf-8')),\n", + " auto_offset_reset='earliest', \n", + " bootstrap_servers=['localhost:9092'])\n", + "messages=consumer.poll(timeout_ms=1000,max_records=10000)\n", + "df = []\n", + "for m in messages.values():\n", + " for i in m:\n", + " df.append(i.value)\n", + "df = pd.DataFrame(df)\n", + "# explode metadata col json\n", + "df = df.join(pd.json_normalize(df.pop(\"metadata\"), sep=\".\").add_prefix(\"metadata_\"))\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f6819a1c-32ab-49c7-845b-5df7bf60f561", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sessionIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_roomTypemetadata_pricemetadata_nightsmetadata_elementTextmetadata_dwellTime
0d176d7c9-4027-4702-9e31-2a71395cdda0page_view/productsNonehotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:23:46.270ZNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1f0317a5d-e424-44e9-b784-c8f7291ffe31page_view/NonehotelMozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...2025-11-14T13:26:00.291ZNaNNaNNaNNaNNaN
2f0317a5d-e424-44e9-b784-c8f7291ffe31page_view/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...2025-11-14T13:26:07.769ZNaNNaNNaNNaNNaN
3f0317a5d-e424-44e9-b784-c8f7291ffe31view_item_page/productshtl-0hotelMozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck...2025-11-14T13:26:15.010ZNaNPremium Room269.01.0NaNNaN
4238dc588-a7ab-4c0e-bccd-6abca5076c66page_view/productsNonehotelMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...2025-11-14T13:27:15.457ZNaNNaNNaNNaNNaN
5238dc588-a7ab-4c0e-bccd-6abca5076c66view_item_page/productshtl-0hotelMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...2025-11-14T13:27:15.591ZNaNPremium Room264.02.0NaNNaNNaNNaNNaNNaNNaN
432214d9fad-9b00-40c3-bd0e-7739b6acd654click1762448192425DIVNaNNaNNaNNaNNaN/NaN1623.0493.0NaNNaNNaNNaNNaNNaN
6238dc588-a7ab-4c0e-bccd-6abca5076c66view_item_page/productshtl-0hotelMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...2025-11-14T13:27:21.483ZNaNPremium Room264.02.0NaNNaN
7238dc588-a7ab-4c0e-bccd-6abca5076c66hover_over_title/productshtl-0hotelMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...2025-11-14T13:27:22.646ZNaNNaNNaNNaNGrand Plaza Hotel1200.0
8238dc588-a7ab-4c0e-bccd-6abca5076c66view_item_page/productshtl-0hotelMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...2025-11-14T13:27:25.889ZNaNPremium Room264.02.0NaNNaN
35013fc334-4045-4d5a-8739-dd0a8766a63bpage_view/productsNonehotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:53:59.993ZNaNNaNNaNNaNNaN
36013fc334-4045-4d5a-8739-dd0a8766a63bview_item_page/productshtl-0hotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:54:10.705ZNaNPremium Room223.03.0NaNNaN
37013fc334-4045-4d5a-8739-dd0a8766a63bhover_over_title/productshtl-0hotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:54:11.771ZNaNNaN416.0397.0NaNNaNNaNNaNNaNNaNGrand Plaza Hotel1200.0
38013fc334-4045-4d5a-8739-dd0a8766a63bview_item_page/productshtl-1hotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:54:29.772ZNaNStandard Room267.05.0NaNNaN
39013fc334-4045-4d5a-8739-dd0a8766a63bhover_over_title/productshtl-1hotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-14T13:54:30.833ZNaNNaNNaNNaNSeaside Resort1200.0
\n", + "
" + ], + "text/plain": [ + " sessionId eventName page \\\n", + "0 d176d7c9-4027-4702-9e31-2a71395cdda0 page_view /products \n", + "1 f0317a5d-e424-44e9-b784-c8f7291ffe31 page_view / \n", + "2 f0317a5d-e424-44e9-b784-c8f7291ffe31 page_view /products \n", + "3 f0317a5d-e424-44e9-b784-c8f7291ffe31 view_item_page /products \n", + "4 238dc588-a7ab-4c0e-bccd-6abca5076c66 page_view /products \n", + "5 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n", + "6 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n", + "7 238dc588-a7ab-4c0e-bccd-6abca5076c66 hover_over_title /products \n", + "8 238dc588-a7ab-4c0e-bccd-6abca5076c66 view_item_page /products \n", + "35 013fc334-4045-4d5a-8739-dd0a8766a63b page_view /products \n", + "36 013fc334-4045-4d5a-8739-dd0a8766a63b view_item_page /products \n", + "37 013fc334-4045-4d5a-8739-dd0a8766a63b hover_over_title /products \n", + "38 013fc334-4045-4d5a-8739-dd0a8766a63b view_item_page /products \n", + "39 013fc334-4045-4d5a-8739-dd0a8766a63b hover_over_title /products \n", + "\n", + " productId storeMode userAgent \\\n", + "0 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "1 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n", + "2 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n", + "3 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64; rv:143.0) Geck... \n", + "4 None hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", + "5 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", + "6 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", + "7 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", + "8 htl-0 hotel Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7... \n", + "35 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "36 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "37 htl-0 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "38 htl-1 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "39 htl-1 hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "\n", + " ts metadata_referrer metadata_roomType \\\n", + "0 2025-11-14T13:23:46.270Z NaN \n", + "1 2025-11-14T13:26:00.291Z NaN \n", + "2 2025-11-14T13:26:07.769Z NaN \n", + "3 2025-11-14T13:26:15.010Z NaN Premium Room \n", + "4 2025-11-14T13:27:15.457Z NaN \n", + "5 2025-11-14T13:27:15.591Z NaN Premium Room \n", + "6 2025-11-14T13:27:21.483Z NaN Premium Room \n", + "7 2025-11-14T13:27:22.646Z NaN NaN \n", + "8 2025-11-14T13:27:25.889Z NaN Premium Room \n", + "35 2025-11-14T13:53:59.993Z NaN \n", + "36 2025-11-14T13:54:10.705Z NaN Premium Room \n", + "37 2025-11-14T13:54:11.771Z NaN NaN \n", + "38 2025-11-14T13:54:29.772Z NaN Standard Room \n", + "39 2025-11-14T13:54:30.833Z NaN NaN \n", + "\n", + " metadata_price metadata_nights metadata_elementText metadata_dwellTime \n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 269.0 1.0 NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "5 264.0 2.0 NaN NaN \n", + "6 264.0 2.0 NaN NaN \n", + "7 NaN NaN Grand Plaza Hotel 1200.0 \n", + "8 264.0 2.0 NaN NaN \n", + "35 NaN NaN NaN NaN \n", + "36 223.0 3.0 NaN NaN \n", + "37 NaN NaN Grand Plaza Hotel 1200.0 \n", + "38 267.0 5.0 NaN NaN \n", + "39 NaN NaN Seaside Resort 1200.0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('sessionId').head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "380eca5f-8304-4fb2-be32-e8bcfd312085", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['013fc334-4045-4d5a-8739-dd0a8766a63b',\n", + " '238dc588-a7ab-4c0e-bccd-6abca5076c66',\n", + " 'd176d7c9-4027-4702-9e31-2a71395cdda0',\n", + " 'f0317a5d-e424-44e9-b784-c8f7291ffe31']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sessions = list(set(df['sessionId'])); sessions # 238dc588-a7ab-4c0e-bccd-6abca5076c66" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f4ae6f81-dcb8-44be-aee7-30dbc3a6bae1", + "metadata": {}, + "outputs": [], + "source": [ + "# map sessions to experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "050d90a4-20a9-47f5-b998-c31178a54cb3", + "metadata": {}, + "outputs": [], + "source": [ + "def build_transition_prob_matrix(df: pd.DataFrame):\n", + " df = df.dropna(subset=['eventName'])\n", + " events = df['eventName'].tolist()\n", + " labels = pd.Index(events).unique().tolist()\n", + " idx = {e:i for i,e in enumerate(labels)}\n", + " M = np.zeros((len(labels), len(labels)), dtype=float)\n", + " for a, b in zip(events, events[1:]):\n", + " M[idx[a], idx[b]] += 1\n", + " row_sums = M.sum(axis=1, keepdims=True)\n", + " with np.errstate(divide='ignore', invalid='ignore'):\n", + " P = np.divide(M, row_sums, where=row_sums>0) # row-normalized\n", + " return P, labels" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e68f9004-82f5-4826-aece-e3dc6e15a18f", + "metadata": {}, + "outputs": [], + "source": [ + "# https://medium.com/data-science/time-series-data-markov-transition-matrices-7060771e362b\n", + "from graphviz import Digraph\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "def _as_prob_df(matrix, labels=None):\n", + " \"\"\"Return a square DataFrame with index=columns=labels.\"\"\"\n", + " if isinstance(matrix, pd.DataFrame):\n", + " # Ensure square and aligned\n", + " assert (matrix.index == matrix.columns).all(), \"Index/columns must match.\"\n", + " return matrix\n", + " matrix = np.asarray(matrix, dtype=float)\n", + " assert matrix.shape[0] == matrix.shape[1], \"Matrix must be square.\"\n", + " if labels is None:\n", + " raise ValueError(\"labels are required when matrix is not a DataFrame\")\n", + " assert len(labels) == matrix.shape[0], \"labels length must match matrix size.\"\n", + " return pd.DataFrame(matrix, index=list(labels), columns=list(labels))\n", + "\n", + "def _df_to_edgelist(P: pd.DataFrame, threshold=0.0, round_digits=2):\n", + " \"\"\"Build weighted edges > threshold.\"\"\"\n", + " edges = []\n", + " for src in P.index:\n", + " for dst in P.columns:\n", + " w = float(P.loc[src, dst])\n", + " if w > threshold:\n", + " edges.append((str(src), str(dst), f\"{w:.{round_digits}f}\"))\n", + " return edges\n", + "\n", + "def render_graph(fname, matrix, ls_index=None, threshold=0.0, fmt=\"svg\", view=False):\n", + " \"\"\"\n", + " fname: output file stem (no extension)\n", + " matrix: NumPy array or pandas DataFrame of transition PROBABILITIES\n", + " ls_index: ordered labels (required if matrix is not a DataFrame)\n", + " threshold: hide edges with weight <= threshold\n", + " fmt: 'svg'|'png'|'pdf' etc.\n", + " view: open after rendering\n", + " \"\"\"\n", + " P = _as_prob_df(matrix, labels=ls_index)\n", + " edges = _df_to_edgelist(P, threshold=threshold)\n", + "\n", + " g = Digraph(format=fmt)\n", + " g.attr(rankdir=\"LR\", size=\"30\")\n", + " g.attr(\"node\", shape=\"circle\")\n", + "\n", + " # ensure isolated nodes appear\n", + " for node in P.index:\n", + " g.node(str(node), width=\"1\", height=\"1\")\n", + "\n", + " for src, dst, label in edges:\n", + " g.edge(src, dst, label=label)\n", + "\n", + " g.render(fname, view=view, cleanup=True)\n", + " return g\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e255a2c1-6454-4e5e-89f6-ef8ac51ab6cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "013fc334-4045-4d5a-8739-dd0a8766a63b\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "page_view\n", + "\n", + "page_view\n", + "\n", + "\n", + "\n", + "view_item_page\n", + "\n", + "view_item_page\n", + "\n", + "\n", + "\n", + "page_view->view_item_page\n", + "\n", + "\n", + "1.00\n", + "\n", + "\n", + "\n", + "view_item_page->view_item_page\n", + "\n", + "\n", + "0.68\n", + "\n", + "\n", + "\n", + "hover_over_title\n", + "\n", + "hover_over_title\n", + "\n", + "\n", + "\n", + "view_item_page->hover_over_title\n", + "\n", + "\n", + "0.29\n", + "\n", + "\n", + "\n", + "hover_over_paragraph\n", + "\n", + "hover_over_paragraph\n", + "\n", + "\n", + "\n", + "view_item_page->hover_over_paragraph\n", + "\n", + "\n", + "0.04\n", + "\n", + "\n", + "\n", + "hover_over_title->view_item_page\n", + "\n", + "\n", + "1.00\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.00000000e+000 1.00000000e+000 0.00000000e+000 0.00000000e+000]\n", + " [0.00000000e+000 6.78571429e-001 2.85714286e-001 3.57142857e-002]\n", + " [0.00000000e+000 1.00000000e+000 0.00000000e+000 0.00000000e+000]\n", + " [2.05833592e-312 2.29175545e-312 4.94065646e-324 6.92110218e-310]]\n", + "238dc588-a7ab-4c0e-bccd-6abca5076c66\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "page_view\n", + "\n", + "page_view\n", + "\n", + "\n", + "\n", + "view_item_page\n", + "\n", + "view_item_page\n", + "\n", + "\n", + "\n", + "page_view->view_item_page\n", + "\n", + "\n", + "1.00\n", + "\n", + "\n", + "\n", + "view_item_page->view_item_page\n", + "\n", + "\n", + "0.19\n", + "\n", + "\n", + "\n", + "hover_over_title\n", + "\n", + "hover_over_title\n", + "\n", + "\n", + "\n", + "view_item_page->hover_over_title\n", + "\n", + "\n", + "0.38\n", + "\n", + "\n", + "\n", + "hover_over_paragraph\n", + "\n", + "hover_over_paragraph\n", + "\n", + "\n", + "\n", + "view_item_page->hover_over_paragraph\n", + "\n", + "\n", + "0.44\n", + "\n", + "\n", + "\n", + "hover_over_title->view_item_page\n", + "\n", + "\n", + "1.00\n", + "\n", + "\n", + "\n", + "hover_over_paragraph->page_view\n", + "\n", + "\n", + "0.14\n", + "\n", + "\n", + "\n", + "hover_over_paragraph->view_item_page\n", + "\n", + "\n", + "0.86\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0. 1. 0. 0. ]\n", + " [0. 0.1875 0.375 0.4375 ]\n", + " [0. 1. 0. 0. ]\n", + " [0.14285714 0.85714286 0. 0. ]]\n", + "d176d7c9-4027-4702-9e31-2a71395cdda0\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "page_view\n", + "\n", + "page_view\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.]]\n", + "f0317a5d-e424-44e9-b784-c8f7291ffe31\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "page_view\n", + "\n", + "page_view\n", + "\n", + "\n", + "\n", + "page_view->page_view\n", + "\n", + "\n", + "0.50\n", + "\n", + "\n", + "\n", + "view_item_page\n", + "\n", + "view_item_page\n", + "\n", + "\n", + "\n", + "page_view->view_item_page\n", + "\n", + "\n", + "0.50\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[5.0e-001 5.0e-001]\n", + " [9.9e-324 1.5e-323]]\n" + ] + } + ], + "source": [ + "def explore_session(session_id: str):\n", + " subset = df[df['sessionId'] == session_id]\n", + " print(session_id)\n", + " P, labels = build_transition_prob_matrix(subset)\n", + " g = render_graph(f\"session_{session_id}\", P, ls_index=labels, threshold=0.01, fmt=\"svg\", view=False)\n", + " display(g)\n", + " return P\n", + "for session in sessions:\n", + " print(explore_session(session))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (PHANTOM)", + "language": "python", + "name": "phantom" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experiments/notebooks/states.ipynb b/experiments/notebooks/states.ipynb new file mode 100644 index 0000000..8948ae2 --- /dev/null +++ b/experiments/notebooks/states.ipynb @@ -0,0 +1,1740 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "5abf66eb-f9ab-4680-a4f8-2a59d0989644", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "project_root = \"/home/velocitatem/Documents/Projects/PHANTOM/experiments\"\n", + "from pathlib import Path\n", + "if str(Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()) not in sys.path:\n", + " sys.path.insert(0, str(project_root))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "67e2b839-83a2-4e78-b96e-dfa6d3ad72f0", + "metadata": {}, + "outputs": [], + "source": [ + "from procesing.steps import (\n", + " ExtractSessionFeaturesStep,\n", + " _extract_features_for_session,\n", + " FetchInteractionsStep,\n", + ")\n", + "from procesing.context import PipelineContext\n", + "from procesing.providers import SupabaseProvider, BackendAPIProvider" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3d8993fa-803d-43b9-8b27-a54d89b6c4d7", + "metadata": {}, + "outputs": [], + "source": [ + "class Provider(SupabaseProvider, BackendAPIProvider):\n", + " def __init__(self, backend_url: str):\n", + " SupabaseProvider.__init__(self)\n", + " BackendAPIProvider.__init__(self, backend_url=backend_url)\n", + "# example run\n", + "context = PipelineContext(\n", + " provider=Provider(backend_url=\"http://localhost:5000\"),\n", + " store_mode='hotel',\n", + " window_size='5min',\n", + "\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f2a52096-4c3c-4168-bf5b-5567c7aade41", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sessionIdexperimentIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_elementTextmetadata_dateIndexmetadata_dwellTimemetadata_typemetadata_roomTypemetadata_pricemetadata_nightsmetadata_totalmetadata_itemCountdateIndex
0d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonepage_view/NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:20:13.061ZNaNNaNNaNNaNNaNNaNNaNNaNNaN<NA>
1d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehover_over_title/hotel/productsd018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:17.425ZNaNJunior Suite1.01200.0NaNNaNNaNNaNNaNNaN1
2d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehover_over_paragraph/hotel/productsd018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:19.496ZNaNprice1.01202.0NaNNaNNaNNaNNaNNaN1
3d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonepage_view/hotel/products/d018efc1-25e9-4284-b276-80386e...NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:21.922Zhttp://localhost:3000/hotel/products?dateIndex...NaNNaNNaNNaNNaNNaNNaNNaNNaN<NA>
4d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonelearn_more_about_item/hotel/products/d018efc1-25e9-4284-b276-80386e...d018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:22.674ZNaNNaN1.0NaNhotelJunior SuiteNaNNaNNaNNaN1
\n", + "
" + ], + "text/plain": [ + " sessionId experimentId eventName \\\n", + "0 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None page_view \n", + "1 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hover_over_title \n", + "2 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hover_over_paragraph \n", + "3 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None page_view \n", + "4 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None learn_more_about_item \n", + "\n", + " page \\\n", + "0 / \n", + "1 /hotel/products \n", + "2 /hotel/products \n", + "3 /hotel/products/d018efc1-25e9-4284-b276-80386e... \n", + "4 /hotel/products/d018efc1-25e9-4284-b276-80386e... \n", + "\n", + " productId storeMode \\\n", + "0 None hotel \n", + "1 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "2 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "3 None hotel \n", + "4 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "\n", + " userAgent \\\n", + "0 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "1 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "3 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "4 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "\n", + " ts \\\n", + "0 2025-11-25T20:20:13.061Z \n", + "1 2025-11-25T20:21:17.425Z \n", + "2 2025-11-25T20:21:19.496Z \n", + "3 2025-11-25T20:21:21.922Z \n", + "4 2025-11-25T20:21:22.674Z \n", + "\n", + " metadata_referrer metadata_elementText \\\n", + "0 NaN \n", + "1 NaN Junior Suite \n", + "2 NaN price \n", + "3 http://localhost:3000/hotel/products?dateIndex... NaN \n", + "4 NaN NaN \n", + "\n", + " metadata_dateIndex metadata_dwellTime metadata_type metadata_roomType \\\n", + "0 NaN NaN NaN NaN \n", + "1 1.0 1200.0 NaN NaN \n", + "2 1.0 1202.0 NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 1.0 NaN hotel Junior Suite \n", + "\n", + " metadata_price metadata_nights metadata_total metadata_itemCount \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " dateIndex \n", + "0 \n", + "1 1 \n", + "2 1 \n", + "3 \n", + "4 1 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df=FetchInteractionsStep(context).transform(None)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "72160b99-8f5f-4d9e-8a54-99116fe9d202", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['d423ce8a-77aa-4c9a-94d4-d1adddcc3472',\n", + " 'fba26fde-4c50-4545-9734-ff415ac2d791',\n", + " 'e48ae739-dff8-4e56-b9b9-efff9de55a48',\n", + " '3d0fed38-45fd-4d44-8511-d157adacb238',\n", + " 'c404dbe5-116f-42c0-b199-503516dbbe91'], dtype=object)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['sessionId'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "173d968d-fd66-4e16-97d3-da18fbbdc0f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sessionId\n", + "d423ce8a-77aa-4c9a-94d4-d1adddcc3472 50\n", + "c404dbe5-116f-42c0-b199-503516dbbe91 18\n", + "e48ae739-dff8-4e56-b9b9-efff9de55a48 8\n", + "fba26fde-4c50-4545-9734-ff415ac2d791 2\n", + "3d0fed38-45fd-4d44-8511-d157adacb238 2\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['sessionId'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "924bf0a9-6143-42f5-b779-780caf902ae8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sessionIdexperimentIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_elementTextmetadata_dateIndexmetadata_dwellTimemetadata_typemetadata_roomTypemetadata_pricemetadata_nightsmetadata_totalmetadata_itemCountdateIndex
30d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35hover_over_title/hotel/productsd018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T21:05:43.942ZNaNJunior Suite1.01252.0NaNNaNNaNNaNNaNNaN1
31d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/products/d018efc1-25e9-4284-b276-80386e...NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T21:05:45.407Zhttp://localhost:3000/hotel/products?dateIndex...NaNNaNNaNNaNNaNNaNNaNNaNNaN<NA>
32d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35learn_more_about_item/hotel/products/d018efc1-25e9-4284-b276-80386e...d018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T21:05:45.515ZNaNNaN1.0NaNhotelJunior SuiteNaNNaNNaNNaN1
33d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T21:05:50.176ZNaNNaNNaNNaNNaNNaNNaNNaNNaN<NA>
34d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/products/2ddabbfc-4127-48fc-86dc-ebc4c6...NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T21:05:54.666Zhttp://localhost:3000/hotel/products?dateIndex...NaNNaNNaNNaNNaNNaNNaNNaNNaN<NA>
\n", + "
" + ], + "text/plain": [ + " sessionId \\\n", + "30 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "31 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "32 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "33 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "34 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "\n", + " experimentId eventName \\\n", + "30 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 hover_over_title \n", + "31 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "32 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 learn_more_about_item \n", + "33 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "34 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "\n", + " page \\\n", + "30 /hotel/products \n", + "31 /hotel/products/d018efc1-25e9-4284-b276-80386e... \n", + "32 /hotel/products/d018efc1-25e9-4284-b276-80386e... \n", + "33 /hotel/products \n", + "34 /hotel/products/2ddabbfc-4127-48fc-86dc-ebc4c6... \n", + "\n", + " productId storeMode \\\n", + "30 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "31 None hotel \n", + "32 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "33 None hotel \n", + "34 None hotel \n", + "\n", + " userAgent \\\n", + "30 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "31 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "32 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "33 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "34 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "\n", + " ts \\\n", + "30 2025-11-25T21:05:43.942Z \n", + "31 2025-11-25T21:05:45.407Z \n", + "32 2025-11-25T21:05:45.515Z \n", + "33 2025-11-25T21:05:50.176Z \n", + "34 2025-11-25T21:05:54.666Z \n", + "\n", + " metadata_referrer metadata_elementText \\\n", + "30 NaN Junior Suite \n", + "31 http://localhost:3000/hotel/products?dateIndex... NaN \n", + "32 NaN NaN \n", + "33 NaN \n", + "34 http://localhost:3000/hotel/products?dateIndex... NaN \n", + "\n", + " metadata_dateIndex metadata_dwellTime metadata_type metadata_roomType \\\n", + "30 1.0 1252.0 NaN NaN \n", + "31 NaN NaN NaN NaN \n", + "32 1.0 NaN hotel Junior Suite \n", + "33 NaN NaN NaN NaN \n", + "34 NaN NaN NaN NaN \n", + "\n", + " metadata_price metadata_nights metadata_total metadata_itemCount \\\n", + "30 NaN NaN NaN NaN \n", + "31 NaN NaN NaN NaN \n", + "32 NaN NaN NaN NaN \n", + "33 NaN NaN NaN NaN \n", + "34 NaN NaN NaN NaN \n", + "\n", + " dateIndex \n", + "30 1 \n", + "31 \n", + "32 1 \n", + "33 \n", + "34 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session=\"d423ce8a-77aa-4c9a-94d4-d1adddcc3472\"\n", + "df=df[df['sessionId'] == session]\n", + "df=df.dropna(subset=[\"experimentId\"])\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3db0b073-e141-4ea4-bbf7-7123aeb9057f", + "metadata": {}, + "outputs": [], + "source": [ + "from procesing.steps import ExtractSessionFeaturesStep\n", + "feats = ExtractSessionFeaturesStep(context).transform(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c6527d2c-b68a-42e7-8558-cc957a371260", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sessionIdexperimentIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_elementText...searchescart_addshoversunique_products_viewedproduct_view_depthsession_duration_secinteraction_velocityavg_time_between_eventsstd_time_between_eventscart_to_view_ratio
0d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35hover_over_title/hotel/productsd018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:05:43.942000+00:00NaNJunior Suite...0.00.01.01.01.00.0000.0000000.0000000.0000000.0
1d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/products/d018efc1-25e9-4284-b276-80386e...NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:05:45.407000+00:00http://localhost:3000/hotel/products?dateIndex...NaN...0.00.01.01.01.01.46581.9112631.465000NaN0.0
2d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35learn_more_about_item/hotel/products/d018efc1-25e9-4284-b276-80386e...d018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:05:45.515000+00:00NaNNaN...0.00.01.01.02.01.573114.4310240.7865000.9595440.0
3d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:05:50.176000+00:00NaN...0.00.01.01.02.06.23438.4985562.0780002.3375800.0
4d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/products/2ddabbfc-4127-48fc-86dc-ebc4c6...NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:05:54.666000+00:00http://localhost:3000/hotel/products?dateIndex...NaN...0.00.01.01.02.010.72427.9746362.6810002.2577180.0
5d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35learn_more_about_item/hotel/products/2ddabbfc-4127-48fc-86dc-ebc4c6...2ddabbfc-4127-48fc-86dc-ebc4c677efa2hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:05:54.794000+00:00NaNNaN...0.00.01.02.02.010.85233.1736092.1704002.2641840.0
6d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:05:57.670000+00:00NaN...0.00.01.02.02.013.72830.5944062.2880002.0455320.0
7d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/products/2cd7f756-fc65-4ba0-ab01-74521c...NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:03.130000+00:00http://localhost:3000/hotel/products?dateIndex...NaN...0.00.01.02.02.019.18825.0156352.7411432.2190550.0
8d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35learn_more_about_item/hotel/products/2cd7f756-fc65-4ba0-ab01-74521c...2cd7f756-fc65-4ba0-ab01-74521c1fff43hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:03.253000+00:00NaNNaN...0.00.01.03.02.019.31127.9633372.4138752.2533490.0
9d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35add_item_to_cart/hotel/products/2cd7f756-fc65-4ba0-ab01-74521c...2cd7f756-fc65-4ba0-ab01-74521c1fff43hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:06.815000+00:00NaNNaN...0.01.01.03.02.022.87326.2318022.5414442.1422760.0
10d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:08.600000+00:00NaN...0.01.01.03.02.024.65826.7661612.4658002.0338730.0
11d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35checkout_start/cartNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:11.586000+00:00NaNNaN...0.01.01.03.02.027.64426.0454352.5130911.9358660.0
12d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35hover_over_title/hotel/products7f71fbe2-343c-4a46-94ea-07cbd903a86chotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:28.050000+00:00NaNExecutive Suite...0.01.02.04.02.044.10817.6838673.6756674.4301100.0
13d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35hover_over_paragraph/hotel/products7f71fbe2-343c-4a46-94ea-07cbd903a86chotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:31.806000+00:00NaNprice...0.01.03.04.02.047.86417.5497243.6818464.2415660.0
14d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/products/7f71fbe2-343c-4a46-94ea-07cbd9...NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:33.847000+00:00http://localhost:3000/hotel/products?dateIndex...NaN...0.01.03.04.02.049.90518.0342653.5646434.0986930.0
15d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35learn_more_about_item/hotel/products/7f71fbe2-343c-4a46-94ea-07cbd9...7f71fbe2-343c-4a46-94ea-07cbd903a86chotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:34.029000+00:00NaNNaN...0.01.03.04.03.050.08719.1666503.3391334.0450160.0
16d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:37.255000+00:00NaN...0.01.03.04.03.053.31319.1322943.3320633.9079590.0
17d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35hover_over_title/hotel/products7f71fbe2-343c-4a46-94ea-07cbd903a86chotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:43.694000+00:00NaNExecutive Suite...0.01.04.04.04.059.75218.0747093.5148243.8581680.0
18d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/products/7f71fbe2-343c-4a46-94ea-07cbd9...NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:44.387000+00:00http://localhost:3000/hotel/products?dateIndex...NaN...0.01.04.04.04.060.44518.8601213.3580563.8016070.0
19d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35learn_more_about_item/hotel/products/7f71fbe2-343c-4a46-94ea-07cbd9...7f71fbe2-343c-4a46-94ea-07cbd903a86chotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:44.492000+00:00NaNNaN...0.01.04.04.05.060.55019.8183323.1868423.7691220.0
20d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35add_item_to_cart/hotel/products/7f71fbe2-343c-4a46-94ea-07cbd9...7f71fbe2-343c-4a46-94ea-07cbd903a86chotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:46.279000+00:00NaNNaN...0.02.04.04.06.062.33720.2127153.1168503.6819240.0
21d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35checkout_start/cartNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25 21:06:48.275000+00:00NaNNaN...0.02.04.04.06.064.33320.5182413.0634763.5970210.0
22d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-28 18:26:19.145000+00:00NaN...0.02.04.04.06.064.33321.45088811347.05468253208.0352460.0
23d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35hover_over_title/hotel/productsd018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-28 18:27:27.527000+00:00NaNJunior Suite...0.02.05.04.06.0132.71510.85031810856.67760952037.8677520.0
24d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-28 18:30:23.429000+00:00NaN...0.02.05.04.06.0308.6174.86039310411.64529250940.7151760.0
25d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-28 18:34:12.526000+00:00NaN...0.02.05.04.06.0537.7142.90117110004.34336049909.7249870.0
26d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-28 18:34:42.426000+00:00NaN...0.02.05.04.06.0567.6142.8540529620.71092348940.4530260.0
27d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-28 18:39:39.397000+00:00NaN...0.02.05.04.06.0864.5851.9431299275.38722248023.5963550.0
28d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-28 18:40:49.203000+00:00NaN...0.02.05.04.06.0934.3911.8621758946.61646447157.9800320.0
29d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/admin/experimentsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-29 17:27:43.775000+00:00NaN...0.02.05.04.06.0934.3911.92638811466.20113848255.0709410.0
\n", + "

30 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " sessionId \\\n", + "0 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "1 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "2 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "3 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "4 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "5 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "6 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "7 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "8 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "9 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "10 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "11 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "12 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "13 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "14 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "15 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "16 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "17 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "18 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "19 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "20 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "21 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "22 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "23 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "24 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "25 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "26 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "27 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "28 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "29 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "\n", + " experimentId eventName \\\n", + "0 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 hover_over_title \n", + "1 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "2 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 learn_more_about_item \n", + "3 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "4 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "5 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 learn_more_about_item \n", + "6 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "7 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "8 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 learn_more_about_item \n", + "9 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 add_item_to_cart \n", + "10 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "11 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 checkout_start \n", + "12 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 hover_over_title \n", + "13 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 hover_over_paragraph \n", + "14 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "15 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 learn_more_about_item \n", + "16 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "17 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 hover_over_title \n", + "18 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "19 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 learn_more_about_item \n", + "20 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 add_item_to_cart \n", + "21 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 checkout_start \n", + "22 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "23 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 hover_over_title \n", + "24 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "25 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "26 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "27 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "28 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "29 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view \n", + "\n", + " page \\\n", + "0 /hotel/products \n", + "1 /hotel/products/d018efc1-25e9-4284-b276-80386e... \n", + "2 /hotel/products/d018efc1-25e9-4284-b276-80386e... \n", + "3 /hotel/products \n", + "4 /hotel/products/2ddabbfc-4127-48fc-86dc-ebc4c6... \n", + "5 /hotel/products/2ddabbfc-4127-48fc-86dc-ebc4c6... \n", + "6 /hotel/products \n", + "7 /hotel/products/2cd7f756-fc65-4ba0-ab01-74521c... \n", + "8 /hotel/products/2cd7f756-fc65-4ba0-ab01-74521c... \n", + "9 /hotel/products/2cd7f756-fc65-4ba0-ab01-74521c... \n", + "10 /hotel/products \n", + "11 /cart \n", + "12 /hotel/products \n", + "13 /hotel/products \n", + "14 /hotel/products/7f71fbe2-343c-4a46-94ea-07cbd9... \n", + "15 /hotel/products/7f71fbe2-343c-4a46-94ea-07cbd9... \n", + "16 /hotel/products \n", + "17 /hotel/products \n", + "18 /hotel/products/7f71fbe2-343c-4a46-94ea-07cbd9... \n", + "19 /hotel/products/7f71fbe2-343c-4a46-94ea-07cbd9... \n", + "20 /hotel/products/7f71fbe2-343c-4a46-94ea-07cbd9... \n", + "21 /cart \n", + "22 / \n", + "23 /hotel/products \n", + "24 /hotel/products \n", + "25 /hotel/products \n", + "26 /hotel/products \n", + "27 /hotel/products \n", + "28 /hotel/products \n", + "29 /admin/experiments \n", + "\n", + " productId storeMode \\\n", + "0 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "1 None hotel \n", + "2 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "3 None hotel \n", + "4 None hotel \n", + "5 2ddabbfc-4127-48fc-86dc-ebc4c677efa2 hotel \n", + "6 None hotel \n", + "7 None hotel \n", + "8 2cd7f756-fc65-4ba0-ab01-74521c1fff43 hotel \n", + "9 2cd7f756-fc65-4ba0-ab01-74521c1fff43 hotel \n", + "10 None hotel \n", + "11 None hotel \n", + "12 7f71fbe2-343c-4a46-94ea-07cbd903a86c hotel \n", + "13 7f71fbe2-343c-4a46-94ea-07cbd903a86c hotel \n", + "14 None hotel \n", + "15 7f71fbe2-343c-4a46-94ea-07cbd903a86c hotel \n", + "16 None hotel \n", + "17 7f71fbe2-343c-4a46-94ea-07cbd903a86c hotel \n", + "18 None hotel \n", + "19 7f71fbe2-343c-4a46-94ea-07cbd903a86c hotel \n", + "20 7f71fbe2-343c-4a46-94ea-07cbd903a86c hotel \n", + "21 None hotel \n", + "22 None hotel \n", + "23 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "24 None hotel \n", + "25 None hotel \n", + "26 None hotel \n", + "27 None hotel \n", + "28 None hotel \n", + "29 None hotel \n", + "\n", + " userAgent \\\n", + "0 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "1 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "3 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "4 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "5 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "6 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "7 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "8 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "9 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "10 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "11 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "12 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "13 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "14 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "15 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "16 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "17 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "18 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "19 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "20 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "21 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "22 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "23 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "24 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "25 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "26 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "27 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "28 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "29 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "\n", + " ts \\\n", + "0 2025-11-25 21:05:43.942000+00:00 \n", + "1 2025-11-25 21:05:45.407000+00:00 \n", + "2 2025-11-25 21:05:45.515000+00:00 \n", + "3 2025-11-25 21:05:50.176000+00:00 \n", + "4 2025-11-25 21:05:54.666000+00:00 \n", + "5 2025-11-25 21:05:54.794000+00:00 \n", + "6 2025-11-25 21:05:57.670000+00:00 \n", + "7 2025-11-25 21:06:03.130000+00:00 \n", + "8 2025-11-25 21:06:03.253000+00:00 \n", + "9 2025-11-25 21:06:06.815000+00:00 \n", + "10 2025-11-25 21:06:08.600000+00:00 \n", + "11 2025-11-25 21:06:11.586000+00:00 \n", + "12 2025-11-25 21:06:28.050000+00:00 \n", + "13 2025-11-25 21:06:31.806000+00:00 \n", + "14 2025-11-25 21:06:33.847000+00:00 \n", + "15 2025-11-25 21:06:34.029000+00:00 \n", + "16 2025-11-25 21:06:37.255000+00:00 \n", + "17 2025-11-25 21:06:43.694000+00:00 \n", + "18 2025-11-25 21:06:44.387000+00:00 \n", + "19 2025-11-25 21:06:44.492000+00:00 \n", + "20 2025-11-25 21:06:46.279000+00:00 \n", + "21 2025-11-25 21:06:48.275000+00:00 \n", + "22 2025-11-28 18:26:19.145000+00:00 \n", + "23 2025-11-28 18:27:27.527000+00:00 \n", + "24 2025-11-28 18:30:23.429000+00:00 \n", + "25 2025-11-28 18:34:12.526000+00:00 \n", + "26 2025-11-28 18:34:42.426000+00:00 \n", + "27 2025-11-28 18:39:39.397000+00:00 \n", + "28 2025-11-28 18:40:49.203000+00:00 \n", + "29 2025-11-29 17:27:43.775000+00:00 \n", + "\n", + " metadata_referrer metadata_elementText \\\n", + "0 NaN Junior Suite \n", + "1 http://localhost:3000/hotel/products?dateIndex... NaN \n", + "2 NaN NaN \n", + "3 NaN \n", + "4 http://localhost:3000/hotel/products?dateIndex... NaN \n", + "5 NaN NaN \n", + "6 NaN \n", + "7 http://localhost:3000/hotel/products?dateIndex... NaN \n", + "8 NaN NaN \n", + "9 NaN NaN \n", + "10 NaN \n", + "11 NaN NaN \n", + "12 NaN Executive Suite \n", + "13 NaN price \n", + "14 http://localhost:3000/hotel/products?dateIndex... NaN \n", + "15 NaN NaN \n", + "16 NaN \n", + "17 NaN Executive Suite \n", + "18 http://localhost:3000/hotel/products?dateIndex... NaN \n", + "19 NaN NaN \n", + "20 NaN NaN \n", + "21 NaN NaN \n", + "22 NaN \n", + "23 NaN Junior Suite \n", + "24 NaN \n", + "25 NaN \n", + "26 NaN \n", + "27 NaN \n", + "28 NaN \n", + "29 NaN \n", + "\n", + " ... searches cart_adds hovers unique_products_viewed \\\n", + "0 ... 0.0 0.0 1.0 1.0 \n", + "1 ... 0.0 0.0 1.0 1.0 \n", + "2 ... 0.0 0.0 1.0 1.0 \n", + "3 ... 0.0 0.0 1.0 1.0 \n", + "4 ... 0.0 0.0 1.0 1.0 \n", + "5 ... 0.0 0.0 1.0 2.0 \n", + "6 ... 0.0 0.0 1.0 2.0 \n", + "7 ... 0.0 0.0 1.0 2.0 \n", + "8 ... 0.0 0.0 1.0 3.0 \n", + "9 ... 0.0 1.0 1.0 3.0 \n", + "10 ... 0.0 1.0 1.0 3.0 \n", + "11 ... 0.0 1.0 1.0 3.0 \n", + "12 ... 0.0 1.0 2.0 4.0 \n", + "13 ... 0.0 1.0 3.0 4.0 \n", + "14 ... 0.0 1.0 3.0 4.0 \n", + "15 ... 0.0 1.0 3.0 4.0 \n", + "16 ... 0.0 1.0 3.0 4.0 \n", + "17 ... 0.0 1.0 4.0 4.0 \n", + "18 ... 0.0 1.0 4.0 4.0 \n", + "19 ... 0.0 1.0 4.0 4.0 \n", + "20 ... 0.0 2.0 4.0 4.0 \n", + "21 ... 0.0 2.0 4.0 4.0 \n", + "22 ... 0.0 2.0 4.0 4.0 \n", + "23 ... 0.0 2.0 5.0 4.0 \n", + "24 ... 0.0 2.0 5.0 4.0 \n", + "25 ... 0.0 2.0 5.0 4.0 \n", + "26 ... 0.0 2.0 5.0 4.0 \n", + "27 ... 0.0 2.0 5.0 4.0 \n", + "28 ... 0.0 2.0 5.0 4.0 \n", + "29 ... 0.0 2.0 5.0 4.0 \n", + "\n", + " product_view_depth session_duration_sec interaction_velocity \\\n", + "0 1.0 0.000 0.000000 \n", + "1 1.0 1.465 81.911263 \n", + "2 2.0 1.573 114.431024 \n", + "3 2.0 6.234 38.498556 \n", + "4 2.0 10.724 27.974636 \n", + "5 2.0 10.852 33.173609 \n", + "6 2.0 13.728 30.594406 \n", + "7 2.0 19.188 25.015635 \n", + "8 2.0 19.311 27.963337 \n", + "9 2.0 22.873 26.231802 \n", + "10 2.0 24.658 26.766161 \n", + "11 2.0 27.644 26.045435 \n", + "12 2.0 44.108 17.683867 \n", + "13 2.0 47.864 17.549724 \n", + "14 2.0 49.905 18.034265 \n", + "15 3.0 50.087 19.166650 \n", + "16 3.0 53.313 19.132294 \n", + "17 4.0 59.752 18.074709 \n", + "18 4.0 60.445 18.860121 \n", + "19 5.0 60.550 19.818332 \n", + "20 6.0 62.337 20.212715 \n", + "21 6.0 64.333 20.518241 \n", + "22 6.0 64.333 21.450888 \n", + "23 6.0 132.715 10.850318 \n", + "24 6.0 308.617 4.860393 \n", + "25 6.0 537.714 2.901171 \n", + "26 6.0 567.614 2.854052 \n", + "27 6.0 864.585 1.943129 \n", + "28 6.0 934.391 1.862175 \n", + "29 6.0 934.391 1.926388 \n", + "\n", + " avg_time_between_events std_time_between_events cart_to_view_ratio \n", + "0 0.000000 0.000000 0.0 \n", + "1 1.465000 NaN 0.0 \n", + "2 0.786500 0.959544 0.0 \n", + "3 2.078000 2.337580 0.0 \n", + "4 2.681000 2.257718 0.0 \n", + "5 2.170400 2.264184 0.0 \n", + "6 2.288000 2.045532 0.0 \n", + "7 2.741143 2.219055 0.0 \n", + "8 2.413875 2.253349 0.0 \n", + "9 2.541444 2.142276 0.0 \n", + "10 2.465800 2.033873 0.0 \n", + "11 2.513091 1.935866 0.0 \n", + "12 3.675667 4.430110 0.0 \n", + "13 3.681846 4.241566 0.0 \n", + "14 3.564643 4.098693 0.0 \n", + "15 3.339133 4.045016 0.0 \n", + "16 3.332063 3.907959 0.0 \n", + "17 3.514824 3.858168 0.0 \n", + "18 3.358056 3.801607 0.0 \n", + "19 3.186842 3.769122 0.0 \n", + "20 3.116850 3.681924 0.0 \n", + "21 3.063476 3.597021 0.0 \n", + "22 11347.054682 53208.035246 0.0 \n", + "23 10856.677609 52037.867752 0.0 \n", + "24 10411.645292 50940.715176 0.0 \n", + "25 10004.343360 49909.724987 0.0 \n", + "26 9620.710923 48940.453026 0.0 \n", + "27 9275.387222 48023.596355 0.0 \n", + "28 8946.616464 47157.980032 0.0 \n", + "29 11466.201138 48255.070941 0.0 \n", + "\n", + "[30 rows x 32 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdf92ad5-a15a-47d1-988e-f2d976f81416", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (PHANTOM)", + "language": "python", + "name": "phantom" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experiments/notebooks/step_breakdown.ipynb b/experiments/notebooks/step_breakdown.ipynb new file mode 100644 index 0000000..a2dfa18 --- /dev/null +++ b/experiments/notebooks/step_breakdown.ipynb @@ -0,0 +1,2320 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "d6bc6a6d-2454-4222-a1ed-1b06bb7b95d1", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "project_root = \"/home/velocitatem/Documents/Projects/PHANTOM/experiments\"\n", + "if str(Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()) not in sys.path:\n", + " sys.path.insert(0, str(project_root))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6926d0a1-02f2-47c1-b927-6b5bd28ae8cc", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a48f6ab9-46bf-4553-8489-f5ecb58f8d7c", + "metadata": {}, + "outputs": [], + "source": [ + "from procesing.steps import (\n", + " FetchInteractionsStep,\n", + " FetchPriceLogsStep,\n", + " FetchExperimentsStep,\n", + " JoinExperimentsStep,\n", + " CreatePriceBucketsStep,\n", + " AugmentEventNamesStep,\n", + " ChunkByTimeWindowStep,\n", + " ComputeDemandForChunksStep,\n", + " AggregatePriceLogsStep,\n", + " ComputeElasticityStep,\n", + " FitPricingFunctionStep,\n", + " PredictPricesStep,\n", + ")\n", + "from procesing.context import PipelineContext\n", + "from procesing.providers import SupabaseProvider, BackendAPIProvider" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bb7ebdfb-432e-4cf6-b0ee-dc030107ebc5", + "metadata": {}, + "outputs": [], + "source": [ + "class Provider(SupabaseProvider, BackendAPIProvider):\n", + " def __init__(self, backend_url: str):\n", + " SupabaseProvider.__init__(self)\n", + " BackendAPIProvider.__init__(self, backend_url=backend_url)\n", + "# example run\n", + "context = PipelineContext(\n", + " provider=Provider(backend_url=\"http://localhost:5000\"),\n", + " store_mode='hotel',\n", + " window_size='15min',\n", + "\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "587b1fdc-30f4-4ee0-b603-7a54b8bed5eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sessionIdexperimentIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_elementTextmetadata_dateIndexmetadata_dwellTimemetadata_typemetadata_roomTypemetadata_pricemetadata_nightsmetadata_totalmetadata_itemCountdateIndex
0d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonepage_view/NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:20:13.061ZNaNNaNNaNNaNNaNNaNNaNNaNNaN<NA>
1d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehover_over_title/hotel/productsd018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:17.425ZNaNJunior Suite1.01200.0NaNNaNNaNNaNNaNNaN1
2d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehover_over_paragraph/hotel/productsd018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:19.496ZNaNprice1.01202.0NaNNaNNaNNaNNaNNaN1
3d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonepage_view/hotel/products/d018efc1-25e9-4284-b276-80386e...NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:21.922Zhttp://localhost:3000/hotel/products?dateIndex...NaNNaNNaNNaNNaNNaNNaNNaNNaN<NA>
4d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonelearn_more_about_item/hotel/products/d018efc1-25e9-4284-b276-80386e...d018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:22.674ZNaNNaN1.0NaNhotelJunior SuiteNaNNaNNaNNaN1
\n", + "
" + ], + "text/plain": [ + " sessionId experimentId eventName \\\n", + "0 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None page_view \n", + "1 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hover_over_title \n", + "2 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hover_over_paragraph \n", + "3 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None page_view \n", + "4 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None learn_more_about_item \n", + "\n", + " page \\\n", + "0 / \n", + "1 /hotel/products \n", + "2 /hotel/products \n", + "3 /hotel/products/d018efc1-25e9-4284-b276-80386e... \n", + "4 /hotel/products/d018efc1-25e9-4284-b276-80386e... \n", + "\n", + " productId storeMode \\\n", + "0 None hotel \n", + "1 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "2 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "3 None hotel \n", + "4 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "\n", + " userAgent \\\n", + "0 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "1 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "3 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "4 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "\n", + " ts \\\n", + "0 2025-11-25T20:20:13.061Z \n", + "1 2025-11-25T20:21:17.425Z \n", + "2 2025-11-25T20:21:19.496Z \n", + "3 2025-11-25T20:21:21.922Z \n", + "4 2025-11-25T20:21:22.674Z \n", + "\n", + " metadata_referrer metadata_elementText \\\n", + "0 NaN \n", + "1 NaN Junior Suite \n", + "2 NaN price \n", + "3 http://localhost:3000/hotel/products?dateIndex... NaN \n", + "4 NaN NaN \n", + "\n", + " metadata_dateIndex metadata_dwellTime metadata_type metadata_roomType \\\n", + "0 NaN NaN NaN NaN \n", + "1 1.0 1200.0 NaN NaN \n", + "2 1.0 1202.0 NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 1.0 NaN hotel Junior Suite \n", + "\n", + " metadata_price metadata_nights metadata_total metadata_itemCount \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " dateIndex \n", + "0 \n", + "1 1 \n", + "2 1 \n", + "3 \n", + "4 1 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df=FetchInteractionsStep(context).transform(None)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "45022876-30d9-4607-a10f-932df9b4dbda", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sessionIdexperimentIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_elementTextmetadata_dateIndexmetadata_dwellTimemetadata_typemetadata_roomTypemetadata_pricemetadata_nightsmetadata_totalmetadata_itemCountdateIndexprice_bucket
0d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonepage_view/NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:20:13.061ZNaNNaNNaNNaNNaNNaNNaNNaNNaN<NA>
1d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehover_over_title/hotel/productsd018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:17.425ZNaNJunior Suite1.01200.0NaNNaNNaNNaNNaNNaN1
2d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehover_over_paragraph/hotel/productsd018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:19.496ZNaNprice1.01202.0NaNNaNNaNNaNNaNNaN1
3d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonepage_view/hotel/products/d018efc1-25e9-4284-b276-80386e...NonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:21.922Zhttp://localhost:3000/hotel/products?dateIndex...NaNNaNNaNNaNNaNNaNNaNNaNNaN<NA>
4d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonelearn_more_about_item/hotel/products/d018efc1-25e9-4284-b276-80386e...d018efc1-25e9-4284-b276-80386e048b25hotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-11-25T20:21:22.674ZNaNNaN1.0NaNhotelJunior SuiteNaNNaNNaNNaN1
\n", + "
" + ], + "text/plain": [ + " sessionId experimentId eventName \\\n", + "0 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None page_view \n", + "1 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hover_over_title \n", + "2 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hover_over_paragraph \n", + "3 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None page_view \n", + "4 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None learn_more_about_item \n", + "\n", + " page \\\n", + "0 / \n", + "1 /hotel/products \n", + "2 /hotel/products \n", + "3 /hotel/products/d018efc1-25e9-4284-b276-80386e... \n", + "4 /hotel/products/d018efc1-25e9-4284-b276-80386e... \n", + "\n", + " productId storeMode \\\n", + "0 None hotel \n", + "1 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "2 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "3 None hotel \n", + "4 d018efc1-25e9-4284-b276-80386e048b25 hotel \n", + "\n", + " userAgent \\\n", + "0 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "1 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "3 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "4 Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "\n", + " ts \\\n", + "0 2025-11-25T20:20:13.061Z \n", + "1 2025-11-25T20:21:17.425Z \n", + "2 2025-11-25T20:21:19.496Z \n", + "3 2025-11-25T20:21:21.922Z \n", + "4 2025-11-25T20:21:22.674Z \n", + "\n", + " metadata_referrer metadata_elementText \\\n", + "0 NaN \n", + "1 NaN Junior Suite \n", + "2 NaN price \n", + "3 http://localhost:3000/hotel/products?dateIndex... NaN \n", + "4 NaN NaN \n", + "\n", + " metadata_dateIndex metadata_dwellTime metadata_type metadata_roomType \\\n", + "0 NaN NaN NaN NaN \n", + "1 1.0 1200.0 NaN NaN \n", + "2 1.0 1202.0 NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 1.0 NaN hotel Junior Suite \n", + "\n", + " metadata_price metadata_nights metadata_total metadata_itemCount \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " dateIndex price_bucket \n", + "0 \n", + "1 1 \n", + "2 1 \n", + "3 \n", + "4 1 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = CreatePriceBucketsStep(context).transform(df)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "720b9631-4350-425a-ad29-ded745ce28f9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sessionIdexperimentIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_elementText...metadata_dwellTimemetadata_typemetadata_roomTypemetadata_pricemetadata_nightsmetadata_totalmetadata_itemCountdateIndexprice_bucketmetadata_schema
78c404dbe5-116f-42c0-b199-503516dbbe91fd01774c-f629-4bcb-88b8-c818856af72apage_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-29T17:32:45.064ZNaN...NaNNaNNaNNaNNaNNaNNaN<NA>
79c404dbe5-116f-42c0-b199-503516dbbe91fd01774c-f629-4bcb-88b8-c818856af72apage_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-29T18:13:53.858ZNaN...NaNNaNNaNNaNNaNNaNNaN<NA>
80d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04T11:13:15.884ZNaN...NaNNaNNaNNaNNaNNaNNaN<NA>
81d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04T11:18:53.473ZNaN...NaNNaNNaNNaNNaNNaNNaN<NA>
82d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04T11:19:05.094ZNaN...NaNNaNNaNNaNNaNNaNNaN<NA>
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " sessionId \\\n", + "78 c404dbe5-116f-42c0-b199-503516dbbe91 \n", + "79 c404dbe5-116f-42c0-b199-503516dbbe91 \n", + "80 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "81 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "82 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "\n", + " experimentId eventName page \\\n", + "78 fd01774c-f629-4bcb-88b8-c818856af72a page_view /hotel/products \n", + "79 fd01774c-f629-4bcb-88b8-c818856af72a page_view /hotel/products \n", + "80 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "81 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "82 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "\n", + " productId storeMode userAgent \\\n", + "78 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "79 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "80 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "81 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "82 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "\n", + " ts metadata_referrer metadata_elementText ... \\\n", + "78 2025-11-29T17:32:45.064Z NaN ... \n", + "79 2025-11-29T18:13:53.858Z NaN ... \n", + "80 2025-12-04T11:13:15.884Z NaN ... \n", + "81 2025-12-04T11:18:53.473Z NaN ... \n", + "82 2025-12-04T11:19:05.094Z NaN ... \n", + "\n", + " metadata_dwellTime metadata_type metadata_roomType metadata_price \\\n", + "78 NaN NaN NaN NaN \n", + "79 NaN NaN NaN NaN \n", + "80 NaN NaN NaN NaN \n", + "81 NaN NaN NaN NaN \n", + "82 NaN NaN NaN NaN \n", + "\n", + " metadata_nights metadata_total metadata_itemCount dateIndex \\\n", + "78 NaN NaN NaN \n", + "79 NaN NaN NaN \n", + "80 NaN NaN NaN \n", + "81 NaN NaN NaN \n", + "82 NaN NaN NaN \n", + "\n", + " price_bucket metadata_schema \n", + "78 \n", + "79 \n", + "80 \n", + "81 \n", + "82 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = AugmentEventNamesStep(context).transform(df)\n", + "df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9c6add1a-147a-4086-a437-3f47f17d69bb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
productIdpricesessionIdexperimentIdstoreModets
2132cd7f756-fc65-4ba0-ab01-74521c1fff43100.0d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehotel2025-12-04T11:18:56.320Z
2142ddabbfc-4127-48fc-86dc-ebc4c677efa2100.0d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehotel2025-12-04T11:19:05.434Z
2152cd7f756-fc65-4ba0-ab01-74521c1fff43100.0d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehotel2025-12-04T11:19:05.338Z
2162cd7f756-fc65-4ba0-ab01-74521c1fff43100.0d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehotel2025-12-04T11:19:05.597Z
2172ddabbfc-4127-48fc-86dc-ebc4c677efa2100.0d423ce8a-77aa-4c9a-94d4-d1adddcc3472Nonehotel2025-12-04T11:19:05.594Z
\n", + "
" + ], + "text/plain": [ + " productId price \\\n", + "213 2cd7f756-fc65-4ba0-ab01-74521c1fff43 100.0 \n", + "214 2ddabbfc-4127-48fc-86dc-ebc4c677efa2 100.0 \n", + "215 2cd7f756-fc65-4ba0-ab01-74521c1fff43 100.0 \n", + "216 2cd7f756-fc65-4ba0-ab01-74521c1fff43 100.0 \n", + "217 2ddabbfc-4127-48fc-86dc-ebc4c677efa2 100.0 \n", + "\n", + " sessionId experimentId storeMode \\\n", + "213 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hotel \n", + "214 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hotel \n", + "215 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hotel \n", + "216 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hotel \n", + "217 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 None hotel \n", + "\n", + " ts \n", + "213 2025-12-04T11:18:56.320Z \n", + "214 2025-12-04T11:19:05.434Z \n", + "215 2025-12-04T11:19:05.338Z \n", + "216 2025-12-04T11:19:05.597Z \n", + "217 2025-12-04T11:19:05.594Z " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "price_df=FetchPriceLogsStep(context).fit_transform(None)\n", + "price_df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "28c3d101-8e82-46c3-aa06-765bd2799177", + "metadata": {}, + "outputs": [], + "source": [ + "df_chunks = ChunkByTimeWindowStep(context).transform(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "9f1e4005-a1ed-423e-ad55-cc00a18ac10b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df_chunks)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f427318c-c4d3-448a-9b02-f5f71ef15f5c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2025-12-04 11:15:00+0000', tz='UTC')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_chunks[-1]['window_start']" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "fc87a9cc-30f0-460d-a13d-b61f7bf83fab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2025-12-04 11:30:00+0000', tz='UTC')" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_chunks[-1]['window_end']" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2d850541-47f5-4cd5-8777-ca526645a39b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sessionIdexperimentIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_elementText...metadata_dwellTimemetadata_typemetadata_roomTypemetadata_pricemetadata_nightsmetadata_totalmetadata_itemCountdateIndexprice_bucketmetadata_schema
81d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04 11:18:53.473000+00:00NaN...NaNNaNNaNNaNNaNNaNNaN<NA>
82d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04 11:19:05.094000+00:00NaN...NaNNaNNaNNaNNaNNaNNaN<NA>
\n", + "

2 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " sessionId \\\n", + "81 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "82 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "\n", + " experimentId eventName page \\\n", + "81 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "82 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "\n", + " productId storeMode userAgent \\\n", + "81 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "82 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "\n", + " ts metadata_referrer metadata_elementText \\\n", + "81 2025-12-04 11:18:53.473000+00:00 NaN \n", + "82 2025-12-04 11:19:05.094000+00:00 NaN \n", + "\n", + " ... metadata_dwellTime metadata_type metadata_roomType metadata_price \\\n", + "81 ... NaN NaN NaN NaN \n", + "82 ... NaN NaN NaN NaN \n", + "\n", + " metadata_nights metadata_total metadata_itemCount dateIndex \\\n", + "81 NaN NaN NaN \n", + "82 NaN NaN NaN \n", + "\n", + " price_bucket metadata_schema \n", + "81 \n", + "82 \n", + "\n", + "[2 rows x 21 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_chunks[-1]['data'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "eda09da8-324b-4af9-971c-d366d8b870d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "demand = ComputeDemandForChunksStep(context).transform(df_chunks)\n", + "len(demand)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "bd2d40ce-6f51-42a8-8849-cc1a4479f4d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
productIddemand_score
0bec37f41-7756-47ae-9219-f5854290f4e70
15e666c06-023a-415b-9976-be0956bbc4050
2d018efc1-25e9-4284-b276-80386e048b250
32cd7f756-fc65-4ba0-ab01-74521c1fff430
451266ddb-5b07-47b7-89ee-5b5cae94bb110
.........
790d1c9a3a-bc37-4417-a59f-de4b994944cb0
80fc64bd74-4dfa-4f78-802a-39d6aa4c39fe0
81d85d4c52-baa0-435f-81ac-b0c27a5251b30
8293bc00e5-8cfe-42af-8322-49bc274076880
8318cc01db-55cc-42a5-aab5-e3ec448548d80
\n", + "

84 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " productId demand_score\n", + "0 bec37f41-7756-47ae-9219-f5854290f4e7 0\n", + "1 5e666c06-023a-415b-9976-be0956bbc405 0\n", + "2 d018efc1-25e9-4284-b276-80386e048b25 0\n", + "3 2cd7f756-fc65-4ba0-ab01-74521c1fff43 0\n", + "4 51266ddb-5b07-47b7-89ee-5b5cae94bb11 0\n", + ".. ... ...\n", + "79 0d1c9a3a-bc37-4417-a59f-de4b994944cb 0\n", + "80 fc64bd74-4dfa-4f78-802a-39d6aa4c39fe 0\n", + "81 d85d4c52-baa0-435f-81ac-b0c27a5251b3 0\n", + "82 93bc00e5-8cfe-42af-8322-49bc27407688 0\n", + "83 18cc01db-55cc-42a5-aab5-e3ec448548d8 0\n", + "\n", + "[84 rows x 2 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "demand[-1]['demand_vector']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "20293a35-c0a8-416e-bfb9-178883f4ca5f", + "metadata": {}, + "outputs": [], + "source": [ + "price_df_agg = AggregatePriceLogsStep(context).transform(price_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "bf378587-f5ef-431b-a1c2-663124b5e42b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
productIdprice
02cd7f756-fc65-4ba0-ab01-74521c1fff43100.00
12ddabbfc-4127-48fc-86dc-ebc4c677efa2100.00
251266ddb-5b07-47b7-89ee-5b5cae94bb11100.00
3d018efc1-25e9-4284-b276-80386e048b25100.00
4aaae8177-0803-4421-8702-f3ffeeeadcd9389.04
57f71fbe2-343c-4a46-94ea-07cbd903a86c327.94
6d6affcb8-6616-47f8-af14-2ec8583f0781391.43
70fbcf915-ecf1-4ec3-9b00-8bbc314e2a81900.97
8eceedfb3-ec52-4453-9aab-88dd9a6b6ca3640.54
\n", + "
" + ], + "text/plain": [ + " productId price\n", + "0 2cd7f756-fc65-4ba0-ab01-74521c1fff43 100.00\n", + "1 2ddabbfc-4127-48fc-86dc-ebc4c677efa2 100.00\n", + "2 51266ddb-5b07-47b7-89ee-5b5cae94bb11 100.00\n", + "3 d018efc1-25e9-4284-b276-80386e048b25 100.00\n", + "4 aaae8177-0803-4421-8702-f3ffeeeadcd9 389.04\n", + "5 7f71fbe2-343c-4a46-94ea-07cbd903a86c 327.94\n", + "6 d6affcb8-6616-47f8-af14-2ec8583f0781 391.43\n", + "7 0fbcf915-ecf1-4ec3-9b00-8bbc314e2a81 900.97\n", + "8 eceedfb3-ec52-4453-9aab-88dd9a6b6ca3 640.54" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "price_df_agg[-1]['price_vector']" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "d05f003a-9537-49f6-a814-118e80cd8748", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
productIdelasticitystd_errorn_obs
0d018efc1-25e9-4284-b276-80386e048b25-0.2220540.44710211
12cd7f756-fc65-4ba0-ab01-74521c1fff43-0.0728570.51013011
251266ddb-5b07-47b7-89ee-5b5cae94bb11-0.2910830.34687911
32ddabbfc-4127-48fc-86dc-ebc4c677efa2-10.2239680.00000011
47f71fbe2-343c-4a46-94ea-07cbd903a86c0.0000000.0000009
...............
790d1c9a3a-bc37-4417-a59f-de4b994944cb0.0000000.0000000
80fc64bd74-4dfa-4f78-802a-39d6aa4c39fe0.0000000.0000000
81d85d4c52-baa0-435f-81ac-b0c27a5251b30.0000000.0000000
8293bc00e5-8cfe-42af-8322-49bc274076880.0000000.0000000
8318cc01db-55cc-42a5-aab5-e3ec448548d80.0000000.0000000
\n", + "

84 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " productId elasticity std_error n_obs\n", + "0 d018efc1-25e9-4284-b276-80386e048b25 -0.222054 0.447102 11\n", + "1 2cd7f756-fc65-4ba0-ab01-74521c1fff43 -0.072857 0.510130 11\n", + "2 51266ddb-5b07-47b7-89ee-5b5cae94bb11 -0.291083 0.346879 11\n", + "3 2ddabbfc-4127-48fc-86dc-ebc4c677efa2 -10.223968 0.000000 11\n", + "4 7f71fbe2-343c-4a46-94ea-07cbd903a86c 0.000000 0.000000 9\n", + ".. ... ... ... ...\n", + "79 0d1c9a3a-bc37-4417-a59f-de4b994944cb 0.000000 0.000000 0\n", + "80 fc64bd74-4dfa-4f78-802a-39d6aa4c39fe 0.000000 0.000000 0\n", + "81 d85d4c52-baa0-435f-81ac-b0c27a5251b3 0.000000 0.000000 0\n", + "82 93bc00e5-8cfe-42af-8322-49bc27407688 0.000000 0.000000 0\n", + "83 18cc01db-55cc-42a5-aab5-e3ec448548d8 0.000000 0.000000 0\n", + "\n", + "[84 rows x 4 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "elasticity = ComputeElasticityStep(context).transform((demand, price_df_agg))\n", + "elasticity" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "926cd9ea-8f6b-43b5-95a5-8fdf5309e1bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
elasticitystd_errorn_obs
productId
d018efc1-25e9-4284-b276-80386e048b25-0.2220540.44710211
2cd7f756-fc65-4ba0-ab01-74521c1fff43-0.0728570.51013011
51266ddb-5b07-47b7-89ee-5b5cae94bb11-0.2910830.34687911
2ddabbfc-4127-48fc-86dc-ebc4c677efa2-10.2239680.00000011
7f71fbe2-343c-4a46-94ea-07cbd903a86c0.0000000.0000009
............
0d1c9a3a-bc37-4417-a59f-de4b994944cb0.0000000.0000000
fc64bd74-4dfa-4f78-802a-39d6aa4c39fe0.0000000.0000000
d85d4c52-baa0-435f-81ac-b0c27a5251b30.0000000.0000000
93bc00e5-8cfe-42af-8322-49bc274076880.0000000.0000000
18cc01db-55cc-42a5-aab5-e3ec448548d80.0000000.0000000
\n", + "

84 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " elasticity std_error n_obs\n", + "productId \n", + "d018efc1-25e9-4284-b276-80386e048b25 -0.222054 0.447102 11\n", + "2cd7f756-fc65-4ba0-ab01-74521c1fff43 -0.072857 0.510130 11\n", + "51266ddb-5b07-47b7-89ee-5b5cae94bb11 -0.291083 0.346879 11\n", + "2ddabbfc-4127-48fc-86dc-ebc4c677efa2 -10.223968 0.000000 11\n", + "7f71fbe2-343c-4a46-94ea-07cbd903a86c 0.000000 0.000000 9\n", + "... ... ... ...\n", + "0d1c9a3a-bc37-4417-a59f-de4b994944cb 0.000000 0.000000 0\n", + "fc64bd74-4dfa-4f78-802a-39d6aa4c39fe 0.000000 0.000000 0\n", + "d85d4c52-baa0-435f-81ac-b0c27a5251b3 0.000000 0.000000 0\n", + "93bc00e5-8cfe-42af-8322-49bc27407688 0.000000 0.000000 0\n", + "18cc01db-55cc-42a5-aab5-e3ec448548d8 0.000000 0.000000 0\n", + "\n", + "[84 rows x 3 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "elasticity.set_index('productId')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "bfa8a023-24da-4b3a-bafb-b174e54bf3b0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 84 entries, 0 to 83\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 productId 84 non-null object \n", + " 1 elasticity 84 non-null float64\n", + " 2 std_error 84 non-null float64\n", + " 3 n_obs 84 non-null int64 \n", + "dtypes: float64(2), int64(1), object(1)\n", + "memory usage: 2.8+ KB\n" + ] + } + ], + "source": [ + "elasticity.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "5a8823da-da7a-41e4-8166-399538c80a73", + "metadata": {}, + "outputs": [], + "source": [ + "df['productId'] = df['productId'].astype(str)\n", + "elasticity['productId'] = elasticity['productId'].astype(str)\n", + "dff=df.join(elasticity.set_index('productId'), how=\"left\", on=\"productId\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "e5d1639b-768b-4217-8821-ee09bb3e60c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sessionIdexperimentIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_elementText...metadata_pricemetadata_nightsmetadata_totalmetadata_itemCountdateIndexprice_bucketmetadata_schemaelasticitystd_errorn_obs
78c404dbe5-116f-42c0-b199-503516dbbe91fd01774c-f629-4bcb-88b8-c818856af72apage_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-29T17:32:45.064ZNaN...NaNNaNNaNNaN<NA>NaNNaNNaN
79c404dbe5-116f-42c0-b199-503516dbbe91fd01774c-f629-4bcb-88b8-c818856af72apage_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-29T18:13:53.858ZNaN...NaNNaNNaNNaN<NA>NaNNaNNaN
80d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04T11:13:15.884ZNaN...NaNNaNNaNNaN<NA>NaNNaNNaN
81d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04T11:18:53.473ZNaN...NaNNaNNaNNaN<NA>NaNNaNNaN
82d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04T11:19:05.094ZNaN...NaNNaNNaNNaN<NA>NaNNaNNaN
\n", + "

5 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " sessionId \\\n", + "78 c404dbe5-116f-42c0-b199-503516dbbe91 \n", + "79 c404dbe5-116f-42c0-b199-503516dbbe91 \n", + "80 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "81 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "82 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "\n", + " experimentId eventName page \\\n", + "78 fd01774c-f629-4bcb-88b8-c818856af72a page_view /hotel/products \n", + "79 fd01774c-f629-4bcb-88b8-c818856af72a page_view /hotel/products \n", + "80 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "81 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "82 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "\n", + " productId storeMode userAgent \\\n", + "78 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "79 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "80 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "81 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "82 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "\n", + " ts metadata_referrer metadata_elementText ... \\\n", + "78 2025-11-29T17:32:45.064Z NaN ... \n", + "79 2025-11-29T18:13:53.858Z NaN ... \n", + "80 2025-12-04T11:13:15.884Z NaN ... \n", + "81 2025-12-04T11:18:53.473Z NaN ... \n", + "82 2025-12-04T11:19:05.094Z NaN ... \n", + "\n", + " metadata_price metadata_nights metadata_total metadata_itemCount \\\n", + "78 NaN NaN NaN NaN \n", + "79 NaN NaN NaN NaN \n", + "80 NaN NaN NaN NaN \n", + "81 NaN NaN NaN NaN \n", + "82 NaN NaN NaN NaN \n", + "\n", + " dateIndex price_bucket metadata_schema elasticity std_error n_obs \n", + "78 NaN NaN NaN \n", + "79 NaN NaN NaN \n", + "80 NaN NaN NaN \n", + "81 NaN NaN NaN \n", + "82 NaN NaN NaN \n", + "\n", + "[5 rows x 24 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "38d4c603-35ed-4de4-847a-48e74044d6d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsubject_namexp_human_onlyxp_market_modexp_task_idtask
053aefd07-f66a-4d7f-ba8b-7ea1fc562d35DanielFalsehotel517b8078-cf4c-4a1f-b943-75281c69a5b3{'task_name': 'Cheapest Room', 'task_def_of_do...
1d10f5ab3-a7b7-4e97-8d94-ab06f1537c0aFull AgentFalsehotel517b8078-cf4c-4a1f-b943-75281c69a5b3{'task_name': 'Cheapest Room', 'task_def_of_do...
2fd01774c-f629-4bcb-88b8-c818856af72aDaniel 1Truehotel920c3deb-18c6-4586-bbc4-4ce4d1ae6f2d{'task_name': 'Cheapest Room w/ View', 'task_d...
\n", + "
" + ], + "text/plain": [ + " id subject_name xp_human_only \\\n", + "0 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 Daniel False \n", + "1 d10f5ab3-a7b7-4e97-8d94-ab06f1537c0a Full Agent False \n", + "2 fd01774c-f629-4bcb-88b8-c818856af72a Daniel 1 True \n", + "\n", + " xp_market_mode xp_task_id \\\n", + "0 hotel 517b8078-cf4c-4a1f-b943-75281c69a5b3 \n", + "1 hotel 517b8078-cf4c-4a1f-b943-75281c69a5b3 \n", + "2 hotel 920c3deb-18c6-4586-bbc4-4ce4d1ae6f2d \n", + "\n", + " task \n", + "0 {'task_name': 'Cheapest Room', 'task_def_of_do... \n", + "1 {'task_name': 'Cheapest Room', 'task_def_of_do... \n", + "2 {'task_name': 'Cheapest Room w/ View', 'task_d... " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiments = FetchExperimentsStep(context).transform(dff)\n", + "experiments.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fd9f831-8a9e-40c2-89a5-de81fb4f77f3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "92f354ee-6550-48c6-87fd-46b0217e57ed", + "metadata": {}, + "outputs": [], + "source": [ + "dff_exp = JoinExperimentsStep(context).transform((dff,experiments))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "d2823849-6ff6-46d2-abc8-9e363b0f66dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sessionIdexperimentIdeventNamepageproductIdstoreModeuserAgenttsmetadata_referrermetadata_elementText...elasticitystd_errorn_obsexp_subjectexp_human_onlyexp_market_modeexp_task_idtask_nametask_def_of_donetask_description
76c404dbe5-116f-42c0-b199-503516dbbe91fd01774c-f629-4bcb-88b8-c818856af72apage_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-29T17:32:45.064ZNaN...NaNNaNNaNDaniel 1Truehotel920c3deb-18c6-4586-bbc4-4ce4d1ae6f2dCheapest Room w/ ViewUser added to cart a the cheapest room of all ...Find the cheapest room with a nice view in the...
77c404dbe5-116f-42c0-b199-503516dbbe91fd01774c-f629-4bcb-88b8-c818856af72apage_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...2025-11-29T18:13:53.858ZNaN...NaNNaNNaNDaniel 1Truehotel920c3deb-18c6-4586-bbc4-4ce4d1ae6f2dCheapest Room w/ ViewUser added to cart a the cheapest room of all ...Find the cheapest room with a nice view in the...
78d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04T11:13:15.884ZNaN...NaNNaNNaNDanielFalsehotel517b8078-cf4c-4a1f-b943-75281c69a5b3Cheapest RoomA room was added and purchased.Find the cheapest hotel room in multiple steps...
79d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04T11:18:53.473ZNaN...NaNNaNNaNDanielFalsehotel517b8078-cf4c-4a1f-b943-75281c69a5b3Cheapest RoomA room was added and purchased.Find the cheapest hotel room in multiple steps...
80d423ce8a-77aa-4c9a-94d4-d1adddcc347253aefd07-f66a-4d7f-ba8b-7ea1fc562d35page_view/hotel/productsNonehotelMozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck...2025-12-04T11:19:05.094ZNaN...NaNNaNNaNDanielFalsehotel517b8078-cf4c-4a1f-b943-75281c69a5b3Cheapest RoomA room was added and purchased.Find the cheapest hotel room in multiple steps...
\n", + "

5 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " sessionId \\\n", + "76 c404dbe5-116f-42c0-b199-503516dbbe91 \n", + "77 c404dbe5-116f-42c0-b199-503516dbbe91 \n", + "78 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "79 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "80 d423ce8a-77aa-4c9a-94d4-d1adddcc3472 \n", + "\n", + " experimentId eventName page \\\n", + "76 fd01774c-f629-4bcb-88b8-c818856af72a page_view /hotel/products \n", + "77 fd01774c-f629-4bcb-88b8-c818856af72a page_view /hotel/products \n", + "78 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "79 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "80 53aefd07-f66a-4d7f-ba8b-7ea1fc562d35 page_view /hotel/products \n", + "\n", + " productId storeMode userAgent \\\n", + "76 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "77 None hotel Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53... \n", + "78 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "79 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "80 None hotel Mozilla/5.0 (X11; Linux x86_64; rv:145.0) Geck... \n", + "\n", + " ts metadata_referrer metadata_elementText ... \\\n", + "76 2025-11-29T17:32:45.064Z NaN ... \n", + "77 2025-11-29T18:13:53.858Z NaN ... \n", + "78 2025-12-04T11:13:15.884Z NaN ... \n", + "79 2025-12-04T11:18:53.473Z NaN ... \n", + "80 2025-12-04T11:19:05.094Z NaN ... \n", + "\n", + " elasticity std_error n_obs exp_subject exp_human_only exp_market_mode \\\n", + "76 NaN NaN NaN Daniel 1 True hotel \n", + "77 NaN NaN NaN Daniel 1 True hotel \n", + "78 NaN NaN NaN Daniel False hotel \n", + "79 NaN NaN NaN Daniel False hotel \n", + "80 NaN NaN NaN Daniel False hotel \n", + "\n", + " exp_task_id task_name \\\n", + "76 920c3deb-18c6-4586-bbc4-4ce4d1ae6f2d Cheapest Room w/ View \n", + "77 920c3deb-18c6-4586-bbc4-4ce4d1ae6f2d Cheapest Room w/ View \n", + "78 517b8078-cf4c-4a1f-b943-75281c69a5b3 Cheapest Room \n", + "79 517b8078-cf4c-4a1f-b943-75281c69a5b3 Cheapest Room \n", + "80 517b8078-cf4c-4a1f-b943-75281c69a5b3 Cheapest Room \n", + "\n", + " task_def_of_done \\\n", + "76 User added to cart a the cheapest room of all ... \n", + "77 User added to cart a the cheapest room of all ... \n", + "78 A room was added and purchased. \n", + "79 A room was added and purchased. \n", + "80 A room was added and purchased. \n", + "\n", + " task_description \n", + "76 Find the cheapest room with a nice view in the... \n", + "77 Find the cheapest room with a nice view in the... \n", + "78 Find the cheapest hotel room in multiple steps... \n", + "79 Find the cheapest hotel room in multiple steps... \n", + "80 Find the cheapest hotel room in multiple steps... \n", + "\n", + "[5 rows x 31 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff_exp.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb61bb99-3597-4473-86e2-c90cc51a9c9a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (PHANTOM)", + "language": "python", + "name": "phantom" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experiments/procesing/tests/test_session.py b/experiments/procesing/tests/test_session.py new file mode 100644 index 0000000..bb45d87 --- /dev/null +++ b/experiments/procesing/tests/test_session.py @@ -0,0 +1,165 @@ +import pytest +import pandas as pd +import numpy as np +from procesing.steps.session import ( + TemporalFeatureStep, + BehavioralFeatureStep, + ProductFeatureStep, + UserAgentFeatureStep, + ExtractSessionFeaturesStep, + JoinLabelsStep, + ValidateDataStep, +) + + +# TemporalFeatureStep tests +def test_temporal_empty(pipeline_context): + result = TemporalFeatureStep(pipeline_context).transform(pd.DataFrame()) + assert 'sessionId' in result.columns + assert result.empty + + +def test_temporal_basic(pipeline_context, session_interactions): + result = TemporalFeatureStep(pipeline_context).transform(session_interactions) + assert 'session_duration_sec' in result.columns + assert 'interaction_velocity' in result.columns + assert 'max_velocity_5min' in result.columns + assert result['total_interactions'].sum() == len(session_interactions) + + +def test_temporal_timeout(pipeline_context): + df = pd.DataFrame({ + 'sessionId': ['s1', 's1'], + 'ts': ['2025-01-01T10:00:00Z', '2025-01-01T11:00:00Z'], # 1 hour gap + }) + result = TemporalFeatureStep(pipeline_context, timeout_sec=900).transform(df) + assert result.iloc[0]['session_duration_sec'] == 0 # gap exceeds timeout + + +# BehavioralFeatureStep tests +def test_behavioral_empty(pipeline_context): + result = BehavioralFeatureStep(pipeline_context).transform(pd.DataFrame()) + assert 'sessionId' in result.columns + + +def test_behavioral_counts(pipeline_context, session_interactions): + result = BehavioralFeatureStep(pipeline_context).transform(session_interactions) + assert 'page_views' in result.columns + assert 'item_views' in result.columns + assert 'hover_events' in result.columns + assert result['total_events'].sum() == len(session_interactions) + + +def test_behavioral_hover_prefix(pipeline_context): + df = pd.DataFrame({ + 'sessionId': ['s1', 's1'], + 'eventName': ['hover_over_custom', 'hover_over_button'], + 'page': ['/products', '/products'], + }) + result = BehavioralFeatureStep(pipeline_context).transform(df) + assert result.iloc[0]['hover_events'] == 2 + + +# ProductFeatureStep tests +def test_product_empty(pipeline_context): + result = ProductFeatureStep(pipeline_context).transform(pd.DataFrame()) + assert 'sessionId' in result.columns + + +def test_product_features(pipeline_context, session_interactions): + result = ProductFeatureStep(pipeline_context).transform(session_interactions) + assert 'unique_products_viewed' in result.columns + assert 'price_range' in result.columns + assert result['unique_products_viewed'].sum() > 0 + + +# UserAgentFeatureStep tests +def test_ua_empty(pipeline_context): + result = UserAgentFeatureStep(pipeline_context).transform(pd.DataFrame()) + assert 'sessionId' in result.columns + + +def test_ua_headless_detection(pipeline_context): + df = pd.DataFrame({ + 'sessionId': ['s1', 's2'], + 'userAgent': ['Mozilla/5.0 Chrome/120', 'HeadlessChrome/120'], + }) + result = UserAgentFeatureStep(pipeline_context).transform(df) + assert 'is_headless' in result.columns + headless = dict(zip(result['sessionId'], result['is_headless'])) + assert headless['s1'] == False + assert headless['s2'] == True + + +def test_ua_browser_family(pipeline_context): + df = pd.DataFrame({ + 'sessionId': ['s1', 's2', 's3'], + 'userAgent': ['Mozilla/5.0 Firefox/120', 'Safari/605.1.15', 'Unknown'], + }) + result = UserAgentFeatureStep(pipeline_context).transform(df) + browsers = dict(zip(result['sessionId'], result['browser_family'])) + assert browsers['s1'] == 'Firefox' + assert browsers['s2'] == 'Safari' + assert browsers['s3'] == 'Other' + + +def test_ua_automation_detection(pipeline_context): + df = pd.DataFrame({ + 'sessionId': ['s1', 's2'], + 'userAgent': ['Selenium WebDriver', 'Normal Chrome/120'], + }) + result = UserAgentFeatureStep(pipeline_context).transform(df) + auto = dict(zip(result['sessionId'], result['is_automation'])) + assert auto['s1'] == True + assert auto['s2'] == False + + +# ExtractSessionFeaturesStep tests +def test_extract_empty(pipeline_context): + result = ExtractSessionFeaturesStep(pipeline_context).transform(pd.DataFrame()) + assert result.empty + + +def test_extract_merges_all(pipeline_context, session_interactions): + result = ExtractSessionFeaturesStep(pipeline_context).transform(session_interactions) + expected = ['session_duration_sec', 'total_events', 'unique_products_viewed', 'is_headless'] + for col in expected: + assert col in result.columns + assert 'experimentId' in result.columns + + +# JoinLabelsStep tests +def test_join_labels_tuple_input(pipeline_context): + features = pd.DataFrame({'sessionId': ['s1'], 'experimentId': ['exp1'], 'total_events': [5]}) + experiments = pd.DataFrame({'id': ['exp1'], 'xp_human_only': [True]}) + result = JoinLabelsStep(pipeline_context).transform((features, experiments)) + assert 'is_agent' in result.columns + assert result.iloc[0]['is_agent'] == False + + +def test_join_labels_empty_experiments(pipeline_context): + features = pd.DataFrame({'sessionId': ['s1'], 'experimentId': ['exp1']}) + result = JoinLabelsStep(pipeline_context).transform((features, pd.DataFrame())) + assert pd.isna(result.iloc[0]['is_agent']) + + +# ValidateDataStep tests +def test_validate_empty(pipeline_context): + ValidateDataStep(pipeline_context).transform(pd.DataFrame()) + report = pipeline_context.get_cached('validation_report') + assert report['status'] == 'empty' + + +def test_validate_missing_cols(pipeline_context): + df = pd.DataFrame({'sessionId': ['s1'], 'ts': ['2025-01-01']}) + ValidateDataStep(pipeline_context).transform(df) + report = pipeline_context.get_cached('validation_report') + assert report['status'] == 'invalid' + assert 'eventName' in report['missing_cols'] + + +def test_validate_valid(pipeline_context, session_interactions): + ValidateDataStep(pipeline_context).transform(session_interactions) + report = pipeline_context.get_cached('validation_report') + assert report['status'] == 'valid' + assert report['sessions'] > 0 diff --git a/lib/separability.py b/lib/separability.py new file mode 100644 index 0000000..a93ddeb --- /dev/null +++ b/lib/separability.py @@ -0,0 +1,128 @@ +"""Utilities for loading separability artifacts and scoring interaction sessions.""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Sequence + +import joblib +import numpy as np + +from experiments.ml.arch import featurize_trajectory + + +DEFAULT_ARTIFACT_DIR = Path("data/separability") + + +@dataclass +class SeparabilityArtifacts: + scaler: object + classifier: object + states: List[str] + event_transitions: Dict[str, Dict[str, float]] + feature_dim: int + + +def _normalize_events(raw_events: Sequence[object]) -> List[object]: + events: List[object] = [] + for evt in raw_events: + if hasattr(evt, "value") and hasattr(evt.value, "payload"): + events.append(evt.value.payload) + else: + events.append(evt) + events.sort(key=lambda e: getattr(e, "ts", "")) + return events + + +def _event_transition_distribution(events: Sequence[object]) -> Dict[str, Dict[str, float]]: + counts: Dict[str, Dict[str, int]] = {} + for src_evt, dst_evt in zip(events, events[1:]): + src_name = getattr(src_evt, "eventName", "unknown") + dst_name = getattr(dst_evt, "eventName", "unknown") + counts.setdefault(src_name, {}) + counts[src_name][dst_name] = counts[src_name].get(dst_name, 0) + 1 + + distribution: Dict[str, Dict[str, float]] = {} + for src, dsts in counts.items(): + total = float(sum(dsts.values())) + distribution[src] = {dst: val / total for dst, val in dsts.items()} if total else {} + return distribution + + +def _kl_divergence(p: Dict[str, Dict[str, float]], q: Dict[str, Dict[str, float]]) -> float: + eps = 1e-10 + total = 0.0 + for src, dsts in p.items(): + for dst, prob in dsts.items(): + ref = q.get(src, {}).get(dst, 0.0) + total += (prob + eps) * np.log((prob + eps) / (ref + eps)) + return float(total) + + +def load_artifacts(artifact_dir: Path | str = DEFAULT_ARTIFACT_DIR) -> SeparabilityArtifacts: + artifact_dir = Path(artifact_dir) + scaler_path = artifact_dir / "scaler.joblib" + model_path = artifact_dir / "classifier.joblib" + metadata_path = artifact_dir / "metadata.json" + + if not (scaler_path.exists() and model_path.exists() and metadata_path.exists()): + raise FileNotFoundError( + f"Separability artifacts not found in {artifact_dir}. Run sim.strong_learner.train first." + ) + + scaler = joblib.load(scaler_path) + classifier = joblib.load(model_path) + with open(metadata_path, "r", encoding="utf-8") as fin: + metadata = json.load(fin) + + return SeparabilityArtifacts( + scaler=scaler, + classifier=classifier, + states=list(metadata["reference_states"]), + event_transitions=metadata["event_transitions"], + feature_dim=int(metadata["feature_dim"]), + ) + + +def score_session( + raw_events: Sequence[object], + artifacts: SeparabilityArtifacts, +) -> dict: + events = _normalize_events(raw_events) + if not events: + return {"prob_agent": 0.0, "delta_h": 0.0, "delta_a": 0.0} + + reference_mdp = {"states": artifacts.states} + features = featurize_trajectory(events, mdp=reference_mdp, input_dim=artifacts.feature_dim) + scaled = artifacts.scaler.transform(features.reshape(1, -1)) + prob_agent = float(artifacts.classifier.predict_proba(scaled)[0, 1]) + + session_dist = _event_transition_distribution(events) + delta_h = _kl_divergence(session_dist, artifacts.event_transitions.get("human", {})) + delta_a = _kl_divergence(session_dist, artifacts.event_transitions.get("agent", {})) + + return { + "prob_agent": prob_agent, + "delta_h": delta_h, + "delta_a": delta_a, + } + + +def estimate_alpha(prob_agent: float, delta_h: float, delta_a: float, temperature: float = 1.0) -> float: + divergence_mass = delta_h + delta_a + if divergence_mass <= 1e-8: + return float(prob_agent) + + ratio = delta_a / divergence_mass + blended = 0.5 * prob_agent + 0.5 * ratio + if temperature <= 0: + return float(np.clip(blended, 0.0, 1.0)) + + scaled = 1.0 / (1.0 + np.exp(-temperature * (blended - 0.5))) + return float(np.clip(scaled, 0.0, 1.0)) + + +def score_sessions(raw_sessions: Iterable[Sequence[object]], artifacts: SeparabilityArtifacts) -> List[dict]: + return [score_session(events, artifacts) for events in raw_sessions] diff --git a/paper/src/chapters/slacberger.tex b/paper/src/chapters/slacberger.tex new file mode 100644 index 0000000..7728c91 --- /dev/null +++ b/paper/src/chapters/slacberger.tex @@ -0,0 +1,69 @@ + +\section{Problem Formulation: A Stackelberg Game Approach} +\label{sec:math_formulation} + +We formalize the interaction between the dynamic pricing system and non-human actors as a \textit{Stackelberg Game} (Leader-Follower) with incomplete information. This framework captures the hierarchical nature of the problem: the Platform (Leader) sets a pricing policy, and the Actors (Followers)---both Humans and Agents---observe these prices and react strategically. + +\subsection{The Players and Objectives} + +Let $t \in \{1, \dots, T\}$ denote discrete time steps. At each step, the system interactions are defined by the following entities: + +\paragraph{1. The Leader (The Platform)} +The e-commerce platform acts as the leader, choosing a pricing policy $\pi$ to maximize total expected revenue. At time $t$, given a state $s_t \in \mathcal{S}$ (representing inventory, time of day, and historical interactions), the platform sets a price $p_t \in [p_{\min}, p_{\max}]$. + +The platform's goal is to maximize the cumulative revenue from genuine human transactions while mitigating the distortion caused by agent interactions. + +\paragraph{2. The Followers (The Demand Mixture)} +The observed demand is not a monolithic signal but a mixture of two distinct populations with divergent objective functions. Let $u$ denote an incoming actor. The type of the actor $\theta \in \{H, A\}$ is a latent variable, where $H$ denotes a Human and $A$ denotes an Agent. + +\begin{itemize} + \item \textbf{The Human ($H$):} Acts as a \textit{myopic utility maximizer}. A human $i$ has a private valuation $v_i$ for the product. They execute a purchase decision $d_i \in \{0, 1\}$ based on the consumer surplus: + \begin{equation} + d_i(p_t) = \mathbb{I}(v_i - p_t \geq 0) + \end{equation} + where $\mathbb{I}(\cdot)$ is the indicator function. The aggregate human demand $q_H(p_t)$ follows a standard downward-sloping demand curve $D(p_t)$. + + \item \textbf{The Agent ($A$):} Acts as an \textit{information maximizer} (reconnaissance). The agent does not intend to purchase at the displayed price $p_t$ unless an arbitrage condition is met. Instead, the agent generates interaction events (queries) to estimate the platform's pricing function $f(p)$. The agent's reward function $R_A$ is defined by Information Gain: + \begin{equation} + R_A(p_t) = H(\mathcal{P}) - H(\mathcal{P} \mid p_t) - c_{query} + \end{equation} + where $H(\mathcal{P})$ is the entropy of the agent's belief regarding the price distribution, and $c_{query}$ is the marginal cost of interaction (assumed $\approx 0$ for LLMs). +\end{itemize} + +\subsection{The Demand Contamination Model} + +% MAYBE alpha has to be \lambda which we also need to formally define still + +The core difficulty in this setting is that the platform observes only the aggregate interaction volume $\hat{q}_t$, which is a contaminated signal. Let $\alpha_t \in [0, 1]$ represent the proportion of traffic generated by agents at time $t$. The observed signal is: + +\begin{equation} + \hat{q}_t(p_t) = (1 - \alpha_t) \cdot q_H(p_t) + \alpha_t \cdot q_A(p_t) + \epsilon_t +\end{equation} + +where: +\begin{itemize} + \item $q_H(p_t)$ is the \textit{true signal} (conversion intent). + \item $q_A(p_t)$ is the \textit{adversarial noise} (reconnaissance queries). + \item $\epsilon_t$ is random market noise. +\end{itemize} + +Crucially, $q_A(p_t)$ is often inversely correlated with $q_H(p_t)$ in terms of utility; agents may flood the system with queries during high-volatility periods to map price boundaries, artificially inflating $\hat{q}_t$ without converting. + +\subsection{The Optimization Objective: Robust Revenue} + +Standard dynamic pricing algorithms (e.g., Thompson Sampling or UCB) assume $\alpha_t = 0$, estimating demand $\hat{D}(p) \approx \mathbb{E}[\hat{q} | p]$. In the presence of agents ($\alpha_t > 0$), this estimator becomes biased, leading to the \textit{Cost of Information} (COI) defined in Section 3.2. + +We propose a robust optimization objective. The platform seeks a pricing policy $\pi^*$ that maximizes worst-case revenue over a statistically plausible set of contamination rates $\alpha$: + +\begin{equation} + \pi^* = \argmax_{\pi} \sum_{t=1}^T \mathbb{E}_{s_t} \left[ \min_{\alpha} \left( p_t \cdot \hat{q}_t(p_t | \theta=H) \right) - \lambda \cdot \mathcal{L}_{detect}(\hat{q}_t) \right] +\end{equation} + +Here: +\begin{itemize} + \item The first term, $p_t \cdot \hat{q}_t(p_t | \theta=H)$, represents the revenue generated strictly from the estimated human segment. + \item $\mathcal{L}_{detect}$ is a penalty term for failing to separate distributions (the cost of confusion). + \item $\lambda$ is a hyperparameter balancing revenue exploitation vs. robust detection. +\end{itemize} + +This formulation effectively transforms the pricing problem into a \textit{Distributionally Robust Optimization (DRO)} problem, where the learner must guard against adversarial perturbations (Agent traffic) in the observed demand distribution. diff --git a/paper/src/graphics/gcp.png b/paper/src/graphics/gcp.png new file mode 100644 index 0000000..12e95ae Binary files /dev/null and b/paper/src/graphics/gcp.png differ diff --git a/paper/src/graphics/gcp.webp b/paper/src/graphics/gcp.webp new file mode 100644 index 0000000..c51bd7e Binary files /dev/null and b/paper/src/graphics/gcp.webp differ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..70b1f11 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,12 @@ +[build-system] +requires = ["setuptools>=45", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "phantom" +version = "0.1.0" +description = "Pricing Heuristics Against Non-human Transaction Orchestration Mechanisms" +requires-python = ">=3.8" + +[tool.setuptools.packages.find] +include = ["experiments*", "lib*"] diff --git a/scripts/tpu_pod_run.sh b/scripts/tpu_pod_run.sh new file mode 100755 index 0000000..8e1d722 --- /dev/null +++ b/scripts/tpu_pod_run.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env sh +# Executed on each TPU pod worker via `gcloud tpu-vm scp` + `gcloud tpu-vm ssh --worker=all`. +# Authenticates with Artifact Registry using the VM's service account metadata token, +# pulls the TPU trainer image, then runs the W&B sweep agent inside Docker. +# TPU chip devices (/dev/accel*) are exposed via --privileged + /dev volume mount. +# Required env vars: WANDB_API_KEY, SWEEP_ID +# Optional: AGENT_COUNT (default 1, 0 = run until sweep ends) +set -eu + +IMAGE="us-central1-docker.pkg.dev/phantom-trc/phantom/phantom-trainer:tpu-latest" +AGENT_COUNT="${AGENT_COUNT:-1}" + +# use VM service account — no manual key needed on the pod +TOKEN=$(curl -sf -H "Metadata-Flavor: Google" \ + "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/token" \ + | python3 -c 'import sys, json; print(json.load(sys.stdin)["access_token"])') + +echo "$TOKEN" | sudo docker login -u oauth2accesstoken \ + --password-stdin https://us-central1-docker.pkg.dev + +sudo docker pull "$IMAGE" + +# --privileged + /dev mount gives the container access to /dev/accel* (TPU chips) +# --network host lets JAX reach the other pod workers for distributed init +sudo docker run --rm \ + --privileged \ + --network host \ + --volume /dev:/dev \ + -e WANDB_API_KEY="$WANDB_API_KEY" \ + -e SWEEP_ID="$SWEEP_ID" \ + -e AGENT_COUNT="$AGENT_COUNT" \ + "$IMAGE" diff --git a/scripts/tpu_sync_repo.sh b/scripts/tpu_sync_repo.sh new file mode 100644 index 0000000..a26e241 --- /dev/null +++ b/scripts/tpu_sync_repo.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env sh +set -eu + +TPU_NAME="${TPU_NAME:?TPU_NAME is required}" +TPU_ZONE="${TPU_ZONE:-us-central2-b}" +TPU_PROJECT="${TPU_PROJECT:-phantom-trc}" +LOCAL_REPO_DIR="${LOCAL_REPO_DIR:-$(pwd)}" +REMOTE_REPO_DIR="${REMOTE_REPO_DIR:-/tmp/PHANTOM}" +ARCHIVE_PATH="${ARCHIVE_PATH:-/tmp/phantom-sync.tgz}" + +FILE_LIST="$(mktemp /tmp/phantom-sync-files.XXXXXX)" +CLEANUP_LIST=true + +cleanup() { + if [ "$CLEANUP_LIST" = "true" ]; then + rm -f "$FILE_LIST" + fi +} +trap cleanup EXIT + +if [ ! -d "$LOCAL_REPO_DIR" ]; then + echo "local repo directory not found: $LOCAL_REPO_DIR" + exit 1 +fi + +if git -C "$LOCAL_REPO_DIR" rev-parse --is-inside-work-tree >/dev/null 2>&1; then + git -C "$LOCAL_REPO_DIR" ls-files -co --exclude-standard > "$FILE_LIST" + python3 - "$FILE_LIST" <<'PY' +import sys +from pathlib import Path + +file_list = Path(sys.argv[1]) +skip_prefixes = ( + "wandb/", + ".venv/", + "venv/", + "node_modules/", + ".next/", + ".turbo/", + "__pycache__/", + ".mypy_cache/", + ".pytest_cache/", + ".ruff_cache/", + "paper/build/", + "tests/e2e/test-results/", +) + +rows = file_list.read_text().splitlines() +kept = [ + row + for row in rows + if row and not any(row == p.rstrip("/") or row.startswith(p) for p in skip_prefixes) +] +file_list.write_text("\n".join(kept) + ("\n" if kept else "")) +PY + tar -czf "$ARCHIVE_PATH" -C "$LOCAL_REPO_DIR" -T "$FILE_LIST" +else + tar \ + --exclude-vcs \ + --exclude=".venv" --exclude="*/.venv" \ + --exclude="venv" --exclude="*/venv" \ + --exclude="node_modules" --exclude="*/node_modules" \ + --exclude=".next" --exclude="*/.next" \ + --exclude=".turbo" --exclude="*/.turbo" \ + --exclude="__pycache__" --exclude="*/__pycache__" \ + --exclude=".mypy_cache" --exclude="*/.mypy_cache" \ + --exclude=".pytest_cache" --exclude="*/.pytest_cache" \ + --exclude=".ruff_cache" --exclude="*/.ruff_cache" \ + --exclude="wandb" --exclude="*/wandb" \ + --exclude="paper/build" \ + --exclude="tests/e2e/test-results" \ + -czf "$ARCHIVE_PATH" \ + -C "$LOCAL_REPO_DIR" . +fi + +gcloud compute tpus tpu-vm scp "$ARCHIVE_PATH" "$TPU_NAME:/tmp/phantom-sync.tgz" \ + --zone="$TPU_ZONE" --project="$TPU_PROJECT" --worker=all + +gcloud compute tpus tpu-vm ssh "$TPU_NAME" \ + --zone="$TPU_ZONE" --project="$TPU_PROJECT" --worker=all \ + --command="rm -rf '$REMOTE_REPO_DIR' && mkdir -p '$REMOTE_REPO_DIR' && tar -xzf /tmp/phantom-sync.tgz -C '$REMOTE_REPO_DIR' && rm -f /tmp/phantom-sync.tgz" + +rm -f "$ARCHIVE_PATH" diff --git a/scripts/tpu_vm_sweep_agent.py b/scripts/tpu_vm_sweep_agent.py new file mode 100644 index 0000000..f0d99b6 --- /dev/null +++ b/scripts/tpu_vm_sweep_agent.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import re +import shlex +import subprocess +import time +from pathlib import Path + +import wandb + + +CLI_MAP: dict[str, str] = { + "algo": "--algo", + "total_timesteps": "--total-timesteps", + "alpha": "--alpha", + "N": "--N", + "n_products": "--n-products", + "lambda_coi": "--lambda-coi", + "info_value": "--info-value", + "robust_radius": "--robust-radius", + "robust_points": "--robust-points", + "learning_rate": "--learning-rate", + "gamma": "--gamma", + "gae_lambda": "--gae-lambda", + "clip_range": "--clip-range", + "ent_coef": "--ent-coef", + "revenue_weight": "--revenue-weight", + "max_steps": "--max-steps", + "margin_floor": "--margin-floor", + "margin_floor_patience": "--margin-floor-patience", + "arch": "--arch", + "activation": "--activation", + "jax_num_envs": "--jax-num-envs", + "jax_num_steps": "--jax-num-steps", + "jax_num_minibatches": "--jax-num-minibatches", + "jax_update_epochs": "--jax-update-epochs", + "jax_anneal_lr": "--jax-anneal-lr", + "checkpoint_interval": "--checkpoint-interval", + "action_levels": "--action-levels", + "action_scale_low": "--action-scale-low", + "action_scale_high": "--action-scale-high", +} + + +def _to_cli_args(cfg: dict) -> str: + parts: list[str] = ["--jax", "--no-wandb"] + for key, flag in CLI_MAP.items(): + if key not in cfg: + continue + value = cfg[key] + if value is None: + continue + if isinstance(value, bool): + if key == "jax_anneal_lr": + parts.extend([flag, "true" if value else "false"]) + elif value: + parts.append(flag) + continue + parts.extend([flag, str(value)]) + return " ".join(shlex.quote(p) for p in parts) + + +_SENTINEL = "PHANTOM_METRICS:" + + +def _extract_metrics(output: str) -> dict: + # fast path: look for the dedicated sentinel line emitted by run_local + for line in output.splitlines(): + if line.startswith(_SENTINEL): + try: + return json.loads(line[len(_SENTINEL) :]) + except Exception: + break + # fallback: scan for any JSON block containing eval/sweep keys; + # use greedy match to capture the largest possible block first + for block in re.findall(r"\{[^{}]*\}", output): + try: + obj = json.loads(block) + except Exception: + continue + if isinstance(obj, dict) and ("sweep/score" in obj or "eval/reward" in obj): + return obj + return {} + + +def main() -> None: + p = argparse.ArgumentParser( + description="Run W&B sweep where each trial uses full TPU pod" + ) + p.add_argument("--sweep-id", required=True) + p.add_argument("--tpu-name", required=True) + p.add_argument("--tpu-zone", default="us-central2-b") + p.add_argument("--tpu-project", default="phantom-trc") + p.add_argument("--tpu-repo-dir", default="/tmp/PHANTOM") + p.add_argument("--count", type=int, default=0) + p.add_argument("--workdir", default=str(Path(__file__).resolve().parents[1])) + args = p.parse_args() + + workdir = Path(args.workdir).resolve() + env = os.environ.copy() + + prepare_cmd = [ + "make", + "train.tpu.vm.prepare", + f"TPU_NAME={args.tpu_name}", + f"TPU_ZONE={args.tpu_zone}", + f"TPU_PROJECT={args.tpu_project}", + f"TPU_REPO_DIR={args.tpu_repo_dir}", + ] + prepare = subprocess.run( + prepare_cmd, + cwd=workdir, + env=env, + text=True, + capture_output=False, + check=False, + ) + if prepare.returncode != 0: + raise RuntimeError("Failed to prepare TPU workers for sweep") + + def run_trial() -> None: + run = None + try: + run = wandb.init() + cfg = dict(wandb.config) + cli_args = _to_cli_args(cfg) + env_trial = dict(env) + env_trial["LOCAL_TRAIN_ARGS"] = cli_args + + cmd = [ + "make", + "train.tpu.vm.run", + f"TPU_NAME={args.tpu_name}", + f"TPU_ZONE={args.tpu_zone}", + f"TPU_PROJECT={args.tpu_project}", + f"TPU_REPO_DIR={args.tpu_repo_dir}", + ] + + proc = subprocess.run( + cmd, + cwd=workdir, + env=env_trial, + text=True, + capture_output=True, + check=False, + ) + + if proc.stdout: + print(proc.stdout) + if proc.stderr: + print(proc.stderr) + + if proc.returncode != 0: + if run is not None: + run.summary["runner/exit_code"] = proc.returncode + raise RuntimeError(f"TPU trial failed with exit code {proc.returncode}") + + metrics = _extract_metrics(proc.stdout) + if metrics: + wandb.log(metrics) + for k, v in metrics.items(): + run.summary[k] = v + run.summary["runner/exit_code"] = 0 + except Exception: + time.sleep(2) + raise + finally: + if run is not None and wandb.run is not None: + wandb.finish() + + wandb.agent( + args.sweep_id, + function=run_trial, + count=args.count if args.count > 0 else None, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/tpu_vm_train.sh b/scripts/tpu_vm_train.sh new file mode 100644 index 0000000..33c798e --- /dev/null +++ b/scripts/tpu_vm_train.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env sh +set -eu + +REPO_DIR="${REPO_DIR:-$HOME/PHANTOM}" +PYTHON_BIN="${PYTHON_BIN:-python3}" +TRAIN_ARGS="${TRAIN_ARGS:---algo ppo --jax --total-timesteps 200000 --jax-num-envs 32 --jax-num-steps 128 --jax-num-minibatches 4 --jax-update-epochs 4}" +EXTRA_PIP="${EXTRA_PIP:-flax optax distrax}" +INSTALL_FULL_REQUIREMENTS="${INSTALL_FULL_REQUIREMENTS:-0}" + +if [ ! -d "$REPO_DIR" ]; then + echo "repo directory not found: $REPO_DIR" + exit 1 +fi + +cd "$REPO_DIR" + +if [ -d "wandb" ]; then + rm -rf wandb +fi + +# keep install idempotent and avoid re-installing jax/libtpu each run +if [ "$INSTALL_FULL_REQUIREMENTS" = "1" ] && [ -f "requirements.txt" ]; then + $PYTHON_BIN -m pip install -r requirements.txt +fi +if ! $PYTHON_BIN -c 'import flax, optax, distrax' >/dev/null 2>&1; then + if [ -f "engine/jax/requirements.txt" ]; then + $PYTHON_BIN -m pip install -r engine/jax/requirements.txt + fi + $PYTHON_BIN -m pip install -U $EXTRA_PIP +fi + +if [ -n "${WANDB_API_KEY:-}" ]; then + if ! $PYTHON_BIN -c 'import wandb; import inspect; assert hasattr(wandb, "init") and callable(wandb.init)' >/dev/null 2>&1; then + $PYTHON_BIN -m pip install -U wandb + fi +fi + +if [ -n "${WANDB_API_KEY:-}" ]; then + export WANDB_API_KEY + exec $PYTHON_BIN -m engine.train $TRAIN_ARGS +fi + +exec $PYTHON_BIN -m engine.train $TRAIN_ARGS --no-wandb diff --git a/scripts/wandb_agent_bootstrap.sh b/scripts/wandb_agent_bootstrap.sh new file mode 100755 index 0000000..effa1b5 --- /dev/null +++ b/scripts/wandb_agent_bootstrap.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +set -euo pipefail + +need_env() { + local name="$1" + if [ -z "${!name:-}" ]; then + echo "$name is required" + exit 1 + fi +} + +need_cmd() { + local c="$1" + command -v "$c" >/dev/null 2>&1 || { + echo "Missing command: $c" + exit 1 + } +} + +need_cmd git +need_cmd python3 + +need_env WANDB_API_KEY +need_env GITHUB_TOKEN +need_env REPO_URL +need_env SWEEP_ID + +BRANCH="${BRANCH:-main}" +WORKDIR="${WORKDIR:-$HOME/PHANTOM-agent}" +AGENT_COUNT="${AGENT_COUNT:-0}" +AGENT_LOOP="${AGENT_LOOP:-1}" +RETRY_SECONDS="${RETRY_SECONDS:-20}" +PYTHON_BIN="${PYTHON_BIN:-python3}" + +mkdir -p "$(dirname "$WORKDIR")" + +ASKPASS_FILE="$(mktemp)" +cat >"$ASKPASS_FILE" <<'EOF' +#!/usr/bin/env sh +case "$1" in + *Username*) echo "x-access-token" ;; + *Password*) echo "$GITHUB_TOKEN" ;; + *) echo "" ;; +esac +EOF +chmod 700 "$ASKPASS_FILE" + +cleanup() { + rm -f "$ASKPASS_FILE" +} +trap cleanup EXIT + +git_auth() { + GIT_TERMINAL_PROMPT=0 GIT_ASKPASS="$ASKPASS_FILE" git "$@" +} + +sync_repo() { + if [ ! -d "$WORKDIR/.git" ]; then + rm -rf "$WORKDIR" + git_auth clone --single-branch --branch "$BRANCH" "$REPO_URL" "$WORKDIR" + return + fi + + git -C "$WORKDIR" remote set-url origin "$REPO_URL" + git_auth -C "$WORKDIR" fetch origin "$BRANCH" --prune + git -C "$WORKDIR" checkout -B "$BRANCH" "origin/$BRANCH" + git -C "$WORKDIR" reset --hard "origin/$BRANCH" +} + +install_deps() { + "$PYTHON_BIN" -m venv "$WORKDIR/.venv" + "$WORKDIR/.venv/bin/pip" install --upgrade pip + "$WORKDIR/.venv/bin/pip" install -r "$WORKDIR/requirements.txt" +} + +run_agent() { + local cmd=("$WORKDIR/.venv/bin/python" -m engine.train --sweep-agent --sweep-id "$SWEEP_ID") + if [ "$AGENT_COUNT" != "0" ]; then + cmd+=(--count "$AGENT_COUNT") + fi + + ( + cd "$WORKDIR" + WANDB_API_KEY="$WANDB_API_KEY" \ + WANDB_ENTITY="${WANDB_ENTITY:-}" \ + WANDB_PROJECT="${WANDB_PROJECT:-}" \ + "${cmd[@]}" + ) +} + +while true; do + sync_repo + install_deps + + if run_agent; then + if [ "$AGENT_LOOP" = "1" ] && [ "$AGENT_COUNT" = "0" ]; then + sleep "$RETRY_SECONDS" + continue + fi + exit 0 + fi + + if [ "$AGENT_LOOP" != "1" ]; then + exit 1 + fi + + sleep "$RETRY_SECONDS" +done diff --git a/sim/requirements.txt b/sim/requirements.txt new file mode 100644 index 0000000..d38cfd4 --- /dev/null +++ b/sim/requirements.txt @@ -0,0 +1,7 @@ +gymnasium>=0.29.0 +numpy>=1.24.0 +pandas>=2.0.0 +stable-baselines3>=2.2.0 +tensorboard>=2.15.0 +jax>=0.4.20 +jaxlib>=0.4.20 diff --git a/sim/rl/behavior_loader/visualize_kl.py b/sim/rl/behavior_loader/visualize_kl.py new file mode 100644 index 0000000..e5cd1ef --- /dev/null +++ b/sim/rl/behavior_loader/visualize_kl.py @@ -0,0 +1,117 @@ +import numpy as np +import matplotlib.pyplot as plt +from collections import defaultdict +from models import BehaviorModel, AgentBehaviorModel, aggregate_event_transitions, kl_divergence + +def event_frequency_distribution(mdp): + evt_cnt, total = defaultdict(int), 0 + for s, trans in mdp['transitions'].items(): + evt = s.split('|')[2] + for cnt in mdp['trans_counts'][s].values(): + evt_cnt[evt] += cnt + total += cnt + return {evt: cnt/total for evt, cnt in evt_cnt.items()} if total > 0 else {} + +def transition_distribution(mdp): + trans_cnt, total = defaultdict(int), 0 + for s, trans in mdp['trans_counts'].items(): + src = s.split('|')[2] + for s_next, cnt in trans.items(): + dst = s_next.split('|')[2] + trans_cnt[f"{src}->{dst}"] += cnt + total += cnt + return {t: cnt/total for t, cnt in trans_cnt.items()} if total > 0 else {} + +def kl_color(kl): + return '#d62828' if kl > 2.0 else '#f77f00' if kl > 0.5 else '#2a9d8f' + +def plot_comparison(ax, human_vals, agent_vals, labels, title, ylabel, kl_val=None): + x, w = np.arange(len(labels)), 0.35 + ax.bar(x - w/2, human_vals, w, label='Human', alpha=0.8, color='#2E86AB') + ax.bar(x + w/2, agent_vals, w, label='Agent', alpha=0.8, color='#A23B72') + ax.set_ylabel(ylabel, fontsize=9 if len(labels) > 10 else 11, fontweight='bold') + ax.set_title(title if not kl_val else f"{title}\nKL={kl_val:.4f}", + fontsize=10 if len(labels) > 10 else 12, fontweight='bold') + ax.set_xticks(x) + ax.set_xticklabels(labels, rotation=45, ha='right', fontsize=8) + ax.legend(fontsize=8) + ax.grid(axis='y', alpha=0.3, linestyle='--') + return ax + +if __name__ == "__main__": + base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments" + human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/" + + human_model, agent_model = BehaviorModel(human_dir), AgentBehaviorModel(agent_dir) + human_mdp, agent_mdp = human_model.build_MDP(), agent_model.build_MDP() + + human_evt, agent_evt = aggregate_event_transitions(human_mdp), aggregate_event_transitions(agent_mdp) + common = set(human_evt.keys()) & set(agent_evt.keys()) + kl_results = sorted([(e, kl_divergence(human_evt[e], agent_evt[e])) for e in common], + key=lambda x: x[1], reverse=True) + + fig = plt.figure(figsize=(16, 10)) + n_rows, n_cols = (len(kl_results) + 1) // 2, 2 + + for idx, (evt, kl) in enumerate(kl_results): + ax = plt.subplot(n_rows, n_cols, idx + 1) + h_dist, a_dist = human_evt.get(evt, {}), agent_evt.get(evt, {}) + dests = sorted(set(h_dist.keys()) | set(a_dist.keys())) + if not dests: continue + + h_probs, a_probs = [h_dist.get(d, 0) for d in dests], [a_dist.get(d, 0) for d in dests] + plot_comparison(ax, h_probs, a_probs, dests, f'From: {evt}', 'Probability') + ax.set_ylim([0, max(max(h_probs + a_probs, default=0) * 1.1, 0.1)]) + ax.text(0.95, 0.95, f'KL={kl:.2f}', transform=ax.transAxes, fontsize=11, + fontweight='bold', va='top', ha='right', + bbox=dict(boxstyle='round', facecolor=kl_color(kl), alpha=0.3)) + + plt.tight_layout() + plt.savefig('kl_divergence_comparison.png', dpi=300, bbox_inches='tight') + print("Saved visualization to kl_divergence_comparison.png") + + fig2, ax2 = plt.subplots(figsize=(10, 6)) + evts, kls = zip(*kl_results) if kl_results else ([], []) + colors = [kl_color(kl) for kl in kls] + bars = ax2.barh(evts, kls, color=colors, alpha=0.8) + ax2.set_xlabel('KL Divergence D(Human || Agent)', fontsize=12, fontweight='bold') + ax2.set_ylabel('Event Type', fontsize=12, fontweight='bold') + ax2.set_title('Behavioral Divergence Between Human and Agent Traffic', fontsize=14, fontweight='bold') + if kls: + ax2.axvline(x=np.mean(kls), color='black', linestyle='--', linewidth=2, + alpha=0.5, label=f'Mean={np.mean(kls):.2f}') + for bar, kl in zip(bars, kls): + ax2.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height()/2, + f'{kl:.2f}', ha='left', va='center', fontsize=10, fontweight='bold') + ax2.legend() + ax2.grid(axis='x', alpha=0.3, linestyle='--') + + plt.tight_layout() + plt.savefig('kl_summary.png', dpi=300, bbox_inches='tight') + print("Saved KL summary to kl_summary.png") + + h_freq, a_freq = event_frequency_distribution(human_mdp), event_frequency_distribution(agent_mdp) + h_trans, a_trans = transition_distribution(human_mdp), transition_distribution(agent_mdp) + freq_kl, trans_kl = kl_divergence(h_freq, a_freq), kl_divergence(h_trans, a_trans) + + print(f"\n=== Global Distribution KL Divergence ===") + print(f"Event frequency KL: {freq_kl:.4f}") + print(f"Transition pair KL: {trans_kl:.4f}") + + fig3, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) + + all_evts = sorted(set(h_freq.keys()) | set(a_freq.keys())) + h_freqs, a_freqs = [h_freq.get(e, 0) for e in all_evts], [a_freq.get(e, 0) for e in all_evts] + plot_comparison(ax1, h_freqs, a_freqs, all_evts, 'Event Frequency Distribution', + 'Frequency', freq_kl) + + all_trans = sorted(set(h_trans.keys()) | set(a_trans.keys())) + top_trans = [t for t, _ in sorted([(t, h_trans.get(t, 0) + a_trans.get(t, 0)) + for t in all_trans], key=lambda x: x[1], reverse=True)[:15]] + h_tprobs, a_tprobs = [h_trans.get(t, 0) for t in top_trans], [a_trans.get(t, 0) for t in top_trans] + plot_comparison(ax2, h_tprobs, a_tprobs, top_trans, 'Top Transition Pairs Distribution', + 'Probability', trans_kl) + + plt.tight_layout() + plt.savefig('global_distributions.png', dpi=300, bbox_inches='tight') + print("Saved global distributions to global_distributions.png") diff --git a/sim/rl/thesis_core.py b/sim/rl/thesis_core.py new file mode 100644 index 0000000..99c9a4b --- /dev/null +++ b/sim/rl/thesis_core.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, Optional + +import numpy as np + +from sim.case.thesis_simplified.simplified import Session + + +@dataclass(frozen=True) +class PricingStep: + sessions: list[Session] + demand_by_session: Dict[str, float] + demand_by_product: np.ndarray + purchases_by_product: np.ndarray + revenue: float + cost: float + n_agents: int + + +def clip_prices(prices: np.ndarray, min_price: float, max_price: float) -> np.ndarray: + return np.clip(prices, min_price, max_price).astype(np.float32) + + +def constrain_prices( + prev_prices: Optional[np.ndarray], + proposed: np.ndarray, + *, + costs: np.ndarray, + min_price: float, + max_price: float, + max_adjustment: float, + min_margin_pct: float, +) -> np.ndarray: + prices = clip_prices(proposed, min_price, max_price) + floor = (costs * (1.0 + float(min_margin_pct))).astype(np.float32) + prices = np.maximum(prices, floor) + if prev_prices is None: + return prices + prev_prices = prev_prices.astype(np.float32) + ratio = np.clip(prices / (prev_prices + 1e-6), 1.0 - max_adjustment, 1.0 + max_adjustment) + return (prev_prices * ratio).astype(np.float32) + + +def aggregate_demand_by_product( + sessions: list[Session], + demand_by_session: Dict[str, float], + n_products: int, +) -> np.ndarray: + demand = np.zeros(n_products, dtype=np.float32) + sessions_by_id = {s.sid: s for s in sessions} + for sid, q in demand_by_session.items(): + sess = sessions_by_id.get(sid) + if not sess or not sess.events: + continue + pidx = int(sess.events[0].product_idx) + if 0 <= pidx < n_products: + demand[pidx] += float(q) + return demand + + +def aggregate_purchases( + sessions: list[Session], + costs: np.ndarray, + n_products: int, +) -> tuple[np.ndarray, float, float, int]: + purchases = np.zeros(n_products, dtype=np.float32) + revenue = 0.0 + cost = 0.0 + n_agents = 0 + + for sess in sessions: + if sess.actor == "A": + n_agents += 1 + for e in sess.events: + if e.action != "purchase": + continue + pidx = int(e.product_idx) + if 0 <= pidx < n_products: + purchases[pidx] += 1.0 + revenue += float(e.price_seen) + cost += float(costs[pidx]) + + return purchases, revenue, cost, n_agents +