From fa65fe992d93f064b2c1332272387c498e5f5518 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 14 Dec 2025 17:30:01 +0100
Subject: [PATCH 01/99] initial environemnt definitions

---
 sim/rl/environment.py | 80 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 sim/rl/environment.py

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
new file mode 100644
index 0000000..803a4fd
--- /dev/null
+++ b/sim/rl/environment.py
@@ -0,0 +1,80 @@
+import gymnasium as gym
+from gymnasium import spaces
+import numpy as np
+from dataclasses import dataclass
+
+# here when we say "learner" we mean the agent that is learning to optimize the pricing and "agent" is part of the envrionment where the agent is creating demand that that "learner" is processing"
+
+@dataclass
+class BusinessLogicConstraints():
+    max_price_adjustment : float = 0.3 # maximum adjustment of price
+    system_max_price : float = 500.0 # maximum price allowed in the system
+    product_catelogue_size : int = 100 # number of products in the catalogue
+
+
+class PHANTOMEnv(gym.Env):
+    def __init__(self):
+        super(PHANTOMEnv, self).__init__()
+        self.constraints = BusinessLogicConstraints()
+        self.action_space = spaces.Box(
+            low=-self.constraints.max_price_adjustment, high=self.constraints.max_price_adjustment,
+            shape=(1,), dtype=np.float32) #  we allow teh learner to adjust price by some BusinessLogicConstraints factor
+        # Example for using image as input:
+        self.observation_space = spaces.Dict({
+            'elasticity': spaces.Dict({
+                'price': spaces.Box(low=0, high=self.constraints.system_max_price,
+                                    shape=(self.constraints.product_catelogue_size,), dtype=np.float32),
+                'demand': spaces.Box(low=0, high=np.inf,
+                                     shape=(self.constraints.product_catelogue_size,), dtype=np.float32)
+            })
+        })
+
+    def reset(self, seed=None, options=None):
+        super().reset(seed=seed)
+        # Initialize state
+        self.state = {
+            'price': 100.0,  # base price
+            'demand': 0.0
+        }
+        return self.state, {}
+
+    def step(self, action):
+        # Apply action
+        price_adjustment = action[0]
+        new_price = self.state['price'] * (1 + price_adjustment)
+        self.state['price'] = new_price
+
+        # Simulate demand based on new price
+        demand = self.simulate_demand(new_price)
+        self.state['demand'] = demand
+
+        # Calculate reward (e.g., revenue)
+        reward = new_price * demand
+
+        # Check if episode is done
+        done = self.state['price'] <= 0.0 or self.state['demand'] <= 0.0
+
+
+        return self.state, reward, done, False, {}
+    def simulate_demand(self, price):
+        # Simple linear demand model: demand decreases as price increases
+        base_demand = 200
+        price_sensitivity = 0.5
+        demand = max(0, base_demand - price_sensitivity * price)
+        return demand
+
+if __name__ == "__main__":
+    env = PHANTOMEnv()
+    obs, _ = env.reset()
+    done = False
+    total_reward = 0
+
+    while not done:
+        action = env.action_space.sample()  # Random action
+        obs, reward, done, _, _ = env.step(action)
+        total_reward += reward
+        print(f"Price: {obs['price']:.2f}, Demand: {obs['demand']:.2f}, Reward: {reward:.2f}")
+        if done:
+            break
+
+    print(f"Total Reward: {total_reward:.2f}")

From fe7dafed0a45dcb36df950a12abd25befaa543f3 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 14 Dec 2025 17:53:48 +0100
Subject: [PATCH 02/99] high level defintion

---
 sim/rl/environment.py | 94 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 83 insertions(+), 11 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index 803a4fd..a09438f 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -2,6 +2,7 @@ import gymnasium as gym
 from gymnasium import spaces
 import numpy as np
 from dataclasses import dataclass
+import pandas as pd
 
 # here when we say "learner" we mean the agent that is learning to optimize the pricing and "agent" is part of the envrionment where the agent is creating demand that that "learner" is processing"
 
@@ -9,17 +10,89 @@ from dataclasses import dataclass
 class BusinessLogicConstraints():
     max_price_adjustment : float = 0.3 # maximum adjustment of price
     system_max_price : float = 500.0 # maximum price allowed in the system
+    system_min_price : float = 1.0 # minimum price allowed in the system
     product_catelogue_size : int = 100 # number of products in the catalogue
 
 
+class CommercePlatform:
+    def __init__(self, product_catelogue_size: int, max_price: float, min_price: float):
+        self.product_catelogue_size = product_catelogue_size
+        self.max_price = max_price
+        self.min_price = min_price
+        self.simulation_history = []
+
+
+    def setup_true_demand(self,prices: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+        human_price_elasticity = -1.5  # Example elasticity value
+        base_demand = 100  # Base demand for products
+        demand = base_demand * (prices / self.max_price) ** human_price_elasticity
+
+        agent_price_elasticity = -2.0  # Example elasticity value for agents
+        agent_base_demand = 150  # Base demand for agents
+        agent_demand = agent_base_demand * (prices / self.max_price) ** agent_price_elasticity
+
+        return demand + agent_demand, agent_demand
+
+
+    def compute_interaction_features(self, interaction_data: np.ndarray) -> dict:
+        df = pd.DataFrame(interaction_data)
+        return {
+            'mean_sale_price': df[df['action'] == 'purchase']['price'].mean(),
+        }
+
+    def run_pricing_simulation(self, prices: np.ndarray) -> np.ndarray:
+        # Simulate demand based on prices
+
+        observed_demand, demand_from_agents = self.setup_true_demand(prices)
+        true_demand = observed_demand - demand_from_agents
+
+        interaction_data = self.get_interaction_data()
+        interaction_features = self.compute_interaction_features(interaction_data)
+        demand_estimates = self.demand_estimate(interaction_data)
+        internal_error = np.abs(true_demand - demand_estimates) / (true_demand + 1e-6)
+
+        self.simulation_history.append(
+            {
+                'prices': prices,
+                'true_demand': true_demand,
+                'demand_estimates': demand_estimates,
+                'internal_error': internal_error,
+                'interaction_data': interaction_data,
+                'interaction_features': interaction_features
+            })
+        return np.array(interaction_data)
+
+    def get_interaction_data(self) -> np.ndarray:
+        # Simulate interaction data
+        interaction_data = []
+        return np.array(interaction_data)
+
+
+    def demand_estimate(self, interactions : np.ndarray) -> np.ndarray:
+        demand_estimates = np.random.rand(self.product_catelogue_size) * 100  # Dummy demand estimates
+        return demand_estimates
+
+
+
+
+
+
+
+
+
 class PHANTOMEnv(gym.Env):
     def __init__(self):
         super(PHANTOMEnv, self).__init__()
         self.constraints = BusinessLogicConstraints()
         self.action_space = spaces.Box(
             low=-self.constraints.max_price_adjustment, high=self.constraints.max_price_adjustment,
-            shape=(1,), dtype=np.float32) #  we allow teh learner to adjust price by some BusinessLogicConstraints factor
+            shape=(self.constraints.product_catelogue_size,), dtype=np.float32) #  we allow teh learner to adjust price by some BusinessLogicConstraints factor
         # Example for using image as input:
+        self.commerce_platform = CommercePlatform(
+            product_catelogue_size=self.constraints.product_catelogue_size,
+            max_price=self.constraints.system_max_price,
+            min_price=self.constraints.system_min_price
+        )
         self.observation_space = spaces.Dict({
             'elasticity': spaces.Dict({
                 'price': spaces.Box(low=0, high=self.constraints.system_max_price,
@@ -29,24 +102,23 @@ class PHANTOMEnv(gym.Env):
             })
         })
 
-    def reset(self, seed=None, options=None):
+    def reset(self, seed :int, options) -> tuple[dict, dict]:
         super().reset(seed=seed)
         # Initialize state
         self.state = {
-            'price': 100.0,  # base price
-            'demand': 0.0
+            'elasticity': {
+                'price': np.full((self.constraints.product_catelogue_size,), 100.0, dtype=np.float32),
+                'demand': np.full((self.constraints.product_catelogue_size,), 50.0, dtype=np.float32)
+            }
         }
         return self.state, {}
 
     def step(self, action):
-        # Apply action
-        price_adjustment = action[0]
-        new_price = self.state['price'] * (1 + price_adjustment)
-        self.state['price'] = new_price
+        self.state['price'] = np.clip(self.state['price'] * (1 + action),
+                            self.constraints.system_min_price,
+                            self.constraints.system_max_price)
+
 
-        # Simulate demand based on new price
-        demand = self.simulate_demand(new_price)
-        self.state['demand'] = demand
 
         # Calculate reward (e.g., revenue)
         reward = new_price * demand

From c5caee21b168b9ca05b0fe3990f9b9c8de5c6f33 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 14 Dec 2025 17:59:34 +0100
Subject: [PATCH 03/99] formlating the reward simply

---
 sim/rl/environment.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index a09438f..ca7159b 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -40,7 +40,7 @@ class CommercePlatform:
             'mean_sale_price': df[df['action'] == 'purchase']['price'].mean(),
         }
 
-    def run_pricing_simulation(self, prices: np.ndarray) -> np.ndarray:
+    def run_pricing_simulation(self, prices: np.ndarray) -> dict:
         # Simulate demand based on prices
 
         observed_demand, demand_from_agents = self.setup_true_demand(prices)
@@ -51,16 +51,17 @@ class CommercePlatform:
         demand_estimates = self.demand_estimate(interaction_data)
         internal_error = np.abs(true_demand - demand_estimates) / (true_demand + 1e-6)
 
-        self.simulation_history.append(
-            {
+
+        summary = {
                 'prices': prices,
                 'true_demand': true_demand,
                 'demand_estimates': demand_estimates,
                 'internal_error': internal_error,
                 'interaction_data': interaction_data,
                 'interaction_features': interaction_features
-            })
-        return np.array(interaction_data)
+            }
+        self.simulation_history.append(summary)
+        return summary
 
     def get_interaction_data(self) -> np.ndarray:
         # Simulate interaction data
@@ -118,10 +119,24 @@ class PHANTOMEnv(gym.Env):
                             self.constraints.system_min_price,
                             self.constraints.system_max_price)
 
+        result = self.commerce_platform.run_pricing_simulation(self.state['price'])
+        history = self.commerce_platform.simulation_history
+        self.state['demand'] = result['demand_estimates']
+
+
+
+        reward = sum(
+            self.state['price'] * self.state['demand'],
+            # performance historically, to take into account business kpi trends (using features from interaction data)
+            sum(
+                [-0.05 * i * history[-1]['internal_error'] for i in range(1, len(history))],
+            ) if len(history) > 1 else 0,
+            sum(
+                [0.1 * history[-1]['interaction_features']['mean_sale_price'] - 0.1 * history[i]['interaction_features']['mean_sale_price'] for i in range(len(history)-1)],
+            ) if len(history) > 1 else 0
+        )
 
 
-        # Calculate reward (e.g., revenue)
-        reward = new_price * demand
 
         # Check if episode is done
         done = self.state['price'] <= 0.0 or self.state['demand'] <= 0.0

From aae124f5eaf7def261d77a0bcaa10e2197d098d4 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 14 Dec 2025 18:59:02 +0100
Subject: [PATCH 04/99] improved implementation

---
 sim/rl/environment.py | 512 ++++++++++++++++++++++++++++++++----------
 1 file changed, 398 insertions(+), 114 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index ca7159b..19f9ad4 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -3,165 +3,449 @@ from gymnasium import spaces
 import numpy as np
 from dataclasses import dataclass
 import pandas as pd
+from typing import Callable, Optional, Dict, Any, List
 
-# here when we say "learner" we mean the agent that is learning to optimize the pricing and "agent" is part of the envrionment where the agent is creating demand that that "learner" is processing"
+# "learner"  agent learning to optimize pricing
+# "agent"  part of environment creating demand signals that learner processes
 
 @dataclass
 class BusinessLogicConstraints():
-    max_price_adjustment : float = 0.3 # maximum adjustment of price
-    system_max_price : float = 500.0 # maximum price allowed in the system
-    system_min_price : float = 1.0 # minimum price allowed in the system
-    product_catelogue_size : int = 100 # number of products in the catalogue
+    max_price_adjustment: float = 0.30
+    system_max_price: float = 500.0
+    system_min_price: float = 1.0
+    product_catelogue_size: int = 100
+    episode_length: int = 200
+    sessions_per_step: int = 250
+    agent_share: float = 0.25
+    agent_recon_multiplier: float = 6.0
+    agent_purchase_probability: float = 0.20
+    coi_strength: float = 0.25
+    coi_threshold: float = 4.0
+    coi_sigmoid_temp: float = 1.25
+    base_human_demand: float = 0.08
+    base_agent_demand: float = 0.05
+    human_price_elasticity: float = -1.2
+    agent_price_elasticity: float = -0.6
+    w_agent_loss: float = 1.0
+    w_volatility: float = 5.0
+    w_estimation_error: float = 0.25
+    seed: int = 7
+
+
+def _sigmoid(x: np.ndarray) -> np.ndarray:
+    return 1.0 / (1.0 + np.exp(-x))
+
+
+def simple_agent_detector(session_df: pd.DataFrame) -> pd.Series:
+    # baseline heuristic: high velocity + low conversion
+    v = session_df.get("interaction_velocity", pd.Series(0.0, index=session_df.index))
+    cr = session_df.get("conversion_rate", pd.Series(0.0, index=session_df.index))
+    total = session_df.get("total_interactions", pd.Series(0, index=session_df.index))
+    return (total >= 12) & (v >= 0.20) & (cr <= 0.01)
 
 
 class CommercePlatform:
-    def __init__(self, product_catelogue_size: int, max_price: float, min_price: float):
+    def __init__(self, product_catelogue_size: int, max_price: float, min_price: float,
+                 constraints: BusinessLogicConstraints, agent_detector: Optional[Callable[[pd.DataFrame], pd.Series]] = None,
+                 use_defense: bool = False):
         self.product_catelogue_size = product_catelogue_size
         self.max_price = max_price
         self.min_price = min_price
-        self.simulation_history = []
+        self.constraints = constraints
+        self.use_defense = use_defense
+        self.agent_detector = agent_detector
+        self.simulation_history: List[Dict[str, Any]] = []
+        self._rng = np.random.default_rng(constraints.seed)
+        self._popularity = self._rng.lognormal(mean=0.0, sigma=0.6, size=self.product_catelogue_size)
+        self._popularity = self._popularity / (self._popularity.mean() + 1e-12)
+        self._last_interaction_df: pd.DataFrame = pd.DataFrame()
 
-
-    def setup_true_demand(self,prices: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-        human_price_elasticity = -1.5  # Example elasticity value
-        base_demand = 100  # Base demand for products
-        demand = base_demand * (prices / self.max_price) ** human_price_elasticity
-
-        agent_price_elasticity = -2.0  # Example elasticity value for agents
-        agent_base_demand = 150  # Base demand for agents
-        agent_demand = agent_base_demand * (prices / self.max_price) ** agent_price_elasticity
-
-        return demand + agent_demand, agent_demand
-
-
-    def compute_interaction_features(self, interaction_data: np.ndarray) -> dict:
-        df = pd.DataFrame(interaction_data)
+    def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
+        # ground truth purchase propensities
+        p = np.clip(prices, self.min_price, self.max_price)
+        pn = p / self.max_price
+        human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
+        agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
         return {
-            'mean_sale_price': df[df['action'] == 'purchase']['price'].mean(),
+            "human_purchase_prob": np.clip(human_prob * self._popularity, 0.0, 0.95),
+            "agent_purchase_prob": np.clip(agent_prob * self._popularity, 0.0, 0.95)
         }
 
-    def run_pricing_simulation(self, prices: np.ndarray) -> dict:
-        # Simulate demand based on prices
+    def _session_markup_multiplier(self, signal_score: float) -> float:
+        # session-based COI markup based on demand signal expression
+        x = (signal_score - self.constraints.coi_threshold) / max(self.constraints.coi_sigmoid_temp, 1e-6)
+        return 1.0 + self.constraints.coi_strength * float(_sigmoid(np.array([x]))[0])
 
-        observed_demand, demand_from_agents = self.setup_true_demand(prices)
-        true_demand = observed_demand - demand_from_agents
+    def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
+        demand = self.setup_true_demand(base_prices)
+        human_pprob = demand["human_purchase_prob"]
+        agent_pprob = demand["agent_purchase_prob"]
+        events: List[Dict[str, Any]] = []
+        T = self.constraints.sessions_per_step
+        n_agent_sessions = int(round(T * self.constraints.agent_share))
+        n_human_sessions = T - n_agent_sessions
 
-        interaction_data = self.get_interaction_data()
-        interaction_features = self.compute_interaction_features(interaction_data)
-        demand_estimates = self.demand_estimate(interaction_data)
-        internal_error = np.abs(true_demand - demand_estimates) / (true_demand + 1e-6)
+        # human sessions: normal browse with possible purchase
+        for s in range(n_human_sessions):
+            session_id = f"h_{len(events)}_{s}"
+            k = int(self._rng.integers(1, 4))
+            prod_ids = self._rng.choice(self.product_catelogue_size, size=k, replace=False)
+            t = 0.0
+            inter_times = self._rng.gamma(shape=2.0, scale=3.0, size=3 * k)
+            signal_score = 0.0
+            purchased_any = False
 
+            for i, pid in enumerate(prod_ids):
+                t += float(inter_times[i])
+                price_shown = float(base_prices[pid])
+                events.append({
+                    "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
+                    "action": "view", "t": t, "price_shown": price_shown, "is_purchase": 0,
+                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
+                })
+                signal_score += 1.0
+
+                if self._rng.random() < 0.35:
+                    t += float(inter_times[i + k])
+                    events.append({
+                        "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
+                        "action": "cart", "t": t, "price_shown": price_shown, "is_purchase": 0,
+                        "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
+                    })
+                    signal_score += 2.0
+
+                if (not purchased_any) and (self._rng.random() < float(human_pprob[pid])):
+                    t += float(inter_times[i + 2 * k])
+                    mult = self._session_markup_multiplier(signal_score)
+                    price_paid = float(np.clip(base_prices[pid] * mult, self.min_price, self.max_price))
+                    events.append({
+                        "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
+                        "action": "purchase", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 1,
+                        "price_paid": price_paid, "oracle_price_paid": price_paid, "signal_score": signal_score,
+                    })
+                    purchased_any = True
+
+        # agent sessions: split recon/purchase to circumvent COI
+        n_agent_ids = max(1, n_agent_sessions // 2)
+        for a in range(n_agent_ids):
+            agent_id = f"a_{a}"
+            recon_session_id = f"{agent_id}_recon"
+            t = 0.0
+            n_views = int(self._rng.poisson(lam=8) * self.constraints.agent_recon_multiplier) + 5
+            inter_times = self._rng.gamma(shape=2.0, scale=0.6, size=max(n_views, 1))
+            prod_ids = self._rng.integers(0, self.product_catelogue_size, size=n_views)
+            recon_signal = 0.0
+
+            for i, pid in enumerate(prod_ids):
+                t += float(inter_times[i])
+                events.append({
+                    "session_id": recon_session_id, "actor": "agent", "agent_id": agent_id, "product_id": int(pid),
+                    "action": "view", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 0,
+                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
+                })
+                recon_signal += 1.0
+
+            # clean purchase session with minimal interactions
+            if self._rng.random() < self.constraints.agent_purchase_probability:
+                purchase_session_id = f"{agent_id}_clean"
+                pid = int(self._rng.integers(0, self.product_catelogue_size))
+                t2 = 0.0
+                clean_signal = 0.0
+                t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
+                events.append({
+                    "session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
+                    "action": "view", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 0,
+                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
+                })
+                clean_signal += 1.0
+
+                if self._rng.random() < float(agent_pprob[pid]):
+                    t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
+                    obs_mult = self._session_markup_multiplier(clean_signal)
+                    obs_paid = float(np.clip(base_prices[pid] * obs_mult, self.min_price, self.max_price))
+                    oracle_mult = self._session_markup_multiplier(recon_signal)  # oracle links recon->purchase
+                    oracle_paid = float(np.clip(base_prices[pid] * oracle_mult, self.min_price, self.max_price))
+                    events.append({
+                        "session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
+                        "action": "purchase", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 1,
+                        "price_paid": obs_paid, "oracle_price_paid": oracle_paid, "signal_score": clean_signal,
+                    })
+
+        return pd.DataFrame(events)
+
+    def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
+        if interaction_df.empty:
+            return {"mean_sale_price": 0.0, "look_to_book": 0.0}
+        purchases = interaction_df[interaction_df["action"] == "purchase"]
+        mean_sale_price = float(purchases["price_paid"].mean()) if not purchases.empty else 0.0
+        views = float((interaction_df["action"] == "view").sum())
+        buys = float((interaction_df["action"] == "purchase").sum())
+        return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
+
+    def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
+        if df.empty:
+            return pd.DataFrame()
+        g = df.groupby("session_id", sort=False)
+        session_duration = g["t"].max() - g["t"].min()
+        total_interactions = g.size()
+        avg_time_between = g["t"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0)
+        interaction_velocity = total_interactions / (session_duration + 1e-6)
+        views = g.apply(lambda x: int((x["action"] == "view").sum()), include_groups=False)
+        cart_adds = g.apply(lambda x: int((x["action"] == "cart").sum()), include_groups=False)
+        purchases = g.apply(lambda x: int((x["action"] == "purchase").sum()), include_groups=False)
+        conversion_rate = purchases / (views + 1e-6)
+        is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False)
+
+        return pd.DataFrame({
+            "session_duration_sec": session_duration.astype(float),
+            "avg_time_between_events": avg_time_between.astype(float),
+            "total_interactions": total_interactions.astype(int),
+            "interaction_velocity": interaction_velocity.astype(float),
+            "item_views": views.astype(int),
+            "cart_adds": cart_adds.astype(int),
+            "purchases": purchases.astype(int),
+            "conversion_rate": conversion_rate.astype(float),
+            "is_agent": is_agent.astype(bool),
+        }).reset_index()
+
+    def demand_estimate(self, interaction_df: pd.DataFrame, exclude_sessions: Optional[pd.Series] = None) -> np.ndarray:
+        # proxy demand from weighted interaction events
+        if interaction_df.empty:
+            return np.zeros(self.product_catelogue_size, dtype=np.float32)
+        df = interaction_df
+        if exclude_sessions is not None:
+            bad_sessions = set(exclude_sessions.loc[exclude_sessions].index)
+            df = df[~df["session_id"].isin(bad_sessions)]
+        weights = {"view": 0.15, "cart": 0.75, "purchase": 2.5}
+        w = df["action"].map(weights).fillna(0.0).to_numpy(dtype=float)
+        prod = df["product_id"].to_numpy(dtype=int)
+        q_hat = np.zeros(self.product_catelogue_size, dtype=float)
+        np.add.at(q_hat, prod, w)
+        return q_hat.astype(np.float32)
+
+    def run_pricing_simulation(self, prices: np.ndarray) -> Dict[str, Any]:
+        interaction_df = self._simulate_sessions(prices)
+        self._last_interaction_df = interaction_df
+        session_df = self._session_feature_table(interaction_df)
+
+        predicted_agent_sessions = None
+        if (self.use_defense and self.agent_detector is not None and not session_df.empty):
+            predicted_agent_sessions = self.agent_detector(session_df.set_index("session_id"))
+
+        q_hat_naive = self.demand_estimate(interaction_df, exclude_sessions=None)
+        q_hat_defended = self.demand_estimate(interaction_df, exclude_sessions=predicted_agent_sessions) \
+            if predicted_agent_sessions is not None else q_hat_naive.copy()
+
+        true_human = np.zeros(self.product_catelogue_size, dtype=float)
+        true_agent = np.zeros(self.product_catelogue_size, dtype=float)
+        if not interaction_df.empty:
+            purchases = interaction_df[interaction_df["action"] == "purchase"]
+            if not purchases.empty:
+                for _, r in purchases.iterrows():
+                    if r["actor"] == "human":
+                        true_human[int(r["product_id"])] += 1.0
+                    else:
+                        true_agent[int(r["product_id"])] += 1.0
+
+        revenue_observed = float(interaction_df["price_paid"].sum()) if not interaction_df.empty else 0.0
+        revenue_oracle = float(interaction_df["oracle_price_paid"].sum()) if not interaction_df.empty else 0.0
+        agent_loss = max(0.0, revenue_oracle - revenue_observed)
+
+        eps = 1e-6
+        internal_error_naive = np.abs(true_human - q_hat_naive) / (true_human + eps)
+        internal_error_def = np.abs(true_human - q_hat_defended) / (true_human + eps)
+        interaction_features = self.compute_interaction_features(interaction_df)
 
         summary = {
-                'prices': prices,
-                'true_demand': true_demand,
-                'demand_estimates': demand_estimates,
-                'internal_error': internal_error,
-                'interaction_data': interaction_data,
-                'interaction_features': interaction_features
-            }
+            "prices": prices.copy(),
+            "interaction_df": interaction_df,
+            "session_df": session_df,
+            "q_hat_naive": q_hat_naive,
+            "q_hat_defended": q_hat_defended,
+            "true_human_demand": true_human.astype(np.float32),
+            "true_agent_purchases": true_agent.astype(np.float32),
+            "internal_error_naive": internal_error_naive.astype(np.float32),
+            "internal_error_defended": internal_error_def.astype(np.float32),
+            "interaction_features": interaction_features,
+            "revenue_observed": revenue_observed,
+            "revenue_oracle": revenue_oracle,
+            "agent_loss": agent_loss,
+            "predicted_agent_sessions": predicted_agent_sessions,
+        }
         self.simulation_history.append(summary)
         return summary
 
     def get_interaction_data(self) -> np.ndarray:
-        # Simulate interaction data
-        interaction_data = []
-        return np.array(interaction_data)
-
-
-    def demand_estimate(self, interactions : np.ndarray) -> np.ndarray:
-        demand_estimates = np.random.rand(self.product_catelogue_size) * 100  # Dummy demand estimates
-        return demand_estimates
-
-
-
-
-
-
-
+        if self._last_interaction_df.empty:
+            return np.array([], dtype=object)
+        return self._last_interaction_df.to_dict(orient="records")
 
 
 class PHANTOMEnv(gym.Env):
-    def __init__(self):
-        super(PHANTOMEnv, self).__init__()
+    metadata = {"render_modes": []}
+
+    def __init__(self, use_defense: bool = False):
+        super().__init__()
         self.constraints = BusinessLogicConstraints()
-        self.action_space = spaces.Box(
-            low=-self.constraints.max_price_adjustment, high=self.constraints.max_price_adjustment,
-            shape=(self.constraints.product_catelogue_size,), dtype=np.float32) #  we allow teh learner to adjust price by some BusinessLogicConstraints factor
-        # Example for using image as input:
+        self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
+                                       high=self.constraints.max_price_adjustment,
+                                       shape=(self.constraints.product_catelogue_size,), dtype=np.float32)
+        self.observation_space = spaces.Dict({
+            "elasticity": spaces.Dict({
+                "price": spaces.Box(
+                    low=np.full((self.constraints.product_catelogue_size,), self.constraints.system_min_price, dtype=np.float32),
+                    high=np.full((self.constraints.product_catelogue_size,), self.constraints.system_max_price, dtype=np.float32),
+                    dtype=np.float32),
+                "demand": spaces.Box(
+                    low=np.zeros((self.constraints.product_catelogue_size,), dtype=np.float32),
+                    high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32),
+                    dtype=np.float32),
+            })
+        })
         self.commerce_platform = CommercePlatform(
             product_catelogue_size=self.constraints.product_catelogue_size,
             max_price=self.constraints.system_max_price,
-            min_price=self.constraints.system_min_price
-        )
-        self.observation_space = spaces.Dict({
-            'elasticity': spaces.Dict({
-                'price': spaces.Box(low=0, high=self.constraints.system_max_price,
-                                    shape=(self.constraints.product_catelogue_size,), dtype=np.float32),
-                'demand': spaces.Box(low=0, high=np.inf,
-                                     shape=(self.constraints.product_catelogue_size,), dtype=np.float32)
-            })
-        })
+            min_price=self.constraints.system_min_price,
+            constraints=self.constraints,
+            agent_detector=simple_agent_detector,
+            use_defense=use_defense)
+        self._rng = np.random.default_rng(self.constraints.seed)
+        self.t = 0
+        self._prev_prices: Optional[np.ndarray] = None
+        self.state: Dict[str, Any] = {}
 
-    def reset(self, seed :int, options) -> tuple[dict, dict]:
+    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
         super().reset(seed=seed)
-        # Initialize state
+        if seed is not None:
+            self._rng = np.random.default_rng(seed)
+            self.commerce_platform._rng = np.random.default_rng(seed)
+        self.t = 0
+        init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catelogue_size,)).astype(np.float32)
+        self._prev_prices = init_prices.copy()
         self.state = {
-            'elasticity': {
-                'price': np.full((self.constraints.product_catelogue_size,), 100.0, dtype=np.float32),
-                'demand': np.full((self.constraints.product_catelogue_size,), 50.0, dtype=np.float32)
+            "elasticity": {
+                "price": init_prices,
+                "demand": np.zeros((self.constraints.product_catelogue_size,), dtype=np.float32),
             }
         }
         return self.state, {}
 
-    def step(self, action):
-        self.state['price'] = np.clip(self.state['price'] * (1 + action),
-                            self.constraints.system_min_price,
-                            self.constraints.system_max_price)
+    def step(self, action: np.ndarray):
+        self.t += 1
+        base_prices = self.state["elasticity"]["price"].astype(np.float32)
+        new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
+                           self.constraints.system_min_price,
+                           self.constraints.system_max_price).astype(np.float32)
+        result = self.commerce_platform.run_pricing_simulation(new_prices)
 
-        result = self.commerce_platform.run_pricing_simulation(self.state['price'])
-        history = self.commerce_platform.simulation_history
-        self.state['demand'] = result['demand_estimates']
+        if self.commerce_platform.use_defense:
+            demand_est = result["q_hat_defended"]
+            internal_err = result["internal_error_defended"]
+        else:
+            demand_est = result["q_hat_naive"]
+            internal_err = result["internal_error_naive"]
 
+        self.state["elasticity"]["price"] = new_prices
+        self.state["elasticity"]["demand"] = demand_est
 
+        volatility = 0.0 if self._prev_prices is None else \
+            float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
+        self._prev_prices = new_prices.copy()
 
-        reward = sum(
-            self.state['price'] * self.state['demand'],
-            # performance historically, to take into account business kpi trends (using features from interaction data)
-            sum(
-                [-0.05 * i * history[-1]['internal_error'] for i in range(1, len(history))],
-            ) if len(history) > 1 else 0,
-            sum(
-                [0.1 * history[-1]['interaction_features']['mean_sale_price'] - 0.1 * history[i]['interaction_features']['mean_sale_price'] for i in range(len(history)-1)],
-            ) if len(history) > 1 else 0
-        )
+        revenue_observed = float(result["revenue_observed"])
+        agent_loss = float(result["agent_loss"])
+        err_mean = float(np.mean(internal_err))
 
+        reward = (revenue_observed
+                 - self.constraints.w_agent_loss * agent_loss
+                 - self.constraints.w_volatility * volatility
+                 - self.constraints.w_estimation_error * err_mean)
 
+        terminated = self.t >= self.constraints.episode_length
+        info = {
+            "t": self.t,
+            "revenue_observed": revenue_observed,
+            "revenue_oracle": float(result["revenue_oracle"]),
+            "agent_loss": agent_loss,
+            "ux_volatility": volatility,
+            "mean_internal_error": err_mean,
+            "look_to_book": float(result["interaction_features"].get("look_to_book", 0.0)),
+            "mean_sale_price": float(result["interaction_features"].get("mean_sale_price", 0.0)),
+            "true_human_purchases_total": float(np.sum(result["true_human_demand"])),
+            "true_agent_purchases_total": float(np.sum(result["true_agent_purchases"])),
+        }
+        return self.state, float(reward), terminated, False, info
 
-        # Check if episode is done
-        done = self.state['price'] <= 0.0 or self.state['demand'] <= 0.0
-
-
-        return self.state, reward, done, False, {}
-    def simulate_demand(self, price):
-        # Simple linear demand model: demand decreases as price increases
-        base_demand = 200
-        price_sensitivity = 0.5
-        demand = max(0, base_demand - price_sensitivity * price)
-        return demand
 
 if __name__ == "__main__":
-    env = PHANTOMEnv()
-    obs, _ = env.reset()
-    done = False
-    total_reward = 0
+    import matplotlib.pyplot as plt
+    from collections import defaultdict
 
-    while not done:
-        action = env.action_space.sample()  # Random action
-        obs, reward, done, _, _ = env.step(action)
-        total_reward += reward
-        print(f"Price: {obs['price']:.2f}, Demand: {obs['demand']:.2f}, Reward: {reward:.2f}")
-        if done:
-            break
+    runs = {}
+    for use_defense in (False, True):
+        env = PHANTOMEnv(use_defense=use_defense)
+        obs, _ = env.reset(seed=42)
+        metrics = defaultdict(list)
+        total_reward = 0.0
+        done = False
 
-    print(f"Total Reward: {total_reward:.2f}")
+        while not done:
+            action = env.action_space.sample()
+            obs, reward, done, _, info = env.step(action)
+            total_reward += reward
+            p_mean = float(np.mean(obs["elasticity"]["price"]))
+            q_mean = float(np.mean(obs["elasticity"]["demand"]))
+            p_std = float(np.std(obs["elasticity"]["price"]))
+
+            metrics['t'].append(info['t'])
+            metrics['price_mean'].append(p_mean)
+            metrics['price_std'].append(p_std)
+            metrics['demand_mean'].append(q_mean)
+            metrics['revenue_observed'].append(info['revenue_observed'])
+            metrics['revenue_oracle'].append(info['revenue_oracle'])
+            metrics['agent_loss'].append(info['agent_loss'])
+            metrics['ux_volatility'].append(info['ux_volatility'])
+            metrics['look_to_book'].append(info['look_to_book'])
+            metrics['reward'].append(reward)
+            metrics['human_purchases'].append(info['true_human_purchases_total'])
+            metrics['agent_purchases'].append(info['true_agent_purchases_total'])
+
+            if info['t'] % 20 == 0 or done:
+                print(f"defense={'ON ' if use_defense else 'OFF'} t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} "
+                      f"q={q_mean:6.2f} rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
+                      f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
+                      f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
+
+        runs[use_defense] = metrics
+        print(f"defense={'ON ' if use_defense else 'OFF'} total_reward={total_reward:.2f}\n")
+
+    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
+    fig.suptitle('PHANTOM Environment: Defense OFF vs ON', fontsize=14, fontweight='bold')
+
+    plot_configs = [
+        ('price_mean', 'Mean Price', 'Price'),
+        ('demand_mean', 'Mean Demand Estimate', 'Demand'),
+        ('revenue_observed', 'Revenue (Observed)', 'Revenue'),
+        ('agent_loss', 'Agent Loss (Oracle - Observed)', 'Loss'),
+        ('ux_volatility', 'UX Volatility (Price Change)', 'Volatility'),
+        ('look_to_book', 'Look-to-Book Ratio', 'Ratio'),
+        ('reward', 'Step Reward', 'Reward'),
+        ('human_purchases', 'Human Purchases', 'Count'),
+        ('agent_purchases', 'Agent Purchases', 'Count'),
+    ]
+
+    for idx, (key, title, ylabel) in enumerate(plot_configs):
+        ax = axes[idx // 3, idx % 3]
+        for use_defense, label, color in [(False, 'No Defense', 'red'), (True, 'With Defense', 'blue')]:
+            m = runs[use_defense]
+            ax.plot(m['t'], m[key], label=label, color=color, alpha=0.7, linewidth=1.5)
+        ax.set_xlabel('Step')
+        ax.set_ylabel(ylabel)
+        ax.set_title(title, fontsize=10, fontweight='bold')
+        ax.legend(loc='best', fontsize=8)
+        ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig('phantom_env_comparison.png', dpi=150, bbox_inches='tight')
+    print("Plot saved to phantom_env_comparison.png")
+    plt.show()

From f95056526497e5e44f868fffd4241072fd5b86f5 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 15 Dec 2025 14:18:35 +0100
Subject: [PATCH 05/99] tailored docker compose image for secondary
 tenaordboard

---
 docker-compose.yml | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index f572758..f72f415 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,8 +1,17 @@
 services:
-
-  tensorboard:
+  tensorboard-rl:
     image: tensorflow/tensorflow:latest
-    container_name: "PHANTOM-tensorboard"
+    container_name: "PHANTOM-tensorboard-rl"
+    ports:
+      - "6007:6006"
+    volumes:
+      - ./sim/rl/runs:/logs
+    command: tensorboard --logdir=/logs --host=0.0.0.0 --port=6006
+    restart: unless-stopped
+
+  tensorboard-ml:
+    image: tensorflow/tensorflow:latest
+    container_name: "PHANTOM-tensorboard-ml"
     ports:
       - "6006:6006"
     volumes:

From c8c44d0453c235d03318c5a0f6019bf24456a02c Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 17 Dec 2025 17:41:16 +0100
Subject: [PATCH 06/99] refactor to align moer with research in the env sims

---
 sim/rl/engine.py      | 220 ++++++++++++++++++++++++++++++++++++
 sim/rl/environment.py | 255 ++++++++++--------------------------------
 sim/rl/train.py       | 149 ++++++++++++++++++++++++
 3 files changed, 431 insertions(+), 193 deletions(-)
 create mode 100644 sim/rl/engine.py
 create mode 100644 sim/rl/train.py

diff --git a/sim/rl/engine.py b/sim/rl/engine.py
new file mode 100644
index 0000000..6d913f3
--- /dev/null
+++ b/sim/rl/engine.py
@@ -0,0 +1,220 @@
+import numpy as np
+import pandas as pd
+from abc import ABC, abstractmethod
+from typing import Dict, Any
+from environment import BusinessLogicConstraints
+
+
+class BasePricingEngine(ABC):
+    """base interface for all pricing engines"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        self.c = constraints
+        self.rng = np.random.default_rng(seed)
+        self.step_count = 0
+
+    @abstractmethod
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        """compute new prices given current state and observation from environment
+
+        args:
+            current_prices: current price vector [N]
+            observation: dict containing 'price', 'demand', and possibly interaction data
+
+        returns:
+            new_prices: updated price vector [N]
+        """
+        pass
+
+    @abstractmethod
+    def update(obs, reward, done, info):
+        pass
+
+
+
+    def reset(self):
+        """reset engine state for new episode"""
+        self.step_count = 0
+
+
+class WildPricingEngine(BasePricingEngine):
+    """production-like pricing using online elasticity estimation via EWMA regression"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        super().__init__(constraints, seed)
+        # per-product unit costs (unknown to customers; known to platform)
+        self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catelogue_size).astype(np.float32)
+        # online elasticity estimate (start moderately elastic)
+        self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
+        # EWMA state for log-log regression
+        self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.cov_pq  = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.var_p   = np.ones(self.c.product_catelogue_size, dtype=np.float32)
+        # knobs typical in production
+        self.lr = 0.08
+        self.ewma = 0.05
+        self.eps_explore = 0.03
+        self.explore_scale = 0.03
+
+    def _safe_elasticity(self, e: np.ndarray) -> np.ndarray:
+        return np.clip(e, -5.0, -1.05)
+
+    def reset(self):
+        super().reset()
+        self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
+        self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.cov_pq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.var_p = np.ones(self.c.product_catelogue_size, dtype=np.float32)
+
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        self.step_count += 1
+        # extract demand signal (from env observation) as proxy for sales
+        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        return self._update_from_demand(current_prices, demand)
+
+    def _update_from_demand(self, prices: np.ndarray, sold: np.ndarray) -> np.ndarray:
+        # log transforms (add 1 to handle zeros)
+        logp = np.log(np.clip(prices, 1e-3, None)).astype(np.float32)
+        logq = np.log(sold + 1.0).astype(np.float32)
+        # EWMA moments for per-product regression: logq ≈ a + e*logp
+        a = self.ewma
+        dp = logp - self.mu_logp
+        dq = logq - self.mu_logq
+        self.mu_logp = (1 - a) * self.mu_logp + a * logp
+        self.mu_logq = (1 - a) * self.mu_logq + a * logq
+        self.cov_pq = (1 - a) * self.cov_pq + a * (dp * dq)
+        self.var_p = (1 - a) * self.var_p + a * (dp * dp + 1e-6)
+        e_new = self.cov_pq / (self.var_p + 1e-6)
+        self.e_hat = self._safe_elasticity(0.9 * self.e_hat + 0.1 * e_new)
+        # profit-optimal price for isoelastic demand (if e < -1)
+        e = self.e_hat
+        p_star = self.unit_cost * (e / (e + 1.0))
+        # smooth toward p_star
+        new_prices = (1 - self.lr) * prices + self.lr * p_star
+        # exploration (small random perturbations)
+        if self.rng.random() < self.eps_explore:
+            noise = self.rng.normal(0.0, self.explore_scale, size=new_prices.shape).astype(np.float32)
+            new_prices = new_prices * (1.0 + noise)
+        # apply business guardrails (max change + bounds)
+        max_adj = self.c.max_price_adjustment
+        ratio = np.clip(new_prices / (prices + 1e-6), 1 - max_adj, 1 + max_adj)
+        new_prices = prices * ratio
+        new_prices = np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
+        return new_prices
+
+
+class StaticPricingEngine(BasePricingEngine):
+    """baseline: fixed prices throughout episode"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        super().__init__(constraints, seed)
+        self.fixed_prices = None
+
+    def reset(self):
+        super().reset()
+        self.fixed_prices = None
+
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        self.step_count += 1
+        if self.fixed_prices is None:
+            self.fixed_prices = current_prices.copy()
+        return self.fixed_prices.copy()
+
+
+class SimpleDemandEngine(BasePricingEngine):
+    """demand-driven pricing: increase price when demand rises, decrease when it falls"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        super().__init__(constraints, seed)
+        self.prev_demand = None
+        self.lr = 0.05
+
+    def reset(self):
+        super().reset()
+        self.prev_demand = None
+
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        self.step_count += 1
+        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        if self.prev_demand is None:
+            self.prev_demand = demand.copy()
+            return current_prices.copy()
+        # simple rule: if demand increases, raise price; if decreases, lower price
+        delta_d = demand - self.prev_demand
+        price_adj = self.lr * np.sign(delta_d) * np.abs(delta_d) / (np.abs(self.prev_demand) + 1.0)
+        new_prices = current_prices * (1.0 + price_adj)
+        self.prev_demand = demand.copy()
+        # apply constraints
+        max_adj = self.c.max_price_adjustment
+        ratio = np.clip(new_prices / (current_prices + 1e-6), 1 - max_adj, 1 + max_adj)
+        new_prices = current_prices * ratio
+        return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
+
+
+class RandomWalkEngine(BasePricingEngine):
+    """random walk pricing with mean reversion"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        super().__init__(constraints, seed)
+        self.target_price = None
+        self.volatility = 0.02
+
+    def reset(self):
+        super().reset()
+        self.target_price = None
+
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        self.step_count += 1
+        if self.target_price is None:
+            self.target_price = current_prices.copy()
+        # random walk with mean reversion toward target
+        noise = self.rng.normal(0.0, self.volatility, size=current_prices.shape).astype(np.float32)
+        reversion = 0.01 * (self.target_price - current_prices)
+        new_prices = current_prices * (1.0 + noise) + reversion
+        # apply constraints
+        max_adj = self.c.max_price_adjustment
+        ratio = np.clip(new_prices / (current_prices + 1e-6), 1 - max_adj, 1 + max_adj)
+        new_prices = current_prices * ratio
+        return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
+
+
+class ThompsonSamplingEngine(BasePricingEngine):
+    """bayesian bandit approach per product treating price as discrete action"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        super().__init__(constraints, seed)
+        self.n_price_levels = 5
+        self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.price_grid = None
+        self.last_actions = None
+
+    def reset(self):
+        super().reset()
+        self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.price_grid = None
+        self.last_actions = None
+
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        self.step_count += 1
+        if self.price_grid is None:
+            # define price grid per product
+            lo = current_prices * 0.7
+            hi = current_prices * 1.3
+            self.price_grid = np.linspace(lo, hi, self.n_price_levels).T
+        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        # update beliefs based on last action
+        if self.last_actions is not None:
+            for i in range(self.c.product_catelogue_size):
+                a = self.last_actions[i]
+                reward = demand[i]
+                if reward > 0.5:
+                    self.alpha[i, a] += reward
+                else:
+                    self.beta[i, a] += 1.0
+        # thompson sampling: sample from posterior, pick best
+        new_prices = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        actions = np.zeros(self.c.product_catelogue_size, dtype=int)
+        for i in range(self.c.product_catelogue_size):
+            theta = self.rng.beta(self.alpha[i], self.beta[i]).astype(np.float32)
+            actions[i] = int(np.argmax(theta))
+            new_prices[i] = self.price_grid[i, actions[i]]
+        self.last_actions = actions
+        return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index 19f9ad4..fd725f8 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -1,5 +1,7 @@
+from sys import intern
 import gymnasium as gym
 from gymnasium import spaces
+from matplotlib import interactive
 import numpy as np
 from dataclasses import dataclass
 import pandas as pd
@@ -24,7 +26,7 @@ class BusinessLogicConstraints():
     coi_sigmoid_temp: float = 1.25
     base_human_demand: float = 0.08
     base_agent_demand: float = 0.05
-    human_price_elasticity: float = -1.2
+    human_price_elasticity: float = -1.2 # assumptions here
     agent_price_elasticity: float = -0.6
     w_agent_loss: float = 1.0
     w_volatility: float = 5.0
@@ -35,31 +37,25 @@ class BusinessLogicConstraints():
 def _sigmoid(x: np.ndarray) -> np.ndarray:
     return 1.0 / (1.0 + np.exp(-x))
 
-
-def simple_agent_detector(session_df: pd.DataFrame) -> pd.Series:
-    # baseline heuristic: high velocity + low conversion
-    v = session_df.get("interaction_velocity", pd.Series(0.0, index=session_df.index))
-    cr = session_df.get("conversion_rate", pd.Series(0.0, index=session_df.index))
-    total = session_df.get("total_interactions", pd.Series(0, index=session_df.index))
-    return (total >= 12) & (v >= 0.20) & (cr <= 0.01)
-
-
 class CommercePlatform:
-    def __init__(self, product_catelogue_size: int, max_price: float, min_price: float,
-                 constraints: BusinessLogicConstraints, agent_detector: Optional[Callable[[pd.DataFrame], pd.Series]] = None,
-                 use_defense: bool = False):
+    """
+    This is just an extension of the state management for the environment, it does not implement anything dynamic just helps us simulate demand.
+    """
+    def __init__(self,
+                 product_catelogue_size: int,
+                 max_price: float,
+                 min_price: float,
+                 constraints: BusinessLogicConstraints):
         self.product_catelogue_size = product_catelogue_size
+        self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catelogue_size,))
         self.max_price = max_price
         self.min_price = min_price
         self.constraints = constraints
-        self.use_defense = use_defense
-        self.agent_detector = agent_detector
         self.simulation_history: List[Dict[str, Any]] = []
         self._rng = np.random.default_rng(constraints.seed)
-        self._popularity = self._rng.lognormal(mean=0.0, sigma=0.6, size=self.product_catelogue_size)
-        self._popularity = self._popularity / (self._popularity.mean() + 1e-12)
         self._last_interaction_df: pd.DataFrame = pd.DataFrame()
 
+
     def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
         # ground truth purchase propensities
         p = np.clip(prices, self.min_price, self.max_price)
@@ -67,14 +63,19 @@ class CommercePlatform:
         human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
         agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
         return {
-            "human_purchase_prob": np.clip(human_prob * self._popularity, 0.0, 0.95),
-            "agent_purchase_prob": np.clip(agent_prob * self._popularity, 0.0, 0.95)
+            "human_purchase_prob": np.clip(human_prob, 0.0, 0.95),
+            "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)
         }
 
-    def _session_markup_multiplier(self, signal_score: float) -> float:
-        # session-based COI markup based on demand signal expression
-        x = (signal_score - self.constraints.coi_threshold) / max(self.constraints.coi_sigmoid_temp, 1e-6)
-        return 1.0 + self.constraints.coi_strength * float(_sigmoid(np.array([x]))[0])
+    def _load_behavioral_profile(actor : str, demand_forcing):
+        """
+        This returns a markov chain with average weights which we get from interaction data of our experiments.
+        This defines transition probabilities between different events:
+        search -> view_item_price_binN: 0.7
+        view_item_price_binN -> add_to_cart: 0.2
+        we also must reweight with the demand_forcing vector or purchase probabilities per-product
+        """
+
 
     def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
         demand = self.setup_true_demand(base_prices)
@@ -84,94 +85,32 @@ class CommercePlatform:
         T = self.constraints.sessions_per_step
         n_agent_sessions = int(round(T * self.constraints.agent_share))
         n_human_sessions = T - n_agent_sessions
-
-        # human sessions: normal browse with possible purchase
-        for s in range(n_human_sessions):
-            session_id = f"h_{len(events)}_{s}"
-            k = int(self._rng.integers(1, 4))
-            prod_ids = self._rng.choice(self.product_catelogue_size, size=k, replace=False)
-            t = 0.0
-            inter_times = self._rng.gamma(shape=2.0, scale=3.0, size=3 * k)
-            signal_score = 0.0
-            purchased_any = False
-
-            for i, pid in enumerate(prod_ids):
-                t += float(inter_times[i])
-                price_shown = float(base_prices[pid])
-                events.append({
-                    "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
-                    "action": "view", "t": t, "price_shown": price_shown, "is_purchase": 0,
-                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                })
-                signal_score += 1.0
-
-                if self._rng.random() < 0.35:
-                    t += float(inter_times[i + k])
-                    events.append({
-                        "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
-                        "action": "cart", "t": t, "price_shown": price_shown, "is_purchase": 0,
-                        "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                    })
-                    signal_score += 2.0
-
-                if (not purchased_any) and (self._rng.random() < float(human_pprob[pid])):
-                    t += float(inter_times[i + 2 * k])
-                    mult = self._session_markup_multiplier(signal_score)
-                    price_paid = float(np.clip(base_prices[pid] * mult, self.min_price, self.max_price))
-                    events.append({
-                        "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
-                        "action": "purchase", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 1,
-                        "price_paid": price_paid, "oracle_price_paid": price_paid, "signal_score": signal_score,
-                    })
-                    purchased_any = True
-
-        # agent sessions: split recon/purchase to circumvent COI
         n_agent_ids = max(1, n_agent_sessions // 2)
-        for a in range(n_agent_ids):
-            agent_id = f"a_{a}"
-            recon_session_id = f"{agent_id}_recon"
-            t = 0.0
-            n_views = int(self._rng.poisson(lam=8) * self.constraints.agent_recon_multiplier) + 5
-            inter_times = self._rng.gamma(shape=2.0, scale=0.6, size=max(n_views, 1))
-            prod_ids = self._rng.integers(0, self.product_catelogue_size, size=n_views)
-            recon_signal = 0.0
+        session_map = {
+            'humans': n_human_sessions,
+            'agents': n_agent_ids
+        }
+        pprob_map = {
+            'humans': human_pprob,
+            'agents': agent_pprob
+        }
+        joint_events = []
+        for actor, n_sessions in session_map.items():
+            bp = _load_behavioral_profile(actor, pprob_map[actor])
+            counter = 0
+            events = []
+            while counter < n_sessions:
+                session_events = []
+                while len(session_events) == 0 or session_events[-1]['action'] == 'checkout':
+                    interaction_event = bp.sample(self._rng)
+                    interaction_event['session_id'] = f'{actor}_{counter:06d}'
+                    # TODO any other assignments
+                    session_events.append(interaction_event)
+                events.extend(session_events)
+                counter += 1
+            joint_events.extend(events)
 
-            for i, pid in enumerate(prod_ids):
-                t += float(inter_times[i])
-                events.append({
-                    "session_id": recon_session_id, "actor": "agent", "agent_id": agent_id, "product_id": int(pid),
-                    "action": "view", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 0,
-                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                })
-                recon_signal += 1.0
-
-            # clean purchase session with minimal interactions
-            if self._rng.random() < self.constraints.agent_purchase_probability:
-                purchase_session_id = f"{agent_id}_clean"
-                pid = int(self._rng.integers(0, self.product_catelogue_size))
-                t2 = 0.0
-                clean_signal = 0.0
-                t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
-                events.append({
-                    "session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
-                    "action": "view", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 0,
-                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                })
-                clean_signal += 1.0
-
-                if self._rng.random() < float(agent_pprob[pid]):
-                    t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
-                    obs_mult = self._session_markup_multiplier(clean_signal)
-                    obs_paid = float(np.clip(base_prices[pid] * obs_mult, self.min_price, self.max_price))
-                    oracle_mult = self._session_markup_multiplier(recon_signal)  # oracle links recon->purchase
-                    oracle_paid = float(np.clip(base_prices[pid] * oracle_mult, self.min_price, self.max_price))
-                    events.append({
-                        "session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
-                        "action": "purchase", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 1,
-                        "price_paid": obs_paid, "oracle_price_paid": oracle_paid, "signal_score": clean_signal,
-                    })
-
-        return pd.DataFrame(events)
+        return pd.DataFrame(joint_events)
 
     def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
         if interaction_df.empty:
@@ -183,6 +122,7 @@ class CommercePlatform:
         return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
 
     def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
+        # TODO: adapt this
         if df.empty:
             return pd.DataFrame()
         g = df.groupby("session_id", sort=False)
@@ -208,73 +148,6 @@ class CommercePlatform:
             "is_agent": is_agent.astype(bool),
         }).reset_index()
 
-    def demand_estimate(self, interaction_df: pd.DataFrame, exclude_sessions: Optional[pd.Series] = None) -> np.ndarray:
-        # proxy demand from weighted interaction events
-        if interaction_df.empty:
-            return np.zeros(self.product_catelogue_size, dtype=np.float32)
-        df = interaction_df
-        if exclude_sessions is not None:
-            bad_sessions = set(exclude_sessions.loc[exclude_sessions].index)
-            df = df[~df["session_id"].isin(bad_sessions)]
-        weights = {"view": 0.15, "cart": 0.75, "purchase": 2.5}
-        w = df["action"].map(weights).fillna(0.0).to_numpy(dtype=float)
-        prod = df["product_id"].to_numpy(dtype=int)
-        q_hat = np.zeros(self.product_catelogue_size, dtype=float)
-        np.add.at(q_hat, prod, w)
-        return q_hat.astype(np.float32)
-
-    def run_pricing_simulation(self, prices: np.ndarray) -> Dict[str, Any]:
-        interaction_df = self._simulate_sessions(prices)
-        self._last_interaction_df = interaction_df
-        session_df = self._session_feature_table(interaction_df)
-
-        predicted_agent_sessions = None
-        if (self.use_defense and self.agent_detector is not None and not session_df.empty):
-            predicted_agent_sessions = self.agent_detector(session_df.set_index("session_id"))
-
-        q_hat_naive = self.demand_estimate(interaction_df, exclude_sessions=None)
-        q_hat_defended = self.demand_estimate(interaction_df, exclude_sessions=predicted_agent_sessions) \
-            if predicted_agent_sessions is not None else q_hat_naive.copy()
-
-        true_human = np.zeros(self.product_catelogue_size, dtype=float)
-        true_agent = np.zeros(self.product_catelogue_size, dtype=float)
-        if not interaction_df.empty:
-            purchases = interaction_df[interaction_df["action"] == "purchase"]
-            if not purchases.empty:
-                for _, r in purchases.iterrows():
-                    if r["actor"] == "human":
-                        true_human[int(r["product_id"])] += 1.0
-                    else:
-                        true_agent[int(r["product_id"])] += 1.0
-
-        revenue_observed = float(interaction_df["price_paid"].sum()) if not interaction_df.empty else 0.0
-        revenue_oracle = float(interaction_df["oracle_price_paid"].sum()) if not interaction_df.empty else 0.0
-        agent_loss = max(0.0, revenue_oracle - revenue_observed)
-
-        eps = 1e-6
-        internal_error_naive = np.abs(true_human - q_hat_naive) / (true_human + eps)
-        internal_error_def = np.abs(true_human - q_hat_defended) / (true_human + eps)
-        interaction_features = self.compute_interaction_features(interaction_df)
-
-        summary = {
-            "prices": prices.copy(),
-            "interaction_df": interaction_df,
-            "session_df": session_df,
-            "q_hat_naive": q_hat_naive,
-            "q_hat_defended": q_hat_defended,
-            "true_human_demand": true_human.astype(np.float32),
-            "true_agent_purchases": true_agent.astype(np.float32),
-            "internal_error_naive": internal_error_naive.astype(np.float32),
-            "internal_error_defended": internal_error_def.astype(np.float32),
-            "interaction_features": interaction_features,
-            "revenue_observed": revenue_observed,
-            "revenue_oracle": revenue_oracle,
-            "agent_loss": agent_loss,
-            "predicted_agent_sessions": predicted_agent_sessions,
-        }
-        self.simulation_history.append(summary)
-        return summary
-
     def get_interaction_data(self) -> np.ndarray:
         if self._last_interaction_df.empty:
             return np.array([], dtype=object)
@@ -284,7 +157,7 @@ class CommercePlatform:
 class PHANTOMEnv(gym.Env):
     metadata = {"render_modes": []}
 
-    def __init__(self, use_defense: bool = False):
+    def __init__(self, constraints):
         super().__init__()
         self.constraints = BusinessLogicConstraints()
         self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
@@ -301,14 +174,13 @@ class PHANTOMEnv(gym.Env):
                     high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32),
                     dtype=np.float32),
             })
+            # TODO: define more features that we compute from the interaction data
         })
         self.commerce_platform = CommercePlatform(
             product_catelogue_size=self.constraints.product_catelogue_size,
             max_price=self.constraints.system_max_price,
             min_price=self.constraints.system_min_price,
-            constraints=self.constraints,
-            agent_detector=simple_agent_detector,
-            use_defense=use_defense)
+            constraints=self.constraints)
         self._rng = np.random.default_rng(self.constraints.seed)
         self.t = 0
         self._prev_prices: Optional[np.ndarray] = None
@@ -336,17 +208,13 @@ class PHANTOMEnv(gym.Env):
         new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
                            self.constraints.system_min_price,
                            self.constraints.system_max_price).astype(np.float32)
-        result = self.commerce_platform.run_pricing_simulation(new_prices)
-
-        if self.commerce_platform.use_defense:
-            demand_est = result["q_hat_defended"]
-            internal_err = result["internal_error_defended"]
-        else:
-            demand_est = result["q_hat_naive"]
-            internal_err = result["internal_error_naive"]
 
         self.state["elasticity"]["price"] = new_prices
-        self.state["elasticity"]["demand"] = demand_est
+        # TODO: use the commerce platform to simulate sessions
+        interactions_df = self.commerce_platform._simulate_sessions(new_prices)
+        result = self.commerce_platform.compute_interaction_features(interactions_df)
+        # TODO: implement COI computation to use in reward
+        COI = 0.0
 
         volatility = 0.0 if self._prev_prices is None else \
             float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
@@ -354,12 +222,13 @@ class PHANTOMEnv(gym.Env):
 
         revenue_observed = float(result["revenue_observed"])
         agent_loss = float(result["agent_loss"])
-        err_mean = float(np.mean(internal_err))
 
         reward = (revenue_observed
-                 - self.constraints.w_agent_loss * agent_loss
-                 - self.constraints.w_volatility * volatility
-                 - self.constraints.w_estimation_error * err_mean)
+                  - COI
+                  - self.constraints.w_agent_loss * agent_loss
+                  - self.constraints.w_volatility * volatility
+                  - self.constraints.w_estimation_error
+                  )
 
         terminated = self.t >= self.constraints.episode_length
         info = {
diff --git a/sim/rl/train.py b/sim/rl/train.py
new file mode 100644
index 0000000..41a87ab
--- /dev/null
+++ b/sim/rl/train.py
@@ -0,0 +1,149 @@
+import numpy as np
+import logging
+from pathlib import Path
+from typing import Dict, Type, Optional
+import pickle
+from torch import neg_
+from torch.utils.tensorboard import SummaryWriter
+from environment import PHANTOMEnv, FastTrainingConstraints, BusinessLogicConstraints
+from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
+                   SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
+logger = logging.getLogger(__name__)
+
+
+
+"""
+Target training loop:
+have base prices p0 from env reset and run the env step, collect reward and metrics
+pass this to the pricing engine which computes the price action to take based on previous reward by learning
+the new action gets passed to the step
+so we alternate, step -> reward -> engine (produces price delta) -> step with price delta -> reward
+to make sure the reinforcement learning inside the engine can learn we need to have trajectory of prices
+CURRENT SOLUTION BELOW does not implement correct learning or updates.
+"""
+
+class EngineTrainer:
+    """wrapper to run pricing engines through episodes and collect metrics"""
+    def __init__(self, engine: BasePricingEngine, env: PHANTOMEnv,
+                 tb_writer: Optional[SummaryWriter] = None):
+        self.engine = engine
+        self.env = env
+        self.episode_metrics = []
+        self.tb_writer = tb_writer
+        self.global_step = 0
+
+    def train(self, n_episodes: int, seed: int = 42):
+
+        obs, _ = self.env.reset(seed=seed)
+        prices = None
+        for ep in range(n_episodes):
+            prices = self.engine.compute_prices(prices, obs
+            obs, reward, done, _, info = self.env.step(prices)
+            self.engine.update(obs, reward, done, info)
+        return self
+
+
+
+
+
+
+        return self.episode_metrics
+
+    def evaluate(self, n_episodes: int = 10, seed: int = 100) -> Dict:
+        """evaluate trained engine"""
+        results = {k: [] for k in ['total_reward', 'revenue_observed', 'revenue_oracle',
+                                   'agent_loss', 'ux_volatility', 'look_to_book']}
+        for ep in range(n_episodes):
+            metrics = self.run_episode(seed=seed + ep)
+            for k in results:                results[k].append(metrics[k])
+        return {k: (np.mean(v), np.std(v)) for k, v in results.items()}
+
+
+def make_env(fast: bool = True):
+    constraints = FastTrainingConstraints() if fast else BusinessLogicConstraints()
+    return PHANTOMEnv(constraints=constraints)
+
+
+def train_engine(engine_cls: Type[BasePricingEngine], env: PHANTOMEnv,
+                n_episodes: int, seed: int = 42,
+                tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
+    constraints = env.constraints
+    engine = engine_cls(constraints=constraints, seed=seed)
+    trainer = EngineTrainer(engine, env, tb_writer=tb_writer)
+    trainer.train(n_episodes, seed=seed)
+    return trainer
+
+
+def save_trainer(trainer: EngineTrainer, path: Path):
+    """save engine state and metrics"""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, 'wb') as f:
+        pickle.dump({
+            'engine': trainer.engine,
+            'metrics': trainer.episode_metrics
+        }, f)
+    logger.info(f"Saved trainer to {path}")
+
+
+def load_trainer(path: Path, env: PHANTOMEnv,
+                 tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
+    """load saved engine"""
+    with open(path, 'rb') as f:
+        data = pickle.load(f)
+    trainer = EngineTrainer(data['engine'], env, tb_writer=tb_writer)
+    trainer.episode_metrics = data['metrics']
+    return trainer
+
+
+if __name__ == "__main__":
+    base_dir = Path("./runs")
+    base_dir.mkdir(exist_ok=True)
+
+    engines = {
+        "Wild": WildPricingEngine,
+        "Static": StaticPricingEngine,
+#        "SimpleDemand": SimpleDemandEngine,
+        "RandomWalk": RandomWalkEngine,
+        "ThompsonSampling": ThompsonSamplingEngine,
+    }
+    defenses = [False, True]
+    n_train_episodes = 50
+    n_eval_episodes = 10
+    seed = 42
+    fast_mode = True
+
+    logger.info(f"Training config: {n_train_episodes} episodes per engine, fast_mode={fast_mode}")
+
+    trained_trainers = {}
+
+    for engine_name, engine_cls in engines.items():
+        for use_defense in defenses:
+            defense_label = "defense_on" if use_defense else "defense_off"
+            run_name = f"{engine_name}_{defense_label}"
+            log_dir = base_dir / run_name
+            log_dir.mkdir(parents=True, exist_ok=True)
+
+            logger.info(f"Training {engine_name} with defense={use_defense}")
+            logger.info(f"Log directory: {log_dir}")
+
+            env = make_env(fast=fast_mode)
+            tb_writer = SummaryWriter(log_dir=str(log_dir))
+            trainer = train_engine(engine_cls, env, n_train_episodes, seed, tb_writer=tb_writer)
+            tb_writer.close()
+
+            save_path = log_dir / "trainer.pkl"
+            save_trainer(trainer, save_path)
+
+            trained_trainers[run_name] = (trainer, env)
+
+    logger.info("Starting evaluation")
+
+    for run_name, (trainer, env) in trained_trainers.items():
+        logger.info(f"Evaluating {run_name}")
+        results = trainer.evaluate(n_episodes=n_eval_episodes, seed=seed + 1000)
+        for metric, (mean, std) in results.items():
+            logger.info(f"  {metric:20s}: {mean:10.2f} ± {std:6.2f}")
+
+    logger.info(f"Results saved to: {base_dir}")

From 57a7e0c5717132266d55287f96327e9546f647c8 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 17 Dec 2025 18:50:04 +0100
Subject: [PATCH 07/99] simple code cleanup

---
 sim/rl/engine.py | 7 +++++++
 sim/rl/train.py  | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/sim/rl/engine.py b/sim/rl/engine.py
index 6d913f3..e0caca8 100644
--- a/sim/rl/engine.py
+++ b/sim/rl/engine.py
@@ -1,3 +1,4 @@
+from os import kill
 import numpy as np
 import pandas as pd
 from abc import ABC, abstractmethod
@@ -5,6 +6,11 @@ from typing import Dict, Any
 from environment import BusinessLogicConstraints
 
 
+"""
+An angine by default should have its own demand estimation mechanism from the observed observations whihc are the computer feature.
+From these features we then follow the researc hstructure of q -> p with a testable and must be updatable mechanism.
+"""
+
 class BasePricingEngine(ABC):
     """base interface for all pricing engines"""
     def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
@@ -12,6 +18,7 @@ class BasePricingEngine(ABC):
         self.rng = np.random.default_rng(seed)
         self.step_count = 0
 
+
     @abstractmethod
     def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
         """compute new prices given current state and observation from environment
diff --git a/sim/rl/train.py b/sim/rl/train.py
index 41a87ab..ba257de 100644
--- a/sim/rl/train.py
+++ b/sim/rl/train.py
@@ -39,7 +39,7 @@ class EngineTrainer:
         obs, _ = self.env.reset(seed=seed)
         prices = None
         for ep in range(n_episodes):
-            prices = self.engine.compute_prices(prices, obs
+            prices = self.engine.compute_prices(prices, obs)
             obs, reward, done, _, info = self.env.step(prices)
             self.engine.update(obs, reward, done, info)
         return self

From b1882b6049bcb25462ed24321932709675af767b Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 9 Jan 2026 20:20:31 +0100
Subject: [PATCH 08/99] feature: MDP behavior mappers (unlinked)

---
 sim/rl/behavior_loader/loader.py |  63 ++++++++++++++
 sim/rl/behavior_loader/models.py | 137 +++++++++++++++++++++++++++++++
 2 files changed, 200 insertions(+)
 create mode 100644 sim/rl/behavior_loader/loader.py
 create mode 100644 sim/rl/behavior_loader/models.py

diff --git a/sim/rl/behavior_loader/loader.py b/sim/rl/behavior_loader/loader.py
new file mode 100644
index 0000000..99a1541
--- /dev/null
+++ b/sim/rl/behavior_loader/loader.py
@@ -0,0 +1,63 @@
+import os
+from pydantic import BaseModel as Base
+import json
+
+class PayloadModel(Base):
+    sessionId: str
+    experimentId: str | None
+    eventName: str
+    page: str | None
+    productId: str | None
+    metadata: dict
+    storeMode: str
+    userAgent: str
+    ts: str
+
+class ValueModel(Base):
+    payload: PayloadModel
+    encoding: str
+    isPayloadNull: bool
+    schemaId: int
+    size: int
+
+class InteractionModel(Base):
+    partitionID: int
+    offset: int
+    timestamp: int
+    compression: str
+    isTransactional: bool
+    headers: list
+    key: dict
+    value: ValueModel
+
+class Loader:
+    def __init__(self, src_dir: str):
+        self.src_dir = src_dir
+        self.entries = os.listdir(src_dir)
+        if not self.entries: raise ValueError("empty directory")
+        self.data = self._load_sessions()
+
+    def _is_admin_page(self, interaction: InteractionModel) -> bool:
+        page = interaction.value.payload.page
+        return page and page.startswith("/admin/")
+
+    def _load_sessions(self) -> dict:
+        sessions = {}
+        for entry in self.entries:
+            int_path = f"{self.src_dir}/{entry}/int.json"
+            raw = json.load(open(int_path))
+            ints = [InteractionModel(**i) for i in raw]
+            sessions[entry] = [i for i in ints if not self._is_admin_page(i)]
+        return sessions
+
+    def get_data(self) -> dict:
+        return self.data
+
+    def get_entries(self) -> tuple[list[str], int]:
+        return self.entries, len(self.entries)
+
+if __name__ == "__main__":
+    DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+    loader = Loader(DIR)
+    _, n = loader.get_entries()
+    print(f"Loaded {n} sessions from {DIR}")
diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
new file mode 100644
index 0000000..f8e92b7
--- /dev/null
+++ b/sim/rl/behavior_loader/models.py
@@ -0,0 +1,137 @@
+from loader import Loader
+from collections import defaultdict
+from typing import Dict, List, Tuple, Set
+import numpy as np
+import graphviz
+
+DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+
+class BehaviorModel:
+    def __init__(self, src_dir: str = DIR):
+        self.loader = Loader(src_dir)
+        self.data = self.loader.get_data()
+        self.entries, self.num_entries = self.loader.get_entries()
+        self.mdp = None
+
+    def _state_repr(self, evt) -> str:
+        p = evt.value.payload
+        return f"{p.page or 'unk'}|{p.productId or 'none'}|{p.eventName}"
+
+    def _extract_sessions(self):
+        # transform raw events into sequential state trajectories per session
+        trajectories = []
+        for sid, evts in self.data.items():
+            if len(evts) < 2: continue
+            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.timestamp)]
+            trajectories.append(states)
+        return trajectories
+
+    def _calc_transitions(self, trajectories: List[List[str]]) -> Tuple[Dict, Set]:
+        trans = defaultdict(lambda: defaultdict(int))
+        states = set()
+        for traj in trajectories:
+            for i in range(len(traj) - 1):
+                s, s_next = traj[i], traj[i+1]
+                trans[s][s_next] += 1
+                states.update([s, s_next])
+        return trans, states
+
+    def _calc_rewards(self, trajectories: List[List[str]]) -> Dict:
+        # reward based on session progression depth
+        rwd = defaultdict(list)
+        for traj in trajectories:
+            n = len(traj)
+            for i, s in enumerate(traj):
+                rwd[s].append(i / n)
+        return rwd
+
+    def _normalize_trans(self, counts: Dict) -> Dict:
+        return {s: {s_n: cnt/sum(nxt.values()) for s_n, cnt in nxt.items()}
+                for s, nxt in counts.items()}
+
+    def build_MDP(self) -> Dict:
+        trajs = self._extract_sessions()
+        trans_cnt, states = self._calc_transitions(trajs)
+        trans_prob = self._normalize_trans(trans_cnt)
+        state_rwd = self._calc_rewards(trajs)
+        state_val = {s: np.mean(r) for s, r in state_rwd.items()}
+
+        self.mdp = {
+            'states': sorted(list(states)),
+            'num_states': len(states),
+            'transitions': trans_prob,
+            'state_values': state_val,
+            'state_rewards': state_rwd,
+            'trans_counts': trans_cnt,
+        }
+        return self.mdp
+
+    def transition_prob(self, s: str, s_next: str) -> float:
+        if not self.mdp: raise ValueError("build MDP first")
+        return self.mdp['transitions'].get(s, {}).get(s_next, 0.0)
+
+    def state_value(self, s: str) -> float:
+        if not self.mdp: raise ValueError("build MDP first")
+        return self.mdp['state_values'].get(s, 0.0)
+
+    def sample_traj(self, start: str, max_len: int = 50) -> List[str]:
+        if not self.mdp: raise ValueError("build MDP first")
+        path = [start]
+        curr = start
+        for _ in range(max_len):
+            nxt = self.mdp['transitions'].get(curr, {})
+            if not nxt: break
+            curr = np.random.choice(list(nxt.keys()), p=list(nxt.values()))
+            path.append(curr)
+        return path
+
+def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False):
+    """visualize MDP as directed graph using graphviz, aggregated by event type"""
+    if not model.mdp: raise ValueError("build MDP first")
+
+    # aggregate transitions by event type
+    evt_trans = defaultdict(lambda: defaultdict(float))
+    for s, trans in model.mdp['transitions'].items():
+        evt_src = s.split('|')[2]
+        for s_next, prob in trans.items():
+            evt_dst = s_next.split('|')[2]
+            evt_trans[evt_src][evt_dst] += prob
+
+    # normalize aggregated transitions
+    for evt_src in evt_trans:
+        total = sum(evt_trans[evt_src].values())
+        if total > 0:
+            for evt_dst in evt_trans[evt_src]:
+                evt_trans[evt_src][evt_dst] /= total
+
+    g = graphviz.Digraph(format=fmt)
+    g.attr(rankdir='LR', size='30')
+    g.attr('node', shape='circle', width='1', height='1')
+
+    # collect all event types
+    events = set(evt_trans.keys())
+    for trans in evt_trans.values():
+        events.update(trans.keys())
+
+    # add nodes for each event type
+    for evt in events:
+        g.node(evt)
+
+    # add edges above threshold
+    for evt_src in evt_trans:
+        for evt_dst, prob in evt_trans[evt_src].items():
+            if prob > threshold:
+                g.edge(evt_src, evt_dst, label=f'{prob:.2f}')
+
+    g.render(output, view=view, cleanup=True)
+    print(f"Saved MDP graph to {output}.{fmt}")
+    return g
+
+if __name__ == "__main__":
+    model = BehaviorModel(DIR)
+    mdp = model.build_MDP()
+    print(f"Built MDP: {mdp['num_states']} states, {sum(len(t) for t in mdp['transitions'].values())} transitions")
+    if not mdp['states']:
+        print("No states found")
+        exit(1)
+    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="svg")

From c56c7f653799e2d943a40c539c05edf2eaf03d6f Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sat, 10 Jan 2026 10:33:56 +0100
Subject: [PATCH 09/99] featuer: dot exporter

---
 sim/rl/behavior_loader/models.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index f8e92b7..6e4201e 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -85,7 +85,7 @@ class BehaviorModel:
             path.append(curr)
         return path
 
-def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False):
+def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False, export_dot: bool = False):
     """visualize MDP as directed graph using graphviz, aggregated by event type"""
     if not model.mdp: raise ValueError("build MDP first")
 
@@ -125,6 +125,13 @@ def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "
 
     g.render(output, view=view, cleanup=True)
     print(f"Saved MDP graph to {output}.{fmt}")
+
+    if export_dot:
+        dot_file = f"{output}.dot"
+        with open(dot_file, 'w') as f:
+            f.write(g.source)
+        print(f"Exported DOT source to {dot_file}")
+
     return g
 
 if __name__ == "__main__":
@@ -134,4 +141,4 @@ if __name__ == "__main__":
     if not mdp['states']:
         print("No states found")
         exit(1)
-    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="svg")
+    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="svg", export_dot=True)

From 29f51d56d1b19329980ef0eaef3c23f9dc328de7 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sat, 10 Jan 2026 11:48:03 +0100
Subject: [PATCH 10/99] pdf rendering

---
 sim/rl/behavior_loader/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 6e4201e..bce2429 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -141,4 +141,4 @@ if __name__ == "__main__":
     if not mdp['states']:
         print("No states found")
         exit(1)
-    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="svg", export_dot=True)
+    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="pdf", export_dot=True)

From 9a8525a8544ccc6449da1b1392e5429fba710ca2 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 12 Jan 2026 10:09:55 +0100
Subject: [PATCH 11/99] chore: refactor to better map end to end

---
 backend/provider/app.py                 | 71 ++++++++++++-------------
 experiments/procesing/pricers/simple.py | 59 +++++++++++++++++---
 experiments/procesing/steps/session.py  |  1 +
 lib/model_registry.py                   | 46 ++++++++++++++++
 web/src/app/api/pricing/route.ts        | 37 +++++++------
 5 files changed, 153 insertions(+), 61 deletions(-)

diff --git a/backend/provider/app.py b/backend/provider/app.py
index fb72a9d..6f9a55d 100644
--- a/backend/provider/app.py
+++ b/backend/provider/app.py
@@ -47,53 +47,52 @@ def health() -> dict:
 
 @app.get("/api/{mode}/price/{productId}", response_model=PriceResponse)
 def get_price(mode: Literal['hotel', 'airline'], productId: str, sessionId: Optional[str] = Query(None), experimentId: Optional[str] = Query(None)):
+    """
+    THIS is the fast lookup service (mechanism).
+    Priority: session-keyed price > global optimal price > base price
+    """
     product = supabase.table(f'{mode}_products').select("metadata").eq('id', productId).execute().data[0]
     if not product: raise HTTPException(404, f"Product {productId} not found")
 
     metadata = product['metadata']
     base_price = metadata.get('base_price', 100.0)
 
-    # fetch pre-computed prices from registry
+    # PRIORITY 1: session-aware price (computed by Airflow worker)
+    if sessionId:
+        session_price = registry.get_session_price(sessionId, productId)
+        if session_price is not None:
+            return PriceResponse(
+                productId=productId,
+                price=session_price,
+                base_price=base_price,
+                markup=session_price/base_price,
+                elasticity=None,
+                model_version='session-aware'
+            )
+
+    # PRIORITY 2: global pre-computed prices (surge pricing)
     prices_df = registry.get_prices('latest')
-    elasticity_df = registry.get_elasticity('latest')
-
-    if prices_df is None:
-        # fallback: no pre-computed prices available
-        return PriceResponse(
-            productId=productId,
-            price=base_price,
-            base_price=base_price,
-            markup=1.0,
-            elasticity=None
-        )
-
-    # lookup pre-computed price for this product
-    product_price_row = prices_df[prices_df['productId'] == productId]
-    if product_price_row.empty:
-        # product not in pre-computed prices, fallback to base
-        return PriceResponse(
-            productId=productId,
-            price=base_price,
-            base_price=base_price,
-            markup=1.0,
-            elasticity=None
-        )
-
-    optimal_price = float(product_price_row['optimal_price'].iloc[0]) # TODO: use optimal_price everywhere as  aresult
-
-    # get elasticity if available
-    product_elasticity = None
-    if elasticity_df is not None:
-        product_elasticity_row = elasticity_df[elasticity_df['productId'] == productId]
-        if not product_elasticity_row.empty:
-            product_elasticity = float(product_elasticity_row['elasticity'].iloc[0])
+    if prices_df is not None:
+        product_price_row = prices_df[prices_df['productId'] == productId]
+        if not product_price_row.empty:
+            optimal_price = float(product_price_row['optimal_price'].iloc[0])
+            return PriceResponse(
+                productId=productId,
+                price=optimal_price,
+                base_price=base_price,
+                markup=optimal_price/base_price,
+                elasticity=None,
+                model_version='surge'
+            )
 
+    # PRIORITY 3: fallback to base price
     return PriceResponse(
         productId=productId,
-        price=optimal_price,
+        price=base_price,
         base_price=base_price,
-        markup=optimal_price/base_price,
-        elasticity=product_elasticity
+        markup=1.0,
+        elasticity=None,
+        model_version='base'
     )
 
 @app.get("/models")
diff --git a/experiments/procesing/pricers/simple.py b/experiments/procesing/pricers/simple.py
index 39be37a..6bdd1ca 100644
--- a/experiments/procesing/pricers/simple.py
+++ b/experiments/procesing/pricers/simple.py
@@ -3,6 +3,46 @@ import pandas as pd
 from procesing.pricers.base import PricingFunction
 
 
+def session_features_to_demand(session_features: pd.DataFrame) -> float:
+    """
+    Map session behavioral features to demand proxy.
+    THIS is the critical θ̂ → D transformation for rule-based pricing.
+
+    Logic:
+      - High velocity → agent behavior → price up (revenue recovery)
+      - High cart ratio → purchase intent → price up
+      - Low activity → discount to convert
+
+    Returns: demand proxy score (0-20 range, higher = more demand)
+    """
+    if session_features.empty:
+        return 1.0
+
+    feat = session_features.iloc[0] if len(session_features) > 0 else {}
+
+    velocity = feat.get('interaction_velocity', 0)
+    cart_ratio = feat.get('cart_to_view_ratio', 0)
+    item_views = feat.get('item_views', 0)
+    cart_adds = feat.get('cart_adds', 0)
+
+    # baseline demand
+    demand = 1.0
+
+    # agent detection: high velocity → treat as high "demand" to price up
+    if velocity > 2.0:
+        demand += 10.0  # strong agent signal
+
+    # conversion intent: cart interaction → price up
+    if cart_ratio > 0.1 or cart_adds > 0:
+        demand += 5.0
+
+    # browsing depth: many views → interest signal
+    if item_views > 3:
+        demand += min(item_views, 5.0)
+
+    return min(demand, 20.0)  # cap at 20
+
+
 class StaticPricer(PricingFunction):
     """Static pricing: always return fixed base prices"""
 
@@ -67,21 +107,24 @@ class SimpleSurgePricer(PricingFunction):
         self.surge_multiplier = surge_multiplier
         self.discount_multiplier = discount_multiplier
 
-    def fit(self, market_data : pd.DataFrame):
+    def fit(self, market_data: pd.DataFrame):
         """Extract base prices from product catalog or historical averages"""
         self.base_prices = market_data['base_price'].to_numpy() if 'base_price' in market_data.columns else market_data['price'].values
-        self.demand_history = market_data['demand'].to_numpy() if 'demand' in market_data.columns else np.zeros_like(self.base_prices)
+        return self
 
-    def predict(self) -> np.ndarray:
+    def predict(self, state_space) -> np.ndarray:
         """
         Adjust prices based on current demand using surge rules.
-        state_space.demand: demand counts per product
-        state_space.prices: current prices (fallback if base_prices not set)
+        state_space.demand: demand proxy per product (from session features)
+        state_space.prices: base prices
         """
-        current_prices = self.base_prices if self.base_prices is not None else np.ones_like(demand_vector) * 99.99
-        demand = self.demand_history if self.demand_history is not None else np.zeros_like(current_prices)
-        new_prices = current_prices.copy()
+        demand = np.asarray(state_space.demand) if state_space and hasattr(state_space, 'demand') else np.array([0])
+        base = np.asarray(state_space.prices) if state_space and hasattr(state_space, 'prices') else self.base_prices
 
+        if base is None:
+            base = np.ones(len(demand)) * 99.99
+
+        new_prices = base.copy()
         high_mask = demand >= self.high_threshold
         new_prices[high_mask] *= self.surge_multiplier
 
diff --git a/experiments/procesing/steps/session.py b/experiments/procesing/steps/session.py
index 4b950aa..ec6f27c 100644
--- a/experiments/procesing/steps/session.py
+++ b/experiments/procesing/steps/session.py
@@ -135,6 +135,7 @@ class ExtractSessionFeaturesStep(BaseContextStep):
     Vectorized session feature extraction - replaces O(n^2) per-row loop.
     Input: interactions_df
     Output: session-level feature matrix
+    THIS is our main mapping from tau (trajectory) to some features vector theta - we need to do this very well. This is what will go into demand esimation.
     """
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
diff --git a/lib/model_registry.py b/lib/model_registry.py
index 92d7934..e833a1a 100755
--- a/lib/model_registry.py
+++ b/lib/model_registry.py
@@ -178,3 +178,49 @@ class ModelRegistry:
             return True
         except:
             return False
+
+    def set_session_prices(self, session_id: str, prices: Dict[str, float], ttl: int = 1800):
+        """
+        Store prices for a specific session.
+        THIS is the write path for session-aware pricing.
+
+        Args:
+            session_id: session identifier
+            prices: dict of {productId: price}
+            ttl: time-to-live in seconds (default 30min)
+        """
+        if not prices:
+            return
+
+        key = f"session:{session_id}:prices"
+        # use Redis hash for O(1) lookup per product
+        self.redis_client.hset(key, mapping={k: str(v) for k, v in prices.items()})
+        self.redis_client.expire(key, ttl)
+
+    def get_session_price(self, session_id: str, product_id: str) -> Optional[float]:
+        """
+        Lookup price for (sessionId, productId).
+        THIS is the read path for fast provider lookup.
+
+        Returns: price or None if not found
+        """
+        key = f"session:{session_id}:prices"
+        price_str = self.redis_client.hget(key, product_id)
+
+        if price_str is None:
+            return None
+
+        return float(price_str.decode('utf-8') if isinstance(price_str, bytes) else price_str)
+
+    def get_session_all_prices(self, session_id: str) -> Dict[str, float]:
+        """Get all prices for a session."""
+        key = f"session:{session_id}:prices"
+        prices_raw = self.redis_client.hgetall(key)
+
+        if not prices_raw:
+            return {}
+
+        return {
+            (k.decode('utf-8') if isinstance(k, bytes) else k): float(v.decode('utf-8') if isinstance(v, bytes) else v)
+            for k, v in prices_raw.items()
+        }
diff --git a/web/src/app/api/pricing/route.ts b/web/src/app/api/pricing/route.ts
index 1aec75b..6532131 100644
--- a/web/src/app/api/pricing/route.ts
+++ b/web/src/app/api/pricing/route.ts
@@ -30,6 +30,8 @@ export async function GET(req: NextRequest) {
     const providerUrl = process.env.PRICING_PROVIDER_URL || 'http://localhost:5001';
     try {
         const queryParams = new URLSearchParams();
+        // THIS is our entry point into the dynamic pricing where we reference the context of the sesion and experiment and ask for a price to assign to the trajectory which is expressed
+        // The whole pipeline gets triggered from here.
         if (sessionId) queryParams.append('sessionId', sessionId);
         if (experimentId) queryParams.append('experimentId', experimentId);
 
@@ -55,25 +57,26 @@ export async function GET(req: NextRequest) {
         price = Math.round(randomBase * 100) / 100;
     }
 
-    // log price to kafka for elasticity computation
+    // log price to kafka asynchronously (non-blocking)
     if (sessionId) {
         const backendUrl = process.env.BACKEND_URL || 'http://localhost:5000';
-        try {
-            await fetch(`${backendUrl}/api/kafka/price-log`, {
-                method: 'POST',
-                headers: { 'Content-Type': 'application/json' },
-                body: JSON.stringify({
-                    productId,
-                    price,
-                    sessionId,
-                    experimentId: experimentId || undefined,
-                    storeMode,
-                    ts: timestamp,
-                }),
-            });
-        } catch (err) {
-            console.error('[price-log-error]', err);
-        }
+        // fire and forget - don't await to avoid blocking response
+        fetch(`${backendUrl}/api/kafka/price-log`, {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+                productId,
+                price,
+                sessionId,
+                experimentId: experimentId || undefined,
+                storeMode,
+                ts: timestamp,
+            }),
+        }).catch(err => {
+            if (process.env.NODE_ENV === 'development') {
+                console.error('[price-log-error]', err);
+            }
+        });
     }
 
     if (process.env.NODE_ENV === 'development') {

From acf731efcb9d725399443dc8e73999f71c8835f0 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 12 Jan 2026 13:37:48 +0100
Subject: [PATCH 12/99] feat: integration of pipeline hooks into testing

---
 Makefile                                      |  2 ++
 backend/server/app.py                         |  6 ++++-
 docker-compose.yml                            |  2 ++
 .../airflow/dags/surge_pricing_pipeline.py    | 24 +++++++++++++++----
 experiments/procesing/pricers/simple.py       |  3 ++-
 tests/e2e/helpers/kafka.ts                    |  4 ++--
 tests/e2e/playwright.config.ts                |  4 ++--
 tests/e2e/scenarios/session-aware.spec.ts     | 21 ++++++++++------
 tests/e2e/scenarios/surge-pricing.spec.ts     | 11 +++++++--
 9 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/Makefile b/Makefile
index d2d2d7f..6e6c521 100644
--- a/Makefile
+++ b/Makefile
@@ -48,8 +48,10 @@ test.backend: $(VENV)
 test.e2e:
 	@cd tests/e2e && npm install
 	@cd tests/e2e && npx playwright install chromium
+	@test -f tests/e2e/.env || cp tests/e2e/.env.example tests/e2e/.env
 	@timeout 30 bash -c 'until curl -sf http://localhost:5000/health > /dev/null 2>&1; do sleep 1; done' || (echo "Backend not ready" && exit 1)
 	@timeout 30 bash -c 'until curl -sf http://localhost:3000 > /dev/null 2>&1; do sleep 1; done' || (echo "Web app not ready" && exit 1)
+	@timeout 30 bash -c 'until curl -sf http://localhost:8085/health > /dev/null 2>&1; do sleep 1; done' || (echo "Airflow not ready" && exit 1)
 	@cd tests/e2e && npm test
 
 .PHONY: test.all
diff --git a/backend/server/app.py b/backend/server/app.py
index d338408..f100811 100644
--- a/backend/server/app.py
+++ b/backend/server/app.py
@@ -198,12 +198,16 @@ def dump_logs(
             auto_offset_reset='earliest',
             enable_auto_commit=False,
             value_deserializer=lambda x: json.loads(x.decode('utf-8')),
-            consumer_timeout_ms=5000
+            consumer_timeout_ms=30000,
+            fetch_max_wait_ms=10000,
+            max_poll_records=1000
         )
 
         events = []
         for msg in consumer:
             events.append(msg.value)
+            if last_n and len(events) >= last_n * 2:
+                break
 
         consumer.close()
 
diff --git a/docker-compose.yml b/docker-compose.yml
index f72f415..561c393 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -144,6 +144,7 @@ services:
       - AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
       - AIRFLOW__WEBSERVER__EXPOSE_CONFIG=true
       - AIRFLOW__WEBSERVER__SECRET_KEY=${AIRFLOW_SECRET_KEY}
+      - AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.basic_auth
       - KAFKA_HOST=kafka
       - KAFKA_PORT=29092
       - BACKEND_URL=http://backend:5000
@@ -180,6 +181,7 @@ services:
       - AIRFLOW__CORE__LOAD_EXAMPLES=false
       - AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
       - AIRFLOW__WEBSERVER__SECRET_KEY=${AIRFLOW_SECRET_KEY}
+      - AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.basic_auth
       - KAFKA_HOST=kafka
       - KAFKA_PORT=29092
       - BACKEND_URL=http://backend:5000
diff --git a/experiments/airflow/dags/surge_pricing_pipeline.py b/experiments/airflow/dags/surge_pricing_pipeline.py
index b1d7c61..1a3b3d0 100644
--- a/experiments/airflow/dags/surge_pricing_pipeline.py
+++ b/experiments/airflow/dags/surge_pricing_pipeline.py
@@ -120,15 +120,31 @@ def apply_surge_pricing(**kwargs):
     # rename demand_score to demand for pricer compatibility
     data = product_features.rename(columns={'demand_score': 'demand'})
 
+    high_thresh = dag_conf.get('high_threshold', 10)
+    low_thresh = dag_conf.get('low_threshold', 2)
+    surge_mult = dag_conf.get('surge_multiplier', 1.2)
+    discount_mult = dag_conf.get('discount_multiplier', 0.9)
+
+    logging.info(f"Surge pricing config: high_thresh={high_thresh}, low_thresh={low_thresh}, surge_mult={surge_mult}, discount_mult={discount_mult}")
+    logging.info(f"Demand stats: min={data['demand'].min():.2f}, max={data['demand'].max():.2f}, mean={data['demand'].mean():.2f}")
+    logging.info(f"Products with high demand (>={high_thresh}): {(data['demand'] >= high_thresh).sum()}")
+    logging.info(f"Products with low demand (<={low_thresh}): {(data['demand'] <= low_thresh).sum()}")
+
     surge_pricer = SimpleSurgePricer(
-        high_threshold=dag_conf.get('high_threshold', 10),
-        low_threshold=dag_conf.get('low_threshold', 2),
-        surge_multiplier=dag_conf.get('surge_multiplier', 1.2),
-        discount_multiplier=dag_conf.get('discount_multiplier', 0.9)
+        high_threshold=high_thresh,
+        low_threshold=low_thresh,
+        surge_multiplier=surge_mult,
+        discount_multiplier=discount_mult
     )
     surge_pricer.fit(data)
     data['optimal_price'] = surge_pricer.predict()
 
+    base_avg = data['base_price'].mean()
+    optimal_avg = data['optimal_price'].mean()
+    price_change_pct = ((optimal_avg - base_avg) / base_avg) * 100
+
+    logging.info(f"Price adjustment: base_avg={base_avg:.2f}, optimal_avg={optimal_avg:.2f}, change={price_change_pct:+.1f}%")
+
     prices_df = data[['productId', 'price', 'base_price', 'optimal_price', 'demand']].rename(columns={
         'price': 'current_price',
         'demand': 'demand_score'
diff --git a/experiments/procesing/pricers/simple.py b/experiments/procesing/pricers/simple.py
index 6bdd1ca..1a03f9f 100644
--- a/experiments/procesing/pricers/simple.py
+++ b/experiments/procesing/pricers/simple.py
@@ -124,7 +124,8 @@ class SimpleSurgePricer(PricingFunction):
         if base is None:
             base = np.ones(len(demand)) * 99.99
 
-        new_prices = base.copy()
+        # ensure float dtype to allow multiplication by float multipliers
+        new_prices = base.astype(np.float64).copy()
         high_mask = demand >= self.high_threshold
         new_prices[high_mask] *= self.surge_multiplier
 
diff --git a/tests/e2e/helpers/kafka.ts b/tests/e2e/helpers/kafka.ts
index c0a95dd..18b977d 100644
--- a/tests/e2e/helpers/kafka.ts
+++ b/tests/e2e/helpers/kafka.ts
@@ -9,8 +9,8 @@ interface InteractionEvent {
 const dumpKafkaTopic = async (backendUrl: string, topic: string) => {
   const resp = await fetch(`${backendUrl}/api/kafka/dump?topic=${topic}`);
   if (!resp.ok) throw new Error(`Kafka dump failed: ${resp.status}`);
-  const { messages = [] } = await resp.json();
-  return messages as any[];
+  const { data = [] } = await resp.json();
+  return data as any[];
 };
 
 export const waitForInteractionEvent = async (
diff --git a/tests/e2e/playwright.config.ts b/tests/e2e/playwright.config.ts
index 54a5561..dc3c815 100644
--- a/tests/e2e/playwright.config.ts
+++ b/tests/e2e/playwright.config.ts
@@ -5,14 +5,14 @@ export default defineConfig({
   fullyParallel: true,
   forbidOnly: !!process.env.CI,
   retries: 0,
-  workers: 5,
+  workers: 1,
   reporter: 'list',
   use: {
     baseURL: process.env.WEB_URL || 'http://localhost:3000',
     trace: 'retain-on-failure',
     screenshot: 'only-on-failure',
   },
-  timeout: 60000,
+  timeout: 180000,
   expect: {
     timeout: 10000,
   },
diff --git a/tests/e2e/scenarios/session-aware.spec.ts b/tests/e2e/scenarios/session-aware.spec.ts
index b204984..5c27747 100644
--- a/tests/e2e/scenarios/session-aware.spec.ts
+++ b/tests/e2e/scenarios/session-aware.spec.ts
@@ -9,6 +9,7 @@ import {
   addToCart,
 } from '../helpers/interactions';
 import { getSessionEvents } from '../helpers/kafka';
+import { runSessionPricing } from '../helpers/airflow';
 
 test.describe('SessionAwarePricer E2E', () => {
   const STORE_TYPE = 'hotel';
@@ -23,6 +24,9 @@ test.describe('SessionAwarePricer E2E', () => {
     await page.waitForTimeout(1500);
 
     const productId2 = await humanLikeViewProduct(page, STORE_TYPE);
+
+    await runSessionPricing(STORE_TYPE);
+
     const secondPrice = await getPriceFromDOM(page);
     expect(await verifySessionConsistency(page, sessionId)).toBeTruthy();
 
@@ -40,11 +44,13 @@ test.describe('SessionAwarePricer E2E', () => {
     await rapidViewProductViaFlow(page, 8, 100, STORE_TYPE);
     expect(await verifySessionConsistency(page, sessionId)).toBeTruthy();
 
-    await page.waitForTimeout(2500);
+    await page.waitForTimeout(1000);
 
     const events = await getSessionEvents(backendUrl, sessionId);
     expect(events.length).toBeGreaterThanOrEqual(8);
 
+    await runSessionPricing(STORE_TYPE);
+
     await page.goto(`/products/${productId}`);
     await page.waitForLoadState('networkidle');
     const agentPrice = await getPriceFromDOM(page);
@@ -59,14 +65,12 @@ test.describe('SessionAwarePricer E2E', () => {
     const productId = await viewProductViaFlow(page, STORE_TYPE);
     const baselinePrice = await getPriceFromDOM(page);
 
-    const startTime = Date.now();
     await rapidViewProductViaFlow(page, 10, 80, STORE_TYPE);
-    const duration = (Date.now() - startTime) / 1000;
 
-    const eventsPerSec = 10 / duration;
-    expect(eventsPerSec).toBeGreaterThan(2.0);
+    const events = await getSessionEvents(backendUrl, sessionId);
+    expect(events.length).toBeGreaterThanOrEqual(10);
 
-    await page.waitForTimeout(2000);
+    await runSessionPricing(STORE_TYPE);
 
     await page.goto(`/products/${productId}`);
     await page.waitForLoadState('networkidle');
@@ -105,8 +109,11 @@ test.describe('SessionAwarePricer E2E', () => {
 
     await rapidViewProductViaFlow(page, 2, 150, STORE_TYPE);
 
-    await page.waitForTimeout(1500);
+    await page.waitForTimeout(1000);
     await humanLikeViewProduct(page, STORE_TYPE);
+
+    await runSessionPricing(STORE_TYPE);
+
     const finalPrice = await getPriceFromDOM(page);
 
     expect(Math.abs(finalPrice - baselinePrice) / baselinePrice).toBeLessThan(0.3);
diff --git a/tests/e2e/scenarios/surge-pricing.spec.ts b/tests/e2e/scenarios/surge-pricing.spec.ts
index e3e2f8d..26d29d3 100644
--- a/tests/e2e/scenarios/surge-pricing.spec.ts
+++ b/tests/e2e/scenarios/surge-pricing.spec.ts
@@ -7,6 +7,7 @@ import {
   verifySessionConsistency,
 } from '../helpers/interactions';
 import { waitForInteractionEvent, countProductViews } from '../helpers/kafka';
+import { runSurgePricing } from '../helpers/airflow';
 
 test.describe('SimpleSurgePricer E2E', () => {
   const STORE_TYPE = 'hotel';
@@ -29,7 +30,7 @@ test.describe('SimpleSurgePricer E2E', () => {
 
     await rapidViewProductViaFlow(page, 5, 200, STORE_TYPE);
 
-    await page.waitForTimeout(2000);
+    await page.waitForTimeout(1000);
 
     const evt = await waitForInteractionEvent(backendUrl, sessionId, 'view_item_page');
     expect(evt).not.toBeNull();
@@ -37,6 +38,8 @@ test.describe('SimpleSurgePricer E2E', () => {
     const viewCount = await countProductViews(backendUrl, productId);
     expect(viewCount).toBeGreaterThanOrEqual(5);
 
+    await runSurgePricing(STORE_TYPE, 3, 1);
+
     await page.goto(`/products/${productId}`);
     await page.waitForLoadState('networkidle');
     const surgedPrice = await getPriceFromDOM(page);
@@ -72,7 +75,9 @@ test.describe('SimpleSurgePricer E2E', () => {
 
     await rapidViewProductViaFlow(page, 5, 150, STORE_TYPE);
 
-    await page.waitForTimeout(1500);
+    await page.waitForTimeout(1000);
+
+    await runSurgePricing(STORE_TYPE, 3, 1);
 
     await page.goto(`/products/${productId}`);
     await page.waitForLoadState('networkidle');
@@ -81,6 +86,8 @@ test.describe('SimpleSurgePricer E2E', () => {
 
     await page.waitForTimeout(12000);
 
+    await runSurgePricing(STORE_TYPE, 3, 1);
+
     await page.goto(`/products/${productId}`);
     await page.waitForLoadState('networkidle');
     const decayedPrice = await getPriceFromDOM(page);

From 0d214a469f64194dd9b1bb4247e157b6579908f6 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 12 Jan 2026 20:59:09 +0100
Subject: [PATCH 13/99] planning

---
 .../airflow/dags/surge_pricing_factory.py     | 10 +++++++
 experiments/procesing/pricers/base.py         | 29 +++++++++----------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/experiments/airflow/dags/surge_pricing_factory.py b/experiments/airflow/dags/surge_pricing_factory.py
index a886d5b..b61e65c 100644
--- a/experiments/airflow/dags/surge_pricing_factory.py
+++ b/experiments/airflow/dags/surge_pricing_factory.py
@@ -1,3 +1,4 @@
+from pandas.core.algorithms import factorize_array
 from airflow import DAG
 from airflow.operators.python import PythonOperator
 from airflow.utils.dates import days_ago
@@ -208,3 +209,12 @@ def create_surge_pricing_dag(store_mode: str) -> DAG:
 # instantiate DAGs for Airflow to discover
 dag_airline = create_surge_pricing_dag('airline')
 dag_hotel = create_surge_pricing_dag('hotel')
+
+# TODO: Refactor this factory from a surge pricing factory to a general pricing factory
+# We will do this by passing a pricing strategy class to the factory, since the generic pipeline is:
+# take all interaction data, group by sessionId and assign a new price vector to each session
+# in the grouping we get a subset of the interactions per sessionId and we can map that to some Features
+# we define a custom _get_features(interactions .) methodin the strategy class
+# we then run only the inference which is the .predict(trajectory) per-session which will give us a new price vector
+# this we then publish for each sessionId group
+# this might include no deleting most of the pricers we have defined and starting with a super simple surge-pricing algorithm that is no-fit only predict. This we can then test end-to-end and observe changes to prices according to a desired strategy - we have to define this one as a very short term strategy because we run sessions that take only a few minutes.
diff --git a/experiments/procesing/pricers/base.py b/experiments/procesing/pricers/base.py
index 6569556..ecaabed 100644
--- a/experiments/procesing/pricers/base.py
+++ b/experiments/procesing/pricers/base.py
@@ -7,15 +7,6 @@ import pandas as pd
 class PricingFunction(ABC):
     """
     Abstract base for pricing functions.
-
-    Defines mapping: f(Q_t, P_t, S_t, H_t) -> P_{t+1}
-
-    Where:
-        Q_t ∈ R^n: demand vector at time t
-        P_t ∈ R^n: price vector at time t
-        S_t: session features (behavioral signals, interactions)
-        H_t = {Q_{t-k}, P_{t-k}, S_{t-k}}: historical state trajectory
-
     Objective:
         maximize E[R_T] = E[Σ P_t^T · Q_t]
         subject to:
@@ -28,10 +19,10 @@ class PricingFunction(ABC):
     def fit(self, *kwargs):
         """
         Offline training on historical data.
+        This is where we can think about some maximization of expected revenue
+        over historical trajectories to learn parameters of the pricing function.
+        (This however we cover move in the RL side of things)
 
-        Args:
-            historical_data: DataFrame with elasticity, prices, demand signals
-            **kwargs: additional training parameters
         """
         pass
 
@@ -39,12 +30,18 @@ class PricingFunction(ABC):
     def predict(self, *kwargs) -> np.ndarray:
         """
         Generate optimal prices given current state.
+        This is an abstract method that transitions from τ -> P*
+        which is the mapping from the trajectory to optimal prices under
+        some subset of session grouping (so, per sessionId)
+        """
+        pass
 
-        Args:
-            state_space: StateSpace object containing Q_t, P_t, S_t, H_t
-
+    @abstractmethod
+    def _get_features(self, *kwargs) -> np.ndarray:
+        """
+        Extract features from trajectory for pricing decision.
         Returns:
-            P_{t+1}: price vector in R^n
+            np.ndarray of shape (n_products, n_features)
         """
         pass
 

From 961302a21a1189bf4bdc3d0a8998ecfd9b057532 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 12 Jan 2026 22:33:47 +0100
Subject: [PATCH 14/99] chore: better test consistency before agnet

---
 docker-compose.yml                            | 20 ++++++++--
 experiments/procesing/pricers/elasticity.py   | 10 +++++
 .../procesing/pricers/session_aware.py        | 39 +++++++++++++++++++
 experiments/procesing/pricers/simple.py       | 23 +++++++++++
 4 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 561c393..ba2e8a3 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -112,11 +112,14 @@ services:
     depends_on:
       - postgres
     environment:
-      - AIRFLOW__CORE__EXECUTOR=SequentialExecutor
+      - AIRFLOW__CORE__EXECUTOR=LocalExecutor
       - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
       - AIRFLOW__CORE__FERNET_KEY=${AIRFLOW_FERNET_KEY}
       - AIRFLOW__CORE__LOAD_EXAMPLES=false
       - AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
+      - AIRFLOW__CORE__PARALLELISM=16
+      - AIRFLOW__CORE__DAG_CONCURRENCY=8
+      - AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG=4
       - _AIRFLOW_DB_MIGRATE=true
       - _AIRFLOW_WWW_USER_CREATE=true
       - _AIRFLOW_WWW_USER_USERNAME=admin
@@ -136,12 +139,17 @@ services:
       - airflow-init
       - redis
     environment:
-      - AIRFLOW__CORE__EXECUTOR=SequentialExecutor
+      - AIRFLOW__CORE__EXECUTOR=LocalExecutor
       - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
       - AIRFLOW__CORE__FERNET_KEY=${AIRFLOW_FERNET_KEY}
       - AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=true
       - AIRFLOW__CORE__LOAD_EXAMPLES=false
       - AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
+      - AIRFLOW__CORE__PARALLELISM=16
+      - AIRFLOW__CORE__DAG_CONCURRENCY=8
+      - AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG=4
+      - AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=30
+      - AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL=60
       - AIRFLOW__WEBSERVER__EXPOSE_CONFIG=true
       - AIRFLOW__WEBSERVER__SECRET_KEY=${AIRFLOW_SECRET_KEY}
       - AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.basic_auth
@@ -174,12 +182,18 @@ services:
       redis:
         condition: service_started
     environment:
-      - AIRFLOW__CORE__EXECUTOR=SequentialExecutor
+      - AIRFLOW__CORE__EXECUTOR=LocalExecutor
       - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
       - AIRFLOW__CORE__FERNET_KEY=${AIRFLOW_FERNET_KEY}
       - AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=true
       - AIRFLOW__CORE__LOAD_EXAMPLES=false
       - AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
+      - AIRFLOW__CORE__PARALLELISM=16
+      - AIRFLOW__CORE__DAG_CONCURRENCY=8
+      - AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG=4
+      - AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=30
+      - AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL=60
+      - AIRFLOW__SCHEDULER__PARSING_PROCESSES=2
       - AIRFLOW__WEBSERVER__SECRET_KEY=${AIRFLOW_SECRET_KEY}
       - AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.basic_auth
       - KAFKA_HOST=kafka
diff --git a/experiments/procesing/pricers/elasticity.py b/experiments/procesing/pricers/elasticity.py
index b203159..3ce3b42 100644
--- a/experiments/procesing/pricers/elasticity.py
+++ b/experiments/procesing/pricers/elasticity.py
@@ -57,3 +57,13 @@ class ElasticityBasedPricer(PricingFunction):
         # enforce bounds
         prices = np.clip(prices, self.price_floor, self.price_ceil)
         return prices
+
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Extract elasticity, demand, and demand deviation for each product"""
+        if state_space is None or self.elasticity is None:
+            n = len(self.elasticity) if self.elasticity is not None else 0
+            return np.zeros((n, 3))
+
+        demand = np.asarray(state_space.demand)
+        demand_dev = (demand - self.mean_demand) / (self.mean_demand + 1e-6)
+        return np.column_stack([self.elasticity, demand, demand_dev])
diff --git a/experiments/procesing/pricers/session_aware.py b/experiments/procesing/pricers/session_aware.py
index 40343a7..dbc859f 100644
--- a/experiments/procesing/pricers/session_aware.py
+++ b/experiments/procesing/pricers/session_aware.py
@@ -107,6 +107,36 @@ class SessionAwarePricer(PricingFunction):
 
         return prices
 
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Extract elasticity, demand, and session features"""
+        if state_space is None or self.elasticity is None:
+            n = len(self.elasticity) if self.elasticity is not None else 0
+            return np.zeros((n, 5))
+
+        demand = np.asarray(state_space.demand)
+        n_products = len(demand)
+
+        # extract session features
+        velocity = 0.0
+        view_depth = 0.0
+        cart_to_view = 0.0
+
+        if not state_space.session_features.empty:
+            sf = state_space.session_features.iloc[0]
+            velocity = sf.get('interaction_velocity', 0.0)
+            view_depth = sf.get('product_view_depth', 0.0)
+            cart_to_view = sf.get('cart_to_view_ratio', 0.0)
+
+        # broadcast session features to all products
+        features = np.column_stack([
+            self.elasticity,
+            demand,
+            np.full(n_products, velocity),
+            np.full(n_products, view_depth),
+            np.full(n_products, cart_to_view)
+        ])
+        return features
+
 
 class ProductSpecificSessionPricer(PricingFunction):
     """
@@ -170,3 +200,12 @@ class ProductSpecificSessionPricer(PricingFunction):
 
         prices = np.clip(base_prices, self.price_floor, self.price_ceil)
         return prices
+
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Extract elasticity and demand features for product-specific pricing"""
+        if state_space is None or self.elasticity is None:
+            n = len(self.elasticity) if self.elasticity is not None else 0
+            return np.zeros((n, 2))
+
+        demand = np.asarray(state_space.demand)
+        return np.column_stack([self.elasticity, demand])
diff --git a/experiments/procesing/pricers/simple.py b/experiments/procesing/pricers/simple.py
index 1a03f9f..d7fa699 100644
--- a/experiments/procesing/pricers/simple.py
+++ b/experiments/procesing/pricers/simple.py
@@ -65,6 +65,11 @@ class StaticPricer(PricingFunction):
             raise ValueError("Must call fit() or provide base_prices in constructor")
         return self.base_prices.copy()
 
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Static pricer uses no features, returns empty array"""
+        n = len(self.base_prices) if self.base_prices is not None else 0
+        return np.zeros((n, 0))
+
 
 class RandomPricer(PricingFunction):
     """Random pricing within bounds (for baseline comparison)"""
@@ -87,6 +92,11 @@ class RandomPricer(PricingFunction):
             self.n_products = len(state_space.demand)
         return self.rng.uniform(self.price_min, self.price_max, size=self.n_products)
 
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Random pricer uses no features"""
+        n = self.n_products if self.n_products else 0
+        return np.zeros((n, 0))
+
 
 class SimpleSurgePricer(PricingFunction):
     """
@@ -133,3 +143,16 @@ class SimpleSurgePricer(PricingFunction):
         new_prices[low_mask] *= self.discount_multiplier
 
         return new_prices
+
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Extract demand and base price features for each product"""
+        if state_space is None:
+            n = len(self.base_prices) if self.base_prices is not None else 0
+            return np.zeros((n, 2))
+
+        demand = np.asarray(state_space.demand) if hasattr(state_space, 'demand') else np.array([0])
+        base = np.asarray(state_space.prices) if hasattr(state_space, 'prices') else self.base_prices
+        if base is None:
+            base = np.ones(len(demand)) * 99.99
+
+        return np.column_stack([demand, base])

From d86535769516aab38beda3184465b915cd9b967e Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 15:05:33 +0100
Subject: [PATCH 15/99] chore: fixing visual bugs in cart

---
 web/src/app/cart/page.tsx | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/web/src/app/cart/page.tsx b/web/src/app/cart/page.tsx
index 30ac3f2..dbcb30b 100644
--- a/web/src/app/cart/page.tsx
+++ b/web/src/app/cart/page.tsx
@@ -32,7 +32,8 @@ export default function CartPage() {
                     {itemCount > 0 && (
                         <button
                             onClick={clearCart}
-                            className="text-sm text-red-600 hover:underline"
+                            className="text-sm hover:underline"
+                            style={{ color: 'var(--accent-warning)' }}
                         >
                             Clear cart
                         </button>
@@ -42,7 +43,7 @@ export default function CartPage() {
                 {itemCount === 0 ? (
                     <div className="text-center py-12">
                         <p className="text-gray-500 mb-4">Your cart is empty</p>
-                        <a href="/" className="text-blue-600 hover:underline">Browse our selection</a>
+                        <a href="/" className="hover:underline" style={{ color: 'var(--text-accent)' }}>Browse our selection</a>
                     </div>
                 ) : (
                     <>
@@ -54,15 +55,11 @@ export default function CartPage() {
                                 >
                                     <div className="flex-1">
                                         <div className="flex items-center gap-2 mb-1">
-                                            <span className="px-2 py-0.5 text-xs font-medium rounded bg-blue-100 text-blue-800">
-                                                {item.type}
-                                            </span>
                                             <h3 className="font-semibold">{item.name}</h3>
                                         </div>
 
                                         {item.type === 'hotel' && (
                                             <div className="text-sm text-gray-600">
-                                                <p>{String(item.metadata.roomType)}</p>
                                                 <p>{String(item.metadata.checkIn)} - {String(item.metadata.checkOut)}</p>
                                                 <p>{String(item.metadata.nights)} night{Number(item.metadata.nights) > 1 ? 's' : ''}</p>
                                             </div>
@@ -81,7 +78,8 @@ export default function CartPage() {
                                         <p className="text-xl font-bold mb-2">${item.price}</p>
                                         <button
                                             onClick={() => handleRemove(item.id, item.type)}
-                                            className="text-sm text-red-600 hover:underline"
+                                            className="text-sm hover:underline"
+                                            style={{ color: 'var(--accent-warning)' }}
                                         >
                                             Remove
                                         </button>
@@ -100,7 +98,7 @@ export default function CartPage() {
                                     dispatchInteraction('checkout_start', undefined, { total, itemCount });
                                     window.location.href = '/checkout';
                                 }}
-                                className="w-full py-3 bg-blue-600 hover:bg-blue-700 text-white rounded-lg font-medium transition-colors"
+                                className="btn-primary w-full"
                             >
                                 Proceed to Checkout
                             </button>

From 90f57cb9b9fa12b2e2eaff43cc15d79c34727c7d Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 15:09:52 +0100
Subject: [PATCH 16/99] chore: styling and title updates

---
 web/src/app/globals.css              | 3 +++
 web/src/app/layout.tsx               | 4 ++--
 web/src/components/ui/Navigation.tsx | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/web/src/app/globals.css b/web/src/app/globals.css
index 4a5b0c9..457b974 100644
--- a/web/src/app/globals.css
+++ b/web/src/app/globals.css
@@ -8,6 +8,9 @@
   --bg-secondary: #f5f5f5;
   --text-primary: #333333;
   --text-secondary: #666666;
+  --accent-primary: #007aff;
+  --accent-primary-hover: #0051d5;
+  --accent-primary-light: #e6f2ff;
   --spacing-sm: 8px;
   --spacing-md: 16px;
   --spacing-lg: 32px;
diff --git a/web/src/app/layout.tsx b/web/src/app/layout.tsx
index e9f9b63..5ff49ae 100644
--- a/web/src/app/layout.tsx
+++ b/web/src/app/layout.tsx
@@ -15,8 +15,8 @@ const geistMono = Geist_Mono({
 });
 
 export const metadata: Metadata = {
-  title: "Create Next App",
-  description: "Generated by create next app",
+  title: "Travel Booking Platform",
+  description: "Book flights and hotels with dynamic pricing",
 };
 
 export default function RootLayout({
diff --git a/web/src/components/ui/Navigation.tsx b/web/src/components/ui/Navigation.tsx
index 9d9d4cf..6f0ecbb 100644
--- a/web/src/components/ui/Navigation.tsx
+++ b/web/src/components/ui/Navigation.tsx
@@ -20,7 +20,7 @@ const NavLink = ({ href, children }: { href: string; children: React.ReactNode }
       href={href}
       className={`px-4 py-2 rounded-md transition-colors ${
         isActive
-          ? 'bg-[var(--accent-primary)] font-semibold'
+          ? 'bg-[var(--accent-primary)] text-white font-semibold'
           : 'hover:bg-[var(--accent-primary-light)] text-[var(--text-primary)]'
       }`}
     >

From e60c0c64e1f89381603c8aaeadfebcf5f9531a42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Alves=20R=C3=B6sel?=
 <60182044+velocitatem@users.noreply.github.com>
Date: Tue, 13 Jan 2026 15:35:27 +0100
Subject: [PATCH 17/99] Pre run web refactors (#43)

* chore: refactor date utilities

* feat: improve images of hotel rooms

* fix: adding date utils
---
 web/src/components/feats/hotel/HotelCard.tsx  |  5 +-
 .../components/feats/hotel/HotelDetails.tsx   |  5 +-
 web/src/lib/airline-utils.ts                  | 24 +--------
 web/src/lib/date-utils.ts                     | 23 ++++++++
 web/src/lib/hotel-utils.ts                    | 52 +++++++++++--------
 5 files changed, 60 insertions(+), 49 deletions(-)
 create mode 100644 web/src/lib/date-utils.ts

diff --git a/web/src/components/feats/hotel/HotelCard.tsx b/web/src/components/feats/hotel/HotelCard.tsx
index 5bf234d..847e1b2 100644
--- a/web/src/components/feats/hotel/HotelCard.tsx
+++ b/web/src/components/feats/hotel/HotelCard.tsx
@@ -2,6 +2,7 @@
 
 import type { EventName } from '@/lib/events';
 import type { Hotel } from '@/lib/hotel-utils';
+import { getHotelImageUrl } from '@/lib/hotel-utils';
 import { useHoverTracking } from '@/hooks/useHoverTracking';
 import PriceDisplay from '@/components/ui/PriceDisplay';
 
@@ -47,8 +48,6 @@ export default function HotelCard({ hotel }: { hotel: Hotel }) {
         window.location.href = `/hotel/products/${hotel.id}`;
     };
 
-    const imageUrl = `https://images.unsplash.com/photo-1551882547-ff40c63fe5fa?w=400&h=300&fit=crop`;
-
     return (
         <div
             className="hotel-card cursor-pointer"
@@ -56,7 +55,7 @@ export default function HotelCard({ hotel }: { hotel: Hotel }) {
         >
             <div className="hotel-image relative overflow-hidden">
                 <img
-                    src={imageUrl}
+                    src={getHotelImageUrl(hotel.id, { w: 400, h: 300 })}
                     alt={hotel.name}
                     className="w-full h-full object-cover"
                     onError={(e) => {
diff --git a/web/src/components/feats/hotel/HotelDetails.tsx b/web/src/components/feats/hotel/HotelDetails.tsx
index 6cdbbdd..030769f 100644
--- a/web/src/components/feats/hotel/HotelDetails.tsx
+++ b/web/src/components/feats/hotel/HotelDetails.tsx
@@ -2,6 +2,7 @@
 
 import { useState, useEffect } from 'react';
 import type { Hotel } from '@/lib/hotel-utils';
+import { getHotelImageUrl } from '@/lib/hotel-utils';
 import PriceDisplay from '@/components/ui/PriceDisplay';
 
 interface HotelDetailsProps {
@@ -43,13 +44,11 @@ const PriceTotalDisplay = ({ productId, nights }: { productId: string; nights: n
 };
 
 export default function HotelDetails({ product, onAddToCart, addedToCart }: HotelDetailsProps) {
-  const imageUrl = `https://images.unsplash.com/photo-1566073771259-6a8506099945?w=800&h=600&fit=crop`;
-
   return (
     <div className="w-full flex flex-col lg:flex-row gap-12 py-8">
       <div className="w-full lg:w-1/2 rounded-lg aspect-[4/3] overflow-hidden shrink-0">
         <img
-          src={imageUrl}
+          src={getHotelImageUrl(product.id, { w: 800, h: 600 })}
           alt={product.name}
           className="w-full h-full object-cover"
           onError={(e) => {
diff --git a/web/src/lib/airline-utils.ts b/web/src/lib/airline-utils.ts
index 74a1916..b801e14 100644
--- a/web/src/lib/airline-utils.ts
+++ b/web/src/lib/airline-utils.ts
@@ -31,7 +31,7 @@ export interface Flight {
   availability: number;
 }
 
-const EPOCH = new Date(0);
+import { dateToDaysFromToday, dateToIndex, todayIndex } from './date-utils';
 
 export const transformProduct = (p: AirlineProduct): Flight => {
   const { id, flight_type, date_index, metadata, availability } = p;
@@ -52,24 +52,4 @@ export const transformProduct = (p: AirlineProduct): Flight => {
   };
 };
 
-// convert date string to days from today
-export const dateToDaysFromToday = (dateStr: string): number => {
-  const target = new Date(dateStr);
-  target.setHours(0, 0, 0, 0);
-  const today = new Date();
-  today.setHours(0, 0, 0, 0);
-  return Math.floor((target.getTime() - today.getTime()) / 86400000);
-};
-
-// convert date string to date_index (days since epoch)
-export const dateToIndex = (dateStr: string): number => {
-  const d = new Date(dateStr);
-  return Math.floor((d.getTime() - EPOCH.getTime()) / 86400000);
-};
-
-// get current date_index
-export const todayIndex = (): number => {
-  const now = new Date();
-  now.setHours(0, 0, 0, 0);
-  return Math.floor((now.getTime() - EPOCH.getTime()) / 86400000);
-};
+export { dateToDaysFromToday, dateToIndex, todayIndex };
diff --git a/web/src/lib/date-utils.ts b/web/src/lib/date-utils.ts
new file mode 100644
index 0000000..bad1a90
--- /dev/null
+++ b/web/src/lib/date-utils.ts
@@ -0,0 +1,23 @@
+const EPOCH = new Date(0);
+const MS_PER_DAY = 86400000;
+
+export const dateToDaysFromToday = (dateStr: string): number => {
+  const target = new Date(dateStr);
+  target.setHours(0, 0, 0, 0);
+  const today = new Date();
+  today.setHours(0, 0, 0, 0);
+  return Math.floor((target.getTime() - today.getTime()) / MS_PER_DAY);
+};
+
+export const dateToIndex = (dateStr: string): number => {
+  const d = new Date(dateStr);
+  return Math.floor((d.getTime() - EPOCH.getTime()) / MS_PER_DAY);
+};
+
+export const todayIndex = (): number => {
+  const now = new Date();
+  now.setHours(0, 0, 0, 0);
+  return Math.floor((now.getTime() - EPOCH.getTime()) / MS_PER_DAY);
+};
+
+export { EPOCH, MS_PER_DAY };
diff --git a/web/src/lib/hotel-utils.ts b/web/src/lib/hotel-utils.ts
index b59994a..e5ba5c2 100644
--- a/web/src/lib/hotel-utils.ts
+++ b/web/src/lib/hotel-utils.ts
@@ -25,7 +25,7 @@ export interface Hotel {
   nights: number;
 }
 
-const EPOCH = new Date(0);
+import { EPOCH, MS_PER_DAY, dateToDaysFromToday, dateToIndex, todayIndex } from './date-utils';
 
 export const transformProduct = (p: HotelProduct): Hotel => {
   const { id, room_type, date_index, metadata } = p;
@@ -37,14 +37,14 @@ export const transformProduct = (p: HotelProduct): Hotel => {
     // legacy: treat as offset from today
     const today = new Date();
     today.setHours(0, 0, 0, 0);
-    checkIn = new Date(today.getTime() + date_index * 86400000);
+    checkIn = new Date(today.getTime() + date_index * MS_PER_DAY);
   } else {
     // proper: days since epoch
-    checkIn = new Date(EPOCH.getTime() + date_index * 86400000);
+    checkIn = new Date(EPOCH.getTime() + date_index * MS_PER_DAY);
   }
 
   const nights = 1;
-  const checkOut = new Date(checkIn.getTime() + nights * 86400000);
+  const checkOut = new Date(checkIn.getTime() + nights * MS_PER_DAY);
 
   const formatOpts: Intl.DateTimeFormatOptions = {
     month: 'short',
@@ -65,24 +65,34 @@ export const transformProduct = (p: HotelProduct): Hotel => {
   };
 };
 
-// convert date string to days from today
-export const dateToDaysFromToday = (dateStr: string): number => {
-  const target = new Date(dateStr);
-  target.setHours(0, 0, 0, 0);
-  const today = new Date();
-  today.setHours(0, 0, 0, 0);
-  return Math.floor((target.getTime() - today.getTime()) / 86400000);
+const hotelImagePool = [
+  'photo-1566073771259-6a8506099945',
+  'photo-1551882547-ff40c63fe5fa',
+  'photo-1590490360182-c33d57733427',
+  'photo-1582719478250-c89cae4dc85b',
+  'photo-1596701062351-8c2c14d1fdd0',
+  'photo-1631049307264-da0ec9d70304',
+  'photo-1578683010236-d716f9a3f461',
+  'photo-1540518614846-7eded433c457',
+  'photo-1505693416388-ac5ce068fe85',
+  'photo-1522771739844-6a9f6d5f14af',
+  'photo-1562438668-bcf0ca6578f0',
+  'photo-1595576508898-0ad5c879a061',
+];
+
+const hashString = (s: string): number => {
+  let h = 0;
+  for (let i = 0; i < s.length; i++) {
+    h = ((h << 5) - h) + s.charCodeAt(i);
+    h = h & h;
+  }
+  return Math.abs(h);
 };
 
-// convert date string to date_index (days since epoch)
-export const dateToIndex = (dateStr: string): number => {
-  const d = new Date(dateStr);
-  return Math.floor((d.getTime() - EPOCH.getTime()) / 86400000);
+export const getHotelImageUrl = (hotelId: string, size: { w: number; h: number } = { w: 400, h: 300 }): string => {
+  const idx = hashString(hotelId) % hotelImagePool.length;
+  const photoId = hotelImagePool[idx];
+  return `https://images.unsplash.com/${photoId}?w=${size.w}&h=${size.h}&fit=crop`;
 };
 
-// get current date_index
-export const todayIndex = (): number => {
-  const now = new Date();
-  now.setHours(0, 0, 0, 0);
-  return Math.floor((now.getTime() - EPOCH.getTime()) / 86400000);
-};
+export { dateToDaysFromToday, dateToIndex, todayIndex };

From 96180e9af15eb79f83e694d5ca10f41b5abd5e66 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 15:36:20 +0100
Subject: [PATCH 18/99] feat: added a runner script for agent orchestration

---
 experiments/agents/run.py | 117 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 experiments/agents/run.py

diff --git a/experiments/agents/run.py b/experiments/agents/run.py
new file mode 100644
index 0000000..823c3d9
--- /dev/null
+++ b/experiments/agents/run.py
@@ -0,0 +1,117 @@
+from supabase import create_client, Client
+import os
+import random
+import asyncio
+import json
+from dotenv import load_dotenv
+
+from experiments.agents.agent import get_agent, AgentTypes
+from lib.kafka_client import get_interactions
+
+load_dotenv()
+
+RESULTS="/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+
+client = create_client(
+    os.getenv("NEXT_PUBLIC_SUPABASE_URL"),
+    os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
+)
+def pick_random_task():
+    mode = 'hotel'
+    tasks = client.table("tasks").select("*").execute().data
+    if mode == 'hotel':
+        # drop all that have 'flight' in the description
+        tasks = [task for task in tasks if 'flight' not in task['task_description'].lower()]
+    return random.choice(tasks) if tasks else None
+
+def clear_kafka_data():
+    """Delete and recreate Kafka topics to clear all data"""
+    from kafka.admin import KafkaAdminClient, NewTopic
+    from kafka.errors import UnknownTopicOrPartitionError
+    import time
+
+    kafka_host = os.getenv('KAFKA_HOST', 'localhost')
+    kafka_port = os.getenv('KAFKA_PORT', '9092')
+    broker = f'{kafka_host}:{kafka_port}'
+
+    admin = KafkaAdminClient(bootstrap_servers=broker)
+    topics = ['user-interactions', 'price-logs']
+
+    try:
+        admin.delete_topics(topics, timeout_ms=5000)
+        print(f"Deleted topics: {topics}")
+        time.sleep(2)
+    except UnknownTopicOrPartitionError:
+        print("Topics don't exist, skipping delete")
+    except Exception as e:
+        print(f"Error deleting topics: {e}")
+
+    new_topics = [
+        NewTopic(name='user-interactions', num_partitions=3, replication_factor=1),
+        NewTopic(name='price-logs', num_partitions=3, replication_factor=1)
+    ]
+
+    try:
+        admin.create_topics(new_topics=new_topics, validate_only=False)
+        print(f"Recreated topics: {topics}")
+    except Exception as e:
+        print(f"Error creating topics: {e}")
+    finally:
+        admin.close()
+
+def create_new_experiment(task_id):
+    import uuid
+    subject_name = f"agent_{str(uuid.uuid4())[:8]}"
+    experiment = {
+        "subject_name": subject_name,
+        "xp_human_only": False,
+        "xp_market_mode": "hotel",
+        "xp_task_id": task_id,
+    }
+    response = client.table("experiments").insert(experiment).execute()
+    return response.data[0] if response.data else None
+
+if __name__ == "__main__":
+    clear_kafka_data()
+
+    task = pick_random_task()
+    if not task:
+        print("No tasks available")
+        exit(1)
+
+    experiment = create_new_experiment(task['id'])
+    exp_id = experiment['id']
+    exp_dir = f"{RESULTS}{exp_id}"
+    os.makedirs(exp_dir, exist_ok=True)
+
+    # construct experiment URL with uuid param
+    base_url = os.getenv('NEXT_PUBLIC_API_BASE', 'http://localhost:3000')
+    agent_url = f"{base_url}/start-task?uuid={exp_id}"
+
+    print(f"Created experiment {exp_id} for task {task['id']}")
+    print(f"Agent will interact with: {agent_url}")
+
+    # instantiate and run agent
+    agent = get_agent(
+        AgentTypes.GENERIC_BROWSER_USE_AGENT,
+        goal=task['task_description'],
+        url=agent_url,
+        timeout=300,
+        headless=True
+    )
+
+    result = asyncio.run(agent.act())
+    print(f"Agent result: {result}")
+
+    # export interaction and price data from kafka
+    interactions = get_interactions(topic='user-interactions', timeout_ms=3000)
+    prices = get_interactions(topic='price-logs', timeout_ms=3000)
+
+    with open(f"{exp_dir}/int.json", 'w') as f:
+        json.dump(interactions, f, indent=2)
+
+    with open(f"{exp_dir}/price.json", 'w') as f:
+        json.dump(prices, f, indent=2)
+
+    print(f"Experiment {exp_id} completed.")
+    print(f"Exported {len(interactions)} interactions and {len(prices)} price logs to {exp_dir}")

From a36973cb42192751ee3f0444cbc6a5e1cdd48678 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 15:37:06 +0100
Subject: [PATCH 19/99] feat: forgot airflow helper staging

---
 tests/e2e/helpers/airflow.ts | 61 ++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 tests/e2e/helpers/airflow.ts

diff --git a/tests/e2e/helpers/airflow.ts b/tests/e2e/helpers/airflow.ts
new file mode 100644
index 0000000..82d4a75
--- /dev/null
+++ b/tests/e2e/helpers/airflow.ts
@@ -0,0 +1,61 @@
+const AIRFLOW_URL = process.env.AIRFLOW_URL || 'http://localhost:8085';
+const AUTH = 'Basic ' + Buffer.from(`${process.env.AIRFLOW_USER || 'admin'}:${process.env.AIRFLOW_PASS || 'admin'}`).toString('base64');
+
+const req = (path: string, opts: any = {}) => {
+  const headers = { Authorization: AUTH, ...opts.headers };
+  return fetch(`${AIRFLOW_URL}${path}`, { ...opts, headers });
+};
+
+export const triggerDag = async (dagId: string, conf = {}) => {
+  const r = await req(`/api/v1/dags/${dagId}/dagRuns`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ conf }),
+  });
+  if (!r.ok) throw new Error(`Trigger DAG failed: ${r.status}`);
+  return (await r.json()).dag_run_id;
+};
+
+export const getDagStatus = async (dagId: string, runId: string) => {
+  const r = await req(`/api/v1/dags/${dagId}/dagRuns/${runId}`);
+  if (!r.ok) throw new Error(`Get status failed: ${r.status}`);
+  return (await r.json()).state;
+};
+
+export const cancelDag = async (dagId: string, runId: string) => {
+  const r = await req(`/api/v1/dags/${dagId}/dagRuns/${runId}`, {
+    method: 'PATCH',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ state: 'failed' }),
+  });
+  if (!r.ok) console.warn(`Failed to cancel DAG ${runId}: ${r.status}`);
+};
+
+export const waitForDag = async (dagId: string, runId: string, maxMs = 30000, pollMs = 1000) => {
+  const t0 = Date.now();
+  while (Date.now() - t0 < maxMs) {
+    const state = await getDagStatus(dagId, runId);
+    if (state === 'success') return;
+    if (state === 'failed') throw new Error(`DAG ${runId} failed`);
+    await new Promise(r => setTimeout(r, pollMs));
+  }
+  await cancelDag(dagId, runId);
+  throw new Error(`DAG ${runId} timeout`);
+};
+
+export const runDag = async (dagId: string, conf = {}, maxMs = 60000) => {
+  const runId = await triggerDag(dagId, conf);
+  await waitForDag(dagId, runId, maxMs);
+};
+
+export const runSessionPricing = (mode = 'hotel') =>
+  runDag('session_pricing_pipeline', { store_mode: mode, session_limit: 10 }, 90000);
+
+export const runSurgePricing = (mode = 'hotel', highThresh = 10, lowThresh = 2) =>
+  runDag('surge_pricing_pipeline', {
+    store_mode: mode,
+    high_threshold: highThresh,
+    low_threshold: lowThresh,
+    surge_multiplier: 1.2,
+    discount_multiplier: 0.9
+  }, 90000);

From eea019ab3fca258f686062174a81bd45707e6f6d Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 15:57:05 +0100
Subject: [PATCH 20/99] feat: introduction of agentinc MDPs and KL divergence
 of > 2

---
 sim/rl/behavior_loader/loader.py | 20 +++++++
 sim/rl/behavior_loader/models.py | 89 ++++++++++++++++++++++++++++----
 2 files changed, 98 insertions(+), 11 deletions(-)

diff --git a/sim/rl/behavior_loader/loader.py b/sim/rl/behavior_loader/loader.py
index 99a1541..bd18442 100644
--- a/sim/rl/behavior_loader/loader.py
+++ b/sim/rl/behavior_loader/loader.py
@@ -56,7 +56,27 @@ class Loader:
     def get_entries(self) -> tuple[list[str], int]:
         return self.entries, len(self.entries)
 
+class AgentLoader(Loader):
+    """Loader for agent interaction data with simplified schema (direct PayloadModel format)"""
+
+    def _is_admin_page_simple(self, interaction: PayloadModel) -> bool:
+        return interaction.page and interaction.page.startswith("/admin/")
+
+    def _load_sessions(self) -> dict:
+        sessions = {}
+        for entry in self.entries:
+            int_path = f"{self.src_dir}/{entry}/int.json"
+            raw = json.load(open(int_path))
+            ints = [PayloadModel(**i) for i in raw]
+            sessions[entry] = [i for i in ints if not self._is_admin_page_simple(i)]
+        return sessions
+
 if __name__ == "__main__":
+    DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+    loader = AgentLoader(DIR)
+    _, n = loader.get_entries()
+    print(f"Loaded {n} sessions from {DIR}")
+
     DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
     loader = Loader(DIR)
     _, n = loader.get_entries()
diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index bce2429..7254606 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -1,10 +1,12 @@
-from loader import Loader
+from experiments.agents.base import Agent
+from loader import Loader, AgentLoader
 from collections import defaultdict
 from typing import Dict, List, Tuple, Set
 import numpy as np
 import graphviz
 
 DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+AGENT_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
 
 class BehaviorModel:
     def __init__(self, src_dir: str = DIR):
@@ -85,13 +87,32 @@ class BehaviorModel:
             path.append(curr)
         return path
 
-def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False, export_dot: bool = False):
-    """visualize MDP as directed graph using graphviz, aggregated by event type"""
-    if not model.mdp: raise ValueError("build MDP first")
+class AgentBehaviorModel(BehaviorModel):
+    """behavior model for agent interaction data (simplified PayloadModel schema)"""
 
-    # aggregate transitions by event type
+    def __init__(self, src_dir: str = AGENT_DIR):
+        self.loader = AgentLoader(src_dir)
+        self.data = self.loader.get_data()
+        self.entries, self.num_entries = self.loader.get_entries()
+        self.mdp = None
+
+    def _state_repr(self, evt) -> str:
+        # direct access to PayloadModel fields (no .value.payload nesting)
+        return f"{evt.page or 'unk'}|{evt.productId or 'none'}|{evt.eventName}"
+
+    def _extract_sessions(self):
+        trajectories = []
+        for sid, evts in self.data.items():
+            if len(evts) < 2: continue
+            # sort by timestamp string (ISO format sorts lexicographically)
+            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.ts)]
+            trajectories.append(states)
+        return trajectories
+
+def aggregate_event_transitions(mdp: Dict) -> Dict[str, Dict[str, float]]:
+    """aggregate state transitions by event type and normalize"""
     evt_trans = defaultdict(lambda: defaultdict(float))
-    for s, trans in model.mdp['transitions'].items():
+    for s, trans in mdp['transitions'].items():
         evt_src = s.split('|')[2]
         for s_next, prob in trans.items():
             evt_dst = s_next.split('|')[2]
@@ -103,6 +124,13 @@ def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "
         if total > 0:
             for evt_dst in evt_trans[evt_src]:
                 evt_trans[evt_src][evt_dst] /= total
+    return dict(evt_trans)
+
+def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False, export_dot: bool = False):
+    """visualize MDP as directed graph using graphviz, aggregated by event type"""
+    if not model.mdp: raise ValueError("build MDP first")
+
+    evt_trans = aggregate_event_transitions(model.mdp)
 
     g = graphviz.Digraph(format=fmt)
     g.attr(rankdir='LR', size='30')
@@ -134,11 +162,50 @@ def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "
 
     return g
 
+
+def kl_divergence(p: Dict[str, float], q: Dict[str, float]) -> float:
+    """Compute KL divergence D_KL(P || Q) for discrete distributions P and Q."""
+    epsilon = 1e-10  # small constant to avoid log(0)
+    kl_div = 0.0
+    for key in p:
+        p_val = p[key] + epsilon
+        q_val = q.get(key, 0.0) + epsilon
+        kl_div += p_val * np.log(p_val / q_val)
+    return kl_div
+
 if __name__ == "__main__":
-    model = BehaviorModel(DIR)
-    mdp = model.build_MDP()
-    print(f"Built MDP: {mdp['num_states']} states, {sum(len(t) for t in mdp['transitions'].values())} transitions")
-    if not mdp['states']:
+    human_model = BehaviorModel(DIR)
+    human_mdp = human_model.build_MDP()
+    print(f"Built MDP: {human_mdp['num_states']} states, {sum(len(t) for t in human_mdp['transitions'].values())} transitions")
+    if not human_mdp['states']:
         print("No states found")
         exit(1)
-    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="pdf", export_dot=True)
+    visualize_mdp(human_model, threshold=0.05, output="human_mdp_viz", fmt="pdf", export_dot=True)
+
+    agent_model = AgentBehaviorModel()
+    agent_mdp = agent_model.build_MDP()
+    print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, {sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
+    if not agent_mdp['states']:
+        print("No states found")
+        exit(1)
+    visualize_mdp(agent_model, threshold=0.05, output="agent_mdp_viz", fmt="pdf", export_dot=True)
+
+    # aggregate transitions by event type for both models
+    human_evt_trans = aggregate_event_transitions(human_mdp)
+    agent_evt_trans = aggregate_event_transitions(agent_mdp)
+
+    common_evts = set(human_evt_trans.keys()) & set(agent_evt_trans.keys())
+    if not common_evts: import sys; sys.exit("No common event types for KL divergence analysis")
+
+    kl_divs = []
+    for evt in common_evts:
+        kl = kl_divergence(human_evt_trans[evt], agent_evt_trans[evt])
+        kl_divs.append((evt, kl))
+
+    kl_divs.sort(key=lambda x: x[1], reverse=True)
+    avg_kl = np.mean([kl for _, kl in kl_divs])
+
+    print(f"Average KL divergence: {avg_kl:.4f}")
+    print(f"\nMost divergent event types:")
+    for evt, kl in kl_divs:
+        print(f"  {evt}: {kl:.4f}")

From 6f361b96a813d52bfb6132a863a07df5e7401591 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 16:42:50 +0100
Subject: [PATCH 21/99] feat: joint loader

---
 sim/rl/behavior_loader/loader.py | 47 ++++++++++++++++++++++++++------
 sim/rl/behavior_loader/models.py | 32 +++++++++++++++++++++-
 2 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/sim/rl/behavior_loader/loader.py b/sim/rl/behavior_loader/loader.py
index bd18442..620576c 100644
--- a/sim/rl/behavior_loader/loader.py
+++ b/sim/rl/behavior_loader/loader.py
@@ -71,13 +71,44 @@ class AgentLoader(Loader):
             sessions[entry] = [i for i in ints if not self._is_admin_page_simple(i)]
         return sessions
 
-if __name__ == "__main__":
-    DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-    loader = AgentLoader(DIR)
-    _, n = loader.get_entries()
-    print(f"Loaded {n} sessions from {DIR}")
+class JointLoader:
+    """Loader for combined human (Kafka) and agent (direct) data without discrimination"""
 
-    DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
-    loader = Loader(DIR)
+    def __init__(self, human_dir: str, agent_dir: str):
+        self.human_dir = human_dir
+        self.agent_dir = agent_dir
+        self.human_loader = Loader(human_dir)
+        self.agent_loader = AgentLoader(agent_dir)
+        self.data = self._load_joint_sessions()
+        self.entries = list(self.data.keys())
+
+    def _load_joint_sessions(self) -> dict:
+        sessions = {}
+        # load human sessions (unwrap from Kafka format to PayloadModel)
+        for sid, evts in self.human_loader.get_data().items():
+            sessions[f"human_{sid}"] = [evt.value.payload for evt in evts]
+        # load agent sessions (already PayloadModel)
+        for sid, evts in self.agent_loader.get_data().items():
+            sessions[f"agent_{sid}"] = evts
+        return sessions
+
+    def get_data(self) -> dict:
+        return self.data
+
+    def get_entries(self) -> tuple[list[str], int]:
+        return self.entries, len(self.entries)
+
+if __name__ == "__main__":
+    AGENT_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+    loader = AgentLoader(AGENT_DIR)
     _, n = loader.get_entries()
-    print(f"Loaded {n} sessions from {DIR}")
+    print(f"Loaded {n} agent sessions from {AGENT_DIR}")
+
+    HUMAN_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+    loader = Loader(HUMAN_DIR)
+    _, n = loader.get_entries()
+    print(f"Loaded {n} human sessions from {HUMAN_DIR}")
+
+    joint_loader = JointLoader(HUMAN_DIR, AGENT_DIR)
+    _, n = joint_loader.get_entries()
+    print(f"Loaded {n} total sessions (combined) from joint loader")
diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 7254606..46ac99d 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -1,5 +1,5 @@
 from experiments.agents.base import Agent
-from loader import Loader, AgentLoader
+from loader import Loader, AgentLoader, JointLoader
 from collections import defaultdict
 from typing import Dict, List, Tuple, Set
 import numpy as np
@@ -109,6 +109,28 @@ class AgentBehaviorModel(BehaviorModel):
             trajectories.append(states)
         return trajectories
 
+class JointBehaviorModel(BehaviorModel):
+    """behavior model for combined human+agent data (flat PayloadModel distribution)"""
+
+    def __init__(self, human_dir: str = DIR, agent_dir: str = AGENT_DIR):
+        self.loader = JointLoader(human_dir, agent_dir)
+        self.data = self.loader.get_data()
+        self.entries, self.num_entries = self.loader.get_entries()
+        self.mdp = None
+
+    def _state_repr(self, evt) -> str:
+        # direct access to PayloadModel fields (JointLoader unwraps to PayloadModel)
+        return f"{evt.page or 'unk'}|{evt.productId or 'none'}|{evt.eventName}"
+
+    def _extract_sessions(self):
+        trajectories = []
+        for sid, evts in self.data.items():
+            if len(evts) < 2: continue
+            # sort by timestamp string (ISO format sorts lexicographically)
+            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.ts)]
+            trajectories.append(states)
+        return trajectories
+
 def aggregate_event_transitions(mdp: Dict) -> Dict[str, Dict[str, float]]:
     """aggregate state transitions by event type and normalize"""
     evt_trans = defaultdict(lambda: defaultdict(float))
@@ -209,3 +231,11 @@ if __name__ == "__main__":
     print(f"\nMost divergent event types:")
     for evt, kl in kl_divs:
         print(f"  {evt}: {kl:.4f}")
+
+    # build joint model (combined distribution)
+    print("\n=== Joint Model (Human + Agent Combined) ===")
+    joint_model = JointBehaviorModel()
+    joint_mdp = joint_model.build_MDP()
+    print(f"Built joint MDP: {joint_mdp['num_states']} states, {sum(len(t) for t in joint_mdp['transitions'].values())} transitions")
+    if joint_mdp['states']:
+        visualize_mdp(joint_model, threshold=0.05, output="joint_mdp_viz", fmt="pdf", export_dot=True)

From a1e31663223736694327127dc006b441d91ddd81 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 16:46:17 +0100
Subject: [PATCH 22/99] chore: refactor the loader class

---
 sim/rl/behavior_loader/loader.py | 67 ++++++++++++--------------------
 1 file changed, 25 insertions(+), 42 deletions(-)

diff --git a/sim/rl/behavior_loader/loader.py b/sim/rl/behavior_loader/loader.py
index 620576c..3336956 100644
--- a/sim/rl/behavior_loader/loader.py
+++ b/sim/rl/behavior_loader/loader.py
@@ -1,6 +1,6 @@
 import os
-from pydantic import BaseModel as Base
 import json
+from pydantic import BaseModel as Base
 
 class PayloadModel(Base):
     sessionId: str
@@ -30,6 +30,9 @@ class InteractionModel(Base):
     key: dict
     value: ValueModel
 
+def _is_admin(page: str | None) -> bool:
+    return page is not None and page.startswith("/admin/")
+
 class Loader:
     def __init__(self, src_dir: str):
         self.src_dir = src_dir
@@ -37,17 +40,13 @@ class Loader:
         if not self.entries: raise ValueError("empty directory")
         self.data = self._load_sessions()
 
-    def _is_admin_page(self, interaction: InteractionModel) -> bool:
-        page = interaction.value.payload.page
-        return page and page.startswith("/admin/")
-
     def _load_sessions(self) -> dict:
         sessions = {}
         for entry in self.entries:
-            int_path = f"{self.src_dir}/{entry}/int.json"
-            raw = json.load(open(int_path))
+            with open(f"{self.src_dir}/{entry}/int.json") as f:
+                raw = json.load(f)
             ints = [InteractionModel(**i) for i in raw]
-            sessions[entry] = [i for i in ints if not self._is_admin_page(i)]
+            sessions[entry] = [i for i in ints if not _is_admin(i.value.payload.page)]
         return sessions
 
     def get_data(self) -> dict:
@@ -57,40 +56,29 @@ class Loader:
         return self.entries, len(self.entries)
 
 class AgentLoader(Loader):
-    """Loader for agent interaction data with simplified schema (direct PayloadModel format)"""
-
-    def _is_admin_page_simple(self, interaction: PayloadModel) -> bool:
-        return interaction.page and interaction.page.startswith("/admin/")
-
     def _load_sessions(self) -> dict:
         sessions = {}
         for entry in self.entries:
-            int_path = f"{self.src_dir}/{entry}/int.json"
-            raw = json.load(open(int_path))
+            with open(f"{self.src_dir}/{entry}/int.json") as f:
+                raw = json.load(f)
             ints = [PayloadModel(**i) for i in raw]
-            sessions[entry] = [i for i in ints if not self._is_admin_page_simple(i)]
+            sessions[entry] = [i for i in ints if not _is_admin(i.page)]
         return sessions
 
 class JointLoader:
-    """Loader for combined human (Kafka) and agent (direct) data without discrimination"""
-
     def __init__(self, human_dir: str, agent_dir: str):
-        self.human_dir = human_dir
-        self.agent_dir = agent_dir
         self.human_loader = Loader(human_dir)
         self.agent_loader = AgentLoader(agent_dir)
-        self.data = self._load_joint_sessions()
+        self.data = self._merge()
         self.entries = list(self.data.keys())
 
-    def _load_joint_sessions(self) -> dict:
-        sessions = {}
-        # load human sessions (unwrap from Kafka format to PayloadModel)
-        for sid, evts in self.human_loader.get_data().items():
-            sessions[f"human_{sid}"] = [evt.value.payload for evt in evts]
-        # load agent sessions (already PayloadModel)
-        for sid, evts in self.agent_loader.get_data().items():
-            sessions[f"agent_{sid}"] = evts
-        return sessions
+    def _merge(self) -> dict:
+        return {
+            **{f"human_{sid}": [e.value.payload for e in evts]
+               for sid, evts in self.human_loader.get_data().items()},
+            **{f"agent_{sid}": evts
+               for sid, evts in self.agent_loader.get_data().items()}
+        }
 
     def get_data(self) -> dict:
         return self.data
@@ -99,16 +87,11 @@ class JointLoader:
         return self.entries, len(self.entries)
 
 if __name__ == "__main__":
-    AGENT_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-    loader = AgentLoader(AGENT_DIR)
-    _, n = loader.get_entries()
-    print(f"Loaded {n} agent sessions from {AGENT_DIR}")
+    agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+    human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
 
-    HUMAN_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
-    loader = Loader(HUMAN_DIR)
-    _, n = loader.get_entries()
-    print(f"Loaded {n} human sessions from {HUMAN_DIR}")
-
-    joint_loader = JointLoader(HUMAN_DIR, AGENT_DIR)
-    _, n = joint_loader.get_entries()
-    print(f"Loaded {n} total sessions (combined) from joint loader")
+    for name, cls, path in [("agent", AgentLoader, agent_dir),
+                             ("human", Loader, human_dir),
+                             ("joint", lambda d: JointLoader(human_dir, d), agent_dir)]:
+        ldr = cls(path) if name != "joint" else cls(agent_dir)
+        print(f"Loaded {len(ldr.get_entries()[0])} {name} sessions")

From 3072e5f46e13cab788afefe60b1b2e22289ace37 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 16:51:00 +0100
Subject: [PATCH 23/99] refactor models computations

---
 sim/rl/behavior_loader/models.py | 186 ++++++++++++-------------------
 1 file changed, 69 insertions(+), 117 deletions(-)

diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 46ac99d..84c2fe4 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -1,16 +1,12 @@
-from experiments.agents.base import Agent
 from loader import Loader, AgentLoader, JointLoader
 from collections import defaultdict
 from typing import Dict, List, Tuple, Set
 import numpy as np
 import graphviz
 
-DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
-AGENT_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-
 class BehaviorModel:
-    def __init__(self, src_dir: str = DIR):
-        self.loader = Loader(src_dir)
+    def __init__(self, src_dir: str, loader_cls=Loader):
+        self.loader = loader_cls(src_dir)
         self.data = self.loader.get_data()
         self.entries, self.num_entries = self.loader.get_entries()
         self.mdp = None
@@ -19,50 +15,48 @@ class BehaviorModel:
         p = evt.value.payload
         return f"{p.page or 'unk'}|{p.productId or 'none'}|{p.eventName}"
 
-    def _extract_sessions(self):
-        # transform raw events into sequential state trajectories per session
-        trajectories = []
-        for sid, evts in self.data.items():
-            if len(evts) < 2: continue
-            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.timestamp)]
-            trajectories.append(states)
-        return trajectories
+    def _sort_key(self, evt):
+        return evt.timestamp
 
-    def _calc_transitions(self, trajectories: List[List[str]]) -> Tuple[Dict, Set]:
-        trans = defaultdict(lambda: defaultdict(int))
-        states = set()
-        for traj in trajectories:
-            for i in range(len(traj) - 1):
-                s, s_next = traj[i], traj[i+1]
+    def _extract_sessions(self) -> List[List[str]]:
+        trajs = []
+        for evts in self.data.values():
+            if len(evts) < 2: continue
+            states = [self._state_repr(e) for e in sorted(evts, key=self._sort_key)]
+            trajs.append(states)
+        return trajs
+
+    def _calc_transitions(self, trajs: List[List[str]]) -> Tuple[Dict, Set]:
+        trans, states = defaultdict(lambda: defaultdict(int)), set()
+        for traj in trajs:
+            for s, s_next in zip(traj, traj[1:]):
                 trans[s][s_next] += 1
                 states.update([s, s_next])
         return trans, states
 
-    def _calc_rewards(self, trajectories: List[List[str]]) -> Dict:
-        # reward based on session progression depth
+    def _calc_rewards(self, trajs: List[List[str]]) -> Dict:
         rwd = defaultdict(list)
-        for traj in trajectories:
+        for traj in trajs:
             n = len(traj)
             for i, s in enumerate(traj):
                 rwd[s].append(i / n)
         return rwd
 
-    def _normalize_trans(self, counts: Dict) -> Dict:
+    def _normalize_trans(self, cnts: Dict) -> Dict:
         return {s: {s_n: cnt/sum(nxt.values()) for s_n, cnt in nxt.items()}
-                for s, nxt in counts.items()}
+                for s, nxt in cnts.items()}
 
     def build_MDP(self) -> Dict:
         trajs = self._extract_sessions()
         trans_cnt, states = self._calc_transitions(trajs)
         trans_prob = self._normalize_trans(trans_cnt)
         state_rwd = self._calc_rewards(trajs)
-        state_val = {s: np.mean(r) for s, r in state_rwd.items()}
 
         self.mdp = {
-            'states': sorted(list(states)),
+            'states': sorted(states),
             'num_states': len(states),
             'transitions': trans_prob,
-            'state_values': state_val,
+            'state_values': {s: np.mean(r) for s, r in state_rwd.items()},
             'state_rewards': state_rwd,
             'trans_counts': trans_cnt,
         }
@@ -78,8 +72,7 @@ class BehaviorModel:
 
     def sample_traj(self, start: str, max_len: int = 50) -> List[str]:
         if not self.mdp: raise ValueError("build MDP first")
-        path = [start]
-        curr = start
+        path, curr = [start], start
         for _ in range(max_len):
             nxt = self.mdp['transitions'].get(curr, {})
             if not nxt: break
@@ -88,154 +81,113 @@ class BehaviorModel:
         return path
 
 class AgentBehaviorModel(BehaviorModel):
-    """behavior model for agent interaction data (simplified PayloadModel schema)"""
-
-    def __init__(self, src_dir: str = AGENT_DIR):
-        self.loader = AgentLoader(src_dir)
-        self.data = self.loader.get_data()
-        self.entries, self.num_entries = self.loader.get_entries()
-        self.mdp = None
+    def __init__(self, src_dir: str):
+        super().__init__(src_dir, AgentLoader)
 
     def _state_repr(self, evt) -> str:
-        # direct access to PayloadModel fields (no .value.payload nesting)
         return f"{evt.page or 'unk'}|{evt.productId or 'none'}|{evt.eventName}"
 
-    def _extract_sessions(self):
-        trajectories = []
-        for sid, evts in self.data.items():
-            if len(evts) < 2: continue
-            # sort by timestamp string (ISO format sorts lexicographically)
-            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.ts)]
-            trajectories.append(states)
-        return trajectories
+    def _sort_key(self, evt):
+        return evt.ts
 
 class JointBehaviorModel(BehaviorModel):
-    """behavior model for combined human+agent data (flat PayloadModel distribution)"""
-
-    def __init__(self, human_dir: str = DIR, agent_dir: str = AGENT_DIR):
+    def __init__(self, human_dir: str, agent_dir: str):
         self.loader = JointLoader(human_dir, agent_dir)
         self.data = self.loader.get_data()
         self.entries, self.num_entries = self.loader.get_entries()
         self.mdp = None
 
     def _state_repr(self, evt) -> str:
-        # direct access to PayloadModel fields (JointLoader unwraps to PayloadModel)
         return f"{evt.page or 'unk'}|{evt.productId or 'none'}|{evt.eventName}"
 
-    def _extract_sessions(self):
-        trajectories = []
-        for sid, evts in self.data.items():
-            if len(evts) < 2: continue
-            # sort by timestamp string (ISO format sorts lexicographically)
-            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.ts)]
-            trajectories.append(states)
-        return trajectories
+    def _sort_key(self, evt):
+        return evt.ts
 
 def aggregate_event_transitions(mdp: Dict) -> Dict[str, Dict[str, float]]:
-    """aggregate state transitions by event type and normalize"""
     evt_trans = defaultdict(lambda: defaultdict(float))
     for s, trans in mdp['transitions'].items():
-        evt_src = s.split('|')[2]
+        src = s.split('|')[2]
         for s_next, prob in trans.items():
-            evt_dst = s_next.split('|')[2]
-            evt_trans[evt_src][evt_dst] += prob
+            dst = s_next.split('|')[2]
+            evt_trans[src][dst] += prob
 
-    # normalize aggregated transitions
-    for evt_src in evt_trans:
-        total = sum(evt_trans[evt_src].values())
+    for src in evt_trans:
+        total = sum(evt_trans[src].values())
         if total > 0:
-            for evt_dst in evt_trans[evt_src]:
-                evt_trans[evt_src][evt_dst] /= total
+            evt_trans[src] = {dst: p/total for dst, p in evt_trans[src].items()}
     return dict(evt_trans)
 
-def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False, export_dot: bool = False):
-    """visualize MDP as directed graph using graphviz, aggregated by event type"""
+def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph",
+                  fmt: str = "svg", view: bool = False, export_dot: bool = False):
     if not model.mdp: raise ValueError("build MDP first")
 
     evt_trans = aggregate_event_transitions(model.mdp)
-
     g = graphviz.Digraph(format=fmt)
     g.attr(rankdir='LR', size='30')
     g.attr('node', shape='circle', width='1', height='1')
 
-    # collect all event types
-    events = set(evt_trans.keys())
-    for trans in evt_trans.values():
-        events.update(trans.keys())
-
-    # add nodes for each event type
+    events = set(evt_trans.keys()) | {e for trans in evt_trans.values() for e in trans.keys()}
     for evt in events:
         g.node(evt)
 
-    # add edges above threshold
-    for evt_src in evt_trans:
-        for evt_dst, prob in evt_trans[evt_src].items():
+    for src, dsts in evt_trans.items():
+        for dst, prob in dsts.items():
             if prob > threshold:
-                g.edge(evt_src, evt_dst, label=f'{prob:.2f}')
+                g.edge(src, dst, label=f'{prob:.2f}')
 
     g.render(output, view=view, cleanup=True)
     print(f"Saved MDP graph to {output}.{fmt}")
 
     if export_dot:
-        dot_file = f"{output}.dot"
-        with open(dot_file, 'w') as f:
+        with open(f"{output}.dot", 'w') as f:
             f.write(g.source)
-        print(f"Exported DOT source to {dot_file}")
+        print(f"Exported DOT source to {output}.dot")
 
     return g
 
-
 def kl_divergence(p: Dict[str, float], q: Dict[str, float]) -> float:
-    """Compute KL divergence D_KL(P || Q) for discrete distributions P and Q."""
-    epsilon = 1e-10  # small constant to avoid log(0)
-    kl_div = 0.0
-    for key in p:
-        p_val = p[key] + epsilon
-        q_val = q.get(key, 0.0) + epsilon
-        kl_div += p_val * np.log(p_val / q_val)
-    return kl_div
+    eps = 1e-10
+    return sum((p[k] + eps) * np.log((p[k] + eps) / (q.get(k, 0.0) + eps)) for k in p)
 
 if __name__ == "__main__":
-    human_model = BehaviorModel(DIR)
+    base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
+    human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
+
+    human_model = BehaviorModel(human_dir)
     human_mdp = human_model.build_MDP()
-    print(f"Built MDP: {human_mdp['num_states']} states, {sum(len(t) for t in human_mdp['transitions'].values())} transitions")
+    print(f"Built MDP: {human_mdp['num_states']} states, "
+          f"{sum(len(t) for t in human_mdp['transitions'].values())} transitions")
     if not human_mdp['states']:
-        print("No states found")
-        exit(1)
+        exit("No states found")
     visualize_mdp(human_model, threshold=0.05, output="human_mdp_viz", fmt="pdf", export_dot=True)
 
-    agent_model = AgentBehaviorModel()
+    agent_model = AgentBehaviorModel(agent_dir)
     agent_mdp = agent_model.build_MDP()
-    print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, {sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
+    print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
+          f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
     if not agent_mdp['states']:
-        print("No states found")
-        exit(1)
+        exit("No states found")
     visualize_mdp(agent_model, threshold=0.05, output="agent_mdp_viz", fmt="pdf", export_dot=True)
 
-    # aggregate transitions by event type for both models
-    human_evt_trans = aggregate_event_transitions(human_mdp)
-    agent_evt_trans = aggregate_event_transitions(agent_mdp)
+    human_evt = aggregate_event_transitions(human_mdp)
+    agent_evt = aggregate_event_transitions(agent_mdp)
+    common = set(human_evt.keys()) & set(agent_evt.keys())
 
-    common_evts = set(human_evt_trans.keys()) & set(agent_evt_trans.keys())
-    if not common_evts: import sys; sys.exit("No common event types for KL divergence analysis")
+    if not common:
+        exit("No common event types for KL divergence analysis")
 
-    kl_divs = []
-    for evt in common_evts:
-        kl = kl_divergence(human_evt_trans[evt], agent_evt_trans[evt])
-        kl_divs.append((evt, kl))
+    kl_divs = sorted([(e, kl_divergence(human_evt[e], agent_evt[e])) for e in common],
+                     key=lambda x: x[1], reverse=True)
 
-    kl_divs.sort(key=lambda x: x[1], reverse=True)
-    avg_kl = np.mean([kl for _, kl in kl_divs])
-
-    print(f"Average KL divergence: {avg_kl:.4f}")
-    print(f"\nMost divergent event types:")
+    print(f"Average KL divergence: {np.mean([kl for _, kl in kl_divs]):.4f}")
+    print("\nMost divergent event types:")
     for evt, kl in kl_divs:
         print(f"  {evt}: {kl:.4f}")
 
-    # build joint model (combined distribution)
     print("\n=== Joint Model (Human + Agent Combined) ===")
-    joint_model = JointBehaviorModel()
+    joint_model = JointBehaviorModel(human_dir, agent_dir)
     joint_mdp = joint_model.build_MDP()
-    print(f"Built joint MDP: {joint_mdp['num_states']} states, {sum(len(t) for t in joint_mdp['transitions'].values())} transitions")
+    print(f"Built joint MDP: {joint_mdp['num_states']} states, "
+          f"{sum(len(t) for t in joint_mdp['transitions'].values())} transitions")
     if joint_mdp['states']:
         visualize_mdp(joint_model, threshold=0.05, output="joint_mdp_viz", fmt="pdf", export_dot=True)

From 95d4f0cee2e921d444c7e8a3ba5177be22f6f7ab Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 19:50:36 +0100
Subject: [PATCH 24/99] chore: ignores

---
 .gitignore | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 733e405..9101b2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,12 +5,19 @@
 **/.virtual_documents/
 **/session_*.svg
 **/*graph.svg
-paper/src/bib/auto
+**/auto/*.el
+*.old
+**/package-lock.json
+**/*.parquet
 
-# Airflow logs - exclude DAG run logs
+paper/src/bib/auto
 experiments/airflow/logs/*
 experiments/airflow/logs/scheduler/
 experiments/airflow/logs/dag_processor_manager/
+experiments/collected_data/
+experiments/agents/collected_data/
+sim/rl/behavior_loader/*.dot
+sim/rl/behavior_loader/*.png
+sim/rl/behavior_loader/*.svg
+sim/rl/behavior_loader/*.pdf
 tests/e2e/node_modules/**
-**/auto/*.el
-*.old

From 08ade8dc891f0c417154479fa65b8d3f3b7a708e Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 20 Jan 2026 21:00:47 +0100
Subject: [PATCH 25/99] feat: wip contaminator

---
 experiments/procesing/contaminator.py | 44 +++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 experiments/procesing/contaminator.py

diff --git a/experiments/procesing/contaminator.py b/experiments/procesing/contaminator.py
new file mode 100644
index 0000000..0a3651d
--- /dev/null
+++ b/experiments/procesing/contaminator.py
@@ -0,0 +1,44 @@
+import pandas as pd
+import random
+from sim.rl.behavior_loader import AgentBehaviorModel
+
+base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
+human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
+
+
+
+def remap_schema(df : pd.DataFrame, mapping: dict, on: str = "event_type"):
+    df = df.copy()
+    df[on] = df[on].map(mapping).fillna(df[on])
+    return df
+
+
+def contaminate_dataset(df : pd.DataFrame, on : str = "event_type",
+                        contamination_rate: float = 0.1) -> pd.DataFrame:
+    model = AgentBehaviorModel(agent_dir)
+    target_df_schema = df[on].unique().tolist()
+    mapping = {
+        'view': 'view_page'
+        # TODO: define properly for the given dataset
+    }
+    OG_event_distribution = df[on].value_counts(normalize=True).to_dict()
+    # normalize to weights
+    OG_event_distribution = {k: v / sum(OG_event_distribution.values()) for k, v in OG_event_distribution.items()}
+    mapped_df = remap_schema(df, mapping, on=on)
+    N = len(df)
+    N_final = N / (1 - contamination_rate) # TODO: explain this in paper
+    N_contaminate = int(N_final - N)
+    start_event_types = random.choices(list(OG_event_distribution.keys()),
+                                    weights=list(OG_event_distribution.values()), k=N_contaminate)
+    # it makes sense
+    new_trajectories = []
+    for start_event in start_event_types:
+        # sample from og start
+        start = None # TODO: defin start accoding to dataset (randomly sample with weights of event distr)
+        trajectory = model.sample_trajectory(start) # TODO: explain this method in paper
+        new_trajectories.extend(trajectory)
+
+    # TODO: make sure the new trajctories schema conforms with dataset
+    contaminate_df = pd.DataFrame(new_trajectories)
+    df = pd.concat([df, contaminate_df], ignore_index=True)
+    return df

From c102ac482e1727e4da8695d735bbf2f2629e75aa Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 11:11:49 +0100
Subject: [PATCH 26/99] chore: extra commenting

---
 experiments/procesing/contaminator.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/experiments/procesing/contaminator.py b/experiments/procesing/contaminator.py
index 0a3651d..da44c3d 100644
--- a/experiments/procesing/contaminator.py
+++ b/experiments/procesing/contaminator.py
@@ -1,9 +1,9 @@
 import pandas as pd
 import random
-from sim.rl.behavior_loader import AgentBehaviorModel
+from sim.rl.behavior_loader import AgentBehaviorModel # TODO: proper import this
 
 base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
-human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
+agent_dir = f"{base_dir}/agents/collected_data/"
 
 
 
@@ -21,6 +21,7 @@ def contaminate_dataset(df : pd.DataFrame, on : str = "event_type",
         'view': 'view_page'
         # TODO: define properly for the given dataset
     }
+    # think about replacing with freqdist method from library
     OG_event_distribution = df[on].value_counts(normalize=True).to_dict()
     # normalize to weights
     OG_event_distribution = {k: v / sum(OG_event_distribution.values()) for k, v in OG_event_distribution.items()}

From a5029f2eabd0d7ac2eaa7187233598c2ef694c41 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 11:27:03 +0100
Subject: [PATCH 27/99] feat: weak train scaffold

---
 experiments/ml/arch.py       | 117 +++--------------------------------
 experiments/ml/weak.train.py |  30 +++++++++
 2 files changed, 39 insertions(+), 108 deletions(-)
 create mode 100644 experiments/ml/weak.train.py

diff --git a/experiments/ml/arch.py b/experiments/ml/arch.py
index 4f36e18..a187959 100644
--- a/experiments/ml/arch.py
+++ b/experiments/ml/arch.py
@@ -12,111 +12,12 @@ TASK = 'classification'
 LABELS = ['human', 'agent']
 
 
-class BaseAgentClassifier(BaseEstimator, ClassifierMixin, ABC):
-    """Base class for tree-based agent detection classifiers with common logic"""
-
-    def __init__(self, context: Optional[PipelineContext] = None, n_estimators: int = 200,
-                 max_depth: int = 6, learning_rate: float = 0.05,
-                 early_stopping_rounds: int = 20):
-        self.context = context
-        self.n_estimators = n_estimators
-        self.max_depth = max_depth
-        self.learning_rate = learning_rate
-        self.early_stopping_rounds = early_stopping_rounds
-        self.model_ = None
-        self.feature_names_ = None
-
-    def _to_array(self, X):
-        """Convert pandas structures to numpy arrays"""
-        return X.values if isinstance(X, (pd.DataFrame, pd.Series)) else X
-
-    def _compute_pos_weight(self, y_arr):
-        """Calculate scale_pos_weight for class imbalance handling"""
-        n_neg, n_pos = (y_arr == 0).sum(), (y_arr == 1).sum()
-        return n_neg / n_pos if n_pos > 0 else 1.0
-
-    def _prepare_eval_set(self, eval_set):
-        """Convert eval_set to numpy arrays if needed"""
-        if not eval_set:
-            return None
-        X_val, y_val = eval_set[0]
-        return [(self._to_array(X_val), self._to_array(y_val))]
-
-    @abstractmethod
-    def _build_model(self, scale_pos: float):
-        """Build the underlying model instance (must be implemented by subclasses)"""
-        pass
-
-    @abstractmethod
-    def _fit_with_eval(self, X_arr, y_arr, eval_arr):
-        """Fit model with evaluation set (must be implemented by subclasses)"""
-        pass
-
-    def fit(self, X, y, eval_set=None):
-        X_arr, y_arr = self._to_array(X), self._to_array(y)
-
-        if isinstance(X, pd.DataFrame):
-            self.feature_names_ = X.columns.tolist()
-
-        scale_pos = self._compute_pos_weight(y_arr)
-        self.model_ = self._build_model(scale_pos)
-
-        eval_arr = self._prepare_eval_set(eval_set)
-        if eval_arr:
-            self._fit_with_eval(X_arr, y_arr, eval_arr)
-        else:
-            self.model_.fit(X_arr, y_arr)
-
-        return self
-
-    def predict(self, X):
-        return self.model_.predict(self._to_array(X))
-
-    def predict_proba(self, X):
-        return self.model_.predict_proba(self._to_array(X))
-
-    @property
-    def feature_importances_(self):
-        return self.model_.feature_importances_ if self.model_ else None
-
-
-class XGBoostAgentClassifier(BaseAgentClassifier):
-    """XGBoost binary classifier for agent detection with class imbalance handling"""
-
-    def _build_model(self, scale_pos: float):
-        return xgb.XGBClassifier(
-            n_estimators=self.n_estimators,
-            max_depth=self.max_depth,
-            learning_rate=self.learning_rate,
-            scale_pos_weight=scale_pos,
-            eval_metric='auc',
-            early_stopping_rounds=self.early_stopping_rounds,
-            random_state=42,
-            tree_method='hist',
-            enable_categorical=False
-        )
-
-    def _fit_with_eval(self, X_arr, y_arr, eval_arr):
-        self.model_.fit(X_arr, y_arr, eval_set=eval_arr, verbose=False)
-
-
-class LightGBMAgentClassifier(BaseAgentClassifier):
-    """LightGBM binary classifier for agent detection with class imbalance handling"""
-
-    def _build_model(self, scale_pos: float):
-        return lgb.LGBMClassifier(
-            n_estimators=self.n_estimators,
-            max_depth=self.max_depth,
-            learning_rate=self.learning_rate,
-            scale_pos_weight=scale_pos,
-            metric='auc',
-            random_state=42,
-            verbosity=-1
-        )
-
-    def _fit_with_eval(self, X_arr, y_arr, eval_arr):
-        self.model_.fit(
-            X_arr, y_arr,
-            eval_set=eval_arr,
-            callbacks=[lgb.early_stopping(self.early_stopping_rounds, verbose=False)]
-        )
+class WeakClassifier(BaseEstimator, ClassifierMixin, ABC):
+    # a simple contrastive machine learning model
+    # this model should learn to distinguish between human and agent behavior
+    # using a weakly supervised approach and contrastive learning + augmentation
+    #
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.model = None
+        self.kwargs = kwargs
diff --git a/experiments/ml/weak.train.py b/experiments/ml/weak.train.py
new file mode 100644
index 0000000..36e11ee
--- /dev/null
+++ b/experiments/ml/weak.train.py
@@ -0,0 +1,30 @@
+from sim.rl.behavior_loader.loader import AgentLoader, Loader, JointLoader
+from sim.rl.behavior_loader.loader import PayloadModel
+from arch import WeakClassifier
+
+agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+
+def augment_trajectory(trajectory : list[PayloadModel], augmentation_rate: float = 0.1) -> list[PayloadModel]:
+    # augmentations possible:
+    # return a sub-trajectory window of the original trajectory
+    # insert random noise events
+    # shuffle a few events (find a few indices and swap them with i+1 neighbor)
+    # adjust metadata
+    return trajectory
+
+
+def train():
+    pass
+
+
+
+if __name__ == "__main__":
+    joint_loader = JointLoader(human_dir, agent_dir)
+    data = joint_loader.get_data()
+    entries, num_entries = joint_loader.get_entries()
+    print(f"Loaded {num_entries} entries")
+    # TODO: augment
+    # fit model
+    model = WeakClassifier()
+    model.fit(data)

From 80863e9b17d59de0701547ffe9d277c435e93e6d Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 14:05:30 +0100
Subject: [PATCH 28/99] strong dataset gathering

---
 sim/strong_learner/data.py | 99 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 sim/strong_learner/data.py

diff --git a/sim/strong_learner/data.py b/sim/strong_learner/data.py
new file mode 100644
index 0000000..80129aa
--- /dev/null
+++ b/sim/strong_learner/data.py
@@ -0,0 +1,99 @@
+import os, requests, py7zr
+import pandas as pd
+from typing import Generator
+try:
+    from sim.rl.behavior_loader.loader import PayloadModel, ValueModel, InteractionModel, Loader
+except ImportError:
+    from loader import PayloadModel, ValueModel, InteractionModel, Loader
+
+class YooChooseLoader(Loader):
+    URL = "https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z"
+    CLICK_COLS = ['session_id', 'ts', 'item_id', 'category']
+    BUY_COLS = ['session_id', 'ts', 'item_id', 'price', 'quantity']
+
+    def __init__(self, root_dir: str = "data/yoochoose", chunk_size: int = 500_000, max_sessions: int = 1000):
+        self.root = root_dir
+        self.chunk_size = chunk_size
+        self.max_sessions = max_sessions
+        self.click_path = f"{root_dir}/yoochoose-clicks.dat"
+        self.buy_path = f"{root_dir}/yoochoose-buys.dat"
+        if not os.path.exists(self.click_path): self._setup()
+        self.data = self._load_sessions(max_sessions)
+        self.entries = list(self.data.keys())
+
+    def _setup(self):
+        os.makedirs(self.root, exist_ok=True)
+        zip_path = f"{self.root}/temp.7z"
+        with requests.get(self.URL, stream=True) as r:
+            with open(zip_path, 'wb') as f:
+                for chunk in r.iter_content(8192): f.write(chunk)
+        with py7zr.SevenZipFile(zip_path, 'r') as z: z.extractall(self.root)
+        os.remove(zip_path)
+
+    def _make_interaction(self, sid: str, ts: str, item_id: str, event: str, page: str, meta: dict) -> InteractionModel:
+        payload = PayloadModel(
+            sessionId=sid, experimentId=None, eventName=event,
+            page=page, productId=item_id, metadata=meta,
+            storeMode="yoochoose", userAgent="dataset", ts=ts
+        )
+        return InteractionModel(
+            partitionID=0, offset=0, timestamp=0, compression="",
+            isTransactional=False, headers=[], key={},
+            value=ValueModel(payload=payload, encoding="json", isPayloadNull=False, schemaId=1, size=0)
+        )
+
+    def _parse_category(self, cat) -> str:
+        if pd.isna(cat) or cat == "0": return "unknown"
+        if cat == "S": return "special_offer"
+        try:
+            n = int(cat)
+            return f"category_{n}" if 1 <= n <= 12 else f"brand_{n}"
+        except: return str(cat)
+
+    def stream_clicks(self) -> Generator[InteractionModel, None, None]:
+        with pd.read_csv(self.click_path, names=self.CLICK_COLS, chunksize=self.chunk_size, header=None) as reader:
+            for chunk in reader:
+                for r in chunk.itertuples(index=False):
+                    yield self._make_interaction(
+                        str(r.session_id), r.ts, str(r.item_id),
+                        "view_item_page", self._parse_category(r.category), {}
+                    )
+
+    def stream_buys(self) -> Generator[InteractionModel, None, None]:
+        with pd.read_csv(self.buy_path, names=self.BUY_COLS, chunksize=self.chunk_size, header=None) as reader:
+            for chunk in reader:
+                for r in chunk.itertuples(index=False):
+                    yield self._make_interaction(
+                        str(r.session_id), r.ts, str(r.item_id),
+                        "purchase_complete", "/checkout", {"price": r.price, "quantity": r.quantity}
+                    )
+
+    def stream(self) -> Generator[InteractionModel, None, None]:
+        yield from self.stream_clicks()
+        yield from self.stream_buys()
+
+    def _load_sessions(self, max_sessions: int | None = None) -> dict:
+        sessions = {}
+        for interaction in self.stream():
+            sid = interaction.value.payload.sessionId
+            if sid not in sessions:
+                if max_sessions and len(sessions) >= max_sessions: continue
+                sessions[sid] = []
+            sessions[sid].append(interaction)
+        for sid in sessions: sessions[sid].sort(key=lambda x: x.value.payload.ts)
+        return sessions
+
+    def get_data(self) -> dict:
+        return self.data
+
+    def get_entries(self) -> tuple[list[str], int]:
+        return self.entries, len(self.entries)
+
+if __name__ == "__main__":
+    loader = YooChooseLoader(max_sessions=100)
+    views, purchases = 0, 0
+    for sid, evts in loader.get_data().items():
+        for e in evts:
+            if e.value.payload.eventName == "view_item_page": views += 1
+            elif e.value.payload.eventName == "purchase_complete": purchases += 1
+    print(f"Loaded {len(loader.entries)} sessions: {views} view_item_page, {purchases} purchase_complete")

From e5060babfaae551bae79981ad4948853315fdc03 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 14:05:39 +0100
Subject: [PATCH 29/99] feat: initial feature engineering of trajectories

---
 sim/rl/behavior_loader/models.py | 49 +++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 84c2fe4..4c6bf21 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -1,4 +1,7 @@
-from loader import Loader, AgentLoader, JointLoader
+try:
+    from loader import Loader, AgentLoader, JointLoader
+except ImportError:
+    from sim.rl.behavior_loader.loader import Loader, AgentLoader, JointLoader
 from collections import defaultdict
 from typing import Dict, List, Tuple, Set
 import numpy as np
@@ -80,6 +83,50 @@ class BehaviorModel:
             path.append(curr)
         return path
 
+    def extract_trajectory_features(self, events: List, max_trans_dim: int = 50) -> np.ndarray:
+        """Convert trajectory to feature vector using MDP structure for contrastive learning"""
+        if not self.mdp:
+            self.build_MDP()
+
+        states = [self._state_repr(e) for e in sorted(events, key=self._sort_key)]
+        features = []
+
+        # transition histogram over MDP state space
+        trans_counts = defaultdict(int)
+        for s, s_next in zip(states, states[1:]):
+            trans_counts[(s, s_next)] += 1
+        all_trans = [(s, t) for s in self.mdp['states'] for t in self.mdp['transitions'].get(s, {}).keys()]
+        trans_vec = [trans_counts.get(tr, 0) for tr in all_trans[:max_trans_dim]]
+        trans_vec = trans_vec + [0] * (max_trans_dim - len(trans_vec))  # pad
+        total_trans = sum(trans_counts.values()) or 1
+        features.extend([v / total_trans for v in trans_vec])
+
+        # state coverage ratio
+        visited = set(states)
+        features.append(len(visited) / max(self.mdp['num_states'], 1))
+
+        # temporal entropy of transitions
+        if len(states) > 1:
+            trans_probs = [self.transition_prob(s, s_n) for s, s_n in zip(states, states[1:])]
+            entropy = -sum(p * np.log(p + 1e-10) for p in trans_probs if p > 0)
+            features.append(entropy / max(len(states), 1))
+        else:
+            features.append(0.0)
+
+        # trajectory length and unique state count
+        features.append(len(states))
+        features.append(len(visited))
+
+        # state value statistics along trajectory
+        vals = [self.state_value(s) for s in states]
+        if vals:
+            features.extend([np.mean(vals), np.std(vals), np.min(vals), np.max(vals)])
+        else:
+            features.extend([0.0, 0.0, 0.0, 0.0])
+
+        return np.array(features, dtype=np.float32)
+
+
 class AgentBehaviorModel(BehaviorModel):
     def __init__(self, src_dir: str):
         super().__init__(src_dir, AgentLoader)

From 6aad196234de1a0c582ad463f805c4113aa79b8a Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 18:22:31 +0100
Subject: [PATCH 30/99] migrating weak learning

---
 experiments/ml/weak.train.py |  30 -----
 experiments/ml/weak_train.py | 246 +++++++++++++++++++++++++++++++++++
 2 files changed, 246 insertions(+), 30 deletions(-)
 delete mode 100644 experiments/ml/weak.train.py
 create mode 100644 experiments/ml/weak_train.py

diff --git a/experiments/ml/weak.train.py b/experiments/ml/weak.train.py
deleted file mode 100644
index 36e11ee..0000000
--- a/experiments/ml/weak.train.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from sim.rl.behavior_loader.loader import AgentLoader, Loader, JointLoader
-from sim.rl.behavior_loader.loader import PayloadModel
-from arch import WeakClassifier
-
-agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
-
-def augment_trajectory(trajectory : list[PayloadModel], augmentation_rate: float = 0.1) -> list[PayloadModel]:
-    # augmentations possible:
-    # return a sub-trajectory window of the original trajectory
-    # insert random noise events
-    # shuffle a few events (find a few indices and swap them with i+1 neighbor)
-    # adjust metadata
-    return trajectory
-
-
-def train():
-    pass
-
-
-
-if __name__ == "__main__":
-    joint_loader = JointLoader(human_dir, agent_dir)
-    data = joint_loader.get_data()
-    entries, num_entries = joint_loader.get_entries()
-    print(f"Loaded {num_entries} entries")
-    # TODO: augment
-    # fit model
-    model = WeakClassifier()
-    model.fit(data)
diff --git a/experiments/ml/weak_train.py b/experiments/ml/weak_train.py
new file mode 100644
index 0000000..eb87a9c
--- /dev/null
+++ b/experiments/ml/weak_train.py
@@ -0,0 +1,246 @@
+import sys
+sys.path.insert(0, "/home/velocitatem/Documents/Projects/PHANTOM/sim/rl/behavior_loader")
+sys.path.insert(0, "/home/velocitatem/Documents/Projects/PHANTOM/experiments/ml")
+
+from sim.rl.behavior_loader.loader import AgentLoader, Loader, JointLoader, PayloadModel
+from sim.rl.behavior_loader.models import JointBehaviorModel
+from arch import ContrastiveWeakClassifier, contrastive_loss, featurize_trajectory
+from typing import List, Optional, Dict
+from datetime import datetime, timedelta
+from copy import deepcopy
+import numpy as np
+import random
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import Adam
+from torch.utils.tensorboard import SummaryWriter
+
+RUNS_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/ml/runs"
+agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+
+
+def _perturb_ts(evt: PayloadModel, jitter_ms: int = 500) -> PayloadModel:
+    """Add random jitter to event timestamp"""
+    new_evt = deepcopy(evt)
+    try:
+        ts = datetime.fromisoformat(evt.ts.replace('Z', '+00:00'))
+        delta = timedelta(milliseconds=random.randint(-jitter_ms, jitter_ms))
+        new_evt.ts = (ts + delta).isoformat()
+    except:
+        pass
+    return new_evt
+
+
+def augment_trajectory(trajectory: List[PayloadModel], rate: float = 0.1) -> List[PayloadModel]:
+    """Apply random augmentation to trajectory for contrastive learning"""
+    if len(trajectory) < 2:
+        return trajectory
+
+    aug_type = random.choice(['window', 'shuffle', 'noise', 'drop'])
+
+    if aug_type == 'window':  # random contiguous sub-sequence (70-100% length)
+        min_len = max(2, int(len(trajectory) * 0.7))
+        sub_len = random.randint(min_len, len(trajectory))
+        start = random.randint(0, len(trajectory) - sub_len)
+        return trajectory[start:start + sub_len]
+
+    elif aug_type == 'shuffle':  # swap adjacent pairs with probability rate
+        result = list(trajectory)
+        for i in range(len(result) - 1):
+            if random.random() < rate:
+                result[i], result[i + 1] = result[i + 1], result[i]
+        return result
+
+    elif aug_type == 'drop':  # drop events with probability rate
+        result = [e for e in trajectory if random.random() > rate]
+        return result if len(result) >= 2 else trajectory[:2]
+
+    elif aug_type == 'noise':  # perturb timestamps
+        return [_perturb_ts(e, jitter_ms=500) for e in trajectory]
+
+    return trajectory
+
+
+class TripletDataset(Dataset):
+    """Generate (anchor, positive, negative) triplets on-the-fly with augmentation"""
+    def __init__(self, data: Dict[str, List[PayloadModel]], mdp: Optional[Dict], augment_fn, input_dim: int = 64, multiplier: int = 10):
+        self.sessions = list(data.items())
+        self.human_ids = [i for i, (sid, _) in enumerate(self.sessions) if sid.startswith('human_')]
+        self.agent_ids = [i for i, (sid, _) in enumerate(self.sessions) if sid.startswith('agent_')]
+        self.mdp = mdp
+        self.augment = augment_fn
+        self.input_dim = input_dim
+        self.multiplier = multiplier
+
+        if not self.human_ids or not self.agent_ids:
+            raise ValueError(f"Need both human ({len(self.human_ids)}) and agent ({len(self.agent_ids)}) sessions")
+
+    def __len__(self) -> int:
+        return len(self.sessions) * self.multiplier
+
+    def __getitem__(self, idx: int):
+        anchor_idx = idx % len(self.sessions)
+        sid, events = self.sessions[anchor_idx]
+        is_human = sid.startswith('human_')
+
+        anchor = featurize_trajectory(events, self.mdp, self.input_dim)
+        positive = featurize_trajectory(self.augment(events), self.mdp, self.input_dim)
+
+        neg_pool = self.agent_ids if is_human else self.human_ids
+        neg_idx = random.choice(neg_pool)
+        negative = featurize_trajectory(self.sessions[neg_idx][1], self.mdp, self.input_dim)
+
+        label = 0 if is_human else 1  # 0=human, 1=agent
+        return (torch.tensor(anchor, dtype=torch.float32),
+                torch.tensor(positive, dtype=torch.float32),
+                torch.tensor(negative, dtype=torch.float32),
+                torch.tensor(label, dtype=torch.long))
+
+
+def train(epochs: int = 100, lr: float = 1e-3, batch_size: int = 4, input_dim: int = 64,
+          embed_dim: int = 32, margin: float = 0.3, verbose: bool = True, run_name: str = None):
+    """Train contrastive weak classifier on human/agent trajectories"""
+    joint = JointLoader(human_dir, agent_dir)
+    data = joint.get_data()
+    if verbose:
+        print(f"Loaded {len(data)} sessions")
+
+    joint_model = JointBehaviorModel(human_dir, agent_dir)
+    ref_mdp = joint_model.build_MDP()
+
+    dataset = TripletDataset(data, ref_mdp, augment_trajectory, input_dim=input_dim)
+    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+
+    model = ContrastiveWeakClassifier(input_dim=input_dim, embed_dim=embed_dim, margin=margin)
+    model.to_device()
+
+    run_name = run_name or f"d{input_dim}_e{embed_dim}_lr{lr}_m{margin}_{datetime.now():%Y%m%d_%H%M%S}"
+    writer = SummaryWriter(f"{RUNS_DIR}/train/{run_name}")
+
+    optimizer = Adam(list(model.encoder.parameters()) + list(model.classifier.parameters()), lr=lr)
+    ce_loss_fn = torch.nn.CrossEntropyLoss()
+
+    best_loss = float('inf')
+    for epoch in range(epochs):
+        model.encoder.train()
+        model.classifier.train()
+        total_loss, n_batches = 0.0, 0
+
+        for anchor, positive, negative, labels in loader:
+            anchor, positive, negative, labels = [t.to(model.device) for t in [anchor, positive, negative, labels]]
+            z_a, z_p, z_n = [model.encoder(t.unsqueeze(1)) for t in [anchor, positive, negative]]
+
+            trip_loss = contrastive_loss(z_a, z_p, z_n, margin=model.margin)
+            ce = ce_loss_fn(model.classifier(z_a), labels)
+            loss = trip_loss + 0.5 * ce
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            n_batches += 1
+
+        avg_loss = total_loss / max(n_batches, 1)
+        writer.add_scalar('loss', avg_loss, epoch)
+
+        if verbose and (epoch + 1) % 10 == 0:
+            print(f"Epoch {epoch+1}/{epochs}: loss={avg_loss:.4f}")
+        if avg_loss < best_loss:
+            best_loss = avg_loss
+
+    writer.close()
+    if verbose:
+        print(f"Done. Best={best_loss:.4f} TB:{RUNS_DIR}/train/{run_name}")
+
+    return model, ref_mdp
+
+
+def evaluate_loocv(input_dim: int = 64, embed_dim: int = 32, epochs_per_fold: int = 50,
+                   lr: float = 1e-3, margin: float = 0.3, run_name: str = None):
+    """Leave-one-out cross-validation given limited samples"""
+    joint = JointLoader(human_dir, agent_dir)
+    data = joint.get_data()
+    session_ids = list(data.keys())
+
+    joint_model = JointBehaviorModel(human_dir, agent_dir)
+    ref_mdp = joint_model.build_MDP()
+
+    run_name = run_name or f"loocv_d{input_dim}_e{embed_dim}_m{margin}_{datetime.now():%Y%m%d_%H%M%S}"
+    writer = SummaryWriter(f"{RUNS_DIR}/eval/{run_name}")
+
+    predictions, actuals = [], []
+
+    for fold_idx, test_sid in enumerate(session_ids):
+        train_data = {k: v for k, v in data.items() if k != test_sid}
+        test_events = data[test_sid]
+        test_label = 0 if test_sid.startswith('human_') else 1
+
+        n_human = sum(1 for k in train_data if k.startswith('human_'))
+        n_agent = sum(1 for k in train_data if k.startswith('agent_'))
+        if n_human == 0 or n_agent == 0:
+            continue
+
+        try:
+            dataset = TripletDataset(train_data, ref_mdp, augment_trajectory, input_dim=input_dim, multiplier=5)
+            loader = DataLoader(dataset, batch_size=2, shuffle=True, drop_last=True)
+
+            model = ContrastiveWeakClassifier(input_dim=input_dim, embed_dim=embed_dim, margin=margin)
+            model.to_device()
+            optimizer = Adam(list(model.encoder.parameters()) + list(model.classifier.parameters()), lr=lr)
+
+            model.encoder.train()
+            model.classifier.train()
+            for _ in range(epochs_per_fold):
+                for anchor, positive, negative, labels in loader:
+                    z_a, z_p, z_n = [model.encoder(t.unsqueeze(1).to(model.device)) for t in [anchor, positive, negative]]
+                    loss = contrastive_loss(z_a, z_p, z_n, margin=margin)
+                    optimizer.zero_grad()
+                    loss.backward()
+                    optimizer.step()
+
+            test_feat = featurize_trajectory(test_events, ref_mdp, input_dim)
+            pred = model.predict(test_feat.reshape(1, -1))[0]
+            predictions.append(pred)
+            actuals.append(test_label)
+            print(f"  {test_sid[:12]}...: pred={pred}, actual={test_label}, {'OK' if pred == test_label else 'MISS'}")
+
+        except Exception as e:
+            print(f"Error: {e}")
+
+    if predictions:
+        acc = sum(p == a for p, a in zip(predictions, actuals)) / len(predictions)
+        tp = sum(1 for p, a in zip(predictions, actuals) if p == 1 and a == 1)
+        fp = sum(1 for p, a in zip(predictions, actuals) if p == 1 and a == 0)
+        fn = sum(1 for p, a in zip(predictions, actuals) if p == 0 and a == 1)
+        prec, rec = tp / max(tp + fp, 1), tp / max(tp + fn, 1)
+        f1 = 2 * prec * rec / max(prec + rec, 1e-10)
+        writer.add_scalar('accuracy', acc, 0)
+        writer.add_scalar('f1', f1, 0)
+        writer.add_scalar('precision', prec, 0)
+        writer.add_scalar('recall', rec, 0)
+        writer.close()
+        print(f"\nAccuracy: {acc:.2%} F1: {f1:.3f} TB:{RUNS_DIR}/eval/{run_name}")
+        return acc, predictions, actuals
+    writer.close()
+    return 0.0, [], []
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mode', choices=['train', 'eval'], default='train')
+    parser.add_argument('--epochs', type=int, default=100)
+    parser.add_argument('--lr', type=float, default=1e-3)
+    parser.add_argument('--margin', type=float, default=0.3)
+    parser.add_argument('--input-dim', type=int, default=64)
+    parser.add_argument('--embed-dim', type=int, default=32)
+    parser.add_argument('--run-name', type=str, default=None)
+    args = parser.parse_args()
+
+    if args.mode == 'train':
+        model, mdp = train(epochs=args.epochs, lr=args.lr, input_dim=args.input_dim,
+                           embed_dim=args.embed_dim, margin=args.margin, run_name=args.run_name)
+    else:
+        evaluate_loocv(input_dim=args.input_dim, embed_dim=args.embed_dim, epochs_per_fold=args.epochs,
+                       lr=args.lr, margin=args.margin, run_name=args.run_name)

From 5f607a58eb7883e0817d9ca738b7c28ca5f275a5 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 18:22:39 +0100
Subject: [PATCH 31/99] acapting some architectures

---
 experiments/ml/__init__.py |  16 ++-
 experiments/ml/arch.py     | 242 +++++++++++++++++++++++++++++++++++--
 2 files changed, 247 insertions(+), 11 deletions(-)

diff --git a/experiments/ml/__init__.py b/experiments/ml/__init__.py
index 11b65df..c97eaa9 100644
--- a/experiments/ml/__init__.py
+++ b/experiments/ml/__init__.py
@@ -1,11 +1,21 @@
 from .evals import evaluate
 from .arch import (
     XGBoostAgentClassifier,
-    LightGBMAgentClassifier
+    LightGBMAgentClassifier,
+    ContrastiveWeakClassifier,
+    TrajectoryEncoder,
+    WeakClassifier,
+    contrastive_loss,
+    featurize_trajectory,
 )
 
-__all__ =[
+__all__ = [
     'evaluate',
     'XGBoostAgentClassifier',
-    'LightGBMAgentClassifier'
+    'LightGBMAgentClassifier',
+    'ContrastiveWeakClassifier',
+    'TrajectoryEncoder',
+    'WeakClassifier',
+    'contrastive_loss',
+    'featurize_trajectory',
 ]
diff --git a/experiments/ml/arch.py b/experiments/ml/arch.py
index a187959..4ceb2e0 100644
--- a/experiments/ml/arch.py
+++ b/experiments/ml/arch.py
@@ -1,23 +1,249 @@
 # sklearn compatible models for agent detection
 from sklearn.base import BaseEstimator, ClassifierMixin
-from procesing.context import PipelineContext
-from typing import Any, Optional, Tuple
+from typing import Any, Optional, Tuple, Dict, List
 from abc import ABC, abstractmethod
-import xgboost as xgb
-import lightgbm as lgb
+from collections import defaultdict
 import numpy as np
 import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 
 TASK = 'classification'
 LABELS = ['human', 'agent']
 
 
 class WeakClassifier(BaseEstimator, ClassifierMixin, ABC):
-    # a simple contrastive machine learning model
-    # this model should learn to distinguish between human and agent behavior
-    # using a weakly supervised approach and contrastive learning + augmentation
-    #
+    # a simple contrastive machine learning model learns to distinguish human/agent behavior
+    # using weakly supervised contrastive learning + augmentation
     def __init__(self, **kwargs):
         super().__init__()
         self.model = None
         self.kwargs = kwargs
+
+
+class TrajectoryEncoder(nn.Module):
+    """Encode variable-length event sequences to fixed-dim embedding via bidirectional LSTM"""
+    def __init__(self, input_dim: int, embed_dim: int = 32, hidden_dim: int = 64):
+        super().__init__()
+        self.event_embed = nn.Linear(input_dim, hidden_dim)
+        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
+        self.proj = nn.Linear(hidden_dim * 2, embed_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # x: (batch, seq_len, input_dim)
+        h = F.relu(self.event_embed(x))
+        _, (hn, _) = self.lstm(h)
+        hn = torch.cat([hn[-2], hn[-1]], dim=1)  # concat bidirectional hidden states
+        return F.normalize(self.proj(hn), dim=1)  # L2 normalized
+
+
+class ContrastiveWeakClassifier(WeakClassifier):
+    """Contrastive learning classifier for human/agent trajectory discrimination"""
+    def __init__(self, input_dim: int = 64, embed_dim: int = 32, margin: float = 1.0, **kwargs):
+        super().__init__(**kwargs)
+        self.input_dim = input_dim
+        self.embed_dim = embed_dim
+        self.margin = margin
+        self.encoder = TrajectoryEncoder(input_dim, embed_dim)
+        self.classifier = nn.Linear(embed_dim, 2)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self._fitted = False
+
+    def to_device(self):
+        self.encoder.to(self.device)
+        self.classifier.to(self.device)
+        return self
+
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        return self.encoder(x.to(self.device))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        emb = self.encode(x)
+        return self.classifier(emb)
+
+    def fit(self, X, y=None):  # sklearn interface - actual training in weak.train.py
+        self._fitted = True
+        return self
+
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        self.encoder.eval()
+        self.classifier.eval()
+        with torch.no_grad():
+            x = torch.tensor(X, dtype=torch.float32).unsqueeze(1).to(self.device)
+            logits = self.forward(x)
+            return torch.argmax(logits, dim=1).cpu().numpy()
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        self.encoder.eval()
+        self.classifier.eval()
+        with torch.no_grad():
+            x = torch.tensor(X, dtype=torch.float32).unsqueeze(1).to(self.device)
+            logits = self.forward(x)
+            return F.softmax(logits, dim=1).cpu().numpy()
+
+
+def contrastive_loss(anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor, margin: float = 0.3) -> torch.Tensor:
+    """Triplet loss using cosine similarity (for L2-normalized embeddings). margin in [0,1] range."""
+    pos_sim = F.cosine_similarity(anchor, positive)  # higher = more similar
+    neg_sim = F.cosine_similarity(anchor, negative)
+    return F.relu(neg_sim - pos_sim + margin).mean()  # want pos_sim > neg_sim + margin
+
+
+def nt_xent_loss(z_i: torch.Tensor, z_j: torch.Tensor, temperature: float = 0.5) -> torch.Tensor:
+    """Normalized temperature-scaled cross entropy loss (SimCLR style)"""
+    batch_size = z_i.size(0)
+    z = torch.cat([z_i, z_j], dim=0)  # (2N, embed_dim)
+    sim = F.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0), dim=2) / temperature
+    mask = torch.eye(2 * batch_size, dtype=torch.bool, device=z.device)
+    sim.masked_fill_(mask, -float('inf'))
+    labels = torch.arange(batch_size, device=z.device)
+    labels = torch.cat([labels + batch_size, labels])  # positive pairs
+    return F.cross_entropy(sim, labels)
+
+
+# feature extraction utilities for trajectory -> feature vector
+def transition_histogram(events: List, state_fn, max_states: int = 50) -> np.ndarray:
+    """Compute normalized histogram of state transitions in trajectory"""
+    if len(events) < 2:
+        return np.zeros(max_states)
+    states = [state_fn(e) for e in events]
+    trans_counts = defaultdict(int)
+    for s, s_next in zip(states, states[1:]):
+        trans_counts[(s, s_next)] += 1
+    total = sum(trans_counts.values())
+    hist = np.array(list(trans_counts.values())[:max_states], dtype=np.float32)
+    hist = np.pad(hist, (0, max(0, max_states - len(hist))))
+    return hist / (total + 1e-10)
+
+
+def temporal_signature(events: List, ts_fn) -> np.ndarray:
+    """Extract temporal features: mean/std/skew of inter-event times"""
+    if len(events) < 2:
+        return np.zeros(4, dtype=np.float32)
+    times = sorted([ts_fn(e) for e in events])
+    diffs = np.diff(times).astype(np.float32)
+    if len(diffs) == 0:
+        return np.zeros(4, dtype=np.float32)
+    mean_dt, std_dt = np.mean(diffs), np.std(diffs) + 1e-10
+    skew = np.mean(((diffs - mean_dt) / std_dt) ** 3) if std_dt > 1e-8 else 0.0
+    return np.array([mean_dt, std_dt, skew, len(diffs)], dtype=np.float32)
+
+
+def state_coverage(events: List, state_fn, mdp_states: set) -> float:
+    """Fraction of MDP states visited by trajectory"""
+    if not mdp_states:
+        return 0.0
+    visited = set(state_fn(e) for e in events)
+    return len(visited & mdp_states) / len(mdp_states)
+
+
+def transition_entropy(events: List, state_fn) -> float:
+    """Compute entropy of transition distribution (randomness of navigation)"""
+    if len(events) < 2:
+        return 0.0
+    states = [state_fn(e) for e in events]
+    trans_counts = defaultdict(int)
+    for s, s_next in zip(states, states[1:]):
+        trans_counts[(s, s_next)] += 1
+    total = sum(trans_counts.values())
+    probs = [c / total for c in trans_counts.values()]
+    return -sum(p * np.log(p + 1e-10) for p in probs)
+
+
+def featurize_trajectory(events: List, mdp: Optional[Dict] = None, input_dim: int = 64) -> np.ndarray:
+    """Convert trajectory to fixed-dim feature vector"""
+    def _state_repr(e):
+        return f"{getattr(e, 'page', None) or 'unk'}|{getattr(e, 'productId', None) or 'none'}|{e.eventName}"
+
+    def _ts_fn(e):
+        ts = getattr(e, 'ts', None)
+        if isinstance(ts, str):
+            from datetime import datetime
+            try:
+                return datetime.fromisoformat(ts.replace('Z', '+00:00')).timestamp()
+            except:
+                return 0.0
+        return float(ts) if ts else 0.0
+
+    feats = []
+    feats.extend(transition_histogram(events, _state_repr, max_states=40))  # 40 dims
+    feats.extend(temporal_signature(events, _ts_fn))  # 4 dims
+    mdp_states = set(mdp.get('states', [])) if mdp else set()
+    feats.append(state_coverage(events, _state_repr, mdp_states))  # 1 dim
+    feats.append(transition_entropy(events, _state_repr))  # 1 dim
+    feats.append(len(events))  # trajectory length
+    feats.append(len(set(_state_repr(e) for e in events)))  # unique states
+
+    # event type distribution (page_view, hover, cart, purchase indicators)
+    event_names = [e.eventName for e in events]
+    feats.append(sum(1 for n in event_names if 'page' in n.lower()) / (len(events) + 1))
+    feats.append(sum(1 for n in event_names if 'hover' in n.lower()) / (len(events) + 1))
+    feats.append(sum(1 for n in event_names if 'cart' in n.lower()) / (len(events) + 1))
+    feats.append(sum(1 for n in event_names if 'purchase' in n.lower() or 'checkout' in n.lower()) / (len(events) + 1))
+
+    # pad/truncate to input_dim
+    feats = np.array(feats[:input_dim], dtype=np.float32)
+    if len(feats) < input_dim:
+        feats = np.pad(feats, (0, input_dim - len(feats)))
+    return feats
+
+
+# gradient boosting classifiers for comparison baselines
+class XGBoostAgentClassifier(BaseEstimator, ClassifierMixin):
+    """XGBoost classifier for human/agent detection from session features"""
+    def __init__(self, n_estimators: int = 100, max_depth: int = 6, learning_rate: float = 0.1, **kwargs):
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.learning_rate = learning_rate
+        self.model = None
+        self.kwargs = kwargs
+
+    def fit(self, X: np.ndarray, y: np.ndarray):
+        try:
+            import xgboost as xgb
+            self.model = xgb.XGBClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth,
+                                           learning_rate=self.learning_rate, **self.kwargs)
+            self.model.fit(X, y)
+        except ImportError:
+            raise ImportError("xgboost required for XGBoostAgentClassifier")
+        return self
+
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        if self.model is None:
+            raise ValueError("fit the model first")
+        return self.model.predict(X)
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        if self.model is None:
+            raise ValueError("fit the model first")
+        return self.model.predict_proba(X)
+
+
+class LightGBMAgentClassifier(BaseEstimator, ClassifierMixin):
+    """LightGBM classifier for human/agent detection from session features"""
+    def __init__(self, n_estimators: int = 100, max_depth: int = -1, learning_rate: float = 0.1, **kwargs):
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.learning_rate = learning_rate
+        self.model = None
+        self.kwargs = kwargs
+
+    def fit(self, X: np.ndarray, y: np.ndarray):
+        try:
+            import lightgbm as lgb
+            self.model = lgb.LGBMClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth,
+                                            learning_rate=self.learning_rate, verbose=-1, **self.kwargs)
+            self.model.fit(X, y)
+        except ImportError:
+            raise ImportError("lightgbm required for LightGBMAgentClassifier")
+        return self
+
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        if self.model is None:
+            raise ValueError("fit the model first")
+        return self.model.predict(X)
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        if self.model is None:
+            raise ValueError("fit the model first")
+        return self.model.predict_proba(X)

From 7fcd18c3cb3f98d77aca7e4e83a9016efb5dc4d2 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 19:11:54 +0100
Subject: [PATCH 32/99] chore: remove boilerplate

---
 web/src/app/page.tsx | 64 ++------------------------------------------
 1 file changed, 2 insertions(+), 62 deletions(-)

diff --git a/web/src/app/page.tsx b/web/src/app/page.tsx
index 295f8fd..c97c8ed 100644
--- a/web/src/app/page.tsx
+++ b/web/src/app/page.tsx
@@ -1,65 +1,5 @@
-import Image from "next/image";
+import { redirect } from 'next/navigation';
 
 export default function Home() {
-  return (
-    <div className="flex min-h-screen items-center justify-center bg-zinc-50 font-sans dark:bg-black">
-      <main className="flex min-h-screen w-full max-w-3xl flex-col items-center justify-between py-32 px-16 bg-white dark:bg-black sm:items-start">
-        <Image
-          className="dark:invert"
-          src="/next.svg"
-          alt="Next.js logo"
-          width={100}
-          height={20}
-          priority
-        />
-        <div className="flex flex-col items-center gap-6 text-center sm:items-start sm:text-left">
-          <h1 className="max-w-xs text-3xl font-semibold leading-10 tracking-tight text-black dark:text-zinc-50">
-            To get started, edit the page.tsx file.
-          </h1>
-          <p className="max-w-md text-lg leading-8 text-zinc-600 dark:text-zinc-400">
-            Looking for a starting point or more instructions? Head over to{" "}
-            <a
-              href="https://vercel.com/templates?framework=next.js&utm_source=create-next-app&utm_medium=appdir-template-tw&utm_campaign=create-next-app"
-              className="font-medium text-zinc-950 dark:text-zinc-50"
-            >
-              Templates
-            </a>{" "}
-            or the{" "}
-            <a
-              href="https://nextjs.org/learn?utm_source=create-next-app&utm_medium=appdir-template-tw&utm_campaign=create-next-app"
-              className="font-medium text-zinc-950 dark:text-zinc-50"
-            >
-              Learning
-            </a>{" "}
-            center.
-          </p>
-        </div>
-        <div className="flex flex-col gap-4 text-base font-medium sm:flex-row">
-          <a
-            className="flex h-12 w-full items-center justify-center gap-2 rounded-full bg-foreground px-5 text-background transition-colors hover:bg-[#383838] dark:hover:bg-[#ccc] md:w-[158px]"
-            href="https://vercel.com/new?utm_source=create-next-app&utm_medium=appdir-template-tw&utm_campaign=create-next-app"
-            target="_blank"
-            rel="noopener noreferrer"
-          >
-            <Image
-              className="dark:invert"
-              src="/vercel.svg"
-              alt="Vercel logomark"
-              width={16}
-              height={16}
-            />
-            Deploy Now
-          </a>
-          <a
-            className="flex h-12 w-full items-center justify-center rounded-full border border-solid border-black/[.08] px-5 transition-colors hover:border-transparent hover:bg-black/[.04] dark:border-white/[.145] dark:hover:bg-[#1a1a1a] md:w-[158px]"
-            href="https://nextjs.org/docs?utm_source=create-next-app&utm_medium=appdir-template-tw&utm_campaign=create-next-app"
-            target="_blank"
-            rel="noopener noreferrer"
-          >
-            Documentation
-          </a>
-        </div>
-      </main>
-    </div>
-  );
+  redirect('/hotel');
 }

From 56308ecb1056a9179c34e2f50f0216704b4a47d5 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 19:12:11 +0100
Subject: [PATCH 33/99] chore: export repeated methods into lib

---
 lib/__init__.py     |  41 +++++++++++++++
 lib/config.py       |  65 +++++++++++++++++++++++
 lib/features.py     | 125 ++++++++++++++++++++++++++++++++++++++++++++
 lib/kafka_client.py |  54 +++++++++++++++++++
 lib/state.py        |  72 +++++++++++++++++++++++++
 5 files changed, 357 insertions(+)
 create mode 100644 lib/__init__.py
 create mode 100644 lib/config.py
 create mode 100644 lib/features.py
 create mode 100755 lib/kafka_client.py
 create mode 100644 lib/state.py

diff --git a/lib/__init__.py b/lib/__init__.py
new file mode 100644
index 0000000..7f8ec2d
--- /dev/null
+++ b/lib/__init__.py
@@ -0,0 +1,41 @@
+"""PHANTOM shared library
+Exports unified utilities for features, state, config, kafka, and model registry
+"""
+from .config import (
+    PROJECT_ROOT, DATA_DIR, EXPERIMENTS_DIR,
+    AGENT_DATA_DIR, HUMAN_DATA_DIR, SIM_RUNS_DIR, MODEL_REGISTRY_DIR,
+    COLLECTED_DATA_DIR, NOTEBOOK_OUTPUT_DIR,
+    ensure_dir, get_data_path, get_experiments_path, get_sim_path,
+    KAFKA_HOST, KAFKA_PORT, KAFKA_BROKER,
+    REDIS_HOST, REDIS_PORT,
+    SUPABASE_URL, SUPABASE_ANON_KEY,
+    BACKEND_PORT, PROVIDER_PORT
+)
+from .state import (
+    make_state_repr, event_to_state, parse_state,
+    get_event_name, get_timestamp,
+    create_state_fn, create_event_name_fn, create_timestamp_fn
+)
+from .features import (
+    transition_histogram, temporal_signature, state_coverage, transition_entropy,
+    event_type_distribution, featurize_trajectory, parse_timestamp
+)
+
+__all__ = [
+    # config
+    'PROJECT_ROOT', 'DATA_DIR', 'EXPERIMENTS_DIR',
+    'AGENT_DATA_DIR', 'HUMAN_DATA_DIR', 'SIM_RUNS_DIR', 'MODEL_REGISTRY_DIR',
+    'COLLECTED_DATA_DIR', 'NOTEBOOK_OUTPUT_DIR',
+    'ensure_dir', 'get_data_path', 'get_experiments_path', 'get_sim_path',
+    'KAFKA_HOST', 'KAFKA_PORT', 'KAFKA_BROKER',
+    'REDIS_HOST', 'REDIS_PORT',
+    'SUPABASE_URL', 'SUPABASE_ANON_KEY',
+    'BACKEND_PORT', 'PROVIDER_PORT',
+    # state
+    'make_state_repr', 'event_to_state', 'parse_state',
+    'get_event_name', 'get_timestamp',
+    'create_state_fn', 'create_event_name_fn', 'create_timestamp_fn',
+    # features
+    'transition_histogram', 'temporal_signature', 'state_coverage', 'transition_entropy',
+    'event_type_distribution', 'featurize_trajectory', 'parse_timestamp',
+]
diff --git a/lib/config.py b/lib/config.py
new file mode 100644
index 0000000..a27ffd9
--- /dev/null
+++ b/lib/config.py
@@ -0,0 +1,65 @@
+"""Unified path configuration for PHANTOM project
+All hardcoded paths should reference this module
+Paths can be overridden via environment variables
+"""
+import os
+from pathlib import Path
+
+# project root (directory containing lib/, experiments/, sim/, web/, backend/)
+PROJECT_ROOT = Path(__file__).parent.parent.resolve()
+
+# data directories
+DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data'))
+EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments'))
+
+# agent/human interaction data
+AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents'))
+HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans'))
+
+# RL simulation runs
+SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs'))
+
+# model artifacts
+MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models'))
+
+# collected experiment data
+COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data'))
+
+# notebook outputs
+NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs'))
+
+
+def ensure_dir(path: Path) -> Path:
+    """ensure directory exists, create if needed"""
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def get_data_path(*parts: str) -> Path:
+    """construct path relative to DATA_DIR"""
+    return DATA_DIR.joinpath(*parts)
+
+
+def get_experiments_path(*parts: str) -> Path:
+    """construct path relative to EXPERIMENTS_DIR"""
+    return EXPERIMENTS_DIR.joinpath(*parts)
+
+
+def get_sim_path(*parts: str) -> Path:
+    """construct path relative to SIM_RUNS_DIR"""
+    return SIM_RUNS_DIR.joinpath(*parts)
+
+
+# service configuration (from .env)
+KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost')
+KAFKA_PORT = os.getenv('KAFKA_PORT', '9092')
+KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}"
+
+REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
+REDIS_PORT = int(os.getenv('REDIS_PORT', '6379'))
+
+SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '')
+SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '')
+
+BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000'))
+PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001'))
diff --git a/lib/features.py b/lib/features.py
new file mode 100644
index 0000000..f2d88f5
--- /dev/null
+++ b/lib/features.py
@@ -0,0 +1,125 @@
+"""Unified featurization utilities for trajectory -> feature vector conversion
+Used by both experiments/ml/ and sim/rl/ components
+"""
+import numpy as np
+from collections import defaultdict
+from typing import List, Dict, Callable, Optional, Any, Set
+from datetime import datetime
+
+
+def transition_histogram(events: List, state_fn: Callable, max_states: int = 50) -> np.ndarray:
+    """compute normalized histogram of state transitions in trajectory
+    events: list of event objects/dicts
+    state_fn: function mapping event -> state string
+    max_states: maximum dimensions for histogram
+    """
+    if len(events) < 2:
+        return np.zeros(max_states, dtype=np.float32)
+    states = [state_fn(e) for e in events]
+    trans_counts = defaultdict(int)
+    for s, s_next in zip(states, states[1:]):
+        trans_counts[(s, s_next)] += 1
+    total = sum(trans_counts.values())
+    hist = np.array(list(trans_counts.values())[:max_states], dtype=np.float32)
+    hist = np.pad(hist, (0, max(0, max_states - len(hist))))
+    return hist / (total + 1e-10)
+
+
+def temporal_signature(events: List, ts_fn: Callable) -> np.ndarray:
+    """extract temporal features: mean/std/skew of inter-event times plus count
+    events: list of event objects/dicts
+    ts_fn: function mapping event -> timestamp (float seconds)
+    returns: [mean_dt, std_dt, skew, n_intervals] array
+    """
+    if len(events) < 2:
+        return np.zeros(4, dtype=np.float32)
+    times = sorted([ts_fn(e) for e in events])
+    diffs = np.diff(times).astype(np.float32)
+    if len(diffs) == 0:
+        return np.zeros(4, dtype=np.float32)
+    mean_dt, std_dt = np.mean(diffs), np.std(diffs) + 1e-10
+    skew = np.mean(((diffs - mean_dt) / std_dt) ** 3) if std_dt > 1e-8 else 0.0
+    return np.array([mean_dt, std_dt, skew, len(diffs)], dtype=np.float32)
+
+
+def state_coverage(events: List, state_fn: Callable, mdp_states: Set[str]) -> float:
+    """fraction of MDP states visited by trajectory
+    events: list of event objects/dicts
+    state_fn: function mapping event -> state string
+    mdp_states: set of all possible MDP states
+    """
+    if not mdp_states:
+        return 0.0
+    visited = set(state_fn(e) for e in events)
+    return len(visited & mdp_states) / len(mdp_states)
+
+
+def transition_entropy(events: List, state_fn: Callable) -> float:
+    """compute entropy of transition distribution (randomness of navigation)
+    higher entropy = more random browsing pattern
+    """
+    if len(events) < 2:
+        return 0.0
+    states = [state_fn(e) for e in events]
+    trans_counts = defaultdict(int)
+    for s, s_next in zip(states, states[1:]):
+        trans_counts[(s, s_next)] += 1
+    total = sum(trans_counts.values())
+    probs = [c / total for c in trans_counts.values()]
+    return -sum(p * np.log(p + 1e-10) for p in probs)
+
+
+def event_type_distribution(events: List, event_name_fn: Callable) -> np.ndarray:
+    """compute proportions of different event type categories
+    returns: [page_view_ratio, hover_ratio, cart_ratio, purchase_ratio]
+    """
+    if not events:
+        return np.zeros(4, dtype=np.float32)
+    n = len(events)
+    names = [event_name_fn(e).lower() for e in events]
+    return np.array([
+        sum(1 for nm in names if 'page' in nm or 'view' in nm) / n,
+        sum(1 for nm in names if 'hover' in nm) / n,
+        sum(1 for nm in names if 'cart' in nm) / n,
+        sum(1 for nm in names if 'purchase' in nm or 'checkout' in nm) / n
+    ], dtype=np.float32)
+
+
+def featurize_trajectory(events: List, state_fn: Callable, ts_fn: Callable,
+                         event_name_fn: Callable, mdp_states: Optional[Set[str]] = None,
+                         output_dim: int = 64) -> np.ndarray:
+    """convert trajectory to fixed-dimension feature vector
+    events: list of event objects/dicts
+    state_fn: function mapping event -> state string
+    ts_fn: function mapping event -> timestamp (float)
+    event_name_fn: function mapping event -> event name string
+    mdp_states: optional set of all MDP states for coverage calculation
+    output_dim: desired output dimension (will pad/truncate)
+    """
+    feats = []
+    feats.extend(transition_histogram(events, state_fn, max_states=40))  # 40 dims
+    feats.extend(temporal_signature(events, ts_fn))  # 4 dims
+    feats.append(state_coverage(events, state_fn, mdp_states or set()))  # 1 dim
+    feats.append(transition_entropy(events, state_fn))  # 1 dim
+    feats.append(float(len(events)))  # trajectory length
+    feats.append(float(len(set(state_fn(e) for e in events))))  # unique states
+    feats.extend(event_type_distribution(events, event_name_fn))  # 4 dims
+
+    feats = np.array(feats[:output_dim], dtype=np.float32)
+    if len(feats) < output_dim:
+        feats = np.pad(feats, (0, output_dim - len(feats)))
+    return feats
+
+
+def parse_timestamp(ts: Any) -> float:
+    """parse various timestamp formats to float seconds"""
+    if ts is None:
+        return 0.0
+    if isinstance(ts, (int, float)):
+        return float(ts)
+    if isinstance(ts, str):
+        try:
+            return datetime.fromisoformat(ts.replace('Z', '+00:00')).timestamp()
+        except ValueError:
+            return 0.0
+    return 0.0
diff --git a/lib/kafka_client.py b/lib/kafka_client.py
new file mode 100755
index 0000000..d61cd9e
--- /dev/null
+++ b/lib/kafka_client.py
@@ -0,0 +1,54 @@
+from kafka import KafkaConsumer
+import json
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+def get_interactions(
+    topic='user-interactions',
+    bootstrap_servers=None,
+    from_beginning=True,
+    max_records=None,
+    timeout_ms=5000
+):
+    """Consume interaction events from Kafka.
+
+    Args:
+        topic: Kafka topic name
+        bootstrap_servers: Kafka broker address (default from env)
+        from_beginning: Start from earliest offset if True
+        max_records: Max number of records to fetch (None = all available)
+        timeout_ms: Consumer poll timeout
+
+    Returns:
+        List of parsed interaction event dicts
+    """
+    if not bootstrap_servers:
+        host = os.getenv('KAFKA_HOST', 'localhost')
+        port = os.getenv('KAFKA_PORT', '9092')
+        bootstrap_servers = f'{host}:{port}'
+
+    consumer = KafkaConsumer(
+        topic,
+        bootstrap_servers=bootstrap_servers,
+        auto_offset_reset='earliest' if from_beginning else 'latest',
+        enable_auto_commit=False,
+        value_deserializer=lambda m: json.loads(m.decode('utf-8')),
+        consumer_timeout_ms=timeout_ms
+    )
+
+    events = []
+    try:
+        for msg in consumer:
+            events.append(msg.value)
+            if max_records and len(events) >= max_records:
+                break
+    finally:
+        consumer.close()
+
+    return events
+
+if __name__ == '__main__':
+    interactions = get_interactions(max_records=10)
+    for event in interactions:
+        print(event)
diff --git a/lib/state.py b/lib/state.py
new file mode 100644
index 0000000..cfb4251
--- /dev/null
+++ b/lib/state.py
@@ -0,0 +1,72 @@
+"""Unified state representation utilities for MDP state encoding
+Used by both experiments/ and sim/ components for consistent state handling
+"""
+from typing import Any, Callable
+
+
+def make_state_repr(page: str = None, product_id: str = None, event_name: str = None) -> str:
+    """create canonical state representation string from components
+    format: page|productId|eventName
+    """
+    p = page or 'unk'
+    pid = product_id or 'none'
+    en = event_name or 'unknown'
+    return f"{p}|{pid}|{en}"
+
+
+def event_to_state(evt: Any) -> str:
+    """convert event object/dict to state string
+    supports both object attributes and dict keys
+    """
+    if isinstance(evt, dict):
+        return make_state_repr(
+            page=evt.get('page'),
+            product_id=evt.get('productId'),
+            event_name=evt.get('eventName') or evt.get('event_type')
+        )
+    return make_state_repr(
+        page=getattr(evt, 'page', None),
+        product_id=getattr(evt, 'productId', None),
+        event_name=getattr(evt, 'eventName', None) or getattr(evt, 'event_type', None)
+    )
+
+
+def parse_state(state_str: str) -> dict:
+    """parse state string back to components
+    returns: {'page': str, 'productId': str, 'eventName': str}
+    """
+    parts = state_str.split('|')
+    return {
+        'page': parts[0] if len(parts) > 0 and parts[0] != 'unk' else None,
+        'productId': parts[1] if len(parts) > 1 and parts[1] != 'none' else None,
+        'eventName': parts[2] if len(parts) > 2 and parts[2] != 'unknown' else None
+    }
+
+
+def get_event_name(evt: Any) -> str:
+    """extract event name from event object/dict"""
+    if isinstance(evt, dict):
+        return evt.get('eventName') or evt.get('event_type') or ''
+    return getattr(evt, 'eventName', None) or getattr(evt, 'event_type', None) or ''
+
+
+def get_timestamp(evt: Any) -> Any:
+    """extract timestamp from event object/dict"""
+    if isinstance(evt, dict):
+        return evt.get('ts') or evt.get('timestamp')
+    return getattr(evt, 'ts', None) or getattr(evt, 'timestamp', None)
+
+
+def create_state_fn() -> Callable:
+    """factory for state representation function"""
+    return event_to_state
+
+
+def create_event_name_fn() -> Callable:
+    """factory for event name extraction function"""
+    return get_event_name
+
+
+def create_timestamp_fn() -> Callable:
+    """factory for timestamp extraction function (returns raw value, use features.parse_timestamp to convert)"""
+    return get_timestamp

From 2ed200f8702d1bebba6937a281547f1383212188 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 19:12:35 +0100
Subject: [PATCH 34/99] chore: make lib backwards compatible

---
 experiments/ml/arch.py           |  91 +++++-----------
 sim/rl/behavior_loader/models.py |  12 +++
 sim/rl/environment.py            | 175 +++++++++++++++----------------
 3 files changed, 126 insertions(+), 152 deletions(-)

diff --git a/experiments/ml/arch.py b/experiments/ml/arch.py
index 4ceb2e0..1fa4f96 100644
--- a/experiments/ml/arch.py
+++ b/experiments/ml/arch.py
@@ -8,6 +8,20 @@ import pandas as pd
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import sys
+from pathlib import Path
+
+# add lib to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'lib'))
+from lib.features import (
+    transition_histogram as _lib_transition_histogram,
+    temporal_signature as _lib_temporal_signature,
+    state_coverage as _lib_state_coverage,
+    transition_entropy as _lib_transition_entropy,
+    featurize_trajectory as _lib_featurize_trajectory,
+    parse_timestamp
+)
+from lib.state import event_to_state, get_event_name, get_timestamp
 
 TASK = 'classification'
 LABELS = ['human', 'agent']
@@ -101,91 +115,40 @@ def nt_xent_loss(z_i: torch.Tensor, z_j: torch.Tensor, temperature: float = 0.5)
     return F.cross_entropy(sim, labels)
 
 
-# feature extraction utilities for trajectory -> feature vector
+# feature extraction utilities - delegating to lib.features for unified implementation
+# these wrappers maintain backwards compatibility for existing imports
+
 def transition_histogram(events: List, state_fn, max_states: int = 50) -> np.ndarray:
     """Compute normalized histogram of state transitions in trajectory"""
-    if len(events) < 2:
-        return np.zeros(max_states)
-    states = [state_fn(e) for e in events]
-    trans_counts = defaultdict(int)
-    for s, s_next in zip(states, states[1:]):
-        trans_counts[(s, s_next)] += 1
-    total = sum(trans_counts.values())
-    hist = np.array(list(trans_counts.values())[:max_states], dtype=np.float32)
-    hist = np.pad(hist, (0, max(0, max_states - len(hist))))
-    return hist / (total + 1e-10)
+    return _lib_transition_histogram(events, state_fn, max_states)
 
 
 def temporal_signature(events: List, ts_fn) -> np.ndarray:
     """Extract temporal features: mean/std/skew of inter-event times"""
-    if len(events) < 2:
-        return np.zeros(4, dtype=np.float32)
-    times = sorted([ts_fn(e) for e in events])
-    diffs = np.diff(times).astype(np.float32)
-    if len(diffs) == 0:
-        return np.zeros(4, dtype=np.float32)
-    mean_dt, std_dt = np.mean(diffs), np.std(diffs) + 1e-10
-    skew = np.mean(((diffs - mean_dt) / std_dt) ** 3) if std_dt > 1e-8 else 0.0
-    return np.array([mean_dt, std_dt, skew, len(diffs)], dtype=np.float32)
+    return _lib_temporal_signature(events, ts_fn)
 
 
 def state_coverage(events: List, state_fn, mdp_states: set) -> float:
     """Fraction of MDP states visited by trajectory"""
-    if not mdp_states:
-        return 0.0
-    visited = set(state_fn(e) for e in events)
-    return len(visited & mdp_states) / len(mdp_states)
+    return _lib_state_coverage(events, state_fn, mdp_states)
 
 
 def transition_entropy(events: List, state_fn) -> float:
     """Compute entropy of transition distribution (randomness of navigation)"""
-    if len(events) < 2:
-        return 0.0
-    states = [state_fn(e) for e in events]
-    trans_counts = defaultdict(int)
-    for s, s_next in zip(states, states[1:]):
-        trans_counts[(s, s_next)] += 1
-    total = sum(trans_counts.values())
-    probs = [c / total for c in trans_counts.values()]
-    return -sum(p * np.log(p + 1e-10) for p in probs)
+    return _lib_transition_entropy(events, state_fn)
 
 
 def featurize_trajectory(events: List, mdp: Optional[Dict] = None, input_dim: int = 64) -> np.ndarray:
-    """Convert trajectory to fixed-dim feature vector"""
-    def _state_repr(e):
-        return f"{getattr(e, 'page', None) or 'unk'}|{getattr(e, 'productId', None) or 'none'}|{e.eventName}"
+    """Convert trajectory to fixed-dim feature vector - uses lib.features implementation"""
+    mdp_states = set(mdp.get('states', [])) if mdp else set()
 
     def _ts_fn(e):
-        ts = getattr(e, 'ts', None)
-        if isinstance(ts, str):
-            from datetime import datetime
-            try:
-                return datetime.fromisoformat(ts.replace('Z', '+00:00')).timestamp()
-            except:
-                return 0.0
-        return float(ts) if ts else 0.0
+        return parse_timestamp(get_timestamp(e))
 
-    feats = []
-    feats.extend(transition_histogram(events, _state_repr, max_states=40))  # 40 dims
-    feats.extend(temporal_signature(events, _ts_fn))  # 4 dims
-    mdp_states = set(mdp.get('states', [])) if mdp else set()
-    feats.append(state_coverage(events, _state_repr, mdp_states))  # 1 dim
-    feats.append(transition_entropy(events, _state_repr))  # 1 dim
-    feats.append(len(events))  # trajectory length
-    feats.append(len(set(_state_repr(e) for e in events)))  # unique states
+    def _event_name_fn(e):
+        return get_event_name(e)
 
-    # event type distribution (page_view, hover, cart, purchase indicators)
-    event_names = [e.eventName for e in events]
-    feats.append(sum(1 for n in event_names if 'page' in n.lower()) / (len(events) + 1))
-    feats.append(sum(1 for n in event_names if 'hover' in n.lower()) / (len(events) + 1))
-    feats.append(sum(1 for n in event_names if 'cart' in n.lower()) / (len(events) + 1))
-    feats.append(sum(1 for n in event_names if 'purchase' in n.lower() or 'checkout' in n.lower()) / (len(events) + 1))
-
-    # pad/truncate to input_dim
-    feats = np.array(feats[:input_dim], dtype=np.float32)
-    if len(feats) < input_dim:
-        feats = np.pad(feats, (0, input_dim - len(feats)))
-    return feats
+    return _lib_featurize_trajectory(events, event_to_state, _ts_fn, _event_name_fn, mdp_states, input_dim)
 
 
 # gradient boosting classifiers for comparison baselines
diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 4c6bf21..3530724 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -6,6 +6,18 @@ from collections import defaultdict
 from typing import Dict, List, Tuple, Set
 import numpy as np
 import graphviz
+import sys
+from pathlib import Path
+
+# import lib utilities for optional use - models keep their own _state_repr for backwards compat
+# with the specific event structure (evt.value.payload)
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / 'lib'))
+try:
+    from lib.state import make_state_repr as lib_make_state_repr
+    from lib.features import transition_histogram as lib_transition_histogram
+except ImportError:
+    lib_make_state_repr = None
+    lib_transition_histogram = None
 
 class BehaviorModel:
     def __init__(self, src_dir: str, loader_cls=Loader):
diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index fd725f8..d9ccbcb 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -1,7 +1,5 @@
-from sys import intern
 import gymnasium as gym
 from gymnasium import spaces
-from matplotlib import interactive
 import numpy as np
 from dataclasses import dataclass
 import pandas as pd
@@ -15,7 +13,7 @@ class BusinessLogicConstraints():
     max_price_adjustment: float = 0.30
     system_max_price: float = 500.0
     system_min_price: float = 1.0
-    product_catelogue_size: int = 100
+    product_catalogue_size: int = 100
     episode_length: int = 200
     sessions_per_step: int = 250
     agent_share: float = 0.25
@@ -37,17 +35,42 @@ class BusinessLogicConstraints():
 def _sigmoid(x: np.ndarray) -> np.ndarray:
     return 1.0 / (1.0 + np.exp(-x))
 
+class BehavioralProfile:
+    """simple markov chain model for generating synthetic interaction events"""
+    def __init__(self, actor: str, purchase_probs: np.ndarray):
+        self.actor = actor
+        self.purchase_probs = purchase_probs
+        self.states = ['view', 'cart', 'checkout']
+        # transition matrix: view->cart 0.3, view->view 0.6, view->exit 0.1, cart->checkout 0.5, cart->view 0.4, cart->exit 0.1
+        self.trans = {'view': {'view': 0.6, 'cart': 0.3, 'exit': 0.1}, 'cart': {'checkout': 0.5, 'view': 0.4, 'exit': 0.1}, 'checkout': {'exit': 1.0}}
+        if actor == 'agents':  # agents browse more before purchasing
+            self.trans['view'] = {'view': 0.75, 'cart': 0.15, 'exit': 0.1}
+            self.trans['cart'] = {'checkout': 0.3, 'view': 0.6, 'exit': 0.1}
+
+    def sample(self, rng: np.random.Generator) -> Dict[str, Any]:
+        """sample single interaction event"""
+        product_idx = rng.integers(0, len(self.purchase_probs))
+        state = 'view'  # always start with view
+        # pick next state based on transition probs
+        trans = self.trans.get(state, {'exit': 1.0})
+        next_state = rng.choice(list(trans.keys()), p=list(trans.values()))
+        price_paid = 0.0 if next_state != 'checkout' else float(rng.uniform(50, 200))
+        return {'action': state, 'product_idx': product_idx, 'actor': 'agent' if self.actor == 'agents' else 'human', 't': 0.0, 'price_paid': price_paid}
+
+
+def _load_behavioral_profile(actor: str, demand_forcing: np.ndarray) -> BehavioralProfile:
+    """returns a behavioral profile for generating synthetic sessions
+    actor: 'humans' or 'agents'
+    demand_forcing: per-product purchase probabilities used to weight interactions
+    """
+    return BehavioralProfile(actor, demand_forcing)
+
+
 class CommercePlatform:
-    """
-    This is just an extension of the state management for the environment, it does not implement anything dynamic just helps us simulate demand.
-    """
-    def __init__(self,
-                 product_catelogue_size: int,
-                 max_price: float,
-                 min_price: float,
-                 constraints: BusinessLogicConstraints):
-        self.product_catelogue_size = product_catelogue_size
-        self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catelogue_size,))
+    """state management for the environment, simulates demand"""
+    def __init__(self, product_catalogue_size: int, max_price: float, min_price: float, constraints: BusinessLogicConstraints):
+        self.product_catalogue_size = product_catalogue_size
+        self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catalogue_size,))
         self.max_price = max_price
         self.min_price = min_price
         self.constraints = constraints
@@ -55,27 +78,12 @@ class CommercePlatform:
         self._rng = np.random.default_rng(constraints.seed)
         self._last_interaction_df: pd.DataFrame = pd.DataFrame()
 
-
     def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
-        # ground truth purchase propensities
         p = np.clip(prices, self.min_price, self.max_price)
         pn = p / self.max_price
         human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
         agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
-        return {
-            "human_purchase_prob": np.clip(human_prob, 0.0, 0.95),
-            "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)
-        }
-
-    def _load_behavioral_profile(actor : str, demand_forcing):
-        """
-        This returns a markov chain with average weights which we get from interaction data of our experiments.
-        This defines transition probabilities between different events:
-        search -> view_item_price_binN: 0.7
-        view_item_price_binN -> add_to_cart: 0.2
-        we also must reweight with the demand_forcing vector or purchase probabilities per-product
-        """
-
+        return {"human_purchase_prob": np.clip(human_prob, 0.0, 0.95), "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)}
 
     def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
         demand = self.setup_true_demand(base_prices)
@@ -162,22 +170,22 @@ class PHANTOMEnv(gym.Env):
         self.constraints = BusinessLogicConstraints()
         self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
                                        high=self.constraints.max_price_adjustment,
-                                       shape=(self.constraints.product_catelogue_size,), dtype=np.float32)
+                                       shape=(self.constraints.product_catalogue_size,), dtype=np.float32)
         self.observation_space = spaces.Dict({
             "elasticity": spaces.Dict({
                 "price": spaces.Box(
-                    low=np.full((self.constraints.product_catelogue_size,), self.constraints.system_min_price, dtype=np.float32),
-                    high=np.full((self.constraints.product_catelogue_size,), self.constraints.system_max_price, dtype=np.float32),
+                    low=np.full((self.constraints.product_catalogue_size,), self.constraints.system_min_price, dtype=np.float32),
+                    high=np.full((self.constraints.product_catalogue_size,), self.constraints.system_max_price, dtype=np.float32),
                     dtype=np.float32),
                 "demand": spaces.Box(
-                    low=np.zeros((self.constraints.product_catelogue_size,), dtype=np.float32),
-                    high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32),
+                    low=np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
+                    high=np.full((self.constraints.product_catalogue_size,), 1e6, dtype=np.float32),
                     dtype=np.float32),
             })
             # TODO: define more features that we compute from the interaction data
         })
         self.commerce_platform = CommercePlatform(
-            product_catelogue_size=self.constraints.product_catelogue_size,
+            product_catalogue_size=self.constraints.product_catalogue_size,
             max_price=self.constraints.system_max_price,
             min_price=self.constraints.system_min_price,
             constraints=self.constraints)
@@ -192,12 +200,12 @@ class PHANTOMEnv(gym.Env):
             self._rng = np.random.default_rng(seed)
             self.commerce_platform._rng = np.random.default_rng(seed)
         self.t = 0
-        init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catelogue_size,)).astype(np.float32)
+        init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catalogue_size,)).astype(np.float32)
         self._prev_prices = init_prices.copy()
         self.state = {
             "elasticity": {
                 "price": init_prices,
-                "demand": np.zeros((self.constraints.product_catelogue_size,), dtype=np.float32),
+                "demand": np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
             }
         }
         return self.state, {}
@@ -210,38 +218,35 @@ class PHANTOMEnv(gym.Env):
                            self.constraints.system_max_price).astype(np.float32)
 
         self.state["elasticity"]["price"] = new_prices
-        # TODO: use the commerce platform to simulate sessions
         interactions_df = self.commerce_platform._simulate_sessions(new_prices)
         result = self.commerce_platform.compute_interaction_features(interactions_df)
-        # TODO: implement COI computation to use in reward
-        COI = 0.0
+        COI = 0.0  # TODO: implement cost-of-information computation
 
         volatility = 0.0 if self._prev_prices is None else \
             float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
         self._prev_prices = new_prices.copy()
 
-        revenue_observed = float(result["revenue_observed"])
-        agent_loss = float(result["agent_loss"])
+        # extract metrics with safe defaults for incomplete simulation
+        revenue_observed = float(result.get("revenue_observed", result.get("mean_sale_price", 0.0)))
+        agent_loss = float(result.get("agent_loss", 0.0))
 
         reward = (revenue_observed
                   - COI
                   - self.constraints.w_agent_loss * agent_loss
                   - self.constraints.w_volatility * volatility
-                  - self.constraints.w_estimation_error
-                  )
+                  - self.constraints.w_estimation_error)
 
         terminated = self.t >= self.constraints.episode_length
         info = {
             "t": self.t,
             "revenue_observed": revenue_observed,
-            "revenue_oracle": float(result["revenue_oracle"]),
+            "revenue_oracle": float(result.get("revenue_oracle", revenue_observed)),
             "agent_loss": agent_loss,
             "ux_volatility": volatility,
-            "mean_internal_error": err_mean,
-            "look_to_book": float(result["interaction_features"].get("look_to_book", 0.0)),
-            "mean_sale_price": float(result["interaction_features"].get("mean_sale_price", 0.0)),
-            "true_human_purchases_total": float(np.sum(result["true_human_demand"])),
-            "true_agent_purchases_total": float(np.sum(result["true_agent_purchases"])),
+            "look_to_book": float(result.get("look_to_book", 0.0)),
+            "mean_sale_price": float(result.get("mean_sale_price", 0.0)),
+            "true_human_purchases_total": 0.0,  # TODO: track from simulation
+            "true_agent_purchases_total": 0.0,  # TODO: track from simulation
         }
         return self.state, float(reward), terminated, False, info
 
@@ -250,46 +255,43 @@ if __name__ == "__main__":
     import matplotlib.pyplot as plt
     from collections import defaultdict
 
-    runs = {}
-    for use_defense in (False, True):
-        env = PHANTOMEnv(use_defense=use_defense)
-        obs, _ = env.reset(seed=42)
-        metrics = defaultdict(list)
-        total_reward = 0.0
-        done = False
+    env = PHANTOMEnv(constraints=BusinessLogicConstraints())
+    obs, _ = env.reset(seed=42)
+    metrics = defaultdict(list)
+    total_reward = 0.0
+    done = False
 
-        while not done:
-            action = env.action_space.sample()
-            obs, reward, done, _, info = env.step(action)
-            total_reward += reward
-            p_mean = float(np.mean(obs["elasticity"]["price"]))
-            q_mean = float(np.mean(obs["elasticity"]["demand"]))
-            p_std = float(np.std(obs["elasticity"]["price"]))
+    while not done:
+        action = env.action_space.sample()
+        obs, reward, done, _, info = env.step(action)
+        total_reward += reward
+        p_mean = float(np.mean(obs["elasticity"]["price"]))
+        q_mean = float(np.mean(obs["elasticity"]["demand"]))
+        p_std = float(np.std(obs["elasticity"]["price"]))
 
-            metrics['t'].append(info['t'])
-            metrics['price_mean'].append(p_mean)
-            metrics['price_std'].append(p_std)
-            metrics['demand_mean'].append(q_mean)
-            metrics['revenue_observed'].append(info['revenue_observed'])
-            metrics['revenue_oracle'].append(info['revenue_oracle'])
-            metrics['agent_loss'].append(info['agent_loss'])
-            metrics['ux_volatility'].append(info['ux_volatility'])
-            metrics['look_to_book'].append(info['look_to_book'])
-            metrics['reward'].append(reward)
-            metrics['human_purchases'].append(info['true_human_purchases_total'])
-            metrics['agent_purchases'].append(info['true_agent_purchases_total'])
+        metrics['t'].append(info['t'])
+        metrics['price_mean'].append(p_mean)
+        metrics['price_std'].append(p_std)
+        metrics['demand_mean'].append(q_mean)
+        metrics['revenue_observed'].append(info['revenue_observed'])
+        metrics['revenue_oracle'].append(info['revenue_oracle'])
+        metrics['agent_loss'].append(info['agent_loss'])
+        metrics['ux_volatility'].append(info['ux_volatility'])
+        metrics['look_to_book'].append(info['look_to_book'])
+        metrics['reward'].append(reward)
+        metrics['human_purchases'].append(info['true_human_purchases_total'])
+        metrics['agent_purchases'].append(info['true_agent_purchases_total'])
 
-            if info['t'] % 20 == 0 or done:
-                print(f"defense={'ON ' if use_defense else 'OFF'} t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} "
-                      f"q={q_mean:6.2f} rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
-                      f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
-                      f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
+        if info['t'] % 20 == 0 or done:
+            print(f"t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} q={q_mean:6.2f} "
+                  f"rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
+                  f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
+                  f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
 
-        runs[use_defense] = metrics
-        print(f"defense={'ON ' if use_defense else 'OFF'} total_reward={total_reward:.2f}\n")
+    print(f"total_reward={total_reward:.2f}")
 
     fig, axes = plt.subplots(3, 3, figsize=(15, 12))
-    fig.suptitle('PHANTOM Environment: Defense OFF vs ON', fontsize=14, fontweight='bold')
+    fig.suptitle('PHANTOM Environment Run', fontsize=14, fontweight='bold')
 
     plot_configs = [
         ('price_mean', 'Mean Price', 'Price'),
@@ -305,13 +307,10 @@ if __name__ == "__main__":
 
     for idx, (key, title, ylabel) in enumerate(plot_configs):
         ax = axes[idx // 3, idx % 3]
-        for use_defense, label, color in [(False, 'No Defense', 'red'), (True, 'With Defense', 'blue')]:
-            m = runs[use_defense]
-            ax.plot(m['t'], m[key], label=label, color=color, alpha=0.7, linewidth=1.5)
+        ax.plot(metrics['t'], metrics[key], color='blue', alpha=0.7, linewidth=1.5)
         ax.set_xlabel('Step')
         ax.set_ylabel(ylabel)
         ax.set_title(title, fontsize=10, fontweight='bold')
-        ax.legend(loc='best', fontsize=8)
         ax.grid(True, alpha=0.3)
 
     plt.tight_layout()

From dee6f573e34f8c3ab00349b1e556126f83f5ad9c Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 19:12:56 +0100
Subject: [PATCH 35/99] feat: contaminator and training

---
 experiments/procesing/contaminator.py | 87 ++++++++++++++++----------
 sim/rl/train.py                       | 89 ++++++++++++++-------------
 2 files changed, 100 insertions(+), 76 deletions(-)

diff --git a/experiments/procesing/contaminator.py b/experiments/procesing/contaminator.py
index da44c3d..2f23b2b 100644
--- a/experiments/procesing/contaminator.py
+++ b/experiments/procesing/contaminator.py
@@ -1,45 +1,66 @@
 import pandas as pd
 import random
-from sim.rl.behavior_loader import AgentBehaviorModel # TODO: proper import this
+import os
+from pathlib import Path
 
-base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
-agent_dir = f"{base_dir}/agents/collected_data/"
+# use relative import when in package context, fallback for standalone
+try:
+    from sim.rl.behavior_loader.models import AgentBehaviorModel
+except ImportError:
+    import sys
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent / "sim" / "rl" / "behavior_loader"))
+    from models import AgentBehaviorModel
+
+# paths should be configurable via environment or relative to project root
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', PROJECT_ROOT / "experiments" / "agents" / "collected_data"))
 
 
-
-def remap_schema(df : pd.DataFrame, mapping: dict, on: str = "event_type"):
+def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.DataFrame:
+    """remap column values according to mapping dict, preserving unmapped values"""
     df = df.copy()
     df[on] = df[on].map(mapping).fillna(df[on])
     return df
 
 
-def contaminate_dataset(df : pd.DataFrame, on : str = "event_type",
-                        contamination_rate: float = 0.1) -> pd.DataFrame:
-    model = AgentBehaviorModel(agent_dir)
-    target_df_schema = df[on].unique().tolist()
-    mapping = {
-        'view': 'view_page'
-        # TODO: define properly for the given dataset
-    }
-    # think about replacing with freqdist method from library
-    OG_event_distribution = df[on].value_counts(normalize=True).to_dict()
-    # normalize to weights
-    OG_event_distribution = {k: v / sum(OG_event_distribution.values()) for k, v in OG_event_distribution.items()}
-    mapped_df = remap_schema(df, mapping, on=on)
-    N = len(df)
-    N_final = N / (1 - contamination_rate) # TODO: explain this in paper
-    N_contaminate = int(N_final - N)
-    start_event_types = random.choices(list(OG_event_distribution.keys()),
-                                    weights=list(OG_event_distribution.values()), k=N_contaminate)
-    # it makes sense
-    new_trajectories = []
-    for start_event in start_event_types:
-        # sample from og start
-        start = None # TODO: defin start accoding to dataset (randomly sample with weights of event distr)
-        trajectory = model.sample_trajectory(start) # TODO: explain this method in paper
-        new_trajectories.extend(trajectory)
+def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
+                        contamination_rate: float = 0.1,
+                        agent_data_dir: Path = None) -> pd.DataFrame:
+    """inject synthetic agent trajectories into a dataset
+    contamination_rate: fraction of final dataset that should be agent data (0.1 = 10% agents)
+    """
+    data_dir = agent_data_dir or AGENT_DATA_DIR
+    model = AgentBehaviorModel(str(data_dir))
+    model.build_MDP()  # ensure MDP is built before sampling
 
-    # TODO: make sure the new trajctories schema conforms with dataset
-    contaminate_df = pd.DataFrame(new_trajectories)
-    df = pd.concat([df, contaminate_df], ignore_index=True)
+    # compute event distribution from original data
+    event_dist = df[on].value_counts(normalize=True).to_dict()
+    total = sum(event_dist.values())
+    event_dist = {k: v / total for k, v in event_dist.items()}
+
+    # calculate how many synthetic events to add
+    N = len(df)
+    N_final = N / (1 - contamination_rate)
+    N_contaminate = int(N_final - N)
+
+    # sample start states weighted by original distribution
+    start_events = random.choices(list(event_dist.keys()), weights=list(event_dist.values()), k=N_contaminate)
+
+    # generate synthetic trajectories
+    new_rows = []
+    for start_event in start_events:
+        # sample trajectory from agent model, using a state that contains the event type
+        mdp_states = model.mdp.get('states', []) if model.mdp else []
+        matching_starts = [s for s in mdp_states if start_event in s]
+        if not matching_starts:
+            continue  # skip if no matching start state
+        start_state = random.choice(matching_starts)
+        trajectory = model.sample_traj(start_state, max_len=20)
+        for state in trajectory:
+            parts = state.split('|')  # page|productId|eventName format
+            new_rows.append({on: parts[-1] if parts else start_event, 'source': 'synthetic_agent'})
+
+    if new_rows:
+        contaminate_df = pd.DataFrame(new_rows)
+        df = pd.concat([df, contaminate_df], ignore_index=True)
     return df
diff --git a/sim/rl/train.py b/sim/rl/train.py
index ba257de..01e6809 100644
--- a/sim/rl/train.py
+++ b/sim/rl/train.py
@@ -3,15 +3,17 @@ import logging
 from pathlib import Path
 from typing import Dict, Type, Optional
 import pickle
-from torch import neg_
 from torch.utils.tensorboard import SummaryWriter
-from environment import PHANTOMEnv, FastTrainingConstraints, BusinessLogicConstraints
-from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
-                   SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
+from environment import PHANTOMEnv, BusinessLogicConstraints
 
 logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
 logger = logging.getLogger(__name__)
 
+try:
+    from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
+                       SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
+except ImportError:
+    BasePricingEngine = None  # engines not required for basic usage
 
 
 """
@@ -26,8 +28,7 @@ CURRENT SOLUTION BELOW does not implement correct learning or updates.
 
 class EngineTrainer:
     """wrapper to run pricing engines through episodes and collect metrics"""
-    def __init__(self, engine: BasePricingEngine, env: PHANTOMEnv,
-                 tb_writer: Optional[SummaryWriter] = None):
+    def __init__(self, engine, env: PHANTOMEnv, tb_writer: Optional[SummaryWriter] = None):
         self.engine = engine
         self.env = env
         self.episode_metrics = []
@@ -35,7 +36,6 @@ class EngineTrainer:
         self.global_step = 0
 
     def train(self, n_episodes: int, seed: int = 42):
-
         obs, _ = self.env.reset(seed=seed)
         prices = None
         for ep in range(n_episodes):
@@ -44,12 +44,21 @@ class EngineTrainer:
             self.engine.update(obs, reward, done, info)
         return self
 
-
-
-
-
-
-        return self.episode_metrics
+    def run_episode(self, seed: int = 42) -> Dict:
+        """run single evaluation episode and return metrics"""
+        obs, _ = self.env.reset(seed=seed)
+        self.engine.reset()
+        total_reward, prices = 0.0, None
+        ep_metrics = {'total_reward': 0.0}
+        done = False
+        while not done:
+            prices = self.engine.compute_prices(prices, obs) if prices is not None else obs["elasticity"]["price"]
+            obs, reward, done, _, info = self.env.step(prices)
+            total_reward += reward
+            for k, v in info.items():
+                ep_metrics[k] = v
+        ep_metrics['total_reward'] = total_reward
+        return ep_metrics
 
     def evaluate(self, n_episodes: int = 10, seed: int = 100) -> Dict:
         """evaluate trained engine"""
@@ -57,17 +66,16 @@ class EngineTrainer:
                                    'agent_loss', 'ux_volatility', 'look_to_book']}
         for ep in range(n_episodes):
             metrics = self.run_episode(seed=seed + ep)
-            for k in results:                results[k].append(metrics[k])
+            for k in results:
+                results[k].append(metrics.get(k, 0.0))
         return {k: (np.mean(v), np.std(v)) for k, v in results.items()}
 
 
-def make_env(fast: bool = True):
-    constraints = FastTrainingConstraints() if fast else BusinessLogicConstraints()
-    return PHANTOMEnv(constraints=constraints)
+def make_env():
+    return PHANTOMEnv(constraints=BusinessLogicConstraints())
 
 
-def train_engine(engine_cls: Type[BasePricingEngine], env: PHANTOMEnv,
-                n_episodes: int, seed: int = 42,
+def train_engine(engine_cls, env: PHANTOMEnv, n_episodes: int, seed: int = 42,
                 tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
     constraints = env.constraints
     engine = engine_cls(constraints=constraints, seed=seed)
@@ -80,15 +88,11 @@ def save_trainer(trainer: EngineTrainer, path: Path):
     """save engine state and metrics"""
     path.parent.mkdir(parents=True, exist_ok=True)
     with open(path, 'wb') as f:
-        pickle.dump({
-            'engine': trainer.engine,
-            'metrics': trainer.episode_metrics
-        }, f)
+        pickle.dump({'engine': trainer.engine, 'metrics': trainer.episode_metrics}, f)
     logger.info(f"Saved trainer to {path}")
 
 
-def load_trainer(path: Path, env: PHANTOMEnv,
-                 tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
+def load_trainer(path: Path, env: PHANTOMEnv, tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
     """load saved engine"""
     with open(path, 'rb') as f:
         data = pickle.load(f)
@@ -98,45 +102,44 @@ def load_trainer(path: Path, env: PHANTOMEnv,
 
 
 if __name__ == "__main__":
+    if BasePricingEngine is None:
+        logger.error("Engines not available, cannot run training")
+        exit(1)
+
     base_dir = Path("./runs")
     base_dir.mkdir(exist_ok=True)
 
     engines = {
         "Wild": WildPricingEngine,
         "Static": StaticPricingEngine,
-#        "SimpleDemand": SimpleDemandEngine,
         "RandomWalk": RandomWalkEngine,
         "ThompsonSampling": ThompsonSamplingEngine,
     }
-    defenses = [False, True]
     n_train_episodes = 50
     n_eval_episodes = 10
     seed = 42
-    fast_mode = True
 
-    logger.info(f"Training config: {n_train_episodes} episodes per engine, fast_mode={fast_mode}")
+    logger.info(f"Training config: {n_train_episodes} episodes per engine")
 
     trained_trainers = {}
 
     for engine_name, engine_cls in engines.items():
-        for use_defense in defenses:
-            defense_label = "defense_on" if use_defense else "defense_off"
-            run_name = f"{engine_name}_{defense_label}"
-            log_dir = base_dir / run_name
-            log_dir.mkdir(parents=True, exist_ok=True)
+        run_name = engine_name
+        log_dir = base_dir / run_name
+        log_dir.mkdir(parents=True, exist_ok=True)
 
-            logger.info(f"Training {engine_name} with defense={use_defense}")
-            logger.info(f"Log directory: {log_dir}")
+        logger.info(f"Training {engine_name}")
+        logger.info(f"Log directory: {log_dir}")
 
-            env = make_env(fast=fast_mode)
-            tb_writer = SummaryWriter(log_dir=str(log_dir))
-            trainer = train_engine(engine_cls, env, n_train_episodes, seed, tb_writer=tb_writer)
-            tb_writer.close()
+        env = make_env()
+        tb_writer = SummaryWriter(log_dir=str(log_dir))
+        trainer = train_engine(engine_cls, env, n_train_episodes, seed, tb_writer=tb_writer)
+        tb_writer.close()
 
-            save_path = log_dir / "trainer.pkl"
-            save_trainer(trainer, save_path)
+        save_path = log_dir / "trainer.pkl"
+        save_trainer(trainer, save_path)
 
-            trained_trainers[run_name] = (trainer, env)
+        trained_trainers[run_name] = (trainer, env)
 
     logger.info("Starting evaluation")
 

From c15bb1882e2e7ab34c978c3f470beab56f9ddab1 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 22 Jan 2026 11:40:12 +0100
Subject: [PATCH 36/99] chore: training and data refactors

---
 sim/rl/train.py            | 47 ++++++++++++++++++++++++++++----------
 sim/strong_learner/data.py | 15 +++++++++---
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/sim/rl/train.py b/sim/rl/train.py
index 01e6809..1d21f24 100644
--- a/sim/rl/train.py
+++ b/sim/rl/train.py
@@ -4,16 +4,17 @@ from pathlib import Path
 from typing import Dict, Type, Optional
 import pickle
 from torch.utils.tensorboard import SummaryWriter
-from environment import PHANTOMEnv, BusinessLogicConstraints
+from sim.rl.environment import PHANTOMEnv, BusinessLogicConstraints
 
 logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
 logger = logging.getLogger(__name__)
 
 try:
-    from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
+    from sim.rl.engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
                        SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
-except ImportError:
+except ImportError as e:
     BasePricingEngine = None  # engines not required for basic usage
+    print(e)
 
 
 """
@@ -36,27 +37,49 @@ class EngineTrainer:
         self.global_step = 0
 
     def train(self, n_episodes: int, seed: int = 42):
-        obs, _ = self.env.reset(seed=seed)
-        prices = None
         for ep in range(n_episodes):
-            prices = self.engine.compute_prices(prices, obs)
-            obs, reward, done, _, info = self.env.step(prices)
-            self.engine.update(obs, reward, done, info)
+            obs, _ = self.env.reset(seed=seed + ep)
+            self.engine.reset()
+            done = False
+            prev_prices = obs["elasticity"]["price"]
+            episode_reward = 0.0
+            last_info: Dict[str, float] = {}
+            while not done:
+                action_prices = self.engine.compute_prices(prev_prices, obs)
+                obs, reward, done, _, info = self.env.step(action_prices)
+                self.engine.update(obs, reward, done, info)
+                episode_reward += reward
+                prev_prices = obs["elasticity"]["price"]
+                last_info = info
+                if self.tb_writer:
+                    self.tb_writer.add_scalar("reward/step", reward, self.global_step)
+                    if "coi" in info:
+                        self.tb_writer.add_scalar("diagnostics/coi", info["coi"], self.global_step)
+                    if "alpha_hat" in info:
+                        self.tb_writer.add_scalar("diagnostics/alpha_hat", info["alpha_hat"], self.global_step)
+                self.global_step += 1
+            last_info = dict(last_info)
+            last_info.update({"episode_reward": episode_reward, "episode": ep})
+            self.episode_metrics.append(last_info)
+            if self.tb_writer:
+                self.tb_writer.add_scalar("reward/episode", episode_reward, ep)
         return self
 
     def run_episode(self, seed: int = 42) -> Dict:
         """run single evaluation episode and return metrics"""
         obs, _ = self.env.reset(seed=seed)
         self.engine.reset()
-        total_reward, prices = 0.0, None
+        total_reward = 0.0
+        prev_prices = obs["elasticity"]["price"]
         ep_metrics = {'total_reward': 0.0}
         done = False
         while not done:
-            prices = self.engine.compute_prices(prices, obs) if prices is not None else obs["elasticity"]["price"]
-            obs, reward, done, _, info = self.env.step(prices)
+            action_prices = self.engine.compute_prices(prev_prices, obs)
+            obs, reward, done, _, info = self.env.step(action_prices)
             total_reward += reward
             for k, v in info.items():
                 ep_metrics[k] = v
+            prev_prices = obs["elasticity"]["price"]
         ep_metrics['total_reward'] = total_reward
         return ep_metrics
 
@@ -106,7 +129,7 @@ if __name__ == "__main__":
         logger.error("Engines not available, cannot run training")
         exit(1)
 
-    base_dir = Path("./runs")
+    base_dir = Path("./sim/rl/runs")
     base_dir.mkdir(exist_ok=True)
 
     engines = {
diff --git a/sim/strong_learner/data.py b/sim/strong_learner/data.py
index 80129aa..e22c7db 100644
--- a/sim/strong_learner/data.py
+++ b/sim/strong_learner/data.py
@@ -1,4 +1,9 @@
-import os, requests, py7zr
+import os
+import requests
+try:
+    import py7zr  # type: ignore
+except ImportError:  # pragma: no cover - optional dependency
+    py7zr = None
 import pandas as pd
 from typing import Generator
 try:
@@ -22,12 +27,16 @@ class YooChooseLoader(Loader):
         self.entries = list(self.data.keys())
 
     def _setup(self):
+        if py7zr is None:
+            raise RuntimeError("py7zr is required to unpack YooChoose dataset. Install py7zr first.")
         os.makedirs(self.root, exist_ok=True)
         zip_path = f"{self.root}/temp.7z"
         with requests.get(self.URL, stream=True) as r:
             with open(zip_path, 'wb') as f:
-                for chunk in r.iter_content(8192): f.write(chunk)
-        with py7zr.SevenZipFile(zip_path, 'r') as z: z.extractall(self.root)
+                for chunk in r.iter_content(8192):
+                    f.write(chunk)
+        with py7zr.SevenZipFile(zip_path, 'r') as z:
+            z.extractall(self.root)
         os.remove(zip_path)
 
     def _make_interaction(self, sid: str, ts: str, item_id: str, event: str, page: str, meta: dict) -> InteractionModel:

From b7161573d7f11ae99b8816adde91cdd809eb6e65 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 22 Jan 2026 11:40:27 +0100
Subject: [PATCH 37/99] chore: mini docs

---
 sim/rl/behavior_loader/models.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 3530724..33f83f4 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -18,6 +18,9 @@ try:
 except ImportError:
     lib_make_state_repr = None
     lib_transition_histogram = None
+    print("lib no includable")
+
+
 
 class BehaviorModel:
     def __init__(self, src_dir: str, loader_cls=Loader):
@@ -206,6 +209,7 @@ def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "
 
 def kl_divergence(p: Dict[str, float], q: Dict[str, float]) -> float:
     eps = 1e-10
+    # p + log(p / q) summed over all keys in P
     return sum((p[k] + eps) * np.log((p[k] + eps) / (q.get(k, 0.0) + eps)) for k in p)
 
 if __name__ == "__main__":

From 20c47fe85fbed16372204719ea0d2d4c242c4206 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 22 Jan 2026 11:40:47 +0100
Subject: [PATCH 38/99] review: planning environment refactoring

---
 sim/rl/environment.py | 366 +++++++++++++++++++++++++++++++++---------
 1 file changed, 290 insertions(+), 76 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index d9ccbcb..926e152 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -3,11 +3,17 @@ from gymnasium import spaces
 import numpy as np
 from dataclasses import dataclass
 import pandas as pd
-from typing import Callable, Optional, Dict, Any, List
+from types import SimpleNamespace
+from typing import Optional, Dict, Any, List, Tuple
 
-# "learner"  agent learning to optimize pricing
-# "agent"  part of environment creating demand signals that learner processes
+from lib.separability import load_artifacts, score_session, estimate_alpha
+from sim.rl.behavior_loader.models import AgentBehaviorModel, BehaviorModel
 
+# "learner" agent learning to optimize pricing
+# "agent" part of environment creating demand signals that learner processes
+
+base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
+human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
 @dataclass
 class BusinessLogicConstraints():
     max_price_adjustment: float = 0.30
@@ -35,27 +41,113 @@ class BusinessLogicConstraints():
 def _sigmoid(x: np.ndarray) -> np.ndarray:
     return 1.0 / (1.0 + np.exp(-x))
 
+EVENT_PAGE_MAP = {
+    "session_start": "/",
+    "view_item_page": "/products",
+    "learn_more_about_item": "/products/details",
+    "add_item_to_cart": "/cart",
+    "purchase_complete": "/checkout",
+    "session_end": "/checkout/success",
+}
+
+
 class BehavioralProfile:
-    """simple markov chain model for generating synthetic interaction events"""
+    """Synthetic Markov profile used to generate interaction sessions."""
+    # TODO: a lot of this is duplicated from models.py - refactor to share code better
+
     def __init__(self, actor: str, purchase_probs: np.ndarray):
         self.actor = actor
-        self.purchase_probs = purchase_probs
-        self.states = ['view', 'cart', 'checkout']
-        # transition matrix: view->cart 0.3, view->view 0.6, view->exit 0.1, cart->checkout 0.5, cart->view 0.4, cart->exit 0.1
-        self.trans = {'view': {'view': 0.6, 'cart': 0.3, 'exit': 0.1}, 'cart': {'checkout': 0.5, 'view': 0.4, 'exit': 0.1}, 'checkout': {'exit': 1.0}}
-        if actor == 'agents':  # agents browse more before purchasing
-            self.trans['view'] = {'view': 0.75, 'cart': 0.15, 'exit': 0.1}
-            self.trans['cart'] = {'checkout': 0.3, 'view': 0.6, 'exit': 0.1}
+        self.purchase_probs = np.clip(purchase_probs, 0.0, 0.95)
+        self.states = [
+            "session_start",
+            "view_item_page",
+            "learn_more_about_item",
+            "add_item_to_cart",
+            "purchase_complete",
+            "session_end",
+        ]
+        # base transition structure (human default)
+        self.transitions : Dict[str, Dict[str, float]];
 
-    def sample(self, rng: np.random.Generator) -> Dict[str, Any]:
-        """sample single interaction event"""
-        product_idx = rng.integers(0, len(self.purchase_probs))
-        state = 'view'  # always start with view
-        # pick next state based on transition probs
-        trans = self.trans.get(state, {'exit': 1.0})
-        next_state = rng.choice(list(trans.keys()), p=list(trans.values()))
-        price_paid = 0.0 if next_state != 'checkout' else float(rng.uniform(50, 200))
-        return {'action': state, 'product_idx': product_idx, 'actor': 'agent' if self.actor == 'agents' else 'human', 't': 0.0, 'price_paid': price_paid}
+        model = AgentBehaviorModel(agent_dir) if actor == "agents" else BehaviorModel(human_dir)
+        self.transitions = # TODO similarly to model.build_MDP_event_transitions() in models.py buidl the dict
+
+    def _transition_probs(self, state: str, product_idx: int) -> Dict[str, float]:
+        probs = dict(self.transitions.get(state, {"session_end": 1.0}))
+        if state == "add_item_to_cart":
+            base = probs.get("purchase_complete", 0.0)
+            demand_factor = float(self.purchase_probs[int(product_idx)])
+            if self.actor == "agents":
+                demand_factor *= 0.7
+            adjusted = np.clip(base * 0.5 + demand_factor * 0.5, 0.0, 0.95)
+            remainder = max(1e-6, 1.0 - adjusted)
+            other_total = sum(v for k, v in probs.items() if k != "purchase_complete")
+            scale = remainder / max(other_total, 1e-6)
+            for key in probs:
+                if key == "purchase_complete":
+                    probs[key] = adjusted
+                else:
+                    probs[key] = probs[key] * scale
+        total = sum(probs.values())
+        if total <= 0:
+            return {"session_end": 1.0}
+        return {state: val / total for state, val in probs.items()}
+
+    def sample_session(
+        self,
+        rng: np.random.Generator,
+        session_id: str,
+        prices: np.ndarray,
+        unit_cost: np.ndarray,
+    ) -> Tuple[List[Dict[str, Any]], List[SimpleNamespace]]:
+        """Generate a single session trajectory."""
+        # TODO: this is similar to the sample trajectory method in models.
+        # we also have to respect business constraints which constrain the lipshitz continuity of the transitions and prices
+        # we must apply constraints on purcahses not to let the platform offer prices under the cost of a productid
+
+        events: List[Dict[str, Any]] = []
+        feature_events: List[SimpleNamespace] = []
+        state = "session_start"
+        t = 0.0
+        product_idx = int(rng.integers(0, len(prices)))
+        product_id = f"product-{product_idx:04d}"
+
+        while state != "session_end" and len(events) < 40:
+            if state != "session_start":
+                price = float(prices[product_idx])
+                row = {
+                    "session_id": session_id,
+                    "actor": "agent" if self.actor == "agents" else "human",
+                    "eventName": state,
+                    "product_idx": product_idx,
+                    "productId": product_id,
+                    "price_offered": price,
+                    "price_paid": 0.0,
+                    "page": EVENT_PAGE_MAP.get(state, "/"),
+                    "ts": t,
+                    "unit_cost": float(unit_cost[product_idx]),
+                    "base_price": float(prices[product_idx]),
+                }
+                if state == "purchase_complete":
+                    noise = float(rng.normal(0.0, 0.015))
+                    row["price_paid"] = max(price * (1.0 + noise), row["unit_cost"])
+                events.append(row)
+                feature_events.append(
+                    SimpleNamespace(
+                        eventName=row["eventName"],
+                        page=row["page"],
+                        productId=row["productId"],
+                        ts=row["ts"],
+                    )
+                )
+
+            transitions = self._transition_probs(state, product_idx)
+            next_state = rng.choice(list(transitions.keys()), p=list(transitions.values()))
+            dwell = max(0.5, rng.gamma(shape=2.0, scale=1.0)) # TODO: should use params from the profile data
+            t += dwell
+            state = next_state
+
+        return events, feature_events
 
 
 def _load_behavioral_profile(actor: str, demand_forcing: np.ndarray) -> BehavioralProfile:
@@ -70,77 +162,160 @@ class CommercePlatform:
     """state management for the environment, simulates demand"""
     def __init__(self, product_catalogue_size: int, max_price: float, min_price: float, constraints: BusinessLogicConstraints):
         self.product_catalogue_size = product_catalogue_size
-        self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catalogue_size,))
         self.max_price = max_price
         self.min_price = min_price
         self.constraints = constraints
         self.simulation_history: List[Dict[str, Any]] = []
         self._rng = np.random.default_rng(constraints.seed)
         self._last_interaction_df: pd.DataFrame = pd.DataFrame()
+        self.unit_cost = np.random.uniform(low=15.0, high=60.0, size=(self.product_catalogue_size,)).astype(np.float32)
+        self.base_price = np.random.uniform(low=60.0, high=140.0, size=(self.product_catalogue_size,)).astype(np.float32)
+        self.alpha_hat = constraints.agent_share
+        try:
+            self.separability_artifacts = load_artifacts()
+        except FileNotFoundError:
+            self.separability_artifacts = None
 
     def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
         p = np.clip(prices, self.min_price, self.max_price)
-        pn = p / self.max_price
-        human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
-        agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
-        return {"human_purchase_prob": np.clip(human_prob, 0.0, 0.95), "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)}
+        cost = np.clip(self.unit_cost, self.min_price * 0.2, self.max_price)
+        margin = np.clip((p - cost) / np.maximum(cost, 1e-3), -0.9, 2.0)
+        # isoelastic demand approximation
+        human_prob = self.constraints.base_human_demand * np.exp(self.constraints.human_price_elasticity * margin)
+        agent_prob = self.constraints.base_agent_demand * np.exp(self.constraints.agent_price_elasticity * margin)
+        return {
+            "human_purchase_prob": np.clip(human_prob, 0.0, 0.95),
+            "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95),
+        }
 
-    def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
-        demand = self.setup_true_demand(base_prices)
-        human_pprob = demand["human_purchase_prob"]
-        agent_pprob = demand["agent_purchase_prob"]
-        events: List[Dict[str, Any]] = []
+    def _simulate_sessions(self, prices: np.ndarray) -> Tuple[pd.DataFrame, Dict[str, Any]]:
+        demand = self.setup_true_demand(prices)
         T = self.constraints.sessions_per_step
-        n_agent_sessions = int(round(T * self.constraints.agent_share))
-        n_human_sessions = T - n_agent_sessions
-        n_agent_ids = max(1, n_agent_sessions // 2)
+        effective_share = float(np.clip(self.alpha_hat, 0.0, 0.95))
+        n_agent_sessions = max(1, int(round(T * effective_share)))
+        n_human_sessions = max(1, T - n_agent_sessions)
+
         session_map = {
-            'humans': n_human_sessions,
-            'agents': n_agent_ids
+            "humans": n_human_sessions,
+            "agents": n_agent_sessions,
         }
         pprob_map = {
-            'humans': human_pprob,
-            'agents': agent_pprob
+            "humans": demand["human_purchase_prob"],
+            "agents": demand["agent_purchase_prob"],
         }
-        joint_events = []
-        for actor, n_sessions in session_map.items():
-            bp = _load_behavioral_profile(actor, pprob_map[actor])
-            counter = 0
-            events = []
-            while counter < n_sessions:
-                session_events = []
-                while len(session_events) == 0 or session_events[-1]['action'] == 'checkout':
-                    interaction_event = bp.sample(self._rng)
-                    interaction_event['session_id'] = f'{actor}_{counter:06d}'
-                    # TODO any other assignments
-                    session_events.append(interaction_event)
-                events.extend(session_events)
-                counter += 1
-            joint_events.extend(events)
 
-        return pd.DataFrame(joint_events)
+        rows: List[Dict[str, Any]] = []
+        session_scores: List[Dict[str, float]] = []
+        demand_human = np.zeros_like(prices, dtype=np.float32)
+        demand_agent = np.zeros_like(prices, dtype=np.float32)
+
+        for actor, n_sessions in session_map.items():
+            profile = _load_behavioral_profile(actor, pprob_map[actor])
+            for idx in range(n_sessions):
+                session_id = f"{actor}_{idx:06d}"
+                session_rows, feature_events = profile.sample_session(
+                    self._rng, session_id, prices, self.unit_cost
+                )
+                rows.extend(session_rows)
+                if session_rows:
+                    df_session = pd.DataFrame(session_rows)
+                    purchases = df_session[df_session["eventName"] == "purchase_complete"]
+                    if not purchases.empty:
+                        counts = purchases.groupby("product_idx").size()
+                        if actor == "agents":
+                            demand_agent[counts.index.to_numpy(dtype=int)] += counts.to_numpy(dtype=np.float32)
+                        else:
+                            demand_human[counts.index.to_numpy(dtype=int)] += counts.to_numpy(dtype=np.float32)
+                if self.separability_artifacts and feature_events:
+                    score = score_session(feature_events, self.separability_artifacts)
+                    session_scores.append(score)
+
+        interactions_df = pd.DataFrame(rows)
+        diagnostics = {
+            "alpha_hat": float(self.alpha_hat),
+            "session_scores": session_scores,
+            "demand_human": demand_human,
+            "demand_agent": demand_agent,
+        }
+
+        if session_scores:
+            alphas = [
+                estimate_alpha(s["prob_agent"], s["delta_h"], s["delta_a"], temperature=2.0)
+                for s in session_scores
+            ]
+            mean_alpha = float(np.mean(alphas))
+            # exponential moving average for stability
+            self.alpha_hat = 0.7 * self.alpha_hat + 0.3 * mean_alpha
+            diagnostics.update(
+                {
+                    "alpha_hat": float(self.alpha_hat),
+                    "delta_h_mean": float(np.mean([s["delta_h"] for s in session_scores])),
+                    "delta_a_mean": float(np.mean([s["delta_a"] for s in session_scores])),
+                    "prob_agent_mean": float(np.mean([s["prob_agent"] for s in session_scores])),
+                }
+            )
+
+        self._last_interaction_df = interactions_df
+        return interactions_df, diagnostics
 
     def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
         if interaction_df.empty:
-            return {"mean_sale_price": 0.0, "look_to_book": 0.0}
-        purchases = interaction_df[interaction_df["action"] == "purchase"]
+            return {
+                "revenue_observed": 0.0,
+                "revenue_oracle": 0.0,
+                "agent_loss": 0.0,
+                "true_human_purchases": 0.0,
+                "true_agent_purchases": 0.0,
+                "mean_sale_price": 0.0,
+                "look_to_book": 0.0,
+                "coi": 0.0,
+            }
+
+        purchases = interaction_df[interaction_df["eventName"] == "purchase_complete"]
+        human_purchases = purchases[purchases["actor"] == "human"]
+        agent_purchases = purchases[purchases["actor"] == "agent"]
+
+        revenue_observed = float(purchases["price_paid"].sum())
+        revenue_oracle = float(purchases["base_price"].sum())
+        agent_loss = float((agent_purchases["base_price"] - agent_purchases["price_paid"]).sum())
+
         mean_sale_price = float(purchases["price_paid"].mean()) if not purchases.empty else 0.0
-        views = float((interaction_df["action"] == "view").sum())
-        buys = float((interaction_df["action"] == "purchase").sum())
-        return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
+        views = float((interaction_df["eventName"] == "view_item_page").sum())
+        look_to_book = float(views / (len(purchases) + 1e-6))
+        true_human = float(len(human_purchases))
+        true_agent = float(len(agent_purchases))
+
+        human_prices = human_purchases["price_offered"] if not human_purchases.empty else pd.Series(dtype=float)
+        human_costs = human_purchases["unit_cost"] if not human_purchases.empty else pd.Series(dtype=float)
+        coi = 0.0
+        if not human_prices.empty and not human_costs.empty:
+            # of the purchased items, what is the margin between the price and cost
+            # TODO: this should take into account the expected price we could have charged also
+            coi = float(np.maximum(0.0, human_prices.mean() - human_costs.mean()))
+
+        return {
+            "revenue_observed": revenue_observed,
+            "revenue_oracle": revenue_oracle,
+            "agent_loss": agent_loss,
+            "true_human_purchases": true_human,
+            "true_agent_purchases": true_agent,
+            "mean_sale_price": mean_sale_price,
+            "look_to_book": look_to_book,
+            "coi": coi,
+        }
 
     def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
         # TODO: adapt this
         if df.empty:
             return pd.DataFrame()
         g = df.groupby("session_id", sort=False)
-        session_duration = g["t"].max() - g["t"].min()
+        session_duration = g["ts"].max() - g["ts"].min()
         total_interactions = g.size()
-        avg_time_between = g["t"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0)
+        avg_time_between = g["ts"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0)
         interaction_velocity = total_interactions / (session_duration + 1e-6)
-        views = g.apply(lambda x: int((x["action"] == "view").sum()), include_groups=False)
-        cart_adds = g.apply(lambda x: int((x["action"] == "cart").sum()), include_groups=False)
-        purchases = g.apply(lambda x: int((x["action"] == "purchase").sum()), include_groups=False)
+        views = g.apply(lambda x: int((x["eventName"] == "view_item_page").sum()), include_groups=False)
+        cart_adds = g.apply(lambda x: int((x["eventName"] == "add_item_to_cart").sum()), include_groups=False)
+        purchases = g.apply(lambda x: int((x["eventName"] == "purchase_complete").sum()), include_groups=False)
         conversion_rate = purchases / (views + 1e-6)
         is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False)
 
@@ -165,9 +340,9 @@ class CommercePlatform:
 class PHANTOMEnv(gym.Env):
     metadata = {"render_modes": []}
 
-    def __init__(self, constraints):
+    def __init__(self, constraints: Optional[BusinessLogicConstraints] = None):
         super().__init__()
-        self.constraints = BusinessLogicConstraints()
+        self.constraints = constraints if isinstance(constraints, BusinessLogicConstraints) else BusinessLogicConstraints()
         self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
                                        high=self.constraints.max_price_adjustment,
                                        shape=(self.constraints.product_catalogue_size,), dtype=np.float32)
@@ -199,8 +374,19 @@ class PHANTOMEnv(gym.Env):
         if seed is not None:
             self._rng = np.random.default_rng(seed)
             self.commerce_platform._rng = np.random.default_rng(seed)
+        self.commerce_platform.alpha_hat = self.constraints.agent_share
         self.t = 0
-        init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catalogue_size,)).astype(np.float32)
+        init_prices = self._rng.uniform(
+            low=60.0,
+            high=140.0,
+            size=(self.constraints.product_catalogue_size,),
+        ).astype(np.float32)
+        self.commerce_platform.unit_cost = self._rng.uniform(
+            low=15.0,
+            high=60.0,
+            size=(self.constraints.product_catalogue_size,),
+        ).astype(np.float32)
+        self.commerce_platform.base_price = init_prices.copy()
         self._prev_prices = init_prices.copy()
         self.state = {
             "elasticity": {
@@ -218,16 +404,21 @@ class PHANTOMEnv(gym.Env):
                            self.constraints.system_max_price).astype(np.float32)
 
         self.state["elasticity"]["price"] = new_prices
-        interactions_df = self.commerce_platform._simulate_sessions(new_prices)
+        interactions_df, diagnostics = self.commerce_platform._simulate_sessions(new_prices)
         result = self.commerce_platform.compute_interaction_features(interactions_df)
-        COI = 0.0  # TODO: implement cost-of-information computation
+        COI = float(result.get("coi", 0.0))
+
+        demand_vector = diagnostics.get("demand_human", np.zeros_like(new_prices)) + diagnostics.get(
+            "demand_agent", np.zeros_like(new_prices)
+        )
+        self.state["elasticity"]["demand"] = demand_vector.astype(np.float32)
 
         volatility = 0.0 if self._prev_prices is None else \
             float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
         self._prev_prices = new_prices.copy()
 
         # extract metrics with safe defaults for incomplete simulation
-        revenue_observed = float(result.get("revenue_observed", result.get("mean_sale_price", 0.0)))
+        revenue_observed = float(result.get("revenue_observed", 0.0))
         agent_loss = float(result.get("agent_loss", 0.0))
 
         reward = (revenue_observed
@@ -245,9 +436,21 @@ class PHANTOMEnv(gym.Env):
             "ux_volatility": volatility,
             "look_to_book": float(result.get("look_to_book", 0.0)),
             "mean_sale_price": float(result.get("mean_sale_price", 0.0)),
-            "true_human_purchases_total": 0.0,  # TODO: track from simulation
-            "true_agent_purchases_total": 0.0,  # TODO: track from simulation
+            "true_human_purchases_total": float(result.get("true_human_purchases", 0.0)),
+            "true_agent_purchases_total": float(result.get("true_agent_purchases", 0.0)),
+            "coi": COI,
+            "alpha_hat": diagnostics.get("alpha_hat", self.commerce_platform.alpha_hat),
+            "mean_human_demand": float(np.mean(diagnostics.get("demand_human", np.zeros_like(new_prices)))),
+            "mean_agent_demand": float(np.mean(diagnostics.get("demand_agent", np.zeros_like(new_prices)))),
         }
+        if "delta_h_mean" in diagnostics:
+            info.update(
+                {
+                    "delta_h_mean": diagnostics["delta_h_mean"],
+                    "delta_a_mean": diagnostics["delta_a_mean"],
+                    "prob_agent_mean": diagnostics["prob_agent_mean"],
+                }
+            )
         return self.state, float(reward), terminated, False, info
 
 
@@ -281,32 +484,43 @@ if __name__ == "__main__":
         metrics['reward'].append(reward)
         metrics['human_purchases'].append(info['true_human_purchases_total'])
         metrics['agent_purchases'].append(info['true_agent_purchases_total'])
+        metrics['coi'].append(info.get('coi', 0.0))
+        metrics['alpha_hat'].append(info.get('alpha_hat', env.commerce_platform.alpha_hat))
+        metrics['mean_human_demand'].append(info.get('mean_human_demand', 0.0))
+        metrics['mean_agent_demand'].append(info.get('mean_agent_demand', 0.0))
+        metrics['delta_h_mean'].append(info.get('delta_h_mean', 0.0))
+        metrics['delta_a_mean'].append(info.get('delta_a_mean', 0.0))
+        metrics['prob_agent_mean'].append(info.get('prob_agent_mean', 0.0))
 
         if info['t'] % 20 == 0 or done:
             print(f"t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} q={q_mean:6.2f} "
                   f"rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
                   f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
+                  f"coi={info.get('coi', 0.0):6.2f} alpha={info.get('alpha_hat', 0.0):4.2f} "
                   f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
 
     print(f"total_reward={total_reward:.2f}")
 
-    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
+    fig, axes = plt.subplots(3, 4, figsize=(18, 12))
     fig.suptitle('PHANTOM Environment Run', fontsize=14, fontweight='bold')
 
     plot_configs = [
         ('price_mean', 'Mean Price', 'Price'),
-        ('demand_mean', 'Mean Demand Estimate', 'Demand'),
+        ('demand_mean', 'Mean Demand (All)', 'Demand'),
+        ('mean_human_demand', 'Mean Human Demand', 'Count'),
+        ('mean_agent_demand', 'Mean Agent Demand', 'Count'),
         ('revenue_observed', 'Revenue (Observed)', 'Revenue'),
         ('agent_loss', 'Agent Loss (Oracle - Observed)', 'Loss'),
+        ('coi', 'Cost of Information', 'COI'),
+        ('alpha_hat', 'Estimated α̂', 'alpha'),
         ('ux_volatility', 'UX Volatility (Price Change)', 'Volatility'),
         ('look_to_book', 'Look-to-Book Ratio', 'Ratio'),
         ('reward', 'Step Reward', 'Reward'),
-        ('human_purchases', 'Human Purchases', 'Count'),
-        ('agent_purchases', 'Agent Purchases', 'Count'),
+        ('prob_agent_mean', 'Avg Agent Probability', 'Probability'),
     ]
 
     for idx, (key, title, ylabel) in enumerate(plot_configs):
-        ax = axes[idx // 3, idx % 3]
+        ax = axes[idx // 4, idx % 4]
         ax.plot(metrics['t'], metrics[key], color='blue', alpha=0.7, linewidth=1.5)
         ax.set_xlabel('Step')
         ax.set_ylabel(ylabel)

From 2b3d937be6edc4c8e1a1b09b74837ac18fbe0495 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 22 Jan 2026 11:46:32 +0100
Subject: [PATCH 39/99] feat: fixing alignment w premiums and specific
 extraction of data

---
 sim/rl/environment.py | 65 ++++++++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index 926e152..f7877a5 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -7,7 +7,7 @@ from types import SimpleNamespace
 from typing import Optional, Dict, Any, List, Tuple
 
 from lib.separability import load_artifacts, score_session, estimate_alpha
-from sim.rl.behavior_loader.models import AgentBehaviorModel, BehaviorModel
+from sim.rl.behavior_loader.models import AgentBehaviorModel, BehaviorModel, aggregate_event_transitions
 
 # "learner" agent learning to optimize pricing
 # "agent" part of environment creating demand signals that learner processes
@@ -52,8 +52,8 @@ EVENT_PAGE_MAP = {
 
 
 class BehavioralProfile:
-    """Synthetic Markov profile used to generate interaction sessions."""
-    # TODO: a lot of this is duplicated from models.py - refactor to share code better
+    """Synthetic Markov profile used to generate interaction sessions.
+    Uses aggregate_event_transitions from models.py to build transition kernels from real data."""
 
     def __init__(self, actor: str, purchase_probs: np.ndarray):
         self.actor = actor
@@ -66,11 +66,31 @@ class BehavioralProfile:
             "purchase_complete",
             "session_end",
         ]
-        # base transition structure (human default)
-        self.transitions : Dict[str, Dict[str, float]];
-
         model = AgentBehaviorModel(agent_dir) if actor == "agents" else BehaviorModel(human_dir)
-        self.transitions = # TODO similarly to model.build_MDP_event_transitions() in models.py buidl the dict
+        mdp = model.build_MDP()
+        self.transitions = aggregate_event_transitions(mdp) if mdp.get("transitions") else self._fallback_transitions()
+        self.dwell_params = self._extract_dwell_params(mdp)
+
+    def _fallback_transitions(self) -> Dict[str, Dict[str, float]]:
+        # sensible defaults if no data available
+        return {
+            "session_start": {"view_item_page": 0.85, "session_end": 0.15},
+            "view_item_page": {"learn_more_about_item": 0.4, "add_item_to_cart": 0.3, "view_item_page": 0.2, "session_end": 0.1},
+            "learn_more_about_item": {"add_item_to_cart": 0.5, "view_item_page": 0.3, "session_end": 0.2},
+            "add_item_to_cart": {"purchase_complete": 0.6, "view_item_page": 0.25, "session_end": 0.15},
+            "purchase_complete": {"session_end": 1.0},
+        }
+
+    def _extract_dwell_params(self, mdp: Dict) -> Dict[str, Tuple[float, float]]:
+        # derive gamma params (shape, scale) from state_rewards which encode temporal progression
+        state_vals = mdp.get("state_values", {})
+        params = {}
+        for state in self.states:
+            val = state_vals.get(state, 0.5)
+            shape = 1.5 + val * 2.0  # higher progression -> longer dwell
+            scale = 0.8 + (1.0 - val) * 1.2
+            params[state] = (shape, scale)
+        return params
 
     def _transition_probs(self, state: str, product_idx: int) -> Dict[str, float]:
         probs = dict(self.transitions.get(state, {"session_end": 1.0}))
@@ -100,11 +120,7 @@ class BehavioralProfile:
         prices: np.ndarray,
         unit_cost: np.ndarray,
     ) -> Tuple[List[Dict[str, Any]], List[SimpleNamespace]]:
-        """Generate a single session trajectory."""
-        # TODO: this is similar to the sample trajectory method in models.
-        # we also have to respect business constraints which constrain the lipshitz continuity of the transitions and prices
-        # we must apply constraints on purcahses not to let the platform offer prices under the cost of a productid
-
+        """Generate a single session trajectory respecting business constraints."""
         events: List[Dict[str, Any]] = []
         feature_events: List[SimpleNamespace] = []
         state = "session_start"
@@ -112,25 +128,30 @@ class BehavioralProfile:
         product_idx = int(rng.integers(0, len(prices)))
         product_id = f"product-{product_idx:04d}"
 
+
+        # enforce price >= cost constraint (lipschitz bound on pricing)
+        # This is a sort of last resort to not let an pricing learner go rogue
+        cost = float(unit_cost[product_idx])
+        constrained_price = max(float(prices[product_idx]), cost * 1.05)  # 5% min margin
+
         while state != "session_end" and len(events) < 40:
             if state != "session_start":
-                price = float(prices[product_idx])
                 row = {
                     "session_id": session_id,
                     "actor": "agent" if self.actor == "agents" else "human",
                     "eventName": state,
                     "product_idx": product_idx,
                     "productId": product_id,
-                    "price_offered": price,
+                    "price_offered": constrained_price,
                     "price_paid": 0.0,
                     "page": EVENT_PAGE_MAP.get(state, "/"),
                     "ts": t,
-                    "unit_cost": float(unit_cost[product_idx]),
+                    "unit_cost": cost,
                     "base_price": float(prices[product_idx]),
                 }
                 if state == "purchase_complete":
                     noise = float(rng.normal(0.0, 0.015))
-                    row["price_paid"] = max(price * (1.0 + noise), row["unit_cost"])
+                    row["price_paid"] = max(constrained_price * (1.0 + noise), cost)
                 events.append(row)
                 feature_events.append(
                     SimpleNamespace(
@@ -143,7 +164,8 @@ class BehavioralProfile:
 
             transitions = self._transition_probs(state, product_idx)
             next_state = rng.choice(list(transitions.keys()), p=list(transitions.values()))
-            dwell = max(0.5, rng.gamma(shape=2.0, scale=1.0)) # TODO: should use params from the profile data
+            shape, scale = self.dwell_params.get(state, (2.0, 1.0))
+            dwell = max(0.3, rng.gamma(shape=shape, scale=scale))
             t += dwell
             state = next_state
 
@@ -287,11 +309,13 @@ class CommercePlatform:
 
         human_prices = human_purchases["price_offered"] if not human_purchases.empty else pd.Series(dtype=float)
         human_costs = human_purchases["unit_cost"] if not human_purchases.empty else pd.Series(dtype=float)
+        human_base = human_purchases["base_price"] if not human_purchases.empty else pd.Series(dtype=float)
         coi = 0.0
         if not human_prices.empty and not human_costs.empty:
-            # of the purchased items, what is the margin between the price and cost
-            # TODO: this should take into account the expected price we could have charged also
-            coi = float(np.maximum(0.0, human_prices.mean() - human_costs.mean()))
+            # COI = E[P] - p_min where p_min is cost, accounting for expected premium (base - realized)
+            margin = human_prices.mean() - human_costs.mean()
+            expected_premium = human_base.mean() - human_prices.mean() if not human_base.empty else 0.0
+            coi = float(np.maximum(0.0, margin - expected_premium * 0.5))
 
         return {
             "revenue_observed": revenue_observed,
@@ -302,6 +326,7 @@ class CommercePlatform:
             "mean_sale_price": mean_sale_price,
             "look_to_book": look_to_book,
             "coi": coi,
+            "expected_premium": float(expected_premium) if not human_base.empty else 0.0,
         }
 
     def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:

From fa89347c4e474c7f2bc73ac769a986de78ead1aa Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 22 Jan 2026 11:48:24 +0100
Subject: [PATCH 40/99] feat: expanding market observation space

---
 sim/rl/environment.py | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index f7877a5..77fe4d5 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -291,6 +291,7 @@ class CommercePlatform:
                 "mean_sale_price": 0.0,
                 "look_to_book": 0.0,
                 "coi": 0.0,
+                "expected_premium": 0.0,
             }
 
         purchases = interaction_df[interaction_df["eventName"] == "purchase_complete"]
@@ -330,7 +331,7 @@ class CommercePlatform:
         }
 
     def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
-        # TODO: adapt this
+        """Extract per-session behavioral features for separability analysis."""
         if df.empty:
             return pd.DataFrame()
         g = df.groupby("session_id", sort=False)
@@ -341,8 +342,13 @@ class CommercePlatform:
         views = g.apply(lambda x: int((x["eventName"] == "view_item_page").sum()), include_groups=False)
         cart_adds = g.apply(lambda x: int((x["eventName"] == "add_item_to_cart").sum()), include_groups=False)
         purchases = g.apply(lambda x: int((x["eventName"] == "purchase_complete").sum()), include_groups=False)
+        learn_more = g.apply(lambda x: int((x["eventName"] == "learn_more_about_item").sum()), include_groups=False)
         conversion_rate = purchases / (views + 1e-6)
         is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False)
+        # price sensitivity features
+        price_variance = g["price_offered"].var().fillna(0.0)
+        avg_price_seen = g["price_offered"].mean().fillna(0.0)
+        products_viewed = g["product_idx"].nunique()
 
         return pd.DataFrame({
             "session_duration_sec": session_duration.astype(float),
@@ -352,7 +358,11 @@ class CommercePlatform:
             "item_views": views.astype(int),
             "cart_adds": cart_adds.astype(int),
             "purchases": purchases.astype(int),
+            "learn_more_clicks": learn_more.astype(int),
             "conversion_rate": conversion_rate.astype(float),
+            "price_variance": price_variance.astype(float),
+            "avg_price_seen": avg_price_seen.astype(float),
+            "products_viewed": products_viewed.astype(int),
             "is_agent": is_agent.astype(bool),
         }).reset_index()
 
@@ -371,18 +381,25 @@ class PHANTOMEnv(gym.Env):
         self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
                                        high=self.constraints.max_price_adjustment,
                                        shape=(self.constraints.product_catalogue_size,), dtype=np.float32)
+        n_products = self.constraints.product_catalogue_size
         self.observation_space = spaces.Dict({
             "elasticity": spaces.Dict({
                 "price": spaces.Box(
-                    low=np.full((self.constraints.product_catalogue_size,), self.constraints.system_min_price, dtype=np.float32),
-                    high=np.full((self.constraints.product_catalogue_size,), self.constraints.system_max_price, dtype=np.float32),
+                    low=np.full((n_products,), self.constraints.system_min_price, dtype=np.float32),
+                    high=np.full((n_products,), self.constraints.system_max_price, dtype=np.float32),
                     dtype=np.float32),
                 "demand": spaces.Box(
-                    low=np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
-                    high=np.full((self.constraints.product_catalogue_size,), 1e6, dtype=np.float32),
+                    low=np.zeros((n_products,), dtype=np.float32),
+                    high=np.full((n_products,), 1e6, dtype=np.float32),
                     dtype=np.float32),
-            })
-            # TODO: define more features that we compute from the interaction data
+            }),
+            "market": spaces.Dict({
+                "alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),  # estimated agent share
+                "revenue_rate": spaces.Box(low=0.0, high=1e6, shape=(1,), dtype=np.float32),  # recent revenue
+                "conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
+                "price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
+            }),
+            "cost": spaces.Box(low=0.0, high=self.constraints.system_max_price, shape=(n_products,), dtype=np.float32),
         })
         self.commerce_platform = CommercePlatform(
             product_catalogue_size=self.constraints.product_catalogue_size,

From a6e6cc5d60e4c99d2bbc7a6373a899d6f13679d1 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 22 Jan 2026 12:52:41 +0100
Subject: [PATCH 41/99] feat: baseline setup for RL modeling

---
 experiments/procesing/contaminator.py | 55 +++++++++++++++++--
 sim/rl/engine.py                      | 55 ++++++++++---------
 sim/rl/environment.py                 | 79 +++++++++++++++++++++++++--
 3 files changed, 152 insertions(+), 37 deletions(-)

diff --git a/experiments/procesing/contaminator.py b/experiments/procesing/contaminator.py
index 2f23b2b..73f9bfd 100644
--- a/experiments/procesing/contaminator.py
+++ b/experiments/procesing/contaminator.py
@@ -1,7 +1,13 @@
-import pandas as pd
-import random
+from __future__ import annotations
+
 import os
+import random
 from pathlib import Path
+from types import SimpleNamespace
+
+import pandas as pd
+
+from lib.separability import estimate_alpha, load_artifacts, score_session
 
 # use relative import when in package context, fallback for standalone
 try:
@@ -15,6 +21,11 @@ except ImportError:
 PROJECT_ROOT = Path(__file__).parent.parent.parent
 AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', PROJECT_ROOT / "experiments" / "agents" / "collected_data"))
 
+try:
+    SEPARABILITY_ARTIFACTS = load_artifacts()
+except FileNotFoundError:
+    SEPARABILITY_ARTIFACTS = None
+
 
 def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.DataFrame:
     """remap column values according to mapping dict, preserving unmapped values"""
@@ -23,6 +34,24 @@ def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.
     return df
 
 
+def _states_to_events(states: list[str]) -> list[SimpleNamespace]:
+    events: list[SimpleNamespace] = []
+    for idx, state in enumerate(states):
+        parts = state.split("|") if isinstance(state, str) else ["page", "product", str(state)]
+        page = f"/{parts[0]}" if parts else "/"
+        product = parts[1] if len(parts) > 1 else "unknown"
+        event_name = parts[2] if len(parts) > 2 else parts[-1]
+        events.append(
+            SimpleNamespace(
+                eventName=event_name,
+                page=page,
+                productId=product,
+                ts=float(idx),
+            )
+        )
+    return events
+
+
 def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
                         contamination_rate: float = 0.1,
                         agent_data_dir: Path = None) -> pd.DataFrame:
@@ -48,6 +77,7 @@ def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
 
     # generate synthetic trajectories
     new_rows = []
+    alpha_estimates = []
     for start_event in start_events:
         # sample trajectory from agent model, using a state that contains the event type
         mdp_states = model.mdp.get('states', []) if model.mdp else []
@@ -56,11 +86,28 @@ def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
             continue  # skip if no matching start state
         start_state = random.choice(matching_starts)
         trajectory = model.sample_traj(start_state, max_len=20)
+        score_payload: list[SimpleNamespace] = []
+        score: dict[str, float] = {}
+        if SEPARABILITY_ARTIFACTS:
+            score_payload = _states_to_events(trajectory)
+            score = score_session(score_payload, SEPARABILITY_ARTIFACTS)
+            alpha_estimates.append(
+                estimate_alpha(score["prob_agent"], score["delta_h"], score["delta_a"], temperature=2.0)
+            )
+
         for state in trajectory:
-            parts = state.split('|')  # page|productId|eventName format
-            new_rows.append({on: parts[-1] if parts else start_event, 'source': 'synthetic_agent'})
+            parts = state.split('|') if isinstance(state, str) else [start_event]
+            new_rows.append({
+                on: parts[-1] if parts else start_event,
+                'source': 'synthetic_agent',
+                'prob_agent': score.get('prob_agent') if SEPARABILITY_ARTIFACTS and score_payload else None,
+                'delta_h': score.get('delta_h') if SEPARABILITY_ARTIFACTS and score_payload else None,
+                'delta_a': score.get('delta_a') if SEPARABILITY_ARTIFACTS and score_payload else None,
+            })
 
     if new_rows:
         contaminate_df = pd.DataFrame(new_rows)
         df = pd.concat([df, contaminate_df], ignore_index=True)
+        if alpha_estimates:
+            df['estimated_alpha'] = sum(alpha_estimates) / len(alpha_estimates)
     return df
diff --git a/sim/rl/engine.py b/sim/rl/engine.py
index e0caca8..ab751e3 100644
--- a/sim/rl/engine.py
+++ b/sim/rl/engine.py
@@ -1,9 +1,8 @@
-from os import kill
 import numpy as np
 import pandas as pd
 from abc import ABC, abstractmethod
 from typing import Dict, Any
-from environment import BusinessLogicConstraints
+from sim.rl.environment import BusinessLogicConstraints
 
 
 """
@@ -32,9 +31,11 @@ class BasePricingEngine(ABC):
         """
         pass
 
-    @abstractmethod
-    def update(obs, reward, done, info):
-        pass
+    def update(self, observation: Dict[str, Any], reward: float, done: bool, info: Dict[str, Any]) -> None:
+        """Default no-op update. Engines can override as needed."""
+        self.last_observation = observation
+        self.last_reward = reward
+        self.last_info = info
 
 
 
@@ -48,14 +49,14 @@ class WildPricingEngine(BasePricingEngine):
     def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
         super().__init__(constraints, seed)
         # per-product unit costs (unknown to customers; known to platform)
-        self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catelogue_size).astype(np.float32)
+        self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catalogue_size).astype(np.float32)
         # online elasticity estimate (start moderately elastic)
-        self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
+        self.e_hat = np.full((self.c.product_catalogue_size,), -1.3, dtype=np.float32)
         # EWMA state for log-log regression
-        self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        self.cov_pq  = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        self.var_p   = np.ones(self.c.product_catelogue_size, dtype=np.float32)
+        self.mu_logp = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        self.mu_logq = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        self.cov_pq  = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        self.var_p   = np.ones(self.c.product_catalogue_size, dtype=np.float32)
         # knobs typical in production
         self.lr = 0.08
         self.ewma = 0.05
@@ -67,16 +68,16 @@ class WildPricingEngine(BasePricingEngine):
 
     def reset(self):
         super().reset()
-        self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
-        self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        self.cov_pq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        self.var_p = np.ones(self.c.product_catelogue_size, dtype=np.float32)
+        self.e_hat = np.full((self.c.product_catalogue_size,), -1.3, dtype=np.float32)
+        self.mu_logp = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        self.mu_logq = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        self.cov_pq = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        self.var_p = np.ones(self.c.product_catalogue_size, dtype=np.float32)
 
     def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
         self.step_count += 1
         # extract demand signal (from env observation) as proxy for sales
-        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32))
         return self._update_from_demand(current_prices, demand)
 
     def _update_from_demand(self, prices: np.ndarray, sold: np.ndarray) -> np.ndarray:
@@ -140,7 +141,7 @@ class SimpleDemandEngine(BasePricingEngine):
 
     def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
         self.step_count += 1
-        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32))
         if self.prev_demand is None:
             self.prev_demand = demand.copy()
             return current_prices.copy()
@@ -187,15 +188,15 @@ class ThompsonSamplingEngine(BasePricingEngine):
     def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
         super().__init__(constraints, seed)
         self.n_price_levels = 5
-        self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
-        self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.alpha = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
+        self.beta = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
         self.price_grid = None
         self.last_actions = None
 
     def reset(self):
         super().reset()
-        self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
-        self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.alpha = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
+        self.beta = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
         self.price_grid = None
         self.last_actions = None
 
@@ -206,10 +207,10 @@ class ThompsonSamplingEngine(BasePricingEngine):
             lo = current_prices * 0.7
             hi = current_prices * 1.3
             self.price_grid = np.linspace(lo, hi, self.n_price_levels).T
-        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32))
         # update beliefs based on last action
         if self.last_actions is not None:
-            for i in range(self.c.product_catelogue_size):
+            for i in range(self.c.product_catalogue_size):
                 a = self.last_actions[i]
                 reward = demand[i]
                 if reward > 0.5:
@@ -217,9 +218,9 @@ class ThompsonSamplingEngine(BasePricingEngine):
                 else:
                     self.beta[i, a] += 1.0
         # thompson sampling: sample from posterior, pick best
-        new_prices = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        actions = np.zeros(self.c.product_catelogue_size, dtype=int)
-        for i in range(self.c.product_catelogue_size):
+        new_prices = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        actions = np.zeros(self.c.product_catalogue_size, dtype=int)
+        for i in range(self.c.product_catalogue_size):
             theta = self.rng.beta(self.alpha[i], self.beta[i]).astype(np.float32)
             actions[i] = int(np.argmax(theta))
             new_prices[i] = self.price_grid[i, actions[i]]
diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index 77fe4d5..f1a7f53 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -22,7 +22,7 @@ class BusinessLogicConstraints():
     product_catalogue_size: int = 100
     episode_length: int = 200
     sessions_per_step: int = 250
-    agent_share: float = 0.25
+    agent_share: float = 0.5
     agent_recon_multiplier: float = 6.0
     agent_purchase_probability: float = 0.20
     coi_strength: float = 0.25
@@ -43,13 +43,45 @@ def _sigmoid(x: np.ndarray) -> np.ndarray:
 
 EVENT_PAGE_MAP = {
     "session_start": "/",
+    "page_view": "/",
     "view_item_page": "/products",
     "learn_more_about_item": "/products/details",
     "add_item_to_cart": "/cart",
+    "checkout_start": "/checkout",
     "purchase_complete": "/checkout",
     "session_end": "/checkout/success",
 }
 
+# map real collected event names to canonical simulation states
+EVENT_CANONICAL_MAP = {
+    "page_view": "session_start",
+    "hover_over_paragraph": "view_item_page",
+    "hover_over_title": "view_item_page",
+    "view_item_page": "view_item_page",
+    "learn_more_about_item": "learn_more_about_item",
+    "add_item_to_cart": "add_item_to_cart",
+    "checkout_start": "purchase_complete",
+    "remove_item": "view_item_page",
+}
+
+
+def _canonicalize_transitions(raw_trans: Dict[str, Dict[str, float]]) -> Dict[str, Dict[str, float]]:
+    """Map real event transition names to canonical simulation states."""
+    canonical: Dict[str, Dict[str, float]] = {}
+    for src, dsts in raw_trans.items():
+        src_canon = EVENT_CANONICAL_MAP.get(src, src)
+        if src_canon not in canonical:
+            canonical[src_canon] = {}
+        for dst, prob in dsts.items():
+            dst_canon = EVENT_CANONICAL_MAP.get(dst, dst)
+            canonical[src_canon][dst_canon] = canonical[src_canon].get(dst_canon, 0.0) + prob
+    # re-normalize after aggregation
+    for src in canonical:
+        total = sum(canonical[src].values())
+        if total > 0:
+            canonical[src] = {k: v / total for k, v in canonical[src].items()}
+    return canonical
+
 
 class BehavioralProfile:
     """Synthetic Markov profile used to generate interaction sessions.
@@ -68,11 +100,23 @@ class BehavioralProfile:
         ]
         model = AgentBehaviorModel(agent_dir) if actor == "agents" else BehaviorModel(human_dir)
         mdp = model.build_MDP()
-        self.transitions = aggregate_event_transitions(mdp) if mdp.get("transitions") else self._fallback_transitions()
+        raw_trans = aggregate_event_transitions(mdp) if mdp.get("transitions") else {}
+        self.transitions = _canonicalize_transitions(raw_trans) if raw_trans else self._fallback_transitions()
+        self._ensure_terminal_states()
         self.dwell_params = self._extract_dwell_params(mdp)
 
+    def _ensure_terminal_states(self):
+        # guarantee purchase_complete leads to session_end and session_start exists
+        if "purchase_complete" not in self.transitions:
+            self.transitions["purchase_complete"] = {"session_end": 1.0}
+        elif "session_end" not in self.transitions.get("purchase_complete", {}):
+            self.transitions["purchase_complete"]["session_end"] = 1.0
+            total = sum(self.transitions["purchase_complete"].values())
+            self.transitions["purchase_complete"] = {k: v/total for k, v in self.transitions["purchase_complete"].items()}
+        if "session_start" not in self.transitions:
+            self.transitions["session_start"] = {"view_item_page": 0.7, "learn_more_about_item": 0.2, "session_end": 0.1}
+
     def _fallback_transitions(self) -> Dict[str, Dict[str, float]]:
-        # sensible defaults if no data available
         return {
             "session_start": {"view_item_page": 0.85, "session_end": 0.15},
             "view_item_page": {"learn_more_about_item": 0.4, "add_item_to_cart": 0.3, "view_item_page": 0.2, "session_end": 0.1},
@@ -82,12 +126,16 @@ class BehavioralProfile:
         }
 
     def _extract_dwell_params(self, mdp: Dict) -> Dict[str, Tuple[float, float]]:
-        # derive gamma params (shape, scale) from state_rewards which encode temporal progression
         state_vals = mdp.get("state_values", {})
         params = {}
         for state in self.states:
+            # try canonical and raw state names
             val = state_vals.get(state, 0.5)
-            shape = 1.5 + val * 2.0  # higher progression -> longer dwell
+            for raw, canon in EVENT_CANONICAL_MAP.items():
+                if canon == state and raw in state_vals:
+                    val = state_vals[raw]
+                    break
+            shape = 1.5 + val * 2.0
             scale = 0.8 + (1.0 - val) * 1.2
             params[state] = (shape, scale)
         return params
@@ -434,7 +482,14 @@ class PHANTOMEnv(gym.Env):
             "elasticity": {
                 "price": init_prices,
                 "demand": np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
-            }
+            },
+            "market": {
+                "alpha_hat": np.array([self.constraints.agent_share], dtype=np.float32),
+                "revenue_rate": np.array([0.0], dtype=np.float32),
+                "conversion_rate": np.array([0.0], dtype=np.float32),
+                "price_volatility": np.array([0.0], dtype=np.float32),
+            },
+            "cost": self.commerce_platform.unit_cost.astype(np.float32),
         }
         return self.state, {}
 
@@ -459,6 +514,18 @@ class PHANTOMEnv(gym.Env):
             float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
         self._prev_prices = new_prices.copy()
 
+        # update market observation features
+        total_demand = float(np.sum(demand_vector))
+        total_purchases = float(result.get("true_human_purchases", 0.0) + result.get("true_agent_purchases", 0.0))
+        conv_rate = total_purchases / max(total_demand, 1.0)
+        self.state["market"] = {
+            "alpha_hat": np.array([float(diagnostics.get("alpha_hat", self.commerce_platform.alpha_hat))], dtype=np.float32),
+            "revenue_rate": np.array([float(result.get("revenue_observed", 0.0))], dtype=np.float32),
+            "conversion_rate": np.array([float(np.clip(conv_rate, 0.0, 1.0))], dtype=np.float32),
+            "price_volatility": np.array([float(volatility)], dtype=np.float32),
+        }
+        self.state["cost"] = self.commerce_platform.unit_cost.astype(np.float32)
+
         # extract metrics with safe defaults for incomplete simulation
         revenue_observed = float(result.get("revenue_observed", 0.0))
         agent_loss = float(result.get("agent_loss", 0.0))

From a217d53556fbad0dab100eb5c8d601b63c971803 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 22 Jan 2026 13:10:01 +0100
Subject: [PATCH 42/99] feat: translating features to jax

---
 sim/rl/jax_core/features.py     | 69 +++++++++++++++++++++++++
 sim/rl/jax_core/separability.py | 43 +++++++++++++++
 sim/rl/jax_core/simulation.py   | 92 +++++++++++++++++++++++++++++++++
 sim/rl/jax_core/transitions.py  | 47 +++++++++++++++++
 4 files changed, 251 insertions(+)
 create mode 100644 sim/rl/jax_core/features.py
 create mode 100644 sim/rl/jax_core/separability.py
 create mode 100644 sim/rl/jax_core/simulation.py
 create mode 100644 sim/rl/jax_core/transitions.py

diff --git a/sim/rl/jax_core/features.py b/sim/rl/jax_core/features.py
new file mode 100644
index 0000000..d5af957
--- /dev/null
+++ b/sim/rl/jax_core/features.py
@@ -0,0 +1,69 @@
+"""Vectorized session feature extraction."""
+import numpy as np
+from .transitions import N_STATES, PURCHASE_IDX, CART_IDX
+from .simulation import SessionBatch
+
+try:
+    import jax.numpy as jnp
+    from jax import jit
+    JAX_AVAILABLE = True
+except ImportError:
+    jnp, JAX_AVAILABLE = np, False
+    def jit(f): return f
+
+@jit
+def extract_features(states, dwells, lengths):
+    """Extract per-session features. Returns (n_sess, 9) array."""
+    n, max_len = states.shape
+    mask = jnp.arange(max_len)[None,:] < lengths[:,None]
+    duration = jnp.sum(dwells * mask, axis=1)
+    total = lengths.astype(jnp.float32)
+    count = lambda idx: jnp.sum((states == idx) & mask, axis=1).astype(jnp.float32)
+    views, learn, carts, purchases = count(1), count(2), count(3), count(4)
+    velocity = total / (duration + 1e-6)
+    conversion = purchases / (views + 1e-6)
+    avg_dwell = duration / (total + 1e-6)
+    return jnp.stack([duration, avg_dwell, total, velocity, views, carts, purchases, learn, conversion], axis=1)
+
+def session_features(batch: SessionBatch) -> np.ndarray:
+    if JAX_AVAILABLE:
+        return np.asarray(extract_features(jnp.array(batch.states), jnp.array(batch.dwells), jnp.array(batch.lengths)))
+    # numpy fallback
+    n, max_len = batch.states.shape
+    mask = np.arange(max_len)[None,:] < batch.lengths[:,None]
+    duration = np.sum(batch.dwells * mask, axis=1)
+    total = batch.lengths.astype(np.float32)
+    count = lambda idx: np.sum((batch.states == idx) & mask, axis=1).astype(np.float32)
+    views, learn, carts, purchases = count(1), count(2), count(3), count(4)
+    return np.stack([duration, duration/(total+1e-6), total, total/(duration+1e-6), views, carts, purchases, learn, purchases/(views+1e-6)], axis=1)
+
+@jit
+def session_transitions(states, lengths, n_states=N_STATES):
+    """Compute empirical transition counts per session. Returns (n_sess, n_states, n_states)."""
+    n, max_len = states.shape
+    mask = jnp.arange(max_len - 1)[None,:] < (lengths[:,None] - 1)
+    src, dst = states[:, :-1], states[:, 1:]
+    # handle -1 padding by clamping to valid range
+    src_c, dst_c = jnp.clip(src, 0, n_states-1), jnp.clip(dst, 0, n_states-1)
+    valid = mask & (src >= 0) & (dst >= 0)
+    def per_session(i):
+        s, d, v = src_c[i], dst_c[i], valid[i]
+        trans = (jnp.eye(n_states)[s,:,None] * jnp.eye(n_states)[d,None,:]).sum(0) * v[:,None,None]
+        return trans.sum(0)
+    # vmap not ideal here, use manual loop for clarity
+    trans = jnp.stack([per_session(i) for i in range(n)])
+    row_sums = trans.sum(axis=-1, keepdims=True)
+    return trans / (row_sums + 1e-10)
+
+def compute_session_transitions(batch: SessionBatch) -> np.ndarray:
+    if JAX_AVAILABLE:
+        return np.asarray(session_transitions(jnp.array(batch.states), jnp.array(batch.lengths)))
+    # numpy fallback
+    n, max_len = batch.states.shape
+    trans = np.zeros((n, N_STATES, N_STATES), dtype=np.float32)
+    for i in range(n):
+        for t in range(batch.lengths[i] - 1):
+            s, d = batch.states[i, t], batch.states[i, t+1]
+            if s >= 0 and d >= 0: trans[i, s, d] += 1
+    row_sums = trans.sum(axis=-1, keepdims=True)
+    return trans / (row_sums + 1e-10)
diff --git a/sim/rl/jax_core/separability.py b/sim/rl/jax_core/separability.py
new file mode 100644
index 0000000..c0c0293
--- /dev/null
+++ b/sim/rl/jax_core/separability.py
@@ -0,0 +1,43 @@
+"""Vectorized KL divergence for separability scoring."""
+import numpy as np
+from typing import Tuple
+
+try:
+    import jax.numpy as jnp
+    from jax import jit
+    JAX_AVAILABLE = True
+except ImportError:
+    jnp, JAX_AVAILABLE = np, False
+    def jit(f): return f
+
+@jit
+def batch_kl(P, Q_human, Q_agent, eps=1e-10):
+    """Compute KL(P||Q) for batched P. P:(n,s,s), Q:(s,s). Returns (delta_h, delta_a) each (n,)."""
+    p = P + eps
+    p = p / p.sum(axis=-1, keepdims=True)
+    qh, qa = Q_human[None] + eps, Q_agent[None] + eps
+    delta_h = jnp.sum(p * jnp.log(p / qh), axis=(1, 2))
+    delta_a = jnp.sum(p * jnp.log(p / qa), axis=(1, 2))
+    return delta_h, delta_a
+
+def compute_divergences(session_trans: np.ndarray, ref_human: np.ndarray, ref_agent: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute KL divergence of each session from human/agent prototypes."""
+    if JAX_AVAILABLE:
+        dh, da = batch_kl(jnp.array(session_trans), jnp.array(ref_human), jnp.array(ref_agent))
+        return np.asarray(dh), np.asarray(da)
+    # numpy fallback
+    eps = 1e-10
+    p = session_trans + eps
+    p = p / p.sum(axis=-1, keepdims=True)
+    qh, qa = ref_human[None] + eps, ref_agent[None] + eps
+    delta_h = np.sum(p * np.log(p / qh), axis=(1, 2))
+    delta_a = np.sum(p * np.log(p / qa), axis=(1, 2))
+    return delta_h, delta_a
+
+def estimate_alpha_batch(prob_agent: np.ndarray, delta_h: np.ndarray, delta_a: np.ndarray, temp: float = 1.0) -> np.ndarray:
+    """Vectorized alpha estimation from classifier probs and divergences."""
+    mass = delta_h + delta_a
+    ratio = np.where(mass > 1e-8, delta_a / mass, 0.5)
+    blended = 0.5 * prob_agent + 0.5 * ratio
+    if temp <= 0: return np.clip(blended, 0.0, 1.0)
+    return np.clip(1.0 / (1.0 + np.exp(-temp * (blended - 0.5))), 0.0, 1.0)
diff --git a/sim/rl/jax_core/simulation.py b/sim/rl/jax_core/simulation.py
new file mode 100644
index 0000000..ee8ca6f
--- /dev/null
+++ b/sim/rl/jax_core/simulation.py
@@ -0,0 +1,92 @@
+"""Vectorized Markov chain session sampling with JAX."""
+from typing import NamedTuple, Tuple
+import numpy as np
+from functools import partial
+
+try:
+    import jax, jax.numpy as jnp
+    from jax import lax
+    JAX_AVAILABLE = True
+except ImportError:
+    JAX_AVAILABLE = False
+
+from .transitions import TransitionData, N_STATES, TERM_IDX, PURCHASE_IDX, CART_IDX
+
+class SessionBatch(NamedTuple):
+    states: np.ndarray      # (n_sess, max_len) state indices, -1=padding
+    dwells: np.ndarray      # (n_sess, max_len) dwell times
+    products: np.ndarray    # (n_sess,) product index per session
+    actors: np.ndarray      # (n_sess,) 0=human, 1=agent
+    lengths: np.ndarray     # (n_sess,) actual session length
+
+class SimResult(NamedTuple):
+    demand_human: np.ndarray
+    demand_agent: np.ndarray
+    revenue: float
+    n_human_purchases: int
+    n_agent_purchases: int
+    sessions: SessionBatch
+
+if JAX_AVAILABLE:
+    @partial(jax.jit, static_argnums=(5,6,7))
+    def _sample_sessions_jax(key, T_human, T_agent, dwell_human, dwell_agent, n_human, n_agent, max_steps):
+        n = n_human + n_agent
+        k1, k2, k3, k4 = jax.random.split(key, 4)
+        actors = jnp.concatenate([jnp.zeros(n_human, dtype=jnp.int32), jnp.ones(n_agent, dtype=jnp.int32)])
+        T = jnp.where(actors[:,None,None]==0, T_human[None], T_agent[None])  # (n,6,6)
+        dwell_p = jnp.where(actors[:,None,None]==0, dwell_human[None], dwell_agent[None])  # (n,6,2)
+
+        def step(carry, _):
+            s, active, k = carry
+            k, k1, k2 = jax.random.split(k, 3)
+            probs = T[jnp.arange(n), s]  # (n,6)
+            nxt = jax.random.categorical(k1, jnp.log(probs + 1e-10))
+            nxt = jnp.where(active, nxt, -1)
+            shape = dwell_p[jnp.arange(n), s, 0]
+            scale = dwell_p[jnp.arange(n), s, 1]
+            dwell = jnp.maximum(0.3, jax.random.gamma(k2, shape) * scale)
+            still = active & (nxt != TERM_IDX) & (nxt >= 0)
+            return (nxt, still, k), (nxt, dwell)
+
+        init = (jnp.zeros(n, dtype=jnp.int32), jnp.ones(n, dtype=jnp.bool_), k3)
+        _, (states, dwells) = lax.scan(step, init, None, length=max_steps)
+        states, dwells = states.T, dwells.T  # (n, max_steps)
+        is_term = (states == -1) | (states == TERM_IDX)
+        lengths = jnp.argmax(is_term, axis=1) + 1
+        lengths = jnp.where(jnp.any(is_term, axis=1), lengths, max_steps)
+        return states, dwells, actors, lengths
+
+def sample_sessions(key, trans: TransitionData, n_human: int, n_agent: int, n_products: int, max_steps: int = 40) -> SessionBatch:
+    if JAX_AVAILABLE:
+        k1, k2 = jax.random.split(key)
+        states, dwells, actors, lengths = _sample_sessions_jax(k1, trans.human_T, trans.agent_T, trans.human_dwell, trans.agent_dwell, n_human, n_agent, max_steps)
+        products = jax.random.randint(k2, (n_human + n_agent,), 0, n_products)
+        return SessionBatch(np.asarray(states), np.asarray(dwells), np.asarray(products), np.asarray(actors), np.asarray(lengths))
+    # numpy fallback
+    rng = np.random.default_rng(int(key[0]) if hasattr(key, '__getitem__') else 42)
+    n = n_human + n_agent
+    actors = np.concatenate([np.zeros(n_human, dtype=np.int32), np.ones(n_agent, dtype=np.int32)])
+    products = rng.integers(0, n_products, size=n)
+    states, dwells = np.full((n, max_steps), -1, dtype=np.int32), np.zeros((n, max_steps), dtype=np.float32)
+    lengths = np.zeros(n, dtype=np.int32)
+    for i in range(n):
+        T = trans.human_T if actors[i] == 0 else trans.agent_T
+        dp = trans.human_dwell if actors[i] == 0 else trans.agent_dwell
+        s, t = 0, 0
+        while t < max_steps and s != TERM_IDX:
+            states[i, t] = s
+            dwells[i, t] = max(0.3, rng.gamma(dp[s, 0], dp[s, 1]))
+            s = rng.choice(N_STATES, p=T[s])
+            t += 1
+        lengths[i] = t
+    return SessionBatch(states, dwells, products, actors, lengths)
+
+def compute_metrics(batch: SessionBatch, prices: np.ndarray, unit_cost: np.ndarray) -> SimResult:
+    purchased = np.any(batch.states == PURCHASE_IDX, axis=1)
+    human_mask, agent_mask = batch.actors == 0, batch.actors == 1
+    human_purch = purchased & human_mask
+    agent_purch = purchased & agent_mask
+    demand_h = np.bincount(batch.products[human_purch], minlength=len(prices)).astype(np.float32)
+    demand_a = np.bincount(batch.products[agent_purch], minlength=len(prices)).astype(np.float32)
+    revenue = float(np.sum(prices[batch.products[purchased]]))
+    return SimResult(demand_h, demand_a, revenue, int(human_purch.sum()), int(agent_purch.sum()), batch)
diff --git a/sim/rl/jax_core/transitions.py b/sim/rl/jax_core/transitions.py
new file mode 100644
index 0000000..6aec650
--- /dev/null
+++ b/sim/rl/jax_core/transitions.py
@@ -0,0 +1,47 @@
+"""Dense transition matrices for JAX Markov chain sampling."""
+from dataclasses import dataclass
+import numpy as np
+
+try:
+    import jax.numpy as jnp
+    JAX_AVAILABLE = True
+except ImportError:
+    jnp, JAX_AVAILABLE = np, False
+
+STATES = ["session_start", "view_item_page", "learn_more_about_item", "add_item_to_cart", "purchase_complete", "session_end"]
+S2I = {s: i for i, s in enumerate(STATES)}
+N_STATES, TERM_IDX, PURCHASE_IDX, CART_IDX = len(STATES), 5, 4, 3
+
+@dataclass
+class TransitionData:
+    human_T: np.ndarray   # (6,6) transition probs
+    agent_T: np.ndarray   # (6,6)
+    human_dwell: np.ndarray  # (6,2) shape,scale
+    agent_dwell: np.ndarray  # (6,2)
+
+    def to_jax(self):
+        if not JAX_AVAILABLE: return self
+        return TransitionData(*[jnp.array(x) for x in [self.human_T, self.agent_T, self.human_dwell, self.agent_dwell]])
+
+def dict_to_dense(d):
+    m = np.zeros((N_STATES, N_STATES), dtype=np.float32)
+    for src, dsts in d.items():
+        if (i := S2I.get(src)) is not None:
+            for dst, p in dsts.items():
+                if (j := S2I.get(dst)) is not None: m[i,j] = p
+    m /= np.maximum(m.sum(1, keepdims=True), 1e-8)
+    m[TERM_IDX] = 0; m[TERM_IDX, TERM_IDX] = 1.0
+    return m
+
+def compile_transitions(human_profile, agent_profile):
+    def dwell_arr(params): return np.array([[params.get(s, (2.0, 1.0)) for s in STATES]], dtype=np.float32).reshape(N_STATES, 2)
+    return TransitionData(dict_to_dense(human_profile.transitions), dict_to_dense(agent_profile.transitions),
+                          dwell_arr(human_profile.dwell_params), dwell_arr(agent_profile.dwell_params))
+
+def fallback_transitions():
+    H = {"session_start": {"view_item_page": .85, "session_end": .15}, "view_item_page": {"learn_more_about_item": .4, "add_item_to_cart": .3, "view_item_page": .2, "session_end": .1},
+         "learn_more_about_item": {"add_item_to_cart": .5, "view_item_page": .3, "session_end": .2}, "add_item_to_cart": {"purchase_complete": .6, "view_item_page": .25, "session_end": .15}, "purchase_complete": {"session_end": 1.0}}
+    A = {"session_start": {"view_item_page": .9, "session_end": .1}, "view_item_page": {"learn_more_about_item": .5, "add_item_to_cart": .25, "view_item_page": .15, "session_end": .1},
+         "learn_more_about_item": {"add_item_to_cart": .4, "view_item_page": .4, "session_end": .2}, "add_item_to_cart": {"purchase_complete": .5, "view_item_page": .3, "session_end": .2}, "purchase_complete": {"session_end": 1.0}}
+    dwell = np.full((N_STATES, 2), [2.0, 1.0], dtype=np.float32)
+    return TransitionData(dict_to_dense(H), dict_to_dense(A), dwell.copy(), dwell.copy())

From 40e0b201e61fa0604fc8946450305dcba7f83e5d Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 22 Jan 2026 13:10:15 +0100
Subject: [PATCH 43/99] chore: init code for jax core

---
 sim/rl/jax_core/__init__.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 sim/rl/jax_core/__init__.py

diff --git a/sim/rl/jax_core/__init__.py b/sim/rl/jax_core/__init__.py
new file mode 100644
index 0000000..99d5a87
--- /dev/null
+++ b/sim/rl/jax_core/__init__.py
@@ -0,0 +1,11 @@
+"""JAX-accelerated simulation core for PHANTOM environment."""
+from .transitions import TransitionData, compile_transitions, fallback_transitions, JAX_AVAILABLE
+from .simulation import SessionBatch, SimResult, sample_sessions, compute_metrics
+from .features import session_features, compute_session_transitions
+from .separability import compute_divergences, estimate_alpha_batch
+
+__all__ = [
+    "JAX_AVAILABLE", "TransitionData", "compile_transitions", "fallback_transitions",
+    "SessionBatch", "SimResult", "sample_sessions", "compute_metrics",
+    "session_features", "compute_session_transitions", "compute_divergences", "estimate_alpha_batch",
+]

From a033e776973c435c3ebce672403c2d3a33b8ad9f Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 22 Jan 2026 21:02:10 +0100
Subject: [PATCH 44/99] intorducing jax for computation

---
 sim/rl/environment.py         | 54 ++++++++++++++++++++++++++++++-----
 sim/rl/jax_core/simulation.py | 34 ++++++++++++++++++----
 2 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index f1a7f53..597359f 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -9,6 +9,13 @@ from typing import Optional, Dict, Any, List, Tuple
 from lib.separability import load_artifacts, score_session, estimate_alpha
 from sim.rl.behavior_loader.models import AgentBehaviorModel, BehaviorModel, aggregate_event_transitions
 
+try:
+    import jax
+    from sim.rl.jax_core import JAX_AVAILABLE, compile_transitions, fallback_transitions, sample_sessions, compute_metrics
+    from sim.rl.jax_core import session_features, compute_session_transitions, compute_divergences, estimate_alpha_batch
+except ImportError:
+    JAX_AVAILABLE = False
+
 # "learner" agent learning to optimize pricing
 # "agent" part of environment creating demand signals that learner processes
 
@@ -20,9 +27,9 @@ class BusinessLogicConstraints():
     system_max_price: float = 500.0
     system_min_price: float = 1.0
     product_catalogue_size: int = 100
-    episode_length: int = 200
+    episode_length: int = 2000
     sessions_per_step: int = 250
-    agent_share: float = 0.5
+    agent_share: float = 0.2
     agent_recon_multiplier: float = 6.0
     agent_purchase_probability: float = 0.20
     coi_strength: float = 0.25
@@ -423,9 +430,10 @@ class CommercePlatform:
 class PHANTOMEnv(gym.Env):
     metadata = {"render_modes": []}
 
-    def __init__(self, constraints: Optional[BusinessLogicConstraints] = None):
+    def __init__(self, constraints: Optional[BusinessLogicConstraints] = None, use_jax: bool = True):
         super().__init__()
         self.constraints = constraints if isinstance(constraints, BusinessLogicConstraints) else BusinessLogicConstraints()
+        self.use_jax = use_jax and JAX_AVAILABLE
         self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
                                        high=self.constraints.max_price_adjustment,
                                        shape=(self.constraints.product_catalogue_size,), dtype=np.float32)
@@ -442,8 +450,8 @@ class PHANTOMEnv(gym.Env):
                     dtype=np.float32),
             }),
             "market": spaces.Dict({
-                "alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),  # estimated agent share
-                "revenue_rate": spaces.Box(low=0.0, high=1e6, shape=(1,), dtype=np.float32),  # recent revenue
+                "alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
+                "revenue_rate": spaces.Box(low=0.0, high=1e6, shape=(1,), dtype=np.float32),
                 "conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
                 "price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
             }),
@@ -458,12 +466,27 @@ class PHANTOMEnv(gym.Env):
         self.t = 0
         self._prev_prices: Optional[np.ndarray] = None
         self.state: Dict[str, Any] = {}
+        self._jax_key = None
+        self._jax_trans = None
+        if self.use_jax:
+            self._jax_key = jax.random.PRNGKey(self.constraints.seed)
+            self._init_jax_transitions()
+
+    def _init_jax_transitions(self):
+        try:
+            human_profile = _load_behavioral_profile("humans", np.ones(self.constraints.product_catalogue_size) * 0.1)
+            agent_profile = _load_behavioral_profile("agents", np.ones(self.constraints.product_catalogue_size) * 0.1)
+            self._jax_trans = compile_transitions(human_profile, agent_profile).to_jax()
+        except Exception:
+            self._jax_trans = fallback_transitions().to_jax()
 
     def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
         super().reset(seed=seed)
         if seed is not None:
             self._rng = np.random.default_rng(seed)
             self.commerce_platform._rng = np.random.default_rng(seed)
+            if self.use_jax:
+                self._jax_key = jax.random.PRNGKey(seed)
         self.commerce_platform.alpha_hat = self.constraints.agent_share
         self.t = 0
         init_prices = self._rng.uniform(
@@ -493,6 +516,20 @@ class PHANTOMEnv(gym.Env):
         }
         return self.state, {}
 
+    def _step_jax(self, new_prices: np.ndarray) -> Tuple[Dict, Dict]:
+        self._jax_key, subkey = jax.random.split(self._jax_key)
+        alpha = float(np.clip(self.commerce_platform.alpha_hat, 0.0, 0.95))
+        n_agent = max(1, int(self.constraints.sessions_per_step * alpha))
+        n_human = max(1, self.constraints.sessions_per_step - n_agent)
+        batch = sample_sessions(subkey, self._jax_trans, n_human, n_agent, len(new_prices))
+        sim = compute_metrics(batch, new_prices, self.commerce_platform.unit_cost, self.commerce_platform.base_price)
+        result = {"revenue_observed": sim.revenue, "revenue_oracle": sim.revenue_oracle,
+                  "agent_loss": sim.agent_loss, "coi": sim.coi, "look_to_book": sim.look_to_book,
+                  "mean_sale_price": sim.mean_sale_price, "true_human_purchases": sim.n_human_purchases,
+                  "true_agent_purchases": sim.n_agent_purchases}
+        diagnostics = {"demand_human": sim.demand_human, "demand_agent": sim.demand_agent, "alpha_hat": alpha}
+        return result, diagnostics
+
     def step(self, action: np.ndarray):
         self.t += 1
         base_prices = self.state["elasticity"]["price"].astype(np.float32)
@@ -501,8 +538,11 @@ class PHANTOMEnv(gym.Env):
                            self.constraints.system_max_price).astype(np.float32)
 
         self.state["elasticity"]["price"] = new_prices
-        interactions_df, diagnostics = self.commerce_platform._simulate_sessions(new_prices)
-        result = self.commerce_platform.compute_interaction_features(interactions_df)
+        if self.use_jax:
+            result, diagnostics = self._step_jax(new_prices)
+        else:
+            interactions_df, diagnostics = self.commerce_platform._simulate_sessions(new_prices)
+            result = self.commerce_platform.compute_interaction_features(interactions_df)
         COI = float(result.get("coi", 0.0))
 
         demand_vector = diagnostics.get("demand_human", np.zeros_like(new_prices)) + diagnostics.get(
diff --git a/sim/rl/jax_core/simulation.py b/sim/rl/jax_core/simulation.py
index ee8ca6f..9532b3d 100644
--- a/sim/rl/jax_core/simulation.py
+++ b/sim/rl/jax_core/simulation.py
@@ -23,6 +23,11 @@ class SimResult(NamedTuple):
     demand_human: np.ndarray
     demand_agent: np.ndarray
     revenue: float
+    revenue_oracle: float
+    agent_loss: float
+    coi: float
+    look_to_book: float
+    mean_sale_price: float
     n_human_purchases: int
     n_agent_purchases: int
     sessions: SessionBatch
@@ -81,12 +86,31 @@ def sample_sessions(key, trans: TransitionData, n_human: int, n_agent: int, n_pr
         lengths[i] = t
     return SessionBatch(states, dwells, products, actors, lengths)
 
-def compute_metrics(batch: SessionBatch, prices: np.ndarray, unit_cost: np.ndarray) -> SimResult:
+def compute_metrics(batch: SessionBatch, prices: np.ndarray, unit_cost: np.ndarray, base_price: np.ndarray) -> SimResult:
     purchased = np.any(batch.states == PURCHASE_IDX, axis=1)
     human_mask, agent_mask = batch.actors == 0, batch.actors == 1
-    human_purch = purchased & human_mask
-    agent_purch = purchased & agent_mask
+    human_purch, agent_purch = purchased & human_mask, purchased & agent_mask
     demand_h = np.bincount(batch.products[human_purch], minlength=len(prices)).astype(np.float32)
     demand_a = np.bincount(batch.products[agent_purch], minlength=len(prices)).astype(np.float32)
-    revenue = float(np.sum(prices[batch.products[purchased]]))
-    return SimResult(demand_h, demand_a, revenue, int(human_purch.sum()), int(agent_purch.sum()), batch)
+    # revenue and oracle
+    purch_products = batch.products[purchased]
+    revenue = float(np.sum(prices[purch_products]))
+    revenue_oracle = float(np.sum(base_price[purch_products]))
+    # agent loss: base_price - price_paid for agent purchases (agents gaming the system)
+    agent_products = batch.products[agent_purch]
+    agent_loss = float(np.sum(base_price[agent_products] - prices[agent_products]))
+    # COI: margin - expected_premium*0.5 for human purchases
+    human_products = batch.products[human_purch]
+    if len(human_products) > 0:
+        margin = float(np.mean(prices[human_products] - unit_cost[human_products]))
+        premium = float(np.mean(base_price[human_products] - prices[human_products]))
+        coi = max(0.0, margin - premium * 0.5)
+    else:
+        coi = 0.0
+    # look to book: views / purchases
+    views = float(np.sum(batch.states == 1))  # view_item_page = index 1
+    n_purch = int(purchased.sum())
+    look_to_book = views / (n_purch + 1e-6)
+    mean_sale = float(np.mean(prices[purch_products])) if n_purch > 0 else 0.0
+    return SimResult(demand_h, demand_a, revenue, revenue_oracle, agent_loss, coi, look_to_book, mean_sale,
+                     int(human_purch.sum()), int(agent_purch.sum()), batch)

From 4e2e41d943ee8bc49119260738596ba440d784ae Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 23 Jan 2026 10:37:32 +0100
Subject: [PATCH 45/99] shock: defining new lab environment and formulation

---
 lab/README.md                         |   1 +
 lab/__init__.py                       |  27 +++
 lab/case/__init__.py                  |   6 +
 lab/case/thesis/__init__.py           |  25 ++
 lab/case/thesis/arrivals.py           | 327 ++++++++++++++++++++++++++
 lab/case/thesis/execution.py          |  91 +++++++
 lab/case/thesis/metrics.py            | 102 ++++++++
 lab/case/thesis/objectives.py         | 228 ++++++++++++++++++
 lab/case/thesis/platform.py           | 176 ++++++++++++++
 lab/case/thesis/run_experiment.py     | 136 +++++++++++
 lab/config.py                         | 156 ++++++++++++
 lab/docs/Makefile                     |  12 +
 lab/docs/conf.py                      |  39 +++
 lab/docs/index.rst                    |  39 +++
 lab/docs/modules/experiments.rst      |  14 ++
 lab/docs/modules/outlet.rst           |  77 ++++++
 lab/docs/modules/population.rst       |  20 ++
 lab/experiments/__init__.py           |   7 +
 lab/experiments/eval.py               | 213 +++++++++++++++++
 lab/outlet/__init__.py                |  17 ++
 lab/outlet/constants.py               |  83 +++++++
 lab/outlet/gym_wrapper.py             |  86 +++++++
 lab/outlet/math_util.py               |  57 +++++
 lab/outlet/mechanisms/__init__.py     |   5 +
 lab/outlet/mechanisms/auction.py      |  73 ++++++
 lab/outlet/mechanisms/posted_price.py |  84 +++++++
 lab/outlet/mechanisms/two_sided.py    |  89 +++++++
 lab/outlet/objectives/__init__.py     |  11 +
 lab/outlet/objectives/base.py         |  48 ++++
 lab/outlet/objectives/factory.py      |  82 +++++++
 lab/outlet/objectives/penalties.py    | 101 ++++++++
 lab/outlet/observation.py             |  92 ++++++++
 lab/outlet/platform.py                | 285 ++++++++++++++++++++++
 lab/outlet/protocols.py               | 297 +++++++++++++++++++++++
 lab/outlet/stock.py                   | 151 ++++++++++++
 lab/outlet/types.py                   | 318 +++++++++++++++++++++++++
 lab/population/__init__.py            |  10 +
 lab/population/arrivals.py            | 168 +++++++++++++
 lab/population/competitors.py         | 189 +++++++++++++++
 lab/population/execution.py           | 174 ++++++++++++++
 lab/run_example.py                    |  59 +++++
 41 files changed, 4175 insertions(+)
 create mode 100644 lab/README.md
 create mode 100644 lab/__init__.py
 create mode 100644 lab/case/__init__.py
 create mode 100644 lab/case/thesis/__init__.py
 create mode 100644 lab/case/thesis/arrivals.py
 create mode 100644 lab/case/thesis/execution.py
 create mode 100644 lab/case/thesis/metrics.py
 create mode 100644 lab/case/thesis/objectives.py
 create mode 100644 lab/case/thesis/platform.py
 create mode 100644 lab/case/thesis/run_experiment.py
 create mode 100644 lab/config.py
 create mode 100644 lab/docs/Makefile
 create mode 100644 lab/docs/conf.py
 create mode 100644 lab/docs/index.rst
 create mode 100644 lab/docs/modules/experiments.rst
 create mode 100644 lab/docs/modules/outlet.rst
 create mode 100644 lab/docs/modules/population.rst
 create mode 100644 lab/experiments/__init__.py
 create mode 100644 lab/experiments/eval.py
 create mode 100644 lab/outlet/__init__.py
 create mode 100644 lab/outlet/constants.py
 create mode 100644 lab/outlet/gym_wrapper.py
 create mode 100644 lab/outlet/math_util.py
 create mode 100644 lab/outlet/mechanisms/__init__.py
 create mode 100644 lab/outlet/mechanisms/auction.py
 create mode 100644 lab/outlet/mechanisms/posted_price.py
 create mode 100644 lab/outlet/mechanisms/two_sided.py
 create mode 100644 lab/outlet/objectives/__init__.py
 create mode 100644 lab/outlet/objectives/base.py
 create mode 100644 lab/outlet/objectives/factory.py
 create mode 100644 lab/outlet/objectives/penalties.py
 create mode 100644 lab/outlet/observation.py
 create mode 100644 lab/outlet/platform.py
 create mode 100644 lab/outlet/protocols.py
 create mode 100644 lab/outlet/stock.py
 create mode 100644 lab/outlet/types.py
 create mode 100644 lab/population/__init__.py
 create mode 100644 lab/population/arrivals.py
 create mode 100644 lab/population/competitors.py
 create mode 100644 lab/population/execution.py
 create mode 100644 lab/run_example.py

diff --git a/lab/README.md b/lab/README.md
new file mode 100644
index 0000000..c4db76a
--- /dev/null
+++ b/lab/README.md
@@ -0,0 +1 @@
+# MOS (Money Operating System)
diff --git a/lab/__init__.py b/lab/__init__.py
new file mode 100644
index 0000000..cc6df0c
--- /dev/null
+++ b/lab/__init__.py
@@ -0,0 +1,27 @@
+"""
+Quote-Control Simulator: Research-grade platform for dynamic pricing and market making
+
+The platform abstracts pricing as: Quote -> Arrival -> Execution -> Position
+Supports multiple mechanisms:
+  - PostedPrice: retail dynamic pricing
+  - TwoSided: market making with bid-ask spreads
+  - Auction: reserve/shading for auction settings
+
+Example usage:
+    from lab.config import make_retail_platform
+    from lab.experiments import rollout, fixed_price_policy
+
+    platform = make_retail_platform()
+    policy = fixed_price_policy(platform.instruments.refs)
+    result = rollout(platform, policy, n_steps=100)
+    print(f"Total PnL: {result.total_pnl:.2f}")
+"""
+
+from .config import make_retail_platform, make_market_making_platform, RetailConfig, MarketMakingConfig
+from .outlet import Platform, PlatformConfig, Quote, Observation, StepResult
+
+__all__ = [
+    'make_retail_platform', 'make_market_making_platform',
+    'RetailConfig', 'MarketMakingConfig',
+    'Platform', 'PlatformConfig', 'Quote', 'Observation', 'StepResult',
+]
diff --git a/lab/case/__init__.py b/lab/case/__init__.py
new file mode 100644
index 0000000..44fbf8c
--- /dev/null
+++ b/lab/case/__init__.py
@@ -0,0 +1,6 @@
+"""
+Case studies implementing specific research scenarios.
+
+Available cases:
+- thesis: PHANTOM thesis implementation with contaminated demand and DR-RL
+"""
diff --git a/lab/case/thesis/__init__.py b/lab/case/thesis/__init__.py
new file mode 100644
index 0000000..31db465
--- /dev/null
+++ b/lab/case/thesis/__init__.py
@@ -0,0 +1,25 @@
+"""
+Thesis-specific implementation of the PHANTOM pricing defense framework.
+
+This module implements the mathematical models from the thesis:
+- ContaminatedArrivalModel: Mixture demand Q(p) = (1-α)d_H + αd_A (Eq 3)
+- HybridExecutionModel: Divergent H/A behavior with separability (Section 2.1)
+- RobustStackelbergObjective: Maximin objective with COI penalty (Eq 23)
+- COIMetrics: Cost of Information tracking (Definition 1)
+
+The platform configuration creates a research environment that directly
+maps to the thesis mathematical framework for DR-RL experiments.
+"""
+from .arrivals import ContaminatedArrivalModel, ContaminatedArrivalConfig
+from .execution import HybridExecutionModel, HybridExecutionConfig
+from .objectives import RobustStackelbergObjective, COIObjective
+from .platform import make_thesis_platform, ThesisConfig
+from .metrics import COIMetrics, compute_coi, compute_separability
+
+__all__ = [
+    'ContaminatedArrivalModel', 'ContaminatedArrivalConfig',
+    'HybridExecutionModel', 'HybridExecutionConfig',
+    'RobustStackelbergObjective', 'COIObjective',
+    'make_thesis_platform', 'ThesisConfig',
+    'COIMetrics', 'compute_coi', 'compute_separability',
+]
diff --git a/lab/case/thesis/arrivals.py b/lab/case/thesis/arrivals.py
new file mode 100644
index 0000000..909cab5
--- /dev/null
+++ b/lab/case/thesis/arrivals.py
@@ -0,0 +1,327 @@
+"""Contaminated arrivals using learned MDP kernels from behavior_loader.
+
+Implements thesis demand model (Section 3.1):
+- Aggregate demand Q(p) = (1-α)E[d(p;θ_H)] + αE[d(p;θ_A)] + ε_t  (Eq 3)
+- Demand proxy q̂_{t,i} = Σ_s Σ_k ω(a_{s,k}) · 1[i_{s,k} = i]     (Eq 2)
+- Per-session separability via KL divergence Δ_H, Δ_A              (Eq 20-21)
+
+The arrival model samples sessions from a mixture of human/agent behavioral profiles,
+each session produces a trajectory τ_s and associated demand computation q(τ').
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from types import SimpleNamespace
+from typing import Dict, List, Tuple, Optional
+import numpy as np
+from ...outlet.types import Opportunity, InstrumentSet, MarketState, HiddenState
+from ...outlet.constants import Side, OpportunityType
+from ...outlet.math_util import poisson_arrivals
+
+try:
+    import sys
+    from pathlib import Path
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+    from sim.rl.behavior_loader.models import (
+        BehaviorModel, AgentBehaviorModel, aggregate_event_transitions, kl_divergence
+    )
+    REAL_MDP = True
+except ImportError:
+    REAL_MDP = False
+    kl_divergence = None
+
+EVENT_PAGE = {"session_start": "/", "view_item_page": "/products", "learn_more_about_item": "/products/details",
+              "add_item_to_cart": "/cart", "purchase_complete": "/checkout", "session_end": "/checkout/success"}
+EVENT_CANON = {"page_view": "session_start", "hover_over_paragraph": "view_item_page", "hover_over_title": "view_item_page",
+               "view_item_page": "view_item_page", "learn_more_about_item": "learn_more_about_item",
+               "add_item_to_cart": "add_item_to_cart", "checkout_start": "purchase_complete", "remove_item": "view_item_page"}
+
+# action space partition A = A_nav ∪ A_cart ∪ A_filter ∪ A_dwell with signal weights ω (Table 1)
+ACTION_WEIGHTS: Dict[str, float] = {
+    "add_item_to_cart": 0.8, "remove_item": 0.6, "checkout_start": 0.9, "purchase_complete": 1.0,  # A_cart
+    "hover_over_title": 0.3, "hover_over_paragraph": 0.35, "hover_over_link": 0.25,               # A_dwell
+    "page_view": 0.1, "session_start": 0.05, "view_item_page": 0.15, "learn_more_about_item": 0.2, # A_nav
+    "search": 0.05, "filter_date": 0.05, "filter_price": 0.08, "sort": 0.03, "session_end": 0.0,   # A_filter
+}
+
+
+@dataclass
+class SessionDemand:
+    """Per-session demand computation per thesis formulation (Section 3.1).
+
+    Each session s ∈ S produces trajectory τ_s and demand proxy q̂. The platform uses
+    divergence signals Δ_H, Δ_A to estimate per-session contamination α̂(τ').
+    """
+    session_id: str
+    q: Dict[int, float]               # q̂_i demand proxy per product (Eq 2)
+    trajectory: List[Dict]            # τ_s = (e_{s,1}, ..., e_{s,L_s})
+    delta_h: float = 0.0              # D_KL(T̂' || T̄_H) (Eq 20)
+    delta_a: float = 0.0              # D_KL(T̂' || T̄_A) (Eq 21)
+    alpha_hat: float = 0.0            # per-session contamination estimate
+    actor_class: str = "H"            # ground truth Y_s ∈ {H, A}
+    theta: Dict[str, float] = field(default_factory=dict)
+
+
+def compute_demand_proxy(events: List[Dict], n_products: int) -> Dict[int, float]:
+    """Compute q̂_{t,i} = Σ_k ω(a_{s,k}) · 1[i_{s,k} = i] per Eq 2."""
+    q = {i: 0.0 for i in range(n_products)}
+    for e in events:
+        action, pidx = e.get("eventName", ""), e.get("product_idx")
+        if pidx is not None and 0 <= pidx < n_products:
+            q[pidx] += ACTION_WEIGHTS.get(action, 0.1)
+    return q
+
+
+def compute_session_divergence(events: List[Dict], ref_h: Dict, ref_a: Dict) -> Tuple[float, float]:
+    """Compute Δ_H, Δ_A divergence signals from trajectory (Eq 20-21)."""
+    if not events or kl_divergence is None:
+        return 0.0, 0.0
+    # build empirical transition kernel from trajectory
+    trans: Dict[str, Dict[str, int]] = {}
+    prev = "session_start"
+    for e in events:
+        curr = e.get("eventName", "session_end")
+        trans.setdefault(prev, {})
+        trans[prev][curr] = trans[prev].get(curr, 0) + 1
+        prev = curr
+    # normalize to probabilities
+    kernel = {}
+    for s, dests in trans.items():
+        total = sum(dests.values())
+        kernel[s] = {d: c / total for d, c in dests.items()} if total > 0 else {}
+    # aggregate to event-level and compute KL divergence against reference kernels
+    delta_h = sum(kl_divergence(kernel.get(s, {}), ref_h.get(s, {})) for s in kernel) / max(len(kernel), 1)
+    delta_a = sum(kl_divergence(kernel.get(s, {}), ref_a.get(s, {})) for s in kernel) / max(len(kernel), 1)
+    return delta_h, delta_a
+
+def _canonicalize(raw: Dict) -> Dict:
+    out = {}
+    for src, dsts in raw.items():
+        sc = EVENT_CANON.get(src, src)
+        out.setdefault(sc, {})
+        for dst, p in dsts.items():
+            dc = EVENT_CANON.get(dst, dst)
+            out[sc][dc] = out[sc].get(dc, 0.0) + p
+    return {s: {k: v/sum(d.values()) for k, v in d.items()} for s, d in out.items() if sum(d.values()) > 0}
+
+
+class BehavioralProfile:
+    """Markov profile from learned MDP kernels (Section 3.5.2).
+
+    Transition kernel T̂_Y estimated via MLE: P̂(s'|s) = N(s,s') / Σ_k N(s,k) (Eq 19)
+    """
+    STATES = ["session_start", "view_item_page", "learn_more_about_item", "add_item_to_cart", "purchase_complete", "session_end"]
+    # fallback kernels T̄_H, T̄_A when real data unavailable
+    FALLBACK_H = {"session_start": {"view_item_page": 0.85, "session_end": 0.15},
+                  "view_item_page": {"learn_more_about_item": 0.4, "add_item_to_cart": 0.3, "view_item_page": 0.2, "session_end": 0.1},
+                  "learn_more_about_item": {"add_item_to_cart": 0.5, "view_item_page": 0.3, "session_end": 0.2},
+                  "add_item_to_cart": {"purchase_complete": 0.6, "view_item_page": 0.25, "session_end": 0.15},
+                  "purchase_complete": {"session_end": 1.0}}
+    FALLBACK_A = {"session_start": {"view_item_page": 0.95, "session_end": 0.05},
+                  "view_item_page": {"learn_more_about_item": 0.6, "view_item_page": 0.25, "add_item_to_cart": 0.1, "session_end": 0.05},
+                  "learn_more_about_item": {"view_item_page": 0.5, "add_item_to_cart": 0.15, "learn_more_about_item": 0.3, "session_end": 0.05},
+                  "add_item_to_cart": {"view_item_page": 0.4, "purchase_complete": 0.2, "session_end": 0.4},
+                  "purchase_complete": {"session_end": 1.0}}
+
+    def __init__(self, actor: str, pprobs: np.ndarray, data_dir: str = ""):
+        self.actor, self.pprobs = actor, np.clip(pprobs, 0.0, 0.95)
+        self.trans = self._load(data_dir)  # T̂_Y transition kernel
+        self._ensure_terminal()
+        self.dwell = {s: (1.2, 0.5) if actor == "agents" else (2.0, 1.2) for s in self.STATES}
+
+    def _load(self, data_dir: str) -> Dict:
+        if not REAL_MDP or not data_dir:
+            print("using fallback")
+            return dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H)
+        try:
+            mdp = (AgentBehaviorModel if self.actor == "agents" else BehaviorModel)(data_dir).build_MDP()
+            raw = aggregate_event_transitions(mdp) if mdp.get("transitions") else {}
+            return _canonicalize(raw) if raw else dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H)
+        except Exception:
+            print("using fallback")
+            return dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H)
+
+    def _ensure_terminal(self):
+        self.trans.setdefault("purchase_complete", {})["session_end"] = self.trans.get("purchase_complete", {}).get("session_end", 1.0)
+        self.trans.setdefault("session_start", {"view_item_page": 0.7, "learn_more_about_item": 0.2, "session_end": 0.1})
+
+    def _tprobs(self, state: str, pidx: int) -> Dict[str, float]:
+        probs = dict(self.trans.get(state, {"session_end": 1.0}))
+        if state == "add_item_to_cart":
+            base = probs.get("purchase_complete", 0.0)
+            df = float(self.pprobs[pidx]) * (0.3 if self.actor == "agents" else 1.0)
+            adj = np.clip(base * 0.5 + df * 0.5, 0.0, 0.95)
+            rem = max(1e-6, 1.0 - adj)
+            other = sum(v for k, v in probs.items() if k != "purchase_complete")
+            probs = {k: (adj if k == "purchase_complete" else v * rem / max(other, 1e-6)) for k, v in probs.items()}
+        total = sum(probs.values())
+        return {k: v/total for k, v in probs.items()} if total > 0 else {"session_end": 1.0}
+
+    def sample(self, rng: np.random.Generator, sid: str, prices: np.ndarray, costs: np.ndarray) -> Tuple[List[Dict], List[SimpleNamespace]]:
+        events, fevts = [], []
+        state, t, pidx = "session_start", 0.0, int(rng.integers(0, len(prices)))
+        cost, cprice = float(costs[pidx]), max(float(prices[pidx]), float(costs[pidx]) * 1.05)
+
+        while state != "session_end" and len(events) < 40:
+            if state != "session_start":
+                row = {"session_id": sid, "actor": "agent" if self.actor == "agents" else "human",
+                       "eventName": state, "product_idx": pidx, "productId": f"product-{pidx:04d}",
+                       "price_offered": cprice, "price_paid": 0.0, "page": EVENT_PAGE.get(state, "/"),
+                       "ts": t, "unit_cost": cost, "base_price": float(prices[pidx])}
+                if state == "purchase_complete":
+                    row["price_paid"] = max(cprice * (1.0 + rng.normal(0.0, 0.015)), cost)
+                events.append(row)
+                fevts.append(SimpleNamespace(eventName=state, page=row["page"], productId=row["productId"], ts=t))
+
+            probs = self._tprobs(state, pidx)
+            state = rng.choice(list(probs.keys()), p=list(probs.values()))
+            sh, sc = self.dwell.get(state, (2.0, 1.0))
+            t += max(0.3, rng.gamma(shape=sh, scale=sc))
+        return events, fevts
+
+
+@dataclass
+class ContaminatedArrivalConfig:
+    base_rate: float = 20.0
+    alpha_contamination: float = 0.2
+    alpha_drift: float = 0.0
+    alpha_bounds: tuple[float, float] = (0.0, 0.5)
+    human_views_range: tuple[int, int] = (1, 4)
+    agent_views_range: tuple[int, int] = (3, 10)
+    agent_systematic: bool = True
+    use_real_behavior: bool = True
+    human_data_dir: str = ""
+    agent_data_dir: str = ""
+
+
+class ContaminatedArrivalModel:
+    """Mixture model Q(p) = (1-α)E[d(p;θ_H)] + αE[d(p;θ_A)] + ε_t (Eq 3).
+
+    Samples sessions from human/agent behavioral profiles, computes per-session
+    demand proxy q̂ and divergence signals Δ_H, Δ_A for separability.
+    """
+
+    def __init__(self, cfg: ContaminatedArrivalConfig | None = None):
+        self.cfg = cfg or ContaminatedArrivalConfig()
+        self._alpha = self.cfg.alpha_contamination
+        self._scount = 0
+        self._profiles: Dict[str, BehavioralProfile] = {}
+        self._ref_kernels: Dict[str, Dict] = {}  # T̄_H, T̄_A reference kernels
+        self._session_demands: List[SessionDemand] = []  # collected session demands
+
+    @property
+    def alpha(self) -> float:
+        return self._alpha
+
+    def _profile(self, actor: str, pprobs: np.ndarray) -> BehavioralProfile:
+        key = actor
+        if key not in self._profiles:
+            ddir = self.cfg.agent_data_dir if actor == "agents" else self.cfg.human_data_dir
+            if not ddir and self.cfg.use_real_behavior:
+                base = Path(__file__).parent.parent.parent.parent / "experiments"
+                ddir = str(base / ("agents/collected_data" if actor == "agents" else "collected_data"))
+            profile = BehavioralProfile(actor, pprobs, ddir if self.cfg.use_real_behavior else "")
+            self._profiles[key] = profile
+            self._ref_kernels[key] = profile.trans  # cache T̄_Y for divergence
+        return self._profiles[key]
+
+    def get_ref_kernels(self) -> Tuple[Dict, Dict]:
+        """Return reference transition kernels T̄_H, T̄_A for divergence computation."""
+        return (self._ref_kernels.get("humans", BehavioralProfile.FALLBACK_H),
+                self._ref_kernels.get("agents", BehavioralProfile.FALLBACK_A))
+
+    def get_session_demands(self) -> List[SessionDemand]:
+        """Return collected session demands for downstream analysis."""
+        return self._session_demands
+
+    def sample(self, t: float, dt: float, instruments: InstrumentSet,
+               market: MarketState | None, hidden: HiddenState, rng: np.random.Generator) -> list[Opportunity]:
+        """Sample arrivals as per Eq 3: mixture of human/agent demand distributions.
+
+        For each session s, computes:
+        - Trajectory τ_s from behavioral profile sampling
+        - Demand proxy q̂ via weighted action aggregation (Eq 2)
+        - Divergence signals Δ_H, Δ_A for separability (Eq 20-21)
+        - Per-session contamination estimate α̂(τ')
+        """
+        cfg = self.cfg
+        if cfg.alpha_drift != 0:
+            self._alpha = np.clip(self._alpha + cfg.alpha_drift * rng.normal(), *cfg.alpha_bounds)
+        hidden.contamination = self._alpha
+
+        n_sess = poisson_arrivals(cfg.base_rate * hidden.true_demand_intensity, dt, rng)
+        prices, costs = instruments.refs, instruments.costs
+        margin = np.clip((prices - costs) / np.maximum(costs, 1e-3), -0.9, 2.0)
+        hprob, aprob = 0.08 * np.exp(-1.2 * margin), 0.05 * np.exp(-0.6 * margin)
+        ref_h, ref_a = self.get_ref_kernels()
+
+        opps = []
+        for _ in range(n_sess):
+            self._scount += 1
+            sid = f"s{self._scount:06d}"
+            is_agent = rng.random() < self._alpha
+            actor, probs = ("agents", aprob) if is_agent else ("humans", hprob)
+            profile = self._profile(actor, probs)
+            events, fevts = profile.sample(rng, sid, prices, costs)
+
+            # compute demand proxy q̂ per Eq 2
+            q = compute_demand_proxy(events, instruments.n)
+
+            # compute divergence signals Δ_H, Δ_A per Eq 20-21
+            delta_h, delta_a = compute_session_divergence(events, ref_h, ref_a)
+            # per-session contamination estimate α̂(τ') = σ(β(Δ_H - Δ_A))
+            alpha_hat = 1.0 / (1.0 + np.exp(-2.0 * (delta_h - delta_a))) if (delta_h + delta_a) > 0 else 0.5
+
+            theta = ({'price_sensitivity': rng.uniform(0.05, 0.2), 'base_conversion': 0.01, 'info_value': 1.0} if is_agent
+                     else {'price_sensitivity': rng.uniform(1.5, 4.0), 'base_conversion': rng.uniform(0.2, 0.5), 'info_value': 0.0})
+
+            # store session demand for downstream analysis
+            self._session_demands.append(SessionDemand(
+                session_id=sid, q=q, trajectory=events, delta_h=delta_h, delta_a=delta_a,
+                alpha_hat=alpha_hat, actor_class="A" if is_agent else "H", theta=theta))
+
+            viewed = list({e["product_idx"] for e in events if "product_idx" in e})
+            if not viewed:
+                vr = cfg.agent_views_range if is_agent else cfg.human_views_range
+                viewed = list(rng.choice(instruments.n, size=min(rng.integers(*vr), instruments.n), replace=False))
+
+            for vi, iid in enumerate(viewed):
+                opps.append(Opportunity(
+                    id=f"{sid}-{iid}", type=OpportunityType.SESSION, side=Side.BUY,
+                    instrument_id=int(iid), size=1.0, t=t + rng.uniform(0, dt),
+                    context={'session_id': sid, 'actor_class': 'AGENT' if is_agent else 'HUMAN', 'is_agent': is_agent,
+                             'reconnaissance_intent': is_agent, 'view_index': vi, 'total_views': len(viewed),
+                             'theta': theta, 'trajectory_events': fevts, 'mdp_trajectory': events,
+                             'demand_proxy': q, 'alpha_hat': alpha_hat, 'delta_h': delta_h, 'delta_a': delta_a}))
+        return opps
+
+
+@dataclass
+class AdversarialArrivalConfig:
+    base_rate: float = 5.0
+    n_parallel_agents: int = 3
+    query_all_products: bool = True
+
+
+class AdversarialArrivalModel:
+    """Adversarial coordination (Theorem 1): as N->inf, COI->0."""
+
+    def __init__(self, cfg: AdversarialArrivalConfig | None = None):
+        self.cfg = cfg or AdversarialArrivalConfig()
+        self._qcount = 0
+
+    def sample(self, t: float, dt: float, instruments: InstrumentSet,
+               market: MarketState | None, hidden: HiddenState, rng: np.random.Generator) -> list[Opportunity]:
+        cfg, opps = self.cfg, []
+        for _ in range(poisson_arrivals(cfg.base_rate, dt, rng)):
+            self._qcount += 1
+            for ai in range(cfg.n_parallel_agents):
+                sid = f"adv{self._qcount:06d}-{ai}"
+                prods = np.arange(instruments.n) if cfg.query_all_products else rng.choice(instruments.n, size=1)
+                for iid in prods:
+                    opps.append(Opportunity(
+                        id=f"{sid}-{iid}", type=OpportunityType.SESSION, side=Side.BUY,
+                        instrument_id=int(iid), size=1.0, t=t,
+                        context={'session_id': sid, 'actor_class': 'AGENT', 'is_agent': True, 'adversarial': True,
+                                 'agent_index': ai, 'query_group': self._qcount,
+                                 'theta': {'price_sensitivity': 0.0, 'base_conversion': 0.0, 'info_value': 1.0}}))
+        return opps
diff --git a/lab/case/thesis/execution.py b/lab/case/thesis/execution.py
new file mode 100644
index 0000000..5d2aa37
--- /dev/null
+++ b/lab/case/thesis/execution.py
@@ -0,0 +1,91 @@
+"""Execution models with divergent H/A behavior using ground truth labels."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict
+import numpy as np
+from ...outlet.types import Opportunity, Quote, InstrumentSet, MarketState
+from ...outlet.math_util import sigmoid, safe_log, EPS
+
+
+@dataclass
+class HybridExecutionConfig:
+    human_base_prob: float = 0.3
+    human_elasticity: float = 2.5
+    agent_conversion: float = 0.01
+    cross_elasticity: float = 0.4
+    quality_weight: float = 0.2
+    use_separability: bool = False
+
+
+class HybridExecutionModel:
+    """Execution with divergent H/A behavior using ground truth labels."""
+
+    def __init__(self, cfg: HybridExecutionConfig | None = None):
+        self.cfg = cfg or HybridExecutionConfig()
+
+    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
+             market: MarketState | None, rng: np.random.Generator) -> float:
+        cfg, idx = self.cfg, int(opp.instrument_id)
+        price, ref, cost = float(quote.prices[idx]), float(instruments.refs[idx]), float(instruments.costs[idx])
+        ctx = opp.context
+        theta = ctx.get('theta', {})
+        is_agent = ctx.get('is_agent', False)
+
+        if is_agent:
+            return cfg.agent_conversion * theta.get('base_conversion', 1.0)
+
+        # human logit discrete choice
+        sens = theta.get('price_sensitivity', cfg.human_elasticity)
+        base = theta.get('base_conversion', cfg.human_base_prob)
+        u_price = -sens * safe_log(price / (ref + EPS))
+        quality = instruments.instruments[idx].attrs.get('quality', 0.5)
+        u_quality = cfg.quality_weight * quality
+
+        u_comp = 0.0
+        if market and market.competitor_quotes is not None:
+            cp = market.competitor_quotes[idx]
+            if cp < price:
+                u_comp = -cfg.cross_elasticity * (price - cp) / ref
+
+        utility = safe_log(base / (1 - base + EPS)) + u_price + u_quality + u_comp
+        return float(sigmoid(utility))
+
+    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, context: dict[str, Any] | None = None) -> np.ndarray:
+        if context is None:
+            return fills / (self.cfg.human_base_prob + EPS)
+        agent_frac = context.get('contamination', 0.0)
+        return fills / (self.cfg.human_base_prob * (1 - agent_frac) + EPS)
+
+
+@dataclass
+class SeparableExecutionConfig:
+    human_funnel: Dict[str, float] = None
+    agent_funnel: Dict[str, float] = None
+
+    def __post_init__(self):
+        self.human_funnel = self.human_funnel or {'view_to_detail': 0.4, 'detail_to_cart': 0.3, 'cart_to_purchase': 0.6}
+        self.agent_funnel = self.agent_funnel or {'view_to_detail': 0.8, 'detail_to_cart': 0.05, 'cart_to_purchase': 0.1}
+
+
+class SeparableExecutionModel:
+    """Execution with Markov funnel kernels using ground truth labels."""
+
+    def __init__(self, cfg: SeparableExecutionConfig | None = None):
+        self.cfg = cfg or SeparableExecutionConfig()
+
+    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
+             market: MarketState | None, rng: np.random.Generator) -> float:
+        is_agent = opp.context.get('is_agent', False)
+        probs = self.cfg.agent_funnel if is_agent else self.cfg.human_funnel
+        p = probs['view_to_detail'] * probs['detail_to_cart'] * probs['cart_to_purchase']
+
+        if not is_agent:
+            idx = int(opp.instrument_id)
+            price_ratio = quote.prices[idx] / (instruments.refs[idx] + EPS)
+            p *= np.exp(-0.5 * (price_ratio - 1.0))
+        return float(np.clip(p, 0, 1))
+
+    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, context: dict[str, Any] | None = None) -> np.ndarray:
+        h = self.cfg.human_funnel
+        exp_conv = h['view_to_detail'] * h['detail_to_cart'] * h['cart_to_purchase']
+        return fills / (exp_conv + EPS)
diff --git a/lab/case/thesis/metrics.py b/lab/case/thesis/metrics.py
new file mode 100644
index 0000000..0cd9680
--- /dev/null
+++ b/lab/case/thesis/metrics.py
@@ -0,0 +1,102 @@
+"""Thesis metrics for COI and behavioral analysis using ground truth labels."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict
+import numpy as np
+from ...outlet.types import StepLogs, StepMetrics, Quote, InstrumentSet
+from ...outlet.math_util import safe_log, EPS
+
+
+@dataclass
+class COIMetrics:
+    coi_level: float = 0.0
+    coi_leakage: float = 0.0
+    realized_premium: float = 0.0
+    theoretical_max: float = 0.0
+    erosion_rate: float = 0.0
+
+    def to_dict(self) -> dict[str, float]:
+        return {k: getattr(self, k) for k in ['coi_level', 'coi_leakage', 'realized_premium', 'theoretical_max', 'erosion_rate']}
+
+
+def compute_coi(quote: Quote, instruments: InstrumentSet, metrics: StepMetrics, contamination: float) -> COIMetrics:
+    prices, costs, refs = quote.prices, instruments.costs, instruments.refs
+    margins = prices - costs
+    coi_level = float(np.mean(margins))
+    theoretical_max = float(np.mean(costs))
+    realized_premium = (metrics.revenue - metrics.cost) / metrics.units_traded if metrics.units_traded > 0 else 0.0
+    price_var = float(np.var(prices / refs))
+    coi_leakage = contamination * (coi_level + price_var)
+    erosion_rate = contamination * coi_level / (theoretical_max + EPS)
+    return COIMetrics(coi_level=coi_level, coi_leakage=coi_leakage, realized_premium=realized_premium,
+                      theoretical_max=theoretical_max, erosion_rate=erosion_rate)
+
+
+@dataclass
+class SeparabilityMetrics:
+    classification_accuracy: float = 0.0
+    estimated_alpha: float = 0.0
+    n_human_sessions: int = 0
+    n_agent_sessions: int = 0
+
+
+def compute_separability(logs: StepLogs, true_alpha: float) -> SeparabilityMetrics:
+    """Compute separability using ground truth labels only."""
+    if logs.events is None or len(logs.events) == 0:
+        return SeparabilityMetrics(estimated_alpha=true_alpha)
+
+    sessions: Dict[str, bool] = {}
+    for evt in logs.events:
+        sid = evt.metadata.get('session_id', evt.opportunity_id)
+        if sid not in sessions:
+            sessions[sid] = evt.metadata.get('is_agent', False)
+
+    n_agent = sum(1 for is_agent in sessions.values() if is_agent)
+    n_human = len(sessions) - n_agent
+    est_alpha = n_agent / len(sessions) if sessions else 0.0
+
+    return SeparabilityMetrics(
+        classification_accuracy=1.0,  # ground truth is always correct
+        estimated_alpha=est_alpha,
+        n_human_sessions=n_human,
+        n_agent_sessions=n_agent)
+
+
+@dataclass
+class RevenueAttribution:
+    total_revenue: float = 0.0
+    human_revenue: float = 0.0
+    agent_revenue: float = 0.0
+    human_conversion: float = 0.0
+    agent_conversion: float = 0.0
+
+
+def compute_attribution(logs: StepLogs, metrics: StepMetrics) -> RevenueAttribution:
+    if logs.executions is None:
+        return RevenueAttribution(total_revenue=metrics.revenue)
+
+    human_rev, agent_rev, human_cnt, agent_cnt = 0.0, 0.0, 0, 0
+    for exe in logs.executions:
+        if exe.propensity < 0.05:
+            agent_rev += exe.price * exe.size_filled
+            agent_cnt += 1
+        else:
+            human_rev += exe.price * exe.size_filled
+            human_cnt += 1
+
+    total_exp = logs.aggregates.get('n_arrivals', 1)
+    return RevenueAttribution(
+        total_revenue=metrics.revenue, human_revenue=human_rev, agent_revenue=agent_rev,
+        human_conversion=human_cnt / (total_exp * 0.8 + EPS),
+        agent_conversion=agent_cnt / (total_exp * 0.2 + EPS))
+
+
+def order_statistic_erosion(n_agents: int, price_variance: float) -> float:
+    """COI erosion from Theorem 1: as N->inf, min(p_1..p_N)->p_min."""
+    if n_agents <= 1:
+        return 0.0
+    sigma, log_n = np.sqrt(price_variance), safe_log(n_agents)
+    if log_n < 1:
+        return 0.0
+    shift = sigma * (np.sqrt(2 * log_n) - (safe_log(log_n) + safe_log(4 * np.pi)) / (2 * np.sqrt(2 * log_n) + EPS))
+    return float(min(shift / (sigma * 2 + EPS), 1.0))
diff --git a/lab/case/thesis/objectives.py b/lab/case/thesis/objectives.py
new file mode 100644
index 0000000..ba70320
--- /dev/null
+++ b/lab/case/thesis/objectives.py
@@ -0,0 +1,228 @@
+"""
+Thesis-specific objectives implementing robust pricing under contamination.
+
+Implements the Maximin objective from Eq 23:
+π* = argmax_π min_{Q ∈ U_ε} E_d~Q[R(p,d) - λ·COI(p)]
+
+Key components:
+- COIObjective: Cost of Information penalty (Definition 1)
+- RobustStackelbergObjective: Full maximin objective with Wasserstein robustness
+- UXPenalty: User experience degradation from volatility
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import numpy as np
+from ...outlet.objectives.base import BaseObjective, CompositeObjective
+from ...outlet.types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation
+from ...outlet.math_util import safe_log, EPS
+
+class COIObjective(BaseObjective):
+    """Cost of Information penalty from Definition 1.
+
+    COI(π) = E[P] - p_min
+
+    The expected price premium over marginal cost represents the platform's
+    pricing power. Agent reconnaissance erodes this by revealing price
+    distribution to buyers.
+
+    We implement COI_leakage = f(τ') · InfoValue(p, τ')
+    where f(τ') is the estimated agent probability.
+    """
+
+    def __init__(self, lambda_coi: float = 1.0, use_revelation: bool = False):
+        """
+        Args:
+            lambda_coi: Weight on COI penalty
+            use_revelation: If True, use -log(π(p)) as info value (penalizes rare prices)
+        """
+        self.lambda_coi = lambda_coi
+        self.use_revelation = use_revelation
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        # COI_leakage = α · InfoValue
+        alpha = hidden.contamination
+
+        if self.use_revelation:
+            # revelation surrogate: rare prices reveal more about policy
+            # InfoValue = -log(π(p|τ')) ≈ surprise of the price
+            price_surprise = np.mean(np.abs(quote.prices - instruments.refs) / (instruments.refs + EPS))
+            info_value = price_surprise
+        else:
+            # query-tax surrogate: each agent query incurs constant leakage
+            info_value = 1.0
+
+        leakage = alpha * info_value
+        return -self.lambda_coi * leakage
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        alpha = hidden.contamination
+        margins = (quote.prices - instruments.costs) / (instruments.costs + EPS)
+        return {
+            'coi_penalty': self.reward(quote, instruments, metrics, hidden, obs),
+            'contamination': alpha,
+            'avg_margin': float(np.mean(margins)),
+        }
+
+@dataclass
+class RobustObjectiveConfig:
+    """Configuration for robust Stackelberg objective.
+
+    Attributes:
+        lambda_coi: Weight on COI penalty (λ in Eq 23)
+        lambda_ux: Weight on UX penalty
+        lambda_volatility: Weight on price volatility penalty
+        gamma_inventory: Inventory risk aversion
+        wasserstein_epsilon: Ambiguity set radius (ε in Eq 21)
+    """
+    lambda_coi: float = 0.5
+    lambda_ux: float = 0.1
+    lambda_volatility: float = 0.2
+    gamma_inventory: float = 0.1
+    wasserstein_epsilon: float = 0.1
+
+class RobustStackelbergObjective(BaseObjective):
+    """Implements the Maximin Objective from thesis Eq 23.
+
+    π* = argmax_π min_{Q ∈ U_ε(P̂_N)} E_d~Q[R(p,d) - λ·COI(p)]
+
+    The objective balances:
+    1. Revenue R(p,d) from human purchases
+    2. COI penalty for information leakage to agents
+    3. UX penalty for price volatility
+    4. Inventory/holding costs
+
+    The min over ambiguity set U_ε is approximated by penalizing
+    high contamination scenarios more heavily.
+    """
+
+    def __init__(self, cfg: RobustObjectiveConfig | None = None):
+        self.cfg = cfg or RobustObjectiveConfig()
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        cfg = self.cfg
+
+        # 1. base revenue (R(p,d))
+        revenue = metrics.revenue
+        cost = metrics.cost
+        profit = revenue - cost
+
+        # 2. COI penalty: scales with contamination and margin extraction
+        # high margins + high contamination = high leakage
+        alpha = hidden.contamination
+        margins = quote.prices - instruments.costs
+        avg_margin = float(np.mean(margins))
+        coi_penalty = cfg.lambda_coi * avg_margin * alpha
+
+        # 3. UX penalty: price volatility harms legitimate users
+        volatility_penalty = cfg.lambda_volatility * metrics.volatility
+
+        # 4. inventory/position cost
+        position_penalty = cfg.gamma_inventory * metrics.position_cost
+
+        # 5. lost opportunity cost (stockouts)
+        lost_penalty = 0.1 * metrics.lost_opportunity
+
+        # robust adjustment: under adversarial distribution Q,
+        # expect lower revenue and higher costs
+        # approximate via worst-case contamination within ε-ball
+        worst_case_alpha = min(alpha + cfg.wasserstein_epsilon, 1.0)
+        robustness_penalty = cfg.wasserstein_epsilon * avg_margin * worst_case_alpha
+
+        total = profit - coi_penalty - volatility_penalty - position_penalty - lost_penalty - robustness_penalty
+
+        return total
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        cfg = self.cfg
+        alpha = hidden.contamination
+        margins = quote.prices - instruments.costs
+        avg_margin = float(np.mean(margins))
+
+        return {
+            'revenue': metrics.revenue,
+            'cost': metrics.cost,
+            'profit': metrics.revenue - metrics.cost,
+            'coi_penalty': -cfg.lambda_coi * avg_margin * alpha,
+            'volatility_penalty': -cfg.lambda_volatility * metrics.volatility,
+            'position_penalty': -cfg.gamma_inventory * metrics.position_cost,
+            'lost_penalty': -0.1 * metrics.lost_opportunity,
+            'robustness_penalty': -cfg.wasserstein_epsilon * avg_margin * min(alpha + cfg.wasserstein_epsilon, 1.0),
+            'contamination': alpha,
+            'avg_margin_pct': avg_margin / (float(np.mean(instruments.costs)) + EPS),
+        }
+
+class UXPenalty(BaseObjective):
+    """User experience penalty from price volatility.
+
+    High price volatility degrades UX for legitimate human users.
+    This term ensures the defense doesn't harm real customers while
+    protecting against agent reconnaissance.
+    """
+
+    def __init__(self, scale: float = 1.0, max_acceptable_volatility: float = 0.1):
+        self.scale = scale
+        self.max_vol = max_acceptable_volatility
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        # penalty increases quadratically beyond threshold
+        excess_vol = max(0, metrics.volatility - self.max_vol)
+        return -self.scale * (excess_vol ** 2)
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        return {
+            'ux_penalty': self.reward(quote, instruments, metrics, hidden, obs),
+            'volatility': metrics.volatility,
+        }
+
+class AdaptiveObjective(BaseObjective):
+    """Objective that adapts weights based on estimated contamination.
+
+    When contamination is low, focus on revenue maximization.
+    When contamination is high, increase COI defense weight.
+    """
+
+    def __init__(self, base_lambda_coi: float = 0.3, max_lambda_coi: float = 2.0,
+                 adaptation_rate: float = 2.0):
+        self.base_lambda = base_lambda_coi
+        self.max_lambda = max_lambda_coi
+        self.rate = adaptation_rate
+
+    def _adaptive_lambda(self, alpha: float) -> float:
+        # sigmoid scaling: λ(α) = base + (max-base) * sigmoid(rate*(α-0.5))
+        from ...outlet.math_util import sigmoid
+        scale = sigmoid(self.rate * (alpha - 0.3))
+        return self.base_lambda + (self.max_lambda - self.base_lambda) * scale
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        alpha = hidden.contamination
+        lambda_coi = self._adaptive_lambda(alpha)
+
+        profit = metrics.revenue - metrics.cost
+        margins = quote.prices - instruments.costs
+        coi_penalty = lambda_coi * float(np.mean(margins)) * alpha
+
+        return profit - coi_penalty
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        alpha = hidden.contamination
+        return {
+            'profit': metrics.revenue - metrics.cost,
+            'adaptive_lambda': self._adaptive_lambda(alpha),
+            'contamination': alpha,
+        }
+
+def make_thesis_objective(lambda_coi: float = 0.5, lambda_ux: float = 0.1,
+                          lambda_vol: float = 0.2) -> CompositeObjective:
+    """Create the standard thesis objective composition."""
+    return CompositeObjective([
+        (RobustStackelbergObjective(RobustObjectiveConfig(
+            lambda_coi=lambda_coi, lambda_ux=lambda_ux, lambda_volatility=lambda_vol)), 1.0),
+    ])
diff --git a/lab/case/thesis/platform.py b/lab/case/thesis/platform.py
new file mode 100644
index 0000000..ec00da5
--- /dev/null
+++ b/lab/case/thesis/platform.py
@@ -0,0 +1,176 @@
+"""Thesis platform with real MDP behavioral models and separability scoring."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+import numpy as np
+from ...outlet import (Platform, PlatformConfig, PositionModel, PositionConfig,
+                       PostedPriceMechanism, make_instruments, InstrumentType, LogLevel)
+from ...outlet.mechanisms.posted_price import PostedPriceConfig
+from ...outlet.observation import DefaultObservationBuilder, ObservationConfig
+from .arrivals import ContaminatedArrivalModel, ContaminatedArrivalConfig
+from .execution import HybridExecutionModel, HybridExecutionConfig
+from .objectives import RobustStackelbergObjective, RobustObjectiveConfig
+
+
+@dataclass
+class ThesisConfig:
+    # instruments
+    n_instruments: int = 10
+    cost_range: tuple[float, float] = (5.0, 50.0)
+    margin_range: tuple[float, float] = (0.2, 0.5)
+
+    # contamination (Section 3.1)
+    alpha_contamination: float = 0.2
+    alpha_drift: float = 0.0
+    alpha_bounds: tuple[float, float] = (0.0, 0.5)
+
+    # objectives (Eq 23)
+    lambda_coi: float = 0.5
+    lambda_ux: float = 0.1
+    lambda_volatility: float = 0.2
+    wasserstein_epsilon: float = 0.1
+
+    # arrivals
+    sessions_per_step: int = 30
+    human_views_range: tuple[int, int] = (1, 4)
+    agent_views_range: tuple[int, int] = (3, 10)
+
+    # inventory
+    initial_inventory: float = 100.0
+    holding_cost_rate: float = 0.002
+
+    # real behavioral models (from sim.rl)
+    use_real_behavior: bool = True
+    use_separability: bool = False  # disabled until classifier trained
+    human_data_dir: str = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data"
+    agent_data_dir: str = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data"
+
+    # simulation
+    max_steps: int = 500
+    seed: int | None = 24
+    log_level: LogLevel = LogLevel.AGG_ONLY
+
+
+def _resolve_data_dirs(cfg: ThesisConfig) -> tuple[str, str]:
+    """Resolve data directories for behavioral models."""
+    base = Path(__file__).parent.parent.parent.parent / "experiments"
+    human = cfg.human_data_dir or str(base / "collected_data")
+    agent = cfg.agent_data_dir or str(base / "agents/collected_data")
+    return human, agent
+
+
+def make_thesis_platform(cfg: ThesisConfig | None = None) -> Platform:
+    """Create platform with real MDP behavioral models.
+
+    Implements:
+    - Contaminated arrivals using learned MDP kernels from behavior_loader
+    - Hybrid execution with real separability scoring from lib.separability
+    - Robust Stackelberg objective (Eq 23)
+    """
+    cfg = cfg or ThesisConfig()
+    rng = np.random.default_rng(cfg.seed)
+    human_dir, agent_dir = _resolve_data_dirs(cfg)
+
+    instruments = make_instruments(
+        n=cfg.n_instruments, cost_range=cfg.cost_range, margin_range=cfg.margin_range,
+        inst_type=InstrumentType.SKU, rng=rng)
+    instruments.position = np.full(cfg.n_instruments, cfg.initial_inventory)
+
+    arrival = ContaminatedArrivalModel(ContaminatedArrivalConfig(
+        base_rate=cfg.sessions_per_step,
+        alpha_contamination=cfg.alpha_contamination,
+        alpha_drift=cfg.alpha_drift,
+        alpha_bounds=cfg.alpha_bounds,
+        human_views_range=cfg.human_views_range,
+        agent_views_range=cfg.agent_views_range,
+        use_real_behavior=cfg.use_real_behavior,
+        human_data_dir=human_dir,
+        agent_data_dir=agent_dir,
+    ))
+
+    execution = HybridExecutionModel(HybridExecutionConfig(
+        use_separability=cfg.use_separability,
+    ))
+
+    mechanism = PostedPriceMechanism(PostedPriceConfig(max_delta_pct=0.15, min_margin_pct=0.05))
+    position = PositionModel(PositionConfig(initial_position=cfg.initial_inventory, holding_cost_rate=cfg.holding_cost_rate))
+
+    market = None
+    objective = RobustStackelbergObjective(RobustObjectiveConfig(
+        lambda_coi=cfg.lambda_coi, lambda_ux=cfg.lambda_ux,
+        lambda_volatility=cfg.lambda_volatility, wasserstein_epsilon=cfg.wasserstein_epsilon))
+
+    obs_builder = DefaultObservationBuilder(ObservationConfig(mask_true_demand=True))
+    platform_cfg = PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps,
+                                   seed=cfg.seed, log_level=cfg.log_level, mask_demand=True)
+
+    return Platform(instruments=instruments, mechanism=mechanism, arrival=arrival, execution=execution,
+                    position=position, market=market, obs_builder=obs_builder, objective=objective, cfg=platform_cfg)
+
+
+@dataclass
+class AblationConfig(ThesisConfig):
+    disable_coi_penalty: bool = False
+    disable_ux_penalty: bool = False
+    disable_contamination: bool = False
+    disable_real_behavior: bool = False
+
+
+def make_ablation_platform(cfg: AblationConfig) -> Platform:
+    if cfg.disable_coi_penalty:
+        cfg.lambda_coi = 0.0
+    if cfg.disable_ux_penalty:
+        cfg.lambda_ux = 0.0
+    if cfg.disable_contamination:
+        cfg.alpha_contamination = 0.0
+    if cfg.disable_real_behavior:
+        cfg.use_real_behavior = False
+        cfg.use_separability = False
+    return make_thesis_platform(cfg)
+
+
+def sweep_contamination(alpha_values: list[float], base_cfg: ThesisConfig | None = None,
+                        n_steps: int = 100, seed: int = 42) -> dict[float, dict]:
+    """Test performance across contamination levels (Theorem 1 validation)."""
+    from ...experiments.eval import rollout, fixed_price_policy
+
+    results = {}
+    base_cfg = base_cfg or ThesisConfig()
+
+    for alpha in alpha_values:
+        cfg = ThesisConfig(**{k: v for k, v in base_cfg.__dict__.items() if k != 'alpha_contamination'},
+                          alpha_contamination=alpha)
+        platform = make_thesis_platform(cfg)
+        policy = fixed_price_policy(platform.instruments.refs)
+        result = rollout(platform, policy, n_steps, seed=seed)
+        results[alpha] = {
+            'total_reward': result.total_reward,
+            'total_pnl': result.total_pnl,
+            'avg_conversion': result.avg_conversion,
+            'final_contamination': platform._hidden.contamination,
+        }
+    return results
+
+
+def sweep_behavior_modes(base_cfg: ThesisConfig | None = None, n_steps: int = 100, seed: int = 42) -> dict[str, dict]:
+    """Compare real vs synthetic behavioral models."""
+    from ...experiments.eval import rollout, fixed_price_policy
+
+    base_cfg = base_cfg or ThesisConfig()
+    modes = {
+        'real_mdp': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': True, 'use_separability': True}),
+        'synthetic': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': False, 'use_separability': False}),
+        'real_mdp_no_sep': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': True, 'use_separability': False}),
+    }
+
+    results = {}
+    for name, cfg in modes.items():
+        platform = make_thesis_platform(cfg)
+        policy = fixed_price_policy(platform.instruments.refs)
+        result = rollout(platform, policy, n_steps, seed=seed)
+        results[name] = {
+            'total_reward': result.total_reward,
+            'total_pnl': result.total_pnl,
+            'avg_conversion': result.avg_conversion,
+        }
+    return results
diff --git a/lab/case/thesis/run_experiment.py b/lab/case/thesis/run_experiment.py
new file mode 100644
index 0000000..962db4f
--- /dev/null
+++ b/lab/case/thesis/run_experiment.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+"""Thesis simulation experiments with real MDP behavioral models."""
+from __future__ import annotations
+import sys
+from pathlib import Path
+
+if __name__ == '__main__':
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+
+from lab.case.thesis.platform import make_thesis_platform, ThesisConfig
+from lab.case.thesis.metrics import compute_coi, compute_separability
+from lab.experiments.eval import compare_policies
+import numpy as np
+
+
+def demo_basic_simulation():
+    print("=" * 70)
+    print("THESIS SIMULATION: Contaminated Dynamic Pricing (Real MDP Kernels)")
+    print("=" * 70)
+
+    cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.3, lambda_coi=0.5,
+                       max_steps=100, seed=42, use_real_behavior=True)
+    platform = make_thesis_platform(cfg)
+
+    print(f"\nInstruments: {platform.instruments.n}")
+    print(f"Reference prices: {platform.instruments.refs.round(2)}")
+    print(f"Costs: {platform.instruments.costs.round(2)}")
+    print(f"Initial contamination alpha={cfg.alpha_contamination}")
+    print(f"Using real behavior: {cfg.use_real_behavior}")
+
+    result = platform.reset(seed=42)
+    total_reward, coi_history = 0, []
+
+    print(f"\n{'Step':>5} {'Reward':>10} {'PnL':>10} {'COI':>8} {'alpha':>6} {'Conv':>8}")
+    print("-" * 55)
+
+    for t in range(cfg.max_steps):
+        action = platform.instruments.refs * np.random.uniform(0.95, 1.15, size=platform.instruments.n)
+        result = platform.step(action)
+        total_reward += result.reward
+        coi = compute_coi(platform._quote, platform.instruments, result.metrics, result.hidden.contamination)
+        coi_history.append(coi.coi_level)
+
+        if t % 20 == 0:
+            print(f"{t:5d} {result.reward:10.2f} {result.metrics.pnl:10.2f} "
+                  f"{coi.coi_level:8.2f} {result.hidden.contamination:6.2f} {result.metrics.conversion:8.3f}")
+
+    print("-" * 55)
+    print(f"Total Reward: {total_reward:.2f}")
+    print(f"Average COI: {np.mean(coi_history):.2f}")
+    print(f"COI Trend: {coi_history[-1] - coi_history[0]:+.2f}")
+
+
+def demo_contamination_sweep():
+    print("\n" + "=" * 70)
+    print("EXPERIMENT: COI Erosion vs Contamination (Theorem 1)")
+    print("=" * 70)
+
+    from lab.case.thesis.platform import sweep_contamination
+    trials = 20
+    alpha_values = [i/trials for i in range(trials)]
+    results = sweep_contamination(alpha_values, n_steps=100, seed=42)
+
+    print(f"\n{'alpha':>6} {'Reward':>12} {'PnL':>12} {'Conv':>10}")
+    print("-" * 45)
+    for alpha, m in sorted(results.items()):
+        print(f"{alpha:6.2f} {m['total_reward']:12.2f} {m['total_pnl']:12.2f} {m['avg_conversion']:10.3f}")
+
+    rewards = [results[a]['total_reward'] for a in sorted(results.keys())]
+    dataset = np.array([[a, r] for a, r in zip(alpha_values, rewards)])
+    trend = np.corrcoef(dataset[:, 0], dataset[:, 1])[0, 1]
+    print(f"Trend (alpha~reward correlation): {trend:.3f}")
+
+
+def demo_policy_comparison():
+    print("\n" + "=" * 70)
+    print("EXPERIMENT: Policy Comparison under Contamination")
+    print("=" * 70)
+
+    cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.25, max_steps=100, seed=42)
+    platform = make_thesis_platform(cfg)
+
+    def fixed_policy(obs, t): return platform.instruments.refs.copy(), 1.0
+    def aggressive_policy(obs, t): return platform.instruments.refs * 1.3, 1.0
+    def conservative_policy(obs, t): return platform.instruments.refs * 1.05, 1.0
+    def adaptive_policy(obs, t):
+        fills = obs[platform.instruments.n:2*platform.instruments.n]
+        exp = obs[2*platform.instruments.n:3*platform.instruments.n]
+        conv = np.sum(fills) / (np.sum(exp) + 1e-8)
+        return platform.instruments.refs * (1.0 + 0.2 * conv), 1.0
+
+    policies = {'fixed': fixed_policy, 'aggressive': aggressive_policy,
+                'conservative': conservative_policy, 'adaptive': adaptive_policy}
+    results = compare_policies(platform, policies, n_steps=100, n_runs=3, seed=42)
+
+    print(f"\n{'Policy':>15} {'Reward':>12} {'Std':>10} {'PnL':>12} {'Conv':>10}")
+    print("-" * 65)
+    for name, r in sorted(results.items(), key=lambda x: -x[1]['mean_reward']):
+        print(f"{name:>15} {r['mean_reward']:12.2f} {r['std_reward']:10.2f} "
+              f"{r['mean_pnl']:12.2f} {r['mean_conversion']:10.3f}")
+
+
+def demo_session_analysis():
+    """Analyze session-level behavior from MDP trajectories."""
+    print("\n" + "=" * 70)
+    print("EXPERIMENT: Session Analysis (Ground Truth)")
+    print("=" * 70)
+
+    from lab.outlet.constants import LogLevel
+    cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.3, max_steps=50,
+                       log_level=LogLevel.FULL, seed=42, use_real_behavior=True)
+    platform = make_thesis_platform(cfg)
+
+    result = platform.reset(seed=42)
+    human_sessions, agent_sessions = 0, 0
+
+    for t in range(cfg.max_steps):
+        action = platform.instruments.refs * 1.1
+        result = platform.step(action)
+        sep = compute_separability(result.logs, result.hidden.contamination)
+        human_sessions += sep.n_human_sessions
+        agent_sessions += sep.n_agent_sessions
+
+    total = human_sessions + agent_sessions
+    print(f"\nTotal sessions: {total}")
+    print(f"Human sessions: {human_sessions} ({100*human_sessions/total:.1f}%)")
+    print(f"Agent sessions: {agent_sessions} ({100*agent_sessions/total:.1f}%)")
+    print(f"True contamination: {cfg.alpha_contamination:.1%}")
+    print(f"Observed contamination: {agent_sessions/total:.1%}")
+
+
+if __name__ == '__main__':
+    demo_basic_simulation()
+    demo_contamination_sweep()
+    # demo_policy_comparison()
+    # demo_session_analysis()
diff --git a/lab/config.py b/lab/config.py
new file mode 100644
index 0000000..441085d
--- /dev/null
+++ b/lab/config.py
@@ -0,0 +1,156 @@
+"""
+Configuration and factory functions for creating pre-configured platforms.
+
+This module provides:
+- RetailConfig, MarketMakingConfig: Configuration dataclasses
+- make_retail_platform: Factory for retail dynamic pricing scenarios
+- make_market_making_platform: Factory for market making scenarios
+
+Example:
+    >>> from lab.config import make_retail_platform
+    >>> platform = make_retail_platform(RetailConfig(n_instruments=5))
+    >>> result = platform.reset(seed=42)
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import numpy as np
+from .outlet import (Platform, PlatformConfig, PositionModel, PositionConfig,
+                     PostedPriceMechanism, TwoSidedMechanism, make_instruments,
+                     InstrumentType, LogLevel)
+from .outlet.mechanisms.posted_price import PostedPriceConfig
+from .outlet.mechanisms.two_sided import TwoSidedConfig
+from .population import (SessionArrivalModel, PoissonArrivalModel, HawkesArrivalModel,
+                         ElasticityExecutionModel, IntensityExecutionModel,
+                         ReactiveCompetitorModel, GBMMarketModel)
+from .population.arrivals import SessionArrivalConfig, PoissonArrivalConfig, HawkesArrivalConfig
+from .population.execution import ElasticityConfig, IntensityConfig
+from .population.competitors import ReactiveCompetitorConfig, GBMMarketConfig
+from .outlet.objectives.factory import retail_objective, market_making_objective
+
+@dataclass
+class RetailConfig:
+    """Configuration for retail dynamic pricing scenario.
+
+    Attributes:
+        n_instruments: Number of products to price
+        cost_range: (min, max) for random product costs
+        margin_range: (min, max) for random initial margins
+        initial_inventory: Starting inventory per product
+        holding_cost_rate: Cost per unit per step for holding
+        sessions_per_step: Number of browsing sessions per step
+        contamination: Fraction of sessions that are scrapers
+        max_steps: Maximum episode length
+        seed: Random seed for reproducibility
+    """
+    n_instruments: int = 10
+    cost_range: tuple[float, float] = (5.0, 50.0)
+    margin_range: tuple[float, float] = (0.2, 0.5)
+    initial_inventory: float = 100.0
+    holding_cost_rate: float = 0.002
+    sessions_per_step: int = 30
+    contamination: float = 0.1
+    max_steps: int = 500
+    seed: int | None = None
+
+def make_retail_platform(cfg: RetailConfig | None = None) -> Platform:
+    """Create a pre-configured retail dynamic pricing platform.
+
+    Components:
+    - Mechanism: PostedPriceMechanism (single price per product)
+    - Arrivals: SessionArrivalModel (browsing sessions with views)
+    - Execution: ElasticityExecutionModel (price sensitivity)
+    - Market: ReactiveCompetitorModel (can trigger price wars)
+    - Objective: PnL - holding_cost - volatility - lost_opportunity
+
+    Args:
+        cfg: Configuration (uses defaults if None)
+
+    Returns:
+        Configured Platform instance
+    """
+    cfg = cfg or RetailConfig()
+    rng = np.random.default_rng(cfg.seed)
+
+    instruments = make_instruments(cfg.n_instruments, cfg.cost_range, cfg.margin_range,
+                                   InstrumentType.SKU, rng)
+    instruments.position = np.full(cfg.n_instruments, cfg.initial_inventory)
+
+    mechanism = PostedPriceMechanism(PostedPriceConfig())
+    arrival = SessionArrivalModel(SessionArrivalConfig(
+        sessions_per_step=cfg.sessions_per_step, contamination=cfg.contamination))
+    execution = ElasticityExecutionModel(ElasticityConfig())
+    position = PositionModel(PositionConfig(
+        initial_position=cfg.initial_inventory,
+        holding_cost_rate=cfg.holding_cost_rate))
+    market = ReactiveCompetitorModel(ReactiveCompetitorConfig(), refs=instruments.refs)
+    objective = retail_objective()
+
+    return Platform(
+        instruments=instruments, mechanism=mechanism, arrival=arrival,
+        execution=execution, position=position, market=market, objective=objective,
+        cfg=PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps,
+                           seed=cfg.seed, log_level=LogLevel.AGG_ONLY)
+    )
+
+@dataclass
+class MarketMakingConfig:
+    """Configuration for market making scenario.
+
+    Attributes:
+        n_instruments: Number of assets to quote
+        initial_mid: Initial mid-price for assets
+        mu: Price drift (expected return)
+        sigma: Price volatility
+        gamma: Inventory risk aversion parameter
+        base_arrival_rate: Order arrival rate (Hawkes baseline)
+        max_steps: Maximum episode length
+        seed: Random seed for reproducibility
+    """
+    n_instruments: int = 5
+    initial_mid: float = 100.0
+    mu: float = 0.0
+    sigma: float = 0.02
+    gamma: float = 0.1
+    base_arrival_rate: float = 20.0
+    max_steps: int = 1000
+    seed: int | None = None
+
+def make_market_making_platform(cfg: MarketMakingConfig | None = None) -> Platform:
+    """Create a pre-configured market making platform.
+
+    Components:
+    - Mechanism: TwoSidedMechanism (bid-ask spread quoting)
+    - Arrivals: HawkesArrivalModel (clustered order flow)
+    - Execution: IntensityExecutionModel (distance-based fills)
+    - Market: GBMMarketModel (geometric Brownian motion mid-prices)
+    - Objective: PnL + spread_capture - inventory_risk
+
+    Args:
+        cfg: Configuration (uses defaults if None)
+
+    Returns:
+        Configured Platform instance
+    """
+    cfg = cfg or MarketMakingConfig()
+    rng = np.random.default_rng(cfg.seed)
+
+    instruments = make_instruments(cfg.n_instruments, (cfg.initial_mid*0.9, cfg.initial_mid*1.1),
+                                   (0.0, 0.0), InstrumentType.ASSET, rng)
+    instruments.position = np.zeros(cfg.n_instruments)
+
+    mechanism = TwoSidedMechanism(TwoSidedConfig())
+    arrival = HawkesArrivalModel(HawkesArrivalConfig(base_rate=cfg.base_arrival_rate))
+    execution = IntensityExecutionModel(IntensityConfig())
+    position = PositionModel(PositionConfig(
+        initial_position=0.0, min_position=-500, max_position=500,
+        holding_cost_rate=0.0))  # use inventory risk penalty instead
+    market = GBMMarketModel(GBMMarketConfig(mu=cfg.mu, sigma=cfg.sigma),
+                            initial=instruments.refs)
+    objective = market_making_objective(gamma=cfg.gamma, sigma=cfg.sigma)
+
+    return Platform(
+        instruments=instruments, mechanism=mechanism, arrival=arrival,
+        execution=execution, position=position, market=market, objective=objective,
+        cfg=PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps,
+                           seed=cfg.seed, log_level=LogLevel.AGG_ONLY)
+    )
diff --git a/lab/docs/Makefile b/lab/docs/Makefile
new file mode 100644
index 0000000..fe8e88c
--- /dev/null
+++ b/lab/docs/Makefile
@@ -0,0 +1,12 @@
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/lab/docs/conf.py b/lab/docs/conf.py
new file mode 100644
index 0000000..0e39351
--- /dev/null
+++ b/lab/docs/conf.py
@@ -0,0 +1,39 @@
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../..'))
+
+project = 'Quote-Control Simulator'
+copyright = '2025, PHANTOM Research'
+author = 'PHANTOM Research'
+release = '0.1.0'
+
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.autosummary',
+]
+
+templates_path = ['_templates']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+html_theme = 'alabaster'
+html_static_path = ['_static']
+
+autodoc_default_options = {
+    'members': True,
+    'undoc-members': True,
+    'show-inheritance': True,
+}
+
+napoleon_google_docstring = True
+napoleon_numpy_docstring = True
+napoleon_include_init_with_doc = True
+
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'numpy': ('https://numpy.org/doc/stable/', None),
+}
+
+autosummary_generate = True
diff --git a/lab/docs/index.rst b/lab/docs/index.rst
new file mode 100644
index 0000000..b53fbba
--- /dev/null
+++ b/lab/docs/index.rst
@@ -0,0 +1,39 @@
+Quote-Control Simulator
+=======================
+
+Research-grade platform for dynamic pricing and market making experiments.
+
+The platform abstracts pricing as: **Quote → Arrival → Execution → Position**
+
+Supports multiple mechanisms:
+
+* **PostedPrice**: retail dynamic pricing
+* **TwoSided**: market making with bid-ask spreads
+* **Auction**: reserve/shading for auction settings
+
+Quick Start
+-----------
+
+.. code-block:: python
+
+   from lab.config import make_retail_platform
+   from lab.experiments import rollout, fixed_price_policy
+
+   platform = make_retail_platform()
+   policy = fixed_price_policy(platform.instruments.refs)
+   result = rollout(platform, policy, n_steps=100)
+   print(f"Total PnL: {result.total_pnl:.2f}")
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   modules/outlet
+   modules/population
+   modules/experiments
+
+Indices
+-------
+
+* :ref:`genindex`
+* :ref:`modindex`
diff --git a/lab/docs/modules/experiments.rst b/lab/docs/modules/experiments.rst
new file mode 100644
index 0000000..c71ee36
--- /dev/null
+++ b/lab/docs/modules/experiments.rst
@@ -0,0 +1,14 @@
+Experiments
+===========
+
+Evaluation & OPE
+----------------
+
+.. automodule:: lab.experiments.eval
+   :members:
+
+Configuration
+-------------
+
+.. automodule:: lab.config
+   :members:
diff --git a/lab/docs/modules/outlet.rst b/lab/docs/modules/outlet.rst
new file mode 100644
index 0000000..9f3b8c3
--- /dev/null
+++ b/lab/docs/modules/outlet.rst
@@ -0,0 +1,77 @@
+Outlet (Core Simulator)
+=======================
+
+Types
+-----
+
+.. automodule:: lab.outlet.types
+   :members:
+
+Constants
+---------
+
+.. automodule:: lab.outlet.constants
+   :members:
+
+Protocols
+---------
+
+.. automodule:: lab.outlet.protocols
+   :members:
+
+Platform
+--------
+
+.. automodule:: lab.outlet.platform
+   :members:
+
+Stock & Position
+----------------
+
+.. automodule:: lab.outlet.stock
+   :members:
+
+Observation
+-----------
+
+.. automodule:: lab.outlet.observation
+   :members:
+
+Mechanisms
+----------
+
+Posted Price
+~~~~~~~~~~~~
+
+.. automodule:: lab.outlet.mechanisms.posted_price
+   :members:
+
+Two-Sided (Market Making)
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: lab.outlet.mechanisms.two_sided
+   :members:
+
+Auction
+~~~~~~~
+
+.. automodule:: lab.outlet.mechanisms.auction
+   :members:
+
+Objectives
+----------
+
+.. automodule:: lab.outlet.objectives.base
+   :members:
+
+.. automodule:: lab.outlet.objectives.penalties
+   :members:
+
+.. automodule:: lab.outlet.objectives.factory
+   :members:
+
+Math Utilities
+--------------
+
+.. automodule:: lab.outlet.math_util
+   :members:
diff --git a/lab/docs/modules/population.rst b/lab/docs/modules/population.rst
new file mode 100644
index 0000000..0b7ef75
--- /dev/null
+++ b/lab/docs/modules/population.rst
@@ -0,0 +1,20 @@
+Population Models
+=================
+
+Arrival Models
+--------------
+
+.. automodule:: lab.population.arrivals
+   :members:
+
+Execution Models
+----------------
+
+.. automodule:: lab.population.execution
+   :members:
+
+Competitor / Market Models
+--------------------------
+
+.. automodule:: lab.population.competitors
+   :members:
diff --git a/lab/experiments/__init__.py b/lab/experiments/__init__.py
new file mode 100644
index 0000000..ac427f3
--- /dev/null
+++ b/lab/experiments/__init__.py
@@ -0,0 +1,7 @@
+from .eval import (rollout, RolloutResult, compare_policies, compute_ips, OPEResult,
+                   fixed_price_policy, cost_plus_margin_policy, random_walk_policy, epsilon_greedy_policy)
+
+__all__ = [
+    'rollout', 'RolloutResult', 'compare_policies', 'compute_ips', 'OPEResult',
+    'fixed_price_policy', 'cost_plus_margin_policy', 'random_walk_policy', 'epsilon_greedy_policy',
+]
diff --git a/lab/experiments/eval.py b/lab/experiments/eval.py
new file mode 100644
index 0000000..8bc9330
--- /dev/null
+++ b/lab/experiments/eval.py
@@ -0,0 +1,213 @@
+"""
+Evaluation utilities for policy testing and off-policy evaluation.
+
+This module provides:
+- rollout: Run a policy on the platform for multiple steps
+- compare_policies: Compare multiple policies with statistics
+- Baseline policies: fixed_price, cost_plus_margin, random_walk, epsilon_greedy
+- OPE estimators: IPS and SNIPS for off-policy evaluation
+
+Example:
+    >>> from lab.config import make_retail_platform
+    >>> from lab.experiments.eval import rollout, fixed_price_policy
+    >>> platform = make_retail_platform()
+    >>> policy = fixed_price_policy(platform.instruments.refs)
+    >>> result = rollout(platform, policy, n_steps=100)
+    >>> print(f"Total PnL: {result.total_pnl:.2f}")
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, Any
+import numpy as np
+from ..outlet.platform import Platform
+from ..outlet.types import StepResult, StepLogs, Quote
+
+# Policy signature: takes (observation_flat, timestep) -> (action_prices, propensity)
+Policy = Callable[[np.ndarray, int], tuple[np.ndarray, float]]
+
+@dataclass
+class RolloutResult:
+    """Results from a policy rollout.
+
+    Attributes:
+        rewards: Per-step rewards
+        metrics: Per-step StepMetrics objects
+        logs: Per-step StepLogs objects
+        total_reward: Sum of rewards
+        total_pnl: Sum of PnL from metrics
+        avg_conversion: Average conversion rate
+    """
+    rewards: list[float]
+    metrics: list[Any]
+    logs: list[StepLogs]
+    total_reward: float
+    total_pnl: float
+    avg_conversion: float
+
+def rollout(platform: Platform, policy: Policy, n_steps: int, seed: int | None = None) -> RolloutResult:
+    """Execute a policy on the platform for n_steps.
+
+    Args:
+        platform: The simulation platform
+        policy: Function (obs, t) -> (action, propensity)
+        n_steps: Number of steps to run
+        seed: Random seed for reproducibility
+
+    Returns:
+        RolloutResult with rewards, metrics, and summary statistics
+    """
+    result = platform.reset(seed)
+    rewards, metrics, logs = [], [], []
+
+    for t in range(n_steps):
+        obs_flat = result.obs.to_flat()
+        action, propensity = policy(obs_flat, t)
+        result = platform.step(action, propensity)
+        rewards.append(result.reward)
+        metrics.append(result.metrics)
+        logs.append(result.logs)
+        if result.terminated or result.truncated:
+            break
+
+    return RolloutResult(
+        rewards=rewards, metrics=metrics, logs=logs,
+        total_reward=sum(rewards),
+        total_pnl=sum(m.pnl for m in metrics),
+        avg_conversion=np.mean([m.conversion for m in metrics])
+    )
+
+# Baseline policies for comparison
+
+def fixed_price_policy(refs: np.ndarray) -> Policy:
+    """Policy that always quotes at reference prices."""
+    def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
+        return refs.copy(), 1.0
+    return policy
+
+def cost_plus_margin_policy(costs: np.ndarray, margin: float = 0.3) -> Policy:
+    """Policy that quotes at cost * (1 + margin)."""
+    prices = costs * (1 + margin)
+    def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
+        return prices.copy(), 1.0
+    return policy
+
+def random_walk_policy(refs: np.ndarray, volatility: float = 0.05,
+                       rng: np.random.Generator | None = None) -> Policy:
+    """Policy that performs a random walk around reference prices."""
+    rng = rng or np.random.default_rng()
+    prices = refs.copy()
+    def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
+        nonlocal prices
+        delta = rng.normal(0, volatility, len(prices))
+        prices = prices * (1 + delta)
+        prices = np.clip(prices, refs * 0.5, refs * 2.0)
+        return prices.copy(), 1.0
+    return policy
+
+def epsilon_greedy_policy(base_policy: Policy, refs: np.ndarray,
+                          epsilon: float = 0.1, rng: np.random.Generator | None = None) -> Policy:
+    """Wrap a policy with epsilon-greedy exploration."""
+    rng = rng or np.random.default_rng()
+    def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
+        if rng.random() < epsilon:
+            action = refs * rng.uniform(0.8, 1.2, len(refs))
+            return action, epsilon / len(refs)
+        else:
+            action, _ = base_policy(obs, t)
+            return action, 1 - epsilon
+    return policy
+
+# Off-Policy Evaluation (OPE)
+
+@dataclass
+class OPEResult:
+    """Results from off-policy evaluation.
+
+    Attributes:
+        ips_estimate: Inverse Propensity Scoring estimate
+        snips_estimate: Self-normalized IPS estimate (more stable)
+        n_samples: Number of samples used
+        effective_samples: Effective sample size (accounts for variance)
+    """
+    ips_estimate: float
+    snips_estimate: float
+    n_samples: int
+    effective_samples: float
+
+def compute_ips(logs: list[StepLogs], rewards: list[float],
+                target_policy: Policy, behavior_propensities: list[float] | None = None) -> OPEResult:
+    """Compute IPS and SNIPS estimators for off-policy evaluation.
+
+    Uses logged propensities to estimate expected reward under a target
+    policy from data collected under a behavior policy.
+
+    Args:
+        logs: Step logs containing propensities
+        rewards: Observed rewards from behavior policy
+        target_policy: Policy to evaluate (not currently used, assumes deterministic)
+        behavior_propensities: Override propensities if not in logs
+
+    Returns:
+        OPEResult with IPS, SNIPS estimates and sample statistics
+    """
+    if behavior_propensities is None:
+        # extract from logs
+        behavior_propensities = []
+        for log in logs:
+            if log.executions:
+                avg_prop = np.mean([e.propensity for e in log.executions])
+            else:
+                avg_prop = 1.0
+            behavior_propensities.append(avg_prop)
+
+    # compute importance weights
+    weights = []
+    for i, (log, bp) in enumerate(zip(logs, behavior_propensities)):
+        # target propensity would need obs reconstruction - simplified here
+        tp = 1.0  # assume deterministic target
+        w = tp / (bp + 1e-8)
+        weights.append(w)
+
+    weights = np.array(weights)
+    rewards = np.array(rewards)
+
+    # IPS estimate
+    ips = np.sum(weights * rewards) / len(rewards)
+
+    # SNIPS (self-normalized)
+    snips = np.sum(weights * rewards) / (np.sum(weights) + 1e-8)
+
+    # effective sample size
+    ess = (np.sum(weights) ** 2) / (np.sum(weights ** 2) + 1e-8)
+
+    return OPEResult(ips_estimate=ips, snips_estimate=snips,
+                     n_samples=len(rewards), effective_samples=ess)
+
+def compare_policies(platform: Platform, policies: dict[str, Policy],
+                     n_steps: int = 100, n_runs: int = 5, seed: int = 42) -> dict[str, dict]:
+    """Compare multiple policies with statistical summary.
+
+    Args:
+        platform: Simulation platform
+        policies: Dict mapping policy names to policy functions
+        n_steps: Steps per rollout
+        n_runs: Number of rollouts per policy (different seeds)
+        seed: Base random seed
+
+    Returns:
+        Dict mapping policy names to result dicts with mean/std statistics
+    """
+    results = {}
+    for name, policy in policies.items():
+        run_results = []
+        for i in range(n_runs):
+            r = rollout(platform, policy, n_steps, seed=seed + i)
+            run_results.append(r)
+
+        results[name] = {
+            'mean_reward': np.mean([r.total_reward for r in run_results]),
+            'std_reward': np.std([r.total_reward for r in run_results]),
+            'mean_pnl': np.mean([r.total_pnl for r in run_results]),
+            'mean_conversion': np.mean([r.avg_conversion for r in run_results]),
+        }
+    return results
diff --git a/lab/outlet/__init__.py b/lab/outlet/__init__.py
new file mode 100644
index 0000000..11a8d76
--- /dev/null
+++ b/lab/outlet/__init__.py
@@ -0,0 +1,17 @@
+from .constants import Side, MechanismType, InstrumentType, OpportunityType, EventType, LogLevel
+from .types import (Instrument, InstrumentSet, Quote, Opportunity, Execution,
+                    StepEvent, StepLogs, StepMetrics, MarketState, HiddenState, Observation, StepResult)
+from .stock import PositionModel, PositionConfig, make_instruments
+from .platform import Platform, PlatformConfig
+from .observation import DefaultObservationBuilder, ObservationConfig
+from .mechanisms import PostedPriceMechanism, TwoSidedMechanism, AuctionMechanism
+
+__all__ = [
+    'Side', 'MechanismType', 'InstrumentType', 'OpportunityType', 'EventType', 'LogLevel',
+    'Instrument', 'InstrumentSet', 'Quote', 'Opportunity', 'Execution',
+    'StepEvent', 'StepLogs', 'StepMetrics', 'MarketState', 'HiddenState', 'Observation', 'StepResult',
+    'PositionModel', 'PositionConfig', 'make_instruments',
+    'Platform', 'PlatformConfig',
+    'DefaultObservationBuilder', 'ObservationConfig',
+    'PostedPriceMechanism', 'TwoSidedMechanism', 'AuctionMechanism',
+]
diff --git a/lab/outlet/constants.py b/lab/outlet/constants.py
new file mode 100644
index 0000000..27c7da2
--- /dev/null
+++ b/lab/outlet/constants.py
@@ -0,0 +1,83 @@
+"""
+Constants and enumerations for the Quote-Control simulator.
+
+This module defines the core enums used throughout the platform to ensure
+type safety and consistent semantics across different pricing mechanisms.
+"""
+from enum import Enum, auto
+
+class Side(Enum):
+    """Transaction side indicator.
+
+    Attributes:
+        BUY: Buyer-initiated transaction (customer purchases, market buy order)
+        SELL: Seller-initiated transaction (market sell order, short sale)
+    """
+    BUY = auto()
+    SELL = auto()
+
+class MechanismType(Enum):
+    """Pricing mechanism type defining how quotes translate to executions.
+
+    Attributes:
+        POSTED_PRICE: Single posted price per instrument (retail dynamic pricing)
+        TWO_SIDED_QUOTE: Bid-ask spread quoting (market making, liquidity provision)
+        AUCTION: Reserve price or bid shading (ad auctions, marketplaces)
+    """
+    POSTED_PRICE = auto()
+    TWO_SIDED_QUOTE = auto()
+    AUCTION = auto()
+
+class InstrumentType(Enum):
+    """Type of instrument being priced.
+
+    Attributes:
+        SKU: Retail product with inventory constraints
+        ASSET: Financial instrument with position limits
+        LOAN: Credit product with interest rate pricing
+        SUBSCRIPTION: Recurring service with periodic fees
+    """
+    SKU = auto()
+    ASSET = auto()
+    LOAN = auto()
+    SUBSCRIPTION = auto()
+
+class OpportunityType(Enum):
+    """Type of arrival opportunity.
+
+    Attributes:
+        SESSION: Retail browsing session with potential purchase intent
+        MARKET_ORDER: Financial market order arrival (buy or sell)
+        REQUEST: Service or credit request requiring quote response
+    """
+    SESSION = auto()
+    MARKET_ORDER = auto()
+    REQUEST = auto()
+
+class EventType(Enum):
+    """Type of logged event during simulation.
+
+    Attributes:
+        ARRIVAL: New opportunity arrived in the system
+        EXPOSURE: Quote was shown to an arrival
+        EXECUTION: Transaction was executed
+        ABANDON: Opportunity abandoned without execution
+        CANCEL: Pending order was cancelled
+    """
+    ARRIVAL = auto()
+    EXPOSURE = auto()
+    EXECUTION = auto()
+    ABANDON = auto()
+    CANCEL = auto()
+
+class LogLevel(Enum):
+    """Verbosity level for step logging.
+
+    Attributes:
+        NONE: No logging, fastest execution
+        AGG_ONLY: Only aggregate statistics per step
+        FULL: Full event-level logging with propensities for OPE
+    """
+    NONE = auto()
+    AGG_ONLY = auto()
+    FULL = auto()
diff --git a/lab/outlet/gym_wrapper.py b/lab/outlet/gym_wrapper.py
new file mode 100644
index 0000000..790adcf
--- /dev/null
+++ b/lab/outlet/gym_wrapper.py
@@ -0,0 +1,86 @@
+"""
+Gymnasium-compatible wrapper for the Quote-Control platform.
+
+Provides a standard Gym interface for RL training:
+- observation_space: Box space with flattened observation
+- action_space: Box space with price multipliers [0.5, 2.0]
+- reset(), step(), render(), close() methods
+
+Example:
+    >>> from lab.config import make_retail_platform
+    >>> from lab.outlet.gym_wrapper import QuoteGymEnv
+    >>> env = QuoteGymEnv(make_retail_platform())
+    >>> obs, info = env.reset()
+    >>> obs, reward, done, truncated, info = env.step(env.action_space.sample())
+"""
+from __future__ import annotations
+from typing import Any
+import numpy as np
+
+try:
+    import gymnasium as gym
+    from gymnasium import spaces
+    HAS_GYM = True
+except ImportError:
+    HAS_GYM = False
+
+from .platform import Platform, PlatformConfig
+from .types import Quote, InstrumentSet, StepResult
+
+class QuoteGymEnv:
+    """Gymnasium-compatible environment wrapper.
+
+    Wraps a Platform instance with standard Gym interface.
+    Actions are price multipliers in [0.5, 2.0] applied to reference prices.
+    Observations are flattened numpy arrays containing quotes, fills, exposures.
+    """
+
+    def __init__(self, platform: Platform):
+        if not HAS_GYM:
+            raise ImportError("gymnasium required for QuoteGymEnv")
+        self.platform = platform
+        self.n = platform.instruments.n
+        self._last_result: StepResult | None = None
+
+        # action space: price adjustments as multipliers [0.5, 2.0]
+        self.action_space = spaces.Box(low=0.5, high=2.0, shape=(self.n,), dtype=np.float32)
+
+        # observation space
+        obs_dim = self.n * 4  # quotes + fills + exposures + position
+        if platform.market:
+            obs_dim += self.n  # competitor quotes
+        self.observation_space = spaces.Box(low=-np.inf, high=np.inf,
+                                            shape=(obs_dim,), dtype=np.float32)
+
+    def reset(self, seed: int | None = None, options: dict | None = None) -> tuple[np.ndarray, dict]:
+        result = self.platform.reset(seed)
+        self._last_result = result
+        return result.obs.to_flat().astype(np.float32), result.info
+
+    def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, bool, dict]:
+        # convert action (multipliers) to absolute prices
+        refs = self.platform.instruments.refs
+        prices = refs * action
+        result = self.platform.step(prices)
+        self._last_result = result
+        return (result.obs.to_flat().astype(np.float32), result.reward,
+                result.terminated, result.truncated, result.info)
+
+    def render(self) -> None:
+        if self._last_result:
+            m = self._last_result.metrics
+            print(f"t={self.platform._t} pnl={m.pnl:.2f} units={m.units_traded:.0f} "
+                  f"conv={m.conversion:.3f} vol={m.volatility:.3f}")
+
+    def close(self) -> None:
+        pass
+
+def make_env(platform: Platform) -> QuoteGymEnv:
+    return QuoteGymEnv(platform)
+
+if HAS_GYM:
+    # register if gymnasium available
+    try:
+        gym.register(id='QuoteControl-v0', entry_point='outlet.gym_wrapper:QuoteGymEnv')
+    except:
+        pass  # already registered or other issue
diff --git a/lab/outlet/math_util.py b/lab/outlet/math_util.py
new file mode 100644
index 0000000..da78745
--- /dev/null
+++ b/lab/outlet/math_util.py
@@ -0,0 +1,57 @@
+"""
+Numerical utilities for stable computation.
+
+This module provides numerically stable implementations of common operations:
+- safe_exp, safe_log: Avoid overflow/underflow
+- softmax: Numerically stable softmax
+- sigmoid, clamp: Standard transformations
+- intensity_decay: Avellaneda-Stoikov fill intensity
+- inventory_penalty: Quadratic inventory risk
+- poisson_arrivals, hawkes_intensity: Arrival process helpers
+
+All functions accept both scalars and numpy arrays.
+"""
+import numpy as np
+
+EPS = 1e-8  # small constant to avoid division by zero
+MAX_EXP = 700.0  # maximum safe exponent to avoid overflow
+
+def safe_exp(x: np.ndarray | float) -> np.ndarray | float:
+    return np.exp(np.clip(x, -MAX_EXP, MAX_EXP))
+
+def safe_log(x: np.ndarray | float) -> np.ndarray | float:
+    return np.log(np.maximum(x, EPS))
+
+def clamp(x: np.ndarray | float, lo: float, hi: float) -> np.ndarray | float:
+    return np.clip(x, lo, hi)
+
+def sigmoid(x: np.ndarray | float) -> np.ndarray | float:
+    return 1.0 / (1.0 + safe_exp(-x))
+
+def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
+    x_max = np.max(x, axis=axis, keepdims=True)
+    exp_x = safe_exp(x - x_max)
+    return exp_x / (np.sum(exp_x, axis=axis, keepdims=True) + EPS)
+
+def geometric_series(base: float, ratio: float, n: int) -> np.ndarray:
+    return base * (ratio ** np.arange(n))
+
+def ema(old: float, new: float, alpha: float = 0.1) -> float:
+    return alpha * new + (1 - alpha) * old
+
+def intensity_decay(distance: float, kappa: float = 1.0) -> float:
+    """Avellaneda-Stoikov style fill intensity decay with quote distance"""
+    return safe_exp(-kappa * distance)
+
+def inventory_penalty(q: float, gamma: float = 0.1, sigma: float = 1.0) -> float:
+    """Quadratic inventory risk penalty"""
+    return gamma * sigma**2 * q**2 / 2
+
+def poisson_arrivals(rate: float, dt: float, rng: np.random.Generator) -> int:
+    return rng.poisson(rate * dt)
+
+def hawkes_intensity(base: float, history: np.ndarray, alpha: float, beta: float, t: float) -> float:
+    """Self-exciting Hawkes process intensity"""
+    if len(history) == 0: return base
+    decays = safe_exp(-beta * (t - history[history < t]))
+    return base + alpha * np.sum(decays)
diff --git a/lab/outlet/mechanisms/__init__.py b/lab/outlet/mechanisms/__init__.py
new file mode 100644
index 0000000..3c3c36e
--- /dev/null
+++ b/lab/outlet/mechanisms/__init__.py
@@ -0,0 +1,5 @@
+from .posted_price import PostedPriceMechanism
+from .two_sided import TwoSidedMechanism
+from .auction import AuctionMechanism
+
+__all__ = ['PostedPriceMechanism', 'TwoSidedMechanism', 'AuctionMechanism']
diff --git a/lab/outlet/mechanisms/auction.py b/lab/outlet/mechanisms/auction.py
new file mode 100644
index 0000000..2260aef
--- /dev/null
+++ b/lab/outlet/mechanisms/auction.py
@@ -0,0 +1,73 @@
+"""
+Auction mechanism for reserve pricing and bid shading.
+
+In this mechanism, the agent sets reserve prices that affect
+win probability and clearing prices. Used for ad auctions,
+marketplace auctions, and similar settings.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import numpy as np
+from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState
+from ..constants import Side
+from ..math_util import clamp, sigmoid
+
+@dataclass
+class AuctionConfig:
+    """Configuration for auction mechanism.
+
+    Attributes:
+        min_reserve: Minimum reserve price
+        max_reserve: Maximum reserve price
+        base_win_prob: Baseline win probability at reference reserve
+        sensitivity: How much higher reserves reduce win probability
+    """
+    min_reserve: float = 0.0
+    max_reserve: float = 100.0
+    base_win_prob: float = 0.3
+    sensitivity: float = 2.0
+
+class AuctionMechanism:
+    """Auction mechanism for reserve pricing.
+
+    The agent sets reserve prices that affect:
+    - Win probability: higher reserves reduce chance of winning
+    - Clearing price: bounded between reserve and simulated max bid
+
+    Win probability: base_prob * sigmoid(-sensitivity * (reserve - ref) / ref)
+    Clearing price: max(reserve, min(max_bid, reserve + random_increment))
+
+    Only BUY-side opportunities are processed (auction wins).
+    """
+
+    def __init__(self, cfg: AuctionConfig | None = None):
+        self.cfg = cfg or AuctionConfig()
+
+    def apply_quote(self, quote: Quote, instruments: InstrumentSet,
+                    rng: np.random.Generator) -> Quote:
+        reserves = clamp(quote.prices, self.cfg.min_reserve, self.cfg.max_reserve)
+        return Quote(prices=reserves, propensity=quote.propensity, metadata=quote.metadata)
+
+    def process_opportunity(self, opp: Opportunity, quote: Quote,
+                            instruments: InstrumentSet, market: MarketState | None,
+                            rng: np.random.Generator) -> Execution | None:
+        if opp.side != Side.BUY: return None
+        idx = int(opp.instrument_id)
+        reserve = float(quote.prices[idx])
+        ref = instruments.refs[idx]
+
+        # win probability decreases with higher reserve
+        relative_reserve = (reserve - ref) / (ref + 1e-8)
+        win_prob = self.cfg.base_win_prob * sigmoid(-self.cfg.sensitivity * relative_reserve)
+
+        if rng.random() > win_prob: return None
+
+        # clearing price is between reserve and some max bid (simulated)
+        max_bid = ref * (1 + rng.exponential(0.2))
+        clearing = max(reserve, min(max_bid, reserve + rng.exponential(0.1) * ref))
+
+        return Execution(
+            opportunity_id=opp.id, instrument_id=opp.instrument_id,
+            side=opp.side, size_requested=opp.size, size_filled=opp.size,
+            price=clearing, propensity=quote.propensity * win_prob, t=opp.t
+        )
diff --git a/lab/outlet/mechanisms/posted_price.py b/lab/outlet/mechanisms/posted_price.py
new file mode 100644
index 0000000..92bac12
--- /dev/null
+++ b/lab/outlet/mechanisms/posted_price.py
@@ -0,0 +1,84 @@
+"""
+Posted price mechanism for retail dynamic pricing.
+
+In this mechanism, the agent posts a single price per instrument.
+Buyers decide whether to purchase based on the posted price.
+This is the standard e-commerce dynamic pricing model.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import numpy as np
+from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState
+from ..constants import Side
+from ..math_util import clamp
+
+@dataclass
+class PostedPriceConfig:
+    """Configuration for posted price mechanism.
+
+    Attributes:
+        min_price: Absolute minimum price
+        max_price: Absolute maximum price
+        max_delta_pct: Maximum price change per step as fraction of previous
+        min_margin_pct: Minimum margin over cost basis
+        round_to: Price rounding granularity (None = no rounding)
+    """
+    min_price: float = 0.01
+    max_price: float = 1000.0
+    max_delta_pct: float = 0.2
+    min_margin_pct: float = 0.05
+    round_to: float | None = 0.01
+
+class PostedPriceMechanism:
+    """Posted price mechanism for retail dynamic pricing.
+
+    The agent posts a single price per product. Constraints enforced:
+    - Prices within [min_price, max_price]
+    - Margin at least min_margin_pct above cost
+    - Price changes limited to max_delta_pct per step
+    - Prices rounded to round_to granularity
+
+    Only BUY-side opportunities are processed (customers purchasing).
+    """
+
+    def __init__(self, cfg: PostedPriceConfig | None = None):
+        self.cfg = cfg or PostedPriceConfig()
+
+    def apply_quote(self, quote: Quote, instruments: InstrumentSet,
+                    rng: np.random.Generator) -> Quote:
+        prices = quote.prices.copy()
+        costs = instruments.costs
+        refs = instruments.refs
+        c = self.cfg
+
+        # enforce min margin
+        min_prices = costs * (1 + c.min_margin_pct)
+        prices = np.maximum(prices, min_prices)
+
+        # enforce absolute bounds
+        prices = clamp(prices, c.min_price, c.max_price)
+
+        # enforce max delta if we have history
+        if 'prev_prices' in quote.metadata:
+            prev = quote.metadata['prev_prices']
+            max_change = prev * c.max_delta_pct
+            prices = clamp(prices, prev - max_change, prev + max_change)
+
+        # round prices
+        if c.round_to:
+            prices = np.round(prices / c.round_to) * c.round_to
+
+        return Quote(prices=prices, propensity=quote.propensity,
+                     metadata={**quote.metadata, 'prev_prices': prices})
+
+    def process_opportunity(self, opp: Opportunity, quote: Quote,
+                            instruments: InstrumentSet, market: MarketState | None,
+                            rng: np.random.Generator) -> Execution | None:
+        if opp.side != Side.BUY: return None  # posted price is buy-only
+        idx = int(opp.instrument_id)
+        price = float(quote.prices[idx])
+        return Execution(
+            opportunity_id=opp.id, instrument_id=opp.instrument_id,
+            side=opp.side, size_requested=opp.size, size_filled=opp.size,
+            price=price, propensity=quote.propensity, t=opp.t
+        )
diff --git a/lab/outlet/mechanisms/two_sided.py b/lab/outlet/mechanisms/two_sided.py
new file mode 100644
index 0000000..166f4d9
--- /dev/null
+++ b/lab/outlet/mechanisms/two_sided.py
@@ -0,0 +1,89 @@
+"""
+Two-sided quoting mechanism for market making.
+
+In this mechanism, the agent posts both bid and ask prices.
+Execution depends on the distance from the market mid-price.
+This models liquidity provision in financial markets.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import numpy as np
+from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState
+from ..constants import Side
+from ..math_util import clamp, intensity_decay
+
+@dataclass
+class TwoSidedConfig:
+    """Configuration for two-sided quoting mechanism.
+
+    Attributes:
+        min_spread: Minimum bid-ask spread
+        max_spread: Maximum bid-ask spread
+        min_price: Absolute minimum price
+        max_price: Absolute maximum price
+        fill_kappa: Intensity decay parameter (higher = faster decay with distance)
+    """
+    min_spread: float = 0.01
+    max_spread: float = 0.5
+    min_price: float = 0.01
+    max_price: float = 10000.0
+    fill_kappa: float = 1.5
+
+class TwoSidedMechanism:
+    """Two-sided quoting mechanism for market making.
+
+    The agent posts bid (buy) and ask (sell) prices around a mid-point.
+    Fill probability decays exponentially with distance from mid-price,
+    following the Avellaneda-Stoikov intensity model.
+
+    Both BUY and SELL opportunities are processed:
+    - BUY: customer buys at agent's ask price
+    - SELL: customer sells at agent's bid price
+    """
+
+    def __init__(self, cfg: TwoSidedConfig | None = None):
+        self.cfg = cfg or TwoSidedConfig()
+
+    def apply_quote(self, quote: Quote, instruments: InstrumentSet,
+                    rng: np.random.Generator) -> Quote:
+        prices = quote.prices.copy()
+        spreads = quote.spreads.copy() if quote.spreads is not None else np.full_like(prices, 0.02)
+        c = self.cfg
+
+        prices = clamp(prices, c.min_price, c.max_price)
+        spreads = clamp(spreads, c.min_spread, c.max_spread)
+
+        # ensure bids < asks
+        half_spread = spreads / 2
+        bids = prices - half_spread
+        asks = prices + half_spread
+        bids = np.maximum(bids, c.min_price)
+        asks = np.minimum(asks, c.max_price)
+        spreads = asks - bids
+        prices = (bids + asks) / 2
+
+        return Quote(prices=prices, spreads=spreads, propensity=quote.propensity,
+                     metadata=quote.metadata)
+
+    def process_opportunity(self, opp: Opportunity, quote: Quote,
+                            instruments: InstrumentSet, market: MarketState | None,
+                            rng: np.random.Generator) -> Execution | None:
+        idx = int(opp.instrument_id)
+        mid = market.mid_prices[idx] if market and market.mid_prices is not None else quote.prices[idx]
+
+        if opp.side == Side.BUY:
+            price = float(quote.asks[idx]) if quote.asks is not None else float(quote.prices[idx])
+            distance = price - mid
+        else:
+            price = float(quote.bids[idx]) if quote.bids is not None else float(quote.prices[idx])
+            distance = mid - price
+
+        # probabilistic fill based on distance from mid
+        fill_prob = intensity_decay(abs(distance), self.cfg.fill_kappa)
+        if rng.random() > fill_prob: return None
+
+        return Execution(
+            opportunity_id=opp.id, instrument_id=opp.instrument_id,
+            side=opp.side, size_requested=opp.size, size_filled=opp.size,
+            price=price, propensity=quote.propensity * fill_prob, t=opp.t
+        )
diff --git a/lab/outlet/objectives/__init__.py b/lab/outlet/objectives/__init__.py
new file mode 100644
index 0000000..063b7a5
--- /dev/null
+++ b/lab/outlet/objectives/__init__.py
@@ -0,0 +1,11 @@
+from .base import BaseObjective, CompositeObjective
+from .penalties import (PnLObjective, VolatilityPenalty, HoldingCostPenalty,
+                        LostOpportunityCostPenalty, InventoryRiskPenalty, SpreadCaptureReward)
+from .factory import make_objective, make_composite, retail_objective, market_making_objective
+
+__all__ = [
+    'BaseObjective', 'CompositeObjective',
+    'PnLObjective', 'VolatilityPenalty', 'HoldingCostPenalty',
+    'LostOpportunityCostPenalty', 'InventoryRiskPenalty', 'SpreadCaptureReward',
+    'make_objective', 'make_composite', 'retail_objective', 'market_making_objective',
+]
diff --git a/lab/outlet/objectives/base.py b/lab/outlet/objectives/base.py
new file mode 100644
index 0000000..49847aa
--- /dev/null
+++ b/lab/outlet/objectives/base.py
@@ -0,0 +1,48 @@
+"""
+Base classes for reward objectives.
+
+Objectives compute scalar rewards from step metrics. The CompositeObjective
+allows combining multiple objectives with weights for multi-objective optimization.
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from ..types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation
+
+class BaseObjective(ABC):
+    """Abstract base class for reward objectives.
+
+    Subclasses must implement reward() and breakdown() methods.
+    """
+
+    @abstractmethod
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: ...
+
+    @abstractmethod
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: ...
+
+class CompositeObjective(BaseObjective):
+    """Weighted sum of multiple objectives.
+
+    Allows combining multiple reward terms (e.g., PnL - holding_cost - volatility).
+
+    Args:
+        objectives: List of (objective, weight) tuples
+    """
+
+    def __init__(self, objectives: list[tuple[BaseObjective, float]]):
+        self.objectives = objectives
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        return sum(w * obj.reward(quote, instruments, metrics, hidden, obs)
+                   for obj, w in self.objectives)
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        bd = {}
+        for obj, w in self.objectives:
+            for k, v in obj.breakdown(quote, instruments, metrics, hidden, obs).items():
+                bd[k] = w * v
+        return bd
diff --git a/lab/outlet/objectives/factory.py b/lab/outlet/objectives/factory.py
new file mode 100644
index 0000000..6e75294
--- /dev/null
+++ b/lab/outlet/objectives/factory.py
@@ -0,0 +1,82 @@
+"""
+Factory functions for creating objectives.
+
+Provides:
+- make_objective: Create single objective by name
+- make_composite: Create weighted combination of objectives
+- retail_objective: Default objective for retail pricing
+- market_making_objective: Default objective for market making
+"""
+from __future__ import annotations
+from .base import BaseObjective, CompositeObjective
+from .penalties import (PnLObjective, VolatilityPenalty, HoldingCostPenalty,
+                        LostOpportunityCostPenalty, InventoryRiskPenalty, SpreadCaptureReward)
+
+REGISTRY: dict[str, type[BaseObjective]] = {
+    'pnl': PnLObjective,
+    'volatility': VolatilityPenalty,
+    'holding_cost': HoldingCostPenalty,
+    'lost_opportunity': LostOpportunityCostPenalty,
+    'inventory_risk': InventoryRiskPenalty,
+    'spread_capture': SpreadCaptureReward,
+}
+
+def make_objective(name: str, **kwargs) -> BaseObjective:
+    """Create an objective by name.
+
+    Args:
+        name: Objective name (pnl, volatility, holding_cost, lost_opportunity,
+              inventory_risk, spread_capture)
+        **kwargs: Passed to objective constructor
+
+    Returns:
+        Instantiated objective
+    """
+    if name not in REGISTRY:
+        raise ValueError(f"Unknown objective: {name}. Available: {list(REGISTRY.keys())}")
+    return REGISTRY[name](**kwargs)
+
+def make_composite(spec: list[tuple[str, float, dict]] | dict[str, float]) -> CompositeObjective:
+    """Create composite objective from specification.
+
+    Args:
+        spec: Either:
+            - list of (name, weight, kwargs) tuples for full control
+            - dict of {name: weight} for simple cases
+
+    Returns:
+        CompositeObjective with specified components
+    """
+    objectives = []
+    if isinstance(spec, dict):
+        for name, weight in spec.items():
+            objectives.append((make_objective(name), weight))
+    else:
+        for name, weight, kwargs in spec:
+            objectives.append((make_objective(name, **kwargs), weight))
+    return CompositeObjective(objectives)
+
+def retail_objective(volatility_weight: float = 0.1, holding_weight: float = 0.5,
+                     stockout_weight: float = 0.3) -> CompositeObjective:
+    """Default objective for retail dynamic pricing.
+
+    Reward = PnL - volatility_weight*volatility - holding_weight*holding_cost
+             - stockout_weight*lost_opportunity
+    """
+    return make_composite({
+        'pnl': 1.0,
+        'volatility': volatility_weight,
+        'holding_cost': holding_weight,
+        'lost_opportunity': stockout_weight,
+    })
+
+def market_making_objective(gamma: float = 0.1, sigma: float = 1.0) -> CompositeObjective:
+    """Default objective for market making.
+
+    Reward = PnL + 0.5*spread_capture - inventory_risk(gamma, sigma)
+    """
+    return CompositeObjective([
+        (PnLObjective(), 1.0),
+        (SpreadCaptureReward(), 0.5),
+        (InventoryRiskPenalty(gamma=gamma, sigma=sigma), 1.0),
+    ])
diff --git a/lab/outlet/objectives/penalties.py b/lab/outlet/objectives/penalties.py
new file mode 100644
index 0000000..916e0e2
--- /dev/null
+++ b/lab/outlet/objectives/penalties.py
@@ -0,0 +1,101 @@
+"""
+Standard objective components and penalties.
+
+This module provides common reward terms:
+- PnLObjective: Basic profit and loss
+- VolatilityPenalty: Penalize price volatility for UX
+- HoldingCostPenalty: Inventory holding cost
+- LostOpportunityCostPenalty: Stockout/missed fill cost
+- InventoryRiskPenalty: Quadratic inventory risk (market making)
+- SpreadCaptureReward: Bid-ask spread capture (market making)
+"""
+from __future__ import annotations
+import numpy as np
+from .base import BaseObjective
+from ..types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation
+from ..math_util import inventory_penalty
+
+class PnLObjective(BaseObjective):
+    """Profit and loss reward (revenue - cost)."""
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        return metrics.pnl
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        return {'pnl': metrics.pnl, 'revenue': metrics.revenue, 'cost': metrics.cost}
+
+class VolatilityPenalty(BaseObjective):
+    """Penalize price volatility for user experience."""
+
+    def __init__(self, scale: float = 1.0):
+        self.scale = scale
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        return -self.scale * metrics.volatility
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        return {'volatility_penalty': -self.scale * metrics.volatility}
+
+class HoldingCostPenalty(BaseObjective):
+    """Penalty for inventory holding costs."""
+
+    def __init__(self, scale: float = 1.0):
+        self.scale = scale
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        return -self.scale * metrics.position_cost
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        return {'holding_cost_penalty': -self.scale * metrics.position_cost}
+
+class LostOpportunityCostPenalty(BaseObjective):
+    """Penalty for lost sales due to stockouts or missed fills."""
+
+    def __init__(self, scale: float = 1.0):
+        self.scale = scale
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        return -self.scale * metrics.lost_opportunity
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        return {'lost_opportunity_penalty': -self.scale * metrics.lost_opportunity}
+
+class InventoryRiskPenalty(BaseObjective):
+    """Quadratic inventory risk penalty (Avellaneda-Stoikov style).
+
+    Penalty = gamma * sigma^2 * q^2 / 2, where q is total position.
+    Encourages market makers to keep inventory near zero.
+    """
+
+    def __init__(self, gamma: float = 0.1, sigma: float = 1.0):
+        self.gamma = gamma
+        self.sigma = sigma
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        if obs.position is None: return 0.0
+        q = np.sum(obs.position)
+        return -inventory_penalty(q, self.gamma, self.sigma)
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        return {'inventory_risk_penalty': self.reward(quote, instruments, metrics, hidden, obs)}
+
+class SpreadCaptureReward(BaseObjective):
+    """Reward for capturing bid-ask spread in market making."""
+
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
+        return metrics.spread_capture
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
+        return {'spread_capture': metrics.spread_capture}
diff --git a/lab/outlet/observation.py b/lab/outlet/observation.py
new file mode 100644
index 0000000..cffc71b
--- /dev/null
+++ b/lab/outlet/observation.py
@@ -0,0 +1,92 @@
+"""
+Observation construction with demand censoring.
+
+This module provides the ObservationBuilder that constructs agent observations
+from step data. The key invariant is that observations only contain censored
+data (fills) and never true demand, ensuring proper research conditions.
+
+The ObservationConfig controls what is included in observations:
+- Position visibility
+- Market/competitor visibility
+- Demand proxy method
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import numpy as np
+from .types import Quote, InstrumentSet, StepLogs, StepMetrics, MarketState, HiddenState, Observation
+
+@dataclass
+class ObservationConfig:
+    """Configuration for observation construction.
+
+    Attributes:
+        include_position: Include current position in observation
+        include_market: Include market/competitor state in observation
+        mask_true_demand: If True, observation excludes true demand (research mode)
+        demand_proxy: Method for demand proxy ('fills', 'exposures', 'weighted')
+        exposure_weights: Weights for weighted demand proxy
+    """
+    include_position: bool = True
+    include_market: bool = True
+    mask_true_demand: bool = True
+    demand_proxy: str = 'fills'
+    exposure_weights: dict[str, float] | None = None
+
+class DefaultObservationBuilder:
+    """Constructs censored observations for the agent.
+
+    Ensures the key research invariant: observations contain only
+    censored fills (realized sales), never true demand. True demand
+    is placed in the info dict for research analysis only.
+    """
+
+    def __init__(self, cfg: ObservationConfig | None = None):
+        self.cfg = cfg or ObservationConfig()
+
+    def build(self, quote: Quote, instruments: InstrumentSet, logs: StepLogs,
+              metrics: StepMetrics, market: MarketState | None,
+              hidden: HiddenState, mask_demand: bool, t: int) -> Observation:
+        n = instruments.n
+        cfg = self.cfg
+
+        # always show censored fills
+        fills = logs.censored_fills if logs.censored_fills is not None else np.zeros(n)
+
+        # compute exposures from logs
+        if logs.events:
+            exposures = np.zeros(n)
+            for e in logs.events:
+                if e.instrument_id is not None:
+                    exposures[e.instrument_id] += 1
+        else:
+            exposures = logs.aggregates.get('exposures', np.zeros(n))
+
+        # position - only if configured and available
+        position = None
+        if cfg.include_position and instruments.position is not None:
+            position = instruments.position.copy()
+
+        # market state - only if configured
+        obs_market = market if cfg.include_market else None
+
+        return Observation(
+            quotes=quote.prices.copy(),
+            position=position,
+            fills=fills,
+            exposures=exposures,
+            market=obs_market,
+            t=t
+        )
+
+    def make_space(self, n_instruments: int, include_market: bool = True) -> dict:
+        """Returns dict describing observation space for gym"""
+        space = {
+            'quotes': {'shape': (n_instruments,), 'low': 0, 'high': np.inf},
+            'fills': {'shape': (n_instruments,), 'low': 0, 'high': np.inf},
+            'exposures': {'shape': (n_instruments,), 'low': 0, 'high': np.inf},
+        }
+        if self.cfg.include_position:
+            space['position'] = {'shape': (n_instruments,), 'low': -np.inf, 'high': np.inf}
+        if include_market:
+            space['competitor_quotes'] = {'shape': (n_instruments,), 'low': 0, 'high': np.inf}
+        return space
diff --git a/lab/outlet/platform.py b/lab/outlet/platform.py
new file mode 100644
index 0000000..eabb69a
--- /dev/null
+++ b/lab/outlet/platform.py
@@ -0,0 +1,285 @@
+"""
+Main simulation platform orchestrating the Quote-Control loop.
+
+The Platform class is the central coordinator that:
+1. Receives pricing actions (quotes) from the agent
+2. Generates arrivals via the ArrivalModel
+3. Processes executions via Mechanism and ExecutionModel
+4. Applies position censorship via PositionModel
+5. Computes metrics and reward via Objective
+6. Returns censored observations
+
+Example:
+    >>> from lab.config import make_retail_platform
+    >>> platform = make_retail_platform()
+    >>> result = platform.reset(seed=42)
+    >>> result = platform.step(platform.instruments.refs * 1.1)
+    >>> print(f"PnL: {result.metrics.pnl:.2f}")
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+import numpy as np
+from .types import (Quote, Opportunity, Execution, InstrumentSet, StepLogs, StepMetrics,
+                    StepEvent, MarketState, HiddenState, Observation, StepResult)
+from .constants import LogLevel, EventType, Side
+from .protocols import Mechanism, ArrivalModel, ExecutionModel, PositionModel, MarketModel, ObservationBuilder, Objective
+from .stock import PositionModel as DefaultPositionModel, PositionConfig
+from .observation import DefaultObservationBuilder, ObservationConfig
+from .objectives.factory import retail_objective
+
+@dataclass
+class PlatformConfig:
+    """Configuration for the simulation platform.
+
+    Attributes:
+        n_instruments: Number of instruments in the simulation
+        max_steps: Maximum steps before episode terminates
+        dt: Time duration per step (affects arrival rates)
+        log_level: Verbosity of logging (NONE, AGG_ONLY, FULL)
+        mask_demand: If True, observations exclude true demand (research mode)
+        seed: Random seed for reproducibility
+    """
+    n_instruments: int = 10
+    max_steps: int = 1000
+    dt: float = 1.0
+    log_level: LogLevel = LogLevel.AGG_ONLY
+    mask_demand: bool = True
+    seed: int | None = None
+
+class Platform:
+    """Main simulation orchestrator implementing Quote -> Arrival -> Execution -> Position.
+
+    The Platform coordinates all components to simulate a pricing environment:
+    - Mechanism: validates quotes and determines execution logic
+    - ArrivalModel: generates demand opportunities
+    - ExecutionModel: computes acceptance probabilities
+    - PositionModel: manages inventory/position and censorship
+    - MarketModel: updates competitor/market state
+    - ObservationBuilder: constructs censored observations
+    - Objective: computes reward from metrics
+
+    Attributes:
+        instruments: The instrument set being priced
+        mechanism: Quote validation and execution mechanism
+        arrival: Demand arrival generator
+        execution: Acceptance probability model
+        position: Inventory/position manager
+        market: Competitor/market dynamics (optional)
+        obs_builder: Observation constructor
+        objective: Reward function
+        cfg: Platform configuration
+    """
+
+    def __init__(self, instruments: InstrumentSet, mechanism: Mechanism,
+                 arrival: ArrivalModel, execution: ExecutionModel,
+                 position: PositionModel | None = None,
+                 market: MarketModel | None = None,
+                 obs_builder: ObservationBuilder | None = None,
+                 objective: Objective | None = None,
+                 cfg: PlatformConfig | None = None):
+        self.instruments = instruments
+        self.mechanism = mechanism
+        self.arrival = arrival
+        self.execution = execution
+        self.position = position or DefaultPositionModel(PositionConfig())
+        self.market = market
+        self.obs_builder = obs_builder or DefaultObservationBuilder()
+        self.objective = objective or retail_objective()
+        self.cfg = cfg or PlatformConfig(n_instruments=instruments.n)
+
+        self._t: int = 0
+        self._rng: np.random.Generator = np.random.default_rng(self.cfg.seed)
+        self._quote: Quote | None = None
+        self._market_state: MarketState | None = None
+        self._hidden: HiddenState = HiddenState()
+        self._prev_prices: np.ndarray | None = None
+
+    def reset(self, seed: int | None = None) -> StepResult:
+        """Reset the platform to initial state.
+
+        Args:
+            seed: Random seed (overrides config seed if provided)
+
+        Returns:
+            Initial StepResult with zeroed metrics and initial observation
+        """
+        self._t = 0
+        self._rng = np.random.default_rng(seed or self.cfg.seed)
+        self._hidden = HiddenState()
+        self._prev_prices = self.instruments.refs.copy()
+
+        # reset position
+        self.position.reset(self.instruments, self._rng)
+        self.instruments.position = self.position.position
+
+        # initial quote at reference prices
+        self._quote = Quote(prices=self.instruments.refs.copy(), propensity=1.0,
+                            metadata={'prev_prices': self._prev_prices})
+        self._quote = self.mechanism.apply_quote(self._quote, self.instruments, self._rng)
+
+        # initial market state
+        if self.market:
+            self._market_state = self.market.step(0, self._quote, self._hidden, self._rng)
+
+        # build initial observation
+        logs = StepLogs(aggregates={'reset': True},
+                        true_demand=np.zeros(self.instruments.n),
+                        censored_fills=np.zeros(self.instruments.n))
+        metrics = StepMetrics()
+        obs = self.obs_builder.build(self._quote, self.instruments, logs, metrics,
+                                     self._market_state, self._hidden, self.cfg.mask_demand, 0)
+
+        return StepResult(obs=obs, reward=0.0, terminated=False, truncated=False,
+                          info={'true_demand': logs.true_demand}, metrics=metrics,
+                          logs=logs, hidden=self._hidden)
+
+    def step(self, action: np.ndarray, propensity: float = 1.0) -> StepResult:
+        """Execute one simulation step with the given pricing action.
+
+        The step proceeds as follows:
+        1. Apply quote constraints via mechanism
+        2. Update market/competitor state
+        3. Generate arrivals
+        4. Process arrivals -> executions with acceptance check
+        5. Apply position censorship to executions
+        6. Update position state
+        7. Compute metrics (PnL, costs, etc.)
+        8. Build logs with propensities
+        9. Construct censored observation
+        10. Compute reward
+
+        Args:
+            action: Price vector for all instruments
+            propensity: P(action | behavior policy) for OPE logging
+
+        Returns:
+            StepResult containing observation, reward, metrics, logs, and hidden state
+        """
+        self._t += 1
+        cfg = self.cfg
+
+        # 1. apply quote from action
+        self._quote = Quote(prices=action, propensity=propensity,
+                            metadata={'prev_prices': self._prev_prices})
+        self._quote = self.mechanism.apply_quote(self._quote, self.instruments, self._rng)
+        self._prev_prices = self._quote.prices.copy()
+        self._hidden.quote_history.append(self._quote.prices.copy())
+
+        # 2. update market/competitors
+        if self.market:
+            self._market_state = self.market.step(self._t, self._quote, self._hidden, self._rng)
+            self._hidden.market_history.append(self._market_state)
+
+        # 3. generate arrivals
+        opps = self.arrival.sample(self._t, cfg.dt, self.instruments,
+                                   self._market_state, self._hidden, self._rng)
+
+        # 4. process opportunities -> executions
+        executions: list[Execution] = []
+        events: list[StepEvent] = []
+        true_demand = np.zeros(self.instruments.n)
+
+        for opp in opps:
+            # log exposure
+            if cfg.log_level == LogLevel.FULL:
+                events.append(StepEvent(t=opp.t, type=EventType.EXPOSURE,
+                                        instrument_id=opp.instrument_id,
+                                        opportunity_id=opp.id,
+                                        price=float(self._quote.prices[opp.instrument_id]),
+                                        propensity=self._quote.propensity))
+
+            # check acceptance
+            prob = self.execution.prob(opp, self._quote, self.instruments,
+                                       self._market_state, self._rng)
+            if self._rng.random() < prob:
+                # create execution
+                exe = self.mechanism.process_opportunity(opp, self._quote, self.instruments,
+                                                         self._market_state, self._rng)
+                if exe:
+                    true_demand[exe.instrument_id] += exe.size_requested
+                    # apply position censorship
+                    exe = self.position.apply_execution(exe)
+                    executions.append(exe)
+                    if cfg.log_level == LogLevel.FULL:
+                        events.append(StepEvent(t=exe.t, type=EventType.EXECUTION,
+                                                instrument_id=exe.instrument_id,
+                                                opportunity_id=exe.opportunity_id,
+                                                price=exe.price, size=exe.size_filled,
+                                                propensity=exe.propensity))
+
+        # 5. update position state
+        self.position.step(self._t)
+        self.instruments.position = self.position.position
+
+        # 6. compute metrics
+        censored_fills = np.zeros(self.instruments.n)
+        revenue = 0.0
+        cost = 0.0
+        spread_capture = 0.0
+
+        for exe in executions:
+            censored_fills[exe.instrument_id] += exe.size_filled
+            if exe.side == Side.BUY:
+                revenue += exe.price * exe.size_filled
+                cost += self.instruments.costs[exe.instrument_id] * exe.size_filled
+            else:
+                revenue -= exe.price * exe.size_filled
+                cost -= self.instruments.costs[exe.instrument_id] * exe.size_filled
+            # spread capture for market making
+            if self._quote.spreads is not None and self._market_state and self._market_state.mid_prices is not None:
+                mid = self._market_state.mid_prices[exe.instrument_id]
+                if exe.side == Side.BUY:
+                    spread_capture += (exe.price - mid) * exe.size_filled
+                else:
+                    spread_capture += (mid - exe.price) * exe.size_filled
+
+        pnl = revenue - cost
+        units = float(np.sum(censored_fills))
+        lost = float(np.sum(true_demand - censored_fills))
+
+        # volatility
+        volatility = 0.0
+        if len(self._hidden.quote_history) > 1:
+            prev = self._hidden.quote_history[-2]
+            volatility = float(np.mean(np.abs(self._quote.prices - prev) / (prev + 1e-8)))
+
+        metrics = StepMetrics(
+            pnl=pnl, revenue=revenue, cost=cost, units_traded=units,
+            position_cost=self.position.holding_cost,
+            lost_opportunity=self.position.shortage_cost + lost * np.mean(self._quote.prices) * 0.1,
+            spread_capture=spread_capture, volatility=volatility,
+            conversion=units / (len(opps) + 1e-8),
+            per_instrument={'fills': censored_fills, 'demand': true_demand}
+        )
+
+        # 7. build logs
+        logs = StepLogs(
+            events=events if cfg.log_level == LogLevel.FULL else None,
+            executions=executions if cfg.log_level == LogLevel.FULL else None,
+            aggregates={'n_arrivals': len(opps), 'n_executions': len(executions),
+                        'exposures': np.bincount([o.instrument_id for o in opps],
+                                                 minlength=self.instruments.n).astype(float)},
+            true_demand=true_demand,
+            censored_fills=censored_fills
+        )
+
+        # 8. build observation
+        obs = self.obs_builder.build(self._quote, self.instruments, logs, metrics,
+                                     self._market_state, self._hidden, cfg.mask_demand, self._t)
+
+        # 9. compute reward
+        reward = self.objective.reward(self._quote, self.instruments, metrics, self._hidden, obs)
+        breakdown = self.objective.breakdown(self._quote, self.instruments, metrics, self._hidden, obs)
+        # print(f"Step {self._t}: Reward={reward:.2f}, Breakdown={breakdown}")
+
+
+        # 10. check termination
+        terminated = self._t >= cfg.max_steps
+        truncated = False
+
+        info = {'true_demand': true_demand, 'breakdown': self.objective.breakdown(
+            self._quote, self.instruments, metrics, self._hidden, obs)}
+
+        return StepResult(obs=obs, reward=reward, terminated=terminated, truncated=truncated,
+                          info=info, metrics=metrics, logs=logs, hidden=self._hidden)
diff --git a/lab/outlet/protocols.py b/lab/outlet/protocols.py
new file mode 100644
index 0000000..13bf967
--- /dev/null
+++ b/lab/outlet/protocols.py
@@ -0,0 +1,297 @@
+"""
+Protocol definitions for pluggable simulator components.
+
+This module defines the interfaces (Protocols) that allow swapping different
+implementations for each stage of the Quote -> Arrival -> Execution -> Position
+pipeline. All protocols use structural subtyping (duck typing).
+
+Protocols:
+    Mechanism: How quotes translate to executions (posted price, two-sided, auction)
+    ArrivalModel: How opportunities arrive (Poisson, Hawkes, sessions)
+    ExecutionModel: Acceptance probability given quote (elasticity, intensity)
+    PositionModel: Inventory/position management and censorship
+    MarketModel: Competitor/market dynamics
+    ObservationBuilder: Constructs agent observations with censoring
+    Objective: Computes reward from metrics
+"""
+from __future__ import annotations
+from typing import Protocol, Any, TYPE_CHECKING
+import numpy as np
+if TYPE_CHECKING:
+    from .types import (Quote, Opportunity, Execution, InstrumentSet, StepLogs,
+                        StepMetrics, HiddenState, Observation, MarketState)
+    from .constants import LogLevel
+
+class Mechanism(Protocol):
+    """Defines how quotes translate to executions.
+
+    The Mechanism is the core abstraction that differentiates pricing domains:
+    - PostedPrice: single price, buyer decides to purchase or not
+    - TwoSided: bid/ask spread, execution depends on distance from mid
+    - Auction: reserve price affects win probability and clearing price
+
+    Methods:
+        apply_quote: Enforce constraints and return valid quote
+        process_opportunity: Determine execution given opportunity and quote
+    """
+    def apply_quote(self, quote: Quote, instruments: InstrumentSet,
+                    rng: np.random.Generator) -> Quote:
+        """Apply mechanism-specific constraints to a quote.
+
+        Args:
+            quote: Raw quote from policy
+            instruments: Current instrument set with costs/refs
+            rng: Random generator for stochastic constraints
+
+        Returns:
+            Constrained quote satisfying mechanism rules (min margin, max delta, etc.)
+        """
+        ...
+
+    def process_opportunity(self, opp: Opportunity, quote: Quote,
+                            instruments: InstrumentSet, market: MarketState | None,
+                            rng: np.random.Generator) -> Execution | None:
+        """Process an opportunity against the current quote.
+
+        Args:
+            opp: Incoming opportunity (session, order, request)
+            quote: Current posted quote
+            instruments: Instrument set
+            market: Current market state (competitor prices, mid-prices)
+            rng: Random generator
+
+        Returns:
+            Execution if opportunity converts, None otherwise
+        """
+        ...
+
+class ArrivalModel(Protocol):
+    """Generates opportunities (demand arrivals) for each step.
+
+    Different arrival models capture different demand dynamics:
+    - Poisson: constant rate, memoryless
+    - Hawkes: self-exciting, clustered arrivals
+    - Session: retail browsing with multi-product views
+
+    Methods:
+        sample: Generate opportunities for a time interval
+    """
+    def sample(self, t: float, dt: float, instruments: InstrumentSet,
+               market: MarketState | None, hidden: HiddenState,
+               rng: np.random.Generator) -> list[Opportunity]:
+        """Sample opportunities for time interval [t, t+dt).
+
+        Args:
+            t: Current time
+            dt: Time interval length
+            instruments: Available instruments
+            market: Current market state
+            hidden: Hidden state (contains demand intensity, contamination)
+            rng: Random generator
+
+        Returns:
+            List of opportunities arriving in this interval
+        """
+        ...
+
+class ExecutionModel(Protocol):
+    """Computes acceptance/execution probability given quote and context.
+
+    Different models capture different demand responses:
+    - Elasticity: price sensitivity with competitor cross-effects
+    - Intensity: distance-based fill probability (market making)
+    - Logit: discrete choice model
+
+    Methods:
+        prob: Compute acceptance probability
+        uncensor: Estimate true demand from censored fills
+    """
+    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
+             market: MarketState | None, rng: np.random.Generator) -> float:
+        """Compute probability that opportunity accepts the quote.
+
+        Args:
+            opp: Opportunity to evaluate
+            quote: Current quote
+            instruments: Instrument set
+            market: Market state (competitor prices affect cross-elasticity)
+            rng: Random generator
+
+        Returns:
+            Probability in [0, 1] that opportunity executes
+        """
+        ...
+
+    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
+                 context: dict[str, Any] | None = None) -> np.ndarray:
+        """Estimate true demand from censored fills.
+
+        Used for demand estimation research under inventory censorship.
+
+        Args:
+            fills: Observed (censored) fill counts
+            instruments: Instrument set
+            context: Additional context (exposures, prices shown)
+
+        Returns:
+            Estimated true demand counts
+        """
+        ...
+
+class PositionModel(Protocol):
+    """Manages inventory (retail) or position (finance).
+
+    Handles:
+    - Position constraints and censorship
+    - Holding costs (retail) or inventory risk (finance)
+    - Replenishment and order receipt
+
+    Methods:
+        reset: Initialize position state
+        available: Query available capacity for a trade
+        apply_execution: Censor execution by available position
+        step: Process time-based updates (replenishment, holding cost)
+
+    Properties:
+        position: Current position vector
+        holding_cost: Cost incurred this step from holding position
+    """
+    def reset(self, instruments: InstrumentSet, rng: np.random.Generator) -> None:
+        """Initialize position state for new episode."""
+        ...
+
+    def available(self, instrument_id: int, side: Any) -> float:
+        """Query available capacity for a trade.
+
+        Args:
+            instrument_id: Which instrument
+            side: BUY or SELL
+
+        Returns:
+            Maximum tradeable size given current position
+        """
+        ...
+
+    def apply_execution(self, exe: Execution) -> Execution:
+        """Apply position constraints to an execution.
+
+        Args:
+            exe: Proposed execution with size_requested
+
+        Returns:
+            Censored execution with size_filled <= available capacity
+        """
+        ...
+
+    def step(self, t: float) -> None:
+        """Process time-based position updates.
+
+        Handles replenishment receipt, holding cost calculation, etc.
+        """
+        ...
+
+    @property
+    def position(self) -> np.ndarray:
+        """Current position vector (positive=long/inventory, negative=short)."""
+        ...
+
+    @property
+    def holding_cost(self) -> float:
+        """Holding cost incurred this step."""
+        ...
+
+class MarketModel(Protocol):
+    """Models external market dynamics and competitor behavior.
+
+    For retail: competitor price dynamics (static, reactive, stochastic)
+    For finance: mid-price process (GBM, mean-reverting)
+
+    Methods:
+        step: Update market state given agent's quotes
+    """
+    def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
+             rng: np.random.Generator) -> MarketState:
+        """Update market state for this timestep.
+
+        Args:
+            t: Current time
+            self_quotes: Agent's current quotes (competitors may react)
+            hidden: Hidden state (regime info)
+            rng: Random generator
+
+        Returns:
+            Updated market state with competitor prices, mid-prices, volatility
+        """
+        ...
+
+class ObservationBuilder(Protocol):
+    """Constructs agent observations with appropriate censoring.
+
+    Critical for research: ensures agent only sees censored fills,
+    never true demand (which goes in info dict).
+
+    Methods:
+        build: Construct observation from step data
+    """
+    def build(self, quote: Quote, instruments: InstrumentSet, logs: StepLogs,
+              metrics: StepMetrics, market: MarketState | None,
+              hidden: HiddenState, mask_demand: bool, t: int) -> Observation:
+        """Build observation for agent.
+
+        Args:
+            quote: Current quote
+            instruments: Instrument set with positions
+            logs: Step logs with true_demand and censored_fills
+            metrics: Computed metrics
+            market: Market state
+            hidden: Hidden state (not included in obs)
+            mask_demand: If True, exclude true demand from observation
+            t: Current timestep
+
+        Returns:
+            Observation containing only observable quantities
+        """
+        ...
+
+class Objective(Protocol):
+    """Computes reward from step metrics.
+
+    Supports composite objectives with weighted terms:
+    - PnL (profit)
+    - Position costs (holding, inventory risk)
+    - Lost opportunity (stockouts)
+    - Volatility penalty (UX)
+    - Spread capture (market making)
+
+    Methods:
+        reward: Compute scalar reward
+        breakdown: Get per-term contribution for analysis
+    """
+    def reward(self, quote: Quote, instruments: InstrumentSet,
+               metrics: StepMetrics, hidden: HiddenState,
+               obs: Observation) -> float:
+        """Compute scalar reward for this step.
+
+        Args:
+            quote: Current quote
+            instruments: Instrument set
+            metrics: Step metrics (pnl, costs, etc.)
+            hidden: Hidden state
+            obs: Agent observation
+
+        Returns:
+            Scalar reward value
+        """
+        ...
+
+    def breakdown(self, quote: Quote, instruments: InstrumentSet,
+                  metrics: StepMetrics, hidden: HiddenState,
+                  obs: Observation) -> dict[str, float]:
+        """Get reward breakdown by component.
+
+        Useful for analyzing which terms dominate the reward.
+
+        Returns:
+            Dict mapping term names to their contributions
+        """
+        ...
diff --git a/lab/outlet/stock.py b/lab/outlet/stock.py
new file mode 100644
index 0000000..b2c88a2
--- /dev/null
+++ b/lab/outlet/stock.py
@@ -0,0 +1,151 @@
+"""
+Inventory/position management and instrument factories.
+
+This module provides:
+- PositionConfig: Configuration for position constraints and costs
+- PositionModel: Manages inventory (retail) or position (finance)
+- make_instruments: Factory for creating instrument sets
+
+The PositionModel handles demand censorship by limiting executions
+to available inventory, computing holding costs, and managing replenishment.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+import numpy as np
+from .types import Instrument, InstrumentSet, Execution
+from .constants import Side, InstrumentType
+
+@dataclass
+class PositionConfig:
+    """Configuration for position/inventory management.
+
+    Attributes:
+        initial_position: Starting inventory (None = unlimited, float = same for all)
+        max_position: Maximum long position per instrument
+        min_position: Maximum short position (negative, for finance)
+        holding_cost_rate: Cost per unit per step for holding inventory
+        shortage_cost_rate: Opportunity cost rate for stockouts
+        lead_time: Steps until replenishment orders arrive
+    """
+    initial_position: np.ndarray | float | None = None
+    max_position: float = 1000.0
+    min_position: float = -1000.0
+    holding_cost_rate: float = 0.001
+    shortage_cost_rate: float = 0.05
+    lead_time: int = 0
+
+@dataclass
+class PositionModel:
+    """Manages inventory (retail) or position (finance) with censorship.
+
+    Key responsibilities:
+    - Track current position per instrument
+    - Censor executions when position is insufficient
+    - Compute holding costs per step
+    - Track shortage/stockout costs
+    - Handle replenishment orders with lead time
+
+    For retail: position is inventory (positive), selling reduces it
+    For finance: position can be positive (long) or negative (short)
+    """
+    cfg: PositionConfig
+    n: int = 0
+    _position: np.ndarray = field(default_factory=lambda: np.array([]))
+    _pending_orders: list[tuple[int, np.ndarray]] = field(default_factory=list)
+    _step_holding_cost: float = 0.0
+    _step_shortage_cost: float = 0.0
+
+    def reset(self, instruments: InstrumentSet, rng: np.random.Generator) -> None:
+        self.n = instruments.n
+        if self.cfg.initial_position is None:
+            self._position = np.full(self.n, np.inf)  # unlimited
+        elif isinstance(self.cfg.initial_position, (int, float)):
+            self._position = np.full(self.n, float(self.cfg.initial_position))
+        else:
+            self._position = self.cfg.initial_position.copy().astype(np.float64)
+        self._pending_orders = []
+        self._step_holding_cost = 0.0
+        self._step_shortage_cost = 0.0
+
+    def available(self, instrument_id: int, side: Side) -> float:
+        pos = self._position[instrument_id]
+        if np.isinf(pos): return np.inf
+        if side == Side.BUY:
+            return max(0, pos)  # can sell up to current inventory
+        else:
+            return max(0, self.cfg.max_position - pos)  # can buy up to max
+
+    def apply_execution(self, exe: Execution) -> Execution:
+        idx = int(exe.instrument_id)
+        avail = self.available(idx, exe.side)
+        filled = min(exe.size_requested, avail)
+        shortage = exe.size_requested - filled
+
+        if exe.side == Side.BUY:
+            self._position[idx] -= filled  # sold from inventory
+        else:
+            self._position[idx] += filled  # bought into inventory
+
+        if shortage > 0:
+            self._step_shortage_cost += shortage * exe.price * self.cfg.shortage_cost_rate
+
+        return Execution(
+            opportunity_id=exe.opportunity_id, instrument_id=exe.instrument_id,
+            side=exe.side, size_requested=exe.size_requested,
+            size_filled=filled, price=exe.price, propensity=exe.propensity, t=exe.t
+        )
+
+    def order(self, quantity: np.ndarray) -> None:
+        if self.cfg.lead_time > 0:
+            self._pending_orders.append((self.cfg.lead_time, quantity.copy()))
+        else:
+            self._position += quantity
+
+    def step(self, t: float) -> None:
+        # compute holding cost
+        pos = np.where(np.isinf(self._position), 0, self._position)
+        self._step_holding_cost = float(np.sum(np.abs(pos)) * self.cfg.holding_cost_rate)
+
+        # receive pending orders
+        new_pending = []
+        for (remaining, qty) in self._pending_orders:
+            if remaining <= 1:
+                self._position += qty
+            else:
+                new_pending.append((remaining - 1, qty))
+        self._pending_orders = new_pending
+
+    @property
+    def position(self) -> np.ndarray:
+        return np.where(np.isinf(self._position), -1, self._position)
+
+    @property
+    def holding_cost(self) -> float:
+        return self._step_holding_cost
+
+    @property
+    def shortage_cost(self) -> float:
+        return self._step_shortage_cost
+
+def make_instruments(n: int, cost_range: tuple[float, float] = (1.0, 10.0),
+                     margin_range: tuple[float, float] = (0.2, 0.5),
+                     inst_type: InstrumentType = InstrumentType.SKU,
+                     rng: np.random.Generator | None = None) -> InstrumentSet:
+    """Factory function to create a random instrument set.
+
+    Args:
+        n: Number of instruments to create
+        cost_range: (min, max) for uniform cost sampling
+        margin_range: (min, max) for uniform margin sampling
+        inst_type: Type of instruments (SKU, ASSET, etc.)
+        rng: Random generator (uses default if None)
+
+    Returns:
+        InstrumentSet with n instruments having random costs and margins
+    """
+    rng = rng or np.random.default_rng()
+    costs = rng.uniform(*cost_range, n)
+    margins = rng.uniform(*margin_range, n)
+    items = [Instrument(id=i, type=inst_type, cost_basis=c, reference_price=c*(1+m))
+             for i, (c, m) in enumerate(zip(costs, margins))]
+    return InstrumentSet(instruments=items)
diff --git a/lab/outlet/types.py b/lab/outlet/types.py
new file mode 100644
index 0000000..db49117
--- /dev/null
+++ b/lab/outlet/types.py
@@ -0,0 +1,318 @@
+"""
+Core data types for the Quote-Control simulator.
+
+This module defines the fundamental data structures used throughout the platform:
+- Identifiers (InstrumentId, OpportunityId, AgentId)
+- Domain objects (Instrument, Quote, Opportunity, Execution)
+- Logging structures (StepEvent, StepLogs, StepMetrics)
+- State containers (MarketState, HiddenState, Observation, StepResult)
+
+All dataclasses are designed to be serializable and numpy-compatible.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, NewType
+import numpy as np
+from .constants import Side, InstrumentType, OpportunityType, EventType
+
+InstrumentId = NewType('InstrumentId', int)  # unique instrument index
+OpportunityId = NewType('OpportunityId', str)  # unique opportunity/session ID
+AgentId = NewType('AgentId', str)  # unique agent/actor ID
+
+@dataclass
+class Instrument:
+    """Represents a priceable entity in the simulation.
+
+    An instrument can be a retail SKU, financial asset, loan product, or subscription.
+    The cost_basis represents the fundamental value (marginal cost for retail,
+    mid-price for assets, funding rate for loans).
+
+    Attributes:
+        id: Unique identifier for this instrument
+        type: Category of instrument (SKU, ASSET, LOAN, SUBSCRIPTION)
+        cost_basis: Fundamental cost or value (marginal cost, mid-price, funding rate)
+        reference_price: Base or fair price used for action scaling
+        attrs: Additional attributes (quality score, category, volatility, etc.)
+    """
+    id: InstrumentId
+    type: InstrumentType
+    cost_basis: float
+    reference_price: float
+    attrs: dict[str, Any] = field(default_factory=dict)
+
+@dataclass
+class InstrumentSet:
+    """Collection of instruments with optional position tracking.
+
+    Provides vectorized access to instrument properties for efficient computation.
+    Position can be positive (long/inventory) or negative (short) for financial assets.
+
+    Attributes:
+        instruments: List of Instrument objects
+        position: Current position per instrument (None = unlimited capacity)
+
+    Properties:
+        n: Number of instruments
+        costs: Vector of cost bases
+        refs: Vector of reference prices
+    """
+    instruments: list[Instrument]
+    position: np.ndarray | None = None
+
+    @property
+    def n(self) -> int: return len(self.instruments)
+    @property
+    def costs(self) -> np.ndarray: return np.array([i.cost_basis for i in self.instruments], np.float32)
+    @property
+    def refs(self) -> np.ndarray: return np.array([i.reference_price for i in self.instruments], np.float32)
+
+@dataclass
+class Quote:
+    """Price quote set by the policy - the action in the MDP.
+
+    Supports multiple quoting mechanisms:
+    - Posted price: only `prices` field used
+    - Two-sided: `prices` as mid, `spreads` for bid-ask width
+    - Auction: `prices` as reserve prices
+
+    The propensity field is critical for off-policy evaluation (OPE).
+
+    Attributes:
+        prices: Posted prices (retail) or mid-quotes (market making)
+        spreads: Bid-ask spread width for two-sided quoting (None for posted price)
+        propensity: P(this quote | behavior policy) for importance sampling
+        metadata: Additional info (prev_prices for delta constraints, etc.)
+
+    Properties:
+        bids: Computed bid prices (mid - spread/2)
+        asks: Computed ask prices (mid + spread/2)
+    """
+    prices: np.ndarray
+    spreads: np.ndarray | None = None
+    propensity: float = 1.0
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    @property
+    def bids(self) -> np.ndarray | None:
+        return self.prices - self.spreads/2 if self.spreads is not None else None
+    @property
+    def asks(self) -> np.ndarray | None:
+        return self.prices + self.spreads/2 if self.spreads is not None else None
+
+@dataclass
+class Opportunity:
+    """An arrival event that may result in a transaction.
+
+    Opportunities are the demand side of the simulation:
+    - Retail: browsing session with purchase intent
+    - Market making: incoming market order
+    - Lending: loan application
+
+    The context dict carries segment/type information used by execution models.
+
+    Attributes:
+        id: Unique identifier for this opportunity
+        type: Category (SESSION, MARKET_ORDER, REQUEST)
+        side: BUY or SELL intent
+        instrument_id: Which instrument the opportunity targets
+        size: Requested transaction size (units, shares, principal)
+        t: Arrival timestamp
+        context: Segment info (is_scraper, credit_score, urgency, etc.)
+    """
+    id: OpportunityId
+    type: OpportunityType
+    side: Side
+    instrument_id: InstrumentId
+    size: float = 1.0
+    t: float = 0.0
+    context: dict[str, Any] = field(default_factory=dict)
+
+@dataclass
+class Execution:
+    """A realized transaction after acceptance and position censorship.
+
+    The difference between size_requested and size_filled represents
+    censored demand due to inventory/position constraints.
+
+    Attributes:
+        opportunity_id: Links back to the originating Opportunity
+        instrument_id: Which instrument was traded
+        side: BUY or SELL
+        size_requested: Original requested size (true demand)
+        size_filled: Actual filled size after censorship
+        price: Execution price
+        propensity: Combined propensity for OPE (quote * acceptance)
+        t: Execution timestamp
+    """
+    opportunity_id: OpportunityId
+    instrument_id: InstrumentId
+    side: Side
+    size_requested: float
+    size_filled: float
+    price: float
+    propensity: float = 1.0
+    t: float = 0.0
+
+@dataclass
+class StepEvent:
+    """Generic logged event"""
+    t: float
+    type: EventType
+    instrument_id: InstrumentId | None = None
+    opportunity_id: OpportunityId | None = None
+    price: float | None = None
+    size: float | None = None
+    propensity: float = 1.0
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+@dataclass
+class StepLogs:
+    """Container for all logging data from a simulation step.
+
+    Supports both detailed event logging (for OPE) and aggregate-only mode
+    (for fast simulation). The true_demand vs censored_fills distinction
+    is critical for research on demand estimation under censorship.
+
+    Attributes:
+        events: Detailed event log (None if LogLevel != FULL)
+        executions: List of executed transactions (None if LogLevel != FULL)
+        aggregates: Always-available aggregate statistics
+        true_demand: Oracle demand before censorship (for research, not in obs)
+        censored_fills: Realized fills after position constraints (observable)
+    """
+    events: list[StepEvent] | None = None
+    executions: list[Execution] | None = None
+    aggregates: dict[str, Any] = field(default_factory=dict)
+    true_demand: np.ndarray | None = None
+    censored_fills: np.ndarray | None = None
+
+@dataclass
+class StepMetrics:
+    """Computed metrics for a single simulation step.
+
+    Metrics are domain-aware: retail uses revenue/cost/holding_cost,
+    market making uses spread_capture and inventory risk.
+
+    Attributes:
+        pnl: Profit and loss (revenue - cost for retail, mark-to-market for finance)
+        revenue: Gross revenue from sales/executions
+        cost: Cost of goods sold or position acquisition cost
+        units_traded: Total units/shares transacted
+        position_cost: Holding cost (retail) or inventory risk penalty (finance)
+        lost_opportunity: Cost of stockouts or missed fills
+        spread_capture: Bid-ask spread captured (market making)
+        volatility: Price volatility metric for UX consideration
+        conversion: Fill rate (executions / opportunities)
+        per_instrument: Per-instrument breakdowns (fills, demand, etc.)
+    """
+    pnl: float = 0.0
+    revenue: float = 0.0
+    cost: float = 0.0
+    units_traded: float = 0.0
+    position_cost: float = 0.0
+    lost_opportunity: float = 0.0
+    spread_capture: float = 0.0
+    volatility: float = 0.0
+    conversion: float = 0.0
+    per_instrument: dict[str, np.ndarray] = field(default_factory=dict)
+
+@dataclass
+class MarketState:
+    """External market conditions and competitor state.
+
+    For retail: competitor_quotes drives cross-elasticity effects.
+    For finance: mid_prices and volatility drive execution dynamics.
+
+    Attributes:
+        competitor_quotes: Competitor posted prices (retail)
+        mid_prices: Market mid-prices for assets (finance)
+        volatility: Per-instrument volatility estimate
+        regime: Market regime identifier (normal, price_war, high_vol, etc.)
+        t: Timestamp of this market state
+    """
+    competitor_quotes: np.ndarray | None = None
+    mid_prices: np.ndarray | None = None
+    volatility: np.ndarray | None = None
+    regime: str = 'normal'
+    t: float = 0.0
+
+@dataclass
+class HiddenState:
+    """Internal simulator state not exposed to the agent.
+
+    Contains oracle information for research analysis and
+    history needed for non-stationary dynamics.
+
+    Attributes:
+        true_demand_intensity: Latent demand multiplier
+        contamination: Fraction of arrivals that are adversarial/scraper
+        regime: Current market/competitor regime
+        quote_history: History of agent quotes for volatility calculation
+        market_history: History of market states for analysis
+    """
+    true_demand_intensity: float = 1.0
+    contamination: float = 0.0
+    regime: str = 'normal'
+    quote_history: list[np.ndarray] = field(default_factory=list)
+    market_history: list[MarketState] = field(default_factory=list)
+
+@dataclass
+class Observation:
+    """Observable state provided to the agent - censored view only.
+
+    Critical invariant: Observation never contains true_demand, only
+    censored fills. This enforces the censorship research setting.
+
+    Attributes:
+        quotes: Current posted quotes (the agent's last action)
+        position: Current inventory/position state
+        fills: Censored execution counts per instrument
+        exposures: Opportunity exposure counts per instrument
+        market: Observable market state (competitor prices, volatility)
+        t: Current timestep
+        extra: Additional observable features
+
+    Methods:
+        to_flat: Flatten to numpy array for gym compatibility
+    """
+    quotes: np.ndarray
+    position: np.ndarray | None
+    fills: np.ndarray
+    exposures: np.ndarray
+    market: MarketState | None
+    t: int
+    extra: dict[str, Any] = field(default_factory=dict)
+
+    def to_flat(self) -> np.ndarray:
+        """Flatten observation to 1D numpy array for gym environments."""
+        parts = [self.quotes, self.fills, self.exposures]
+        if self.position is not None: parts.append(self.position)
+        if self.market and self.market.competitor_quotes is not None:
+            parts.append(self.market.competitor_quotes)
+        return np.concatenate([p.flatten() for p in parts])
+
+@dataclass
+class StepResult:
+    """Complete result from a simulation step.
+
+    Follows gymnasium convention for obs, reward, terminated, truncated, info.
+    Additionally provides metrics, logs, and hidden state for research.
+
+    Attributes:
+        obs: Observable state (censored)
+        reward: Scalar reward from objective function
+        terminated: Episode ended naturally (max_steps reached)
+        truncated: Episode ended early (bankruptcy, constraint violation)
+        info: Additional info dict (contains true_demand for research)
+        metrics: Computed metrics for this step
+        logs: Event logs and aggregates
+        hidden: Internal simulator state (oracle info)
+    """
+    obs: Observation
+    reward: float
+    terminated: bool
+    truncated: bool
+    info: dict[str, Any]
+    metrics: StepMetrics
+    logs: StepLogs
+    hidden: HiddenState
diff --git a/lab/population/__init__.py b/lab/population/__init__.py
new file mode 100644
index 0000000..081dbd0
--- /dev/null
+++ b/lab/population/__init__.py
@@ -0,0 +1,10 @@
+from .arrivals import PoissonArrivalModel, HawkesArrivalModel, SessionArrivalModel
+from .execution import ElasticityExecutionModel, IntensityExecutionModel, LogitExecutionModel
+from .competitors import (StaticCompetitorModel, ReactiveCompetitorModel,
+                          StochasticCompetitorModel, GBMMarketModel)
+
+__all__ = [
+    'PoissonArrivalModel', 'HawkesArrivalModel', 'SessionArrivalModel',
+    'ElasticityExecutionModel', 'IntensityExecutionModel', 'LogitExecutionModel',
+    'StaticCompetitorModel', 'ReactiveCompetitorModel', 'StochasticCompetitorModel', 'GBMMarketModel',
+]
diff --git a/lab/population/arrivals.py b/lab/population/arrivals.py
new file mode 100644
index 0000000..b7e7ed6
--- /dev/null
+++ b/lab/population/arrivals.py
@@ -0,0 +1,168 @@
+"""
+Arrival models for generating demand opportunities.
+
+This module provides different arrival processes:
+- PoissonArrivalModel: Constant-rate memoryless arrivals
+- HawkesArrivalModel: Self-exciting clustered arrivals (market orders)
+- SessionArrivalModel: Retail browsing sessions with multi-product views
+
+Each model implements the ArrivalModel protocol and generates Opportunity objects
+that flow through the execution pipeline.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable
+import numpy as np
+from uuid import uuid4
+from ..outlet.types import Opportunity, InstrumentSet, MarketState, HiddenState
+from ..outlet.constants import Side, OpportunityType
+from ..outlet.math_util import poisson_arrivals, hawkes_intensity
+
+@dataclass
+class PoissonArrivalConfig:
+    """Configuration for Poisson arrival process.
+
+    Attributes:
+        base_rate: Expected arrivals per unit time (scaled by hidden.true_demand_intensity)
+        side_probs: Probability distribution over BUY/SELL sides
+    """
+    base_rate: float = 10.0
+    side_probs: dict[Side, float] = None
+
+    def __post_init__(self):
+        if self.side_probs is None:
+            self.side_probs = {Side.BUY: 1.0}
+
+class PoissonArrivalModel:
+    """Homogeneous Poisson arrival process.
+
+    Generates arrivals at a constant rate (modulated by demand intensity).
+    Suitable for stationary demand or as a baseline model.
+
+    The actual arrival count follows Poisson(rate * dt * intensity).
+    """
+
+    def __init__(self, cfg: PoissonArrivalConfig | None = None):
+        self.cfg = cfg or PoissonArrivalConfig()
+
+    def sample(self, t: float, dt: float, instruments: InstrumentSet,
+               market: MarketState | None, hidden: HiddenState,
+               rng: np.random.Generator) -> list[Opportunity]:
+        n_arrivals = poisson_arrivals(self.cfg.base_rate * hidden.true_demand_intensity, dt, rng)
+        opps = []
+        for _ in range(n_arrivals):
+            inst_id = rng.integers(0, instruments.n)
+            side = rng.choice(list(self.cfg.side_probs.keys()),
+                              p=list(self.cfg.side_probs.values()))
+            opps.append(Opportunity(
+                id=str(uuid4())[:8], type=OpportunityType.SESSION,
+                side=side, instrument_id=inst_id, size=1.0, t=t,
+                context={'segment': 'default'}
+            ))
+        return opps
+
+@dataclass
+class HawkesArrivalConfig:
+    """Configuration for Hawkes self-exciting process.
+
+    Attributes:
+        base_rate: Baseline arrival intensity
+        alpha: Excitation strength (how much each arrival increases intensity)
+        beta: Decay rate (how quickly excitation fades)
+        side_probs: Probability distribution over BUY/SELL sides
+    """
+    base_rate: float = 5.0
+    alpha: float = 0.5
+    beta: float = 1.0
+    side_probs: dict[Side, float] = None
+
+    def __post_init__(self):
+        if self.side_probs is None:
+            self.side_probs = {Side.BUY: 0.5, Side.SELL: 0.5}
+
+class HawkesArrivalModel:
+    """Self-exciting Hawkes point process for clustered arrivals.
+
+    Models order flow where arrivals cluster in time (momentum, herding).
+    Intensity: lambda(t) = base + alpha * sum(exp(-beta * (t - t_i)))
+
+    Used for market making scenarios where orders arrive in bursts.
+    """
+
+    def __init__(self, cfg: HawkesArrivalConfig | None = None):
+        self.cfg = cfg or HawkesArrivalConfig()
+        self._history: np.ndarray = np.array([])
+
+    def sample(self, t: float, dt: float, instruments: InstrumentSet,
+               market: MarketState | None, hidden: HiddenState,
+               rng: np.random.Generator) -> list[Opportunity]:
+        intensity = hawkes_intensity(
+            self.cfg.base_rate * hidden.true_demand_intensity,
+            self._history, self.cfg.alpha, self.cfg.beta, t
+        )
+        n_arrivals = poisson_arrivals(intensity, dt, rng)
+        opps = []
+        for i in range(n_arrivals):
+            arr_t = t + rng.uniform(0, dt)
+            self._history = np.append(self._history, arr_t)
+            inst_id = rng.integers(0, instruments.n)
+            side = rng.choice(list(self.cfg.side_probs.keys()),
+                              p=list(self.cfg.side_probs.values()))
+            opps.append(Opportunity(
+                id=str(uuid4())[:8], type=OpportunityType.MARKET_ORDER,
+                side=side, instrument_id=inst_id,
+                size=rng.exponential(1.0), t=arr_t,
+                context={'intensity': intensity}
+            ))
+        # decay old history
+        self._history = self._history[self._history > t - 10]
+        return opps
+
+@dataclass
+class SessionArrivalConfig:
+    """Configuration for retail session arrivals.
+
+    Attributes:
+        sessions_per_step: Number of browsing sessions per step
+        views_per_session: (min, max) product views per session
+        contamination: Fraction of sessions that are scrapers/bots
+    """
+    sessions_per_step: int = 20
+    views_per_session: tuple[int, int] = (1, 5)
+    contamination: float = 0.0
+
+class SessionArrivalModel:
+    """Retail browsing session model with multi-product views.
+
+    Each session views multiple products, generating one opportunity per view.
+    Scraper sessions (controlled by contamination) view more products
+    but convert at lower rates (handled by ExecutionModel).
+    """
+
+    def __init__(self, cfg: SessionArrivalConfig | None = None):
+        self.cfg = cfg or SessionArrivalConfig()
+
+    def sample(self, t: float, dt: float, instruments: InstrumentSet,
+               market: MarketState | None, hidden: HiddenState,
+               rng: np.random.Generator) -> list[Opportunity]:
+        n_sessions = self.cfg.sessions_per_step
+        contamination = hidden.contamination if hidden else self.cfg.contamination
+        opps = []
+
+        for _ in range(n_sessions):
+            is_scraper = rng.random() < contamination
+            n_views = rng.integers(*self.cfg.views_per_session)
+            sid = str(uuid4())[:8]
+
+            # scrapers view more products
+            if is_scraper:
+                n_views = min(instruments.n, n_views * 3)
+
+            viewed = rng.choice(instruments.n, size=min(n_views, instruments.n), replace=False)
+            for inst_id in viewed:
+                opps.append(Opportunity(
+                    id=f"{sid}-{inst_id}", type=OpportunityType.SESSION,
+                    side=Side.BUY, instrument_id=int(inst_id), size=1.0, t=t,
+                    context={'session_id': sid, 'is_scraper': is_scraper, 'n_views': n_views}
+                ))
+        return opps
diff --git a/lab/population/competitors.py b/lab/population/competitors.py
new file mode 100644
index 0000000..9417709
--- /dev/null
+++ b/lab/population/competitors.py
@@ -0,0 +1,189 @@
+"""
+Market and competitor models for external dynamics.
+
+This module provides models for competitor pricing (retail) and market dynamics (finance):
+- StaticCompetitorModel: Fixed competitor prices
+- ReactiveCompetitorModel: Competitor reacts to agent's prices, can trigger price wars
+- StochasticCompetitorModel: Random walk competitor prices
+- GBMMarketModel: Geometric Brownian Motion for asset mid-prices
+
+Each model implements the MarketModel protocol.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import numpy as np
+from ..outlet.types import Quote, MarketState, HiddenState
+from ..outlet.math_util import clamp, ema
+
+@dataclass
+class StaticCompetitorConfig:
+    """Configuration for static competitor.
+
+    Attributes:
+        markup: Fixed percentage markup over reference prices
+    """
+    markup: float = 0.1
+
+class StaticCompetitorModel:
+    """Static competitor with fixed markup pricing.
+
+    Competitor prices = reference * (1 + markup).
+    Useful as a baseline or for testing without competitor dynamics.
+    """
+
+    def __init__(self, cfg: StaticCompetitorConfig | None = None, refs: np.ndarray | None = None):
+        self.cfg = cfg or StaticCompetitorConfig()
+        self.refs = refs
+
+    def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
+             rng: np.random.Generator) -> MarketState:
+        refs = self.refs if self.refs is not None else self_quotes.prices
+        comp_prices = refs * (1 + self.cfg.markup)
+        return MarketState(competitor_quotes=comp_prices, regime='static', t=t)
+
+@dataclass
+class ReactiveCompetitorConfig:
+    """Configuration for reactive competitor.
+
+    Attributes:
+        follow_weight: Smoothing weight for price following (0=ignore, 1=instant)
+        band_pct: Maximum deviation from reference prices
+        war_threshold: Relative price diff that triggers price war
+        war_aggression: How much competitor cuts prices during war
+    """
+    follow_weight: float = 0.3
+    band_pct: float = 0.1
+    war_threshold: float = -0.15
+    war_aggression: float = 0.2
+
+class ReactiveCompetitorModel:
+    """Competitor that reacts to agent's prices with price war dynamics.
+
+    The competitor follows the agent's prices with smoothing.
+    If the agent undercuts significantly (beyond war_threshold),
+    a price war is triggered where the competitor becomes more aggressive.
+
+    This creates non-stationary dynamics that test policy robustness.
+    """
+
+    def __init__(self, cfg: ReactiveCompetitorConfig | None = None, refs: np.ndarray | None = None):
+        self.cfg = cfg or ReactiveCompetitorConfig()
+        self.refs = refs
+        self._prices: np.ndarray | None = None
+        self._in_war: bool = False
+
+    def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
+             rng: np.random.Generator) -> MarketState:
+        refs = self.refs if self.refs is not None else self_quotes.prices
+        c = self.cfg
+
+        if self._prices is None:
+            self._prices = refs.copy()
+
+        # check for price war trigger
+        relative_diff = (self_quotes.prices - self._prices) / (self._prices + 1e-8)
+        if np.any(relative_diff < c.war_threshold):
+            self._in_war = True
+        elif np.all(relative_diff > -c.war_threshold / 2):
+            self._in_war = False
+
+        # update prices
+        if self._in_war:
+            target = self_quotes.prices * (1 - c.war_aggression)
+            hidden.regime = 'price_war'
+        else:
+            target = self_quotes.prices * (1 + c.follow_weight * 0.05)
+            hidden.regime = 'normal'
+
+        # follow with smoothing
+        new_prices = np.array([ema(old, new, c.follow_weight)
+                               for old, new in zip(self._prices, target)])
+
+        # stay within band
+        new_prices = clamp(new_prices, refs * (1 - c.band_pct), refs * (1 + c.band_pct))
+        self._prices = new_prices
+
+        return MarketState(competitor_quotes=new_prices, regime=hidden.regime, t=t)
+
+@dataclass
+class StochasticCompetitorConfig:
+    """Configuration for stochastic competitor.
+
+    Attributes:
+        drift: Price drift per step
+        volatility: Price volatility (std of random shocks)
+        mean_revert: Mean reversion strength toward reference
+    """
+    drift: float = 0.0
+    volatility: float = 0.02
+    mean_revert: float = 0.1
+
+class StochasticCompetitorModel:
+    """Ornstein-Uhlenbeck style stochastic competitor prices.
+
+    Prices follow: dP = drift + mean_revert*(ref - P) + volatility*P*dW
+
+    Provides non-stationary competitor dynamics independent of agent actions.
+    Useful for testing robustness to market noise.
+    """
+
+    def __init__(self, cfg: StochasticCompetitorConfig | None = None, refs: np.ndarray | None = None):
+        self.cfg = cfg or StochasticCompetitorConfig()
+        self.refs = refs
+        self._prices: np.ndarray | None = None
+
+    def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
+             rng: np.random.Generator) -> MarketState:
+        refs = self.refs if self.refs is not None else self_quotes.prices
+        c = self.cfg
+
+        if self._prices is None:
+            self._prices = refs.copy()
+
+        # Ornstein-Uhlenbeck style dynamics
+        n = len(self._prices)
+        noise = rng.normal(0, c.volatility, n)
+        reversion = c.mean_revert * (refs - self._prices)
+        self._prices = self._prices + c.drift + reversion + noise * self._prices
+        self._prices = np.maximum(self._prices, refs * 0.5)
+
+        return MarketState(competitor_quotes=self._prices.copy(), regime='stochastic', t=t)
+
+@dataclass
+class GBMMarketConfig:
+    """Configuration for GBM market model.
+
+    Attributes:
+        mu: Price drift (expected return)
+        sigma: Price volatility
+        dt: Time step size
+    """
+    mu: float = 0.0
+    sigma: float = 0.1
+    dt: float = 1.0
+
+class GBMMarketModel:
+    """Geometric Brownian Motion model for asset mid-prices.
+
+    Standard Black-Scholes dynamics: dS = mu*S*dt + sigma*S*dW
+
+    Used for market making scenarios where the underlying asset price
+    follows a random walk. The agent quotes around this moving mid-price.
+    """
+
+    def __init__(self, cfg: GBMMarketConfig | None = None, initial: np.ndarray | None = None):
+        self.cfg = cfg or GBMMarketConfig()
+        self._mids = initial
+
+    def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
+             rng: np.random.Generator) -> MarketState:
+        if self._mids is None:
+            self._mids = self_quotes.prices.copy()
+
+        c = self.cfg
+        n = len(self._mids)
+        z = rng.standard_normal(n)
+        self._mids = self._mids * np.exp((c.mu - 0.5*c.sigma**2)*c.dt + c.sigma*np.sqrt(c.dt)*z)
+
+        vol = np.full(n, c.sigma)
+        return MarketState(mid_prices=self._mids.copy(), volatility=vol, regime='gbm', t=t)
diff --git a/lab/population/execution.py b/lab/population/execution.py
new file mode 100644
index 0000000..97484b2
--- /dev/null
+++ b/lab/population/execution.py
@@ -0,0 +1,174 @@
+"""
+Execution models for computing acceptance/fill probabilities.
+
+This module provides different models for how opportunities convert to executions:
+- ElasticityExecutionModel: Price elasticity with competitor cross-effects (retail)
+- IntensityExecutionModel: Distance-based fill intensity (market making)
+- LogitExecutionModel: Discrete choice model
+
+Each model implements the ExecutionModel protocol.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+import numpy as np
+from ..outlet.types import Opportunity, Quote, InstrumentSet, MarketState
+from ..outlet.constants import Side
+from ..outlet.math_util import sigmoid, safe_log, intensity_decay, EPS
+
+@dataclass
+class ElasticityConfig:
+    """Configuration for price elasticity execution model.
+
+    Attributes:
+        base_prob: Baseline purchase probability at reference price
+        price_sensitivity: Own-price elasticity coefficient
+        cross_elasticity: Competitor price cross-elasticity
+        scraper_conversion: Multiplier for scraper conversion (typically << 1)
+    """
+    base_prob: float = 0.3
+    price_sensitivity: float = 2.0
+    cross_elasticity: float = 0.5
+    scraper_conversion: float = 0.01
+
+class ElasticityExecutionModel:
+    """Price elasticity model for retail dynamic pricing.
+
+    P(buy) = base_prob * exp(-sensitivity * log(price/ref)) * cross_effect * scraper_mult
+
+    Higher prices reduce purchase probability exponentially.
+    Competitor undercutting shifts demand away from the platform.
+    Scrapers convert at a much lower rate (reconnaissance, not purchase).
+    """
+
+    def __init__(self, cfg: ElasticityConfig | None = None):
+        self.cfg = cfg or ElasticityConfig()
+
+    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
+             market: MarketState | None, rng: np.random.Generator) -> float:
+        idx = int(opp.instrument_id)
+        price = quote.prices[idx]
+        ref = instruments.refs[idx]
+
+        # base probability adjusted by price ratio
+        log_ratio = safe_log(price / ref)
+        prob = self.cfg.base_prob * np.exp(-self.cfg.price_sensitivity * log_ratio)
+
+        # cross-elasticity: competitor undercutting increases their share
+        if market and market.competitor_quotes is not None:
+            comp_price = market.competitor_quotes[idx]
+            if comp_price < price:
+                prob *= np.exp(-self.cfg.cross_elasticity * (price - comp_price) / ref)
+
+        # scrapers convert at much lower rate
+        if opp.context.get('is_scraper', False):
+            prob *= self.cfg.scraper_conversion
+
+        return float(np.clip(prob, 0, 1))
+
+    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
+                 context: dict[str, Any] | None = None) -> np.ndarray:
+        # simple imputation: assume fills = prob * exposures, invert
+        exposures = context.get('exposures', fills) if context else fills
+        avg_prob = self.cfg.base_prob
+        return fills / (avg_prob + EPS)
+
+@dataclass
+class IntensityConfig:
+    """Configuration for intensity-based execution model.
+
+    Attributes:
+        base_intensity: Baseline fill intensity
+        kappa: Decay rate with distance from mid-price
+        vol_scale: Volatility multiplier for fill intensity
+    """
+    base_intensity: float = 1.0
+    kappa: float = 1.5
+    vol_scale: float = 0.5
+
+class IntensityExecutionModel:
+    """Avellaneda-Stoikov style fill intensity for market making.
+
+    Fill probability decays exponentially with distance from mid-price:
+    P(fill) = base * exp(-kappa * |quote - mid|) * (1 + vol_scale * sigma)
+
+    Tighter spreads (closer to mid) have higher fill probability.
+    Higher volatility increases fill probability (more aggressive traders).
+    """
+
+    def __init__(self, cfg: IntensityConfig | None = None):
+        self.cfg = cfg or IntensityConfig()
+
+    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
+             market: MarketState | None, rng: np.random.Generator) -> float:
+        idx = int(opp.instrument_id)
+
+        # get mid price from market or use quote price
+        if market and market.mid_prices is not None:
+            mid = market.mid_prices[idx]
+        else:
+            mid = quote.prices[idx]
+
+        # compute distance from mid
+        if opp.side == Side.BUY:
+            exec_price = quote.asks[idx] if quote.asks is not None else quote.prices[idx]
+            distance = exec_price - mid
+        else:
+            exec_price = quote.bids[idx] if quote.bids is not None else quote.prices[idx]
+            distance = mid - exec_price
+
+        # intensity decays with distance
+        intensity = self.cfg.base_intensity * intensity_decay(abs(distance), self.cfg.kappa)
+
+        # volatility increases fill probability
+        if market and market.volatility is not None:
+            vol = market.volatility[idx]
+            intensity *= (1 + self.cfg.vol_scale * vol)
+
+        return float(np.clip(intensity, 0, 1))
+
+    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
+                 context: dict[str, Any] | None = None) -> np.ndarray:
+        return fills  # market making doesn't have same censorship concept
+
+@dataclass
+class LogitConfig:
+    """Configuration for logit discrete choice model.
+
+    Attributes:
+        beta_0: Intercept (base utility)
+        beta_price: Price coefficient (typically negative)
+        beta_quality: Quality attribute coefficient
+    """
+    beta_0: float = 0.5
+    beta_price: float = -1.5
+    beta_quality: float = 0.3
+
+class LogitExecutionModel:
+    """Discrete choice logit model for purchase probability.
+
+    Utility: U = beta_0 + beta_price * (price/ref) + beta_quality * quality
+    P(buy) = sigmoid(U)
+
+    Provides a theoretically grounded demand model from economics literature.
+    """
+
+    def __init__(self, cfg: LogitConfig | None = None):
+        self.cfg = cfg or LogitConfig()
+
+    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
+             market: MarketState | None, rng: np.random.Generator) -> float:
+        idx = int(opp.instrument_id)
+        price = quote.prices[idx]
+        ref = instruments.refs[idx]
+        quality = instruments.instruments[idx].attrs.get('quality', 0.5)
+
+        # utility
+        u = self.cfg.beta_0 + self.cfg.beta_price * (price / ref) + self.cfg.beta_quality * quality
+
+        # choice probability via sigmoid
+        return float(sigmoid(u))
+
+    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
+                 context: dict[str, Any] | None = None) -> np.ndarray:
+        return fills / (self.cfg.beta_0 + EPS)
diff --git a/lab/run_example.py b/lab/run_example.py
new file mode 100644
index 0000000..ebe0f18
--- /dev/null
+++ b/lab/run_example.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+"""Example script demonstrating the Quote-Control platform"""
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import numpy as np
+from lab.config import make_retail_platform, make_market_making_platform
+from lab.experiments.eval import (rollout, compare_policies, fixed_price_policy,
+                                   cost_plus_margin_policy, random_walk_policy)
+
+def demo_retail():
+    print("=" * 60)
+    print("RETAIL DYNAMIC PRICING DEMO")
+    print("=" * 60)
+
+    platform = make_retail_platform()
+    print(f"Instruments: {platform.instruments.n}")
+    print(f"Reference prices: {platform.instruments.refs[:5].round(2)}...")
+
+    # compare policies
+    policies = {
+        'fixed': fixed_price_policy(platform.instruments.refs),
+        'cost_plus_30%': cost_plus_margin_policy(platform.instruments.costs, 0.3),
+        'cost_plus_50%': cost_plus_margin_policy(platform.instruments.costs, 0.5),
+        'random_walk': random_walk_policy(platform.instruments.refs, 0.03),
+    }
+
+    results = compare_policies(platform, policies, n_steps=100, n_runs=3)
+
+    print("\nPolicy Comparison (100 steps, 3 runs):")
+    print("-" * 50)
+    for name, r in sorted(results.items(), key=lambda x: -x[1]['mean_pnl']):
+        print(f"{name:20s} PnL={r['mean_pnl']:8.1f} +/- {r['std_reward']:6.1f}  "
+              f"conv={r['mean_conversion']:.3f}")
+
+def demo_market_making():
+    print("\n" + "=" * 60)
+    print("MARKET MAKING DEMO")
+    print("=" * 60)
+
+    platform = make_market_making_platform()
+    print(f"Instruments: {platform.instruments.n}")
+    print(f"Initial mids: {platform.instruments.refs.round(2)}")
+
+    # simple policy: quote at mid with fixed spread
+    def mm_policy(obs: np.ndarray, t: int):
+        mids = platform.instruments.refs  # would use obs in real policy
+        return mids, 1.0
+
+    result = rollout(platform, mm_policy, n_steps=200, seed=42)
+    print(f"\nRollout (200 steps):")
+    print(f"  Total PnL: {result.total_pnl:.2f}")
+    print(f"  Avg conversion: {result.avg_conversion:.3f}")
+    print(f"  Total spread capture: {sum(m.spread_capture for m in result.metrics):.2f}")
+
+if __name__ == '__main__':
+    demo_retail()
+    demo_market_making()

From 19bb4fd517532a0d06de9fa8c073e3cac2acb07f Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 23 Jan 2026 10:37:48 +0100
Subject: [PATCH 46/99] chore; ignoreing build of docs

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 9101b2f..e06db65 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@
 *.old
 **/package-lock.json
 **/*.parquet
+**/_build/
 
 paper/src/bib/auto
 experiments/airflow/logs/*
@@ -20,4 +21,4 @@ sim/rl/behavior_loader/*.dot
 sim/rl/behavior_loader/*.png
 sim/rl/behavior_loader/*.svg
 sim/rl/behavior_loader/*.pdf
-tests/e2e/node_modules/**
+tests/e2e/node_modules/**
\ No newline at end of file

From b0a164795618199c7f6c4c5306e029b1ad5e7942 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 23 Jan 2026 12:52:58 +0100
Subject: [PATCH 47/99] docs

---
 lab/README.md                | 74 +++++++++++++++++++++++++++
 lab/docs/index.rst           |  1 +
 lab/docs/system_overview.rst | 97 ++++++++++++++++++++++++++++++++++++
 3 files changed, 172 insertions(+)
 create mode 100644 lab/docs/system_overview.rst

diff --git a/lab/README.md b/lab/README.md
index c4db76a..b5226aa 100644
--- a/lab/README.md
+++ b/lab/README.md
@@ -1 +1,75 @@
 # MOS (Money Operating System)
+
+Research-grade quote-control simulator for studying dynamic pricing and market making policies.
+The system models pricing as a closed loop of **Quote → Arrival → Execution → Position**, enabling
+controlled experimentation with demand models, inventory constraints, and reward shaping.
+
+## Core Loop
+
+1. **Quote** – the policy posts prices (one-sided or two-sided depending on the mechanism).
+2. **Arrival** – a population model generates purchase opportunities or market orders.
+3. **Execution** – an execution model decides whether an arrival converts at the quoted price.
+4. **Position** – inventory/position limits censor fills and generate holding/shortage costs.
+5. **Observation & Reward** – censored fills and aggregate metrics are exposed to the agent, while
+   objectives turn metrics into a scalar reward.
+
+Each stage is pluggable via light-weight protocols so you can swap in alternative mechanisms,
+demand models, or objectives without rewriting the rest of the simulator.
+
+## Package Layout
+
+| Module            | Purpose |
+|-------------------|---------|
+| `lab.outlet`      | Core simulation engine, domain types, pricing mechanisms, objectives. |
+| `lab.population`  | Demand arrival models, execution probability models, competitor/market dynamics. |
+| `lab.experiments` | Rollout utilities, baseline policies, and off-policy evaluation helpers. |
+| `lab.config`      | Convenience factories for preconfigured retail and market-making environments. |
+
+## Preconfigured Scenarios
+
+### Retail Dynamic Pricing
+- Mechanism: posted prices with margin and delta constraints.
+- Arrivals: browsing sessions with contamination support (scrapers).
+- Execution: elasticity model with competitor cross-effects.
+- Position: inventory tracking with holding and shortage costs.
+- Market: reactive competitor that can trigger price wars.
+- Objective: PnL minus volatility, holding cost, and lost opportunity penalties.
+
+```python
+from lab.config import make_retail_platform
+from lab.experiments import rollout, fixed_price_policy
+
+platform = make_retail_platform()
+policy = fixed_price_policy(platform.instruments.refs)
+result = rollout(platform, policy, n_steps=100)
+print(result.total_pnl)
+```
+
+### Market Making
+- Mechanism: two-sided quoting with bid/ask spreads.
+- Arrivals: Hawkes order flow for clustered demand.
+- Execution: Avellaneda–Stoikov style intensity model.
+- Position: inventory risk limits and quadratic penalty objective.
+- Market: geometric Brownian motion mid-price process.
+- Objective: PnL plus spread capture minus inventory risk.
+
+```python
+from lab.config import make_market_making_platform
+from lab.experiments import rollout
+
+platform = make_market_making_platform()
+mm_policy = lambda obs, t: (platform.instruments.refs, 1.0)
+result = rollout(platform, mm_policy, n_steps=200, seed=42)
+print(result.total_pnl)
+```
+
+## Extending the Simulator
+
+- Implement `lab.outlet.protocols.Mechanism` or `ArrivalModel` to introduce new pricing
+domains or demand processes.
+- Compose objectives with `lab.outlet.objectives.factory.make_composite` to study alternate
+reward formulations.
+- Use `lab.experiments.compare_policies` to benchmark candidate policies across multiple
+random seeds.
+
+Comprehensive API documentation lives in `lab/docs` (build with `make html`).
diff --git a/lab/docs/index.rst b/lab/docs/index.rst
index b53fbba..bd36ecd 100644
--- a/lab/docs/index.rst
+++ b/lab/docs/index.rst
@@ -28,6 +28,7 @@ Quick Start
    :maxdepth: 2
    :caption: Contents:
 
+   system_overview
    modules/outlet
    modules/population
    modules/experiments
diff --git a/lab/docs/system_overview.rst b/lab/docs/system_overview.rst
new file mode 100644
index 0000000..3fda8ad
--- /dev/null
+++ b/lab/docs/system_overview.rst
@@ -0,0 +1,97 @@
+System Overview
+===============
+
+The simulator organises dynamic pricing and market-making experiments as a
+closed loop with the following stages:
+
+* **Quote** – a policy or agent emits a :class:`lab.outlet.types.Quote`. The
+  quote is normalised and validated by a concrete
+  :class:`lab.outlet.protocols.Mechanism` implementation
+  (posted-price, two-sided, auction).
+* **Arrival** – a :class:`lab.outlet.protocols.ArrivalModel` samples a stream of
+  :class:`lab.outlet.types.Opportunity` objects given the current time,
+  instrument catalogue, and market state.
+* **Execution** – the :class:`lab.outlet.protocols.ExecutionModel` converts an
+  opportunity into a probabilistic fill using the active quote, optional
+  competitor prices, and demand-side context.
+* **Position** – a :class:`lab.outlet.protocols.PositionModel` enforces
+  inventory or position constraints, censors oversized fills, and accrues
+  holding and shortage costs.
+* **Observation & Reward** – the
+  :class:`lab.outlet.protocols.ObservationBuilder` constructs the censored view
+  exposed to the agent, while a :class:`lab.outlet.protocols.Objective`
+  transforms :class:`lab.outlet.types.StepMetrics` into a scalar reward with an
+  optional breakdown per term.
+
+These components are orchestrated by :class:`lab.outlet.platform.Platform`,
+which manages internal hidden state, deterministic seeding, and logging.
+
+Component Matrix
+----------------
+
+===============================  ==============================================
+Layer                            Responsibilities / Examples
+===============================  ==============================================
+Mechanisms                       Quote normalisation, execution semantics
+                                 (`posted_price`, `two_sided`, `auction`).
+Population models                Arrivals (:mod:`lab.population.arrivals`),
+                                 execution probability models
+                                 (:mod:`lab.population.execution`), and
+                                 competitor or market dynamics
+                                 (:mod:`lab.population.competitors`).
+Position management              Inventory limits, replenishment, holding and
+                                 shortage costs (:mod:`lab.outlet.stock`).
+Observation & logging            Censored observations and optional event logs
+                                 (:mod:`lab.outlet.observation`).
+Objectives                       Reward composition utilities
+                                 (:mod:`lab.outlet.objectives`).
+Experiments                      Rollout helpers, baseline policies, off-policy
+                                 evaluation (:mod:`lab.experiments.eval`).
+===============================  ==============================================
+
+Preconfigured Platforms
+-----------------------
+
+Two high-level factories in :mod:`lab.config` wire common combinations of the
+building blocks:
+
+* **Retail dynamic pricing** – posted-price mechanism, session arrivals with
+  contamination, elasticity-based executions, reactive competitor model, and a
+  composite objective that penalises volatility, holding costs, and lost
+  opportunities.
+* **Market making** – two-sided quoting, Hawkes order flow, intensity-based
+  executions, geometric Brownian motion mid-prices, and an objective combining
+  PnL, spread capture, and quadratic inventory risk.
+
+State & Reset Behaviour
+-----------------------
+
+When you call :meth:`lab.outlet.platform.Platform.reset`, the platform resets
+instrument positions, quotes, and hidden state, but component implementations
+may maintain their own internal buffers. For reproducible experiments:
+
+* Reuse freshly instantiated arrival/market models per episode, or add explicit
+  ``reset`` methods if the model keeps history (for example,
+  :class:`lab.population.arrivals.HawkesArrivalModel` maintains an event
+  history, while :class:`lab.population.competitors.ReactiveCompetitorModel`
+  tracks prior competitor quotes).
+* Seed randomness through the factory configuration (``RetailConfig.seed`` or
+  ``MarketMakingConfig.seed``) or pass a seed to ``Platform.reset`` for
+  deterministic rollouts.
+
+Extending the Platform
+----------------------
+
+To support a new domain:
+
+1. Create custom Mechanism/Arrival/Execution/Market/Observation components by
+   implementing the respective protocol in :mod:`lab.outlet.protocols`.
+2. Compose a new objective with
+   :func:`lab.outlet.objectives.factory.make_composite` or write a bespoke
+   :class:`lab.outlet.objectives.base.BaseObjective`.
+3. Wire everything together via :class:`lab.outlet.platform.Platform` directly
+   or expose a helper factory in :mod:`lab.config`.
+
+Use :func:`lab.experiments.rollout` and
+:func:`lab.experiments.compare_policies` to benchmark candidate policies under
+multiple random seeds, collecting per-step logs for analysis or OPE.

From 28669ea4c330ef0d6c442651a0db1fd4de587cb8 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 23 Jan 2026 17:16:32 +0100
Subject: [PATCH 48/99] win: refomulated and re-inspired from library

---
 lab/case/thesis/simplified.py     | 288 +++++++++++++++++++++++++
 lab/case/thesis/simplified_env.py | 338 ++++++++++++++++++++++++++++++
 2 files changed, 626 insertions(+)
 create mode 100644 lab/case/thesis/simplified.py
 create mode 100644 lab/case/thesis/simplified_env.py

diff --git a/lab/case/thesis/simplified.py b/lab/case/thesis/simplified.py
new file mode 100644
index 0000000..3b3838e
--- /dev/null
+++ b/lab/case/thesis/simplified.py
@@ -0,0 +1,288 @@
+"""Minimal implementation of thesis pricing system.
+
+Implements the core loop: prices -> sessions -> demand -> prices
+with behavioral separability and robust pricing objective (Eq 23).
+
+Objects:
+- Session trajectories τ_s from mixture of H/A behavioral profiles
+- Demand proxy q̂ via weighted action aggregation (Eq 2)
+- COI leakage penalty for agent reconnaissance
+- Limbo: alternating price/demand history for trajectory analysis
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple
+import numpy as np
+
+ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0}
+TRANS_H = {"start": {"view": 0.85, "end": 0.15}, "view": {"detail": 0.4, "cart": 0.3, "view": 0.2, "end": 0.1},
+           "detail": {"cart": 0.5, "view": 0.3, "end": 0.2}, "cart": {"purchase": 0.6, "view": 0.25, "end": 0.15}, "purchase": {"end": 1.0}}
+TRANS_A = {"start": {"view": 0.95, "end": 0.05}, "view": {"detail": 0.6, "view": 0.25, "cart": 0.1, "end": 0.05},
+           "detail": {"view": 0.5, "cart": 0.15, "detail": 0.3, "end": 0.05}, "cart": {"view": 0.4, "purchase": 0.2, "end": 0.4}, "purchase": {"end": 1.0}}
+
+
+@dataclass
+class Event:
+    action: str
+    product_idx: int
+    price_seen: float
+    ts: float
+
+
+@dataclass
+class Session:
+    sid: str
+    events: List[Event]
+    actor: str  # H or A (ground truth label)
+    theta: Dict[str, float] = field(default_factory=dict)
+
+
+def compute_demand(session: Session) -> float:
+    """Compute demand proxy q̂ = Σ_k ω(a_k) for session (Eq 2)."""
+    return sum(ACTION_WEIGHTS.get(e.action, 0.1) for e in session.events)
+
+
+def kl_div(p: Dict[str, float], q: Dict[str, float]) -> float:
+    """KL divergence D_KL(p || q) for transition kernels."""
+    eps = 1e-10
+    keys = set(p.keys()) | set(q.keys())
+    return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys)
+
+
+def build_kernel(events: List[Event]) -> Dict[str, Dict[str, float]]:
+    """Build empirical transition kernel from trajectory."""
+    trans: Dict[str, Dict[str, int]] = {}
+    prev = "start"
+    for e in events:
+        curr = e.action
+        trans.setdefault(prev, {})
+        trans[prev][curr] = trans[prev].get(curr, 0) + 1
+        prev = curr
+    kernel = {}
+    for s, dsts in trans.items():
+        total = sum(dsts.values())
+        kernel[s] = {d: c / total for d, c in dsts.items()} if total > 0 else {}
+    return kernel
+
+
+def compute_divergence(session: Session) -> Tuple[float, float]:
+    """Compute Δ_H, Δ_A divergence signals (Eq 20-21)."""
+    kernel = build_kernel(session.events)
+    delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / max(len(kernel), 1)
+    delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / max(len(kernel), 1)
+    return delta_h, delta_a
+
+
+def estimate_alpha(session: Session, beta: float = 2.0) -> float:
+    """Per-session contamination estimate α̂(τ') = σ(β(Δ_H - Δ_A))."""
+    dh, da = compute_divergence(session)
+    return 1.0 / (1.0 + np.exp(-beta * (dh - da))) if (dh + da) > 0 else 0.5
+
+
+def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray, is_agent: bool) -> Tuple[List[Event], int]:
+    """Sample session trajectory from behavioral kernel."""
+    state, t, pidx = "start", 0.0, int(rng.integers(0, len(prices)))
+    events = []
+    while state != "end" and len(events) < 30:
+        if state != "start":
+            events.append(Event(action=state, product_idx=pidx, price_seen=float(prices[pidx]), ts=t))
+        probs = trans.get(state, {"end": 1.0})
+        state = rng.choice(list(probs.keys()), p=list(probs.values()))
+        t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2))
+    return events, pidx
+
+
+def put_prices_to_market(prices: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
+                         seed: int | None = None) -> Tuple[Dict[str, float], Dict[str, str]]:
+    """Generate sessions from mixture model Q(p) = (1-α)E[d_H] + αE[d_A] (Eq 3).
+
+    Returns:
+        demand_mapping: session_id -> demand proxy q̂
+        hidden_labels: session_id -> actor class (H or A)
+    """
+    rng = np.random.default_rng(seed)
+    demand_mapping, hidden_labels = {}, {}
+
+    for i in range(n_sessions):
+        sid = f"s{i:04d}"
+        is_agent = rng.random() < alpha
+        trans = TRANS_A if is_agent else TRANS_H
+        theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
+        events, _ = sample_trajectory(rng, trans, prices, is_agent)
+        session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
+        demand_mapping[sid] = compute_demand(session)
+        hidden_labels[sid] = session.actor
+
+    return demand_mapping, hidden_labels
+
+
+@dataclass
+class LimboUpdate:
+    utype: str  # "prices" or "demand"
+    data: np.ndarray | Dict[str, float]
+    t: int
+
+
+class Limbo:
+    """Historical trajectory of alternating price/demand observations."""
+
+    def __init__(self):
+        self.history: List[LimboUpdate] = []
+        self._t = 0
+
+    def add_update(self, utype: str, data: np.ndarray | Dict[str, float]) -> Dict:
+        self.history.append(LimboUpdate(utype=utype, data=data, t=self._t))
+        self._t += 1
+        return self.on_update(utype)
+
+    def on_update(self, utype: str) -> Dict:
+        """React to update: after prices -> return observed demand; after demand -> signal price update needed."""
+        if utype == "prices":
+            return {"action": "observe_demand", "msg": "awaiting market response"}
+        return {"action": "set_prices", "msg": "demand observed, update prices"}
+
+    def get_prices_history(self) -> List[np.ndarray]:
+        return [u.data for u in self.history if u.utype == "prices"]
+
+    def get_demand_history(self) -> List[Dict[str, float]]:
+        return [u.data for u in self.history if u.utype == "demand"]
+
+
+class System:
+    """Main pricing system implementing robust Stackelberg objective.
+
+    Manages the alternating loop:
+    1. Set prices p_t
+    2. Observe demand response Q̂(p_t)
+    3. Estimate contamination α from behavioral signals
+    4. Compute next prices via robust objective (Eq 23)
+    """
+
+    def __init__(self, n_products: int = 10, costs: np.ndarray | None = None, lambda_coi: float = 0.5, seed: int | None = 42):
+        self.n = n_products
+        self.rng = np.random.default_rng(seed)
+        self.costs = costs if costs is not None else self.rng.uniform(10, 50, n_products)
+        self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products))  # base prices with margin
+        self.lambda_coi = lambda_coi
+        self.limbo = Limbo()
+        self._alpha_est = 0.2  # current contamination estimate
+        self._sessions: List[Session] = []
+
+    @property
+    def alpha(self) -> float:
+        return self._alpha_est
+
+    def _estimate_alpha_from_sessions(self) -> float:
+        """Aggregate per-session α̂ estimates."""
+        if not self._sessions:
+            return self._alpha_est
+        alphas = [estimate_alpha(s) for s in self._sessions[-50:]]  # use recent sessions
+        return float(np.mean(alphas))
+
+    def _revenue_under_demand(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
+        """Compute expected revenue R(p, d) from demand proxy."""
+        agg_demand = np.zeros(self.n)
+        for sid, q in demand.items():
+            if self._sessions:
+                sess = next((s for s in self._sessions if s.sid == sid), None)
+                if sess and sess.events:
+                    pidx = sess.events[0].product_idx
+                    agg_demand[pidx] += q
+        return float(np.dot(prices, agg_demand))
+
+    def _coi_leakage(self, prices: np.ndarray) -> float:
+        """COI_leak = α · InfoValue (query-tax surrogate)."""
+        return self._alpha_est * 1.0
+
+    def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
+        """Robust objective: R(p,d) - λ·COI_leak (Eq 23 simplified)."""
+        revenue = self._revenue_under_demand(prices, demand)
+        cost = float(np.sum(self.costs))  # fixed cost approximation
+        profit = revenue - cost
+        coi_penalty = self.lambda_coi * self._coi_leakage(prices) * float(np.mean(prices - self.costs))
+        return profit - coi_penalty
+
+    def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray:
+        """Compute next prices via simple gradient-like update on robust objective.
+
+        In a full implementation this would be replaced by DR-RL policy output.
+        Here we use a heuristic: adjust margins based on α estimate.
+        """
+        self._alpha_est = self._estimate_alpha_from_sessions()
+
+        # base margin adjustment: higher α -> lower margins (defensive pricing)
+        margin_scale = 1.0 - 0.5 * self._alpha_est  # reduce margins under high contamination
+        margins = (self.refs - self.costs) * margin_scale
+
+        # add small noise for exploration
+        noise = self.rng.normal(0, 0.02, self.n) * self.costs
+        prices = np.clip(self.costs + margins + noise, self.costs * 1.02, self.refs * 1.3)
+
+        self.limbo.add_update("prices", prices)
+        return prices
+
+    def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
+        """Observe market response to prices."""
+        demand_map, labels = put_prices_to_market(prices, alpha=alpha_true, n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
+
+        # reconstruct sessions for α estimation
+        for sid, actor in labels.items():
+            events, _ = sample_trajectory(self.rng, TRANS_A if actor == "A" else TRANS_H, prices, actor == "A")
+            self._sessions.append(Session(sid=sid, events=events, actor=actor))
+
+        self.limbo.add_update("demand", demand_map)
+        return demand_map
+
+    def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float]:
+        """Single simulation step: prices -> demand -> reward."""
+        demand_hist = self.limbo.get_demand_history()
+        prices = self.compute_prices(demand_hist[-1] if demand_hist else None)
+        demand = self.observe_demand(prices, alpha_true, n_sessions)
+        reward = self._objective(prices, demand)
+        return prices, demand, reward
+
+    def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict:
+        """Run simulation for n_steps, return trajectory."""
+        trajectory = {"prices": [], "demand": [], "rewards": [], "alpha_est": [], "alpha_true": alpha_true}
+        for _ in range(n_steps):
+            p, d, r = self.step(alpha_true)
+            trajectory["prices"].append(p)
+            trajectory["demand"].append(d)
+            trajectory["rewards"].append(r)
+            trajectory["alpha_est"].append(self._alpha_est)
+        return trajectory
+
+
+def coi_erosion(n_agents: int, price_std: float) -> float:
+    """COI erosion from Theorem 1: as N->inf, min(p_1..p_N)->p_min."""
+    if n_agents <= 1:
+        return 0.0
+    log_n = np.log(n_agents)
+    shift = price_std * (np.sqrt(2 * log_n) - (np.log(log_n) + np.log(4 * np.pi)) / (2 * np.sqrt(2 * log_n) + 1e-6))
+    return float(min(shift / (price_std * 2 + 1e-6), 1.0))
+
+
+if __name__ == "__main__":
+    # quick demo
+    sys = System(n_products=5, seed=42)
+    traj = sys.run(n_steps=20, alpha_true=0.25)
+    print(f"avg reward: {np.mean(traj['rewards']):.2f}, final α̂: {traj['alpha_est'][-1]:.3f}")
+
+    prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
+    demand, labels = put_prices_to_market(prices, alpha=0.3, n_sessions=20, seed=123)
+    print(f'sessions: {len(demand)}, agents: {sum(1 for l in labels.values() if l=="A")}')
+
+    for n in [1, 5, 10, 50, 100]:
+        ero = coi_erosion(n, price_std=5.0)
+        print(f'N={n:3d} agents -> COI erosion: {ero:.3f}')
+
+    # test separability
+    events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0),
+    Event('purchase', 0, 20.0, 2.0)]
+    sess_h = Session(sid='test', events=events, actor='H')
+    print(f'human-like session α̂: {estimate_alpha(sess_h):.3f}')
+
+    events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3),
+    Event('detail', 0, 20.0, 0.4)]
+    sess_a = Session(sid='test2', events=events_a, actor='A')
+    print(f'agent-like session α̂: {estimate_alpha(sess_a):.3f}')
diff --git a/lab/case/thesis/simplified_env.py b/lab/case/thesis/simplified_env.py
new file mode 100644
index 0000000..e18454b
--- /dev/null
+++ b/lab/case/thesis/simplified_env.py
@@ -0,0 +1,338 @@
+"""Gymnasium-compatible RL environment for thesis pricing system.
+
+Wraps simplified.System with standard Gym interface for training pricing policies.
+Supports multiple reward modes and contamination scenarios.
+
+Action: price multipliers [0.5, 1.5] applied to reference prices
+Observation: [prices, demand_agg, alpha_est, margins, position_proxy]
+Reward: configurable objective (revenue, profit, robust, coi-aware)
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, Tuple
+import numpy as np
+
+try:
+    import gymnasium as gym
+    from gymnasium import spaces
+    HAS_GYM = True
+except ImportError:
+    HAS_GYM = False
+
+from .simplified import (System, Session, Event, Limbo, put_prices_to_market,
+                         compute_demand, estimate_alpha, coi_erosion, TRANS_H, TRANS_A)
+
+
+@dataclass
+class EnvConfig:
+    """Configuration for pricing environment."""
+    n_products: int = 5
+    max_steps: int = 200
+    sessions_per_step: int = 30
+    alpha_true: float = 0.2           # true contamination level
+    alpha_drift: float = 0.0          # per-step drift in α
+    alpha_bounds: Tuple[float, float] = (0.0, 0.6)
+    lambda_coi: float = 0.5           # COI penalty weight
+    lambda_vol: float = 0.1           # volatility penalty weight
+    reward_mode: str = "robust"       # revenue | profit | robust | coi_aware
+    normalize_reward: bool = True
+    seed: int | None = 42
+
+
+class PricingEnv:
+    """RL environment for dynamic pricing under agent contamination.
+
+    Implements the thesis formulation where:
+    - Platform sets prices p_t
+    - Market responds with mixture demand Q(p) = (1-α)D_H + αD_A
+    - Agent estimates contamination α̂ from behavioral signals
+    - Reward balances profit vs COI leakage
+
+    Observation space (normalized):
+        [0:n]     - current prices / ref_prices
+        [n:2n]    - aggregated demand per product
+        [2n]      - estimated contamination α̂
+        [2n+1]    - true contamination α (if observable, else 0)
+        [2n+2:3n+2] - current margins (prices - costs) / costs
+        [3n+2]    - step / max_steps
+
+    Action space:
+        price multipliers in [0.5, 1.5] applied to reference prices
+    """
+
+    metadata = {"render_modes": ["human", "ansi"]}
+
+    def __init__(self, cfg: EnvConfig | None = None):
+        if not HAS_GYM:
+            raise ImportError("gymnasium required")
+        self.cfg = cfg or EnvConfig()
+        self.n = self.cfg.n_products
+        self._sys: System | None = None
+        self._t = 0
+        self._alpha = self.cfg.alpha_true
+        self._last_prices: np.ndarray | None = None
+        self._last_demand: Dict[str, float] | None = None
+        self._episode_rewards: list[float] = []
+        self._demand_agg = np.zeros(self.n)
+
+        # gymnasium spaces
+        self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32)
+        obs_dim = self.n + self.n + 1 + 1 + self.n + 1  # prices + demand + α̂ + α + margins + t
+        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)
+
+    def _build_obs(self) -> np.ndarray:
+        """Construct observation vector."""
+        if self._sys is None:
+            return np.zeros(self.observation_space.shape[0], dtype=np.float32)
+
+        prices = self._last_prices if self._last_prices is not None else self._sys.refs
+        price_ratio = prices / (self._sys.refs + 1e-6)
+        demand_norm = self._demand_agg / (np.sum(self._demand_agg) + 1e-6)
+        margins = (prices - self._sys.costs) / (self._sys.costs + 1e-6)
+        t_norm = self._t / self.cfg.max_steps
+
+        obs = np.concatenate([
+            price_ratio,                          # [0:n]
+            demand_norm,                          # [n:2n]
+            [self._sys.alpha],                    # [2n] estimated α̂
+            [self._alpha],                        # [2n+1] true α
+            margins,                              # [2n+2:3n+2]
+            [t_norm],                             # [3n+2]
+        ])
+        return obs.astype(np.float32)
+
+    def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
+        """Compute reward based on configured mode."""
+        cfg, sys = self.cfg, self._sys
+        if sys is None:
+            return 0.0
+
+        # aggregate demand per product
+        agg = np.zeros(self.n)
+        for sid, q in demand.items():
+            sess = next((s for s in sys._sessions if s.sid == sid), None)
+            if sess and sess.events:
+                pidx = sess.events[0].product_idx
+                agg[pidx] += q
+        self._demand_agg = agg
+
+        revenue = float(np.dot(prices, agg))
+        cost = float(np.dot(sys.costs, np.clip(agg, 0, 1)))  # simplified cost model
+        profit = revenue - cost
+
+        # volatility penalty (price changes)
+        vol_penalty = 0.0
+        if self._last_prices is not None:
+            price_change = np.abs(prices - self._last_prices) / (sys.refs + 1e-6)
+            vol_penalty = cfg.lambda_vol * float(np.mean(price_change))
+
+        # COI leakage penalty
+        avg_margin = float(np.mean(prices - sys.costs))
+        coi_leak = sys.alpha * avg_margin
+
+        if cfg.reward_mode == "revenue":
+            r = revenue
+        elif cfg.reward_mode == "profit":
+            r = profit
+        elif cfg.reward_mode == "robust":
+            # robust objective: profit - λ_coi * COI_leak - λ_vol * volatility
+            r = profit - cfg.lambda_coi * coi_leak - vol_penalty
+        elif cfg.reward_mode == "coi_aware":
+            # adaptive: heavier penalty at high contamination
+            adaptive_lambda = cfg.lambda_coi * (1 + 2 * sys.alpha)
+            r = profit - adaptive_lambda * coi_leak - vol_penalty
+        else:
+            r = profit
+
+        if cfg.normalize_reward:
+            r = r / (float(np.sum(sys.refs)) + 1e-6)  # normalize by potential revenue
+
+        return float(r)
+
+    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
+        """Reset environment to initial state."""
+        seed = seed if seed is not None else self.cfg.seed
+        self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed)
+        self._t = 0
+        self._alpha = self.cfg.alpha_true
+        self._last_prices = None
+        self._last_demand = None
+        self._episode_rewards = []
+        self._demand_agg = np.zeros(self.n)
+
+        info = {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
+                "costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()}
+        return self._build_obs(), info
+
+    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
+        """Execute one environment step.
+
+        Args:
+            action: price multipliers in [0.5, 1.5]
+
+        Returns:
+            obs, reward, terminated, truncated, info
+        """
+        if self._sys is None:
+            raise RuntimeError("call reset() first")
+
+        # convert action to prices
+        action = np.clip(action, 0.5, 1.5)
+        prices = self._sys.refs * action.astype(np.float64)
+        prices = np.clip(prices, self._sys.costs * 1.01, self._sys.refs * 2.0)
+
+        # drift contamination
+        if self.cfg.alpha_drift != 0:
+            self._alpha = np.clip(
+                self._alpha + self.cfg.alpha_drift * self._sys.rng.normal(),
+                *self.cfg.alpha_bounds)
+
+        # observe demand
+        demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step)
+        self._sys.limbo.add_update("prices", prices)
+
+        # update α estimate
+        self._sys._alpha_est = self._sys._estimate_alpha_from_sessions()
+
+        reward = self._compute_reward(prices, demand)
+        self._episode_rewards.append(reward)
+
+        self._last_prices = prices.copy()
+        self._last_demand = demand
+        self._t += 1
+
+        terminated = self._t >= self.cfg.max_steps
+        truncated = False
+
+        info = {
+            "alpha_true": self._alpha,
+            "alpha_est": self._sys.alpha,
+            "revenue": float(np.dot(prices, self._demand_agg)),
+            "avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
+            "n_sessions": len(demand),
+            "coi_erosion": coi_erosion(int(self._alpha * self.cfg.sessions_per_step), float(np.std(prices))),
+        }
+
+        return self._build_obs(), reward, terminated, truncated, info
+
+    def render(self, mode: str = "human") -> str | None:
+        """Render environment state."""
+        if self._sys is None or self._last_prices is None:
+            return None
+
+        lines = [
+            f"t={self._t}/{self.cfg.max_steps}",
+            f"α_true={self._alpha:.3f} α̂={self._sys.alpha:.3f}",
+            f"prices: {self._last_prices.round(1)}",
+            f"demand: {self._demand_agg.round(2)}",
+            f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}",
+        ]
+        out = " | ".join(lines)
+        if mode == "human":
+            print(out)
+        return out
+
+    def close(self) -> None:
+        pass
+
+
+class ContaminationSweepEnv(PricingEnv):
+    """Environment that sweeps through contamination levels during training.
+
+    Useful for curriculum learning: start with low α, gradually increase.
+    """
+
+    def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None):
+        super().__init__(cfg)
+        self._schedule = alpha_schedule or [0.1, 0.2, 0.3, 0.4, 0.5]
+        self._schedule_idx = 0
+
+    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
+        # advance schedule on reset
+        if options and options.get("advance_schedule", False):
+            self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule)
+        self.cfg.alpha_true = self._schedule[self._schedule_idx]
+        return super().reset(seed, options)
+
+
+class AdversarialEnv(PricingEnv):
+    """Environment with adversarial contamination dynamics.
+
+    The contamination level responds to pricing policy: if prices are too predictable,
+    agents learn to exploit and α increases.
+    """
+
+    def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02):
+        super().__init__(cfg)
+        self._exploit_rate = exploitation_rate
+        self._price_history: list[np.ndarray] = []
+
+    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
+        obs, reward, term, trunc, info = super().step(action)
+
+        # track price history for predictability
+        if self._last_prices is not None:
+            self._price_history.append(self._last_prices.copy())
+
+        # increase α if prices are predictable (low variance over recent history)
+        if len(self._price_history) > 10:
+            recent = np.array(self._price_history[-10:])
+            predictability = 1.0 / (float(np.std(recent)) + 0.1)
+            self._alpha = np.clip(
+                self._alpha + self._exploit_rate * predictability * self._sys.rng.random(),
+                *self.cfg.alpha_bounds)
+
+        info["predictability"] = predictability if len(self._price_history) > 10 else 0.0
+        return obs, reward, term, trunc, info
+
+    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
+        self._price_history = []
+        return super().reset(seed, options)
+
+
+def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv:
+    """Factory for creating pricing environments."""
+    if env_type == "sweep":
+        return ContaminationSweepEnv(cfg)
+    elif env_type == "adversarial":
+        return AdversarialEnv(cfg)
+    return PricingEnv(cfg)
+
+
+# simple baseline policies for benchmarking
+def fixed_price_policy(refs: np.ndarray, margin: float = 0.0) -> np.ndarray:
+    """Fixed markup policy: always return ref * (1 + margin)."""
+    return np.ones(len(refs), dtype=np.float32) * (1.0 + margin)
+
+
+def random_policy(n: int, rng: np.random.Generator | None = None) -> np.ndarray:
+    """Random policy for exploration baseline."""
+    rng = rng or np.random.default_rng()
+    return rng.uniform(0.7, 1.3, n).astype(np.float32)
+
+
+def adaptive_policy(obs: np.ndarray, n: int, base_margin: float = 0.1) -> np.ndarray:
+    """Simple adaptive policy: reduce margins when α̂ is high."""
+    alpha_est = obs[2 * n]  # α̂ is at position 2n in observation
+    margin_scale = 1.0 - 0.4 * alpha_est  # defensive when α̂ high
+    return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale)
+
+
+if __name__ == "__main__":
+    # demo run
+    cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust")
+    env = make_env(cfg)
+    obs, info = env.reset()
+    print(f"initial: α={info['alpha_true']:.2f}")
+
+    total_reward = 0.0
+    for t in range(cfg.max_steps):
+        action = adaptive_policy(obs, cfg.n_products)
+        obs, reward, done, _, info = env.step(action)
+        total_reward += reward
+        if t % 10 == 0:
+            env.render()
+        if done:
+            break
+
+    print(f"\ntotal reward: {total_reward:.2f}, final α̂: {info['alpha_est']:.3f}")

From c5eae179245d6dceb74f454b40b5da7c6ed46159 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sat, 24 Jan 2026 13:20:42 +0100
Subject: [PATCH 49/99] simple baselines and training setup to be refactored

---
 lab/case/thesis/simplified_env.py |  21 +-
 lab/case/thesis/train.py          | 474 ++++++++++++++++++++++++++++++
 2 files changed, 492 insertions(+), 3 deletions(-)
 create mode 100644 lab/case/thesis/train.py

diff --git a/lab/case/thesis/simplified_env.py b/lab/case/thesis/simplified_env.py
index e18454b..af4af87 100644
--- a/lab/case/thesis/simplified_env.py
+++ b/lab/case/thesis/simplified_env.py
@@ -39,7 +39,7 @@ class EnvConfig:
     seed: int | None = 42
 
 
-class PricingEnv:
+class PricingEnv(gym.Env if HAS_GYM else object):
     """RL environment for dynamic pricing under agent contamination.
 
     Implements the thesis formulation where:
@@ -204,13 +204,28 @@ class PricingEnv:
         terminated = self._t >= self.cfg.max_steps
         truncated = False
 
+        # compute metrics for tracking
+        revenue = float(np.dot(prices, self._demand_agg))
+        cost = float(np.dot(self._sys.costs, np.clip(self._demand_agg, 0, 1)))
+        profit = revenue - cost
+        n_agents = int(self._alpha * self.cfg.sessions_per_step)
+        price_std = float(np.std(prices))
+
         info = {
             "alpha_true": self._alpha,
             "alpha_est": self._sys.alpha,
-            "revenue": float(np.dot(prices, self._demand_agg)),
+            "alpha_error": abs(self._alpha - self._sys.alpha),
+            "revenue": revenue,
+            "profit": profit,
+            "cost": cost,
             "avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
             "n_sessions": len(demand),
-            "coi_erosion": coi_erosion(int(self._alpha * self.cfg.sessions_per_step), float(np.std(prices))),
+            "n_agents": n_agents,
+            "price_std": price_std,
+            "coi_erosion": coi_erosion(max(1, n_agents), price_std),
+            "coi_leakage": self._sys.alpha * float(np.mean(prices - self._sys.costs)),
+            "cumulative_reward": sum(self._episode_rewards),
+            "step": self._t,
         }
 
         return self._build_obs(), reward, terminated, truncated, info
diff --git a/lab/case/thesis/train.py b/lab/case/thesis/train.py
new file mode 100644
index 0000000..f6fb7d4
--- /dev/null
+++ b/lab/case/thesis/train.py
@@ -0,0 +1,474 @@
+"""RL training for thesis pricing system with COI tracking.
+
+Trains pricing policies using stable-baselines3 with TensorBoard logging.
+Demonstrates COI leakage under different contamination levels and policies.
+
+Usage:
+    python -m lab.case.thesis.train --algo ppo --alpha 0.3 --steps 100000
+    tensorboard --logdir lab/case/thesis/runs
+"""
+from __future__ import annotations
+import argparse
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Callable, Any
+import numpy as np
+
+try:
+    from stable_baselines3 import PPO, SAC, A2C
+    from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
+    from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
+    from stable_baselines3.common.monitor import Monitor
+    from stable_baselines3.common.logger import configure
+    HAS_SB3 = True
+except ImportError:
+    HAS_SB3 = False
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+    HAS_TB = True
+except ImportError:
+    HAS_TB = False
+
+from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy
+from .simplified import coi_erosion
+
+
+class BaselinePolicy:
+    """Wrapper to make baseline policies compatible with SB3 interface."""
+
+    def __init__(self, policy_fn, name: str):
+        self.policy_fn = policy_fn
+        self.name = name
+        self.num_timesteps = 0
+
+    def predict(self, obs, deterministic: bool = True):
+        n = (len(obs) - 3) // 3  # infer n_products from obs shape
+        action = self.policy_fn(obs, n)
+        return action, None
+
+    def learn(self, total_timesteps: int, callback=None, progress_bar: bool = False):
+        self.num_timesteps = total_timesteps
+        return self
+
+    def save(self, path):
+        pass  # no-op for baselines
+
+    @staticmethod
+    def load(path):
+        raise NotImplementedError("baselines cannot be loaded")
+
+
+def myopic_policy(obs: np.ndarray, n: int, greed: float = 0.3) -> np.ndarray:
+    """Myopic: maximize immediate margin, ignore alpha and future COI erosion.
+
+    Greedy short-term optimizer that sets high prices when demand looks good,
+    completely ignoring the alpha estimate and long-term consequences.
+    """
+    demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
+    avg_demand = np.mean(demand_norm)
+    multiplier = 1.0 + greed * (1 + avg_demand)
+    return np.ones(n, dtype=np.float32) * np.clip(multiplier, 0.5, 1.5)
+
+
+def random_myopic_policy(obs: np.ndarray, n: int) -> np.ndarray:
+    """Random myopic: iid random prices each step, no state awareness.
+
+    Represents worst-case baseline where pricing has no strategy at all.
+    """
+    return np.random.uniform(0.8, 1.4, n).astype(np.float32)
+
+
+@dataclass
+class TrainConfig:
+    """Training configuration."""
+    algo: str = "ppo"  # ppo | sac | a2c
+    total_timesteps: int = 100_000
+    n_envs: int = 4
+    eval_freq: int = 5000
+    n_eval_episodes: int = 10
+    log_dir: str = "lab/case/thesis/runs"
+    seed: int = 42
+    # env config
+    n_products: int = 10
+    max_steps: int = 200
+    alpha_true: float = 0.2
+    reward_mode: str = "robust"
+    # baseline sweep
+    run_baselines: bool = True
+    alpha_sweep: List[float] = field(default_factory=lambda: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+
+
+class COICallback(BaseCallback):
+    """Custom callback for tracking COI metrics in TensorBoard."""
+
+    def __init__(self, writer: Any = None, verbose: int = 0):
+        super().__init__(verbose)
+        self._writer = writer
+        self._episode_coi_leak = []
+        self._episode_alpha_err = []
+        self._episode_revenues = []
+        self._episode_margins = []
+
+    def _on_step(self) -> bool:
+        infos = self.locals.get('infos', [])
+        for info in infos:
+            if 'alpha_true' in info and 'alpha_est' in info:
+                self._episode_alpha_err.append(abs(info['alpha_true'] - info['alpha_est']))
+            if 'coi_erosion' in info:
+                self._episode_coi_leak.append(info['coi_erosion'])
+            if 'revenue' in info:
+                self._episode_revenues.append(info['revenue'])
+            if 'avg_margin' in info:
+                self._episode_margins.append(info['avg_margin'])
+        return True
+
+    def _on_rollout_end(self) -> None:
+        if self._writer is None:
+            return
+        step = self.num_timesteps
+        if self._episode_coi_leak:
+            self._writer.add_scalar('coi/erosion_mean', np.mean(self._episode_coi_leak), step)
+            self._writer.add_scalar('coi/erosion_max', np.max(self._episode_coi_leak), step)
+        if self._episode_alpha_err:
+            self._writer.add_scalar('alpha/estimation_error', np.mean(self._episode_alpha_err), step)
+        if self._episode_revenues:
+            self._writer.add_scalar('economics/revenue_mean', np.mean(self._episode_revenues), step)
+        if self._episode_margins:
+            self._writer.add_scalar('economics/margin_mean', np.mean(self._episode_margins), step)
+        self._episode_coi_leak.clear()
+        self._episode_alpha_err.clear()
+        self._episode_revenues.clear()
+        self._episode_margins.clear()
+
+
+def run_baseline_with_logging(model: BaselinePolicy, vec_env, total_timesteps: int, writer: Any) -> None:
+    """Run baseline policy and log metrics identically to RL training."""
+    n_envs = vec_env.num_envs
+    obs = vec_env.reset()
+    step = 0
+    episode_rewards, episode_coi, episode_alpha_err, episode_revenues = [], [], [], []
+    ep_rewards = np.zeros(n_envs)
+
+    while step < total_timesteps:
+        actions = np.array([model.predict(obs[i])[0] for i in range(n_envs)])
+        obs, rewards, dones, infos = vec_env.step(actions)
+        step += n_envs
+        ep_rewards += rewards
+
+        for i, info in enumerate(infos):
+            if 'coi_erosion' in info:
+                episode_coi.append(info['coi_erosion'])
+            if 'alpha_true' in info and 'alpha_est' in info:
+                episode_alpha_err.append(abs(info['alpha_true'] - info['alpha_est']))
+            if 'revenue' in info:
+                episode_revenues.append(info['revenue'])
+            if dones[i]:
+                episode_rewards.append(ep_rewards[i])
+                ep_rewards[i] = 0.0
+
+        if writer and len(episode_rewards) >= 5 and step % 1000 < n_envs:
+            writer.add_scalar('rollout/ep_rew_mean', np.mean(episode_rewards[-10:]), step)
+            if episode_coi:
+                writer.add_scalar('coi/erosion_mean', np.mean(episode_coi[-100:]), step)
+            if episode_alpha_err:
+                writer.add_scalar('alpha/estimation_error', np.mean(episode_alpha_err[-100:]), step)
+            if episode_revenues:
+                writer.add_scalar('economics/revenue_mean', np.mean(episode_revenues[-100:]), step)
+
+        if step % 10000 < n_envs:
+            print(f"  step {step}/{total_timesteps}, avg_reward={np.mean(episode_rewards[-20:]) if episode_rewards else 0:.2f}")
+
+
+def make_vec_env(cfg: TrainConfig, n_envs: int = 1) -> DummyVecEnv:
+    """Create vectorized environment."""
+    def _make():
+        env_cfg = EnvConfig(
+            n_products=cfg.n_products, max_steps=cfg.max_steps,
+            alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)
+        env = make_env(env_cfg)
+        return Monitor(env)
+    return DummyVecEnv([_make for _ in range(n_envs)])
+
+
+def run_baseline(
+    policy_fn: Callable[[np.ndarray, int], np.ndarray],
+    env: PricingEnv,
+    n_episodes: int = 20,
+    name: str = "baseline"
+) -> Dict[str, float]:
+    """Evaluate baseline policy and collect metrics."""
+    episode_rewards, episode_coi, episode_alpha_err = [], [], []
+
+    for _ in range(n_episodes):
+        obs, info = env.reset()
+        done, ep_reward, ep_coi, ep_alpha_err = False, 0.0, [], []
+
+        while not done:
+            action = policy_fn(obs, env.n)
+            obs, reward, terminated, truncated, info = env.step(action)
+            done = terminated or truncated
+            ep_reward += reward
+            if 'coi_erosion' in info:
+                ep_coi.append(info['coi_erosion'])
+            if 'alpha_true' in info and 'alpha_est' in info:
+                ep_alpha_err.append(abs(info['alpha_true'] - info['alpha_est']))
+
+        episode_rewards.append(ep_reward)
+        if ep_coi:
+            episode_coi.append(np.mean(ep_coi))
+        if ep_alpha_err:
+            episode_alpha_err.append(np.mean(ep_alpha_err))
+
+    return {
+        f'{name}/reward_mean': np.mean(episode_rewards),
+        f'{name}/reward_std': np.std(episode_rewards),
+        f'{name}/coi_erosion': np.mean(episode_coi) if episode_coi else 0.0,
+        f'{name}/alpha_error': np.mean(episode_alpha_err) if episode_alpha_err else 0.0,
+    }
+
+
+def run_coi_demonstration(writer: Any, cfg: TrainConfig) -> Dict[str, Dict[str, float]]:
+    """Demonstrate COI leakage across contamination levels."""
+    results = {}
+
+    for alpha in cfg.alpha_sweep:
+        env_cfg = EnvConfig(
+            n_products=cfg.n_products, max_steps=cfg.max_steps,
+            alpha_true=alpha, reward_mode=cfg.reward_mode, seed=cfg.seed)
+        env = make_env(env_cfg)
+
+        # run fixed policy
+        fixed_metrics = run_baseline(
+            lambda obs, n: fixed_price_policy(np.ones(n), margin=0.15),
+            env, n_episodes=10, name=f"fixed_alpha{alpha:.1f}")
+
+        # run adaptive policy
+        adaptive_metrics = run_baseline(
+            lambda obs, n: adaptive_policy(obs, n, base_margin=0.15),
+            env, n_episodes=10, name=f"adaptive_alpha{alpha:.1f}")
+
+        # theoretical erosion
+        n_agents = int(alpha * cfg.max_steps * 30)  # rough agent count
+        theo_erosion = coi_erosion(max(1, n_agents), price_std=5.0)
+
+        results[f'alpha_{alpha:.1f}'] = {
+            'fixed_reward': fixed_metrics[f"fixed_alpha{alpha:.1f}/reward_mean"],
+            'adaptive_reward': adaptive_metrics[f"adaptive_alpha{alpha:.1f}/reward_mean"],
+            'fixed_coi': fixed_metrics[f"fixed_alpha{alpha:.1f}/coi_erosion"],
+            'adaptive_coi': adaptive_metrics[f"adaptive_alpha{alpha:.1f}/coi_erosion"],
+            'theoretical_erosion': theo_erosion,
+        }
+
+        if writer:
+            writer.add_scalar(f'baseline/fixed_reward', fixed_metrics[f"fixed_alpha{alpha:.1f}/reward_mean"], int(alpha * 100))
+            writer.add_scalar(f'baseline/adaptive_reward', adaptive_metrics[f"adaptive_alpha{alpha:.1f}/reward_mean"], int(alpha * 100))
+            writer.add_scalar(f'baseline/coi_erosion_fixed', fixed_metrics[f"fixed_alpha{alpha:.1f}/coi_erosion"], int(alpha * 100))
+            writer.add_scalar(f'baseline/coi_erosion_adaptive', adaptive_metrics[f"adaptive_alpha{alpha:.1f}/coi_erosion"], int(alpha * 100))
+            writer.add_scalar(f'baseline/theoretical_erosion', theo_erosion, int(alpha * 100))
+
+    return results
+
+
+def train_rl(cfg: TrainConfig) -> Dict[str, Any]:
+    """Train RL agent or baseline policy with TensorBoard logging."""
+    is_baseline = cfg.algo.lower() in ["myopic", "random_myopic", "fixed", "adaptive"]
+    if not HAS_SB3 and not is_baseline:
+        raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]")
+
+    log_path = Path(cfg.log_dir) / f"{cfg.algo}_alpha{cfg.alpha_true:.1f}_{cfg.reward_mode}"
+    log_path.mkdir(parents=True, exist_ok=True)
+
+    writer = SummaryWriter(log_path) if HAS_TB else None
+
+    # baseline demonstration
+    if False and cfg.run_baselines:
+        print("Running baseline demonstrations...")
+        baseline_results = run_coi_demonstration(writer, cfg)
+        for k, v in baseline_results.items():
+            print(f"  {k}: reward_fixed={v['fixed_reward']:.2f}, reward_adapt={v['adaptive_reward']:.2f}, "
+                  f"coi_fixed={v['fixed_coi']:.3f}, coi_adapt={v['adaptive_coi']:.3f}, theo={v['theoretical_erosion']:.3f}")
+
+    # create envs
+    train_env = make_vec_env(cfg, n_envs=cfg.n_envs)
+    eval_env = make_vec_env(cfg, n_envs=1)
+
+    # select algorithm
+    algo_name = cfg.algo.lower()
+
+    if is_baseline:
+        # baseline policies wrapped for compatibility
+        policy_map = {
+            "myopic": lambda obs, n: myopic_policy(obs, n, greed=0.3),
+            "random_myopic": random_myopic_policy,
+            "fixed": lambda obs, n: fixed_price_policy(np.ones(n), margin=0.15),
+            "adaptive": lambda obs, n: adaptive_policy(obs, n, base_margin=0.15),
+        }
+        model = BaselinePolicy(policy_map[algo_name], algo_name)
+    else:
+        if not HAS_SB3:
+            raise ImportError("stable-baselines3 required for RL algos")
+
+        algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}.get(algo_name)
+        if algo_cls is None:
+            raise ValueError(f"unknown algo: {cfg.algo}")
+
+        common_kwargs = dict(
+            verbose=1, seed=cfg.seed, tensorboard_log=str(log_path),
+            device="auto"
+        )
+        if algo_name == "ppo":
+            model = PPO(
+                "MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048,
+                batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95,
+                clip_range=0.2, ent_coef=0.01, **common_kwargs)
+        elif algo_name == "sac":
+            model = SAC(
+                "MlpPolicy", train_env, learning_rate=3e-4, buffer_size=100_000,
+                batch_size=256, tau=0.005, gamma=0.99, train_freq=1,
+                gradient_steps=1, ent_coef="auto", **common_kwargs)
+        else:
+            model = A2C(
+                "MlpPolicy", train_env, learning_rate=7e-4, n_steps=5,
+                gamma=0.99, gae_lambda=1.0, ent_coef=0.01, **common_kwargs)
+
+    print(f"\nRunning {cfg.algo.upper()} for {cfg.total_timesteps} steps...")
+    print(f"  alpha_true={cfg.alpha_true}, reward_mode={cfg.reward_mode}")
+    print(f"  logs: {log_path}")
+
+    if is_baseline:
+        # run baseline through env manually with logging
+        run_baseline_with_logging(model, train_env, cfg.total_timesteps, writer)
+    else:
+        coi_cb = COICallback(writer=writer, verbose=1)
+        eval_cb = EvalCallback(
+            eval_env, best_model_save_path=str(log_path / "best"),
+            log_path=str(log_path), eval_freq=cfg.eval_freq,
+            n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
+        model.learn(total_timesteps=cfg.total_timesteps, callback=[coi_cb, eval_cb], progress_bar=True)
+        model.save(log_path / "final_model")
+
+    # final evaluation
+    final_metrics = evaluate_trained_model(model, cfg)
+    if writer:
+        for k, v in final_metrics.items():
+            writer.add_scalar(f'final/{k}', v, cfg.total_timesteps)
+        writer.close()
+
+    train_env.close()
+    eval_env.close()
+
+    return {"model_path": str(log_path / "final_model"), "metrics": final_metrics}
+
+
+def evaluate_trained_model(model: Any, cfg: TrainConfig, n_episodes: int = 20) -> Dict[str, float]:
+    """Evaluate trained model."""
+    env_cfg = EnvConfig(
+        n_products=cfg.n_products, max_steps=cfg.max_steps,
+        alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 1000)
+    env = make_env(env_cfg)
+
+    episode_rewards, episode_coi = [], []
+    for _ in range(n_episodes):
+        obs, _ = env.reset()
+        done, ep_reward, ep_coi = False, 0.0, []
+        while not done:
+            action, _ = model.predict(obs, deterministic=True)
+            obs, reward, terminated, truncated, info = env.step(action)
+            done = terminated or truncated
+            ep_reward += reward
+            if 'coi_erosion' in info:
+                ep_coi.append(info['coi_erosion'])
+        episode_rewards.append(ep_reward)
+        if ep_coi:
+            episode_coi.append(np.mean(ep_coi))
+
+    return {
+        'reward_mean': np.mean(episode_rewards),
+        'reward_std': np.std(episode_rewards),
+        'coi_erosion_mean': np.mean(episode_coi) if episode_coi else 0.0,
+    }
+
+
+def compare_policies(cfg: TrainConfig, model_paths: List[str] = None) -> None:
+    """Compare trained models against baselines."""
+    if model_paths and not HAS_SB3:
+        raise ImportError("stable-baselines3 required for loading trained models")
+
+    writer = SummaryWriter(Path(cfg.log_dir) / "comparison") if HAS_TB else None
+
+    env_cfg = EnvConfig(
+        n_products=cfg.n_products, max_steps=cfg.max_steps,
+        alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)
+    env = make_env(env_cfg)
+
+    results = {}
+
+    # all baseline policies
+    baselines = {
+        'random': lambda obs, n: random_policy(n),
+        'fixed': lambda obs, n: fixed_price_policy(np.ones(n), 0.15),
+        'adaptive': lambda obs, n: adaptive_policy(obs, n, 0.15),
+        'myopic': lambda obs, n: myopic_policy(obs, n, 0.3),
+        'random_myopic': random_myopic_policy,
+    }
+    for name, policy_fn in baselines.items():
+        results[name] = run_baseline(policy_fn, env, n_episodes=20, name=name)
+
+    # trained models
+    if model_paths:
+        for path in model_paths:
+            name = Path(path).parent.name
+            model = PPO.load(path)  # assume PPO, could detect
+            metrics = evaluate_trained_model(model, cfg)
+            results[name] = {f'{name}/{k}': v for k, v in metrics.items()}
+
+    print("\n=== Policy Comparison ===")
+    for name, metrics in results.items():
+        reward_key = [k for k in metrics if 'reward_mean' in k][0]
+        coi_key = [k for k in metrics if 'coi' in k][0] if any('coi' in k for k in metrics) else None
+        print(f"{name:20s}: reward={metrics[reward_key]:.2f}", end="")
+        if coi_key:
+            print(f", coi={metrics[coi_key]:.3f}")
+        else:
+            print()
+
+        if writer:
+            for k, v in metrics.items():
+                writer.add_scalar(f'comparison/{k}', v, 0)
+
+    if writer:
+        writer.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train RL pricing policies")
+    parser.add_argument("--algo", type=str, default="ppo",
+                        choices=["ppo", "sac", "a2c", "myopic", "random_myopic", "fixed", "adaptive"])
+    parser.add_argument("--steps", type=int, default=100_000, help="total training steps")
+    parser.add_argument("--alpha", type=float, default=0.2, help="true contamination level")
+    parser.add_argument("--reward-mode", type=str, default="robust", choices=["revenue", "profit", "robust", "coi_aware"])
+    parser.add_argument("--n-products", type=int, default=10)
+    parser.add_argument("--n-envs", type=int, default=4)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--log-dir", type=str, default="lab/case/thesis/runs")
+    parser.add_argument("--no-baselines", action="store_true", help="skip baseline runs")
+    parser.add_argument("--compare", nargs="*", help="compare model paths")
+    args = parser.parse_args()
+
+    cfg = TrainConfig(
+        algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
+        reward_mode=args.reward_mode, n_products=args.n_products,
+        n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir,
+        run_baselines=not args.no_baselines)
+
+    if args.compare is not None:
+        compare_policies(cfg, args.compare if args.compare else None)
+    else:
+        result = train_rl(cfg)
+        print(f"\nTraining complete. Model saved to: {result['model_path']}")
+        print(f"Final metrics: {result['metrics']}")
+
+
+if __name__ == "__main__":
+    main()

From bae51daa1c15f8e3e79fe052e92960aec1cadf1f Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sat, 24 Jan 2026 14:21:35 +0100
Subject: [PATCH 50/99] chore: refactor session mapping

---
 lab/case/thesis/simplified.py |  36 ++-
 lab/case/thesis/train.py      | 582 +++++++++++++---------------------
 2 files changed, 241 insertions(+), 377 deletions(-)

diff --git a/lab/case/thesis/simplified.py b/lab/case/thesis/simplified.py
index 3b3838e..00ed43a 100644
--- a/lab/case/thesis/simplified.py
+++ b/lab/case/thesis/simplified.py
@@ -93,15 +93,15 @@ def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray,
 
 
 def put_prices_to_market(prices: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
-                         seed: int | None = None) -> Tuple[Dict[str, float], Dict[str, str]]:
+                         seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]:
     """Generate sessions from mixture model Q(p) = (1-α)E[d_H] + αE[d_A] (Eq 3).
 
     Returns:
+        sessions: list of Session objects with events and product attribution
         demand_mapping: session_id -> demand proxy q̂
-        hidden_labels: session_id -> actor class (H or A)
     """
     rng = np.random.default_rng(seed)
-    demand_mapping, hidden_labels = {}, {}
+    sessions, demand_mapping = [], {}
 
     for i in range(n_sessions):
         sid = f"s{i:04d}"
@@ -110,10 +110,10 @@ def put_prices_to_market(prices: np.ndarray, alpha: float = 0.2, n_sessions: int
         theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
         events, _ = sample_trajectory(rng, trans, prices, is_agent)
         session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
+        sessions.append(session)
         demand_mapping[sid] = compute_demand(session)
-        hidden_labels[sid] = session.actor
 
-    return demand_mapping, hidden_labels
+    return sessions, demand_mapping
 
 
 @dataclass
@@ -190,9 +190,16 @@ class System:
                     agg_demand[pidx] += q
         return float(np.dot(prices, agg_demand))
 
-    def _coi_leakage(self, prices: np.ndarray) -> float:
-        """COI_leak = α · InfoValue (query-tax surrogate)."""
-        return self._alpha_est * 1.0
+    def _coi_leakage(self, prices: np.ndarray, n_agents: int = 1) -> float:
+        """COI leakage tied to Theorem 1: erosion from order statistic collapse.
+
+        As N agents query, min(p_1..p_N) → p_min and COI → 0.
+        Leakage = erosion_rate × margin_at_risk
+        """
+        price_std = float(np.std(prices))
+        erosion = coi_erosion(max(1, n_agents), price_std)
+        margin_at_risk = float(np.mean(prices - self.costs))
+        return erosion * margin_at_risk
 
     def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
         """Robust objective: R(p,d) - λ·COI_leak (Eq 23 simplified)."""
@@ -223,13 +230,8 @@ class System:
 
     def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
         """Observe market response to prices."""
-        demand_map, labels = put_prices_to_market(prices, alpha=alpha_true, n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
-
-        # reconstruct sessions for α estimation
-        for sid, actor in labels.items():
-            events, _ = sample_trajectory(self.rng, TRANS_A if actor == "A" else TRANS_H, prices, actor == "A")
-            self._sessions.append(Session(sid=sid, events=events, actor=actor))
-
+        sessions, demand_map = put_prices_to_market(prices, alpha=alpha_true, n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
+        self._sessions.extend(sessions)  # store actual sessions for correct product attribution
         self.limbo.add_update("demand", demand_map)
         return demand_map
 
@@ -269,8 +271,8 @@ if __name__ == "__main__":
     print(f"avg reward: {np.mean(traj['rewards']):.2f}, final α̂: {traj['alpha_est'][-1]:.3f}")
 
     prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
-    demand, labels = put_prices_to_market(prices, alpha=0.3, n_sessions=20, seed=123)
-    print(f'sessions: {len(demand)}, agents: {sum(1 for l in labels.values() if l=="A")}')
+    sessions, demand = put_prices_to_market(prices, alpha=0.3, n_sessions=20, seed=123)
+    print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
 
     for n in [1, 5, 10, 50, 100]:
         ero = coi_erosion(n, price_std=5.0)
diff --git a/lab/case/thesis/train.py b/lab/case/thesis/train.py
index f6fb7d4..cd134fd 100644
--- a/lab/case/thesis/train.py
+++ b/lab/case/thesis/train.py
@@ -1,15 +1,17 @@
-"""RL training for thesis pricing system with COI tracking.
+"""RL training for thesis pricing system with thesis-aligned metrics.
 
 Trains pricing policies using stable-baselines3 with TensorBoard logging.
-Demonstrates COI leakage under different contamination levels and policies.
+Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formulation.
 
 Usage:
     python -m lab.case.thesis.train --algo ppo --alpha 0.3 --steps 100000
+    python -m lab.case.thesis.train --algo adaptive --sweep  # run alpha sweep
     tensorboard --logdir lab/case/thesis/runs
 """
 from __future__ import annotations
 import argparse
-from dataclasses import dataclass, field
+import json
+from dataclasses import dataclass, asdict
 from pathlib import Path
 from typing import Dict, List, Callable, Any
 import numpy as np
@@ -17,9 +19,8 @@ import numpy as np
 try:
     from stable_baselines3 import PPO, SAC, A2C
     from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
-    from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
+    from stable_baselines3.common.vec_env import DummyVecEnv
     from stable_baselines3.common.monitor import Monitor
-    from stable_baselines3.common.logger import configure
     HAS_SB3 = True
 except ImportError:
     HAS_SB3 = False
@@ -34,322 +35,203 @@ from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fi
 from .simplified import coi_erosion
 
 
-class BaselinePolicy:
-    """Wrapper to make baseline policies compatible with SB3 interface."""
-
-    def __init__(self, policy_fn, name: str):
-        self.policy_fn = policy_fn
-        self.name = name
-        self.num_timesteps = 0
-
-    def predict(self, obs, deterministic: bool = True):
-        n = (len(obs) - 3) // 3  # infer n_products from obs shape
-        action = self.policy_fn(obs, n)
-        return action, None
-
-    def learn(self, total_timesteps: int, callback=None, progress_bar: bool = False):
-        self.num_timesteps = total_timesteps
-        return self
-
-    def save(self, path):
-        pass  # no-op for baselines
-
-    @staticmethod
-    def load(path):
-        raise NotImplementedError("baselines cannot be loaded")
-
-
-def myopic_policy(obs: np.ndarray, n: int, greed: float = 0.3) -> np.ndarray:
-    """Myopic: maximize immediate margin, ignore alpha and future COI erosion.
-
-    Greedy short-term optimizer that sets high prices when demand looks good,
-    completely ignoring the alpha estimate and long-term consequences.
-    """
-    demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
-    avg_demand = np.mean(demand_norm)
-    multiplier = 1.0 + greed * (1 + avg_demand)
-    return np.ones(n, dtype=np.float32) * np.clip(multiplier, 0.5, 1.5)
-
-
-def random_myopic_policy(obs: np.ndarray, n: int) -> np.ndarray:
-    """Random myopic: iid random prices each step, no state awareness.
-
-    Represents worst-case baseline where pricing has no strategy at all.
-    """
-    return np.random.uniform(0.8, 1.4, n).astype(np.float32)
+# thesis-aligned KPIs tracked per episode
+@dataclass
+class EpisodeMetrics:
+    reward: float = 0.0
+    revenue: float = 0.0
+    profit: float = 0.0
+    coi_erosion: float = 0.0      # theorem 1: order statistic erosion
+    coi_leakage: float = 0.0      # per-step leakage penalty
+    alpha_error: float = 0.0      # |α - α̂|
+    avg_margin: float = 0.0
+    n_agents: int = 0
+    steps: int = 0
 
 
 @dataclass
-class TrainConfig:
-    """Training configuration."""
-    algo: str = "ppo"  # ppo | sac | a2c
+class ExperimentConfig:
+    """Full experiment specification for reproducibility."""
+    algo: str = "ppo"
     total_timesteps: int = 100_000
     n_envs: int = 4
     eval_freq: int = 5000
     n_eval_episodes: int = 10
     log_dir: str = "lab/case/thesis/runs"
     seed: int = 42
-    # env config
     n_products: int = 10
     max_steps: int = 200
     alpha_true: float = 0.2
     reward_mode: str = "robust"
-    # baseline sweep
-    run_baselines: bool = True
-    alpha_sweep: List[float] = field(default_factory=lambda: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
+    experiment_name: str | None = None
+
+    def __post_init__(self):
+        if self.experiment_name is None:
+            self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}"
 
 
-class COICallback(BaseCallback):
-    """Custom callback for tracking COI metrics in TensorBoard."""
+# unified policy interface wrapping all baselines
+class Policy:
+    """Unified policy interface for baselines and trained models."""
 
-    def __init__(self, writer: Any = None, verbose: int = 0):
+    def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str):
+        self._fn = policy_fn
+        self.name = name
+
+    def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]:
+        n = (len(obs) - 3) // 3
+        return self._fn(obs, n), None
+
+    @staticmethod
+    def fixed(margin: float = 0.15) -> "Policy":
+        return Policy(lambda obs, n: fixed_price_policy(np.ones(n), margin), f"fixed_{margin:.2f}")
+
+    @staticmethod
+    def adaptive(base_margin: float = 0.15) -> "Policy":
+        return Policy(lambda obs, n: adaptive_policy(obs, n, base_margin), f"adaptive_{base_margin:.2f}")
+
+    @staticmethod
+    def random() -> "Policy":
+        return Policy(lambda obs, n: random_policy(n), "random")
+
+    @staticmethod
+    def myopic(greed: float = 0.3) -> "Policy":
+        """Myopic: maximize immediate margin, ignore alpha."""
+        def _fn(obs: np.ndarray, n: int) -> np.ndarray:
+            demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
+            mult = 1.0 + greed * (1 + np.mean(demand_norm))
+            return np.ones(n, dtype=np.float32) * np.clip(mult, 0.5, 1.5)
+        return Policy(_fn, f"myopic_{greed:.1f}")
+
+
+class MetricsCallback(BaseCallback):
+    """Tracks thesis-aligned metrics during RL training."""
+
+    def __init__(self, writer: SummaryWriter | None, verbose: int = 0):
         super().__init__(verbose)
         self._writer = writer
-        self._episode_coi_leak = []
-        self._episode_alpha_err = []
-        self._episode_revenues = []
-        self._episode_margins = []
+        self._ep = EpisodeMetrics()
+        self._buffer: List[EpisodeMetrics] = []
 
     def _on_step(self) -> bool:
-        infos = self.locals.get('infos', [])
-        for info in infos:
-            if 'alpha_true' in info and 'alpha_est' in info:
-                self._episode_alpha_err.append(abs(info['alpha_true'] - info['alpha_est']))
-            if 'coi_erosion' in info:
-                self._episode_coi_leak.append(info['coi_erosion'])
-            if 'revenue' in info:
-                self._episode_revenues.append(info['revenue'])
-            if 'avg_margin' in info:
-                self._episode_margins.append(info['avg_margin'])
+        for info in self.locals.get('infos', []):
+            self._ep.steps += 1
+            self._ep.reward += info.get('reward', 0)
+            self._ep.revenue += info.get('revenue', 0)
+            self._ep.profit += info.get('profit', 0)
+            self._ep.coi_erosion += info.get('coi_erosion', 0)
+            self._ep.coi_leakage += info.get('coi_leakage', 0)
+            self._ep.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0))
+            self._ep.avg_margin += info.get('avg_margin', 0)
+            self._ep.n_agents += info.get('n_agents', 0)
         return True
 
     def _on_rollout_end(self) -> None:
-        if self._writer is None:
+        if self._ep.steps == 0 or self._writer is None:
             return
-        step = self.num_timesteps
-        if self._episode_coi_leak:
-            self._writer.add_scalar('coi/erosion_mean', np.mean(self._episode_coi_leak), step)
-            self._writer.add_scalar('coi/erosion_max', np.max(self._episode_coi_leak), step)
-        if self._episode_alpha_err:
-            self._writer.add_scalar('alpha/estimation_error', np.mean(self._episode_alpha_err), step)
-        if self._episode_revenues:
-            self._writer.add_scalar('economics/revenue_mean', np.mean(self._episode_revenues), step)
-        if self._episode_margins:
-            self._writer.add_scalar('economics/margin_mean', np.mean(self._episode_margins), step)
-        self._episode_coi_leak.clear()
-        self._episode_alpha_err.clear()
-        self._episode_revenues.clear()
-        self._episode_margins.clear()
+        s, step = self._ep.steps, self.num_timesteps
+        self._writer.add_scalar('economics/revenue', self._ep.revenue / s, step)
+        self._writer.add_scalar('economics/profit', self._ep.profit / s, step)
+        self._writer.add_scalar('economics/margin', self._ep.avg_margin / s, step)
+        self._writer.add_scalar('coi/erosion', self._ep.coi_erosion / s, step)
+        self._writer.add_scalar('coi/leakage', self._ep.coi_leakage / s, step)
+        self._writer.add_scalar('alpha/estimation_error', self._ep.alpha_error / s, step)
+        self._writer.add_scalar('agents/count', self._ep.n_agents / s, step)
+        self._buffer.append(self._ep)
+        self._ep = EpisodeMetrics()
 
 
-def run_baseline_with_logging(model: BaselinePolicy, vec_env, total_timesteps: int, writer: Any) -> None:
-    """Run baseline policy and log metrics identically to RL training."""
-    n_envs = vec_env.num_envs
-    obs = vec_env.reset()
-    step = 0
-    episode_rewards, episode_coi, episode_alpha_err, episode_revenues = [], [], [], []
-    ep_rewards = np.zeros(n_envs)
-
-    while step < total_timesteps:
-        actions = np.array([model.predict(obs[i])[0] for i in range(n_envs)])
-        obs, rewards, dones, infos = vec_env.step(actions)
-        step += n_envs
-        ep_rewards += rewards
-
-        for i, info in enumerate(infos):
-            if 'coi_erosion' in info:
-                episode_coi.append(info['coi_erosion'])
-            if 'alpha_true' in info and 'alpha_est' in info:
-                episode_alpha_err.append(abs(info['alpha_true'] - info['alpha_est']))
-            if 'revenue' in info:
-                episode_revenues.append(info['revenue'])
-            if dones[i]:
-                episode_rewards.append(ep_rewards[i])
-                ep_rewards[i] = 0.0
-
-        if writer and len(episode_rewards) >= 5 and step % 1000 < n_envs:
-            writer.add_scalar('rollout/ep_rew_mean', np.mean(episode_rewards[-10:]), step)
-            if episode_coi:
-                writer.add_scalar('coi/erosion_mean', np.mean(episode_coi[-100:]), step)
-            if episode_alpha_err:
-                writer.add_scalar('alpha/estimation_error', np.mean(episode_alpha_err[-100:]), step)
-            if episode_revenues:
-                writer.add_scalar('economics/revenue_mean', np.mean(episode_revenues[-100:]), step)
-
-        if step % 10000 < n_envs:
-            print(f"  step {step}/{total_timesteps}, avg_reward={np.mean(episode_rewards[-20:]) if episode_rewards else 0:.2f}")
-
-
-def make_vec_env(cfg: TrainConfig, n_envs: int = 1) -> DummyVecEnv:
-    """Create vectorized environment."""
+def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv:
     def _make():
-        env_cfg = EnvConfig(
-            n_products=cfg.n_products, max_steps=cfg.max_steps,
-            alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)
-        env = make_env(env_cfg)
-        return Monitor(env)
+        env_cfg = EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
+                            alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)
+        return Monitor(make_env(env_cfg))
     return DummyVecEnv([_make for _ in range(n_envs)])
 
 
-def run_baseline(
-    policy_fn: Callable[[np.ndarray, int], np.ndarray],
-    env: PricingEnv,
-    n_episodes: int = 20,
-    name: str = "baseline"
-) -> Dict[str, float]:
-    """Evaluate baseline policy and collect metrics."""
-    episode_rewards, episode_coi, episode_alpha_err = [], [], []
+def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]:
+    """Evaluate policy and return thesis-aligned metrics."""
+    env_cfg = EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
+                        alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999)
+    env = make_env(env_cfg)
+    metrics = []
 
     for _ in range(n_episodes):
-        obs, info = env.reset()
-        done, ep_reward, ep_coi, ep_alpha_err = False, 0.0, [], []
-
+        obs, _ = env.reset()
+        ep = EpisodeMetrics()
+        done = False
         while not done:
-            action = policy_fn(obs, env.n)
-            obs, reward, terminated, truncated, info = env.step(action)
-            done = terminated or truncated
-            ep_reward += reward
-            if 'coi_erosion' in info:
-                ep_coi.append(info['coi_erosion'])
-            if 'alpha_true' in info and 'alpha_est' in info:
-                ep_alpha_err.append(abs(info['alpha_true'] - info['alpha_est']))
-
-        episode_rewards.append(ep_reward)
-        if ep_coi:
-            episode_coi.append(np.mean(ep_coi))
-        if ep_alpha_err:
-            episode_alpha_err.append(np.mean(ep_alpha_err))
+            action, _ = policy.predict(obs, deterministic=True)
+            obs, reward, term, trunc, info = env.step(action)
+            done = term or trunc
+            ep.reward += reward
+            ep.revenue += info.get('revenue', 0)
+            ep.profit += info.get('profit', 0)
+            ep.coi_erosion += info.get('coi_erosion', 0)
+            ep.coi_leakage += info.get('coi_leakage', 0)
+            ep.alpha_error += abs(info['alpha_true'] - info['alpha_est'])
+            ep.avg_margin += info.get('avg_margin', 0)
+            ep.steps += 1
+        metrics.append(ep)
 
+    n = len(metrics)
     return {
-        f'{name}/reward_mean': np.mean(episode_rewards),
-        f'{name}/reward_std': np.std(episode_rewards),
-        f'{name}/coi_erosion': np.mean(episode_coi) if episode_coi else 0.0,
-        f'{name}/alpha_error': np.mean(episode_alpha_err) if episode_alpha_err else 0.0,
+        'reward_mean': np.mean([m.reward for m in metrics]),
+        'reward_std': np.std([m.reward for m in metrics]),
+        'revenue_mean': np.mean([m.revenue / m.steps for m in metrics]),
+        'profit_mean': np.mean([m.profit / m.steps for m in metrics]),
+        'coi_erosion_mean': np.mean([m.coi_erosion / m.steps for m in metrics]),
+        'coi_leakage_mean': np.mean([m.coi_leakage / m.steps for m in metrics]),
+        'alpha_error_mean': np.mean([m.alpha_error / m.steps for m in metrics]),
+        'margin_mean': np.mean([m.avg_margin / m.steps for m in metrics]),
     }
 
 
-def run_coi_demonstration(writer: Any, cfg: TrainConfig) -> Dict[str, Dict[str, float]]:
-    """Demonstrate COI leakage across contamination levels."""
-    results = {}
-
-    for alpha in cfg.alpha_sweep:
-        env_cfg = EnvConfig(
-            n_products=cfg.n_products, max_steps=cfg.max_steps,
-            alpha_true=alpha, reward_mode=cfg.reward_mode, seed=cfg.seed)
-        env = make_env(env_cfg)
-
-        # run fixed policy
-        fixed_metrics = run_baseline(
-            lambda obs, n: fixed_price_policy(np.ones(n), margin=0.15),
-            env, n_episodes=10, name=f"fixed_alpha{alpha:.1f}")
-
-        # run adaptive policy
-        adaptive_metrics = run_baseline(
-            lambda obs, n: adaptive_policy(obs, n, base_margin=0.15),
-            env, n_episodes=10, name=f"adaptive_alpha{alpha:.1f}")
-
-        # theoretical erosion
-        n_agents = int(alpha * cfg.max_steps * 30)  # rough agent count
-        theo_erosion = coi_erosion(max(1, n_agents), price_std=5.0)
-
-        results[f'alpha_{alpha:.1f}'] = {
-            'fixed_reward': fixed_metrics[f"fixed_alpha{alpha:.1f}/reward_mean"],
-            'adaptive_reward': adaptive_metrics[f"adaptive_alpha{alpha:.1f}/reward_mean"],
-            'fixed_coi': fixed_metrics[f"fixed_alpha{alpha:.1f}/coi_erosion"],
-            'adaptive_coi': adaptive_metrics[f"adaptive_alpha{alpha:.1f}/coi_erosion"],
-            'theoretical_erosion': theo_erosion,
-        }
-
-        if writer:
-            writer.add_scalar(f'baseline/fixed_reward', fixed_metrics[f"fixed_alpha{alpha:.1f}/reward_mean"], int(alpha * 100))
-            writer.add_scalar(f'baseline/adaptive_reward', adaptive_metrics[f"adaptive_alpha{alpha:.1f}/reward_mean"], int(alpha * 100))
-            writer.add_scalar(f'baseline/coi_erosion_fixed', fixed_metrics[f"fixed_alpha{alpha:.1f}/coi_erosion"], int(alpha * 100))
-            writer.add_scalar(f'baseline/coi_erosion_adaptive', adaptive_metrics[f"adaptive_alpha{alpha:.1f}/coi_erosion"], int(alpha * 100))
-            writer.add_scalar(f'baseline/theoretical_erosion', theo_erosion, int(alpha * 100))
-
-    return results
-
-
-def train_rl(cfg: TrainConfig) -> Dict[str, Any]:
-    """Train RL agent or baseline policy with TensorBoard logging."""
-    is_baseline = cfg.algo.lower() in ["myopic", "random_myopic", "fixed", "adaptive"]
+def train(cfg: ExperimentConfig) -> Dict[str, Any]:
+    """Train RL agent or evaluate baseline policy."""
+    is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"]
     if not HAS_SB3 and not is_baseline:
         raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]")
 
-    log_path = Path(cfg.log_dir) / f"{cfg.algo}_alpha{cfg.alpha_true:.1f}_{cfg.reward_mode}"
+    log_path = Path(cfg.log_dir) / cfg.experiment_name
     log_path.mkdir(parents=True, exist_ok=True)
+    with open(log_path / "config.json", "w") as f:
+        json.dump(asdict(cfg), f, indent=2)
 
     writer = SummaryWriter(log_path) if HAS_TB else None
-
-    # baseline demonstration
-    if False and cfg.run_baselines:
-        print("Running baseline demonstrations...")
-        baseline_results = run_coi_demonstration(writer, cfg)
-        for k, v in baseline_results.items():
-            print(f"  {k}: reward_fixed={v['fixed_reward']:.2f}, reward_adapt={v['adaptive_reward']:.2f}, "
-                  f"coi_fixed={v['fixed_coi']:.3f}, coi_adapt={v['adaptive_coi']:.3f}, theo={v['theoretical_erosion']:.3f}")
-
-    # create envs
-    train_env = make_vec_env(cfg, n_envs=cfg.n_envs)
-    eval_env = make_vec_env(cfg, n_envs=1)
-
-    # select algorithm
-    algo_name = cfg.algo.lower()
+    train_env = make_vec_env(cfg, cfg.n_envs)
+    eval_env = make_vec_env(cfg, 1)
 
     if is_baseline:
-        # baseline policies wrapped for compatibility
-        policy_map = {
-            "myopic": lambda obs, n: myopic_policy(obs, n, greed=0.3),
-            "random_myopic": random_myopic_policy,
-            "fixed": lambda obs, n: fixed_price_policy(np.ones(n), margin=0.15),
-            "adaptive": lambda obs, n: adaptive_policy(obs, n, base_margin=0.15),
-        }
-        model = BaselinePolicy(policy_map[algo_name], algo_name)
+        policy_map = {"fixed": Policy.fixed(), "adaptive": Policy.adaptive(),
+                      "random": Policy.random(), "myopic": Policy.myopic()}
+        policy = policy_map[cfg.algo.lower()]
+        run_baseline(policy, train_env, cfg.total_timesteps, writer)
+        final_metrics = evaluate_policy(policy, cfg)
     else:
-        if not HAS_SB3:
-            raise ImportError("stable-baselines3 required for RL algos")
-
-        algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}.get(algo_name)
+        algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}.get(cfg.algo.lower())
         if algo_cls is None:
             raise ValueError(f"unknown algo: {cfg.algo}")
-
-        common_kwargs = dict(
-            verbose=1, seed=cfg.seed, tensorboard_log=str(log_path),
-            device="auto"
-        )
-        if algo_name == "ppo":
-            model = PPO(
-                "MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048,
-                batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95,
-                clip_range=0.2, ent_coef=0.01, **common_kwargs)
-        elif algo_name == "sac":
-            model = SAC(
-                "MlpPolicy", train_env, learning_rate=3e-4, buffer_size=100_000,
-                batch_size=256, tau=0.005, gamma=0.99, train_freq=1,
-                gradient_steps=1, ent_coef="auto", **common_kwargs)
+        common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
+        if cfg.algo.lower() == "ppo":
+            model = PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048,
+                        batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95,
+                        clip_range=0.2, ent_coef=0.01, **common)
+        elif cfg.algo.lower() == "sac":
+            model = SAC("MlpPolicy", train_env, learning_rate=3e-4, buffer_size=100_000,
+                        batch_size=256, tau=0.005, gamma=0.99, **common)
         else:
-            model = A2C(
-                "MlpPolicy", train_env, learning_rate=7e-4, n_steps=5,
-                gamma=0.99, gae_lambda=1.0, ent_coef=0.01, **common_kwargs)
+            model = A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common)
 
-    print(f"\nRunning {cfg.algo.upper()} for {cfg.total_timesteps} steps...")
-    print(f"  alpha_true={cfg.alpha_true}, reward_mode={cfg.reward_mode}")
-    print(f"  logs: {log_path}")
-
-    if is_baseline:
-        # run baseline through env manually with logging
-        run_baseline_with_logging(model, train_env, cfg.total_timesteps, writer)
-    else:
-        coi_cb = COICallback(writer=writer, verbose=1)
-        eval_cb = EvalCallback(
-            eval_env, best_model_save_path=str(log_path / "best"),
-            log_path=str(log_path), eval_freq=cfg.eval_freq,
-            n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
-        model.learn(total_timesteps=cfg.total_timesteps, callback=[coi_cb, eval_cb], progress_bar=True)
+        cb = MetricsCallback(writer)
+        eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"),
+                               log_path=str(log_path), eval_freq=cfg.eval_freq,
+                               n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
+        model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True)
         model.save(log_path / "final_model")
+        policy = model
+        final_metrics = evaluate_policy(model, cfg)
 
-    # final evaluation
-    final_metrics = evaluate_trained_model(model, cfg)
     if writer:
         for k, v in final_metrics.items():
             writer.add_scalar(f'final/{k}', v, cfg.total_timesteps)
@@ -357,117 +239,97 @@ def train_rl(cfg: TrainConfig) -> Dict[str, Any]:
 
     train_env.close()
     eval_env.close()
-
-    return {"model_path": str(log_path / "final_model"), "metrics": final_metrics}
+    with open(log_path / "results.json", "w") as f:
+        json.dump(final_metrics, f, indent=2)
+    return {"path": str(log_path), "metrics": final_metrics}
 
 
-def evaluate_trained_model(model: Any, cfg: TrainConfig, n_episodes: int = 20) -> Dict[str, float]:
-    """Evaluate trained model."""
-    env_cfg = EnvConfig(
-        n_products=cfg.n_products, max_steps=cfg.max_steps,
-        alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 1000)
-    env = make_env(env_cfg)
+def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None):
+    """Run baseline policy through environment with logging."""
+    obs = vec_env.reset()
+    n_envs = vec_env.num_envs
+    ep_rewards = np.zeros(n_envs)
+    all_rewards, coi_buf, alpha_buf = [], [], []
 
-    episode_rewards, episode_coi = [], []
-    for _ in range(n_episodes):
-        obs, _ = env.reset()
-        done, ep_reward, ep_coi = False, 0.0, []
-        while not done:
-            action, _ = model.predict(obs, deterministic=True)
-            obs, reward, terminated, truncated, info = env.step(action)
-            done = terminated or truncated
-            ep_reward += reward
-            if 'coi_erosion' in info:
-                ep_coi.append(info['coi_erosion'])
-        episode_rewards.append(ep_reward)
-        if ep_coi:
-            episode_coi.append(np.mean(ep_coi))
-
-    return {
-        'reward_mean': np.mean(episode_rewards),
-        'reward_std': np.std(episode_rewards),
-        'coi_erosion_mean': np.mean(episode_coi) if episode_coi else 0.0,
-    }
+    for step in range(0, total_steps, n_envs):
+        actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)])
+        obs, rewards, dones, infos = vec_env.step(actions)
+        ep_rewards += rewards
+        for i, info in enumerate(infos):
+            coi_buf.append(info.get('coi_erosion', 0))
+            alpha_buf.append(abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)))
+            if dones[i]:
+                all_rewards.append(ep_rewards[i])
+                ep_rewards[i] = 0
+        if writer and step % 1000 < n_envs and all_rewards:
+            writer.add_scalar('rollout/ep_rew_mean', np.mean(all_rewards[-20:]), step)
+            writer.add_scalar('coi/erosion', np.mean(coi_buf[-100:]), step)
+            writer.add_scalar('alpha/estimation_error', np.mean(alpha_buf[-100:]), step)
 
 
-def compare_policies(cfg: TrainConfig, model_paths: List[str] = None) -> None:
-    """Compare trained models against baselines."""
-    if model_paths and not HAS_SB3:
-        raise ImportError("stable-baselines3 required for loading trained models")
-
-    writer = SummaryWriter(Path(cfg.log_dir) / "comparison") if HAS_TB else None
-
-    env_cfg = EnvConfig(
-        n_products=cfg.n_products, max_steps=cfg.max_steps,
-        alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)
-    env = make_env(env_cfg)
-
+def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None) -> Dict[str, Dict]:
+    """Run experiment across contamination levels for scientific comparison."""
+    alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
     results = {}
+    for alpha in alphas:
+        sweep_cfg = ExperimentConfig(**{**asdict(cfg), "alpha_true": alpha,
+                                         "experiment_name": f"{cfg.algo}_a{alpha:.2f}_{cfg.reward_mode}"})
+        print(f"\n=== α={alpha:.2f} ===")
+        out = train(sweep_cfg)
+        results[f"alpha_{alpha:.2f}"] = out["metrics"]
+    summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json"
+    with open(summary_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nSweep results saved to {summary_path}")
+    return results
 
-    # all baseline policies
-    baselines = {
-        'random': lambda obs, n: random_policy(n),
-        'fixed': lambda obs, n: fixed_price_policy(np.ones(n), 0.15),
-        'adaptive': lambda obs, n: adaptive_policy(obs, n, 0.15),
-        'myopic': lambda obs, n: myopic_policy(obs, n, 0.3),
-        'random_myopic': random_myopic_policy,
-    }
-    for name, policy_fn in baselines.items():
-        results[name] = run_baseline(policy_fn, env, n_episodes=20, name=name)
 
-    # trained models
-    if model_paths:
-        for path in model_paths:
-            name = Path(path).parent.name
-            model = PPO.load(path)  # assume PPO, could detect
-            metrics = evaluate_trained_model(model, cfg)
-            results[name] = {f'{name}/{k}': v for k, v in metrics.items()}
-
-    print("\n=== Policy Comparison ===")
-    for name, metrics in results.items():
-        reward_key = [k for k in metrics if 'reward_mean' in k][0]
-        coi_key = [k for k in metrics if 'coi' in k][0] if any('coi' in k for k in metrics) else None
-        print(f"{name:20s}: reward={metrics[reward_key]:.2f}", end="")
-        if coi_key:
-            print(f", coi={metrics[coi_key]:.3f}")
-        else:
-            print()
-
-        if writer:
-            for k, v in metrics.items():
-                writer.add_scalar(f'comparison/{k}', v, 0)
-
-    if writer:
-        writer.close()
+def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None) -> Dict[str, Dict]:
+    """Compare multiple policies at same contamination level."""
+    policies = policies or ["fixed", "adaptive", "myopic", "random"]
+    results = {}
+    for algo in policies:
+        cmp_cfg = ExperimentConfig(**{**asdict(cfg), "algo": algo,
+                                       "experiment_name": f"cmp_{algo}_a{cfg.alpha_true:.2f}"})
+        print(f"\n=== {algo} ===")
+        out = train(cmp_cfg)
+        results[algo] = out["metrics"]
+    cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json"
+    with open(cmp_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nComparison saved to {cmp_path}")
+    for algo, m in results.items():
+        print(f"  {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} "
+              f"alpha_err={m['alpha_error_mean']:.4f}")
+    return results
 
 
 def main():
     parser = argparse.ArgumentParser(description="Train RL pricing policies")
-    parser.add_argument("--algo", type=str, default="ppo",
-                        choices=["ppo", "sac", "a2c", "myopic", "random_myopic", "fixed", "adaptive"])
-    parser.add_argument("--steps", type=int, default=100_000, help="total training steps")
-    parser.add_argument("--alpha", type=float, default=0.2, help="true contamination level")
-    parser.add_argument("--reward-mode", type=str, default="robust", choices=["revenue", "profit", "robust", "coi_aware"])
+    parser.add_argument("--algo", default="ppo", choices=["ppo", "sac", "a2c", "fixed", "adaptive", "random", "myopic"])
+    parser.add_argument("--steps", type=int, default=100_000)
+    parser.add_argument("--alpha", type=float, default=0.2)
+    parser.add_argument("--reward-mode", default="robust", choices=["revenue", "profit", "robust", "coi_aware"])
     parser.add_argument("--n-products", type=int, default=10)
     parser.add_argument("--n-envs", type=int, default=4)
     parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--log-dir", type=str, default="lab/case/thesis/runs")
-    parser.add_argument("--no-baselines", action="store_true", help="skip baseline runs")
-    parser.add_argument("--compare", nargs="*", help="compare model paths")
+    parser.add_argument("--log-dir", default="lab/case/thesis/runs")
+    parser.add_argument("--sweep", action="store_true", help="run contamination sweep")
+    parser.add_argument("--compare", action="store_true", help="compare all baselines")
     args = parser.parse_args()
 
-    cfg = TrainConfig(
-        algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
-        reward_mode=args.reward_mode, n_products=args.n_products,
-        n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir,
-        run_baselines=not args.no_baselines)
+    cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
+                           reward_mode=args.reward_mode, n_products=args.n_products,
+                           n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir)
 
-    if args.compare is not None:
-        compare_policies(cfg, args.compare if args.compare else None)
+    if args.sweep:
+        run_sweep(cfg)
+    elif args.compare:
+        compare_policies(cfg)
     else:
-        result = train_rl(cfg)
-        print(f"\nTraining complete. Model saved to: {result['model_path']}")
-        print(f"Final metrics: {result['metrics']}")
+        result = train(cfg)
+        print(f"\nTraining complete: {result['path']}")
+        print(f"Metrics: {json.dumps(result['metrics'], indent=2)}")
 
 
 if __name__ == "__main__":

From 4033e73ba1694f5758986727bb2729f7325a9d48 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sat, 24 Jan 2026 15:16:41 +0100
Subject: [PATCH 51/99] feat: consistent failure case

---
 lab/case/thesis/simplified.py     | 233 ++++++++++++++++++++++++++----
 lab/case/thesis/simplified_env.py |  73 +++++++---
 lab/case/thesis/train.py          |  12 +-
 3 files changed, 264 insertions(+), 54 deletions(-)

diff --git a/lab/case/thesis/simplified.py b/lab/case/thesis/simplified.py
index 00ed43a..59aef75 100644
--- a/lab/case/thesis/simplified.py
+++ b/lab/case/thesis/simplified.py
@@ -79,22 +79,175 @@ def estimate_alpha(session: Session, beta: float = 2.0) -> float:
     return 1.0 / (1.0 + np.exp(-beta * (dh - da))) if (dh + da) > 0 else 0.5
 
 
-def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray, is_agent: bool) -> Tuple[List[Event], int]:
+@dataclass(frozen=True)
+class COIWindow:
+    """Windowed COI metrics computed from realized price exposures.
+
+    COI_policy is the definition-level KPI: E[p_shown] - p_min.
+    COI_agent is the theorem-level object: E[p^(1)] - p_min, where p^(1) is the minimum price realized under agent querying.
+    In this simplified simulator, p^(1) is approximated as the minimum price exposed to any agent in the window (per product).
+    Leak is the observable gap between them.
+    """
+
+    policy: float
+    agent: float
+    leak: float
+    survival_ratio: float
+    policy_by_product: np.ndarray
+    agent_by_product: np.ndarray
+    demand_weights: np.ndarray
+
+
+def _prices_by_product(sessions: List[Session]) -> Dict[int, List[float]]:
+    prices: Dict[int, List[float]] = {}
+    for s in sessions:
+        for e in s.events:
+            prices.setdefault(e.product_idx, []).append(float(e.price_seen))
+    return prices
+
+
+def _min_session_prices_by_product(sessions: List[Session]) -> Dict[int, List[float]]:
+    mins: Dict[int, List[float]] = {}
+    for s in sessions:
+        by_p: Dict[int, float] = {}
+        for e in s.events:
+            pidx = int(e.product_idx)
+            price = float(e.price_seen)
+            by_p[pidx] = price if pidx not in by_p else min(by_p[pidx], price)
+        for pidx, pmin in by_p.items():
+            mins.setdefault(pidx, []).append(pmin)
+    return mins
+
+
+def _min_price_across_sessions_by_product(sessions: List[Session]) -> Dict[int, float]:
+    mins: Dict[int, float] = {}
+    for s in sessions:
+        for e in s.events:
+            pidx = int(e.product_idx)
+            price = float(e.price_seen)
+            mins[pidx] = price if pidx not in mins else min(mins[pidx], price)
+    return mins
+
+
+def _demand_weights_by_product(
+    sessions: List[Session],
+    demand_mapping: Dict[str, float],
+    n_products: int,
+) -> np.ndarray:
+    w = np.zeros(n_products, dtype=float)
+    sessions_by_id = {s.sid: s for s in sessions}
+    for sid, q in demand_mapping.items():
+        sess = sessions_by_id.get(sid)
+        if not sess or not sess.events:
+            continue
+        pidx = int(sess.events[0].product_idx)
+        w[pidx] += float(q)
+    s = float(np.sum(w))
+    return (w / s) if s > 0 else w
+
+
+def compute_coi_window(
+    sessions: List[Session],
+    costs: np.ndarray,
+    demand_mapping: Dict[str, float] | None = None,
+) -> COIWindow:
+    n_products = int(len(costs))
+    prices = _prices_by_product(sessions)
+    agent_min_across = _min_price_across_sessions_by_product([s for s in sessions if s.actor == "A"])
+
+    policy_by_product = np.zeros(n_products, dtype=float)
+    agent_by_product = np.zeros(n_products, dtype=float)
+    seen = np.array([(i in prices) for i in range(n_products)], dtype=bool)
+    agent_seen = np.array([(i in agent_min_across) for i in range(n_products)], dtype=bool)
+
+    for pidx, ps in prices.items():
+        if 0 <= pidx < n_products and ps:
+            policy_by_product[pidx] = float(np.mean(ps) - float(costs[pidx]))
+
+    for pidx, pmin in agent_min_across.items():
+        if 0 <= pidx < n_products:
+            agent_by_product[pidx] = float(pmin - float(costs[pidx]))
+
+    # If no agent exposure exists for a product in the window, there is no realized erosion for that product.
+    agent_by_product[seen & ~agent_seen] = policy_by_product[seen & ~agent_seen]
+
+    demand_weights = (
+        _demand_weights_by_product(sessions, demand_mapping, n_products)
+        if demand_mapping is not None
+        else np.zeros(n_products, dtype=float)
+    )
+
+    has_weights = float(np.sum(demand_weights)) > 0
+    if has_weights:
+        policy = float(np.dot(demand_weights, policy_by_product))
+        agent = float(np.dot(demand_weights, agent_by_product))
+    else:
+        if not bool(np.any(seen)):
+            policy = 0.0
+            agent = 0.0
+        else:
+            policy = float(np.mean(policy_by_product[seen]))
+            agent = float(np.mean(agent_by_product[seen]))
+
+    leak = float(max(policy - agent, 0.0))
+    survival_ratio = float(np.clip(agent / policy, 0.0, 1.0)) if policy > 0 else 0.0
+
+    return COIWindow(
+        policy=policy,
+        agent=agent,
+        leak=leak,
+        survival_ratio=survival_ratio,
+        policy_by_product=policy_by_product,
+        agent_by_product=agent_by_product,
+        demand_weights=demand_weights,
+    )
+
+
+def sample_trajectory(
+    rng: np.random.Generator,
+    trans: Dict,
+    prices: np.ndarray,
+    costs: np.ndarray,
+    theta: Dict[str, float],
+    is_agent: bool,
+    session_price_noise: float = 0.02,
+    surge: float = 0.08,
+    max_markup_mult: float = 1.8,
+) -> Tuple[List[Event], int]:
     """Sample session trajectory from behavioral kernel."""
     state, t, pidx = "start", 0.0, int(rng.integers(0, len(prices)))
+    cost = float(costs[pidx])
+    base_price = float(prices[pidx]) * float(1.0 + rng.normal(0.0, session_price_noise))
+    base_price = float(np.clip(base_price, cost * 1.01, float(prices[pidx]) * 2.0))
+    current_price = base_price
+    signal = 0.0
     events = []
+    # TODO: instead of this very controlled setup implement same session samplin as in models.py
     while state != "end" and len(events) < 30:
-        if state != "start":
-            events.append(Event(action=state, product_idx=pidx, price_seen=float(prices[pidx]), ts=t))
         probs = trans.get(state, {"end": 1.0})
-        state = rng.choice(list(probs.keys()), p=list(probs.values()))
+        nxt = rng.choice(list(probs.keys()), p=list(probs.values()))
+
+        if nxt == "purchase":
+            price_sens = float(theta.get("price_sens", 2.0))
+            base_conv = float(theta.get("base_conv", 0.2))
+            rel = max((current_price - cost) / (cost + 1e-6), 0.0)
+            p_buy = float(np.clip(base_conv * np.exp(-price_sens * rel), 0.0, 1.0))
+            if rng.random() > p_buy:
+                nxt = "end"
+
+        state = nxt
+        if state not in {"start", "end"}:
+            events.append(Event(action=state, product_idx=pidx, price_seen=float(current_price), ts=t))
+            signal += float(ACTION_WEIGHTS.get(state, 0.1))
+            current_price = float(np.clip(base_price * (1.0 + surge * signal), cost * 1.01, base_price * max_markup_mult))
+
         t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2))
     return events, pidx
 
 
-def put_prices_to_market(prices: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
+def put_prices_to_market(prices: np.ndarray, costs: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
                          seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]:
-    """Generate sessions from mixture model Q(p) = (1-α)E[d_H] + αE[d_A] (Eq 3).
+    """Generate sessions from mixture model
 
     Returns:
         sessions: list of Session objects with events and product attribution
@@ -108,7 +261,7 @@ def put_prices_to_market(prices: np.ndarray, alpha: float = 0.2, n_sessions: int
         is_agent = rng.random() < alpha
         trans = TRANS_A if is_agent else TRANS_H
         theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
-        events, _ = sample_trajectory(rng, trans, prices, is_agent)
+        events, _ = sample_trajectory(rng, trans, prices, costs=costs, theta=theta, is_agent=is_agent)
         session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
         sessions.append(session)
         demand_mapping[sid] = compute_demand(session)
@@ -167,6 +320,8 @@ class System:
         self.limbo = Limbo()
         self._alpha_est = 0.2  # current contamination estimate
         self._sessions: List[Session] = []
+        self._last_sessions: List[Session] = []
+        self._last_coi: COIWindow | None = None
 
     @property
     def alpha(self) -> float:
@@ -190,24 +345,27 @@ class System:
                     agg_demand[pidx] += q
         return float(np.dot(prices, agg_demand))
 
-    def _coi_leakage(self, prices: np.ndarray, n_agents: int = 1) -> float:
-        """COI leakage tied to Theorem 1: erosion from order statistic collapse.
-
-        As N agents query, min(p_1..p_N) → p_min and COI → 0.
-        Leakage = erosion_rate × margin_at_risk
-        """
-        price_std = float(np.std(prices))
-        erosion = coi_erosion(max(1, n_agents), price_std)
-        margin_at_risk = float(np.mean(prices - self.costs))
-        return erosion * margin_at_risk
+    def _compute_coi_window(self, demand: Dict[str, float]) -> COIWindow:
+        if not self._last_sessions:
+            zeros = np.zeros(self.n, dtype=float)
+            return COIWindow(
+                policy=0.0,
+                agent=0.0,
+                leak=0.0,
+                survival_ratio=0.0,
+                policy_by_product=zeros,
+                agent_by_product=zeros,
+                demand_weights=zeros,
+            )
+        return compute_coi_window(self._last_sessions, self.costs, demand_mapping=demand)
 
     def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
         """Robust objective: R(p,d) - λ·COI_leak (Eq 23 simplified)."""
         revenue = self._revenue_under_demand(prices, demand)
         cost = float(np.sum(self.costs))  # fixed cost approximation
         profit = revenue - cost
-        coi_penalty = self.lambda_coi * self._coi_leakage(prices) * float(np.mean(prices - self.costs))
-        return profit - coi_penalty
+        self._last_coi = self._compute_coi_window(demand)
+        return profit - self.lambda_coi * self._last_coi.leak
 
     def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray:
         """Compute next prices via simple gradient-like update on robust objective.
@@ -230,28 +388,44 @@ class System:
 
     def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
         """Observe market response to prices."""
-        sessions, demand_map = put_prices_to_market(prices, alpha=alpha_true, n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
+        sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true, n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
+        self._last_sessions = sessions
         self._sessions.extend(sessions)  # store actual sessions for correct product attribution
         self.limbo.add_update("demand", demand_map)
         return demand_map
 
-    def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float]:
+    def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float, COIWindow]:
         """Single simulation step: prices -> demand -> reward."""
         demand_hist = self.limbo.get_demand_history()
         prices = self.compute_prices(demand_hist[-1] if demand_hist else None)
         demand = self.observe_demand(prices, alpha_true, n_sessions)
         reward = self._objective(prices, demand)
-        return prices, demand, reward
+        coi = self._last_coi or self._compute_coi_window(demand)
+        return prices, demand, reward, coi
 
     def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict:
         """Run simulation for n_steps, return trajectory."""
-        trajectory = {"prices": [], "demand": [], "rewards": [], "alpha_est": [], "alpha_true": alpha_true}
+        trajectory = {
+            "prices": [],
+            "demand": [],
+            "rewards": [],
+            "alpha_est": [],
+            "alpha_true": alpha_true,
+            "coi_policy": [],
+            "coi_agent": [],
+            "coi_leak": [],
+            "coi_survival": [],
+        }
         for _ in range(n_steps):
-            p, d, r = self.step(alpha_true)
+            p, d, r, coi = self.step(alpha_true)
             trajectory["prices"].append(p)
             trajectory["demand"].append(d)
             trajectory["rewards"].append(r)
             trajectory["alpha_est"].append(self._alpha_est)
+            trajectory["coi_policy"].append(coi.policy)
+            trajectory["coi_agent"].append(coi.agent)
+            trajectory["coi_leak"].append(coi.leak)
+            trajectory["coi_survival"].append(coi.survival_ratio)
         return trajectory
 
 
@@ -268,10 +442,17 @@ if __name__ == "__main__":
     # quick demo
     sys = System(n_products=5, seed=42)
     traj = sys.run(n_steps=20, alpha_true=0.25)
-    print(f"avg reward: {np.mean(traj['rewards']):.2f}, final α̂: {traj['alpha_est'][-1]:.3f}")
+    print(
+        f"avg reward: {np.mean(traj['rewards']):.2f}, "
+        f"final α̂: {traj['alpha_est'][-1]:.3f}, "
+        f"COI_policy: {np.mean(traj['coi_policy']):.3f}, "
+        f"COI_agent: {np.mean(traj['coi_agent']):.3f}, "
+        f"leak: {np.mean(traj['coi_leak']):.3f}"
+    )
 
     prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
-    sessions, demand = put_prices_to_market(prices, alpha=0.3, n_sessions=20, seed=123)
+    costs = np.array([15.0, 28.0, 40.0, 18.0, 30.0])
+    sessions, demand = put_prices_to_market(prices, costs=costs, alpha=0.3, n_sessions=20, seed=123)
     print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
 
     for n in [1, 5, 10, 50, 100]:
diff --git a/lab/case/thesis/simplified_env.py b/lab/case/thesis/simplified_env.py
index af4af87..e59ae41 100644
--- a/lab/case/thesis/simplified_env.py
+++ b/lab/case/thesis/simplified_env.py
@@ -19,8 +19,19 @@ try:
 except ImportError:
     HAS_GYM = False
 
-from .simplified import (System, Session, Event, Limbo, put_prices_to_market,
-                         compute_demand, estimate_alpha, coi_erosion, TRANS_H, TRANS_A)
+from .simplified import (
+    System,
+    Session,
+    Event,
+    Limbo,
+    put_prices_to_market,
+    compute_coi_window,
+    compute_demand,
+    estimate_alpha,
+    coi_erosion,
+    TRANS_H,
+    TRANS_A,
+)
 
 
 @dataclass
@@ -116,9 +127,19 @@ class PricingEnv(gym.Env if HAS_GYM else object):
                 agg[pidx] += q
         self._demand_agg = agg
 
-        revenue = float(np.dot(prices, agg))
-        cost = float(np.dot(sys.costs, np.clip(agg, 0, 1)))  # simplified cost model
-        profit = revenue - cost
+        revenue = 0.0
+        cost = 0.0
+        purchases = np.zeros(self.n, dtype=float)
+        for sess in sys._last_sessions:
+            for e in sess.events:
+                if e.action != "purchase":
+                    continue
+                pidx = int(e.product_idx)
+                if 0 <= pidx < self.n:
+                    purchases[pidx] += 1.0
+                    revenue += float(e.price_seen)
+                    cost += float(sys.costs[pidx])
+        profit = float(revenue - cost)
 
         # volatility penalty (price changes)
         vol_penalty = 0.0
@@ -126,9 +147,8 @@ class PricingEnv(gym.Env if HAS_GYM else object):
             price_change = np.abs(prices - self._last_prices) / (sys.refs + 1e-6)
             vol_penalty = cfg.lambda_vol * float(np.mean(price_change))
 
-        # COI leakage penalty
-        avg_margin = float(np.mean(prices - sys.costs))
-        coi_leak = sys.alpha * avg_margin
+        coi = compute_coi_window(sys._last_sessions, sys.costs, demand_mapping=demand)
+        coi_leak = float(coi.leak)
 
         if cfg.reward_mode == "revenue":
             r = revenue
@@ -181,11 +201,11 @@ class PricingEnv(gym.Env if HAS_GYM else object):
         prices = self._sys.refs * action.astype(np.float64)
         prices = np.clip(prices, self._sys.costs * 1.01, self._sys.refs * 2.0)
 
-        # drift contamination
-        if self.cfg.alpha_drift != 0:
-            self._alpha = np.clip(
-                self._alpha + self.cfg.alpha_drift * self._sys.rng.normal(),
-                *self.cfg.alpha_bounds)
+        # # drift contamination
+        # if self.cfg.alpha_drift != 0:
+        #     self._alpha = np.clip(
+        #         self._alpha + self.cfg.alpha_drift * self._sys.rng.normal(),
+        #         *self.cfg.alpha_bounds)
 
         # observe demand
         demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step)
@@ -205,25 +225,38 @@ class PricingEnv(gym.Env if HAS_GYM else object):
         truncated = False
 
         # compute metrics for tracking
-        revenue = float(np.dot(prices, self._demand_agg))
-        cost = float(np.dot(self._sys.costs, np.clip(self._demand_agg, 0, 1)))
-        profit = revenue - cost
+        revenue = 0.0
+        cost = 0.0
+        n_purchases = 0
+        for sess in self._sys._last_sessions:
+            for e in sess.events:
+                if e.action != "purchase":
+                    continue
+                n_purchases += 1
+                revenue += float(e.price_seen)
+                cost += float(self._sys.costs[int(e.product_idx)])
+        profit = float(revenue - cost)
         n_agents = int(self._alpha * self.cfg.sessions_per_step)
         price_std = float(np.std(prices))
+        coi = compute_coi_window(self._sys._last_sessions, self._sys.costs, demand_mapping=demand)
 
         info = {
             "alpha_true": self._alpha,
             "alpha_est": self._sys.alpha,
             "alpha_error": abs(self._alpha - self._sys.alpha),
-            "revenue": revenue,
-            "profit": profit,
-            "cost": cost,
+            "revenue": float(revenue),
+            "profit": float(profit),
+            "cost": float(cost),
+            "n_purchases": int(n_purchases),
             "avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
             "n_sessions": len(demand),
             "n_agents": n_agents,
             "price_std": price_std,
             "coi_erosion": coi_erosion(max(1, n_agents), price_std),
-            "coi_leakage": self._sys.alpha * float(np.mean(prices - self._sys.costs)),
+            "coi_policy": float(coi.policy),
+            "coi_agent": float(coi.agent),
+            "coi_leakage": float(coi.leak),
+            "coi_survival": float(coi.survival_ratio),
             "cumulative_reward": sum(self._episode_rewards),
             "step": self._t,
         }
diff --git a/lab/case/thesis/train.py b/lab/case/thesis/train.py
index cd134fd..cc152b5 100644
--- a/lab/case/thesis/train.py
+++ b/lab/case/thesis/train.py
@@ -2,11 +2,6 @@
 
 Trains pricing policies using stable-baselines3 with TensorBoard logging.
 Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formulation.
-
-Usage:
-    python -m lab.case.thesis.train --algo ppo --alpha 0.3 --steps 100000
-    python -m lab.case.thesis.train --algo adaptive --sweep  # run alpha sweep
-    tensorboard --logdir lab/case/thesis/runs
 """
 from __future__ import annotations
 import argparse
@@ -41,9 +36,9 @@ class EpisodeMetrics:
     reward: float = 0.0
     revenue: float = 0.0
     profit: float = 0.0
-    coi_erosion: float = 0.0      # theorem 1: order statistic erosion
-    coi_leakage: float = 0.0      # per-step leakage penalty
-    alpha_error: float = 0.0      # |α - α̂|
+    coi_erosion: float = 0.0
+    coi_leakage: float = 0.0
+    alpha_error: float = 0.0
     avg_margin: float = 0.0
     n_agents: int = 0
     steps: int = 0
@@ -213,6 +208,7 @@ def train(cfg: ExperimentConfig) -> Dict[str, Any]:
         if algo_cls is None:
             raise ValueError(f"unknown algo: {cfg.algo}")
         common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
+        # TODO: setup hyper parameter passing to train different variations (no free lunch)
         if cfg.algo.lower() == "ppo":
             model = PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048,
                         batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95,

From 1224841a82167e54457899427067731b174000c9 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sat, 24 Jan 2026 23:51:57 +0100
Subject: [PATCH 52/99] preliminary improved runs

---
 lab/case/thesis/simplified.py     | 373 +++++-------------------------
 lab/case/thesis/simplified_env.py | 293 +++++++----------------
 lab/case/thesis/train.py          | 277 +++++++++++-----------
 3 files changed, 279 insertions(+), 664 deletions(-)

diff --git a/lab/case/thesis/simplified.py b/lab/case/thesis/simplified.py
index 59aef75..3c58fdd 100644
--- a/lab/case/thesis/simplified.py
+++ b/lab/case/thesis/simplified.py
@@ -1,11 +1,11 @@
 """Minimal implementation of thesis pricing system.
 
 Implements the core loop: prices -> sessions -> demand -> prices
-with behavioral separability and robust pricing objective (Eq 23).
+with behavioral separability and robust pricing objective.
 
 Objects:
-- Session trajectories τ_s from mixture of H/A behavioral profiles
-- Demand proxy q̂ via weighted action aggregation (Eq 2)
+- Session trajectories tau_s from mixture of H/A behavioral profiles
+- Demand proxy q_hat via weighted action aggregation
 - COI leakage penalty for agent reconnaissance
 - Limbo: alternating price/demand history for trajectory analysis
 """
@@ -14,11 +14,10 @@ from dataclasses import dataclass, field
 from typing import Dict, List, Tuple
 import numpy as np
 
+from .coi import COIWindow, compute_coi_window, coi_erosion
+from .separability import TRANS_H, TRANS_A, kl_div, build_kernel, compute_divergence, estimate_alpha
+
 ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0}
-TRANS_H = {"start": {"view": 0.85, "end": 0.15}, "view": {"detail": 0.4, "cart": 0.3, "view": 0.2, "end": 0.1},
-           "detail": {"cart": 0.5, "view": 0.3, "end": 0.2}, "cart": {"purchase": 0.6, "view": 0.25, "end": 0.15}, "purchase": {"end": 1.0}}
-TRANS_A = {"start": {"view": 0.95, "end": 0.05}, "view": {"detail": 0.6, "view": 0.25, "cart": 0.1, "end": 0.05},
-           "detail": {"view": 0.5, "cart": 0.15, "detail": 0.3, "end": 0.05}, "cart": {"view": 0.4, "purchase": 0.2, "end": 0.4}, "purchase": {"end": 1.0}}
 
 
 @dataclass
@@ -38,235 +37,52 @@ class Session:
 
 
 def compute_demand(session: Session) -> float:
-    """Compute demand proxy q̂ = Σ_k ω(a_k) for session (Eq 2)."""
+    """Compute demand proxy q_hat = sum_k omega(a_k) for session."""
     return sum(ACTION_WEIGHTS.get(e.action, 0.1) for e in session.events)
 
 
-def kl_div(p: Dict[str, float], q: Dict[str, float]) -> float:
-    """KL divergence D_KL(p || q) for transition kernels."""
-    eps = 1e-10
-    keys = set(p.keys()) | set(q.keys())
-    return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys)
-
-
-def build_kernel(events: List[Event]) -> Dict[str, Dict[str, float]]:
-    """Build empirical transition kernel from trajectory."""
-    trans: Dict[str, Dict[str, int]] = {}
-    prev = "start"
-    for e in events:
-        curr = e.action
-        trans.setdefault(prev, {})
-        trans[prev][curr] = trans[prev].get(curr, 0) + 1
-        prev = curr
-    kernel = {}
-    for s, dsts in trans.items():
-        total = sum(dsts.values())
-        kernel[s] = {d: c / total for d, c in dsts.items()} if total > 0 else {}
-    return kernel
-
-
-def compute_divergence(session: Session) -> Tuple[float, float]:
-    """Compute Δ_H, Δ_A divergence signals (Eq 20-21)."""
-    kernel = build_kernel(session.events)
-    delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / max(len(kernel), 1)
-    delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / max(len(kernel), 1)
-    return delta_h, delta_a
-
-
-def estimate_alpha(session: Session, beta: float = 2.0) -> float:
-    """Per-session contamination estimate α̂(τ') = σ(β(Δ_H - Δ_A))."""
-    dh, da = compute_divergence(session)
-    return 1.0 / (1.0 + np.exp(-beta * (dh - da))) if (dh + da) > 0 else 0.5
-
-
-@dataclass(frozen=True)
-class COIWindow:
-    """Windowed COI metrics computed from realized price exposures.
-
-    COI_policy is the definition-level KPI: E[p_shown] - p_min.
-    COI_agent is the theorem-level object: E[p^(1)] - p_min, where p^(1) is the minimum price realized under agent querying.
-    In this simplified simulator, p^(1) is approximated as the minimum price exposed to any agent in the window (per product).
-    Leak is the observable gap between them.
-    """
-
-    policy: float
-    agent: float
-    leak: float
-    survival_ratio: float
-    policy_by_product: np.ndarray
-    agent_by_product: np.ndarray
-    demand_weights: np.ndarray
-
-
-def _prices_by_product(sessions: List[Session]) -> Dict[int, List[float]]:
-    prices: Dict[int, List[float]] = {}
-    for s in sessions:
-        for e in s.events:
-            prices.setdefault(e.product_idx, []).append(float(e.price_seen))
-    return prices
-
-
-def _min_session_prices_by_product(sessions: List[Session]) -> Dict[int, List[float]]:
-    mins: Dict[int, List[float]] = {}
-    for s in sessions:
-        by_p: Dict[int, float] = {}
-        for e in s.events:
-            pidx = int(e.product_idx)
-            price = float(e.price_seen)
-            by_p[pidx] = price if pidx not in by_p else min(by_p[pidx], price)
-        for pidx, pmin in by_p.items():
-            mins.setdefault(pidx, []).append(pmin)
-    return mins
-
-
-def _min_price_across_sessions_by_product(sessions: List[Session]) -> Dict[int, float]:
-    mins: Dict[int, float] = {}
-    for s in sessions:
-        for e in s.events:
-            pidx = int(e.product_idx)
-            price = float(e.price_seen)
-            mins[pidx] = price if pidx not in mins else min(mins[pidx], price)
-    return mins
-
-
-def _demand_weights_by_product(
-    sessions: List[Session],
-    demand_mapping: Dict[str, float],
-    n_products: int,
-) -> np.ndarray:
-    w = np.zeros(n_products, dtype=float)
-    sessions_by_id = {s.sid: s for s in sessions}
-    for sid, q in demand_mapping.items():
-        sess = sessions_by_id.get(sid)
-        if not sess or not sess.events:
-            continue
-        pidx = int(sess.events[0].product_idx)
-        w[pidx] += float(q)
-    s = float(np.sum(w))
-    return (w / s) if s > 0 else w
-
-
-def compute_coi_window(
-    sessions: List[Session],
-    costs: np.ndarray,
-    demand_mapping: Dict[str, float] | None = None,
-) -> COIWindow:
-    n_products = int(len(costs))
-    prices = _prices_by_product(sessions)
-    agent_min_across = _min_price_across_sessions_by_product([s for s in sessions if s.actor == "A"])
-
-    policy_by_product = np.zeros(n_products, dtype=float)
-    agent_by_product = np.zeros(n_products, dtype=float)
-    seen = np.array([(i in prices) for i in range(n_products)], dtype=bool)
-    agent_seen = np.array([(i in agent_min_across) for i in range(n_products)], dtype=bool)
-
-    for pidx, ps in prices.items():
-        if 0 <= pidx < n_products and ps:
-            policy_by_product[pidx] = float(np.mean(ps) - float(costs[pidx]))
-
-    for pidx, pmin in agent_min_across.items():
-        if 0 <= pidx < n_products:
-            agent_by_product[pidx] = float(pmin - float(costs[pidx]))
-
-    # If no agent exposure exists for a product in the window, there is no realized erosion for that product.
-    agent_by_product[seen & ~agent_seen] = policy_by_product[seen & ~agent_seen]
-
-    demand_weights = (
-        _demand_weights_by_product(sessions, demand_mapping, n_products)
-        if demand_mapping is not None
-        else np.zeros(n_products, dtype=float)
-    )
-
-    has_weights = float(np.sum(demand_weights)) > 0
-    if has_weights:
-        policy = float(np.dot(demand_weights, policy_by_product))
-        agent = float(np.dot(demand_weights, agent_by_product))
-    else:
-        if not bool(np.any(seen)):
-            policy = 0.0
-            agent = 0.0
-        else:
-            policy = float(np.mean(policy_by_product[seen]))
-            agent = float(np.mean(agent_by_product[seen]))
-
-    leak = float(max(policy - agent, 0.0))
-    survival_ratio = float(np.clip(agent / policy, 0.0, 1.0)) if policy > 0 else 0.0
-
-    return COIWindow(
-        policy=policy,
-        agent=agent,
-        leak=leak,
-        survival_ratio=survival_ratio,
-        policy_by_product=policy_by_product,
-        agent_by_product=agent_by_product,
-        demand_weights=demand_weights,
-    )
-
-
-def sample_trajectory(
-    rng: np.random.Generator,
-    trans: Dict,
-    prices: np.ndarray,
-    costs: np.ndarray,
-    theta: Dict[str, float],
-    is_agent: bool,
-    session_price_noise: float = 0.02,
-    surge: float = 0.08,
-    max_markup_mult: float = 1.8,
-) -> Tuple[List[Event], int]:
+def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray, costs: np.ndarray, theta: Dict[str, float],
+                      is_agent: bool, session_noise: float = 0.02, surge: float = 0.08, max_mult: float = 1.8) -> Tuple[List[Event], int]:
     """Sample session trajectory from behavioral kernel."""
-    state, t, pidx = "start", 0.0, int(rng.integers(0, len(prices)))
-    cost = float(costs[pidx])
-    base_price = float(prices[pidx]) * float(1.0 + rng.normal(0.0, session_price_noise))
-    base_price = float(np.clip(base_price, cost * 1.01, float(prices[pidx]) * 2.0))
-    current_price = base_price
-    signal = 0.0
+    pidx = int(rng.integers(0, len(prices)))
+    cost, base = float(costs[pidx]), float(prices[pidx]) * (1.0 + rng.normal(0.0, session_noise))
+    base = float(np.clip(base, cost * 1.01, float(prices[pidx]) * 2.0))
+    price, signal, state, t = base, 0.0, "start", 0.0
     events = []
-    # TODO: instead of this very controlled setup implement same session samplin as in models.py
+
     while state != "end" and len(events) < 30:
         probs = trans.get(state, {"end": 1.0})
         nxt = rng.choice(list(probs.keys()), p=list(probs.values()))
-
-        if nxt == "purchase":
-            price_sens = float(theta.get("price_sens", 2.0))
-            base_conv = float(theta.get("base_conv", 0.2))
-            rel = max((current_price - cost) / (cost + 1e-6), 0.0)
-            p_buy = float(np.clip(base_conv * np.exp(-price_sens * rel), 0.0, 1.0))
+        if nxt == "purchase":  # purchase conversion check
+            rel = max((price - cost) / (cost + 1e-6), 0.0)
+            p_buy = float(np.clip(theta.get("base_conv", 0.2) * np.exp(-theta.get("price_sens", 2.0) * rel), 0.0, 1.0))
             if rng.random() > p_buy:
                 nxt = "end"
-
         state = nxt
         if state not in {"start", "end"}:
-            events.append(Event(action=state, product_idx=pidx, price_seen=float(current_price), ts=t))
+            events.append(Event(action=state, product_idx=pidx, price_seen=float(price), ts=t))
             signal += float(ACTION_WEIGHTS.get(state, 0.1))
-            current_price = float(np.clip(base_price * (1.0 + surge * signal), cost * 1.01, base_price * max_markup_mult))
-
+            price = float(np.clip(base * (1.0 + surge * signal), cost * 1.01, base * max_mult))
         t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2))
     return events, pidx
 
 
 def put_prices_to_market(prices: np.ndarray, costs: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
                          seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]:
-    """Generate sessions from mixture model
-
-    Returns:
-        sessions: list of Session objects with events and product attribution
-        demand_mapping: session_id -> demand proxy q̂
-    """
+    """Generate sessions from mixture model. Returns sessions and demand mapping sid -> q_hat."""
     rng = np.random.default_rng(seed)
-    sessions, demand_mapping = [], {}
-
+    sessions, demand = [], {}
     for i in range(n_sessions):
         sid = f"s{i:04d}"
         is_agent = rng.random() < alpha
         trans = TRANS_A if is_agent else TRANS_H
-        theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
+        theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else \
+                {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
         events, _ = sample_trajectory(rng, trans, prices, costs=costs, theta=theta, is_agent=is_agent)
         session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
         sessions.append(session)
-        demand_mapping[sid] = compute_demand(session)
-
-    return sessions, demand_mapping
+        demand[sid] = compute_demand(session)
+    return sessions, demand
 
 
 @dataclass
@@ -286,13 +102,7 @@ class Limbo:
     def add_update(self, utype: str, data: np.ndarray | Dict[str, float]) -> Dict:
         self.history.append(LimboUpdate(utype=utype, data=data, t=self._t))
         self._t += 1
-        return self.on_update(utype)
-
-    def on_update(self, utype: str) -> Dict:
-        """React to update: after prices -> return observed demand; after demand -> signal price update needed."""
-        if utype == "prices":
-            return {"action": "observe_demand", "msg": "awaiting market response"}
-        return {"action": "set_prices", "msg": "demand observed, update prices"}
+        return {"action": "observe_demand" if utype == "prices" else "set_prices"}
 
     def get_prices_history(self) -> List[np.ndarray]:
         return [u.data for u in self.history if u.utype == "prices"]
@@ -304,21 +114,18 @@ class Limbo:
 class System:
     """Main pricing system implementing robust Stackelberg objective.
 
-    Manages the alternating loop:
-    1. Set prices p_t
-    2. Observe demand response Q̂(p_t)
-    3. Estimate contamination α from behavioral signals
-    4. Compute next prices via robust objective (Eq 23)
+    Manages the alternating loop: set prices p_t -> observe demand Q_hat(p_t) ->
+    estimate contamination alpha from behavioral signals -> compute next prices.
     """
 
     def __init__(self, n_products: int = 10, costs: np.ndarray | None = None, lambda_coi: float = 0.5, seed: int | None = 42):
         self.n = n_products
         self.rng = np.random.default_rng(seed)
         self.costs = costs if costs is not None else self.rng.uniform(10, 50, n_products)
-        self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products))  # base prices with margin
+        self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products))
         self.lambda_coi = lambda_coi
         self.limbo = Limbo()
-        self._alpha_est = 0.2  # current contamination estimate
+        self._alpha_est = 0.2
         self._sessions: List[Session] = []
         self._last_sessions: List[Session] = []
         self._last_coi: COIWindow | None = None
@@ -328,127 +135,73 @@ class System:
         return self._alpha_est
 
     def _estimate_alpha_from_sessions(self) -> float:
-        """Aggregate per-session α̂ estimates."""
         if not self._sessions:
             return self._alpha_est
-        alphas = [estimate_alpha(s) for s in self._sessions[-50:]]  # use recent sessions
-        return float(np.mean(alphas))
+        return float(np.mean([estimate_alpha(s) for s in self._sessions[-50:]]))
 
     def _revenue_under_demand(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
-        """Compute expected revenue R(p, d) from demand proxy."""
-        agg_demand = np.zeros(self.n)
+        agg = np.zeros(self.n)
         for sid, q in demand.items():
-            if self._sessions:
-                sess = next((s for s in self._sessions if s.sid == sid), None)
-                if sess and sess.events:
-                    pidx = sess.events[0].product_idx
-                    agg_demand[pidx] += q
-        return float(np.dot(prices, agg_demand))
+            sess = next((s for s in self._sessions if s.sid == sid), None)
+            if sess and sess.events:
+                agg[sess.events[0].product_idx] += q
+        return float(np.dot(prices, agg))
 
     def _compute_coi_window(self, demand: Dict[str, float]) -> COIWindow:
         if not self._last_sessions:
             zeros = np.zeros(self.n, dtype=float)
-            return COIWindow(
-                policy=0.0,
-                agent=0.0,
-                leak=0.0,
-                survival_ratio=0.0,
-                policy_by_product=zeros,
-                agent_by_product=zeros,
-                demand_weights=zeros,
-            )
+            return COIWindow(policy=0.0, agent=0.0, leak=0.0, survival_ratio=0.0,
+                             policy_by_product=zeros, agent_by_product=zeros, demand_weights=zeros)
         return compute_coi_window(self._last_sessions, self.costs, demand_mapping=demand)
 
     def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
-        """Robust objective: R(p,d) - λ·COI_leak (Eq 23 simplified)."""
-        revenue = self._revenue_under_demand(prices, demand)
-        cost = float(np.sum(self.costs))  # fixed cost approximation
-        profit = revenue - cost
+        """Robust objective: R(p,d) - lambda * COI_leak."""
+        profit = self._revenue_under_demand(prices, demand) - float(np.sum(self.costs))
         self._last_coi = self._compute_coi_window(demand)
         return profit - self.lambda_coi * self._last_coi.leak
 
     def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray:
-        """Compute next prices via simple gradient-like update on robust objective.
-
-        In a full implementation this would be replaced by DR-RL policy output.
-        Here we use a heuristic: adjust margins based on α estimate.
-        """
+        """Compute next prices via heuristic margin adjustment based on alpha estimate."""
         self._alpha_est = self._estimate_alpha_from_sessions()
-
-        # base margin adjustment: higher α -> lower margins (defensive pricing)
-        margin_scale = 1.0 - 0.5 * self._alpha_est  # reduce margins under high contamination
+        margin_scale = 1.0 - 0.5 * self._alpha_est  # defensive pricing under high contamination
         margins = (self.refs - self.costs) * margin_scale
-
-        # add small noise for exploration
         noise = self.rng.normal(0, 0.02, self.n) * self.costs
         prices = np.clip(self.costs + margins + noise, self.costs * 1.02, self.refs * 1.3)
-
         self.limbo.add_update("prices", prices)
         return prices
 
     def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
-        """Observe market response to prices."""
-        sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true, n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
+        sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true,
+                                                    n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
         self._last_sessions = sessions
-        self._sessions.extend(sessions)  # store actual sessions for correct product attribution
+        self._sessions.extend(sessions)
         self.limbo.add_update("demand", demand_map)
         return demand_map
 
     def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float, COIWindow]:
-        """Single simulation step: prices -> demand -> reward."""
         demand_hist = self.limbo.get_demand_history()
         prices = self.compute_prices(demand_hist[-1] if demand_hist else None)
         demand = self.observe_demand(prices, alpha_true, n_sessions)
         reward = self._objective(prices, demand)
-        coi = self._last_coi or self._compute_coi_window(demand)
-        return prices, demand, reward, coi
+        return prices, demand, reward, self._last_coi or self._compute_coi_window(demand)
 
     def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict:
-        """Run simulation for n_steps, return trajectory."""
-        trajectory = {
-            "prices": [],
-            "demand": [],
-            "rewards": [],
-            "alpha_est": [],
-            "alpha_true": alpha_true,
-            "coi_policy": [],
-            "coi_agent": [],
-            "coi_leak": [],
-            "coi_survival": [],
-        }
+        traj = {"prices": [], "demand": [], "rewards": [], "alpha_est": [], "alpha_true": alpha_true,
+                "coi_policy": [], "coi_agent": [], "coi_leak": [], "coi_survival": []}
         for _ in range(n_steps):
             p, d, r, coi = self.step(alpha_true)
-            trajectory["prices"].append(p)
-            trajectory["demand"].append(d)
-            trajectory["rewards"].append(r)
-            trajectory["alpha_est"].append(self._alpha_est)
-            trajectory["coi_policy"].append(coi.policy)
-            trajectory["coi_agent"].append(coi.agent)
-            trajectory["coi_leak"].append(coi.leak)
-            trajectory["coi_survival"].append(coi.survival_ratio)
-        return trajectory
-
-
-def coi_erosion(n_agents: int, price_std: float) -> float:
-    """COI erosion from Theorem 1: as N->inf, min(p_1..p_N)->p_min."""
-    if n_agents <= 1:
-        return 0.0
-    log_n = np.log(n_agents)
-    shift = price_std * (np.sqrt(2 * log_n) - (np.log(log_n) + np.log(4 * np.pi)) / (2 * np.sqrt(2 * log_n) + 1e-6))
-    return float(min(shift / (price_std * 2 + 1e-6), 1.0))
+            traj["prices"].append(p); traj["demand"].append(d); traj["rewards"].append(r)
+            traj["alpha_est"].append(self._alpha_est)
+            traj["coi_policy"].append(coi.policy); traj["coi_agent"].append(coi.agent)
+            traj["coi_leak"].append(coi.leak); traj["coi_survival"].append(coi.survival_ratio)
+        return traj
 
 
 if __name__ == "__main__":
-    # quick demo
     sys = System(n_products=5, seed=42)
     traj = sys.run(n_steps=20, alpha_true=0.25)
-    print(
-        f"avg reward: {np.mean(traj['rewards']):.2f}, "
-        f"final α̂: {traj['alpha_est'][-1]:.3f}, "
-        f"COI_policy: {np.mean(traj['coi_policy']):.3f}, "
-        f"COI_agent: {np.mean(traj['coi_agent']):.3f}, "
-        f"leak: {np.mean(traj['coi_leak']):.3f}"
-    )
+    print(f"avg reward: {np.mean(traj['rewards']):.2f}, final alpha_hat: {traj['alpha_est'][-1]:.3f}, "
+          f"COI_policy: {np.mean(traj['coi_policy']):.3f}, COI_agent: {np.mean(traj['coi_agent']):.3f}, leak: {np.mean(traj['coi_leak']):.3f}")
 
     prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
     costs = np.array([15.0, 28.0, 40.0, 18.0, 30.0])
@@ -456,16 +209,10 @@ if __name__ == "__main__":
     print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
 
     for n in [1, 5, 10, 50, 100]:
-        ero = coi_erosion(n, price_std=5.0)
-        print(f'N={n:3d} agents -> COI erosion: {ero:.3f}')
+        print(f'N={n:3d} agents -> COI erosion: {coi_erosion(n, price_std=5.0):.3f}')
 
-    # test separability
-    events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0),
-    Event('purchase', 0, 20.0, 2.0)]
-    sess_h = Session(sid='test', events=events, actor='H')
-    print(f'human-like session α̂: {estimate_alpha(sess_h):.3f}')
+    events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0), Event('purchase', 0, 20.0, 2.0)]
+    print(f'human-like session alpha_hat: {estimate_alpha(Session(sid="test", events=events, actor="H")):.3f}')
 
-    events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3),
-    Event('detail', 0, 20.0, 0.4)]
-    sess_a = Session(sid='test2', events=events_a, actor='A')
-    print(f'agent-like session α̂: {estimate_alpha(sess_a):.3f}')
+    events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3), Event('detail', 0, 20.0, 0.4)]
+    print(f'agent-like session alpha_hat: {estimate_alpha(Session(sid="test2", events=events_a, actor="A")):.3f}')
diff --git a/lab/case/thesis/simplified_env.py b/lab/case/thesis/simplified_env.py
index e59ae41..e4cd84c 100644
--- a/lab/case/thesis/simplified_env.py
+++ b/lab/case/thesis/simplified_env.py
@@ -19,58 +19,45 @@ try:
 except ImportError:
     HAS_GYM = False
 
-from .simplified import (
-    System,
-    Session,
-    Event,
-    Limbo,
-    put_prices_to_market,
-    compute_coi_window,
-    compute_demand,
-    estimate_alpha,
-    coi_erosion,
-    TRANS_H,
-    TRANS_A,
-)
+from .simplified import System, Session, Event, Limbo, put_prices_to_market, compute_demand, estimate_alpha
+from .coi import COIWindow, compute_coi_window, coi_erosion
 
 
 @dataclass
 class EnvConfig:
-    """Configuration for pricing environment."""
     n_products: int = 5
     max_steps: int = 200
     sessions_per_step: int = 30
-    alpha_true: float = 0.2           # true contamination level
-    alpha_drift: float = 0.0          # per-step drift in α
+    alpha_true: float = 0.2
+    alpha_drift: float = 0.0
     alpha_bounds: Tuple[float, float] = (0.0, 0.6)
-    lambda_coi: float = 0.5           # COI penalty weight
-    lambda_vol: float = 0.1           # volatility penalty weight
-    reward_mode: str = "robust"       # revenue | profit | robust | coi_aware
+    lambda_coi: float = 0.5
+    lambda_vol: float = 0.1
+    reward_mode: str = "robust"  # revenue | profit | robust | coi_aware
     normalize_reward: bool = True
     seed: int | None = 42
 
 
+def aggregate_purchases(sessions: list[Session], n_products: int, costs: np.ndarray) -> Tuple[np.ndarray, float, float]:
+    """Aggregate purchases from sessions, returns (counts, revenue, cost)."""
+    purchases = np.zeros(n_products, dtype=float)
+    revenue, cost = 0.0, 0.0
+    for sess in sessions:
+        for e in sess.events:
+            if e.action == "purchase" and 0 <= e.product_idx < n_products:
+                purchases[e.product_idx] += 1.0
+                revenue += float(e.price_seen)
+                cost += float(costs[e.product_idx])
+    return purchases, revenue, cost
+
+
 class PricingEnv(gym.Env if HAS_GYM else object):
     """RL environment for dynamic pricing under agent contamination.
 
-    Implements the thesis formulation where:
-    - Platform sets prices p_t
-    - Market responds with mixture demand Q(p) = (1-α)D_H + αD_A
-    - Agent estimates contamination α̂ from behavioral signals
-    - Reward balances profit vs COI leakage
-
-    Observation space (normalized):
-        [0:n]     - current prices / ref_prices
-        [n:2n]    - aggregated demand per product
-        [2n]      - estimated contamination α̂
-        [2n+1]    - true contamination α (if observable, else 0)
-        [2n+2:3n+2] - current margins (prices - costs) / costs
-        [3n+2]    - step / max_steps
-
-    Action space:
-        price multipliers in [0.5, 1.5] applied to reference prices
+    Platform sets prices p_t, market responds with mixture demand Q(p) = (1-alpha)*D_H + alpha*D_A.
+    Agent estimates contamination alpha_hat from behavioral signals.
+    Reward balances profit vs COI leakage.
     """
-
     metadata = {"render_modes": ["human", "ansi"]}
 
     def __init__(self, cfg: EnvConfig | None = None):
@@ -86,34 +73,23 @@ class PricingEnv(gym.Env if HAS_GYM else object):
         self._episode_rewards: list[float] = []
         self._demand_agg = np.zeros(self.n)
 
-        # gymnasium spaces
         self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32)
-        obs_dim = self.n + self.n + 1 + 1 + self.n + 1  # prices + demand + α̂ + α + margins + t
+        obs_dim = self.n + self.n + 1 + 1 + self.n + 1  # prices + demand + alpha_hat + alpha + margins + t
         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)
 
     def _build_obs(self) -> np.ndarray:
-        """Construct observation vector."""
         if self._sys is None:
             return np.zeros(self.observation_space.shape[0], dtype=np.float32)
-
         prices = self._last_prices if self._last_prices is not None else self._sys.refs
-        price_ratio = prices / (self._sys.refs + 1e-6)
-        demand_norm = self._demand_agg / (np.sum(self._demand_agg) + 1e-6)
-        margins = (prices - self._sys.costs) / (self._sys.costs + 1e-6)
-        t_norm = self._t / self.cfg.max_steps
-
-        obs = np.concatenate([
-            price_ratio,                          # [0:n]
-            demand_norm,                          # [n:2n]
-            [self._sys.alpha],                    # [2n] estimated α̂
-            [self._alpha],                        # [2n+1] true α
-            margins,                              # [2n+2:3n+2]
-            [t_norm],                             # [3n+2]
-        ])
-        return obs.astype(np.float32)
+        return np.concatenate([
+            prices / (self._sys.refs + 1e-6),
+            self._demand_agg / (np.sum(self._demand_agg) + 1e-6),
+            [self._sys.alpha, self._alpha],
+            (prices - self._sys.costs) / (self._sys.costs + 1e-6),
+            [self._t / self.cfg.max_steps],
+        ]).astype(np.float32)
 
     def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
-        """Compute reward based on configured mode."""
         cfg, sys = self.cfg, self._sys
         if sys is None:
             return 0.0
@@ -123,159 +99,77 @@ class PricingEnv(gym.Env if HAS_GYM else object):
         for sid, q in demand.items():
             sess = next((s for s in sys._sessions if s.sid == sid), None)
             if sess and sess.events:
-                pidx = sess.events[0].product_idx
-                agg[pidx] += q
+                agg[sess.events[0].product_idx] += q
         self._demand_agg = agg
 
-        revenue = 0.0
-        cost = 0.0
-        purchases = np.zeros(self.n, dtype=float)
-        for sess in sys._last_sessions:
-            for e in sess.events:
-                if e.action != "purchase":
-                    continue
-                pidx = int(e.product_idx)
-                if 0 <= pidx < self.n:
-                    purchases[pidx] += 1.0
-                    revenue += float(e.price_seen)
-                    cost += float(sys.costs[pidx])
-        profit = float(revenue - cost)
+        _, revenue, cost = aggregate_purchases(sys._last_sessions, self.n, sys.costs)
+        profit = revenue - cost
 
-        # volatility penalty (price changes)
         vol_penalty = 0.0
         if self._last_prices is not None:
-            price_change = np.abs(prices - self._last_prices) / (sys.refs + 1e-6)
-            vol_penalty = cfg.lambda_vol * float(np.mean(price_change))
+            vol_penalty = cfg.lambda_vol * float(np.mean(np.abs(prices - self._last_prices) / (sys.refs + 1e-6)))
 
         coi = compute_coi_window(sys._last_sessions, sys.costs, demand_mapping=demand)
-        coi_leak = float(coi.leak)
+        leak = float(coi.leak)
 
-        if cfg.reward_mode == "revenue":
-            r = revenue
-        elif cfg.reward_mode == "profit":
-            r = profit
-        elif cfg.reward_mode == "robust":
-            # robust objective: profit - λ_coi * COI_leak - λ_vol * volatility
-            r = profit - cfg.lambda_coi * coi_leak - vol_penalty
-        elif cfg.reward_mode == "coi_aware":
-            # adaptive: heavier penalty at high contamination
-            adaptive_lambda = cfg.lambda_coi * (1 + 2 * sys.alpha)
-            r = profit - adaptive_lambda * coi_leak - vol_penalty
-        else:
-            r = profit
-
-        if cfg.normalize_reward:
-            r = r / (float(np.sum(sys.refs)) + 1e-6)  # normalize by potential revenue
-
-        return float(r)
+        reward_fns = {
+            "revenue": lambda: revenue,
+            "profit": lambda: profit,
+            "robust": lambda: profit - cfg.lambda_coi * leak - vol_penalty,
+            "coi_aware": lambda: profit - cfg.lambda_coi * (1 + 2 * sys.alpha) * leak - vol_penalty,
+        }
+        r = reward_fns.get(cfg.reward_mode, lambda: profit)()
+        return float(r / (float(np.sum(sys.refs)) + 1e-6)) if cfg.normalize_reward else float(r)
 
     def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
-        """Reset environment to initial state."""
         seed = seed if seed is not None else self.cfg.seed
         self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed)
-        self._t = 0
-        self._alpha = self.cfg.alpha_true
-        self._last_prices = None
-        self._last_demand = None
-        self._episode_rewards = []
-        self._demand_agg = np.zeros(self.n)
-
-        info = {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
-                "costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()}
-        return self._build_obs(), info
+        self._t, self._alpha = 0, self.cfg.alpha_true
+        self._last_prices, self._last_demand = None, None
+        self._episode_rewards, self._demand_agg = [], np.zeros(self.n)
+        return self._build_obs(), {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
+                                   "costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()}
 
     def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
-        """Execute one environment step.
-
-        Args:
-            action: price multipliers in [0.5, 1.5]
-
-        Returns:
-            obs, reward, terminated, truncated, info
-        """
         if self._sys is None:
             raise RuntimeError("call reset() first")
 
-        # convert action to prices
         action = np.clip(action, 0.5, 1.5)
-        prices = self._sys.refs * action.astype(np.float64)
-        prices = np.clip(prices, self._sys.costs * 1.01, self._sys.refs * 2.0)
-
-        # # drift contamination
-        # if self.cfg.alpha_drift != 0:
-        #     self._alpha = np.clip(
-        #         self._alpha + self.cfg.alpha_drift * self._sys.rng.normal(),
-        #         *self.cfg.alpha_bounds)
-
-        # observe demand
+        prices = np.clip(self._sys.refs * action.astype(np.float64), self._sys.costs * 1.01, self._sys.refs * 2.0)
         demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step)
         self._sys.limbo.add_update("prices", prices)
-
-        # update α estimate
         self._sys._alpha_est = self._sys._estimate_alpha_from_sessions()
 
         reward = self._compute_reward(prices, demand)
         self._episode_rewards.append(reward)
-
-        self._last_prices = prices.copy()
-        self._last_demand = demand
+        self._last_prices, self._last_demand = prices.copy(), demand
         self._t += 1
 
-        terminated = self._t >= self.cfg.max_steps
-        truncated = False
-
-        # compute metrics for tracking
-        revenue = 0.0
-        cost = 0.0
-        n_purchases = 0
-        for sess in self._sys._last_sessions:
-            for e in sess.events:
-                if e.action != "purchase":
-                    continue
-                n_purchases += 1
-                revenue += float(e.price_seen)
-                cost += float(self._sys.costs[int(e.product_idx)])
-        profit = float(revenue - cost)
+        # compute info metrics using shared helper
+        purchases, revenue, cost = aggregate_purchases(self._sys._last_sessions, self.n, self._sys.costs)
         n_agents = int(self._alpha * self.cfg.sessions_per_step)
-        price_std = float(np.std(prices))
         coi = compute_coi_window(self._sys._last_sessions, self._sys.costs, demand_mapping=demand)
 
         info = {
-            "alpha_true": self._alpha,
-            "alpha_est": self._sys.alpha,
+            "alpha_true": self._alpha, "alpha_est": self._sys.alpha,
             "alpha_error": abs(self._alpha - self._sys.alpha),
-            "revenue": float(revenue),
-            "profit": float(profit),
-            "cost": float(cost),
-            "n_purchases": int(n_purchases),
+            "revenue": float(revenue), "profit": float(revenue - cost), "cost": float(cost),
+            "n_purchases": int(np.sum(purchases)),
             "avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
-            "n_sessions": len(demand),
-            "n_agents": n_agents,
-            "price_std": price_std,
-            "coi_erosion": coi_erosion(max(1, n_agents), price_std),
-            "coi_policy": float(coi.policy),
-            "coi_agent": float(coi.agent),
-            "coi_leakage": float(coi.leak),
-            "coi_survival": float(coi.survival_ratio),
-            "cumulative_reward": sum(self._episode_rewards),
-            "step": self._t,
+            "n_sessions": len(demand), "n_agents": n_agents, "price_std": float(np.std(prices)),
+            "coi_erosion": coi_erosion(max(1, n_agents), float(np.std(prices))),
+            "coi_policy": float(coi.policy), "coi_agent": float(coi.agent),
+            "coi_leakage": float(coi.leak), "coi_survival": float(coi.survival_ratio),
+            "cumulative_reward": sum(self._episode_rewards), "step": self._t,
         }
-
-        return self._build_obs(), reward, terminated, truncated, info
+        return self._build_obs(), reward, self._t >= self.cfg.max_steps, False, info
 
     def render(self, mode: str = "human") -> str | None:
-        """Render environment state."""
         if self._sys is None or self._last_prices is None:
             return None
-
-        lines = [
-            f"t={self._t}/{self.cfg.max_steps}",
-            f"α_true={self._alpha:.3f} α̂={self._sys.alpha:.3f}",
-            f"prices: {self._last_prices.round(1)}",
-            f"demand: {self._demand_agg.round(2)}",
-            f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}",
-        ]
-        out = " | ".join(lines)
+        out = f"t={self._t}/{self.cfg.max_steps} | alpha_true={self._alpha:.3f} alpha_hat={self._sys.alpha:.3f} | " \
+              f"prices: {self._last_prices.round(1)} | demand: {self._demand_agg.round(2)} | " \
+              f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}"
         if mode == "human":
             print(out)
         return out
@@ -285,10 +179,7 @@ class PricingEnv(gym.Env if HAS_GYM else object):
 
 
 class ContaminationSweepEnv(PricingEnv):
-    """Environment that sweeps through contamination levels during training.
-
-    Useful for curriculum learning: start with low α, gradually increase.
-    """
+    """Environment that sweeps through contamination levels during training."""
 
     def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None):
         super().__init__(cfg)
@@ -296,7 +187,6 @@ class ContaminationSweepEnv(PricingEnv):
         self._schedule_idx = 0
 
     def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
-        # advance schedule on reset
         if options and options.get("advance_schedule", False):
             self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule)
         self.cfg.alpha_true = self._schedule[self._schedule_idx]
@@ -306,8 +196,7 @@ class ContaminationSweepEnv(PricingEnv):
 class AdversarialEnv(PricingEnv):
     """Environment with adversarial contamination dynamics.
 
-    The contamination level responds to pricing policy: if prices are too predictable,
-    agents learn to exploit and α increases.
+    Contamination increases when prices are predictable (agents exploit).
     """
 
     def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02):
@@ -317,20 +206,13 @@ class AdversarialEnv(PricingEnv):
 
     def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
         obs, reward, term, trunc, info = super().step(action)
-
-        # track price history for predictability
         if self._last_prices is not None:
             self._price_history.append(self._last_prices.copy())
-
-        # increase α if prices are predictable (low variance over recent history)
+        predictability = 0.0
         if len(self._price_history) > 10:
-            recent = np.array(self._price_history[-10:])
-            predictability = 1.0 / (float(np.std(recent)) + 0.1)
-            self._alpha = np.clip(
-                self._alpha + self._exploit_rate * predictability * self._sys.rng.random(),
-                *self.cfg.alpha_bounds)
-
-        info["predictability"] = predictability if len(self._price_history) > 10 else 0.0
+            predictability = 1.0 / (float(np.std(self._price_history[-10:])) + 0.1)
+            self._alpha = np.clip(self._alpha + self._exploit_rate * predictability * self._sys.rng.random(), *self.cfg.alpha_bounds)
+        info["predictability"] = predictability
         return obs, reward, term, trunc, info
 
     def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
@@ -339,39 +221,20 @@ class AdversarialEnv(PricingEnv):
 
 
 def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv:
-    """Factory for creating pricing environments."""
-    if env_type == "sweep":
-        return ContaminationSweepEnv(cfg)
-    elif env_type == "adversarial":
-        return AdversarialEnv(cfg)
-    return PricingEnv(cfg)
+    return {"sweep": ContaminationSweepEnv, "adversarial": AdversarialEnv}.get(env_type, PricingEnv)(cfg)
 
 
-# simple baseline policies for benchmarking
-def fixed_price_policy(refs: np.ndarray, margin: float = 0.0) -> np.ndarray:
-    """Fixed markup policy: always return ref * (1 + margin)."""
-    return np.ones(len(refs), dtype=np.float32) * (1.0 + margin)
-
-
-def random_policy(n: int, rng: np.random.Generator | None = None) -> np.ndarray:
-    """Random policy for exploration baseline."""
-    rng = rng or np.random.default_rng()
-    return rng.uniform(0.7, 1.3, n).astype(np.float32)
-
-
-def adaptive_policy(obs: np.ndarray, n: int, base_margin: float = 0.1) -> np.ndarray:
-    """Simple adaptive policy: reduce margins when α̂ is high."""
-    alpha_est = obs[2 * n]  # α̂ is at position 2n in observation
-    margin_scale = 1.0 - 0.4 * alpha_est  # defensive when α̂ high
-    return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale)
+# baseline policies
+fixed_price_policy = lambda refs, margin=0.0: np.ones(len(refs), dtype=np.float32) * (1.0 + margin)
+random_policy = lambda n, rng=None: (rng or np.random.default_rng()).uniform(0.7, 1.3, n).astype(np.float32)
+adaptive_policy = lambda obs, n, base=0.1: np.ones(n, dtype=np.float32) * (1.0 + base * (1.0 - 0.4 * obs[2 * n]))
 
 
 if __name__ == "__main__":
-    # demo run
     cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust")
     env = make_env(cfg)
     obs, info = env.reset()
-    print(f"initial: α={info['alpha_true']:.2f}")
+    print(f"initial: alpha={info['alpha_true']:.2f}")
 
     total_reward = 0.0
     for t in range(cfg.max_steps):
@@ -383,4 +246,4 @@ if __name__ == "__main__":
         if done:
             break
 
-    print(f"\ntotal reward: {total_reward:.2f}, final α̂: {info['alpha_est']:.3f}")
+    print(f"\ntotal reward: {total_reward:.2f}, final alpha_hat: {info['alpha_est']:.3f}")
diff --git a/lab/case/thesis/train.py b/lab/case/thesis/train.py
index cc152b5..753a5f1 100644
--- a/lab/case/thesis/train.py
+++ b/lab/case/thesis/train.py
@@ -6,7 +6,8 @@ Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formula
 from __future__ import annotations
 import argparse
 import json
-from dataclasses import dataclass, asdict
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from dataclasses import dataclass, asdict, field
 from pathlib import Path
 from typing import Dict, List, Callable, Any
 import numpy as np
@@ -27,10 +28,9 @@ except ImportError:
     HAS_TB = False
 
 from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy
-from .simplified import coi_erosion
+from .coi import coi_erosion
 
 
-# thesis-aligned KPIs tracked per episode
 @dataclass
 class EpisodeMetrics:
     reward: float = 0.0
@@ -43,10 +43,24 @@ class EpisodeMetrics:
     n_agents: int = 0
     steps: int = 0
 
+    def accumulate(self, info: Dict[str, Any]) -> None:
+        self.steps += 1
+        self.reward += info.get('reward', 0)
+        self.revenue += info.get('revenue', 0)
+        self.profit += info.get('profit', 0)
+        self.coi_erosion += info.get('coi_erosion', 0)
+        self.coi_leakage += info.get('coi_leakage', 0)
+        self.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0))
+        self.avg_margin += info.get('avg_margin', 0)
+        self.n_agents += info.get('n_agents', 0)
+
+    def normalized(self) -> Dict[str, float]:
+        s = max(self.steps, 1)
+        return {k: getattr(self, k) / s for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin', 'n_agents']}
+
 
 @dataclass
 class ExperimentConfig:
-    """Full experiment specification for reproducibility."""
     algo: str = "ppo"
     total_timesteps: int = 100_000
     n_envs: int = 4
@@ -65,17 +79,14 @@ class ExperimentConfig:
             self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}"
 
 
-# unified policy interface wrapping all baselines
 class Policy:
     """Unified policy interface for baselines and trained models."""
 
     def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str):
-        self._fn = policy_fn
-        self.name = name
+        self._fn, self.name = policy_fn, name
 
     def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]:
-        n = (len(obs) - 3) // 3
-        return self._fn(obs, n), None
+        return self._fn(obs, (len(obs) - 3) // 3), None
 
     @staticmethod
     def fixed(margin: float = 0.15) -> "Policy":
@@ -91,99 +102,97 @@ class Policy:
 
     @staticmethod
     def myopic(greed: float = 0.3) -> "Policy":
-        """Myopic: maximize immediate margin, ignore alpha."""
         def _fn(obs: np.ndarray, n: int) -> np.ndarray:
             demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
-            mult = 1.0 + greed * (1 + np.mean(demand_norm))
-            return np.ones(n, dtype=np.float32) * np.clip(mult, 0.5, 1.5)
+            return np.ones(n, dtype=np.float32) * np.clip(1.0 + greed * (1 + np.mean(demand_norm)), 0.5, 1.5)
         return Policy(_fn, f"myopic_{greed:.1f}")
 
 
-class MetricsCallback(BaseCallback):
-    """Tracks thesis-aligned metrics during RL training."""
+def log_metrics(writer: SummaryWriter | None, metrics: Dict[str, float], prefix: str, step: int) -> None:
+    if writer is None:
+        return
+    for k, v in metrics.items():
+        writer.add_scalar(f'{prefix}/{k}', v, step)
 
+
+class MetricsCallback(BaseCallback):
     def __init__(self, writer: SummaryWriter | None, verbose: int = 0):
         super().__init__(verbose)
         self._writer = writer
-        self._ep = EpisodeMetrics()
-        self._buffer: List[EpisodeMetrics] = []
 
     def _on_step(self) -> bool:
+        if self._writer is None:
+            return True
         for info in self.locals.get('infos', []):
-            self._ep.steps += 1
-            self._ep.reward += info.get('reward', 0)
-            self._ep.revenue += info.get('revenue', 0)
-            self._ep.profit += info.get('profit', 0)
-            self._ep.coi_erosion += info.get('coi_erosion', 0)
-            self._ep.coi_leakage += info.get('coi_leakage', 0)
-            self._ep.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0))
-            self._ep.avg_margin += info.get('avg_margin', 0)
-            self._ep.n_agents += info.get('n_agents', 0)
+            t = self.num_timesteps
+            self._writer.add_scalar('economics/revenue', info.get('revenue', 0), t)
+            self._writer.add_scalar('economics/profit', info.get('profit', 0), t)
+            self._writer.add_scalar('economics/margin', info.get('avg_margin', 0), t)
+            self._writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), t)
+            self._writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), t)
+            self._writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), t)
+            self._writer.add_scalar('agents/count', info.get('n_agents', 0), t)
         return True
 
-    def _on_rollout_end(self) -> None:
-        if self._ep.steps == 0 or self._writer is None:
-            return
-        s, step = self._ep.steps, self.num_timesteps
-        self._writer.add_scalar('economics/revenue', self._ep.revenue / s, step)
-        self._writer.add_scalar('economics/profit', self._ep.profit / s, step)
-        self._writer.add_scalar('economics/margin', self._ep.avg_margin / s, step)
-        self._writer.add_scalar('coi/erosion', self._ep.coi_erosion / s, step)
-        self._writer.add_scalar('coi/leakage', self._ep.coi_leakage / s, step)
-        self._writer.add_scalar('alpha/estimation_error', self._ep.alpha_error / s, step)
-        self._writer.add_scalar('agents/count', self._ep.n_agents / s, step)
-        self._buffer.append(self._ep)
-        self._ep = EpisodeMetrics()
-
 
 def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv:
     def _make():
-        env_cfg = EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
-                            alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)
-        return Monitor(make_env(env_cfg))
+        return Monitor(make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
+                                          alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)))
     return DummyVecEnv([_make for _ in range(n_envs)])
 
 
-def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]:
-    """Evaluate policy and return thesis-aligned metrics."""
-    env_cfg = EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
-                        alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999)
-    env = make_env(env_cfg)
+def run_episodes(policy: Policy | Any, env: PricingEnv, n_episodes: int) -> List[EpisodeMetrics]:
+    """Run policy for n episodes and collect metrics."""
     metrics = []
-
     for _ in range(n_episodes):
         obs, _ = env.reset()
-        ep = EpisodeMetrics()
-        done = False
+        ep, done = EpisodeMetrics(), False
         while not done:
             action, _ = policy.predict(obs, deterministic=True)
             obs, reward, term, trunc, info = env.step(action)
             done = term or trunc
+            ep.accumulate(info)
             ep.reward += reward
-            ep.revenue += info.get('revenue', 0)
-            ep.profit += info.get('profit', 0)
-            ep.coi_erosion += info.get('coi_erosion', 0)
-            ep.coi_leakage += info.get('coi_leakage', 0)
-            ep.alpha_error += abs(info['alpha_true'] - info['alpha_est'])
-            ep.avg_margin += info.get('avg_margin', 0)
-            ep.steps += 1
         metrics.append(ep)
+    return metrics
 
-    n = len(metrics)
+
+def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]:
+    env = make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
+                             alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999))
+    metrics = run_episodes(policy, env, n_episodes)
     return {
-        'reward_mean': np.mean([m.reward for m in metrics]),
-        'reward_std': np.std([m.reward for m in metrics]),
-        'revenue_mean': np.mean([m.revenue / m.steps for m in metrics]),
-        'profit_mean': np.mean([m.profit / m.steps for m in metrics]),
-        'coi_erosion_mean': np.mean([m.coi_erosion / m.steps for m in metrics]),
-        'coi_leakage_mean': np.mean([m.coi_leakage / m.steps for m in metrics]),
-        'alpha_error_mean': np.mean([m.alpha_error / m.steps for m in metrics]),
-        'margin_mean': np.mean([m.avg_margin / m.steps for m in metrics]),
+        'reward_mean': np.mean([m.reward for m in metrics]), 'reward_std': np.std([m.reward for m in metrics]),
+        **{f'{k}_mean': np.mean([m.normalized()[k] for m in metrics])
+           for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin']},
     }
 
 
+def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None):
+    obs, n_envs = vec_env.reset(), vec_env.num_envs
+    ep_rewards = np.zeros(n_envs)
+
+    for step in range(0, total_steps, n_envs):
+        actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)])
+        obs, rewards, dones, infos = vec_env.step(actions)
+        ep_rewards += rewards
+        for i, info in enumerate(infos):
+            if writer:
+                writer.add_scalar('economics/revenue', info.get('revenue', 0), step)
+                writer.add_scalar('economics/profit', info.get('profit', 0), step)
+                writer.add_scalar('economics/margin', info.get('avg_margin', 0), step)
+                writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), step)
+                writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), step)
+                writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), step)
+                writer.add_scalar('agents/count', info.get('n_agents', 0), step)
+            if dones[i]:
+                if writer:
+                    writer.add_scalar('rollout/ep_reward', ep_rewards[i], step)
+                ep_rewards[i] = 0
+
+
 def train(cfg: ExperimentConfig) -> Dict[str, Any]:
-    """Train RL agent or evaluate baseline policy."""
     is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"]
     if not HAS_SB3 and not is_baseline:
         raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]")
@@ -194,85 +203,65 @@ def train(cfg: ExperimentConfig) -> Dict[str, Any]:
         json.dump(asdict(cfg), f, indent=2)
 
     writer = SummaryWriter(log_path) if HAS_TB else None
-    train_env = make_vec_env(cfg, cfg.n_envs)
-    eval_env = make_vec_env(cfg, 1)
+    train_env, eval_env = make_vec_env(cfg, cfg.n_envs), make_vec_env(cfg, 1)
 
     if is_baseline:
-        policy_map = {"fixed": Policy.fixed(), "adaptive": Policy.adaptive(),
-                      "random": Policy.random(), "myopic": Policy.myopic()}
-        policy = policy_map[cfg.algo.lower()]
+        policy = {"fixed": Policy.fixed, "adaptive": Policy.adaptive, "random": Policy.random, "myopic": Policy.myopic}[cfg.algo.lower()]()
         run_baseline(policy, train_env, cfg.total_timesteps, writer)
         final_metrics = evaluate_policy(policy, cfg)
     else:
-        algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}.get(cfg.algo.lower())
-        if algo_cls is None:
-            raise ValueError(f"unknown algo: {cfg.algo}")
+        algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}[cfg.algo.lower()]
         common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
-        # TODO: setup hyper parameter passing to train different variations (no free lunch)
-        if cfg.algo.lower() == "ppo":
-            model = PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048,
-                        batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95,
-                        clip_range=0.2, ent_coef=0.01, **common)
-        elif cfg.algo.lower() == "sac":
-            model = SAC("MlpPolicy", train_env, learning_rate=3e-4, buffer_size=100_000,
-                        batch_size=256, tau=0.005, gamma=0.99, **common)
-        else:
-            model = A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common)
+        model = {
+            "ppo": lambda: PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, ent_coef=0.01, **common),
+            "sac": lambda: SAC("MlpPolicy", train_env, learning_rate=3e-4, buffer_size=100_000, batch_size=256, tau=0.005, gamma=0.99, **common),
+            "a2c": lambda: A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common),
+        }[cfg.algo.lower()]()
 
         cb = MetricsCallback(writer)
-        eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"),
-                               log_path=str(log_path), eval_freq=cfg.eval_freq,
-                               n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
+        eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"), log_path=str(log_path),
+                               eval_freq=cfg.eval_freq, n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
         model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True)
         model.save(log_path / "final_model")
         policy = model
         final_metrics = evaluate_policy(model, cfg)
 
     if writer:
-        for k, v in final_metrics.items():
-            writer.add_scalar(f'final/{k}', v, cfg.total_timesteps)
+        log_metrics(writer, final_metrics, 'final', cfg.total_timesteps)
         writer.close()
 
-    train_env.close()
-    eval_env.close()
+    train_env.close(); eval_env.close()
     with open(log_path / "results.json", "w") as f:
         json.dump(final_metrics, f, indent=2)
     return {"path": str(log_path), "metrics": final_metrics}
 
 
-def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None):
-    """Run baseline policy through environment with logging."""
-    obs = vec_env.reset()
-    n_envs = vec_env.num_envs
-    ep_rewards = np.zeros(n_envs)
-    all_rewards, coi_buf, alpha_buf = [], [], []
-
-    for step in range(0, total_steps, n_envs):
-        actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)])
-        obs, rewards, dones, infos = vec_env.step(actions)
-        ep_rewards += rewards
-        for i, info in enumerate(infos):
-            coi_buf.append(info.get('coi_erosion', 0))
-            alpha_buf.append(abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)))
-            if dones[i]:
-                all_rewards.append(ep_rewards[i])
-                ep_rewards[i] = 0
-        if writer and step % 1000 < n_envs and all_rewards:
-            writer.add_scalar('rollout/ep_rew_mean', np.mean(all_rewards[-20:]), step)
-            writer.add_scalar('coi/erosion', np.mean(coi_buf[-100:]), step)
-            writer.add_scalar('alpha/estimation_error', np.mean(alpha_buf[-100:]), step)
+def _train_alpha(args: tuple) -> tuple[str, Dict]:
+    """Worker for parallel sweep - must be top-level for pickling."""
+    cfg_dict, alpha = args
+    cfg_dict["alpha_true"] = alpha
+    cfg_dict["experiment_name"] = f"{cfg_dict['algo']}_a{alpha:.2f}_{cfg_dict['reward_mode']}"
+    sweep_cfg = ExperimentConfig(**cfg_dict)
+    print(f"[alpha={alpha:.2f}] starting")
+    metrics = train(sweep_cfg)["metrics"]
+    print(f"[alpha={alpha:.2f}] done")
+    return f"alpha_{alpha:.2f}", metrics
 
 
-def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None) -> Dict[str, Dict]:
-    """Run experiment across contamination levels for scientific comparison."""
-    alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
-    results = {}
-    for alpha in alphas:
-        sweep_cfg = ExperimentConfig(**{**asdict(cfg), "alpha_true": alpha,
-                                         "experiment_name": f"{cfg.algo}_a{alpha:.2f}_{cfg.reward_mode}"})
-        print(f"\n=== α={alpha:.2f} ===")
-        out = train(sweep_cfg)
-        results[f"alpha_{alpha:.2f}"] = out["metrics"]
+def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
+    alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+    cfg_dict = asdict(cfg)
+
+    if max_workers == 1:  # sequential fallback
+        results = dict(_train_alpha((cfg_dict.copy(), a)) for a in alphas)
+    else:
+        with ProcessPoolExecutor(max_workers=max_workers) as pool:
+            futures = {pool.submit(_train_alpha, (cfg_dict.copy(), a)): a for a in alphas}
+            results = {}
+            for fut in as_completed(futures):
+                key, metrics = fut.result()
+                results[key] = metrics
+
     summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json"
     with open(summary_path, "w") as f:
         json.dump(results, f, indent=2)
@@ -280,23 +269,38 @@ def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None) -> Dict[
     return results
 
 
-def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None) -> Dict[str, Dict]:
-    """Compare multiple policies at same contamination level."""
+def _train_policy(args: tuple) -> tuple[str, Dict]:
+    """Worker for parallel policy comparison."""
+    cfg_dict, algo = args
+    cfg_dict["algo"] = algo
+    cfg_dict["experiment_name"] = f"cmp_{algo}_a{cfg_dict['alpha_true']:.2f}"
+    cmp_cfg = ExperimentConfig(**cfg_dict)
+    print(f"[{algo}] starting")
+    metrics = train(cmp_cfg)["metrics"]
+    print(f"[{algo}] done")
+    return algo, metrics
+
+
+def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
     policies = policies or ["fixed", "adaptive", "myopic", "random"]
-    results = {}
-    for algo in policies:
-        cmp_cfg = ExperimentConfig(**{**asdict(cfg), "algo": algo,
-                                       "experiment_name": f"cmp_{algo}_a{cfg.alpha_true:.2f}"})
-        print(f"\n=== {algo} ===")
-        out = train(cmp_cfg)
-        results[algo] = out["metrics"]
+    cfg_dict = asdict(cfg)
+
+    if max_workers == 1:
+        results = dict(_train_policy((cfg_dict.copy(), p)) for p in policies)
+    else:
+        with ProcessPoolExecutor(max_workers=max_workers) as pool:
+            futures = {pool.submit(_train_policy, (cfg_dict.copy(), p)): p for p in policies}
+            results = {}
+            for fut in as_completed(futures):
+                algo, metrics = fut.result()
+                results[algo] = metrics
+
     cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json"
     with open(cmp_path, "w") as f:
         json.dump(results, f, indent=2)
     print(f"\nComparison saved to {cmp_path}")
     for algo, m in results.items():
-        print(f"  {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} "
-              f"alpha_err={m['alpha_error_mean']:.4f}")
+        print(f"  {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} alpha_err={m['alpha_error_mean']:.4f}")
     return results
 
 
@@ -312,6 +316,7 @@ def main():
     parser.add_argument("--log-dir", default="lab/case/thesis/runs")
     parser.add_argument("--sweep", action="store_true", help="run contamination sweep")
     parser.add_argument("--compare", action="store_true", help="compare all baselines")
+    parser.add_argument("--workers", type=int, default=None, help="max parallel workers for sweep (None=auto, 1=sequential)")
     args = parser.parse_args()
 
     cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
@@ -319,9 +324,9 @@ def main():
                            n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir)
 
     if args.sweep:
-        run_sweep(cfg)
+        run_sweep(cfg, max_workers=args.workers)
     elif args.compare:
-        compare_policies(cfg)
+        compare_policies(cfg, max_workers=args.workers)
     else:
         result = train(cfg)
         print(f"\nTraining complete: {result['path']}")

From 98a9a3738c16d640b93f086451b5b22faede90a6 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 25 Jan 2026 10:36:37 +0100
Subject: [PATCH 53/99] fix: coi better defined and aligned and sac improved

---
 lab/case/thesis/simplified.py     | 5 +++--
 lab/case/thesis/simplified_env.py | 2 +-
 lab/case/thesis/train.py          | 3 +--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lab/case/thesis/simplified.py b/lab/case/thesis/simplified.py
index 3c58fdd..450f01a 100644
--- a/lab/case/thesis/simplified.py
+++ b/lab/case/thesis/simplified.py
@@ -14,7 +14,7 @@ from dataclasses import dataclass, field
 from typing import Dict, List, Tuple
 import numpy as np
 
-from .coi import COIWindow, compute_coi_window, coi_erosion
+from .coi import COIWindow, compute_coi_window
 from .separability import TRANS_H, TRANS_A, kl_div, build_kernel, compute_divergence, estimate_alpha
 
 ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0}
@@ -209,7 +209,8 @@ if __name__ == "__main__":
     print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
 
     for n in [1, 5, 10, 50, 100]:
-        print(f'N={n:3d} agents -> COI erosion: {coi_erosion(n, price_std=5.0):.3f}')
+        # theoretical: erosion = 1 - 2/(N+1) for uniform order statistic
+        print(f'N={n:3d} agents -> COI erosion: {1.0 - 2.0/(n+1):.3f}')
 
     events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0), Event('purchase', 0, 20.0, 2.0)]
     print(f'human-like session alpha_hat: {estimate_alpha(Session(sid="test", events=events, actor="H")):.3f}')
diff --git a/lab/case/thesis/simplified_env.py b/lab/case/thesis/simplified_env.py
index e4cd84c..70b3904 100644
--- a/lab/case/thesis/simplified_env.py
+++ b/lab/case/thesis/simplified_env.py
@@ -157,7 +157,7 @@ class PricingEnv(gym.Env if HAS_GYM else object):
             "n_purchases": int(np.sum(purchases)),
             "avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
             "n_sessions": len(demand), "n_agents": n_agents, "price_std": float(np.std(prices)),
-            "coi_erosion": coi_erosion(max(1, n_agents), float(np.std(prices))),
+            "coi_erosion": coi_erosion(coi.policy, coi.agent),
             "coi_policy": float(coi.policy), "coi_agent": float(coi.agent),
             "coi_leakage": float(coi.leak), "coi_survival": float(coi.survival_ratio),
             "cumulative_reward": sum(self._episode_rewards), "step": self._t,
diff --git a/lab/case/thesis/train.py b/lab/case/thesis/train.py
index 753a5f1..c1273eb 100644
--- a/lab/case/thesis/train.py
+++ b/lab/case/thesis/train.py
@@ -28,7 +28,6 @@ except ImportError:
     HAS_TB = False
 
 from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy
-from .coi import coi_erosion
 
 
 @dataclass
@@ -214,7 +213,7 @@ def train(cfg: ExperimentConfig) -> Dict[str, Any]:
         common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
         model = {
             "ppo": lambda: PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, ent_coef=0.01, **common),
-            "sac": lambda: SAC("MlpPolicy", train_env, learning_rate=3e-4, buffer_size=100_000, batch_size=256, tau=0.005, gamma=0.99, **common),
+            "sac": lambda: SAC("MlpPolicy", train_env, learning_rate=1e-4, buffer_size=50_000, batch_size=512, tau=0.02, gamma=0.99, learning_starts=1000, ent_coef="auto_0.1", train_freq=4, **common),
             "a2c": lambda: A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common),
         }[cfg.algo.lower()]()
 

From cd6c3d600685bb51618226471fbb196d1355eec2 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 26 Jan 2026 13:19:55 +0100
Subject: [PATCH 54/99] chore: migrating thesis case definition

---
 .gitignore                                    |   4 +-
 sim/case/__init__.py                          |   2 +
 sim/case/thesis_simplified/__init__.py        |   2 +
 sim/case/thesis_simplified/coi.py             | 125 +++++++
 sim/case/thesis_simplified/experiments.py     | 325 ++++++++++++++++++
 sim/case/thesis_simplified/separability.py    |  72 ++++
 .../case/thesis_simplified}/simplified.py     |   0
 .../case/thesis_simplified}/simplified_env.py |   0
 sim/case/thesis_simplified/summarize.py       | 168 +++++++++
 .../case/thesis_simplified}/train.py          |   4 +-
 sim/rl/environment.py                         |  51 ++-
 11 files changed, 741 insertions(+), 12 deletions(-)
 create mode 100644 sim/case/__init__.py
 create mode 100644 sim/case/thesis_simplified/__init__.py
 create mode 100644 sim/case/thesis_simplified/coi.py
 create mode 100644 sim/case/thesis_simplified/experiments.py
 create mode 100644 sim/case/thesis_simplified/separability.py
 rename {lab/case/thesis => sim/case/thesis_simplified}/simplified.py (100%)
 rename {lab/case/thesis => sim/case/thesis_simplified}/simplified_env.py (100%)
 create mode 100644 sim/case/thesis_simplified/summarize.py
 rename {lab/case/thesis => sim/case/thesis_simplified}/train.py (99%)

diff --git a/.gitignore b/.gitignore
index e06db65..90077a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,4 +21,6 @@ sim/rl/behavior_loader/*.dot
 sim/rl/behavior_loader/*.png
 sim/rl/behavior_loader/*.svg
 sim/rl/behavior_loader/*.pdf
-tests/e2e/node_modules/**
\ No newline at end of file
+tests/e2e/node_modules/**
+lab/case/thesis/runs*/
+sim/case/thesis_simplified/runs*/
diff --git a/sim/case/__init__.py b/sim/case/__init__.py
new file mode 100644
index 0000000..cb6c13c
--- /dev/null
+++ b/sim/case/__init__.py
@@ -0,0 +1,2 @@
+"""Case-specific simulations and experiments."""
+
diff --git a/sim/case/thesis_simplified/__init__.py b/sim/case/thesis_simplified/__init__.py
new file mode 100644
index 0000000..6259958
--- /dev/null
+++ b/sim/case/thesis_simplified/__init__.py
@@ -0,0 +1,2 @@
+"""Minimal thesis-aligned pricing simulation (self-contained)."""
+
diff --git a/sim/case/thesis_simplified/coi.py b/sim/case/thesis_simplified/coi.py
new file mode 100644
index 0000000..1657f65
--- /dev/null
+++ b/sim/case/thesis_simplified/coi.py
@@ -0,0 +1,125 @@
+"""Cost of Information (COI) computation for thesis pricing system.
+
+Core KPI: COI = E[p_shown] - p_min measures pricing power from information asymmetry.
+Theorem 1 shows COI erodes as agent queries increase: as N->inf, p^(1)->p_min.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, List, TYPE_CHECKING
+import numpy as np
+
+if TYPE_CHECKING:
+    from .simplified import Session
+
+
+@dataclass(frozen=True)
+class COIWindow:
+    """Windowed COI metrics computed from realized price exposures.
+
+    policy: E[p_shown] - cost, the definition-level KPI
+    agent: E[p^(1)] - cost where p^(1) is min price under agent querying
+    leak: max(policy - agent, 0), observable gap from reconnaissance
+    survival_ratio: agent/policy, fraction of pricing power retained
+    """
+    policy: float
+    agent: float
+    leak: float
+    survival_ratio: float
+    policy_by_product: np.ndarray
+    agent_by_product: np.ndarray
+    demand_weights: np.ndarray
+
+
+def aggregate_prices(sessions: List["Session"], mode: str = "all") -> Dict[int, List[float] | float]:
+    """Unified price aggregation across sessions.
+
+    mode: "all" returns all prices per product, "min_per_session" returns min price per session per product,
+          "min_across" returns single min price per product
+    """
+    if mode == "min_across":
+        mins: Dict[int, float] = {}
+        for s in sessions:
+            for e in s.events:
+                pidx, price = int(e.product_idx), float(e.price_seen)
+                mins[pidx] = min(mins.get(pidx, price), price)
+        return mins
+    elif mode == "min_per_session":
+        result: Dict[int, List[float]] = {}
+        for s in sessions:
+            by_p: Dict[int, float] = {}
+            for e in s.events:
+                pidx, price = int(e.product_idx), float(e.price_seen)
+                by_p[pidx] = min(by_p.get(pidx, price), price)
+            for pidx, pmin in by_p.items():
+                result.setdefault(pidx, []).append(pmin)
+        return result
+    else:  # "all"
+        prices: Dict[int, List[float]] = {}
+        for s in sessions:
+            for e in s.events:
+                prices.setdefault(e.product_idx, []).append(float(e.price_seen))
+        return prices
+
+
+def demand_weights_by_product(sessions: List["Session"], demand_mapping: Dict[str, float], n_products: int) -> np.ndarray:
+    """Compute demand-weighted importance per product."""
+    w = np.zeros(n_products, dtype=float)
+    sessions_by_id = {s.sid: s for s in sessions}
+    for sid, q in demand_mapping.items():
+        sess = sessions_by_id.get(sid)
+        if sess and sess.events:
+            w[int(sess.events[0].product_idx)] += float(q)
+    total = float(np.sum(w))
+    return (w / total) if total > 0 else w
+
+
+def compute_coi_window(sessions: List["Session"], costs: np.ndarray, demand_mapping: Dict[str, float] | None = None) -> COIWindow:
+    """Compute COI metrics over session window.
+
+    Aggregates price exposures and computes policy-level vs agent-realized COI.
+    """
+    n = int(len(costs))
+    prices = aggregate_prices(sessions, mode="all")
+    agent_sessions = [s for s in sessions if s.actor == "A"]
+    agent_min = aggregate_prices(agent_sessions, mode="min_across") if agent_sessions else {}
+
+    policy_by = np.zeros(n, dtype=float)
+    agent_by = np.zeros(n, dtype=float)
+    seen = np.array([(i in prices) for i in range(n)], dtype=bool)
+    agent_seen = np.array([(i in agent_min) for i in range(n)], dtype=bool)
+
+    for pidx, ps in prices.items():
+        if 0 <= pidx < n and ps:
+            policy_by[pidx] = float(np.mean(ps) - float(costs[pidx]))
+    for pidx, pmin in agent_min.items():
+        if 0 <= pidx < n:
+            agent_by[pidx] = float(pmin - float(costs[pidx]))
+
+    agent_by[seen & ~agent_seen] = policy_by[seen & ~agent_seen]  # no erosion if no agent exposure
+
+    demand_w = demand_weights_by_product(sessions, demand_mapping, n) if demand_mapping else np.zeros(n, dtype=float)
+    has_weights = float(np.sum(demand_w)) > 0
+
+    if has_weights:
+        policy, agent = float(np.dot(demand_w, policy_by)), float(np.dot(demand_w, agent_by))
+    elif np.any(seen):
+        policy, agent = float(np.mean(policy_by[seen])), float(np.mean(agent_by[seen]))
+    else:
+        policy, agent = 0.0, 0.0
+
+    leak = float(max(policy - agent, 0.0))
+    survival = float(np.clip(agent / policy, 0.0, 1.0)) if policy > 0 else 0.0
+
+    return COIWindow(policy=policy, agent=agent, leak=leak, survival_ratio=survival,
+                     policy_by_product=policy_by, agent_by_product=agent_by, demand_weights=demand_w)
+
+
+def coi_erosion(coi_policy: float, coi_agent: float, eps: float = 1e-9) -> float:
+    """Thesis-consistent COI erosion: fraction of pricing power destroyed by agent queries.
+
+    erosion = 1 - (COI_agent / COI_policy)
+    When agents find low prices, COI_agent -> 0, erosion -> 1.
+    """
+    if coi_policy <= eps:
+        return 0.0
+    return float(np.clip(1.0 - (coi_agent / (coi_policy + eps)), 0.0, 1.0))
diff --git a/sim/case/thesis_simplified/experiments.py b/sim/case/thesis_simplified/experiments.py
new file mode 100644
index 0000000..74458d7
--- /dev/null
+++ b/sim/case/thesis_simplified/experiments.py
@@ -0,0 +1,325 @@
+"""COI leakage experiments and policy comparisons.
+
+Demonstrates the core thesis contribution: COI erosion under agent contamination
+and recovery via robust pricing policies.
+
+Generates TensorBoard logs for:
+- COI erosion curves across contamination levels
+- Policy comparison (fixed vs adaptive vs RL)
+- Revenue/margin trade-offs
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Tuple
+import json
+import numpy as np
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+    HAS_TB = True
+except ImportError:
+    HAS_TB = False
+
+from .simplified_env import PricingEnv, EnvConfig, make_env
+from .simplified import System
+
+
+@dataclass
+class ExperimentResult:
+    """Container for experiment metrics."""
+    name: str
+    alpha: float
+    reward_mean: float
+    reward_std: float
+    coi_erosion: float
+    alpha_error: float
+    revenue: float
+    margin: float
+
+    def to_dict(self) -> dict:
+        return {k: getattr(self, k) for k in self.__dataclass_fields__}
+
+
+def theoretical_coi_erosion_curve(alphas: np.ndarray, n_sessions: int = 1000) -> np.ndarray:
+    """Theoretical COI erosion from Theorem 1 using order statistic model.
+
+    For N i.i.d. uniform queries on [p_min, p_max]:
+    E[p^(1)] = p_min + (p_max - p_min)/(N+1), so erosion = 1 - 2/(N+1)
+    """
+    erosions = []
+    for a in alphas:
+        n_agents = max(1, int(a * n_sessions))
+        erosions.append(1.0 - 2.0 / (n_agents + 1))
+    return np.array(erosions)
+
+
+def run_policy_episode(
+    env: PricingEnv,
+    policy_fn,
+    n_episodes: int = 10
+) -> Tuple[List[float], List[float], List[float], List[float]]:
+    """Run policy and collect per-step metrics."""
+    rewards, coi_erosions, alpha_errors, revenues = [], [], [], []
+
+    for _ in range(n_episodes):
+        obs, info = env.reset()
+        done = False
+        while not done:
+            action = policy_fn(obs, env.n)
+            obs, reward, terminated, truncated, info = env.step(action)
+            done = terminated or truncated
+            rewards.append(reward)
+            if 'coi_erosion' in info:
+                coi_erosions.append(info['coi_erosion'])
+            if 'alpha_true' in info and 'alpha_est' in info:
+                alpha_errors.append(abs(info['alpha_true'] - info['alpha_est']))
+            if 'revenue' in info:
+                revenues.append(info['revenue'])
+
+    return rewards, coi_erosions, alpha_errors, revenues
+
+
+class PolicyRegistry:
+    """Registry of baseline policies."""
+
+    @staticmethod
+    def fixed(obs: np.ndarray, n: int, margin: float = 0.15) -> np.ndarray:
+        return np.ones(n, dtype=np.float32) * (1.0 + margin)
+
+    @staticmethod
+    def random(obs: np.ndarray, n: int, rng: np.random.Generator = None) -> np.ndarray:
+        rng = rng or np.random.default_rng()
+        return rng.uniform(0.7, 1.3, n).astype(np.float32)
+
+    @staticmethod
+    def adaptive(obs: np.ndarray, n: int, base_margin: float = 0.15) -> np.ndarray:
+        """Reduce margins when alpha estimate is high."""
+        alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
+        margin_scale = 1.0 - 0.4 * alpha_est
+        return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale)
+
+    @staticmethod
+    def aggressive(obs: np.ndarray, n: int) -> np.ndarray:
+        """High margins, ignores contamination."""
+        return np.ones(n, dtype=np.float32) * 1.4
+
+    @staticmethod
+    def defensive(obs: np.ndarray, n: int) -> np.ndarray:
+        """Low margins, always cautious."""
+        return np.ones(n, dtype=np.float32) * 1.05
+
+    @staticmethod
+    def alpha_proportional(obs: np.ndarray, n: int, max_margin: float = 0.3) -> np.ndarray:
+        """Margin inversely proportional to estimated alpha."""
+        alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
+        margin = max_margin * (1.0 - alpha_est)
+        return np.ones(n, dtype=np.float32) * (1.0 + margin)
+
+
+def run_contamination_sweep(
+    alphas: List[float],
+    policies: Dict[str, callable],
+    n_products: int = 10,
+    max_steps: int = 200,
+    n_episodes: int = 10,
+    seed: int = 42,
+    log_dir: str = None
+) -> Dict[str, List[ExperimentResult]]:
+    """Run policies across contamination levels."""
+
+    results = {name: [] for name in policies}
+    writer = SummaryWriter(Path(log_dir) / "sweep") if log_dir and HAS_TB else None
+
+    for alpha in alphas:
+        print(f"  alpha={alpha:.2f}", end=" ")
+        env_cfg = EnvConfig(
+            n_products=n_products, max_steps=max_steps,
+            alpha_true=alpha, reward_mode="robust", seed=seed)
+        env = make_env(env_cfg)
+
+        for name, policy_fn in policies.items():
+            rewards, coi_vals, alpha_errs, revenues = run_policy_episode(env, policy_fn, n_episodes)
+
+            result = ExperimentResult(
+                name=name, alpha=alpha,
+                reward_mean=float(np.mean(rewards)),
+                reward_std=float(np.std(rewards)),
+                coi_erosion=float(np.mean(coi_vals)) if coi_vals else 0.0,
+                alpha_error=float(np.mean(alpha_errs)) if alpha_errs else 0.0,
+                revenue=float(np.mean(revenues)) if revenues else 0.0,
+                margin=float(np.mean([policy_fn(np.zeros(3 * n_products + 3), n_products)]) - 1.0))
+
+            results[name].append(result)
+
+            if writer:
+                step = int(alpha * 100)
+                writer.add_scalar(f'{name}/reward', result.reward_mean, step)
+                writer.add_scalar(f'{name}/coi_erosion', result.coi_erosion, step)
+                writer.add_scalar(f'{name}/alpha_error', result.alpha_error, step)
+                writer.add_scalar(f'{name}/revenue', result.revenue, step)
+
+        print(f"done")
+
+    # add theoretical curve
+    if writer:
+        theo = theoretical_coi_erosion_curve(np.array(alphas))
+        for i, (a, e) in enumerate(zip(alphas, theo)):
+            writer.add_scalar('theoretical/coi_erosion', e, int(a * 100))
+        writer.close()
+
+    return results
+
+
+def run_coi_demonstration(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
+    """Main COI demonstration experiment."""
+    print("=== COI Leakage Demonstration ===\n")
+
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    writer = SummaryWriter(Path(log_dir) / "coi_demo") if HAS_TB else None
+
+    # theoretical erosion curve
+    print("1. Theoretical COI erosion (Theorem 1)")
+    alphas = np.linspace(0.0, 0.6, 13)
+    theo_erosion = theoretical_coi_erosion_curve(alphas, n_sessions=1000)
+
+    for a, e in zip(alphas, theo_erosion):
+        print(f"   alpha={a:.2f} -> erosion={e:.3f}")
+        if writer:
+            writer.add_scalar('theory/coi_erosion', e, int(a * 100))
+
+    # policy comparison
+    print("\n2. Policy comparison across contamination levels")
+    policies = {
+        'fixed': lambda obs, n: PolicyRegistry.fixed(obs, n),
+        'aggressive': PolicyRegistry.aggressive,
+        'defensive': PolicyRegistry.defensive,
+        'adaptive': PolicyRegistry.adaptive,
+        'alpha_proportional': PolicyRegistry.alpha_proportional,
+    }
+
+    sweep_alphas = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
+    results = run_contamination_sweep(
+        sweep_alphas, policies, n_products=10, max_steps=100,
+        n_episodes=5, seed=seed, log_dir=log_dir)
+
+    # summarize
+    print("\n3. Summary by policy")
+    for name, res_list in results.items():
+        avg_reward = np.mean([r.reward_mean for r in res_list])
+        avg_coi = np.mean([r.coi_erosion for r in res_list])
+        print(f"   {name:20s}: avg_reward={avg_reward:.2f}, avg_coi={avg_coi:.3f}")
+
+    # save results
+    output = {
+        'theoretical': {'alphas': alphas.tolist(), 'erosion': theo_erosion.tolist()},
+        'empirical': {name: [r.to_dict() for r in res_list] for name, res_list in results.items()}}
+
+    with open(Path(log_dir) / "coi_demo_results.json", 'w') as f:
+        json.dump(output, f, indent=2)
+
+    if writer:
+        writer.close()
+
+    print(f"\nResults saved to {log_dir}/coi_demo_results.json")
+    print(f"TensorBoard: tensorboard --logdir {log_dir}")
+
+    return output
+
+
+def run_reward_mode_comparison(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
+    """Compare different reward modes."""
+    print("=== Reward Mode Comparison ===\n")
+
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    writer = SummaryWriter(Path(log_dir) / "reward_modes") if HAS_TB else None
+
+    reward_modes = ["revenue", "profit", "robust", "coi_aware"]
+    alpha = 0.3  # moderate contamination
+
+    results = {}
+    for mode in reward_modes:
+        print(f"  mode={mode}", end=" ")
+        env_cfg = EnvConfig(
+            n_products=10, max_steps=200, alpha_true=alpha,
+            reward_mode=mode, seed=seed)
+        env = make_env(env_cfg)
+
+        rewards, coi_vals, _, revenues = run_policy_episode(
+            env, PolicyRegistry.adaptive, n_episodes=10)
+
+        results[mode] = {
+            'reward_mean': float(np.mean(rewards)),
+            'reward_std': float(np.std(rewards)),
+            'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
+            'revenue': float(np.mean(revenues)) if revenues else 0.0}
+
+        if writer:
+            for k, v in results[mode].items():
+                writer.add_scalar(f'{mode}/{k}', v, 0)
+
+        print(f"reward={results[mode]['reward_mean']:.2f}, coi={results[mode]['coi_erosion']:.3f}")
+
+    if writer:
+        writer.close()
+
+    with open(Path(log_dir) / "reward_mode_results.json", 'w') as f:
+        json.dump(results, f, indent=2)
+
+    return results
+
+
+def run_alpha_drift_experiment(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
+    """Test policy robustness under non-stationary contamination."""
+    print("=== Alpha Drift Experiment ===\n")
+
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    writer = SummaryWriter(Path(log_dir) / "alpha_drift") if HAS_TB else None
+
+    drift_rates = [0.0, 0.01, 0.02, 0.05]
+    results = {}
+
+    for drift in drift_rates:
+        print(f"  drift={drift:.2f}", end=" ")
+        env_cfg = EnvConfig(
+            n_products=10, max_steps=200, alpha_true=0.2,
+            alpha_drift=drift, reward_mode="robust", seed=seed)
+        env = make_env(env_cfg)
+
+        rewards, coi_vals, alpha_errs, _ = run_policy_episode(
+            env, PolicyRegistry.adaptive, n_episodes=10)
+
+        results[f'drift_{drift}'] = {
+            'reward_mean': float(np.mean(rewards)),
+            'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
+            'alpha_tracking_error': float(np.mean(alpha_errs)) if alpha_errs else 0.0}
+
+        if writer:
+            for k, v in results[f'drift_{drift}'].items():
+                writer.add_scalar(f'drift_{drift}/{k}', v, 0)
+
+        print(f"reward={results[f'drift_{drift}']['reward_mean']:.2f}, "
+              f"alpha_err={results[f'drift_{drift}']['alpha_tracking_error']:.3f}")
+
+    if writer:
+        writer.close()
+
+    return results
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Run COI experiments")
+    parser.add_argument("--exp", type=str, default="coi", choices=["coi", "reward", "drift", "all"])
+    parser.add_argument("--log-dir", type=str, default="sim/case/thesis_simplified/runs")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    if args.exp == "coi" or args.exp == "all":
+        run_coi_demonstration(args.log_dir, args.seed)
+
+    if args.exp == "reward" or args.exp == "all":
+        run_reward_mode_comparison(args.log_dir, args.seed)
+
+    if args.exp == "drift" or args.exp == "all":
+        run_alpha_drift_experiment(args.log_dir, args.seed)
diff --git a/sim/case/thesis_simplified/separability.py b/sim/case/thesis_simplified/separability.py
new file mode 100644
index 0000000..eaabaa3
--- /dev/null
+++ b/sim/case/thesis_simplified/separability.py
@@ -0,0 +1,72 @@
+"""Behavioral separability for human/agent detection.
+
+Computes divergence signals delta_H, delta_A from session trajectories using
+transition kernel estimation and KL divergence to prototype behavioral profiles.
+"""
+from __future__ import annotations
+from typing import Dict, List, Tuple, TYPE_CHECKING
+import numpy as np
+
+if TYPE_CHECKING:
+    from .simplified import Event, Session
+
+
+# prototype behavioral kernels for human vs agent sessions
+TRANS_H = {
+    "start": {"view": 0.85, "end": 0.15},
+    "view": {"detail": 0.4, "cart": 0.3, "view": 0.2, "end": 0.1},
+    "detail": {"cart": 0.5, "view": 0.3, "end": 0.2},
+    "cart": {"purchase": 0.6, "view": 0.25, "end": 0.15},
+    "purchase": {"end": 1.0},
+}
+
+TRANS_A = {
+    "start": {"view": 0.95, "end": 0.05},
+    "view": {"detail": 0.6, "view": 0.25, "cart": 0.1, "end": 0.05},
+    "detail": {"view": 0.5, "cart": 0.15, "detail": 0.3, "end": 0.05},
+    "cart": {"view": 0.4, "purchase": 0.2, "end": 0.4},
+    "purchase": {"end": 1.0},
+}
+
+
+def kl_div(p: Dict[str, float], q: Dict[str, float], eps: float = 1e-10) -> float:
+    """KL divergence D_KL(p || q) for discrete distributions."""
+    keys = set(p.keys()) | set(q.keys())
+    return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys)
+
+
+def build_kernel(events: List["Event"]) -> Dict[str, Dict[str, float]]:
+    """Build empirical transition kernel T' from trajectory events."""
+    trans: Dict[str, Dict[str, int]] = {}
+    prev = "start"
+    for e in events:
+        curr = e.action
+        trans.setdefault(prev, {})
+        trans[prev][curr] = trans[prev].get(curr, 0) + 1
+        prev = curr
+    return {s: {d: c / sum(dsts.values()) for d, c in dsts.items()} for s, dsts in trans.items() if sum(dsts.values()) > 0}
+
+
+def compute_divergence(session: "Session") -> Tuple[float, float]:
+    """Compute divergence signals delta_H, delta_A for session.
+
+    delta_H = mean KL(T' || T_H) across states, measures distance to human prototype
+    delta_A = mean KL(T' || T_A) across states, measures distance to agent prototype
+    """
+    kernel = build_kernel(session.events)
+    if not kernel:
+        return 0.5, 0.5
+    delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / len(kernel)
+    delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / len(kernel)
+    return delta_h, delta_a
+
+
+def estimate_alpha(session: "Session", beta: float = 2.0) -> float:
+    """Per-session contamination estimate alpha_hat = sigma(beta*(delta_H - delta_A)).
+
+    Returns probability session is agent-generated based on behavioral divergence.
+    """
+    dh, da = compute_divergence(session)
+    if (dh + da) <= 0:
+        return 0.5
+    return 1.0 / (1.0 + np.exp(-beta * (dh - da)))
diff --git a/lab/case/thesis/simplified.py b/sim/case/thesis_simplified/simplified.py
similarity index 100%
rename from lab/case/thesis/simplified.py
rename to sim/case/thesis_simplified/simplified.py
diff --git a/lab/case/thesis/simplified_env.py b/sim/case/thesis_simplified/simplified_env.py
similarity index 100%
rename from lab/case/thesis/simplified_env.py
rename to sim/case/thesis_simplified/simplified_env.py
diff --git a/sim/case/thesis_simplified/summarize.py b/sim/case/thesis_simplified/summarize.py
new file mode 100644
index 0000000..10406aa
--- /dev/null
+++ b/sim/case/thesis_simplified/summarize.py
@@ -0,0 +1,168 @@
+"""Summarize TensorBoard logs into comparison tables."""
+from __future__ import annotations
+import json
+import re
+from pathlib import Path
+from collections import defaultdict
+from dataclasses import dataclass
+import pandas as pd
+
+try:
+    from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
+    HAS_TB = True
+except ImportError:
+    HAS_TB = False
+
+
+@dataclass
+class RunInfo:
+    algo: str
+    alpha: float
+    reward_mode: str
+    path: Path
+
+
+def parse_run_name(name: str) -> RunInfo | None:
+    """Extract algo, alpha, reward_mode from run directory name."""
+    # patterns: ppo_a0.20_robust, cmp_fixed_a0.20, sac_a0.90_robust
+    m = re.match(r'(cmp_)?(\w+)_a([\d.]+)_?(\w+)?', name)
+    if not m:
+        return None
+    prefix, algo, alpha, mode = m.groups()
+    return RunInfo(algo=algo, alpha=float(alpha), reward_mode=mode or 'robust', path=Path())
+
+
+def load_tb_scalars(log_dir: Path, tags: list[str], reduce: str = 'last') -> dict[str, float]:
+    """Load scalar values from TensorBoard event files."""
+    if not HAS_TB:
+        return {}
+    ea = EventAccumulator(str(log_dir))
+    ea.Reload()
+    results = {}
+    for tag in tags:
+        if tag in ea.Tags().get('scalars', []):
+            events = ea.Scalars(tag)
+            if not events:
+                continue
+            vals = [e.value for e in events]
+            if reduce == 'last':
+                results[tag] = vals[-1]
+            elif reduce == 'mean':
+                results[tag] = sum(vals) / len(vals)
+            elif reduce == 'max':
+                results[tag] = max(vals)
+            elif reduce == 'min':
+                results[tag] = min(vals)
+    return results
+
+
+def load_json_results(log_dir: Path) -> dict[str, float]:
+    """Load metrics from results.json if available."""
+    results_file = log_dir / 'results.json'
+    if results_file.exists():
+        with open(results_file) as f:
+            return json.load(f)
+    return {}
+
+
+def discover_runs(base_dir: Path) -> list[RunInfo]:
+    """Find all experiment runs in base directory."""
+    runs = []
+    for d in base_dir.iterdir():
+        if not d.is_dir():
+            continue
+        info = parse_run_name(d.name)
+        if info:
+            info.path = d
+            runs.append(info)
+    return runs
+
+
+def build_tables(runs: list[RunInfo], metrics: list[str], reduce: str = 'last') -> dict[str, dict[str, pd.DataFrame]]:
+    """Build pivot tables: reward_mode -> metric -> DataFrame[alpha x algo]."""
+    # collect data: {reward_mode: {metric: {(alpha, algo): value}}}
+    data = defaultdict(lambda: defaultdict(dict))
+
+    tb_tags = [f'economics/{m}' if m in ['revenue', 'profit', 'margin'] else f'coi/{m}' if m in ['erosion', 'leakage'] else f'alpha/{m}' for m in metrics]
+    tag_map = dict(zip(tb_tags, metrics))
+
+    for run in runs:
+        # try json first (final eval metrics)
+        jm = load_json_results(run.path)
+        tb = load_tb_scalars(run.path, tb_tags, reduce)
+
+        for tag, metric in tag_map.items():
+            val = None
+            json_key = f'{metric}_mean' if metric != 'reward' else 'reward_mean'
+            if json_key in jm:
+                val = jm[json_key]
+            elif tag in tb:
+                val = tb[tag]
+            if val is not None:
+                data[run.reward_mode][metric][(run.alpha, run.algo)] = val
+
+    # convert to DataFrames
+    tables = {}
+    for mode, metrics_data in data.items():
+        tables[mode] = {}
+        for metric, vals in metrics_data.items():
+            if not vals:
+                continue
+            alphas = sorted(set(a for a, _ in vals.keys()))
+            algos = sorted(set(al for _, al in vals.keys()))
+            df = pd.DataFrame(index=alphas, columns=algos, dtype=float)
+            for (a, al), v in vals.items():
+                df.loc[a, al] = v
+            df.index.name = 'alpha'
+            tables[mode][metric] = df
+    return tables
+
+
+def format_table(df: pd.DataFrame, fmt: str = '.3f') -> str:
+    """Format DataFrame as markdown table."""
+    return df.to_markdown(floatfmt=fmt)
+
+
+def summarize(base_dir: str = 'sim/case/thesis_simplified/runs',
+              metrics: list[str] | None = None,
+              reduce: str = 'last',
+              output: str | None = None) -> dict:
+    """Generate summary tables from experiment runs."""
+    base = Path(base_dir)
+    metrics = metrics or ['revenue', 'profit', 'margin', 'erosion', 'leakage']
+
+    runs = discover_runs(base)
+    if not runs:
+        print(f"No runs found in {base}")
+        return {}
+
+    print(f"Found {len(runs)} runs")
+    tables = build_tables(runs, metrics, reduce)
+
+    lines = []
+    for mode, metric_tables in sorted(tables.items()):
+        lines.append(f"\n# Reward Mode: {mode}\n")
+        for metric, df in sorted(metric_tables.items()):
+            lines.append(f"\n## {metric}\n")
+            lines.append(format_table(df))
+            lines.append("")
+
+    report = '\n'.join(lines)
+    print(report)
+
+    if output:
+        Path(output).write_text(report)
+        print(f"\nSaved to {output}")
+
+    return tables
+
+
+if __name__ == '__main__':
+    import argparse
+    p = argparse.ArgumentParser()
+    p.add_argument('--dir', default='sim/case/thesis_simplified/runs')
+    p.add_argument('--metrics', nargs='+', default=['revenue', 'profit', 'margin', 'erosion', 'leakage'])
+    p.add_argument('--reduce', default='last', choices=['last', 'mean', 'max', 'min'])
+    p.add_argument('--output', '-o', help='save markdown to file')
+    args = p.parse_args()
+    summarize(args.dir, args.metrics, args.reduce, args.output)
diff --git a/lab/case/thesis/train.py b/sim/case/thesis_simplified/train.py
similarity index 99%
rename from lab/case/thesis/train.py
rename to sim/case/thesis_simplified/train.py
index c1273eb..a405c44 100644
--- a/lab/case/thesis/train.py
+++ b/sim/case/thesis_simplified/train.py
@@ -65,7 +65,7 @@ class ExperimentConfig:
     n_envs: int = 4
     eval_freq: int = 5000
     n_eval_episodes: int = 10
-    log_dir: str = "lab/case/thesis/runs"
+    log_dir: str = "sim/case/thesis_simplified/runs"
     seed: int = 42
     n_products: int = 10
     max_steps: int = 200
@@ -312,7 +312,7 @@ def main():
     parser.add_argument("--n-products", type=int, default=10)
     parser.add_argument("--n-envs", type=int, default=4)
     parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--log-dir", default="lab/case/thesis/runs")
+    parser.add_argument("--log-dir", default="sim/case/thesis_simplified/runs")
     parser.add_argument("--sweep", action="store_true", help="run contamination sweep")
     parser.add_argument("--compare", action="store_true", help="compare all baselines")
     parser.add_argument("--workers", type=int, default=None, help="max parallel workers for sweep (None=auto, 1=sequential)")
diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index 597359f..a4cf7c9 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -2,6 +2,7 @@ import gymnasium as gym
 from gymnasium import spaces
 import numpy as np
 from dataclasses import dataclass
+from pathlib import Path
 import pandas as pd
 from types import SimpleNamespace
 from typing import Optional, Dict, Any, List, Tuple
@@ -19,8 +20,6 @@ except ImportError:
 # "learner" agent learning to optimize pricing
 # "agent" part of environment creating demand signals that learner processes
 
-base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
-human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
 @dataclass
 class BusinessLogicConstraints():
     max_price_adjustment: float = 0.30
@@ -43,6 +42,17 @@ class BusinessLogicConstraints():
     w_volatility: float = 5.0
     w_estimation_error: float = 0.25
     seed: int = 7
+    human_data_dir: str | None = None
+    agent_data_dir: str | None = None
+
+
+def _resolve_behavior_data_dirs(constraints: BusinessLogicConstraints) -> tuple[str, str]:
+    base = Path(__file__).resolve().parents[2] / "experiments"
+    human_default = str(base / "collected_data")
+    agent_default = str(base / "agents" / "collected_data")
+    human = constraints.human_data_dir or human_default
+    agent = constraints.agent_data_dir or agent_default
+    return human, agent
 
 
 def _sigmoid(x: np.ndarray) -> np.ndarray:
@@ -94,7 +104,7 @@ class BehavioralProfile:
     """Synthetic Markov profile used to generate interaction sessions.
     Uses aggregate_event_transitions from models.py to build transition kernels from real data."""
 
-    def __init__(self, actor: str, purchase_probs: np.ndarray):
+    def __init__(self, actor: str, purchase_probs: np.ndarray, *, human_data_dir: str, agent_data_dir: str):
         self.actor = actor
         self.purchase_probs = np.clip(purchase_probs, 0.0, 0.95)
         self.states = [
@@ -105,7 +115,7 @@ class BehavioralProfile:
             "purchase_complete",
             "session_end",
         ]
-        model = AgentBehaviorModel(agent_dir) if actor == "agents" else BehaviorModel(human_dir)
+        model = AgentBehaviorModel(agent_data_dir) if actor == "agents" else BehaviorModel(human_data_dir)
         mdp = model.build_MDP()
         raw_trans = aggregate_event_transitions(mdp) if mdp.get("transitions") else {}
         self.transitions = _canonicalize_transitions(raw_trans) if raw_trans else self._fallback_transitions()
@@ -227,12 +237,18 @@ class BehavioralProfile:
         return events, feature_events
 
 
-def _load_behavioral_profile(actor: str, demand_forcing: np.ndarray) -> BehavioralProfile:
+def _load_behavioral_profile(
+    actor: str,
+    demand_forcing: np.ndarray,
+    *,
+    human_data_dir: str,
+    agent_data_dir: str,
+) -> BehavioralProfile:
     """returns a behavioral profile for generating synthetic sessions
     actor: 'humans' or 'agents'
     demand_forcing: per-product purchase probabilities used to weight interactions
     """
-    return BehavioralProfile(actor, demand_forcing)
+    return BehavioralProfile(actor, demand_forcing, human_data_dir=human_data_dir, agent_data_dir=agent_data_dir)
 
 
 class CommercePlatform:
@@ -248,6 +264,7 @@ class CommercePlatform:
         self.unit_cost = np.random.uniform(low=15.0, high=60.0, size=(self.product_catalogue_size,)).astype(np.float32)
         self.base_price = np.random.uniform(low=60.0, high=140.0, size=(self.product_catalogue_size,)).astype(np.float32)
         self.alpha_hat = constraints.agent_share
+        self._human_data_dir, self._agent_data_dir = _resolve_behavior_data_dirs(constraints)
         try:
             self.separability_artifacts = load_artifacts()
         except FileNotFoundError:
@@ -287,7 +304,12 @@ class CommercePlatform:
         demand_agent = np.zeros_like(prices, dtype=np.float32)
 
         for actor, n_sessions in session_map.items():
-            profile = _load_behavioral_profile(actor, pprob_map[actor])
+            profile = _load_behavioral_profile(
+                actor,
+                pprob_map[actor],
+                human_data_dir=self._human_data_dir,
+                agent_data_dir=self._agent_data_dir,
+            )
             for idx in range(n_sessions):
                 session_id = f"{actor}_{idx:06d}"
                 session_rows, feature_events = profile.sample_session(
@@ -474,8 +496,19 @@ class PHANTOMEnv(gym.Env):
 
     def _init_jax_transitions(self):
         try:
-            human_profile = _load_behavioral_profile("humans", np.ones(self.constraints.product_catalogue_size) * 0.1)
-            agent_profile = _load_behavioral_profile("agents", np.ones(self.constraints.product_catalogue_size) * 0.1)
+            human_dir, agent_dir = _resolve_behavior_data_dirs(self.constraints)
+            human_profile = _load_behavioral_profile(
+                "humans",
+                np.ones(self.constraints.product_catalogue_size) * 0.1,
+                human_data_dir=human_dir,
+                agent_data_dir=agent_dir,
+            )
+            agent_profile = _load_behavioral_profile(
+                "agents",
+                np.ones(self.constraints.product_catalogue_size) * 0.1,
+                human_data_dir=human_dir,
+                agent_data_dir=agent_dir,
+            )
             self._jax_trans = compile_transitions(human_profile, agent_profile).to_jax()
         except Exception:
             self._jax_trans = fallback_transitions().to_jax()

From fa2aca8b13a331dff9a6968aa01f2b75b07c4308 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 26 Jan 2026 14:12:41 +0100
Subject: [PATCH 55/99] chore: rough migration of environment configuration

---
 sim/rl/engine.py      |  18 +-
 sim/rl/environment.py | 875 ++++++++++--------------------------------
 2 files changed, 216 insertions(+), 677 deletions(-)

diff --git a/sim/rl/engine.py b/sim/rl/engine.py
index ab751e3..ec4d871 100644
--- a/sim/rl/engine.py
+++ b/sim/rl/engine.py
@@ -76,8 +76,7 @@ class WildPricingEngine(BasePricingEngine):
 
     def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
         self.step_count += 1
-        # extract demand signal (from env observation) as proxy for sales
-        demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32))
+        demand = _extract_demand(observation, self.c.product_catalogue_size)
         return self._update_from_demand(current_prices, demand)
 
     def _update_from_demand(self, prices: np.ndarray, sold: np.ndarray) -> np.ndarray:
@@ -141,7 +140,7 @@ class SimpleDemandEngine(BasePricingEngine):
 
     def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
         self.step_count += 1
-        demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32))
+        demand = _extract_demand(observation, self.c.product_catalogue_size)
         if self.prev_demand is None:
             self.prev_demand = demand.copy()
             return current_prices.copy()
@@ -207,7 +206,7 @@ class ThompsonSamplingEngine(BasePricingEngine):
             lo = current_prices * 0.7
             hi = current_prices * 1.3
             self.price_grid = np.linspace(lo, hi, self.n_price_levels).T
-        demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32))
+        demand = _extract_demand(observation, self.c.product_catalogue_size)
         # update beliefs based on last action
         if self.last_actions is not None:
             for i in range(self.c.product_catalogue_size):
@@ -226,3 +225,14 @@ class ThompsonSamplingEngine(BasePricingEngine):
             new_prices[i] = self.price_grid[i, actions[i]]
         self.last_actions = actions
         return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
+
+
+def _extract_demand(observation: Dict[str, Any], n: int) -> np.ndarray:
+    if "elasticity" in observation and isinstance(observation["elasticity"], dict):
+        d = observation["elasticity"].get("demand")
+        if d is not None:
+            return np.asarray(d, dtype=np.float32)
+    d = observation.get("demand")
+    if d is not None:
+        return np.asarray(d, dtype=np.float32)
+    return np.zeros(n, dtype=np.float32)
diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index a4cf7c9..94bc8e1 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -1,715 +1,244 @@
-import gymnasium as gym
-from gymnasium import spaces
-import numpy as np
-from dataclasses import dataclass
-from pathlib import Path
-import pandas as pd
-from types import SimpleNamespace
-from typing import Optional, Dict, Any, List, Tuple
+from __future__ import annotations
 
-from lib.separability import load_artifacts, score_session, estimate_alpha
-from sim.rl.behavior_loader.models import AgentBehaviorModel, BehaviorModel, aggregate_event_transitions
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple
+
+import numpy as np
 
 try:
-    import jax
-    from sim.rl.jax_core import JAX_AVAILABLE, compile_transitions, fallback_transitions, sample_sessions, compute_metrics
-    from sim.rl.jax_core import session_features, compute_session_transitions, compute_divergences, estimate_alpha_batch
-except ImportError:
-    JAX_AVAILABLE = False
+    import gymnasium as gym
+    from gymnasium import spaces
+except ImportError as e:
+    raise ImportError("sim.rl.environment requires gymnasium") from e
 
-# "learner" agent learning to optimize pricing
-# "agent" part of environment creating demand signals that learner processes
+from sim.case.thesis_simplified.coi import COIWindow, coi_erosion, compute_coi_window
+from sim.case.thesis_simplified.separability import estimate_alpha as estimate_session_alpha
+from sim.case.thesis_simplified.simplified import Limbo, Session, put_prices_to_market
+from sim.rl.thesis_core import aggregate_demand_by_product, aggregate_purchases, constrain_prices
+
+
+@dataclass(frozen=True)
+class BusinessLogicConstraints:
+    product_catalogue_size: int = 100
+    max_steps: int = 2000
+    sessions_per_step: int = 250
 
-@dataclass
-class BusinessLogicConstraints():
-    max_price_adjustment: float = 0.30
     system_max_price: float = 500.0
     system_min_price: float = 1.0
-    product_catalogue_size: int = 100
-    episode_length: int = 2000
-    sessions_per_step: int = 250
+    max_price_adjustment: float = 0.30
+    min_margin_pct: float = 0.05
+
     agent_share: float = 0.2
-    agent_recon_multiplier: float = 6.0
-    agent_purchase_probability: float = 0.20
+    alpha_drift: float = 0.0
+    alpha_bounds: tuple[float, float] = (0.0, 0.8)
+
     coi_strength: float = 0.25
-    coi_threshold: float = 4.0
-    coi_sigmoid_temp: float = 1.25
-    base_human_demand: float = 0.08
-    base_agent_demand: float = 0.05
-    human_price_elasticity: float = -1.2 # assumptions here
-    agent_price_elasticity: float = -0.6
-    w_agent_loss: float = 1.0
     w_volatility: float = 5.0
     w_estimation_error: float = 0.25
+
     seed: int = 7
-    human_data_dir: str | None = None
-    agent_data_dir: str | None = None
 
 
-def _resolve_behavior_data_dirs(constraints: BusinessLogicConstraints) -> tuple[str, str]:
-    base = Path(__file__).resolve().parents[2] / "experiments"
-    human_default = str(base / "collected_data")
-    agent_default = str(base / "agents" / "collected_data")
-    human = constraints.human_data_dir or human_default
-    agent = constraints.agent_data_dir or agent_default
-    return human, agent
-
-
-def _sigmoid(x: np.ndarray) -> np.ndarray:
-    return 1.0 / (1.0 + np.exp(-x))
-
-EVENT_PAGE_MAP = {
-    "session_start": "/",
-    "page_view": "/",
-    "view_item_page": "/products",
-    "learn_more_about_item": "/products/details",
-    "add_item_to_cart": "/cart",
-    "checkout_start": "/checkout",
-    "purchase_complete": "/checkout",
-    "session_end": "/checkout/success",
-}
-
-# map real collected event names to canonical simulation states
-EVENT_CANONICAL_MAP = {
-    "page_view": "session_start",
-    "hover_over_paragraph": "view_item_page",
-    "hover_over_title": "view_item_page",
-    "view_item_page": "view_item_page",
-    "learn_more_about_item": "learn_more_about_item",
-    "add_item_to_cart": "add_item_to_cart",
-    "checkout_start": "purchase_complete",
-    "remove_item": "view_item_page",
-}
-
-
-def _canonicalize_transitions(raw_trans: Dict[str, Dict[str, float]]) -> Dict[str, Dict[str, float]]:
-    """Map real event transition names to canonical simulation states."""
-    canonical: Dict[str, Dict[str, float]] = {}
-    for src, dsts in raw_trans.items():
-        src_canon = EVENT_CANONICAL_MAP.get(src, src)
-        if src_canon not in canonical:
-            canonical[src_canon] = {}
-        for dst, prob in dsts.items():
-            dst_canon = EVENT_CANONICAL_MAP.get(dst, dst)
-            canonical[src_canon][dst_canon] = canonical[src_canon].get(dst_canon, 0.0) + prob
-    # re-normalize after aggregation
-    for src in canonical:
-        total = sum(canonical[src].values())
-        if total > 0:
-            canonical[src] = {k: v / total for k, v in canonical[src].items()}
-    return canonical
-
-
-class BehavioralProfile:
-    """Synthetic Markov profile used to generate interaction sessions.
-    Uses aggregate_event_transitions from models.py to build transition kernels from real data."""
-
-    def __init__(self, actor: str, purchase_probs: np.ndarray, *, human_data_dir: str, agent_data_dir: str):
-        self.actor = actor
-        self.purchase_probs = np.clip(purchase_probs, 0.0, 0.95)
-        self.states = [
-            "session_start",
-            "view_item_page",
-            "learn_more_about_item",
-            "add_item_to_cart",
-            "purchase_complete",
-            "session_end",
-        ]
-        model = AgentBehaviorModel(agent_data_dir) if actor == "agents" else BehaviorModel(human_data_dir)
-        mdp = model.build_MDP()
-        raw_trans = aggregate_event_transitions(mdp) if mdp.get("transitions") else {}
-        self.transitions = _canonicalize_transitions(raw_trans) if raw_trans else self._fallback_transitions()
-        self._ensure_terminal_states()
-        self.dwell_params = self._extract_dwell_params(mdp)
-
-    def _ensure_terminal_states(self):
-        # guarantee purchase_complete leads to session_end and session_start exists
-        if "purchase_complete" not in self.transitions:
-            self.transitions["purchase_complete"] = {"session_end": 1.0}
-        elif "session_end" not in self.transitions.get("purchase_complete", {}):
-            self.transitions["purchase_complete"]["session_end"] = 1.0
-            total = sum(self.transitions["purchase_complete"].values())
-            self.transitions["purchase_complete"] = {k: v/total for k, v in self.transitions["purchase_complete"].items()}
-        if "session_start" not in self.transitions:
-            self.transitions["session_start"] = {"view_item_page": 0.7, "learn_more_about_item": 0.2, "session_end": 0.1}
-
-    def _fallback_transitions(self) -> Dict[str, Dict[str, float]]:
-        return {
-            "session_start": {"view_item_page": 0.85, "session_end": 0.15},
-            "view_item_page": {"learn_more_about_item": 0.4, "add_item_to_cart": 0.3, "view_item_page": 0.2, "session_end": 0.1},
-            "learn_more_about_item": {"add_item_to_cart": 0.5, "view_item_page": 0.3, "session_end": 0.2},
-            "add_item_to_cart": {"purchase_complete": 0.6, "view_item_page": 0.25, "session_end": 0.15},
-            "purchase_complete": {"session_end": 1.0},
-        }
-
-    def _extract_dwell_params(self, mdp: Dict) -> Dict[str, Tuple[float, float]]:
-        state_vals = mdp.get("state_values", {})
-        params = {}
-        for state in self.states:
-            # try canonical and raw state names
-            val = state_vals.get(state, 0.5)
-            for raw, canon in EVENT_CANONICAL_MAP.items():
-                if canon == state and raw in state_vals:
-                    val = state_vals[raw]
-                    break
-            shape = 1.5 + val * 2.0
-            scale = 0.8 + (1.0 - val) * 1.2
-            params[state] = (shape, scale)
-        return params
-
-    def _transition_probs(self, state: str, product_idx: int) -> Dict[str, float]:
-        probs = dict(self.transitions.get(state, {"session_end": 1.0}))
-        if state == "add_item_to_cart":
-            base = probs.get("purchase_complete", 0.0)
-            demand_factor = float(self.purchase_probs[int(product_idx)])
-            if self.actor == "agents":
-                demand_factor *= 0.7
-            adjusted = np.clip(base * 0.5 + demand_factor * 0.5, 0.0, 0.95)
-            remainder = max(1e-6, 1.0 - adjusted)
-            other_total = sum(v for k, v in probs.items() if k != "purchase_complete")
-            scale = remainder / max(other_total, 1e-6)
-            for key in probs:
-                if key == "purchase_complete":
-                    probs[key] = adjusted
-                else:
-                    probs[key] = probs[key] * scale
-        total = sum(probs.values())
-        if total <= 0:
-            return {"session_end": 1.0}
-        return {state: val / total for state, val in probs.items()}
-
-    def sample_session(
-        self,
-        rng: np.random.Generator,
-        session_id: str,
-        prices: np.ndarray,
-        unit_cost: np.ndarray,
-    ) -> Tuple[List[Dict[str, Any]], List[SimpleNamespace]]:
-        """Generate a single session trajectory respecting business constraints."""
-        events: List[Dict[str, Any]] = []
-        feature_events: List[SimpleNamespace] = []
-        state = "session_start"
-        t = 0.0
-        product_idx = int(rng.integers(0, len(prices)))
-        product_id = f"product-{product_idx:04d}"
-
-
-        # enforce price >= cost constraint (lipschitz bound on pricing)
-        # This is a sort of last resort to not let an pricing learner go rogue
-        cost = float(unit_cost[product_idx])
-        constrained_price = max(float(prices[product_idx]), cost * 1.05)  # 5% min margin
-
-        while state != "session_end" and len(events) < 40:
-            if state != "session_start":
-                row = {
-                    "session_id": session_id,
-                    "actor": "agent" if self.actor == "agents" else "human",
-                    "eventName": state,
-                    "product_idx": product_idx,
-                    "productId": product_id,
-                    "price_offered": constrained_price,
-                    "price_paid": 0.0,
-                    "page": EVENT_PAGE_MAP.get(state, "/"),
-                    "ts": t,
-                    "unit_cost": cost,
-                    "base_price": float(prices[product_idx]),
-                }
-                if state == "purchase_complete":
-                    noise = float(rng.normal(0.0, 0.015))
-                    row["price_paid"] = max(constrained_price * (1.0 + noise), cost)
-                events.append(row)
-                feature_events.append(
-                    SimpleNamespace(
-                        eventName=row["eventName"],
-                        page=row["page"],
-                        productId=row["productId"],
-                        ts=row["ts"],
-                    )
-                )
-
-            transitions = self._transition_probs(state, product_idx)
-            next_state = rng.choice(list(transitions.keys()), p=list(transitions.values()))
-            shape, scale = self.dwell_params.get(state, (2.0, 1.0))
-            dwell = max(0.3, rng.gamma(shape=shape, scale=scale))
-            t += dwell
-            state = next_state
-
-        return events, feature_events
-
-
-def _load_behavioral_profile(
-    actor: str,
-    demand_forcing: np.ndarray,
-    *,
-    human_data_dir: str,
-    agent_data_dir: str,
-) -> BehavioralProfile:
-    """returns a behavioral profile for generating synthetic sessions
-    actor: 'humans' or 'agents'
-    demand_forcing: per-product purchase probabilities used to weight interactions
-    """
-    return BehavioralProfile(actor, demand_forcing, human_data_dir=human_data_dir, agent_data_dir=agent_data_dir)
-
-
-class CommercePlatform:
-    """state management for the environment, simulates demand"""
-    def __init__(self, product_catalogue_size: int, max_price: float, min_price: float, constraints: BusinessLogicConstraints):
-        self.product_catalogue_size = product_catalogue_size
-        self.max_price = max_price
-        self.min_price = min_price
-        self.constraints = constraints
-        self.simulation_history: List[Dict[str, Any]] = []
-        self._rng = np.random.default_rng(constraints.seed)
-        self._last_interaction_df: pd.DataFrame = pd.DataFrame()
-        self.unit_cost = np.random.uniform(low=15.0, high=60.0, size=(self.product_catalogue_size,)).astype(np.float32)
-        self.base_price = np.random.uniform(low=60.0, high=140.0, size=(self.product_catalogue_size,)).astype(np.float32)
-        self.alpha_hat = constraints.agent_share
-        self._human_data_dir, self._agent_data_dir = _resolve_behavior_data_dirs(constraints)
-        try:
-            self.separability_artifacts = load_artifacts()
-        except FileNotFoundError:
-            self.separability_artifacts = None
-
-    def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
-        p = np.clip(prices, self.min_price, self.max_price)
-        cost = np.clip(self.unit_cost, self.min_price * 0.2, self.max_price)
-        margin = np.clip((p - cost) / np.maximum(cost, 1e-3), -0.9, 2.0)
-        # isoelastic demand approximation
-        human_prob = self.constraints.base_human_demand * np.exp(self.constraints.human_price_elasticity * margin)
-        agent_prob = self.constraints.base_agent_demand * np.exp(self.constraints.agent_price_elasticity * margin)
-        return {
-            "human_purchase_prob": np.clip(human_prob, 0.0, 0.95),
-            "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95),
-        }
-
-    def _simulate_sessions(self, prices: np.ndarray) -> Tuple[pd.DataFrame, Dict[str, Any]]:
-        demand = self.setup_true_demand(prices)
-        T = self.constraints.sessions_per_step
-        effective_share = float(np.clip(self.alpha_hat, 0.0, 0.95))
-        n_agent_sessions = max(1, int(round(T * effective_share)))
-        n_human_sessions = max(1, T - n_agent_sessions)
-
-        session_map = {
-            "humans": n_human_sessions,
-            "agents": n_agent_sessions,
-        }
-        pprob_map = {
-            "humans": demand["human_purchase_prob"],
-            "agents": demand["agent_purchase_prob"],
-        }
-
-        rows: List[Dict[str, Any]] = []
-        session_scores: List[Dict[str, float]] = []
-        demand_human = np.zeros_like(prices, dtype=np.float32)
-        demand_agent = np.zeros_like(prices, dtype=np.float32)
-
-        for actor, n_sessions in session_map.items():
-            profile = _load_behavioral_profile(
-                actor,
-                pprob_map[actor],
-                human_data_dir=self._human_data_dir,
-                agent_data_dir=self._agent_data_dir,
-            )
-            for idx in range(n_sessions):
-                session_id = f"{actor}_{idx:06d}"
-                session_rows, feature_events = profile.sample_session(
-                    self._rng, session_id, prices, self.unit_cost
-                )
-                rows.extend(session_rows)
-                if session_rows:
-                    df_session = pd.DataFrame(session_rows)
-                    purchases = df_session[df_session["eventName"] == "purchase_complete"]
-                    if not purchases.empty:
-                        counts = purchases.groupby("product_idx").size()
-                        if actor == "agents":
-                            demand_agent[counts.index.to_numpy(dtype=int)] += counts.to_numpy(dtype=np.float32)
-                        else:
-                            demand_human[counts.index.to_numpy(dtype=int)] += counts.to_numpy(dtype=np.float32)
-                if self.separability_artifacts and feature_events:
-                    score = score_session(feature_events, self.separability_artifacts)
-                    session_scores.append(score)
-
-        interactions_df = pd.DataFrame(rows)
-        diagnostics = {
-            "alpha_hat": float(self.alpha_hat),
-            "session_scores": session_scores,
-            "demand_human": demand_human,
-            "demand_agent": demand_agent,
-        }
-
-        if session_scores:
-            alphas = [
-                estimate_alpha(s["prob_agent"], s["delta_h"], s["delta_a"], temperature=2.0)
-                for s in session_scores
-            ]
-            mean_alpha = float(np.mean(alphas))
-            # exponential moving average for stability
-            self.alpha_hat = 0.7 * self.alpha_hat + 0.3 * mean_alpha
-            diagnostics.update(
-                {
-                    "alpha_hat": float(self.alpha_hat),
-                    "delta_h_mean": float(np.mean([s["delta_h"] for s in session_scores])),
-                    "delta_a_mean": float(np.mean([s["delta_a"] for s in session_scores])),
-                    "prob_agent_mean": float(np.mean([s["prob_agent"] for s in session_scores])),
-                }
-            )
-
-        self._last_interaction_df = interactions_df
-        return interactions_df, diagnostics
-
-    def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
-        if interaction_df.empty:
-            return {
-                "revenue_observed": 0.0,
-                "revenue_oracle": 0.0,
-                "agent_loss": 0.0,
-                "true_human_purchases": 0.0,
-                "true_agent_purchases": 0.0,
-                "mean_sale_price": 0.0,
-                "look_to_book": 0.0,
-                "coi": 0.0,
-                "expected_premium": 0.0,
-            }
-
-        purchases = interaction_df[interaction_df["eventName"] == "purchase_complete"]
-        human_purchases = purchases[purchases["actor"] == "human"]
-        agent_purchases = purchases[purchases["actor"] == "agent"]
-
-        revenue_observed = float(purchases["price_paid"].sum())
-        revenue_oracle = float(purchases["base_price"].sum())
-        agent_loss = float((agent_purchases["base_price"] - agent_purchases["price_paid"]).sum())
-
-        mean_sale_price = float(purchases["price_paid"].mean()) if not purchases.empty else 0.0
-        views = float((interaction_df["eventName"] == "view_item_page").sum())
-        look_to_book = float(views / (len(purchases) + 1e-6))
-        true_human = float(len(human_purchases))
-        true_agent = float(len(agent_purchases))
-
-        human_prices = human_purchases["price_offered"] if not human_purchases.empty else pd.Series(dtype=float)
-        human_costs = human_purchases["unit_cost"] if not human_purchases.empty else pd.Series(dtype=float)
-        human_base = human_purchases["base_price"] if not human_purchases.empty else pd.Series(dtype=float)
-        coi = 0.0
-        if not human_prices.empty and not human_costs.empty:
-            # COI = E[P] - p_min where p_min is cost, accounting for expected premium (base - realized)
-            margin = human_prices.mean() - human_costs.mean()
-            expected_premium = human_base.mean() - human_prices.mean() if not human_base.empty else 0.0
-            coi = float(np.maximum(0.0, margin - expected_premium * 0.5))
-
-        return {
-            "revenue_observed": revenue_observed,
-            "revenue_oracle": revenue_oracle,
-            "agent_loss": agent_loss,
-            "true_human_purchases": true_human,
-            "true_agent_purchases": true_agent,
-            "mean_sale_price": mean_sale_price,
-            "look_to_book": look_to_book,
-            "coi": coi,
-            "expected_premium": float(expected_premium) if not human_base.empty else 0.0,
-        }
-
-    def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Extract per-session behavioral features for separability analysis."""
-        if df.empty:
-            return pd.DataFrame()
-        g = df.groupby("session_id", sort=False)
-        session_duration = g["ts"].max() - g["ts"].min()
-        total_interactions = g.size()
-        avg_time_between = g["ts"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0)
-        interaction_velocity = total_interactions / (session_duration + 1e-6)
-        views = g.apply(lambda x: int((x["eventName"] == "view_item_page").sum()), include_groups=False)
-        cart_adds = g.apply(lambda x: int((x["eventName"] == "add_item_to_cart").sum()), include_groups=False)
-        purchases = g.apply(lambda x: int((x["eventName"] == "purchase_complete").sum()), include_groups=False)
-        learn_more = g.apply(lambda x: int((x["eventName"] == "learn_more_about_item").sum()), include_groups=False)
-        conversion_rate = purchases / (views + 1e-6)
-        is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False)
-        # price sensitivity features
-        price_variance = g["price_offered"].var().fillna(0.0)
-        avg_price_seen = g["price_offered"].mean().fillna(0.0)
-        products_viewed = g["product_idx"].nunique()
-
-        return pd.DataFrame({
-            "session_duration_sec": session_duration.astype(float),
-            "avg_time_between_events": avg_time_between.astype(float),
-            "total_interactions": total_interactions.astype(int),
-            "interaction_velocity": interaction_velocity.astype(float),
-            "item_views": views.astype(int),
-            "cart_adds": cart_adds.astype(int),
-            "purchases": purchases.astype(int),
-            "learn_more_clicks": learn_more.astype(int),
-            "conversion_rate": conversion_rate.astype(float),
-            "price_variance": price_variance.astype(float),
-            "avg_price_seen": avg_price_seen.astype(float),
-            "products_viewed": products_viewed.astype(int),
-            "is_agent": is_agent.astype(bool),
-        }).reset_index()
-
-    def get_interaction_data(self) -> np.ndarray:
-        if self._last_interaction_df.empty:
-            return np.array([], dtype=object)
-        return self._last_interaction_df.to_dict(orient="records")
+def make_env(constraints: Optional[BusinessLogicConstraints] = None) -> "PHANTOMEnv":
+    return PHANTOMEnv(constraints=constraints or BusinessLogicConstraints())
 
 
 class PHANTOMEnv(gym.Env):
-    metadata = {"render_modes": []}
+    metadata = {"render_modes": ["human", "ansi"]}
 
-    def __init__(self, constraints: Optional[BusinessLogicConstraints] = None, use_jax: bool = True):
+    def __init__(self, constraints: Optional[BusinessLogicConstraints] = None):
         super().__init__()
-        self.constraints = constraints if isinstance(constraints, BusinessLogicConstraints) else BusinessLogicConstraints()
-        self.use_jax = use_jax and JAX_AVAILABLE
-        self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
-                                       high=self.constraints.max_price_adjustment,
-                                       shape=(self.constraints.product_catalogue_size,), dtype=np.float32)
-        n_products = self.constraints.product_catalogue_size
-        self.observation_space = spaces.Dict({
-            "elasticity": spaces.Dict({
-                "price": spaces.Box(
-                    low=np.full((n_products,), self.constraints.system_min_price, dtype=np.float32),
-                    high=np.full((n_products,), self.constraints.system_max_price, dtype=np.float32),
-                    dtype=np.float32),
-                "demand": spaces.Box(
-                    low=np.zeros((n_products,), dtype=np.float32),
-                    high=np.full((n_products,), 1e6, dtype=np.float32),
-                    dtype=np.float32),
-            }),
-            "market": spaces.Dict({
-                "alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
-                "revenue_rate": spaces.Box(low=0.0, high=1e6, shape=(1,), dtype=np.float32),
-                "conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
-                "price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
-            }),
-            "cost": spaces.Box(low=0.0, high=self.constraints.system_max_price, shape=(n_products,), dtype=np.float32),
-        })
-        self.commerce_platform = CommercePlatform(
-            product_catalogue_size=self.constraints.product_catalogue_size,
-            max_price=self.constraints.system_max_price,
-            min_price=self.constraints.system_min_price,
-            constraints=self.constraints)
-        self._rng = np.random.default_rng(self.constraints.seed)
-        self.t = 0
-        self._prev_prices: Optional[np.ndarray] = None
-        self.state: Dict[str, Any] = {}
-        self._jax_key = None
-        self._jax_trans = None
-        if self.use_jax:
-            self._jax_key = jax.random.PRNGKey(self.constraints.seed)
-            self._init_jax_transitions()
+        self.c = constraints or BusinessLogicConstraints()
+        self.n = int(self.c.product_catalogue_size)
 
-    def _init_jax_transitions(self):
-        try:
-            human_dir, agent_dir = _resolve_behavior_data_dirs(self.constraints)
-            human_profile = _load_behavioral_profile(
-                "humans",
-                np.ones(self.constraints.product_catalogue_size) * 0.1,
-                human_data_dir=human_dir,
-                agent_data_dir=agent_dir,
-            )
-            agent_profile = _load_behavioral_profile(
-                "agents",
-                np.ones(self.constraints.product_catalogue_size) * 0.1,
-                human_data_dir=human_dir,
-                agent_data_dir=agent_dir,
-            )
-            self._jax_trans = compile_transitions(human_profile, agent_profile).to_jax()
-        except Exception:
-            self._jax_trans = fallback_transitions().to_jax()
+        self._rng = np.random.default_rng(self.c.seed)
+        self._t = 0
+        self._alpha_true = float(self.c.agent_share)
+        self._alpha_hat = float(self.c.agent_share)
+        self._costs = np.zeros(self.n, dtype=np.float32)
+        self._refs = np.zeros(self.n, dtype=np.float32)
+        self._prices: Optional[np.ndarray] = None
+        self._last_sessions: list[Session] = []
+        self._last_coi: COIWindow | None = None
+        self._limbo = Limbo()
+
+        self.action_space = spaces.Box(
+            low=np.full((self.n,), self.c.system_min_price, dtype=np.float32),
+            high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
+            dtype=np.float32,
+        )
+        self.observation_space = spaces.Dict(
+            {
+                "elasticity": spaces.Dict(
+                    {
+                        "price": spaces.Box(
+                            low=np.full((self.n,), self.c.system_min_price, dtype=np.float32),
+                            high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
+                            dtype=np.float32,
+                        ),
+                        "demand": spaces.Box(
+                            low=np.zeros((self.n,), dtype=np.float32),
+                            high=np.full((self.n,), 1e9, dtype=np.float32),
+                            dtype=np.float32,
+                        ),
+                    }
+                ),
+                "market": spaces.Dict(
+                    {
+                        "alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
+                        "revenue_rate": spaces.Box(low=0.0, high=1e12, shape=(1,), dtype=np.float32),
+                        "conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
+                        "price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
+                    }
+                ),
+                "cost": spaces.Box(
+                    low=np.zeros((self.n,), dtype=np.float32),
+                    high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
+                    dtype=np.float32,
+                ),
+            }
+        )
+
+    def _reset_catalogue(self) -> None:
+        self._costs = self._rng.uniform(15.0, 60.0, size=self.n).astype(np.float32)
+        margins = self._rng.uniform(0.2, 0.6, size=self.n).astype(np.float32)
+        self._refs = (self._costs * (1.0 + margins)).astype(np.float32)
+        self._prices = self._refs.copy()
+
+    def _observe_market(
+        self, prices: np.ndarray
+    ) -> tuple[list[Session], Dict[str, float], np.ndarray, np.ndarray, float, float, int]:
+        sessions, demand_map = put_prices_to_market(
+            prices,
+            costs=self._costs,
+            alpha=self._alpha_true,
+            n_sessions=int(self.c.sessions_per_step),
+            seed=int(self._rng.integers(0, 2**31 - 1)),
+        )
+        demand_by_product = aggregate_demand_by_product(sessions, demand_map, self.n)
+        purchases, revenue, cost, n_agents = aggregate_purchases(sessions, self._costs, self.n)
+        conversion = float(np.sum(purchases) / max(len(sessions), 1))
+        return sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents
+
+    def _update_alpha_hat(self, sessions: list[Session]) -> float:
+        scores = [estimate_session_alpha(s) for s in sessions if s.events]
+        if not scores:
+            return self._alpha_hat
+        alpha_step = float(np.mean(scores))
+        self._alpha_hat = 0.8 * self._alpha_hat + 0.2 * alpha_step
+        self._alpha_hat = float(np.clip(self._alpha_hat, 0.0, 1.0))
+        return self._alpha_hat
+
+    def _reward(self, prices: np.ndarray, revenue: float, cost: float, volatility: float) -> float:
+        profit = float(revenue - cost)
+        coi_leak = float(self._last_coi.leak) if self._last_coi else 0.0
+        alpha_err = abs(self._alpha_hat - self._alpha_true)
+        return profit - self.c.coi_strength * coi_leak - self.c.w_volatility * volatility - self.c.w_estimation_error * alpha_err
+
+    def _build_obs(
+        self,
+        prices: np.ndarray,
+        demand_by_product: np.ndarray,
+        revenue: float,
+        conversion: float,
+        volatility: float,
+    ) -> Dict[str, Any]:
+        return {
+            "elasticity": {"price": prices.astype(np.float32), "demand": demand_by_product.astype(np.float32)},
+            "market": {
+                "alpha_hat": np.array([self._alpha_hat], dtype=np.float32),
+                "revenue_rate": np.array([revenue], dtype=np.float32),
+                "conversion_rate": np.array([conversion], dtype=np.float32),
+                "price_volatility": np.array([volatility], dtype=np.float32),
+            },
+            "cost": self._costs.astype(np.float32),
+        }
 
     def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
         super().reset(seed=seed)
         if seed is not None:
             self._rng = np.random.default_rng(seed)
-            self.commerce_platform._rng = np.random.default_rng(seed)
-            if self.use_jax:
-                self._jax_key = jax.random.PRNGKey(seed)
-        self.commerce_platform.alpha_hat = self.constraints.agent_share
-        self.t = 0
-        init_prices = self._rng.uniform(
-            low=60.0,
-            high=140.0,
-            size=(self.constraints.product_catalogue_size,),
-        ).astype(np.float32)
-        self.commerce_platform.unit_cost = self._rng.uniform(
-            low=15.0,
-            high=60.0,
-            size=(self.constraints.product_catalogue_size,),
-        ).astype(np.float32)
-        self.commerce_platform.base_price = init_prices.copy()
-        self._prev_prices = init_prices.copy()
-        self.state = {
-            "elasticity": {
-                "price": init_prices,
-                "demand": np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
-            },
-            "market": {
-                "alpha_hat": np.array([self.constraints.agent_share], dtype=np.float32),
-                "revenue_rate": np.array([0.0], dtype=np.float32),
-                "conversion_rate": np.array([0.0], dtype=np.float32),
-                "price_volatility": np.array([0.0], dtype=np.float32),
-            },
-            "cost": self.commerce_platform.unit_cost.astype(np.float32),
-        }
-        return self.state, {}
+        self._t = 0
+        self._alpha_true = float(np.clip(self.c.agent_share, *self.c.alpha_bounds))
+        self._alpha_hat = float(self.c.agent_share)
+        self._reset_catalogue()
+        self._limbo = Limbo()
+        self._last_sessions = []
+        self._last_coi = None
 
-    def _step_jax(self, new_prices: np.ndarray) -> Tuple[Dict, Dict]:
-        self._jax_key, subkey = jax.random.split(self._jax_key)
-        alpha = float(np.clip(self.commerce_platform.alpha_hat, 0.0, 0.95))
-        n_agent = max(1, int(self.constraints.sessions_per_step * alpha))
-        n_human = max(1, self.constraints.sessions_per_step - n_agent)
-        batch = sample_sessions(subkey, self._jax_trans, n_human, n_agent, len(new_prices))
-        sim = compute_metrics(batch, new_prices, self.commerce_platform.unit_cost, self.commerce_platform.base_price)
-        result = {"revenue_observed": sim.revenue, "revenue_oracle": sim.revenue_oracle,
-                  "agent_loss": sim.agent_loss, "coi": sim.coi, "look_to_book": sim.look_to_book,
-                  "mean_sale_price": sim.mean_sale_price, "true_human_purchases": sim.n_human_purchases,
-                  "true_agent_purchases": sim.n_agent_purchases}
-        diagnostics = {"demand_human": sim.demand_human, "demand_agent": sim.demand_agent, "alpha_hat": alpha}
-        return result, diagnostics
+        prices = self._prices if self._prices is not None else np.zeros(self.n, dtype=np.float32)
+        obs = self._build_obs(prices, np.zeros(self.n, dtype=np.float32), 0.0, 0.0, 0.0)
+        return obs, {"alpha_true": self._alpha_true}
 
-    def step(self, action: np.ndarray):
-        self.t += 1
-        base_prices = self.state["elasticity"]["price"].astype(np.float32)
-        new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
-                           self.constraints.system_min_price,
-                           self.constraints.system_max_price).astype(np.float32)
+    def step(self, action: np.ndarray) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
+        if self._prices is None:
+            raise RuntimeError("reset() must be called before step()")
 
-        self.state["elasticity"]["price"] = new_prices
-        if self.use_jax:
-            result, diagnostics = self._step_jax(new_prices)
-        else:
-            interactions_df, diagnostics = self.commerce_platform._simulate_sessions(new_prices)
-            result = self.commerce_platform.compute_interaction_features(interactions_df)
-        COI = float(result.get("coi", 0.0))
-
-        demand_vector = diagnostics.get("demand_human", np.zeros_like(new_prices)) + diagnostics.get(
-            "demand_agent", np.zeros_like(new_prices)
+        prev = self._prices
+        prices = constrain_prices(
+            prev,
+            np.asarray(action, dtype=np.float32),
+            costs=self._costs,
+            min_price=float(self.c.system_min_price),
+            max_price=float(self.c.system_max_price),
+            max_adjustment=float(self.c.max_price_adjustment),
+            min_margin_pct=float(self.c.min_margin_pct),
         )
-        self.state["elasticity"]["demand"] = demand_vector.astype(np.float32)
+        self._prices = prices
+        self._limbo.add_update("prices", prices)
 
-        volatility = 0.0 if self._prev_prices is None else \
-            float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
-        self._prev_prices = new_prices.copy()
+        sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents = self._observe_market(prices)
+        self._last_sessions = sessions
+        self._limbo.add_update("demand", demand_map)
 
-        # update market observation features
-        total_demand = float(np.sum(demand_vector))
-        total_purchases = float(result.get("true_human_purchases", 0.0) + result.get("true_agent_purchases", 0.0))
-        conv_rate = total_purchases / max(total_demand, 1.0)
-        self.state["market"] = {
-            "alpha_hat": np.array([float(diagnostics.get("alpha_hat", self.commerce_platform.alpha_hat))], dtype=np.float32),
-            "revenue_rate": np.array([float(result.get("revenue_observed", 0.0))], dtype=np.float32),
-            "conversion_rate": np.array([float(np.clip(conv_rate, 0.0, 1.0))], dtype=np.float32),
-            "price_volatility": np.array([float(volatility)], dtype=np.float32),
-        }
-        self.state["cost"] = self.commerce_platform.unit_cost.astype(np.float32)
+        self._update_alpha_hat(self._last_sessions)
+        self._last_coi = compute_coi_window(self._last_sessions, self._costs, demand_mapping=demand_map)
 
-        # extract metrics with safe defaults for incomplete simulation
-        revenue_observed = float(result.get("revenue_observed", 0.0))
-        agent_loss = float(result.get("agent_loss", 0.0))
+        self._alpha_true = float(np.clip(self._alpha_true + self.c.alpha_drift, *self.c.alpha_bounds))
+        volatility = float(np.std((prices - prev) / (prev + 1e-6)))
+        reward = float(self._reward(prices, revenue, cost, volatility))
+        conversion = float(np.sum(purchases) / max(len(self._last_sessions), 1))
 
-        reward = (revenue_observed
-                  - COI
-                  - self.constraints.w_agent_loss * agent_loss
-                  - self.constraints.w_volatility * volatility
-                  - self.constraints.w_estimation_error)
+        self._t += 1
+        terminated = self._t >= int(self.c.max_steps)
 
-        terminated = self.t >= self.constraints.episode_length
+        obs = self._build_obs(prices, demand_by_product, revenue, conversion, min(volatility, 1.0))
         info = {
-            "t": self.t,
-            "revenue_observed": revenue_observed,
-            "revenue_oracle": float(result.get("revenue_oracle", revenue_observed)),
-            "agent_loss": agent_loss,
-            "ux_volatility": volatility,
-            "look_to_book": float(result.get("look_to_book", 0.0)),
-            "mean_sale_price": float(result.get("mean_sale_price", 0.0)),
-            "true_human_purchases_total": float(result.get("true_human_purchases", 0.0)),
-            "true_agent_purchases_total": float(result.get("true_agent_purchases", 0.0)),
-            "coi": COI,
-            "alpha_hat": diagnostics.get("alpha_hat", self.commerce_platform.alpha_hat),
-            "mean_human_demand": float(np.mean(diagnostics.get("demand_human", np.zeros_like(new_prices)))),
-            "mean_agent_demand": float(np.mean(diagnostics.get("demand_agent", np.zeros_like(new_prices)))),
+            "step": self._t,
+            "reward": reward,
+            "revenue": float(revenue),
+            "profit": float(revenue - cost),
+            "n_sessions": int(self.c.sessions_per_step),
+            "n_agents": int(n_agents),
+            "alpha_true": float(self._alpha_true),
+            "alpha_hat": float(self._alpha_hat),
+            "alpha_error": float(abs(self._alpha_hat - self._alpha_true)),
+            "price_std": float(np.std(prices)),
+            "price_volatility": float(volatility),
         }
-        if "delta_h_mean" in diagnostics:
+        if self._last_coi is not None:
             info.update(
                 {
-                    "delta_h_mean": diagnostics["delta_h_mean"],
-                    "delta_a_mean": diagnostics["delta_a_mean"],
-                    "prob_agent_mean": diagnostics["prob_agent_mean"],
+                    "coi_policy": float(self._last_coi.policy),
+                    "coi_agent": float(self._last_coi.agent),
+                    "coi_leakage": float(self._last_coi.leak),
+                    "coi_survival": float(self._last_coi.survival_ratio),
+                    "coi_erosion": float(coi_erosion(self._last_coi.policy, self._last_coi.agent)),
                 }
             )
-        return self.state, float(reward), terminated, False, info
+        return obs, reward, terminated, False, info
 
+    def render(self, mode: str = "human") -> str | None:
+        if self._prices is None:
+            return None
+        out = (
+            f"t={self._t}/{self.c.max_steps} "
+            f"alpha_true={self._alpha_true:.3f} alpha_hat={self._alpha_hat:.3f} "
+            f"price_std={float(np.std(self._prices)):.2f}"
+        )
+        if mode == "human":
+            print(out)
+        return out
 
-if __name__ == "__main__":
-    import matplotlib.pyplot as plt
-    from collections import defaultdict
-
-    env = PHANTOMEnv(constraints=BusinessLogicConstraints())
-    obs, _ = env.reset(seed=42)
-    metrics = defaultdict(list)
-    total_reward = 0.0
-    done = False
-
-    while not done:
-        action = env.action_space.sample()
-        obs, reward, done, _, info = env.step(action)
-        total_reward += reward
-        p_mean = float(np.mean(obs["elasticity"]["price"]))
-        q_mean = float(np.mean(obs["elasticity"]["demand"]))
-        p_std = float(np.std(obs["elasticity"]["price"]))
-
-        metrics['t'].append(info['t'])
-        metrics['price_mean'].append(p_mean)
-        metrics['price_std'].append(p_std)
-        metrics['demand_mean'].append(q_mean)
-        metrics['revenue_observed'].append(info['revenue_observed'])
-        metrics['revenue_oracle'].append(info['revenue_oracle'])
-        metrics['agent_loss'].append(info['agent_loss'])
-        metrics['ux_volatility'].append(info['ux_volatility'])
-        metrics['look_to_book'].append(info['look_to_book'])
-        metrics['reward'].append(reward)
-        metrics['human_purchases'].append(info['true_human_purchases_total'])
-        metrics['agent_purchases'].append(info['true_agent_purchases_total'])
-        metrics['coi'].append(info.get('coi', 0.0))
-        metrics['alpha_hat'].append(info.get('alpha_hat', env.commerce_platform.alpha_hat))
-        metrics['mean_human_demand'].append(info.get('mean_human_demand', 0.0))
-        metrics['mean_agent_demand'].append(info.get('mean_agent_demand', 0.0))
-        metrics['delta_h_mean'].append(info.get('delta_h_mean', 0.0))
-        metrics['delta_a_mean'].append(info.get('delta_a_mean', 0.0))
-        metrics['prob_agent_mean'].append(info.get('prob_agent_mean', 0.0))
-
-        if info['t'] % 20 == 0 or done:
-            print(f"t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} q={q_mean:6.2f} "
-                  f"rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
-                  f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
-                  f"coi={info.get('coi', 0.0):6.2f} alpha={info.get('alpha_hat', 0.0):4.2f} "
-                  f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
-
-    print(f"total_reward={total_reward:.2f}")
-
-    fig, axes = plt.subplots(3, 4, figsize=(18, 12))
-    fig.suptitle('PHANTOM Environment Run', fontsize=14, fontweight='bold')
-
-    plot_configs = [
-        ('price_mean', 'Mean Price', 'Price'),
-        ('demand_mean', 'Mean Demand (All)', 'Demand'),
-        ('mean_human_demand', 'Mean Human Demand', 'Count'),
-        ('mean_agent_demand', 'Mean Agent Demand', 'Count'),
-        ('revenue_observed', 'Revenue (Observed)', 'Revenue'),
-        ('agent_loss', 'Agent Loss (Oracle - Observed)', 'Loss'),
-        ('coi', 'Cost of Information', 'COI'),
-        ('alpha_hat', 'Estimated α̂', 'alpha'),
-        ('ux_volatility', 'UX Volatility (Price Change)', 'Volatility'),
-        ('look_to_book', 'Look-to-Book Ratio', 'Ratio'),
-        ('reward', 'Step Reward', 'Reward'),
-        ('prob_agent_mean', 'Avg Agent Probability', 'Probability'),
-    ]
-
-    for idx, (key, title, ylabel) in enumerate(plot_configs):
-        ax = axes[idx // 4, idx % 4]
-        ax.plot(metrics['t'], metrics[key], color='blue', alpha=0.7, linewidth=1.5)
-        ax.set_xlabel('Step')
-        ax.set_ylabel(ylabel)
-        ax.set_title(title, fontsize=10, fontweight='bold')
-        ax.grid(True, alpha=0.3)
-
-    plt.tight_layout()
-    plt.savefig('phantom_env_comparison.png', dpi=150, bbox_inches='tight')
-    print("Plot saved to phantom_env_comparison.png")
-    plt.show()
+    def close(self) -> None:
+        return

From 83d9bb25521d8a0d7cbe22e10630ff2a8e1bbd59 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 28 Jan 2026 14:04:57 +0100
Subject: [PATCH 56/99] chore: properly developing

---
 engine/lib/behavior.py           | 48 ++++++++++++++++++++++++++++++++
 engine/lib/demand.py             | 40 ++++++++++++++++++++++++++
 engine/main.py                   |  0
 sim/rl/behavior_loader/models.py |  4 +++
 4 files changed, 92 insertions(+)
 create mode 100644 engine/lib/behavior.py
 create mode 100644 engine/lib/demand.py
 create mode 100644 engine/main.py

diff --git a/engine/lib/behavior.py b/engine/lib/behavior.py
new file mode 100644
index 0000000..69b6649
--- /dev/null
+++ b/engine/lib/behavior.py
@@ -0,0 +1,48 @@
+from sim.rl.behavior_loader.models import BehaviorModel, AgentBehaviorModel, aggregate_event_transitions
+import pandas as pd
+import numpy as np
+from .demand import generate_demand
+
+base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
+human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
+
+
+def adjust_behavior_to_condition(condition, transition_matrix):
+    # transition matrix just maps probability of eventA to eventB
+    # we enhance this that eventA-productI to eventB-productJ... based on the condition of interest
+    # this is to simulate the impact of demand onto the behavior
+    # NxN -> (N*(P + 1))x(N*(P + 1)) where P is number of products
+    new_transitions = transition_matrix.copy()
+    for col in new_transitions.columns:
+        for product in range(len(condition)):
+            # adjust the transition probability based on the demand condition
+            newname = f"{col}_product{product}"
+            new_transitions[newname] = new_transitions[col] * (condition[product] / np.sum(condition))
+    for row in transition_matrix.index:
+        for product in range(len(condition)):
+            newname = f"{row}_product{product}"
+            new_transitions.loc[newname] = new_transitions.loc[row] * (condition[product] / np.sum(condition))
+
+    return new_transitions.fillna(0.0)
+
+def sample_behavior(condition, human=True, max_len=40):
+    model = BehaviorModel(human_dir) if human else AgentBehaviorModel(agent_dir)
+    mdp = model.build_MDP()
+    raw_events = aggregate_event_transitions(mdp) # this gets us transtition between events (blind to products or prices)
+    # staet: {state: p} is raw_events we needc a matrix a pivot table
+    events_pivot = pd.DataFrame(raw_events).fillna(0.0)
+    # now adjust the transition matrix based on the condition to get a more informed transition matrix
+    adjusted_transitions = adjust_behavior_to_condition(condition, events_pivot)
+
+    trajectory = [np.random.choice(adjusted_transitions.index)]
+    while len(trajectory) < max_len or 'checkout' in trajectory[-1]:
+        probs = adjusted_transitions.loc[trajectory[-1]].values
+        sample = np.random.choice(adjusted_transitions.columns, p=probs/np.sum(probs) if np.sum(probs) > 0 else None)
+        trajectory.append(sample)
+    return trajectory
+
+if __name__ == "__main__":
+    t=sample_behavior(generate_demand(np.array([10,20,30])), human=True)
+    print(t)
+    t=sample_behavior(generate_demand(np.array([10,20,30])), human=False)
+    print(t)
diff --git a/engine/lib/demand.py b/engine/lib/demand.py
new file mode 100644
index 0000000..52e28e7
--- /dev/null
+++ b/engine/lib/demand.py
@@ -0,0 +1,40 @@
+import numpy as np
+
+def generate_demand(prices):
+    # assumption 1: each product has an intrinsic valuation drawn from a normal distribution centered at 50
+    product_valuations = np.random.normal(loc=50.0, scale=10.0, size=len(prices))
+    # assumption 2: demand decreases as price increases, following a simple linear model
+    demand = np.maximum(0, product_valuations - prices) # demand cannot be negative
+    demand = demand / np.sum(demand) * 100  # normalize to total demand of 1000 units so demand output is within [0, 100]
+    return demand
+
+def estimate_demand(trajectories):
+    demand_estimate = {}
+    for traj in trajectories:
+        for event in traj:
+            if 'view_product' in event:
+                product_id = int(event.split('_')[-1].replace('product', ''))
+                demand_estimate[product_id] = demand_estimate.get(product_id, 0) + 1
+    total_views = sum(demand_estimate.values())
+    for product_id in demand_estimate:
+        demand_estimate[product_id] = (demand_estimate[product_id] / total_views) * 100  # normalize to percentage
+    return demand_estimate
+
+# Example usage
+if __name__ == "__main__":
+    np.random.seed(42)
+    prices = np.array([20.0, 35.0, 50.0, 65.0])
+    demand = generate_demand(prices)
+    print("Generated Demand:", demand)
+    from .behavior import sample_behavior
+    N, alphat =200, 0.1
+    trajectories = []
+    for _ in range(int(N*(1 - alphat))):
+        trajectories.append(sample_behavior(demand, human=True))
+    for _ in range(int(N*alphat)):
+        trajectories.append(sample_behavior(demand, human=False))
+    demand_estimate = estimate_demand(trajectories)
+    print("Estimated Demand from Behavior:", demand_estimate)
+    delta = {k: demand_estimate.get(k, 0) - demand[i] for i, k in enumerate(range(len(prices)))}
+    delta = np.mean([np.abs(v) for v in delta.values()])
+    print("Demand Delta:", delta)
diff --git a/engine/main.py b/engine/main.py
new file mode 100644
index 0000000..e69de29
diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 33f83f4..8cc0214 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -226,6 +226,7 @@ if __name__ == "__main__":
 
     agent_model = AgentBehaviorModel(agent_dir)
     agent_mdp = agent_model.build_MDP()
+    print(agent_mdp)
     print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
           f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
     if not agent_mdp['states']:
@@ -234,6 +235,9 @@ if __name__ == "__main__":
 
     human_evt = aggregate_event_transitions(human_mdp)
     agent_evt = aggregate_event_transitions(agent_mdp)
+    print(agent_evt)
+
+
     common = set(human_evt.keys()) & set(agent_evt.keys())
 
     if not common:

From 6e06081d60a01909f7bf51c1a8aa7abe8f2d9b62 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 28 Jan 2026 16:09:28 +0100
Subject: [PATCH 57/99] porting to better

---
 engine/engine.py     | 66 ++++++++++++++++++++++++++++++++++++++++++++
 engine/lib/demand.py |  8 ++++--
 engine/main.py       |  0
 engine/wrapper.py    |  5 ++++
 4 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 engine/engine.py
 delete mode 100644 engine/main.py
 create mode 100644 engine/wrapper.py

diff --git a/engine/engine.py b/engine/engine.py
new file mode 100644
index 0000000..e304aeb
--- /dev/null
+++ b/engine/engine.py
@@ -0,0 +1,66 @@
+from sys import platform
+import numpy as np
+from .lib.demand import generate_demand, estimate_demand
+from .lib.behavior import sample_behavior
+from logging import INFO, getLogger
+logger = getLogger(__name__)
+logger.setLevel(INFO)
+
+
+
+class MarketEngine():
+    def __init__(self,
+                 alpha = 0.5,
+                 N = 100,
+                 demand_distribution = (50, 10),
+                 demand_sampling_function = np.random.normal):
+        self.Nagents = int(N*alpha)
+        self.Nhumans = int(N*(1-alpha))
+        self.demand = (demand_sampling_function, demand_distribution)
+
+    def act(self, prices):
+        demand = generate_demand(prices, *self.demand)
+        sample_n = lambda n, human: [sample_behavior(demand, human=human) for _ in range(n)]
+        human_t, agent_t = sample_n(100, True), sample_n(100, False)
+        trajectories = human_t + agent_t
+        demand_estimate = estimate_demand(trajectories)
+        return demand_estimate
+
+    def measure(self):
+        pass
+
+class PricingEngine():
+    def __init__(self,
+                 ) -> None:
+        pass
+
+    def act(self, demand):
+        return np.random.uniform(low=25, high=100, size=10)
+
+
+
+class Limbo():
+    def __init__(self,
+                 platform,
+                 market
+                 ) -> None:
+        self.platform_turn = True
+        self.platform = platform
+        self.market = market
+        self.output = None
+
+    def step(self):
+        # we could code golf this a little bit
+        if self.platform_turn:
+            self.output = self.platform.act(self.output)
+        else:
+            self.output = self.market.act(self.output)
+        print(self.output)
+        self.platform_turn = not self.platform_turn
+
+if __name__ == "__main__":
+    platform = PricingEngine()
+    market = MarketEngine()
+    limbo = Limbo(platform, market)
+    for _ in range(10):
+        limbo.step()
diff --git a/engine/lib/demand.py b/engine/lib/demand.py
index 52e28e7..687afe8 100644
--- a/engine/lib/demand.py
+++ b/engine/lib/demand.py
@@ -1,11 +1,15 @@
+import logging
 import numpy as np
+from logging import getLogger
+logger = getLogger(__name__)
 
-def generate_demand(prices):
+def generate_demand(prices, distribution_method = np.random.normal, distribution_params = (50.0, 10.0)):
     # assumption 1: each product has an intrinsic valuation drawn from a normal distribution centered at 50
-    product_valuations = np.random.normal(loc=50.0, scale=10.0, size=len(prices))
+    product_valuations = distribution_method(*distribution_params, size=len(prices))
     # assumption 2: demand decreases as price increases, following a simple linear model
     demand = np.maximum(0, product_valuations - prices) # demand cannot be negative
     demand = demand / np.sum(demand) * 100  # normalize to total demand of 1000 units so demand output is within [0, 100]
+    logger.info(f"Generated demand for prices {prices}: {demand} with valuations from distribution {distribution_params}")
     return demand
 
 def estimate_demand(trajectories):
diff --git a/engine/main.py b/engine/main.py
deleted file mode 100644
index e69de29..0000000
diff --git a/engine/wrapper.py b/engine/wrapper.py
new file mode 100644
index 0000000..1cf5815
--- /dev/null
+++ b/engine/wrapper.py
@@ -0,0 +1,5 @@
+import gymnasium as gym
+from gymnasium import spaces
+from engine import Limbo
+
+class PHANTOM(gym.Env, Limbo):

From 772772b5b93323212f2473312de62ad238d9e733 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 29 Jan 2026 10:01:53 +0100
Subject: [PATCH 58/99] chore: better wrapping amd more performant

---
 engine/lib/behavior.py |  43 ++++++-------
 engine/lib/demand.py   |   3 +-
 engine/wrapper.py      | 141 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 162 insertions(+), 25 deletions(-)

diff --git a/engine/lib/behavior.py b/engine/lib/behavior.py
index 69b6649..1822dde 100644
--- a/engine/lib/behavior.py
+++ b/engine/lib/behavior.py
@@ -6,33 +6,32 @@ from .demand import generate_demand
 base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
 human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
 
+_cache = {}  # lazy cache for models and base pivots
+
+def _get_base_pivot(human: bool):
+    key = 'human' if human else 'agent'
+    if key not in _cache:
+        model = BehaviorModel(human_dir) if human else AgentBehaviorModel(agent_dir)
+        mdp = model.build_MDP()
+        _cache[key] = pd.DataFrame(aggregate_event_transitions(mdp)).fillna(0.0)
+    return _cache[key]
 
 def adjust_behavior_to_condition(condition, transition_matrix):
-    # transition matrix just maps probability of eventA to eventB
-    # we enhance this that eventA-productI to eventB-productJ... based on the condition of interest
-    # this is to simulate the impact of demand onto the behavior
-    # NxN -> (N*(P + 1))x(N*(P + 1)) where P is number of products
-    new_transitions = transition_matrix.copy()
-    for col in new_transitions.columns:
-        for product in range(len(condition)):
-            # adjust the transition probability based on the demand condition
-            newname = f"{col}_product{product}"
-            new_transitions[newname] = new_transitions[col] * (condition[product] / np.sum(condition))
-    for row in transition_matrix.index:
-        for product in range(len(condition)):
-            newname = f"{row}_product{product}"
-            new_transitions.loc[newname] = new_transitions.loc[row] * (condition[product] / np.sum(condition))
+    # expand NxN transition matrix to (N*P)x(N*P) weighted by demand condition
+    cond_norm = condition / np.sum(condition)
+    n_products = len(condition)
+    base_vals = transition_matrix.values
+    base_cols, base_rows = transition_matrix.columns.tolist(), transition_matrix.index.tolist()
 
-    return new_transitions.fillna(0.0)
+    # expand via kronecker-like tiling: each cell becomes a P*P block weighted by outer product of cond_norm
+    expanded = np.kron(base_vals, np.outer(cond_norm, cond_norm))
+    new_cols = [f"{c}_product{p}" for c in base_cols for p in range(n_products)]
+    new_rows = [f"{r}_product{p}" for r in base_rows for p in range(n_products)]
+    return pd.DataFrame(expanded, index=new_rows, columns=new_cols)
 
 def sample_behavior(condition, human=True, max_len=40):
-    model = BehaviorModel(human_dir) if human else AgentBehaviorModel(agent_dir)
-    mdp = model.build_MDP()
-    raw_events = aggregate_event_transitions(mdp) # this gets us transtition between events (blind to products or prices)
-    # staet: {state: p} is raw_events we needc a matrix a pivot table
-    events_pivot = pd.DataFrame(raw_events).fillna(0.0)
-    # now adjust the transition matrix based on the condition to get a more informed transition matrix
-    adjusted_transitions = adjust_behavior_to_condition(condition, events_pivot)
+    base_pivot = _get_base_pivot(human)
+    adjusted_transitions = adjust_behavior_to_condition(condition, base_pivot)
 
     trajectory = [np.random.choice(adjusted_transitions.index)]
     while len(trajectory) < max_len or 'checkout' in trajectory[-1]:
diff --git a/engine/lib/demand.py b/engine/lib/demand.py
index 687afe8..7215f7c 100644
--- a/engine/lib/demand.py
+++ b/engine/lib/demand.py
@@ -8,7 +8,8 @@ def generate_demand(prices, distribution_method = np.random.normal, distribution
     product_valuations = distribution_method(*distribution_params, size=len(prices))
     # assumption 2: demand decreases as price increases, following a simple linear model
     demand = np.maximum(0, product_valuations - prices) # demand cannot be negative
-    demand = demand / np.sum(demand) * 100  # normalize to total demand of 1000 units so demand output is within [0, 100]
+    total = np.sum(demand)
+    demand = demand / total * 100 if total > 0 else demand  # normalize to percentage, avoid div by zero
     logger.info(f"Generated demand for prices {prices}: {demand} with valuations from distribution {distribution_params}")
     return demand
 
diff --git a/engine/wrapper.py b/engine/wrapper.py
index 1cf5815..5543670 100644
--- a/engine/wrapper.py
+++ b/engine/wrapper.py
@@ -1,5 +1,142 @@
 import gymnasium as gym
 from gymnasium import spaces
-from engine import Limbo
+import numpy as np
+import matplotlib.pyplot as plt
+from .engine import Limbo, MarketEngine, PricingEngine
 
-class PHANTOM(gym.Env, Limbo):
+
+class PHANTOM(gym.Env):
+    """Gymnasium wrapper for the Limbo pricing-market simulation. Platform sets prices, market responds with demand."""
+    metadata = {"render_modes": ["human", "ansi"]}
+
+    def __init__(self,
+                 n_products: int = 10,
+                 alpha: float = 0.3,
+                 N: int = 100,
+                 price_bounds: tuple = (10.0, 150.0),
+                 lambda_coi: float = 0.1,  # coi leakage penalty weight
+                 render_mode: str = None):
+        super().__init__()
+        self.n_products = n_products
+        self.price_bounds = price_bounds
+        self.lambda_coi = lambda_coi
+        self.render_mode = render_mode
+
+        self.market = MarketEngine(alpha=alpha, N=N)
+        self._platform_stub = PricingEngine()
+        self._limbo = Limbo(self._platform_stub, self.market)
+
+        # action: continuous prices for each product
+        self.action_space = spaces.Box(
+            low=price_bounds[0], high=price_bounds[1],
+            shape=(n_products,), dtype=np.float32
+        )
+        # observation: demand estimate + previous prices
+        self.observation_space = spaces.Dict({
+            "demand": spaces.Box(low=0.0, high=100.0, shape=(n_products,), dtype=np.float32),
+            "prices": spaces.Box(low=price_bounds[0], high=price_bounds[1], shape=(n_products,), dtype=np.float32),
+        })
+
+        self._prices = None
+        self._demand = None
+        self._step_count = 0
+        self._demand_history = []  # list of demand arrays over time
+        self._price_history = []   # list of price arrays over time
+        self._fig, self._axes = None, None
+
+    def _get_obs(self) -> dict:
+        demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32)
+        return {"demand": demand_arr, "prices": self._prices.astype(np.float32)}
+
+    def _compute_reward(self, prices: np.ndarray, demand: dict) -> float:
+        demand_arr = np.array([demand.get(i, 0.0) for i in range(self.n_products)])
+        revenue = np.sum(prices * demand_arr)  # revenue = price * quantity proxy
+        base_price = self.price_bounds[0]
+        return float(revenue)# - self.lambda_coi * coi_leak)
+
+    def _record_history(self):
+        demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)])
+        self._demand_history.append(demand_arr)
+        self._price_history.append(self._prices.copy())
+
+    def reset(self, seed=None, options=None):
+        super().reset(seed=seed)
+        self._prices = np.random.uniform(*self.price_bounds, size=self.n_products)
+        self._demand = self.market.act(self._prices)
+        self._step_count = 0
+        self._demand_history, self._price_history = [], []
+        self._record_history()
+        if self._fig: plt.close(self._fig)
+        self._fig, self._axes = None, None
+        return self._get_obs(), {}
+
+    def step(self, action: np.ndarray):
+        self._prices = np.clip(action, *self.price_bounds)
+        self._demand = self.market.act(self._prices)
+        self._step_count += 1
+        self._record_history()
+
+        reward = self._compute_reward(self._prices, self._demand)
+        terminated = self._step_count >= 100
+        truncated = False
+
+        return self._get_obs(), reward, terminated, truncated, {"step": self._step_count}
+
+    def render(self):
+        if self.render_mode == "human":
+            if self._fig is None:
+                plt.ion()
+                self._fig, self._axes = plt.subplots(2, 2, figsize=(12, 8))
+                self._fig.suptitle("PHANTOM Environment")
+
+            demand_mat = np.array(self._demand_history).T  # shape: (n_products, timesteps)
+            price_mat = np.array(self._price_history).T
+            revenue_per_step = np.sum(demand_mat * price_mat, axis=0)  # revenue = demand * price
+            demand_variance = np.var(demand_mat, axis=0)  # how spread demand is across products
+
+            for row in self._axes:
+                for ax in row: ax.clear()
+
+            self._axes[0, 0].imshow(demand_mat, aspect='auto', cmap='viridis', origin='lower')
+            self._axes[0, 0].set_xlabel("Time Step")
+            self._axes[0, 0].set_ylabel("Product")
+            self._axes[0, 0].set_title("Demand Over Time")
+
+            self._axes[0, 1].imshow(price_mat, aspect='auto', cmap='plasma', origin='lower')
+            self._axes[0, 1].set_xlabel("Time Step")
+            self._axes[0, 1].set_ylabel("Product")
+            self._axes[0, 1].set_title("Price Over Time")
+
+            self._axes[1, 0].plot(revenue_per_step, color='teal', linewidth=1.5)
+            self._axes[1, 0].set_xlabel("Time Step")
+            self._axes[1, 0].set_ylabel("Revenue")
+            self._axes[1, 0].set_title("Revenue per Step")
+
+            self._axes[1, 1].plot(demand_variance, color='orangered', linewidth=1.5)
+            self._axes[1, 1].set_xlabel("Time Step")
+            self._axes[1, 1].set_ylabel("Variance")
+            self._axes[1, 1].set_title("Demand Variance")
+
+            self._fig.tight_layout()
+            self._fig.canvas.draw()
+            self._fig.canvas.flush_events()
+            plt.pause(0.01)
+
+        elif self.render_mode == "ansi":
+            return f"step={self._step_count}, prices={self._prices}, demand={self._demand}"
+        return None
+
+    def close(self):
+        if self._fig: plt.close(self._fig)
+        self._fig, self._axes = None, None
+
+
+if __name__ == "__main__":
+    env = PHANTOM(n_products=100, render_mode="human")
+    obs, _ = env.reset()
+    for _ in range(100):
+        action = env.action_space.sample()
+        obs, reward, term, trunc, info = env.step(action)
+        env.render()
+        print(f"Reward: {reward:.2f}")
+        if term: break

From 10e8397eec4a0f210d1e2c324f51be3a5731c57c Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Thu, 29 Jan 2026 13:11:52 +0100
Subject: [PATCH 59/99] chore: bette rplotting

---
 engine/wrapper.py | 175 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 139 insertions(+), 36 deletions(-)

diff --git a/engine/wrapper.py b/engine/wrapper.py
index 5543670..7637998 100644
--- a/engine/wrapper.py
+++ b/engine/wrapper.py
@@ -2,6 +2,8 @@ import gymnasium as gym
 from gymnasium import spaces
 import numpy as np
 import matplotlib.pyplot as plt
+from matplotlib.gridspec import GridSpec
+import matplotlib.colors as mcolors
 from .engine import Limbo, MarketEngine, PricingEngine
 
 
@@ -21,6 +23,8 @@ class PHANTOM(gym.Env):
         self.price_bounds = price_bounds
         self.lambda_coi = lambda_coi
         self.render_mode = render_mode
+        self.alpha = alpha
+        self.N = N
 
         self.market = MarketEngine(alpha=alpha, N=N)
         self._platform_stub = PricingEngine()
@@ -40,9 +44,16 @@ class PHANTOM(gym.Env):
         self._prices = None
         self._demand = None
         self._step_count = 0
-        self._demand_history = []  # list of demand arrays over time
-        self._price_history = []   # list of price arrays over time
-        self._fig, self._axes = None, None
+        self._demand_history = []
+        self._price_history = []
+        self._revenue_history = []
+        self._fig = None
+        self._gs = None
+        self._dashboard_colors = {
+            'bg': '#f5f0e8', 'panel': '#ebe3d5', 'accent': '#c9b99a',
+            'text': '#3d3229', 'green': '#5c7a5c', 'red': '#8b4049',
+            'blue': '#5a7384', 'orange': '#b87333', 'purple': '#7d6b7d'
+        }
 
     def _get_obs(self) -> dict:
         demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32)
@@ -58,16 +69,16 @@ class PHANTOM(gym.Env):
         demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)])
         self._demand_history.append(demand_arr)
         self._price_history.append(self._prices.copy())
+        revenue = np.sum(self._prices * demand_arr)
+        self._revenue_history.append(revenue)
 
     def reset(self, seed=None, options=None):
         super().reset(seed=seed)
         self._prices = np.random.uniform(*self.price_bounds, size=self.n_products)
         self._demand = self.market.act(self._prices)
         self._step_count = 0
-        self._demand_history, self._price_history = [], []
+        self._demand_history, self._price_history, self._revenue_history = [], [], []
         self._record_history()
-        if self._fig: plt.close(self._fig)
-        self._fig, self._axes = None, None
         return self._get_obs(), {}
 
     def step(self, action: np.ndarray):
@@ -82,45 +93,137 @@ class PHANTOM(gym.Env):
 
         return self._get_obs(), reward, terminated, truncated, {"step": self._step_count}
 
+    def _compute_elasticity(self) -> np.ndarray:
+        """point elasticity: e = (dQ/dP) * (P/Q) estimated via finite differences, clipped to [-5, 5]"""
+        if len(self._price_history) < 2:
+            return np.zeros(self.n_products)
+        p = np.array(self._price_history)
+        q = np.array(self._demand_history)
+        dp = np.diff(p, axis=0)
+        dq = np.diff(q, axis=0)
+        min_dp = 0.5  # ignore tiny price changes to avoid explosions
+        valid = np.abs(dp) > min_dp
+        with np.errstate(divide='ignore', invalid='ignore'):
+            elasticity = np.where(valid, (dq / dp) * (p[:-1] / np.maximum(q[:-1], 1.0)), 0.0)
+            elasticity = np.clip(elasticity, -5.0, 5.0)
+            elasticity = np.nan_to_num(elasticity, nan=0.0)
+        return np.mean(elasticity, axis=0) if len(elasticity) > 0 else np.zeros(self.n_products)
+
+    def _style_axis(self, ax, title: str = None, xlabel: str = None, ylabel: str = None):
+        c = self._dashboard_colors
+        ax.set_facecolor(c['panel'])
+        ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)
+        ax.spines['bottom'].set_color(c['accent']); ax.spines['left'].set_color(c['accent'])
+        ax.tick_params(colors=c['text'], labelsize=8)
+        if title: ax.set_title(title, color=c['text'], fontsize=11, fontweight='bold', pad=8)
+        if xlabel: ax.set_xlabel(xlabel, color=c['text'], fontsize=9)
+        if ylabel: ax.set_ylabel(ylabel, color=c['text'], fontsize=9)
+
     def render(self):
         if self.render_mode == "human":
+            c = self._dashboard_colors
             if self._fig is None:
                 plt.ion()
-                self._fig, self._axes = plt.subplots(2, 2, figsize=(12, 8))
-                self._fig.suptitle("PHANTOM Environment")
+                self._fig = plt.figure(figsize=(14, 10), facecolor=c['bg'])
+                self._gs = GridSpec(3, 3, figure=self._fig, hspace=0.35, wspace=0.3,
+                                    left=0.07, right=0.95, top=0.92, bottom=0.08)
+                plt.show(block=False)
 
-            demand_mat = np.array(self._demand_history).T  # shape: (n_products, timesteps)
+            self._fig.clear()
+            self._fig.suptitle(f'PHANTOM  Market Dynamics  [t={self._step_count}, α={self.alpha:.2f}]',
+                              color=c['text'], fontsize=14, fontweight='bold')
+
+            demand_mat = np.array(self._demand_history).T
             price_mat = np.array(self._price_history).T
-            revenue_per_step = np.sum(demand_mat * price_mat, axis=0)  # revenue = demand * price
-            demand_variance = np.var(demand_mat, axis=0)  # how spread demand is across products
+            elasticity = self._compute_elasticity()
+            cmap = mcolors.LinearSegmentedColormap.from_list('phantom', [c['bg'], c['blue'], c['green']])
+            cmap_div = mcolors.LinearSegmentedColormap.from_list('elast', [c['red'], c['bg'], c['blue']])
 
-            for row in self._axes:
-                for ax in row: ax.clear()
+            # price-demand elasticity scatter (all historical data points)
+            ax_elast = self._fig.add_subplot(self._gs[0, 0])
+            prices_flat = np.array(self._price_history).flatten()
+            demands_flat = np.array(self._demand_history).flatten()
+            product_ids = np.tile(np.arange(self.n_products), len(self._price_history))
+            scatter = ax_elast.scatter(prices_flat, demands_flat, c=product_ids, cmap='plasma',
+                                       alpha=0.6, s=15, edgecolors='none')
+            if len(prices_flat) > 1:  # fit regression line
+                z = np.polyfit(prices_flat, demands_flat, 1)
+                p_line = np.linspace(prices_flat.min(), prices_flat.max(), 50)
+                ax_elast.plot(p_line, np.polyval(z, p_line), '--', color=c['red'], lw=1.5, alpha=0.8)
+            self._style_axis(ax_elast, "Price-Demand Relationship", "Price ($)", "Demand")
 
-            self._axes[0, 0].imshow(demand_mat, aspect='auto', cmap='viridis', origin='lower')
-            self._axes[0, 0].set_xlabel("Time Step")
-            self._axes[0, 0].set_ylabel("Product")
-            self._axes[0, 0].set_title("Demand Over Time")
+            # elasticity coefficients bar
+            ax_ebar = self._fig.add_subplot(self._gs[0, 1])
+            colors_e = [c['red'] if e < -0.5 else c['blue'] if e > 0.5 else c['accent'] for e in elasticity]
+            ax_ebar.barh(range(self.n_products), elasticity, color=colors_e, alpha=0.8, edgecolor=c['bg'])
+            ax_ebar.axvline(0, color=c['text'], lw=0.8, alpha=0.5)
+            ax_ebar.axvline(-1, color=c['red'], lw=1, ls='--', alpha=0.5)  # unit elastic reference
+            ax_ebar.set_yticks(range(self.n_products))
+            ax_ebar.set_yticklabels([f'P{i}' for i in range(self.n_products)], fontsize=7)
+            self._style_axis(ax_ebar, "Price Elasticity ε", "ε = (ΔQ/ΔP)·(P/Q)", None)
 
-            self._axes[0, 1].imshow(price_mat, aspect='auto', cmap='plasma', origin='lower')
-            self._axes[0, 1].set_xlabel("Time Step")
-            self._axes[0, 1].set_ylabel("Product")
-            self._axes[0, 1].set_title("Price Over Time")
+            # session composition pie
+            ax_pie = self._fig.add_subplot(self._gs[0, 2])
+            n_humans, n_agents = self.market.Nhumans, self.market.Nagents
+            ax_pie.set_facecolor(c['panel'])
+            wedges, _ = ax_pie.pie([n_humans, n_agents], colors=[c['blue'], c['red']],
+                                   startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': c['bg']})
+            ax_pie.legend(wedges, [f'H ({n_humans})', f'A ({n_agents})'],
+                          loc='lower center', fontsize=8, frameon=False,
+                          labelcolor=c['text'], bbox_to_anchor=(0.5, -0.05))
+            ax_pie.set_title("Session Mix", color=c['text'], fontsize=11, fontweight='bold')
 
-            self._axes[1, 0].plot(revenue_per_step, color='teal', linewidth=1.5)
-            self._axes[1, 0].set_xlabel("Time Step")
-            self._axes[1, 0].set_ylabel("Revenue")
-            self._axes[1, 0].set_title("Revenue per Step")
+            # price heatmap over time
+            ax_pheat = self._fig.add_subplot(self._gs[1, :2])
+            im_p = ax_pheat.imshow(price_mat, aspect='auto', cmap='viridis', origin='lower')
+            self._style_axis(ax_pheat, "Price Heatmap P(product, t)", "Step", "Product")
+            cbar_p = self._fig.colorbar(im_p, ax=ax_pheat, fraction=0.03, pad=0.02)
+            cbar_p.ax.tick_params(colors=c['text'], labelsize=7)
+            cbar_p.set_label('$', color=c['text'], fontsize=8)
 
-            self._axes[1, 1].plot(demand_variance, color='orangered', linewidth=1.5)
-            self._axes[1, 1].set_xlabel("Time Step")
-            self._axes[1, 1].set_ylabel("Variance")
-            self._axes[1, 1].set_title("Demand Variance")
+            # demand heatmap over time
+            ax_dheat = self._fig.add_subplot(self._gs[1, 2])
+            im_d = ax_dheat.imshow(demand_mat, aspect='auto', cmap=cmap, origin='lower')
+            self._style_axis(ax_dheat, "Demand Q(product, t)", "Step", None)
+            cbar_d = self._fig.colorbar(im_d, ax=ax_dheat, fraction=0.046, pad=0.02)
+            cbar_d.ax.tick_params(colors=c['text'], labelsize=7)
 
-            self._fig.tight_layout()
-            self._fig.canvas.draw()
+            # cross-correlation matrix (price-demand covariance per product)
+            ax_corr = self._fig.add_subplot(self._gs[2, 0])
+            if len(self._price_history) > 2:
+                corr_mat = np.corrcoef(price_mat, demand_mat)[:self.n_products, self.n_products:]
+                im_corr = ax_corr.imshow(corr_mat, cmap=cmap_div, vmin=-1, vmax=1, aspect='auto')
+                ax_corr.set_xticks(range(self.n_products))
+                ax_corr.set_yticks(range(self.n_products))
+                ax_corr.set_xticklabels([f'Q{i}' for i in range(self.n_products)], fontsize=6)
+                ax_corr.set_yticklabels([f'P{i}' for i in range(self.n_products)], fontsize=6)
+                cbar_c = self._fig.colorbar(im_corr, ax=ax_corr, fraction=0.046, pad=0.02)
+                cbar_c.ax.tick_params(colors=c['text'], labelsize=7)
+            self._style_axis(ax_corr, "Price-Demand Correlation", None, None)
+
+            # revenue curve with demand dispersion (std dev shows concentration)
+            ax_rev = self._fig.add_subplot(self._gs[2, 1:])
+            n_steps = len(self._revenue_history)
+            demand_std = [np.std(d) for d in self._demand_history]
+            ax_rev.fill_between(range(n_steps), self._revenue_history, alpha=0.3, color=c['green'])
+            ax_rev.plot(self._revenue_history, color=c['green'], linewidth=2, label='Revenue')
+            ax_rev.set_xlim(0, max(n_steps, 1))
+            ax_rev.set_ylim(0, max(self._revenue_history) * 1.1 if self._revenue_history else 1)
+            ax2 = ax_rev.twinx()
+            ax2.plot(range(n_steps), demand_std, color=c['blue'], linewidth=2, ls='-', alpha=0.9, label='σ(Demand)')
+            d_min, d_max = min(demand_std), max(demand_std)
+            margin = (d_max - d_min) * 0.2 if d_max > d_min else 0.5
+            ax2.set_ylim(max(0, d_min - margin), d_max + margin)
+            ax2.tick_params(axis='y', colors=c['blue'], labelsize=8)
+            ax2.spines['right'].set_color(c['blue'])
+            ax2.set_ylabel('Demand σ', color=c['blue'], fontsize=9)
+            self._style_axis(ax_rev, "Revenue & Demand Dispersion", "Step", "Revenue ($)")
+            ax_rev.legend(loc='upper left', fontsize=7, frameon=False, labelcolor=c['text'])
+            ax2.legend(loc='upper right', fontsize=7, frameon=False, labelcolor=c['text'])
+
+            self._fig.canvas.draw_idle()
             self._fig.canvas.flush_events()
-            plt.pause(0.01)
+            plt.pause(0.05)
 
         elif self.render_mode == "ansi":
             return f"step={self._step_count}, prices={self._prices}, demand={self._demand}"
@@ -128,15 +231,15 @@ class PHANTOM(gym.Env):
 
     def close(self):
         if self._fig: plt.close(self._fig)
-        self._fig, self._axes = None, None
+        self._fig = None
 
 
 if __name__ == "__main__":
-    env = PHANTOM(n_products=100, render_mode="human")
+    env = PHANTOM(n_products=15, alpha=0.3, N=100, render_mode="human")
     obs, _ = env.reset()
-    for _ in range(100):
+    for step in range(100):
         action = env.action_space.sample()
         obs, reward, term, trunc, info = env.step(action)
         env.render()
-        print(f"Reward: {reward:.2f}")
         if term: break
+    env.close()

From 28d3f6853e197954ebe43872763c2dde6419f8c4 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 30 Jan 2026 13:17:12 +0100
Subject: [PATCH 60/99] chore: refactor wrapper

---
 engine/lib/__init__.py |   3 +
 engine/lib/render.py   | 126 +++++++++++++++++++++++++++++++
 engine/train.py        |  45 +++++++++++
 engine/wrapper.py      | 165 +++++------------------------------------
 4 files changed, 193 insertions(+), 146 deletions(-)
 create mode 100644 engine/lib/__init__.py
 create mode 100644 engine/lib/render.py
 create mode 100644 engine/train.py

diff --git a/engine/lib/__init__.py b/engine/lib/__init__.py
new file mode 100644
index 0000000..8e17835
--- /dev/null
+++ b/engine/lib/__init__.py
@@ -0,0 +1,3 @@
+from .demand import generate_demand, estimate_demand
+from .behavior import sample_behavior
+from .render import DashboardRenderer, style_axis
diff --git a/engine/lib/render.py b/engine/lib/render.py
new file mode 100644
index 0000000..a16f215
--- /dev/null
+++ b/engine/lib/render.py
@@ -0,0 +1,126 @@
+"""rendering logic for PHANTOM environment dashboard"""
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.gridspec import GridSpec
+
+
+def style_axis(ax, title: str = None, xlabel: str = None, ylabel: str = None):
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    if title: ax.set_title(title, fontsize=11, fontweight='bold', pad=8)
+    if xlabel: ax.set_xlabel(xlabel, fontsize=9)
+    if ylabel: ax.set_ylabel(ylabel, fontsize=9)
+
+
+class DashboardRenderer:
+    """stateful renderer for PHANTOM market dynamics visualization"""
+
+    def __init__(self):
+        self.fig = None
+        self.gs = None
+
+    def render(self, env) -> None:
+        if self.fig is None:
+            plt.ion()
+            self.fig = plt.figure(figsize=(14, 10))
+            self.gs = GridSpec(3, 3, figure=self.fig, hspace=0.35, wspace=0.3,
+                               left=0.07, right=0.95, top=0.92, bottom=0.08)
+            plt.show(block=False)
+
+        self.fig.clear()
+        self.fig.suptitle(f'PHANTOM  Market Dynamics  [t={env._step_count}, a={env.alpha:.2f}]',
+                          fontsize=14, fontweight='bold')
+
+        demand_mat = np.array(env._demand_history).T
+        price_mat = np.array(env._price_history).T
+        elasticity = env._compute_elasticity()
+
+        self._render_scatter(env)
+        self._render_elasticity_bar(env, elasticity)
+        self._render_session_pie(env)
+        self._render_price_heatmap(price_mat)
+        self._render_demand_heatmap(demand_mat)
+        self._render_correlation(env.n_products, price_mat, demand_mat)
+        self._render_revenue(env)
+
+        self.fig.canvas.draw_idle()
+        self.fig.canvas.flush_events()
+
+    def _render_scatter(self, env):
+        ax = self.fig.add_subplot(self.gs[0, 0])
+        prices_flat = np.array(env._price_history).flatten()
+        demands_flat = np.array(env._demand_history).flatten()
+        product_ids = np.tile(np.arange(env.n_products), len(env._price_history))
+        ax.scatter(prices_flat, demands_flat, c=product_ids, cmap='plasma', alpha=0.6, s=15, edgecolors='none')
+        if len(prices_flat) > 1:
+            z = np.polyfit(prices_flat, demands_flat, 1)
+            p_line = np.linspace(prices_flat.min(), prices_flat.max(), 50)
+            ax.plot(p_line, np.polyval(z, p_line), '--', lw=1.5, alpha=0.8)
+        style_axis(ax, "Price-Demand Relationship", "Price ($)", "Demand")
+
+    def _render_elasticity_bar(self, env, elasticity):
+        ax = self.fig.add_subplot(self.gs[0, 1])
+        ax.barh(range(env.n_products), elasticity, alpha=0.8)
+        ax.axvline(0, lw=0.8, alpha=0.5)
+        ax.axvline(-1, lw=1, ls='--', alpha=0.5)
+        ax.set_yticks(range(env.n_products))
+        ax.set_yticklabels([f'P{i}' for i in range(env.n_products)], fontsize=7)
+        style_axis(ax, "Price Elasticity", "(dQ/dP)(P/Q)", None)
+
+    def _render_session_pie(self, env):
+        ax = self.fig.add_subplot(self.gs[0, 2])
+        n_h, n_a = env.market.Nhumans, env.market.Nagents
+        wedges, _ = ax.pie([n_h, n_a], startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'})
+        ax.legend(wedges, [f'H ({n_h})', f'A ({n_a})'], loc='lower center', fontsize=8,
+                  frameon=False, bbox_to_anchor=(0.5, -0.05))
+        ax.set_title("Session Mix", fontsize=11, fontweight='bold')
+
+    def _render_price_heatmap(self, price_mat):
+        ax = self.fig.add_subplot(self.gs[1, :2])
+        im = ax.imshow(price_mat, aspect='auto', cmap='viridis', origin='lower')
+        style_axis(ax, "Price Heatmap P(product, t)", "Step", "Product")
+        cbar = self.fig.colorbar(im, ax=ax, fraction=0.03, pad=0.02)
+        cbar.set_label('$', fontsize=8)
+
+    def _render_demand_heatmap(self, demand_mat):
+        ax = self.fig.add_subplot(self.gs[1, 2])
+        im = ax.imshow(demand_mat, aspect='auto', cmap='Blues', origin='lower')
+        style_axis(ax, "Demand Q(product, t)", "Step", None)
+        self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
+
+    def _render_correlation(self, n_products, price_mat, demand_mat):
+        ax = self.fig.add_subplot(self.gs[2, 0])
+        if price_mat.shape[1] > 2:
+            corr = np.corrcoef(price_mat, demand_mat)[:n_products, n_products:]
+            im = ax.imshow(corr, cmap='RdBu', vmin=-1, vmax=1, aspect='auto')
+            ax.set_xticks(range(n_products))
+            ax.set_yticks(range(n_products))
+            ax.set_xticklabels([f'Q{i}' for i in range(n_products)], fontsize=6)
+            ax.set_yticklabels([f'P{i}' for i in range(n_products)], fontsize=6)
+            self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
+        style_axis(ax, "Price-Demand Correlation", None, None)
+
+    def _render_revenue(self, env):
+        ax = self.fig.add_subplot(self.gs[2, 1:])
+        n_steps = len(env._revenue_history)
+        demand_std = [np.std(d) for d in env._demand_history]
+        ax.fill_between(range(n_steps), env._revenue_history, alpha=0.3)
+        ax.plot(env._revenue_history, linewidth=2, label='Revenue')
+        ax.set_xlim(0, max(n_steps, 1))
+        ax.set_ylim(0, max(env._revenue_history) * 1.1 if env._revenue_history else 1)
+
+        ax2 = ax.twinx()
+        ax2.plot(range(n_steps), demand_std, linewidth=2, ls='-', alpha=0.9, label='sigma(Demand)')
+        d_min, d_max = min(demand_std), max(demand_std)
+        margin = (d_max - d_min) * 0.2 if d_max > d_min else 0.5
+        ax2.set_ylim(max(0, d_min - margin), d_max + margin)
+        ax2.set_ylabel('Demand sigma', fontsize=9)
+
+        style_axis(ax, "Revenue & Demand Dispersion", "Step", "Revenue ($)")
+        ax.legend(loc='upper left', fontsize=7, frameon=False)
+        ax2.legend(loc='upper right', fontsize=7, frameon=False)
+
+    def close(self):
+        if self.fig:
+            plt.close(self.fig)
+            self.fig = None
diff --git a/engine/train.py b/engine/train.py
new file mode 100644
index 0000000..496ecfd
--- /dev/null
+++ b/engine/train.py
@@ -0,0 +1,45 @@
+from stable_baselines3 import SAC
+from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
+from .wrapper import PHANTOM
+
+
+class RenderCallback(BaseCallback):
+    """Renders environment on every step for live visualization."""
+    def __init__(self, env: PHANTOM):
+        super().__init__()
+        self.env = env
+
+    def _on_step(self) -> bool:
+        self.env.render()
+        return True
+
+
+env = PHANTOM(n_products=10, alpha=0.3, render_mode="human")
+eval_env = PHANTOM(n_products=10, alpha=0.3, render_mode=None)
+
+model = SAC(
+    "MultiInputPolicy",
+    env,
+    verbose=1,
+    learning_rate=3e-4,
+    buffer_size=50000,
+    batch_size=256,
+    tau=0.005,
+    gamma=0.99,
+)
+
+render_cb = RenderCallback(env)
+eval_cb = EvalCallback(eval_env, eval_freq=1000, n_eval_episodes=5, verbose=1)
+
+model.learn(total_timesteps=50000, callback=[render_cb, eval_cb])
+model.save("phantom_sac")
+
+# test trained policy
+env = PHANTOM(n_products=10, alpha=0.3, render_mode="human")
+obs, _ = env.reset()
+for _ in range(100):
+    action, _ = model.predict(obs, deterministic=True)
+    obs, reward, term, trunc, _ = env.step(action)
+    env.render()
+    if term or trunc: break
+env.close()
diff --git a/engine/wrapper.py b/engine/wrapper.py
index 7637998..0301082 100644
--- a/engine/wrapper.py
+++ b/engine/wrapper.py
@@ -1,10 +1,8 @@
 import gymnasium as gym
 from gymnasium import spaces
 import numpy as np
-import matplotlib.pyplot as plt
-from matplotlib.gridspec import GridSpec
-import matplotlib.colors as mcolors
 from .engine import Limbo, MarketEngine, PricingEngine
+from .lib.render import DashboardRenderer
 
 
 class PHANTOM(gym.Env):
@@ -16,7 +14,7 @@ class PHANTOM(gym.Env):
                  alpha: float = 0.3,
                  N: int = 100,
                  price_bounds: tuple = (10.0, 150.0),
-                 lambda_coi: float = 0.1,  # coi leakage penalty weight
+                 lambda_coi: float = 0.1,
                  render_mode: str = None):
         super().__init__()
         self.n_products = n_products
@@ -30,12 +28,10 @@ class PHANTOM(gym.Env):
         self._platform_stub = PricingEngine()
         self._limbo = Limbo(self._platform_stub, self.market)
 
-        # action: continuous prices for each product
         self.action_space = spaces.Box(
             low=price_bounds[0], high=price_bounds[1],
             shape=(n_products,), dtype=np.float32
         )
-        # observation: demand estimate + previous prices
         self.observation_space = spaces.Dict({
             "demand": spaces.Box(low=0.0, high=100.0, shape=(n_products,), dtype=np.float32),
             "prices": spaces.Box(low=price_bounds[0], high=price_bounds[1], shape=(n_products,), dtype=np.float32),
@@ -47,30 +43,22 @@ class PHANTOM(gym.Env):
         self._demand_history = []
         self._price_history = []
         self._revenue_history = []
-        self._fig = None
-        self._gs = None
-        self._dashboard_colors = {
-            'bg': '#f5f0e8', 'panel': '#ebe3d5', 'accent': '#c9b99a',
-            'text': '#3d3229', 'green': '#5c7a5c', 'red': '#8b4049',
-            'blue': '#5a7384', 'orange': '#b87333', 'purple': '#7d6b7d'
-        }
+        self._renderer = None
 
     def _get_obs(self) -> dict:
         demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32)
         return {"demand": demand_arr, "prices": self._prices.astype(np.float32)}
 
     def _compute_reward(self, prices: np.ndarray, demand: dict) -> float:
-        demand_arr = np.array([demand.get(i, 0.0) for i in range(self.n_products)])
-        revenue = np.sum(prices * demand_arr)  # revenue = price * quantity proxy
-        base_price = self.price_bounds[0]
-        return float(revenue)# - self.lambda_coi * coi_leak)
+        revenue = np.sum(prices * np.array([demand.get(i, 0.0) for i in range(self.n_products)]))
+        # TODO: implement supra-competitive price punishment
+        return float(revenue)
 
     def _record_history(self):
         demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)])
         self._demand_history.append(demand_arr)
         self._price_history.append(self._prices.copy())
-        revenue = np.sum(self._prices * demand_arr)
-        self._revenue_history.append(revenue)
+        self._revenue_history.append(np.sum(self._prices * demand_arr))
 
     def reset(self, seed=None, options=None):
         super().reset(seed=seed)
@@ -89,149 +77,34 @@ class PHANTOM(gym.Env):
 
         reward = self._compute_reward(self._prices, self._demand)
         terminated = self._step_count >= 100
-        truncated = False
 
-        return self._get_obs(), reward, terminated, truncated, {"step": self._step_count}
+        return self._get_obs(), reward, terminated, False, {"step": self._step_count}
 
     def _compute_elasticity(self) -> np.ndarray:
-        """point elasticity: e = (dQ/dP) * (P/Q) estimated via finite differences, clipped to [-5, 5]"""
+        """point elasticity: e = (dQ/dP) * (P/Q) via finite differences, clipped to [-5, 5]"""
         if len(self._price_history) < 2:
             return np.zeros(self.n_products)
-        p = np.array(self._price_history)
-        q = np.array(self._demand_history)
-        dp = np.diff(p, axis=0)
-        dq = np.diff(q, axis=0)
-        min_dp = 0.5  # ignore tiny price changes to avoid explosions
-        valid = np.abs(dp) > min_dp
+        p, q = np.array(self._price_history), np.array(self._demand_history)
+        dp, dq = np.diff(p, axis=0), np.diff(q, axis=0)
+        valid = np.abs(dp) > 0.5
         with np.errstate(divide='ignore', invalid='ignore'):
             elasticity = np.where(valid, (dq / dp) * (p[:-1] / np.maximum(q[:-1], 1.0)), 0.0)
-            elasticity = np.clip(elasticity, -5.0, 5.0)
-            elasticity = np.nan_to_num(elasticity, nan=0.0)
+            elasticity = np.nan_to_num(np.clip(elasticity, -5.0, 5.0), nan=0.0)
         return np.mean(elasticity, axis=0) if len(elasticity) > 0 else np.zeros(self.n_products)
 
-    def _style_axis(self, ax, title: str = None, xlabel: str = None, ylabel: str = None):
-        c = self._dashboard_colors
-        ax.set_facecolor(c['panel'])
-        ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)
-        ax.spines['bottom'].set_color(c['accent']); ax.spines['left'].set_color(c['accent'])
-        ax.tick_params(colors=c['text'], labelsize=8)
-        if title: ax.set_title(title, color=c['text'], fontsize=11, fontweight='bold', pad=8)
-        if xlabel: ax.set_xlabel(xlabel, color=c['text'], fontsize=9)
-        if ylabel: ax.set_ylabel(ylabel, color=c['text'], fontsize=9)
-
     def render(self):
         if self.render_mode == "human":
-            c = self._dashboard_colors
-            if self._fig is None:
-                plt.ion()
-                self._fig = plt.figure(figsize=(14, 10), facecolor=c['bg'])
-                self._gs = GridSpec(3, 3, figure=self._fig, hspace=0.35, wspace=0.3,
-                                    left=0.07, right=0.95, top=0.92, bottom=0.08)
-                plt.show(block=False)
-
-            self._fig.clear()
-            self._fig.suptitle(f'PHANTOM  Market Dynamics  [t={self._step_count}, α={self.alpha:.2f}]',
-                              color=c['text'], fontsize=14, fontweight='bold')
-
-            demand_mat = np.array(self._demand_history).T
-            price_mat = np.array(self._price_history).T
-            elasticity = self._compute_elasticity()
-            cmap = mcolors.LinearSegmentedColormap.from_list('phantom', [c['bg'], c['blue'], c['green']])
-            cmap_div = mcolors.LinearSegmentedColormap.from_list('elast', [c['red'], c['bg'], c['blue']])
-
-            # price-demand elasticity scatter (all historical data points)
-            ax_elast = self._fig.add_subplot(self._gs[0, 0])
-            prices_flat = np.array(self._price_history).flatten()
-            demands_flat = np.array(self._demand_history).flatten()
-            product_ids = np.tile(np.arange(self.n_products), len(self._price_history))
-            scatter = ax_elast.scatter(prices_flat, demands_flat, c=product_ids, cmap='plasma',
-                                       alpha=0.6, s=15, edgecolors='none')
-            if len(prices_flat) > 1:  # fit regression line
-                z = np.polyfit(prices_flat, demands_flat, 1)
-                p_line = np.linspace(prices_flat.min(), prices_flat.max(), 50)
-                ax_elast.plot(p_line, np.polyval(z, p_line), '--', color=c['red'], lw=1.5, alpha=0.8)
-            self._style_axis(ax_elast, "Price-Demand Relationship", "Price ($)", "Demand")
-
-            # elasticity coefficients bar
-            ax_ebar = self._fig.add_subplot(self._gs[0, 1])
-            colors_e = [c['red'] if e < -0.5 else c['blue'] if e > 0.5 else c['accent'] for e in elasticity]
-            ax_ebar.barh(range(self.n_products), elasticity, color=colors_e, alpha=0.8, edgecolor=c['bg'])
-            ax_ebar.axvline(0, color=c['text'], lw=0.8, alpha=0.5)
-            ax_ebar.axvline(-1, color=c['red'], lw=1, ls='--', alpha=0.5)  # unit elastic reference
-            ax_ebar.set_yticks(range(self.n_products))
-            ax_ebar.set_yticklabels([f'P{i}' for i in range(self.n_products)], fontsize=7)
-            self._style_axis(ax_ebar, "Price Elasticity ε", "ε = (ΔQ/ΔP)·(P/Q)", None)
-
-            # session composition pie
-            ax_pie = self._fig.add_subplot(self._gs[0, 2])
-            n_humans, n_agents = self.market.Nhumans, self.market.Nagents
-            ax_pie.set_facecolor(c['panel'])
-            wedges, _ = ax_pie.pie([n_humans, n_agents], colors=[c['blue'], c['red']],
-                                   startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': c['bg']})
-            ax_pie.legend(wedges, [f'H ({n_humans})', f'A ({n_agents})'],
-                          loc='lower center', fontsize=8, frameon=False,
-                          labelcolor=c['text'], bbox_to_anchor=(0.5, -0.05))
-            ax_pie.set_title("Session Mix", color=c['text'], fontsize=11, fontweight='bold')
-
-            # price heatmap over time
-            ax_pheat = self._fig.add_subplot(self._gs[1, :2])
-            im_p = ax_pheat.imshow(price_mat, aspect='auto', cmap='viridis', origin='lower')
-            self._style_axis(ax_pheat, "Price Heatmap P(product, t)", "Step", "Product")
-            cbar_p = self._fig.colorbar(im_p, ax=ax_pheat, fraction=0.03, pad=0.02)
-            cbar_p.ax.tick_params(colors=c['text'], labelsize=7)
-            cbar_p.set_label('$', color=c['text'], fontsize=8)
-
-            # demand heatmap over time
-            ax_dheat = self._fig.add_subplot(self._gs[1, 2])
-            im_d = ax_dheat.imshow(demand_mat, aspect='auto', cmap=cmap, origin='lower')
-            self._style_axis(ax_dheat, "Demand Q(product, t)", "Step", None)
-            cbar_d = self._fig.colorbar(im_d, ax=ax_dheat, fraction=0.046, pad=0.02)
-            cbar_d.ax.tick_params(colors=c['text'], labelsize=7)
-
-            # cross-correlation matrix (price-demand covariance per product)
-            ax_corr = self._fig.add_subplot(self._gs[2, 0])
-            if len(self._price_history) > 2:
-                corr_mat = np.corrcoef(price_mat, demand_mat)[:self.n_products, self.n_products:]
-                im_corr = ax_corr.imshow(corr_mat, cmap=cmap_div, vmin=-1, vmax=1, aspect='auto')
-                ax_corr.set_xticks(range(self.n_products))
-                ax_corr.set_yticks(range(self.n_products))
-                ax_corr.set_xticklabels([f'Q{i}' for i in range(self.n_products)], fontsize=6)
-                ax_corr.set_yticklabels([f'P{i}' for i in range(self.n_products)], fontsize=6)
-                cbar_c = self._fig.colorbar(im_corr, ax=ax_corr, fraction=0.046, pad=0.02)
-                cbar_c.ax.tick_params(colors=c['text'], labelsize=7)
-            self._style_axis(ax_corr, "Price-Demand Correlation", None, None)
-
-            # revenue curve with demand dispersion (std dev shows concentration)
-            ax_rev = self._fig.add_subplot(self._gs[2, 1:])
-            n_steps = len(self._revenue_history)
-            demand_std = [np.std(d) for d in self._demand_history]
-            ax_rev.fill_between(range(n_steps), self._revenue_history, alpha=0.3, color=c['green'])
-            ax_rev.plot(self._revenue_history, color=c['green'], linewidth=2, label='Revenue')
-            ax_rev.set_xlim(0, max(n_steps, 1))
-            ax_rev.set_ylim(0, max(self._revenue_history) * 1.1 if self._revenue_history else 1)
-            ax2 = ax_rev.twinx()
-            ax2.plot(range(n_steps), demand_std, color=c['blue'], linewidth=2, ls='-', alpha=0.9, label='σ(Demand)')
-            d_min, d_max = min(demand_std), max(demand_std)
-            margin = (d_max - d_min) * 0.2 if d_max > d_min else 0.5
-            ax2.set_ylim(max(0, d_min - margin), d_max + margin)
-            ax2.tick_params(axis='y', colors=c['blue'], labelsize=8)
-            ax2.spines['right'].set_color(c['blue'])
-            ax2.set_ylabel('Demand σ', color=c['blue'], fontsize=9)
-            self._style_axis(ax_rev, "Revenue & Demand Dispersion", "Step", "Revenue ($)")
-            ax_rev.legend(loc='upper left', fontsize=7, frameon=False, labelcolor=c['text'])
-            ax2.legend(loc='upper right', fontsize=7, frameon=False, labelcolor=c['text'])
-
-            self._fig.canvas.draw_idle()
-            self._fig.canvas.flush_events()
-            plt.pause(0.05)
-
+            if self._renderer is None:
+                self._renderer = DashboardRenderer()
+            self._renderer.render(self)
         elif self.render_mode == "ansi":
             return f"step={self._step_count}, prices={self._prices}, demand={self._demand}"
         return None
 
     def close(self):
-        if self._fig: plt.close(self._fig)
-        self._fig = None
+        if self._renderer:
+            self._renderer.close()
+            self._renderer = None
 
 
 if __name__ == "__main__":

From 52fe8655987876fb8bd22191f455b28f9417784f Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 30 Jan 2026 13:18:20 +0100
Subject: [PATCH 61/99] feature: drafting studies directory

---
 engine/studies/factors.py        |  34 ++++++++++
 engine/studies/full_factorial.py |  89 ++++++++++++++++++++++++++
 engine/studies/mixed_lh.py       | 106 +++++++++++++++++++++++++++++++
 3 files changed, 229 insertions(+)
 create mode 100644 engine/studies/factors.py
 create mode 100644 engine/studies/full_factorial.py
 create mode 100644 engine/studies/mixed_lh.py

diff --git a/engine/studies/factors.py b/engine/studies/factors.py
new file mode 100644
index 0000000..1fbfbe1
--- /dev/null
+++ b/engine/studies/factors.py
@@ -0,0 +1,34 @@
+"""shared factor definitions for experimental designs"""
+import numpy as np
+from dataclasses import dataclass, field
+from typing import Callable, Any
+
+@dataclass
+class Factor:
+    name: str
+    levels: list
+    primary: bool = True  # full cross vs sampled
+
+# demand functions with compatible signatures
+def demand_linear(mu, sigma, size): return np.maximum(0, np.random.normal(mu, sigma, size))
+def demand_uniform(mu, sigma, size): return np.random.uniform(mu - sigma, mu + sigma, size)
+def demand_exponential(mu, sigma, size): return np.random.exponential(mu, size)
+def demand_logistic(mu, sigma, size): return np.random.logistic(mu, sigma, size)
+
+DEMAND_FUNCTIONS = {
+    "linear": demand_linear,
+    "uniform": demand_uniform,
+    "exponential": demand_exponential,
+    "logistic": demand_logistic,
+}
+
+FACTORS = [
+    Factor("demand_fn", list(DEMAND_FUNCTIONS.keys()), primary=True),
+    Factor("alpha", [0.1, 0.3, 0.5, 0.7], primary=True),
+    Factor("n_products", [5, 15, 30, 50], primary=True),
+    Factor("demand_mu", [30.0, 50.0, 70.0], primary=False),
+    Factor("demand_sigma", [5.0, 10.0, 20.0], primary=False),
+    Factor("N", [100, 500, 1000], primary=False),
+]
+
+SEEDS_PER_CONFIG = 5
diff --git a/engine/studies/full_factorial.py b/engine/studies/full_factorial.py
new file mode 100644
index 0000000..9b4d1eb
--- /dev/null
+++ b/engine/studies/full_factorial.py
@@ -0,0 +1,89 @@
+"""full factorial design - all factor combinations"""
+import sys
+sys.path.insert(0, "..")
+import logging
+from itertools import product
+import json
+import hashlib
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
+from .factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+
+def generate_configs():
+    """generate all factor combinations with seeds"""
+    all_levels = [f.levels for f in FACTORS]
+    names = [f.name for f in FACTORS]
+
+    configs = []
+    for combo in product(*all_levels):
+        base = {names[i]: combo[i] for i in range(len(names))}
+        for seed in range(SEEDS_PER_CONFIG):
+            cfg = {**base, "seed": seed}
+            cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
+            configs.append(cfg)
+    return configs
+
+def run_single(cfg: dict) -> dict:
+    """execute one experiment config, return metrics"""
+    from engine.wrapper import PHANTOM
+    import numpy as np
+
+    np.random.seed(cfg["seed"])
+    demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]]
+
+    env = PHANTOM(
+        n_products=cfg["n_products"],
+        alpha=cfg["alpha"],
+        N=cfg["N"],
+    )
+    env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"]))
+
+    obs, _ = env.reset()
+    total_reward, steps = 0.0, 0
+
+    for _ in range(100):
+        action = env.action_space.sample()
+        obs, reward, term, trunc, _ = env.step(action)
+        total_reward += reward
+        steps += 1
+        if term: break
+
+    env.close()
+    return {
+        "id": cfg["id"],
+        "config": cfg,
+        "total_reward": total_reward,
+        "avg_reward": total_reward / steps,
+        "steps": steps,
+    }
+
+def run_study(max_workers: int = None, output: str = "results_full.jsonl"):
+    configs = generate_configs()
+    log.info(f"full factorial: {len(configs)} configs ({len(configs)//SEEDS_PER_CONFIG} unique × {SEEDS_PER_CONFIG} seeds)")
+
+    results = []
+    with ProcessPoolExecutor(max_workers=max_workers) as ex:
+        for i, result in enumerate(ex.map(run_single, configs)):
+            results.append(result)
+            if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
+
+    Path(output).write_text("\n".join(json.dumps(r) for r in results))
+    log.info(f"wrote {len(results)} results to {output}")
+    return results
+
+if __name__ == "__main__":
+    import argparse
+    p = argparse.ArgumentParser()
+    p.add_argument("--workers", type=int, default=None)
+    p.add_argument("--output", default="results_full.jsonl")
+    p.add_argument("--dry-run", action="store_true", help="only show design size")
+    args = p.parse_args()
+
+    configs = generate_configs()
+    log.info(f"design: {len(configs)} runs | factors: {[f.name for f in FACTORS]} | levels: {[len(f.levels) for f in FACTORS]}")
+
+    if not args.dry_run:
+        run_study(args.workers, args.output)
diff --git a/engine/studies/mixed_lh.py b/engine/studies/mixed_lh.py
new file mode 100644
index 0000000..33ea2ee
--- /dev/null
+++ b/engine/studies/mixed_lh.py
@@ -0,0 +1,106 @@
+"""mixed design: full factorial on primary factors, latin hypercube on secondary"""
+import sys
+sys.path.insert(0, "..")
+import logging
+from itertools import product
+import json
+import hashlib
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
+import numpy as np
+from scipy.stats.qmc import LatinHypercube
+from factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+
+LH_SAMPLES = 10
+
+def generate_configs(lh_samples: int = LH_SAMPLES):
+    primary = [f for f in FACTORS if f.primary]
+    secondary = [f for f in FACTORS if not f.primary]
+
+    primary_grid = list(product(*[f.levels for f in primary]))
+    lhs = LatinHypercube(d=len(secondary), seed=42)
+
+    configs = []
+    for p_combo in primary_grid:
+        samples = lhs.random(n=lh_samples)
+        for s in samples:
+            sec_vals = {
+                secondary[i].name: secondary[i].levels[int(s[i] * len(secondary[i].levels))]
+                for i in range(len(secondary))
+            }
+            base = {primary[i].name: p_combo[i] for i in range(len(primary))}
+            base.update(sec_vals)
+
+            for seed in range(SEEDS_PER_CONFIG):
+                cfg = {**base, "seed": seed}
+                cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
+                configs.append(cfg)
+    return configs
+
+def run_single(cfg: dict) -> dict:
+    from engine.wrapper import PHANTOM
+    import numpy as np
+
+    np.random.seed(cfg["seed"])
+    demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]]
+
+    env = PHANTOM(
+        n_products=cfg["n_products"],
+        alpha=cfg["alpha"],
+        N=cfg["N"],
+    )
+    env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"]))
+
+    obs, _ = env.reset()
+    total_reward, steps = 0.0, 0
+
+    for _ in range(100):
+        action = env.action_space.sample()
+        obs, reward, term, trunc, _ = env.step(action)
+        total_reward += reward
+        steps += 1
+        if term: break
+
+    env.close()
+    return {
+        "id": cfg["id"],
+        "config": cfg,
+        "total_reward": total_reward,
+        "avg_reward": total_reward / steps,
+        "steps": steps,
+    }
+
+def run_study(max_workers: int = None, output: str = "results_mixed.jsonl", lh_samples: int = LH_SAMPLES):
+    configs = generate_configs(lh_samples)
+    n_primary_cells = int(np.prod([len(f.levels) for f in FACTORS if f.primary]))
+    log.info(f"mixed LH: {len(configs)} configs ({n_primary_cells} primary × {lh_samples} LH × {SEEDS_PER_CONFIG} seeds)")
+
+    results = []
+    with ProcessPoolExecutor(max_workers=max_workers) as ex:
+        for i, result in enumerate(ex.map(run_single, configs)):
+            results.append(result)
+            if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
+
+    Path(output).write_text("\n".join(json.dumps(r) for r in results))
+    log.info(f"wrote {len(results)} results to {output}")
+    return results
+
+if __name__ == "__main__":
+    import argparse
+    p = argparse.ArgumentParser()
+    p.add_argument("--workers", type=int, default=None)
+    p.add_argument("--output", default="results_mixed.jsonl")
+    p.add_argument("--lh-samples", type=int, default=10)
+    p.add_argument("--dry-run", action="store_true", help="only show design size")
+    args = p.parse_args()
+
+    primary = [f for f in FACTORS if f.primary]
+    secondary = [f for f in FACTORS if not f.primary]
+    configs = generate_configs(args.lh_samples)
+    log.info(f"design: {len(configs)} runs | primary: {[f.name for f in primary]} | secondary (LH): {[f.name for f in secondary]}")
+
+    if not args.dry_run:
+        run_study(args.workers, args.output, args.lh_samples)

From ea4580184596b3a16a1b0ed432102af5c815a77c Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 30 Jan 2026 13:22:22 +0100
Subject: [PATCH 62/99] chore: removing the lab byproduct

---
 lab/README.md                         |  75 ------
 lab/__init__.py                       |  27 ---
 lab/case/__init__.py                  |   6 -
 lab/case/thesis/__init__.py           |  25 --
 lab/case/thesis/arrivals.py           | 327 --------------------------
 lab/case/thesis/execution.py          |  91 -------
 lab/case/thesis/metrics.py            | 102 --------
 lab/case/thesis/objectives.py         | 228 ------------------
 lab/case/thesis/platform.py           | 176 --------------
 lab/case/thesis/run_experiment.py     | 136 -----------
 lab/config.py                         | 156 ------------
 lab/docs/Makefile                     |  12 -
 lab/docs/conf.py                      |  39 ---
 lab/docs/index.rst                    |  40 ----
 lab/docs/modules/experiments.rst      |  14 --
 lab/docs/modules/outlet.rst           |  77 ------
 lab/docs/modules/population.rst       |  20 --
 lab/docs/system_overview.rst          |  97 --------
 lab/experiments/__init__.py           |   7 -
 lab/experiments/eval.py               | 213 -----------------
 lab/outlet/__init__.py                |  17 --
 lab/outlet/constants.py               |  83 -------
 lab/outlet/gym_wrapper.py             |  86 -------
 lab/outlet/math_util.py               |  57 -----
 lab/outlet/mechanisms/__init__.py     |   5 -
 lab/outlet/mechanisms/auction.py      |  73 ------
 lab/outlet/mechanisms/posted_price.py |  84 -------
 lab/outlet/mechanisms/two_sided.py    |  89 -------
 lab/outlet/objectives/__init__.py     |  11 -
 lab/outlet/objectives/base.py         |  48 ----
 lab/outlet/objectives/factory.py      |  82 -------
 lab/outlet/objectives/penalties.py    | 101 --------
 lab/outlet/observation.py             |  92 --------
 lab/outlet/platform.py                | 285 ----------------------
 lab/outlet/protocols.py               | 297 -----------------------
 lab/outlet/stock.py                   | 151 ------------
 lab/outlet/types.py                   | 318 -------------------------
 lab/population/__init__.py            |  10 -
 lab/population/arrivals.py            | 168 -------------
 lab/population/competitors.py         | 189 ---------------
 lab/population/execution.py           | 174 --------------
 lab/run_example.py                    |  59 -----
 42 files changed, 4347 deletions(-)
 delete mode 100644 lab/README.md
 delete mode 100644 lab/__init__.py
 delete mode 100644 lab/case/__init__.py
 delete mode 100644 lab/case/thesis/__init__.py
 delete mode 100644 lab/case/thesis/arrivals.py
 delete mode 100644 lab/case/thesis/execution.py
 delete mode 100644 lab/case/thesis/metrics.py
 delete mode 100644 lab/case/thesis/objectives.py
 delete mode 100644 lab/case/thesis/platform.py
 delete mode 100644 lab/case/thesis/run_experiment.py
 delete mode 100644 lab/config.py
 delete mode 100644 lab/docs/Makefile
 delete mode 100644 lab/docs/conf.py
 delete mode 100644 lab/docs/index.rst
 delete mode 100644 lab/docs/modules/experiments.rst
 delete mode 100644 lab/docs/modules/outlet.rst
 delete mode 100644 lab/docs/modules/population.rst
 delete mode 100644 lab/docs/system_overview.rst
 delete mode 100644 lab/experiments/__init__.py
 delete mode 100644 lab/experiments/eval.py
 delete mode 100644 lab/outlet/__init__.py
 delete mode 100644 lab/outlet/constants.py
 delete mode 100644 lab/outlet/gym_wrapper.py
 delete mode 100644 lab/outlet/math_util.py
 delete mode 100644 lab/outlet/mechanisms/__init__.py
 delete mode 100644 lab/outlet/mechanisms/auction.py
 delete mode 100644 lab/outlet/mechanisms/posted_price.py
 delete mode 100644 lab/outlet/mechanisms/two_sided.py
 delete mode 100644 lab/outlet/objectives/__init__.py
 delete mode 100644 lab/outlet/objectives/base.py
 delete mode 100644 lab/outlet/objectives/factory.py
 delete mode 100644 lab/outlet/objectives/penalties.py
 delete mode 100644 lab/outlet/observation.py
 delete mode 100644 lab/outlet/platform.py
 delete mode 100644 lab/outlet/protocols.py
 delete mode 100644 lab/outlet/stock.py
 delete mode 100644 lab/outlet/types.py
 delete mode 100644 lab/population/__init__.py
 delete mode 100644 lab/population/arrivals.py
 delete mode 100644 lab/population/competitors.py
 delete mode 100644 lab/population/execution.py
 delete mode 100644 lab/run_example.py

diff --git a/lab/README.md b/lab/README.md
deleted file mode 100644
index b5226aa..0000000
--- a/lab/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# MOS (Money Operating System)
-
-Research-grade quote-control simulator for studying dynamic pricing and market making policies.
-The system models pricing as a closed loop of **Quote → Arrival → Execution → Position**, enabling
-controlled experimentation with demand models, inventory constraints, and reward shaping.
-
-## Core Loop
-
-1. **Quote** – the policy posts prices (one-sided or two-sided depending on the mechanism).
-2. **Arrival** – a population model generates purchase opportunities or market orders.
-3. **Execution** – an execution model decides whether an arrival converts at the quoted price.
-4. **Position** – inventory/position limits censor fills and generate holding/shortage costs.
-5. **Observation & Reward** – censored fills and aggregate metrics are exposed to the agent, while
-   objectives turn metrics into a scalar reward.
-
-Each stage is pluggable via light-weight protocols so you can swap in alternative mechanisms,
-demand models, or objectives without rewriting the rest of the simulator.
-
-## Package Layout
-
-| Module            | Purpose |
-|-------------------|---------|
-| `lab.outlet`      | Core simulation engine, domain types, pricing mechanisms, objectives. |
-| `lab.population`  | Demand arrival models, execution probability models, competitor/market dynamics. |
-| `lab.experiments` | Rollout utilities, baseline policies, and off-policy evaluation helpers. |
-| `lab.config`      | Convenience factories for preconfigured retail and market-making environments. |
-
-## Preconfigured Scenarios
-
-### Retail Dynamic Pricing
-- Mechanism: posted prices with margin and delta constraints.
-- Arrivals: browsing sessions with contamination support (scrapers).
-- Execution: elasticity model with competitor cross-effects.
-- Position: inventory tracking with holding and shortage costs.
-- Market: reactive competitor that can trigger price wars.
-- Objective: PnL minus volatility, holding cost, and lost opportunity penalties.
-
-```python
-from lab.config import make_retail_platform
-from lab.experiments import rollout, fixed_price_policy
-
-platform = make_retail_platform()
-policy = fixed_price_policy(platform.instruments.refs)
-result = rollout(platform, policy, n_steps=100)
-print(result.total_pnl)
-```
-
-### Market Making
-- Mechanism: two-sided quoting with bid/ask spreads.
-- Arrivals: Hawkes order flow for clustered demand.
-- Execution: Avellaneda–Stoikov style intensity model.
-- Position: inventory risk limits and quadratic penalty objective.
-- Market: geometric Brownian motion mid-price process.
-- Objective: PnL plus spread capture minus inventory risk.
-
-```python
-from lab.config import make_market_making_platform
-from lab.experiments import rollout
-
-platform = make_market_making_platform()
-mm_policy = lambda obs, t: (platform.instruments.refs, 1.0)
-result = rollout(platform, mm_policy, n_steps=200, seed=42)
-print(result.total_pnl)
-```
-
-## Extending the Simulator
-
-- Implement `lab.outlet.protocols.Mechanism` or `ArrivalModel` to introduce new pricing
-domains or demand processes.
-- Compose objectives with `lab.outlet.objectives.factory.make_composite` to study alternate
-reward formulations.
-- Use `lab.experiments.compare_policies` to benchmark candidate policies across multiple
-random seeds.
-
-Comprehensive API documentation lives in `lab/docs` (build with `make html`).
diff --git a/lab/__init__.py b/lab/__init__.py
deleted file mode 100644
index cc6df0c..0000000
--- a/lab/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""
-Quote-Control Simulator: Research-grade platform for dynamic pricing and market making
-
-The platform abstracts pricing as: Quote -> Arrival -> Execution -> Position
-Supports multiple mechanisms:
-  - PostedPrice: retail dynamic pricing
-  - TwoSided: market making with bid-ask spreads
-  - Auction: reserve/shading for auction settings
-
-Example usage:
-    from lab.config import make_retail_platform
-    from lab.experiments import rollout, fixed_price_policy
-
-    platform = make_retail_platform()
-    policy = fixed_price_policy(platform.instruments.refs)
-    result = rollout(platform, policy, n_steps=100)
-    print(f"Total PnL: {result.total_pnl:.2f}")
-"""
-
-from .config import make_retail_platform, make_market_making_platform, RetailConfig, MarketMakingConfig
-from .outlet import Platform, PlatformConfig, Quote, Observation, StepResult
-
-__all__ = [
-    'make_retail_platform', 'make_market_making_platform',
-    'RetailConfig', 'MarketMakingConfig',
-    'Platform', 'PlatformConfig', 'Quote', 'Observation', 'StepResult',
-]
diff --git a/lab/case/__init__.py b/lab/case/__init__.py
deleted file mode 100644
index 44fbf8c..0000000
--- a/lab/case/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-"""
-Case studies implementing specific research scenarios.
-
-Available cases:
-- thesis: PHANTOM thesis implementation with contaminated demand and DR-RL
-"""
diff --git a/lab/case/thesis/__init__.py b/lab/case/thesis/__init__.py
deleted file mode 100644
index 31db465..0000000
--- a/lab/case/thesis/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""
-Thesis-specific implementation of the PHANTOM pricing defense framework.
-
-This module implements the mathematical models from the thesis:
-- ContaminatedArrivalModel: Mixture demand Q(p) = (1-α)d_H + αd_A (Eq 3)
-- HybridExecutionModel: Divergent H/A behavior with separability (Section 2.1)
-- RobustStackelbergObjective: Maximin objective with COI penalty (Eq 23)
-- COIMetrics: Cost of Information tracking (Definition 1)
-
-The platform configuration creates a research environment that directly
-maps to the thesis mathematical framework for DR-RL experiments.
-"""
-from .arrivals import ContaminatedArrivalModel, ContaminatedArrivalConfig
-from .execution import HybridExecutionModel, HybridExecutionConfig
-from .objectives import RobustStackelbergObjective, COIObjective
-from .platform import make_thesis_platform, ThesisConfig
-from .metrics import COIMetrics, compute_coi, compute_separability
-
-__all__ = [
-    'ContaminatedArrivalModel', 'ContaminatedArrivalConfig',
-    'HybridExecutionModel', 'HybridExecutionConfig',
-    'RobustStackelbergObjective', 'COIObjective',
-    'make_thesis_platform', 'ThesisConfig',
-    'COIMetrics', 'compute_coi', 'compute_separability',
-]
diff --git a/lab/case/thesis/arrivals.py b/lab/case/thesis/arrivals.py
deleted file mode 100644
index 909cab5..0000000
--- a/lab/case/thesis/arrivals.py
+++ /dev/null
@@ -1,327 +0,0 @@
-"""Contaminated arrivals using learned MDP kernels from behavior_loader.
-
-Implements thesis demand model (Section 3.1):
-- Aggregate demand Q(p) = (1-α)E[d(p;θ_H)] + αE[d(p;θ_A)] + ε_t  (Eq 3)
-- Demand proxy q̂_{t,i} = Σ_s Σ_k ω(a_{s,k}) · 1[i_{s,k} = i]     (Eq 2)
-- Per-session separability via KL divergence Δ_H, Δ_A              (Eq 20-21)
-
-The arrival model samples sessions from a mixture of human/agent behavioral profiles,
-each session produces a trajectory τ_s and associated demand computation q(τ').
-"""
-from __future__ import annotations
-from dataclasses import dataclass, field
-from types import SimpleNamespace
-from typing import Dict, List, Tuple, Optional
-import numpy as np
-from ...outlet.types import Opportunity, InstrumentSet, MarketState, HiddenState
-from ...outlet.constants import Side, OpportunityType
-from ...outlet.math_util import poisson_arrivals
-
-try:
-    import sys
-    from pathlib import Path
-    sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
-    from sim.rl.behavior_loader.models import (
-        BehaviorModel, AgentBehaviorModel, aggregate_event_transitions, kl_divergence
-    )
-    REAL_MDP = True
-except ImportError:
-    REAL_MDP = False
-    kl_divergence = None
-
-EVENT_PAGE = {"session_start": "/", "view_item_page": "/products", "learn_more_about_item": "/products/details",
-              "add_item_to_cart": "/cart", "purchase_complete": "/checkout", "session_end": "/checkout/success"}
-EVENT_CANON = {"page_view": "session_start", "hover_over_paragraph": "view_item_page", "hover_over_title": "view_item_page",
-               "view_item_page": "view_item_page", "learn_more_about_item": "learn_more_about_item",
-               "add_item_to_cart": "add_item_to_cart", "checkout_start": "purchase_complete", "remove_item": "view_item_page"}
-
-# action space partition A = A_nav ∪ A_cart ∪ A_filter ∪ A_dwell with signal weights ω (Table 1)
-ACTION_WEIGHTS: Dict[str, float] = {
-    "add_item_to_cart": 0.8, "remove_item": 0.6, "checkout_start": 0.9, "purchase_complete": 1.0,  # A_cart
-    "hover_over_title": 0.3, "hover_over_paragraph": 0.35, "hover_over_link": 0.25,               # A_dwell
-    "page_view": 0.1, "session_start": 0.05, "view_item_page": 0.15, "learn_more_about_item": 0.2, # A_nav
-    "search": 0.05, "filter_date": 0.05, "filter_price": 0.08, "sort": 0.03, "session_end": 0.0,   # A_filter
-}
-
-
-@dataclass
-class SessionDemand:
-    """Per-session demand computation per thesis formulation (Section 3.1).
-
-    Each session s ∈ S produces trajectory τ_s and demand proxy q̂. The platform uses
-    divergence signals Δ_H, Δ_A to estimate per-session contamination α̂(τ').
-    """
-    session_id: str
-    q: Dict[int, float]               # q̂_i demand proxy per product (Eq 2)
-    trajectory: List[Dict]            # τ_s = (e_{s,1}, ..., e_{s,L_s})
-    delta_h: float = 0.0              # D_KL(T̂' || T̄_H) (Eq 20)
-    delta_a: float = 0.0              # D_KL(T̂' || T̄_A) (Eq 21)
-    alpha_hat: float = 0.0            # per-session contamination estimate
-    actor_class: str = "H"            # ground truth Y_s ∈ {H, A}
-    theta: Dict[str, float] = field(default_factory=dict)
-
-
-def compute_demand_proxy(events: List[Dict], n_products: int) -> Dict[int, float]:
-    """Compute q̂_{t,i} = Σ_k ω(a_{s,k}) · 1[i_{s,k} = i] per Eq 2."""
-    q = {i: 0.0 for i in range(n_products)}
-    for e in events:
-        action, pidx = e.get("eventName", ""), e.get("product_idx")
-        if pidx is not None and 0 <= pidx < n_products:
-            q[pidx] += ACTION_WEIGHTS.get(action, 0.1)
-    return q
-
-
-def compute_session_divergence(events: List[Dict], ref_h: Dict, ref_a: Dict) -> Tuple[float, float]:
-    """Compute Δ_H, Δ_A divergence signals from trajectory (Eq 20-21)."""
-    if not events or kl_divergence is None:
-        return 0.0, 0.0
-    # build empirical transition kernel from trajectory
-    trans: Dict[str, Dict[str, int]] = {}
-    prev = "session_start"
-    for e in events:
-        curr = e.get("eventName", "session_end")
-        trans.setdefault(prev, {})
-        trans[prev][curr] = trans[prev].get(curr, 0) + 1
-        prev = curr
-    # normalize to probabilities
-    kernel = {}
-    for s, dests in trans.items():
-        total = sum(dests.values())
-        kernel[s] = {d: c / total for d, c in dests.items()} if total > 0 else {}
-    # aggregate to event-level and compute KL divergence against reference kernels
-    delta_h = sum(kl_divergence(kernel.get(s, {}), ref_h.get(s, {})) for s in kernel) / max(len(kernel), 1)
-    delta_a = sum(kl_divergence(kernel.get(s, {}), ref_a.get(s, {})) for s in kernel) / max(len(kernel), 1)
-    return delta_h, delta_a
-
-def _canonicalize(raw: Dict) -> Dict:
-    out = {}
-    for src, dsts in raw.items():
-        sc = EVENT_CANON.get(src, src)
-        out.setdefault(sc, {})
-        for dst, p in dsts.items():
-            dc = EVENT_CANON.get(dst, dst)
-            out[sc][dc] = out[sc].get(dc, 0.0) + p
-    return {s: {k: v/sum(d.values()) for k, v in d.items()} for s, d in out.items() if sum(d.values()) > 0}
-
-
-class BehavioralProfile:
-    """Markov profile from learned MDP kernels (Section 3.5.2).
-
-    Transition kernel T̂_Y estimated via MLE: P̂(s'|s) = N(s,s') / Σ_k N(s,k) (Eq 19)
-    """
-    STATES = ["session_start", "view_item_page", "learn_more_about_item", "add_item_to_cart", "purchase_complete", "session_end"]
-    # fallback kernels T̄_H, T̄_A when real data unavailable
-    FALLBACK_H = {"session_start": {"view_item_page": 0.85, "session_end": 0.15},
-                  "view_item_page": {"learn_more_about_item": 0.4, "add_item_to_cart": 0.3, "view_item_page": 0.2, "session_end": 0.1},
-                  "learn_more_about_item": {"add_item_to_cart": 0.5, "view_item_page": 0.3, "session_end": 0.2},
-                  "add_item_to_cart": {"purchase_complete": 0.6, "view_item_page": 0.25, "session_end": 0.15},
-                  "purchase_complete": {"session_end": 1.0}}
-    FALLBACK_A = {"session_start": {"view_item_page": 0.95, "session_end": 0.05},
-                  "view_item_page": {"learn_more_about_item": 0.6, "view_item_page": 0.25, "add_item_to_cart": 0.1, "session_end": 0.05},
-                  "learn_more_about_item": {"view_item_page": 0.5, "add_item_to_cart": 0.15, "learn_more_about_item": 0.3, "session_end": 0.05},
-                  "add_item_to_cart": {"view_item_page": 0.4, "purchase_complete": 0.2, "session_end": 0.4},
-                  "purchase_complete": {"session_end": 1.0}}
-
-    def __init__(self, actor: str, pprobs: np.ndarray, data_dir: str = ""):
-        self.actor, self.pprobs = actor, np.clip(pprobs, 0.0, 0.95)
-        self.trans = self._load(data_dir)  # T̂_Y transition kernel
-        self._ensure_terminal()
-        self.dwell = {s: (1.2, 0.5) if actor == "agents" else (2.0, 1.2) for s in self.STATES}
-
-    def _load(self, data_dir: str) -> Dict:
-        if not REAL_MDP or not data_dir:
-            print("using fallback")
-            return dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H)
-        try:
-            mdp = (AgentBehaviorModel if self.actor == "agents" else BehaviorModel)(data_dir).build_MDP()
-            raw = aggregate_event_transitions(mdp) if mdp.get("transitions") else {}
-            return _canonicalize(raw) if raw else dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H)
-        except Exception:
-            print("using fallback")
-            return dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H)
-
-    def _ensure_terminal(self):
-        self.trans.setdefault("purchase_complete", {})["session_end"] = self.trans.get("purchase_complete", {}).get("session_end", 1.0)
-        self.trans.setdefault("session_start", {"view_item_page": 0.7, "learn_more_about_item": 0.2, "session_end": 0.1})
-
-    def _tprobs(self, state: str, pidx: int) -> Dict[str, float]:
-        probs = dict(self.trans.get(state, {"session_end": 1.0}))
-        if state == "add_item_to_cart":
-            base = probs.get("purchase_complete", 0.0)
-            df = float(self.pprobs[pidx]) * (0.3 if self.actor == "agents" else 1.0)
-            adj = np.clip(base * 0.5 + df * 0.5, 0.0, 0.95)
-            rem = max(1e-6, 1.0 - adj)
-            other = sum(v for k, v in probs.items() if k != "purchase_complete")
-            probs = {k: (adj if k == "purchase_complete" else v * rem / max(other, 1e-6)) for k, v in probs.items()}
-        total = sum(probs.values())
-        return {k: v/total for k, v in probs.items()} if total > 0 else {"session_end": 1.0}
-
-    def sample(self, rng: np.random.Generator, sid: str, prices: np.ndarray, costs: np.ndarray) -> Tuple[List[Dict], List[SimpleNamespace]]:
-        events, fevts = [], []
-        state, t, pidx = "session_start", 0.0, int(rng.integers(0, len(prices)))
-        cost, cprice = float(costs[pidx]), max(float(prices[pidx]), float(costs[pidx]) * 1.05)
-
-        while state != "session_end" and len(events) < 40:
-            if state != "session_start":
-                row = {"session_id": sid, "actor": "agent" if self.actor == "agents" else "human",
-                       "eventName": state, "product_idx": pidx, "productId": f"product-{pidx:04d}",
-                       "price_offered": cprice, "price_paid": 0.0, "page": EVENT_PAGE.get(state, "/"),
-                       "ts": t, "unit_cost": cost, "base_price": float(prices[pidx])}
-                if state == "purchase_complete":
-                    row["price_paid"] = max(cprice * (1.0 + rng.normal(0.0, 0.015)), cost)
-                events.append(row)
-                fevts.append(SimpleNamespace(eventName=state, page=row["page"], productId=row["productId"], ts=t))
-
-            probs = self._tprobs(state, pidx)
-            state = rng.choice(list(probs.keys()), p=list(probs.values()))
-            sh, sc = self.dwell.get(state, (2.0, 1.0))
-            t += max(0.3, rng.gamma(shape=sh, scale=sc))
-        return events, fevts
-
-
-@dataclass
-class ContaminatedArrivalConfig:
-    base_rate: float = 20.0
-    alpha_contamination: float = 0.2
-    alpha_drift: float = 0.0
-    alpha_bounds: tuple[float, float] = (0.0, 0.5)
-    human_views_range: tuple[int, int] = (1, 4)
-    agent_views_range: tuple[int, int] = (3, 10)
-    agent_systematic: bool = True
-    use_real_behavior: bool = True
-    human_data_dir: str = ""
-    agent_data_dir: str = ""
-
-
-class ContaminatedArrivalModel:
-    """Mixture model Q(p) = (1-α)E[d(p;θ_H)] + αE[d(p;θ_A)] + ε_t (Eq 3).
-
-    Samples sessions from human/agent behavioral profiles, computes per-session
-    demand proxy q̂ and divergence signals Δ_H, Δ_A for separability.
-    """
-
-    def __init__(self, cfg: ContaminatedArrivalConfig | None = None):
-        self.cfg = cfg or ContaminatedArrivalConfig()
-        self._alpha = self.cfg.alpha_contamination
-        self._scount = 0
-        self._profiles: Dict[str, BehavioralProfile] = {}
-        self._ref_kernels: Dict[str, Dict] = {}  # T̄_H, T̄_A reference kernels
-        self._session_demands: List[SessionDemand] = []  # collected session demands
-
-    @property
-    def alpha(self) -> float:
-        return self._alpha
-
-    def _profile(self, actor: str, pprobs: np.ndarray) -> BehavioralProfile:
-        key = actor
-        if key not in self._profiles:
-            ddir = self.cfg.agent_data_dir if actor == "agents" else self.cfg.human_data_dir
-            if not ddir and self.cfg.use_real_behavior:
-                base = Path(__file__).parent.parent.parent.parent / "experiments"
-                ddir = str(base / ("agents/collected_data" if actor == "agents" else "collected_data"))
-            profile = BehavioralProfile(actor, pprobs, ddir if self.cfg.use_real_behavior else "")
-            self._profiles[key] = profile
-            self._ref_kernels[key] = profile.trans  # cache T̄_Y for divergence
-        return self._profiles[key]
-
-    def get_ref_kernels(self) -> Tuple[Dict, Dict]:
-        """Return reference transition kernels T̄_H, T̄_A for divergence computation."""
-        return (self._ref_kernels.get("humans", BehavioralProfile.FALLBACK_H),
-                self._ref_kernels.get("agents", BehavioralProfile.FALLBACK_A))
-
-    def get_session_demands(self) -> List[SessionDemand]:
-        """Return collected session demands for downstream analysis."""
-        return self._session_demands
-
-    def sample(self, t: float, dt: float, instruments: InstrumentSet,
-               market: MarketState | None, hidden: HiddenState, rng: np.random.Generator) -> list[Opportunity]:
-        """Sample arrivals as per Eq 3: mixture of human/agent demand distributions.
-
-        For each session s, computes:
-        - Trajectory τ_s from behavioral profile sampling
-        - Demand proxy q̂ via weighted action aggregation (Eq 2)
-        - Divergence signals Δ_H, Δ_A for separability (Eq 20-21)
-        - Per-session contamination estimate α̂(τ')
-        """
-        cfg = self.cfg
-        if cfg.alpha_drift != 0:
-            self._alpha = np.clip(self._alpha + cfg.alpha_drift * rng.normal(), *cfg.alpha_bounds)
-        hidden.contamination = self._alpha
-
-        n_sess = poisson_arrivals(cfg.base_rate * hidden.true_demand_intensity, dt, rng)
-        prices, costs = instruments.refs, instruments.costs
-        margin = np.clip((prices - costs) / np.maximum(costs, 1e-3), -0.9, 2.0)
-        hprob, aprob = 0.08 * np.exp(-1.2 * margin), 0.05 * np.exp(-0.6 * margin)
-        ref_h, ref_a = self.get_ref_kernels()
-
-        opps = []
-        for _ in range(n_sess):
-            self._scount += 1
-            sid = f"s{self._scount:06d}"
-            is_agent = rng.random() < self._alpha
-            actor, probs = ("agents", aprob) if is_agent else ("humans", hprob)
-            profile = self._profile(actor, probs)
-            events, fevts = profile.sample(rng, sid, prices, costs)
-
-            # compute demand proxy q̂ per Eq 2
-            q = compute_demand_proxy(events, instruments.n)
-
-            # compute divergence signals Δ_H, Δ_A per Eq 20-21
-            delta_h, delta_a = compute_session_divergence(events, ref_h, ref_a)
-            # per-session contamination estimate α̂(τ') = σ(β(Δ_H - Δ_A))
-            alpha_hat = 1.0 / (1.0 + np.exp(-2.0 * (delta_h - delta_a))) if (delta_h + delta_a) > 0 else 0.5
-
-            theta = ({'price_sensitivity': rng.uniform(0.05, 0.2), 'base_conversion': 0.01, 'info_value': 1.0} if is_agent
-                     else {'price_sensitivity': rng.uniform(1.5, 4.0), 'base_conversion': rng.uniform(0.2, 0.5), 'info_value': 0.0})
-
-            # store session demand for downstream analysis
-            self._session_demands.append(SessionDemand(
-                session_id=sid, q=q, trajectory=events, delta_h=delta_h, delta_a=delta_a,
-                alpha_hat=alpha_hat, actor_class="A" if is_agent else "H", theta=theta))
-
-            viewed = list({e["product_idx"] for e in events if "product_idx" in e})
-            if not viewed:
-                vr = cfg.agent_views_range if is_agent else cfg.human_views_range
-                viewed = list(rng.choice(instruments.n, size=min(rng.integers(*vr), instruments.n), replace=False))
-
-            for vi, iid in enumerate(viewed):
-                opps.append(Opportunity(
-                    id=f"{sid}-{iid}", type=OpportunityType.SESSION, side=Side.BUY,
-                    instrument_id=int(iid), size=1.0, t=t + rng.uniform(0, dt),
-                    context={'session_id': sid, 'actor_class': 'AGENT' if is_agent else 'HUMAN', 'is_agent': is_agent,
-                             'reconnaissance_intent': is_agent, 'view_index': vi, 'total_views': len(viewed),
-                             'theta': theta, 'trajectory_events': fevts, 'mdp_trajectory': events,
-                             'demand_proxy': q, 'alpha_hat': alpha_hat, 'delta_h': delta_h, 'delta_a': delta_a}))
-        return opps
-
-
-@dataclass
-class AdversarialArrivalConfig:
-    base_rate: float = 5.0
-    n_parallel_agents: int = 3
-    query_all_products: bool = True
-
-
-class AdversarialArrivalModel:
-    """Adversarial coordination (Theorem 1): as N->inf, COI->0."""
-
-    def __init__(self, cfg: AdversarialArrivalConfig | None = None):
-        self.cfg = cfg or AdversarialArrivalConfig()
-        self._qcount = 0
-
-    def sample(self, t: float, dt: float, instruments: InstrumentSet,
-               market: MarketState | None, hidden: HiddenState, rng: np.random.Generator) -> list[Opportunity]:
-        cfg, opps = self.cfg, []
-        for _ in range(poisson_arrivals(cfg.base_rate, dt, rng)):
-            self._qcount += 1
-            for ai in range(cfg.n_parallel_agents):
-                sid = f"adv{self._qcount:06d}-{ai}"
-                prods = np.arange(instruments.n) if cfg.query_all_products else rng.choice(instruments.n, size=1)
-                for iid in prods:
-                    opps.append(Opportunity(
-                        id=f"{sid}-{iid}", type=OpportunityType.SESSION, side=Side.BUY,
-                        instrument_id=int(iid), size=1.0, t=t,
-                        context={'session_id': sid, 'actor_class': 'AGENT', 'is_agent': True, 'adversarial': True,
-                                 'agent_index': ai, 'query_group': self._qcount,
-                                 'theta': {'price_sensitivity': 0.0, 'base_conversion': 0.0, 'info_value': 1.0}}))
-        return opps
diff --git a/lab/case/thesis/execution.py b/lab/case/thesis/execution.py
deleted file mode 100644
index 5d2aa37..0000000
--- a/lab/case/thesis/execution.py
+++ /dev/null
@@ -1,91 +0,0 @@
-"""Execution models with divergent H/A behavior using ground truth labels."""
-from __future__ import annotations
-from dataclasses import dataclass
-from typing import Any, Dict
-import numpy as np
-from ...outlet.types import Opportunity, Quote, InstrumentSet, MarketState
-from ...outlet.math_util import sigmoid, safe_log, EPS
-
-
-@dataclass
-class HybridExecutionConfig:
-    human_base_prob: float = 0.3
-    human_elasticity: float = 2.5
-    agent_conversion: float = 0.01
-    cross_elasticity: float = 0.4
-    quality_weight: float = 0.2
-    use_separability: bool = False
-
-
-class HybridExecutionModel:
-    """Execution with divergent H/A behavior using ground truth labels."""
-
-    def __init__(self, cfg: HybridExecutionConfig | None = None):
-        self.cfg = cfg or HybridExecutionConfig()
-
-    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
-             market: MarketState | None, rng: np.random.Generator) -> float:
-        cfg, idx = self.cfg, int(opp.instrument_id)
-        price, ref, cost = float(quote.prices[idx]), float(instruments.refs[idx]), float(instruments.costs[idx])
-        ctx = opp.context
-        theta = ctx.get('theta', {})
-        is_agent = ctx.get('is_agent', False)
-
-        if is_agent:
-            return cfg.agent_conversion * theta.get('base_conversion', 1.0)
-
-        # human logit discrete choice
-        sens = theta.get('price_sensitivity', cfg.human_elasticity)
-        base = theta.get('base_conversion', cfg.human_base_prob)
-        u_price = -sens * safe_log(price / (ref + EPS))
-        quality = instruments.instruments[idx].attrs.get('quality', 0.5)
-        u_quality = cfg.quality_weight * quality
-
-        u_comp = 0.0
-        if market and market.competitor_quotes is not None:
-            cp = market.competitor_quotes[idx]
-            if cp < price:
-                u_comp = -cfg.cross_elasticity * (price - cp) / ref
-
-        utility = safe_log(base / (1 - base + EPS)) + u_price + u_quality + u_comp
-        return float(sigmoid(utility))
-
-    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, context: dict[str, Any] | None = None) -> np.ndarray:
-        if context is None:
-            return fills / (self.cfg.human_base_prob + EPS)
-        agent_frac = context.get('contamination', 0.0)
-        return fills / (self.cfg.human_base_prob * (1 - agent_frac) + EPS)
-
-
-@dataclass
-class SeparableExecutionConfig:
-    human_funnel: Dict[str, float] = None
-    agent_funnel: Dict[str, float] = None
-
-    def __post_init__(self):
-        self.human_funnel = self.human_funnel or {'view_to_detail': 0.4, 'detail_to_cart': 0.3, 'cart_to_purchase': 0.6}
-        self.agent_funnel = self.agent_funnel or {'view_to_detail': 0.8, 'detail_to_cart': 0.05, 'cart_to_purchase': 0.1}
-
-
-class SeparableExecutionModel:
-    """Execution with Markov funnel kernels using ground truth labels."""
-
-    def __init__(self, cfg: SeparableExecutionConfig | None = None):
-        self.cfg = cfg or SeparableExecutionConfig()
-
-    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
-             market: MarketState | None, rng: np.random.Generator) -> float:
-        is_agent = opp.context.get('is_agent', False)
-        probs = self.cfg.agent_funnel if is_agent else self.cfg.human_funnel
-        p = probs['view_to_detail'] * probs['detail_to_cart'] * probs['cart_to_purchase']
-
-        if not is_agent:
-            idx = int(opp.instrument_id)
-            price_ratio = quote.prices[idx] / (instruments.refs[idx] + EPS)
-            p *= np.exp(-0.5 * (price_ratio - 1.0))
-        return float(np.clip(p, 0, 1))
-
-    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, context: dict[str, Any] | None = None) -> np.ndarray:
-        h = self.cfg.human_funnel
-        exp_conv = h['view_to_detail'] * h['detail_to_cart'] * h['cart_to_purchase']
-        return fills / (exp_conv + EPS)
diff --git a/lab/case/thesis/metrics.py b/lab/case/thesis/metrics.py
deleted file mode 100644
index 0cd9680..0000000
--- a/lab/case/thesis/metrics.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""Thesis metrics for COI and behavioral analysis using ground truth labels."""
-from __future__ import annotations
-from dataclasses import dataclass, field
-from typing import Dict
-import numpy as np
-from ...outlet.types import StepLogs, StepMetrics, Quote, InstrumentSet
-from ...outlet.math_util import safe_log, EPS
-
-
-@dataclass
-class COIMetrics:
-    coi_level: float = 0.0
-    coi_leakage: float = 0.0
-    realized_premium: float = 0.0
-    theoretical_max: float = 0.0
-    erosion_rate: float = 0.0
-
-    def to_dict(self) -> dict[str, float]:
-        return {k: getattr(self, k) for k in ['coi_level', 'coi_leakage', 'realized_premium', 'theoretical_max', 'erosion_rate']}
-
-
-def compute_coi(quote: Quote, instruments: InstrumentSet, metrics: StepMetrics, contamination: float) -> COIMetrics:
-    prices, costs, refs = quote.prices, instruments.costs, instruments.refs
-    margins = prices - costs
-    coi_level = float(np.mean(margins))
-    theoretical_max = float(np.mean(costs))
-    realized_premium = (metrics.revenue - metrics.cost) / metrics.units_traded if metrics.units_traded > 0 else 0.0
-    price_var = float(np.var(prices / refs))
-    coi_leakage = contamination * (coi_level + price_var)
-    erosion_rate = contamination * coi_level / (theoretical_max + EPS)
-    return COIMetrics(coi_level=coi_level, coi_leakage=coi_leakage, realized_premium=realized_premium,
-                      theoretical_max=theoretical_max, erosion_rate=erosion_rate)
-
-
-@dataclass
-class SeparabilityMetrics:
-    classification_accuracy: float = 0.0
-    estimated_alpha: float = 0.0
-    n_human_sessions: int = 0
-    n_agent_sessions: int = 0
-
-
-def compute_separability(logs: StepLogs, true_alpha: float) -> SeparabilityMetrics:
-    """Compute separability using ground truth labels only."""
-    if logs.events is None or len(logs.events) == 0:
-        return SeparabilityMetrics(estimated_alpha=true_alpha)
-
-    sessions: Dict[str, bool] = {}
-    for evt in logs.events:
-        sid = evt.metadata.get('session_id', evt.opportunity_id)
-        if sid not in sessions:
-            sessions[sid] = evt.metadata.get('is_agent', False)
-
-    n_agent = sum(1 for is_agent in sessions.values() if is_agent)
-    n_human = len(sessions) - n_agent
-    est_alpha = n_agent / len(sessions) if sessions else 0.0
-
-    return SeparabilityMetrics(
-        classification_accuracy=1.0,  # ground truth is always correct
-        estimated_alpha=est_alpha,
-        n_human_sessions=n_human,
-        n_agent_sessions=n_agent)
-
-
-@dataclass
-class RevenueAttribution:
-    total_revenue: float = 0.0
-    human_revenue: float = 0.0
-    agent_revenue: float = 0.0
-    human_conversion: float = 0.0
-    agent_conversion: float = 0.0
-
-
-def compute_attribution(logs: StepLogs, metrics: StepMetrics) -> RevenueAttribution:
-    if logs.executions is None:
-        return RevenueAttribution(total_revenue=metrics.revenue)
-
-    human_rev, agent_rev, human_cnt, agent_cnt = 0.0, 0.0, 0, 0
-    for exe in logs.executions:
-        if exe.propensity < 0.05:
-            agent_rev += exe.price * exe.size_filled
-            agent_cnt += 1
-        else:
-            human_rev += exe.price * exe.size_filled
-            human_cnt += 1
-
-    total_exp = logs.aggregates.get('n_arrivals', 1)
-    return RevenueAttribution(
-        total_revenue=metrics.revenue, human_revenue=human_rev, agent_revenue=agent_rev,
-        human_conversion=human_cnt / (total_exp * 0.8 + EPS),
-        agent_conversion=agent_cnt / (total_exp * 0.2 + EPS))
-
-
-def order_statistic_erosion(n_agents: int, price_variance: float) -> float:
-    """COI erosion from Theorem 1: as N->inf, min(p_1..p_N)->p_min."""
-    if n_agents <= 1:
-        return 0.0
-    sigma, log_n = np.sqrt(price_variance), safe_log(n_agents)
-    if log_n < 1:
-        return 0.0
-    shift = sigma * (np.sqrt(2 * log_n) - (safe_log(log_n) + safe_log(4 * np.pi)) / (2 * np.sqrt(2 * log_n) + EPS))
-    return float(min(shift / (sigma * 2 + EPS), 1.0))
diff --git a/lab/case/thesis/objectives.py b/lab/case/thesis/objectives.py
deleted file mode 100644
index ba70320..0000000
--- a/lab/case/thesis/objectives.py
+++ /dev/null
@@ -1,228 +0,0 @@
-"""
-Thesis-specific objectives implementing robust pricing under contamination.
-
-Implements the Maximin objective from Eq 23:
-π* = argmax_π min_{Q ∈ U_ε} E_d~Q[R(p,d) - λ·COI(p)]
-
-Key components:
-- COIObjective: Cost of Information penalty (Definition 1)
-- RobustStackelbergObjective: Full maximin objective with Wasserstein robustness
-- UXPenalty: User experience degradation from volatility
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-import numpy as np
-from ...outlet.objectives.base import BaseObjective, CompositeObjective
-from ...outlet.types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation
-from ...outlet.math_util import safe_log, EPS
-
-class COIObjective(BaseObjective):
-    """Cost of Information penalty from Definition 1.
-
-    COI(π) = E[P] - p_min
-
-    The expected price premium over marginal cost represents the platform's
-    pricing power. Agent reconnaissance erodes this by revealing price
-    distribution to buyers.
-
-    We implement COI_leakage = f(τ') · InfoValue(p, τ')
-    where f(τ') is the estimated agent probability.
-    """
-
-    def __init__(self, lambda_coi: float = 1.0, use_revelation: bool = False):
-        """
-        Args:
-            lambda_coi: Weight on COI penalty
-            use_revelation: If True, use -log(π(p)) as info value (penalizes rare prices)
-        """
-        self.lambda_coi = lambda_coi
-        self.use_revelation = use_revelation
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        # COI_leakage = α · InfoValue
-        alpha = hidden.contamination
-
-        if self.use_revelation:
-            # revelation surrogate: rare prices reveal more about policy
-            # InfoValue = -log(π(p|τ')) ≈ surprise of the price
-            price_surprise = np.mean(np.abs(quote.prices - instruments.refs) / (instruments.refs + EPS))
-            info_value = price_surprise
-        else:
-            # query-tax surrogate: each agent query incurs constant leakage
-            info_value = 1.0
-
-        leakage = alpha * info_value
-        return -self.lambda_coi * leakage
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        alpha = hidden.contamination
-        margins = (quote.prices - instruments.costs) / (instruments.costs + EPS)
-        return {
-            'coi_penalty': self.reward(quote, instruments, metrics, hidden, obs),
-            'contamination': alpha,
-            'avg_margin': float(np.mean(margins)),
-        }
-
-@dataclass
-class RobustObjectiveConfig:
-    """Configuration for robust Stackelberg objective.
-
-    Attributes:
-        lambda_coi: Weight on COI penalty (λ in Eq 23)
-        lambda_ux: Weight on UX penalty
-        lambda_volatility: Weight on price volatility penalty
-        gamma_inventory: Inventory risk aversion
-        wasserstein_epsilon: Ambiguity set radius (ε in Eq 21)
-    """
-    lambda_coi: float = 0.5
-    lambda_ux: float = 0.1
-    lambda_volatility: float = 0.2
-    gamma_inventory: float = 0.1
-    wasserstein_epsilon: float = 0.1
-
-class RobustStackelbergObjective(BaseObjective):
-    """Implements the Maximin Objective from thesis Eq 23.
-
-    π* = argmax_π min_{Q ∈ U_ε(P̂_N)} E_d~Q[R(p,d) - λ·COI(p)]
-
-    The objective balances:
-    1. Revenue R(p,d) from human purchases
-    2. COI penalty for information leakage to agents
-    3. UX penalty for price volatility
-    4. Inventory/holding costs
-
-    The min over ambiguity set U_ε is approximated by penalizing
-    high contamination scenarios more heavily.
-    """
-
-    def __init__(self, cfg: RobustObjectiveConfig | None = None):
-        self.cfg = cfg or RobustObjectiveConfig()
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        cfg = self.cfg
-
-        # 1. base revenue (R(p,d))
-        revenue = metrics.revenue
-        cost = metrics.cost
-        profit = revenue - cost
-
-        # 2. COI penalty: scales with contamination and margin extraction
-        # high margins + high contamination = high leakage
-        alpha = hidden.contamination
-        margins = quote.prices - instruments.costs
-        avg_margin = float(np.mean(margins))
-        coi_penalty = cfg.lambda_coi * avg_margin * alpha
-
-        # 3. UX penalty: price volatility harms legitimate users
-        volatility_penalty = cfg.lambda_volatility * metrics.volatility
-
-        # 4. inventory/position cost
-        position_penalty = cfg.gamma_inventory * metrics.position_cost
-
-        # 5. lost opportunity cost (stockouts)
-        lost_penalty = 0.1 * metrics.lost_opportunity
-
-        # robust adjustment: under adversarial distribution Q,
-        # expect lower revenue and higher costs
-        # approximate via worst-case contamination within ε-ball
-        worst_case_alpha = min(alpha + cfg.wasserstein_epsilon, 1.0)
-        robustness_penalty = cfg.wasserstein_epsilon * avg_margin * worst_case_alpha
-
-        total = profit - coi_penalty - volatility_penalty - position_penalty - lost_penalty - robustness_penalty
-
-        return total
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        cfg = self.cfg
-        alpha = hidden.contamination
-        margins = quote.prices - instruments.costs
-        avg_margin = float(np.mean(margins))
-
-        return {
-            'revenue': metrics.revenue,
-            'cost': metrics.cost,
-            'profit': metrics.revenue - metrics.cost,
-            'coi_penalty': -cfg.lambda_coi * avg_margin * alpha,
-            'volatility_penalty': -cfg.lambda_volatility * metrics.volatility,
-            'position_penalty': -cfg.gamma_inventory * metrics.position_cost,
-            'lost_penalty': -0.1 * metrics.lost_opportunity,
-            'robustness_penalty': -cfg.wasserstein_epsilon * avg_margin * min(alpha + cfg.wasserstein_epsilon, 1.0),
-            'contamination': alpha,
-            'avg_margin_pct': avg_margin / (float(np.mean(instruments.costs)) + EPS),
-        }
-
-class UXPenalty(BaseObjective):
-    """User experience penalty from price volatility.
-
-    High price volatility degrades UX for legitimate human users.
-    This term ensures the defense doesn't harm real customers while
-    protecting against agent reconnaissance.
-    """
-
-    def __init__(self, scale: float = 1.0, max_acceptable_volatility: float = 0.1):
-        self.scale = scale
-        self.max_vol = max_acceptable_volatility
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        # penalty increases quadratically beyond threshold
-        excess_vol = max(0, metrics.volatility - self.max_vol)
-        return -self.scale * (excess_vol ** 2)
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        return {
-            'ux_penalty': self.reward(quote, instruments, metrics, hidden, obs),
-            'volatility': metrics.volatility,
-        }
-
-class AdaptiveObjective(BaseObjective):
-    """Objective that adapts weights based on estimated contamination.
-
-    When contamination is low, focus on revenue maximization.
-    When contamination is high, increase COI defense weight.
-    """
-
-    def __init__(self, base_lambda_coi: float = 0.3, max_lambda_coi: float = 2.0,
-                 adaptation_rate: float = 2.0):
-        self.base_lambda = base_lambda_coi
-        self.max_lambda = max_lambda_coi
-        self.rate = adaptation_rate
-
-    def _adaptive_lambda(self, alpha: float) -> float:
-        # sigmoid scaling: λ(α) = base + (max-base) * sigmoid(rate*(α-0.5))
-        from ...outlet.math_util import sigmoid
-        scale = sigmoid(self.rate * (alpha - 0.3))
-        return self.base_lambda + (self.max_lambda - self.base_lambda) * scale
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        alpha = hidden.contamination
-        lambda_coi = self._adaptive_lambda(alpha)
-
-        profit = metrics.revenue - metrics.cost
-        margins = quote.prices - instruments.costs
-        coi_penalty = lambda_coi * float(np.mean(margins)) * alpha
-
-        return profit - coi_penalty
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        alpha = hidden.contamination
-        return {
-            'profit': metrics.revenue - metrics.cost,
-            'adaptive_lambda': self._adaptive_lambda(alpha),
-            'contamination': alpha,
-        }
-
-def make_thesis_objective(lambda_coi: float = 0.5, lambda_ux: float = 0.1,
-                          lambda_vol: float = 0.2) -> CompositeObjective:
-    """Create the standard thesis objective composition."""
-    return CompositeObjective([
-        (RobustStackelbergObjective(RobustObjectiveConfig(
-            lambda_coi=lambda_coi, lambda_ux=lambda_ux, lambda_volatility=lambda_vol)), 1.0),
-    ])
diff --git a/lab/case/thesis/platform.py b/lab/case/thesis/platform.py
deleted file mode 100644
index ec00da5..0000000
--- a/lab/case/thesis/platform.py
+++ /dev/null
@@ -1,176 +0,0 @@
-"""Thesis platform with real MDP behavioral models and separability scoring."""
-from __future__ import annotations
-from dataclasses import dataclass
-from pathlib import Path
-import numpy as np
-from ...outlet import (Platform, PlatformConfig, PositionModel, PositionConfig,
-                       PostedPriceMechanism, make_instruments, InstrumentType, LogLevel)
-from ...outlet.mechanisms.posted_price import PostedPriceConfig
-from ...outlet.observation import DefaultObservationBuilder, ObservationConfig
-from .arrivals import ContaminatedArrivalModel, ContaminatedArrivalConfig
-from .execution import HybridExecutionModel, HybridExecutionConfig
-from .objectives import RobustStackelbergObjective, RobustObjectiveConfig
-
-
-@dataclass
-class ThesisConfig:
-    # instruments
-    n_instruments: int = 10
-    cost_range: tuple[float, float] = (5.0, 50.0)
-    margin_range: tuple[float, float] = (0.2, 0.5)
-
-    # contamination (Section 3.1)
-    alpha_contamination: float = 0.2
-    alpha_drift: float = 0.0
-    alpha_bounds: tuple[float, float] = (0.0, 0.5)
-
-    # objectives (Eq 23)
-    lambda_coi: float = 0.5
-    lambda_ux: float = 0.1
-    lambda_volatility: float = 0.2
-    wasserstein_epsilon: float = 0.1
-
-    # arrivals
-    sessions_per_step: int = 30
-    human_views_range: tuple[int, int] = (1, 4)
-    agent_views_range: tuple[int, int] = (3, 10)
-
-    # inventory
-    initial_inventory: float = 100.0
-    holding_cost_rate: float = 0.002
-
-    # real behavioral models (from sim.rl)
-    use_real_behavior: bool = True
-    use_separability: bool = False  # disabled until classifier trained
-    human_data_dir: str = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data"
-    agent_data_dir: str = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data"
-
-    # simulation
-    max_steps: int = 500
-    seed: int | None = 24
-    log_level: LogLevel = LogLevel.AGG_ONLY
-
-
-def _resolve_data_dirs(cfg: ThesisConfig) -> tuple[str, str]:
-    """Resolve data directories for behavioral models."""
-    base = Path(__file__).parent.parent.parent.parent / "experiments"
-    human = cfg.human_data_dir or str(base / "collected_data")
-    agent = cfg.agent_data_dir or str(base / "agents/collected_data")
-    return human, agent
-
-
-def make_thesis_platform(cfg: ThesisConfig | None = None) -> Platform:
-    """Create platform with real MDP behavioral models.
-
-    Implements:
-    - Contaminated arrivals using learned MDP kernels from behavior_loader
-    - Hybrid execution with real separability scoring from lib.separability
-    - Robust Stackelberg objective (Eq 23)
-    """
-    cfg = cfg or ThesisConfig()
-    rng = np.random.default_rng(cfg.seed)
-    human_dir, agent_dir = _resolve_data_dirs(cfg)
-
-    instruments = make_instruments(
-        n=cfg.n_instruments, cost_range=cfg.cost_range, margin_range=cfg.margin_range,
-        inst_type=InstrumentType.SKU, rng=rng)
-    instruments.position = np.full(cfg.n_instruments, cfg.initial_inventory)
-
-    arrival = ContaminatedArrivalModel(ContaminatedArrivalConfig(
-        base_rate=cfg.sessions_per_step,
-        alpha_contamination=cfg.alpha_contamination,
-        alpha_drift=cfg.alpha_drift,
-        alpha_bounds=cfg.alpha_bounds,
-        human_views_range=cfg.human_views_range,
-        agent_views_range=cfg.agent_views_range,
-        use_real_behavior=cfg.use_real_behavior,
-        human_data_dir=human_dir,
-        agent_data_dir=agent_dir,
-    ))
-
-    execution = HybridExecutionModel(HybridExecutionConfig(
-        use_separability=cfg.use_separability,
-    ))
-
-    mechanism = PostedPriceMechanism(PostedPriceConfig(max_delta_pct=0.15, min_margin_pct=0.05))
-    position = PositionModel(PositionConfig(initial_position=cfg.initial_inventory, holding_cost_rate=cfg.holding_cost_rate))
-
-    market = None
-    objective = RobustStackelbergObjective(RobustObjectiveConfig(
-        lambda_coi=cfg.lambda_coi, lambda_ux=cfg.lambda_ux,
-        lambda_volatility=cfg.lambda_volatility, wasserstein_epsilon=cfg.wasserstein_epsilon))
-
-    obs_builder = DefaultObservationBuilder(ObservationConfig(mask_true_demand=True))
-    platform_cfg = PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps,
-                                   seed=cfg.seed, log_level=cfg.log_level, mask_demand=True)
-
-    return Platform(instruments=instruments, mechanism=mechanism, arrival=arrival, execution=execution,
-                    position=position, market=market, obs_builder=obs_builder, objective=objective, cfg=platform_cfg)
-
-
-@dataclass
-class AblationConfig(ThesisConfig):
-    disable_coi_penalty: bool = False
-    disable_ux_penalty: bool = False
-    disable_contamination: bool = False
-    disable_real_behavior: bool = False
-
-
-def make_ablation_platform(cfg: AblationConfig) -> Platform:
-    if cfg.disable_coi_penalty:
-        cfg.lambda_coi = 0.0
-    if cfg.disable_ux_penalty:
-        cfg.lambda_ux = 0.0
-    if cfg.disable_contamination:
-        cfg.alpha_contamination = 0.0
-    if cfg.disable_real_behavior:
-        cfg.use_real_behavior = False
-        cfg.use_separability = False
-    return make_thesis_platform(cfg)
-
-
-def sweep_contamination(alpha_values: list[float], base_cfg: ThesisConfig | None = None,
-                        n_steps: int = 100, seed: int = 42) -> dict[float, dict]:
-    """Test performance across contamination levels (Theorem 1 validation)."""
-    from ...experiments.eval import rollout, fixed_price_policy
-
-    results = {}
-    base_cfg = base_cfg or ThesisConfig()
-
-    for alpha in alpha_values:
-        cfg = ThesisConfig(**{k: v for k, v in base_cfg.__dict__.items() if k != 'alpha_contamination'},
-                          alpha_contamination=alpha)
-        platform = make_thesis_platform(cfg)
-        policy = fixed_price_policy(platform.instruments.refs)
-        result = rollout(platform, policy, n_steps, seed=seed)
-        results[alpha] = {
-            'total_reward': result.total_reward,
-            'total_pnl': result.total_pnl,
-            'avg_conversion': result.avg_conversion,
-            'final_contamination': platform._hidden.contamination,
-        }
-    return results
-
-
-def sweep_behavior_modes(base_cfg: ThesisConfig | None = None, n_steps: int = 100, seed: int = 42) -> dict[str, dict]:
-    """Compare real vs synthetic behavioral models."""
-    from ...experiments.eval import rollout, fixed_price_policy
-
-    base_cfg = base_cfg or ThesisConfig()
-    modes = {
-        'real_mdp': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': True, 'use_separability': True}),
-        'synthetic': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': False, 'use_separability': False}),
-        'real_mdp_no_sep': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': True, 'use_separability': False}),
-    }
-
-    results = {}
-    for name, cfg in modes.items():
-        platform = make_thesis_platform(cfg)
-        policy = fixed_price_policy(platform.instruments.refs)
-        result = rollout(platform, policy, n_steps, seed=seed)
-        results[name] = {
-            'total_reward': result.total_reward,
-            'total_pnl': result.total_pnl,
-            'avg_conversion': result.avg_conversion,
-        }
-    return results
diff --git a/lab/case/thesis/run_experiment.py b/lab/case/thesis/run_experiment.py
deleted file mode 100644
index 962db4f..0000000
--- a/lab/case/thesis/run_experiment.py
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/env python
-"""Thesis simulation experiments with real MDP behavioral models."""
-from __future__ import annotations
-import sys
-from pathlib import Path
-
-if __name__ == '__main__':
-    sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
-
-from lab.case.thesis.platform import make_thesis_platform, ThesisConfig
-from lab.case.thesis.metrics import compute_coi, compute_separability
-from lab.experiments.eval import compare_policies
-import numpy as np
-
-
-def demo_basic_simulation():
-    print("=" * 70)
-    print("THESIS SIMULATION: Contaminated Dynamic Pricing (Real MDP Kernels)")
-    print("=" * 70)
-
-    cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.3, lambda_coi=0.5,
-                       max_steps=100, seed=42, use_real_behavior=True)
-    platform = make_thesis_platform(cfg)
-
-    print(f"\nInstruments: {platform.instruments.n}")
-    print(f"Reference prices: {platform.instruments.refs.round(2)}")
-    print(f"Costs: {platform.instruments.costs.round(2)}")
-    print(f"Initial contamination alpha={cfg.alpha_contamination}")
-    print(f"Using real behavior: {cfg.use_real_behavior}")
-
-    result = platform.reset(seed=42)
-    total_reward, coi_history = 0, []
-
-    print(f"\n{'Step':>5} {'Reward':>10} {'PnL':>10} {'COI':>8} {'alpha':>6} {'Conv':>8}")
-    print("-" * 55)
-
-    for t in range(cfg.max_steps):
-        action = platform.instruments.refs * np.random.uniform(0.95, 1.15, size=platform.instruments.n)
-        result = platform.step(action)
-        total_reward += result.reward
-        coi = compute_coi(platform._quote, platform.instruments, result.metrics, result.hidden.contamination)
-        coi_history.append(coi.coi_level)
-
-        if t % 20 == 0:
-            print(f"{t:5d} {result.reward:10.2f} {result.metrics.pnl:10.2f} "
-                  f"{coi.coi_level:8.2f} {result.hidden.contamination:6.2f} {result.metrics.conversion:8.3f}")
-
-    print("-" * 55)
-    print(f"Total Reward: {total_reward:.2f}")
-    print(f"Average COI: {np.mean(coi_history):.2f}")
-    print(f"COI Trend: {coi_history[-1] - coi_history[0]:+.2f}")
-
-
-def demo_contamination_sweep():
-    print("\n" + "=" * 70)
-    print("EXPERIMENT: COI Erosion vs Contamination (Theorem 1)")
-    print("=" * 70)
-
-    from lab.case.thesis.platform import sweep_contamination
-    trials = 20
-    alpha_values = [i/trials for i in range(trials)]
-    results = sweep_contamination(alpha_values, n_steps=100, seed=42)
-
-    print(f"\n{'alpha':>6} {'Reward':>12} {'PnL':>12} {'Conv':>10}")
-    print("-" * 45)
-    for alpha, m in sorted(results.items()):
-        print(f"{alpha:6.2f} {m['total_reward']:12.2f} {m['total_pnl']:12.2f} {m['avg_conversion']:10.3f}")
-
-    rewards = [results[a]['total_reward'] for a in sorted(results.keys())]
-    dataset = np.array([[a, r] for a, r in zip(alpha_values, rewards)])
-    trend = np.corrcoef(dataset[:, 0], dataset[:, 1])[0, 1]
-    print(f"Trend (alpha~reward correlation): {trend:.3f}")
-
-
-def demo_policy_comparison():
-    print("\n" + "=" * 70)
-    print("EXPERIMENT: Policy Comparison under Contamination")
-    print("=" * 70)
-
-    cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.25, max_steps=100, seed=42)
-    platform = make_thesis_platform(cfg)
-
-    def fixed_policy(obs, t): return platform.instruments.refs.copy(), 1.0
-    def aggressive_policy(obs, t): return platform.instruments.refs * 1.3, 1.0
-    def conservative_policy(obs, t): return platform.instruments.refs * 1.05, 1.0
-    def adaptive_policy(obs, t):
-        fills = obs[platform.instruments.n:2*platform.instruments.n]
-        exp = obs[2*platform.instruments.n:3*platform.instruments.n]
-        conv = np.sum(fills) / (np.sum(exp) + 1e-8)
-        return platform.instruments.refs * (1.0 + 0.2 * conv), 1.0
-
-    policies = {'fixed': fixed_policy, 'aggressive': aggressive_policy,
-                'conservative': conservative_policy, 'adaptive': adaptive_policy}
-    results = compare_policies(platform, policies, n_steps=100, n_runs=3, seed=42)
-
-    print(f"\n{'Policy':>15} {'Reward':>12} {'Std':>10} {'PnL':>12} {'Conv':>10}")
-    print("-" * 65)
-    for name, r in sorted(results.items(), key=lambda x: -x[1]['mean_reward']):
-        print(f"{name:>15} {r['mean_reward']:12.2f} {r['std_reward']:10.2f} "
-              f"{r['mean_pnl']:12.2f} {r['mean_conversion']:10.3f}")
-
-
-def demo_session_analysis():
-    """Analyze session-level behavior from MDP trajectories."""
-    print("\n" + "=" * 70)
-    print("EXPERIMENT: Session Analysis (Ground Truth)")
-    print("=" * 70)
-
-    from lab.outlet.constants import LogLevel
-    cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.3, max_steps=50,
-                       log_level=LogLevel.FULL, seed=42, use_real_behavior=True)
-    platform = make_thesis_platform(cfg)
-
-    result = platform.reset(seed=42)
-    human_sessions, agent_sessions = 0, 0
-
-    for t in range(cfg.max_steps):
-        action = platform.instruments.refs * 1.1
-        result = platform.step(action)
-        sep = compute_separability(result.logs, result.hidden.contamination)
-        human_sessions += sep.n_human_sessions
-        agent_sessions += sep.n_agent_sessions
-
-    total = human_sessions + agent_sessions
-    print(f"\nTotal sessions: {total}")
-    print(f"Human sessions: {human_sessions} ({100*human_sessions/total:.1f}%)")
-    print(f"Agent sessions: {agent_sessions} ({100*agent_sessions/total:.1f}%)")
-    print(f"True contamination: {cfg.alpha_contamination:.1%}")
-    print(f"Observed contamination: {agent_sessions/total:.1%}")
-
-
-if __name__ == '__main__':
-    demo_basic_simulation()
-    demo_contamination_sweep()
-    # demo_policy_comparison()
-    # demo_session_analysis()
diff --git a/lab/config.py b/lab/config.py
deleted file mode 100644
index 441085d..0000000
--- a/lab/config.py
+++ /dev/null
@@ -1,156 +0,0 @@
-"""
-Configuration and factory functions for creating pre-configured platforms.
-
-This module provides:
-- RetailConfig, MarketMakingConfig: Configuration dataclasses
-- make_retail_platform: Factory for retail dynamic pricing scenarios
-- make_market_making_platform: Factory for market making scenarios
-
-Example:
-    >>> from lab.config import make_retail_platform
-    >>> platform = make_retail_platform(RetailConfig(n_instruments=5))
-    >>> result = platform.reset(seed=42)
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-import numpy as np
-from .outlet import (Platform, PlatformConfig, PositionModel, PositionConfig,
-                     PostedPriceMechanism, TwoSidedMechanism, make_instruments,
-                     InstrumentType, LogLevel)
-from .outlet.mechanisms.posted_price import PostedPriceConfig
-from .outlet.mechanisms.two_sided import TwoSidedConfig
-from .population import (SessionArrivalModel, PoissonArrivalModel, HawkesArrivalModel,
-                         ElasticityExecutionModel, IntensityExecutionModel,
-                         ReactiveCompetitorModel, GBMMarketModel)
-from .population.arrivals import SessionArrivalConfig, PoissonArrivalConfig, HawkesArrivalConfig
-from .population.execution import ElasticityConfig, IntensityConfig
-from .population.competitors import ReactiveCompetitorConfig, GBMMarketConfig
-from .outlet.objectives.factory import retail_objective, market_making_objective
-
-@dataclass
-class RetailConfig:
-    """Configuration for retail dynamic pricing scenario.
-
-    Attributes:
-        n_instruments: Number of products to price
-        cost_range: (min, max) for random product costs
-        margin_range: (min, max) for random initial margins
-        initial_inventory: Starting inventory per product
-        holding_cost_rate: Cost per unit per step for holding
-        sessions_per_step: Number of browsing sessions per step
-        contamination: Fraction of sessions that are scrapers
-        max_steps: Maximum episode length
-        seed: Random seed for reproducibility
-    """
-    n_instruments: int = 10
-    cost_range: tuple[float, float] = (5.0, 50.0)
-    margin_range: tuple[float, float] = (0.2, 0.5)
-    initial_inventory: float = 100.0
-    holding_cost_rate: float = 0.002
-    sessions_per_step: int = 30
-    contamination: float = 0.1
-    max_steps: int = 500
-    seed: int | None = None
-
-def make_retail_platform(cfg: RetailConfig | None = None) -> Platform:
-    """Create a pre-configured retail dynamic pricing platform.
-
-    Components:
-    - Mechanism: PostedPriceMechanism (single price per product)
-    - Arrivals: SessionArrivalModel (browsing sessions with views)
-    - Execution: ElasticityExecutionModel (price sensitivity)
-    - Market: ReactiveCompetitorModel (can trigger price wars)
-    - Objective: PnL - holding_cost - volatility - lost_opportunity
-
-    Args:
-        cfg: Configuration (uses defaults if None)
-
-    Returns:
-        Configured Platform instance
-    """
-    cfg = cfg or RetailConfig()
-    rng = np.random.default_rng(cfg.seed)
-
-    instruments = make_instruments(cfg.n_instruments, cfg.cost_range, cfg.margin_range,
-                                   InstrumentType.SKU, rng)
-    instruments.position = np.full(cfg.n_instruments, cfg.initial_inventory)
-
-    mechanism = PostedPriceMechanism(PostedPriceConfig())
-    arrival = SessionArrivalModel(SessionArrivalConfig(
-        sessions_per_step=cfg.sessions_per_step, contamination=cfg.contamination))
-    execution = ElasticityExecutionModel(ElasticityConfig())
-    position = PositionModel(PositionConfig(
-        initial_position=cfg.initial_inventory,
-        holding_cost_rate=cfg.holding_cost_rate))
-    market = ReactiveCompetitorModel(ReactiveCompetitorConfig(), refs=instruments.refs)
-    objective = retail_objective()
-
-    return Platform(
-        instruments=instruments, mechanism=mechanism, arrival=arrival,
-        execution=execution, position=position, market=market, objective=objective,
-        cfg=PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps,
-                           seed=cfg.seed, log_level=LogLevel.AGG_ONLY)
-    )
-
-@dataclass
-class MarketMakingConfig:
-    """Configuration for market making scenario.
-
-    Attributes:
-        n_instruments: Number of assets to quote
-        initial_mid: Initial mid-price for assets
-        mu: Price drift (expected return)
-        sigma: Price volatility
-        gamma: Inventory risk aversion parameter
-        base_arrival_rate: Order arrival rate (Hawkes baseline)
-        max_steps: Maximum episode length
-        seed: Random seed for reproducibility
-    """
-    n_instruments: int = 5
-    initial_mid: float = 100.0
-    mu: float = 0.0
-    sigma: float = 0.02
-    gamma: float = 0.1
-    base_arrival_rate: float = 20.0
-    max_steps: int = 1000
-    seed: int | None = None
-
-def make_market_making_platform(cfg: MarketMakingConfig | None = None) -> Platform:
-    """Create a pre-configured market making platform.
-
-    Components:
-    - Mechanism: TwoSidedMechanism (bid-ask spread quoting)
-    - Arrivals: HawkesArrivalModel (clustered order flow)
-    - Execution: IntensityExecutionModel (distance-based fills)
-    - Market: GBMMarketModel (geometric Brownian motion mid-prices)
-    - Objective: PnL + spread_capture - inventory_risk
-
-    Args:
-        cfg: Configuration (uses defaults if None)
-
-    Returns:
-        Configured Platform instance
-    """
-    cfg = cfg or MarketMakingConfig()
-    rng = np.random.default_rng(cfg.seed)
-
-    instruments = make_instruments(cfg.n_instruments, (cfg.initial_mid*0.9, cfg.initial_mid*1.1),
-                                   (0.0, 0.0), InstrumentType.ASSET, rng)
-    instruments.position = np.zeros(cfg.n_instruments)
-
-    mechanism = TwoSidedMechanism(TwoSidedConfig())
-    arrival = HawkesArrivalModel(HawkesArrivalConfig(base_rate=cfg.base_arrival_rate))
-    execution = IntensityExecutionModel(IntensityConfig())
-    position = PositionModel(PositionConfig(
-        initial_position=0.0, min_position=-500, max_position=500,
-        holding_cost_rate=0.0))  # use inventory risk penalty instead
-    market = GBMMarketModel(GBMMarketConfig(mu=cfg.mu, sigma=cfg.sigma),
-                            initial=instruments.refs)
-    objective = market_making_objective(gamma=cfg.gamma, sigma=cfg.sigma)
-
-    return Platform(
-        instruments=instruments, mechanism=mechanism, arrival=arrival,
-        execution=execution, position=position, market=market, objective=objective,
-        cfg=PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps,
-                           seed=cfg.seed, log_level=LogLevel.AGG_ONLY)
-    )
diff --git a/lab/docs/Makefile b/lab/docs/Makefile
deleted file mode 100644
index fe8e88c..0000000
--- a/lab/docs/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/lab/docs/conf.py b/lab/docs/conf.py
deleted file mode 100644
index 0e39351..0000000
--- a/lab/docs/conf.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-import sys
-sys.path.insert(0, os.path.abspath('../..'))
-
-project = 'Quote-Control Simulator'
-copyright = '2025, PHANTOM Research'
-author = 'PHANTOM Research'
-release = '0.1.0'
-
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.autosummary',
-]
-
-templates_path = ['_templates']
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
-
-html_theme = 'alabaster'
-html_static_path = ['_static']
-
-autodoc_default_options = {
-    'members': True,
-    'undoc-members': True,
-    'show-inheritance': True,
-}
-
-napoleon_google_docstring = True
-napoleon_numpy_docstring = True
-napoleon_include_init_with_doc = True
-
-intersphinx_mapping = {
-    'python': ('https://docs.python.org/3', None),
-    'numpy': ('https://numpy.org/doc/stable/', None),
-}
-
-autosummary_generate = True
diff --git a/lab/docs/index.rst b/lab/docs/index.rst
deleted file mode 100644
index bd36ecd..0000000
--- a/lab/docs/index.rst
+++ /dev/null
@@ -1,40 +0,0 @@
-Quote-Control Simulator
-=======================
-
-Research-grade platform for dynamic pricing and market making experiments.
-
-The platform abstracts pricing as: **Quote → Arrival → Execution → Position**
-
-Supports multiple mechanisms:
-
-* **PostedPrice**: retail dynamic pricing
-* **TwoSided**: market making with bid-ask spreads
-* **Auction**: reserve/shading for auction settings
-
-Quick Start
------------
-
-.. code-block:: python
-
-   from lab.config import make_retail_platform
-   from lab.experiments import rollout, fixed_price_policy
-
-   platform = make_retail_platform()
-   policy = fixed_price_policy(platform.instruments.refs)
-   result = rollout(platform, policy, n_steps=100)
-   print(f"Total PnL: {result.total_pnl:.2f}")
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-   system_overview
-   modules/outlet
-   modules/population
-   modules/experiments
-
-Indices
--------
-
-* :ref:`genindex`
-* :ref:`modindex`
diff --git a/lab/docs/modules/experiments.rst b/lab/docs/modules/experiments.rst
deleted file mode 100644
index c71ee36..0000000
--- a/lab/docs/modules/experiments.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Experiments
-===========
-
-Evaluation & OPE
-----------------
-
-.. automodule:: lab.experiments.eval
-   :members:
-
-Configuration
--------------
-
-.. automodule:: lab.config
-   :members:
diff --git a/lab/docs/modules/outlet.rst b/lab/docs/modules/outlet.rst
deleted file mode 100644
index 9f3b8c3..0000000
--- a/lab/docs/modules/outlet.rst
+++ /dev/null
@@ -1,77 +0,0 @@
-Outlet (Core Simulator)
-=======================
-
-Types
------
-
-.. automodule:: lab.outlet.types
-   :members:
-
-Constants
----------
-
-.. automodule:: lab.outlet.constants
-   :members:
-
-Protocols
----------
-
-.. automodule:: lab.outlet.protocols
-   :members:
-
-Platform
---------
-
-.. automodule:: lab.outlet.platform
-   :members:
-
-Stock & Position
-----------------
-
-.. automodule:: lab.outlet.stock
-   :members:
-
-Observation
------------
-
-.. automodule:: lab.outlet.observation
-   :members:
-
-Mechanisms
-----------
-
-Posted Price
-~~~~~~~~~~~~
-
-.. automodule:: lab.outlet.mechanisms.posted_price
-   :members:
-
-Two-Sided (Market Making)
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. automodule:: lab.outlet.mechanisms.two_sided
-   :members:
-
-Auction
-~~~~~~~
-
-.. automodule:: lab.outlet.mechanisms.auction
-   :members:
-
-Objectives
-----------
-
-.. automodule:: lab.outlet.objectives.base
-   :members:
-
-.. automodule:: lab.outlet.objectives.penalties
-   :members:
-
-.. automodule:: lab.outlet.objectives.factory
-   :members:
-
-Math Utilities
---------------
-
-.. automodule:: lab.outlet.math_util
-   :members:
diff --git a/lab/docs/modules/population.rst b/lab/docs/modules/population.rst
deleted file mode 100644
index 0b7ef75..0000000
--- a/lab/docs/modules/population.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-Population Models
-=================
-
-Arrival Models
---------------
-
-.. automodule:: lab.population.arrivals
-   :members:
-
-Execution Models
-----------------
-
-.. automodule:: lab.population.execution
-   :members:
-
-Competitor / Market Models
---------------------------
-
-.. automodule:: lab.population.competitors
-   :members:
diff --git a/lab/docs/system_overview.rst b/lab/docs/system_overview.rst
deleted file mode 100644
index 3fda8ad..0000000
--- a/lab/docs/system_overview.rst
+++ /dev/null
@@ -1,97 +0,0 @@
-System Overview
-===============
-
-The simulator organises dynamic pricing and market-making experiments as a
-closed loop with the following stages:
-
-* **Quote** – a policy or agent emits a :class:`lab.outlet.types.Quote`. The
-  quote is normalised and validated by a concrete
-  :class:`lab.outlet.protocols.Mechanism` implementation
-  (posted-price, two-sided, auction).
-* **Arrival** – a :class:`lab.outlet.protocols.ArrivalModel` samples a stream of
-  :class:`lab.outlet.types.Opportunity` objects given the current time,
-  instrument catalogue, and market state.
-* **Execution** – the :class:`lab.outlet.protocols.ExecutionModel` converts an
-  opportunity into a probabilistic fill using the active quote, optional
-  competitor prices, and demand-side context.
-* **Position** – a :class:`lab.outlet.protocols.PositionModel` enforces
-  inventory or position constraints, censors oversized fills, and accrues
-  holding and shortage costs.
-* **Observation & Reward** – the
-  :class:`lab.outlet.protocols.ObservationBuilder` constructs the censored view
-  exposed to the agent, while a :class:`lab.outlet.protocols.Objective`
-  transforms :class:`lab.outlet.types.StepMetrics` into a scalar reward with an
-  optional breakdown per term.
-
-These components are orchestrated by :class:`lab.outlet.platform.Platform`,
-which manages internal hidden state, deterministic seeding, and logging.
-
-Component Matrix
-----------------
-
-===============================  ==============================================
-Layer                            Responsibilities / Examples
-===============================  ==============================================
-Mechanisms                       Quote normalisation, execution semantics
-                                 (`posted_price`, `two_sided`, `auction`).
-Population models                Arrivals (:mod:`lab.population.arrivals`),
-                                 execution probability models
-                                 (:mod:`lab.population.execution`), and
-                                 competitor or market dynamics
-                                 (:mod:`lab.population.competitors`).
-Position management              Inventory limits, replenishment, holding and
-                                 shortage costs (:mod:`lab.outlet.stock`).
-Observation & logging            Censored observations and optional event logs
-                                 (:mod:`lab.outlet.observation`).
-Objectives                       Reward composition utilities
-                                 (:mod:`lab.outlet.objectives`).
-Experiments                      Rollout helpers, baseline policies, off-policy
-                                 evaluation (:mod:`lab.experiments.eval`).
-===============================  ==============================================
-
-Preconfigured Platforms
------------------------
-
-Two high-level factories in :mod:`lab.config` wire common combinations of the
-building blocks:
-
-* **Retail dynamic pricing** – posted-price mechanism, session arrivals with
-  contamination, elasticity-based executions, reactive competitor model, and a
-  composite objective that penalises volatility, holding costs, and lost
-  opportunities.
-* **Market making** – two-sided quoting, Hawkes order flow, intensity-based
-  executions, geometric Brownian motion mid-prices, and an objective combining
-  PnL, spread capture, and quadratic inventory risk.
-
-State & Reset Behaviour
------------------------
-
-When you call :meth:`lab.outlet.platform.Platform.reset`, the platform resets
-instrument positions, quotes, and hidden state, but component implementations
-may maintain their own internal buffers. For reproducible experiments:
-
-* Reuse freshly instantiated arrival/market models per episode, or add explicit
-  ``reset`` methods if the model keeps history (for example,
-  :class:`lab.population.arrivals.HawkesArrivalModel` maintains an event
-  history, while :class:`lab.population.competitors.ReactiveCompetitorModel`
-  tracks prior competitor quotes).
-* Seed randomness through the factory configuration (``RetailConfig.seed`` or
-  ``MarketMakingConfig.seed``) or pass a seed to ``Platform.reset`` for
-  deterministic rollouts.
-
-Extending the Platform
-----------------------
-
-To support a new domain:
-
-1. Create custom Mechanism/Arrival/Execution/Market/Observation components by
-   implementing the respective protocol in :mod:`lab.outlet.protocols`.
-2. Compose a new objective with
-   :func:`lab.outlet.objectives.factory.make_composite` or write a bespoke
-   :class:`lab.outlet.objectives.base.BaseObjective`.
-3. Wire everything together via :class:`lab.outlet.platform.Platform` directly
-   or expose a helper factory in :mod:`lab.config`.
-
-Use :func:`lab.experiments.rollout` and
-:func:`lab.experiments.compare_policies` to benchmark candidate policies under
-multiple random seeds, collecting per-step logs for analysis or OPE.
diff --git a/lab/experiments/__init__.py b/lab/experiments/__init__.py
deleted file mode 100644
index ac427f3..0000000
--- a/lab/experiments/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .eval import (rollout, RolloutResult, compare_policies, compute_ips, OPEResult,
-                   fixed_price_policy, cost_plus_margin_policy, random_walk_policy, epsilon_greedy_policy)
-
-__all__ = [
-    'rollout', 'RolloutResult', 'compare_policies', 'compute_ips', 'OPEResult',
-    'fixed_price_policy', 'cost_plus_margin_policy', 'random_walk_policy', 'epsilon_greedy_policy',
-]
diff --git a/lab/experiments/eval.py b/lab/experiments/eval.py
deleted file mode 100644
index 8bc9330..0000000
--- a/lab/experiments/eval.py
+++ /dev/null
@@ -1,213 +0,0 @@
-"""
-Evaluation utilities for policy testing and off-policy evaluation.
-
-This module provides:
-- rollout: Run a policy on the platform for multiple steps
-- compare_policies: Compare multiple policies with statistics
-- Baseline policies: fixed_price, cost_plus_margin, random_walk, epsilon_greedy
-- OPE estimators: IPS and SNIPS for off-policy evaluation
-
-Example:
-    >>> from lab.config import make_retail_platform
-    >>> from lab.experiments.eval import rollout, fixed_price_policy
-    >>> platform = make_retail_platform()
-    >>> policy = fixed_price_policy(platform.instruments.refs)
-    >>> result = rollout(platform, policy, n_steps=100)
-    >>> print(f"Total PnL: {result.total_pnl:.2f}")
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-from typing import Callable, Any
-import numpy as np
-from ..outlet.platform import Platform
-from ..outlet.types import StepResult, StepLogs, Quote
-
-# Policy signature: takes (observation_flat, timestep) -> (action_prices, propensity)
-Policy = Callable[[np.ndarray, int], tuple[np.ndarray, float]]
-
-@dataclass
-class RolloutResult:
-    """Results from a policy rollout.
-
-    Attributes:
-        rewards: Per-step rewards
-        metrics: Per-step StepMetrics objects
-        logs: Per-step StepLogs objects
-        total_reward: Sum of rewards
-        total_pnl: Sum of PnL from metrics
-        avg_conversion: Average conversion rate
-    """
-    rewards: list[float]
-    metrics: list[Any]
-    logs: list[StepLogs]
-    total_reward: float
-    total_pnl: float
-    avg_conversion: float
-
-def rollout(platform: Platform, policy: Policy, n_steps: int, seed: int | None = None) -> RolloutResult:
-    """Execute a policy on the platform for n_steps.
-
-    Args:
-        platform: The simulation platform
-        policy: Function (obs, t) -> (action, propensity)
-        n_steps: Number of steps to run
-        seed: Random seed for reproducibility
-
-    Returns:
-        RolloutResult with rewards, metrics, and summary statistics
-    """
-    result = platform.reset(seed)
-    rewards, metrics, logs = [], [], []
-
-    for t in range(n_steps):
-        obs_flat = result.obs.to_flat()
-        action, propensity = policy(obs_flat, t)
-        result = platform.step(action, propensity)
-        rewards.append(result.reward)
-        metrics.append(result.metrics)
-        logs.append(result.logs)
-        if result.terminated or result.truncated:
-            break
-
-    return RolloutResult(
-        rewards=rewards, metrics=metrics, logs=logs,
-        total_reward=sum(rewards),
-        total_pnl=sum(m.pnl for m in metrics),
-        avg_conversion=np.mean([m.conversion for m in metrics])
-    )
-
-# Baseline policies for comparison
-
-def fixed_price_policy(refs: np.ndarray) -> Policy:
-    """Policy that always quotes at reference prices."""
-    def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
-        return refs.copy(), 1.0
-    return policy
-
-def cost_plus_margin_policy(costs: np.ndarray, margin: float = 0.3) -> Policy:
-    """Policy that quotes at cost * (1 + margin)."""
-    prices = costs * (1 + margin)
-    def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
-        return prices.copy(), 1.0
-    return policy
-
-def random_walk_policy(refs: np.ndarray, volatility: float = 0.05,
-                       rng: np.random.Generator | None = None) -> Policy:
-    """Policy that performs a random walk around reference prices."""
-    rng = rng or np.random.default_rng()
-    prices = refs.copy()
-    def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
-        nonlocal prices
-        delta = rng.normal(0, volatility, len(prices))
-        prices = prices * (1 + delta)
-        prices = np.clip(prices, refs * 0.5, refs * 2.0)
-        return prices.copy(), 1.0
-    return policy
-
-def epsilon_greedy_policy(base_policy: Policy, refs: np.ndarray,
-                          epsilon: float = 0.1, rng: np.random.Generator | None = None) -> Policy:
-    """Wrap a policy with epsilon-greedy exploration."""
-    rng = rng or np.random.default_rng()
-    def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
-        if rng.random() < epsilon:
-            action = refs * rng.uniform(0.8, 1.2, len(refs))
-            return action, epsilon / len(refs)
-        else:
-            action, _ = base_policy(obs, t)
-            return action, 1 - epsilon
-    return policy
-
-# Off-Policy Evaluation (OPE)
-
-@dataclass
-class OPEResult:
-    """Results from off-policy evaluation.
-
-    Attributes:
-        ips_estimate: Inverse Propensity Scoring estimate
-        snips_estimate: Self-normalized IPS estimate (more stable)
-        n_samples: Number of samples used
-        effective_samples: Effective sample size (accounts for variance)
-    """
-    ips_estimate: float
-    snips_estimate: float
-    n_samples: int
-    effective_samples: float
-
-def compute_ips(logs: list[StepLogs], rewards: list[float],
-                target_policy: Policy, behavior_propensities: list[float] | None = None) -> OPEResult:
-    """Compute IPS and SNIPS estimators for off-policy evaluation.
-
-    Uses logged propensities to estimate expected reward under a target
-    policy from data collected under a behavior policy.
-
-    Args:
-        logs: Step logs containing propensities
-        rewards: Observed rewards from behavior policy
-        target_policy: Policy to evaluate (not currently used, assumes deterministic)
-        behavior_propensities: Override propensities if not in logs
-
-    Returns:
-        OPEResult with IPS, SNIPS estimates and sample statistics
-    """
-    if behavior_propensities is None:
-        # extract from logs
-        behavior_propensities = []
-        for log in logs:
-            if log.executions:
-                avg_prop = np.mean([e.propensity for e in log.executions])
-            else:
-                avg_prop = 1.0
-            behavior_propensities.append(avg_prop)
-
-    # compute importance weights
-    weights = []
-    for i, (log, bp) in enumerate(zip(logs, behavior_propensities)):
-        # target propensity would need obs reconstruction - simplified here
-        tp = 1.0  # assume deterministic target
-        w = tp / (bp + 1e-8)
-        weights.append(w)
-
-    weights = np.array(weights)
-    rewards = np.array(rewards)
-
-    # IPS estimate
-    ips = np.sum(weights * rewards) / len(rewards)
-
-    # SNIPS (self-normalized)
-    snips = np.sum(weights * rewards) / (np.sum(weights) + 1e-8)
-
-    # effective sample size
-    ess = (np.sum(weights) ** 2) / (np.sum(weights ** 2) + 1e-8)
-
-    return OPEResult(ips_estimate=ips, snips_estimate=snips,
-                     n_samples=len(rewards), effective_samples=ess)
-
-def compare_policies(platform: Platform, policies: dict[str, Policy],
-                     n_steps: int = 100, n_runs: int = 5, seed: int = 42) -> dict[str, dict]:
-    """Compare multiple policies with statistical summary.
-
-    Args:
-        platform: Simulation platform
-        policies: Dict mapping policy names to policy functions
-        n_steps: Steps per rollout
-        n_runs: Number of rollouts per policy (different seeds)
-        seed: Base random seed
-
-    Returns:
-        Dict mapping policy names to result dicts with mean/std statistics
-    """
-    results = {}
-    for name, policy in policies.items():
-        run_results = []
-        for i in range(n_runs):
-            r = rollout(platform, policy, n_steps, seed=seed + i)
-            run_results.append(r)
-
-        results[name] = {
-            'mean_reward': np.mean([r.total_reward for r in run_results]),
-            'std_reward': np.std([r.total_reward for r in run_results]),
-            'mean_pnl': np.mean([r.total_pnl for r in run_results]),
-            'mean_conversion': np.mean([r.avg_conversion for r in run_results]),
-        }
-    return results
diff --git a/lab/outlet/__init__.py b/lab/outlet/__init__.py
deleted file mode 100644
index 11a8d76..0000000
--- a/lab/outlet/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from .constants import Side, MechanismType, InstrumentType, OpportunityType, EventType, LogLevel
-from .types import (Instrument, InstrumentSet, Quote, Opportunity, Execution,
-                    StepEvent, StepLogs, StepMetrics, MarketState, HiddenState, Observation, StepResult)
-from .stock import PositionModel, PositionConfig, make_instruments
-from .platform import Platform, PlatformConfig
-from .observation import DefaultObservationBuilder, ObservationConfig
-from .mechanisms import PostedPriceMechanism, TwoSidedMechanism, AuctionMechanism
-
-__all__ = [
-    'Side', 'MechanismType', 'InstrumentType', 'OpportunityType', 'EventType', 'LogLevel',
-    'Instrument', 'InstrumentSet', 'Quote', 'Opportunity', 'Execution',
-    'StepEvent', 'StepLogs', 'StepMetrics', 'MarketState', 'HiddenState', 'Observation', 'StepResult',
-    'PositionModel', 'PositionConfig', 'make_instruments',
-    'Platform', 'PlatformConfig',
-    'DefaultObservationBuilder', 'ObservationConfig',
-    'PostedPriceMechanism', 'TwoSidedMechanism', 'AuctionMechanism',
-]
diff --git a/lab/outlet/constants.py b/lab/outlet/constants.py
deleted file mode 100644
index 27c7da2..0000000
--- a/lab/outlet/constants.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""
-Constants and enumerations for the Quote-Control simulator.
-
-This module defines the core enums used throughout the platform to ensure
-type safety and consistent semantics across different pricing mechanisms.
-"""
-from enum import Enum, auto
-
-class Side(Enum):
-    """Transaction side indicator.
-
-    Attributes:
-        BUY: Buyer-initiated transaction (customer purchases, market buy order)
-        SELL: Seller-initiated transaction (market sell order, short sale)
-    """
-    BUY = auto()
-    SELL = auto()
-
-class MechanismType(Enum):
-    """Pricing mechanism type defining how quotes translate to executions.
-
-    Attributes:
-        POSTED_PRICE: Single posted price per instrument (retail dynamic pricing)
-        TWO_SIDED_QUOTE: Bid-ask spread quoting (market making, liquidity provision)
-        AUCTION: Reserve price or bid shading (ad auctions, marketplaces)
-    """
-    POSTED_PRICE = auto()
-    TWO_SIDED_QUOTE = auto()
-    AUCTION = auto()
-
-class InstrumentType(Enum):
-    """Type of instrument being priced.
-
-    Attributes:
-        SKU: Retail product with inventory constraints
-        ASSET: Financial instrument with position limits
-        LOAN: Credit product with interest rate pricing
-        SUBSCRIPTION: Recurring service with periodic fees
-    """
-    SKU = auto()
-    ASSET = auto()
-    LOAN = auto()
-    SUBSCRIPTION = auto()
-
-class OpportunityType(Enum):
-    """Type of arrival opportunity.
-
-    Attributes:
-        SESSION: Retail browsing session with potential purchase intent
-        MARKET_ORDER: Financial market order arrival (buy or sell)
-        REQUEST: Service or credit request requiring quote response
-    """
-    SESSION = auto()
-    MARKET_ORDER = auto()
-    REQUEST = auto()
-
-class EventType(Enum):
-    """Type of logged event during simulation.
-
-    Attributes:
-        ARRIVAL: New opportunity arrived in the system
-        EXPOSURE: Quote was shown to an arrival
-        EXECUTION: Transaction was executed
-        ABANDON: Opportunity abandoned without execution
-        CANCEL: Pending order was cancelled
-    """
-    ARRIVAL = auto()
-    EXPOSURE = auto()
-    EXECUTION = auto()
-    ABANDON = auto()
-    CANCEL = auto()
-
-class LogLevel(Enum):
-    """Verbosity level for step logging.
-
-    Attributes:
-        NONE: No logging, fastest execution
-        AGG_ONLY: Only aggregate statistics per step
-        FULL: Full event-level logging with propensities for OPE
-    """
-    NONE = auto()
-    AGG_ONLY = auto()
-    FULL = auto()
diff --git a/lab/outlet/gym_wrapper.py b/lab/outlet/gym_wrapper.py
deleted file mode 100644
index 790adcf..0000000
--- a/lab/outlet/gym_wrapper.py
+++ /dev/null
@@ -1,86 +0,0 @@
-"""
-Gymnasium-compatible wrapper for the Quote-Control platform.
-
-Provides a standard Gym interface for RL training:
-- observation_space: Box space with flattened observation
-- action_space: Box space with price multipliers [0.5, 2.0]
-- reset(), step(), render(), close() methods
-
-Example:
-    >>> from lab.config import make_retail_platform
-    >>> from lab.outlet.gym_wrapper import QuoteGymEnv
-    >>> env = QuoteGymEnv(make_retail_platform())
-    >>> obs, info = env.reset()
-    >>> obs, reward, done, truncated, info = env.step(env.action_space.sample())
-"""
-from __future__ import annotations
-from typing import Any
-import numpy as np
-
-try:
-    import gymnasium as gym
-    from gymnasium import spaces
-    HAS_GYM = True
-except ImportError:
-    HAS_GYM = False
-
-from .platform import Platform, PlatformConfig
-from .types import Quote, InstrumentSet, StepResult
-
-class QuoteGymEnv:
-    """Gymnasium-compatible environment wrapper.
-
-    Wraps a Platform instance with standard Gym interface.
-    Actions are price multipliers in [0.5, 2.0] applied to reference prices.
-    Observations are flattened numpy arrays containing quotes, fills, exposures.
-    """
-
-    def __init__(self, platform: Platform):
-        if not HAS_GYM:
-            raise ImportError("gymnasium required for QuoteGymEnv")
-        self.platform = platform
-        self.n = platform.instruments.n
-        self._last_result: StepResult | None = None
-
-        # action space: price adjustments as multipliers [0.5, 2.0]
-        self.action_space = spaces.Box(low=0.5, high=2.0, shape=(self.n,), dtype=np.float32)
-
-        # observation space
-        obs_dim = self.n * 4  # quotes + fills + exposures + position
-        if platform.market:
-            obs_dim += self.n  # competitor quotes
-        self.observation_space = spaces.Box(low=-np.inf, high=np.inf,
-                                            shape=(obs_dim,), dtype=np.float32)
-
-    def reset(self, seed: int | None = None, options: dict | None = None) -> tuple[np.ndarray, dict]:
-        result = self.platform.reset(seed)
-        self._last_result = result
-        return result.obs.to_flat().astype(np.float32), result.info
-
-    def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, bool, dict]:
-        # convert action (multipliers) to absolute prices
-        refs = self.platform.instruments.refs
-        prices = refs * action
-        result = self.platform.step(prices)
-        self._last_result = result
-        return (result.obs.to_flat().astype(np.float32), result.reward,
-                result.terminated, result.truncated, result.info)
-
-    def render(self) -> None:
-        if self._last_result:
-            m = self._last_result.metrics
-            print(f"t={self.platform._t} pnl={m.pnl:.2f} units={m.units_traded:.0f} "
-                  f"conv={m.conversion:.3f} vol={m.volatility:.3f}")
-
-    def close(self) -> None:
-        pass
-
-def make_env(platform: Platform) -> QuoteGymEnv:
-    return QuoteGymEnv(platform)
-
-if HAS_GYM:
-    # register if gymnasium available
-    try:
-        gym.register(id='QuoteControl-v0', entry_point='outlet.gym_wrapper:QuoteGymEnv')
-    except:
-        pass  # already registered or other issue
diff --git a/lab/outlet/math_util.py b/lab/outlet/math_util.py
deleted file mode 100644
index da78745..0000000
--- a/lab/outlet/math_util.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""
-Numerical utilities for stable computation.
-
-This module provides numerically stable implementations of common operations:
-- safe_exp, safe_log: Avoid overflow/underflow
-- softmax: Numerically stable softmax
-- sigmoid, clamp: Standard transformations
-- intensity_decay: Avellaneda-Stoikov fill intensity
-- inventory_penalty: Quadratic inventory risk
-- poisson_arrivals, hawkes_intensity: Arrival process helpers
-
-All functions accept both scalars and numpy arrays.
-"""
-import numpy as np
-
-EPS = 1e-8  # small constant to avoid division by zero
-MAX_EXP = 700.0  # maximum safe exponent to avoid overflow
-
-def safe_exp(x: np.ndarray | float) -> np.ndarray | float:
-    return np.exp(np.clip(x, -MAX_EXP, MAX_EXP))
-
-def safe_log(x: np.ndarray | float) -> np.ndarray | float:
-    return np.log(np.maximum(x, EPS))
-
-def clamp(x: np.ndarray | float, lo: float, hi: float) -> np.ndarray | float:
-    return np.clip(x, lo, hi)
-
-def sigmoid(x: np.ndarray | float) -> np.ndarray | float:
-    return 1.0 / (1.0 + safe_exp(-x))
-
-def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
-    x_max = np.max(x, axis=axis, keepdims=True)
-    exp_x = safe_exp(x - x_max)
-    return exp_x / (np.sum(exp_x, axis=axis, keepdims=True) + EPS)
-
-def geometric_series(base: float, ratio: float, n: int) -> np.ndarray:
-    return base * (ratio ** np.arange(n))
-
-def ema(old: float, new: float, alpha: float = 0.1) -> float:
-    return alpha * new + (1 - alpha) * old
-
-def intensity_decay(distance: float, kappa: float = 1.0) -> float:
-    """Avellaneda-Stoikov style fill intensity decay with quote distance"""
-    return safe_exp(-kappa * distance)
-
-def inventory_penalty(q: float, gamma: float = 0.1, sigma: float = 1.0) -> float:
-    """Quadratic inventory risk penalty"""
-    return gamma * sigma**2 * q**2 / 2
-
-def poisson_arrivals(rate: float, dt: float, rng: np.random.Generator) -> int:
-    return rng.poisson(rate * dt)
-
-def hawkes_intensity(base: float, history: np.ndarray, alpha: float, beta: float, t: float) -> float:
-    """Self-exciting Hawkes process intensity"""
-    if len(history) == 0: return base
-    decays = safe_exp(-beta * (t - history[history < t]))
-    return base + alpha * np.sum(decays)
diff --git a/lab/outlet/mechanisms/__init__.py b/lab/outlet/mechanisms/__init__.py
deleted file mode 100644
index 3c3c36e..0000000
--- a/lab/outlet/mechanisms/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .posted_price import PostedPriceMechanism
-from .two_sided import TwoSidedMechanism
-from .auction import AuctionMechanism
-
-__all__ = ['PostedPriceMechanism', 'TwoSidedMechanism', 'AuctionMechanism']
diff --git a/lab/outlet/mechanisms/auction.py b/lab/outlet/mechanisms/auction.py
deleted file mode 100644
index 2260aef..0000000
--- a/lab/outlet/mechanisms/auction.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""
-Auction mechanism for reserve pricing and bid shading.
-
-In this mechanism, the agent sets reserve prices that affect
-win probability and clearing prices. Used for ad auctions,
-marketplace auctions, and similar settings.
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-import numpy as np
-from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState
-from ..constants import Side
-from ..math_util import clamp, sigmoid
-
-@dataclass
-class AuctionConfig:
-    """Configuration for auction mechanism.
-
-    Attributes:
-        min_reserve: Minimum reserve price
-        max_reserve: Maximum reserve price
-        base_win_prob: Baseline win probability at reference reserve
-        sensitivity: How much higher reserves reduce win probability
-    """
-    min_reserve: float = 0.0
-    max_reserve: float = 100.0
-    base_win_prob: float = 0.3
-    sensitivity: float = 2.0
-
-class AuctionMechanism:
-    """Auction mechanism for reserve pricing.
-
-    The agent sets reserve prices that affect:
-    - Win probability: higher reserves reduce chance of winning
-    - Clearing price: bounded between reserve and simulated max bid
-
-    Win probability: base_prob * sigmoid(-sensitivity * (reserve - ref) / ref)
-    Clearing price: max(reserve, min(max_bid, reserve + random_increment))
-
-    Only BUY-side opportunities are processed (auction wins).
-    """
-
-    def __init__(self, cfg: AuctionConfig | None = None):
-        self.cfg = cfg or AuctionConfig()
-
-    def apply_quote(self, quote: Quote, instruments: InstrumentSet,
-                    rng: np.random.Generator) -> Quote:
-        reserves = clamp(quote.prices, self.cfg.min_reserve, self.cfg.max_reserve)
-        return Quote(prices=reserves, propensity=quote.propensity, metadata=quote.metadata)
-
-    def process_opportunity(self, opp: Opportunity, quote: Quote,
-                            instruments: InstrumentSet, market: MarketState | None,
-                            rng: np.random.Generator) -> Execution | None:
-        if opp.side != Side.BUY: return None
-        idx = int(opp.instrument_id)
-        reserve = float(quote.prices[idx])
-        ref = instruments.refs[idx]
-
-        # win probability decreases with higher reserve
-        relative_reserve = (reserve - ref) / (ref + 1e-8)
-        win_prob = self.cfg.base_win_prob * sigmoid(-self.cfg.sensitivity * relative_reserve)
-
-        if rng.random() > win_prob: return None
-
-        # clearing price is between reserve and some max bid (simulated)
-        max_bid = ref * (1 + rng.exponential(0.2))
-        clearing = max(reserve, min(max_bid, reserve + rng.exponential(0.1) * ref))
-
-        return Execution(
-            opportunity_id=opp.id, instrument_id=opp.instrument_id,
-            side=opp.side, size_requested=opp.size, size_filled=opp.size,
-            price=clearing, propensity=quote.propensity * win_prob, t=opp.t
-        )
diff --git a/lab/outlet/mechanisms/posted_price.py b/lab/outlet/mechanisms/posted_price.py
deleted file mode 100644
index 92bac12..0000000
--- a/lab/outlet/mechanisms/posted_price.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""
-Posted price mechanism for retail dynamic pricing.
-
-In this mechanism, the agent posts a single price per instrument.
-Buyers decide whether to purchase based on the posted price.
-This is the standard e-commerce dynamic pricing model.
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-import numpy as np
-from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState
-from ..constants import Side
-from ..math_util import clamp
-
-@dataclass
-class PostedPriceConfig:
-    """Configuration for posted price mechanism.
-
-    Attributes:
-        min_price: Absolute minimum price
-        max_price: Absolute maximum price
-        max_delta_pct: Maximum price change per step as fraction of previous
-        min_margin_pct: Minimum margin over cost basis
-        round_to: Price rounding granularity (None = no rounding)
-    """
-    min_price: float = 0.01
-    max_price: float = 1000.0
-    max_delta_pct: float = 0.2
-    min_margin_pct: float = 0.05
-    round_to: float | None = 0.01
-
-class PostedPriceMechanism:
-    """Posted price mechanism for retail dynamic pricing.
-
-    The agent posts a single price per product. Constraints enforced:
-    - Prices within [min_price, max_price]
-    - Margin at least min_margin_pct above cost
-    - Price changes limited to max_delta_pct per step
-    - Prices rounded to round_to granularity
-
-    Only BUY-side opportunities are processed (customers purchasing).
-    """
-
-    def __init__(self, cfg: PostedPriceConfig | None = None):
-        self.cfg = cfg or PostedPriceConfig()
-
-    def apply_quote(self, quote: Quote, instruments: InstrumentSet,
-                    rng: np.random.Generator) -> Quote:
-        prices = quote.prices.copy()
-        costs = instruments.costs
-        refs = instruments.refs
-        c = self.cfg
-
-        # enforce min margin
-        min_prices = costs * (1 + c.min_margin_pct)
-        prices = np.maximum(prices, min_prices)
-
-        # enforce absolute bounds
-        prices = clamp(prices, c.min_price, c.max_price)
-
-        # enforce max delta if we have history
-        if 'prev_prices' in quote.metadata:
-            prev = quote.metadata['prev_prices']
-            max_change = prev * c.max_delta_pct
-            prices = clamp(prices, prev - max_change, prev + max_change)
-
-        # round prices
-        if c.round_to:
-            prices = np.round(prices / c.round_to) * c.round_to
-
-        return Quote(prices=prices, propensity=quote.propensity,
-                     metadata={**quote.metadata, 'prev_prices': prices})
-
-    def process_opportunity(self, opp: Opportunity, quote: Quote,
-                            instruments: InstrumentSet, market: MarketState | None,
-                            rng: np.random.Generator) -> Execution | None:
-        if opp.side != Side.BUY: return None  # posted price is buy-only
-        idx = int(opp.instrument_id)
-        price = float(quote.prices[idx])
-        return Execution(
-            opportunity_id=opp.id, instrument_id=opp.instrument_id,
-            side=opp.side, size_requested=opp.size, size_filled=opp.size,
-            price=price, propensity=quote.propensity, t=opp.t
-        )
diff --git a/lab/outlet/mechanisms/two_sided.py b/lab/outlet/mechanisms/two_sided.py
deleted file mode 100644
index 166f4d9..0000000
--- a/lab/outlet/mechanisms/two_sided.py
+++ /dev/null
@@ -1,89 +0,0 @@
-"""
-Two-sided quoting mechanism for market making.
-
-In this mechanism, the agent posts both bid and ask prices.
-Execution depends on the distance from the market mid-price.
-This models liquidity provision in financial markets.
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-import numpy as np
-from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState
-from ..constants import Side
-from ..math_util import clamp, intensity_decay
-
-@dataclass
-class TwoSidedConfig:
-    """Configuration for two-sided quoting mechanism.
-
-    Attributes:
-        min_spread: Minimum bid-ask spread
-        max_spread: Maximum bid-ask spread
-        min_price: Absolute minimum price
-        max_price: Absolute maximum price
-        fill_kappa: Intensity decay parameter (higher = faster decay with distance)
-    """
-    min_spread: float = 0.01
-    max_spread: float = 0.5
-    min_price: float = 0.01
-    max_price: float = 10000.0
-    fill_kappa: float = 1.5
-
-class TwoSidedMechanism:
-    """Two-sided quoting mechanism for market making.
-
-    The agent posts bid (buy) and ask (sell) prices around a mid-point.
-    Fill probability decays exponentially with distance from mid-price,
-    following the Avellaneda-Stoikov intensity model.
-
-    Both BUY and SELL opportunities are processed:
-    - BUY: customer buys at agent's ask price
-    - SELL: customer sells at agent's bid price
-    """
-
-    def __init__(self, cfg: TwoSidedConfig | None = None):
-        self.cfg = cfg or TwoSidedConfig()
-
-    def apply_quote(self, quote: Quote, instruments: InstrumentSet,
-                    rng: np.random.Generator) -> Quote:
-        prices = quote.prices.copy()
-        spreads = quote.spreads.copy() if quote.spreads is not None else np.full_like(prices, 0.02)
-        c = self.cfg
-
-        prices = clamp(prices, c.min_price, c.max_price)
-        spreads = clamp(spreads, c.min_spread, c.max_spread)
-
-        # ensure bids < asks
-        half_spread = spreads / 2
-        bids = prices - half_spread
-        asks = prices + half_spread
-        bids = np.maximum(bids, c.min_price)
-        asks = np.minimum(asks, c.max_price)
-        spreads = asks - bids
-        prices = (bids + asks) / 2
-
-        return Quote(prices=prices, spreads=spreads, propensity=quote.propensity,
-                     metadata=quote.metadata)
-
-    def process_opportunity(self, opp: Opportunity, quote: Quote,
-                            instruments: InstrumentSet, market: MarketState | None,
-                            rng: np.random.Generator) -> Execution | None:
-        idx = int(opp.instrument_id)
-        mid = market.mid_prices[idx] if market and market.mid_prices is not None else quote.prices[idx]
-
-        if opp.side == Side.BUY:
-            price = float(quote.asks[idx]) if quote.asks is not None else float(quote.prices[idx])
-            distance = price - mid
-        else:
-            price = float(quote.bids[idx]) if quote.bids is not None else float(quote.prices[idx])
-            distance = mid - price
-
-        # probabilistic fill based on distance from mid
-        fill_prob = intensity_decay(abs(distance), self.cfg.fill_kappa)
-        if rng.random() > fill_prob: return None
-
-        return Execution(
-            opportunity_id=opp.id, instrument_id=opp.instrument_id,
-            side=opp.side, size_requested=opp.size, size_filled=opp.size,
-            price=price, propensity=quote.propensity * fill_prob, t=opp.t
-        )
diff --git a/lab/outlet/objectives/__init__.py b/lab/outlet/objectives/__init__.py
deleted file mode 100644
index 063b7a5..0000000
--- a/lab/outlet/objectives/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from .base import BaseObjective, CompositeObjective
-from .penalties import (PnLObjective, VolatilityPenalty, HoldingCostPenalty,
-                        LostOpportunityCostPenalty, InventoryRiskPenalty, SpreadCaptureReward)
-from .factory import make_objective, make_composite, retail_objective, market_making_objective
-
-__all__ = [
-    'BaseObjective', 'CompositeObjective',
-    'PnLObjective', 'VolatilityPenalty', 'HoldingCostPenalty',
-    'LostOpportunityCostPenalty', 'InventoryRiskPenalty', 'SpreadCaptureReward',
-    'make_objective', 'make_composite', 'retail_objective', 'market_making_objective',
-]
diff --git a/lab/outlet/objectives/base.py b/lab/outlet/objectives/base.py
deleted file mode 100644
index 49847aa..0000000
--- a/lab/outlet/objectives/base.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""
-Base classes for reward objectives.
-
-Objectives compute scalar rewards from step metrics. The CompositeObjective
-allows combining multiple objectives with weights for multi-objective optimization.
-"""
-from __future__ import annotations
-from abc import ABC, abstractmethod
-from ..types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation
-
-class BaseObjective(ABC):
-    """Abstract base class for reward objectives.
-
-    Subclasses must implement reward() and breakdown() methods.
-    """
-
-    @abstractmethod
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: ...
-
-    @abstractmethod
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: ...
-
-class CompositeObjective(BaseObjective):
-    """Weighted sum of multiple objectives.
-
-    Allows combining multiple reward terms (e.g., PnL - holding_cost - volatility).
-
-    Args:
-        objectives: List of (objective, weight) tuples
-    """
-
-    def __init__(self, objectives: list[tuple[BaseObjective, float]]):
-        self.objectives = objectives
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        return sum(w * obj.reward(quote, instruments, metrics, hidden, obs)
-                   for obj, w in self.objectives)
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        bd = {}
-        for obj, w in self.objectives:
-            for k, v in obj.breakdown(quote, instruments, metrics, hidden, obs).items():
-                bd[k] = w * v
-        return bd
diff --git a/lab/outlet/objectives/factory.py b/lab/outlet/objectives/factory.py
deleted file mode 100644
index 6e75294..0000000
--- a/lab/outlet/objectives/factory.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""
-Factory functions for creating objectives.
-
-Provides:
-- make_objective: Create single objective by name
-- make_composite: Create weighted combination of objectives
-- retail_objective: Default objective for retail pricing
-- market_making_objective: Default objective for market making
-"""
-from __future__ import annotations
-from .base import BaseObjective, CompositeObjective
-from .penalties import (PnLObjective, VolatilityPenalty, HoldingCostPenalty,
-                        LostOpportunityCostPenalty, InventoryRiskPenalty, SpreadCaptureReward)
-
-REGISTRY: dict[str, type[BaseObjective]] = {
-    'pnl': PnLObjective,
-    'volatility': VolatilityPenalty,
-    'holding_cost': HoldingCostPenalty,
-    'lost_opportunity': LostOpportunityCostPenalty,
-    'inventory_risk': InventoryRiskPenalty,
-    'spread_capture': SpreadCaptureReward,
-}
-
-def make_objective(name: str, **kwargs) -> BaseObjective:
-    """Create an objective by name.
-
-    Args:
-        name: Objective name (pnl, volatility, holding_cost, lost_opportunity,
-              inventory_risk, spread_capture)
-        **kwargs: Passed to objective constructor
-
-    Returns:
-        Instantiated objective
-    """
-    if name not in REGISTRY:
-        raise ValueError(f"Unknown objective: {name}. Available: {list(REGISTRY.keys())}")
-    return REGISTRY[name](**kwargs)
-
-def make_composite(spec: list[tuple[str, float, dict]] | dict[str, float]) -> CompositeObjective:
-    """Create composite objective from specification.
-
-    Args:
-        spec: Either:
-            - list of (name, weight, kwargs) tuples for full control
-            - dict of {name: weight} for simple cases
-
-    Returns:
-        CompositeObjective with specified components
-    """
-    objectives = []
-    if isinstance(spec, dict):
-        for name, weight in spec.items():
-            objectives.append((make_objective(name), weight))
-    else:
-        for name, weight, kwargs in spec:
-            objectives.append((make_objective(name, **kwargs), weight))
-    return CompositeObjective(objectives)
-
-def retail_objective(volatility_weight: float = 0.1, holding_weight: float = 0.5,
-                     stockout_weight: float = 0.3) -> CompositeObjective:
-    """Default objective for retail dynamic pricing.
-
-    Reward = PnL - volatility_weight*volatility - holding_weight*holding_cost
-             - stockout_weight*lost_opportunity
-    """
-    return make_composite({
-        'pnl': 1.0,
-        'volatility': volatility_weight,
-        'holding_cost': holding_weight,
-        'lost_opportunity': stockout_weight,
-    })
-
-def market_making_objective(gamma: float = 0.1, sigma: float = 1.0) -> CompositeObjective:
-    """Default objective for market making.
-
-    Reward = PnL + 0.5*spread_capture - inventory_risk(gamma, sigma)
-    """
-    return CompositeObjective([
-        (PnLObjective(), 1.0),
-        (SpreadCaptureReward(), 0.5),
-        (InventoryRiskPenalty(gamma=gamma, sigma=sigma), 1.0),
-    ])
diff --git a/lab/outlet/objectives/penalties.py b/lab/outlet/objectives/penalties.py
deleted file mode 100644
index 916e0e2..0000000
--- a/lab/outlet/objectives/penalties.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-Standard objective components and penalties.
-
-This module provides common reward terms:
-- PnLObjective: Basic profit and loss
-- VolatilityPenalty: Penalize price volatility for UX
-- HoldingCostPenalty: Inventory holding cost
-- LostOpportunityCostPenalty: Stockout/missed fill cost
-- InventoryRiskPenalty: Quadratic inventory risk (market making)
-- SpreadCaptureReward: Bid-ask spread capture (market making)
-"""
-from __future__ import annotations
-import numpy as np
-from .base import BaseObjective
-from ..types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation
-from ..math_util import inventory_penalty
-
-class PnLObjective(BaseObjective):
-    """Profit and loss reward (revenue - cost)."""
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        return metrics.pnl
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        return {'pnl': metrics.pnl, 'revenue': metrics.revenue, 'cost': metrics.cost}
-
-class VolatilityPenalty(BaseObjective):
-    """Penalize price volatility for user experience."""
-
-    def __init__(self, scale: float = 1.0):
-        self.scale = scale
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        return -self.scale * metrics.volatility
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        return {'volatility_penalty': -self.scale * metrics.volatility}
-
-class HoldingCostPenalty(BaseObjective):
-    """Penalty for inventory holding costs."""
-
-    def __init__(self, scale: float = 1.0):
-        self.scale = scale
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        return -self.scale * metrics.position_cost
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        return {'holding_cost_penalty': -self.scale * metrics.position_cost}
-
-class LostOpportunityCostPenalty(BaseObjective):
-    """Penalty for lost sales due to stockouts or missed fills."""
-
-    def __init__(self, scale: float = 1.0):
-        self.scale = scale
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        return -self.scale * metrics.lost_opportunity
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        return {'lost_opportunity_penalty': -self.scale * metrics.lost_opportunity}
-
-class InventoryRiskPenalty(BaseObjective):
-    """Quadratic inventory risk penalty (Avellaneda-Stoikov style).
-
-    Penalty = gamma * sigma^2 * q^2 / 2, where q is total position.
-    Encourages market makers to keep inventory near zero.
-    """
-
-    def __init__(self, gamma: float = 0.1, sigma: float = 1.0):
-        self.gamma = gamma
-        self.sigma = sigma
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        if obs.position is None: return 0.0
-        q = np.sum(obs.position)
-        return -inventory_penalty(q, self.gamma, self.sigma)
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        return {'inventory_risk_penalty': self.reward(quote, instruments, metrics, hidden, obs)}
-
-class SpreadCaptureReward(BaseObjective):
-    """Reward for capturing bid-ask spread in market making."""
-
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
-        return metrics.spread_capture
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
-        return {'spread_capture': metrics.spread_capture}
diff --git a/lab/outlet/observation.py b/lab/outlet/observation.py
deleted file mode 100644
index cffc71b..0000000
--- a/lab/outlet/observation.py
+++ /dev/null
@@ -1,92 +0,0 @@
-"""
-Observation construction with demand censoring.
-
-This module provides the ObservationBuilder that constructs agent observations
-from step data. The key invariant is that observations only contain censored
-data (fills) and never true demand, ensuring proper research conditions.
-
-The ObservationConfig controls what is included in observations:
-- Position visibility
-- Market/competitor visibility
-- Demand proxy method
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-import numpy as np
-from .types import Quote, InstrumentSet, StepLogs, StepMetrics, MarketState, HiddenState, Observation
-
-@dataclass
-class ObservationConfig:
-    """Configuration for observation construction.
-
-    Attributes:
-        include_position: Include current position in observation
-        include_market: Include market/competitor state in observation
-        mask_true_demand: If True, observation excludes true demand (research mode)
-        demand_proxy: Method for demand proxy ('fills', 'exposures', 'weighted')
-        exposure_weights: Weights for weighted demand proxy
-    """
-    include_position: bool = True
-    include_market: bool = True
-    mask_true_demand: bool = True
-    demand_proxy: str = 'fills'
-    exposure_weights: dict[str, float] | None = None
-
-class DefaultObservationBuilder:
-    """Constructs censored observations for the agent.
-
-    Ensures the key research invariant: observations contain only
-    censored fills (realized sales), never true demand. True demand
-    is placed in the info dict for research analysis only.
-    """
-
-    def __init__(self, cfg: ObservationConfig | None = None):
-        self.cfg = cfg or ObservationConfig()
-
-    def build(self, quote: Quote, instruments: InstrumentSet, logs: StepLogs,
-              metrics: StepMetrics, market: MarketState | None,
-              hidden: HiddenState, mask_demand: bool, t: int) -> Observation:
-        n = instruments.n
-        cfg = self.cfg
-
-        # always show censored fills
-        fills = logs.censored_fills if logs.censored_fills is not None else np.zeros(n)
-
-        # compute exposures from logs
-        if logs.events:
-            exposures = np.zeros(n)
-            for e in logs.events:
-                if e.instrument_id is not None:
-                    exposures[e.instrument_id] += 1
-        else:
-            exposures = logs.aggregates.get('exposures', np.zeros(n))
-
-        # position - only if configured and available
-        position = None
-        if cfg.include_position and instruments.position is not None:
-            position = instruments.position.copy()
-
-        # market state - only if configured
-        obs_market = market if cfg.include_market else None
-
-        return Observation(
-            quotes=quote.prices.copy(),
-            position=position,
-            fills=fills,
-            exposures=exposures,
-            market=obs_market,
-            t=t
-        )
-
-    def make_space(self, n_instruments: int, include_market: bool = True) -> dict:
-        """Returns dict describing observation space for gym"""
-        space = {
-            'quotes': {'shape': (n_instruments,), 'low': 0, 'high': np.inf},
-            'fills': {'shape': (n_instruments,), 'low': 0, 'high': np.inf},
-            'exposures': {'shape': (n_instruments,), 'low': 0, 'high': np.inf},
-        }
-        if self.cfg.include_position:
-            space['position'] = {'shape': (n_instruments,), 'low': -np.inf, 'high': np.inf}
-        if include_market:
-            space['competitor_quotes'] = {'shape': (n_instruments,), 'low': 0, 'high': np.inf}
-        return space
diff --git a/lab/outlet/platform.py b/lab/outlet/platform.py
deleted file mode 100644
index eabb69a..0000000
--- a/lab/outlet/platform.py
+++ /dev/null
@@ -1,285 +0,0 @@
-"""
-Main simulation platform orchestrating the Quote-Control loop.
-
-The Platform class is the central coordinator that:
-1. Receives pricing actions (quotes) from the agent
-2. Generates arrivals via the ArrivalModel
-3. Processes executions via Mechanism and ExecutionModel
-4. Applies position censorship via PositionModel
-5. Computes metrics and reward via Objective
-6. Returns censored observations
-
-Example:
-    >>> from lab.config import make_retail_platform
-    >>> platform = make_retail_platform()
-    >>> result = platform.reset(seed=42)
-    >>> result = platform.step(platform.instruments.refs * 1.1)
-    >>> print(f"PnL: {result.metrics.pnl:.2f}")
-"""
-from __future__ import annotations
-from dataclasses import dataclass, field
-from typing import Any
-import numpy as np
-from .types import (Quote, Opportunity, Execution, InstrumentSet, StepLogs, StepMetrics,
-                    StepEvent, MarketState, HiddenState, Observation, StepResult)
-from .constants import LogLevel, EventType, Side
-from .protocols import Mechanism, ArrivalModel, ExecutionModel, PositionModel, MarketModel, ObservationBuilder, Objective
-from .stock import PositionModel as DefaultPositionModel, PositionConfig
-from .observation import DefaultObservationBuilder, ObservationConfig
-from .objectives.factory import retail_objective
-
-@dataclass
-class PlatformConfig:
-    """Configuration for the simulation platform.
-
-    Attributes:
-        n_instruments: Number of instruments in the simulation
-        max_steps: Maximum steps before episode terminates
-        dt: Time duration per step (affects arrival rates)
-        log_level: Verbosity of logging (NONE, AGG_ONLY, FULL)
-        mask_demand: If True, observations exclude true demand (research mode)
-        seed: Random seed for reproducibility
-    """
-    n_instruments: int = 10
-    max_steps: int = 1000
-    dt: float = 1.0
-    log_level: LogLevel = LogLevel.AGG_ONLY
-    mask_demand: bool = True
-    seed: int | None = None
-
-class Platform:
-    """Main simulation orchestrator implementing Quote -> Arrival -> Execution -> Position.
-
-    The Platform coordinates all components to simulate a pricing environment:
-    - Mechanism: validates quotes and determines execution logic
-    - ArrivalModel: generates demand opportunities
-    - ExecutionModel: computes acceptance probabilities
-    - PositionModel: manages inventory/position and censorship
-    - MarketModel: updates competitor/market state
-    - ObservationBuilder: constructs censored observations
-    - Objective: computes reward from metrics
-
-    Attributes:
-        instruments: The instrument set being priced
-        mechanism: Quote validation and execution mechanism
-        arrival: Demand arrival generator
-        execution: Acceptance probability model
-        position: Inventory/position manager
-        market: Competitor/market dynamics (optional)
-        obs_builder: Observation constructor
-        objective: Reward function
-        cfg: Platform configuration
-    """
-
-    def __init__(self, instruments: InstrumentSet, mechanism: Mechanism,
-                 arrival: ArrivalModel, execution: ExecutionModel,
-                 position: PositionModel | None = None,
-                 market: MarketModel | None = None,
-                 obs_builder: ObservationBuilder | None = None,
-                 objective: Objective | None = None,
-                 cfg: PlatformConfig | None = None):
-        self.instruments = instruments
-        self.mechanism = mechanism
-        self.arrival = arrival
-        self.execution = execution
-        self.position = position or DefaultPositionModel(PositionConfig())
-        self.market = market
-        self.obs_builder = obs_builder or DefaultObservationBuilder()
-        self.objective = objective or retail_objective()
-        self.cfg = cfg or PlatformConfig(n_instruments=instruments.n)
-
-        self._t: int = 0
-        self._rng: np.random.Generator = np.random.default_rng(self.cfg.seed)
-        self._quote: Quote | None = None
-        self._market_state: MarketState | None = None
-        self._hidden: HiddenState = HiddenState()
-        self._prev_prices: np.ndarray | None = None
-
-    def reset(self, seed: int | None = None) -> StepResult:
-        """Reset the platform to initial state.
-
-        Args:
-            seed: Random seed (overrides config seed if provided)
-
-        Returns:
-            Initial StepResult with zeroed metrics and initial observation
-        """
-        self._t = 0
-        self._rng = np.random.default_rng(seed or self.cfg.seed)
-        self._hidden = HiddenState()
-        self._prev_prices = self.instruments.refs.copy()
-
-        # reset position
-        self.position.reset(self.instruments, self._rng)
-        self.instruments.position = self.position.position
-
-        # initial quote at reference prices
-        self._quote = Quote(prices=self.instruments.refs.copy(), propensity=1.0,
-                            metadata={'prev_prices': self._prev_prices})
-        self._quote = self.mechanism.apply_quote(self._quote, self.instruments, self._rng)
-
-        # initial market state
-        if self.market:
-            self._market_state = self.market.step(0, self._quote, self._hidden, self._rng)
-
-        # build initial observation
-        logs = StepLogs(aggregates={'reset': True},
-                        true_demand=np.zeros(self.instruments.n),
-                        censored_fills=np.zeros(self.instruments.n))
-        metrics = StepMetrics()
-        obs = self.obs_builder.build(self._quote, self.instruments, logs, metrics,
-                                     self._market_state, self._hidden, self.cfg.mask_demand, 0)
-
-        return StepResult(obs=obs, reward=0.0, terminated=False, truncated=False,
-                          info={'true_demand': logs.true_demand}, metrics=metrics,
-                          logs=logs, hidden=self._hidden)
-
-    def step(self, action: np.ndarray, propensity: float = 1.0) -> StepResult:
-        """Execute one simulation step with the given pricing action.
-
-        The step proceeds as follows:
-        1. Apply quote constraints via mechanism
-        2. Update market/competitor state
-        3. Generate arrivals
-        4. Process arrivals -> executions with acceptance check
-        5. Apply position censorship to executions
-        6. Update position state
-        7. Compute metrics (PnL, costs, etc.)
-        8. Build logs with propensities
-        9. Construct censored observation
-        10. Compute reward
-
-        Args:
-            action: Price vector for all instruments
-            propensity: P(action | behavior policy) for OPE logging
-
-        Returns:
-            StepResult containing observation, reward, metrics, logs, and hidden state
-        """
-        self._t += 1
-        cfg = self.cfg
-
-        # 1. apply quote from action
-        self._quote = Quote(prices=action, propensity=propensity,
-                            metadata={'prev_prices': self._prev_prices})
-        self._quote = self.mechanism.apply_quote(self._quote, self.instruments, self._rng)
-        self._prev_prices = self._quote.prices.copy()
-        self._hidden.quote_history.append(self._quote.prices.copy())
-
-        # 2. update market/competitors
-        if self.market:
-            self._market_state = self.market.step(self._t, self._quote, self._hidden, self._rng)
-            self._hidden.market_history.append(self._market_state)
-
-        # 3. generate arrivals
-        opps = self.arrival.sample(self._t, cfg.dt, self.instruments,
-                                   self._market_state, self._hidden, self._rng)
-
-        # 4. process opportunities -> executions
-        executions: list[Execution] = []
-        events: list[StepEvent] = []
-        true_demand = np.zeros(self.instruments.n)
-
-        for opp in opps:
-            # log exposure
-            if cfg.log_level == LogLevel.FULL:
-                events.append(StepEvent(t=opp.t, type=EventType.EXPOSURE,
-                                        instrument_id=opp.instrument_id,
-                                        opportunity_id=opp.id,
-                                        price=float(self._quote.prices[opp.instrument_id]),
-                                        propensity=self._quote.propensity))
-
-            # check acceptance
-            prob = self.execution.prob(opp, self._quote, self.instruments,
-                                       self._market_state, self._rng)
-            if self._rng.random() < prob:
-                # create execution
-                exe = self.mechanism.process_opportunity(opp, self._quote, self.instruments,
-                                                         self._market_state, self._rng)
-                if exe:
-                    true_demand[exe.instrument_id] += exe.size_requested
-                    # apply position censorship
-                    exe = self.position.apply_execution(exe)
-                    executions.append(exe)
-                    if cfg.log_level == LogLevel.FULL:
-                        events.append(StepEvent(t=exe.t, type=EventType.EXECUTION,
-                                                instrument_id=exe.instrument_id,
-                                                opportunity_id=exe.opportunity_id,
-                                                price=exe.price, size=exe.size_filled,
-                                                propensity=exe.propensity))
-
-        # 5. update position state
-        self.position.step(self._t)
-        self.instruments.position = self.position.position
-
-        # 6. compute metrics
-        censored_fills = np.zeros(self.instruments.n)
-        revenue = 0.0
-        cost = 0.0
-        spread_capture = 0.0
-
-        for exe in executions:
-            censored_fills[exe.instrument_id] += exe.size_filled
-            if exe.side == Side.BUY:
-                revenue += exe.price * exe.size_filled
-                cost += self.instruments.costs[exe.instrument_id] * exe.size_filled
-            else:
-                revenue -= exe.price * exe.size_filled
-                cost -= self.instruments.costs[exe.instrument_id] * exe.size_filled
-            # spread capture for market making
-            if self._quote.spreads is not None and self._market_state and self._market_state.mid_prices is not None:
-                mid = self._market_state.mid_prices[exe.instrument_id]
-                if exe.side == Side.BUY:
-                    spread_capture += (exe.price - mid) * exe.size_filled
-                else:
-                    spread_capture += (mid - exe.price) * exe.size_filled
-
-        pnl = revenue - cost
-        units = float(np.sum(censored_fills))
-        lost = float(np.sum(true_demand - censored_fills))
-
-        # volatility
-        volatility = 0.0
-        if len(self._hidden.quote_history) > 1:
-            prev = self._hidden.quote_history[-2]
-            volatility = float(np.mean(np.abs(self._quote.prices - prev) / (prev + 1e-8)))
-
-        metrics = StepMetrics(
-            pnl=pnl, revenue=revenue, cost=cost, units_traded=units,
-            position_cost=self.position.holding_cost,
-            lost_opportunity=self.position.shortage_cost + lost * np.mean(self._quote.prices) * 0.1,
-            spread_capture=spread_capture, volatility=volatility,
-            conversion=units / (len(opps) + 1e-8),
-            per_instrument={'fills': censored_fills, 'demand': true_demand}
-        )
-
-        # 7. build logs
-        logs = StepLogs(
-            events=events if cfg.log_level == LogLevel.FULL else None,
-            executions=executions if cfg.log_level == LogLevel.FULL else None,
-            aggregates={'n_arrivals': len(opps), 'n_executions': len(executions),
-                        'exposures': np.bincount([o.instrument_id for o in opps],
-                                                 minlength=self.instruments.n).astype(float)},
-            true_demand=true_demand,
-            censored_fills=censored_fills
-        )
-
-        # 8. build observation
-        obs = self.obs_builder.build(self._quote, self.instruments, logs, metrics,
-                                     self._market_state, self._hidden, cfg.mask_demand, self._t)
-
-        # 9. compute reward
-        reward = self.objective.reward(self._quote, self.instruments, metrics, self._hidden, obs)
-        breakdown = self.objective.breakdown(self._quote, self.instruments, metrics, self._hidden, obs)
-        # print(f"Step {self._t}: Reward={reward:.2f}, Breakdown={breakdown}")
-
-
-        # 10. check termination
-        terminated = self._t >= cfg.max_steps
-        truncated = False
-
-        info = {'true_demand': true_demand, 'breakdown': self.objective.breakdown(
-            self._quote, self.instruments, metrics, self._hidden, obs)}
-
-        return StepResult(obs=obs, reward=reward, terminated=terminated, truncated=truncated,
-                          info=info, metrics=metrics, logs=logs, hidden=self._hidden)
diff --git a/lab/outlet/protocols.py b/lab/outlet/protocols.py
deleted file mode 100644
index 13bf967..0000000
--- a/lab/outlet/protocols.py
+++ /dev/null
@@ -1,297 +0,0 @@
-"""
-Protocol definitions for pluggable simulator components.
-
-This module defines the interfaces (Protocols) that allow swapping different
-implementations for each stage of the Quote -> Arrival -> Execution -> Position
-pipeline. All protocols use structural subtyping (duck typing).
-
-Protocols:
-    Mechanism: How quotes translate to executions (posted price, two-sided, auction)
-    ArrivalModel: How opportunities arrive (Poisson, Hawkes, sessions)
-    ExecutionModel: Acceptance probability given quote (elasticity, intensity)
-    PositionModel: Inventory/position management and censorship
-    MarketModel: Competitor/market dynamics
-    ObservationBuilder: Constructs agent observations with censoring
-    Objective: Computes reward from metrics
-"""
-from __future__ import annotations
-from typing import Protocol, Any, TYPE_CHECKING
-import numpy as np
-if TYPE_CHECKING:
-    from .types import (Quote, Opportunity, Execution, InstrumentSet, StepLogs,
-                        StepMetrics, HiddenState, Observation, MarketState)
-    from .constants import LogLevel
-
-class Mechanism(Protocol):
-    """Defines how quotes translate to executions.
-
-    The Mechanism is the core abstraction that differentiates pricing domains:
-    - PostedPrice: single price, buyer decides to purchase or not
-    - TwoSided: bid/ask spread, execution depends on distance from mid
-    - Auction: reserve price affects win probability and clearing price
-
-    Methods:
-        apply_quote: Enforce constraints and return valid quote
-        process_opportunity: Determine execution given opportunity and quote
-    """
-    def apply_quote(self, quote: Quote, instruments: InstrumentSet,
-                    rng: np.random.Generator) -> Quote:
-        """Apply mechanism-specific constraints to a quote.
-
-        Args:
-            quote: Raw quote from policy
-            instruments: Current instrument set with costs/refs
-            rng: Random generator for stochastic constraints
-
-        Returns:
-            Constrained quote satisfying mechanism rules (min margin, max delta, etc.)
-        """
-        ...
-
-    def process_opportunity(self, opp: Opportunity, quote: Quote,
-                            instruments: InstrumentSet, market: MarketState | None,
-                            rng: np.random.Generator) -> Execution | None:
-        """Process an opportunity against the current quote.
-
-        Args:
-            opp: Incoming opportunity (session, order, request)
-            quote: Current posted quote
-            instruments: Instrument set
-            market: Current market state (competitor prices, mid-prices)
-            rng: Random generator
-
-        Returns:
-            Execution if opportunity converts, None otherwise
-        """
-        ...
-
-class ArrivalModel(Protocol):
-    """Generates opportunities (demand arrivals) for each step.
-
-    Different arrival models capture different demand dynamics:
-    - Poisson: constant rate, memoryless
-    - Hawkes: self-exciting, clustered arrivals
-    - Session: retail browsing with multi-product views
-
-    Methods:
-        sample: Generate opportunities for a time interval
-    """
-    def sample(self, t: float, dt: float, instruments: InstrumentSet,
-               market: MarketState | None, hidden: HiddenState,
-               rng: np.random.Generator) -> list[Opportunity]:
-        """Sample opportunities for time interval [t, t+dt).
-
-        Args:
-            t: Current time
-            dt: Time interval length
-            instruments: Available instruments
-            market: Current market state
-            hidden: Hidden state (contains demand intensity, contamination)
-            rng: Random generator
-
-        Returns:
-            List of opportunities arriving in this interval
-        """
-        ...
-
-class ExecutionModel(Protocol):
-    """Computes acceptance/execution probability given quote and context.
-
-    Different models capture different demand responses:
-    - Elasticity: price sensitivity with competitor cross-effects
-    - Intensity: distance-based fill probability (market making)
-    - Logit: discrete choice model
-
-    Methods:
-        prob: Compute acceptance probability
-        uncensor: Estimate true demand from censored fills
-    """
-    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
-             market: MarketState | None, rng: np.random.Generator) -> float:
-        """Compute probability that opportunity accepts the quote.
-
-        Args:
-            opp: Opportunity to evaluate
-            quote: Current quote
-            instruments: Instrument set
-            market: Market state (competitor prices affect cross-elasticity)
-            rng: Random generator
-
-        Returns:
-            Probability in [0, 1] that opportunity executes
-        """
-        ...
-
-    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
-                 context: dict[str, Any] | None = None) -> np.ndarray:
-        """Estimate true demand from censored fills.
-
-        Used for demand estimation research under inventory censorship.
-
-        Args:
-            fills: Observed (censored) fill counts
-            instruments: Instrument set
-            context: Additional context (exposures, prices shown)
-
-        Returns:
-            Estimated true demand counts
-        """
-        ...
-
-class PositionModel(Protocol):
-    """Manages inventory (retail) or position (finance).
-
-    Handles:
-    - Position constraints and censorship
-    - Holding costs (retail) or inventory risk (finance)
-    - Replenishment and order receipt
-
-    Methods:
-        reset: Initialize position state
-        available: Query available capacity for a trade
-        apply_execution: Censor execution by available position
-        step: Process time-based updates (replenishment, holding cost)
-
-    Properties:
-        position: Current position vector
-        holding_cost: Cost incurred this step from holding position
-    """
-    def reset(self, instruments: InstrumentSet, rng: np.random.Generator) -> None:
-        """Initialize position state for new episode."""
-        ...
-
-    def available(self, instrument_id: int, side: Any) -> float:
-        """Query available capacity for a trade.
-
-        Args:
-            instrument_id: Which instrument
-            side: BUY or SELL
-
-        Returns:
-            Maximum tradeable size given current position
-        """
-        ...
-
-    def apply_execution(self, exe: Execution) -> Execution:
-        """Apply position constraints to an execution.
-
-        Args:
-            exe: Proposed execution with size_requested
-
-        Returns:
-            Censored execution with size_filled <= available capacity
-        """
-        ...
-
-    def step(self, t: float) -> None:
-        """Process time-based position updates.
-
-        Handles replenishment receipt, holding cost calculation, etc.
-        """
-        ...
-
-    @property
-    def position(self) -> np.ndarray:
-        """Current position vector (positive=long/inventory, negative=short)."""
-        ...
-
-    @property
-    def holding_cost(self) -> float:
-        """Holding cost incurred this step."""
-        ...
-
-class MarketModel(Protocol):
-    """Models external market dynamics and competitor behavior.
-
-    For retail: competitor price dynamics (static, reactive, stochastic)
-    For finance: mid-price process (GBM, mean-reverting)
-
-    Methods:
-        step: Update market state given agent's quotes
-    """
-    def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
-             rng: np.random.Generator) -> MarketState:
-        """Update market state for this timestep.
-
-        Args:
-            t: Current time
-            self_quotes: Agent's current quotes (competitors may react)
-            hidden: Hidden state (regime info)
-            rng: Random generator
-
-        Returns:
-            Updated market state with competitor prices, mid-prices, volatility
-        """
-        ...
-
-class ObservationBuilder(Protocol):
-    """Constructs agent observations with appropriate censoring.
-
-    Critical for research: ensures agent only sees censored fills,
-    never true demand (which goes in info dict).
-
-    Methods:
-        build: Construct observation from step data
-    """
-    def build(self, quote: Quote, instruments: InstrumentSet, logs: StepLogs,
-              metrics: StepMetrics, market: MarketState | None,
-              hidden: HiddenState, mask_demand: bool, t: int) -> Observation:
-        """Build observation for agent.
-
-        Args:
-            quote: Current quote
-            instruments: Instrument set with positions
-            logs: Step logs with true_demand and censored_fills
-            metrics: Computed metrics
-            market: Market state
-            hidden: Hidden state (not included in obs)
-            mask_demand: If True, exclude true demand from observation
-            t: Current timestep
-
-        Returns:
-            Observation containing only observable quantities
-        """
-        ...
-
-class Objective(Protocol):
-    """Computes reward from step metrics.
-
-    Supports composite objectives with weighted terms:
-    - PnL (profit)
-    - Position costs (holding, inventory risk)
-    - Lost opportunity (stockouts)
-    - Volatility penalty (UX)
-    - Spread capture (market making)
-
-    Methods:
-        reward: Compute scalar reward
-        breakdown: Get per-term contribution for analysis
-    """
-    def reward(self, quote: Quote, instruments: InstrumentSet,
-               metrics: StepMetrics, hidden: HiddenState,
-               obs: Observation) -> float:
-        """Compute scalar reward for this step.
-
-        Args:
-            quote: Current quote
-            instruments: Instrument set
-            metrics: Step metrics (pnl, costs, etc.)
-            hidden: Hidden state
-            obs: Agent observation
-
-        Returns:
-            Scalar reward value
-        """
-        ...
-
-    def breakdown(self, quote: Quote, instruments: InstrumentSet,
-                  metrics: StepMetrics, hidden: HiddenState,
-                  obs: Observation) -> dict[str, float]:
-        """Get reward breakdown by component.
-
-        Useful for analyzing which terms dominate the reward.
-
-        Returns:
-            Dict mapping term names to their contributions
-        """
-        ...
diff --git a/lab/outlet/stock.py b/lab/outlet/stock.py
deleted file mode 100644
index b2c88a2..0000000
--- a/lab/outlet/stock.py
+++ /dev/null
@@ -1,151 +0,0 @@
-"""
-Inventory/position management and instrument factories.
-
-This module provides:
-- PositionConfig: Configuration for position constraints and costs
-- PositionModel: Manages inventory (retail) or position (finance)
-- make_instruments: Factory for creating instrument sets
-
-The PositionModel handles demand censorship by limiting executions
-to available inventory, computing holding costs, and managing replenishment.
-"""
-from __future__ import annotations
-from dataclasses import dataclass, field
-import numpy as np
-from .types import Instrument, InstrumentSet, Execution
-from .constants import Side, InstrumentType
-
-@dataclass
-class PositionConfig:
-    """Configuration for position/inventory management.
-
-    Attributes:
-        initial_position: Starting inventory (None = unlimited, float = same for all)
-        max_position: Maximum long position per instrument
-        min_position: Maximum short position (negative, for finance)
-        holding_cost_rate: Cost per unit per step for holding inventory
-        shortage_cost_rate: Opportunity cost rate for stockouts
-        lead_time: Steps until replenishment orders arrive
-    """
-    initial_position: np.ndarray | float | None = None
-    max_position: float = 1000.0
-    min_position: float = -1000.0
-    holding_cost_rate: float = 0.001
-    shortage_cost_rate: float = 0.05
-    lead_time: int = 0
-
-@dataclass
-class PositionModel:
-    """Manages inventory (retail) or position (finance) with censorship.
-
-    Key responsibilities:
-    - Track current position per instrument
-    - Censor executions when position is insufficient
-    - Compute holding costs per step
-    - Track shortage/stockout costs
-    - Handle replenishment orders with lead time
-
-    For retail: position is inventory (positive), selling reduces it
-    For finance: position can be positive (long) or negative (short)
-    """
-    cfg: PositionConfig
-    n: int = 0
-    _position: np.ndarray = field(default_factory=lambda: np.array([]))
-    _pending_orders: list[tuple[int, np.ndarray]] = field(default_factory=list)
-    _step_holding_cost: float = 0.0
-    _step_shortage_cost: float = 0.0
-
-    def reset(self, instruments: InstrumentSet, rng: np.random.Generator) -> None:
-        self.n = instruments.n
-        if self.cfg.initial_position is None:
-            self._position = np.full(self.n, np.inf)  # unlimited
-        elif isinstance(self.cfg.initial_position, (int, float)):
-            self._position = np.full(self.n, float(self.cfg.initial_position))
-        else:
-            self._position = self.cfg.initial_position.copy().astype(np.float64)
-        self._pending_orders = []
-        self._step_holding_cost = 0.0
-        self._step_shortage_cost = 0.0
-
-    def available(self, instrument_id: int, side: Side) -> float:
-        pos = self._position[instrument_id]
-        if np.isinf(pos): return np.inf
-        if side == Side.BUY:
-            return max(0, pos)  # can sell up to current inventory
-        else:
-            return max(0, self.cfg.max_position - pos)  # can buy up to max
-
-    def apply_execution(self, exe: Execution) -> Execution:
-        idx = int(exe.instrument_id)
-        avail = self.available(idx, exe.side)
-        filled = min(exe.size_requested, avail)
-        shortage = exe.size_requested - filled
-
-        if exe.side == Side.BUY:
-            self._position[idx] -= filled  # sold from inventory
-        else:
-            self._position[idx] += filled  # bought into inventory
-
-        if shortage > 0:
-            self._step_shortage_cost += shortage * exe.price * self.cfg.shortage_cost_rate
-
-        return Execution(
-            opportunity_id=exe.opportunity_id, instrument_id=exe.instrument_id,
-            side=exe.side, size_requested=exe.size_requested,
-            size_filled=filled, price=exe.price, propensity=exe.propensity, t=exe.t
-        )
-
-    def order(self, quantity: np.ndarray) -> None:
-        if self.cfg.lead_time > 0:
-            self._pending_orders.append((self.cfg.lead_time, quantity.copy()))
-        else:
-            self._position += quantity
-
-    def step(self, t: float) -> None:
-        # compute holding cost
-        pos = np.where(np.isinf(self._position), 0, self._position)
-        self._step_holding_cost = float(np.sum(np.abs(pos)) * self.cfg.holding_cost_rate)
-
-        # receive pending orders
-        new_pending = []
-        for (remaining, qty) in self._pending_orders:
-            if remaining <= 1:
-                self._position += qty
-            else:
-                new_pending.append((remaining - 1, qty))
-        self._pending_orders = new_pending
-
-    @property
-    def position(self) -> np.ndarray:
-        return np.where(np.isinf(self._position), -1, self._position)
-
-    @property
-    def holding_cost(self) -> float:
-        return self._step_holding_cost
-
-    @property
-    def shortage_cost(self) -> float:
-        return self._step_shortage_cost
-
-def make_instruments(n: int, cost_range: tuple[float, float] = (1.0, 10.0),
-                     margin_range: tuple[float, float] = (0.2, 0.5),
-                     inst_type: InstrumentType = InstrumentType.SKU,
-                     rng: np.random.Generator | None = None) -> InstrumentSet:
-    """Factory function to create a random instrument set.
-
-    Args:
-        n: Number of instruments to create
-        cost_range: (min, max) for uniform cost sampling
-        margin_range: (min, max) for uniform margin sampling
-        inst_type: Type of instruments (SKU, ASSET, etc.)
-        rng: Random generator (uses default if None)
-
-    Returns:
-        InstrumentSet with n instruments having random costs and margins
-    """
-    rng = rng or np.random.default_rng()
-    costs = rng.uniform(*cost_range, n)
-    margins = rng.uniform(*margin_range, n)
-    items = [Instrument(id=i, type=inst_type, cost_basis=c, reference_price=c*(1+m))
-             for i, (c, m) in enumerate(zip(costs, margins))]
-    return InstrumentSet(instruments=items)
diff --git a/lab/outlet/types.py b/lab/outlet/types.py
deleted file mode 100644
index db49117..0000000
--- a/lab/outlet/types.py
+++ /dev/null
@@ -1,318 +0,0 @@
-"""
-Core data types for the Quote-Control simulator.
-
-This module defines the fundamental data structures used throughout the platform:
-- Identifiers (InstrumentId, OpportunityId, AgentId)
-- Domain objects (Instrument, Quote, Opportunity, Execution)
-- Logging structures (StepEvent, StepLogs, StepMetrics)
-- State containers (MarketState, HiddenState, Observation, StepResult)
-
-All dataclasses are designed to be serializable and numpy-compatible.
-"""
-from __future__ import annotations
-from dataclasses import dataclass, field
-from typing import Any, NewType
-import numpy as np
-from .constants import Side, InstrumentType, OpportunityType, EventType
-
-InstrumentId = NewType('InstrumentId', int)  # unique instrument index
-OpportunityId = NewType('OpportunityId', str)  # unique opportunity/session ID
-AgentId = NewType('AgentId', str)  # unique agent/actor ID
-
-@dataclass
-class Instrument:
-    """Represents a priceable entity in the simulation.
-
-    An instrument can be a retail SKU, financial asset, loan product, or subscription.
-    The cost_basis represents the fundamental value (marginal cost for retail,
-    mid-price for assets, funding rate for loans).
-
-    Attributes:
-        id: Unique identifier for this instrument
-        type: Category of instrument (SKU, ASSET, LOAN, SUBSCRIPTION)
-        cost_basis: Fundamental cost or value (marginal cost, mid-price, funding rate)
-        reference_price: Base or fair price used for action scaling
-        attrs: Additional attributes (quality score, category, volatility, etc.)
-    """
-    id: InstrumentId
-    type: InstrumentType
-    cost_basis: float
-    reference_price: float
-    attrs: dict[str, Any] = field(default_factory=dict)
-
-@dataclass
-class InstrumentSet:
-    """Collection of instruments with optional position tracking.
-
-    Provides vectorized access to instrument properties for efficient computation.
-    Position can be positive (long/inventory) or negative (short) for financial assets.
-
-    Attributes:
-        instruments: List of Instrument objects
-        position: Current position per instrument (None = unlimited capacity)
-
-    Properties:
-        n: Number of instruments
-        costs: Vector of cost bases
-        refs: Vector of reference prices
-    """
-    instruments: list[Instrument]
-    position: np.ndarray | None = None
-
-    @property
-    def n(self) -> int: return len(self.instruments)
-    @property
-    def costs(self) -> np.ndarray: return np.array([i.cost_basis for i in self.instruments], np.float32)
-    @property
-    def refs(self) -> np.ndarray: return np.array([i.reference_price for i in self.instruments], np.float32)
-
-@dataclass
-class Quote:
-    """Price quote set by the policy - the action in the MDP.
-
-    Supports multiple quoting mechanisms:
-    - Posted price: only `prices` field used
-    - Two-sided: `prices` as mid, `spreads` for bid-ask width
-    - Auction: `prices` as reserve prices
-
-    The propensity field is critical for off-policy evaluation (OPE).
-
-    Attributes:
-        prices: Posted prices (retail) or mid-quotes (market making)
-        spreads: Bid-ask spread width for two-sided quoting (None for posted price)
-        propensity: P(this quote | behavior policy) for importance sampling
-        metadata: Additional info (prev_prices for delta constraints, etc.)
-
-    Properties:
-        bids: Computed bid prices (mid - spread/2)
-        asks: Computed ask prices (mid + spread/2)
-    """
-    prices: np.ndarray
-    spreads: np.ndarray | None = None
-    propensity: float = 1.0
-    metadata: dict[str, Any] = field(default_factory=dict)
-
-    @property
-    def bids(self) -> np.ndarray | None:
-        return self.prices - self.spreads/2 if self.spreads is not None else None
-    @property
-    def asks(self) -> np.ndarray | None:
-        return self.prices + self.spreads/2 if self.spreads is not None else None
-
-@dataclass
-class Opportunity:
-    """An arrival event that may result in a transaction.
-
-    Opportunities are the demand side of the simulation:
-    - Retail: browsing session with purchase intent
-    - Market making: incoming market order
-    - Lending: loan application
-
-    The context dict carries segment/type information used by execution models.
-
-    Attributes:
-        id: Unique identifier for this opportunity
-        type: Category (SESSION, MARKET_ORDER, REQUEST)
-        side: BUY or SELL intent
-        instrument_id: Which instrument the opportunity targets
-        size: Requested transaction size (units, shares, principal)
-        t: Arrival timestamp
-        context: Segment info (is_scraper, credit_score, urgency, etc.)
-    """
-    id: OpportunityId
-    type: OpportunityType
-    side: Side
-    instrument_id: InstrumentId
-    size: float = 1.0
-    t: float = 0.0
-    context: dict[str, Any] = field(default_factory=dict)
-
-@dataclass
-class Execution:
-    """A realized transaction after acceptance and position censorship.
-
-    The difference between size_requested and size_filled represents
-    censored demand due to inventory/position constraints.
-
-    Attributes:
-        opportunity_id: Links back to the originating Opportunity
-        instrument_id: Which instrument was traded
-        side: BUY or SELL
-        size_requested: Original requested size (true demand)
-        size_filled: Actual filled size after censorship
-        price: Execution price
-        propensity: Combined propensity for OPE (quote * acceptance)
-        t: Execution timestamp
-    """
-    opportunity_id: OpportunityId
-    instrument_id: InstrumentId
-    side: Side
-    size_requested: float
-    size_filled: float
-    price: float
-    propensity: float = 1.0
-    t: float = 0.0
-
-@dataclass
-class StepEvent:
-    """Generic logged event"""
-    t: float
-    type: EventType
-    instrument_id: InstrumentId | None = None
-    opportunity_id: OpportunityId | None = None
-    price: float | None = None
-    size: float | None = None
-    propensity: float = 1.0
-    metadata: dict[str, Any] = field(default_factory=dict)
-
-@dataclass
-class StepLogs:
-    """Container for all logging data from a simulation step.
-
-    Supports both detailed event logging (for OPE) and aggregate-only mode
-    (for fast simulation). The true_demand vs censored_fills distinction
-    is critical for research on demand estimation under censorship.
-
-    Attributes:
-        events: Detailed event log (None if LogLevel != FULL)
-        executions: List of executed transactions (None if LogLevel != FULL)
-        aggregates: Always-available aggregate statistics
-        true_demand: Oracle demand before censorship (for research, not in obs)
-        censored_fills: Realized fills after position constraints (observable)
-    """
-    events: list[StepEvent] | None = None
-    executions: list[Execution] | None = None
-    aggregates: dict[str, Any] = field(default_factory=dict)
-    true_demand: np.ndarray | None = None
-    censored_fills: np.ndarray | None = None
-
-@dataclass
-class StepMetrics:
-    """Computed metrics for a single simulation step.
-
-    Metrics are domain-aware: retail uses revenue/cost/holding_cost,
-    market making uses spread_capture and inventory risk.
-
-    Attributes:
-        pnl: Profit and loss (revenue - cost for retail, mark-to-market for finance)
-        revenue: Gross revenue from sales/executions
-        cost: Cost of goods sold or position acquisition cost
-        units_traded: Total units/shares transacted
-        position_cost: Holding cost (retail) or inventory risk penalty (finance)
-        lost_opportunity: Cost of stockouts or missed fills
-        spread_capture: Bid-ask spread captured (market making)
-        volatility: Price volatility metric for UX consideration
-        conversion: Fill rate (executions / opportunities)
-        per_instrument: Per-instrument breakdowns (fills, demand, etc.)
-    """
-    pnl: float = 0.0
-    revenue: float = 0.0
-    cost: float = 0.0
-    units_traded: float = 0.0
-    position_cost: float = 0.0
-    lost_opportunity: float = 0.0
-    spread_capture: float = 0.0
-    volatility: float = 0.0
-    conversion: float = 0.0
-    per_instrument: dict[str, np.ndarray] = field(default_factory=dict)
-
-@dataclass
-class MarketState:
-    """External market conditions and competitor state.
-
-    For retail: competitor_quotes drives cross-elasticity effects.
-    For finance: mid_prices and volatility drive execution dynamics.
-
-    Attributes:
-        competitor_quotes: Competitor posted prices (retail)
-        mid_prices: Market mid-prices for assets (finance)
-        volatility: Per-instrument volatility estimate
-        regime: Market regime identifier (normal, price_war, high_vol, etc.)
-        t: Timestamp of this market state
-    """
-    competitor_quotes: np.ndarray | None = None
-    mid_prices: np.ndarray | None = None
-    volatility: np.ndarray | None = None
-    regime: str = 'normal'
-    t: float = 0.0
-
-@dataclass
-class HiddenState:
-    """Internal simulator state not exposed to the agent.
-
-    Contains oracle information for research analysis and
-    history needed for non-stationary dynamics.
-
-    Attributes:
-        true_demand_intensity: Latent demand multiplier
-        contamination: Fraction of arrivals that are adversarial/scraper
-        regime: Current market/competitor regime
-        quote_history: History of agent quotes for volatility calculation
-        market_history: History of market states for analysis
-    """
-    true_demand_intensity: float = 1.0
-    contamination: float = 0.0
-    regime: str = 'normal'
-    quote_history: list[np.ndarray] = field(default_factory=list)
-    market_history: list[MarketState] = field(default_factory=list)
-
-@dataclass
-class Observation:
-    """Observable state provided to the agent - censored view only.
-
-    Critical invariant: Observation never contains true_demand, only
-    censored fills. This enforces the censorship research setting.
-
-    Attributes:
-        quotes: Current posted quotes (the agent's last action)
-        position: Current inventory/position state
-        fills: Censored execution counts per instrument
-        exposures: Opportunity exposure counts per instrument
-        market: Observable market state (competitor prices, volatility)
-        t: Current timestep
-        extra: Additional observable features
-
-    Methods:
-        to_flat: Flatten to numpy array for gym compatibility
-    """
-    quotes: np.ndarray
-    position: np.ndarray | None
-    fills: np.ndarray
-    exposures: np.ndarray
-    market: MarketState | None
-    t: int
-    extra: dict[str, Any] = field(default_factory=dict)
-
-    def to_flat(self) -> np.ndarray:
-        """Flatten observation to 1D numpy array for gym environments."""
-        parts = [self.quotes, self.fills, self.exposures]
-        if self.position is not None: parts.append(self.position)
-        if self.market and self.market.competitor_quotes is not None:
-            parts.append(self.market.competitor_quotes)
-        return np.concatenate([p.flatten() for p in parts])
-
-@dataclass
-class StepResult:
-    """Complete result from a simulation step.
-
-    Follows gymnasium convention for obs, reward, terminated, truncated, info.
-    Additionally provides metrics, logs, and hidden state for research.
-
-    Attributes:
-        obs: Observable state (censored)
-        reward: Scalar reward from objective function
-        terminated: Episode ended naturally (max_steps reached)
-        truncated: Episode ended early (bankruptcy, constraint violation)
-        info: Additional info dict (contains true_demand for research)
-        metrics: Computed metrics for this step
-        logs: Event logs and aggregates
-        hidden: Internal simulator state (oracle info)
-    """
-    obs: Observation
-    reward: float
-    terminated: bool
-    truncated: bool
-    info: dict[str, Any]
-    metrics: StepMetrics
-    logs: StepLogs
-    hidden: HiddenState
diff --git a/lab/population/__init__.py b/lab/population/__init__.py
deleted file mode 100644
index 081dbd0..0000000
--- a/lab/population/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from .arrivals import PoissonArrivalModel, HawkesArrivalModel, SessionArrivalModel
-from .execution import ElasticityExecutionModel, IntensityExecutionModel, LogitExecutionModel
-from .competitors import (StaticCompetitorModel, ReactiveCompetitorModel,
-                          StochasticCompetitorModel, GBMMarketModel)
-
-__all__ = [
-    'PoissonArrivalModel', 'HawkesArrivalModel', 'SessionArrivalModel',
-    'ElasticityExecutionModel', 'IntensityExecutionModel', 'LogitExecutionModel',
-    'StaticCompetitorModel', 'ReactiveCompetitorModel', 'StochasticCompetitorModel', 'GBMMarketModel',
-]
diff --git a/lab/population/arrivals.py b/lab/population/arrivals.py
deleted file mode 100644
index b7e7ed6..0000000
--- a/lab/population/arrivals.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""
-Arrival models for generating demand opportunities.
-
-This module provides different arrival processes:
-- PoissonArrivalModel: Constant-rate memoryless arrivals
-- HawkesArrivalModel: Self-exciting clustered arrivals (market orders)
-- SessionArrivalModel: Retail browsing sessions with multi-product views
-
-Each model implements the ArrivalModel protocol and generates Opportunity objects
-that flow through the execution pipeline.
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-from typing import Callable
-import numpy as np
-from uuid import uuid4
-from ..outlet.types import Opportunity, InstrumentSet, MarketState, HiddenState
-from ..outlet.constants import Side, OpportunityType
-from ..outlet.math_util import poisson_arrivals, hawkes_intensity
-
-@dataclass
-class PoissonArrivalConfig:
-    """Configuration for Poisson arrival process.
-
-    Attributes:
-        base_rate: Expected arrivals per unit time (scaled by hidden.true_demand_intensity)
-        side_probs: Probability distribution over BUY/SELL sides
-    """
-    base_rate: float = 10.0
-    side_probs: dict[Side, float] = None
-
-    def __post_init__(self):
-        if self.side_probs is None:
-            self.side_probs = {Side.BUY: 1.0}
-
-class PoissonArrivalModel:
-    """Homogeneous Poisson arrival process.
-
-    Generates arrivals at a constant rate (modulated by demand intensity).
-    Suitable for stationary demand or as a baseline model.
-
-    The actual arrival count follows Poisson(rate * dt * intensity).
-    """
-
-    def __init__(self, cfg: PoissonArrivalConfig | None = None):
-        self.cfg = cfg or PoissonArrivalConfig()
-
-    def sample(self, t: float, dt: float, instruments: InstrumentSet,
-               market: MarketState | None, hidden: HiddenState,
-               rng: np.random.Generator) -> list[Opportunity]:
-        n_arrivals = poisson_arrivals(self.cfg.base_rate * hidden.true_demand_intensity, dt, rng)
-        opps = []
-        for _ in range(n_arrivals):
-            inst_id = rng.integers(0, instruments.n)
-            side = rng.choice(list(self.cfg.side_probs.keys()),
-                              p=list(self.cfg.side_probs.values()))
-            opps.append(Opportunity(
-                id=str(uuid4())[:8], type=OpportunityType.SESSION,
-                side=side, instrument_id=inst_id, size=1.0, t=t,
-                context={'segment': 'default'}
-            ))
-        return opps
-
-@dataclass
-class HawkesArrivalConfig:
-    """Configuration for Hawkes self-exciting process.
-
-    Attributes:
-        base_rate: Baseline arrival intensity
-        alpha: Excitation strength (how much each arrival increases intensity)
-        beta: Decay rate (how quickly excitation fades)
-        side_probs: Probability distribution over BUY/SELL sides
-    """
-    base_rate: float = 5.0
-    alpha: float = 0.5
-    beta: float = 1.0
-    side_probs: dict[Side, float] = None
-
-    def __post_init__(self):
-        if self.side_probs is None:
-            self.side_probs = {Side.BUY: 0.5, Side.SELL: 0.5}
-
-class HawkesArrivalModel:
-    """Self-exciting Hawkes point process for clustered arrivals.
-
-    Models order flow where arrivals cluster in time (momentum, herding).
-    Intensity: lambda(t) = base + alpha * sum(exp(-beta * (t - t_i)))
-
-    Used for market making scenarios where orders arrive in bursts.
-    """
-
-    def __init__(self, cfg: HawkesArrivalConfig | None = None):
-        self.cfg = cfg or HawkesArrivalConfig()
-        self._history: np.ndarray = np.array([])
-
-    def sample(self, t: float, dt: float, instruments: InstrumentSet,
-               market: MarketState | None, hidden: HiddenState,
-               rng: np.random.Generator) -> list[Opportunity]:
-        intensity = hawkes_intensity(
-            self.cfg.base_rate * hidden.true_demand_intensity,
-            self._history, self.cfg.alpha, self.cfg.beta, t
-        )
-        n_arrivals = poisson_arrivals(intensity, dt, rng)
-        opps = []
-        for i in range(n_arrivals):
-            arr_t = t + rng.uniform(0, dt)
-            self._history = np.append(self._history, arr_t)
-            inst_id = rng.integers(0, instruments.n)
-            side = rng.choice(list(self.cfg.side_probs.keys()),
-                              p=list(self.cfg.side_probs.values()))
-            opps.append(Opportunity(
-                id=str(uuid4())[:8], type=OpportunityType.MARKET_ORDER,
-                side=side, instrument_id=inst_id,
-                size=rng.exponential(1.0), t=arr_t,
-                context={'intensity': intensity}
-            ))
-        # decay old history
-        self._history = self._history[self._history > t - 10]
-        return opps
-
-@dataclass
-class SessionArrivalConfig:
-    """Configuration for retail session arrivals.
-
-    Attributes:
-        sessions_per_step: Number of browsing sessions per step
-        views_per_session: (min, max) product views per session
-        contamination: Fraction of sessions that are scrapers/bots
-    """
-    sessions_per_step: int = 20
-    views_per_session: tuple[int, int] = (1, 5)
-    contamination: float = 0.0
-
-class SessionArrivalModel:
-    """Retail browsing session model with multi-product views.
-
-    Each session views multiple products, generating one opportunity per view.
-    Scraper sessions (controlled by contamination) view more products
-    but convert at lower rates (handled by ExecutionModel).
-    """
-
-    def __init__(self, cfg: SessionArrivalConfig | None = None):
-        self.cfg = cfg or SessionArrivalConfig()
-
-    def sample(self, t: float, dt: float, instruments: InstrumentSet,
-               market: MarketState | None, hidden: HiddenState,
-               rng: np.random.Generator) -> list[Opportunity]:
-        n_sessions = self.cfg.sessions_per_step
-        contamination = hidden.contamination if hidden else self.cfg.contamination
-        opps = []
-
-        for _ in range(n_sessions):
-            is_scraper = rng.random() < contamination
-            n_views = rng.integers(*self.cfg.views_per_session)
-            sid = str(uuid4())[:8]
-
-            # scrapers view more products
-            if is_scraper:
-                n_views = min(instruments.n, n_views * 3)
-
-            viewed = rng.choice(instruments.n, size=min(n_views, instruments.n), replace=False)
-            for inst_id in viewed:
-                opps.append(Opportunity(
-                    id=f"{sid}-{inst_id}", type=OpportunityType.SESSION,
-                    side=Side.BUY, instrument_id=int(inst_id), size=1.0, t=t,
-                    context={'session_id': sid, 'is_scraper': is_scraper, 'n_views': n_views}
-                ))
-        return opps
diff --git a/lab/population/competitors.py b/lab/population/competitors.py
deleted file mode 100644
index 9417709..0000000
--- a/lab/population/competitors.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""
-Market and competitor models for external dynamics.
-
-This module provides models for competitor pricing (retail) and market dynamics (finance):
-- StaticCompetitorModel: Fixed competitor prices
-- ReactiveCompetitorModel: Competitor reacts to agent's prices, can trigger price wars
-- StochasticCompetitorModel: Random walk competitor prices
-- GBMMarketModel: Geometric Brownian Motion for asset mid-prices
-
-Each model implements the MarketModel protocol.
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-import numpy as np
-from ..outlet.types import Quote, MarketState, HiddenState
-from ..outlet.math_util import clamp, ema
-
-@dataclass
-class StaticCompetitorConfig:
-    """Configuration for static competitor.
-
-    Attributes:
-        markup: Fixed percentage markup over reference prices
-    """
-    markup: float = 0.1
-
-class StaticCompetitorModel:
-    """Static competitor with fixed markup pricing.
-
-    Competitor prices = reference * (1 + markup).
-    Useful as a baseline or for testing without competitor dynamics.
-    """
-
-    def __init__(self, cfg: StaticCompetitorConfig | None = None, refs: np.ndarray | None = None):
-        self.cfg = cfg or StaticCompetitorConfig()
-        self.refs = refs
-
-    def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
-             rng: np.random.Generator) -> MarketState:
-        refs = self.refs if self.refs is not None else self_quotes.prices
-        comp_prices = refs * (1 + self.cfg.markup)
-        return MarketState(competitor_quotes=comp_prices, regime='static', t=t)
-
-@dataclass
-class ReactiveCompetitorConfig:
-    """Configuration for reactive competitor.
-
-    Attributes:
-        follow_weight: Smoothing weight for price following (0=ignore, 1=instant)
-        band_pct: Maximum deviation from reference prices
-        war_threshold: Relative price diff that triggers price war
-        war_aggression: How much competitor cuts prices during war
-    """
-    follow_weight: float = 0.3
-    band_pct: float = 0.1
-    war_threshold: float = -0.15
-    war_aggression: float = 0.2
-
-class ReactiveCompetitorModel:
-    """Competitor that reacts to agent's prices with price war dynamics.
-
-    The competitor follows the agent's prices with smoothing.
-    If the agent undercuts significantly (beyond war_threshold),
-    a price war is triggered where the competitor becomes more aggressive.
-
-    This creates non-stationary dynamics that test policy robustness.
-    """
-
-    def __init__(self, cfg: ReactiveCompetitorConfig | None = None, refs: np.ndarray | None = None):
-        self.cfg = cfg or ReactiveCompetitorConfig()
-        self.refs = refs
-        self._prices: np.ndarray | None = None
-        self._in_war: bool = False
-
-    def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
-             rng: np.random.Generator) -> MarketState:
-        refs = self.refs if self.refs is not None else self_quotes.prices
-        c = self.cfg
-
-        if self._prices is None:
-            self._prices = refs.copy()
-
-        # check for price war trigger
-        relative_diff = (self_quotes.prices - self._prices) / (self._prices + 1e-8)
-        if np.any(relative_diff < c.war_threshold):
-            self._in_war = True
-        elif np.all(relative_diff > -c.war_threshold / 2):
-            self._in_war = False
-
-        # update prices
-        if self._in_war:
-            target = self_quotes.prices * (1 - c.war_aggression)
-            hidden.regime = 'price_war'
-        else:
-            target = self_quotes.prices * (1 + c.follow_weight * 0.05)
-            hidden.regime = 'normal'
-
-        # follow with smoothing
-        new_prices = np.array([ema(old, new, c.follow_weight)
-                               for old, new in zip(self._prices, target)])
-
-        # stay within band
-        new_prices = clamp(new_prices, refs * (1 - c.band_pct), refs * (1 + c.band_pct))
-        self._prices = new_prices
-
-        return MarketState(competitor_quotes=new_prices, regime=hidden.regime, t=t)
-
-@dataclass
-class StochasticCompetitorConfig:
-    """Configuration for stochastic competitor.
-
-    Attributes:
-        drift: Price drift per step
-        volatility: Price volatility (std of random shocks)
-        mean_revert: Mean reversion strength toward reference
-    """
-    drift: float = 0.0
-    volatility: float = 0.02
-    mean_revert: float = 0.1
-
-class StochasticCompetitorModel:
-    """Ornstein-Uhlenbeck style stochastic competitor prices.
-
-    Prices follow: dP = drift + mean_revert*(ref - P) + volatility*P*dW
-
-    Provides non-stationary competitor dynamics independent of agent actions.
-    Useful for testing robustness to market noise.
-    """
-
-    def __init__(self, cfg: StochasticCompetitorConfig | None = None, refs: np.ndarray | None = None):
-        self.cfg = cfg or StochasticCompetitorConfig()
-        self.refs = refs
-        self._prices: np.ndarray | None = None
-
-    def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
-             rng: np.random.Generator) -> MarketState:
-        refs = self.refs if self.refs is not None else self_quotes.prices
-        c = self.cfg
-
-        if self._prices is None:
-            self._prices = refs.copy()
-
-        # Ornstein-Uhlenbeck style dynamics
-        n = len(self._prices)
-        noise = rng.normal(0, c.volatility, n)
-        reversion = c.mean_revert * (refs - self._prices)
-        self._prices = self._prices + c.drift + reversion + noise * self._prices
-        self._prices = np.maximum(self._prices, refs * 0.5)
-
-        return MarketState(competitor_quotes=self._prices.copy(), regime='stochastic', t=t)
-
-@dataclass
-class GBMMarketConfig:
-    """Configuration for GBM market model.
-
-    Attributes:
-        mu: Price drift (expected return)
-        sigma: Price volatility
-        dt: Time step size
-    """
-    mu: float = 0.0
-    sigma: float = 0.1
-    dt: float = 1.0
-
-class GBMMarketModel:
-    """Geometric Brownian Motion model for asset mid-prices.
-
-    Standard Black-Scholes dynamics: dS = mu*S*dt + sigma*S*dW
-
-    Used for market making scenarios where the underlying asset price
-    follows a random walk. The agent quotes around this moving mid-price.
-    """
-
-    def __init__(self, cfg: GBMMarketConfig | None = None, initial: np.ndarray | None = None):
-        self.cfg = cfg or GBMMarketConfig()
-        self._mids = initial
-
-    def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
-             rng: np.random.Generator) -> MarketState:
-        if self._mids is None:
-            self._mids = self_quotes.prices.copy()
-
-        c = self.cfg
-        n = len(self._mids)
-        z = rng.standard_normal(n)
-        self._mids = self._mids * np.exp((c.mu - 0.5*c.sigma**2)*c.dt + c.sigma*np.sqrt(c.dt)*z)
-
-        vol = np.full(n, c.sigma)
-        return MarketState(mid_prices=self._mids.copy(), volatility=vol, regime='gbm', t=t)
diff --git a/lab/population/execution.py b/lab/population/execution.py
deleted file mode 100644
index 97484b2..0000000
--- a/lab/population/execution.py
+++ /dev/null
@@ -1,174 +0,0 @@
-"""
-Execution models for computing acceptance/fill probabilities.
-
-This module provides different models for how opportunities convert to executions:
-- ElasticityExecutionModel: Price elasticity with competitor cross-effects (retail)
-- IntensityExecutionModel: Distance-based fill intensity (market making)
-- LogitExecutionModel: Discrete choice model
-
-Each model implements the ExecutionModel protocol.
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-from typing import Any
-import numpy as np
-from ..outlet.types import Opportunity, Quote, InstrumentSet, MarketState
-from ..outlet.constants import Side
-from ..outlet.math_util import sigmoid, safe_log, intensity_decay, EPS
-
-@dataclass
-class ElasticityConfig:
-    """Configuration for price elasticity execution model.
-
-    Attributes:
-        base_prob: Baseline purchase probability at reference price
-        price_sensitivity: Own-price elasticity coefficient
-        cross_elasticity: Competitor price cross-elasticity
-        scraper_conversion: Multiplier for scraper conversion (typically << 1)
-    """
-    base_prob: float = 0.3
-    price_sensitivity: float = 2.0
-    cross_elasticity: float = 0.5
-    scraper_conversion: float = 0.01
-
-class ElasticityExecutionModel:
-    """Price elasticity model for retail dynamic pricing.
-
-    P(buy) = base_prob * exp(-sensitivity * log(price/ref)) * cross_effect * scraper_mult
-
-    Higher prices reduce purchase probability exponentially.
-    Competitor undercutting shifts demand away from the platform.
-    Scrapers convert at a much lower rate (reconnaissance, not purchase).
-    """
-
-    def __init__(self, cfg: ElasticityConfig | None = None):
-        self.cfg = cfg or ElasticityConfig()
-
-    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
-             market: MarketState | None, rng: np.random.Generator) -> float:
-        idx = int(opp.instrument_id)
-        price = quote.prices[idx]
-        ref = instruments.refs[idx]
-
-        # base probability adjusted by price ratio
-        log_ratio = safe_log(price / ref)
-        prob = self.cfg.base_prob * np.exp(-self.cfg.price_sensitivity * log_ratio)
-
-        # cross-elasticity: competitor undercutting increases their share
-        if market and market.competitor_quotes is not None:
-            comp_price = market.competitor_quotes[idx]
-            if comp_price < price:
-                prob *= np.exp(-self.cfg.cross_elasticity * (price - comp_price) / ref)
-
-        # scrapers convert at much lower rate
-        if opp.context.get('is_scraper', False):
-            prob *= self.cfg.scraper_conversion
-
-        return float(np.clip(prob, 0, 1))
-
-    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
-                 context: dict[str, Any] | None = None) -> np.ndarray:
-        # simple imputation: assume fills = prob * exposures, invert
-        exposures = context.get('exposures', fills) if context else fills
-        avg_prob = self.cfg.base_prob
-        return fills / (avg_prob + EPS)
-
-@dataclass
-class IntensityConfig:
-    """Configuration for intensity-based execution model.
-
-    Attributes:
-        base_intensity: Baseline fill intensity
-        kappa: Decay rate with distance from mid-price
-        vol_scale: Volatility multiplier for fill intensity
-    """
-    base_intensity: float = 1.0
-    kappa: float = 1.5
-    vol_scale: float = 0.5
-
-class IntensityExecutionModel:
-    """Avellaneda-Stoikov style fill intensity for market making.
-
-    Fill probability decays exponentially with distance from mid-price:
-    P(fill) = base * exp(-kappa * |quote - mid|) * (1 + vol_scale * sigma)
-
-    Tighter spreads (closer to mid) have higher fill probability.
-    Higher volatility increases fill probability (more aggressive traders).
-    """
-
-    def __init__(self, cfg: IntensityConfig | None = None):
-        self.cfg = cfg or IntensityConfig()
-
-    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
-             market: MarketState | None, rng: np.random.Generator) -> float:
-        idx = int(opp.instrument_id)
-
-        # get mid price from market or use quote price
-        if market and market.mid_prices is not None:
-            mid = market.mid_prices[idx]
-        else:
-            mid = quote.prices[idx]
-
-        # compute distance from mid
-        if opp.side == Side.BUY:
-            exec_price = quote.asks[idx] if quote.asks is not None else quote.prices[idx]
-            distance = exec_price - mid
-        else:
-            exec_price = quote.bids[idx] if quote.bids is not None else quote.prices[idx]
-            distance = mid - exec_price
-
-        # intensity decays with distance
-        intensity = self.cfg.base_intensity * intensity_decay(abs(distance), self.cfg.kappa)
-
-        # volatility increases fill probability
-        if market and market.volatility is not None:
-            vol = market.volatility[idx]
-            intensity *= (1 + self.cfg.vol_scale * vol)
-
-        return float(np.clip(intensity, 0, 1))
-
-    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
-                 context: dict[str, Any] | None = None) -> np.ndarray:
-        return fills  # market making doesn't have same censorship concept
-
-@dataclass
-class LogitConfig:
-    """Configuration for logit discrete choice model.
-
-    Attributes:
-        beta_0: Intercept (base utility)
-        beta_price: Price coefficient (typically negative)
-        beta_quality: Quality attribute coefficient
-    """
-    beta_0: float = 0.5
-    beta_price: float = -1.5
-    beta_quality: float = 0.3
-
-class LogitExecutionModel:
-    """Discrete choice logit model for purchase probability.
-
-    Utility: U = beta_0 + beta_price * (price/ref) + beta_quality * quality
-    P(buy) = sigmoid(U)
-
-    Provides a theoretically grounded demand model from economics literature.
-    """
-
-    def __init__(self, cfg: LogitConfig | None = None):
-        self.cfg = cfg or LogitConfig()
-
-    def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
-             market: MarketState | None, rng: np.random.Generator) -> float:
-        idx = int(opp.instrument_id)
-        price = quote.prices[idx]
-        ref = instruments.refs[idx]
-        quality = instruments.instruments[idx].attrs.get('quality', 0.5)
-
-        # utility
-        u = self.cfg.beta_0 + self.cfg.beta_price * (price / ref) + self.cfg.beta_quality * quality
-
-        # choice probability via sigmoid
-        return float(sigmoid(u))
-
-    def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
-                 context: dict[str, Any] | None = None) -> np.ndarray:
-        return fills / (self.cfg.beta_0 + EPS)
diff --git a/lab/run_example.py b/lab/run_example.py
deleted file mode 100644
index ebe0f18..0000000
--- a/lab/run_example.py
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env python
-"""Example script demonstrating the Quote-Control platform"""
-import sys
-from pathlib import Path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-import numpy as np
-from lab.config import make_retail_platform, make_market_making_platform
-from lab.experiments.eval import (rollout, compare_policies, fixed_price_policy,
-                                   cost_plus_margin_policy, random_walk_policy)
-
-def demo_retail():
-    print("=" * 60)
-    print("RETAIL DYNAMIC PRICING DEMO")
-    print("=" * 60)
-
-    platform = make_retail_platform()
-    print(f"Instruments: {platform.instruments.n}")
-    print(f"Reference prices: {platform.instruments.refs[:5].round(2)}...")
-
-    # compare policies
-    policies = {
-        'fixed': fixed_price_policy(platform.instruments.refs),
-        'cost_plus_30%': cost_plus_margin_policy(platform.instruments.costs, 0.3),
-        'cost_plus_50%': cost_plus_margin_policy(platform.instruments.costs, 0.5),
-        'random_walk': random_walk_policy(platform.instruments.refs, 0.03),
-    }
-
-    results = compare_policies(platform, policies, n_steps=100, n_runs=3)
-
-    print("\nPolicy Comparison (100 steps, 3 runs):")
-    print("-" * 50)
-    for name, r in sorted(results.items(), key=lambda x: -x[1]['mean_pnl']):
-        print(f"{name:20s} PnL={r['mean_pnl']:8.1f} +/- {r['std_reward']:6.1f}  "
-              f"conv={r['mean_conversion']:.3f}")
-
-def demo_market_making():
-    print("\n" + "=" * 60)
-    print("MARKET MAKING DEMO")
-    print("=" * 60)
-
-    platform = make_market_making_platform()
-    print(f"Instruments: {platform.instruments.n}")
-    print(f"Initial mids: {platform.instruments.refs.round(2)}")
-
-    # simple policy: quote at mid with fixed spread
-    def mm_policy(obs: np.ndarray, t: int):
-        mids = platform.instruments.refs  # would use obs in real policy
-        return mids, 1.0
-
-    result = rollout(platform, mm_policy, n_steps=200, seed=42)
-    print(f"\nRollout (200 steps):")
-    print(f"  Total PnL: {result.total_pnl:.2f}")
-    print(f"  Avg conversion: {result.avg_conversion:.3f}")
-    print(f"  Total spread capture: {sum(m.spread_capture for m in result.metrics):.2f}")
-
-if __name__ == '__main__':
-    demo_retail()
-    demo_market_making()

From 4c7d9362af4023904e6b10941e9a4675c956ecb5 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 30 Jan 2026 13:55:22 +0100
Subject: [PATCH 63/99] chore: envs for e2e

---
 tests/e2e/.env.example | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 tests/e2e/.env.example

diff --git a/tests/e2e/.env.example b/tests/e2e/.env.example
new file mode 100644
index 0000000..9e5dee5
--- /dev/null
+++ b/tests/e2e/.env.example
@@ -0,0 +1,7 @@
+WEB_URL=http://localhost:3000
+BACKEND_URL=http://localhost:5000
+PRICING_PROVIDER_URL=http://localhost:5001
+AIRFLOW_URL=http://localhost:8085
+AIRFLOW_USER=admin
+AIRFLOW_PASS=admin
+HEADLESS=true

From 26abff586499ad368fa4d27f6b76fb439e7466c3 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 30 Jan 2026 13:57:40 +0100
Subject: [PATCH 64/99] chore: fixing tests with seed determinism

---
 experiments/procesing/tests/test_demand.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/experiments/procesing/tests/test_demand.py b/experiments/procesing/tests/test_demand.py
index 18dce5d..d964da2 100644
--- a/experiments/procesing/tests/test_demand.py
+++ b/experiments/procesing/tests/test_demand.py
@@ -6,6 +6,7 @@ from procesing.steps import (
 )
 
 def test_compute_demand(pipeline_context):
+    random.seed(42)  # deterministic test
     step = ComputeDemandStep(context=pipeline_context)
 
     # Test with normal interaction data
@@ -26,6 +27,7 @@ def test_compute_demand(pipeline_context):
 
 
 def test_compute_demand_skewed(pipeline_context):
+    random.seed(42)  # deterministic test
     step = ComputeDemandStep(context=pipeline_context)
 
     # Test with normal interaction data

From 20132c084c0056de8a1d5f4019138cc2eba1829c Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 14 Dec 2025 17:30:01 +0100
Subject: [PATCH 65/99] initial environemnt definitions

---
 sim/rl/environment.py | 471 +++++-------------------------------------
 1 file changed, 50 insertions(+), 421 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index 19f9ad4..803a4fd 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -2,450 +2,79 @@ import gymnasium as gym
 from gymnasium import spaces
 import numpy as np
 from dataclasses import dataclass
-import pandas as pd
-from typing import Callable, Optional, Dict, Any, List
 
-# "learner"  agent learning to optimize pricing
-# "agent"  part of environment creating demand signals that learner processes
+# here when we say "learner" we mean the agent that is learning to optimize the pricing and "agent" is part of the envrionment where the agent is creating demand that that "learner" is processing"
 
 @dataclass
 class BusinessLogicConstraints():
-    max_price_adjustment: float = 0.30
-    system_max_price: float = 500.0
-    system_min_price: float = 1.0
-    product_catelogue_size: int = 100
-    episode_length: int = 200
-    sessions_per_step: int = 250
-    agent_share: float = 0.25
-    agent_recon_multiplier: float = 6.0
-    agent_purchase_probability: float = 0.20
-    coi_strength: float = 0.25
-    coi_threshold: float = 4.0
-    coi_sigmoid_temp: float = 1.25
-    base_human_demand: float = 0.08
-    base_agent_demand: float = 0.05
-    human_price_elasticity: float = -1.2
-    agent_price_elasticity: float = -0.6
-    w_agent_loss: float = 1.0
-    w_volatility: float = 5.0
-    w_estimation_error: float = 0.25
-    seed: int = 7
-
-
-def _sigmoid(x: np.ndarray) -> np.ndarray:
-    return 1.0 / (1.0 + np.exp(-x))
-
-
-def simple_agent_detector(session_df: pd.DataFrame) -> pd.Series:
-    # baseline heuristic: high velocity + low conversion
-    v = session_df.get("interaction_velocity", pd.Series(0.0, index=session_df.index))
-    cr = session_df.get("conversion_rate", pd.Series(0.0, index=session_df.index))
-    total = session_df.get("total_interactions", pd.Series(0, index=session_df.index))
-    return (total >= 12) & (v >= 0.20) & (cr <= 0.01)
-
-
-class CommercePlatform:
-    def __init__(self, product_catelogue_size: int, max_price: float, min_price: float,
-                 constraints: BusinessLogicConstraints, agent_detector: Optional[Callable[[pd.DataFrame], pd.Series]] = None,
-                 use_defense: bool = False):
-        self.product_catelogue_size = product_catelogue_size
-        self.max_price = max_price
-        self.min_price = min_price
-        self.constraints = constraints
-        self.use_defense = use_defense
-        self.agent_detector = agent_detector
-        self.simulation_history: List[Dict[str, Any]] = []
-        self._rng = np.random.default_rng(constraints.seed)
-        self._popularity = self._rng.lognormal(mean=0.0, sigma=0.6, size=self.product_catelogue_size)
-        self._popularity = self._popularity / (self._popularity.mean() + 1e-12)
-        self._last_interaction_df: pd.DataFrame = pd.DataFrame()
-
-    def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
-        # ground truth purchase propensities
-        p = np.clip(prices, self.min_price, self.max_price)
-        pn = p / self.max_price
-        human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
-        agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
-        return {
-            "human_purchase_prob": np.clip(human_prob * self._popularity, 0.0, 0.95),
-            "agent_purchase_prob": np.clip(agent_prob * self._popularity, 0.0, 0.95)
-        }
-
-    def _session_markup_multiplier(self, signal_score: float) -> float:
-        # session-based COI markup based on demand signal expression
-        x = (signal_score - self.constraints.coi_threshold) / max(self.constraints.coi_sigmoid_temp, 1e-6)
-        return 1.0 + self.constraints.coi_strength * float(_sigmoid(np.array([x]))[0])
-
-    def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
-        demand = self.setup_true_demand(base_prices)
-        human_pprob = demand["human_purchase_prob"]
-        agent_pprob = demand["agent_purchase_prob"]
-        events: List[Dict[str, Any]] = []
-        T = self.constraints.sessions_per_step
-        n_agent_sessions = int(round(T * self.constraints.agent_share))
-        n_human_sessions = T - n_agent_sessions
-
-        # human sessions: normal browse with possible purchase
-        for s in range(n_human_sessions):
-            session_id = f"h_{len(events)}_{s}"
-            k = int(self._rng.integers(1, 4))
-            prod_ids = self._rng.choice(self.product_catelogue_size, size=k, replace=False)
-            t = 0.0
-            inter_times = self._rng.gamma(shape=2.0, scale=3.0, size=3 * k)
-            signal_score = 0.0
-            purchased_any = False
-
-            for i, pid in enumerate(prod_ids):
-                t += float(inter_times[i])
-                price_shown = float(base_prices[pid])
-                events.append({
-                    "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
-                    "action": "view", "t": t, "price_shown": price_shown, "is_purchase": 0,
-                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                })
-                signal_score += 1.0
-
-                if self._rng.random() < 0.35:
-                    t += float(inter_times[i + k])
-                    events.append({
-                        "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
-                        "action": "cart", "t": t, "price_shown": price_shown, "is_purchase": 0,
-                        "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                    })
-                    signal_score += 2.0
-
-                if (not purchased_any) and (self._rng.random() < float(human_pprob[pid])):
-                    t += float(inter_times[i + 2 * k])
-                    mult = self._session_markup_multiplier(signal_score)
-                    price_paid = float(np.clip(base_prices[pid] * mult, self.min_price, self.max_price))
-                    events.append({
-                        "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
-                        "action": "purchase", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 1,
-                        "price_paid": price_paid, "oracle_price_paid": price_paid, "signal_score": signal_score,
-                    })
-                    purchased_any = True
-
-        # agent sessions: split recon/purchase to circumvent COI
-        n_agent_ids = max(1, n_agent_sessions // 2)
-        for a in range(n_agent_ids):
-            agent_id = f"a_{a}"
-            recon_session_id = f"{agent_id}_recon"
-            t = 0.0
-            n_views = int(self._rng.poisson(lam=8) * self.constraints.agent_recon_multiplier) + 5
-            inter_times = self._rng.gamma(shape=2.0, scale=0.6, size=max(n_views, 1))
-            prod_ids = self._rng.integers(0, self.product_catelogue_size, size=n_views)
-            recon_signal = 0.0
-
-            for i, pid in enumerate(prod_ids):
-                t += float(inter_times[i])
-                events.append({
-                    "session_id": recon_session_id, "actor": "agent", "agent_id": agent_id, "product_id": int(pid),
-                    "action": "view", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 0,
-                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                })
-                recon_signal += 1.0
-
-            # clean purchase session with minimal interactions
-            if self._rng.random() < self.constraints.agent_purchase_probability:
-                purchase_session_id = f"{agent_id}_clean"
-                pid = int(self._rng.integers(0, self.product_catelogue_size))
-                t2 = 0.0
-                clean_signal = 0.0
-                t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
-                events.append({
-                    "session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
-                    "action": "view", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 0,
-                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                })
-                clean_signal += 1.0
-
-                if self._rng.random() < float(agent_pprob[pid]):
-                    t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
-                    obs_mult = self._session_markup_multiplier(clean_signal)
-                    obs_paid = float(np.clip(base_prices[pid] * obs_mult, self.min_price, self.max_price))
-                    oracle_mult = self._session_markup_multiplier(recon_signal)  # oracle links recon->purchase
-                    oracle_paid = float(np.clip(base_prices[pid] * oracle_mult, self.min_price, self.max_price))
-                    events.append({
-                        "session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
-                        "action": "purchase", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 1,
-                        "price_paid": obs_paid, "oracle_price_paid": oracle_paid, "signal_score": clean_signal,
-                    })
-
-        return pd.DataFrame(events)
-
-    def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
-        if interaction_df.empty:
-            return {"mean_sale_price": 0.0, "look_to_book": 0.0}
-        purchases = interaction_df[interaction_df["action"] == "purchase"]
-        mean_sale_price = float(purchases["price_paid"].mean()) if not purchases.empty else 0.0
-        views = float((interaction_df["action"] == "view").sum())
-        buys = float((interaction_df["action"] == "purchase").sum())
-        return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
-
-    def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
-        if df.empty:
-            return pd.DataFrame()
-        g = df.groupby("session_id", sort=False)
-        session_duration = g["t"].max() - g["t"].min()
-        total_interactions = g.size()
-        avg_time_between = g["t"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0)
-        interaction_velocity = total_interactions / (session_duration + 1e-6)
-        views = g.apply(lambda x: int((x["action"] == "view").sum()), include_groups=False)
-        cart_adds = g.apply(lambda x: int((x["action"] == "cart").sum()), include_groups=False)
-        purchases = g.apply(lambda x: int((x["action"] == "purchase").sum()), include_groups=False)
-        conversion_rate = purchases / (views + 1e-6)
-        is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False)
-
-        return pd.DataFrame({
-            "session_duration_sec": session_duration.astype(float),
-            "avg_time_between_events": avg_time_between.astype(float),
-            "total_interactions": total_interactions.astype(int),
-            "interaction_velocity": interaction_velocity.astype(float),
-            "item_views": views.astype(int),
-            "cart_adds": cart_adds.astype(int),
-            "purchases": purchases.astype(int),
-            "conversion_rate": conversion_rate.astype(float),
-            "is_agent": is_agent.astype(bool),
-        }).reset_index()
-
-    def demand_estimate(self, interaction_df: pd.DataFrame, exclude_sessions: Optional[pd.Series] = None) -> np.ndarray:
-        # proxy demand from weighted interaction events
-        if interaction_df.empty:
-            return np.zeros(self.product_catelogue_size, dtype=np.float32)
-        df = interaction_df
-        if exclude_sessions is not None:
-            bad_sessions = set(exclude_sessions.loc[exclude_sessions].index)
-            df = df[~df["session_id"].isin(bad_sessions)]
-        weights = {"view": 0.15, "cart": 0.75, "purchase": 2.5}
-        w = df["action"].map(weights).fillna(0.0).to_numpy(dtype=float)
-        prod = df["product_id"].to_numpy(dtype=int)
-        q_hat = np.zeros(self.product_catelogue_size, dtype=float)
-        np.add.at(q_hat, prod, w)
-        return q_hat.astype(np.float32)
-
-    def run_pricing_simulation(self, prices: np.ndarray) -> Dict[str, Any]:
-        interaction_df = self._simulate_sessions(prices)
-        self._last_interaction_df = interaction_df
-        session_df = self._session_feature_table(interaction_df)
-
-        predicted_agent_sessions = None
-        if (self.use_defense and self.agent_detector is not None and not session_df.empty):
-            predicted_agent_sessions = self.agent_detector(session_df.set_index("session_id"))
-
-        q_hat_naive = self.demand_estimate(interaction_df, exclude_sessions=None)
-        q_hat_defended = self.demand_estimate(interaction_df, exclude_sessions=predicted_agent_sessions) \
-            if predicted_agent_sessions is not None else q_hat_naive.copy()
-
-        true_human = np.zeros(self.product_catelogue_size, dtype=float)
-        true_agent = np.zeros(self.product_catelogue_size, dtype=float)
-        if not interaction_df.empty:
-            purchases = interaction_df[interaction_df["action"] == "purchase"]
-            if not purchases.empty:
-                for _, r in purchases.iterrows():
-                    if r["actor"] == "human":
-                        true_human[int(r["product_id"])] += 1.0
-                    else:
-                        true_agent[int(r["product_id"])] += 1.0
-
-        revenue_observed = float(interaction_df["price_paid"].sum()) if not interaction_df.empty else 0.0
-        revenue_oracle = float(interaction_df["oracle_price_paid"].sum()) if not interaction_df.empty else 0.0
-        agent_loss = max(0.0, revenue_oracle - revenue_observed)
-
-        eps = 1e-6
-        internal_error_naive = np.abs(true_human - q_hat_naive) / (true_human + eps)
-        internal_error_def = np.abs(true_human - q_hat_defended) / (true_human + eps)
-        interaction_features = self.compute_interaction_features(interaction_df)
-
-        summary = {
-            "prices": prices.copy(),
-            "interaction_df": interaction_df,
-            "session_df": session_df,
-            "q_hat_naive": q_hat_naive,
-            "q_hat_defended": q_hat_defended,
-            "true_human_demand": true_human.astype(np.float32),
-            "true_agent_purchases": true_agent.astype(np.float32),
-            "internal_error_naive": internal_error_naive.astype(np.float32),
-            "internal_error_defended": internal_error_def.astype(np.float32),
-            "interaction_features": interaction_features,
-            "revenue_observed": revenue_observed,
-            "revenue_oracle": revenue_oracle,
-            "agent_loss": agent_loss,
-            "predicted_agent_sessions": predicted_agent_sessions,
-        }
-        self.simulation_history.append(summary)
-        return summary
-
-    def get_interaction_data(self) -> np.ndarray:
-        if self._last_interaction_df.empty:
-            return np.array([], dtype=object)
-        return self._last_interaction_df.to_dict(orient="records")
+    max_price_adjustment : float = 0.3 # maximum adjustment of price
+    system_max_price : float = 500.0 # maximum price allowed in the system
+    product_catelogue_size : int = 100 # number of products in the catalogue
 
 
 class PHANTOMEnv(gym.Env):
-    metadata = {"render_modes": []}
-
-    def __init__(self, use_defense: bool = False):
-        super().__init__()
+    def __init__(self):
+        super(PHANTOMEnv, self).__init__()
         self.constraints = BusinessLogicConstraints()
-        self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
-                                       high=self.constraints.max_price_adjustment,
-                                       shape=(self.constraints.product_catelogue_size,), dtype=np.float32)
+        self.action_space = spaces.Box(
+            low=-self.constraints.max_price_adjustment, high=self.constraints.max_price_adjustment,
+            shape=(1,), dtype=np.float32) #  we allow teh learner to adjust price by some BusinessLogicConstraints factor
+        # Example for using image as input:
         self.observation_space = spaces.Dict({
-            "elasticity": spaces.Dict({
-                "price": spaces.Box(
-                    low=np.full((self.constraints.product_catelogue_size,), self.constraints.system_min_price, dtype=np.float32),
-                    high=np.full((self.constraints.product_catelogue_size,), self.constraints.system_max_price, dtype=np.float32),
-                    dtype=np.float32),
-                "demand": spaces.Box(
-                    low=np.zeros((self.constraints.product_catelogue_size,), dtype=np.float32),
-                    high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32),
-                    dtype=np.float32),
+            'elasticity': spaces.Dict({
+                'price': spaces.Box(low=0, high=self.constraints.system_max_price,
+                                    shape=(self.constraints.product_catelogue_size,), dtype=np.float32),
+                'demand': spaces.Box(low=0, high=np.inf,
+                                     shape=(self.constraints.product_catelogue_size,), dtype=np.float32)
             })
         })
-        self.commerce_platform = CommercePlatform(
-            product_catelogue_size=self.constraints.product_catelogue_size,
-            max_price=self.constraints.system_max_price,
-            min_price=self.constraints.system_min_price,
-            constraints=self.constraints,
-            agent_detector=simple_agent_detector,
-            use_defense=use_defense)
-        self._rng = np.random.default_rng(self.constraints.seed)
-        self.t = 0
-        self._prev_prices: Optional[np.ndarray] = None
-        self.state: Dict[str, Any] = {}
 
-    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
+    def reset(self, seed=None, options=None):
         super().reset(seed=seed)
-        if seed is not None:
-            self._rng = np.random.default_rng(seed)
-            self.commerce_platform._rng = np.random.default_rng(seed)
-        self.t = 0
-        init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catelogue_size,)).astype(np.float32)
-        self._prev_prices = init_prices.copy()
+        # Initialize state
         self.state = {
-            "elasticity": {
-                "price": init_prices,
-                "demand": np.zeros((self.constraints.product_catelogue_size,), dtype=np.float32),
-            }
+            'price': 100.0,  # base price
+            'demand': 0.0
         }
         return self.state, {}
 
-    def step(self, action: np.ndarray):
-        self.t += 1
-        base_prices = self.state["elasticity"]["price"].astype(np.float32)
-        new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
-                           self.constraints.system_min_price,
-                           self.constraints.system_max_price).astype(np.float32)
-        result = self.commerce_platform.run_pricing_simulation(new_prices)
+    def step(self, action):
+        # Apply action
+        price_adjustment = action[0]
+        new_price = self.state['price'] * (1 + price_adjustment)
+        self.state['price'] = new_price
 
-        if self.commerce_platform.use_defense:
-            demand_est = result["q_hat_defended"]
-            internal_err = result["internal_error_defended"]
-        else:
-            demand_est = result["q_hat_naive"]
-            internal_err = result["internal_error_naive"]
+        # Simulate demand based on new price
+        demand = self.simulate_demand(new_price)
+        self.state['demand'] = demand
 
-        self.state["elasticity"]["price"] = new_prices
-        self.state["elasticity"]["demand"] = demand_est
+        # Calculate reward (e.g., revenue)
+        reward = new_price * demand
 
-        volatility = 0.0 if self._prev_prices is None else \
-            float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
-        self._prev_prices = new_prices.copy()
+        # Check if episode is done
+        done = self.state['price'] <= 0.0 or self.state['demand'] <= 0.0
 
-        revenue_observed = float(result["revenue_observed"])
-        agent_loss = float(result["agent_loss"])
-        err_mean = float(np.mean(internal_err))
-
-        reward = (revenue_observed
-                 - self.constraints.w_agent_loss * agent_loss
-                 - self.constraints.w_volatility * volatility
-                 - self.constraints.w_estimation_error * err_mean)
-
-        terminated = self.t >= self.constraints.episode_length
-        info = {
-            "t": self.t,
-            "revenue_observed": revenue_observed,
-            "revenue_oracle": float(result["revenue_oracle"]),
-            "agent_loss": agent_loss,
-            "ux_volatility": volatility,
-            "mean_internal_error": err_mean,
-            "look_to_book": float(result["interaction_features"].get("look_to_book", 0.0)),
-            "mean_sale_price": float(result["interaction_features"].get("mean_sale_price", 0.0)),
-            "true_human_purchases_total": float(np.sum(result["true_human_demand"])),
-            "true_agent_purchases_total": float(np.sum(result["true_agent_purchases"])),
-        }
-        return self.state, float(reward), terminated, False, info
 
+        return self.state, reward, done, False, {}
+    def simulate_demand(self, price):
+        # Simple linear demand model: demand decreases as price increases
+        base_demand = 200
+        price_sensitivity = 0.5
+        demand = max(0, base_demand - price_sensitivity * price)
+        return demand
 
 if __name__ == "__main__":
-    import matplotlib.pyplot as plt
-    from collections import defaultdict
+    env = PHANTOMEnv()
+    obs, _ = env.reset()
+    done = False
+    total_reward = 0
 
-    runs = {}
-    for use_defense in (False, True):
-        env = PHANTOMEnv(use_defense=use_defense)
-        obs, _ = env.reset(seed=42)
-        metrics = defaultdict(list)
-        total_reward = 0.0
-        done = False
+    while not done:
+        action = env.action_space.sample()  # Random action
+        obs, reward, done, _, _ = env.step(action)
+        total_reward += reward
+        print(f"Price: {obs['price']:.2f}, Demand: {obs['demand']:.2f}, Reward: {reward:.2f}")
+        if done:
+            break
 
-        while not done:
-            action = env.action_space.sample()
-            obs, reward, done, _, info = env.step(action)
-            total_reward += reward
-            p_mean = float(np.mean(obs["elasticity"]["price"]))
-            q_mean = float(np.mean(obs["elasticity"]["demand"]))
-            p_std = float(np.std(obs["elasticity"]["price"]))
-
-            metrics['t'].append(info['t'])
-            metrics['price_mean'].append(p_mean)
-            metrics['price_std'].append(p_std)
-            metrics['demand_mean'].append(q_mean)
-            metrics['revenue_observed'].append(info['revenue_observed'])
-            metrics['revenue_oracle'].append(info['revenue_oracle'])
-            metrics['agent_loss'].append(info['agent_loss'])
-            metrics['ux_volatility'].append(info['ux_volatility'])
-            metrics['look_to_book'].append(info['look_to_book'])
-            metrics['reward'].append(reward)
-            metrics['human_purchases'].append(info['true_human_purchases_total'])
-            metrics['agent_purchases'].append(info['true_agent_purchases_total'])
-
-            if info['t'] % 20 == 0 or done:
-                print(f"defense={'ON ' if use_defense else 'OFF'} t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} "
-                      f"q={q_mean:6.2f} rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
-                      f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
-                      f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
-
-        runs[use_defense] = metrics
-        print(f"defense={'ON ' if use_defense else 'OFF'} total_reward={total_reward:.2f}\n")
-
-    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
-    fig.suptitle('PHANTOM Environment: Defense OFF vs ON', fontsize=14, fontweight='bold')
-
-    plot_configs = [
-        ('price_mean', 'Mean Price', 'Price'),
-        ('demand_mean', 'Mean Demand Estimate', 'Demand'),
-        ('revenue_observed', 'Revenue (Observed)', 'Revenue'),
-        ('agent_loss', 'Agent Loss (Oracle - Observed)', 'Loss'),
-        ('ux_volatility', 'UX Volatility (Price Change)', 'Volatility'),
-        ('look_to_book', 'Look-to-Book Ratio', 'Ratio'),
-        ('reward', 'Step Reward', 'Reward'),
-        ('human_purchases', 'Human Purchases', 'Count'),
-        ('agent_purchases', 'Agent Purchases', 'Count'),
-    ]
-
-    for idx, (key, title, ylabel) in enumerate(plot_configs):
-        ax = axes[idx // 3, idx % 3]
-        for use_defense, label, color in [(False, 'No Defense', 'red'), (True, 'With Defense', 'blue')]:
-            m = runs[use_defense]
-            ax.plot(m['t'], m[key], label=label, color=color, alpha=0.7, linewidth=1.5)
-        ax.set_xlabel('Step')
-        ax.set_ylabel(ylabel)
-        ax.set_title(title, fontsize=10, fontweight='bold')
-        ax.legend(loc='best', fontsize=8)
-        ax.grid(True, alpha=0.3)
-
-    plt.tight_layout()
-    plt.savefig('phantom_env_comparison.png', dpi=150, bbox_inches='tight')
-    print("Plot saved to phantom_env_comparison.png")
-    plt.show()
+    print(f"Total Reward: {total_reward:.2f}")

From 7d09232e48072598e7bf7bab46749fba47f00720 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 14 Dec 2025 17:53:48 +0100
Subject: [PATCH 66/99] high level defintion

---
 sim/rl/environment.py | 94 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 83 insertions(+), 11 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index 803a4fd..a09438f 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -2,6 +2,7 @@ import gymnasium as gym
 from gymnasium import spaces
 import numpy as np
 from dataclasses import dataclass
+import pandas as pd
 
 # here when we say "learner" we mean the agent that is learning to optimize the pricing and "agent" is part of the envrionment where the agent is creating demand that that "learner" is processing"
 
@@ -9,17 +10,89 @@ from dataclasses import dataclass
 class BusinessLogicConstraints():
     max_price_adjustment : float = 0.3 # maximum adjustment of price
     system_max_price : float = 500.0 # maximum price allowed in the system
+    system_min_price : float = 1.0 # minimum price allowed in the system
     product_catelogue_size : int = 100 # number of products in the catalogue
 
 
+class CommercePlatform:
+    def __init__(self, product_catelogue_size: int, max_price: float, min_price: float):
+        self.product_catelogue_size = product_catelogue_size
+        self.max_price = max_price
+        self.min_price = min_price
+        self.simulation_history = []
+
+
+    def setup_true_demand(self,prices: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+        human_price_elasticity = -1.5  # Example elasticity value
+        base_demand = 100  # Base demand for products
+        demand = base_demand * (prices / self.max_price) ** human_price_elasticity
+
+        agent_price_elasticity = -2.0  # Example elasticity value for agents
+        agent_base_demand = 150  # Base demand for agents
+        agent_demand = agent_base_demand * (prices / self.max_price) ** agent_price_elasticity
+
+        return demand + agent_demand, agent_demand
+
+
+    def compute_interaction_features(self, interaction_data: np.ndarray) -> dict:
+        df = pd.DataFrame(interaction_data)
+        return {
+            'mean_sale_price': df[df['action'] == 'purchase']['price'].mean(),
+        }
+
+    def run_pricing_simulation(self, prices: np.ndarray) -> np.ndarray:
+        # Simulate demand based on prices
+
+        observed_demand, demand_from_agents = self.setup_true_demand(prices)
+        true_demand = observed_demand - demand_from_agents
+
+        interaction_data = self.get_interaction_data()
+        interaction_features = self.compute_interaction_features(interaction_data)
+        demand_estimates = self.demand_estimate(interaction_data)
+        internal_error = np.abs(true_demand - demand_estimates) / (true_demand + 1e-6)
+
+        self.simulation_history.append(
+            {
+                'prices': prices,
+                'true_demand': true_demand,
+                'demand_estimates': demand_estimates,
+                'internal_error': internal_error,
+                'interaction_data': interaction_data,
+                'interaction_features': interaction_features
+            })
+        return np.array(interaction_data)
+
+    def get_interaction_data(self) -> np.ndarray:
+        # Simulate interaction data
+        interaction_data = []
+        return np.array(interaction_data)
+
+
+    def demand_estimate(self, interactions : np.ndarray) -> np.ndarray:
+        demand_estimates = np.random.rand(self.product_catelogue_size) * 100  # Dummy demand estimates
+        return demand_estimates
+
+
+
+
+
+
+
+
+
 class PHANTOMEnv(gym.Env):
     def __init__(self):
         super(PHANTOMEnv, self).__init__()
         self.constraints = BusinessLogicConstraints()
         self.action_space = spaces.Box(
             low=-self.constraints.max_price_adjustment, high=self.constraints.max_price_adjustment,
-            shape=(1,), dtype=np.float32) #  we allow teh learner to adjust price by some BusinessLogicConstraints factor
+            shape=(self.constraints.product_catelogue_size,), dtype=np.float32) #  we allow teh learner to adjust price by some BusinessLogicConstraints factor
         # Example for using image as input:
+        self.commerce_platform = CommercePlatform(
+            product_catelogue_size=self.constraints.product_catelogue_size,
+            max_price=self.constraints.system_max_price,
+            min_price=self.constraints.system_min_price
+        )
         self.observation_space = spaces.Dict({
             'elasticity': spaces.Dict({
                 'price': spaces.Box(low=0, high=self.constraints.system_max_price,
@@ -29,24 +102,23 @@ class PHANTOMEnv(gym.Env):
             })
         })
 
-    def reset(self, seed=None, options=None):
+    def reset(self, seed :int, options) -> tuple[dict, dict]:
         super().reset(seed=seed)
         # Initialize state
         self.state = {
-            'price': 100.0,  # base price
-            'demand': 0.0
+            'elasticity': {
+                'price': np.full((self.constraints.product_catelogue_size,), 100.0, dtype=np.float32),
+                'demand': np.full((self.constraints.product_catelogue_size,), 50.0, dtype=np.float32)
+            }
         }
         return self.state, {}
 
     def step(self, action):
-        # Apply action
-        price_adjustment = action[0]
-        new_price = self.state['price'] * (1 + price_adjustment)
-        self.state['price'] = new_price
+        self.state['price'] = np.clip(self.state['price'] * (1 + action),
+                            self.constraints.system_min_price,
+                            self.constraints.system_max_price)
+
 
-        # Simulate demand based on new price
-        demand = self.simulate_demand(new_price)
-        self.state['demand'] = demand
 
         # Calculate reward (e.g., revenue)
         reward = new_price * demand

From 8a084584786bfbaff6b913ccf7d8af3c0804349d Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 14 Dec 2025 17:59:34 +0100
Subject: [PATCH 67/99] formlating the reward simply

---
 sim/rl/environment.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index a09438f..ca7159b 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -40,7 +40,7 @@ class CommercePlatform:
             'mean_sale_price': df[df['action'] == 'purchase']['price'].mean(),
         }
 
-    def run_pricing_simulation(self, prices: np.ndarray) -> np.ndarray:
+    def run_pricing_simulation(self, prices: np.ndarray) -> dict:
         # Simulate demand based on prices
 
         observed_demand, demand_from_agents = self.setup_true_demand(prices)
@@ -51,16 +51,17 @@ class CommercePlatform:
         demand_estimates = self.demand_estimate(interaction_data)
         internal_error = np.abs(true_demand - demand_estimates) / (true_demand + 1e-6)
 
-        self.simulation_history.append(
-            {
+
+        summary = {
                 'prices': prices,
                 'true_demand': true_demand,
                 'demand_estimates': demand_estimates,
                 'internal_error': internal_error,
                 'interaction_data': interaction_data,
                 'interaction_features': interaction_features
-            })
-        return np.array(interaction_data)
+            }
+        self.simulation_history.append(summary)
+        return summary
 
     def get_interaction_data(self) -> np.ndarray:
         # Simulate interaction data
@@ -118,10 +119,24 @@ class PHANTOMEnv(gym.Env):
                             self.constraints.system_min_price,
                             self.constraints.system_max_price)
 
+        result = self.commerce_platform.run_pricing_simulation(self.state['price'])
+        history = self.commerce_platform.simulation_history
+        self.state['demand'] = result['demand_estimates']
+
+
+
+        reward = sum(
+            self.state['price'] * self.state['demand'],
+            # performance historically, to take into account business kpi trends (using features from interaction data)
+            sum(
+                [-0.05 * i * history[-1]['internal_error'] for i in range(1, len(history))],
+            ) if len(history) > 1 else 0,
+            sum(
+                [0.1 * history[-1]['interaction_features']['mean_sale_price'] - 0.1 * history[i]['interaction_features']['mean_sale_price'] for i in range(len(history)-1)],
+            ) if len(history) > 1 else 0
+        )
 
 
-        # Calculate reward (e.g., revenue)
-        reward = new_price * demand
 
         # Check if episode is done
         done = self.state['price'] <= 0.0 or self.state['demand'] <= 0.0

From 201c98bcacd2420ed45c52a583383618b26d46dc Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 14 Dec 2025 18:59:02 +0100
Subject: [PATCH 68/99] improved implementation

---
 sim/rl/environment.py | 512 ++++++++++++++++++++++++++++++++----------
 1 file changed, 398 insertions(+), 114 deletions(-)

diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index ca7159b..19f9ad4 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -3,165 +3,449 @@ from gymnasium import spaces
 import numpy as np
 from dataclasses import dataclass
 import pandas as pd
+from typing import Callable, Optional, Dict, Any, List
 
-# here when we say "learner" we mean the agent that is learning to optimize the pricing and "agent" is part of the envrionment where the agent is creating demand that that "learner" is processing"
+# "learner"  agent learning to optimize pricing
+# "agent"  part of environment creating demand signals that learner processes
 
 @dataclass
 class BusinessLogicConstraints():
-    max_price_adjustment : float = 0.3 # maximum adjustment of price
-    system_max_price : float = 500.0 # maximum price allowed in the system
-    system_min_price : float = 1.0 # minimum price allowed in the system
-    product_catelogue_size : int = 100 # number of products in the catalogue
+    max_price_adjustment: float = 0.30
+    system_max_price: float = 500.0
+    system_min_price: float = 1.0
+    product_catelogue_size: int = 100
+    episode_length: int = 200
+    sessions_per_step: int = 250
+    agent_share: float = 0.25
+    agent_recon_multiplier: float = 6.0
+    agent_purchase_probability: float = 0.20
+    coi_strength: float = 0.25
+    coi_threshold: float = 4.0
+    coi_sigmoid_temp: float = 1.25
+    base_human_demand: float = 0.08
+    base_agent_demand: float = 0.05
+    human_price_elasticity: float = -1.2
+    agent_price_elasticity: float = -0.6
+    w_agent_loss: float = 1.0
+    w_volatility: float = 5.0
+    w_estimation_error: float = 0.25
+    seed: int = 7
+
+
+def _sigmoid(x: np.ndarray) -> np.ndarray:
+    return 1.0 / (1.0 + np.exp(-x))
+
+
+def simple_agent_detector(session_df: pd.DataFrame) -> pd.Series:
+    # baseline heuristic: high velocity + low conversion
+    v = session_df.get("interaction_velocity", pd.Series(0.0, index=session_df.index))
+    cr = session_df.get("conversion_rate", pd.Series(0.0, index=session_df.index))
+    total = session_df.get("total_interactions", pd.Series(0, index=session_df.index))
+    return (total >= 12) & (v >= 0.20) & (cr <= 0.01)
 
 
 class CommercePlatform:
-    def __init__(self, product_catelogue_size: int, max_price: float, min_price: float):
+    def __init__(self, product_catelogue_size: int, max_price: float, min_price: float,
+                 constraints: BusinessLogicConstraints, agent_detector: Optional[Callable[[pd.DataFrame], pd.Series]] = None,
+                 use_defense: bool = False):
         self.product_catelogue_size = product_catelogue_size
         self.max_price = max_price
         self.min_price = min_price
-        self.simulation_history = []
+        self.constraints = constraints
+        self.use_defense = use_defense
+        self.agent_detector = agent_detector
+        self.simulation_history: List[Dict[str, Any]] = []
+        self._rng = np.random.default_rng(constraints.seed)
+        self._popularity = self._rng.lognormal(mean=0.0, sigma=0.6, size=self.product_catelogue_size)
+        self._popularity = self._popularity / (self._popularity.mean() + 1e-12)
+        self._last_interaction_df: pd.DataFrame = pd.DataFrame()
 
-
-    def setup_true_demand(self,prices: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-        human_price_elasticity = -1.5  # Example elasticity value
-        base_demand = 100  # Base demand for products
-        demand = base_demand * (prices / self.max_price) ** human_price_elasticity
-
-        agent_price_elasticity = -2.0  # Example elasticity value for agents
-        agent_base_demand = 150  # Base demand for agents
-        agent_demand = agent_base_demand * (prices / self.max_price) ** agent_price_elasticity
-
-        return demand + agent_demand, agent_demand
-
-
-    def compute_interaction_features(self, interaction_data: np.ndarray) -> dict:
-        df = pd.DataFrame(interaction_data)
+    def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
+        # ground truth purchase propensities
+        p = np.clip(prices, self.min_price, self.max_price)
+        pn = p / self.max_price
+        human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
+        agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
         return {
-            'mean_sale_price': df[df['action'] == 'purchase']['price'].mean(),
+            "human_purchase_prob": np.clip(human_prob * self._popularity, 0.0, 0.95),
+            "agent_purchase_prob": np.clip(agent_prob * self._popularity, 0.0, 0.95)
         }
 
-    def run_pricing_simulation(self, prices: np.ndarray) -> dict:
-        # Simulate demand based on prices
+    def _session_markup_multiplier(self, signal_score: float) -> float:
+        # session-based COI markup based on demand signal expression
+        x = (signal_score - self.constraints.coi_threshold) / max(self.constraints.coi_sigmoid_temp, 1e-6)
+        return 1.0 + self.constraints.coi_strength * float(_sigmoid(np.array([x]))[0])
 
-        observed_demand, demand_from_agents = self.setup_true_demand(prices)
-        true_demand = observed_demand - demand_from_agents
+    def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
+        demand = self.setup_true_demand(base_prices)
+        human_pprob = demand["human_purchase_prob"]
+        agent_pprob = demand["agent_purchase_prob"]
+        events: List[Dict[str, Any]] = []
+        T = self.constraints.sessions_per_step
+        n_agent_sessions = int(round(T * self.constraints.agent_share))
+        n_human_sessions = T - n_agent_sessions
 
-        interaction_data = self.get_interaction_data()
-        interaction_features = self.compute_interaction_features(interaction_data)
-        demand_estimates = self.demand_estimate(interaction_data)
-        internal_error = np.abs(true_demand - demand_estimates) / (true_demand + 1e-6)
+        # human sessions: normal browse with possible purchase
+        for s in range(n_human_sessions):
+            session_id = f"h_{len(events)}_{s}"
+            k = int(self._rng.integers(1, 4))
+            prod_ids = self._rng.choice(self.product_catelogue_size, size=k, replace=False)
+            t = 0.0
+            inter_times = self._rng.gamma(shape=2.0, scale=3.0, size=3 * k)
+            signal_score = 0.0
+            purchased_any = False
 
+            for i, pid in enumerate(prod_ids):
+                t += float(inter_times[i])
+                price_shown = float(base_prices[pid])
+                events.append({
+                    "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
+                    "action": "view", "t": t, "price_shown": price_shown, "is_purchase": 0,
+                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
+                })
+                signal_score += 1.0
+
+                if self._rng.random() < 0.35:
+                    t += float(inter_times[i + k])
+                    events.append({
+                        "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
+                        "action": "cart", "t": t, "price_shown": price_shown, "is_purchase": 0,
+                        "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
+                    })
+                    signal_score += 2.0
+
+                if (not purchased_any) and (self._rng.random() < float(human_pprob[pid])):
+                    t += float(inter_times[i + 2 * k])
+                    mult = self._session_markup_multiplier(signal_score)
+                    price_paid = float(np.clip(base_prices[pid] * mult, self.min_price, self.max_price))
+                    events.append({
+                        "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
+                        "action": "purchase", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 1,
+                        "price_paid": price_paid, "oracle_price_paid": price_paid, "signal_score": signal_score,
+                    })
+                    purchased_any = True
+
+        # agent sessions: split recon/purchase to circumvent COI
+        n_agent_ids = max(1, n_agent_sessions // 2)
+        for a in range(n_agent_ids):
+            agent_id = f"a_{a}"
+            recon_session_id = f"{agent_id}_recon"
+            t = 0.0
+            n_views = int(self._rng.poisson(lam=8) * self.constraints.agent_recon_multiplier) + 5
+            inter_times = self._rng.gamma(shape=2.0, scale=0.6, size=max(n_views, 1))
+            prod_ids = self._rng.integers(0, self.product_catelogue_size, size=n_views)
+            recon_signal = 0.0
+
+            for i, pid in enumerate(prod_ids):
+                t += float(inter_times[i])
+                events.append({
+                    "session_id": recon_session_id, "actor": "agent", "agent_id": agent_id, "product_id": int(pid),
+                    "action": "view", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 0,
+                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
+                })
+                recon_signal += 1.0
+
+            # clean purchase session with minimal interactions
+            if self._rng.random() < self.constraints.agent_purchase_probability:
+                purchase_session_id = f"{agent_id}_clean"
+                pid = int(self._rng.integers(0, self.product_catelogue_size))
+                t2 = 0.0
+                clean_signal = 0.0
+                t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
+                events.append({
+                    "session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
+                    "action": "view", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 0,
+                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
+                })
+                clean_signal += 1.0
+
+                if self._rng.random() < float(agent_pprob[pid]):
+                    t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
+                    obs_mult = self._session_markup_multiplier(clean_signal)
+                    obs_paid = float(np.clip(base_prices[pid] * obs_mult, self.min_price, self.max_price))
+                    oracle_mult = self._session_markup_multiplier(recon_signal)  # oracle links recon->purchase
+                    oracle_paid = float(np.clip(base_prices[pid] * oracle_mult, self.min_price, self.max_price))
+                    events.append({
+                        "session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
+                        "action": "purchase", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 1,
+                        "price_paid": obs_paid, "oracle_price_paid": oracle_paid, "signal_score": clean_signal,
+                    })
+
+        return pd.DataFrame(events)
+
+    def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
+        if interaction_df.empty:
+            return {"mean_sale_price": 0.0, "look_to_book": 0.0}
+        purchases = interaction_df[interaction_df["action"] == "purchase"]
+        mean_sale_price = float(purchases["price_paid"].mean()) if not purchases.empty else 0.0
+        views = float((interaction_df["action"] == "view").sum())
+        buys = float((interaction_df["action"] == "purchase").sum())
+        return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
+
+    def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
+        if df.empty:
+            return pd.DataFrame()
+        g = df.groupby("session_id", sort=False)
+        session_duration = g["t"].max() - g["t"].min()
+        total_interactions = g.size()
+        avg_time_between = g["t"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0)
+        interaction_velocity = total_interactions / (session_duration + 1e-6)
+        views = g.apply(lambda x: int((x["action"] == "view").sum()), include_groups=False)
+        cart_adds = g.apply(lambda x: int((x["action"] == "cart").sum()), include_groups=False)
+        purchases = g.apply(lambda x: int((x["action"] == "purchase").sum()), include_groups=False)
+        conversion_rate = purchases / (views + 1e-6)
+        is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False)
+
+        return pd.DataFrame({
+            "session_duration_sec": session_duration.astype(float),
+            "avg_time_between_events": avg_time_between.astype(float),
+            "total_interactions": total_interactions.astype(int),
+            "interaction_velocity": interaction_velocity.astype(float),
+            "item_views": views.astype(int),
+            "cart_adds": cart_adds.astype(int),
+            "purchases": purchases.astype(int),
+            "conversion_rate": conversion_rate.astype(float),
+            "is_agent": is_agent.astype(bool),
+        }).reset_index()
+
+    def demand_estimate(self, interaction_df: pd.DataFrame, exclude_sessions: Optional[pd.Series] = None) -> np.ndarray:
+        # proxy demand from weighted interaction events
+        if interaction_df.empty:
+            return np.zeros(self.product_catelogue_size, dtype=np.float32)
+        df = interaction_df
+        if exclude_sessions is not None:
+            bad_sessions = set(exclude_sessions.loc[exclude_sessions].index)
+            df = df[~df["session_id"].isin(bad_sessions)]
+        weights = {"view": 0.15, "cart": 0.75, "purchase": 2.5}
+        w = df["action"].map(weights).fillna(0.0).to_numpy(dtype=float)
+        prod = df["product_id"].to_numpy(dtype=int)
+        q_hat = np.zeros(self.product_catelogue_size, dtype=float)
+        np.add.at(q_hat, prod, w)
+        return q_hat.astype(np.float32)
+
+    def run_pricing_simulation(self, prices: np.ndarray) -> Dict[str, Any]:
+        interaction_df = self._simulate_sessions(prices)
+        self._last_interaction_df = interaction_df
+        session_df = self._session_feature_table(interaction_df)
+
+        predicted_agent_sessions = None
+        if (self.use_defense and self.agent_detector is not None and not session_df.empty):
+            predicted_agent_sessions = self.agent_detector(session_df.set_index("session_id"))
+
+        q_hat_naive = self.demand_estimate(interaction_df, exclude_sessions=None)
+        q_hat_defended = self.demand_estimate(interaction_df, exclude_sessions=predicted_agent_sessions) \
+            if predicted_agent_sessions is not None else q_hat_naive.copy()
+
+        true_human = np.zeros(self.product_catelogue_size, dtype=float)
+        true_agent = np.zeros(self.product_catelogue_size, dtype=float)
+        if not interaction_df.empty:
+            purchases = interaction_df[interaction_df["action"] == "purchase"]
+            if not purchases.empty:
+                for _, r in purchases.iterrows():
+                    if r["actor"] == "human":
+                        true_human[int(r["product_id"])] += 1.0
+                    else:
+                        true_agent[int(r["product_id"])] += 1.0
+
+        revenue_observed = float(interaction_df["price_paid"].sum()) if not interaction_df.empty else 0.0
+        revenue_oracle = float(interaction_df["oracle_price_paid"].sum()) if not interaction_df.empty else 0.0
+        agent_loss = max(0.0, revenue_oracle - revenue_observed)
+
+        eps = 1e-6
+        internal_error_naive = np.abs(true_human - q_hat_naive) / (true_human + eps)
+        internal_error_def = np.abs(true_human - q_hat_defended) / (true_human + eps)
+        interaction_features = self.compute_interaction_features(interaction_df)
 
         summary = {
-                'prices': prices,
-                'true_demand': true_demand,
-                'demand_estimates': demand_estimates,
-                'internal_error': internal_error,
-                'interaction_data': interaction_data,
-                'interaction_features': interaction_features
-            }
+            "prices": prices.copy(),
+            "interaction_df": interaction_df,
+            "session_df": session_df,
+            "q_hat_naive": q_hat_naive,
+            "q_hat_defended": q_hat_defended,
+            "true_human_demand": true_human.astype(np.float32),
+            "true_agent_purchases": true_agent.astype(np.float32),
+            "internal_error_naive": internal_error_naive.astype(np.float32),
+            "internal_error_defended": internal_error_def.astype(np.float32),
+            "interaction_features": interaction_features,
+            "revenue_observed": revenue_observed,
+            "revenue_oracle": revenue_oracle,
+            "agent_loss": agent_loss,
+            "predicted_agent_sessions": predicted_agent_sessions,
+        }
         self.simulation_history.append(summary)
         return summary
 
     def get_interaction_data(self) -> np.ndarray:
-        # Simulate interaction data
-        interaction_data = []
-        return np.array(interaction_data)
-
-
-    def demand_estimate(self, interactions : np.ndarray) -> np.ndarray:
-        demand_estimates = np.random.rand(self.product_catelogue_size) * 100  # Dummy demand estimates
-        return demand_estimates
-
-
-
-
-
-
-
+        if self._last_interaction_df.empty:
+            return np.array([], dtype=object)
+        return self._last_interaction_df.to_dict(orient="records")
 
 
 class PHANTOMEnv(gym.Env):
-    def __init__(self):
-        super(PHANTOMEnv, self).__init__()
+    metadata = {"render_modes": []}
+
+    def __init__(self, use_defense: bool = False):
+        super().__init__()
         self.constraints = BusinessLogicConstraints()
-        self.action_space = spaces.Box(
-            low=-self.constraints.max_price_adjustment, high=self.constraints.max_price_adjustment,
-            shape=(self.constraints.product_catelogue_size,), dtype=np.float32) #  we allow teh learner to adjust price by some BusinessLogicConstraints factor
-        # Example for using image as input:
+        self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
+                                       high=self.constraints.max_price_adjustment,
+                                       shape=(self.constraints.product_catelogue_size,), dtype=np.float32)
+        self.observation_space = spaces.Dict({
+            "elasticity": spaces.Dict({
+                "price": spaces.Box(
+                    low=np.full((self.constraints.product_catelogue_size,), self.constraints.system_min_price, dtype=np.float32),
+                    high=np.full((self.constraints.product_catelogue_size,), self.constraints.system_max_price, dtype=np.float32),
+                    dtype=np.float32),
+                "demand": spaces.Box(
+                    low=np.zeros((self.constraints.product_catelogue_size,), dtype=np.float32),
+                    high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32),
+                    dtype=np.float32),
+            })
+        })
         self.commerce_platform = CommercePlatform(
             product_catelogue_size=self.constraints.product_catelogue_size,
             max_price=self.constraints.system_max_price,
-            min_price=self.constraints.system_min_price
-        )
-        self.observation_space = spaces.Dict({
-            'elasticity': spaces.Dict({
-                'price': spaces.Box(low=0, high=self.constraints.system_max_price,
-                                    shape=(self.constraints.product_catelogue_size,), dtype=np.float32),
-                'demand': spaces.Box(low=0, high=np.inf,
-                                     shape=(self.constraints.product_catelogue_size,), dtype=np.float32)
-            })
-        })
+            min_price=self.constraints.system_min_price,
+            constraints=self.constraints,
+            agent_detector=simple_agent_detector,
+            use_defense=use_defense)
+        self._rng = np.random.default_rng(self.constraints.seed)
+        self.t = 0
+        self._prev_prices: Optional[np.ndarray] = None
+        self.state: Dict[str, Any] = {}
 
-    def reset(self, seed :int, options) -> tuple[dict, dict]:
+    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
         super().reset(seed=seed)
-        # Initialize state
+        if seed is not None:
+            self._rng = np.random.default_rng(seed)
+            self.commerce_platform._rng = np.random.default_rng(seed)
+        self.t = 0
+        init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catelogue_size,)).astype(np.float32)
+        self._prev_prices = init_prices.copy()
         self.state = {
-            'elasticity': {
-                'price': np.full((self.constraints.product_catelogue_size,), 100.0, dtype=np.float32),
-                'demand': np.full((self.constraints.product_catelogue_size,), 50.0, dtype=np.float32)
+            "elasticity": {
+                "price": init_prices,
+                "demand": np.zeros((self.constraints.product_catelogue_size,), dtype=np.float32),
             }
         }
         return self.state, {}
 
-    def step(self, action):
-        self.state['price'] = np.clip(self.state['price'] * (1 + action),
-                            self.constraints.system_min_price,
-                            self.constraints.system_max_price)
+    def step(self, action: np.ndarray):
+        self.t += 1
+        base_prices = self.state["elasticity"]["price"].astype(np.float32)
+        new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
+                           self.constraints.system_min_price,
+                           self.constraints.system_max_price).astype(np.float32)
+        result = self.commerce_platform.run_pricing_simulation(new_prices)
 
-        result = self.commerce_platform.run_pricing_simulation(self.state['price'])
-        history = self.commerce_platform.simulation_history
-        self.state['demand'] = result['demand_estimates']
+        if self.commerce_platform.use_defense:
+            demand_est = result["q_hat_defended"]
+            internal_err = result["internal_error_defended"]
+        else:
+            demand_est = result["q_hat_naive"]
+            internal_err = result["internal_error_naive"]
 
+        self.state["elasticity"]["price"] = new_prices
+        self.state["elasticity"]["demand"] = demand_est
 
+        volatility = 0.0 if self._prev_prices is None else \
+            float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
+        self._prev_prices = new_prices.copy()
 
-        reward = sum(
-            self.state['price'] * self.state['demand'],
-            # performance historically, to take into account business kpi trends (using features from interaction data)
-            sum(
-                [-0.05 * i * history[-1]['internal_error'] for i in range(1, len(history))],
-            ) if len(history) > 1 else 0,
-            sum(
-                [0.1 * history[-1]['interaction_features']['mean_sale_price'] - 0.1 * history[i]['interaction_features']['mean_sale_price'] for i in range(len(history)-1)],
-            ) if len(history) > 1 else 0
-        )
+        revenue_observed = float(result["revenue_observed"])
+        agent_loss = float(result["agent_loss"])
+        err_mean = float(np.mean(internal_err))
 
+        reward = (revenue_observed
+                 - self.constraints.w_agent_loss * agent_loss
+                 - self.constraints.w_volatility * volatility
+                 - self.constraints.w_estimation_error * err_mean)
 
+        terminated = self.t >= self.constraints.episode_length
+        info = {
+            "t": self.t,
+            "revenue_observed": revenue_observed,
+            "revenue_oracle": float(result["revenue_oracle"]),
+            "agent_loss": agent_loss,
+            "ux_volatility": volatility,
+            "mean_internal_error": err_mean,
+            "look_to_book": float(result["interaction_features"].get("look_to_book", 0.0)),
+            "mean_sale_price": float(result["interaction_features"].get("mean_sale_price", 0.0)),
+            "true_human_purchases_total": float(np.sum(result["true_human_demand"])),
+            "true_agent_purchases_total": float(np.sum(result["true_agent_purchases"])),
+        }
+        return self.state, float(reward), terminated, False, info
 
-        # Check if episode is done
-        done = self.state['price'] <= 0.0 or self.state['demand'] <= 0.0
-
-
-        return self.state, reward, done, False, {}
-    def simulate_demand(self, price):
-        # Simple linear demand model: demand decreases as price increases
-        base_demand = 200
-        price_sensitivity = 0.5
-        demand = max(0, base_demand - price_sensitivity * price)
-        return demand
 
 if __name__ == "__main__":
-    env = PHANTOMEnv()
-    obs, _ = env.reset()
-    done = False
-    total_reward = 0
+    import matplotlib.pyplot as plt
+    from collections import defaultdict
 
-    while not done:
-        action = env.action_space.sample()  # Random action
-        obs, reward, done, _, _ = env.step(action)
-        total_reward += reward
-        print(f"Price: {obs['price']:.2f}, Demand: {obs['demand']:.2f}, Reward: {reward:.2f}")
-        if done:
-            break
+    runs = {}
+    for use_defense in (False, True):
+        env = PHANTOMEnv(use_defense=use_defense)
+        obs, _ = env.reset(seed=42)
+        metrics = defaultdict(list)
+        total_reward = 0.0
+        done = False
 
-    print(f"Total Reward: {total_reward:.2f}")
+        while not done:
+            action = env.action_space.sample()
+            obs, reward, done, _, info = env.step(action)
+            total_reward += reward
+            p_mean = float(np.mean(obs["elasticity"]["price"]))
+            q_mean = float(np.mean(obs["elasticity"]["demand"]))
+            p_std = float(np.std(obs["elasticity"]["price"]))
+
+            metrics['t'].append(info['t'])
+            metrics['price_mean'].append(p_mean)
+            metrics['price_std'].append(p_std)
+            metrics['demand_mean'].append(q_mean)
+            metrics['revenue_observed'].append(info['revenue_observed'])
+            metrics['revenue_oracle'].append(info['revenue_oracle'])
+            metrics['agent_loss'].append(info['agent_loss'])
+            metrics['ux_volatility'].append(info['ux_volatility'])
+            metrics['look_to_book'].append(info['look_to_book'])
+            metrics['reward'].append(reward)
+            metrics['human_purchases'].append(info['true_human_purchases_total'])
+            metrics['agent_purchases'].append(info['true_agent_purchases_total'])
+
+            if info['t'] % 20 == 0 or done:
+                print(f"defense={'ON ' if use_defense else 'OFF'} t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} "
+                      f"q={q_mean:6.2f} rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
+                      f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
+                      f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
+
+        runs[use_defense] = metrics
+        print(f"defense={'ON ' if use_defense else 'OFF'} total_reward={total_reward:.2f}\n")
+
+    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
+    fig.suptitle('PHANTOM Environment: Defense OFF vs ON', fontsize=14, fontweight='bold')
+
+    plot_configs = [
+        ('price_mean', 'Mean Price', 'Price'),
+        ('demand_mean', 'Mean Demand Estimate', 'Demand'),
+        ('revenue_observed', 'Revenue (Observed)', 'Revenue'),
+        ('agent_loss', 'Agent Loss (Oracle - Observed)', 'Loss'),
+        ('ux_volatility', 'UX Volatility (Price Change)', 'Volatility'),
+        ('look_to_book', 'Look-to-Book Ratio', 'Ratio'),
+        ('reward', 'Step Reward', 'Reward'),
+        ('human_purchases', 'Human Purchases', 'Count'),
+        ('agent_purchases', 'Agent Purchases', 'Count'),
+    ]
+
+    for idx, (key, title, ylabel) in enumerate(plot_configs):
+        ax = axes[idx // 3, idx % 3]
+        for use_defense, label, color in [(False, 'No Defense', 'red'), (True, 'With Defense', 'blue')]:
+            m = runs[use_defense]
+            ax.plot(m['t'], m[key], label=label, color=color, alpha=0.7, linewidth=1.5)
+        ax.set_xlabel('Step')
+        ax.set_ylabel(ylabel)
+        ax.set_title(title, fontsize=10, fontweight='bold')
+        ax.legend(loc='best', fontsize=8)
+        ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig('phantom_env_comparison.png', dpi=150, bbox_inches='tight')
+    print("Plot saved to phantom_env_comparison.png")
+    plt.show()

From 3fa98f375df31eb23fe5a43116cf4c1535ce706d Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 17 Dec 2025 17:41:16 +0100
Subject: [PATCH 69/99] refactor to align moer with research in the env sims

---
 sim/rl/engine.py      | 220 ++++++++++++++++++++++++++++++++++++
 sim/rl/environment.py | 255 ++++++++++--------------------------------
 sim/rl/train.py       | 149 ++++++++++++++++++++++++
 3 files changed, 431 insertions(+), 193 deletions(-)
 create mode 100644 sim/rl/engine.py
 create mode 100644 sim/rl/train.py

diff --git a/sim/rl/engine.py b/sim/rl/engine.py
new file mode 100644
index 0000000..6d913f3
--- /dev/null
+++ b/sim/rl/engine.py
@@ -0,0 +1,220 @@
+import numpy as np
+import pandas as pd
+from abc import ABC, abstractmethod
+from typing import Dict, Any
+from environment import BusinessLogicConstraints
+
+
+class BasePricingEngine(ABC):
+    """base interface for all pricing engines"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        self.c = constraints
+        self.rng = np.random.default_rng(seed)
+        self.step_count = 0
+
+    @abstractmethod
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        """compute new prices given current state and observation from environment
+
+        args:
+            current_prices: current price vector [N]
+            observation: dict containing 'price', 'demand', and possibly interaction data
+
+        returns:
+            new_prices: updated price vector [N]
+        """
+        pass
+
+    @abstractmethod
+    def update(obs, reward, done, info):
+        pass
+
+
+
+    def reset(self):
+        """reset engine state for new episode"""
+        self.step_count = 0
+
+
+class WildPricingEngine(BasePricingEngine):
+    """production-like pricing using online elasticity estimation via EWMA regression"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        super().__init__(constraints, seed)
+        # per-product unit costs (unknown to customers; known to platform)
+        self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catelogue_size).astype(np.float32)
+        # online elasticity estimate (start moderately elastic)
+        self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
+        # EWMA state for log-log regression
+        self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.cov_pq  = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.var_p   = np.ones(self.c.product_catelogue_size, dtype=np.float32)
+        # knobs typical in production
+        self.lr = 0.08
+        self.ewma = 0.05
+        self.eps_explore = 0.03
+        self.explore_scale = 0.03
+
+    def _safe_elasticity(self, e: np.ndarray) -> np.ndarray:
+        return np.clip(e, -5.0, -1.05)
+
+    def reset(self):
+        super().reset()
+        self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
+        self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.cov_pq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.var_p = np.ones(self.c.product_catelogue_size, dtype=np.float32)
+
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        self.step_count += 1
+        # extract demand signal (from env observation) as proxy for sales
+        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        return self._update_from_demand(current_prices, demand)
+
+    def _update_from_demand(self, prices: np.ndarray, sold: np.ndarray) -> np.ndarray:
+        # log transforms (add 1 to handle zeros)
+        logp = np.log(np.clip(prices, 1e-3, None)).astype(np.float32)
+        logq = np.log(sold + 1.0).astype(np.float32)
+        # EWMA moments for per-product regression: logq ≈ a + e*logp
+        a = self.ewma
+        dp = logp - self.mu_logp
+        dq = logq - self.mu_logq
+        self.mu_logp = (1 - a) * self.mu_logp + a * logp
+        self.mu_logq = (1 - a) * self.mu_logq + a * logq
+        self.cov_pq = (1 - a) * self.cov_pq + a * (dp * dq)
+        self.var_p = (1 - a) * self.var_p + a * (dp * dp + 1e-6)
+        e_new = self.cov_pq / (self.var_p + 1e-6)
+        self.e_hat = self._safe_elasticity(0.9 * self.e_hat + 0.1 * e_new)
+        # profit-optimal price for isoelastic demand (if e < -1)
+        e = self.e_hat
+        p_star = self.unit_cost * (e / (e + 1.0))
+        # smooth toward p_star
+        new_prices = (1 - self.lr) * prices + self.lr * p_star
+        # exploration (small random perturbations)
+        if self.rng.random() < self.eps_explore:
+            noise = self.rng.normal(0.0, self.explore_scale, size=new_prices.shape).astype(np.float32)
+            new_prices = new_prices * (1.0 + noise)
+        # apply business guardrails (max change + bounds)
+        max_adj = self.c.max_price_adjustment
+        ratio = np.clip(new_prices / (prices + 1e-6), 1 - max_adj, 1 + max_adj)
+        new_prices = prices * ratio
+        new_prices = np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
+        return new_prices
+
+
+class StaticPricingEngine(BasePricingEngine):
+    """baseline: fixed prices throughout episode"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        super().__init__(constraints, seed)
+        self.fixed_prices = None
+
+    def reset(self):
+        super().reset()
+        self.fixed_prices = None
+
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        self.step_count += 1
+        if self.fixed_prices is None:
+            self.fixed_prices = current_prices.copy()
+        return self.fixed_prices.copy()
+
+
+class SimpleDemandEngine(BasePricingEngine):
+    """demand-driven pricing: increase price when demand rises, decrease when it falls"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        super().__init__(constraints, seed)
+        self.prev_demand = None
+        self.lr = 0.05
+
+    def reset(self):
+        super().reset()
+        self.prev_demand = None
+
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        self.step_count += 1
+        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        if self.prev_demand is None:
+            self.prev_demand = demand.copy()
+            return current_prices.copy()
+        # simple rule: if demand increases, raise price; if decreases, lower price
+        delta_d = demand - self.prev_demand
+        price_adj = self.lr * np.sign(delta_d) * np.abs(delta_d) / (np.abs(self.prev_demand) + 1.0)
+        new_prices = current_prices * (1.0 + price_adj)
+        self.prev_demand = demand.copy()
+        # apply constraints
+        max_adj = self.c.max_price_adjustment
+        ratio = np.clip(new_prices / (current_prices + 1e-6), 1 - max_adj, 1 + max_adj)
+        new_prices = current_prices * ratio
+        return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
+
+
+class RandomWalkEngine(BasePricingEngine):
+    """random walk pricing with mean reversion"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        super().__init__(constraints, seed)
+        self.target_price = None
+        self.volatility = 0.02
+
+    def reset(self):
+        super().reset()
+        self.target_price = None
+
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        self.step_count += 1
+        if self.target_price is None:
+            self.target_price = current_prices.copy()
+        # random walk with mean reversion toward target
+        noise = self.rng.normal(0.0, self.volatility, size=current_prices.shape).astype(np.float32)
+        reversion = 0.01 * (self.target_price - current_prices)
+        new_prices = current_prices * (1.0 + noise) + reversion
+        # apply constraints
+        max_adj = self.c.max_price_adjustment
+        ratio = np.clip(new_prices / (current_prices + 1e-6), 1 - max_adj, 1 + max_adj)
+        new_prices = current_prices * ratio
+        return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
+
+
+class ThompsonSamplingEngine(BasePricingEngine):
+    """bayesian bandit approach per product treating price as discrete action"""
+    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
+        super().__init__(constraints, seed)
+        self.n_price_levels = 5
+        self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.price_grid = None
+        self.last_actions = None
+
+    def reset(self):
+        super().reset()
+        self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.price_grid = None
+        self.last_actions = None
+
+    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
+        self.step_count += 1
+        if self.price_grid is None:
+            # define price grid per product
+            lo = current_prices * 0.7
+            hi = current_prices * 1.3
+            self.price_grid = np.linspace(lo, hi, self.n_price_levels).T
+        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        # update beliefs based on last action
+        if self.last_actions is not None:
+            for i in range(self.c.product_catelogue_size):
+                a = self.last_actions[i]
+                reward = demand[i]
+                if reward > 0.5:
+                    self.alpha[i, a] += reward
+                else:
+                    self.beta[i, a] += 1.0
+        # thompson sampling: sample from posterior, pick best
+        new_prices = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        actions = np.zeros(self.c.product_catelogue_size, dtype=int)
+        for i in range(self.c.product_catelogue_size):
+            theta = self.rng.beta(self.alpha[i], self.beta[i]).astype(np.float32)
+            actions[i] = int(np.argmax(theta))
+            new_prices[i] = self.price_grid[i, actions[i]]
+        self.last_actions = actions
+        return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index 19f9ad4..fd725f8 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -1,5 +1,7 @@
+from sys import intern
 import gymnasium as gym
 from gymnasium import spaces
+from matplotlib import interactive
 import numpy as np
 from dataclasses import dataclass
 import pandas as pd
@@ -24,7 +26,7 @@ class BusinessLogicConstraints():
     coi_sigmoid_temp: float = 1.25
     base_human_demand: float = 0.08
     base_agent_demand: float = 0.05
-    human_price_elasticity: float = -1.2
+    human_price_elasticity: float = -1.2 # assumptions here
     agent_price_elasticity: float = -0.6
     w_agent_loss: float = 1.0
     w_volatility: float = 5.0
@@ -35,31 +37,25 @@ class BusinessLogicConstraints():
 def _sigmoid(x: np.ndarray) -> np.ndarray:
     return 1.0 / (1.0 + np.exp(-x))
 
-
-def simple_agent_detector(session_df: pd.DataFrame) -> pd.Series:
-    # baseline heuristic: high velocity + low conversion
-    v = session_df.get("interaction_velocity", pd.Series(0.0, index=session_df.index))
-    cr = session_df.get("conversion_rate", pd.Series(0.0, index=session_df.index))
-    total = session_df.get("total_interactions", pd.Series(0, index=session_df.index))
-    return (total >= 12) & (v >= 0.20) & (cr <= 0.01)
-
-
 class CommercePlatform:
-    def __init__(self, product_catelogue_size: int, max_price: float, min_price: float,
-                 constraints: BusinessLogicConstraints, agent_detector: Optional[Callable[[pd.DataFrame], pd.Series]] = None,
-                 use_defense: bool = False):
+    """
+    This is just an extension of the state management for the environment, it does not implement anything dynamic just helps us simulate demand.
+    """
+    def __init__(self,
+                 product_catelogue_size: int,
+                 max_price: float,
+                 min_price: float,
+                 constraints: BusinessLogicConstraints):
         self.product_catelogue_size = product_catelogue_size
+        self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catelogue_size,))
         self.max_price = max_price
         self.min_price = min_price
         self.constraints = constraints
-        self.use_defense = use_defense
-        self.agent_detector = agent_detector
         self.simulation_history: List[Dict[str, Any]] = []
         self._rng = np.random.default_rng(constraints.seed)
-        self._popularity = self._rng.lognormal(mean=0.0, sigma=0.6, size=self.product_catelogue_size)
-        self._popularity = self._popularity / (self._popularity.mean() + 1e-12)
         self._last_interaction_df: pd.DataFrame = pd.DataFrame()
 
+
     def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
         # ground truth purchase propensities
         p = np.clip(prices, self.min_price, self.max_price)
@@ -67,14 +63,19 @@ class CommercePlatform:
         human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
         agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
         return {
-            "human_purchase_prob": np.clip(human_prob * self._popularity, 0.0, 0.95),
-            "agent_purchase_prob": np.clip(agent_prob * self._popularity, 0.0, 0.95)
+            "human_purchase_prob": np.clip(human_prob, 0.0, 0.95),
+            "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)
         }
 
-    def _session_markup_multiplier(self, signal_score: float) -> float:
-        # session-based COI markup based on demand signal expression
-        x = (signal_score - self.constraints.coi_threshold) / max(self.constraints.coi_sigmoid_temp, 1e-6)
-        return 1.0 + self.constraints.coi_strength * float(_sigmoid(np.array([x]))[0])
+    def _load_behavioral_profile(actor : str, demand_forcing):
+        """
+        This returns a markov chain with average weights which we get from interaction data of our experiments.
+        This defines transition probabilities between different events:
+        search -> view_item_price_binN: 0.7
+        view_item_price_binN -> add_to_cart: 0.2
+        we also must reweight with the demand_forcing vector or purchase probabilities per-product
+        """
+
 
     def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
         demand = self.setup_true_demand(base_prices)
@@ -84,94 +85,32 @@ class CommercePlatform:
         T = self.constraints.sessions_per_step
         n_agent_sessions = int(round(T * self.constraints.agent_share))
         n_human_sessions = T - n_agent_sessions
-
-        # human sessions: normal browse with possible purchase
-        for s in range(n_human_sessions):
-            session_id = f"h_{len(events)}_{s}"
-            k = int(self._rng.integers(1, 4))
-            prod_ids = self._rng.choice(self.product_catelogue_size, size=k, replace=False)
-            t = 0.0
-            inter_times = self._rng.gamma(shape=2.0, scale=3.0, size=3 * k)
-            signal_score = 0.0
-            purchased_any = False
-
-            for i, pid in enumerate(prod_ids):
-                t += float(inter_times[i])
-                price_shown = float(base_prices[pid])
-                events.append({
-                    "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
-                    "action": "view", "t": t, "price_shown": price_shown, "is_purchase": 0,
-                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                })
-                signal_score += 1.0
-
-                if self._rng.random() < 0.35:
-                    t += float(inter_times[i + k])
-                    events.append({
-                        "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
-                        "action": "cart", "t": t, "price_shown": price_shown, "is_purchase": 0,
-                        "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                    })
-                    signal_score += 2.0
-
-                if (not purchased_any) and (self._rng.random() < float(human_pprob[pid])):
-                    t += float(inter_times[i + 2 * k])
-                    mult = self._session_markup_multiplier(signal_score)
-                    price_paid = float(np.clip(base_prices[pid] * mult, self.min_price, self.max_price))
-                    events.append({
-                        "session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
-                        "action": "purchase", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 1,
-                        "price_paid": price_paid, "oracle_price_paid": price_paid, "signal_score": signal_score,
-                    })
-                    purchased_any = True
-
-        # agent sessions: split recon/purchase to circumvent COI
         n_agent_ids = max(1, n_agent_sessions // 2)
-        for a in range(n_agent_ids):
-            agent_id = f"a_{a}"
-            recon_session_id = f"{agent_id}_recon"
-            t = 0.0
-            n_views = int(self._rng.poisson(lam=8) * self.constraints.agent_recon_multiplier) + 5
-            inter_times = self._rng.gamma(shape=2.0, scale=0.6, size=max(n_views, 1))
-            prod_ids = self._rng.integers(0, self.product_catelogue_size, size=n_views)
-            recon_signal = 0.0
+        session_map = {
+            'humans': n_human_sessions,
+            'agents': n_agent_ids
+        }
+        pprob_map = {
+            'humans': human_pprob,
+            'agents': agent_pprob
+        }
+        joint_events = []
+        for actor, n_sessions in session_map.items():
+            bp = _load_behavioral_profile(actor, pprob_map[actor])
+            counter = 0
+            events = []
+            while counter < n_sessions:
+                session_events = []
+                while len(session_events) == 0 or session_events[-1]['action'] == 'checkout':
+                    interaction_event = bp.sample(self._rng)
+                    interaction_event['session_id'] = f'{actor}_{counter:06d}'
+                    # TODO any other assignments
+                    session_events.append(interaction_event)
+                events.extend(session_events)
+                counter += 1
+            joint_events.extend(events)
 
-            for i, pid in enumerate(prod_ids):
-                t += float(inter_times[i])
-                events.append({
-                    "session_id": recon_session_id, "actor": "agent", "agent_id": agent_id, "product_id": int(pid),
-                    "action": "view", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 0,
-                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                })
-                recon_signal += 1.0
-
-            # clean purchase session with minimal interactions
-            if self._rng.random() < self.constraints.agent_purchase_probability:
-                purchase_session_id = f"{agent_id}_clean"
-                pid = int(self._rng.integers(0, self.product_catelogue_size))
-                t2 = 0.0
-                clean_signal = 0.0
-                t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
-                events.append({
-                    "session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
-                    "action": "view", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 0,
-                    "price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
-                })
-                clean_signal += 1.0
-
-                if self._rng.random() < float(agent_pprob[pid]):
-                    t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
-                    obs_mult = self._session_markup_multiplier(clean_signal)
-                    obs_paid = float(np.clip(base_prices[pid] * obs_mult, self.min_price, self.max_price))
-                    oracle_mult = self._session_markup_multiplier(recon_signal)  # oracle links recon->purchase
-                    oracle_paid = float(np.clip(base_prices[pid] * oracle_mult, self.min_price, self.max_price))
-                    events.append({
-                        "session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
-                        "action": "purchase", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 1,
-                        "price_paid": obs_paid, "oracle_price_paid": oracle_paid, "signal_score": clean_signal,
-                    })
-
-        return pd.DataFrame(events)
+        return pd.DataFrame(joint_events)
 
     def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
         if interaction_df.empty:
@@ -183,6 +122,7 @@ class CommercePlatform:
         return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
 
     def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
+        # TODO: adapt this
         if df.empty:
             return pd.DataFrame()
         g = df.groupby("session_id", sort=False)
@@ -208,73 +148,6 @@ class CommercePlatform:
             "is_agent": is_agent.astype(bool),
         }).reset_index()
 
-    def demand_estimate(self, interaction_df: pd.DataFrame, exclude_sessions: Optional[pd.Series] = None) -> np.ndarray:
-        # proxy demand from weighted interaction events
-        if interaction_df.empty:
-            return np.zeros(self.product_catelogue_size, dtype=np.float32)
-        df = interaction_df
-        if exclude_sessions is not None:
-            bad_sessions = set(exclude_sessions.loc[exclude_sessions].index)
-            df = df[~df["session_id"].isin(bad_sessions)]
-        weights = {"view": 0.15, "cart": 0.75, "purchase": 2.5}
-        w = df["action"].map(weights).fillna(0.0).to_numpy(dtype=float)
-        prod = df["product_id"].to_numpy(dtype=int)
-        q_hat = np.zeros(self.product_catelogue_size, dtype=float)
-        np.add.at(q_hat, prod, w)
-        return q_hat.astype(np.float32)
-
-    def run_pricing_simulation(self, prices: np.ndarray) -> Dict[str, Any]:
-        interaction_df = self._simulate_sessions(prices)
-        self._last_interaction_df = interaction_df
-        session_df = self._session_feature_table(interaction_df)
-
-        predicted_agent_sessions = None
-        if (self.use_defense and self.agent_detector is not None and not session_df.empty):
-            predicted_agent_sessions = self.agent_detector(session_df.set_index("session_id"))
-
-        q_hat_naive = self.demand_estimate(interaction_df, exclude_sessions=None)
-        q_hat_defended = self.demand_estimate(interaction_df, exclude_sessions=predicted_agent_sessions) \
-            if predicted_agent_sessions is not None else q_hat_naive.copy()
-
-        true_human = np.zeros(self.product_catelogue_size, dtype=float)
-        true_agent = np.zeros(self.product_catelogue_size, dtype=float)
-        if not interaction_df.empty:
-            purchases = interaction_df[interaction_df["action"] == "purchase"]
-            if not purchases.empty:
-                for _, r in purchases.iterrows():
-                    if r["actor"] == "human":
-                        true_human[int(r["product_id"])] += 1.0
-                    else:
-                        true_agent[int(r["product_id"])] += 1.0
-
-        revenue_observed = float(interaction_df["price_paid"].sum()) if not interaction_df.empty else 0.0
-        revenue_oracle = float(interaction_df["oracle_price_paid"].sum()) if not interaction_df.empty else 0.0
-        agent_loss = max(0.0, revenue_oracle - revenue_observed)
-
-        eps = 1e-6
-        internal_error_naive = np.abs(true_human - q_hat_naive) / (true_human + eps)
-        internal_error_def = np.abs(true_human - q_hat_defended) / (true_human + eps)
-        interaction_features = self.compute_interaction_features(interaction_df)
-
-        summary = {
-            "prices": prices.copy(),
-            "interaction_df": interaction_df,
-            "session_df": session_df,
-            "q_hat_naive": q_hat_naive,
-            "q_hat_defended": q_hat_defended,
-            "true_human_demand": true_human.astype(np.float32),
-            "true_agent_purchases": true_agent.astype(np.float32),
-            "internal_error_naive": internal_error_naive.astype(np.float32),
-            "internal_error_defended": internal_error_def.astype(np.float32),
-            "interaction_features": interaction_features,
-            "revenue_observed": revenue_observed,
-            "revenue_oracle": revenue_oracle,
-            "agent_loss": agent_loss,
-            "predicted_agent_sessions": predicted_agent_sessions,
-        }
-        self.simulation_history.append(summary)
-        return summary
-
     def get_interaction_data(self) -> np.ndarray:
         if self._last_interaction_df.empty:
             return np.array([], dtype=object)
@@ -284,7 +157,7 @@ class CommercePlatform:
 class PHANTOMEnv(gym.Env):
     metadata = {"render_modes": []}
 
-    def __init__(self, use_defense: bool = False):
+    def __init__(self, constraints):
         super().__init__()
         self.constraints = BusinessLogicConstraints()
         self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
@@ -301,14 +174,13 @@ class PHANTOMEnv(gym.Env):
                     high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32),
                     dtype=np.float32),
             })
+            # TODO: define more features that we compute from the interaction data
         })
         self.commerce_platform = CommercePlatform(
             product_catelogue_size=self.constraints.product_catelogue_size,
             max_price=self.constraints.system_max_price,
             min_price=self.constraints.system_min_price,
-            constraints=self.constraints,
-            agent_detector=simple_agent_detector,
-            use_defense=use_defense)
+            constraints=self.constraints)
         self._rng = np.random.default_rng(self.constraints.seed)
         self.t = 0
         self._prev_prices: Optional[np.ndarray] = None
@@ -336,17 +208,13 @@ class PHANTOMEnv(gym.Env):
         new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
                            self.constraints.system_min_price,
                            self.constraints.system_max_price).astype(np.float32)
-        result = self.commerce_platform.run_pricing_simulation(new_prices)
-
-        if self.commerce_platform.use_defense:
-            demand_est = result["q_hat_defended"]
-            internal_err = result["internal_error_defended"]
-        else:
-            demand_est = result["q_hat_naive"]
-            internal_err = result["internal_error_naive"]
 
         self.state["elasticity"]["price"] = new_prices
-        self.state["elasticity"]["demand"] = demand_est
+        # TODO: use the commerce platform to simulate sessions
+        interactions_df = self.commerce_platform._simulate_sessions(new_prices)
+        result = self.commerce_platform.compute_interaction_features(interactions_df)
+        # TODO: implement COI computation to use in reward
+        COI = 0.0
 
         volatility = 0.0 if self._prev_prices is None else \
             float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
@@ -354,12 +222,13 @@ class PHANTOMEnv(gym.Env):
 
         revenue_observed = float(result["revenue_observed"])
         agent_loss = float(result["agent_loss"])
-        err_mean = float(np.mean(internal_err))
 
         reward = (revenue_observed
-                 - self.constraints.w_agent_loss * agent_loss
-                 - self.constraints.w_volatility * volatility
-                 - self.constraints.w_estimation_error * err_mean)
+                  - COI
+                  - self.constraints.w_agent_loss * agent_loss
+                  - self.constraints.w_volatility * volatility
+                  - self.constraints.w_estimation_error
+                  )
 
         terminated = self.t >= self.constraints.episode_length
         info = {
diff --git a/sim/rl/train.py b/sim/rl/train.py
new file mode 100644
index 0000000..41a87ab
--- /dev/null
+++ b/sim/rl/train.py
@@ -0,0 +1,149 @@
+import numpy as np
+import logging
+from pathlib import Path
+from typing import Dict, Type, Optional
+import pickle
+from torch import neg_
+from torch.utils.tensorboard import SummaryWriter
+from environment import PHANTOMEnv, FastTrainingConstraints, BusinessLogicConstraints
+from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
+                   SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
+logger = logging.getLogger(__name__)
+
+
+
+"""
+Target training loop:
+have base prices p0 from env reset and run the env step, collect reward and metrics
+pass this to the pricing engine which computes the price action to take based on previous reward by learning
+the new action gets passed to the step
+so we alternate, step -> reward -> engine (produces price delta) -> step with price delta -> reward
+to make sure the reinforcement learning inside the engine can learn we need to have trajectory of prices
+CURRENT SOLUTION BELOW does not implement correct learning or updates.
+"""
+
+class EngineTrainer:
+    """wrapper to run pricing engines through episodes and collect metrics"""
+    def __init__(self, engine: BasePricingEngine, env: PHANTOMEnv,
+                 tb_writer: Optional[SummaryWriter] = None):
+        self.engine = engine
+        self.env = env
+        self.episode_metrics = []
+        self.tb_writer = tb_writer
+        self.global_step = 0
+
+    def train(self, n_episodes: int, seed: int = 42):
+
+        obs, _ = self.env.reset(seed=seed)
+        prices = None
+        for ep in range(n_episodes):
+            prices = self.engine.compute_prices(prices, obs
+            obs, reward, done, _, info = self.env.step(prices)
+            self.engine.update(obs, reward, done, info)
+        return self
+
+
+
+
+
+
+        return self.episode_metrics
+
+    def evaluate(self, n_episodes: int = 10, seed: int = 100) -> Dict:
+        """evaluate trained engine"""
+        results = {k: [] for k in ['total_reward', 'revenue_observed', 'revenue_oracle',
+                                   'agent_loss', 'ux_volatility', 'look_to_book']}
+        for ep in range(n_episodes):
+            metrics = self.run_episode(seed=seed + ep)
+            for k in results:                results[k].append(metrics[k])
+        return {k: (np.mean(v), np.std(v)) for k, v in results.items()}
+
+
+def make_env(fast: bool = True):
+    constraints = FastTrainingConstraints() if fast else BusinessLogicConstraints()
+    return PHANTOMEnv(constraints=constraints)
+
+
+def train_engine(engine_cls: Type[BasePricingEngine], env: PHANTOMEnv,
+                n_episodes: int, seed: int = 42,
+                tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
+    constraints = env.constraints
+    engine = engine_cls(constraints=constraints, seed=seed)
+    trainer = EngineTrainer(engine, env, tb_writer=tb_writer)
+    trainer.train(n_episodes, seed=seed)
+    return trainer
+
+
+def save_trainer(trainer: EngineTrainer, path: Path):
+    """save engine state and metrics"""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, 'wb') as f:
+        pickle.dump({
+            'engine': trainer.engine,
+            'metrics': trainer.episode_metrics
+        }, f)
+    logger.info(f"Saved trainer to {path}")
+
+
+def load_trainer(path: Path, env: PHANTOMEnv,
+                 tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
+    """load saved engine"""
+    with open(path, 'rb') as f:
+        data = pickle.load(f)
+    trainer = EngineTrainer(data['engine'], env, tb_writer=tb_writer)
+    trainer.episode_metrics = data['metrics']
+    return trainer
+
+
+if __name__ == "__main__":
+    base_dir = Path("./runs")
+    base_dir.mkdir(exist_ok=True)
+
+    engines = {
+        "Wild": WildPricingEngine,
+        "Static": StaticPricingEngine,
+#        "SimpleDemand": SimpleDemandEngine,
+        "RandomWalk": RandomWalkEngine,
+        "ThompsonSampling": ThompsonSamplingEngine,
+    }
+    defenses = [False, True]
+    n_train_episodes = 50
+    n_eval_episodes = 10
+    seed = 42
+    fast_mode = True
+
+    logger.info(f"Training config: {n_train_episodes} episodes per engine, fast_mode={fast_mode}")
+
+    trained_trainers = {}
+
+    for engine_name, engine_cls in engines.items():
+        for use_defense in defenses:
+            defense_label = "defense_on" if use_defense else "defense_off"
+            run_name = f"{engine_name}_{defense_label}"
+            log_dir = base_dir / run_name
+            log_dir.mkdir(parents=True, exist_ok=True)
+
+            logger.info(f"Training {engine_name} with defense={use_defense}")
+            logger.info(f"Log directory: {log_dir}")
+
+            env = make_env(fast=fast_mode)
+            tb_writer = SummaryWriter(log_dir=str(log_dir))
+            trainer = train_engine(engine_cls, env, n_train_episodes, seed, tb_writer=tb_writer)
+            tb_writer.close()
+
+            save_path = log_dir / "trainer.pkl"
+            save_trainer(trainer, save_path)
+
+            trained_trainers[run_name] = (trainer, env)
+
+    logger.info("Starting evaluation")
+
+    for run_name, (trainer, env) in trained_trainers.items():
+        logger.info(f"Evaluating {run_name}")
+        results = trainer.evaluate(n_episodes=n_eval_episodes, seed=seed + 1000)
+        for metric, (mean, std) in results.items():
+            logger.info(f"  {metric:20s}: {mean:10.2f} ± {std:6.2f}")
+
+    logger.info(f"Results saved to: {base_dir}")

From 6a06a8af4a01a898a1897cbc9b2560044e201ee1 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 17 Dec 2025 18:50:04 +0100
Subject: [PATCH 70/99] simple code cleanup

---
 sim/rl/engine.py | 7 +++++++
 sim/rl/train.py  | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/sim/rl/engine.py b/sim/rl/engine.py
index 6d913f3..e0caca8 100644
--- a/sim/rl/engine.py
+++ b/sim/rl/engine.py
@@ -1,3 +1,4 @@
+from os import kill
 import numpy as np
 import pandas as pd
 from abc import ABC, abstractmethod
@@ -5,6 +6,11 @@ from typing import Dict, Any
 from environment import BusinessLogicConstraints
 
 
+"""
+An angine by default should have its own demand estimation mechanism from the observed observations whihc are the computer feature.
+From these features we then follow the researc hstructure of q -> p with a testable and must be updatable mechanism.
+"""
+
 class BasePricingEngine(ABC):
     """base interface for all pricing engines"""
     def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
@@ -12,6 +18,7 @@ class BasePricingEngine(ABC):
         self.rng = np.random.default_rng(seed)
         self.step_count = 0
 
+
     @abstractmethod
     def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
         """compute new prices given current state and observation from environment
diff --git a/sim/rl/train.py b/sim/rl/train.py
index 41a87ab..ba257de 100644
--- a/sim/rl/train.py
+++ b/sim/rl/train.py
@@ -39,7 +39,7 @@ class EngineTrainer:
         obs, _ = self.env.reset(seed=seed)
         prices = None
         for ep in range(n_episodes):
-            prices = self.engine.compute_prices(prices, obs
+            prices = self.engine.compute_prices(prices, obs)
             obs, reward, done, _, info = self.env.step(prices)
             self.engine.update(obs, reward, done, info)
         return self

From ec4cf074e65478c32bea71a2f39e4eeda714a15f Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Fri, 9 Jan 2026 20:20:31 +0100
Subject: [PATCH 71/99] feature: MDP behavior mappers (unlinked)

---
 sim/rl/behavior_loader/loader.py |  63 ++++++++++++++
 sim/rl/behavior_loader/models.py | 137 +++++++++++++++++++++++++++++++
 2 files changed, 200 insertions(+)
 create mode 100644 sim/rl/behavior_loader/loader.py
 create mode 100644 sim/rl/behavior_loader/models.py

diff --git a/sim/rl/behavior_loader/loader.py b/sim/rl/behavior_loader/loader.py
new file mode 100644
index 0000000..99a1541
--- /dev/null
+++ b/sim/rl/behavior_loader/loader.py
@@ -0,0 +1,63 @@
+import os
+from pydantic import BaseModel as Base
+import json
+
+class PayloadModel(Base):
+    sessionId: str
+    experimentId: str | None
+    eventName: str
+    page: str | None
+    productId: str | None
+    metadata: dict
+    storeMode: str
+    userAgent: str
+    ts: str
+
+class ValueModel(Base):
+    payload: PayloadModel
+    encoding: str
+    isPayloadNull: bool
+    schemaId: int
+    size: int
+
+class InteractionModel(Base):
+    partitionID: int
+    offset: int
+    timestamp: int
+    compression: str
+    isTransactional: bool
+    headers: list
+    key: dict
+    value: ValueModel
+
+class Loader:
+    def __init__(self, src_dir: str):
+        self.src_dir = src_dir
+        self.entries = os.listdir(src_dir)
+        if not self.entries: raise ValueError("empty directory")
+        self.data = self._load_sessions()
+
+    def _is_admin_page(self, interaction: InteractionModel) -> bool:
+        page = interaction.value.payload.page
+        return page and page.startswith("/admin/")
+
+    def _load_sessions(self) -> dict:
+        sessions = {}
+        for entry in self.entries:
+            int_path = f"{self.src_dir}/{entry}/int.json"
+            raw = json.load(open(int_path))
+            ints = [InteractionModel(**i) for i in raw]
+            sessions[entry] = [i for i in ints if not self._is_admin_page(i)]
+        return sessions
+
+    def get_data(self) -> dict:
+        return self.data
+
+    def get_entries(self) -> tuple[list[str], int]:
+        return self.entries, len(self.entries)
+
+if __name__ == "__main__":
+    DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+    loader = Loader(DIR)
+    _, n = loader.get_entries()
+    print(f"Loaded {n} sessions from {DIR}")
diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
new file mode 100644
index 0000000..f8e92b7
--- /dev/null
+++ b/sim/rl/behavior_loader/models.py
@@ -0,0 +1,137 @@
+from loader import Loader
+from collections import defaultdict
+from typing import Dict, List, Tuple, Set
+import numpy as np
+import graphviz
+
+DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+
+class BehaviorModel:
+    def __init__(self, src_dir: str = DIR):
+        self.loader = Loader(src_dir)
+        self.data = self.loader.get_data()
+        self.entries, self.num_entries = self.loader.get_entries()
+        self.mdp = None
+
+    def _state_repr(self, evt) -> str:
+        p = evt.value.payload
+        return f"{p.page or 'unk'}|{p.productId or 'none'}|{p.eventName}"
+
+    def _extract_sessions(self):
+        # transform raw events into sequential state trajectories per session
+        trajectories = []
+        for sid, evts in self.data.items():
+            if len(evts) < 2: continue
+            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.timestamp)]
+            trajectories.append(states)
+        return trajectories
+
+    def _calc_transitions(self, trajectories: List[List[str]]) -> Tuple[Dict, Set]:
+        trans = defaultdict(lambda: defaultdict(int))
+        states = set()
+        for traj in trajectories:
+            for i in range(len(traj) - 1):
+                s, s_next = traj[i], traj[i+1]
+                trans[s][s_next] += 1
+                states.update([s, s_next])
+        return trans, states
+
+    def _calc_rewards(self, trajectories: List[List[str]]) -> Dict:
+        # reward based on session progression depth
+        rwd = defaultdict(list)
+        for traj in trajectories:
+            n = len(traj)
+            for i, s in enumerate(traj):
+                rwd[s].append(i / n)
+        return rwd
+
+    def _normalize_trans(self, counts: Dict) -> Dict:
+        return {s: {s_n: cnt/sum(nxt.values()) for s_n, cnt in nxt.items()}
+                for s, nxt in counts.items()}
+
+    def build_MDP(self) -> Dict:
+        trajs = self._extract_sessions()
+        trans_cnt, states = self._calc_transitions(trajs)
+        trans_prob = self._normalize_trans(trans_cnt)
+        state_rwd = self._calc_rewards(trajs)
+        state_val = {s: np.mean(r) for s, r in state_rwd.items()}
+
+        self.mdp = {
+            'states': sorted(list(states)),
+            'num_states': len(states),
+            'transitions': trans_prob,
+            'state_values': state_val,
+            'state_rewards': state_rwd,
+            'trans_counts': trans_cnt,
+        }
+        return self.mdp
+
+    def transition_prob(self, s: str, s_next: str) -> float:
+        if not self.mdp: raise ValueError("build MDP first")
+        return self.mdp['transitions'].get(s, {}).get(s_next, 0.0)
+
+    def state_value(self, s: str) -> float:
+        if not self.mdp: raise ValueError("build MDP first")
+        return self.mdp['state_values'].get(s, 0.0)
+
+    def sample_traj(self, start: str, max_len: int = 50) -> List[str]:
+        if not self.mdp: raise ValueError("build MDP first")
+        path = [start]
+        curr = start
+        for _ in range(max_len):
+            nxt = self.mdp['transitions'].get(curr, {})
+            if not nxt: break
+            curr = np.random.choice(list(nxt.keys()), p=list(nxt.values()))
+            path.append(curr)
+        return path
+
+def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False):
+    """visualize MDP as directed graph using graphviz, aggregated by event type"""
+    if not model.mdp: raise ValueError("build MDP first")
+
+    # aggregate transitions by event type
+    evt_trans = defaultdict(lambda: defaultdict(float))
+    for s, trans in model.mdp['transitions'].items():
+        evt_src = s.split('|')[2]
+        for s_next, prob in trans.items():
+            evt_dst = s_next.split('|')[2]
+            evt_trans[evt_src][evt_dst] += prob
+
+    # normalize aggregated transitions
+    for evt_src in evt_trans:
+        total = sum(evt_trans[evt_src].values())
+        if total > 0:
+            for evt_dst in evt_trans[evt_src]:
+                evt_trans[evt_src][evt_dst] /= total
+
+    g = graphviz.Digraph(format=fmt)
+    g.attr(rankdir='LR', size='30')
+    g.attr('node', shape='circle', width='1', height='1')
+
+    # collect all event types
+    events = set(evt_trans.keys())
+    for trans in evt_trans.values():
+        events.update(trans.keys())
+
+    # add nodes for each event type
+    for evt in events:
+        g.node(evt)
+
+    # add edges above threshold
+    for evt_src in evt_trans:
+        for evt_dst, prob in evt_trans[evt_src].items():
+            if prob > threshold:
+                g.edge(evt_src, evt_dst, label=f'{prob:.2f}')
+
+    g.render(output, view=view, cleanup=True)
+    print(f"Saved MDP graph to {output}.{fmt}")
+    return g
+
+if __name__ == "__main__":
+    model = BehaviorModel(DIR)
+    mdp = model.build_MDP()
+    print(f"Built MDP: {mdp['num_states']} states, {sum(len(t) for t in mdp['transitions'].values())} transitions")
+    if not mdp['states']:
+        print("No states found")
+        exit(1)
+    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="svg")

From 131323ef56984229063ce1efca763615f51cb5d0 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sat, 10 Jan 2026 10:33:56 +0100
Subject: [PATCH 72/99] featuer: dot exporter

---
 sim/rl/behavior_loader/models.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index f8e92b7..6e4201e 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -85,7 +85,7 @@ class BehaviorModel:
             path.append(curr)
         return path
 
-def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False):
+def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False, export_dot: bool = False):
     """visualize MDP as directed graph using graphviz, aggregated by event type"""
     if not model.mdp: raise ValueError("build MDP first")
 
@@ -125,6 +125,13 @@ def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "
 
     g.render(output, view=view, cleanup=True)
     print(f"Saved MDP graph to {output}.{fmt}")
+
+    if export_dot:
+        dot_file = f"{output}.dot"
+        with open(dot_file, 'w') as f:
+            f.write(g.source)
+        print(f"Exported DOT source to {dot_file}")
+
     return g
 
 if __name__ == "__main__":
@@ -134,4 +141,4 @@ if __name__ == "__main__":
     if not mdp['states']:
         print("No states found")
         exit(1)
-    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="svg")
+    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="svg", export_dot=True)

From f9bf3de71eb691c9182c46f091ffe25b80d402de Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sat, 10 Jan 2026 11:48:03 +0100
Subject: [PATCH 73/99] pdf rendering

---
 sim/rl/behavior_loader/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 6e4201e..bce2429 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -141,4 +141,4 @@ if __name__ == "__main__":
     if not mdp['states']:
         print("No states found")
         exit(1)
-    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="svg", export_dot=True)
+    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="pdf", export_dot=True)

From 8b429b7a8e5a7c1e671c62afa34b013358be4208 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 12 Jan 2026 10:09:55 +0100
Subject: [PATCH 74/99] chore: refactor to better map end to end

---
 backend/provider/app.py                 | 71 ++++++++++++-------------
 experiments/procesing/pricers/simple.py | 59 +++++++++++++++++---
 experiments/procesing/steps/session.py  |  1 +
 lib/model_registry.py                   | 46 ++++++++++++++++
 web/src/app/api/pricing/route.ts        | 37 +++++++------
 5 files changed, 153 insertions(+), 61 deletions(-)

diff --git a/backend/provider/app.py b/backend/provider/app.py
index fb72a9d..6f9a55d 100644
--- a/backend/provider/app.py
+++ b/backend/provider/app.py
@@ -47,53 +47,52 @@ def health() -> dict:
 
 @app.get("/api/{mode}/price/{productId}", response_model=PriceResponse)
 def get_price(mode: Literal['hotel', 'airline'], productId: str, sessionId: Optional[str] = Query(None), experimentId: Optional[str] = Query(None)):
+    """
+    THIS is the fast lookup service (mechanism).
+    Priority: session-keyed price > global optimal price > base price
+    """
     product = supabase.table(f'{mode}_products').select("metadata").eq('id', productId).execute().data[0]
     if not product: raise HTTPException(404, f"Product {productId} not found")
 
     metadata = product['metadata']
     base_price = metadata.get('base_price', 100.0)
 
-    # fetch pre-computed prices from registry
+    # PRIORITY 1: session-aware price (computed by Airflow worker)
+    if sessionId:
+        session_price = registry.get_session_price(sessionId, productId)
+        if session_price is not None:
+            return PriceResponse(
+                productId=productId,
+                price=session_price,
+                base_price=base_price,
+                markup=session_price/base_price,
+                elasticity=None,
+                model_version='session-aware'
+            )
+
+    # PRIORITY 2: global pre-computed prices (surge pricing)
     prices_df = registry.get_prices('latest')
-    elasticity_df = registry.get_elasticity('latest')
-
-    if prices_df is None:
-        # fallback: no pre-computed prices available
-        return PriceResponse(
-            productId=productId,
-            price=base_price,
-            base_price=base_price,
-            markup=1.0,
-            elasticity=None
-        )
-
-    # lookup pre-computed price for this product
-    product_price_row = prices_df[prices_df['productId'] == productId]
-    if product_price_row.empty:
-        # product not in pre-computed prices, fallback to base
-        return PriceResponse(
-            productId=productId,
-            price=base_price,
-            base_price=base_price,
-            markup=1.0,
-            elasticity=None
-        )
-
-    optimal_price = float(product_price_row['optimal_price'].iloc[0]) # TODO: use optimal_price everywhere as  aresult
-
-    # get elasticity if available
-    product_elasticity = None
-    if elasticity_df is not None:
-        product_elasticity_row = elasticity_df[elasticity_df['productId'] == productId]
-        if not product_elasticity_row.empty:
-            product_elasticity = float(product_elasticity_row['elasticity'].iloc[0])
+    if prices_df is not None:
+        product_price_row = prices_df[prices_df['productId'] == productId]
+        if not product_price_row.empty:
+            optimal_price = float(product_price_row['optimal_price'].iloc[0])
+            return PriceResponse(
+                productId=productId,
+                price=optimal_price,
+                base_price=base_price,
+                markup=optimal_price/base_price,
+                elasticity=None,
+                model_version='surge'
+            )
 
+    # PRIORITY 3: fallback to base price
     return PriceResponse(
         productId=productId,
-        price=optimal_price,
+        price=base_price,
         base_price=base_price,
-        markup=optimal_price/base_price,
-        elasticity=product_elasticity
+        markup=1.0,
+        elasticity=None,
+        model_version='base'
     )
 
 @app.get("/models")
diff --git a/experiments/procesing/pricers/simple.py b/experiments/procesing/pricers/simple.py
index 39be37a..6bdd1ca 100644
--- a/experiments/procesing/pricers/simple.py
+++ b/experiments/procesing/pricers/simple.py
@@ -3,6 +3,46 @@ import pandas as pd
 from procesing.pricers.base import PricingFunction
 
 
+def session_features_to_demand(session_features: pd.DataFrame) -> float:
+    """
+    Map session behavioral features to demand proxy.
+    THIS is the critical θ̂ → D transformation for rule-based pricing.
+
+    Logic:
+      - High velocity → agent behavior → price up (revenue recovery)
+      - High cart ratio → purchase intent → price up
+      - Low activity → discount to convert
+
+    Returns: demand proxy score (0-20 range, higher = more demand)
+    """
+    if session_features.empty:
+        return 1.0
+
+    feat = session_features.iloc[0] if len(session_features) > 0 else {}
+
+    velocity = feat.get('interaction_velocity', 0)
+    cart_ratio = feat.get('cart_to_view_ratio', 0)
+    item_views = feat.get('item_views', 0)
+    cart_adds = feat.get('cart_adds', 0)
+
+    # baseline demand
+    demand = 1.0
+
+    # agent detection: high velocity → treat as high "demand" to price up
+    if velocity > 2.0:
+        demand += 10.0  # strong agent signal
+
+    # conversion intent: cart interaction → price up
+    if cart_ratio > 0.1 or cart_adds > 0:
+        demand += 5.0
+
+    # browsing depth: many views → interest signal
+    if item_views > 3:
+        demand += min(item_views, 5.0)
+
+    return min(demand, 20.0)  # cap at 20
+
+
 class StaticPricer(PricingFunction):
     """Static pricing: always return fixed base prices"""
 
@@ -67,21 +107,24 @@ class SimpleSurgePricer(PricingFunction):
         self.surge_multiplier = surge_multiplier
         self.discount_multiplier = discount_multiplier
 
-    def fit(self, market_data : pd.DataFrame):
+    def fit(self, market_data: pd.DataFrame):
         """Extract base prices from product catalog or historical averages"""
         self.base_prices = market_data['base_price'].to_numpy() if 'base_price' in market_data.columns else market_data['price'].values
-        self.demand_history = market_data['demand'].to_numpy() if 'demand' in market_data.columns else np.zeros_like(self.base_prices)
+        return self
 
-    def predict(self) -> np.ndarray:
+    def predict(self, state_space) -> np.ndarray:
         """
         Adjust prices based on current demand using surge rules.
-        state_space.demand: demand counts per product
-        state_space.prices: current prices (fallback if base_prices not set)
+        state_space.demand: demand proxy per product (from session features)
+        state_space.prices: base prices
         """
-        current_prices = self.base_prices if self.base_prices is not None else np.ones_like(demand_vector) * 99.99
-        demand = self.demand_history if self.demand_history is not None else np.zeros_like(current_prices)
-        new_prices = current_prices.copy()
+        demand = np.asarray(state_space.demand) if state_space and hasattr(state_space, 'demand') else np.array([0])
+        base = np.asarray(state_space.prices) if state_space and hasattr(state_space, 'prices') else self.base_prices
 
+        if base is None:
+            base = np.ones(len(demand)) * 99.99
+
+        new_prices = base.copy()
         high_mask = demand >= self.high_threshold
         new_prices[high_mask] *= self.surge_multiplier
 
diff --git a/experiments/procesing/steps/session.py b/experiments/procesing/steps/session.py
index 4b950aa..ec6f27c 100644
--- a/experiments/procesing/steps/session.py
+++ b/experiments/procesing/steps/session.py
@@ -135,6 +135,7 @@ class ExtractSessionFeaturesStep(BaseContextStep):
     Vectorized session feature extraction - replaces O(n^2) per-row loop.
     Input: interactions_df
     Output: session-level feature matrix
+    THIS is our main mapping from tau (trajectory) to some features vector theta - we need to do this very well. This is what will go into demand esimation.
     """
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
diff --git a/lib/model_registry.py b/lib/model_registry.py
index 92d7934..e833a1a 100755
--- a/lib/model_registry.py
+++ b/lib/model_registry.py
@@ -178,3 +178,49 @@ class ModelRegistry:
             return True
         except:
             return False
+
+    def set_session_prices(self, session_id: str, prices: Dict[str, float], ttl: int = 1800):
+        """
+        Store prices for a specific session.
+        THIS is the write path for session-aware pricing.
+
+        Args:
+            session_id: session identifier
+            prices: dict of {productId: price}
+            ttl: time-to-live in seconds (default 30min)
+        """
+        if not prices:
+            return
+
+        key = f"session:{session_id}:prices"
+        # use Redis hash for O(1) lookup per product
+        self.redis_client.hset(key, mapping={k: str(v) for k, v in prices.items()})
+        self.redis_client.expire(key, ttl)
+
+    def get_session_price(self, session_id: str, product_id: str) -> Optional[float]:
+        """
+        Lookup price for (sessionId, productId).
+        THIS is the read path for fast provider lookup.
+
+        Returns: price or None if not found
+        """
+        key = f"session:{session_id}:prices"
+        price_str = self.redis_client.hget(key, product_id)
+
+        if price_str is None:
+            return None
+
+        return float(price_str.decode('utf-8') if isinstance(price_str, bytes) else price_str)
+
+    def get_session_all_prices(self, session_id: str) -> Dict[str, float]:
+        """Get all prices for a session."""
+        key = f"session:{session_id}:prices"
+        prices_raw = self.redis_client.hgetall(key)
+
+        if not prices_raw:
+            return {}
+
+        return {
+            (k.decode('utf-8') if isinstance(k, bytes) else k): float(v.decode('utf-8') if isinstance(v, bytes) else v)
+            for k, v in prices_raw.items()
+        }
diff --git a/web/src/app/api/pricing/route.ts b/web/src/app/api/pricing/route.ts
index 1aec75b..6532131 100644
--- a/web/src/app/api/pricing/route.ts
+++ b/web/src/app/api/pricing/route.ts
@@ -30,6 +30,8 @@ export async function GET(req: NextRequest) {
     const providerUrl = process.env.PRICING_PROVIDER_URL || 'http://localhost:5001';
     try {
         const queryParams = new URLSearchParams();
+        // THIS is our entry point into the dynamic pricing where we reference the context of the sesion and experiment and ask for a price to assign to the trajectory which is expressed
+        // The whole pipeline gets triggered from here.
         if (sessionId) queryParams.append('sessionId', sessionId);
         if (experimentId) queryParams.append('experimentId', experimentId);
 
@@ -55,25 +57,26 @@ export async function GET(req: NextRequest) {
         price = Math.round(randomBase * 100) / 100;
     }
 
-    // log price to kafka for elasticity computation
+    // log price to kafka asynchronously (non-blocking)
     if (sessionId) {
         const backendUrl = process.env.BACKEND_URL || 'http://localhost:5000';
-        try {
-            await fetch(`${backendUrl}/api/kafka/price-log`, {
-                method: 'POST',
-                headers: { 'Content-Type': 'application/json' },
-                body: JSON.stringify({
-                    productId,
-                    price,
-                    sessionId,
-                    experimentId: experimentId || undefined,
-                    storeMode,
-                    ts: timestamp,
-                }),
-            });
-        } catch (err) {
-            console.error('[price-log-error]', err);
-        }
+        // fire and forget - don't await to avoid blocking response
+        fetch(`${backendUrl}/api/kafka/price-log`, {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+                productId,
+                price,
+                sessionId,
+                experimentId: experimentId || undefined,
+                storeMode,
+                ts: timestamp,
+            }),
+        }).catch(err => {
+            if (process.env.NODE_ENV === 'development') {
+                console.error('[price-log-error]', err);
+            }
+        });
     }
 
     if (process.env.NODE_ENV === 'development') {

From 62a4008c29f94b753089de69ba84bfb6eea8a0cc Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 12 Jan 2026 13:37:48 +0100
Subject: [PATCH 75/99] feat: integration of pipeline hooks into testing

---
 Makefile                                      |  2 ++
 backend/server/app.py                         |  6 ++++-
 docker-compose.yml                            |  2 ++
 .../airflow/dags/surge_pricing_pipeline.py    | 24 +++++++++++++++----
 experiments/procesing/pricers/simple.py       |  3 ++-
 tests/e2e/helpers/kafka.ts                    |  4 ++--
 tests/e2e/playwright.config.ts                |  4 ++--
 tests/e2e/scenarios/session-aware.spec.ts     | 21 ++++++++++------
 tests/e2e/scenarios/surge-pricing.spec.ts     | 11 +++++++--
 9 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/Makefile b/Makefile
index 0c51bb3..879afb5 100644
--- a/Makefile
+++ b/Makefile
@@ -49,8 +49,10 @@ test.backend: $(VENV)
 test.e2e:
 	@cd tests/e2e && npm install
 	@cd tests/e2e && npx playwright install chromium
+	@test -f tests/e2e/.env || cp tests/e2e/.env.example tests/e2e/.env
 	@timeout 30 bash -c 'until curl -sf http://localhost:5000/health > /dev/null 2>&1; do sleep 1; done' || (echo "Backend not ready" && exit 1)
 	@timeout 30 bash -c 'until curl -sf http://localhost:3000 > /dev/null 2>&1; do sleep 1; done' || (echo "Web app not ready" && exit 1)
+	@timeout 30 bash -c 'until curl -sf http://localhost:8085/health > /dev/null 2>&1; do sleep 1; done' || (echo "Airflow not ready" && exit 1)
 	@cd tests/e2e && npm test
 
 .PHONY: test.all
diff --git a/backend/server/app.py b/backend/server/app.py
index d338408..f100811 100644
--- a/backend/server/app.py
+++ b/backend/server/app.py
@@ -198,12 +198,16 @@ def dump_logs(
             auto_offset_reset='earliest',
             enable_auto_commit=False,
             value_deserializer=lambda x: json.loads(x.decode('utf-8')),
-            consumer_timeout_ms=5000
+            consumer_timeout_ms=30000,
+            fetch_max_wait_ms=10000,
+            max_poll_records=1000
         )
 
         events = []
         for msg in consumer:
             events.append(msg.value)
+            if last_n and len(events) >= last_n * 2:
+                break
 
         consumer.close()
 
diff --git a/docker-compose.yml b/docker-compose.yml
index f72f415..561c393 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -144,6 +144,7 @@ services:
       - AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
       - AIRFLOW__WEBSERVER__EXPOSE_CONFIG=true
       - AIRFLOW__WEBSERVER__SECRET_KEY=${AIRFLOW_SECRET_KEY}
+      - AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.basic_auth
       - KAFKA_HOST=kafka
       - KAFKA_PORT=29092
       - BACKEND_URL=http://backend:5000
@@ -180,6 +181,7 @@ services:
       - AIRFLOW__CORE__LOAD_EXAMPLES=false
       - AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
       - AIRFLOW__WEBSERVER__SECRET_KEY=${AIRFLOW_SECRET_KEY}
+      - AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.basic_auth
       - KAFKA_HOST=kafka
       - KAFKA_PORT=29092
       - BACKEND_URL=http://backend:5000
diff --git a/experiments/airflow/dags/surge_pricing_pipeline.py b/experiments/airflow/dags/surge_pricing_pipeline.py
index b1d7c61..1a3b3d0 100644
--- a/experiments/airflow/dags/surge_pricing_pipeline.py
+++ b/experiments/airflow/dags/surge_pricing_pipeline.py
@@ -120,15 +120,31 @@ def apply_surge_pricing(**kwargs):
     # rename demand_score to demand for pricer compatibility
     data = product_features.rename(columns={'demand_score': 'demand'})
 
+    high_thresh = dag_conf.get('high_threshold', 10)
+    low_thresh = dag_conf.get('low_threshold', 2)
+    surge_mult = dag_conf.get('surge_multiplier', 1.2)
+    discount_mult = dag_conf.get('discount_multiplier', 0.9)
+
+    logging.info(f"Surge pricing config: high_thresh={high_thresh}, low_thresh={low_thresh}, surge_mult={surge_mult}, discount_mult={discount_mult}")
+    logging.info(f"Demand stats: min={data['demand'].min():.2f}, max={data['demand'].max():.2f}, mean={data['demand'].mean():.2f}")
+    logging.info(f"Products with high demand (>={high_thresh}): {(data['demand'] >= high_thresh).sum()}")
+    logging.info(f"Products with low demand (<={low_thresh}): {(data['demand'] <= low_thresh).sum()}")
+
     surge_pricer = SimpleSurgePricer(
-        high_threshold=dag_conf.get('high_threshold', 10),
-        low_threshold=dag_conf.get('low_threshold', 2),
-        surge_multiplier=dag_conf.get('surge_multiplier', 1.2),
-        discount_multiplier=dag_conf.get('discount_multiplier', 0.9)
+        high_threshold=high_thresh,
+        low_threshold=low_thresh,
+        surge_multiplier=surge_mult,
+        discount_multiplier=discount_mult
     )
     surge_pricer.fit(data)
     data['optimal_price'] = surge_pricer.predict()
 
+    base_avg = data['base_price'].mean()
+    optimal_avg = data['optimal_price'].mean()
+    price_change_pct = ((optimal_avg - base_avg) / base_avg) * 100
+
+    logging.info(f"Price adjustment: base_avg={base_avg:.2f}, optimal_avg={optimal_avg:.2f}, change={price_change_pct:+.1f}%")
+
     prices_df = data[['productId', 'price', 'base_price', 'optimal_price', 'demand']].rename(columns={
         'price': 'current_price',
         'demand': 'demand_score'
diff --git a/experiments/procesing/pricers/simple.py b/experiments/procesing/pricers/simple.py
index 6bdd1ca..1a03f9f 100644
--- a/experiments/procesing/pricers/simple.py
+++ b/experiments/procesing/pricers/simple.py
@@ -124,7 +124,8 @@ class SimpleSurgePricer(PricingFunction):
         if base is None:
             base = np.ones(len(demand)) * 99.99
 
-        new_prices = base.copy()
+        # ensure float dtype to allow multiplication by float multipliers
+        new_prices = base.astype(np.float64).copy()
         high_mask = demand >= self.high_threshold
         new_prices[high_mask] *= self.surge_multiplier
 
diff --git a/tests/e2e/helpers/kafka.ts b/tests/e2e/helpers/kafka.ts
index c0a95dd..18b977d 100644
--- a/tests/e2e/helpers/kafka.ts
+++ b/tests/e2e/helpers/kafka.ts
@@ -9,8 +9,8 @@ interface InteractionEvent {
 const dumpKafkaTopic = async (backendUrl: string, topic: string) => {
   const resp = await fetch(`${backendUrl}/api/kafka/dump?topic=${topic}`);
   if (!resp.ok) throw new Error(`Kafka dump failed: ${resp.status}`);
-  const { messages = [] } = await resp.json();
-  return messages as any[];
+  const { data = [] } = await resp.json();
+  return data as any[];
 };
 
 export const waitForInteractionEvent = async (
diff --git a/tests/e2e/playwright.config.ts b/tests/e2e/playwright.config.ts
index 54a5561..dc3c815 100644
--- a/tests/e2e/playwright.config.ts
+++ b/tests/e2e/playwright.config.ts
@@ -5,14 +5,14 @@ export default defineConfig({
   fullyParallel: true,
   forbidOnly: !!process.env.CI,
   retries: 0,
-  workers: 5,
+  workers: 1,
   reporter: 'list',
   use: {
     baseURL: process.env.WEB_URL || 'http://localhost:3000',
     trace: 'retain-on-failure',
     screenshot: 'only-on-failure',
   },
-  timeout: 60000,
+  timeout: 180000,
   expect: {
     timeout: 10000,
   },
diff --git a/tests/e2e/scenarios/session-aware.spec.ts b/tests/e2e/scenarios/session-aware.spec.ts
index b204984..5c27747 100644
--- a/tests/e2e/scenarios/session-aware.spec.ts
+++ b/tests/e2e/scenarios/session-aware.spec.ts
@@ -9,6 +9,7 @@ import {
   addToCart,
 } from '../helpers/interactions';
 import { getSessionEvents } from '../helpers/kafka';
+import { runSessionPricing } from '../helpers/airflow';
 
 test.describe('SessionAwarePricer E2E', () => {
   const STORE_TYPE = 'hotel';
@@ -23,6 +24,9 @@ test.describe('SessionAwarePricer E2E', () => {
     await page.waitForTimeout(1500);
 
     const productId2 = await humanLikeViewProduct(page, STORE_TYPE);
+
+    await runSessionPricing(STORE_TYPE);
+
     const secondPrice = await getPriceFromDOM(page);
     expect(await verifySessionConsistency(page, sessionId)).toBeTruthy();
 
@@ -40,11 +44,13 @@ test.describe('SessionAwarePricer E2E', () => {
     await rapidViewProductViaFlow(page, 8, 100, STORE_TYPE);
     expect(await verifySessionConsistency(page, sessionId)).toBeTruthy();
 
-    await page.waitForTimeout(2500);
+    await page.waitForTimeout(1000);
 
     const events = await getSessionEvents(backendUrl, sessionId);
     expect(events.length).toBeGreaterThanOrEqual(8);
 
+    await runSessionPricing(STORE_TYPE);
+
     await page.goto(`/products/${productId}`);
     await page.waitForLoadState('networkidle');
     const agentPrice = await getPriceFromDOM(page);
@@ -59,14 +65,12 @@ test.describe('SessionAwarePricer E2E', () => {
     const productId = await viewProductViaFlow(page, STORE_TYPE);
     const baselinePrice = await getPriceFromDOM(page);
 
-    const startTime = Date.now();
     await rapidViewProductViaFlow(page, 10, 80, STORE_TYPE);
-    const duration = (Date.now() - startTime) / 1000;
 
-    const eventsPerSec = 10 / duration;
-    expect(eventsPerSec).toBeGreaterThan(2.0);
+    const events = await getSessionEvents(backendUrl, sessionId);
+    expect(events.length).toBeGreaterThanOrEqual(10);
 
-    await page.waitForTimeout(2000);
+    await runSessionPricing(STORE_TYPE);
 
     await page.goto(`/products/${productId}`);
     await page.waitForLoadState('networkidle');
@@ -105,8 +109,11 @@ test.describe('SessionAwarePricer E2E', () => {
 
     await rapidViewProductViaFlow(page, 2, 150, STORE_TYPE);
 
-    await page.waitForTimeout(1500);
+    await page.waitForTimeout(1000);
     await humanLikeViewProduct(page, STORE_TYPE);
+
+    await runSessionPricing(STORE_TYPE);
+
     const finalPrice = await getPriceFromDOM(page);
 
     expect(Math.abs(finalPrice - baselinePrice) / baselinePrice).toBeLessThan(0.3);
diff --git a/tests/e2e/scenarios/surge-pricing.spec.ts b/tests/e2e/scenarios/surge-pricing.spec.ts
index e3e2f8d..26d29d3 100644
--- a/tests/e2e/scenarios/surge-pricing.spec.ts
+++ b/tests/e2e/scenarios/surge-pricing.spec.ts
@@ -7,6 +7,7 @@ import {
   verifySessionConsistency,
 } from '../helpers/interactions';
 import { waitForInteractionEvent, countProductViews } from '../helpers/kafka';
+import { runSurgePricing } from '../helpers/airflow';
 
 test.describe('SimpleSurgePricer E2E', () => {
   const STORE_TYPE = 'hotel';
@@ -29,7 +30,7 @@ test.describe('SimpleSurgePricer E2E', () => {
 
     await rapidViewProductViaFlow(page, 5, 200, STORE_TYPE);
 
-    await page.waitForTimeout(2000);
+    await page.waitForTimeout(1000);
 
     const evt = await waitForInteractionEvent(backendUrl, sessionId, 'view_item_page');
     expect(evt).not.toBeNull();
@@ -37,6 +38,8 @@ test.describe('SimpleSurgePricer E2E', () => {
     const viewCount = await countProductViews(backendUrl, productId);
     expect(viewCount).toBeGreaterThanOrEqual(5);
 
+    await runSurgePricing(STORE_TYPE, 3, 1);
+
     await page.goto(`/products/${productId}`);
     await page.waitForLoadState('networkidle');
     const surgedPrice = await getPriceFromDOM(page);
@@ -72,7 +75,9 @@ test.describe('SimpleSurgePricer E2E', () => {
 
     await rapidViewProductViaFlow(page, 5, 150, STORE_TYPE);
 
-    await page.waitForTimeout(1500);
+    await page.waitForTimeout(1000);
+
+    await runSurgePricing(STORE_TYPE, 3, 1);
 
     await page.goto(`/products/${productId}`);
     await page.waitForLoadState('networkidle');
@@ -81,6 +86,8 @@ test.describe('SimpleSurgePricer E2E', () => {
 
     await page.waitForTimeout(12000);
 
+    await runSurgePricing(STORE_TYPE, 3, 1);
+
     await page.goto(`/products/${productId}`);
     await page.waitForLoadState('networkidle');
     const decayedPrice = await getPriceFromDOM(page);

From e89cb263d49375f0b0a628810d41c86617ae5386 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 12 Jan 2026 20:59:09 +0100
Subject: [PATCH 76/99] planning

---
 .../airflow/dags/surge_pricing_factory.py     | 10 +++++++
 experiments/procesing/pricers/base.py         | 29 +++++++++----------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/experiments/airflow/dags/surge_pricing_factory.py b/experiments/airflow/dags/surge_pricing_factory.py
index a886d5b..b61e65c 100644
--- a/experiments/airflow/dags/surge_pricing_factory.py
+++ b/experiments/airflow/dags/surge_pricing_factory.py
@@ -1,3 +1,4 @@
+from pandas.core.algorithms import factorize_array
 from airflow import DAG
 from airflow.operators.python import PythonOperator
 from airflow.utils.dates import days_ago
@@ -208,3 +209,12 @@ def create_surge_pricing_dag(store_mode: str) -> DAG:
 # instantiate DAGs for Airflow to discover
 dag_airline = create_surge_pricing_dag('airline')
 dag_hotel = create_surge_pricing_dag('hotel')
+
+# TODO: Refactor this factory from a surge pricing factory to a general pricing factory
+# We will do this by passing a pricing strategy class to the factory, since the generic pipeline is:
+# take all interaction data, group by sessionId and assign a new price vector to each session
+# in the grouping we get a subset of the interactions per sessionId and we can map that to some Features
+# we define a custom _get_features(interactions .) methodin the strategy class
+# we then run only the inference which is the .predict(trajectory) per-session which will give us a new price vector
+# this we then publish for each sessionId group
+# this might include no deleting most of the pricers we have defined and starting with a super simple surge-pricing algorithm that is no-fit only predict. This we can then test end-to-end and observe changes to prices according to a desired strategy - we have to define this one as a very short term strategy because we run sessions that take only a few minutes.
diff --git a/experiments/procesing/pricers/base.py b/experiments/procesing/pricers/base.py
index 6569556..ecaabed 100644
--- a/experiments/procesing/pricers/base.py
+++ b/experiments/procesing/pricers/base.py
@@ -7,15 +7,6 @@ import pandas as pd
 class PricingFunction(ABC):
     """
     Abstract base for pricing functions.
-
-    Defines mapping: f(Q_t, P_t, S_t, H_t) -> P_{t+1}
-
-    Where:
-        Q_t ∈ R^n: demand vector at time t
-        P_t ∈ R^n: price vector at time t
-        S_t: session features (behavioral signals, interactions)
-        H_t = {Q_{t-k}, P_{t-k}, S_{t-k}}: historical state trajectory
-
     Objective:
         maximize E[R_T] = E[Σ P_t^T · Q_t]
         subject to:
@@ -28,10 +19,10 @@ class PricingFunction(ABC):
     def fit(self, *kwargs):
         """
         Offline training on historical data.
+        This is where we can think about some maximization of expected revenue
+        over historical trajectories to learn parameters of the pricing function.
+        (This however we cover move in the RL side of things)
 
-        Args:
-            historical_data: DataFrame with elasticity, prices, demand signals
-            **kwargs: additional training parameters
         """
         pass
 
@@ -39,12 +30,18 @@ class PricingFunction(ABC):
     def predict(self, *kwargs) -> np.ndarray:
         """
         Generate optimal prices given current state.
+        This is an abstract method that transitions from τ -> P*
+        which is the mapping from the trajectory to optimal prices under
+        some subset of session grouping (so, per sessionId)
+        """
+        pass
 
-        Args:
-            state_space: StateSpace object containing Q_t, P_t, S_t, H_t
-
+    @abstractmethod
+    def _get_features(self, *kwargs) -> np.ndarray:
+        """
+        Extract features from trajectory for pricing decision.
         Returns:
-            P_{t+1}: price vector in R^n
+            np.ndarray of shape (n_products, n_features)
         """
         pass
 

From 3c141a4b6c5d13bc4078807c381082fbc7ce625d Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Mon, 12 Jan 2026 22:33:47 +0100
Subject: [PATCH 77/99] chore: better test consistency before agnet

---
 docker-compose.yml                            | 20 ++++++++--
 experiments/procesing/pricers/elasticity.py   | 10 +++++
 .../procesing/pricers/session_aware.py        | 39 +++++++++++++++++++
 experiments/procesing/pricers/simple.py       | 23 +++++++++++
 4 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 561c393..ba2e8a3 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -112,11 +112,14 @@ services:
     depends_on:
       - postgres
     environment:
-      - AIRFLOW__CORE__EXECUTOR=SequentialExecutor
+      - AIRFLOW__CORE__EXECUTOR=LocalExecutor
       - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
       - AIRFLOW__CORE__FERNET_KEY=${AIRFLOW_FERNET_KEY}
       - AIRFLOW__CORE__LOAD_EXAMPLES=false
       - AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
+      - AIRFLOW__CORE__PARALLELISM=16
+      - AIRFLOW__CORE__DAG_CONCURRENCY=8
+      - AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG=4
       - _AIRFLOW_DB_MIGRATE=true
       - _AIRFLOW_WWW_USER_CREATE=true
       - _AIRFLOW_WWW_USER_USERNAME=admin
@@ -136,12 +139,17 @@ services:
       - airflow-init
       - redis
     environment:
-      - AIRFLOW__CORE__EXECUTOR=SequentialExecutor
+      - AIRFLOW__CORE__EXECUTOR=LocalExecutor
       - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
       - AIRFLOW__CORE__FERNET_KEY=${AIRFLOW_FERNET_KEY}
       - AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=true
       - AIRFLOW__CORE__LOAD_EXAMPLES=false
       - AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
+      - AIRFLOW__CORE__PARALLELISM=16
+      - AIRFLOW__CORE__DAG_CONCURRENCY=8
+      - AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG=4
+      - AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=30
+      - AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL=60
       - AIRFLOW__WEBSERVER__EXPOSE_CONFIG=true
       - AIRFLOW__WEBSERVER__SECRET_KEY=${AIRFLOW_SECRET_KEY}
       - AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.basic_auth
@@ -174,12 +182,18 @@ services:
       redis:
         condition: service_started
     environment:
-      - AIRFLOW__CORE__EXECUTOR=SequentialExecutor
+      - AIRFLOW__CORE__EXECUTOR=LocalExecutor
       - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
       - AIRFLOW__CORE__FERNET_KEY=${AIRFLOW_FERNET_KEY}
       - AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=true
       - AIRFLOW__CORE__LOAD_EXAMPLES=false
       - AIRFLOW__CORE__ENABLE_XCOM_PICKLING=true
+      - AIRFLOW__CORE__PARALLELISM=16
+      - AIRFLOW__CORE__DAG_CONCURRENCY=8
+      - AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG=4
+      - AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL=30
+      - AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL=60
+      - AIRFLOW__SCHEDULER__PARSING_PROCESSES=2
       - AIRFLOW__WEBSERVER__SECRET_KEY=${AIRFLOW_SECRET_KEY}
       - AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.basic_auth
       - KAFKA_HOST=kafka
diff --git a/experiments/procesing/pricers/elasticity.py b/experiments/procesing/pricers/elasticity.py
index b203159..3ce3b42 100644
--- a/experiments/procesing/pricers/elasticity.py
+++ b/experiments/procesing/pricers/elasticity.py
@@ -57,3 +57,13 @@ class ElasticityBasedPricer(PricingFunction):
         # enforce bounds
         prices = np.clip(prices, self.price_floor, self.price_ceil)
         return prices
+
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Extract elasticity, demand, and demand deviation for each product"""
+        if state_space is None or self.elasticity is None:
+            n = len(self.elasticity) if self.elasticity is not None else 0
+            return np.zeros((n, 3))
+
+        demand = np.asarray(state_space.demand)
+        demand_dev = (demand - self.mean_demand) / (self.mean_demand + 1e-6)
+        return np.column_stack([self.elasticity, demand, demand_dev])
diff --git a/experiments/procesing/pricers/session_aware.py b/experiments/procesing/pricers/session_aware.py
index 40343a7..dbc859f 100644
--- a/experiments/procesing/pricers/session_aware.py
+++ b/experiments/procesing/pricers/session_aware.py
@@ -107,6 +107,36 @@ class SessionAwarePricer(PricingFunction):
 
         return prices
 
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Extract elasticity, demand, and session features"""
+        if state_space is None or self.elasticity is None:
+            n = len(self.elasticity) if self.elasticity is not None else 0
+            return np.zeros((n, 5))
+
+        demand = np.asarray(state_space.demand)
+        n_products = len(demand)
+
+        # extract session features
+        velocity = 0.0
+        view_depth = 0.0
+        cart_to_view = 0.0
+
+        if not state_space.session_features.empty:
+            sf = state_space.session_features.iloc[0]
+            velocity = sf.get('interaction_velocity', 0.0)
+            view_depth = sf.get('product_view_depth', 0.0)
+            cart_to_view = sf.get('cart_to_view_ratio', 0.0)
+
+        # broadcast session features to all products
+        features = np.column_stack([
+            self.elasticity,
+            demand,
+            np.full(n_products, velocity),
+            np.full(n_products, view_depth),
+            np.full(n_products, cart_to_view)
+        ])
+        return features
+
 
 class ProductSpecificSessionPricer(PricingFunction):
     """
@@ -170,3 +200,12 @@ class ProductSpecificSessionPricer(PricingFunction):
 
         prices = np.clip(base_prices, self.price_floor, self.price_ceil)
         return prices
+
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Extract elasticity and demand features for product-specific pricing"""
+        if state_space is None or self.elasticity is None:
+            n = len(self.elasticity) if self.elasticity is not None else 0
+            return np.zeros((n, 2))
+
+        demand = np.asarray(state_space.demand)
+        return np.column_stack([self.elasticity, demand])
diff --git a/experiments/procesing/pricers/simple.py b/experiments/procesing/pricers/simple.py
index 1a03f9f..d7fa699 100644
--- a/experiments/procesing/pricers/simple.py
+++ b/experiments/procesing/pricers/simple.py
@@ -65,6 +65,11 @@ class StaticPricer(PricingFunction):
             raise ValueError("Must call fit() or provide base_prices in constructor")
         return self.base_prices.copy()
 
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Static pricer uses no features, returns empty array"""
+        n = len(self.base_prices) if self.base_prices is not None else 0
+        return np.zeros((n, 0))
+
 
 class RandomPricer(PricingFunction):
     """Random pricing within bounds (for baseline comparison)"""
@@ -87,6 +92,11 @@ class RandomPricer(PricingFunction):
             self.n_products = len(state_space.demand)
         return self.rng.uniform(self.price_min, self.price_max, size=self.n_products)
 
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Random pricer uses no features"""
+        n = self.n_products if self.n_products else 0
+        return np.zeros((n, 0))
+
 
 class SimpleSurgePricer(PricingFunction):
     """
@@ -133,3 +143,16 @@ class SimpleSurgePricer(PricingFunction):
         new_prices[low_mask] *= self.discount_multiplier
 
         return new_prices
+
+    def _get_features(self, state_space=None) -> np.ndarray:
+        """Extract demand and base price features for each product"""
+        if state_space is None:
+            n = len(self.base_prices) if self.base_prices is not None else 0
+            return np.zeros((n, 2))
+
+        demand = np.asarray(state_space.demand) if hasattr(state_space, 'demand') else np.array([0])
+        base = np.asarray(state_space.prices) if hasattr(state_space, 'prices') else self.base_prices
+        if base is None:
+            base = np.ones(len(demand)) * 99.99
+
+        return np.column_stack([demand, base])

From 4c368d48f2a489595a9c8bc0375f348568305782 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 15:05:33 +0100
Subject: [PATCH 78/99] chore: fixing visual bugs in cart

---
 web/src/app/cart/page.tsx | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/web/src/app/cart/page.tsx b/web/src/app/cart/page.tsx
index 30ac3f2..dbcb30b 100644
--- a/web/src/app/cart/page.tsx
+++ b/web/src/app/cart/page.tsx
@@ -32,7 +32,8 @@ export default function CartPage() {
                     {itemCount > 0 && (
                         <button
                             onClick={clearCart}
-                            className="text-sm text-red-600 hover:underline"
+                            className="text-sm hover:underline"
+                            style={{ color: 'var(--accent-warning)' }}
                         >
                             Clear cart
                         </button>
@@ -42,7 +43,7 @@ export default function CartPage() {
                 {itemCount === 0 ? (
                     <div className="text-center py-12">
                         <p className="text-gray-500 mb-4">Your cart is empty</p>
-                        <a href="/" className="text-blue-600 hover:underline">Browse our selection</a>
+                        <a href="/" className="hover:underline" style={{ color: 'var(--text-accent)' }}>Browse our selection</a>
                     </div>
                 ) : (
                     <>
@@ -54,15 +55,11 @@ export default function CartPage() {
                                 >
                                     <div className="flex-1">
                                         <div className="flex items-center gap-2 mb-1">
-                                            <span className="px-2 py-0.5 text-xs font-medium rounded bg-blue-100 text-blue-800">
-                                                {item.type}
-                                            </span>
                                             <h3 className="font-semibold">{item.name}</h3>
                                         </div>
 
                                         {item.type === 'hotel' && (
                                             <div className="text-sm text-gray-600">
-                                                <p>{String(item.metadata.roomType)}</p>
                                                 <p>{String(item.metadata.checkIn)} - {String(item.metadata.checkOut)}</p>
                                                 <p>{String(item.metadata.nights)} night{Number(item.metadata.nights) > 1 ? 's' : ''}</p>
                                             </div>
@@ -81,7 +78,8 @@ export default function CartPage() {
                                         <p className="text-xl font-bold mb-2">${item.price}</p>
                                         <button
                                             onClick={() => handleRemove(item.id, item.type)}
-                                            className="text-sm text-red-600 hover:underline"
+                                            className="text-sm hover:underline"
+                                            style={{ color: 'var(--accent-warning)' }}
                                         >
                                             Remove
                                         </button>
@@ -100,7 +98,7 @@ export default function CartPage() {
                                     dispatchInteraction('checkout_start', undefined, { total, itemCount });
                                     window.location.href = '/checkout';
                                 }}
-                                className="w-full py-3 bg-blue-600 hover:bg-blue-700 text-white rounded-lg font-medium transition-colors"
+                                className="btn-primary w-full"
                             >
                                 Proceed to Checkout
                             </button>

From 61dd621532fbe91eb5afccf01fbb87488e55978a Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 15:09:52 +0100
Subject: [PATCH 79/99] chore: styling and title updates

---
 web/src/app/globals.css              | 3 +++
 web/src/app/layout.tsx               | 4 ++--
 web/src/components/ui/Navigation.tsx | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/web/src/app/globals.css b/web/src/app/globals.css
index 4a5b0c9..457b974 100644
--- a/web/src/app/globals.css
+++ b/web/src/app/globals.css
@@ -8,6 +8,9 @@
   --bg-secondary: #f5f5f5;
   --text-primary: #333333;
   --text-secondary: #666666;
+  --accent-primary: #007aff;
+  --accent-primary-hover: #0051d5;
+  --accent-primary-light: #e6f2ff;
   --spacing-sm: 8px;
   --spacing-md: 16px;
   --spacing-lg: 32px;
diff --git a/web/src/app/layout.tsx b/web/src/app/layout.tsx
index e9f9b63..5ff49ae 100644
--- a/web/src/app/layout.tsx
+++ b/web/src/app/layout.tsx
@@ -15,8 +15,8 @@ const geistMono = Geist_Mono({
 });
 
 export const metadata: Metadata = {
-  title: "Create Next App",
-  description: "Generated by create next app",
+  title: "Travel Booking Platform",
+  description: "Book flights and hotels with dynamic pricing",
 };
 
 export default function RootLayout({
diff --git a/web/src/components/ui/Navigation.tsx b/web/src/components/ui/Navigation.tsx
index 9d9d4cf..6f0ecbb 100644
--- a/web/src/components/ui/Navigation.tsx
+++ b/web/src/components/ui/Navigation.tsx
@@ -20,7 +20,7 @@ const NavLink = ({ href, children }: { href: string; children: React.ReactNode }
       href={href}
       className={`px-4 py-2 rounded-md transition-colors ${
         isActive
-          ? 'bg-[var(--accent-primary)] font-semibold'
+          ? 'bg-[var(--accent-primary)] text-white font-semibold'
           : 'hover:bg-[var(--accent-primary-light)] text-[var(--text-primary)]'
       }`}
     >

From eb9506038096497a0377636ce082f1be2f9e6840 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Alves=20R=C3=B6sel?=
 <60182044+velocitatem@users.noreply.github.com>
Date: Tue, 13 Jan 2026 15:35:27 +0100
Subject: [PATCH 80/99] Pre run web refactors (#43)

* chore: refactor date utilities

* feat: improve images of hotel rooms

* fix: adding date utils
---
 web/src/components/feats/hotel/HotelCard.tsx  |  5 +-
 .../components/feats/hotel/HotelDetails.tsx   |  5 +-
 web/src/lib/airline-utils.ts                  | 24 +--------
 web/src/lib/date-utils.ts                     | 23 ++++++++
 web/src/lib/hotel-utils.ts                    | 52 +++++++++++--------
 5 files changed, 60 insertions(+), 49 deletions(-)
 create mode 100644 web/src/lib/date-utils.ts

diff --git a/web/src/components/feats/hotel/HotelCard.tsx b/web/src/components/feats/hotel/HotelCard.tsx
index 5bf234d..847e1b2 100644
--- a/web/src/components/feats/hotel/HotelCard.tsx
+++ b/web/src/components/feats/hotel/HotelCard.tsx
@@ -2,6 +2,7 @@
 
 import type { EventName } from '@/lib/events';
 import type { Hotel } from '@/lib/hotel-utils';
+import { getHotelImageUrl } from '@/lib/hotel-utils';
 import { useHoverTracking } from '@/hooks/useHoverTracking';
 import PriceDisplay from '@/components/ui/PriceDisplay';
 
@@ -47,8 +48,6 @@ export default function HotelCard({ hotel }: { hotel: Hotel }) {
         window.location.href = `/hotel/products/${hotel.id}`;
     };
 
-    const imageUrl = `https://images.unsplash.com/photo-1551882547-ff40c63fe5fa?w=400&h=300&fit=crop`;
-
     return (
         <div
             className="hotel-card cursor-pointer"
@@ -56,7 +55,7 @@ export default function HotelCard({ hotel }: { hotel: Hotel }) {
         >
             <div className="hotel-image relative overflow-hidden">
                 <img
-                    src={imageUrl}
+                    src={getHotelImageUrl(hotel.id, { w: 400, h: 300 })}
                     alt={hotel.name}
                     className="w-full h-full object-cover"
                     onError={(e) => {
diff --git a/web/src/components/feats/hotel/HotelDetails.tsx b/web/src/components/feats/hotel/HotelDetails.tsx
index 6cdbbdd..030769f 100644
--- a/web/src/components/feats/hotel/HotelDetails.tsx
+++ b/web/src/components/feats/hotel/HotelDetails.tsx
@@ -2,6 +2,7 @@
 
 import { useState, useEffect } from 'react';
 import type { Hotel } from '@/lib/hotel-utils';
+import { getHotelImageUrl } from '@/lib/hotel-utils';
 import PriceDisplay from '@/components/ui/PriceDisplay';
 
 interface HotelDetailsProps {
@@ -43,13 +44,11 @@ const PriceTotalDisplay = ({ productId, nights }: { productId: string; nights: n
 };
 
 export default function HotelDetails({ product, onAddToCart, addedToCart }: HotelDetailsProps) {
-  const imageUrl = `https://images.unsplash.com/photo-1566073771259-6a8506099945?w=800&h=600&fit=crop`;
-
   return (
     <div className="w-full flex flex-col lg:flex-row gap-12 py-8">
       <div className="w-full lg:w-1/2 rounded-lg aspect-[4/3] overflow-hidden shrink-0">
         <img
-          src={imageUrl}
+          src={getHotelImageUrl(product.id, { w: 800, h: 600 })}
           alt={product.name}
           className="w-full h-full object-cover"
           onError={(e) => {
diff --git a/web/src/lib/airline-utils.ts b/web/src/lib/airline-utils.ts
index 74a1916..b801e14 100644
--- a/web/src/lib/airline-utils.ts
+++ b/web/src/lib/airline-utils.ts
@@ -31,7 +31,7 @@ export interface Flight {
   availability: number;
 }
 
-const EPOCH = new Date(0);
+import { dateToDaysFromToday, dateToIndex, todayIndex } from './date-utils';
 
 export const transformProduct = (p: AirlineProduct): Flight => {
   const { id, flight_type, date_index, metadata, availability } = p;
@@ -52,24 +52,4 @@ export const transformProduct = (p: AirlineProduct): Flight => {
   };
 };
 
-// convert date string to days from today
-export const dateToDaysFromToday = (dateStr: string): number => {
-  const target = new Date(dateStr);
-  target.setHours(0, 0, 0, 0);
-  const today = new Date();
-  today.setHours(0, 0, 0, 0);
-  return Math.floor((target.getTime() - today.getTime()) / 86400000);
-};
-
-// convert date string to date_index (days since epoch)
-export const dateToIndex = (dateStr: string): number => {
-  const d = new Date(dateStr);
-  return Math.floor((d.getTime() - EPOCH.getTime()) / 86400000);
-};
-
-// get current date_index
-export const todayIndex = (): number => {
-  const now = new Date();
-  now.setHours(0, 0, 0, 0);
-  return Math.floor((now.getTime() - EPOCH.getTime()) / 86400000);
-};
+export { dateToDaysFromToday, dateToIndex, todayIndex };
diff --git a/web/src/lib/date-utils.ts b/web/src/lib/date-utils.ts
new file mode 100644
index 0000000..bad1a90
--- /dev/null
+++ b/web/src/lib/date-utils.ts
@@ -0,0 +1,23 @@
+const EPOCH = new Date(0);
+const MS_PER_DAY = 86400000;
+
+export const dateToDaysFromToday = (dateStr: string): number => {
+  const target = new Date(dateStr);
+  target.setHours(0, 0, 0, 0);
+  const today = new Date();
+  today.setHours(0, 0, 0, 0);
+  return Math.floor((target.getTime() - today.getTime()) / MS_PER_DAY);
+};
+
+export const dateToIndex = (dateStr: string): number => {
+  const d = new Date(dateStr);
+  return Math.floor((d.getTime() - EPOCH.getTime()) / MS_PER_DAY);
+};
+
+export const todayIndex = (): number => {
+  const now = new Date();
+  now.setHours(0, 0, 0, 0);
+  return Math.floor((now.getTime() - EPOCH.getTime()) / MS_PER_DAY);
+};
+
+export { EPOCH, MS_PER_DAY };
diff --git a/web/src/lib/hotel-utils.ts b/web/src/lib/hotel-utils.ts
index b59994a..e5ba5c2 100644
--- a/web/src/lib/hotel-utils.ts
+++ b/web/src/lib/hotel-utils.ts
@@ -25,7 +25,7 @@ export interface Hotel {
   nights: number;
 }
 
-const EPOCH = new Date(0);
+import { EPOCH, MS_PER_DAY, dateToDaysFromToday, dateToIndex, todayIndex } from './date-utils';
 
 export const transformProduct = (p: HotelProduct): Hotel => {
   const { id, room_type, date_index, metadata } = p;
@@ -37,14 +37,14 @@ export const transformProduct = (p: HotelProduct): Hotel => {
     // legacy: treat as offset from today
     const today = new Date();
     today.setHours(0, 0, 0, 0);
-    checkIn = new Date(today.getTime() + date_index * 86400000);
+    checkIn = new Date(today.getTime() + date_index * MS_PER_DAY);
   } else {
     // proper: days since epoch
-    checkIn = new Date(EPOCH.getTime() + date_index * 86400000);
+    checkIn = new Date(EPOCH.getTime() + date_index * MS_PER_DAY);
   }
 
   const nights = 1;
-  const checkOut = new Date(checkIn.getTime() + nights * 86400000);
+  const checkOut = new Date(checkIn.getTime() + nights * MS_PER_DAY);
 
   const formatOpts: Intl.DateTimeFormatOptions = {
     month: 'short',
@@ -65,24 +65,34 @@ export const transformProduct = (p: HotelProduct): Hotel => {
   };
 };
 
-// convert date string to days from today
-export const dateToDaysFromToday = (dateStr: string): number => {
-  const target = new Date(dateStr);
-  target.setHours(0, 0, 0, 0);
-  const today = new Date();
-  today.setHours(0, 0, 0, 0);
-  return Math.floor((target.getTime() - today.getTime()) / 86400000);
+const hotelImagePool = [
+  'photo-1566073771259-6a8506099945',
+  'photo-1551882547-ff40c63fe5fa',
+  'photo-1590490360182-c33d57733427',
+  'photo-1582719478250-c89cae4dc85b',
+  'photo-1596701062351-8c2c14d1fdd0',
+  'photo-1631049307264-da0ec9d70304',
+  'photo-1578683010236-d716f9a3f461',
+  'photo-1540518614846-7eded433c457',
+  'photo-1505693416388-ac5ce068fe85',
+  'photo-1522771739844-6a9f6d5f14af',
+  'photo-1562438668-bcf0ca6578f0',
+  'photo-1595576508898-0ad5c879a061',
+];
+
+const hashString = (s: string): number => {
+  let h = 0;
+  for (let i = 0; i < s.length; i++) {
+    h = ((h << 5) - h) + s.charCodeAt(i);
+    h = h & h;
+  }
+  return Math.abs(h);
 };
 
-// convert date string to date_index (days since epoch)
-export const dateToIndex = (dateStr: string): number => {
-  const d = new Date(dateStr);
-  return Math.floor((d.getTime() - EPOCH.getTime()) / 86400000);
+export const getHotelImageUrl = (hotelId: string, size: { w: number; h: number } = { w: 400, h: 300 }): string => {
+  const idx = hashString(hotelId) % hotelImagePool.length;
+  const photoId = hotelImagePool[idx];
+  return `https://images.unsplash.com/${photoId}?w=${size.w}&h=${size.h}&fit=crop`;
 };
 
-// get current date_index
-export const todayIndex = (): number => {
-  const now = new Date();
-  now.setHours(0, 0, 0, 0);
-  return Math.floor((now.getTime() - EPOCH.getTime()) / 86400000);
-};
+export { dateToDaysFromToday, dateToIndex, todayIndex };

From 7c330a19c698340341131561934653dc9e109d33 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 15:36:20 +0100
Subject: [PATCH 81/99] feat: added a runner script for agent orchestration

---
 experiments/agents/run.py | 117 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 experiments/agents/run.py

diff --git a/experiments/agents/run.py b/experiments/agents/run.py
new file mode 100644
index 0000000..823c3d9
--- /dev/null
+++ b/experiments/agents/run.py
@@ -0,0 +1,117 @@
+from supabase import create_client, Client
+import os
+import random
+import asyncio
+import json
+from dotenv import load_dotenv
+
+from experiments.agents.agent import get_agent, AgentTypes
+from lib.kafka_client import get_interactions
+
+load_dotenv()
+
+RESULTS="/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+
+client = create_client(
+    os.getenv("NEXT_PUBLIC_SUPABASE_URL"),
+    os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
+)
+def pick_random_task():
+    mode = 'hotel'
+    tasks = client.table("tasks").select("*").execute().data
+    if mode == 'hotel':
+        # drop all that have 'flight' in the description
+        tasks = [task for task in tasks if 'flight' not in task['task_description'].lower()]
+    return random.choice(tasks) if tasks else None
+
+def clear_kafka_data():
+    """Delete and recreate Kafka topics to clear all data"""
+    from kafka.admin import KafkaAdminClient, NewTopic
+    from kafka.errors import UnknownTopicOrPartitionError
+    import time
+
+    kafka_host = os.getenv('KAFKA_HOST', 'localhost')
+    kafka_port = os.getenv('KAFKA_PORT', '9092')
+    broker = f'{kafka_host}:{kafka_port}'
+
+    admin = KafkaAdminClient(bootstrap_servers=broker)
+    topics = ['user-interactions', 'price-logs']
+
+    try:
+        admin.delete_topics(topics, timeout_ms=5000)
+        print(f"Deleted topics: {topics}")
+        time.sleep(2)
+    except UnknownTopicOrPartitionError:
+        print("Topics don't exist, skipping delete")
+    except Exception as e:
+        print(f"Error deleting topics: {e}")
+
+    new_topics = [
+        NewTopic(name='user-interactions', num_partitions=3, replication_factor=1),
+        NewTopic(name='price-logs', num_partitions=3, replication_factor=1)
+    ]
+
+    try:
+        admin.create_topics(new_topics=new_topics, validate_only=False)
+        print(f"Recreated topics: {topics}")
+    except Exception as e:
+        print(f"Error creating topics: {e}")
+    finally:
+        admin.close()
+
+def create_new_experiment(task_id):
+    import uuid
+    subject_name = f"agent_{str(uuid.uuid4())[:8]}"
+    experiment = {
+        "subject_name": subject_name,
+        "xp_human_only": False,
+        "xp_market_mode": "hotel",
+        "xp_task_id": task_id,
+    }
+    response = client.table("experiments").insert(experiment).execute()
+    return response.data[0] if response.data else None
+
+if __name__ == "__main__":
+    clear_kafka_data()
+
+    task = pick_random_task()
+    if not task:
+        print("No tasks available")
+        exit(1)
+
+    experiment = create_new_experiment(task['id'])
+    exp_id = experiment['id']
+    exp_dir = f"{RESULTS}{exp_id}"
+    os.makedirs(exp_dir, exist_ok=True)
+
+    # construct experiment URL with uuid param
+    base_url = os.getenv('NEXT_PUBLIC_API_BASE', 'http://localhost:3000')
+    agent_url = f"{base_url}/start-task?uuid={exp_id}"
+
+    print(f"Created experiment {exp_id} for task {task['id']}")
+    print(f"Agent will interact with: {agent_url}")
+
+    # instantiate and run agent
+    agent = get_agent(
+        AgentTypes.GENERIC_BROWSER_USE_AGENT,
+        goal=task['task_description'],
+        url=agent_url,
+        timeout=300,
+        headless=True
+    )
+
+    result = asyncio.run(agent.act())
+    print(f"Agent result: {result}")
+
+    # export interaction and price data from kafka
+    interactions = get_interactions(topic='user-interactions', timeout_ms=3000)
+    prices = get_interactions(topic='price-logs', timeout_ms=3000)
+
+    with open(f"{exp_dir}/int.json", 'w') as f:
+        json.dump(interactions, f, indent=2)
+
+    with open(f"{exp_dir}/price.json", 'w') as f:
+        json.dump(prices, f, indent=2)
+
+    print(f"Experiment {exp_id} completed.")
+    print(f"Exported {len(interactions)} interactions and {len(prices)} price logs to {exp_dir}")

From 9cb2b0fc4431f1a10af457d8fc17e1bb6e706032 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 15:37:06 +0100
Subject: [PATCH 82/99] feat: forgot airflow helper staging

---
 tests/e2e/helpers/airflow.ts | 61 ++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 tests/e2e/helpers/airflow.ts

diff --git a/tests/e2e/helpers/airflow.ts b/tests/e2e/helpers/airflow.ts
new file mode 100644
index 0000000..82d4a75
--- /dev/null
+++ b/tests/e2e/helpers/airflow.ts
@@ -0,0 +1,61 @@
+const AIRFLOW_URL = process.env.AIRFLOW_URL || 'http://localhost:8085';
+const AUTH = 'Basic ' + Buffer.from(`${process.env.AIRFLOW_USER || 'admin'}:${process.env.AIRFLOW_PASS || 'admin'}`).toString('base64');
+
+const req = (path: string, opts: any = {}) => {
+  const headers = { Authorization: AUTH, ...opts.headers };
+  return fetch(`${AIRFLOW_URL}${path}`, { ...opts, headers });
+};
+
+export const triggerDag = async (dagId: string, conf = {}) => {
+  const r = await req(`/api/v1/dags/${dagId}/dagRuns`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ conf }),
+  });
+  if (!r.ok) throw new Error(`Trigger DAG failed: ${r.status}`);
+  return (await r.json()).dag_run_id;
+};
+
+export const getDagStatus = async (dagId: string, runId: string) => {
+  const r = await req(`/api/v1/dags/${dagId}/dagRuns/${runId}`);
+  if (!r.ok) throw new Error(`Get status failed: ${r.status}`);
+  return (await r.json()).state;
+};
+
+export const cancelDag = async (dagId: string, runId: string) => {
+  const r = await req(`/api/v1/dags/${dagId}/dagRuns/${runId}`, {
+    method: 'PATCH',
+    headers: { 'Content-Type': 'application/json' },
+    body: JSON.stringify({ state: 'failed' }),
+  });
+  if (!r.ok) console.warn(`Failed to cancel DAG ${runId}: ${r.status}`);
+};
+
+export const waitForDag = async (dagId: string, runId: string, maxMs = 30000, pollMs = 1000) => {
+  const t0 = Date.now();
+  while (Date.now() - t0 < maxMs) {
+    const state = await getDagStatus(dagId, runId);
+    if (state === 'success') return;
+    if (state === 'failed') throw new Error(`DAG ${runId} failed`);
+    await new Promise(r => setTimeout(r, pollMs));
+  }
+  await cancelDag(dagId, runId);
+  throw new Error(`DAG ${runId} timeout`);
+};
+
+export const runDag = async (dagId: string, conf = {}, maxMs = 60000) => {
+  const runId = await triggerDag(dagId, conf);
+  await waitForDag(dagId, runId, maxMs);
+};
+
+export const runSessionPricing = (mode = 'hotel') =>
+  runDag('session_pricing_pipeline', { store_mode: mode, session_limit: 10 }, 90000);
+
+export const runSurgePricing = (mode = 'hotel', highThresh = 10, lowThresh = 2) =>
+  runDag('surge_pricing_pipeline', {
+    store_mode: mode,
+    high_threshold: highThresh,
+    low_threshold: lowThresh,
+    surge_multiplier: 1.2,
+    discount_multiplier: 0.9
+  }, 90000);

From af23d2f736a854fe2f835addc3112c31d3d38e30 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 15:57:05 +0100
Subject: [PATCH 83/99] feat: introduction of agentinc MDPs and KL divergence
 of > 2

---
 sim/rl/behavior_loader/loader.py | 20 +++++++
 sim/rl/behavior_loader/models.py | 89 ++++++++++++++++++++++++++++----
 2 files changed, 98 insertions(+), 11 deletions(-)

diff --git a/sim/rl/behavior_loader/loader.py b/sim/rl/behavior_loader/loader.py
index 99a1541..bd18442 100644
--- a/sim/rl/behavior_loader/loader.py
+++ b/sim/rl/behavior_loader/loader.py
@@ -56,7 +56,27 @@ class Loader:
     def get_entries(self) -> tuple[list[str], int]:
         return self.entries, len(self.entries)
 
+class AgentLoader(Loader):
+    """Loader for agent interaction data with simplified schema (direct PayloadModel format)"""
+
+    def _is_admin_page_simple(self, interaction: PayloadModel) -> bool:
+        return interaction.page and interaction.page.startswith("/admin/")
+
+    def _load_sessions(self) -> dict:
+        sessions = {}
+        for entry in self.entries:
+            int_path = f"{self.src_dir}/{entry}/int.json"
+            raw = json.load(open(int_path))
+            ints = [PayloadModel(**i) for i in raw]
+            sessions[entry] = [i for i in ints if not self._is_admin_page_simple(i)]
+        return sessions
+
 if __name__ == "__main__":
+    DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+    loader = AgentLoader(DIR)
+    _, n = loader.get_entries()
+    print(f"Loaded {n} sessions from {DIR}")
+
     DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
     loader = Loader(DIR)
     _, n = loader.get_entries()
diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index bce2429..7254606 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -1,10 +1,12 @@
-from loader import Loader
+from experiments.agents.base import Agent
+from loader import Loader, AgentLoader
 from collections import defaultdict
 from typing import Dict, List, Tuple, Set
 import numpy as np
 import graphviz
 
 DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+AGENT_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
 
 class BehaviorModel:
     def __init__(self, src_dir: str = DIR):
@@ -85,13 +87,32 @@ class BehaviorModel:
             path.append(curr)
         return path
 
-def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False, export_dot: bool = False):
-    """visualize MDP as directed graph using graphviz, aggregated by event type"""
-    if not model.mdp: raise ValueError("build MDP first")
+class AgentBehaviorModel(BehaviorModel):
+    """behavior model for agent interaction data (simplified PayloadModel schema)"""
 
-    # aggregate transitions by event type
+    def __init__(self, src_dir: str = AGENT_DIR):
+        self.loader = AgentLoader(src_dir)
+        self.data = self.loader.get_data()
+        self.entries, self.num_entries = self.loader.get_entries()
+        self.mdp = None
+
+    def _state_repr(self, evt) -> str:
+        # direct access to PayloadModel fields (no .value.payload nesting)
+        return f"{evt.page or 'unk'}|{evt.productId or 'none'}|{evt.eventName}"
+
+    def _extract_sessions(self):
+        trajectories = []
+        for sid, evts in self.data.items():
+            if len(evts) < 2: continue
+            # sort by timestamp string (ISO format sorts lexicographically)
+            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.ts)]
+            trajectories.append(states)
+        return trajectories
+
+def aggregate_event_transitions(mdp: Dict) -> Dict[str, Dict[str, float]]:
+    """aggregate state transitions by event type and normalize"""
     evt_trans = defaultdict(lambda: defaultdict(float))
-    for s, trans in model.mdp['transitions'].items():
+    for s, trans in mdp['transitions'].items():
         evt_src = s.split('|')[2]
         for s_next, prob in trans.items():
             evt_dst = s_next.split('|')[2]
@@ -103,6 +124,13 @@ def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "
         if total > 0:
             for evt_dst in evt_trans[evt_src]:
                 evt_trans[evt_src][evt_dst] /= total
+    return dict(evt_trans)
+
+def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False, export_dot: bool = False):
+    """visualize MDP as directed graph using graphviz, aggregated by event type"""
+    if not model.mdp: raise ValueError("build MDP first")
+
+    evt_trans = aggregate_event_transitions(model.mdp)
 
     g = graphviz.Digraph(format=fmt)
     g.attr(rankdir='LR', size='30')
@@ -134,11 +162,50 @@ def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "
 
     return g
 
+
+def kl_divergence(p: Dict[str, float], q: Dict[str, float]) -> float:
+    """Compute KL divergence D_KL(P || Q) for discrete distributions P and Q."""
+    epsilon = 1e-10  # small constant to avoid log(0)
+    kl_div = 0.0
+    for key in p:
+        p_val = p[key] + epsilon
+        q_val = q.get(key, 0.0) + epsilon
+        kl_div += p_val * np.log(p_val / q_val)
+    return kl_div
+
 if __name__ == "__main__":
-    model = BehaviorModel(DIR)
-    mdp = model.build_MDP()
-    print(f"Built MDP: {mdp['num_states']} states, {sum(len(t) for t in mdp['transitions'].values())} transitions")
-    if not mdp['states']:
+    human_model = BehaviorModel(DIR)
+    human_mdp = human_model.build_MDP()
+    print(f"Built MDP: {human_mdp['num_states']} states, {sum(len(t) for t in human_mdp['transitions'].values())} transitions")
+    if not human_mdp['states']:
         print("No states found")
         exit(1)
-    visualize_mdp(model, threshold=0.05, output="mdp_viz", fmt="pdf", export_dot=True)
+    visualize_mdp(human_model, threshold=0.05, output="human_mdp_viz", fmt="pdf", export_dot=True)
+
+    agent_model = AgentBehaviorModel()
+    agent_mdp = agent_model.build_MDP()
+    print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, {sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
+    if not agent_mdp['states']:
+        print("No states found")
+        exit(1)
+    visualize_mdp(agent_model, threshold=0.05, output="agent_mdp_viz", fmt="pdf", export_dot=True)
+
+    # aggregate transitions by event type for both models
+    human_evt_trans = aggregate_event_transitions(human_mdp)
+    agent_evt_trans = aggregate_event_transitions(agent_mdp)
+
+    common_evts = set(human_evt_trans.keys()) & set(agent_evt_trans.keys())
+    if not common_evts: import sys; sys.exit("No common event types for KL divergence analysis")
+
+    kl_divs = []
+    for evt in common_evts:
+        kl = kl_divergence(human_evt_trans[evt], agent_evt_trans[evt])
+        kl_divs.append((evt, kl))
+
+    kl_divs.sort(key=lambda x: x[1], reverse=True)
+    avg_kl = np.mean([kl for _, kl in kl_divs])
+
+    print(f"Average KL divergence: {avg_kl:.4f}")
+    print(f"\nMost divergent event types:")
+    for evt, kl in kl_divs:
+        print(f"  {evt}: {kl:.4f}")

From 87a35fad2c9c0954de5332edf4a55b53ca6b7049 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 16:42:50 +0100
Subject: [PATCH 84/99] feat: joint loader

---
 sim/rl/behavior_loader/loader.py | 47 ++++++++++++++++++++++++++------
 sim/rl/behavior_loader/models.py | 32 +++++++++++++++++++++-
 2 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/sim/rl/behavior_loader/loader.py b/sim/rl/behavior_loader/loader.py
index bd18442..620576c 100644
--- a/sim/rl/behavior_loader/loader.py
+++ b/sim/rl/behavior_loader/loader.py
@@ -71,13 +71,44 @@ class AgentLoader(Loader):
             sessions[entry] = [i for i in ints if not self._is_admin_page_simple(i)]
         return sessions
 
-if __name__ == "__main__":
-    DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-    loader = AgentLoader(DIR)
-    _, n = loader.get_entries()
-    print(f"Loaded {n} sessions from {DIR}")
+class JointLoader:
+    """Loader for combined human (Kafka) and agent (direct) data without discrimination"""
 
-    DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
-    loader = Loader(DIR)
+    def __init__(self, human_dir: str, agent_dir: str):
+        self.human_dir = human_dir
+        self.agent_dir = agent_dir
+        self.human_loader = Loader(human_dir)
+        self.agent_loader = AgentLoader(agent_dir)
+        self.data = self._load_joint_sessions()
+        self.entries = list(self.data.keys())
+
+    def _load_joint_sessions(self) -> dict:
+        sessions = {}
+        # load human sessions (unwrap from Kafka format to PayloadModel)
+        for sid, evts in self.human_loader.get_data().items():
+            sessions[f"human_{sid}"] = [evt.value.payload for evt in evts]
+        # load agent sessions (already PayloadModel)
+        for sid, evts in self.agent_loader.get_data().items():
+            sessions[f"agent_{sid}"] = evts
+        return sessions
+
+    def get_data(self) -> dict:
+        return self.data
+
+    def get_entries(self) -> tuple[list[str], int]:
+        return self.entries, len(self.entries)
+
+if __name__ == "__main__":
+    AGENT_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+    loader = AgentLoader(AGENT_DIR)
     _, n = loader.get_entries()
-    print(f"Loaded {n} sessions from {DIR}")
+    print(f"Loaded {n} agent sessions from {AGENT_DIR}")
+
+    HUMAN_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+    loader = Loader(HUMAN_DIR)
+    _, n = loader.get_entries()
+    print(f"Loaded {n} human sessions from {HUMAN_DIR}")
+
+    joint_loader = JointLoader(HUMAN_DIR, AGENT_DIR)
+    _, n = joint_loader.get_entries()
+    print(f"Loaded {n} total sessions (combined) from joint loader")
diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 7254606..46ac99d 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -1,5 +1,5 @@
 from experiments.agents.base import Agent
-from loader import Loader, AgentLoader
+from loader import Loader, AgentLoader, JointLoader
 from collections import defaultdict
 from typing import Dict, List, Tuple, Set
 import numpy as np
@@ -109,6 +109,28 @@ class AgentBehaviorModel(BehaviorModel):
             trajectories.append(states)
         return trajectories
 
+class JointBehaviorModel(BehaviorModel):
+    """behavior model for combined human+agent data (flat PayloadModel distribution)"""
+
+    def __init__(self, human_dir: str = DIR, agent_dir: str = AGENT_DIR):
+        self.loader = JointLoader(human_dir, agent_dir)
+        self.data = self.loader.get_data()
+        self.entries, self.num_entries = self.loader.get_entries()
+        self.mdp = None
+
+    def _state_repr(self, evt) -> str:
+        # direct access to PayloadModel fields (JointLoader unwraps to PayloadModel)
+        return f"{evt.page or 'unk'}|{evt.productId or 'none'}|{evt.eventName}"
+
+    def _extract_sessions(self):
+        trajectories = []
+        for sid, evts in self.data.items():
+            if len(evts) < 2: continue
+            # sort by timestamp string (ISO format sorts lexicographically)
+            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.ts)]
+            trajectories.append(states)
+        return trajectories
+
 def aggregate_event_transitions(mdp: Dict) -> Dict[str, Dict[str, float]]:
     """aggregate state transitions by event type and normalize"""
     evt_trans = defaultdict(lambda: defaultdict(float))
@@ -209,3 +231,11 @@ if __name__ == "__main__":
     print(f"\nMost divergent event types:")
     for evt, kl in kl_divs:
         print(f"  {evt}: {kl:.4f}")
+
+    # build joint model (combined distribution)
+    print("\n=== Joint Model (Human + Agent Combined) ===")
+    joint_model = JointBehaviorModel()
+    joint_mdp = joint_model.build_MDP()
+    print(f"Built joint MDP: {joint_mdp['num_states']} states, {sum(len(t) for t in joint_mdp['transitions'].values())} transitions")
+    if joint_mdp['states']:
+        visualize_mdp(joint_model, threshold=0.05, output="joint_mdp_viz", fmt="pdf", export_dot=True)

From 82b54428b7494858597dfd91acaa7733378362aa Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 16:46:17 +0100
Subject: [PATCH 85/99] chore: refactor the loader class

---
 sim/rl/behavior_loader/loader.py | 67 ++++++++++++--------------------
 1 file changed, 25 insertions(+), 42 deletions(-)

diff --git a/sim/rl/behavior_loader/loader.py b/sim/rl/behavior_loader/loader.py
index 620576c..3336956 100644
--- a/sim/rl/behavior_loader/loader.py
+++ b/sim/rl/behavior_loader/loader.py
@@ -1,6 +1,6 @@
 import os
-from pydantic import BaseModel as Base
 import json
+from pydantic import BaseModel as Base
 
 class PayloadModel(Base):
     sessionId: str
@@ -30,6 +30,9 @@ class InteractionModel(Base):
     key: dict
     value: ValueModel
 
+def _is_admin(page: str | None) -> bool:
+    return page is not None and page.startswith("/admin/")
+
 class Loader:
     def __init__(self, src_dir: str):
         self.src_dir = src_dir
@@ -37,17 +40,13 @@ class Loader:
         if not self.entries: raise ValueError("empty directory")
         self.data = self._load_sessions()
 
-    def _is_admin_page(self, interaction: InteractionModel) -> bool:
-        page = interaction.value.payload.page
-        return page and page.startswith("/admin/")
-
     def _load_sessions(self) -> dict:
         sessions = {}
         for entry in self.entries:
-            int_path = f"{self.src_dir}/{entry}/int.json"
-            raw = json.load(open(int_path))
+            with open(f"{self.src_dir}/{entry}/int.json") as f:
+                raw = json.load(f)
             ints = [InteractionModel(**i) for i in raw]
-            sessions[entry] = [i for i in ints if not self._is_admin_page(i)]
+            sessions[entry] = [i for i in ints if not _is_admin(i.value.payload.page)]
         return sessions
 
     def get_data(self) -> dict:
@@ -57,40 +56,29 @@ class Loader:
         return self.entries, len(self.entries)
 
 class AgentLoader(Loader):
-    """Loader for agent interaction data with simplified schema (direct PayloadModel format)"""
-
-    def _is_admin_page_simple(self, interaction: PayloadModel) -> bool:
-        return interaction.page and interaction.page.startswith("/admin/")
-
     def _load_sessions(self) -> dict:
         sessions = {}
         for entry in self.entries:
-            int_path = f"{self.src_dir}/{entry}/int.json"
-            raw = json.load(open(int_path))
+            with open(f"{self.src_dir}/{entry}/int.json") as f:
+                raw = json.load(f)
             ints = [PayloadModel(**i) for i in raw]
-            sessions[entry] = [i for i in ints if not self._is_admin_page_simple(i)]
+            sessions[entry] = [i for i in ints if not _is_admin(i.page)]
         return sessions
 
 class JointLoader:
-    """Loader for combined human (Kafka) and agent (direct) data without discrimination"""
-
     def __init__(self, human_dir: str, agent_dir: str):
-        self.human_dir = human_dir
-        self.agent_dir = agent_dir
         self.human_loader = Loader(human_dir)
         self.agent_loader = AgentLoader(agent_dir)
-        self.data = self._load_joint_sessions()
+        self.data = self._merge()
         self.entries = list(self.data.keys())
 
-    def _load_joint_sessions(self) -> dict:
-        sessions = {}
-        # load human sessions (unwrap from Kafka format to PayloadModel)
-        for sid, evts in self.human_loader.get_data().items():
-            sessions[f"human_{sid}"] = [evt.value.payload for evt in evts]
-        # load agent sessions (already PayloadModel)
-        for sid, evts in self.agent_loader.get_data().items():
-            sessions[f"agent_{sid}"] = evts
-        return sessions
+    def _merge(self) -> dict:
+        return {
+            **{f"human_{sid}": [e.value.payload for e in evts]
+               for sid, evts in self.human_loader.get_data().items()},
+            **{f"agent_{sid}": evts
+               for sid, evts in self.agent_loader.get_data().items()}
+        }
 
     def get_data(self) -> dict:
         return self.data
@@ -99,16 +87,11 @@ class JointLoader:
         return self.entries, len(self.entries)
 
 if __name__ == "__main__":
-    AGENT_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-    loader = AgentLoader(AGENT_DIR)
-    _, n = loader.get_entries()
-    print(f"Loaded {n} agent sessions from {AGENT_DIR}")
+    agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+    human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
 
-    HUMAN_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
-    loader = Loader(HUMAN_DIR)
-    _, n = loader.get_entries()
-    print(f"Loaded {n} human sessions from {HUMAN_DIR}")
-
-    joint_loader = JointLoader(HUMAN_DIR, AGENT_DIR)
-    _, n = joint_loader.get_entries()
-    print(f"Loaded {n} total sessions (combined) from joint loader")
+    for name, cls, path in [("agent", AgentLoader, agent_dir),
+                             ("human", Loader, human_dir),
+                             ("joint", lambda d: JointLoader(human_dir, d), agent_dir)]:
+        ldr = cls(path) if name != "joint" else cls(agent_dir)
+        print(f"Loaded {len(ldr.get_entries()[0])} {name} sessions")

From e9cf5f07367e3ad85b94caaf038eb7a0e6f8d852 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 16:51:00 +0100
Subject: [PATCH 86/99] refactor models computations

---
 sim/rl/behavior_loader/models.py | 186 ++++++++++++-------------------
 1 file changed, 69 insertions(+), 117 deletions(-)

diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 46ac99d..84c2fe4 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -1,16 +1,12 @@
-from experiments.agents.base import Agent
 from loader import Loader, AgentLoader, JointLoader
 from collections import defaultdict
 from typing import Dict, List, Tuple, Set
 import numpy as np
 import graphviz
 
-DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
-AGENT_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-
 class BehaviorModel:
-    def __init__(self, src_dir: str = DIR):
-        self.loader = Loader(src_dir)
+    def __init__(self, src_dir: str, loader_cls=Loader):
+        self.loader = loader_cls(src_dir)
         self.data = self.loader.get_data()
         self.entries, self.num_entries = self.loader.get_entries()
         self.mdp = None
@@ -19,50 +15,48 @@ class BehaviorModel:
         p = evt.value.payload
         return f"{p.page or 'unk'}|{p.productId or 'none'}|{p.eventName}"
 
-    def _extract_sessions(self):
-        # transform raw events into sequential state trajectories per session
-        trajectories = []
-        for sid, evts in self.data.items():
-            if len(evts) < 2: continue
-            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.timestamp)]
-            trajectories.append(states)
-        return trajectories
+    def _sort_key(self, evt):
+        return evt.timestamp
 
-    def _calc_transitions(self, trajectories: List[List[str]]) -> Tuple[Dict, Set]:
-        trans = defaultdict(lambda: defaultdict(int))
-        states = set()
-        for traj in trajectories:
-            for i in range(len(traj) - 1):
-                s, s_next = traj[i], traj[i+1]
+    def _extract_sessions(self) -> List[List[str]]:
+        trajs = []
+        for evts in self.data.values():
+            if len(evts) < 2: continue
+            states = [self._state_repr(e) for e in sorted(evts, key=self._sort_key)]
+            trajs.append(states)
+        return trajs
+
+    def _calc_transitions(self, trajs: List[List[str]]) -> Tuple[Dict, Set]:
+        trans, states = defaultdict(lambda: defaultdict(int)), set()
+        for traj in trajs:
+            for s, s_next in zip(traj, traj[1:]):
                 trans[s][s_next] += 1
                 states.update([s, s_next])
         return trans, states
 
-    def _calc_rewards(self, trajectories: List[List[str]]) -> Dict:
-        # reward based on session progression depth
+    def _calc_rewards(self, trajs: List[List[str]]) -> Dict:
         rwd = defaultdict(list)
-        for traj in trajectories:
+        for traj in trajs:
             n = len(traj)
             for i, s in enumerate(traj):
                 rwd[s].append(i / n)
         return rwd
 
-    def _normalize_trans(self, counts: Dict) -> Dict:
+    def _normalize_trans(self, cnts: Dict) -> Dict:
         return {s: {s_n: cnt/sum(nxt.values()) for s_n, cnt in nxt.items()}
-                for s, nxt in counts.items()}
+                for s, nxt in cnts.items()}
 
     def build_MDP(self) -> Dict:
         trajs = self._extract_sessions()
         trans_cnt, states = self._calc_transitions(trajs)
         trans_prob = self._normalize_trans(trans_cnt)
         state_rwd = self._calc_rewards(trajs)
-        state_val = {s: np.mean(r) for s, r in state_rwd.items()}
 
         self.mdp = {
-            'states': sorted(list(states)),
+            'states': sorted(states),
             'num_states': len(states),
             'transitions': trans_prob,
-            'state_values': state_val,
+            'state_values': {s: np.mean(r) for s, r in state_rwd.items()},
             'state_rewards': state_rwd,
             'trans_counts': trans_cnt,
         }
@@ -78,8 +72,7 @@ class BehaviorModel:
 
     def sample_traj(self, start: str, max_len: int = 50) -> List[str]:
         if not self.mdp: raise ValueError("build MDP first")
-        path = [start]
-        curr = start
+        path, curr = [start], start
         for _ in range(max_len):
             nxt = self.mdp['transitions'].get(curr, {})
             if not nxt: break
@@ -88,154 +81,113 @@ class BehaviorModel:
         return path
 
 class AgentBehaviorModel(BehaviorModel):
-    """behavior model for agent interaction data (simplified PayloadModel schema)"""
-
-    def __init__(self, src_dir: str = AGENT_DIR):
-        self.loader = AgentLoader(src_dir)
-        self.data = self.loader.get_data()
-        self.entries, self.num_entries = self.loader.get_entries()
-        self.mdp = None
+    def __init__(self, src_dir: str):
+        super().__init__(src_dir, AgentLoader)
 
     def _state_repr(self, evt) -> str:
-        # direct access to PayloadModel fields (no .value.payload nesting)
         return f"{evt.page or 'unk'}|{evt.productId or 'none'}|{evt.eventName}"
 
-    def _extract_sessions(self):
-        trajectories = []
-        for sid, evts in self.data.items():
-            if len(evts) < 2: continue
-            # sort by timestamp string (ISO format sorts lexicographically)
-            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.ts)]
-            trajectories.append(states)
-        return trajectories
+    def _sort_key(self, evt):
+        return evt.ts
 
 class JointBehaviorModel(BehaviorModel):
-    """behavior model for combined human+agent data (flat PayloadModel distribution)"""
-
-    def __init__(self, human_dir: str = DIR, agent_dir: str = AGENT_DIR):
+    def __init__(self, human_dir: str, agent_dir: str):
         self.loader = JointLoader(human_dir, agent_dir)
         self.data = self.loader.get_data()
         self.entries, self.num_entries = self.loader.get_entries()
         self.mdp = None
 
     def _state_repr(self, evt) -> str:
-        # direct access to PayloadModel fields (JointLoader unwraps to PayloadModel)
         return f"{evt.page or 'unk'}|{evt.productId or 'none'}|{evt.eventName}"
 
-    def _extract_sessions(self):
-        trajectories = []
-        for sid, evts in self.data.items():
-            if len(evts) < 2: continue
-            # sort by timestamp string (ISO format sorts lexicographically)
-            states = [self._state_repr(e) for e in sorted(evts, key=lambda x: x.ts)]
-            trajectories.append(states)
-        return trajectories
+    def _sort_key(self, evt):
+        return evt.ts
 
 def aggregate_event_transitions(mdp: Dict) -> Dict[str, Dict[str, float]]:
-    """aggregate state transitions by event type and normalize"""
     evt_trans = defaultdict(lambda: defaultdict(float))
     for s, trans in mdp['transitions'].items():
-        evt_src = s.split('|')[2]
+        src = s.split('|')[2]
         for s_next, prob in trans.items():
-            evt_dst = s_next.split('|')[2]
-            evt_trans[evt_src][evt_dst] += prob
+            dst = s_next.split('|')[2]
+            evt_trans[src][dst] += prob
 
-    # normalize aggregated transitions
-    for evt_src in evt_trans:
-        total = sum(evt_trans[evt_src].values())
+    for src in evt_trans:
+        total = sum(evt_trans[src].values())
         if total > 0:
-            for evt_dst in evt_trans[evt_src]:
-                evt_trans[evt_src][evt_dst] /= total
+            evt_trans[src] = {dst: p/total for dst, p in evt_trans[src].items()}
     return dict(evt_trans)
 
-def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph", fmt: str = "svg", view: bool = False, export_dot: bool = False):
-    """visualize MDP as directed graph using graphviz, aggregated by event type"""
+def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph",
+                  fmt: str = "svg", view: bool = False, export_dot: bool = False):
     if not model.mdp: raise ValueError("build MDP first")
 
     evt_trans = aggregate_event_transitions(model.mdp)
-
     g = graphviz.Digraph(format=fmt)
     g.attr(rankdir='LR', size='30')
     g.attr('node', shape='circle', width='1', height='1')
 
-    # collect all event types
-    events = set(evt_trans.keys())
-    for trans in evt_trans.values():
-        events.update(trans.keys())
-
-    # add nodes for each event type
+    events = set(evt_trans.keys()) | {e for trans in evt_trans.values() for e in trans.keys()}
     for evt in events:
         g.node(evt)
 
-    # add edges above threshold
-    for evt_src in evt_trans:
-        for evt_dst, prob in evt_trans[evt_src].items():
+    for src, dsts in evt_trans.items():
+        for dst, prob in dsts.items():
             if prob > threshold:
-                g.edge(evt_src, evt_dst, label=f'{prob:.2f}')
+                g.edge(src, dst, label=f'{prob:.2f}')
 
     g.render(output, view=view, cleanup=True)
     print(f"Saved MDP graph to {output}.{fmt}")
 
     if export_dot:
-        dot_file = f"{output}.dot"
-        with open(dot_file, 'w') as f:
+        with open(f"{output}.dot", 'w') as f:
             f.write(g.source)
-        print(f"Exported DOT source to {dot_file}")
+        print(f"Exported DOT source to {output}.dot")
 
     return g
 
-
 def kl_divergence(p: Dict[str, float], q: Dict[str, float]) -> float:
-    """Compute KL divergence D_KL(P || Q) for discrete distributions P and Q."""
-    epsilon = 1e-10  # small constant to avoid log(0)
-    kl_div = 0.0
-    for key in p:
-        p_val = p[key] + epsilon
-        q_val = q.get(key, 0.0) + epsilon
-        kl_div += p_val * np.log(p_val / q_val)
-    return kl_div
+    eps = 1e-10
+    return sum((p[k] + eps) * np.log((p[k] + eps) / (q.get(k, 0.0) + eps)) for k in p)
 
 if __name__ == "__main__":
-    human_model = BehaviorModel(DIR)
+    base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
+    human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
+
+    human_model = BehaviorModel(human_dir)
     human_mdp = human_model.build_MDP()
-    print(f"Built MDP: {human_mdp['num_states']} states, {sum(len(t) for t in human_mdp['transitions'].values())} transitions")
+    print(f"Built MDP: {human_mdp['num_states']} states, "
+          f"{sum(len(t) for t in human_mdp['transitions'].values())} transitions")
     if not human_mdp['states']:
-        print("No states found")
-        exit(1)
+        exit("No states found")
     visualize_mdp(human_model, threshold=0.05, output="human_mdp_viz", fmt="pdf", export_dot=True)
 
-    agent_model = AgentBehaviorModel()
+    agent_model = AgentBehaviorModel(agent_dir)
     agent_mdp = agent_model.build_MDP()
-    print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, {sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
+    print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
+          f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
     if not agent_mdp['states']:
-        print("No states found")
-        exit(1)
+        exit("No states found")
     visualize_mdp(agent_model, threshold=0.05, output="agent_mdp_viz", fmt="pdf", export_dot=True)
 
-    # aggregate transitions by event type for both models
-    human_evt_trans = aggregate_event_transitions(human_mdp)
-    agent_evt_trans = aggregate_event_transitions(agent_mdp)
+    human_evt = aggregate_event_transitions(human_mdp)
+    agent_evt = aggregate_event_transitions(agent_mdp)
+    common = set(human_evt.keys()) & set(agent_evt.keys())
 
-    common_evts = set(human_evt_trans.keys()) & set(agent_evt_trans.keys())
-    if not common_evts: import sys; sys.exit("No common event types for KL divergence analysis")
+    if not common:
+        exit("No common event types for KL divergence analysis")
 
-    kl_divs = []
-    for evt in common_evts:
-        kl = kl_divergence(human_evt_trans[evt], agent_evt_trans[evt])
-        kl_divs.append((evt, kl))
+    kl_divs = sorted([(e, kl_divergence(human_evt[e], agent_evt[e])) for e in common],
+                     key=lambda x: x[1], reverse=True)
 
-    kl_divs.sort(key=lambda x: x[1], reverse=True)
-    avg_kl = np.mean([kl for _, kl in kl_divs])
-
-    print(f"Average KL divergence: {avg_kl:.4f}")
-    print(f"\nMost divergent event types:")
+    print(f"Average KL divergence: {np.mean([kl for _, kl in kl_divs]):.4f}")
+    print("\nMost divergent event types:")
     for evt, kl in kl_divs:
         print(f"  {evt}: {kl:.4f}")
 
-    # build joint model (combined distribution)
     print("\n=== Joint Model (Human + Agent Combined) ===")
-    joint_model = JointBehaviorModel()
+    joint_model = JointBehaviorModel(human_dir, agent_dir)
     joint_mdp = joint_model.build_MDP()
-    print(f"Built joint MDP: {joint_mdp['num_states']} states, {sum(len(t) for t in joint_mdp['transitions'].values())} transitions")
+    print(f"Built joint MDP: {joint_mdp['num_states']} states, "
+          f"{sum(len(t) for t in joint_mdp['transitions'].values())} transitions")
     if joint_mdp['states']:
         visualize_mdp(joint_model, threshold=0.05, output="joint_mdp_viz", fmt="pdf", export_dot=True)

From 0ce12fbc3beb086f627b1269e470170f76c319d2 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 13 Jan 2026 19:50:36 +0100
Subject: [PATCH 87/99] chore: ignores

---
 .gitignore | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9db7742..ef6746f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,18 +5,22 @@
 **/.virtual_documents/
 **/session_*.svg
 **/*graph.svg
-paper/src/bib/auto
+**/auto/*.el
+*.old
+**/package-lock.json
+**/*.parquet
 
-# Airflow logs - exclude DAG run logs
+paper/src/auto/*
+paper/src/bib/auto
+docs/goals/*.md
+PHANTOM.wiki/
 experiments/airflow/logs/*
 experiments/airflow/logs/scheduler/
 experiments/airflow/logs/dag_processor_manager/
-experiments/collected_data/*
-
-paper/src/auto/*
-lib/
-docs/goals/*.md
-PHANTOM.wiki/
+experiments/collected_data/
+experiments/agents/collected_data/
+sim/rl/behavior_loader/*.dot
+sim/rl/behavior_loader/*.png
+sim/rl/behavior_loader/*.svg
+sim/rl/behavior_loader/*.pdf
 tests/e2e/node_modules/**
-**/auto/*.el
-*.old

From 7b2d80ac4c96f4583028e4a049265be459662a72 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Tue, 20 Jan 2026 21:00:47 +0100
Subject: [PATCH 88/99] feat: wip contaminator

---
 experiments/procesing/contaminator.py | 44 +++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 experiments/procesing/contaminator.py

diff --git a/experiments/procesing/contaminator.py b/experiments/procesing/contaminator.py
new file mode 100644
index 0000000..0a3651d
--- /dev/null
+++ b/experiments/procesing/contaminator.py
@@ -0,0 +1,44 @@
+import pandas as pd
+import random
+from sim.rl.behavior_loader import AgentBehaviorModel
+
+base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
+human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
+
+
+
+def remap_schema(df : pd.DataFrame, mapping: dict, on: str = "event_type"):
+    df = df.copy()
+    df[on] = df[on].map(mapping).fillna(df[on])
+    return df
+
+
+def contaminate_dataset(df : pd.DataFrame, on : str = "event_type",
+                        contamination_rate: float = 0.1) -> pd.DataFrame:
+    model = AgentBehaviorModel(agent_dir)
+    target_df_schema = df[on].unique().tolist()
+    mapping = {
+        'view': 'view_page'
+        # TODO: define properly for the given dataset
+    }
+    OG_event_distribution = df[on].value_counts(normalize=True).to_dict()
+    # normalize to weights
+    OG_event_distribution = {k: v / sum(OG_event_distribution.values()) for k, v in OG_event_distribution.items()}
+    mapped_df = remap_schema(df, mapping, on=on)
+    N = len(df)
+    N_final = N / (1 - contamination_rate) # TODO: explain this in paper
+    N_contaminate = int(N_final - N)
+    start_event_types = random.choices(list(OG_event_distribution.keys()),
+                                    weights=list(OG_event_distribution.values()), k=N_contaminate)
+    # it makes sense
+    new_trajectories = []
+    for start_event in start_event_types:
+        # sample from og start
+        start = None # TODO: defin start accoding to dataset (randomly sample with weights of event distr)
+        trajectory = model.sample_trajectory(start) # TODO: explain this method in paper
+        new_trajectories.extend(trajectory)
+
+    # TODO: make sure the new trajctories schema conforms with dataset
+    contaminate_df = pd.DataFrame(new_trajectories)
+    df = pd.concat([df, contaminate_df], ignore_index=True)
+    return df

From b2f0746c01585a4fc6189feed7b0244be4d5be3b Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 11:11:49 +0100
Subject: [PATCH 89/99] chore: extra commenting

---
 experiments/procesing/contaminator.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/experiments/procesing/contaminator.py b/experiments/procesing/contaminator.py
index 0a3651d..da44c3d 100644
--- a/experiments/procesing/contaminator.py
+++ b/experiments/procesing/contaminator.py
@@ -1,9 +1,9 @@
 import pandas as pd
 import random
-from sim.rl.behavior_loader import AgentBehaviorModel
+from sim.rl.behavior_loader import AgentBehaviorModel # TODO: proper import this
 
 base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
-human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
+agent_dir = f"{base_dir}/agents/collected_data/"
 
 
 
@@ -21,6 +21,7 @@ def contaminate_dataset(df : pd.DataFrame, on : str = "event_type",
         'view': 'view_page'
         # TODO: define properly for the given dataset
     }
+    # think about replacing with freqdist method from library
     OG_event_distribution = df[on].value_counts(normalize=True).to_dict()
     # normalize to weights
     OG_event_distribution = {k: v / sum(OG_event_distribution.values()) for k, v in OG_event_distribution.items()}

From 04907df393149c61ff3efd3ffcdfa37bed2d8db5 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 11:27:03 +0100
Subject: [PATCH 90/99] feat: weak train scaffold

---
 experiments/ml/arch.py       | 117 +++--------------------------------
 experiments/ml/weak.train.py |  30 +++++++++
 2 files changed, 39 insertions(+), 108 deletions(-)
 create mode 100644 experiments/ml/weak.train.py

diff --git a/experiments/ml/arch.py b/experiments/ml/arch.py
index 4f36e18..a187959 100644
--- a/experiments/ml/arch.py
+++ b/experiments/ml/arch.py
@@ -12,111 +12,12 @@ TASK = 'classification'
 LABELS = ['human', 'agent']
 
 
-class BaseAgentClassifier(BaseEstimator, ClassifierMixin, ABC):
-    """Base class for tree-based agent detection classifiers with common logic"""
-
-    def __init__(self, context: Optional[PipelineContext] = None, n_estimators: int = 200,
-                 max_depth: int = 6, learning_rate: float = 0.05,
-                 early_stopping_rounds: int = 20):
-        self.context = context
-        self.n_estimators = n_estimators
-        self.max_depth = max_depth
-        self.learning_rate = learning_rate
-        self.early_stopping_rounds = early_stopping_rounds
-        self.model_ = None
-        self.feature_names_ = None
-
-    def _to_array(self, X):
-        """Convert pandas structures to numpy arrays"""
-        return X.values if isinstance(X, (pd.DataFrame, pd.Series)) else X
-
-    def _compute_pos_weight(self, y_arr):
-        """Calculate scale_pos_weight for class imbalance handling"""
-        n_neg, n_pos = (y_arr == 0).sum(), (y_arr == 1).sum()
-        return n_neg / n_pos if n_pos > 0 else 1.0
-
-    def _prepare_eval_set(self, eval_set):
-        """Convert eval_set to numpy arrays if needed"""
-        if not eval_set:
-            return None
-        X_val, y_val = eval_set[0]
-        return [(self._to_array(X_val), self._to_array(y_val))]
-
-    @abstractmethod
-    def _build_model(self, scale_pos: float):
-        """Build the underlying model instance (must be implemented by subclasses)"""
-        pass
-
-    @abstractmethod
-    def _fit_with_eval(self, X_arr, y_arr, eval_arr):
-        """Fit model with evaluation set (must be implemented by subclasses)"""
-        pass
-
-    def fit(self, X, y, eval_set=None):
-        X_arr, y_arr = self._to_array(X), self._to_array(y)
-
-        if isinstance(X, pd.DataFrame):
-            self.feature_names_ = X.columns.tolist()
-
-        scale_pos = self._compute_pos_weight(y_arr)
-        self.model_ = self._build_model(scale_pos)
-
-        eval_arr = self._prepare_eval_set(eval_set)
-        if eval_arr:
-            self._fit_with_eval(X_arr, y_arr, eval_arr)
-        else:
-            self.model_.fit(X_arr, y_arr)
-
-        return self
-
-    def predict(self, X):
-        return self.model_.predict(self._to_array(X))
-
-    def predict_proba(self, X):
-        return self.model_.predict_proba(self._to_array(X))
-
-    @property
-    def feature_importances_(self):
-        return self.model_.feature_importances_ if self.model_ else None
-
-
-class XGBoostAgentClassifier(BaseAgentClassifier):
-    """XGBoost binary classifier for agent detection with class imbalance handling"""
-
-    def _build_model(self, scale_pos: float):
-        return xgb.XGBClassifier(
-            n_estimators=self.n_estimators,
-            max_depth=self.max_depth,
-            learning_rate=self.learning_rate,
-            scale_pos_weight=scale_pos,
-            eval_metric='auc',
-            early_stopping_rounds=self.early_stopping_rounds,
-            random_state=42,
-            tree_method='hist',
-            enable_categorical=False
-        )
-
-    def _fit_with_eval(self, X_arr, y_arr, eval_arr):
-        self.model_.fit(X_arr, y_arr, eval_set=eval_arr, verbose=False)
-
-
-class LightGBMAgentClassifier(BaseAgentClassifier):
-    """LightGBM binary classifier for agent detection with class imbalance handling"""
-
-    def _build_model(self, scale_pos: float):
-        return lgb.LGBMClassifier(
-            n_estimators=self.n_estimators,
-            max_depth=self.max_depth,
-            learning_rate=self.learning_rate,
-            scale_pos_weight=scale_pos,
-            metric='auc',
-            random_state=42,
-            verbosity=-1
-        )
-
-    def _fit_with_eval(self, X_arr, y_arr, eval_arr):
-        self.model_.fit(
-            X_arr, y_arr,
-            eval_set=eval_arr,
-            callbacks=[lgb.early_stopping(self.early_stopping_rounds, verbose=False)]
-        )
+class WeakClassifier(BaseEstimator, ClassifierMixin, ABC):
+    # a simple contrastive machine learning model
+    # this model should learn to distinguish between human and agent behavior
+    # using a weakly supervised approach and contrastive learning + augmentation
+    #
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.model = None
+        self.kwargs = kwargs
diff --git a/experiments/ml/weak.train.py b/experiments/ml/weak.train.py
new file mode 100644
index 0000000..36e11ee
--- /dev/null
+++ b/experiments/ml/weak.train.py
@@ -0,0 +1,30 @@
+from sim.rl.behavior_loader.loader import AgentLoader, Loader, JointLoader
+from sim.rl.behavior_loader.loader import PayloadModel
+from arch import WeakClassifier
+
+agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+
+def augment_trajectory(trajectory : list[PayloadModel], augmentation_rate: float = 0.1) -> list[PayloadModel]:
+    # augmentations possible:
+    # return a sub-trajectory window of the original trajectory
+    # insert random noise events
+    # shuffle a few events (find a few indices and swap them with i+1 neighbor)
+    # adjust metadata
+    return trajectory
+
+
+def train():
+    pass
+
+
+
+if __name__ == "__main__":
+    joint_loader = JointLoader(human_dir, agent_dir)
+    data = joint_loader.get_data()
+    entries, num_entries = joint_loader.get_entries()
+    print(f"Loaded {num_entries} entries")
+    # TODO: augment
+    # fit model
+    model = WeakClassifier()
+    model.fit(data)

From b05b510f7098778c84ea84636f7958ded3e558d3 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 14:05:30 +0100
Subject: [PATCH 91/99] strong dataset gathering

---
 sim/strong_learner/data.py | 99 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 sim/strong_learner/data.py

diff --git a/sim/strong_learner/data.py b/sim/strong_learner/data.py
new file mode 100644
index 0000000..80129aa
--- /dev/null
+++ b/sim/strong_learner/data.py
@@ -0,0 +1,99 @@
+import os, requests, py7zr
+import pandas as pd
+from typing import Generator
+try:
+    from sim.rl.behavior_loader.loader import PayloadModel, ValueModel, InteractionModel, Loader
+except ImportError:
+    from loader import PayloadModel, ValueModel, InteractionModel, Loader
+
+class YooChooseLoader(Loader):
+    URL = "https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z"
+    CLICK_COLS = ['session_id', 'ts', 'item_id', 'category']
+    BUY_COLS = ['session_id', 'ts', 'item_id', 'price', 'quantity']
+
+    def __init__(self, root_dir: str = "data/yoochoose", chunk_size: int = 500_000, max_sessions: int = 1000):
+        self.root = root_dir
+        self.chunk_size = chunk_size
+        self.max_sessions = max_sessions
+        self.click_path = f"{root_dir}/yoochoose-clicks.dat"
+        self.buy_path = f"{root_dir}/yoochoose-buys.dat"
+        if not os.path.exists(self.click_path): self._setup()
+        self.data = self._load_sessions(max_sessions)
+        self.entries = list(self.data.keys())
+
+    def _setup(self):
+        os.makedirs(self.root, exist_ok=True)
+        zip_path = f"{self.root}/temp.7z"
+        with requests.get(self.URL, stream=True) as r:
+            with open(zip_path, 'wb') as f:
+                for chunk in r.iter_content(8192): f.write(chunk)
+        with py7zr.SevenZipFile(zip_path, 'r') as z: z.extractall(self.root)
+        os.remove(zip_path)
+
+    def _make_interaction(self, sid: str, ts: str, item_id: str, event: str, page: str, meta: dict) -> InteractionModel:
+        payload = PayloadModel(
+            sessionId=sid, experimentId=None, eventName=event,
+            page=page, productId=item_id, metadata=meta,
+            storeMode="yoochoose", userAgent="dataset", ts=ts
+        )
+        return InteractionModel(
+            partitionID=0, offset=0, timestamp=0, compression="",
+            isTransactional=False, headers=[], key={},
+            value=ValueModel(payload=payload, encoding="json", isPayloadNull=False, schemaId=1, size=0)
+        )
+
+    def _parse_category(self, cat) -> str:
+        if pd.isna(cat) or cat == "0": return "unknown"
+        if cat == "S": return "special_offer"
+        try:
+            n = int(cat)
+            return f"category_{n}" if 1 <= n <= 12 else f"brand_{n}"
+        except: return str(cat)
+
+    def stream_clicks(self) -> Generator[InteractionModel, None, None]:
+        with pd.read_csv(self.click_path, names=self.CLICK_COLS, chunksize=self.chunk_size, header=None) as reader:
+            for chunk in reader:
+                for r in chunk.itertuples(index=False):
+                    yield self._make_interaction(
+                        str(r.session_id), r.ts, str(r.item_id),
+                        "view_item_page", self._parse_category(r.category), {}
+                    )
+
+    def stream_buys(self) -> Generator[InteractionModel, None, None]:
+        with pd.read_csv(self.buy_path, names=self.BUY_COLS, chunksize=self.chunk_size, header=None) as reader:
+            for chunk in reader:
+                for r in chunk.itertuples(index=False):
+                    yield self._make_interaction(
+                        str(r.session_id), r.ts, str(r.item_id),
+                        "purchase_complete", "/checkout", {"price": r.price, "quantity": r.quantity}
+                    )
+
+    def stream(self) -> Generator[InteractionModel, None, None]:
+        yield from self.stream_clicks()
+        yield from self.stream_buys()
+
+    def _load_sessions(self, max_sessions: int | None = None) -> dict:
+        sessions = {}
+        for interaction in self.stream():
+            sid = interaction.value.payload.sessionId
+            if sid not in sessions:
+                if max_sessions and len(sessions) >= max_sessions: continue
+                sessions[sid] = []
+            sessions[sid].append(interaction)
+        for sid in sessions: sessions[sid].sort(key=lambda x: x.value.payload.ts)
+        return sessions
+
+    def get_data(self) -> dict:
+        return self.data
+
+    def get_entries(self) -> tuple[list[str], int]:
+        return self.entries, len(self.entries)
+
+if __name__ == "__main__":
+    loader = YooChooseLoader(max_sessions=100)
+    views, purchases = 0, 0
+    for sid, evts in loader.get_data().items():
+        for e in evts:
+            if e.value.payload.eventName == "view_item_page": views += 1
+            elif e.value.payload.eventName == "purchase_complete": purchases += 1
+    print(f"Loaded {len(loader.entries)} sessions: {views} view_item_page, {purchases} purchase_complete")

From 440371dba40e39ce5159cb1edc21a899fcd0740b Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 14:05:39 +0100
Subject: [PATCH 92/99] feat: initial feature engineering of trajectories

---
 sim/rl/behavior_loader/models.py | 49 +++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 84c2fe4..4c6bf21 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -1,4 +1,7 @@
-from loader import Loader, AgentLoader, JointLoader
+try:
+    from loader import Loader, AgentLoader, JointLoader
+except ImportError:
+    from sim.rl.behavior_loader.loader import Loader, AgentLoader, JointLoader
 from collections import defaultdict
 from typing import Dict, List, Tuple, Set
 import numpy as np
@@ -80,6 +83,50 @@ class BehaviorModel:
             path.append(curr)
         return path
 
+    def extract_trajectory_features(self, events: List, max_trans_dim: int = 50) -> np.ndarray:
+        """Convert trajectory to feature vector using MDP structure for contrastive learning"""
+        if not self.mdp:
+            self.build_MDP()
+
+        states = [self._state_repr(e) for e in sorted(events, key=self._sort_key)]
+        features = []
+
+        # transition histogram over MDP state space
+        trans_counts = defaultdict(int)
+        for s, s_next in zip(states, states[1:]):
+            trans_counts[(s, s_next)] += 1
+        all_trans = [(s, t) for s in self.mdp['states'] for t in self.mdp['transitions'].get(s, {}).keys()]
+        trans_vec = [trans_counts.get(tr, 0) for tr in all_trans[:max_trans_dim]]
+        trans_vec = trans_vec + [0] * (max_trans_dim - len(trans_vec))  # pad
+        total_trans = sum(trans_counts.values()) or 1
+        features.extend([v / total_trans for v in trans_vec])
+
+        # state coverage ratio
+        visited = set(states)
+        features.append(len(visited) / max(self.mdp['num_states'], 1))
+
+        # temporal entropy of transitions
+        if len(states) > 1:
+            trans_probs = [self.transition_prob(s, s_n) for s, s_n in zip(states, states[1:])]
+            entropy = -sum(p * np.log(p + 1e-10) for p in trans_probs if p > 0)
+            features.append(entropy / max(len(states), 1))
+        else:
+            features.append(0.0)
+
+        # trajectory length and unique state count
+        features.append(len(states))
+        features.append(len(visited))
+
+        # state value statistics along trajectory
+        vals = [self.state_value(s) for s in states]
+        if vals:
+            features.extend([np.mean(vals), np.std(vals), np.min(vals), np.max(vals)])
+        else:
+            features.extend([0.0, 0.0, 0.0, 0.0])
+
+        return np.array(features, dtype=np.float32)
+
+
 class AgentBehaviorModel(BehaviorModel):
     def __init__(self, src_dir: str):
         super().__init__(src_dir, AgentLoader)

From 00e3eff2fadbc4b6153220971c68729464b8b46b Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 18:22:31 +0100
Subject: [PATCH 93/99] migrating weak learning

---
 experiments/ml/weak.train.py |  30 -----
 experiments/ml/weak_train.py | 246 +++++++++++++++++++++++++++++++++++
 2 files changed, 246 insertions(+), 30 deletions(-)
 delete mode 100644 experiments/ml/weak.train.py
 create mode 100644 experiments/ml/weak_train.py

diff --git a/experiments/ml/weak.train.py b/experiments/ml/weak.train.py
deleted file mode 100644
index 36e11ee..0000000
--- a/experiments/ml/weak.train.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from sim.rl.behavior_loader.loader import AgentLoader, Loader, JointLoader
-from sim.rl.behavior_loader.loader import PayloadModel
-from arch import WeakClassifier
-
-agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
-
-def augment_trajectory(trajectory : list[PayloadModel], augmentation_rate: float = 0.1) -> list[PayloadModel]:
-    # augmentations possible:
-    # return a sub-trajectory window of the original trajectory
-    # insert random noise events
-    # shuffle a few events (find a few indices and swap them with i+1 neighbor)
-    # adjust metadata
-    return trajectory
-
-
-def train():
-    pass
-
-
-
-if __name__ == "__main__":
-    joint_loader = JointLoader(human_dir, agent_dir)
-    data = joint_loader.get_data()
-    entries, num_entries = joint_loader.get_entries()
-    print(f"Loaded {num_entries} entries")
-    # TODO: augment
-    # fit model
-    model = WeakClassifier()
-    model.fit(data)
diff --git a/experiments/ml/weak_train.py b/experiments/ml/weak_train.py
new file mode 100644
index 0000000..eb87a9c
--- /dev/null
+++ b/experiments/ml/weak_train.py
@@ -0,0 +1,246 @@
+import sys
+sys.path.insert(0, "/home/velocitatem/Documents/Projects/PHANTOM/sim/rl/behavior_loader")
+sys.path.insert(0, "/home/velocitatem/Documents/Projects/PHANTOM/experiments/ml")
+
+from sim.rl.behavior_loader.loader import AgentLoader, Loader, JointLoader, PayloadModel
+from sim.rl.behavior_loader.models import JointBehaviorModel
+from arch import ContrastiveWeakClassifier, contrastive_loss, featurize_trajectory
+from typing import List, Optional, Dict
+from datetime import datetime, timedelta
+from copy import deepcopy
+import numpy as np
+import random
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import Adam
+from torch.utils.tensorboard import SummaryWriter
+
+RUNS_DIR = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/ml/runs"
+agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
+human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+
+
+def _perturb_ts(evt: PayloadModel, jitter_ms: int = 500) -> PayloadModel:
+    """Add random jitter to event timestamp"""
+    new_evt = deepcopy(evt)
+    try:
+        ts = datetime.fromisoformat(evt.ts.replace('Z', '+00:00'))
+        delta = timedelta(milliseconds=random.randint(-jitter_ms, jitter_ms))
+        new_evt.ts = (ts + delta).isoformat()
+    except:
+        pass
+    return new_evt
+
+
+def augment_trajectory(trajectory: List[PayloadModel], rate: float = 0.1) -> List[PayloadModel]:
+    """Apply random augmentation to trajectory for contrastive learning"""
+    if len(trajectory) < 2:
+        return trajectory
+
+    aug_type = random.choice(['window', 'shuffle', 'noise', 'drop'])
+
+    if aug_type == 'window':  # random contiguous sub-sequence (70-100% length)
+        min_len = max(2, int(len(trajectory) * 0.7))
+        sub_len = random.randint(min_len, len(trajectory))
+        start = random.randint(0, len(trajectory) - sub_len)
+        return trajectory[start:start + sub_len]
+
+    elif aug_type == 'shuffle':  # swap adjacent pairs with probability rate
+        result = list(trajectory)
+        for i in range(len(result) - 1):
+            if random.random() < rate:
+                result[i], result[i + 1] = result[i + 1], result[i]
+        return result
+
+    elif aug_type == 'drop':  # drop events with probability rate
+        result = [e for e in trajectory if random.random() > rate]
+        return result if len(result) >= 2 else trajectory[:2]
+
+    elif aug_type == 'noise':  # perturb timestamps
+        return [_perturb_ts(e, jitter_ms=500) for e in trajectory]
+
+    return trajectory
+
+
+class TripletDataset(Dataset):
+    """Generate (anchor, positive, negative) triplets on-the-fly with augmentation"""
+    def __init__(self, data: Dict[str, List[PayloadModel]], mdp: Optional[Dict], augment_fn, input_dim: int = 64, multiplier: int = 10):
+        self.sessions = list(data.items())
+        self.human_ids = [i for i, (sid, _) in enumerate(self.sessions) if sid.startswith('human_')]
+        self.agent_ids = [i for i, (sid, _) in enumerate(self.sessions) if sid.startswith('agent_')]
+        self.mdp = mdp
+        self.augment = augment_fn
+        self.input_dim = input_dim
+        self.multiplier = multiplier
+
+        if not self.human_ids or not self.agent_ids:
+            raise ValueError(f"Need both human ({len(self.human_ids)}) and agent ({len(self.agent_ids)}) sessions")
+
+    def __len__(self) -> int:
+        return len(self.sessions) * self.multiplier
+
+    def __getitem__(self, idx: int):
+        anchor_idx = idx % len(self.sessions)
+        sid, events = self.sessions[anchor_idx]
+        is_human = sid.startswith('human_')
+
+        anchor = featurize_trajectory(events, self.mdp, self.input_dim)
+        positive = featurize_trajectory(self.augment(events), self.mdp, self.input_dim)
+
+        neg_pool = self.agent_ids if is_human else self.human_ids
+        neg_idx = random.choice(neg_pool)
+        negative = featurize_trajectory(self.sessions[neg_idx][1], self.mdp, self.input_dim)
+
+        label = 0 if is_human else 1  # 0=human, 1=agent
+        return (torch.tensor(anchor, dtype=torch.float32),
+                torch.tensor(positive, dtype=torch.float32),
+                torch.tensor(negative, dtype=torch.float32),
+                torch.tensor(label, dtype=torch.long))
+
+
+def train(epochs: int = 100, lr: float = 1e-3, batch_size: int = 4, input_dim: int = 64,
+          embed_dim: int = 32, margin: float = 0.3, verbose: bool = True, run_name: str = None):
+    """Train contrastive weak classifier on human/agent trajectories"""
+    joint = JointLoader(human_dir, agent_dir)
+    data = joint.get_data()
+    if verbose:
+        print(f"Loaded {len(data)} sessions")
+
+    joint_model = JointBehaviorModel(human_dir, agent_dir)
+    ref_mdp = joint_model.build_MDP()
+
+    dataset = TripletDataset(data, ref_mdp, augment_trajectory, input_dim=input_dim)
+    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
+
+    model = ContrastiveWeakClassifier(input_dim=input_dim, embed_dim=embed_dim, margin=margin)
+    model.to_device()
+
+    run_name = run_name or f"d{input_dim}_e{embed_dim}_lr{lr}_m{margin}_{datetime.now():%Y%m%d_%H%M%S}"
+    writer = SummaryWriter(f"{RUNS_DIR}/train/{run_name}")
+
+    optimizer = Adam(list(model.encoder.parameters()) + list(model.classifier.parameters()), lr=lr)
+    ce_loss_fn = torch.nn.CrossEntropyLoss()
+
+    best_loss = float('inf')
+    for epoch in range(epochs):
+        model.encoder.train()
+        model.classifier.train()
+        total_loss, n_batches = 0.0, 0
+
+        for anchor, positive, negative, labels in loader:
+            anchor, positive, negative, labels = [t.to(model.device) for t in [anchor, positive, negative, labels]]
+            z_a, z_p, z_n = [model.encoder(t.unsqueeze(1)) for t in [anchor, positive, negative]]
+
+            trip_loss = contrastive_loss(z_a, z_p, z_n, margin=model.margin)
+            ce = ce_loss_fn(model.classifier(z_a), labels)
+            loss = trip_loss + 0.5 * ce
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            n_batches += 1
+
+        avg_loss = total_loss / max(n_batches, 1)
+        writer.add_scalar('loss', avg_loss, epoch)
+
+        if verbose and (epoch + 1) % 10 == 0:
+            print(f"Epoch {epoch+1}/{epochs}: loss={avg_loss:.4f}")
+        if avg_loss < best_loss:
+            best_loss = avg_loss
+
+    writer.close()
+    if verbose:
+        print(f"Done. Best={best_loss:.4f} TB:{RUNS_DIR}/train/{run_name}")
+
+    return model, ref_mdp
+
+
+def evaluate_loocv(input_dim: int = 64, embed_dim: int = 32, epochs_per_fold: int = 50,
+                   lr: float = 1e-3, margin: float = 0.3, run_name: str = None):
+    """Leave-one-out cross-validation given limited samples"""
+    joint = JointLoader(human_dir, agent_dir)
+    data = joint.get_data()
+    session_ids = list(data.keys())
+
+    joint_model = JointBehaviorModel(human_dir, agent_dir)
+    ref_mdp = joint_model.build_MDP()
+
+    run_name = run_name or f"loocv_d{input_dim}_e{embed_dim}_m{margin}_{datetime.now():%Y%m%d_%H%M%S}"
+    writer = SummaryWriter(f"{RUNS_DIR}/eval/{run_name}")
+
+    predictions, actuals = [], []
+
+    for fold_idx, test_sid in enumerate(session_ids):
+        train_data = {k: v for k, v in data.items() if k != test_sid}
+        test_events = data[test_sid]
+        test_label = 0 if test_sid.startswith('human_') else 1
+
+        n_human = sum(1 for k in train_data if k.startswith('human_'))
+        n_agent = sum(1 for k in train_data if k.startswith('agent_'))
+        if n_human == 0 or n_agent == 0:
+            continue
+
+        try:
+            dataset = TripletDataset(train_data, ref_mdp, augment_trajectory, input_dim=input_dim, multiplier=5)
+            loader = DataLoader(dataset, batch_size=2, shuffle=True, drop_last=True)
+
+            model = ContrastiveWeakClassifier(input_dim=input_dim, embed_dim=embed_dim, margin=margin)
+            model.to_device()
+            optimizer = Adam(list(model.encoder.parameters()) + list(model.classifier.parameters()), lr=lr)
+
+            model.encoder.train()
+            model.classifier.train()
+            for _ in range(epochs_per_fold):
+                for anchor, positive, negative, labels in loader:
+                    z_a, z_p, z_n = [model.encoder(t.unsqueeze(1).to(model.device)) for t in [anchor, positive, negative]]
+                    loss = contrastive_loss(z_a, z_p, z_n, margin=margin)
+                    optimizer.zero_grad()
+                    loss.backward()
+                    optimizer.step()
+
+            test_feat = featurize_trajectory(test_events, ref_mdp, input_dim)
+            pred = model.predict(test_feat.reshape(1, -1))[0]
+            predictions.append(pred)
+            actuals.append(test_label)
+            print(f"  {test_sid[:12]}...: pred={pred}, actual={test_label}, {'OK' if pred == test_label else 'MISS'}")
+
+        except Exception as e:
+            print(f"Error: {e}")
+
+    if predictions:
+        acc = sum(p == a for p, a in zip(predictions, actuals)) / len(predictions)
+        tp = sum(1 for p, a in zip(predictions, actuals) if p == 1 and a == 1)
+        fp = sum(1 for p, a in zip(predictions, actuals) if p == 1 and a == 0)
+        fn = sum(1 for p, a in zip(predictions, actuals) if p == 0 and a == 1)
+        prec, rec = tp / max(tp + fp, 1), tp / max(tp + fn, 1)
+        f1 = 2 * prec * rec / max(prec + rec, 1e-10)
+        writer.add_scalar('accuracy', acc, 0)
+        writer.add_scalar('f1', f1, 0)
+        writer.add_scalar('precision', prec, 0)
+        writer.add_scalar('recall', rec, 0)
+        writer.close()
+        print(f"\nAccuracy: {acc:.2%} F1: {f1:.3f} TB:{RUNS_DIR}/eval/{run_name}")
+        return acc, predictions, actuals
+    writer.close()
+    return 0.0, [], []
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mode', choices=['train', 'eval'], default='train')
+    parser.add_argument('--epochs', type=int, default=100)
+    parser.add_argument('--lr', type=float, default=1e-3)
+    parser.add_argument('--margin', type=float, default=0.3)
+    parser.add_argument('--input-dim', type=int, default=64)
+    parser.add_argument('--embed-dim', type=int, default=32)
+    parser.add_argument('--run-name', type=str, default=None)
+    args = parser.parse_args()
+
+    if args.mode == 'train':
+        model, mdp = train(epochs=args.epochs, lr=args.lr, input_dim=args.input_dim,
+                           embed_dim=args.embed_dim, margin=args.margin, run_name=args.run_name)
+    else:
+        evaluate_loocv(input_dim=args.input_dim, embed_dim=args.embed_dim, epochs_per_fold=args.epochs,
+                       lr=args.lr, margin=args.margin, run_name=args.run_name)

From ccc19f349385511e3b0e9f0cb3a9290c11095bb3 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 18:22:39 +0100
Subject: [PATCH 94/99] acapting some architectures

---
 experiments/ml/__init__.py |  16 ++-
 experiments/ml/arch.py     | 242 +++++++++++++++++++++++++++++++++++--
 2 files changed, 247 insertions(+), 11 deletions(-)

diff --git a/experiments/ml/__init__.py b/experiments/ml/__init__.py
index 11b65df..c97eaa9 100644
--- a/experiments/ml/__init__.py
+++ b/experiments/ml/__init__.py
@@ -1,11 +1,21 @@
 from .evals import evaluate
 from .arch import (
     XGBoostAgentClassifier,
-    LightGBMAgentClassifier
+    LightGBMAgentClassifier,
+    ContrastiveWeakClassifier,
+    TrajectoryEncoder,
+    WeakClassifier,
+    contrastive_loss,
+    featurize_trajectory,
 )
 
-__all__ =[
+__all__ = [
     'evaluate',
     'XGBoostAgentClassifier',
-    'LightGBMAgentClassifier'
+    'LightGBMAgentClassifier',
+    'ContrastiveWeakClassifier',
+    'TrajectoryEncoder',
+    'WeakClassifier',
+    'contrastive_loss',
+    'featurize_trajectory',
 ]
diff --git a/experiments/ml/arch.py b/experiments/ml/arch.py
index a187959..4ceb2e0 100644
--- a/experiments/ml/arch.py
+++ b/experiments/ml/arch.py
@@ -1,23 +1,249 @@
 # sklearn compatible models for agent detection
 from sklearn.base import BaseEstimator, ClassifierMixin
-from procesing.context import PipelineContext
-from typing import Any, Optional, Tuple
+from typing import Any, Optional, Tuple, Dict, List
 from abc import ABC, abstractmethod
-import xgboost as xgb
-import lightgbm as lgb
+from collections import defaultdict
 import numpy as np
 import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 
 TASK = 'classification'
 LABELS = ['human', 'agent']
 
 
 class WeakClassifier(BaseEstimator, ClassifierMixin, ABC):
-    # a simple contrastive machine learning model
-    # this model should learn to distinguish between human and agent behavior
-    # using a weakly supervised approach and contrastive learning + augmentation
-    #
+    # a simple contrastive machine learning model learns to distinguish human/agent behavior
+    # using weakly supervised contrastive learning + augmentation
     def __init__(self, **kwargs):
         super().__init__()
         self.model = None
         self.kwargs = kwargs
+
+
+class TrajectoryEncoder(nn.Module):
+    """Encode variable-length event sequences to fixed-dim embedding via bidirectional LSTM"""
+    def __init__(self, input_dim: int, embed_dim: int = 32, hidden_dim: int = 64):
+        super().__init__()
+        self.event_embed = nn.Linear(input_dim, hidden_dim)
+        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
+        self.proj = nn.Linear(hidden_dim * 2, embed_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # x: (batch, seq_len, input_dim)
+        h = F.relu(self.event_embed(x))
+        _, (hn, _) = self.lstm(h)
+        hn = torch.cat([hn[-2], hn[-1]], dim=1)  # concat bidirectional hidden states
+        return F.normalize(self.proj(hn), dim=1)  # L2 normalized
+
+
+class ContrastiveWeakClassifier(WeakClassifier):
+    """Contrastive learning classifier for human/agent trajectory discrimination"""
+    def __init__(self, input_dim: int = 64, embed_dim: int = 32, margin: float = 1.0, **kwargs):
+        super().__init__(**kwargs)
+        self.input_dim = input_dim
+        self.embed_dim = embed_dim
+        self.margin = margin
+        self.encoder = TrajectoryEncoder(input_dim, embed_dim)
+        self.classifier = nn.Linear(embed_dim, 2)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self._fitted = False
+
+    def to_device(self):
+        self.encoder.to(self.device)
+        self.classifier.to(self.device)
+        return self
+
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        return self.encoder(x.to(self.device))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        emb = self.encode(x)
+        return self.classifier(emb)
+
+    def fit(self, X, y=None):  # sklearn interface - actual training in weak.train.py
+        self._fitted = True
+        return self
+
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        self.encoder.eval()
+        self.classifier.eval()
+        with torch.no_grad():
+            x = torch.tensor(X, dtype=torch.float32).unsqueeze(1).to(self.device)
+            logits = self.forward(x)
+            return torch.argmax(logits, dim=1).cpu().numpy()
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        self.encoder.eval()
+        self.classifier.eval()
+        with torch.no_grad():
+            x = torch.tensor(X, dtype=torch.float32).unsqueeze(1).to(self.device)
+            logits = self.forward(x)
+            return F.softmax(logits, dim=1).cpu().numpy()
+
+
+def contrastive_loss(anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor, margin: float = 0.3) -> torch.Tensor:
+    """Triplet loss using cosine similarity (for L2-normalized embeddings). margin in [0,1] range."""
+    pos_sim = F.cosine_similarity(anchor, positive)  # higher = more similar
+    neg_sim = F.cosine_similarity(anchor, negative)
+    return F.relu(neg_sim - pos_sim + margin).mean()  # want pos_sim > neg_sim + margin
+
+
+def nt_xent_loss(z_i: torch.Tensor, z_j: torch.Tensor, temperature: float = 0.5) -> torch.Tensor:
+    """Normalized temperature-scaled cross entropy loss (SimCLR style)"""
+    batch_size = z_i.size(0)
+    z = torch.cat([z_i, z_j], dim=0)  # (2N, embed_dim)
+    sim = F.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0), dim=2) / temperature
+    mask = torch.eye(2 * batch_size, dtype=torch.bool, device=z.device)
+    sim.masked_fill_(mask, -float('inf'))
+    labels = torch.arange(batch_size, device=z.device)
+    labels = torch.cat([labels + batch_size, labels])  # positive pairs
+    return F.cross_entropy(sim, labels)
+
+
+# feature extraction utilities for trajectory -> feature vector
+def transition_histogram(events: List, state_fn, max_states: int = 50) -> np.ndarray:
+    """Compute normalized histogram of state transitions in trajectory"""
+    if len(events) < 2:
+        return np.zeros(max_states)
+    states = [state_fn(e) for e in events]
+    trans_counts = defaultdict(int)
+    for s, s_next in zip(states, states[1:]):
+        trans_counts[(s, s_next)] += 1
+    total = sum(trans_counts.values())
+    hist = np.array(list(trans_counts.values())[:max_states], dtype=np.float32)
+    hist = np.pad(hist, (0, max(0, max_states - len(hist))))
+    return hist / (total + 1e-10)
+
+
+def temporal_signature(events: List, ts_fn) -> np.ndarray:
+    """Extract temporal features: mean/std/skew of inter-event times"""
+    if len(events) < 2:
+        return np.zeros(4, dtype=np.float32)
+    times = sorted([ts_fn(e) for e in events])
+    diffs = np.diff(times).astype(np.float32)
+    if len(diffs) == 0:
+        return np.zeros(4, dtype=np.float32)
+    mean_dt, std_dt = np.mean(diffs), np.std(diffs) + 1e-10
+    skew = np.mean(((diffs - mean_dt) / std_dt) ** 3) if std_dt > 1e-8 else 0.0
+    return np.array([mean_dt, std_dt, skew, len(diffs)], dtype=np.float32)
+
+
+def state_coverage(events: List, state_fn, mdp_states: set) -> float:
+    """Fraction of MDP states visited by trajectory"""
+    if not mdp_states:
+        return 0.0
+    visited = set(state_fn(e) for e in events)
+    return len(visited & mdp_states) / len(mdp_states)
+
+
+def transition_entropy(events: List, state_fn) -> float:
+    """Compute entropy of transition distribution (randomness of navigation)"""
+    if len(events) < 2:
+        return 0.0
+    states = [state_fn(e) for e in events]
+    trans_counts = defaultdict(int)
+    for s, s_next in zip(states, states[1:]):
+        trans_counts[(s, s_next)] += 1
+    total = sum(trans_counts.values())
+    probs = [c / total for c in trans_counts.values()]
+    return -sum(p * np.log(p + 1e-10) for p in probs)
+
+
+def featurize_trajectory(events: List, mdp: Optional[Dict] = None, input_dim: int = 64) -> np.ndarray:
+    """Convert trajectory to fixed-dim feature vector"""
+    def _state_repr(e):
+        return f"{getattr(e, 'page', None) or 'unk'}|{getattr(e, 'productId', None) or 'none'}|{e.eventName}"
+
+    def _ts_fn(e):
+        ts = getattr(e, 'ts', None)
+        if isinstance(ts, str):
+            from datetime import datetime
+            try:
+                return datetime.fromisoformat(ts.replace('Z', '+00:00')).timestamp()
+            except:
+                return 0.0
+        return float(ts) if ts else 0.0
+
+    feats = []
+    feats.extend(transition_histogram(events, _state_repr, max_states=40))  # 40 dims
+    feats.extend(temporal_signature(events, _ts_fn))  # 4 dims
+    mdp_states = set(mdp.get('states', [])) if mdp else set()
+    feats.append(state_coverage(events, _state_repr, mdp_states))  # 1 dim
+    feats.append(transition_entropy(events, _state_repr))  # 1 dim
+    feats.append(len(events))  # trajectory length
+    feats.append(len(set(_state_repr(e) for e in events)))  # unique states
+
+    # event type distribution (page_view, hover, cart, purchase indicators)
+    event_names = [e.eventName for e in events]
+    feats.append(sum(1 for n in event_names if 'page' in n.lower()) / (len(events) + 1))
+    feats.append(sum(1 for n in event_names if 'hover' in n.lower()) / (len(events) + 1))
+    feats.append(sum(1 for n in event_names if 'cart' in n.lower()) / (len(events) + 1))
+    feats.append(sum(1 for n in event_names if 'purchase' in n.lower() or 'checkout' in n.lower()) / (len(events) + 1))
+
+    # pad/truncate to input_dim
+    feats = np.array(feats[:input_dim], dtype=np.float32)
+    if len(feats) < input_dim:
+        feats = np.pad(feats, (0, input_dim - len(feats)))
+    return feats
+
+
+# gradient boosting classifiers for comparison baselines
+class XGBoostAgentClassifier(BaseEstimator, ClassifierMixin):
+    """XGBoost classifier for human/agent detection from session features"""
+    def __init__(self, n_estimators: int = 100, max_depth: int = 6, learning_rate: float = 0.1, **kwargs):
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.learning_rate = learning_rate
+        self.model = None
+        self.kwargs = kwargs
+
+    def fit(self, X: np.ndarray, y: np.ndarray):
+        try:
+            import xgboost as xgb
+            self.model = xgb.XGBClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth,
+                                           learning_rate=self.learning_rate, **self.kwargs)
+            self.model.fit(X, y)
+        except ImportError:
+            raise ImportError("xgboost required for XGBoostAgentClassifier")
+        return self
+
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        if self.model is None:
+            raise ValueError("fit the model first")
+        return self.model.predict(X)
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        if self.model is None:
+            raise ValueError("fit the model first")
+        return self.model.predict_proba(X)
+
+
+class LightGBMAgentClassifier(BaseEstimator, ClassifierMixin):
+    """LightGBM classifier for human/agent detection from session features"""
+    def __init__(self, n_estimators: int = 100, max_depth: int = -1, learning_rate: float = 0.1, **kwargs):
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.learning_rate = learning_rate
+        self.model = None
+        self.kwargs = kwargs
+
+    def fit(self, X: np.ndarray, y: np.ndarray):
+        try:
+            import lightgbm as lgb
+            self.model = lgb.LGBMClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth,
+                                            learning_rate=self.learning_rate, verbose=-1, **self.kwargs)
+            self.model.fit(X, y)
+        except ImportError:
+            raise ImportError("lightgbm required for LightGBMAgentClassifier")
+        return self
+
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        if self.model is None:
+            raise ValueError("fit the model first")
+        return self.model.predict(X)
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        if self.model is None:
+            raise ValueError("fit the model first")
+        return self.model.predict_proba(X)

From 22a2c255bd23f8717275fe99a34a65253deed3c8 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 19:11:54 +0100
Subject: [PATCH 95/99] chore: remove boilerplate

---
 web/src/app/page.tsx | 64 ++------------------------------------------
 1 file changed, 2 insertions(+), 62 deletions(-)

diff --git a/web/src/app/page.tsx b/web/src/app/page.tsx
index 295f8fd..c97c8ed 100644
--- a/web/src/app/page.tsx
+++ b/web/src/app/page.tsx
@@ -1,65 +1,5 @@
-import Image from "next/image";
+import { redirect } from 'next/navigation';
 
 export default function Home() {
-  return (
-    <div className="flex min-h-screen items-center justify-center bg-zinc-50 font-sans dark:bg-black">
-      <main className="flex min-h-screen w-full max-w-3xl flex-col items-center justify-between py-32 px-16 bg-white dark:bg-black sm:items-start">
-        <Image
-          className="dark:invert"
-          src="/next.svg"
-          alt="Next.js logo"
-          width={100}
-          height={20}
-          priority
-        />
-        <div className="flex flex-col items-center gap-6 text-center sm:items-start sm:text-left">
-          <h1 className="max-w-xs text-3xl font-semibold leading-10 tracking-tight text-black dark:text-zinc-50">
-            To get started, edit the page.tsx file.
-          </h1>
-          <p className="max-w-md text-lg leading-8 text-zinc-600 dark:text-zinc-400">
-            Looking for a starting point or more instructions? Head over to{" "}
-            <a
-              href="https://vercel.com/templates?framework=next.js&utm_source=create-next-app&utm_medium=appdir-template-tw&utm_campaign=create-next-app"
-              className="font-medium text-zinc-950 dark:text-zinc-50"
-            >
-              Templates
-            </a>{" "}
-            or the{" "}
-            <a
-              href="https://nextjs.org/learn?utm_source=create-next-app&utm_medium=appdir-template-tw&utm_campaign=create-next-app"
-              className="font-medium text-zinc-950 dark:text-zinc-50"
-            >
-              Learning
-            </a>{" "}
-            center.
-          </p>
-        </div>
-        <div className="flex flex-col gap-4 text-base font-medium sm:flex-row">
-          <a
-            className="flex h-12 w-full items-center justify-center gap-2 rounded-full bg-foreground px-5 text-background transition-colors hover:bg-[#383838] dark:hover:bg-[#ccc] md:w-[158px]"
-            href="https://vercel.com/new?utm_source=create-next-app&utm_medium=appdir-template-tw&utm_campaign=create-next-app"
-            target="_blank"
-            rel="noopener noreferrer"
-          >
-            <Image
-              className="dark:invert"
-              src="/vercel.svg"
-              alt="Vercel logomark"
-              width={16}
-              height={16}
-            />
-            Deploy Now
-          </a>
-          <a
-            className="flex h-12 w-full items-center justify-center rounded-full border border-solid border-black/[.08] px-5 transition-colors hover:border-transparent hover:bg-black/[.04] dark:border-white/[.145] dark:hover:bg-[#1a1a1a] md:w-[158px]"
-            href="https://nextjs.org/docs?utm_source=create-next-app&utm_medium=appdir-template-tw&utm_campaign=create-next-app"
-            target="_blank"
-            rel="noopener noreferrer"
-          >
-            Documentation
-          </a>
-        </div>
-      </main>
-    </div>
-  );
+  redirect('/hotel');
 }

From ee70f02a1f2feae8c52f02aeb8d61837a0ad1787 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 19:12:11 +0100
Subject: [PATCH 96/99] chore: export repeated methods into lib

---
 lib/__init__.py     |  41 +++++++++++++++
 lib/config.py       |  65 +++++++++++++++++++++++
 lib/features.py     | 125 ++++++++++++++++++++++++++++++++++++++++++++
 lib/kafka_client.py |  54 +++++++++++++++++++
 lib/state.py        |  72 +++++++++++++++++++++++++
 5 files changed, 357 insertions(+)
 create mode 100644 lib/__init__.py
 create mode 100644 lib/config.py
 create mode 100644 lib/features.py
 create mode 100755 lib/kafka_client.py
 create mode 100644 lib/state.py

diff --git a/lib/__init__.py b/lib/__init__.py
new file mode 100644
index 0000000..7f8ec2d
--- /dev/null
+++ b/lib/__init__.py
@@ -0,0 +1,41 @@
+"""PHANTOM shared library
+Exports unified utilities for features, state, config, kafka, and model registry
+"""
+from .config import (
+    PROJECT_ROOT, DATA_DIR, EXPERIMENTS_DIR,
+    AGENT_DATA_DIR, HUMAN_DATA_DIR, SIM_RUNS_DIR, MODEL_REGISTRY_DIR,
+    COLLECTED_DATA_DIR, NOTEBOOK_OUTPUT_DIR,
+    ensure_dir, get_data_path, get_experiments_path, get_sim_path,
+    KAFKA_HOST, KAFKA_PORT, KAFKA_BROKER,
+    REDIS_HOST, REDIS_PORT,
+    SUPABASE_URL, SUPABASE_ANON_KEY,
+    BACKEND_PORT, PROVIDER_PORT
+)
+from .state import (
+    make_state_repr, event_to_state, parse_state,
+    get_event_name, get_timestamp,
+    create_state_fn, create_event_name_fn, create_timestamp_fn
+)
+from .features import (
+    transition_histogram, temporal_signature, state_coverage, transition_entropy,
+    event_type_distribution, featurize_trajectory, parse_timestamp
+)
+
+__all__ = [
+    # config
+    'PROJECT_ROOT', 'DATA_DIR', 'EXPERIMENTS_DIR',
+    'AGENT_DATA_DIR', 'HUMAN_DATA_DIR', 'SIM_RUNS_DIR', 'MODEL_REGISTRY_DIR',
+    'COLLECTED_DATA_DIR', 'NOTEBOOK_OUTPUT_DIR',
+    'ensure_dir', 'get_data_path', 'get_experiments_path', 'get_sim_path',
+    'KAFKA_HOST', 'KAFKA_PORT', 'KAFKA_BROKER',
+    'REDIS_HOST', 'REDIS_PORT',
+    'SUPABASE_URL', 'SUPABASE_ANON_KEY',
+    'BACKEND_PORT', 'PROVIDER_PORT',
+    # state
+    'make_state_repr', 'event_to_state', 'parse_state',
+    'get_event_name', 'get_timestamp',
+    'create_state_fn', 'create_event_name_fn', 'create_timestamp_fn',
+    # features
+    'transition_histogram', 'temporal_signature', 'state_coverage', 'transition_entropy',
+    'event_type_distribution', 'featurize_trajectory', 'parse_timestamp',
+]
diff --git a/lib/config.py b/lib/config.py
new file mode 100644
index 0000000..a27ffd9
--- /dev/null
+++ b/lib/config.py
@@ -0,0 +1,65 @@
+"""Unified path configuration for PHANTOM project
+All hardcoded paths should reference this module
+Paths can be overridden via environment variables
+"""
+import os
+from pathlib import Path
+
+# project root (directory containing lib/, experiments/, sim/, web/, backend/)
+PROJECT_ROOT = Path(__file__).parent.parent.resolve()
+
+# data directories
+DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data'))
+EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments'))
+
+# agent/human interaction data
+AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents'))
+HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans'))
+
+# RL simulation runs
+SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs'))
+
+# model artifacts
+MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models'))
+
+# collected experiment data
+COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data'))
+
+# notebook outputs
+NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs'))
+
+
+def ensure_dir(path: Path) -> Path:
+    """ensure directory exists, create if needed"""
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def get_data_path(*parts: str) -> Path:
+    """construct path relative to DATA_DIR"""
+    return DATA_DIR.joinpath(*parts)
+
+
+def get_experiments_path(*parts: str) -> Path:
+    """construct path relative to EXPERIMENTS_DIR"""
+    return EXPERIMENTS_DIR.joinpath(*parts)
+
+
+def get_sim_path(*parts: str) -> Path:
+    """construct path relative to SIM_RUNS_DIR"""
+    return SIM_RUNS_DIR.joinpath(*parts)
+
+
+# service configuration (from .env)
+KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost')
+KAFKA_PORT = os.getenv('KAFKA_PORT', '9092')
+KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}"
+
+REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
+REDIS_PORT = int(os.getenv('REDIS_PORT', '6379'))
+
+SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '')
+SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '')
+
+BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000'))
+PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001'))
diff --git a/lib/features.py b/lib/features.py
new file mode 100644
index 0000000..f2d88f5
--- /dev/null
+++ b/lib/features.py
@@ -0,0 +1,125 @@
+"""Unified featurization utilities for trajectory -> feature vector conversion
+Used by both experiments/ml/ and sim/rl/ components
+"""
+import numpy as np
+from collections import defaultdict
+from typing import List, Dict, Callable, Optional, Any, Set
+from datetime import datetime
+
+
+def transition_histogram(events: List, state_fn: Callable, max_states: int = 50) -> np.ndarray:
+    """compute normalized histogram of state transitions in trajectory
+    events: list of event objects/dicts
+    state_fn: function mapping event -> state string
+    max_states: maximum dimensions for histogram
+    """
+    if len(events) < 2:
+        return np.zeros(max_states, dtype=np.float32)
+    states = [state_fn(e) for e in events]
+    trans_counts = defaultdict(int)
+    for s, s_next in zip(states, states[1:]):
+        trans_counts[(s, s_next)] += 1
+    total = sum(trans_counts.values())
+    hist = np.array(list(trans_counts.values())[:max_states], dtype=np.float32)
+    hist = np.pad(hist, (0, max(0, max_states - len(hist))))
+    return hist / (total + 1e-10)
+
+
+def temporal_signature(events: List, ts_fn: Callable) -> np.ndarray:
+    """extract temporal features: mean/std/skew of inter-event times plus count
+    events: list of event objects/dicts
+    ts_fn: function mapping event -> timestamp (float seconds)
+    returns: [mean_dt, std_dt, skew, n_intervals] array
+    """
+    if len(events) < 2:
+        return np.zeros(4, dtype=np.float32)
+    times = sorted([ts_fn(e) for e in events])
+    diffs = np.diff(times).astype(np.float32)
+    if len(diffs) == 0:
+        return np.zeros(4, dtype=np.float32)
+    mean_dt, std_dt = np.mean(diffs), np.std(diffs) + 1e-10
+    skew = np.mean(((diffs - mean_dt) / std_dt) ** 3) if std_dt > 1e-8 else 0.0
+    return np.array([mean_dt, std_dt, skew, len(diffs)], dtype=np.float32)
+
+
+def state_coverage(events: List, state_fn: Callable, mdp_states: Set[str]) -> float:
+    """fraction of MDP states visited by trajectory
+    events: list of event objects/dicts
+    state_fn: function mapping event -> state string
+    mdp_states: set of all possible MDP states
+    """
+    if not mdp_states:
+        return 0.0
+    visited = set(state_fn(e) for e in events)
+    return len(visited & mdp_states) / len(mdp_states)
+
+
+def transition_entropy(events: List, state_fn: Callable) -> float:
+    """compute entropy of transition distribution (randomness of navigation)
+    higher entropy = more random browsing pattern
+    """
+    if len(events) < 2:
+        return 0.0
+    states = [state_fn(e) for e in events]
+    trans_counts = defaultdict(int)
+    for s, s_next in zip(states, states[1:]):
+        trans_counts[(s, s_next)] += 1
+    total = sum(trans_counts.values())
+    probs = [c / total for c in trans_counts.values()]
+    return -sum(p * np.log(p + 1e-10) for p in probs)
+
+
+def event_type_distribution(events: List, event_name_fn: Callable) -> np.ndarray:
+    """compute proportions of different event type categories
+    returns: [page_view_ratio, hover_ratio, cart_ratio, purchase_ratio]
+    """
+    if not events:
+        return np.zeros(4, dtype=np.float32)
+    n = len(events)
+    names = [event_name_fn(e).lower() for e in events]
+    return np.array([
+        sum(1 for nm in names if 'page' in nm or 'view' in nm) / n,
+        sum(1 for nm in names if 'hover' in nm) / n,
+        sum(1 for nm in names if 'cart' in nm) / n,
+        sum(1 for nm in names if 'purchase' in nm or 'checkout' in nm) / n
+    ], dtype=np.float32)
+
+
+def featurize_trajectory(events: List, state_fn: Callable, ts_fn: Callable,
+                         event_name_fn: Callable, mdp_states: Optional[Set[str]] = None,
+                         output_dim: int = 64) -> np.ndarray:
+    """convert trajectory to fixed-dimension feature vector
+    events: list of event objects/dicts
+    state_fn: function mapping event -> state string
+    ts_fn: function mapping event -> timestamp (float)
+    event_name_fn: function mapping event -> event name string
+    mdp_states: optional set of all MDP states for coverage calculation
+    output_dim: desired output dimension (will pad/truncate)
+    """
+    feats = []
+    feats.extend(transition_histogram(events, state_fn, max_states=40))  # 40 dims
+    feats.extend(temporal_signature(events, ts_fn))  # 4 dims
+    feats.append(state_coverage(events, state_fn, mdp_states or set()))  # 1 dim
+    feats.append(transition_entropy(events, state_fn))  # 1 dim
+    feats.append(float(len(events)))  # trajectory length
+    feats.append(float(len(set(state_fn(e) for e in events))))  # unique states
+    feats.extend(event_type_distribution(events, event_name_fn))  # 4 dims
+
+    feats = np.array(feats[:output_dim], dtype=np.float32)
+    if len(feats) < output_dim:
+        feats = np.pad(feats, (0, output_dim - len(feats)))
+    return feats
+
+
+def parse_timestamp(ts: Any) -> float:
+    """parse various timestamp formats to float seconds"""
+    if ts is None:
+        return 0.0
+    if isinstance(ts, (int, float)):
+        return float(ts)
+    if isinstance(ts, str):
+        try:
+            return datetime.fromisoformat(ts.replace('Z', '+00:00')).timestamp()
+        except ValueError:
+            return 0.0
+    return 0.0
diff --git a/lib/kafka_client.py b/lib/kafka_client.py
new file mode 100755
index 0000000..d61cd9e
--- /dev/null
+++ b/lib/kafka_client.py
@@ -0,0 +1,54 @@
+from kafka import KafkaConsumer
+import json
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+def get_interactions(
+    topic='user-interactions',
+    bootstrap_servers=None,
+    from_beginning=True,
+    max_records=None,
+    timeout_ms=5000
+):
+    """Consume interaction events from Kafka.
+
+    Args:
+        topic: Kafka topic name
+        bootstrap_servers: Kafka broker address (default from env)
+        from_beginning: Start from earliest offset if True
+        max_records: Max number of records to fetch (None = all available)
+        timeout_ms: Consumer poll timeout
+
+    Returns:
+        List of parsed interaction event dicts
+    """
+    if not bootstrap_servers:
+        host = os.getenv('KAFKA_HOST', 'localhost')
+        port = os.getenv('KAFKA_PORT', '9092')
+        bootstrap_servers = f'{host}:{port}'
+
+    consumer = KafkaConsumer(
+        topic,
+        bootstrap_servers=bootstrap_servers,
+        auto_offset_reset='earliest' if from_beginning else 'latest',
+        enable_auto_commit=False,
+        value_deserializer=lambda m: json.loads(m.decode('utf-8')),
+        consumer_timeout_ms=timeout_ms
+    )
+
+    events = []
+    try:
+        for msg in consumer:
+            events.append(msg.value)
+            if max_records and len(events) >= max_records:
+                break
+    finally:
+        consumer.close()
+
+    return events
+
+if __name__ == '__main__':
+    interactions = get_interactions(max_records=10)
+    for event in interactions:
+        print(event)
diff --git a/lib/state.py b/lib/state.py
new file mode 100644
index 0000000..cfb4251
--- /dev/null
+++ b/lib/state.py
@@ -0,0 +1,72 @@
+"""Unified state representation utilities for MDP state encoding
+Used by both experiments/ and sim/ components for consistent state handling
+"""
+from typing import Any, Callable
+
+
+def make_state_repr(page: str = None, product_id: str = None, event_name: str = None) -> str:
+    """create canonical state representation string from components
+    format: page|productId|eventName
+    """
+    p = page or 'unk'
+    pid = product_id or 'none'
+    en = event_name or 'unknown'
+    return f"{p}|{pid}|{en}"
+
+
+def event_to_state(evt: Any) -> str:
+    """convert event object/dict to state string
+    supports both object attributes and dict keys
+    """
+    if isinstance(evt, dict):
+        return make_state_repr(
+            page=evt.get('page'),
+            product_id=evt.get('productId'),
+            event_name=evt.get('eventName') or evt.get('event_type')
+        )
+    return make_state_repr(
+        page=getattr(evt, 'page', None),
+        product_id=getattr(evt, 'productId', None),
+        event_name=getattr(evt, 'eventName', None) or getattr(evt, 'event_type', None)
+    )
+
+
+def parse_state(state_str: str) -> dict:
+    """parse state string back to components
+    returns: {'page': str, 'productId': str, 'eventName': str}
+    """
+    parts = state_str.split('|')
+    return {
+        'page': parts[0] if len(parts) > 0 and parts[0] != 'unk' else None,
+        'productId': parts[1] if len(parts) > 1 and parts[1] != 'none' else None,
+        'eventName': parts[2] if len(parts) > 2 and parts[2] != 'unknown' else None
+    }
+
+
+def get_event_name(evt: Any) -> str:
+    """extract event name from event object/dict"""
+    if isinstance(evt, dict):
+        return evt.get('eventName') or evt.get('event_type') or ''
+    return getattr(evt, 'eventName', None) or getattr(evt, 'event_type', None) or ''
+
+
+def get_timestamp(evt: Any) -> Any:
+    """extract timestamp from event object/dict"""
+    if isinstance(evt, dict):
+        return evt.get('ts') or evt.get('timestamp')
+    return getattr(evt, 'ts', None) or getattr(evt, 'timestamp', None)
+
+
+def create_state_fn() -> Callable:
+    """factory for state representation function"""
+    return event_to_state
+
+
+def create_event_name_fn() -> Callable:
+    """factory for event name extraction function"""
+    return get_event_name
+
+
+def create_timestamp_fn() -> Callable:
+    """factory for timestamp extraction function (returns raw value, use features.parse_timestamp to convert)"""
+    return get_timestamp

From 0f5f8affab007789dcfad9aea52cf4f2791b41f1 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 19:12:35 +0100
Subject: [PATCH 97/99] chore: make lib backwards compatible

---
 experiments/ml/arch.py           |  91 +++++-----------
 sim/rl/behavior_loader/models.py |  12 +++
 sim/rl/environment.py            | 175 +++++++++++++++----------------
 3 files changed, 126 insertions(+), 152 deletions(-)

diff --git a/experiments/ml/arch.py b/experiments/ml/arch.py
index 4ceb2e0..1fa4f96 100644
--- a/experiments/ml/arch.py
+++ b/experiments/ml/arch.py
@@ -8,6 +8,20 @@ import pandas as pd
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import sys
+from pathlib import Path
+
+# add lib to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'lib'))
+from lib.features import (
+    transition_histogram as _lib_transition_histogram,
+    temporal_signature as _lib_temporal_signature,
+    state_coverage as _lib_state_coverage,
+    transition_entropy as _lib_transition_entropy,
+    featurize_trajectory as _lib_featurize_trajectory,
+    parse_timestamp
+)
+from lib.state import event_to_state, get_event_name, get_timestamp
 
 TASK = 'classification'
 LABELS = ['human', 'agent']
@@ -101,91 +115,40 @@ def nt_xent_loss(z_i: torch.Tensor, z_j: torch.Tensor, temperature: float = 0.5)
     return F.cross_entropy(sim, labels)
 
 
-# feature extraction utilities for trajectory -> feature vector
+# feature extraction utilities - delegating to lib.features for unified implementation
+# these wrappers maintain backwards compatibility for existing imports
+
 def transition_histogram(events: List, state_fn, max_states: int = 50) -> np.ndarray:
     """Compute normalized histogram of state transitions in trajectory"""
-    if len(events) < 2:
-        return np.zeros(max_states)
-    states = [state_fn(e) for e in events]
-    trans_counts = defaultdict(int)
-    for s, s_next in zip(states, states[1:]):
-        trans_counts[(s, s_next)] += 1
-    total = sum(trans_counts.values())
-    hist = np.array(list(trans_counts.values())[:max_states], dtype=np.float32)
-    hist = np.pad(hist, (0, max(0, max_states - len(hist))))
-    return hist / (total + 1e-10)
+    return _lib_transition_histogram(events, state_fn, max_states)
 
 
 def temporal_signature(events: List, ts_fn) -> np.ndarray:
     """Extract temporal features: mean/std/skew of inter-event times"""
-    if len(events) < 2:
-        return np.zeros(4, dtype=np.float32)
-    times = sorted([ts_fn(e) for e in events])
-    diffs = np.diff(times).astype(np.float32)
-    if len(diffs) == 0:
-        return np.zeros(4, dtype=np.float32)
-    mean_dt, std_dt = np.mean(diffs), np.std(diffs) + 1e-10
-    skew = np.mean(((diffs - mean_dt) / std_dt) ** 3) if std_dt > 1e-8 else 0.0
-    return np.array([mean_dt, std_dt, skew, len(diffs)], dtype=np.float32)
+    return _lib_temporal_signature(events, ts_fn)
 
 
 def state_coverage(events: List, state_fn, mdp_states: set) -> float:
     """Fraction of MDP states visited by trajectory"""
-    if not mdp_states:
-        return 0.0
-    visited = set(state_fn(e) for e in events)
-    return len(visited & mdp_states) / len(mdp_states)
+    return _lib_state_coverage(events, state_fn, mdp_states)
 
 
 def transition_entropy(events: List, state_fn) -> float:
     """Compute entropy of transition distribution (randomness of navigation)"""
-    if len(events) < 2:
-        return 0.0
-    states = [state_fn(e) for e in events]
-    trans_counts = defaultdict(int)
-    for s, s_next in zip(states, states[1:]):
-        trans_counts[(s, s_next)] += 1
-    total = sum(trans_counts.values())
-    probs = [c / total for c in trans_counts.values()]
-    return -sum(p * np.log(p + 1e-10) for p in probs)
+    return _lib_transition_entropy(events, state_fn)
 
 
 def featurize_trajectory(events: List, mdp: Optional[Dict] = None, input_dim: int = 64) -> np.ndarray:
-    """Convert trajectory to fixed-dim feature vector"""
-    def _state_repr(e):
-        return f"{getattr(e, 'page', None) or 'unk'}|{getattr(e, 'productId', None) or 'none'}|{e.eventName}"
+    """Convert trajectory to fixed-dim feature vector - uses lib.features implementation"""
+    mdp_states = set(mdp.get('states', [])) if mdp else set()
 
     def _ts_fn(e):
-        ts = getattr(e, 'ts', None)
-        if isinstance(ts, str):
-            from datetime import datetime
-            try:
-                return datetime.fromisoformat(ts.replace('Z', '+00:00')).timestamp()
-            except:
-                return 0.0
-        return float(ts) if ts else 0.0
+        return parse_timestamp(get_timestamp(e))
 
-    feats = []
-    feats.extend(transition_histogram(events, _state_repr, max_states=40))  # 40 dims
-    feats.extend(temporal_signature(events, _ts_fn))  # 4 dims
-    mdp_states = set(mdp.get('states', [])) if mdp else set()
-    feats.append(state_coverage(events, _state_repr, mdp_states))  # 1 dim
-    feats.append(transition_entropy(events, _state_repr))  # 1 dim
-    feats.append(len(events))  # trajectory length
-    feats.append(len(set(_state_repr(e) for e in events)))  # unique states
+    def _event_name_fn(e):
+        return get_event_name(e)
 
-    # event type distribution (page_view, hover, cart, purchase indicators)
-    event_names = [e.eventName for e in events]
-    feats.append(sum(1 for n in event_names if 'page' in n.lower()) / (len(events) + 1))
-    feats.append(sum(1 for n in event_names if 'hover' in n.lower()) / (len(events) + 1))
-    feats.append(sum(1 for n in event_names if 'cart' in n.lower()) / (len(events) + 1))
-    feats.append(sum(1 for n in event_names if 'purchase' in n.lower() or 'checkout' in n.lower()) / (len(events) + 1))
-
-    # pad/truncate to input_dim
-    feats = np.array(feats[:input_dim], dtype=np.float32)
-    if len(feats) < input_dim:
-        feats = np.pad(feats, (0, input_dim - len(feats)))
-    return feats
+    return _lib_featurize_trajectory(events, event_to_state, _ts_fn, _event_name_fn, mdp_states, input_dim)
 
 
 # gradient boosting classifiers for comparison baselines
diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index 4c6bf21..3530724 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -6,6 +6,18 @@ from collections import defaultdict
 from typing import Dict, List, Tuple, Set
 import numpy as np
 import graphviz
+import sys
+from pathlib import Path
+
+# import lib utilities for optional use - models keep their own _state_repr for backwards compat
+# with the specific event structure (evt.value.payload)
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / 'lib'))
+try:
+    from lib.state import make_state_repr as lib_make_state_repr
+    from lib.features import transition_histogram as lib_transition_histogram
+except ImportError:
+    lib_make_state_repr = None
+    lib_transition_histogram = None
 
 class BehaviorModel:
     def __init__(self, src_dir: str, loader_cls=Loader):
diff --git a/sim/rl/environment.py b/sim/rl/environment.py
index fd725f8..d9ccbcb 100644
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -1,7 +1,5 @@
-from sys import intern
 import gymnasium as gym
 from gymnasium import spaces
-from matplotlib import interactive
 import numpy as np
 from dataclasses import dataclass
 import pandas as pd
@@ -15,7 +13,7 @@ class BusinessLogicConstraints():
     max_price_adjustment: float = 0.30
     system_max_price: float = 500.0
     system_min_price: float = 1.0
-    product_catelogue_size: int = 100
+    product_catalogue_size: int = 100
     episode_length: int = 200
     sessions_per_step: int = 250
     agent_share: float = 0.25
@@ -37,17 +35,42 @@ class BusinessLogicConstraints():
 def _sigmoid(x: np.ndarray) -> np.ndarray:
     return 1.0 / (1.0 + np.exp(-x))
 
+class BehavioralProfile:
+    """simple markov chain model for generating synthetic interaction events"""
+    def __init__(self, actor: str, purchase_probs: np.ndarray):
+        self.actor = actor
+        self.purchase_probs = purchase_probs
+        self.states = ['view', 'cart', 'checkout']
+        # transition matrix: view->cart 0.3, view->view 0.6, view->exit 0.1, cart->checkout 0.5, cart->view 0.4, cart->exit 0.1
+        self.trans = {'view': {'view': 0.6, 'cart': 0.3, 'exit': 0.1}, 'cart': {'checkout': 0.5, 'view': 0.4, 'exit': 0.1}, 'checkout': {'exit': 1.0}}
+        if actor == 'agents':  # agents browse more before purchasing
+            self.trans['view'] = {'view': 0.75, 'cart': 0.15, 'exit': 0.1}
+            self.trans['cart'] = {'checkout': 0.3, 'view': 0.6, 'exit': 0.1}
+
+    def sample(self, rng: np.random.Generator) -> Dict[str, Any]:
+        """sample single interaction event"""
+        product_idx = rng.integers(0, len(self.purchase_probs))
+        state = 'view'  # always start with view
+        # pick next state based on transition probs
+        trans = self.trans.get(state, {'exit': 1.0})
+        next_state = rng.choice(list(trans.keys()), p=list(trans.values()))
+        price_paid = 0.0 if next_state != 'checkout' else float(rng.uniform(50, 200))
+        return {'action': state, 'product_idx': product_idx, 'actor': 'agent' if self.actor == 'agents' else 'human', 't': 0.0, 'price_paid': price_paid}
+
+
+def _load_behavioral_profile(actor: str, demand_forcing: np.ndarray) -> BehavioralProfile:
+    """returns a behavioral profile for generating synthetic sessions
+    actor: 'humans' or 'agents'
+    demand_forcing: per-product purchase probabilities used to weight interactions
+    """
+    return BehavioralProfile(actor, demand_forcing)
+
+
 class CommercePlatform:
-    """
-    This is just an extension of the state management for the environment, it does not implement anything dynamic just helps us simulate demand.
-    """
-    def __init__(self,
-                 product_catelogue_size: int,
-                 max_price: float,
-                 min_price: float,
-                 constraints: BusinessLogicConstraints):
-        self.product_catelogue_size = product_catelogue_size
-        self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catelogue_size,))
+    """state management for the environment, simulates demand"""
+    def __init__(self, product_catalogue_size: int, max_price: float, min_price: float, constraints: BusinessLogicConstraints):
+        self.product_catalogue_size = product_catalogue_size
+        self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catalogue_size,))
         self.max_price = max_price
         self.min_price = min_price
         self.constraints = constraints
@@ -55,27 +78,12 @@ class CommercePlatform:
         self._rng = np.random.default_rng(constraints.seed)
         self._last_interaction_df: pd.DataFrame = pd.DataFrame()
 
-
     def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
-        # ground truth purchase propensities
         p = np.clip(prices, self.min_price, self.max_price)
         pn = p / self.max_price
         human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
         agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
-        return {
-            "human_purchase_prob": np.clip(human_prob, 0.0, 0.95),
-            "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)
-        }
-
-    def _load_behavioral_profile(actor : str, demand_forcing):
-        """
-        This returns a markov chain with average weights which we get from interaction data of our experiments.
-        This defines transition probabilities between different events:
-        search -> view_item_price_binN: 0.7
-        view_item_price_binN -> add_to_cart: 0.2
-        we also must reweight with the demand_forcing vector or purchase probabilities per-product
-        """
-
+        return {"human_purchase_prob": np.clip(human_prob, 0.0, 0.95), "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)}
 
     def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
         demand = self.setup_true_demand(base_prices)
@@ -162,22 +170,22 @@ class PHANTOMEnv(gym.Env):
         self.constraints = BusinessLogicConstraints()
         self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
                                        high=self.constraints.max_price_adjustment,
-                                       shape=(self.constraints.product_catelogue_size,), dtype=np.float32)
+                                       shape=(self.constraints.product_catalogue_size,), dtype=np.float32)
         self.observation_space = spaces.Dict({
             "elasticity": spaces.Dict({
                 "price": spaces.Box(
-                    low=np.full((self.constraints.product_catelogue_size,), self.constraints.system_min_price, dtype=np.float32),
-                    high=np.full((self.constraints.product_catelogue_size,), self.constraints.system_max_price, dtype=np.float32),
+                    low=np.full((self.constraints.product_catalogue_size,), self.constraints.system_min_price, dtype=np.float32),
+                    high=np.full((self.constraints.product_catalogue_size,), self.constraints.system_max_price, dtype=np.float32),
                     dtype=np.float32),
                 "demand": spaces.Box(
-                    low=np.zeros((self.constraints.product_catelogue_size,), dtype=np.float32),
-                    high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32),
+                    low=np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
+                    high=np.full((self.constraints.product_catalogue_size,), 1e6, dtype=np.float32),
                     dtype=np.float32),
             })
             # TODO: define more features that we compute from the interaction data
         })
         self.commerce_platform = CommercePlatform(
-            product_catelogue_size=self.constraints.product_catelogue_size,
+            product_catalogue_size=self.constraints.product_catalogue_size,
             max_price=self.constraints.system_max_price,
             min_price=self.constraints.system_min_price,
             constraints=self.constraints)
@@ -192,12 +200,12 @@ class PHANTOMEnv(gym.Env):
             self._rng = np.random.default_rng(seed)
             self.commerce_platform._rng = np.random.default_rng(seed)
         self.t = 0
-        init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catelogue_size,)).astype(np.float32)
+        init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catalogue_size,)).astype(np.float32)
         self._prev_prices = init_prices.copy()
         self.state = {
             "elasticity": {
                 "price": init_prices,
-                "demand": np.zeros((self.constraints.product_catelogue_size,), dtype=np.float32),
+                "demand": np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
             }
         }
         return self.state, {}
@@ -210,38 +218,35 @@ class PHANTOMEnv(gym.Env):
                            self.constraints.system_max_price).astype(np.float32)
 
         self.state["elasticity"]["price"] = new_prices
-        # TODO: use the commerce platform to simulate sessions
         interactions_df = self.commerce_platform._simulate_sessions(new_prices)
         result = self.commerce_platform.compute_interaction_features(interactions_df)
-        # TODO: implement COI computation to use in reward
-        COI = 0.0
+        COI = 0.0  # TODO: implement cost-of-information computation
 
         volatility = 0.0 if self._prev_prices is None else \
             float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
         self._prev_prices = new_prices.copy()
 
-        revenue_observed = float(result["revenue_observed"])
-        agent_loss = float(result["agent_loss"])
+        # extract metrics with safe defaults for incomplete simulation
+        revenue_observed = float(result.get("revenue_observed", result.get("mean_sale_price", 0.0)))
+        agent_loss = float(result.get("agent_loss", 0.0))
 
         reward = (revenue_observed
                   - COI
                   - self.constraints.w_agent_loss * agent_loss
                   - self.constraints.w_volatility * volatility
-                  - self.constraints.w_estimation_error
-                  )
+                  - self.constraints.w_estimation_error)
 
         terminated = self.t >= self.constraints.episode_length
         info = {
             "t": self.t,
             "revenue_observed": revenue_observed,
-            "revenue_oracle": float(result["revenue_oracle"]),
+            "revenue_oracle": float(result.get("revenue_oracle", revenue_observed)),
             "agent_loss": agent_loss,
             "ux_volatility": volatility,
-            "mean_internal_error": err_mean,
-            "look_to_book": float(result["interaction_features"].get("look_to_book", 0.0)),
-            "mean_sale_price": float(result["interaction_features"].get("mean_sale_price", 0.0)),
-            "true_human_purchases_total": float(np.sum(result["true_human_demand"])),
-            "true_agent_purchases_total": float(np.sum(result["true_agent_purchases"])),
+            "look_to_book": float(result.get("look_to_book", 0.0)),
+            "mean_sale_price": float(result.get("mean_sale_price", 0.0)),
+            "true_human_purchases_total": 0.0,  # TODO: track from simulation
+            "true_agent_purchases_total": 0.0,  # TODO: track from simulation
         }
         return self.state, float(reward), terminated, False, info
 
@@ -250,46 +255,43 @@ if __name__ == "__main__":
     import matplotlib.pyplot as plt
     from collections import defaultdict
 
-    runs = {}
-    for use_defense in (False, True):
-        env = PHANTOMEnv(use_defense=use_defense)
-        obs, _ = env.reset(seed=42)
-        metrics = defaultdict(list)
-        total_reward = 0.0
-        done = False
+    env = PHANTOMEnv(constraints=BusinessLogicConstraints())
+    obs, _ = env.reset(seed=42)
+    metrics = defaultdict(list)
+    total_reward = 0.0
+    done = False
 
-        while not done:
-            action = env.action_space.sample()
-            obs, reward, done, _, info = env.step(action)
-            total_reward += reward
-            p_mean = float(np.mean(obs["elasticity"]["price"]))
-            q_mean = float(np.mean(obs["elasticity"]["demand"]))
-            p_std = float(np.std(obs["elasticity"]["price"]))
+    while not done:
+        action = env.action_space.sample()
+        obs, reward, done, _, info = env.step(action)
+        total_reward += reward
+        p_mean = float(np.mean(obs["elasticity"]["price"]))
+        q_mean = float(np.mean(obs["elasticity"]["demand"]))
+        p_std = float(np.std(obs["elasticity"]["price"]))
 
-            metrics['t'].append(info['t'])
-            metrics['price_mean'].append(p_mean)
-            metrics['price_std'].append(p_std)
-            metrics['demand_mean'].append(q_mean)
-            metrics['revenue_observed'].append(info['revenue_observed'])
-            metrics['revenue_oracle'].append(info['revenue_oracle'])
-            metrics['agent_loss'].append(info['agent_loss'])
-            metrics['ux_volatility'].append(info['ux_volatility'])
-            metrics['look_to_book'].append(info['look_to_book'])
-            metrics['reward'].append(reward)
-            metrics['human_purchases'].append(info['true_human_purchases_total'])
-            metrics['agent_purchases'].append(info['true_agent_purchases_total'])
+        metrics['t'].append(info['t'])
+        metrics['price_mean'].append(p_mean)
+        metrics['price_std'].append(p_std)
+        metrics['demand_mean'].append(q_mean)
+        metrics['revenue_observed'].append(info['revenue_observed'])
+        metrics['revenue_oracle'].append(info['revenue_oracle'])
+        metrics['agent_loss'].append(info['agent_loss'])
+        metrics['ux_volatility'].append(info['ux_volatility'])
+        metrics['look_to_book'].append(info['look_to_book'])
+        metrics['reward'].append(reward)
+        metrics['human_purchases'].append(info['true_human_purchases_total'])
+        metrics['agent_purchases'].append(info['true_agent_purchases_total'])
 
-            if info['t'] % 20 == 0 or done:
-                print(f"defense={'ON ' if use_defense else 'OFF'} t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} "
-                      f"q={q_mean:6.2f} rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
-                      f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
-                      f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
+        if info['t'] % 20 == 0 or done:
+            print(f"t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} q={q_mean:6.2f} "
+                  f"rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
+                  f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
+                  f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
 
-        runs[use_defense] = metrics
-        print(f"defense={'ON ' if use_defense else 'OFF'} total_reward={total_reward:.2f}\n")
+    print(f"total_reward={total_reward:.2f}")
 
     fig, axes = plt.subplots(3, 3, figsize=(15, 12))
-    fig.suptitle('PHANTOM Environment: Defense OFF vs ON', fontsize=14, fontweight='bold')
+    fig.suptitle('PHANTOM Environment Run', fontsize=14, fontweight='bold')
 
     plot_configs = [
         ('price_mean', 'Mean Price', 'Price'),
@@ -305,13 +307,10 @@ if __name__ == "__main__":
 
     for idx, (key, title, ylabel) in enumerate(plot_configs):
         ax = axes[idx // 3, idx % 3]
-        for use_defense, label, color in [(False, 'No Defense', 'red'), (True, 'With Defense', 'blue')]:
-            m = runs[use_defense]
-            ax.plot(m['t'], m[key], label=label, color=color, alpha=0.7, linewidth=1.5)
+        ax.plot(metrics['t'], metrics[key], color='blue', alpha=0.7, linewidth=1.5)
         ax.set_xlabel('Step')
         ax.set_ylabel(ylabel)
         ax.set_title(title, fontsize=10, fontweight='bold')
-        ax.legend(loc='best', fontsize=8)
         ax.grid(True, alpha=0.3)
 
     plt.tight_layout()

From 72877439ca8133613f19173eb6b47099d68141dc Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 21 Jan 2026 19:12:56 +0100
Subject: [PATCH 98/99] feat: contaminator and training

---
 experiments/procesing/contaminator.py | 87 ++++++++++++++++----------
 sim/rl/train.py                       | 89 ++++++++++++++-------------
 2 files changed, 100 insertions(+), 76 deletions(-)

diff --git a/experiments/procesing/contaminator.py b/experiments/procesing/contaminator.py
index da44c3d..2f23b2b 100644
--- a/experiments/procesing/contaminator.py
+++ b/experiments/procesing/contaminator.py
@@ -1,45 +1,66 @@
 import pandas as pd
 import random
-from sim.rl.behavior_loader import AgentBehaviorModel # TODO: proper import this
+import os
+from pathlib import Path
 
-base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
-agent_dir = f"{base_dir}/agents/collected_data/"
+# use relative import when in package context, fallback for standalone
+try:
+    from sim.rl.behavior_loader.models import AgentBehaviorModel
+except ImportError:
+    import sys
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent / "sim" / "rl" / "behavior_loader"))
+    from models import AgentBehaviorModel
+
+# paths should be configurable via environment or relative to project root
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', PROJECT_ROOT / "experiments" / "agents" / "collected_data"))
 
 
-
-def remap_schema(df : pd.DataFrame, mapping: dict, on: str = "event_type"):
+def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.DataFrame:
+    """remap column values according to mapping dict, preserving unmapped values"""
     df = df.copy()
     df[on] = df[on].map(mapping).fillna(df[on])
     return df
 
 
-def contaminate_dataset(df : pd.DataFrame, on : str = "event_type",
-                        contamination_rate: float = 0.1) -> pd.DataFrame:
-    model = AgentBehaviorModel(agent_dir)
-    target_df_schema = df[on].unique().tolist()
-    mapping = {
-        'view': 'view_page'
-        # TODO: define properly for the given dataset
-    }
-    # think about replacing with freqdist method from library
-    OG_event_distribution = df[on].value_counts(normalize=True).to_dict()
-    # normalize to weights
-    OG_event_distribution = {k: v / sum(OG_event_distribution.values()) for k, v in OG_event_distribution.items()}
-    mapped_df = remap_schema(df, mapping, on=on)
-    N = len(df)
-    N_final = N / (1 - contamination_rate) # TODO: explain this in paper
-    N_contaminate = int(N_final - N)
-    start_event_types = random.choices(list(OG_event_distribution.keys()),
-                                    weights=list(OG_event_distribution.values()), k=N_contaminate)
-    # it makes sense
-    new_trajectories = []
-    for start_event in start_event_types:
-        # sample from og start
-        start = None # TODO: defin start accoding to dataset (randomly sample with weights of event distr)
-        trajectory = model.sample_trajectory(start) # TODO: explain this method in paper
-        new_trajectories.extend(trajectory)
+def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
+                        contamination_rate: float = 0.1,
+                        agent_data_dir: Path = None) -> pd.DataFrame:
+    """inject synthetic agent trajectories into a dataset
+    contamination_rate: fraction of final dataset that should be agent data (0.1 = 10% agents)
+    """
+    data_dir = agent_data_dir or AGENT_DATA_DIR
+    model = AgentBehaviorModel(str(data_dir))
+    model.build_MDP()  # ensure MDP is built before sampling
 
-    # TODO: make sure the new trajctories schema conforms with dataset
-    contaminate_df = pd.DataFrame(new_trajectories)
-    df = pd.concat([df, contaminate_df], ignore_index=True)
+    # compute event distribution from original data
+    event_dist = df[on].value_counts(normalize=True).to_dict()
+    total = sum(event_dist.values())
+    event_dist = {k: v / total for k, v in event_dist.items()}
+
+    # calculate how many synthetic events to add
+    N = len(df)
+    N_final = N / (1 - contamination_rate)
+    N_contaminate = int(N_final - N)
+
+    # sample start states weighted by original distribution
+    start_events = random.choices(list(event_dist.keys()), weights=list(event_dist.values()), k=N_contaminate)
+
+    # generate synthetic trajectories
+    new_rows = []
+    for start_event in start_events:
+        # sample trajectory from agent model, using a state that contains the event type
+        mdp_states = model.mdp.get('states', []) if model.mdp else []
+        matching_starts = [s for s in mdp_states if start_event in s]
+        if not matching_starts:
+            continue  # skip if no matching start state
+        start_state = random.choice(matching_starts)
+        trajectory = model.sample_traj(start_state, max_len=20)
+        for state in trajectory:
+            parts = state.split('|')  # page|productId|eventName format
+            new_rows.append({on: parts[-1] if parts else start_event, 'source': 'synthetic_agent'})
+
+    if new_rows:
+        contaminate_df = pd.DataFrame(new_rows)
+        df = pd.concat([df, contaminate_df], ignore_index=True)
     return df
diff --git a/sim/rl/train.py b/sim/rl/train.py
index ba257de..01e6809 100644
--- a/sim/rl/train.py
+++ b/sim/rl/train.py
@@ -3,15 +3,17 @@ import logging
 from pathlib import Path
 from typing import Dict, Type, Optional
 import pickle
-from torch import neg_
 from torch.utils.tensorboard import SummaryWriter
-from environment import PHANTOMEnv, FastTrainingConstraints, BusinessLogicConstraints
-from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
-                   SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
+from environment import PHANTOMEnv, BusinessLogicConstraints
 
 logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
 logger = logging.getLogger(__name__)
 
+try:
+    from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
+                       SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
+except ImportError:
+    BasePricingEngine = None  # engines not required for basic usage
 
 
 """
@@ -26,8 +28,7 @@ CURRENT SOLUTION BELOW does not implement correct learning or updates.
 
 class EngineTrainer:
     """wrapper to run pricing engines through episodes and collect metrics"""
-    def __init__(self, engine: BasePricingEngine, env: PHANTOMEnv,
-                 tb_writer: Optional[SummaryWriter] = None):
+    def __init__(self, engine, env: PHANTOMEnv, tb_writer: Optional[SummaryWriter] = None):
         self.engine = engine
         self.env = env
         self.episode_metrics = []
@@ -35,7 +36,6 @@ class EngineTrainer:
         self.global_step = 0
 
     def train(self, n_episodes: int, seed: int = 42):
-
         obs, _ = self.env.reset(seed=seed)
         prices = None
         for ep in range(n_episodes):
@@ -44,12 +44,21 @@ class EngineTrainer:
             self.engine.update(obs, reward, done, info)
         return self
 
-
-
-
-
-
-        return self.episode_metrics
+    def run_episode(self, seed: int = 42) -> Dict:
+        """run single evaluation episode and return metrics"""
+        obs, _ = self.env.reset(seed=seed)
+        self.engine.reset()
+        total_reward, prices = 0.0, None
+        ep_metrics = {'total_reward': 0.0}
+        done = False
+        while not done:
+            prices = self.engine.compute_prices(prices, obs) if prices is not None else obs["elasticity"]["price"]
+            obs, reward, done, _, info = self.env.step(prices)
+            total_reward += reward
+            for k, v in info.items():
+                ep_metrics[k] = v
+        ep_metrics['total_reward'] = total_reward
+        return ep_metrics
 
     def evaluate(self, n_episodes: int = 10, seed: int = 100) -> Dict:
         """evaluate trained engine"""
@@ -57,17 +66,16 @@ class EngineTrainer:
                                    'agent_loss', 'ux_volatility', 'look_to_book']}
         for ep in range(n_episodes):
             metrics = self.run_episode(seed=seed + ep)
-            for k in results:                results[k].append(metrics[k])
+            for k in results:
+                results[k].append(metrics.get(k, 0.0))
         return {k: (np.mean(v), np.std(v)) for k, v in results.items()}
 
 
-def make_env(fast: bool = True):
-    constraints = FastTrainingConstraints() if fast else BusinessLogicConstraints()
-    return PHANTOMEnv(constraints=constraints)
+def make_env():
+    return PHANTOMEnv(constraints=BusinessLogicConstraints())
 
 
-def train_engine(engine_cls: Type[BasePricingEngine], env: PHANTOMEnv,
-                n_episodes: int, seed: int = 42,
+def train_engine(engine_cls, env: PHANTOMEnv, n_episodes: int, seed: int = 42,
                 tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
     constraints = env.constraints
     engine = engine_cls(constraints=constraints, seed=seed)
@@ -80,15 +88,11 @@ def save_trainer(trainer: EngineTrainer, path: Path):
     """save engine state and metrics"""
     path.parent.mkdir(parents=True, exist_ok=True)
     with open(path, 'wb') as f:
-        pickle.dump({
-            'engine': trainer.engine,
-            'metrics': trainer.episode_metrics
-        }, f)
+        pickle.dump({'engine': trainer.engine, 'metrics': trainer.episode_metrics}, f)
     logger.info(f"Saved trainer to {path}")
 
 
-def load_trainer(path: Path, env: PHANTOMEnv,
-                 tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
+def load_trainer(path: Path, env: PHANTOMEnv, tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
     """load saved engine"""
     with open(path, 'rb') as f:
         data = pickle.load(f)
@@ -98,45 +102,44 @@ def load_trainer(path: Path, env: PHANTOMEnv,
 
 
 if __name__ == "__main__":
+    if BasePricingEngine is None:
+        logger.error("Engines not available, cannot run training")
+        exit(1)
+
     base_dir = Path("./runs")
     base_dir.mkdir(exist_ok=True)
 
     engines = {
         "Wild": WildPricingEngine,
         "Static": StaticPricingEngine,
-#        "SimpleDemand": SimpleDemandEngine,
         "RandomWalk": RandomWalkEngine,
         "ThompsonSampling": ThompsonSamplingEngine,
     }
-    defenses = [False, True]
     n_train_episodes = 50
     n_eval_episodes = 10
     seed = 42
-    fast_mode = True
 
-    logger.info(f"Training config: {n_train_episodes} episodes per engine, fast_mode={fast_mode}")
+    logger.info(f"Training config: {n_train_episodes} episodes per engine")
 
     trained_trainers = {}
 
     for engine_name, engine_cls in engines.items():
-        for use_defense in defenses:
-            defense_label = "defense_on" if use_defense else "defense_off"
-            run_name = f"{engine_name}_{defense_label}"
-            log_dir = base_dir / run_name
-            log_dir.mkdir(parents=True, exist_ok=True)
+        run_name = engine_name
+        log_dir = base_dir / run_name
+        log_dir.mkdir(parents=True, exist_ok=True)
 
-            logger.info(f"Training {engine_name} with defense={use_defense}")
-            logger.info(f"Log directory: {log_dir}")
+        logger.info(f"Training {engine_name}")
+        logger.info(f"Log directory: {log_dir}")
 
-            env = make_env(fast=fast_mode)
-            tb_writer = SummaryWriter(log_dir=str(log_dir))
-            trainer = train_engine(engine_cls, env, n_train_episodes, seed, tb_writer=tb_writer)
-            tb_writer.close()
+        env = make_env()
+        tb_writer = SummaryWriter(log_dir=str(log_dir))
+        trainer = train_engine(engine_cls, env, n_train_episodes, seed, tb_writer=tb_writer)
+        tb_writer.close()
 
-            save_path = log_dir / "trainer.pkl"
-            save_trainer(trainer, save_path)
+        save_path = log_dir / "trainer.pkl"
+        save_trainer(trainer, save_path)
 
-            trained_trainers[run_name] = (trainer, env)
+        trained_trainers[run_name] = (trainer, env)
 
     logger.info("Starting evaluation")
 

From 13959e4b287e96a93340ea4adb0a92214c93659a Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sat, 31 Jan 2026 10:13:07 +0100
Subject: [PATCH 99/99] chore: bug fixes

---
 engine/engine.py                 | 2 +-
 engine/studies/full_factorial.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/engine/engine.py b/engine/engine.py
index e304aeb..cacac7a 100644
--- a/engine/engine.py
+++ b/engine/engine.py
@@ -21,7 +21,7 @@ class MarketEngine():
     def act(self, prices):
         demand = generate_demand(prices, *self.demand)
         sample_n = lambda n, human: [sample_behavior(demand, human=human) for _ in range(n)]
-        human_t, agent_t = sample_n(100, True), sample_n(100, False)
+        human_t, agent_t = sample_n(self.Nhumans, True), sample_n(self.Nagents, False)
         trajectories = human_t + agent_t
         demand_estimate = estimate_demand(trajectories)
         return demand_estimate
diff --git a/engine/studies/full_factorial.py b/engine/studies/full_factorial.py
index 9b4d1eb..92210b2 100644
--- a/engine/studies/full_factorial.py
+++ b/engine/studies/full_factorial.py
@@ -56,7 +56,7 @@ def run_single(cfg: dict) -> dict:
         "id": cfg["id"],
         "config": cfg,
         "total_reward": total_reward,
-        "avg_reward": total_reward / steps,
+        "avg_reward": total_reward / steps if steps > 0 else 0.0,
         "steps": steps,
     }