Merge pull request #51 from velocitatem/feat-strong-learning-implementation-with-data-contamination

Feat strong learning implementation with data contamination
2026-07-15 17:43:36 +00:00 · 2026-01-31 10:15:09 +01:00
parent 72877439ca 13959e4b28
commit 9843c5deab
33 changed files with 2828 additions and 328 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,11 @@
 *.old
 **/package-lock.json
 **/*.parquet
+**/_build/

+paper/src/bib/auto
+=======
+**/_build/
 paper/src/auto/*
 paper/src/bib/auto
 docs/goals/*.md
@@ -24,3 +28,5 @@ sim/rl/behavior_loader/*.png
 sim/rl/behavior_loader/*.svg
 sim/rl/behavior_loader/*.pdf
 tests/e2e/node_modules/**
+lab/case/thesis/runs*/
+sim/case/thesis_simplified/runs*/
--- a/engine/engine.py
+++ b/engine/engine.py
@@ -0,0 +1,66 @@
+from sys import platform
+import numpy as np
+from .lib.demand import generate_demand, estimate_demand
+from .lib.behavior import sample_behavior
+from logging import INFO, getLogger
+logger = getLogger(__name__)
+logger.setLevel(INFO)
+
+
+
+class MarketEngine():
+    def __init__(self,
+                 alpha = 0.5,
+                 N = 100,
+                 demand_distribution = (50, 10),
+                 demand_sampling_function = np.random.normal):
+        self.Nagents = int(N*alpha)
+        self.Nhumans = int(N*(1-alpha))
+        self.demand = (demand_sampling_function, demand_distribution)
+
+    def act(self, prices):
+        demand = generate_demand(prices, *self.demand)
+        sample_n = lambda n, human: [sample_behavior(demand, human=human) for _ in range(n)]
+        human_t, agent_t = sample_n(self.Nhumans, True), sample_n(self.Nagents, False)
+        trajectories = human_t + agent_t
+        demand_estimate = estimate_demand(trajectories)
+        return demand_estimate
+
+    def measure(self):
+        pass
+
+class PricingEngine():
+    def __init__(self,
+                 ) -> None:
+        pass
+
+    def act(self, demand):
+        return np.random.uniform(low=25, high=100, size=10)
+
+
+
+class Limbo():
+    def __init__(self,
+                 platform,
+                 market
+                 ) -> None:
+        self.platform_turn = True
+        self.platform = platform
+        self.market = market
+        self.output = None
+
+    def step(self):
+        # we could code golf this a little bit
+        if self.platform_turn:
+            self.output = self.platform.act(self.output)
+        else:
+            self.output = self.market.act(self.output)
+        print(self.output)
+        self.platform_turn = not self.platform_turn
+
+if __name__ == "__main__":
+    platform = PricingEngine()
+    market = MarketEngine()
+    limbo = Limbo(platform, market)
+    for _ in range(10):
+        limbo.step()
--- a/engine/lib/init.py
+++ b/engine/lib/init.py
@@ -0,0 +1,3 @@
+from .demand import generate_demand, estimate_demand
+from .behavior import sample_behavior
+from .render import DashboardRenderer, style_axis
--- a/engine/lib/behavior.py
+++ b/engine/lib/behavior.py
@@ -0,0 +1,47 @@
+from sim.rl.behavior_loader.models import BehaviorModel, AgentBehaviorModel, aggregate_event_transitions
+import pandas as pd
+import numpy as np
+from .demand import generate_demand
+
+base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
+human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
+
+_cache = {}  # lazy cache for models and base pivots
+
+def _get_base_pivot(human: bool):
+    key = 'human' if human else 'agent'
+    if key not in _cache:
+        model = BehaviorModel(human_dir) if human else AgentBehaviorModel(agent_dir)
+        mdp = model.build_MDP()
+        _cache[key] = pd.DataFrame(aggregate_event_transitions(mdp)).fillna(0.0)
+    return _cache[key]
+
+def adjust_behavior_to_condition(condition, transition_matrix):
+    # expand NxN transition matrix to (N*P)x(N*P) weighted by demand condition
+    cond_norm = condition / np.sum(condition)
+    n_products = len(condition)
+    base_vals = transition_matrix.values
+    base_cols, base_rows = transition_matrix.columns.tolist(), transition_matrix.index.tolist()
+
+    # expand via kronecker-like tiling: each cell becomes a P*P block weighted by outer product of cond_norm
+    expanded = np.kron(base_vals, np.outer(cond_norm, cond_norm))
+    new_cols = [f"{c}_product{p}" for c in base_cols for p in range(n_products)]
+    new_rows = [f"{r}_product{p}" for r in base_rows for p in range(n_products)]
+    return pd.DataFrame(expanded, index=new_rows, columns=new_cols)
+
+def sample_behavior(condition, human=True, max_len=40):
+    base_pivot = _get_base_pivot(human)
+    adjusted_transitions = adjust_behavior_to_condition(condition, base_pivot)
+
+    trajectory = [np.random.choice(adjusted_transitions.index)]
+    while len(trajectory) < max_len or 'checkout' in trajectory[-1]:
+        probs = adjusted_transitions.loc[trajectory[-1]].values
+        sample = np.random.choice(adjusted_transitions.columns, p=probs/np.sum(probs) if np.sum(probs) > 0 else None)
+        trajectory.append(sample)
+    return trajectory
+
+if __name__ == "__main__":
+    t=sample_behavior(generate_demand(np.array([10,20,30])), human=True)
+    print(t)
+    t=sample_behavior(generate_demand(np.array([10,20,30])), human=False)
+    print(t)
--- a/engine/lib/demand.py
+++ b/engine/lib/demand.py
@@ -0,0 +1,45 @@
+import logging
+import numpy as np
+from logging import getLogger
+logger = getLogger(__name__)
+
+def generate_demand(prices, distribution_method = np.random.normal, distribution_params = (50.0, 10.0)):
+    # assumption 1: each product has an intrinsic valuation drawn from a normal distribution centered at 50
+    product_valuations = distribution_method(*distribution_params, size=len(prices))
+    # assumption 2: demand decreases as price increases, following a simple linear model
+    demand = np.maximum(0, product_valuations - prices) # demand cannot be negative
+    total = np.sum(demand)
+    demand = demand / total * 100 if total > 0 else demand  # normalize to percentage, avoid div by zero
+    logger.info(f"Generated demand for prices {prices}: {demand} with valuations from distribution {distribution_params}")
+    return demand
+
+def estimate_demand(trajectories):
+    demand_estimate = {}
+    for traj in trajectories:
+        for event in traj:
+            if 'view_product' in event:
+                product_id = int(event.split('_')[-1].replace('product', ''))
+                demand_estimate[product_id] = demand_estimate.get(product_id, 0) + 1
+    total_views = sum(demand_estimate.values())
+    for product_id in demand_estimate:
+        demand_estimate[product_id] = (demand_estimate[product_id] / total_views) * 100  # normalize to percentage
+    return demand_estimate
+
+# Example usage
+if __name__ == "__main__":
+    np.random.seed(42)
+    prices = np.array([20.0, 35.0, 50.0, 65.0])
+    demand = generate_demand(prices)
+    print("Generated Demand:", demand)
+    from .behavior import sample_behavior
+    N, alphat =200, 0.1
+    trajectories = []
+    for _ in range(int(N*(1 - alphat))):
+        trajectories.append(sample_behavior(demand, human=True))
+    for _ in range(int(N*alphat)):
+        trajectories.append(sample_behavior(demand, human=False))
+    demand_estimate = estimate_demand(trajectories)
+    print("Estimated Demand from Behavior:", demand_estimate)
+    delta = {k: demand_estimate.get(k, 0) - demand[i] for i, k in enumerate(range(len(prices)))}
+    delta = np.mean([np.abs(v) for v in delta.values()])
+    print("Demand Delta:", delta)
--- a/engine/lib/render.py
+++ b/engine/lib/render.py
@@ -0,0 +1,126 @@
+"""rendering logic for PHANTOM environment dashboard"""
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.gridspec import GridSpec
+
+
+def style_axis(ax, title: str = None, xlabel: str = None, ylabel: str = None):
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    if title: ax.set_title(title, fontsize=11, fontweight='bold', pad=8)
+    if xlabel: ax.set_xlabel(xlabel, fontsize=9)
+    if ylabel: ax.set_ylabel(ylabel, fontsize=9)
+
+
+class DashboardRenderer:
+    """stateful renderer for PHANTOM market dynamics visualization"""
+
+    def __init__(self):
+        self.fig = None
+        self.gs = None
+
+    def render(self, env) -> None:
+        if self.fig is None:
+            plt.ion()
+            self.fig = plt.figure(figsize=(14, 10))
+            self.gs = GridSpec(3, 3, figure=self.fig, hspace=0.35, wspace=0.3,
+                               left=0.07, right=0.95, top=0.92, bottom=0.08)
+            plt.show(block=False)
+
+        self.fig.clear()
+        self.fig.suptitle(f'PHANTOM  Market Dynamics  [t={env._step_count}, a={env.alpha:.2f}]',
+                          fontsize=14, fontweight='bold')
+
+        demand_mat = np.array(env._demand_history).T
+        price_mat = np.array(env._price_history).T
+        elasticity = env._compute_elasticity()
+
+        self._render_scatter(env)
+        self._render_elasticity_bar(env, elasticity)
+        self._render_session_pie(env)
+        self._render_price_heatmap(price_mat)
+        self._render_demand_heatmap(demand_mat)
+        self._render_correlation(env.n_products, price_mat, demand_mat)
+        self._render_revenue(env)
+
+        self.fig.canvas.draw_idle()
+        self.fig.canvas.flush_events()
+
+    def _render_scatter(self, env):
+        ax = self.fig.add_subplot(self.gs[0, 0])
+        prices_flat = np.array(env._price_history).flatten()
+        demands_flat = np.array(env._demand_history).flatten()
+        product_ids = np.tile(np.arange(env.n_products), len(env._price_history))
+        ax.scatter(prices_flat, demands_flat, c=product_ids, cmap='plasma', alpha=0.6, s=15, edgecolors='none')
+        if len(prices_flat) > 1:
+            z = np.polyfit(prices_flat, demands_flat, 1)
+            p_line = np.linspace(prices_flat.min(), prices_flat.max(), 50)
+            ax.plot(p_line, np.polyval(z, p_line), '--', lw=1.5, alpha=0.8)
+        style_axis(ax, "Price-Demand Relationship", "Price ($)", "Demand")
+
+    def _render_elasticity_bar(self, env, elasticity):
+        ax = self.fig.add_subplot(self.gs[0, 1])
+        ax.barh(range(env.n_products), elasticity, alpha=0.8)
+        ax.axvline(0, lw=0.8, alpha=0.5)
+        ax.axvline(-1, lw=1, ls='--', alpha=0.5)
+        ax.set_yticks(range(env.n_products))
+        ax.set_yticklabels([f'P{i}' for i in range(env.n_products)], fontsize=7)
+        style_axis(ax, "Price Elasticity", "(dQ/dP)(P/Q)", None)
+
+    def _render_session_pie(self, env):
+        ax = self.fig.add_subplot(self.gs[0, 2])
+        n_h, n_a = env.market.Nhumans, env.market.Nagents
+        wedges, _ = ax.pie([n_h, n_a], startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'})
+        ax.legend(wedges, [f'H ({n_h})', f'A ({n_a})'], loc='lower center', fontsize=8,
+                  frameon=False, bbox_to_anchor=(0.5, -0.05))
+        ax.set_title("Session Mix", fontsize=11, fontweight='bold')
+
+    def _render_price_heatmap(self, price_mat):
+        ax = self.fig.add_subplot(self.gs[1, :2])
+        im = ax.imshow(price_mat, aspect='auto', cmap='viridis', origin='lower')
+        style_axis(ax, "Price Heatmap P(product, t)", "Step", "Product")
+        cbar = self.fig.colorbar(im, ax=ax, fraction=0.03, pad=0.02)
+        cbar.set_label('$', fontsize=8)
+
+    def _render_demand_heatmap(self, demand_mat):
+        ax = self.fig.add_subplot(self.gs[1, 2])
+        im = ax.imshow(demand_mat, aspect='auto', cmap='Blues', origin='lower')
+        style_axis(ax, "Demand Q(product, t)", "Step", None)
+        self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
+
+    def _render_correlation(self, n_products, price_mat, demand_mat):
+        ax = self.fig.add_subplot(self.gs[2, 0])
+        if price_mat.shape[1] > 2:
+            corr = np.corrcoef(price_mat, demand_mat)[:n_products, n_products:]
+            im = ax.imshow(corr, cmap='RdBu', vmin=-1, vmax=1, aspect='auto')
+            ax.set_xticks(range(n_products))
+            ax.set_yticks(range(n_products))
+            ax.set_xticklabels([f'Q{i}' for i in range(n_products)], fontsize=6)
+            ax.set_yticklabels([f'P{i}' for i in range(n_products)], fontsize=6)
+            self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
+        style_axis(ax, "Price-Demand Correlation", None, None)
+
+    def _render_revenue(self, env):
+        ax = self.fig.add_subplot(self.gs[2, 1:])
+        n_steps = len(env._revenue_history)
+        demand_std = [np.std(d) for d in env._demand_history]
+        ax.fill_between(range(n_steps), env._revenue_history, alpha=0.3)
+        ax.plot(env._revenue_history, linewidth=2, label='Revenue')
+        ax.set_xlim(0, max(n_steps, 1))
+        ax.set_ylim(0, max(env._revenue_history) * 1.1 if env._revenue_history else 1)
+
+        ax2 = ax.twinx()
+        ax2.plot(range(n_steps), demand_std, linewidth=2, ls='-', alpha=0.9, label='sigma(Demand)')
+        d_min, d_max = min(demand_std), max(demand_std)
+        margin = (d_max - d_min) * 0.2 if d_max > d_min else 0.5
+        ax2.set_ylim(max(0, d_min - margin), d_max + margin)
+        ax2.set_ylabel('Demand sigma', fontsize=9)
+
+        style_axis(ax, "Revenue & Demand Dispersion", "Step", "Revenue ($)")
+        ax.legend(loc='upper left', fontsize=7, frameon=False)
+        ax2.legend(loc='upper right', fontsize=7, frameon=False)
+
+    def close(self):
+        if self.fig:
+            plt.close(self.fig)
+            self.fig = None
--- a/engine/studies/factors.py
+++ b/engine/studies/factors.py
@@ -0,0 +1,34 @@
+"""shared factor definitions for experimental designs"""
+import numpy as np
+from dataclasses import dataclass, field
+from typing import Callable, Any
+
+@dataclass
+class Factor:
+    name: str
+    levels: list
+    primary: bool = True  # full cross vs sampled
+
+# demand functions with compatible signatures
+def demand_linear(mu, sigma, size): return np.maximum(0, np.random.normal(mu, sigma, size))
+def demand_uniform(mu, sigma, size): return np.random.uniform(mu - sigma, mu + sigma, size)
+def demand_exponential(mu, sigma, size): return np.random.exponential(mu, size)
+def demand_logistic(mu, sigma, size): return np.random.logistic(mu, sigma, size)
+
+DEMAND_FUNCTIONS = {
+    "linear": demand_linear,
+    "uniform": demand_uniform,
+    "exponential": demand_exponential,
+    "logistic": demand_logistic,
+}
+
+FACTORS = [
+    Factor("demand_fn", list(DEMAND_FUNCTIONS.keys()), primary=True),
+    Factor("alpha", [0.1, 0.3, 0.5, 0.7], primary=True),
+    Factor("n_products", [5, 15, 30, 50], primary=True),
+    Factor("demand_mu", [30.0, 50.0, 70.0], primary=False),
+    Factor("demand_sigma", [5.0, 10.0, 20.0], primary=False),
+    Factor("N", [100, 500, 1000], primary=False),
+]
+
+SEEDS_PER_CONFIG = 5
--- a/engine/studies/full_factorial.py
+++ b/engine/studies/full_factorial.py
@@ -0,0 +1,89 @@
+"""full factorial design - all factor combinations"""
+import sys
+sys.path.insert(0, "..")
+import logging
+from itertools import product
+import json
+import hashlib
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
+from .factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+
+def generate_configs():
+    """generate all factor combinations with seeds"""
+    all_levels = [f.levels for f in FACTORS]
+    names = [f.name for f in FACTORS]
+
+    configs = []
+    for combo in product(*all_levels):
+        base = {names[i]: combo[i] for i in range(len(names))}
+        for seed in range(SEEDS_PER_CONFIG):
+            cfg = {**base, "seed": seed}
+            cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
+            configs.append(cfg)
+    return configs
+
+def run_single(cfg: dict) -> dict:
+    """execute one experiment config, return metrics"""
+    from engine.wrapper import PHANTOM
+    import numpy as np
+
+    np.random.seed(cfg["seed"])
+    demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]]
+
+    env = PHANTOM(
+        n_products=cfg["n_products"],
+        alpha=cfg["alpha"],
+        N=cfg["N"],
+    )
+    env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"]))
+
+    obs, _ = env.reset()
+    total_reward, steps = 0.0, 0
+
+    for _ in range(100):
+        action = env.action_space.sample()
+        obs, reward, term, trunc, _ = env.step(action)
+        total_reward += reward
+        steps += 1
+        if term: break
+
+    env.close()
+    return {
+        "id": cfg["id"],
+        "config": cfg,
+        "total_reward": total_reward,
+        "avg_reward": total_reward / steps if steps > 0 else 0.0,
+        "steps": steps,
+    }
+
+def run_study(max_workers: int = None, output: str = "results_full.jsonl"):
+    configs = generate_configs()
+    log.info(f"full factorial: {len(configs)} configs ({len(configs)//SEEDS_PER_CONFIG} unique × {SEEDS_PER_CONFIG} seeds)")
+
+    results = []
+    with ProcessPoolExecutor(max_workers=max_workers) as ex:
+        for i, result in enumerate(ex.map(run_single, configs)):
+            results.append(result)
+            if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
+
+    Path(output).write_text("\n".join(json.dumps(r) for r in results))
+    log.info(f"wrote {len(results)} results to {output}")
+    return results
+
+if __name__ == "__main__":
+    import argparse
+    p = argparse.ArgumentParser()
+    p.add_argument("--workers", type=int, default=None)
+    p.add_argument("--output", default="results_full.jsonl")
+    p.add_argument("--dry-run", action="store_true", help="only show design size")
+    args = p.parse_args()
+
+    configs = generate_configs()
+    log.info(f"design: {len(configs)} runs | factors: {[f.name for f in FACTORS]} | levels: {[len(f.levels) for f in FACTORS]}")
+
+    if not args.dry_run:
+        run_study(args.workers, args.output)
--- a/engine/studies/mixed_lh.py
+++ b/engine/studies/mixed_lh.py
@@ -0,0 +1,106 @@
+"""mixed design: full factorial on primary factors, latin hypercube on secondary"""
+import sys
+sys.path.insert(0, "..")
+import logging
+from itertools import product
+import json
+import hashlib
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
+import numpy as np
+from scipy.stats.qmc import LatinHypercube
+from factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+
+LH_SAMPLES = 10
+
+def generate_configs(lh_samples: int = LH_SAMPLES):
+    primary = [f for f in FACTORS if f.primary]
+    secondary = [f for f in FACTORS if not f.primary]
+
+    primary_grid = list(product(*[f.levels for f in primary]))
+    lhs = LatinHypercube(d=len(secondary), seed=42)
+
+    configs = []
+    for p_combo in primary_grid:
+        samples = lhs.random(n=lh_samples)
+        for s in samples:
+            sec_vals = {
+                secondary[i].name: secondary[i].levels[int(s[i] * len(secondary[i].levels))]
+                for i in range(len(secondary))
+            }
+            base = {primary[i].name: p_combo[i] for i in range(len(primary))}
+            base.update(sec_vals)
+
+            for seed in range(SEEDS_PER_CONFIG):
+                cfg = {**base, "seed": seed}
+                cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
+                configs.append(cfg)
+    return configs
+
+def run_single(cfg: dict) -> dict:
+    from engine.wrapper import PHANTOM
+    import numpy as np
+
+    np.random.seed(cfg["seed"])
+    demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]]
+
+    env = PHANTOM(
+        n_products=cfg["n_products"],
+        alpha=cfg["alpha"],
+        N=cfg["N"],
+    )
+    env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"]))
+
+    obs, _ = env.reset()
+    total_reward, steps = 0.0, 0
+
+    for _ in range(100):
+        action = env.action_space.sample()
+        obs, reward, term, trunc, _ = env.step(action)
+        total_reward += reward
+        steps += 1
+        if term: break
+
+    env.close()
+    return {
+        "id": cfg["id"],
+        "config": cfg,
+        "total_reward": total_reward,
+        "avg_reward": total_reward / steps,
+        "steps": steps,
+    }
+
+def run_study(max_workers: int = None, output: str = "results_mixed.jsonl", lh_samples: int = LH_SAMPLES):
+    configs = generate_configs(lh_samples)
+    n_primary_cells = int(np.prod([len(f.levels) for f in FACTORS if f.primary]))
+    log.info(f"mixed LH: {len(configs)} configs ({n_primary_cells} primary × {lh_samples} LH × {SEEDS_PER_CONFIG} seeds)")
+
+    results = []
+    with ProcessPoolExecutor(max_workers=max_workers) as ex:
+        for i, result in enumerate(ex.map(run_single, configs)):
+            results.append(result)
+            if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
+
+    Path(output).write_text("\n".join(json.dumps(r) for r in results))
+    log.info(f"wrote {len(results)} results to {output}")
+    return results
+
+if __name__ == "__main__":
+    import argparse
+    p = argparse.ArgumentParser()
+    p.add_argument("--workers", type=int, default=None)
+    p.add_argument("--output", default="results_mixed.jsonl")
+    p.add_argument("--lh-samples", type=int, default=10)
+    p.add_argument("--dry-run", action="store_true", help="only show design size")
+    args = p.parse_args()
+
+    primary = [f for f in FACTORS if f.primary]
+    secondary = [f for f in FACTORS if not f.primary]
+    configs = generate_configs(args.lh_samples)
+    log.info(f"design: {len(configs)} runs | primary: {[f.name for f in primary]} | secondary (LH): {[f.name for f in secondary]}")
+
+    if not args.dry_run:
+        run_study(args.workers, args.output, args.lh_samples)
--- a/engine/train.py
+++ b/engine/train.py
@@ -0,0 +1,45 @@
+from stable_baselines3 import SAC
+from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
+from .wrapper import PHANTOM
+
+
+class RenderCallback(BaseCallback):
+    """Renders environment on every step for live visualization."""
+    def __init__(self, env: PHANTOM):
+        super().__init__()
+        self.env = env
+
+    def _on_step(self) -> bool:
+        self.env.render()
+        return True
+
+
+env = PHANTOM(n_products=10, alpha=0.3, render_mode="human")
+eval_env = PHANTOM(n_products=10, alpha=0.3, render_mode=None)
+
+model = SAC(
+    "MultiInputPolicy",
+    env,
+    verbose=1,
+    learning_rate=3e-4,
+    buffer_size=50000,
+    batch_size=256,
+    tau=0.005,
+    gamma=0.99,
+)
+
+render_cb = RenderCallback(env)
+eval_cb = EvalCallback(eval_env, eval_freq=1000, n_eval_episodes=5, verbose=1)
+
+model.learn(total_timesteps=50000, callback=[render_cb, eval_cb])
+model.save("phantom_sac")
+
+# test trained policy
+env = PHANTOM(n_products=10, alpha=0.3, render_mode="human")
+obs, _ = env.reset()
+for _ in range(100):
+    action, _ = model.predict(obs, deterministic=True)
+    obs, reward, term, trunc, _ = env.step(action)
+    env.render()
+    if term or trunc: break
+env.close()
--- a/engine/wrapper.py
+++ b/engine/wrapper.py
@@ -0,0 +1,118 @@
+import gymnasium as gym
+from gymnasium import spaces
+import numpy as np
+from .engine import Limbo, MarketEngine, PricingEngine
+from .lib.render import DashboardRenderer
+
+
+class PHANTOM(gym.Env):
+    """Gymnasium wrapper for the Limbo pricing-market simulation. Platform sets prices, market responds with demand."""
+    metadata = {"render_modes": ["human", "ansi"]}
+
+    def __init__(self,
+                 n_products: int = 10,
+                 alpha: float = 0.3,
+                 N: int = 100,
+                 price_bounds: tuple = (10.0, 150.0),
+                 lambda_coi: float = 0.1,
+                 render_mode: str = None):
+        super().__init__()
+        self.n_products = n_products
+        self.price_bounds = price_bounds
+        self.lambda_coi = lambda_coi
+        self.render_mode = render_mode
+        self.alpha = alpha
+        self.N = N
+
+        self.market = MarketEngine(alpha=alpha, N=N)
+        self._platform_stub = PricingEngine()
+        self._limbo = Limbo(self._platform_stub, self.market)
+
+        self.action_space = spaces.Box(
+            low=price_bounds[0], high=price_bounds[1],
+            shape=(n_products,), dtype=np.float32
+        )
+        self.observation_space = spaces.Dict({
+            "demand": spaces.Box(low=0.0, high=100.0, shape=(n_products,), dtype=np.float32),
+            "prices": spaces.Box(low=price_bounds[0], high=price_bounds[1], shape=(n_products,), dtype=np.float32),
+        })
+
+        self._prices = None
+        self._demand = None
+        self._step_count = 0
+        self._demand_history = []
+        self._price_history = []
+        self._revenue_history = []
+        self._renderer = None
+
+    def _get_obs(self) -> dict:
+        demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32)
+        return {"demand": demand_arr, "prices": self._prices.astype(np.float32)}
+
+    def _compute_reward(self, prices: np.ndarray, demand: dict) -> float:
+        revenue = np.sum(prices * np.array([demand.get(i, 0.0) for i in range(self.n_products)]))
+        # TODO: implement supra-competitive price punishment
+        return float(revenue)
+
+    def _record_history(self):
+        demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)])
+        self._demand_history.append(demand_arr)
+        self._price_history.append(self._prices.copy())
+        self._revenue_history.append(np.sum(self._prices * demand_arr))
+
+    def reset(self, seed=None, options=None):
+        super().reset(seed=seed)
+        self._prices = np.random.uniform(*self.price_bounds, size=self.n_products)
+        self._demand = self.market.act(self._prices)
+        self._step_count = 0
+        self._demand_history, self._price_history, self._revenue_history = [], [], []
+        self._record_history()
+        return self._get_obs(), {}
+
+    def step(self, action: np.ndarray):
+        self._prices = np.clip(action, *self.price_bounds)
+        self._demand = self.market.act(self._prices)
+        self._step_count += 1
+        self._record_history()
+
+        reward = self._compute_reward(self._prices, self._demand)
+        terminated = self._step_count >= 100
+
+        return self._get_obs(), reward, terminated, False, {"step": self._step_count}
+
+    def _compute_elasticity(self) -> np.ndarray:
+        """point elasticity: e = (dQ/dP) * (P/Q) via finite differences, clipped to [-5, 5]"""
+        if len(self._price_history) < 2:
+            return np.zeros(self.n_products)
+        p, q = np.array(self._price_history), np.array(self._demand_history)
+        dp, dq = np.diff(p, axis=0), np.diff(q, axis=0)
+        valid = np.abs(dp) > 0.5
+        with np.errstate(divide='ignore', invalid='ignore'):
+            elasticity = np.where(valid, (dq / dp) * (p[:-1] / np.maximum(q[:-1], 1.0)), 0.0)
+            elasticity = np.nan_to_num(np.clip(elasticity, -5.0, 5.0), nan=0.0)
+        return np.mean(elasticity, axis=0) if len(elasticity) > 0 else np.zeros(self.n_products)
+
+    def render(self):
+        if self.render_mode == "human":
+            if self._renderer is None:
+                self._renderer = DashboardRenderer()
+            self._renderer.render(self)
+        elif self.render_mode == "ansi":
+            return f"step={self._step_count}, prices={self._prices}, demand={self._demand}"
+        return None
+
+    def close(self):
+        if self._renderer:
+            self._renderer.close()
+            self._renderer = None
+
+
+if __name__ == "__main__":
+    env = PHANTOM(n_products=15, alpha=0.3, N=100, render_mode="human")
+    obs, _ = env.reset()
+    for step in range(100):
+        action = env.action_space.sample()
+        obs, reward, term, trunc, info = env.step(action)
+        env.render()
+        if term: break
+    env.close()
--- a/experiments/procesing/contaminator.py
+++ b/experiments/procesing/contaminator.py
@@ -1,7 +1,14 @@
-import pandas as pd
-import random
+from __future__ import annotations
+
 import os
+import random
 from pathlib import Path
+from types import SimpleNamespace
+
+import pandas as pd
+
+from lib.separability import estimate_alpha, load_artifacts, score_session
+

 # use relative import when in package context, fallback for standalone
 try:
@@ -15,6 +22,11 @@ except ImportError:
 PROJECT_ROOT = Path(__file__).parent.parent.parent
 AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', PROJECT_ROOT / "experiments" / "agents" / "collected_data"))

+try:
+    SEPARABILITY_ARTIFACTS = load_artifacts()
+except FileNotFoundError:
+    SEPARABILITY_ARTIFACTS = None
+

 def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.DataFrame:
    """remap column values according to mapping dict, preserving unmapped values"""
@@ -23,6 +35,23 @@ def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.
    return df


+def _states_to_events(states: list[str]) -> list[SimpleNamespace]:
+    events: list[SimpleNamespace] = []
+    for idx, state in enumerate(states):
+        parts = state.split("|") if isinstance(state, str) else ["page", "product", str(state)]
+        page = f"/{parts[0]}" if parts else "/"
+        product = parts[1] if len(parts) > 1 else "unknown"
+        event_name = parts[2] if len(parts) > 2 else parts[-1]
+        events.append(
+            SimpleNamespace(
+                eventName=event_name,
+                page=page,
+                productId=product,
+                ts=float(idx),
+            )
+        )
+    return events
+
 def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
                        contamination_rate: float = 0.1,
                        agent_data_dir: Path = None) -> pd.DataFrame:
@@ -48,6 +77,8 @@ def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",

    # generate synthetic trajectories
    new_rows = []
+    alpha_estimates = []
+
    for start_event in start_events:
        # sample trajectory from agent model, using a state that contains the event type
        mdp_states = model.mdp.get('states', []) if model.mdp else []
@@ -56,11 +87,28 @@ def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
            continue  # skip if no matching start state
        start_state = random.choice(matching_starts)
        trajectory = model.sample_traj(start_state, max_len=20)
+        score_payload: list[SimpleNamespace] = []
+        score: dict[str, float] = {}
+        if SEPARABILITY_ARTIFACTS:
+            score_payload = _states_to_events(trajectory)
+            score = score_session(score_payload, SEPARABILITY_ARTIFACTS)
+            alpha_estimates.append(
+                estimate_alpha(score["prob_agent"], score["delta_h"], score["delta_a"], temperature=2.0)
+            )
+
        for state in trajectory:
-            parts = state.split('|')  # page|productId|eventName format
-            new_rows.append({on: parts[-1] if parts else start_event, 'source': 'synthetic_agent'})
+            parts = state.split('|') if isinstance(state, str) else [start_event]
+            new_rows.append({
+                on: parts[-1] if parts else start_event,
+                'source': 'synthetic_agent',
+                'prob_agent': score.get('prob_agent') if SEPARABILITY_ARTIFACTS and score_payload else None,
+                'delta_h': score.get('delta_h') if SEPARABILITY_ARTIFACTS and score_payload else None,
+                'delta_a': score.get('delta_a') if SEPARABILITY_ARTIFACTS and score_payload else None,
+            })

    if new_rows:
        contaminate_df = pd.DataFrame(new_rows)
        df = pd.concat([df, contaminate_df], ignore_index=True)
+        if alpha_estimates:
+            df['estimated_alpha'] = sum(alpha_estimates) / len(alpha_estimates)
    return df
--- a/experiments/procesing/tests/test_demand.py
+++ b/experiments/procesing/tests/test_demand.py
@@ -6,6 +6,7 @@ from procesing.steps import (
 )

 def test_compute_demand(pipeline_context):
+    random.seed(42)  # deterministic test
    step = ComputeDemandStep(context=pipeline_context)

    # Test with normal interaction data
@@ -26,6 +27,7 @@ def test_compute_demand(pipeline_context):


 def test_compute_demand_skewed(pipeline_context):
+    random.seed(42)  # deterministic test
    step = ComputeDemandStep(context=pipeline_context)

    # Test with normal interaction data
--- a/sim/case/init.py
+++ b/sim/case/init.py
@@ -0,0 +1,2 @@
+"""Case-specific simulations and experiments."""
+
--- a/sim/case/thesis_simplified/init.py
+++ b/sim/case/thesis_simplified/init.py
@@ -0,0 +1,2 @@
+"""Minimal thesis-aligned pricing simulation (self-contained)."""
+
--- a/sim/case/thesis_simplified/coi.py
+++ b/sim/case/thesis_simplified/coi.py
@@ -0,0 +1,125 @@
+"""Cost of Information (COI) computation for thesis pricing system.
+
+Core KPI: COI = E[p_shown] - p_min measures pricing power from information asymmetry.
+Theorem 1 shows COI erodes as agent queries increase: as N->inf, p^(1)->p_min.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, List, TYPE_CHECKING
+import numpy as np
+
+if TYPE_CHECKING:
+    from .simplified import Session
+
+
+@dataclass(frozen=True)
+class COIWindow:
+    """Windowed COI metrics computed from realized price exposures.
+
+    policy: E[p_shown] - cost, the definition-level KPI
+    agent: E[p^(1)] - cost where p^(1) is min price under agent querying
+    leak: max(policy - agent, 0), observable gap from reconnaissance
+    survival_ratio: agent/policy, fraction of pricing power retained
+    """
+    policy: float
+    agent: float
+    leak: float
+    survival_ratio: float
+    policy_by_product: np.ndarray
+    agent_by_product: np.ndarray
+    demand_weights: np.ndarray
+
+
+def aggregate_prices(sessions: List["Session"], mode: str = "all") -> Dict[int, List[float] | float]:
+    """Unified price aggregation across sessions.
+
+    mode: "all" returns all prices per product, "min_per_session" returns min price per session per product,
+          "min_across" returns single min price per product
+    """
+    if mode == "min_across":
+        mins: Dict[int, float] = {}
+        for s in sessions:
+            for e in s.events:
+                pidx, price = int(e.product_idx), float(e.price_seen)
+                mins[pidx] = min(mins.get(pidx, price), price)
+        return mins
+    elif mode == "min_per_session":
+        result: Dict[int, List[float]] = {}
+        for s in sessions:
+            by_p: Dict[int, float] = {}
+            for e in s.events:
+                pidx, price = int(e.product_idx), float(e.price_seen)
+                by_p[pidx] = min(by_p.get(pidx, price), price)
+            for pidx, pmin in by_p.items():
+                result.setdefault(pidx, []).append(pmin)
+        return result
+    else:  # "all"
+        prices: Dict[int, List[float]] = {}
+        for s in sessions:
+            for e in s.events:
+                prices.setdefault(e.product_idx, []).append(float(e.price_seen))
+        return prices
+
+
+def demand_weights_by_product(sessions: List["Session"], demand_mapping: Dict[str, float], n_products: int) -> np.ndarray:
+    """Compute demand-weighted importance per product."""
+    w = np.zeros(n_products, dtype=float)
+    sessions_by_id = {s.sid: s for s in sessions}
+    for sid, q in demand_mapping.items():
+        sess = sessions_by_id.get(sid)
+        if sess and sess.events:
+            w[int(sess.events[0].product_idx)] += float(q)
+    total = float(np.sum(w))
+    return (w / total) if total > 0 else w
+
+
+def compute_coi_window(sessions: List["Session"], costs: np.ndarray, demand_mapping: Dict[str, float] | None = None) -> COIWindow:
+    """Compute COI metrics over session window.
+
+    Aggregates price exposures and computes policy-level vs agent-realized COI.
+    """
+    n = int(len(costs))
+    prices = aggregate_prices(sessions, mode="all")
+    agent_sessions = [s for s in sessions if s.actor == "A"]
+    agent_min = aggregate_prices(agent_sessions, mode="min_across") if agent_sessions else {}
+
+    policy_by = np.zeros(n, dtype=float)
+    agent_by = np.zeros(n, dtype=float)
+    seen = np.array([(i in prices) for i in range(n)], dtype=bool)
+    agent_seen = np.array([(i in agent_min) for i in range(n)], dtype=bool)
+
+    for pidx, ps in prices.items():
+        if 0 <= pidx < n and ps:
+            policy_by[pidx] = float(np.mean(ps) - float(costs[pidx]))
+    for pidx, pmin in agent_min.items():
+        if 0 <= pidx < n:
+            agent_by[pidx] = float(pmin - float(costs[pidx]))
+
+    agent_by[seen & ~agent_seen] = policy_by[seen & ~agent_seen]  # no erosion if no agent exposure
+
+    demand_w = demand_weights_by_product(sessions, demand_mapping, n) if demand_mapping else np.zeros(n, dtype=float)
+    has_weights = float(np.sum(demand_w)) > 0
+
+    if has_weights:
+        policy, agent = float(np.dot(demand_w, policy_by)), float(np.dot(demand_w, agent_by))
+    elif np.any(seen):
+        policy, agent = float(np.mean(policy_by[seen])), float(np.mean(agent_by[seen]))
+    else:
+        policy, agent = 0.0, 0.0
+
+    leak = float(max(policy - agent, 0.0))
+    survival = float(np.clip(agent / policy, 0.0, 1.0)) if policy > 0 else 0.0
+
+    return COIWindow(policy=policy, agent=agent, leak=leak, survival_ratio=survival,
+                     policy_by_product=policy_by, agent_by_product=agent_by, demand_weights=demand_w)
+
+
+def coi_erosion(coi_policy: float, coi_agent: float, eps: float = 1e-9) -> float:
+    """Thesis-consistent COI erosion: fraction of pricing power destroyed by agent queries.
+
+    erosion = 1 - (COI_agent / COI_policy)
+    When agents find low prices, COI_agent -> 0, erosion -> 1.
+    """
+    if coi_policy <= eps:
+        return 0.0
+    return float(np.clip(1.0 - (coi_agent / (coi_policy + eps)), 0.0, 1.0))
--- a/sim/case/thesis_simplified/experiments.py
+++ b/sim/case/thesis_simplified/experiments.py
@@ -0,0 +1,325 @@
+"""COI leakage experiments and policy comparisons.
+
+Demonstrates the core thesis contribution: COI erosion under agent contamination
+and recovery via robust pricing policies.
+
+Generates TensorBoard logs for:
+- COI erosion curves across contamination levels
+- Policy comparison (fixed vs adaptive vs RL)
+- Revenue/margin trade-offs
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Tuple
+import json
+import numpy as np
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+    HAS_TB = True
+except ImportError:
+    HAS_TB = False
+
+from .simplified_env import PricingEnv, EnvConfig, make_env
+from .simplified import System
+
+
+@dataclass
+class ExperimentResult:
+    """Container for experiment metrics."""
+    name: str
+    alpha: float
+    reward_mean: float
+    reward_std: float
+    coi_erosion: float
+    alpha_error: float
+    revenue: float
+    margin: float
+
+    def to_dict(self) -> dict:
+        return {k: getattr(self, k) for k in self.__dataclass_fields__}
+
+
+def theoretical_coi_erosion_curve(alphas: np.ndarray, n_sessions: int = 1000) -> np.ndarray:
+    """Theoretical COI erosion from Theorem 1 using order statistic model.
+
+    For N i.i.d. uniform queries on [p_min, p_max]:
+    E[p^(1)] = p_min + (p_max - p_min)/(N+1), so erosion = 1 - 2/(N+1)
+    """
+    erosions = []
+    for a in alphas:
+        n_agents = max(1, int(a * n_sessions))
+        erosions.append(1.0 - 2.0 / (n_agents + 1))
+    return np.array(erosions)
+
+
+def run_policy_episode(
+    env: PricingEnv,
+    policy_fn,
+    n_episodes: int = 10
+) -> Tuple[List[float], List[float], List[float], List[float]]:
+    """Run policy and collect per-step metrics."""
+    rewards, coi_erosions, alpha_errors, revenues = [], [], [], []
+
+    for _ in range(n_episodes):
+        obs, info = env.reset()
+        done = False
+        while not done:
+            action = policy_fn(obs, env.n)
+            obs, reward, terminated, truncated, info = env.step(action)
+            done = terminated or truncated
+            rewards.append(reward)
+            if 'coi_erosion' in info:
+                coi_erosions.append(info['coi_erosion'])
+            if 'alpha_true' in info and 'alpha_est' in info:
+                alpha_errors.append(abs(info['alpha_true'] - info['alpha_est']))
+            if 'revenue' in info:
+                revenues.append(info['revenue'])
+
+    return rewards, coi_erosions, alpha_errors, revenues
+
+
+class PolicyRegistry:
+    """Registry of baseline policies."""
+
+    @staticmethod
+    def fixed(obs: np.ndarray, n: int, margin: float = 0.15) -> np.ndarray:
+        return np.ones(n, dtype=np.float32) * (1.0 + margin)
+
+    @staticmethod
+    def random(obs: np.ndarray, n: int, rng: np.random.Generator = None) -> np.ndarray:
+        rng = rng or np.random.default_rng()
+        return rng.uniform(0.7, 1.3, n).astype(np.float32)
+
+    @staticmethod
+    def adaptive(obs: np.ndarray, n: int, base_margin: float = 0.15) -> np.ndarray:
+        """Reduce margins when alpha estimate is high."""
+        alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
+        margin_scale = 1.0 - 0.4 * alpha_est
+        return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale)
+
+    @staticmethod
+    def aggressive(obs: np.ndarray, n: int) -> np.ndarray:
+        """High margins, ignores contamination."""
+        return np.ones(n, dtype=np.float32) * 1.4
+
+    @staticmethod
+    def defensive(obs: np.ndarray, n: int) -> np.ndarray:
+        """Low margins, always cautious."""
+        return np.ones(n, dtype=np.float32) * 1.05
+
+    @staticmethod
+    def alpha_proportional(obs: np.ndarray, n: int, max_margin: float = 0.3) -> np.ndarray:
+        """Margin inversely proportional to estimated alpha."""
+        alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
+        margin = max_margin * (1.0 - alpha_est)
+        return np.ones(n, dtype=np.float32) * (1.0 + margin)
+
+
+def run_contamination_sweep(
+    alphas: List[float],
+    policies: Dict[str, callable],
+    n_products: int = 10,
+    max_steps: int = 200,
+    n_episodes: int = 10,
+    seed: int = 42,
+    log_dir: str = None
+) -> Dict[str, List[ExperimentResult]]:
+    """Run policies across contamination levels."""
+
+    results = {name: [] for name in policies}
+    writer = SummaryWriter(Path(log_dir) / "sweep") if log_dir and HAS_TB else None
+
+    for alpha in alphas:
+        print(f"  alpha={alpha:.2f}", end=" ")
+        env_cfg = EnvConfig(
+            n_products=n_products, max_steps=max_steps,
+            alpha_true=alpha, reward_mode="robust", seed=seed)
+        env = make_env(env_cfg)
+
+        for name, policy_fn in policies.items():
+            rewards, coi_vals, alpha_errs, revenues = run_policy_episode(env, policy_fn, n_episodes)
+
+            result = ExperimentResult(
+                name=name, alpha=alpha,
+                reward_mean=float(np.mean(rewards)),
+                reward_std=float(np.std(rewards)),
+                coi_erosion=float(np.mean(coi_vals)) if coi_vals else 0.0,
+                alpha_error=float(np.mean(alpha_errs)) if alpha_errs else 0.0,
+                revenue=float(np.mean(revenues)) if revenues else 0.0,
+                margin=float(np.mean([policy_fn(np.zeros(3 * n_products + 3), n_products)]) - 1.0))
+
+            results[name].append(result)
+
+            if writer:
+                step = int(alpha * 100)
+                writer.add_scalar(f'{name}/reward', result.reward_mean, step)
+                writer.add_scalar(f'{name}/coi_erosion', result.coi_erosion, step)
+                writer.add_scalar(f'{name}/alpha_error', result.alpha_error, step)
+                writer.add_scalar(f'{name}/revenue', result.revenue, step)
+
+        print(f"done")
+
+    # add theoretical curve
+    if writer:
+        theo = theoretical_coi_erosion_curve(np.array(alphas))
+        for i, (a, e) in enumerate(zip(alphas, theo)):
+            writer.add_scalar('theoretical/coi_erosion', e, int(a * 100))
+        writer.close()
+
+    return results
+
+
+def run_coi_demonstration(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
+    """Main COI demonstration experiment."""
+    print("=== COI Leakage Demonstration ===\n")
+
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    writer = SummaryWriter(Path(log_dir) / "coi_demo") if HAS_TB else None
+
+    # theoretical erosion curve
+    print("1. Theoretical COI erosion (Theorem 1)")
+    alphas = np.linspace(0.0, 0.6, 13)
+    theo_erosion = theoretical_coi_erosion_curve(alphas, n_sessions=1000)
+
+    for a, e in zip(alphas, theo_erosion):
+        print(f"   alpha={a:.2f} -> erosion={e:.3f}")
+        if writer:
+            writer.add_scalar('theory/coi_erosion', e, int(a * 100))
+
+    # policy comparison
+    print("\n2. Policy comparison across contamination levels")
+    policies = {
+        'fixed': lambda obs, n: PolicyRegistry.fixed(obs, n),
+        'aggressive': PolicyRegistry.aggressive,
+        'defensive': PolicyRegistry.defensive,
+        'adaptive': PolicyRegistry.adaptive,
+        'alpha_proportional': PolicyRegistry.alpha_proportional,
+    }
+
+    sweep_alphas = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
+    results = run_contamination_sweep(
+        sweep_alphas, policies, n_products=10, max_steps=100,
+        n_episodes=5, seed=seed, log_dir=log_dir)
+
+    # summarize
+    print("\n3. Summary by policy")
+    for name, res_list in results.items():
+        avg_reward = np.mean([r.reward_mean for r in res_list])
+        avg_coi = np.mean([r.coi_erosion for r in res_list])
+        print(f"   {name:20s}: avg_reward={avg_reward:.2f}, avg_coi={avg_coi:.3f}")
+
+    # save results
+    output = {
+        'theoretical': {'alphas': alphas.tolist(), 'erosion': theo_erosion.tolist()},
+        'empirical': {name: [r.to_dict() for r in res_list] for name, res_list in results.items()}}
+
+    with open(Path(log_dir) / "coi_demo_results.json", 'w') as f:
+        json.dump(output, f, indent=2)
+
+    if writer:
+        writer.close()
+
+    print(f"\nResults saved to {log_dir}/coi_demo_results.json")
+    print(f"TensorBoard: tensorboard --logdir {log_dir}")
+
+    return output
+
+
+def run_reward_mode_comparison(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
+    """Compare different reward modes."""
+    print("=== Reward Mode Comparison ===\n")
+
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    writer = SummaryWriter(Path(log_dir) / "reward_modes") if HAS_TB else None
+
+    reward_modes = ["revenue", "profit", "robust", "coi_aware"]
+    alpha = 0.3  # moderate contamination
+
+    results = {}
+    for mode in reward_modes:
+        print(f"  mode={mode}", end=" ")
+        env_cfg = EnvConfig(
+            n_products=10, max_steps=200, alpha_true=alpha,
+            reward_mode=mode, seed=seed)
+        env = make_env(env_cfg)
+
+        rewards, coi_vals, _, revenues = run_policy_episode(
+            env, PolicyRegistry.adaptive, n_episodes=10)
+
+        results[mode] = {
+            'reward_mean': float(np.mean(rewards)),
+            'reward_std': float(np.std(rewards)),
+            'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
+            'revenue': float(np.mean(revenues)) if revenues else 0.0}
+
+        if writer:
+            for k, v in results[mode].items():
+                writer.add_scalar(f'{mode}/{k}', v, 0)
+
+        print(f"reward={results[mode]['reward_mean']:.2f}, coi={results[mode]['coi_erosion']:.3f}")
+
+    if writer:
+        writer.close()
+
+    with open(Path(log_dir) / "reward_mode_results.json", 'w') as f:
+        json.dump(results, f, indent=2)
+
+    return results
+
+
+def run_alpha_drift_experiment(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
+    """Test policy robustness under non-stationary contamination."""
+    print("=== Alpha Drift Experiment ===\n")
+
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    writer = SummaryWriter(Path(log_dir) / "alpha_drift") if HAS_TB else None
+
+    drift_rates = [0.0, 0.01, 0.02, 0.05]
+    results = {}
+
+    for drift in drift_rates:
+        print(f"  drift={drift:.2f}", end=" ")
+        env_cfg = EnvConfig(
+            n_products=10, max_steps=200, alpha_true=0.2,
+            alpha_drift=drift, reward_mode="robust", seed=seed)
+        env = make_env(env_cfg)
+
+        rewards, coi_vals, alpha_errs, _ = run_policy_episode(
+            env, PolicyRegistry.adaptive, n_episodes=10)
+
+        results[f'drift_{drift}'] = {
+            'reward_mean': float(np.mean(rewards)),
+            'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
+            'alpha_tracking_error': float(np.mean(alpha_errs)) if alpha_errs else 0.0}
+
+        if writer:
+            for k, v in results[f'drift_{drift}'].items():
+                writer.add_scalar(f'drift_{drift}/{k}', v, 0)
+
+        print(f"reward={results[f'drift_{drift}']['reward_mean']:.2f}, "
+              f"alpha_err={results[f'drift_{drift}']['alpha_tracking_error']:.3f}")
+
+    if writer:
+        writer.close()
+
+    return results
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Run COI experiments")
+    parser.add_argument("--exp", type=str, default="coi", choices=["coi", "reward", "drift", "all"])
+    parser.add_argument("--log-dir", type=str, default="sim/case/thesis_simplified/runs")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    if args.exp == "coi" or args.exp == "all":
+        run_coi_demonstration(args.log_dir, args.seed)
+
+    if args.exp == "reward" or args.exp == "all":
+        run_reward_mode_comparison(args.log_dir, args.seed)
+
+    if args.exp == "drift" or args.exp == "all":
+        run_alpha_drift_experiment(args.log_dir, args.seed)
--- a/sim/case/thesis_simplified/separability.py
+++ b/sim/case/thesis_simplified/separability.py
@@ -0,0 +1,72 @@
+"""Behavioral separability for human/agent detection.
+
+Computes divergence signals delta_H, delta_A from session trajectories using
+transition kernel estimation and KL divergence to prototype behavioral profiles.
+"""
+from __future__ import annotations
+from typing import Dict, List, Tuple, TYPE_CHECKING
+import numpy as np
+
+if TYPE_CHECKING:
+    from .simplified import Event, Session
+
+
+# prototype behavioral kernels for human vs agent sessions
+TRANS_H = {
+    "start": {"view": 0.85, "end": 0.15},
+    "view": {"detail": 0.4, "cart": 0.3, "view": 0.2, "end": 0.1},
+    "detail": {"cart": 0.5, "view": 0.3, "end": 0.2},
+    "cart": {"purchase": 0.6, "view": 0.25, "end": 0.15},
+    "purchase": {"end": 1.0},
+}
+
+TRANS_A = {
+    "start": {"view": 0.95, "end": 0.05},
+    "view": {"detail": 0.6, "view": 0.25, "cart": 0.1, "end": 0.05},
+    "detail": {"view": 0.5, "cart": 0.15, "detail": 0.3, "end": 0.05},
+    "cart": {"view": 0.4, "purchase": 0.2, "end": 0.4},
+    "purchase": {"end": 1.0},
+}
+
+
+def kl_div(p: Dict[str, float], q: Dict[str, float], eps: float = 1e-10) -> float:
+    """KL divergence D_KL(p || q) for discrete distributions."""
+    keys = set(p.keys()) | set(q.keys())
+    return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys)
+
+
+def build_kernel(events: List["Event"]) -> Dict[str, Dict[str, float]]:
+    """Build empirical transition kernel T' from trajectory events."""
+    trans: Dict[str, Dict[str, int]] = {}
+    prev = "start"
+    for e in events:
+        curr = e.action
+        trans.setdefault(prev, {})
+        trans[prev][curr] = trans[prev].get(curr, 0) + 1
+        prev = curr
+    return {s: {d: c / sum(dsts.values()) for d, c in dsts.items()} for s, dsts in trans.items() if sum(dsts.values()) > 0}
+
+
+def compute_divergence(session: "Session") -> Tuple[float, float]:
+    """Compute divergence signals delta_H, delta_A for session.
+
+    delta_H = mean KL(T' || T_H) across states, measures distance to human prototype
+    delta_A = mean KL(T' || T_A) across states, measures distance to agent prototype
+    """
+    kernel = build_kernel(session.events)
+    if not kernel:
+        return 0.5, 0.5
+    delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / len(kernel)
+    delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / len(kernel)
+    return delta_h, delta_a
+
+
+def estimate_alpha(session: "Session", beta: float = 2.0) -> float:
+    """Per-session contamination estimate alpha_hat = sigma(beta*(delta_H - delta_A)).
+
+    Returns probability session is agent-generated based on behavioral divergence.
+    """
+    dh, da = compute_divergence(session)
+    if (dh + da) <= 0:
+        return 0.5
+    return 1.0 / (1.0 + np.exp(-beta * (dh - da)))
--- a/sim/case/thesis_simplified/simplified.py
+++ b/sim/case/thesis_simplified/simplified.py
@@ -0,0 +1,219 @@
+"""Minimal implementation of thesis pricing system.
+
+Implements the core loop: prices -> sessions -> demand -> prices
+with behavioral separability and robust pricing objective.
+
+Objects:
+- Session trajectories tau_s from mixture of H/A behavioral profiles
+- Demand proxy q_hat via weighted action aggregation
+- COI leakage penalty for agent reconnaissance
+- Limbo: alternating price/demand history for trajectory analysis
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict, List, Tuple
+import numpy as np
+
+from .coi import COIWindow, compute_coi_window
+from .separability import TRANS_H, TRANS_A, kl_div, build_kernel, compute_divergence, estimate_alpha
+
+ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0}
+
+
+@dataclass
+class Event:
+    action: str
+    product_idx: int
+    price_seen: float
+    ts: float
+
+
+@dataclass
+class Session:
+    sid: str
+    events: List[Event]
+    actor: str  # H or A (ground truth label)
+    theta: Dict[str, float] = field(default_factory=dict)
+
+
+def compute_demand(session: Session) -> float:
+    """Compute demand proxy q_hat = sum_k omega(a_k) for session."""
+    return sum(ACTION_WEIGHTS.get(e.action, 0.1) for e in session.events)
+
+
+def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray, costs: np.ndarray, theta: Dict[str, float],
+                      is_agent: bool, session_noise: float = 0.02, surge: float = 0.08, max_mult: float = 1.8) -> Tuple[List[Event], int]:
+    """Sample session trajectory from behavioral kernel."""
+    pidx = int(rng.integers(0, len(prices)))
+    cost, base = float(costs[pidx]), float(prices[pidx]) * (1.0 + rng.normal(0.0, session_noise))
+    base = float(np.clip(base, cost * 1.01, float(prices[pidx]) * 2.0))
+    price, signal, state, t = base, 0.0, "start", 0.0
+    events = []
+
+    while state != "end" and len(events) < 30:
+        probs = trans.get(state, {"end": 1.0})
+        nxt = rng.choice(list(probs.keys()), p=list(probs.values()))
+        if nxt == "purchase":  # purchase conversion check
+            rel = max((price - cost) / (cost + 1e-6), 0.0)
+            p_buy = float(np.clip(theta.get("base_conv", 0.2) * np.exp(-theta.get("price_sens", 2.0) * rel), 0.0, 1.0))
+            if rng.random() > p_buy:
+                nxt = "end"
+        state = nxt
+        if state not in {"start", "end"}:
+            events.append(Event(action=state, product_idx=pidx, price_seen=float(price), ts=t))
+            signal += float(ACTION_WEIGHTS.get(state, 0.1))
+            price = float(np.clip(base * (1.0 + surge * signal), cost * 1.01, base * max_mult))
+        t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2))
+    return events, pidx
+
+
+def put_prices_to_market(prices: np.ndarray, costs: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
+                         seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]:
+    """Generate sessions from mixture model. Returns sessions and demand mapping sid -> q_hat."""
+    rng = np.random.default_rng(seed)
+    sessions, demand = [], {}
+    for i in range(n_sessions):
+        sid = f"s{i:04d}"
+        is_agent = rng.random() < alpha
+        trans = TRANS_A if is_agent else TRANS_H
+        theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else \
+                {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
+        events, _ = sample_trajectory(rng, trans, prices, costs=costs, theta=theta, is_agent=is_agent)
+        session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
+        sessions.append(session)
+        demand[sid] = compute_demand(session)
+    return sessions, demand
+
+
+@dataclass
+class LimboUpdate:
+    utype: str  # "prices" or "demand"
+    data: np.ndarray | Dict[str, float]
+    t: int
+
+
+class Limbo:
+    """Historical trajectory of alternating price/demand observations."""
+
+    def __init__(self):
+        self.history: List[LimboUpdate] = []
+        self._t = 0
+
+    def add_update(self, utype: str, data: np.ndarray | Dict[str, float]) -> Dict:
+        self.history.append(LimboUpdate(utype=utype, data=data, t=self._t))
+        self._t += 1
+        return {"action": "observe_demand" if utype == "prices" else "set_prices"}
+
+    def get_prices_history(self) -> List[np.ndarray]:
+        return [u.data for u in self.history if u.utype == "prices"]
+
+    def get_demand_history(self) -> List[Dict[str, float]]:
+        return [u.data for u in self.history if u.utype == "demand"]
+
+
+class System:
+    """Main pricing system implementing robust Stackelberg objective.
+
+    Manages the alternating loop: set prices p_t -> observe demand Q_hat(p_t) ->
+    estimate contamination alpha from behavioral signals -> compute next prices.
+    """
+
+    def __init__(self, n_products: int = 10, costs: np.ndarray | None = None, lambda_coi: float = 0.5, seed: int | None = 42):
+        self.n = n_products
+        self.rng = np.random.default_rng(seed)
+        self.costs = costs if costs is not None else self.rng.uniform(10, 50, n_products)
+        self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products))
+        self.lambda_coi = lambda_coi
+        self.limbo = Limbo()
+        self._alpha_est = 0.2
+        self._sessions: List[Session] = []
+        self._last_sessions: List[Session] = []
+        self._last_coi: COIWindow | None = None
+
+    @property
+    def alpha(self) -> float:
+        return self._alpha_est
+
+    def _estimate_alpha_from_sessions(self) -> float:
+        if not self._sessions:
+            return self._alpha_est
+        return float(np.mean([estimate_alpha(s) for s in self._sessions[-50:]]))
+
+    def _revenue_under_demand(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
+        agg = np.zeros(self.n)
+        for sid, q in demand.items():
+            sess = next((s for s in self._sessions if s.sid == sid), None)
+            if sess and sess.events:
+                agg[sess.events[0].product_idx] += q
+        return float(np.dot(prices, agg))
+
+    def _compute_coi_window(self, demand: Dict[str, float]) -> COIWindow:
+        if not self._last_sessions:
+            zeros = np.zeros(self.n, dtype=float)
+            return COIWindow(policy=0.0, agent=0.0, leak=0.0, survival_ratio=0.0,
+                             policy_by_product=zeros, agent_by_product=zeros, demand_weights=zeros)
+        return compute_coi_window(self._last_sessions, self.costs, demand_mapping=demand)
+
+    def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
+        """Robust objective: R(p,d) - lambda * COI_leak."""
+        profit = self._revenue_under_demand(prices, demand) - float(np.sum(self.costs))
+        self._last_coi = self._compute_coi_window(demand)
+        return profit - self.lambda_coi * self._last_coi.leak
+
+    def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray:
+        """Compute next prices via heuristic margin adjustment based on alpha estimate."""
+        self._alpha_est = self._estimate_alpha_from_sessions()
+        margin_scale = 1.0 - 0.5 * self._alpha_est  # defensive pricing under high contamination
+        margins = (self.refs - self.costs) * margin_scale
+        noise = self.rng.normal(0, 0.02, self.n) * self.costs
+        prices = np.clip(self.costs + margins + noise, self.costs * 1.02, self.refs * 1.3)
+        self.limbo.add_update("prices", prices)
+        return prices
+
+    def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
+        sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true,
+                                                    n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
+        self._last_sessions = sessions
+        self._sessions.extend(sessions)
+        self.limbo.add_update("demand", demand_map)
+        return demand_map
+
+    def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float, COIWindow]:
+        demand_hist = self.limbo.get_demand_history()
+        prices = self.compute_prices(demand_hist[-1] if demand_hist else None)
+        demand = self.observe_demand(prices, alpha_true, n_sessions)
+        reward = self._objective(prices, demand)
+        return prices, demand, reward, self._last_coi or self._compute_coi_window(demand)
+
+    def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict:
+        traj = {"prices": [], "demand": [], "rewards": [], "alpha_est": [], "alpha_true": alpha_true,
+                "coi_policy": [], "coi_agent": [], "coi_leak": [], "coi_survival": []}
+        for _ in range(n_steps):
+            p, d, r, coi = self.step(alpha_true)
+            traj["prices"].append(p); traj["demand"].append(d); traj["rewards"].append(r)
+            traj["alpha_est"].append(self._alpha_est)
+            traj["coi_policy"].append(coi.policy); traj["coi_agent"].append(coi.agent)
+            traj["coi_leak"].append(coi.leak); traj["coi_survival"].append(coi.survival_ratio)
+        return traj
+
+
+if __name__ == "__main__":
+    sys = System(n_products=5, seed=42)
+    traj = sys.run(n_steps=20, alpha_true=0.25)
+    print(f"avg reward: {np.mean(traj['rewards']):.2f}, final alpha_hat: {traj['alpha_est'][-1]:.3f}, "
+          f"COI_policy: {np.mean(traj['coi_policy']):.3f}, COI_agent: {np.mean(traj['coi_agent']):.3f}, leak: {np.mean(traj['coi_leak']):.3f}")
+
+    prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
+    costs = np.array([15.0, 28.0, 40.0, 18.0, 30.0])
+    sessions, demand = put_prices_to_market(prices, costs=costs, alpha=0.3, n_sessions=20, seed=123)
+    print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
+
+    for n in [1, 5, 10, 50, 100]:
+        # theoretical: erosion = 1 - 2/(N+1) for uniform order statistic
+        print(f'N={n:3d} agents -> COI erosion: {1.0 - 2.0/(n+1):.3f}')
+
+    events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0), Event('purchase', 0, 20.0, 2.0)]
+    print(f'human-like session alpha_hat: {estimate_alpha(Session(sid="test", events=events, actor="H")):.3f}')
+
+    events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3), Event('detail', 0, 20.0, 0.4)]
+    print(f'agent-like session alpha_hat: {estimate_alpha(Session(sid="test2", events=events_a, actor="A")):.3f}')
--- a/sim/case/thesis_simplified/simplified_env.py
+++ b/sim/case/thesis_simplified/simplified_env.py
@@ -0,0 +1,249 @@
+"""Gymnasium-compatible RL environment for thesis pricing system.
+
+Wraps simplified.System with standard Gym interface for training pricing policies.
+Supports multiple reward modes and contamination scenarios.
+
+Action: price multipliers [0.5, 1.5] applied to reference prices
+Observation: [prices, demand_agg, alpha_est, margins, position_proxy]
+Reward: configurable objective (revenue, profit, robust, coi-aware)
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, Tuple
+import numpy as np
+
+try:
+    import gymnasium as gym
+    from gymnasium import spaces
+    HAS_GYM = True
+except ImportError:
+    HAS_GYM = False
+
+from .simplified import System, Session, Event, Limbo, put_prices_to_market, compute_demand, estimate_alpha
+from .coi import COIWindow, compute_coi_window, coi_erosion
+
+
+@dataclass
+class EnvConfig:
+    n_products: int = 5
+    max_steps: int = 200
+    sessions_per_step: int = 30
+    alpha_true: float = 0.2
+    alpha_drift: float = 0.0
+    alpha_bounds: Tuple[float, float] = (0.0, 0.6)
+    lambda_coi: float = 0.5
+    lambda_vol: float = 0.1
+    reward_mode: str = "robust"  # revenue | profit | robust | coi_aware
+    normalize_reward: bool = True
+    seed: int | None = 42
+
+
+def aggregate_purchases(sessions: list[Session], n_products: int, costs: np.ndarray) -> Tuple[np.ndarray, float, float]:
+    """Aggregate purchases from sessions, returns (counts, revenue, cost)."""
+    purchases = np.zeros(n_products, dtype=float)
+    revenue, cost = 0.0, 0.0
+    for sess in sessions:
+        for e in sess.events:
+            if e.action == "purchase" and 0 <= e.product_idx < n_products:
+                purchases[e.product_idx] += 1.0
+                revenue += float(e.price_seen)
+                cost += float(costs[e.product_idx])
+    return purchases, revenue, cost
+
+
+class PricingEnv(gym.Env if HAS_GYM else object):
+    """RL environment for dynamic pricing under agent contamination.
+
+    Platform sets prices p_t, market responds with mixture demand Q(p) = (1-alpha)*D_H + alpha*D_A.
+    Agent estimates contamination alpha_hat from behavioral signals.
+    Reward balances profit vs COI leakage.
+    """
+    metadata = {"render_modes": ["human", "ansi"]}
+
+    def __init__(self, cfg: EnvConfig | None = None):
+        if not HAS_GYM:
+            raise ImportError("gymnasium required")
+        self.cfg = cfg or EnvConfig()
+        self.n = self.cfg.n_products
+        self._sys: System | None = None
+        self._t = 0
+        self._alpha = self.cfg.alpha_true
+        self._last_prices: np.ndarray | None = None
+        self._last_demand: Dict[str, float] | None = None
+        self._episode_rewards: list[float] = []
+        self._demand_agg = np.zeros(self.n)
+
+        self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32)
+        obs_dim = self.n + self.n + 1 + 1 + self.n + 1  # prices + demand + alpha_hat + alpha + margins + t
+        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)
+
+    def _build_obs(self) -> np.ndarray:
+        if self._sys is None:
+            return np.zeros(self.observation_space.shape[0], dtype=np.float32)
+        prices = self._last_prices if self._last_prices is not None else self._sys.refs
+        return np.concatenate([
+            prices / (self._sys.refs + 1e-6),
+            self._demand_agg / (np.sum(self._demand_agg) + 1e-6),
+            [self._sys.alpha, self._alpha],
+            (prices - self._sys.costs) / (self._sys.costs + 1e-6),
+            [self._t / self.cfg.max_steps],
+        ]).astype(np.float32)
+
+    def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
+        cfg, sys = self.cfg, self._sys
+        if sys is None:
+            return 0.0
+
+        # aggregate demand per product
+        agg = np.zeros(self.n)
+        for sid, q in demand.items():
+            sess = next((s for s in sys._sessions if s.sid == sid), None)
+            if sess and sess.events:
+                agg[sess.events[0].product_idx] += q
+        self._demand_agg = agg
+
+        _, revenue, cost = aggregate_purchases(sys._last_sessions, self.n, sys.costs)
+        profit = revenue - cost
+
+        vol_penalty = 0.0
+        if self._last_prices is not None:
+            vol_penalty = cfg.lambda_vol * float(np.mean(np.abs(prices - self._last_prices) / (sys.refs + 1e-6)))
+
+        coi = compute_coi_window(sys._last_sessions, sys.costs, demand_mapping=demand)
+        leak = float(coi.leak)
+
+        reward_fns = {
+            "revenue": lambda: revenue,
+            "profit": lambda: profit,
+            "robust": lambda: profit - cfg.lambda_coi * leak - vol_penalty,
+            "coi_aware": lambda: profit - cfg.lambda_coi * (1 + 2 * sys.alpha) * leak - vol_penalty,
+        }
+        r = reward_fns.get(cfg.reward_mode, lambda: profit)()
+        return float(r / (float(np.sum(sys.refs)) + 1e-6)) if cfg.normalize_reward else float(r)
+
+    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
+        seed = seed if seed is not None else self.cfg.seed
+        self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed)
+        self._t, self._alpha = 0, self.cfg.alpha_true
+        self._last_prices, self._last_demand = None, None
+        self._episode_rewards, self._demand_agg = [], np.zeros(self.n)
+        return self._build_obs(), {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
+                                   "costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()}
+
+    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
+        if self._sys is None:
+            raise RuntimeError("call reset() first")
+
+        action = np.clip(action, 0.5, 1.5)
+        prices = np.clip(self._sys.refs * action.astype(np.float64), self._sys.costs * 1.01, self._sys.refs * 2.0)
+        demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step)
+        self._sys.limbo.add_update("prices", prices)
+        self._sys._alpha_est = self._sys._estimate_alpha_from_sessions()
+
+        reward = self._compute_reward(prices, demand)
+        self._episode_rewards.append(reward)
+        self._last_prices, self._last_demand = prices.copy(), demand
+        self._t += 1
+
+        # compute info metrics using shared helper
+        purchases, revenue, cost = aggregate_purchases(self._sys._last_sessions, self.n, self._sys.costs)
+        n_agents = int(self._alpha * self.cfg.sessions_per_step)
+        coi = compute_coi_window(self._sys._last_sessions, self._sys.costs, demand_mapping=demand)
+
+        info = {
+            "alpha_true": self._alpha, "alpha_est": self._sys.alpha,
+            "alpha_error": abs(self._alpha - self._sys.alpha),
+            "revenue": float(revenue), "profit": float(revenue - cost), "cost": float(cost),
+            "n_purchases": int(np.sum(purchases)),
+            "avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
+            "n_sessions": len(demand), "n_agents": n_agents, "price_std": float(np.std(prices)),
+            "coi_erosion": coi_erosion(coi.policy, coi.agent),
+            "coi_policy": float(coi.policy), "coi_agent": float(coi.agent),
+            "coi_leakage": float(coi.leak), "coi_survival": float(coi.survival_ratio),
+            "cumulative_reward": sum(self._episode_rewards), "step": self._t,
+        }
+        return self._build_obs(), reward, self._t >= self.cfg.max_steps, False, info
+
+    def render(self, mode: str = "human") -> str | None:
+        if self._sys is None or self._last_prices is None:
+            return None
+        out = f"t={self._t}/{self.cfg.max_steps} | alpha_true={self._alpha:.3f} alpha_hat={self._sys.alpha:.3f} | " \
+              f"prices: {self._last_prices.round(1)} | demand: {self._demand_agg.round(2)} | " \
+              f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}"
+        if mode == "human":
+            print(out)
+        return out
+
+    def close(self) -> None:
+        pass
+
+
+class ContaminationSweepEnv(PricingEnv):
+    """Environment that sweeps through contamination levels during training."""
+
+    def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None):
+        super().__init__(cfg)
+        self._schedule = alpha_schedule or [0.1, 0.2, 0.3, 0.4, 0.5]
+        self._schedule_idx = 0
+
+    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
+        if options and options.get("advance_schedule", False):
+            self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule)
+        self.cfg.alpha_true = self._schedule[self._schedule_idx]
+        return super().reset(seed, options)
+
+
+class AdversarialEnv(PricingEnv):
+    """Environment with adversarial contamination dynamics.
+
+    Contamination increases when prices are predictable (agents exploit).
+    """
+
+    def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02):
+        super().__init__(cfg)
+        self._exploit_rate = exploitation_rate
+        self._price_history: list[np.ndarray] = []
+
+    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
+        obs, reward, term, trunc, info = super().step(action)
+        if self._last_prices is not None:
+            self._price_history.append(self._last_prices.copy())
+        predictability = 0.0
+        if len(self._price_history) > 10:
+            predictability = 1.0 / (float(np.std(self._price_history[-10:])) + 0.1)
+            self._alpha = np.clip(self._alpha + self._exploit_rate * predictability * self._sys.rng.random(), *self.cfg.alpha_bounds)
+        info["predictability"] = predictability
+        return obs, reward, term, trunc, info
+
+    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
+        self._price_history = []
+        return super().reset(seed, options)
+
+
+def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv:
+    return {"sweep": ContaminationSweepEnv, "adversarial": AdversarialEnv}.get(env_type, PricingEnv)(cfg)
+
+
+# baseline policies
+fixed_price_policy = lambda refs, margin=0.0: np.ones(len(refs), dtype=np.float32) * (1.0 + margin)
+random_policy = lambda n, rng=None: (rng or np.random.default_rng()).uniform(0.7, 1.3, n).astype(np.float32)
+adaptive_policy = lambda obs, n, base=0.1: np.ones(n, dtype=np.float32) * (1.0 + base * (1.0 - 0.4 * obs[2 * n]))
+
+
+if __name__ == "__main__":
+    cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust")
+    env = make_env(cfg)
+    obs, info = env.reset()
+    print(f"initial: alpha={info['alpha_true']:.2f}")
+
+    total_reward = 0.0
+    for t in range(cfg.max_steps):
+        action = adaptive_policy(obs, cfg.n_products)
+        obs, reward, done, _, info = env.step(action)
+        total_reward += reward
+        if t % 10 == 0:
+            env.render()
+        if done:
+            break
+
+    print(f"\ntotal reward: {total_reward:.2f}, final alpha_hat: {info['alpha_est']:.3f}")
--- a/sim/case/thesis_simplified/summarize.py
+++ b/sim/case/thesis_simplified/summarize.py
@@ -0,0 +1,168 @@
+"""Summarize TensorBoard logs into comparison tables."""
+from __future__ import annotations
+import json
+import re
+from pathlib import Path
+from collections import defaultdict
+from dataclasses import dataclass
+import pandas as pd
+
+try:
+    from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
+    HAS_TB = True
+except ImportError:
+    HAS_TB = False
+
+
+@dataclass
+class RunInfo:
+    algo: str
+    alpha: float
+    reward_mode: str
+    path: Path
+
+
+def parse_run_name(name: str) -> RunInfo | None:
+    """Extract algo, alpha, reward_mode from run directory name."""
+    # patterns: ppo_a0.20_robust, cmp_fixed_a0.20, sac_a0.90_robust
+    m = re.match(r'(cmp_)?(\w+)_a([\d.]+)_?(\w+)?', name)
+    if not m:
+        return None
+    prefix, algo, alpha, mode = m.groups()
+    return RunInfo(algo=algo, alpha=float(alpha), reward_mode=mode or 'robust', path=Path())
+
+
+def load_tb_scalars(log_dir: Path, tags: list[str], reduce: str = 'last') -> dict[str, float]:
+    """Load scalar values from TensorBoard event files."""
+    if not HAS_TB:
+        return {}
+    ea = EventAccumulator(str(log_dir))
+    ea.Reload()
+    results = {}
+    for tag in tags:
+        if tag in ea.Tags().get('scalars', []):
+            events = ea.Scalars(tag)
+            if not events:
+                continue
+            vals = [e.value for e in events]
+            if reduce == 'last':
+                results[tag] = vals[-1]
+            elif reduce == 'mean':
+                results[tag] = sum(vals) / len(vals)
+            elif reduce == 'max':
+                results[tag] = max(vals)
+            elif reduce == 'min':
+                results[tag] = min(vals)
+    return results
+
+
+def load_json_results(log_dir: Path) -> dict[str, float]:
+    """Load metrics from results.json if available."""
+    results_file = log_dir / 'results.json'
+    if results_file.exists():
+        with open(results_file) as f:
+            return json.load(f)
+    return {}
+
+
+def discover_runs(base_dir: Path) -> list[RunInfo]:
+    """Find all experiment runs in base directory."""
+    runs = []
+    for d in base_dir.iterdir():
+        if not d.is_dir():
+            continue
+        info = parse_run_name(d.name)
+        if info:
+            info.path = d
+            runs.append(info)
+    return runs
+
+
+def build_tables(runs: list[RunInfo], metrics: list[str], reduce: str = 'last') -> dict[str, dict[str, pd.DataFrame]]:
+    """Build pivot tables: reward_mode -> metric -> DataFrame[alpha x algo]."""
+    # collect data: {reward_mode: {metric: {(alpha, algo): value}}}
+    data = defaultdict(lambda: defaultdict(dict))
+
+    tb_tags = [f'economics/{m}' if m in ['revenue', 'profit', 'margin'] else f'coi/{m}' if m in ['erosion', 'leakage'] else f'alpha/{m}' for m in metrics]
+    tag_map = dict(zip(tb_tags, metrics))
+
+    for run in runs:
+        # try json first (final eval metrics)
+        jm = load_json_results(run.path)
+        tb = load_tb_scalars(run.path, tb_tags, reduce)
+
+        for tag, metric in tag_map.items():
+            val = None
+            json_key = f'{metric}_mean' if metric != 'reward' else 'reward_mean'
+            if json_key in jm:
+                val = jm[json_key]
+            elif tag in tb:
+                val = tb[tag]
+            if val is not None:
+                data[run.reward_mode][metric][(run.alpha, run.algo)] = val
+
+    # convert to DataFrames
+    tables = {}
+    for mode, metrics_data in data.items():
+        tables[mode] = {}
+        for metric, vals in metrics_data.items():
+            if not vals:
+                continue
+            alphas = sorted(set(a for a, _ in vals.keys()))
+            algos = sorted(set(al for _, al in vals.keys()))
+            df = pd.DataFrame(index=alphas, columns=algos, dtype=float)
+            for (a, al), v in vals.items():
+                df.loc[a, al] = v
+            df.index.name = 'alpha'
+            tables[mode][metric] = df
+    return tables
+
+
+def format_table(df: pd.DataFrame, fmt: str = '.3f') -> str:
+    """Format DataFrame as markdown table."""
+    return df.to_markdown(floatfmt=fmt)
+
+
+def summarize(base_dir: str = 'sim/case/thesis_simplified/runs',
+              metrics: list[str] | None = None,
+              reduce: str = 'last',
+              output: str | None = None) -> dict:
+    """Generate summary tables from experiment runs."""
+    base = Path(base_dir)
+    metrics = metrics or ['revenue', 'profit', 'margin', 'erosion', 'leakage']
+
+    runs = discover_runs(base)
+    if not runs:
+        print(f"No runs found in {base}")
+        return {}
+
+    print(f"Found {len(runs)} runs")
+    tables = build_tables(runs, metrics, reduce)
+
+    lines = []
+    for mode, metric_tables in sorted(tables.items()):
+        lines.append(f"\n# Reward Mode: {mode}\n")
+        for metric, df in sorted(metric_tables.items()):
+            lines.append(f"\n## {metric}\n")
+            lines.append(format_table(df))
+            lines.append("")
+
+    report = '\n'.join(lines)
+    print(report)
+
+    if output:
+        Path(output).write_text(report)
+        print(f"\nSaved to {output}")
+
+    return tables
+
+
+if __name__ == '__main__':
+    import argparse
+    p = argparse.ArgumentParser()
+    p.add_argument('--dir', default='sim/case/thesis_simplified/runs')
+    p.add_argument('--metrics', nargs='+', default=['revenue', 'profit', 'margin', 'erosion', 'leakage'])
+    p.add_argument('--reduce', default='last', choices=['last', 'mean', 'max', 'min'])
+    p.add_argument('--output', '-o', help='save markdown to file')
+    args = p.parse_args()
+    summarize(args.dir, args.metrics, args.reduce, args.output)
--- a/sim/case/thesis_simplified/train.py
+++ b/sim/case/thesis_simplified/train.py
@@ -0,0 +1,336 @@
+"""RL training for thesis pricing system with thesis-aligned metrics.
+
+Trains pricing policies using stable-baselines3 with TensorBoard logging.
+Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formulation.
+"""
+from __future__ import annotations
+import argparse
+import json
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from dataclasses import dataclass, asdict, field
+from pathlib import Path
+from typing import Dict, List, Callable, Any
+import numpy as np
+
+try:
+    from stable_baselines3 import PPO, SAC, A2C
+    from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
+    from stable_baselines3.common.vec_env import DummyVecEnv
+    from stable_baselines3.common.monitor import Monitor
+    HAS_SB3 = True
+except ImportError:
+    HAS_SB3 = False
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+    HAS_TB = True
+except ImportError:
+    HAS_TB = False
+
+from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy
+
+
+@dataclass
+class EpisodeMetrics:
+    reward: float = 0.0
+    revenue: float = 0.0
+    profit: float = 0.0
+    coi_erosion: float = 0.0
+    coi_leakage: float = 0.0
+    alpha_error: float = 0.0
+    avg_margin: float = 0.0
+    n_agents: int = 0
+    steps: int = 0
+
+    def accumulate(self, info: Dict[str, Any]) -> None:
+        self.steps += 1
+        self.reward += info.get('reward', 0)
+        self.revenue += info.get('revenue', 0)
+        self.profit += info.get('profit', 0)
+        self.coi_erosion += info.get('coi_erosion', 0)
+        self.coi_leakage += info.get('coi_leakage', 0)
+        self.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0))
+        self.avg_margin += info.get('avg_margin', 0)
+        self.n_agents += info.get('n_agents', 0)
+
+    def normalized(self) -> Dict[str, float]:
+        s = max(self.steps, 1)
+        return {k: getattr(self, k) / s for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin', 'n_agents']}
+
+
+@dataclass
+class ExperimentConfig:
+    algo: str = "ppo"
+    total_timesteps: int = 100_000
+    n_envs: int = 4
+    eval_freq: int = 5000
+    n_eval_episodes: int = 10
+    log_dir: str = "sim/case/thesis_simplified/runs"
+    seed: int = 42
+    n_products: int = 10
+    max_steps: int = 200
+    alpha_true: float = 0.2
+    reward_mode: str = "robust"
+    experiment_name: str | None = None
+
+    def __post_init__(self):
+        if self.experiment_name is None:
+            self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}"
+
+
+class Policy:
+    """Unified policy interface for baselines and trained models."""
+
+    def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str):
+        self._fn, self.name = policy_fn, name
+
+    def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]:
+        return self._fn(obs, (len(obs) - 3) // 3), None
+
+    @staticmethod
+    def fixed(margin: float = 0.15) -> "Policy":
+        return Policy(lambda obs, n: fixed_price_policy(np.ones(n), margin), f"fixed_{margin:.2f}")
+
+    @staticmethod
+    def adaptive(base_margin: float = 0.15) -> "Policy":
+        return Policy(lambda obs, n: adaptive_policy(obs, n, base_margin), f"adaptive_{base_margin:.2f}")
+
+    @staticmethod
+    def random() -> "Policy":
+        return Policy(lambda obs, n: random_policy(n), "random")
+
+    @staticmethod
+    def myopic(greed: float = 0.3) -> "Policy":
+        def _fn(obs: np.ndarray, n: int) -> np.ndarray:
+            demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
+            return np.ones(n, dtype=np.float32) * np.clip(1.0 + greed * (1 + np.mean(demand_norm)), 0.5, 1.5)
+        return Policy(_fn, f"myopic_{greed:.1f}")
+
+
+def log_metrics(writer: SummaryWriter | None, metrics: Dict[str, float], prefix: str, step: int) -> None:
+    if writer is None:
+        return
+    for k, v in metrics.items():
+        writer.add_scalar(f'{prefix}/{k}', v, step)
+
+
+class MetricsCallback(BaseCallback):
+    def __init__(self, writer: SummaryWriter | None, verbose: int = 0):
+        super().__init__(verbose)
+        self._writer = writer
+
+    def _on_step(self) -> bool:
+        if self._writer is None:
+            return True
+        for info in self.locals.get('infos', []):
+            t = self.num_timesteps
+            self._writer.add_scalar('economics/revenue', info.get('revenue', 0), t)
+            self._writer.add_scalar('economics/profit', info.get('profit', 0), t)
+            self._writer.add_scalar('economics/margin', info.get('avg_margin', 0), t)
+            self._writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), t)
+            self._writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), t)
+            self._writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), t)
+            self._writer.add_scalar('agents/count', info.get('n_agents', 0), t)
+        return True
+
+
+def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv:
+    def _make():
+        return Monitor(make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
+                                          alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)))
+    return DummyVecEnv([_make for _ in range(n_envs)])
+
+
+def run_episodes(policy: Policy | Any, env: PricingEnv, n_episodes: int) -> List[EpisodeMetrics]:
+    """Run policy for n episodes and collect metrics."""
+    metrics = []
+    for _ in range(n_episodes):
+        obs, _ = env.reset()
+        ep, done = EpisodeMetrics(), False
+        while not done:
+            action, _ = policy.predict(obs, deterministic=True)
+            obs, reward, term, trunc, info = env.step(action)
+            done = term or trunc
+            ep.accumulate(info)
+            ep.reward += reward
+        metrics.append(ep)
+    return metrics
+
+
+def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]:
+    env = make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
+                             alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999))
+    metrics = run_episodes(policy, env, n_episodes)
+    return {
+        'reward_mean': np.mean([m.reward for m in metrics]), 'reward_std': np.std([m.reward for m in metrics]),
+        **{f'{k}_mean': np.mean([m.normalized()[k] for m in metrics])
+           for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin']},
+    }
+
+
+def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None):
+    obs, n_envs = vec_env.reset(), vec_env.num_envs
+    ep_rewards = np.zeros(n_envs)
+
+    for step in range(0, total_steps, n_envs):
+        actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)])
+        obs, rewards, dones, infos = vec_env.step(actions)
+        ep_rewards += rewards
+        for i, info in enumerate(infos):
+            if writer:
+                writer.add_scalar('economics/revenue', info.get('revenue', 0), step)
+                writer.add_scalar('economics/profit', info.get('profit', 0), step)
+                writer.add_scalar('economics/margin', info.get('avg_margin', 0), step)
+                writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), step)
+                writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), step)
+                writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), step)
+                writer.add_scalar('agents/count', info.get('n_agents', 0), step)
+            if dones[i]:
+                if writer:
+                    writer.add_scalar('rollout/ep_reward', ep_rewards[i], step)
+                ep_rewards[i] = 0
+
+
+def train(cfg: ExperimentConfig) -> Dict[str, Any]:
+    is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"]
+    if not HAS_SB3 and not is_baseline:
+        raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]")
+
+    log_path = Path(cfg.log_dir) / cfg.experiment_name
+    log_path.mkdir(parents=True, exist_ok=True)
+    with open(log_path / "config.json", "w") as f:
+        json.dump(asdict(cfg), f, indent=2)
+
+    writer = SummaryWriter(log_path) if HAS_TB else None
+    train_env, eval_env = make_vec_env(cfg, cfg.n_envs), make_vec_env(cfg, 1)
+
+    if is_baseline:
+        policy = {"fixed": Policy.fixed, "adaptive": Policy.adaptive, "random": Policy.random, "myopic": Policy.myopic}[cfg.algo.lower()]()
+        run_baseline(policy, train_env, cfg.total_timesteps, writer)
+        final_metrics = evaluate_policy(policy, cfg)
+    else:
+        algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}[cfg.algo.lower()]
+        common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
+        model = {
+            "ppo": lambda: PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, ent_coef=0.01, **common),
+            "sac": lambda: SAC("MlpPolicy", train_env, learning_rate=1e-4, buffer_size=50_000, batch_size=512, tau=0.02, gamma=0.99, learning_starts=1000, ent_coef="auto_0.1", train_freq=4, **common),
+            "a2c": lambda: A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common),
+        }[cfg.algo.lower()]()
+
+        cb = MetricsCallback(writer)
+        eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"), log_path=str(log_path),
+                               eval_freq=cfg.eval_freq, n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
+        model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True)
+        model.save(log_path / "final_model")
+        policy = model
+        final_metrics = evaluate_policy(model, cfg)
+
+    if writer:
+        log_metrics(writer, final_metrics, 'final', cfg.total_timesteps)
+        writer.close()
+
+    train_env.close(); eval_env.close()
+    with open(log_path / "results.json", "w") as f:
+        json.dump(final_metrics, f, indent=2)
+    return {"path": str(log_path), "metrics": final_metrics}
+
+
+def _train_alpha(args: tuple) -> tuple[str, Dict]:
+    """Worker for parallel sweep - must be top-level for pickling."""
+    cfg_dict, alpha = args
+    cfg_dict["alpha_true"] = alpha
+    cfg_dict["experiment_name"] = f"{cfg_dict['algo']}_a{alpha:.2f}_{cfg_dict['reward_mode']}"
+    sweep_cfg = ExperimentConfig(**cfg_dict)
+    print(f"[alpha={alpha:.2f}] starting")
+    metrics = train(sweep_cfg)["metrics"]
+    print(f"[alpha={alpha:.2f}] done")
+    return f"alpha_{alpha:.2f}", metrics
+
+
+def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
+    alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+    cfg_dict = asdict(cfg)
+
+    if max_workers == 1:  # sequential fallback
+        results = dict(_train_alpha((cfg_dict.copy(), a)) for a in alphas)
+    else:
+        with ProcessPoolExecutor(max_workers=max_workers) as pool:
+            futures = {pool.submit(_train_alpha, (cfg_dict.copy(), a)): a for a in alphas}
+            results = {}
+            for fut in as_completed(futures):
+                key, metrics = fut.result()
+                results[key] = metrics
+
+    summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json"
+    with open(summary_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nSweep results saved to {summary_path}")
+    return results
+
+
+def _train_policy(args: tuple) -> tuple[str, Dict]:
+    """Worker for parallel policy comparison."""
+    cfg_dict, algo = args
+    cfg_dict["algo"] = algo
+    cfg_dict["experiment_name"] = f"cmp_{algo}_a{cfg_dict['alpha_true']:.2f}"
+    cmp_cfg = ExperimentConfig(**cfg_dict)
+    print(f"[{algo}] starting")
+    metrics = train(cmp_cfg)["metrics"]
+    print(f"[{algo}] done")
+    return algo, metrics
+
+
+def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
+    policies = policies or ["fixed", "adaptive", "myopic", "random"]
+    cfg_dict = asdict(cfg)
+
+    if max_workers == 1:
+        results = dict(_train_policy((cfg_dict.copy(), p)) for p in policies)
+    else:
+        with ProcessPoolExecutor(max_workers=max_workers) as pool:
+            futures = {pool.submit(_train_policy, (cfg_dict.copy(), p)): p for p in policies}
+            results = {}
+            for fut in as_completed(futures):
+                algo, metrics = fut.result()
+                results[algo] = metrics
+
+    cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json"
+    with open(cmp_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nComparison saved to {cmp_path}")
+    for algo, m in results.items():
+        print(f"  {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} alpha_err={m['alpha_error_mean']:.4f}")
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train RL pricing policies")
+    parser.add_argument("--algo", default="ppo", choices=["ppo", "sac", "a2c", "fixed", "adaptive", "random", "myopic"])
+    parser.add_argument("--steps", type=int, default=100_000)
+    parser.add_argument("--alpha", type=float, default=0.2)
+    parser.add_argument("--reward-mode", default="robust", choices=["revenue", "profit", "robust", "coi_aware"])
+    parser.add_argument("--n-products", type=int, default=10)
+    parser.add_argument("--n-envs", type=int, default=4)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--log-dir", default="sim/case/thesis_simplified/runs")
+    parser.add_argument("--sweep", action="store_true", help="run contamination sweep")
+    parser.add_argument("--compare", action="store_true", help="compare all baselines")
+    parser.add_argument("--workers", type=int, default=None, help="max parallel workers for sweep (None=auto, 1=sequential)")
+    args = parser.parse_args()
+
+    cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
+                           reward_mode=args.reward_mode, n_products=args.n_products,
+                           n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir)
+
+    if args.sweep:
+        run_sweep(cfg, max_workers=args.workers)
+    elif args.compare:
+        compare_policies(cfg, max_workers=args.workers)
+    else:
+        result = train(cfg)
+        print(f"\nTraining complete: {result['path']}")
+        print(f"Metrics: {json.dumps(result['metrics'], indent=2)}")
+
+
+if __name__ == "__main__":
+    main()
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -19,6 +19,7 @@ except ImportError:
    lib_make_state_repr = None
    lib_transition_histogram = None

+
 class BehaviorModel:
    def __init__(self, src_dir: str, loader_cls=Loader):
        self.loader = loader_cls(src_dir)
@@ -206,6 +207,7 @@ def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "

 def kl_divergence(p: Dict[str, float], q: Dict[str, float]) -> float:
    eps = 1e-10
+    # p + log(p / q) summed over all keys in P
    return sum((p[k] + eps) * np.log((p[k] + eps) / (q.get(k, 0.0) + eps)) for k in p)

 if __name__ == "__main__":
@@ -222,6 +224,7 @@ if __name__ == "__main__":

    agent_model = AgentBehaviorModel(agent_dir)
    agent_mdp = agent_model.build_MDP()
+
    print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
          f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
    if not agent_mdp['states']:
@@ -230,6 +233,7 @@ if __name__ == "__main__":

    human_evt = aggregate_event_transitions(human_mdp)
    agent_evt = aggregate_event_transitions(agent_mdp)
+
    common = set(human_evt.keys()) & set(agent_evt.keys())

    if not common:
--- a/sim/rl/engine.py
+++ b/sim/rl/engine.py
@@ -3,8 +3,7 @@ import numpy as np
 import pandas as pd
 from abc import ABC, abstractmethod
 from typing import Dict, Any
-from environment import BusinessLogicConstraints
-
+from sim.rl.environment import BusinessLogicConstraints

 """
 An angine by default should have its own demand estimation mechanism from the observed observations whihc are the computer feature.
@@ -32,9 +31,12 @@ class BasePricingEngine(ABC):
        """
        pass

-    @abstractmethod
-    def update(obs, reward, done, info):
-        pass
+    def update(self, observation: Dict[str, Any], reward: float, done: bool, info: Dict[str, Any]) -> None:
+        """Default no-op update. Engines can override as needed."""
+        self.last_observation = observation
+        self.last_reward = reward
+        self.last_info = info
+



@@ -48,14 +50,14 @@ class WildPricingEngine(BasePricingEngine):
    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
        super().__init__(constraints, seed)
        # per-product unit costs (unknown to customers; known to platform)
-        self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catelogue_size).astype(np.float32)
+        self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catalogue_size).astype(np.float32)
        # online elasticity estimate (start moderately elastic)
-        self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
+        self.e_hat = np.full((self.c.product_catalogue_size,), -1.3, dtype=np.float32)
        # EWMA state for log-log regression
-        self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        self.cov_pq  = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        self.var_p   = np.ones(self.c.product_catelogue_size, dtype=np.float32)
+        self.mu_logp = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        self.mu_logq = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        self.cov_pq  = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        self.var_p   = np.ones(self.c.product_catalogue_size, dtype=np.float32)
        # knobs typical in production
        self.lr = 0.08
        self.ewma = 0.05
@@ -140,7 +142,7 @@ class SimpleDemandEngine(BasePricingEngine):

    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
        self.step_count += 1
-        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        demand = _extract_demand(observation, self.c.product_catalogue_size)
        if self.prev_demand is None:
            self.prev_demand = demand.copy()
            return current_prices.copy()
@@ -187,15 +189,15 @@ class ThompsonSamplingEngine(BasePricingEngine):
    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
        super().__init__(constraints, seed)
        self.n_price_levels = 5
-        self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
-        self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.alpha = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
+        self.beta = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
        self.price_grid = None
        self.last_actions = None

    def reset(self):
        super().reset()
-        self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
-        self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.alpha = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
+        self.beta = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
        self.price_grid = None
        self.last_actions = None

@@ -206,10 +208,10 @@ class ThompsonSamplingEngine(BasePricingEngine):
            lo = current_prices * 0.7
            hi = current_prices * 1.3
            self.price_grid = np.linspace(lo, hi, self.n_price_levels).T
-        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        demand = _extract_demand(observation, self.c.product_catalogue_size)
        # update beliefs based on last action
        if self.last_actions is not None:
-            for i in range(self.c.product_catelogue_size):
+            for i in range(self.c.product_catalogue_size):
                a = self.last_actions[i]
                reward = demand[i]
                if reward > 0.5:
@@ -217,11 +219,22 @@ class ThompsonSamplingEngine(BasePricingEngine):
                else:
                    self.beta[i, a] += 1.0
        # thompson sampling: sample from posterior, pick best
-        new_prices = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
-        actions = np.zeros(self.c.product_catelogue_size, dtype=int)
-        for i in range(self.c.product_catelogue_size):
+        new_prices = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
+        actions = np.zeros(self.c.product_catalogue_size, dtype=int)
+        for i in range(self.c.product_catalogue_size):
            theta = self.rng.beta(self.alpha[i], self.beta[i]).astype(np.float32)
            actions[i] = int(np.argmax(theta))
            new_prices[i] = self.price_grid[i, actions[i]]
        self.last_actions = actions
        return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
+
+
+def _extract_demand(observation: Dict[str, Any], n: int) -> np.ndarray:
+    if "elasticity" in observation and isinstance(observation["elasticity"], dict):
+        d = observation["elasticity"].get("demand")
+        if d is not None:
+            return np.asarray(d, dtype=np.float32)
+    d = observation.get("demand")
+    if d is not None:
+        return np.asarray(d, dtype=np.float32)
+    return np.zeros(n, dtype=np.float32)
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -1,319 +1,244 @@
-import gymnasium as gym
-from gymnasium import spaces
-import numpy as np
+from __future__ import annotations
+
 from dataclasses import dataclass
-import pandas as pd
-from typing import Callable, Optional, Dict, Any, List
+from typing import Any, Dict, Optional, Tuple

-# "learner"  agent learning to optimize pricing
-# "agent"  part of environment creating demand signals that learner processes
+import numpy as np
+
+try:
+    import gymnasium as gym
+    from gymnasium import spaces
+except ImportError as e:
+    raise ImportError("sim.rl.environment requires gymnasium") from e
+
+from sim.case.thesis_simplified.coi import COIWindow, coi_erosion, compute_coi_window
+from sim.case.thesis_simplified.separability import estimate_alpha as estimate_session_alpha
+from sim.case.thesis_simplified.simplified import Limbo, Session, put_prices_to_market
+from sim.rl.thesis_core import aggregate_demand_by_product, aggregate_purchases, constrain_prices
+
+
+@dataclass(frozen=True)
+class BusinessLogicConstraints:
+    product_catalogue_size: int = 100
+    max_steps: int = 2000
+    sessions_per_step: int = 250

-@dataclass
-class BusinessLogicConstraints():
-    max_price_adjustment: float = 0.30
    system_max_price: float = 500.0
    system_min_price: float = 1.0
-    product_catalogue_size: int = 100
-    episode_length: int = 200
-    sessions_per_step: int = 250
-    agent_share: float = 0.25
-    agent_recon_multiplier: float = 6.0
-    agent_purchase_probability: float = 0.20
+    max_price_adjustment: float = 0.30
+    min_margin_pct: float = 0.05
+
+    agent_share: float = 0.2
+    alpha_drift: float = 0.0
+    alpha_bounds: tuple[float, float] = (0.0, 0.8)
+
    coi_strength: float = 0.25
-    coi_threshold: float = 4.0
-    coi_sigmoid_temp: float = 1.25
-    base_human_demand: float = 0.08
-    base_agent_demand: float = 0.05
-    human_price_elasticity: float = -1.2 # assumptions here
-    agent_price_elasticity: float = -0.6
-    w_agent_loss: float = 1.0
    w_volatility: float = 5.0
    w_estimation_error: float = 0.25
+
    seed: int = 7


-def _sigmoid(x: np.ndarray) -> np.ndarray:
-    return 1.0 / (1.0 + np.exp(-x))
-
-class BehavioralProfile:
-    """simple markov chain model for generating synthetic interaction events"""
-    def __init__(self, actor: str, purchase_probs: np.ndarray):
-        self.actor = actor
-        self.purchase_probs = purchase_probs
-        self.states = ['view', 'cart', 'checkout']
-        # transition matrix: view->cart 0.3, view->view 0.6, view->exit 0.1, cart->checkout 0.5, cart->view 0.4, cart->exit 0.1
-        self.trans = {'view': {'view': 0.6, 'cart': 0.3, 'exit': 0.1}, 'cart': {'checkout': 0.5, 'view': 0.4, 'exit': 0.1}, 'checkout': {'exit': 1.0}}
-        if actor == 'agents':  # agents browse more before purchasing
-            self.trans['view'] = {'view': 0.75, 'cart': 0.15, 'exit': 0.1}
-            self.trans['cart'] = {'checkout': 0.3, 'view': 0.6, 'exit': 0.1}
-
-    def sample(self, rng: np.random.Generator) -> Dict[str, Any]:
-        """sample single interaction event"""
-        product_idx = rng.integers(0, len(self.purchase_probs))
-        state = 'view'  # always start with view
-        # pick next state based on transition probs
-        trans = self.trans.get(state, {'exit': 1.0})
-        next_state = rng.choice(list(trans.keys()), p=list(trans.values()))
-        price_paid = 0.0 if next_state != 'checkout' else float(rng.uniform(50, 200))
-        return {'action': state, 'product_idx': product_idx, 'actor': 'agent' if self.actor == 'agents' else 'human', 't': 0.0, 'price_paid': price_paid}
-
-
-def _load_behavioral_profile(actor: str, demand_forcing: np.ndarray) -> BehavioralProfile:
-    """returns a behavioral profile for generating synthetic sessions
-    actor: 'humans' or 'agents'
-    demand_forcing: per-product purchase probabilities used to weight interactions
-    """
-    return BehavioralProfile(actor, demand_forcing)
-
-
-class CommercePlatform:
-    """state management for the environment, simulates demand"""
-    def __init__(self, product_catalogue_size: int, max_price: float, min_price: float, constraints: BusinessLogicConstraints):
-        self.product_catalogue_size = product_catalogue_size
-        self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catalogue_size,))
-        self.max_price = max_price
-        self.min_price = min_price
-        self.constraints = constraints
-        self.simulation_history: List[Dict[str, Any]] = []
-        self._rng = np.random.default_rng(constraints.seed)
-        self._last_interaction_df: pd.DataFrame = pd.DataFrame()
-
-    def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
-        p = np.clip(prices, self.min_price, self.max_price)
-        pn = p / self.max_price
-        human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
-        agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
-        return {"human_purchase_prob": np.clip(human_prob, 0.0, 0.95), "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)}
-
-    def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
-        demand = self.setup_true_demand(base_prices)
-        human_pprob = demand["human_purchase_prob"]
-        agent_pprob = demand["agent_purchase_prob"]
-        events: List[Dict[str, Any]] = []
-        T = self.constraints.sessions_per_step
-        n_agent_sessions = int(round(T * self.constraints.agent_share))
-        n_human_sessions = T - n_agent_sessions
-        n_agent_ids = max(1, n_agent_sessions // 2)
-        session_map = {
-            'humans': n_human_sessions,
-            'agents': n_agent_ids
-        }
-        pprob_map = {
-            'humans': human_pprob,
-            'agents': agent_pprob
-        }
-        joint_events = []
-        for actor, n_sessions in session_map.items():
-            bp = _load_behavioral_profile(actor, pprob_map[actor])
-            counter = 0
-            events = []
-            while counter < n_sessions:
-                session_events = []
-                while len(session_events) == 0 or session_events[-1]['action'] == 'checkout':
-                    interaction_event = bp.sample(self._rng)
-                    interaction_event['session_id'] = f'{actor}_{counter:06d}'
-                    # TODO any other assignments
-                    session_events.append(interaction_event)
-                events.extend(session_events)
-                counter += 1
-            joint_events.extend(events)
-
-        return pd.DataFrame(joint_events)
-
-    def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
-        if interaction_df.empty:
-            return {"mean_sale_price": 0.0, "look_to_book": 0.0}
-        purchases = interaction_df[interaction_df["action"] == "purchase"]
-        mean_sale_price = float(purchases["price_paid"].mean()) if not purchases.empty else 0.0
-        views = float((interaction_df["action"] == "view").sum())
-        buys = float((interaction_df["action"] == "purchase").sum())
-        return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
-
-    def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
-        # TODO: adapt this
-        if df.empty:
-            return pd.DataFrame()
-        g = df.groupby("session_id", sort=False)
-        session_duration = g["t"].max() - g["t"].min()
-        total_interactions = g.size()
-        avg_time_between = g["t"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0)
-        interaction_velocity = total_interactions / (session_duration + 1e-6)
-        views = g.apply(lambda x: int((x["action"] == "view").sum()), include_groups=False)
-        cart_adds = g.apply(lambda x: int((x["action"] == "cart").sum()), include_groups=False)
-        purchases = g.apply(lambda x: int((x["action"] == "purchase").sum()), include_groups=False)
-        conversion_rate = purchases / (views + 1e-6)
-        is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False)
-
-        return pd.DataFrame({
-            "session_duration_sec": session_duration.astype(float),
-            "avg_time_between_events": avg_time_between.astype(float),
-            "total_interactions": total_interactions.astype(int),
-            "interaction_velocity": interaction_velocity.astype(float),
-            "item_views": views.astype(int),
-            "cart_adds": cart_adds.astype(int),
-            "purchases": purchases.astype(int),
-            "conversion_rate": conversion_rate.astype(float),
-            "is_agent": is_agent.astype(bool),
-        }).reset_index()
-
-    def get_interaction_data(self) -> np.ndarray:
-        if self._last_interaction_df.empty:
-            return np.array([], dtype=object)
-        return self._last_interaction_df.to_dict(orient="records")
+def make_env(constraints: Optional[BusinessLogicConstraints] = None) -> "PHANTOMEnv":
+    return PHANTOMEnv(constraints=constraints or BusinessLogicConstraints())


 class PHANTOMEnv(gym.Env):
-    metadata = {"render_modes": []}
+    metadata = {"render_modes": ["human", "ansi"]}

-    def __init__(self, constraints):
+    def __init__(self, constraints: Optional[BusinessLogicConstraints] = None):
        super().__init__()
-        self.constraints = BusinessLogicConstraints()
-        self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
-                                       high=self.constraints.max_price_adjustment,
-                                       shape=(self.constraints.product_catalogue_size,), dtype=np.float32)
-        self.observation_space = spaces.Dict({
-            "elasticity": spaces.Dict({
+        self.c = constraints or BusinessLogicConstraints()
+        self.n = int(self.c.product_catalogue_size)
+
+        self._rng = np.random.default_rng(self.c.seed)
+        self._t = 0
+        self._alpha_true = float(self.c.agent_share)
+        self._alpha_hat = float(self.c.agent_share)
+        self._costs = np.zeros(self.n, dtype=np.float32)
+        self._refs = np.zeros(self.n, dtype=np.float32)
+        self._prices: Optional[np.ndarray] = None
+        self._last_sessions: list[Session] = []
+        self._last_coi: COIWindow | None = None
+        self._limbo = Limbo()
+
+        self.action_space = spaces.Box(
+            low=np.full((self.n,), self.c.system_min_price, dtype=np.float32),
+            high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
+            dtype=np.float32,
+        )
+        self.observation_space = spaces.Dict(
+            {
+                "elasticity": spaces.Dict(
+                    {
                        "price": spaces.Box(
-                    low=np.full((self.constraints.product_catalogue_size,), self.constraints.system_min_price, dtype=np.float32),
-                    high=np.full((self.constraints.product_catalogue_size,), self.constraints.system_max_price, dtype=np.float32),
-                    dtype=np.float32),
+                            low=np.full((self.n,), self.c.system_min_price, dtype=np.float32),
+                            high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
+                            dtype=np.float32,
+                        ),
                        "demand": spaces.Box(
-                    low=np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
-                    high=np.full((self.constraints.product_catalogue_size,), 1e6, dtype=np.float32),
-                    dtype=np.float32),
-            })
-            # TODO: define more features that we compute from the interaction data
-        })
-        self.commerce_platform = CommercePlatform(
-            product_catalogue_size=self.constraints.product_catalogue_size,
-            max_price=self.constraints.system_max_price,
-            min_price=self.constraints.system_min_price,
-            constraints=self.constraints)
-        self._rng = np.random.default_rng(self.constraints.seed)
-        self.t = 0
-        self._prev_prices: Optional[np.ndarray] = None
-        self.state: Dict[str, Any] = {}
+                            low=np.zeros((self.n,), dtype=np.float32),
+                            high=np.full((self.n,), 1e9, dtype=np.float32),
+                            dtype=np.float32,
+                        ),
+                    }
+                ),
+                "market": spaces.Dict(
+                    {
+                        "alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
+                        "revenue_rate": spaces.Box(low=0.0, high=1e12, shape=(1,), dtype=np.float32),
+                        "conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
+                        "price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
+                    }
+                ),
+                "cost": spaces.Box(
+                    low=np.zeros((self.n,), dtype=np.float32),
+                    high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
+                    dtype=np.float32,
+                ),
+            }
+        )
+
+    def _reset_catalogue(self) -> None:
+        self._costs = self._rng.uniform(15.0, 60.0, size=self.n).astype(np.float32)
+        margins = self._rng.uniform(0.2, 0.6, size=self.n).astype(np.float32)
+        self._refs = (self._costs * (1.0 + margins)).astype(np.float32)
+        self._prices = self._refs.copy()
+
+    def _observe_market(
+        self, prices: np.ndarray
+    ) -> tuple[list[Session], Dict[str, float], np.ndarray, np.ndarray, float, float, int]:
+        sessions, demand_map = put_prices_to_market(
+            prices,
+            costs=self._costs,
+            alpha=self._alpha_true,
+            n_sessions=int(self.c.sessions_per_step),
+            seed=int(self._rng.integers(0, 2**31 - 1)),
+        )
+        demand_by_product = aggregate_demand_by_product(sessions, demand_map, self.n)
+        purchases, revenue, cost, n_agents = aggregate_purchases(sessions, self._costs, self.n)
+        conversion = float(np.sum(purchases) / max(len(sessions), 1))
+        return sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents
+
+    def _update_alpha_hat(self, sessions: list[Session]) -> float:
+        scores = [estimate_session_alpha(s) for s in sessions if s.events]
+        if not scores:
+            return self._alpha_hat
+        alpha_step = float(np.mean(scores))
+        self._alpha_hat = 0.8 * self._alpha_hat + 0.2 * alpha_step
+        self._alpha_hat = float(np.clip(self._alpha_hat, 0.0, 1.0))
+        return self._alpha_hat
+
+    def _reward(self, prices: np.ndarray, revenue: float, cost: float, volatility: float) -> float:
+        profit = float(revenue - cost)
+        coi_leak = float(self._last_coi.leak) if self._last_coi else 0.0
+        alpha_err = abs(self._alpha_hat - self._alpha_true)
+        return profit - self.c.coi_strength * coi_leak - self.c.w_volatility * volatility - self.c.w_estimation_error * alpha_err
+
+    def _build_obs(
+        self,
+        prices: np.ndarray,
+        demand_by_product: np.ndarray,
+        revenue: float,
+        conversion: float,
+        volatility: float,
+    ) -> Dict[str, Any]:
+        return {
+            "elasticity": {"price": prices.astype(np.float32), "demand": demand_by_product.astype(np.float32)},
+            "market": {
+                "alpha_hat": np.array([self._alpha_hat], dtype=np.float32),
+                "revenue_rate": np.array([revenue], dtype=np.float32),
+                "conversion_rate": np.array([conversion], dtype=np.float32),
+                "price_volatility": np.array([volatility], dtype=np.float32),
+            },
+            "cost": self._costs.astype(np.float32),
+        }

    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)
        if seed is not None:
            self._rng = np.random.default_rng(seed)
-            self.commerce_platform._rng = np.random.default_rng(seed)
-        self.t = 0
-        init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catalogue_size,)).astype(np.float32)
-        self._prev_prices = init_prices.copy()
-        self.state = {
-            "elasticity": {
-                "price": init_prices,
-                "demand": np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
-            }
-        }
-        return self.state, {}
+        self._t = 0
+        self._alpha_true = float(np.clip(self.c.agent_share, *self.c.alpha_bounds))
+        self._alpha_hat = float(self.c.agent_share)
+        self._reset_catalogue()
+        self._limbo = Limbo()
+        self._last_sessions = []
+        self._last_coi = None

-    def step(self, action: np.ndarray):
-        self.t += 1
-        base_prices = self.state["elasticity"]["price"].astype(np.float32)
-        new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
-                           self.constraints.system_min_price,
-                           self.constraints.system_max_price).astype(np.float32)
+        prices = self._prices if self._prices is not None else np.zeros(self.n, dtype=np.float32)
+        obs = self._build_obs(prices, np.zeros(self.n, dtype=np.float32), 0.0, 0.0, 0.0)
+        return obs, {"alpha_true": self._alpha_true}

-        self.state["elasticity"]["price"] = new_prices
-        interactions_df = self.commerce_platform._simulate_sessions(new_prices)
-        result = self.commerce_platform.compute_interaction_features(interactions_df)
-        COI = 0.0  # TODO: implement cost-of-information computation
+    def step(self, action: np.ndarray) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
+        if self._prices is None:
+            raise RuntimeError("reset() must be called before step()")

-        volatility = 0.0 if self._prev_prices is None else \
-            float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
-        self._prev_prices = new_prices.copy()
+        prev = self._prices
+        prices = constrain_prices(
+            prev,
+            np.asarray(action, dtype=np.float32),
+            costs=self._costs,
+            min_price=float(self.c.system_min_price),
+            max_price=float(self.c.system_max_price),
+            max_adjustment=float(self.c.max_price_adjustment),
+            min_margin_pct=float(self.c.min_margin_pct),
+        )
+        self._prices = prices
+        self._limbo.add_update("prices", prices)

-        # extract metrics with safe defaults for incomplete simulation
-        revenue_observed = float(result.get("revenue_observed", result.get("mean_sale_price", 0.0)))
-        agent_loss = float(result.get("agent_loss", 0.0))
+        sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents = self._observe_market(prices)
+        self._last_sessions = sessions
+        self._limbo.add_update("demand", demand_map)

-        reward = (revenue_observed
-                  - COI
-                  - self.constraints.w_agent_loss * agent_loss
-                  - self.constraints.w_volatility * volatility
-                  - self.constraints.w_estimation_error)
+        self._update_alpha_hat(self._last_sessions)
+        self._last_coi = compute_coi_window(self._last_sessions, self._costs, demand_mapping=demand_map)

-        terminated = self.t >= self.constraints.episode_length
+        self._alpha_true = float(np.clip(self._alpha_true + self.c.alpha_drift, *self.c.alpha_bounds))
+        volatility = float(np.std((prices - prev) / (prev + 1e-6)))
+        reward = float(self._reward(prices, revenue, cost, volatility))
+        conversion = float(np.sum(purchases) / max(len(self._last_sessions), 1))
+
+        self._t += 1
+        terminated = self._t >= int(self.c.max_steps)
+
+        obs = self._build_obs(prices, demand_by_product, revenue, conversion, min(volatility, 1.0))
        info = {
-            "t": self.t,
-            "revenue_observed": revenue_observed,
-            "revenue_oracle": float(result.get("revenue_oracle", revenue_observed)),
-            "agent_loss": agent_loss,
-            "ux_volatility": volatility,
-            "look_to_book": float(result.get("look_to_book", 0.0)),
-            "mean_sale_price": float(result.get("mean_sale_price", 0.0)),
-            "true_human_purchases_total": 0.0,  # TODO: track from simulation
-            "true_agent_purchases_total": 0.0,  # TODO: track from simulation
+            "step": self._t,
+            "reward": reward,
+            "revenue": float(revenue),
+            "profit": float(revenue - cost),
+            "n_sessions": int(self.c.sessions_per_step),
+            "n_agents": int(n_agents),
+            "alpha_true": float(self._alpha_true),
+            "alpha_hat": float(self._alpha_hat),
+            "alpha_error": float(abs(self._alpha_hat - self._alpha_true)),
+            "price_std": float(np.std(prices)),
+            "price_volatility": float(volatility),
        }
-        return self.state, float(reward), terminated, False, info
+        if self._last_coi is not None:
+            info.update(
+                {
+                    "coi_policy": float(self._last_coi.policy),
+                    "coi_agent": float(self._last_coi.agent),
+                    "coi_leakage": float(self._last_coi.leak),
+                    "coi_survival": float(self._last_coi.survival_ratio),
+                    "coi_erosion": float(coi_erosion(self._last_coi.policy, self._last_coi.agent)),
+                }
+            )
+        return obs, reward, terminated, False, info

+    def render(self, mode: str = "human") -> str | None:
+        if self._prices is None:
+            return None
+        out = (
+            f"t={self._t}/{self.c.max_steps} "
+            f"alpha_true={self._alpha_true:.3f} alpha_hat={self._alpha_hat:.3f} "
+            f"price_std={float(np.std(self._prices)):.2f}"
+        )
+        if mode == "human":
+            print(out)
+        return out

-if __name__ == "__main__":
-    import matplotlib.pyplot as plt
-    from collections import defaultdict
-
-    env = PHANTOMEnv(constraints=BusinessLogicConstraints())
-    obs, _ = env.reset(seed=42)
-    metrics = defaultdict(list)
-    total_reward = 0.0
-    done = False
-
-    while not done:
-        action = env.action_space.sample()
-        obs, reward, done, _, info = env.step(action)
-        total_reward += reward
-        p_mean = float(np.mean(obs["elasticity"]["price"]))
-        q_mean = float(np.mean(obs["elasticity"]["demand"]))
-        p_std = float(np.std(obs["elasticity"]["price"]))
-
-        metrics['t'].append(info['t'])
-        metrics['price_mean'].append(p_mean)
-        metrics['price_std'].append(p_std)
-        metrics['demand_mean'].append(q_mean)
-        metrics['revenue_observed'].append(info['revenue_observed'])
-        metrics['revenue_oracle'].append(info['revenue_oracle'])
-        metrics['agent_loss'].append(info['agent_loss'])
-        metrics['ux_volatility'].append(info['ux_volatility'])
-        metrics['look_to_book'].append(info['look_to_book'])
-        metrics['reward'].append(reward)
-        metrics['human_purchases'].append(info['true_human_purchases_total'])
-        metrics['agent_purchases'].append(info['true_agent_purchases_total'])
-
-        if info['t'] % 20 == 0 or done:
-            print(f"t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} q={q_mean:6.2f} "
-                  f"rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
-                  f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
-                  f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
-
-    print(f"total_reward={total_reward:.2f}")
-
-    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
-    fig.suptitle('PHANTOM Environment Run', fontsize=14, fontweight='bold')
-
-    plot_configs = [
-        ('price_mean', 'Mean Price', 'Price'),
-        ('demand_mean', 'Mean Demand Estimate', 'Demand'),
-        ('revenue_observed', 'Revenue (Observed)', 'Revenue'),
-        ('agent_loss', 'Agent Loss (Oracle - Observed)', 'Loss'),
-        ('ux_volatility', 'UX Volatility (Price Change)', 'Volatility'),
-        ('look_to_book', 'Look-to-Book Ratio', 'Ratio'),
-        ('reward', 'Step Reward', 'Reward'),
-        ('human_purchases', 'Human Purchases', 'Count'),
-        ('agent_purchases', 'Agent Purchases', 'Count'),
-    ]
-
-    for idx, (key, title, ylabel) in enumerate(plot_configs):
-        ax = axes[idx // 3, idx % 3]
-        ax.plot(metrics['t'], metrics[key], color='blue', alpha=0.7, linewidth=1.5)
-        ax.set_xlabel('Step')
-        ax.set_ylabel(ylabel)
-        ax.set_title(title, fontsize=10, fontweight='bold')
-        ax.grid(True, alpha=0.3)
-
-    plt.tight_layout()
-    plt.savefig('phantom_env_comparison.png', dpi=150, bbox_inches='tight')
-    print("Plot saved to phantom_env_comparison.png")
-    plt.show()
+    def close(self) -> None:
+        return
--- a/sim/rl/jax_core/init.py
+++ b/sim/rl/jax_core/init.py
@@ -0,0 +1,11 @@
+"""JAX-accelerated simulation core for PHANTOM environment."""
+from .transitions import TransitionData, compile_transitions, fallback_transitions, JAX_AVAILABLE
+from .simulation import SessionBatch, SimResult, sample_sessions, compute_metrics
+from .features import session_features, compute_session_transitions
+from .separability import compute_divergences, estimate_alpha_batch
+
+__all__ = [
+    "JAX_AVAILABLE", "TransitionData", "compile_transitions", "fallback_transitions",
+    "SessionBatch", "SimResult", "sample_sessions", "compute_metrics",
+    "session_features", "compute_session_transitions", "compute_divergences", "estimate_alpha_batch",
+]
--- a/sim/rl/jax_core/features.py
+++ b/sim/rl/jax_core/features.py
@@ -0,0 +1,69 @@
+"""Vectorized session feature extraction."""
+import numpy as np
+from .transitions import N_STATES, PURCHASE_IDX, CART_IDX
+from .simulation import SessionBatch
+
+try:
+    import jax.numpy as jnp
+    from jax import jit
+    JAX_AVAILABLE = True
+except ImportError:
+    jnp, JAX_AVAILABLE = np, False
+    def jit(f): return f
+
+@jit
+def extract_features(states, dwells, lengths):
+    """Extract per-session features. Returns (n_sess, 9) array."""
+    n, max_len = states.shape
+    mask = jnp.arange(max_len)[None,:] < lengths[:,None]
+    duration = jnp.sum(dwells * mask, axis=1)
+    total = lengths.astype(jnp.float32)
+    count = lambda idx: jnp.sum((states == idx) & mask, axis=1).astype(jnp.float32)
+    views, learn, carts, purchases = count(1), count(2), count(3), count(4)
+    velocity = total / (duration + 1e-6)
+    conversion = purchases / (views + 1e-6)
+    avg_dwell = duration / (total + 1e-6)
+    return jnp.stack([duration, avg_dwell, total, velocity, views, carts, purchases, learn, conversion], axis=1)
+
+def session_features(batch: SessionBatch) -> np.ndarray:
+    if JAX_AVAILABLE:
+        return np.asarray(extract_features(jnp.array(batch.states), jnp.array(batch.dwells), jnp.array(batch.lengths)))
+    # numpy fallback
+    n, max_len = batch.states.shape
+    mask = np.arange(max_len)[None,:] < batch.lengths[:,None]
+    duration = np.sum(batch.dwells * mask, axis=1)
+    total = batch.lengths.astype(np.float32)
+    count = lambda idx: np.sum((batch.states == idx) & mask, axis=1).astype(np.float32)
+    views, learn, carts, purchases = count(1), count(2), count(3), count(4)
+    return np.stack([duration, duration/(total+1e-6), total, total/(duration+1e-6), views, carts, purchases, learn, purchases/(views+1e-6)], axis=1)
+
+@jit
+def session_transitions(states, lengths, n_states=N_STATES):
+    """Compute empirical transition counts per session. Returns (n_sess, n_states, n_states)."""
+    n, max_len = states.shape
+    mask = jnp.arange(max_len - 1)[None,:] < (lengths[:,None] - 1)
+    src, dst = states[:, :-1], states[:, 1:]
+    # handle -1 padding by clamping to valid range
+    src_c, dst_c = jnp.clip(src, 0, n_states-1), jnp.clip(dst, 0, n_states-1)
+    valid = mask & (src >= 0) & (dst >= 0)
+    def per_session(i):
+        s, d, v = src_c[i], dst_c[i], valid[i]
+        trans = (jnp.eye(n_states)[s,:,None] * jnp.eye(n_states)[d,None,:]).sum(0) * v[:,None,None]
+        return trans.sum(0)
+    # vmap not ideal here, use manual loop for clarity
+    trans = jnp.stack([per_session(i) for i in range(n)])
+    row_sums = trans.sum(axis=-1, keepdims=True)
+    return trans / (row_sums + 1e-10)
+
+def compute_session_transitions(batch: SessionBatch) -> np.ndarray:
+    if JAX_AVAILABLE:
+        return np.asarray(session_transitions(jnp.array(batch.states), jnp.array(batch.lengths)))
+    # numpy fallback
+    n, max_len = batch.states.shape
+    trans = np.zeros((n, N_STATES, N_STATES), dtype=np.float32)
+    for i in range(n):
+        for t in range(batch.lengths[i] - 1):
+            s, d = batch.states[i, t], batch.states[i, t+1]
+            if s >= 0 and d >= 0: trans[i, s, d] += 1
+    row_sums = trans.sum(axis=-1, keepdims=True)
+    return trans / (row_sums + 1e-10)
--- a/sim/rl/jax_core/separability.py
+++ b/sim/rl/jax_core/separability.py
@@ -0,0 +1,43 @@
+"""Vectorized KL divergence for separability scoring."""
+import numpy as np
+from typing import Tuple
+
+try:
+    import jax.numpy as jnp
+    from jax import jit
+    JAX_AVAILABLE = True
+except ImportError:
+    jnp, JAX_AVAILABLE = np, False
+    def jit(f): return f
+
+@jit
+def batch_kl(P, Q_human, Q_agent, eps=1e-10):
+    """Compute KL(P||Q) for batched P. P:(n,s,s), Q:(s,s). Returns (delta_h, delta_a) each (n,)."""
+    p = P + eps
+    p = p / p.sum(axis=-1, keepdims=True)
+    qh, qa = Q_human[None] + eps, Q_agent[None] + eps
+    delta_h = jnp.sum(p * jnp.log(p / qh), axis=(1, 2))
+    delta_a = jnp.sum(p * jnp.log(p / qa), axis=(1, 2))
+    return delta_h, delta_a
+
+def compute_divergences(session_trans: np.ndarray, ref_human: np.ndarray, ref_agent: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute KL divergence of each session from human/agent prototypes."""
+    if JAX_AVAILABLE:
+        dh, da = batch_kl(jnp.array(session_trans), jnp.array(ref_human), jnp.array(ref_agent))
+        return np.asarray(dh), np.asarray(da)
+    # numpy fallback
+    eps = 1e-10
+    p = session_trans + eps
+    p = p / p.sum(axis=-1, keepdims=True)
+    qh, qa = ref_human[None] + eps, ref_agent[None] + eps
+    delta_h = np.sum(p * np.log(p / qh), axis=(1, 2))
+    delta_a = np.sum(p * np.log(p / qa), axis=(1, 2))
+    return delta_h, delta_a
+
+def estimate_alpha_batch(prob_agent: np.ndarray, delta_h: np.ndarray, delta_a: np.ndarray, temp: float = 1.0) -> np.ndarray:
+    """Vectorized alpha estimation from classifier probs and divergences."""
+    mass = delta_h + delta_a
+    ratio = np.where(mass > 1e-8, delta_a / mass, 0.5)
+    blended = 0.5 * prob_agent + 0.5 * ratio
+    if temp <= 0: return np.clip(blended, 0.0, 1.0)
+    return np.clip(1.0 / (1.0 + np.exp(-temp * (blended - 0.5))), 0.0, 1.0)
--- a/sim/rl/jax_core/simulation.py
+++ b/sim/rl/jax_core/simulation.py
@@ -0,0 +1,116 @@
+"""Vectorized Markov chain session sampling with JAX."""
+from typing import NamedTuple, Tuple
+import numpy as np
+from functools import partial
+
+try:
+    import jax, jax.numpy as jnp
+    from jax import lax
+    JAX_AVAILABLE = True
+except ImportError:
+    JAX_AVAILABLE = False
+
+from .transitions import TransitionData, N_STATES, TERM_IDX, PURCHASE_IDX, CART_IDX
+
+class SessionBatch(NamedTuple):
+    states: np.ndarray      # (n_sess, max_len) state indices, -1=padding
+    dwells: np.ndarray      # (n_sess, max_len) dwell times
+    products: np.ndarray    # (n_sess,) product index per session
+    actors: np.ndarray      # (n_sess,) 0=human, 1=agent
+    lengths: np.ndarray     # (n_sess,) actual session length
+
+class SimResult(NamedTuple):
+    demand_human: np.ndarray
+    demand_agent: np.ndarray
+    revenue: float
+    revenue_oracle: float
+    agent_loss: float
+    coi: float
+    look_to_book: float
+    mean_sale_price: float
+    n_human_purchases: int
+    n_agent_purchases: int
+    sessions: SessionBatch
+
+if JAX_AVAILABLE:
+    @partial(jax.jit, static_argnums=(5,6,7))
+    def _sample_sessions_jax(key, T_human, T_agent, dwell_human, dwell_agent, n_human, n_agent, max_steps):
+        n = n_human + n_agent
+        k1, k2, k3, k4 = jax.random.split(key, 4)
+        actors = jnp.concatenate([jnp.zeros(n_human, dtype=jnp.int32), jnp.ones(n_agent, dtype=jnp.int32)])
+        T = jnp.where(actors[:,None,None]==0, T_human[None], T_agent[None])  # (n,6,6)
+        dwell_p = jnp.where(actors[:,None,None]==0, dwell_human[None], dwell_agent[None])  # (n,6,2)
+
+        def step(carry, _):
+            s, active, k = carry
+            k, k1, k2 = jax.random.split(k, 3)
+            probs = T[jnp.arange(n), s]  # (n,6)
+            nxt = jax.random.categorical(k1, jnp.log(probs + 1e-10))
+            nxt = jnp.where(active, nxt, -1)
+            shape = dwell_p[jnp.arange(n), s, 0]
+            scale = dwell_p[jnp.arange(n), s, 1]
+            dwell = jnp.maximum(0.3, jax.random.gamma(k2, shape) * scale)
+            still = active & (nxt != TERM_IDX) & (nxt >= 0)
+            return (nxt, still, k), (nxt, dwell)
+
+        init = (jnp.zeros(n, dtype=jnp.int32), jnp.ones(n, dtype=jnp.bool_), k3)
+        _, (states, dwells) = lax.scan(step, init, None, length=max_steps)
+        states, dwells = states.T, dwells.T  # (n, max_steps)
+        is_term = (states == -1) | (states == TERM_IDX)
+        lengths = jnp.argmax(is_term, axis=1) + 1
+        lengths = jnp.where(jnp.any(is_term, axis=1), lengths, max_steps)
+        return states, dwells, actors, lengths
+
+def sample_sessions(key, trans: TransitionData, n_human: int, n_agent: int, n_products: int, max_steps: int = 40) -> SessionBatch:
+    if JAX_AVAILABLE:
+        k1, k2 = jax.random.split(key)
+        states, dwells, actors, lengths = _sample_sessions_jax(k1, trans.human_T, trans.agent_T, trans.human_dwell, trans.agent_dwell, n_human, n_agent, max_steps)
+        products = jax.random.randint(k2, (n_human + n_agent,), 0, n_products)
+        return SessionBatch(np.asarray(states), np.asarray(dwells), np.asarray(products), np.asarray(actors), np.asarray(lengths))
+    # numpy fallback
+    rng = np.random.default_rng(int(key[0]) if hasattr(key, '__getitem__') else 42)
+    n = n_human + n_agent
+    actors = np.concatenate([np.zeros(n_human, dtype=np.int32), np.ones(n_agent, dtype=np.int32)])
+    products = rng.integers(0, n_products, size=n)
+    states, dwells = np.full((n, max_steps), -1, dtype=np.int32), np.zeros((n, max_steps), dtype=np.float32)
+    lengths = np.zeros(n, dtype=np.int32)
+    for i in range(n):
+        T = trans.human_T if actors[i] == 0 else trans.agent_T
+        dp = trans.human_dwell if actors[i] == 0 else trans.agent_dwell
+        s, t = 0, 0
+        while t < max_steps and s != TERM_IDX:
+            states[i, t] = s
+            dwells[i, t] = max(0.3, rng.gamma(dp[s, 0], dp[s, 1]))
+            s = rng.choice(N_STATES, p=T[s])
+            t += 1
+        lengths[i] = t
+    return SessionBatch(states, dwells, products, actors, lengths)
+
+def compute_metrics(batch: SessionBatch, prices: np.ndarray, unit_cost: np.ndarray, base_price: np.ndarray) -> SimResult:
+    purchased = np.any(batch.states == PURCHASE_IDX, axis=1)
+    human_mask, agent_mask = batch.actors == 0, batch.actors == 1
+    human_purch, agent_purch = purchased & human_mask, purchased & agent_mask
+    demand_h = np.bincount(batch.products[human_purch], minlength=len(prices)).astype(np.float32)
+    demand_a = np.bincount(batch.products[agent_purch], minlength=len(prices)).astype(np.float32)
+    # revenue and oracle
+    purch_products = batch.products[purchased]
+    revenue = float(np.sum(prices[purch_products]))
+    revenue_oracle = float(np.sum(base_price[purch_products]))
+    # agent loss: base_price - price_paid for agent purchases (agents gaming the system)
+    agent_products = batch.products[agent_purch]
+    agent_loss = float(np.sum(base_price[agent_products] - prices[agent_products]))
+    # COI: margin - expected_premium*0.5 for human purchases
+    human_products = batch.products[human_purch]
+    if len(human_products) > 0:
+        margin = float(np.mean(prices[human_products] - unit_cost[human_products]))
+        premium = float(np.mean(base_price[human_products] - prices[human_products]))
+        coi = max(0.0, margin - premium * 0.5)
+    else:
+        coi = 0.0
+    # look to book: views / purchases
+    views = float(np.sum(batch.states == 1))  # view_item_page = index 1
+    n_purch = int(purchased.sum())
+    look_to_book = views / (n_purch + 1e-6)
+    mean_sale = float(np.mean(prices[purch_products])) if n_purch > 0 else 0.0
+    return SimResult(demand_h, demand_a, revenue, revenue_oracle, agent_loss, coi, look_to_book, mean_sale,
+                     int(human_purch.sum()), int(agent_purch.sum()), batch)
--- a/sim/rl/jax_core/transitions.py
+++ b/sim/rl/jax_core/transitions.py
@@ -0,0 +1,47 @@
+"""Dense transition matrices for JAX Markov chain sampling."""
+from dataclasses import dataclass
+import numpy as np
+
+try:
+    import jax.numpy as jnp
+    JAX_AVAILABLE = True
+except ImportError:
+    jnp, JAX_AVAILABLE = np, False
+
+STATES = ["session_start", "view_item_page", "learn_more_about_item", "add_item_to_cart", "purchase_complete", "session_end"]
+S2I = {s: i for i, s in enumerate(STATES)}
+N_STATES, TERM_IDX, PURCHASE_IDX, CART_IDX = len(STATES), 5, 4, 3
+
+@dataclass
+class TransitionData:
+    human_T: np.ndarray   # (6,6) transition probs
+    agent_T: np.ndarray   # (6,6)
+    human_dwell: np.ndarray  # (6,2) shape,scale
+    agent_dwell: np.ndarray  # (6,2)
+
+    def to_jax(self):
+        if not JAX_AVAILABLE: return self
+        return TransitionData(*[jnp.array(x) for x in [self.human_T, self.agent_T, self.human_dwell, self.agent_dwell]])
+
+def dict_to_dense(d):
+    m = np.zeros((N_STATES, N_STATES), dtype=np.float32)
+    for src, dsts in d.items():
+        if (i := S2I.get(src)) is not None:
+            for dst, p in dsts.items():
+                if (j := S2I.get(dst)) is not None: m[i,j] = p
+    m /= np.maximum(m.sum(1, keepdims=True), 1e-8)
+    m[TERM_IDX] = 0; m[TERM_IDX, TERM_IDX] = 1.0
+    return m
+
+def compile_transitions(human_profile, agent_profile):
+    def dwell_arr(params): return np.array([[params.get(s, (2.0, 1.0)) for s in STATES]], dtype=np.float32).reshape(N_STATES, 2)
+    return TransitionData(dict_to_dense(human_profile.transitions), dict_to_dense(agent_profile.transitions),
+                          dwell_arr(human_profile.dwell_params), dwell_arr(agent_profile.dwell_params))
+
+def fallback_transitions():
+    H = {"session_start": {"view_item_page": .85, "session_end": .15}, "view_item_page": {"learn_more_about_item": .4, "add_item_to_cart": .3, "view_item_page": .2, "session_end": .1},
+         "learn_more_about_item": {"add_item_to_cart": .5, "view_item_page": .3, "session_end": .2}, "add_item_to_cart": {"purchase_complete": .6, "view_item_page": .25, "session_end": .15}, "purchase_complete": {"session_end": 1.0}}
+    A = {"session_start": {"view_item_page": .9, "session_end": .1}, "view_item_page": {"learn_more_about_item": .5, "add_item_to_cart": .25, "view_item_page": .15, "session_end": .1},
+         "learn_more_about_item": {"add_item_to_cart": .4, "view_item_page": .4, "session_end": .2}, "add_item_to_cart": {"purchase_complete": .5, "view_item_page": .3, "session_end": .2}, "purchase_complete": {"session_end": 1.0}}
+    dwell = np.full((N_STATES, 2), [2.0, 1.0], dtype=np.float32)
+    return TransitionData(dict_to_dense(H), dict_to_dense(A), dwell.copy(), dwell.copy())
--- a/sim/rl/train.py
+++ b/sim/rl/train.py
@@ -4,16 +4,17 @@ from pathlib import Path
 from typing import Dict, Type, Optional
 import pickle
 from torch.utils.tensorboard import SummaryWriter
-from environment import PHANTOMEnv, BusinessLogicConstraints
+from sim.rl.environment import PHANTOMEnv, BusinessLogicConstraints

 logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
 logger = logging.getLogger(__name__)

 try:
-    from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
+    from sim.rl.engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
                       SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
-except ImportError:
+except ImportError as e:
    BasePricingEngine = None  # engines not required for basic usage
+    print(e)


 """
@@ -36,27 +37,49 @@ class EngineTrainer:
        self.global_step = 0

    def train(self, n_episodes: int, seed: int = 42):
-        obs, _ = self.env.reset(seed=seed)
-        prices = None
        for ep in range(n_episodes):
-            prices = self.engine.compute_prices(prices, obs)
-            obs, reward, done, _, info = self.env.step(prices)
+            obs, _ = self.env.reset(seed=seed + ep)
+            self.engine.reset()
+            done = False
+            prev_prices = obs["elasticity"]["price"]
+            episode_reward = 0.0
+            last_info: Dict[str, float] = {}
+            while not done:
+                action_prices = self.engine.compute_prices(prev_prices, obs)
+                obs, reward, done, _, info = self.env.step(action_prices)
                self.engine.update(obs, reward, done, info)
+                episode_reward += reward
+                prev_prices = obs["elasticity"]["price"]
+                last_info = info
+                if self.tb_writer:
+                    self.tb_writer.add_scalar("reward/step", reward, self.global_step)
+                    if "coi" in info:
+                        self.tb_writer.add_scalar("diagnostics/coi", info["coi"], self.global_step)
+                    if "alpha_hat" in info:
+                        self.tb_writer.add_scalar("diagnostics/alpha_hat", info["alpha_hat"], self.global_step)
+                self.global_step += 1
+            last_info = dict(last_info)
+            last_info.update({"episode_reward": episode_reward, "episode": ep})
+            self.episode_metrics.append(last_info)
+            if self.tb_writer:
+                self.tb_writer.add_scalar("reward/episode", episode_reward, ep)
        return self

    def run_episode(self, seed: int = 42) -> Dict:
        """run single evaluation episode and return metrics"""
        obs, _ = self.env.reset(seed=seed)
        self.engine.reset()
-        total_reward, prices = 0.0, None
+        total_reward = 0.0
+        prev_prices = obs["elasticity"]["price"]
        ep_metrics = {'total_reward': 0.0}
        done = False
        while not done:
-            prices = self.engine.compute_prices(prices, obs) if prices is not None else obs["elasticity"]["price"]
-            obs, reward, done, _, info = self.env.step(prices)
+            action_prices = self.engine.compute_prices(prev_prices, obs)
+            obs, reward, done, _, info = self.env.step(action_prices)
            total_reward += reward
            for k, v in info.items():
                ep_metrics[k] = v
+            prev_prices = obs["elasticity"]["price"]
        ep_metrics['total_reward'] = total_reward
        return ep_metrics

@@ -106,7 +129,7 @@ if __name__ == "__main__":
        logger.error("Engines not available, cannot run training")
        exit(1)

-    base_dir = Path("./runs")
+    base_dir = Path("./sim/rl/runs")
    base_dir.mkdir(exist_ok=True)

    engines = {
--- a/sim/strong_learner/data.py
+++ b/sim/strong_learner/data.py
@@ -1,4 +1,9 @@
-import os, requests, py7zr
+import os
+import requests
+try:
+    import py7zr  # type: ignore
+except ImportError:  # pragma: no cover - optional dependency
+    py7zr = None
 import pandas as pd
 from typing import Generator
 try:
@@ -22,12 +27,16 @@ class YooChooseLoader(Loader):
        self.entries = list(self.data.keys())

    def _setup(self):
+        if py7zr is None:
+            raise RuntimeError("py7zr is required to unpack YooChoose dataset. Install py7zr first.")
        os.makedirs(self.root, exist_ok=True)
        zip_path = f"{self.root}/temp.7z"
        with requests.get(self.URL, stream=True) as r:
            with open(zip_path, 'wb') as f:
-                for chunk in r.iter_content(8192): f.write(chunk)
-        with py7zr.SevenZipFile(zip_path, 'r') as z: z.extractall(self.root)
+                for chunk in r.iter_content(8192):
+                    f.write(chunk)
+        with py7zr.SevenZipFile(zip_path, 'r') as z:
+            z.extractall(self.root)
        os.remove(zip_path)

    def _make_interaction(self, sid: str, ts: str, item_id: str, event: str, page: str, meta: dict) -> InteractionModel:
--- a/tests/e2e/.env.example
+++ b/tests/e2e/.env.example
@@ -0,0 +1,7 @@
+WEB_URL=http://localhost:3000
+BACKEND_URL=http://localhost:5000
+PRICING_PROVIDER_URL=http://localhost:5001
+AIRFLOW_URL=http://localhost:8085
+AIRFLOW_USER=admin
+AIRFLOW_PASS=admin
+HEADLESS=true
				`@@ -0,0 +1,2 @@`
				`"""Case-specific simulations and experiments."""`
				`@@ -0,0 +1,2 @@`
				`"""Minimal thesis-aligned pricing simulation (self-contained)."""`