Merge pull request #51 from velocitatem/feat-strong-learning-implementation-with-data-contamination

Feat strong learning implementation with data contamination
2026-07-15 17:43:36 +00:00 · 2026-01-31 10:15:09 +01:00
parent 72877439ca 13959e4b28
commit 9843c5deab
33 changed files with 2828 additions and 328 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,11 @@
 *.old
 **/package-lock.json
 **/*.parquet
 **/_build/
 paper/src/bib/auto
 =======
 **/_build/
 paper/src/auto/*
 paper/src/bib/auto
 docs/goals/*.md
@@ -24,3 +28,5 @@ sim/rl/behavior_loader/*.png
 sim/rl/behavior_loader/*.svg
 sim/rl/behavior_loader/*.pdf
 tests/e2e/node_modules/**
 lab/case/thesis/runs*/
 sim/case/thesis_simplified/runs*/
--- a/engine/engine.py
+++ b/engine/engine.py
@@ -0,0 +1,66 @@
 from sys import platform
 import numpy as np
 from .lib.demand import generate_demand, estimate_demand
 from .lib.behavior import sample_behavior
 from logging import INFO, getLogger
 logger = getLogger(__name__)
 logger.setLevel(INFO)
 class MarketEngine():
    def __init__(self,
                 alpha = 0.5,
                 N = 100,
                 demand_distribution = (50, 10),
                 demand_sampling_function = np.random.normal):
        self.Nagents = int(N*alpha)
        self.Nhumans = int(N*(1-alpha))
        self.demand = (demand_sampling_function, demand_distribution)
    def act(self, prices):
        demand = generate_demand(prices, *self.demand)
        sample_n = lambda n, human: [sample_behavior(demand, human=human) for _ in range(n)]
        human_t, agent_t = sample_n(self.Nhumans, True), sample_n(self.Nagents, False)
        trajectories = human_t + agent_t
        demand_estimate = estimate_demand(trajectories)
        return demand_estimate
    def measure(self):
        pass
 class PricingEngine():
    def __init__(self,
                 ) -> None:
        pass
    def act(self, demand):
        return np.random.uniform(low=25, high=100, size=10)
 class Limbo():
    def __init__(self,
                 platform,
                 market
                 ) -> None:
        self.platform_turn = True
        self.platform = platform
        self.market = market
        self.output = None
    def step(self):
        # we could code golf this a little bit
        if self.platform_turn:
            self.output = self.platform.act(self.output)
        else:
            self.output = self.market.act(self.output)
        print(self.output)
        self.platform_turn = not self.platform_turn
 if __name__ == "__main__":
    platform = PricingEngine()
    market = MarketEngine()
    limbo = Limbo(platform, market)
    for _ in range(10):
        limbo.step()
--- a/engine/lib/init.py
+++ b/engine/lib/init.py
@@ -0,0 +1,3 @@
 from .demand import generate_demand, estimate_demand
 from .behavior import sample_behavior
 from .render import DashboardRenderer, style_axis
--- a/engine/lib/behavior.py
+++ b/engine/lib/behavior.py
@@ -0,0 +1,47 @@
 from sim.rl.behavior_loader.models import BehaviorModel, AgentBehaviorModel, aggregate_event_transitions
 import pandas as pd
 import numpy as np
 from .demand import generate_demand
 base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
 human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
 _cache = {}  # lazy cache for models and base pivots
 def _get_base_pivot(human: bool):
    key = 'human' if human else 'agent'
    if key not in _cache:
        model = BehaviorModel(human_dir) if human else AgentBehaviorModel(agent_dir)
        mdp = model.build_MDP()
        _cache[key] = pd.DataFrame(aggregate_event_transitions(mdp)).fillna(0.0)
    return _cache[key]
 def adjust_behavior_to_condition(condition, transition_matrix):
    # expand NxN transition matrix to (N*P)x(N*P) weighted by demand condition
    cond_norm = condition / np.sum(condition)
    n_products = len(condition)
    base_vals = transition_matrix.values
    base_cols, base_rows = transition_matrix.columns.tolist(), transition_matrix.index.tolist()
    # expand via kronecker-like tiling: each cell becomes a P*P block weighted by outer product of cond_norm
    expanded = np.kron(base_vals, np.outer(cond_norm, cond_norm))
    new_cols = [f"{c}_product{p}" for c in base_cols for p in range(n_products)]
    new_rows = [f"{r}_product{p}" for r in base_rows for p in range(n_products)]
    return pd.DataFrame(expanded, index=new_rows, columns=new_cols)
 def sample_behavior(condition, human=True, max_len=40):
    base_pivot = _get_base_pivot(human)
    adjusted_transitions = adjust_behavior_to_condition(condition, base_pivot)
    trajectory = [np.random.choice(adjusted_transitions.index)]
    while len(trajectory) < max_len or 'checkout' in trajectory[-1]:
        probs = adjusted_transitions.loc[trajectory[-1]].values
        sample = np.random.choice(adjusted_transitions.columns, p=probs/np.sum(probs) if np.sum(probs) > 0 else None)
        trajectory.append(sample)
    return trajectory
 if __name__ == "__main__":
    t=sample_behavior(generate_demand(np.array([10,20,30])), human=True)
    print(t)
    t=sample_behavior(generate_demand(np.array([10,20,30])), human=False)
    print(t)
--- a/engine/lib/demand.py
+++ b/engine/lib/demand.py
@@ -0,0 +1,45 @@
 import logging
 import numpy as np
 from logging import getLogger
 logger = getLogger(__name__)
 def generate_demand(prices, distribution_method = np.random.normal, distribution_params = (50.0, 10.0)):
    # assumption 1: each product has an intrinsic valuation drawn from a normal distribution centered at 50
    product_valuations = distribution_method(*distribution_params, size=len(prices))
    # assumption 2: demand decreases as price increases, following a simple linear model
    demand = np.maximum(0, product_valuations - prices) # demand cannot be negative
    total = np.sum(demand)
    demand = demand / total * 100 if total > 0 else demand  # normalize to percentage, avoid div by zero
    logger.info(f"Generated demand for prices {prices}: {demand} with valuations from distribution {distribution_params}")
    return demand
 def estimate_demand(trajectories):
    demand_estimate = {}
    for traj in trajectories:
        for event in traj:
            if 'view_product' in event:
                product_id = int(event.split('_')[-1].replace('product', ''))
                demand_estimate[product_id] = demand_estimate.get(product_id, 0) + 1
    total_views = sum(demand_estimate.values())
    for product_id in demand_estimate:
        demand_estimate[product_id] = (demand_estimate[product_id] / total_views) * 100  # normalize to percentage
    return demand_estimate
 # Example usage
 if __name__ == "__main__":
    np.random.seed(42)
    prices = np.array([20.0, 35.0, 50.0, 65.0])
    demand = generate_demand(prices)
    print("Generated Demand:", demand)
    from .behavior import sample_behavior
    N, alphat =200, 0.1
    trajectories = []
    for _ in range(int(N*(1 - alphat))):
        trajectories.append(sample_behavior(demand, human=True))
    for _ in range(int(N*alphat)):
        trajectories.append(sample_behavior(demand, human=False))
    demand_estimate = estimate_demand(trajectories)
    print("Estimated Demand from Behavior:", demand_estimate)
    delta = {k: demand_estimate.get(k, 0) - demand[i] for i, k in enumerate(range(len(prices)))}
    delta = np.mean([np.abs(v) for v in delta.values()])
    print("Demand Delta:", delta)
--- a/engine/lib/render.py
+++ b/engine/lib/render.py
@@ -0,0 +1,126 @@
 """rendering logic for PHANTOM environment dashboard"""
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.gridspec import GridSpec
 def style_axis(ax, title: str = None, xlabel: str = None, ylabel: str = None):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    if title: ax.set_title(title, fontsize=11, fontweight='bold', pad=8)
    if xlabel: ax.set_xlabel(xlabel, fontsize=9)
    if ylabel: ax.set_ylabel(ylabel, fontsize=9)
 class DashboardRenderer:
    """stateful renderer for PHANTOM market dynamics visualization"""
    def __init__(self):
        self.fig = None
        self.gs = None
    def render(self, env) -> None:
        if self.fig is None:
            plt.ion()
            self.fig = plt.figure(figsize=(14, 10))
            self.gs = GridSpec(3, 3, figure=self.fig, hspace=0.35, wspace=0.3,
                               left=0.07, right=0.95, top=0.92, bottom=0.08)
            plt.show(block=False)
        self.fig.clear()
        self.fig.suptitle(f'PHANTOM  Market Dynamics  [t={env._step_count}, a={env.alpha:.2f}]',
                          fontsize=14, fontweight='bold')
        demand_mat = np.array(env._demand_history).T
        price_mat = np.array(env._price_history).T
        elasticity = env._compute_elasticity()
        self._render_scatter(env)
        self._render_elasticity_bar(env, elasticity)
        self._render_session_pie(env)
        self._render_price_heatmap(price_mat)
        self._render_demand_heatmap(demand_mat)
        self._render_correlation(env.n_products, price_mat, demand_mat)
        self._render_revenue(env)
        self.fig.canvas.draw_idle()
        self.fig.canvas.flush_events()
    def _render_scatter(self, env):
        ax = self.fig.add_subplot(self.gs[0, 0])
        prices_flat = np.array(env._price_history).flatten()
        demands_flat = np.array(env._demand_history).flatten()
        product_ids = np.tile(np.arange(env.n_products), len(env._price_history))
        ax.scatter(prices_flat, demands_flat, c=product_ids, cmap='plasma', alpha=0.6, s=15, edgecolors='none')
        if len(prices_flat) > 1:
            z = np.polyfit(prices_flat, demands_flat, 1)
            p_line = np.linspace(prices_flat.min(), prices_flat.max(), 50)
            ax.plot(p_line, np.polyval(z, p_line), '--', lw=1.5, alpha=0.8)
        style_axis(ax, "Price-Demand Relationship", "Price ($)", "Demand")
    def _render_elasticity_bar(self, env, elasticity):
        ax = self.fig.add_subplot(self.gs[0, 1])
        ax.barh(range(env.n_products), elasticity, alpha=0.8)
        ax.axvline(0, lw=0.8, alpha=0.5)
        ax.axvline(-1, lw=1, ls='--', alpha=0.5)
        ax.set_yticks(range(env.n_products))
        ax.set_yticklabels([f'P{i}' for i in range(env.n_products)], fontsize=7)
        style_axis(ax, "Price Elasticity", "(dQ/dP)(P/Q)", None)
    def _render_session_pie(self, env):
        ax = self.fig.add_subplot(self.gs[0, 2])
        n_h, n_a = env.market.Nhumans, env.market.Nagents
        wedges, _ = ax.pie([n_h, n_a], startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'})
        ax.legend(wedges, [f'H ({n_h})', f'A ({n_a})'], loc='lower center', fontsize=8,
                  frameon=False, bbox_to_anchor=(0.5, -0.05))
        ax.set_title("Session Mix", fontsize=11, fontweight='bold')
    def _render_price_heatmap(self, price_mat):
        ax = self.fig.add_subplot(self.gs[1, :2])
        im = ax.imshow(price_mat, aspect='auto', cmap='viridis', origin='lower')
        style_axis(ax, "Price Heatmap P(product, t)", "Step", "Product")
        cbar = self.fig.colorbar(im, ax=ax, fraction=0.03, pad=0.02)
        cbar.set_label('$', fontsize=8)
    def _render_demand_heatmap(self, demand_mat):
        ax = self.fig.add_subplot(self.gs[1, 2])
        im = ax.imshow(demand_mat, aspect='auto', cmap='Blues', origin='lower')
        style_axis(ax, "Demand Q(product, t)", "Step", None)
        self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
    def _render_correlation(self, n_products, price_mat, demand_mat):
        ax = self.fig.add_subplot(self.gs[2, 0])
        if price_mat.shape[1] > 2:
            corr = np.corrcoef(price_mat, demand_mat)[:n_products, n_products:]
            im = ax.imshow(corr, cmap='RdBu', vmin=-1, vmax=1, aspect='auto')
            ax.set_xticks(range(n_products))
            ax.set_yticks(range(n_products))
            ax.set_xticklabels([f'Q{i}' for i in range(n_products)], fontsize=6)
            ax.set_yticklabels([f'P{i}' for i in range(n_products)], fontsize=6)
            self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
        style_axis(ax, "Price-Demand Correlation", None, None)
    def _render_revenue(self, env):
        ax = self.fig.add_subplot(self.gs[2, 1:])
        n_steps = len(env._revenue_history)
        demand_std = [np.std(d) for d in env._demand_history]
        ax.fill_between(range(n_steps), env._revenue_history, alpha=0.3)
        ax.plot(env._revenue_history, linewidth=2, label='Revenue')
        ax.set_xlim(0, max(n_steps, 1))
        ax.set_ylim(0, max(env._revenue_history) * 1.1 if env._revenue_history else 1)
        ax2 = ax.twinx()
        ax2.plot(range(n_steps), demand_std, linewidth=2, ls='-', alpha=0.9, label='sigma(Demand)')
        d_min, d_max = min(demand_std), max(demand_std)
        margin = (d_max - d_min) * 0.2 if d_max > d_min else 0.5
        ax2.set_ylim(max(0, d_min - margin), d_max + margin)
        ax2.set_ylabel('Demand sigma', fontsize=9)
        style_axis(ax, "Revenue & Demand Dispersion", "Step", "Revenue ($)")
        ax.legend(loc='upper left', fontsize=7, frameon=False)
        ax2.legend(loc='upper right', fontsize=7, frameon=False)
    def close(self):
        if self.fig:
            plt.close(self.fig)
            self.fig = None
--- a/engine/studies/factors.py
+++ b/engine/studies/factors.py
@@ -0,0 +1,34 @@
 """shared factor definitions for experimental designs"""
 import numpy as np
 from dataclasses import dataclass, field
 from typing import Callable, Any
@dataclass
 class Factor:
    name: str
    levels: list
    primary: bool = True  # full cross vs sampled
 # demand functions with compatible signatures
 def demand_linear(mu, sigma, size): return np.maximum(0, np.random.normal(mu, sigma, size))
 def demand_uniform(mu, sigma, size): return np.random.uniform(mu - sigma, mu + sigma, size)
 def demand_exponential(mu, sigma, size): return np.random.exponential(mu, size)
 def demand_logistic(mu, sigma, size): return np.random.logistic(mu, sigma, size)
 DEMAND_FUNCTIONS = {
    "linear": demand_linear,
    "uniform": demand_uniform,
    "exponential": demand_exponential,
    "logistic": demand_logistic,
 }
 FACTORS = [
    Factor("demand_fn", list(DEMAND_FUNCTIONS.keys()), primary=True),
    Factor("alpha", [0.1, 0.3, 0.5, 0.7], primary=True),
    Factor("n_products", [5, 15, 30, 50], primary=True),
    Factor("demand_mu", [30.0, 50.0, 70.0], primary=False),
    Factor("demand_sigma", [5.0, 10.0, 20.0], primary=False),
    Factor("N", [100, 500, 1000], primary=False),
 ]
 SEEDS_PER_CONFIG = 5
--- a/engine/studies/full_factorial.py
+++ b/engine/studies/full_factorial.py
@@ -0,0 +1,89 @@
 """full factorial design - all factor combinations"""
 import sys
 sys.path.insert(0, "..")
 import logging
 from itertools import product
 import json
 import hashlib
 from pathlib import Path
 from concurrent.futures import ProcessPoolExecutor
 from .factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 log = logging.getLogger(__name__)
 def generate_configs():
    """generate all factor combinations with seeds"""
    all_levels = [f.levels for f in FACTORS]
    names = [f.name for f in FACTORS]
    configs = []
    for combo in product(*all_levels):
        base = {names[i]: combo[i] for i in range(len(names))}
        for seed in range(SEEDS_PER_CONFIG):
            cfg = {**base, "seed": seed}
            cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
            configs.append(cfg)
    return configs
 def run_single(cfg: dict) -> dict:
    """execute one experiment config, return metrics"""
    from engine.wrapper import PHANTOM
    import numpy as np
    np.random.seed(cfg["seed"])
    demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]]
    env = PHANTOM(
        n_products=cfg["n_products"],
        alpha=cfg["alpha"],
        N=cfg["N"],
    )
    env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"]))
    obs, _ = env.reset()
    total_reward, steps = 0.0, 0
    for _ in range(100):
        action = env.action_space.sample()
        obs, reward, term, trunc, _ = env.step(action)
        total_reward += reward
        steps += 1
        if term: break
    env.close()
    return {
        "id": cfg["id"],
        "config": cfg,
        "total_reward": total_reward,
        "avg_reward": total_reward / steps if steps > 0 else 0.0,
        "steps": steps,
    }
 def run_study(max_workers: int = None, output: str = "results_full.jsonl"):
    configs = generate_configs()
    log.info(f"full factorial: {len(configs)} configs ({len(configs)//SEEDS_PER_CONFIG} unique × {SEEDS_PER_CONFIG} seeds)")
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as ex:
        for i, result in enumerate(ex.map(run_single, configs)):
            results.append(result)
            if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
    Path(output).write_text("\n".join(json.dumps(r) for r in results))
    log.info(f"wrote {len(results)} results to {output}")
    return results
 if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser()
    p.add_argument("--workers", type=int, default=None)
    p.add_argument("--output", default="results_full.jsonl")
    p.add_argument("--dry-run", action="store_true", help="only show design size")
    args = p.parse_args()
    configs = generate_configs()
    log.info(f"design: {len(configs)} runs | factors: {[f.name for f in FACTORS]} | levels: {[len(f.levels) for f in FACTORS]}")
    if not args.dry_run:
        run_study(args.workers, args.output)
--- a/engine/studies/mixed_lh.py
+++ b/engine/studies/mixed_lh.py
@@ -0,0 +1,106 @@
 """mixed design: full factorial on primary factors, latin hypercube on secondary"""
 import sys
 sys.path.insert(0, "..")
 import logging
 from itertools import product
 import json
 import hashlib
 from pathlib import Path
 from concurrent.futures import ProcessPoolExecutor
 import numpy as np
 from scipy.stats.qmc import LatinHypercube
 from factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 log = logging.getLogger(__name__)
 LH_SAMPLES = 10
 def generate_configs(lh_samples: int = LH_SAMPLES):
    primary = [f for f in FACTORS if f.primary]
    secondary = [f for f in FACTORS if not f.primary]
    primary_grid = list(product(*[f.levels for f in primary]))
    lhs = LatinHypercube(d=len(secondary), seed=42)
    configs = []
    for p_combo in primary_grid:
        samples = lhs.random(n=lh_samples)
        for s in samples:
            sec_vals = {
                secondary[i].name: secondary[i].levels[int(s[i] * len(secondary[i].levels))]
                for i in range(len(secondary))
            }
            base = {primary[i].name: p_combo[i] for i in range(len(primary))}
            base.update(sec_vals)
            for seed in range(SEEDS_PER_CONFIG):
                cfg = {**base, "seed": seed}
                cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
                configs.append(cfg)
    return configs
 def run_single(cfg: dict) -> dict:
    from engine.wrapper import PHANTOM
    import numpy as np
    np.random.seed(cfg["seed"])
    demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]]
    env = PHANTOM(
        n_products=cfg["n_products"],
        alpha=cfg["alpha"],
        N=cfg["N"],
    )
    env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"]))
    obs, _ = env.reset()
    total_reward, steps = 0.0, 0
    for _ in range(100):
        action = env.action_space.sample()
        obs, reward, term, trunc, _ = env.step(action)
        total_reward += reward
        steps += 1
        if term: break
    env.close()
    return {
        "id": cfg["id"],
        "config": cfg,
        "total_reward": total_reward,
        "avg_reward": total_reward / steps,
        "steps": steps,
    }
 def run_study(max_workers: int = None, output: str = "results_mixed.jsonl", lh_samples: int = LH_SAMPLES):
    configs = generate_configs(lh_samples)
    n_primary_cells = int(np.prod([len(f.levels) for f in FACTORS if f.primary]))
    log.info(f"mixed LH: {len(configs)} configs ({n_primary_cells} primary × {lh_samples} LH × {SEEDS_PER_CONFIG} seeds)")
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as ex:
        for i, result in enumerate(ex.map(run_single, configs)):
            results.append(result)
            if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
    Path(output).write_text("\n".join(json.dumps(r) for r in results))
    log.info(f"wrote {len(results)} results to {output}")
    return results
 if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser()
    p.add_argument("--workers", type=int, default=None)
    p.add_argument("--output", default="results_mixed.jsonl")
    p.add_argument("--lh-samples", type=int, default=10)
    p.add_argument("--dry-run", action="store_true", help="only show design size")
    args = p.parse_args()
    primary = [f for f in FACTORS if f.primary]
    secondary = [f for f in FACTORS if not f.primary]
    configs = generate_configs(args.lh_samples)
    log.info(f"design: {len(configs)} runs | primary: {[f.name for f in primary]} | secondary (LH): {[f.name for f in secondary]}")
    if not args.dry_run:
        run_study(args.workers, args.output, args.lh_samples)
--- a/engine/train.py
+++ b/engine/train.py
@@ -0,0 +1,45 @@
 from stable_baselines3 import SAC
 from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
 from .wrapper import PHANTOM
 class RenderCallback(BaseCallback):
    """Renders environment on every step for live visualization."""
    def __init__(self, env: PHANTOM):
        super().__init__()
        self.env = env
    def _on_step(self) -> bool:
        self.env.render()
        return True
 env = PHANTOM(n_products=10, alpha=0.3, render_mode="human")
 eval_env = PHANTOM(n_products=10, alpha=0.3, render_mode=None)
 model = SAC(
    "MultiInputPolicy",
    env,
    verbose=1,
    learning_rate=3e-4,
    buffer_size=50000,
    batch_size=256,
    tau=0.005,
    gamma=0.99,
 )
 render_cb = RenderCallback(env)
 eval_cb = EvalCallback(eval_env, eval_freq=1000, n_eval_episodes=5, verbose=1)
 model.learn(total_timesteps=50000, callback=[render_cb, eval_cb])
 model.save("phantom_sac")
 # test trained policy
 env = PHANTOM(n_products=10, alpha=0.3, render_mode="human")
 obs, _ = env.reset()
 for _ in range(100):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, term, trunc, _ = env.step(action)
    env.render()
    if term or trunc: break
 env.close()
--- a/engine/wrapper.py
+++ b/engine/wrapper.py
@@ -0,0 +1,118 @@
 import gymnasium as gym
 from gymnasium import spaces
 import numpy as np
 from .engine import Limbo, MarketEngine, PricingEngine
 from .lib.render import DashboardRenderer
 class PHANTOM(gym.Env):
    """Gymnasium wrapper for the Limbo pricing-market simulation. Platform sets prices, market responds with demand."""
    metadata = {"render_modes": ["human", "ansi"]}
    def __init__(self,
                 n_products: int = 10,
                 alpha: float = 0.3,
                 N: int = 100,
                 price_bounds: tuple = (10.0, 150.0),
                 lambda_coi: float = 0.1,
                 render_mode: str = None):
        super().__init__()
        self.n_products = n_products
        self.price_bounds = price_bounds
        self.lambda_coi = lambda_coi
        self.render_mode = render_mode
        self.alpha = alpha
        self.N = N
        self.market = MarketEngine(alpha=alpha, N=N)
        self._platform_stub = PricingEngine()
        self._limbo = Limbo(self._platform_stub, self.market)
        self.action_space = spaces.Box(
            low=price_bounds[0], high=price_bounds[1],
            shape=(n_products,), dtype=np.float32
        )
        self.observation_space = spaces.Dict({
            "demand": spaces.Box(low=0.0, high=100.0, shape=(n_products,), dtype=np.float32),
            "prices": spaces.Box(low=price_bounds[0], high=price_bounds[1], shape=(n_products,), dtype=np.float32),
        })
        self._prices = None
        self._demand = None
        self._step_count = 0
        self._demand_history = []
        self._price_history = []
        self._revenue_history = []
        self._renderer = None
    def _get_obs(self) -> dict:
        demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32)
        return {"demand": demand_arr, "prices": self._prices.astype(np.float32)}
    def _compute_reward(self, prices: np.ndarray, demand: dict) -> float:
        revenue = np.sum(prices * np.array([demand.get(i, 0.0) for i in range(self.n_products)]))
        # TODO: implement supra-competitive price punishment
        return float(revenue)
    def _record_history(self):
        demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)])
        self._demand_history.append(demand_arr)
        self._price_history.append(self._prices.copy())
        self._revenue_history.append(np.sum(self._prices * demand_arr))
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._prices = np.random.uniform(*self.price_bounds, size=self.n_products)
        self._demand = self.market.act(self._prices)
        self._step_count = 0
        self._demand_history, self._price_history, self._revenue_history = [], [], []
        self._record_history()
        return self._get_obs(), {}
    def step(self, action: np.ndarray):
        self._prices = np.clip(action, *self.price_bounds)
        self._demand = self.market.act(self._prices)
        self._step_count += 1
        self._record_history()
        reward = self._compute_reward(self._prices, self._demand)
        terminated = self._step_count >= 100
        return self._get_obs(), reward, terminated, False, {"step": self._step_count}
    def _compute_elasticity(self) -> np.ndarray:
        """point elasticity: e = (dQ/dP) * (P/Q) via finite differences, clipped to [-5, 5]"""
        if len(self._price_history) < 2:
            return np.zeros(self.n_products)
        p, q = np.array(self._price_history), np.array(self._demand_history)
        dp, dq = np.diff(p, axis=0), np.diff(q, axis=0)
        valid = np.abs(dp) > 0.5
        with np.errstate(divide='ignore', invalid='ignore'):
            elasticity = np.where(valid, (dq / dp) * (p[:-1] / np.maximum(q[:-1], 1.0)), 0.0)
            elasticity = np.nan_to_num(np.clip(elasticity, -5.0, 5.0), nan=0.0)
        return np.mean(elasticity, axis=0) if len(elasticity) > 0 else np.zeros(self.n_products)
    def render(self):
        if self.render_mode == "human":
            if self._renderer is None:
                self._renderer = DashboardRenderer()
            self._renderer.render(self)
        elif self.render_mode == "ansi":
            return f"step={self._step_count}, prices={self._prices}, demand={self._demand}"
        return None
    def close(self):
        if self._renderer:
            self._renderer.close()
            self._renderer = None
 if __name__ == "__main__":
    env = PHANTOM(n_products=15, alpha=0.3, N=100, render_mode="human")
    obs, _ = env.reset()
    for step in range(100):
        action = env.action_space.sample()
        obs, reward, term, trunc, info = env.step(action)
        env.render()
        if term: break
    env.close()
--- a/experiments/procesing/contaminator.py
+++ b/experiments/procesing/contaminator.py
@@ -1,7 +1,14 @@
-import pandas as pd
+from __future__ import annotations
-import random
+
 import os
 import random
 from pathlib import Path
 from types import SimpleNamespace
 import pandas as pd
 from lib.separability import estimate_alpha, load_artifacts, score_session
 # use relative import when in package context, fallback for standalone
 try:
@@ -15,6 +22,11 @@ except ImportError:
 PROJECT_ROOT = Path(__file__).parent.parent.parent
 AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', PROJECT_ROOT / "experiments" / "agents" / "collected_data"))
 try:
    SEPARABILITY_ARTIFACTS = load_artifacts()
 except FileNotFoundError:
    SEPARABILITY_ARTIFACTS = None
 def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.DataFrame:
    """remap column values according to mapping dict, preserving unmapped values"""
@@ -23,6 +35,23 @@ def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.
    return df
 def _states_to_events(states: list[str]) -> list[SimpleNamespace]:
    events: list[SimpleNamespace] = []
    for idx, state in enumerate(states):
        parts = state.split("|") if isinstance(state, str) else ["page", "product", str(state)]
        page = f"/{parts[0]}" if parts else "/"
        product = parts[1] if len(parts) > 1 else "unknown"
        event_name = parts[2] if len(parts) > 2 else parts[-1]
        events.append(
            SimpleNamespace(
                eventName=event_name,
                page=page,
                productId=product,
                ts=float(idx),
            )
        )
    return events
 def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
                        contamination_rate: float = 0.1,
                        agent_data_dir: Path = None) -> pd.DataFrame:
@@ -48,6 +77,8 @@ def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
    # generate synthetic trajectories
    new_rows = []
    alpha_estimates = []
    for start_event in start_events:
        # sample trajectory from agent model, using a state that contains the event type
        mdp_states = model.mdp.get('states', []) if model.mdp else []
@@ -56,11 +87,28 @@ def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
            continue  # skip if no matching start state
        start_state = random.choice(matching_starts)
        trajectory = model.sample_traj(start_state, max_len=20)
        score_payload: list[SimpleNamespace] = []
        score: dict[str, float] = {}
        if SEPARABILITY_ARTIFACTS:
            score_payload = _states_to_events(trajectory)
            score = score_session(score_payload, SEPARABILITY_ARTIFACTS)
            alpha_estimates.append(
                estimate_alpha(score["prob_agent"], score["delta_h"], score["delta_a"], temperature=2.0)
            )
        for state in trajectory:
-            parts = state.split('|')  # page|productId|eventName format
+            parts = state.split('|') if isinstance(state, str) else [start_event]
-            new_rows.append({on: parts[-1] if parts else start_event, 'source': 'synthetic_agent'})
+            new_rows.append({
                on: parts[-1] if parts else start_event,
                'source': 'synthetic_agent',
                'prob_agent': score.get('prob_agent') if SEPARABILITY_ARTIFACTS and score_payload else None,
                'delta_h': score.get('delta_h') if SEPARABILITY_ARTIFACTS and score_payload else None,
                'delta_a': score.get('delta_a') if SEPARABILITY_ARTIFACTS and score_payload else None,
            })
    if new_rows:
        contaminate_df = pd.DataFrame(new_rows)
        df = pd.concat([df, contaminate_df], ignore_index=True)
        if alpha_estimates:
            df['estimated_alpha'] = sum(alpha_estimates) / len(alpha_estimates)
    return df
--- a/experiments/procesing/tests/test_demand.py
+++ b/experiments/procesing/tests/test_demand.py
@@ -6,6 +6,7 @@ from procesing.steps import (
 )
 def test_compute_demand(pipeline_context):
    random.seed(42)  # deterministic test
    step = ComputeDemandStep(context=pipeline_context)
    # Test with normal interaction data
@@ -26,6 +27,7 @@ def test_compute_demand(pipeline_context):
 def test_compute_demand_skewed(pipeline_context):
    random.seed(42)  # deterministic test
    step = ComputeDemandStep(context=pipeline_context)
    # Test with normal interaction data
--- a/sim/case/init.py
+++ b/sim/case/init.py
@@ -0,0 +1,2 @@
 """Case-specific simulations and experiments."""
--- a/sim/case/thesis_simplified/init.py
+++ b/sim/case/thesis_simplified/init.py
@@ -0,0 +1,2 @@
 """Minimal thesis-aligned pricing simulation (self-contained)."""
--- a/sim/case/thesis_simplified/coi.py
+++ b/sim/case/thesis_simplified/coi.py
@@ -0,0 +1,125 @@
 """Cost of Information (COI) computation for thesis pricing system.
 Core KPI: COI = E[p_shown] - p_min measures pricing power from information asymmetry.
 Theorem 1 shows COI erodes as agent queries increase: as N->inf, p^(1)->p_min.
 """
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Dict, List, TYPE_CHECKING
 import numpy as np
 if TYPE_CHECKING:
    from .simplified import Session
@dataclass(frozen=True)
 class COIWindow:
    """Windowed COI metrics computed from realized price exposures.
    policy: E[p_shown] - cost, the definition-level KPI
    agent: E[p^(1)] - cost where p^(1) is min price under agent querying
    leak: max(policy - agent, 0), observable gap from reconnaissance
    survival_ratio: agent/policy, fraction of pricing power retained
    """
    policy: float
    agent: float
    leak: float
    survival_ratio: float
    policy_by_product: np.ndarray
    agent_by_product: np.ndarray
    demand_weights: np.ndarray
 def aggregate_prices(sessions: List["Session"], mode: str = "all") -> Dict[int, List[float] | float]:
    """Unified price aggregation across sessions.
    mode: "all" returns all prices per product, "min_per_session" returns min price per session per product,
          "min_across" returns single min price per product
    """
    if mode == "min_across":
        mins: Dict[int, float] = {}
        for s in sessions:
            for e in s.events:
                pidx, price = int(e.product_idx), float(e.price_seen)
                mins[pidx] = min(mins.get(pidx, price), price)
        return mins
    elif mode == "min_per_session":
        result: Dict[int, List[float]] = {}
        for s in sessions:
            by_p: Dict[int, float] = {}
            for e in s.events:
                pidx, price = int(e.product_idx), float(e.price_seen)
                by_p[pidx] = min(by_p.get(pidx, price), price)
            for pidx, pmin in by_p.items():
                result.setdefault(pidx, []).append(pmin)
        return result
    else:  # "all"
        prices: Dict[int, List[float]] = {}
        for s in sessions:
            for e in s.events:
                prices.setdefault(e.product_idx, []).append(float(e.price_seen))
        return prices
 def demand_weights_by_product(sessions: List["Session"], demand_mapping: Dict[str, float], n_products: int) -> np.ndarray:
    """Compute demand-weighted importance per product."""
    w = np.zeros(n_products, dtype=float)
    sessions_by_id = {s.sid: s for s in sessions}
    for sid, q in demand_mapping.items():
        sess = sessions_by_id.get(sid)
        if sess and sess.events:
            w[int(sess.events[0].product_idx)] += float(q)
    total = float(np.sum(w))
    return (w / total) if total > 0 else w
 def compute_coi_window(sessions: List["Session"], costs: np.ndarray, demand_mapping: Dict[str, float] | None = None) -> COIWindow:
    """Compute COI metrics over session window.
    Aggregates price exposures and computes policy-level vs agent-realized COI.
    """
    n = int(len(costs))
    prices = aggregate_prices(sessions, mode="all")
    agent_sessions = [s for s in sessions if s.actor == "A"]
    agent_min = aggregate_prices(agent_sessions, mode="min_across") if agent_sessions else {}
    policy_by = np.zeros(n, dtype=float)
    agent_by = np.zeros(n, dtype=float)
    seen = np.array([(i in prices) for i in range(n)], dtype=bool)
    agent_seen = np.array([(i in agent_min) for i in range(n)], dtype=bool)
    for pidx, ps in prices.items():
        if 0 <= pidx < n and ps:
            policy_by[pidx] = float(np.mean(ps) - float(costs[pidx]))
    for pidx, pmin in agent_min.items():
        if 0 <= pidx < n:
            agent_by[pidx] = float(pmin - float(costs[pidx]))
    agent_by[seen & ~agent_seen] = policy_by[seen & ~agent_seen]  # no erosion if no agent exposure
    demand_w = demand_weights_by_product(sessions, demand_mapping, n) if demand_mapping else np.zeros(n, dtype=float)
    has_weights = float(np.sum(demand_w)) > 0
    if has_weights:
        policy, agent = float(np.dot(demand_w, policy_by)), float(np.dot(demand_w, agent_by))
    elif np.any(seen):
        policy, agent = float(np.mean(policy_by[seen])), float(np.mean(agent_by[seen]))
    else:
        policy, agent = 0.0, 0.0
    leak = float(max(policy - agent, 0.0))
    survival = float(np.clip(agent / policy, 0.0, 1.0)) if policy > 0 else 0.0
    return COIWindow(policy=policy, agent=agent, leak=leak, survival_ratio=survival,
                     policy_by_product=policy_by, agent_by_product=agent_by, demand_weights=demand_w)
 def coi_erosion(coi_policy: float, coi_agent: float, eps: float = 1e-9) -> float:
    """Thesis-consistent COI erosion: fraction of pricing power destroyed by agent queries.
    erosion = 1 - (COI_agent / COI_policy)
    When agents find low prices, COI_agent -> 0, erosion -> 1.
    """
    if coi_policy <= eps:
        return 0.0
    return float(np.clip(1.0 - (coi_agent / (coi_policy + eps)), 0.0, 1.0))
--- a/sim/case/thesis_simplified/experiments.py
+++ b/sim/case/thesis_simplified/experiments.py
@@ -0,0 +1,325 @@
 """COI leakage experiments and policy comparisons.
 Demonstrates the core thesis contribution: COI erosion under agent contamination
 and recovery via robust pricing policies.
 Generates TensorBoard logs for:
 - COI erosion curves across contamination levels
 - Policy comparison (fixed vs adaptive vs RL)
 - Revenue/margin trade-offs
 """
 from __future__ import annotations
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, List, Tuple
 import json
 import numpy as np
 try:
    from torch.utils.tensorboard import SummaryWriter
    HAS_TB = True
 except ImportError:
    HAS_TB = False
 from .simplified_env import PricingEnv, EnvConfig, make_env
 from .simplified import System
@dataclass
 class ExperimentResult:
    """Container for experiment metrics."""
    name: str
    alpha: float
    reward_mean: float
    reward_std: float
    coi_erosion: float
    alpha_error: float
    revenue: float
    margin: float
    def to_dict(self) -> dict:
        return {k: getattr(self, k) for k in self.__dataclass_fields__}
 def theoretical_coi_erosion_curve(alphas: np.ndarray, n_sessions: int = 1000) -> np.ndarray:
    """Theoretical COI erosion from Theorem 1 using order statistic model.
    For N i.i.d. uniform queries on [p_min, p_max]:
    E[p^(1)] = p_min + (p_max - p_min)/(N+1), so erosion = 1 - 2/(N+1)
    """
    erosions = []
    for a in alphas:
        n_agents = max(1, int(a * n_sessions))
        erosions.append(1.0 - 2.0 / (n_agents + 1))
    return np.array(erosions)
 def run_policy_episode(
    env: PricingEnv,
    policy_fn,
    n_episodes: int = 10
 ) -> Tuple[List[float], List[float], List[float], List[float]]:
    """Run policy and collect per-step metrics."""
    rewards, coi_erosions, alpha_errors, revenues = [], [], [], []
    for _ in range(n_episodes):
        obs, info = env.reset()
        done = False
        while not done:
            action = policy_fn(obs, env.n)
            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            rewards.append(reward)
            if 'coi_erosion' in info:
                coi_erosions.append(info['coi_erosion'])
            if 'alpha_true' in info and 'alpha_est' in info:
                alpha_errors.append(abs(info['alpha_true'] - info['alpha_est']))
            if 'revenue' in info:
                revenues.append(info['revenue'])
    return rewards, coi_erosions, alpha_errors, revenues
 class PolicyRegistry:
    """Registry of baseline policies."""
    @staticmethod
    def fixed(obs: np.ndarray, n: int, margin: float = 0.15) -> np.ndarray:
        return np.ones(n, dtype=np.float32) * (1.0 + margin)
    @staticmethod
    def random(obs: np.ndarray, n: int, rng: np.random.Generator = None) -> np.ndarray:
        rng = rng or np.random.default_rng()
        return rng.uniform(0.7, 1.3, n).astype(np.float32)
    @staticmethod
    def adaptive(obs: np.ndarray, n: int, base_margin: float = 0.15) -> np.ndarray:
        """Reduce margins when alpha estimate is high."""
        alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
        margin_scale = 1.0 - 0.4 * alpha_est
        return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale)
    @staticmethod
    def aggressive(obs: np.ndarray, n: int) -> np.ndarray:
        """High margins, ignores contamination."""
        return np.ones(n, dtype=np.float32) * 1.4
    @staticmethod
    def defensive(obs: np.ndarray, n: int) -> np.ndarray:
        """Low margins, always cautious."""
        return np.ones(n, dtype=np.float32) * 1.05
    @staticmethod
    def alpha_proportional(obs: np.ndarray, n: int, max_margin: float = 0.3) -> np.ndarray:
        """Margin inversely proportional to estimated alpha."""
        alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
        margin = max_margin * (1.0 - alpha_est)
        return np.ones(n, dtype=np.float32) * (1.0 + margin)
 def run_contamination_sweep(
    alphas: List[float],
    policies: Dict[str, callable],
    n_products: int = 10,
    max_steps: int = 200,
    n_episodes: int = 10,
    seed: int = 42,
    log_dir: str = None
 ) -> Dict[str, List[ExperimentResult]]:
    """Run policies across contamination levels."""
    results = {name: [] for name in policies}
    writer = SummaryWriter(Path(log_dir) / "sweep") if log_dir and HAS_TB else None
    for alpha in alphas:
        print(f"  alpha={alpha:.2f}", end=" ")
        env_cfg = EnvConfig(
            n_products=n_products, max_steps=max_steps,
            alpha_true=alpha, reward_mode="robust", seed=seed)
        env = make_env(env_cfg)
        for name, policy_fn in policies.items():
            rewards, coi_vals, alpha_errs, revenues = run_policy_episode(env, policy_fn, n_episodes)
            result = ExperimentResult(
                name=name, alpha=alpha,
                reward_mean=float(np.mean(rewards)),
                reward_std=float(np.std(rewards)),
                coi_erosion=float(np.mean(coi_vals)) if coi_vals else 0.0,
                alpha_error=float(np.mean(alpha_errs)) if alpha_errs else 0.0,
                revenue=float(np.mean(revenues)) if revenues else 0.0,
                margin=float(np.mean([policy_fn(np.zeros(3 * n_products + 3), n_products)]) - 1.0))
            results[name].append(result)
            if writer:
                step = int(alpha * 100)
                writer.add_scalar(f'{name}/reward', result.reward_mean, step)
                writer.add_scalar(f'{name}/coi_erosion', result.coi_erosion, step)
                writer.add_scalar(f'{name}/alpha_error', result.alpha_error, step)
                writer.add_scalar(f'{name}/revenue', result.revenue, step)
        print(f"done")
    # add theoretical curve
    if writer:
        theo = theoretical_coi_erosion_curve(np.array(alphas))
        for i, (a, e) in enumerate(zip(alphas, theo)):
            writer.add_scalar('theoretical/coi_erosion', e, int(a * 100))
        writer.close()
    return results
 def run_coi_demonstration(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
    """Main COI demonstration experiment."""
    print("=== COI Leakage Demonstration ===\n")
    Path(log_dir).mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter(Path(log_dir) / "coi_demo") if HAS_TB else None
    # theoretical erosion curve
    print("1. Theoretical COI erosion (Theorem 1)")
    alphas = np.linspace(0.0, 0.6, 13)
    theo_erosion = theoretical_coi_erosion_curve(alphas, n_sessions=1000)
    for a, e in zip(alphas, theo_erosion):
        print(f"   alpha={a:.2f} -> erosion={e:.3f}")
        if writer:
            writer.add_scalar('theory/coi_erosion', e, int(a * 100))
    # policy comparison
    print("\n2. Policy comparison across contamination levels")
    policies = {
        'fixed': lambda obs, n: PolicyRegistry.fixed(obs, n),
        'aggressive': PolicyRegistry.aggressive,
        'defensive': PolicyRegistry.defensive,
        'adaptive': PolicyRegistry.adaptive,
        'alpha_proportional': PolicyRegistry.alpha_proportional,
    }
    sweep_alphas = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
    results = run_contamination_sweep(
        sweep_alphas, policies, n_products=10, max_steps=100,
        n_episodes=5, seed=seed, log_dir=log_dir)
    # summarize
    print("\n3. Summary by policy")
    for name, res_list in results.items():
        avg_reward = np.mean([r.reward_mean for r in res_list])
        avg_coi = np.mean([r.coi_erosion for r in res_list])
        print(f"   {name:20s}: avg_reward={avg_reward:.2f}, avg_coi={avg_coi:.3f}")
    # save results
    output = {
        'theoretical': {'alphas': alphas.tolist(), 'erosion': theo_erosion.tolist()},
        'empirical': {name: [r.to_dict() for r in res_list] for name, res_list in results.items()}}
    with open(Path(log_dir) / "coi_demo_results.json", 'w') as f:
        json.dump(output, f, indent=2)
    if writer:
        writer.close()
    print(f"\nResults saved to {log_dir}/coi_demo_results.json")
    print(f"TensorBoard: tensorboard --logdir {log_dir}")
    return output
 def run_reward_mode_comparison(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
    """Compare different reward modes."""
    print("=== Reward Mode Comparison ===\n")
    Path(log_dir).mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter(Path(log_dir) / "reward_modes") if HAS_TB else None
    reward_modes = ["revenue", "profit", "robust", "coi_aware"]
    alpha = 0.3  # moderate contamination
    results = {}
    for mode in reward_modes:
        print(f"  mode={mode}", end=" ")
        env_cfg = EnvConfig(
            n_products=10, max_steps=200, alpha_true=alpha,
            reward_mode=mode, seed=seed)
        env = make_env(env_cfg)
        rewards, coi_vals, _, revenues = run_policy_episode(
            env, PolicyRegistry.adaptive, n_episodes=10)
        results[mode] = {
            'reward_mean': float(np.mean(rewards)),
            'reward_std': float(np.std(rewards)),
            'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
            'revenue': float(np.mean(revenues)) if revenues else 0.0}
        if writer:
            for k, v in results[mode].items():
                writer.add_scalar(f'{mode}/{k}', v, 0)
        print(f"reward={results[mode]['reward_mean']:.2f}, coi={results[mode]['coi_erosion']:.3f}")
    if writer:
        writer.close()
    with open(Path(log_dir) / "reward_mode_results.json", 'w') as f:
        json.dump(results, f, indent=2)
    return results
 def run_alpha_drift_experiment(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
    """Test policy robustness under non-stationary contamination."""
    print("=== Alpha Drift Experiment ===\n")
    Path(log_dir).mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter(Path(log_dir) / "alpha_drift") if HAS_TB else None
    drift_rates = [0.0, 0.01, 0.02, 0.05]
    results = {}
    for drift in drift_rates:
        print(f"  drift={drift:.2f}", end=" ")
        env_cfg = EnvConfig(
            n_products=10, max_steps=200, alpha_true=0.2,
            alpha_drift=drift, reward_mode="robust", seed=seed)
        env = make_env(env_cfg)
        rewards, coi_vals, alpha_errs, _ = run_policy_episode(
            env, PolicyRegistry.adaptive, n_episodes=10)
        results[f'drift_{drift}'] = {
            'reward_mean': float(np.mean(rewards)),
            'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
            'alpha_tracking_error': float(np.mean(alpha_errs)) if alpha_errs else 0.0}
        if writer:
            for k, v in results[f'drift_{drift}'].items():
                writer.add_scalar(f'drift_{drift}/{k}', v, 0)
        print(f"reward={results[f'drift_{drift}']['reward_mean']:.2f}, "
              f"alpha_err={results[f'drift_{drift}']['alpha_tracking_error']:.3f}")
    if writer:
        writer.close()
    return results
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Run COI experiments")
    parser.add_argument("--exp", type=str, default="coi", choices=["coi", "reward", "drift", "all"])
    parser.add_argument("--log-dir", type=str, default="sim/case/thesis_simplified/runs")
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()
    if args.exp == "coi" or args.exp == "all":
        run_coi_demonstration(args.log_dir, args.seed)
    if args.exp == "reward" or args.exp == "all":
        run_reward_mode_comparison(args.log_dir, args.seed)
    if args.exp == "drift" or args.exp == "all":
        run_alpha_drift_experiment(args.log_dir, args.seed)
--- a/sim/case/thesis_simplified/separability.py
+++ b/sim/case/thesis_simplified/separability.py
@@ -0,0 +1,72 @@
 """Behavioral separability for human/agent detection.
 Computes divergence signals delta_H, delta_A from session trajectories using
 transition kernel estimation and KL divergence to prototype behavioral profiles.
 """
 from __future__ import annotations
 from typing import Dict, List, Tuple, TYPE_CHECKING
 import numpy as np
 if TYPE_CHECKING:
    from .simplified import Event, Session
 # prototype behavioral kernels for human vs agent sessions
 TRANS_H = {
    "start": {"view": 0.85, "end": 0.15},
    "view": {"detail": 0.4, "cart": 0.3, "view": 0.2, "end": 0.1},
    "detail": {"cart": 0.5, "view": 0.3, "end": 0.2},
    "cart": {"purchase": 0.6, "view": 0.25, "end": 0.15},
    "purchase": {"end": 1.0},
 }
 TRANS_A = {
    "start": {"view": 0.95, "end": 0.05},
    "view": {"detail": 0.6, "view": 0.25, "cart": 0.1, "end": 0.05},
    "detail": {"view": 0.5, "cart": 0.15, "detail": 0.3, "end": 0.05},
    "cart": {"view": 0.4, "purchase": 0.2, "end": 0.4},
    "purchase": {"end": 1.0},
 }
 def kl_div(p: Dict[str, float], q: Dict[str, float], eps: float = 1e-10) -> float:
    """KL divergence D_KL(p || q) for discrete distributions."""
    keys = set(p.keys()) | set(q.keys())
    return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys)
 def build_kernel(events: List["Event"]) -> Dict[str, Dict[str, float]]:
    """Build empirical transition kernel T' from trajectory events."""
    trans: Dict[str, Dict[str, int]] = {}
    prev = "start"
    for e in events:
        curr = e.action
        trans.setdefault(prev, {})
        trans[prev][curr] = trans[prev].get(curr, 0) + 1
        prev = curr
    return {s: {d: c / sum(dsts.values()) for d, c in dsts.items()} for s, dsts in trans.items() if sum(dsts.values()) > 0}
 def compute_divergence(session: "Session") -> Tuple[float, float]:
    """Compute divergence signals delta_H, delta_A for session.
    delta_H = mean KL(T' || T_H) across states, measures distance to human prototype
    delta_A = mean KL(T' || T_A) across states, measures distance to agent prototype
    """
    kernel = build_kernel(session.events)
    if not kernel:
        return 0.5, 0.5
    delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / len(kernel)
    delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / len(kernel)
    return delta_h, delta_a
 def estimate_alpha(session: "Session", beta: float = 2.0) -> float:
    """Per-session contamination estimate alpha_hat = sigma(beta*(delta_H - delta_A)).
    Returns probability session is agent-generated based on behavioral divergence.
    """
    dh, da = compute_divergence(session)
    if (dh + da) <= 0:
        return 0.5
    return 1.0 / (1.0 + np.exp(-beta * (dh - da)))
--- a/sim/case/thesis_simplified/simplified.py
+++ b/sim/case/thesis_simplified/simplified.py
@@ -0,0 +1,219 @@
 """Minimal implementation of thesis pricing system.
 Implements the core loop: prices -> sessions -> demand -> prices
 with behavioral separability and robust pricing objective.
 Objects:
 - Session trajectories tau_s from mixture of H/A behavioral profiles
 - Demand proxy q_hat via weighted action aggregation
 - COI leakage penalty for agent reconnaissance
 - Limbo: alternating price/demand history for trajectory analysis
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Dict, List, Tuple
 import numpy as np
 from .coi import COIWindow, compute_coi_window
 from .separability import TRANS_H, TRANS_A, kl_div, build_kernel, compute_divergence, estimate_alpha
 ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0}
@dataclass
 class Event:
    action: str
    product_idx: int
    price_seen: float
    ts: float
@dataclass
 class Session:
    sid: str
    events: List[Event]
    actor: str  # H or A (ground truth label)
    theta: Dict[str, float] = field(default_factory=dict)
 def compute_demand(session: Session) -> float:
    """Compute demand proxy q_hat = sum_k omega(a_k) for session."""
    return sum(ACTION_WEIGHTS.get(e.action, 0.1) for e in session.events)
 def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray, costs: np.ndarray, theta: Dict[str, float],
                      is_agent: bool, session_noise: float = 0.02, surge: float = 0.08, max_mult: float = 1.8) -> Tuple[List[Event], int]:
    """Sample session trajectory from behavioral kernel."""
    pidx = int(rng.integers(0, len(prices)))
    cost, base = float(costs[pidx]), float(prices[pidx]) * (1.0 + rng.normal(0.0, session_noise))
    base = float(np.clip(base, cost * 1.01, float(prices[pidx]) * 2.0))
    price, signal, state, t = base, 0.0, "start", 0.0
    events = []
    while state != "end" and len(events) < 30:
        probs = trans.get(state, {"end": 1.0})
        nxt = rng.choice(list(probs.keys()), p=list(probs.values()))
        if nxt == "purchase":  # purchase conversion check
            rel = max((price - cost) / (cost + 1e-6), 0.0)
            p_buy = float(np.clip(theta.get("base_conv", 0.2) * np.exp(-theta.get("price_sens", 2.0) * rel), 0.0, 1.0))
            if rng.random() > p_buy:
                nxt = "end"
        state = nxt
        if state not in {"start", "end"}:
            events.append(Event(action=state, product_idx=pidx, price_seen=float(price), ts=t))
            signal += float(ACTION_WEIGHTS.get(state, 0.1))
            price = float(np.clip(base * (1.0 + surge * signal), cost * 1.01, base * max_mult))
        t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2))
    return events, pidx
 def put_prices_to_market(prices: np.ndarray, costs: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
                         seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]:
    """Generate sessions from mixture model. Returns sessions and demand mapping sid -> q_hat."""
    rng = np.random.default_rng(seed)
    sessions, demand = [], {}
    for i in range(n_sessions):
        sid = f"s{i:04d}"
        is_agent = rng.random() < alpha
        trans = TRANS_A if is_agent else TRANS_H
        theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else \
                {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
        events, _ = sample_trajectory(rng, trans, prices, costs=costs, theta=theta, is_agent=is_agent)
        session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
        sessions.append(session)
        demand[sid] = compute_demand(session)
    return sessions, demand
@dataclass
 class LimboUpdate:
    utype: str  # "prices" or "demand"
    data: np.ndarray | Dict[str, float]
    t: int
 class Limbo:
    """Historical trajectory of alternating price/demand observations."""
    def __init__(self):
        self.history: List[LimboUpdate] = []
        self._t = 0
    def add_update(self, utype: str, data: np.ndarray | Dict[str, float]) -> Dict:
        self.history.append(LimboUpdate(utype=utype, data=data, t=self._t))
        self._t += 1
        return {"action": "observe_demand" if utype == "prices" else "set_prices"}
    def get_prices_history(self) -> List[np.ndarray]:
        return [u.data for u in self.history if u.utype == "prices"]
    def get_demand_history(self) -> List[Dict[str, float]]:
        return [u.data for u in self.history if u.utype == "demand"]
 class System:
    """Main pricing system implementing robust Stackelberg objective.
    Manages the alternating loop: set prices p_t -> observe demand Q_hat(p_t) ->
    estimate contamination alpha from behavioral signals -> compute next prices.
    """
    def __init__(self, n_products: int = 10, costs: np.ndarray | None = None, lambda_coi: float = 0.5, seed: int | None = 42):
        self.n = n_products
        self.rng = np.random.default_rng(seed)
        self.costs = costs if costs is not None else self.rng.uniform(10, 50, n_products)
        self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products))
        self.lambda_coi = lambda_coi
        self.limbo = Limbo()
        self._alpha_est = 0.2
        self._sessions: List[Session] = []
        self._last_sessions: List[Session] = []
        self._last_coi: COIWindow | None = None
    @property
    def alpha(self) -> float:
        return self._alpha_est
    def _estimate_alpha_from_sessions(self) -> float:
        if not self._sessions:
            return self._alpha_est
        return float(np.mean([estimate_alpha(s) for s in self._sessions[-50:]]))
    def _revenue_under_demand(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
        agg = np.zeros(self.n)
        for sid, q in demand.items():
            sess = next((s for s in self._sessions if s.sid == sid), None)
            if sess and sess.events:
                agg[sess.events[0].product_idx] += q
        return float(np.dot(prices, agg))
    def _compute_coi_window(self, demand: Dict[str, float]) -> COIWindow:
        if not self._last_sessions:
            zeros = np.zeros(self.n, dtype=float)
            return COIWindow(policy=0.0, agent=0.0, leak=0.0, survival_ratio=0.0,
                             policy_by_product=zeros, agent_by_product=zeros, demand_weights=zeros)
        return compute_coi_window(self._last_sessions, self.costs, demand_mapping=demand)
    def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
        """Robust objective: R(p,d) - lambda * COI_leak."""
        profit = self._revenue_under_demand(prices, demand) - float(np.sum(self.costs))
        self._last_coi = self._compute_coi_window(demand)
        return profit - self.lambda_coi * self._last_coi.leak
    def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray:
        """Compute next prices via heuristic margin adjustment based on alpha estimate."""
        self._alpha_est = self._estimate_alpha_from_sessions()
        margin_scale = 1.0 - 0.5 * self._alpha_est  # defensive pricing under high contamination
        margins = (self.refs - self.costs) * margin_scale
        noise = self.rng.normal(0, 0.02, self.n) * self.costs
        prices = np.clip(self.costs + margins + noise, self.costs * 1.02, self.refs * 1.3)
        self.limbo.add_update("prices", prices)
        return prices
    def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
        sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true,
                                                    n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
        self._last_sessions = sessions
        self._sessions.extend(sessions)
        self.limbo.add_update("demand", demand_map)
        return demand_map
    def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float, COIWindow]:
        demand_hist = self.limbo.get_demand_history()
        prices = self.compute_prices(demand_hist[-1] if demand_hist else None)
        demand = self.observe_demand(prices, alpha_true, n_sessions)
        reward = self._objective(prices, demand)
        return prices, demand, reward, self._last_coi or self._compute_coi_window(demand)
    def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict:
        traj = {"prices": [], "demand": [], "rewards": [], "alpha_est": [], "alpha_true": alpha_true,
                "coi_policy": [], "coi_agent": [], "coi_leak": [], "coi_survival": []}
        for _ in range(n_steps):
            p, d, r, coi = self.step(alpha_true)
            traj["prices"].append(p); traj["demand"].append(d); traj["rewards"].append(r)
            traj["alpha_est"].append(self._alpha_est)
            traj["coi_policy"].append(coi.policy); traj["coi_agent"].append(coi.agent)
            traj["coi_leak"].append(coi.leak); traj["coi_survival"].append(coi.survival_ratio)
        return traj
 if __name__ == "__main__":
    sys = System(n_products=5, seed=42)
    traj = sys.run(n_steps=20, alpha_true=0.25)
    print(f"avg reward: {np.mean(traj['rewards']):.2f}, final alpha_hat: {traj['alpha_est'][-1]:.3f}, "
          f"COI_policy: {np.mean(traj['coi_policy']):.3f}, COI_agent: {np.mean(traj['coi_agent']):.3f}, leak: {np.mean(traj['coi_leak']):.3f}")
    prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
    costs = np.array([15.0, 28.0, 40.0, 18.0, 30.0])
    sessions, demand = put_prices_to_market(prices, costs=costs, alpha=0.3, n_sessions=20, seed=123)
    print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
    for n in [1, 5, 10, 50, 100]:
        # theoretical: erosion = 1 - 2/(N+1) for uniform order statistic
        print(f'N={n:3d} agents -> COI erosion: {1.0 - 2.0/(n+1):.3f}')
    events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0), Event('purchase', 0, 20.0, 2.0)]
    print(f'human-like session alpha_hat: {estimate_alpha(Session(sid="test", events=events, actor="H")):.3f}')
    events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3), Event('detail', 0, 20.0, 0.4)]
    print(f'agent-like session alpha_hat: {estimate_alpha(Session(sid="test2", events=events_a, actor="A")):.3f}')
--- a/sim/case/thesis_simplified/simplified_env.py
+++ b/sim/case/thesis_simplified/simplified_env.py
@@ -0,0 +1,249 @@
 """Gymnasium-compatible RL environment for thesis pricing system.
 Wraps simplified.System with standard Gym interface for training pricing policies.
 Supports multiple reward modes and contamination scenarios.
 Action: price multipliers [0.5, 1.5] applied to reference prices
 Observation: [prices, demand_agg, alpha_est, margins, position_proxy]
 Reward: configurable objective (revenue, profit, robust, coi-aware)
 """
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any, Dict, Tuple
 import numpy as np
 try:
    import gymnasium as gym
    from gymnasium import spaces
    HAS_GYM = True
 except ImportError:
    HAS_GYM = False
 from .simplified import System, Session, Event, Limbo, put_prices_to_market, compute_demand, estimate_alpha
 from .coi import COIWindow, compute_coi_window, coi_erosion
@dataclass
 class EnvConfig:
    n_products: int = 5
    max_steps: int = 200
    sessions_per_step: int = 30
    alpha_true: float = 0.2
    alpha_drift: float = 0.0
    alpha_bounds: Tuple[float, float] = (0.0, 0.6)
    lambda_coi: float = 0.5
    lambda_vol: float = 0.1
    reward_mode: str = "robust"  # revenue | profit | robust | coi_aware
    normalize_reward: bool = True
    seed: int | None = 42
 def aggregate_purchases(sessions: list[Session], n_products: int, costs: np.ndarray) -> Tuple[np.ndarray, float, float]:
    """Aggregate purchases from sessions, returns (counts, revenue, cost)."""
    purchases = np.zeros(n_products, dtype=float)
    revenue, cost = 0.0, 0.0
    for sess in sessions:
        for e in sess.events:
            if e.action == "purchase" and 0 <= e.product_idx < n_products:
                purchases[e.product_idx] += 1.0
                revenue += float(e.price_seen)
                cost += float(costs[e.product_idx])
    return purchases, revenue, cost
 class PricingEnv(gym.Env if HAS_GYM else object):
    """RL environment for dynamic pricing under agent contamination.
    Platform sets prices p_t, market responds with mixture demand Q(p) = (1-alpha)*D_H + alpha*D_A.
    Agent estimates contamination alpha_hat from behavioral signals.
    Reward balances profit vs COI leakage.
    """
    metadata = {"render_modes": ["human", "ansi"]}
    def __init__(self, cfg: EnvConfig | None = None):
        if not HAS_GYM:
            raise ImportError("gymnasium required")
        self.cfg = cfg or EnvConfig()
        self.n = self.cfg.n_products
        self._sys: System | None = None
        self._t = 0
        self._alpha = self.cfg.alpha_true
        self._last_prices: np.ndarray | None = None
        self._last_demand: Dict[str, float] | None = None
        self._episode_rewards: list[float] = []
        self._demand_agg = np.zeros(self.n)
        self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32)
        obs_dim = self.n + self.n + 1 + 1 + self.n + 1  # prices + demand + alpha_hat + alpha + margins + t
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)
    def _build_obs(self) -> np.ndarray:
        if self._sys is None:
            return np.zeros(self.observation_space.shape[0], dtype=np.float32)
        prices = self._last_prices if self._last_prices is not None else self._sys.refs
        return np.concatenate([
            prices / (self._sys.refs + 1e-6),
            self._demand_agg / (np.sum(self._demand_agg) + 1e-6),
            [self._sys.alpha, self._alpha],
            (prices - self._sys.costs) / (self._sys.costs + 1e-6),
            [self._t / self.cfg.max_steps],
        ]).astype(np.float32)
    def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
        cfg, sys = self.cfg, self._sys
        if sys is None:
            return 0.0
        # aggregate demand per product
        agg = np.zeros(self.n)
        for sid, q in demand.items():
            sess = next((s for s in sys._sessions if s.sid == sid), None)
            if sess and sess.events:
                agg[sess.events[0].product_idx] += q
        self._demand_agg = agg
        _, revenue, cost = aggregate_purchases(sys._last_sessions, self.n, sys.costs)
        profit = revenue - cost
        vol_penalty = 0.0
        if self._last_prices is not None:
            vol_penalty = cfg.lambda_vol * float(np.mean(np.abs(prices - self._last_prices) / (sys.refs + 1e-6)))
        coi = compute_coi_window(sys._last_sessions, sys.costs, demand_mapping=demand)
        leak = float(coi.leak)
        reward_fns = {
            "revenue": lambda: revenue,
            "profit": lambda: profit,
            "robust": lambda: profit - cfg.lambda_coi * leak - vol_penalty,
            "coi_aware": lambda: profit - cfg.lambda_coi * (1 + 2 * sys.alpha) * leak - vol_penalty,
        }
        r = reward_fns.get(cfg.reward_mode, lambda: profit)()
        return float(r / (float(np.sum(sys.refs)) + 1e-6)) if cfg.normalize_reward else float(r)
    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
        seed = seed if seed is not None else self.cfg.seed
        self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed)
        self._t, self._alpha = 0, self.cfg.alpha_true
        self._last_prices, self._last_demand = None, None
        self._episode_rewards, self._demand_agg = [], np.zeros(self.n)
        return self._build_obs(), {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
                                   "costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()}
    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
        if self._sys is None:
            raise RuntimeError("call reset() first")
        action = np.clip(action, 0.5, 1.5)
        prices = np.clip(self._sys.refs * action.astype(np.float64), self._sys.costs * 1.01, self._sys.refs * 2.0)
        demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step)
        self._sys.limbo.add_update("prices", prices)
        self._sys._alpha_est = self._sys._estimate_alpha_from_sessions()
        reward = self._compute_reward(prices, demand)
        self._episode_rewards.append(reward)
        self._last_prices, self._last_demand = prices.copy(), demand
        self._t += 1
        # compute info metrics using shared helper
        purchases, revenue, cost = aggregate_purchases(self._sys._last_sessions, self.n, self._sys.costs)
        n_agents = int(self._alpha * self.cfg.sessions_per_step)
        coi = compute_coi_window(self._sys._last_sessions, self._sys.costs, demand_mapping=demand)
        info = {
            "alpha_true": self._alpha, "alpha_est": self._sys.alpha,
            "alpha_error": abs(self._alpha - self._sys.alpha),
            "revenue": float(revenue), "profit": float(revenue - cost), "cost": float(cost),
            "n_purchases": int(np.sum(purchases)),
            "avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
            "n_sessions": len(demand), "n_agents": n_agents, "price_std": float(np.std(prices)),
            "coi_erosion": coi_erosion(coi.policy, coi.agent),
            "coi_policy": float(coi.policy), "coi_agent": float(coi.agent),
            "coi_leakage": float(coi.leak), "coi_survival": float(coi.survival_ratio),
            "cumulative_reward": sum(self._episode_rewards), "step": self._t,
        }
        return self._build_obs(), reward, self._t >= self.cfg.max_steps, False, info
    def render(self, mode: str = "human") -> str | None:
        if self._sys is None or self._last_prices is None:
            return None
        out = f"t={self._t}/{self.cfg.max_steps} | alpha_true={self._alpha:.3f} alpha_hat={self._sys.alpha:.3f} | " \
              f"prices: {self._last_prices.round(1)} | demand: {self._demand_agg.round(2)} | " \
              f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}"
        if mode == "human":
            print(out)
        return out
    def close(self) -> None:
        pass
 class ContaminationSweepEnv(PricingEnv):
    """Environment that sweeps through contamination levels during training."""
    def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None):
        super().__init__(cfg)
        self._schedule = alpha_schedule or [0.1, 0.2, 0.3, 0.4, 0.5]
        self._schedule_idx = 0
    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
        if options and options.get("advance_schedule", False):
            self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule)
        self.cfg.alpha_true = self._schedule[self._schedule_idx]
        return super().reset(seed, options)
 class AdversarialEnv(PricingEnv):
    """Environment with adversarial contamination dynamics.
    Contamination increases when prices are predictable (agents exploit).
    """
    def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02):
        super().__init__(cfg)
        self._exploit_rate = exploitation_rate
        self._price_history: list[np.ndarray] = []
    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
        obs, reward, term, trunc, info = super().step(action)
        if self._last_prices is not None:
            self._price_history.append(self._last_prices.copy())
        predictability = 0.0
        if len(self._price_history) > 10:
            predictability = 1.0 / (float(np.std(self._price_history[-10:])) + 0.1)
            self._alpha = np.clip(self._alpha + self._exploit_rate * predictability * self._sys.rng.random(), *self.cfg.alpha_bounds)
        info["predictability"] = predictability
        return obs, reward, term, trunc, info
    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
        self._price_history = []
        return super().reset(seed, options)
 def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv:
    return {"sweep": ContaminationSweepEnv, "adversarial": AdversarialEnv}.get(env_type, PricingEnv)(cfg)
 # baseline policies
 fixed_price_policy = lambda refs, margin=0.0: np.ones(len(refs), dtype=np.float32) * (1.0 + margin)
 random_policy = lambda n, rng=None: (rng or np.random.default_rng()).uniform(0.7, 1.3, n).astype(np.float32)
 adaptive_policy = lambda obs, n, base=0.1: np.ones(n, dtype=np.float32) * (1.0 + base * (1.0 - 0.4 * obs[2 * n]))
 if __name__ == "__main__":
    cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust")
    env = make_env(cfg)
    obs, info = env.reset()
    print(f"initial: alpha={info['alpha_true']:.2f}")
    total_reward = 0.0
    for t in range(cfg.max_steps):
        action = adaptive_policy(obs, cfg.n_products)
        obs, reward, done, _, info = env.step(action)
        total_reward += reward
        if t % 10 == 0:
            env.render()
        if done:
            break
    print(f"\ntotal reward: {total_reward:.2f}, final alpha_hat: {info['alpha_est']:.3f}")
--- a/sim/case/thesis_simplified/summarize.py
+++ b/sim/case/thesis_simplified/summarize.py
@@ -0,0 +1,168 @@
 """Summarize TensorBoard logs into comparison tables."""
 from __future__ import annotations
 import json
 import re
 from pathlib import Path
 from collections import defaultdict
 from dataclasses import dataclass
 import pandas as pd
 try:
    from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
    HAS_TB = True
 except ImportError:
    HAS_TB = False
@dataclass
 class RunInfo:
    algo: str
    alpha: float
    reward_mode: str
    path: Path
 def parse_run_name(name: str) -> RunInfo | None:
    """Extract algo, alpha, reward_mode from run directory name."""
    # patterns: ppo_a0.20_robust, cmp_fixed_a0.20, sac_a0.90_robust
    m = re.match(r'(cmp_)?(\w+)_a([\d.]+)_?(\w+)?', name)
    if not m:
        return None
    prefix, algo, alpha, mode = m.groups()
    return RunInfo(algo=algo, alpha=float(alpha), reward_mode=mode or 'robust', path=Path())
 def load_tb_scalars(log_dir: Path, tags: list[str], reduce: str = 'last') -> dict[str, float]:
    """Load scalar values from TensorBoard event files."""
    if not HAS_TB:
        return {}
    ea = EventAccumulator(str(log_dir))
    ea.Reload()
    results = {}
    for tag in tags:
        if tag in ea.Tags().get('scalars', []):
            events = ea.Scalars(tag)
            if not events:
                continue
            vals = [e.value for e in events]
            if reduce == 'last':
                results[tag] = vals[-1]
            elif reduce == 'mean':
                results[tag] = sum(vals) / len(vals)
            elif reduce == 'max':
                results[tag] = max(vals)
            elif reduce == 'min':
                results[tag] = min(vals)
    return results
 def load_json_results(log_dir: Path) -> dict[str, float]:
    """Load metrics from results.json if available."""
    results_file = log_dir / 'results.json'
    if results_file.exists():
        with open(results_file) as f:
            return json.load(f)
    return {}
 def discover_runs(base_dir: Path) -> list[RunInfo]:
    """Find all experiment runs in base directory."""
    runs = []
    for d in base_dir.iterdir():
        if not d.is_dir():
            continue
        info = parse_run_name(d.name)
        if info:
            info.path = d
            runs.append(info)
    return runs
 def build_tables(runs: list[RunInfo], metrics: list[str], reduce: str = 'last') -> dict[str, dict[str, pd.DataFrame]]:
    """Build pivot tables: reward_mode -> metric -> DataFrame[alpha x algo]."""
    # collect data: {reward_mode: {metric: {(alpha, algo): value}}}
    data = defaultdict(lambda: defaultdict(dict))
    tb_tags = [f'economics/{m}' if m in ['revenue', 'profit', 'margin'] else f'coi/{m}' if m in ['erosion', 'leakage'] else f'alpha/{m}' for m in metrics]
    tag_map = dict(zip(tb_tags, metrics))
    for run in runs:
        # try json first (final eval metrics)
        jm = load_json_results(run.path)
        tb = load_tb_scalars(run.path, tb_tags, reduce)
        for tag, metric in tag_map.items():
            val = None
            json_key = f'{metric}_mean' if metric != 'reward' else 'reward_mean'
            if json_key in jm:
                val = jm[json_key]
            elif tag in tb:
                val = tb[tag]
            if val is not None:
                data[run.reward_mode][metric][(run.alpha, run.algo)] = val
    # convert to DataFrames
    tables = {}
    for mode, metrics_data in data.items():
        tables[mode] = {}
        for metric, vals in metrics_data.items():
            if not vals:
                continue
            alphas = sorted(set(a for a, _ in vals.keys()))
            algos = sorted(set(al for _, al in vals.keys()))
            df = pd.DataFrame(index=alphas, columns=algos, dtype=float)
            for (a, al), v in vals.items():
                df.loc[a, al] = v
            df.index.name = 'alpha'
            tables[mode][metric] = df
    return tables
 def format_table(df: pd.DataFrame, fmt: str = '.3f') -> str:
    """Format DataFrame as markdown table."""
    return df.to_markdown(floatfmt=fmt)
 def summarize(base_dir: str = 'sim/case/thesis_simplified/runs',
              metrics: list[str] | None = None,
              reduce: str = 'last',
              output: str | None = None) -> dict:
    """Generate summary tables from experiment runs."""
    base = Path(base_dir)
    metrics = metrics or ['revenue', 'profit', 'margin', 'erosion', 'leakage']
    runs = discover_runs(base)
    if not runs:
        print(f"No runs found in {base}")
        return {}
    print(f"Found {len(runs)} runs")
    tables = build_tables(runs, metrics, reduce)
    lines = []
    for mode, metric_tables in sorted(tables.items()):
        lines.append(f"\n# Reward Mode: {mode}\n")
        for metric, df in sorted(metric_tables.items()):
            lines.append(f"\n## {metric}\n")
            lines.append(format_table(df))
            lines.append("")
    report = '\n'.join(lines)
    print(report)
    if output:
        Path(output).write_text(report)
        print(f"\nSaved to {output}")
    return tables
 if __name__ == '__main__':
    import argparse
    p = argparse.ArgumentParser()
    p.add_argument('--dir', default='sim/case/thesis_simplified/runs')
    p.add_argument('--metrics', nargs='+', default=['revenue', 'profit', 'margin', 'erosion', 'leakage'])
    p.add_argument('--reduce', default='last', choices=['last', 'mean', 'max', 'min'])
    p.add_argument('--output', '-o', help='save markdown to file')
    args = p.parse_args()
    summarize(args.dir, args.metrics, args.reduce, args.output)
--- a/sim/case/thesis_simplified/train.py
+++ b/sim/case/thesis_simplified/train.py
@@ -0,0 +1,336 @@
 """RL training for thesis pricing system with thesis-aligned metrics.
 Trains pricing policies using stable-baselines3 with TensorBoard logging.
 Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formulation.
 """
 from __future__ import annotations
 import argparse
 import json
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass, asdict, field
 from pathlib import Path
 from typing import Dict, List, Callable, Any
 import numpy as np
 try:
    from stable_baselines3 import PPO, SAC, A2C
    from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
    from stable_baselines3.common.vec_env import DummyVecEnv
    from stable_baselines3.common.monitor import Monitor
    HAS_SB3 = True
 except ImportError:
    HAS_SB3 = False
 try:
    from torch.utils.tensorboard import SummaryWriter
    HAS_TB = True
 except ImportError:
    HAS_TB = False
 from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy
@dataclass
 class EpisodeMetrics:
    reward: float = 0.0
    revenue: float = 0.0
    profit: float = 0.0
    coi_erosion: float = 0.0
    coi_leakage: float = 0.0
    alpha_error: float = 0.0
    avg_margin: float = 0.0
    n_agents: int = 0
    steps: int = 0
    def accumulate(self, info: Dict[str, Any]) -> None:
        self.steps += 1
        self.reward += info.get('reward', 0)
        self.revenue += info.get('revenue', 0)
        self.profit += info.get('profit', 0)
        self.coi_erosion += info.get('coi_erosion', 0)
        self.coi_leakage += info.get('coi_leakage', 0)
        self.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0))
        self.avg_margin += info.get('avg_margin', 0)
        self.n_agents += info.get('n_agents', 0)
    def normalized(self) -> Dict[str, float]:
        s = max(self.steps, 1)
        return {k: getattr(self, k) / s for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin', 'n_agents']}
@dataclass
 class ExperimentConfig:
    algo: str = "ppo"
    total_timesteps: int = 100_000
    n_envs: int = 4
    eval_freq: int = 5000
    n_eval_episodes: int = 10
    log_dir: str = "sim/case/thesis_simplified/runs"
    seed: int = 42
    n_products: int = 10
    max_steps: int = 200
    alpha_true: float = 0.2
    reward_mode: str = "robust"
    experiment_name: str | None = None
    def __post_init__(self):
        if self.experiment_name is None:
            self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}"
 class Policy:
    """Unified policy interface for baselines and trained models."""
    def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str):
        self._fn, self.name = policy_fn, name
    def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]:
        return self._fn(obs, (len(obs) - 3) // 3), None
    @staticmethod
    def fixed(margin: float = 0.15) -> "Policy":
        return Policy(lambda obs, n: fixed_price_policy(np.ones(n), margin), f"fixed_{margin:.2f}")
    @staticmethod
    def adaptive(base_margin: float = 0.15) -> "Policy":
        return Policy(lambda obs, n: adaptive_policy(obs, n, base_margin), f"adaptive_{base_margin:.2f}")
    @staticmethod
    def random() -> "Policy":
        return Policy(lambda obs, n: random_policy(n), "random")
    @staticmethod
    def myopic(greed: float = 0.3) -> "Policy":
        def _fn(obs: np.ndarray, n: int) -> np.ndarray:
            demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
            return np.ones(n, dtype=np.float32) * np.clip(1.0 + greed * (1 + np.mean(demand_norm)), 0.5, 1.5)
        return Policy(_fn, f"myopic_{greed:.1f}")
 def log_metrics(writer: SummaryWriter | None, metrics: Dict[str, float], prefix: str, step: int) -> None:
    if writer is None:
        return
    for k, v in metrics.items():
        writer.add_scalar(f'{prefix}/{k}', v, step)
 class MetricsCallback(BaseCallback):
    def __init__(self, writer: SummaryWriter | None, verbose: int = 0):
        super().__init__(verbose)
        self._writer = writer
    def _on_step(self) -> bool:
        if self._writer is None:
            return True
        for info in self.locals.get('infos', []):
            t = self.num_timesteps
            self._writer.add_scalar('economics/revenue', info.get('revenue', 0), t)
            self._writer.add_scalar('economics/profit', info.get('profit', 0), t)
            self._writer.add_scalar('economics/margin', info.get('avg_margin', 0), t)
            self._writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), t)
            self._writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), t)
            self._writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), t)
            self._writer.add_scalar('agents/count', info.get('n_agents', 0), t)
        return True
 def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv:
    def _make():
        return Monitor(make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
                                          alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)))
    return DummyVecEnv([_make for _ in range(n_envs)])
 def run_episodes(policy: Policy | Any, env: PricingEnv, n_episodes: int) -> List[EpisodeMetrics]:
    """Run policy for n episodes and collect metrics."""
    metrics = []
    for _ in range(n_episodes):
        obs, _ = env.reset()
        ep, done = EpisodeMetrics(), False
        while not done:
            action, _ = policy.predict(obs, deterministic=True)
            obs, reward, term, trunc, info = env.step(action)
            done = term or trunc
            ep.accumulate(info)
            ep.reward += reward
        metrics.append(ep)
    return metrics
 def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]:
    env = make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
                             alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999))
    metrics = run_episodes(policy, env, n_episodes)
    return {
        'reward_mean': np.mean([m.reward for m in metrics]), 'reward_std': np.std([m.reward for m in metrics]),
        **{f'{k}_mean': np.mean([m.normalized()[k] for m in metrics])
           for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin']},
    }
 def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None):
    obs, n_envs = vec_env.reset(), vec_env.num_envs
    ep_rewards = np.zeros(n_envs)
    for step in range(0, total_steps, n_envs):
        actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)])
        obs, rewards, dones, infos = vec_env.step(actions)
        ep_rewards += rewards
        for i, info in enumerate(infos):
            if writer:
                writer.add_scalar('economics/revenue', info.get('revenue', 0), step)
                writer.add_scalar('economics/profit', info.get('profit', 0), step)
                writer.add_scalar('economics/margin', info.get('avg_margin', 0), step)
                writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), step)
                writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), step)
                writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), step)
                writer.add_scalar('agents/count', info.get('n_agents', 0), step)
            if dones[i]:
                if writer:
                    writer.add_scalar('rollout/ep_reward', ep_rewards[i], step)
                ep_rewards[i] = 0
 def train(cfg: ExperimentConfig) -> Dict[str, Any]:
    is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"]
    if not HAS_SB3 and not is_baseline:
        raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]")
    log_path = Path(cfg.log_dir) / cfg.experiment_name
    log_path.mkdir(parents=True, exist_ok=True)
    with open(log_path / "config.json", "w") as f:
        json.dump(asdict(cfg), f, indent=2)
    writer = SummaryWriter(log_path) if HAS_TB else None
    train_env, eval_env = make_vec_env(cfg, cfg.n_envs), make_vec_env(cfg, 1)
    if is_baseline:
        policy = {"fixed": Policy.fixed, "adaptive": Policy.adaptive, "random": Policy.random, "myopic": Policy.myopic}[cfg.algo.lower()]()
        run_baseline(policy, train_env, cfg.total_timesteps, writer)
        final_metrics = evaluate_policy(policy, cfg)
    else:
        algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}[cfg.algo.lower()]
        common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
        model = {
            "ppo": lambda: PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, ent_coef=0.01, **common),
            "sac": lambda: SAC("MlpPolicy", train_env, learning_rate=1e-4, buffer_size=50_000, batch_size=512, tau=0.02, gamma=0.99, learning_starts=1000, ent_coef="auto_0.1", train_freq=4, **common),
            "a2c": lambda: A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common),
        }[cfg.algo.lower()]()
        cb = MetricsCallback(writer)
        eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"), log_path=str(log_path),
                               eval_freq=cfg.eval_freq, n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
        model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True)
        model.save(log_path / "final_model")
        policy = model
        final_metrics = evaluate_policy(model, cfg)
    if writer:
        log_metrics(writer, final_metrics, 'final', cfg.total_timesteps)
        writer.close()
    train_env.close(); eval_env.close()
    with open(log_path / "results.json", "w") as f:
        json.dump(final_metrics, f, indent=2)
    return {"path": str(log_path), "metrics": final_metrics}
 def _train_alpha(args: tuple) -> tuple[str, Dict]:
    """Worker for parallel sweep - must be top-level for pickling."""
    cfg_dict, alpha = args
    cfg_dict["alpha_true"] = alpha
    cfg_dict["experiment_name"] = f"{cfg_dict['algo']}_a{alpha:.2f}_{cfg_dict['reward_mode']}"
    sweep_cfg = ExperimentConfig(**cfg_dict)
    print(f"[alpha={alpha:.2f}] starting")
    metrics = train(sweep_cfg)["metrics"]
    print(f"[alpha={alpha:.2f}] done")
    return f"alpha_{alpha:.2f}", metrics
 def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
    alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    cfg_dict = asdict(cfg)
    if max_workers == 1:  # sequential fallback
        results = dict(_train_alpha((cfg_dict.copy(), a)) for a in alphas)
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as pool:
            futures = {pool.submit(_train_alpha, (cfg_dict.copy(), a)): a for a in alphas}
            results = {}
            for fut in as_completed(futures):
                key, metrics = fut.result()
                results[key] = metrics
    summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json"
    with open(summary_path, "w") as f:
        json.dump(results, f, indent=2)
    print(f"\nSweep results saved to {summary_path}")
    return results
 def _train_policy(args: tuple) -> tuple[str, Dict]:
    """Worker for parallel policy comparison."""
    cfg_dict, algo = args
    cfg_dict["algo"] = algo
    cfg_dict["experiment_name"] = f"cmp_{algo}_a{cfg_dict['alpha_true']:.2f}"
    cmp_cfg = ExperimentConfig(**cfg_dict)
    print(f"[{algo}] starting")
    metrics = train(cmp_cfg)["metrics"]
    print(f"[{algo}] done")
    return algo, metrics
 def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
    policies = policies or ["fixed", "adaptive", "myopic", "random"]
    cfg_dict = asdict(cfg)
    if max_workers == 1:
        results = dict(_train_policy((cfg_dict.copy(), p)) for p in policies)
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as pool:
            futures = {pool.submit(_train_policy, (cfg_dict.copy(), p)): p for p in policies}
            results = {}
            for fut in as_completed(futures):
                algo, metrics = fut.result()
                results[algo] = metrics
    cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json"
    with open(cmp_path, "w") as f:
        json.dump(results, f, indent=2)
    print(f"\nComparison saved to {cmp_path}")
    for algo, m in results.items():
        print(f"  {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} alpha_err={m['alpha_error_mean']:.4f}")
    return results
 def main():
    parser = argparse.ArgumentParser(description="Train RL pricing policies")
    parser.add_argument("--algo", default="ppo", choices=["ppo", "sac", "a2c", "fixed", "adaptive", "random", "myopic"])
    parser.add_argument("--steps", type=int, default=100_000)
    parser.add_argument("--alpha", type=float, default=0.2)
    parser.add_argument("--reward-mode", default="robust", choices=["revenue", "profit", "robust", "coi_aware"])
    parser.add_argument("--n-products", type=int, default=10)
    parser.add_argument("--n-envs", type=int, default=4)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--log-dir", default="sim/case/thesis_simplified/runs")
    parser.add_argument("--sweep", action="store_true", help="run contamination sweep")
    parser.add_argument("--compare", action="store_true", help="compare all baselines")
    parser.add_argument("--workers", type=int, default=None, help="max parallel workers for sweep (None=auto, 1=sequential)")
    args = parser.parse_args()
    cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
                           reward_mode=args.reward_mode, n_products=args.n_products,
                           n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir)
    if args.sweep:
        run_sweep(cfg, max_workers=args.workers)
    elif args.compare:
        compare_policies(cfg, max_workers=args.workers)
    else:
        result = train(cfg)
        print(f"\nTraining complete: {result['path']}")
        print(f"Metrics: {json.dumps(result['metrics'], indent=2)}")
 if __name__ == "__main__":
    main()
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -19,6 +19,7 @@ except ImportError:
    lib_make_state_repr = None
    lib_transition_histogram = None
 class BehaviorModel:
    def __init__(self, src_dir: str, loader_cls=Loader):
        self.loader = loader_cls(src_dir)
@@ -206,6 +207,7 @@ def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "
 def kl_divergence(p: Dict[str, float], q: Dict[str, float]) -> float:
    eps = 1e-10
    # p + log(p / q) summed over all keys in P
    return sum((p[k] + eps) * np.log((p[k] + eps) / (q.get(k, 0.0) + eps)) for k in p)
 if __name__ == "__main__":
@@ -222,6 +224,7 @@ if __name__ == "__main__":
    agent_model = AgentBehaviorModel(agent_dir)
    agent_mdp = agent_model.build_MDP()
    print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
          f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
    if not agent_mdp['states']:
@@ -230,6 +233,7 @@ if __name__ == "__main__":
    human_evt = aggregate_event_transitions(human_mdp)
    agent_evt = aggregate_event_transitions(agent_mdp)
    common = set(human_evt.keys()) & set(agent_evt.keys())
    if not common:
--- a/sim/rl/engine.py
+++ b/sim/rl/engine.py
@@ -3,8 +3,7 @@ import numpy as np
 import pandas as pd
 from abc import ABC, abstractmethod
 from typing import Dict, Any
-from environment import BusinessLogicConstraints
+from sim.rl.environment import BusinessLogicConstraints
 """
 An angine by default should have its own demand estimation mechanism from the observed observations whihc are the computer feature.
@@ -32,9 +31,12 @@ class BasePricingEngine(ABC):
        """
        pass
-    @abstractmethod
+    def update(self, observation: Dict[str, Any], reward: float, done: bool, info: Dict[str, Any]) -> None:
-    def update(obs, reward, done, info):
+        """Default no-op update. Engines can override as needed."""
-        pass
+        self.last_observation = observation
        self.last_reward = reward
        self.last_info = info
@@ -48,14 +50,14 @@ class WildPricingEngine(BasePricingEngine):
    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
        super().__init__(constraints, seed)
        # per-product unit costs (unknown to customers; known to platform)
-        self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catelogue_size).astype(np.float32)
+        self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catalogue_size).astype(np.float32)
        # online elasticity estimate (start moderately elastic)
-        self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
+        self.e_hat = np.full((self.c.product_catalogue_size,), -1.3, dtype=np.float32)
        # EWMA state for log-log regression
-        self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.mu_logp = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
-        self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.mu_logq = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
-        self.cov_pq  = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        self.cov_pq  = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
-        self.var_p   = np.ones(self.c.product_catelogue_size, dtype=np.float32)
+        self.var_p   = np.ones(self.c.product_catalogue_size, dtype=np.float32)
        # knobs typical in production
        self.lr = 0.08
        self.ewma = 0.05
@@ -140,7 +142,7 @@ class SimpleDemandEngine(BasePricingEngine):
    def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
        self.step_count += 1
-        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        demand = _extract_demand(observation, self.c.product_catalogue_size)
        if self.prev_demand is None:
            self.prev_demand = demand.copy()
            return current_prices.copy()
@@ -187,15 +189,15 @@ class ThompsonSamplingEngine(BasePricingEngine):
    def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
        super().__init__(constraints, seed)
        self.n_price_levels = 5
-        self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.alpha = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
-        self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.beta = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
        self.price_grid = None
        self.last_actions = None
    def reset(self):
        super().reset()
-        self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.alpha = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
-        self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
+        self.beta = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
        self.price_grid = None
        self.last_actions = None
@@ -206,10 +208,10 @@ class ThompsonSamplingEngine(BasePricingEngine):
            lo = current_prices * 0.7
            hi = current_prices * 1.3
            self.price_grid = np.linspace(lo, hi, self.n_price_levels).T
-        demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
+        demand = _extract_demand(observation, self.c.product_catalogue_size)
        # update beliefs based on last action
        if self.last_actions is not None:
-            for i in range(self.c.product_catelogue_size):
+            for i in range(self.c.product_catalogue_size):
                a = self.last_actions[i]
                reward = demand[i]
                if reward > 0.5:
@@ -217,11 +219,22 @@ class ThompsonSamplingEngine(BasePricingEngine):
                else:
                    self.beta[i, a] += 1.0
        # thompson sampling: sample from posterior, pick best
-        new_prices = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
+        new_prices = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
-        actions = np.zeros(self.c.product_catelogue_size, dtype=int)
+        actions = np.zeros(self.c.product_catalogue_size, dtype=int)
-        for i in range(self.c.product_catelogue_size):
+        for i in range(self.c.product_catalogue_size):
            theta = self.rng.beta(self.alpha[i], self.beta[i]).astype(np.float32)
            actions[i] = int(np.argmax(theta))
            new_prices[i] = self.price_grid[i, actions[i]]
        self.last_actions = actions
        return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
 def _extract_demand(observation: Dict[str, Any], n: int) -> np.ndarray:
    if "elasticity" in observation and isinstance(observation["elasticity"], dict):
        d = observation["elasticity"].get("demand")
        if d is not None:
            return np.asarray(d, dtype=np.float32)
    d = observation.get("demand")
    if d is not None:
        return np.asarray(d, dtype=np.float32)
    return np.zeros(n, dtype=np.float32)
--- a/sim/rl/environment.py
+++ b/sim/rl/environment.py
@@ -1,319 +1,244 @@
-import gymnasium as gym
+from __future__ import annotations
-from gymnasium import spaces
+
 import numpy as np
 from dataclasses import dataclass
-import pandas as pd
+from typing import Any, Dict, Optional, Tuple
 from typing import Callable, Optional, Dict, Any, List
-# "learner"  agent learning to optimize pricing
+import numpy as np
-# "agent"  part of environment creating demand signals that learner processes
+
 try:
    import gymnasium as gym
    from gymnasium import spaces
 except ImportError as e:
    raise ImportError("sim.rl.environment requires gymnasium") from e
 from sim.case.thesis_simplified.coi import COIWindow, coi_erosion, compute_coi_window
 from sim.case.thesis_simplified.separability import estimate_alpha as estimate_session_alpha
 from sim.case.thesis_simplified.simplified import Limbo, Session, put_prices_to_market
 from sim.rl.thesis_core import aggregate_demand_by_product, aggregate_purchases, constrain_prices
@dataclass(frozen=True)
 class BusinessLogicConstraints:
    product_catalogue_size: int = 100
    max_steps: int = 2000
    sessions_per_step: int = 250
@dataclass
 class BusinessLogicConstraints():
    max_price_adjustment: float = 0.30
    system_max_price: float = 500.0
    system_min_price: float = 1.0
-    product_catalogue_size: int = 100
+    max_price_adjustment: float = 0.30
-    episode_length: int = 200
+    min_margin_pct: float = 0.05
-    sessions_per_step: int = 250
+
-    agent_share: float = 0.25
+    agent_share: float = 0.2
-    agent_recon_multiplier: float = 6.0
+    alpha_drift: float = 0.0
-    agent_purchase_probability: float = 0.20
+    alpha_bounds: tuple[float, float] = (0.0, 0.8)
    coi_strength: float = 0.25
    coi_threshold: float = 4.0
    coi_sigmoid_temp: float = 1.25
    base_human_demand: float = 0.08
    base_agent_demand: float = 0.05
    human_price_elasticity: float = -1.2 # assumptions here
    agent_price_elasticity: float = -0.6
    w_agent_loss: float = 1.0
    w_volatility: float = 5.0
    w_estimation_error: float = 0.25
    seed: int = 7
-def _sigmoid(x: np.ndarray) -> np.ndarray:
+def make_env(constraints: Optional[BusinessLogicConstraints] = None) -> "PHANTOMEnv":
-    return 1.0 / (1.0 + np.exp(-x))
+    return PHANTOMEnv(constraints=constraints or BusinessLogicConstraints())
 class BehavioralProfile:
    """simple markov chain model for generating synthetic interaction events"""
    def __init__(self, actor: str, purchase_probs: np.ndarray):
        self.actor = actor
        self.purchase_probs = purchase_probs
        self.states = ['view', 'cart', 'checkout']
        # transition matrix: view->cart 0.3, view->view 0.6, view->exit 0.1, cart->checkout 0.5, cart->view 0.4, cart->exit 0.1
        self.trans = {'view': {'view': 0.6, 'cart': 0.3, 'exit': 0.1}, 'cart': {'checkout': 0.5, 'view': 0.4, 'exit': 0.1}, 'checkout': {'exit': 1.0}}
        if actor == 'agents':  # agents browse more before purchasing
            self.trans['view'] = {'view': 0.75, 'cart': 0.15, 'exit': 0.1}
            self.trans['cart'] = {'checkout': 0.3, 'view': 0.6, 'exit': 0.1}
    def sample(self, rng: np.random.Generator) -> Dict[str, Any]:
        """sample single interaction event"""
        product_idx = rng.integers(0, len(self.purchase_probs))
        state = 'view'  # always start with view
        # pick next state based on transition probs
        trans = self.trans.get(state, {'exit': 1.0})
        next_state = rng.choice(list(trans.keys()), p=list(trans.values()))
        price_paid = 0.0 if next_state != 'checkout' else float(rng.uniform(50, 200))
        return {'action': state, 'product_idx': product_idx, 'actor': 'agent' if self.actor == 'agents' else 'human', 't': 0.0, 'price_paid': price_paid}
 def _load_behavioral_profile(actor: str, demand_forcing: np.ndarray) -> BehavioralProfile:
    """returns a behavioral profile for generating synthetic sessions
    actor: 'humans' or 'agents'
    demand_forcing: per-product purchase probabilities used to weight interactions
    """
    return BehavioralProfile(actor, demand_forcing)
 class CommercePlatform:
    """state management for the environment, simulates demand"""
    def __init__(self, product_catalogue_size: int, max_price: float, min_price: float, constraints: BusinessLogicConstraints):
        self.product_catalogue_size = product_catalogue_size
        self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catalogue_size,))
        self.max_price = max_price
        self.min_price = min_price
        self.constraints = constraints
        self.simulation_history: List[Dict[str, Any]] = []
        self._rng = np.random.default_rng(constraints.seed)
        self._last_interaction_df: pd.DataFrame = pd.DataFrame()
    def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
        p = np.clip(prices, self.min_price, self.max_price)
        pn = p / self.max_price
        human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
        agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
        return {"human_purchase_prob": np.clip(human_prob, 0.0, 0.95), "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)}
    def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
        demand = self.setup_true_demand(base_prices)
        human_pprob = demand["human_purchase_prob"]
        agent_pprob = demand["agent_purchase_prob"]
        events: List[Dict[str, Any]] = []
        T = self.constraints.sessions_per_step
        n_agent_sessions = int(round(T * self.constraints.agent_share))
        n_human_sessions = T - n_agent_sessions
        n_agent_ids = max(1, n_agent_sessions // 2)
        session_map = {
            'humans': n_human_sessions,
            'agents': n_agent_ids
        }
        pprob_map = {
            'humans': human_pprob,
            'agents': agent_pprob
        }
        joint_events = []
        for actor, n_sessions in session_map.items():
            bp = _load_behavioral_profile(actor, pprob_map[actor])
            counter = 0
            events = []
            while counter < n_sessions:
                session_events = []
                while len(session_events) == 0 or session_events[-1]['action'] == 'checkout':
                    interaction_event = bp.sample(self._rng)
                    interaction_event['session_id'] = f'{actor}_{counter:06d}'
                    # TODO any other assignments
                    session_events.append(interaction_event)
                events.extend(session_events)
                counter += 1
            joint_events.extend(events)
        return pd.DataFrame(joint_events)
    def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
        if interaction_df.empty:
            return {"mean_sale_price": 0.0, "look_to_book": 0.0}
        purchases = interaction_df[interaction_df["action"] == "purchase"]
        mean_sale_price = float(purchases["price_paid"].mean()) if not purchases.empty else 0.0
        views = float((interaction_df["action"] == "view").sum())
        buys = float((interaction_df["action"] == "purchase").sum())
        return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
    def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
        # TODO: adapt this
        if df.empty:
            return pd.DataFrame()
        g = df.groupby("session_id", sort=False)
        session_duration = g["t"].max() - g["t"].min()
        total_interactions = g.size()
        avg_time_between = g["t"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0)
        interaction_velocity = total_interactions / (session_duration + 1e-6)
        views = g.apply(lambda x: int((x["action"] == "view").sum()), include_groups=False)
        cart_adds = g.apply(lambda x: int((x["action"] == "cart").sum()), include_groups=False)
        purchases = g.apply(lambda x: int((x["action"] == "purchase").sum()), include_groups=False)
        conversion_rate = purchases / (views + 1e-6)
        is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False)
        return pd.DataFrame({
            "session_duration_sec": session_duration.astype(float),
            "avg_time_between_events": avg_time_between.astype(float),
            "total_interactions": total_interactions.astype(int),
            "interaction_velocity": interaction_velocity.astype(float),
            "item_views": views.astype(int),
            "cart_adds": cart_adds.astype(int),
            "purchases": purchases.astype(int),
            "conversion_rate": conversion_rate.astype(float),
            "is_agent": is_agent.astype(bool),
        }).reset_index()
    def get_interaction_data(self) -> np.ndarray:
        if self._last_interaction_df.empty:
            return np.array([], dtype=object)
        return self._last_interaction_df.to_dict(orient="records")
 class PHANTOMEnv(gym.Env):
-    metadata = {"render_modes": []}
+    metadata = {"render_modes": ["human", "ansi"]}
-    def __init__(self, constraints):
+    def __init__(self, constraints: Optional[BusinessLogicConstraints] = None):
        super().__init__()
-        self.constraints = BusinessLogicConstraints()
+        self.c = constraints or BusinessLogicConstraints()
-        self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
+        self.n = int(self.c.product_catalogue_size)
-                                       high=self.constraints.max_price_adjustment,
+
-                                       shape=(self.constraints.product_catalogue_size,), dtype=np.float32)
+        self._rng = np.random.default_rng(self.c.seed)
-        self.observation_space = spaces.Dict({
+        self._t = 0
-            "elasticity": spaces.Dict({
+        self._alpha_true = float(self.c.agent_share)
-                "price": spaces.Box(
+        self._alpha_hat = float(self.c.agent_share)
-                    low=np.full((self.constraints.product_catalogue_size,), self.constraints.system_min_price, dtype=np.float32),
+        self._costs = np.zeros(self.n, dtype=np.float32)
-                    high=np.full((self.constraints.product_catalogue_size,), self.constraints.system_max_price, dtype=np.float32),
+        self._refs = np.zeros(self.n, dtype=np.float32)
-                    dtype=np.float32),
+        self._prices: Optional[np.ndarray] = None
-                "demand": spaces.Box(
+        self._last_sessions: list[Session] = []
-                    low=np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
+        self._last_coi: COIWindow | None = None
-                    high=np.full((self.constraints.product_catalogue_size,), 1e6, dtype=np.float32),
+        self._limbo = Limbo()
-                    dtype=np.float32),
+
-            })
+        self.action_space = spaces.Box(
-            # TODO: define more features that we compute from the interaction data
+            low=np.full((self.n,), self.c.system_min_price, dtype=np.float32),
-        })
+            high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
-        self.commerce_platform = CommercePlatform(
+            dtype=np.float32,
-            product_catalogue_size=self.constraints.product_catalogue_size,
+        )
-            max_price=self.constraints.system_max_price,
+        self.observation_space = spaces.Dict(
-            min_price=self.constraints.system_min_price,
+            {
-            constraints=self.constraints)
+                "elasticity": spaces.Dict(
-        self._rng = np.random.default_rng(self.constraints.seed)
+                    {
-        self.t = 0
+                        "price": spaces.Box(
-        self._prev_prices: Optional[np.ndarray] = None
+                            low=np.full((self.n,), self.c.system_min_price, dtype=np.float32),
-        self.state: Dict[str, Any] = {}
+                            high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
                            dtype=np.float32,
                        ),
                        "demand": spaces.Box(
                            low=np.zeros((self.n,), dtype=np.float32),
                            high=np.full((self.n,), 1e9, dtype=np.float32),
                            dtype=np.float32,
                        ),
                    }
                ),
                "market": spaces.Dict(
                    {
                        "alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
                        "revenue_rate": spaces.Box(low=0.0, high=1e12, shape=(1,), dtype=np.float32),
                        "conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
                        "price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
                    }
                ),
                "cost": spaces.Box(
                    low=np.zeros((self.n,), dtype=np.float32),
                    high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
                    dtype=np.float32,
                ),
            }
        )
    def _reset_catalogue(self) -> None:
        self._costs = self._rng.uniform(15.0, 60.0, size=self.n).astype(np.float32)
        margins = self._rng.uniform(0.2, 0.6, size=self.n).astype(np.float32)
        self._refs = (self._costs * (1.0 + margins)).astype(np.float32)
        self._prices = self._refs.copy()
    def _observe_market(
        self, prices: np.ndarray
    ) -> tuple[list[Session], Dict[str, float], np.ndarray, np.ndarray, float, float, int]:
        sessions, demand_map = put_prices_to_market(
            prices,
            costs=self._costs,
            alpha=self._alpha_true,
            n_sessions=int(self.c.sessions_per_step),
            seed=int(self._rng.integers(0, 2**31 - 1)),
        )
        demand_by_product = aggregate_demand_by_product(sessions, demand_map, self.n)
        purchases, revenue, cost, n_agents = aggregate_purchases(sessions, self._costs, self.n)
        conversion = float(np.sum(purchases) / max(len(sessions), 1))
        return sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents
    def _update_alpha_hat(self, sessions: list[Session]) -> float:
        scores = [estimate_session_alpha(s) for s in sessions if s.events]
        if not scores:
            return self._alpha_hat
        alpha_step = float(np.mean(scores))
        self._alpha_hat = 0.8 * self._alpha_hat + 0.2 * alpha_step
        self._alpha_hat = float(np.clip(self._alpha_hat, 0.0, 1.0))
        return self._alpha_hat
    def _reward(self, prices: np.ndarray, revenue: float, cost: float, volatility: float) -> float:
        profit = float(revenue - cost)
        coi_leak = float(self._last_coi.leak) if self._last_coi else 0.0
        alpha_err = abs(self._alpha_hat - self._alpha_true)
        return profit - self.c.coi_strength * coi_leak - self.c.w_volatility * volatility - self.c.w_estimation_error * alpha_err
    def _build_obs(
        self,
        prices: np.ndarray,
        demand_by_product: np.ndarray,
        revenue: float,
        conversion: float,
        volatility: float,
    ) -> Dict[str, Any]:
        return {
            "elasticity": {"price": prices.astype(np.float32), "demand": demand_by_product.astype(np.float32)},
            "market": {
                "alpha_hat": np.array([self._alpha_hat], dtype=np.float32),
                "revenue_rate": np.array([revenue], dtype=np.float32),
                "conversion_rate": np.array([conversion], dtype=np.float32),
                "price_volatility": np.array([volatility], dtype=np.float32),
            },
            "cost": self._costs.astype(np.float32),
        }
    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)
        if seed is not None:
            self._rng = np.random.default_rng(seed)
-            self.commerce_platform._rng = np.random.default_rng(seed)
+        self._t = 0
-        self.t = 0
+        self._alpha_true = float(np.clip(self.c.agent_share, *self.c.alpha_bounds))
-        init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catalogue_size,)).astype(np.float32)
+        self._alpha_hat = float(self.c.agent_share)
-        self._prev_prices = init_prices.copy()
+        self._reset_catalogue()
-        self.state = {
+        self._limbo = Limbo()
-            "elasticity": {
+        self._last_sessions = []
-                "price": init_prices,
+        self._last_coi = None
                "demand": np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
            }
        }
        return self.state, {}
-    def step(self, action: np.ndarray):
+        prices = self._prices if self._prices is not None else np.zeros(self.n, dtype=np.float32)
-        self.t += 1
+        obs = self._build_obs(prices, np.zeros(self.n, dtype=np.float32), 0.0, 0.0, 0.0)
-        base_prices = self.state["elasticity"]["price"].astype(np.float32)
+        return obs, {"alpha_true": self._alpha_true}
        new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
                           self.constraints.system_min_price,
                           self.constraints.system_max_price).astype(np.float32)
-        self.state["elasticity"]["price"] = new_prices
+    def step(self, action: np.ndarray) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
-        interactions_df = self.commerce_platform._simulate_sessions(new_prices)
+        if self._prices is None:
-        result = self.commerce_platform.compute_interaction_features(interactions_df)
+            raise RuntimeError("reset() must be called before step()")
        COI = 0.0  # TODO: implement cost-of-information computation
-        volatility = 0.0 if self._prev_prices is None else \
+        prev = self._prices
-            float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
+        prices = constrain_prices(
-        self._prev_prices = new_prices.copy()
+            prev,
            np.asarray(action, dtype=np.float32),
            costs=self._costs,
            min_price=float(self.c.system_min_price),
            max_price=float(self.c.system_max_price),
            max_adjustment=float(self.c.max_price_adjustment),
            min_margin_pct=float(self.c.min_margin_pct),
        )
        self._prices = prices
        self._limbo.add_update("prices", prices)
-        # extract metrics with safe defaults for incomplete simulation
+        sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents = self._observe_market(prices)
-        revenue_observed = float(result.get("revenue_observed", result.get("mean_sale_price", 0.0)))
+        self._last_sessions = sessions
-        agent_loss = float(result.get("agent_loss", 0.0))
+        self._limbo.add_update("demand", demand_map)
-        reward = (revenue_observed
+        self._update_alpha_hat(self._last_sessions)
-                  - COI
+        self._last_coi = compute_coi_window(self._last_sessions, self._costs, demand_mapping=demand_map)
                  - self.constraints.w_agent_loss * agent_loss
                  - self.constraints.w_volatility * volatility
                  - self.constraints.w_estimation_error)
-        terminated = self.t >= self.constraints.episode_length
+        self._alpha_true = float(np.clip(self._alpha_true + self.c.alpha_drift, *self.c.alpha_bounds))
        volatility = float(np.std((prices - prev) / (prev + 1e-6)))
        reward = float(self._reward(prices, revenue, cost, volatility))
        conversion = float(np.sum(purchases) / max(len(self._last_sessions), 1))
        self._t += 1
        terminated = self._t >= int(self.c.max_steps)
        obs = self._build_obs(prices, demand_by_product, revenue, conversion, min(volatility, 1.0))
        info = {
-            "t": self.t,
+            "step": self._t,
-            "revenue_observed": revenue_observed,
+            "reward": reward,
-            "revenue_oracle": float(result.get("revenue_oracle", revenue_observed)),
+            "revenue": float(revenue),
-            "agent_loss": agent_loss,
+            "profit": float(revenue - cost),
-            "ux_volatility": volatility,
+            "n_sessions": int(self.c.sessions_per_step),
-            "look_to_book": float(result.get("look_to_book", 0.0)),
+            "n_agents": int(n_agents),
-            "mean_sale_price": float(result.get("mean_sale_price", 0.0)),
+            "alpha_true": float(self._alpha_true),
-            "true_human_purchases_total": 0.0,  # TODO: track from simulation
+            "alpha_hat": float(self._alpha_hat),
-            "true_agent_purchases_total": 0.0,  # TODO: track from simulation
+            "alpha_error": float(abs(self._alpha_hat - self._alpha_true)),
            "price_std": float(np.std(prices)),
            "price_volatility": float(volatility),
        }
-        return self.state, float(reward), terminated, False, info
+        if self._last_coi is not None:
            info.update(
                {
                    "coi_policy": float(self._last_coi.policy),
                    "coi_agent": float(self._last_coi.agent),
                    "coi_leakage": float(self._last_coi.leak),
                    "coi_survival": float(self._last_coi.survival_ratio),
                    "coi_erosion": float(coi_erosion(self._last_coi.policy, self._last_coi.agent)),
                }
            )
        return obs, reward, terminated, False, info
    def render(self, mode: str = "human") -> str | None:
        if self._prices is None:
            return None
        out = (
            f"t={self._t}/{self.c.max_steps} "
            f"alpha_true={self._alpha_true:.3f} alpha_hat={self._alpha_hat:.3f} "
            f"price_std={float(np.std(self._prices)):.2f}"
        )
        if mode == "human":
            print(out)
        return out
-if __name__ == "__main__":
+    def close(self) -> None:
-    import matplotlib.pyplot as plt
+        return
    from collections import defaultdict
    env = PHANTOMEnv(constraints=BusinessLogicConstraints())
    obs, _ = env.reset(seed=42)
    metrics = defaultdict(list)
    total_reward = 0.0
    done = False
    while not done:
        action = env.action_space.sample()
        obs, reward, done, _, info = env.step(action)
        total_reward += reward
        p_mean = float(np.mean(obs["elasticity"]["price"]))
        q_mean = float(np.mean(obs["elasticity"]["demand"]))
        p_std = float(np.std(obs["elasticity"]["price"]))
        metrics['t'].append(info['t'])
        metrics['price_mean'].append(p_mean)
        metrics['price_std'].append(p_std)
        metrics['demand_mean'].append(q_mean)
        metrics['revenue_observed'].append(info['revenue_observed'])
        metrics['revenue_oracle'].append(info['revenue_oracle'])
        metrics['agent_loss'].append(info['agent_loss'])
        metrics['ux_volatility'].append(info['ux_volatility'])
        metrics['look_to_book'].append(info['look_to_book'])
        metrics['reward'].append(reward)
        metrics['human_purchases'].append(info['true_human_purchases_total'])
        metrics['agent_purchases'].append(info['true_agent_purchases_total'])
        if info['t'] % 20 == 0 or done:
            print(f"t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} q={q_mean:6.2f} "
                  f"rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
                  f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
                  f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
    print(f"total_reward={total_reward:.2f}")
    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
    fig.suptitle('PHANTOM Environment Run', fontsize=14, fontweight='bold')
    plot_configs = [
        ('price_mean', 'Mean Price', 'Price'),
        ('demand_mean', 'Mean Demand Estimate', 'Demand'),
        ('revenue_observed', 'Revenue (Observed)', 'Revenue'),
        ('agent_loss', 'Agent Loss (Oracle - Observed)', 'Loss'),
        ('ux_volatility', 'UX Volatility (Price Change)', 'Volatility'),
        ('look_to_book', 'Look-to-Book Ratio', 'Ratio'),
        ('reward', 'Step Reward', 'Reward'),
        ('human_purchases', 'Human Purchases', 'Count'),
        ('agent_purchases', 'Agent Purchases', 'Count'),
    ]
    for idx, (key, title, ylabel) in enumerate(plot_configs):
        ax = axes[idx // 3, idx % 3]
        ax.plot(metrics['t'], metrics[key], color='blue', alpha=0.7, linewidth=1.5)
        ax.set_xlabel('Step')
        ax.set_ylabel(ylabel)
        ax.set_title(title, fontsize=10, fontweight='bold')
        ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('phantom_env_comparison.png', dpi=150, bbox_inches='tight')
    print("Plot saved to phantom_env_comparison.png")
    plt.show()
--- a/sim/rl/jax_core/init.py
+++ b/sim/rl/jax_core/init.py
@@ -0,0 +1,11 @@
 """JAX-accelerated simulation core for PHANTOM environment."""
 from .transitions import TransitionData, compile_transitions, fallback_transitions, JAX_AVAILABLE
 from .simulation import SessionBatch, SimResult, sample_sessions, compute_metrics
 from .features import session_features, compute_session_transitions
 from .separability import compute_divergences, estimate_alpha_batch
 __all__ = [
    "JAX_AVAILABLE", "TransitionData", "compile_transitions", "fallback_transitions",
    "SessionBatch", "SimResult", "sample_sessions", "compute_metrics",
    "session_features", "compute_session_transitions", "compute_divergences", "estimate_alpha_batch",
 ]
--- a/sim/rl/jax_core/features.py
+++ b/sim/rl/jax_core/features.py
@@ -0,0 +1,69 @@
 """Vectorized session feature extraction."""
 import numpy as np
 from .transitions import N_STATES, PURCHASE_IDX, CART_IDX
 from .simulation import SessionBatch
 try:
    import jax.numpy as jnp
    from jax import jit
    JAX_AVAILABLE = True
 except ImportError:
    jnp, JAX_AVAILABLE = np, False
    def jit(f): return f
@jit
 def extract_features(states, dwells, lengths):
    """Extract per-session features. Returns (n_sess, 9) array."""
    n, max_len = states.shape
    mask = jnp.arange(max_len)[None,:] < lengths[:,None]
    duration = jnp.sum(dwells * mask, axis=1)
    total = lengths.astype(jnp.float32)
    count = lambda idx: jnp.sum((states == idx) & mask, axis=1).astype(jnp.float32)
    views, learn, carts, purchases = count(1), count(2), count(3), count(4)
    velocity = total / (duration + 1e-6)
    conversion = purchases / (views + 1e-6)
    avg_dwell = duration / (total + 1e-6)
    return jnp.stack([duration, avg_dwell, total, velocity, views, carts, purchases, learn, conversion], axis=1)
 def session_features(batch: SessionBatch) -> np.ndarray:
    if JAX_AVAILABLE:
        return np.asarray(extract_features(jnp.array(batch.states), jnp.array(batch.dwells), jnp.array(batch.lengths)))
    # numpy fallback
    n, max_len = batch.states.shape
    mask = np.arange(max_len)[None,:] < batch.lengths[:,None]
    duration = np.sum(batch.dwells * mask, axis=1)
    total = batch.lengths.astype(np.float32)
    count = lambda idx: np.sum((batch.states == idx) & mask, axis=1).astype(np.float32)
    views, learn, carts, purchases = count(1), count(2), count(3), count(4)
    return np.stack([duration, duration/(total+1e-6), total, total/(duration+1e-6), views, carts, purchases, learn, purchases/(views+1e-6)], axis=1)
@jit
 def session_transitions(states, lengths, n_states=N_STATES):
    """Compute empirical transition counts per session. Returns (n_sess, n_states, n_states)."""
    n, max_len = states.shape
    mask = jnp.arange(max_len - 1)[None,:] < (lengths[:,None] - 1)
    src, dst = states[:, :-1], states[:, 1:]
    # handle -1 padding by clamping to valid range
    src_c, dst_c = jnp.clip(src, 0, n_states-1), jnp.clip(dst, 0, n_states-1)
    valid = mask & (src >= 0) & (dst >= 0)
    def per_session(i):
        s, d, v = src_c[i], dst_c[i], valid[i]
        trans = (jnp.eye(n_states)[s,:,None] * jnp.eye(n_states)[d,None,:]).sum(0) * v[:,None,None]
        return trans.sum(0)
    # vmap not ideal here, use manual loop for clarity
    trans = jnp.stack([per_session(i) for i in range(n)])
    row_sums = trans.sum(axis=-1, keepdims=True)
    return trans / (row_sums + 1e-10)
 def compute_session_transitions(batch: SessionBatch) -> np.ndarray:
    if JAX_AVAILABLE:
        return np.asarray(session_transitions(jnp.array(batch.states), jnp.array(batch.lengths)))
    # numpy fallback
    n, max_len = batch.states.shape
    trans = np.zeros((n, N_STATES, N_STATES), dtype=np.float32)
    for i in range(n):
        for t in range(batch.lengths[i] - 1):
            s, d = batch.states[i, t], batch.states[i, t+1]
            if s >= 0 and d >= 0: trans[i, s, d] += 1
    row_sums = trans.sum(axis=-1, keepdims=True)
    return trans / (row_sums + 1e-10)
--- a/sim/rl/jax_core/separability.py
+++ b/sim/rl/jax_core/separability.py
@@ -0,0 +1,43 @@
 """Vectorized KL divergence for separability scoring."""
 import numpy as np
 from typing import Tuple
 try:
    import jax.numpy as jnp
    from jax import jit
    JAX_AVAILABLE = True
 except ImportError:
    jnp, JAX_AVAILABLE = np, False
    def jit(f): return f
@jit
 def batch_kl(P, Q_human, Q_agent, eps=1e-10):
    """Compute KL(P||Q) for batched P. P:(n,s,s), Q:(s,s). Returns (delta_h, delta_a) each (n,)."""
    p = P + eps
    p = p / p.sum(axis=-1, keepdims=True)
    qh, qa = Q_human[None] + eps, Q_agent[None] + eps
    delta_h = jnp.sum(p * jnp.log(p / qh), axis=(1, 2))
    delta_a = jnp.sum(p * jnp.log(p / qa), axis=(1, 2))
    return delta_h, delta_a
 def compute_divergences(session_trans: np.ndarray, ref_human: np.ndarray, ref_agent: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """Compute KL divergence of each session from human/agent prototypes."""
    if JAX_AVAILABLE:
        dh, da = batch_kl(jnp.array(session_trans), jnp.array(ref_human), jnp.array(ref_agent))
        return np.asarray(dh), np.asarray(da)
    # numpy fallback
    eps = 1e-10
    p = session_trans + eps
    p = p / p.sum(axis=-1, keepdims=True)
    qh, qa = ref_human[None] + eps, ref_agent[None] + eps
    delta_h = np.sum(p * np.log(p / qh), axis=(1, 2))
    delta_a = np.sum(p * np.log(p / qa), axis=(1, 2))
    return delta_h, delta_a
 def estimate_alpha_batch(prob_agent: np.ndarray, delta_h: np.ndarray, delta_a: np.ndarray, temp: float = 1.0) -> np.ndarray:
    """Vectorized alpha estimation from classifier probs and divergences."""
    mass = delta_h + delta_a
    ratio = np.where(mass > 1e-8, delta_a / mass, 0.5)
    blended = 0.5 * prob_agent + 0.5 * ratio
    if temp <= 0: return np.clip(blended, 0.0, 1.0)
    return np.clip(1.0 / (1.0 + np.exp(-temp * (blended - 0.5))), 0.0, 1.0)
--- a/sim/rl/jax_core/simulation.py
+++ b/sim/rl/jax_core/simulation.py
@@ -0,0 +1,116 @@
 """Vectorized Markov chain session sampling with JAX."""
 from typing import NamedTuple, Tuple
 import numpy as np
 from functools import partial
 try:
    import jax, jax.numpy as jnp
    from jax import lax
    JAX_AVAILABLE = True
 except ImportError:
    JAX_AVAILABLE = False
 from .transitions import TransitionData, N_STATES, TERM_IDX, PURCHASE_IDX, CART_IDX
 class SessionBatch(NamedTuple):
    states: np.ndarray      # (n_sess, max_len) state indices, -1=padding
    dwells: np.ndarray      # (n_sess, max_len) dwell times
    products: np.ndarray    # (n_sess,) product index per session
    actors: np.ndarray      # (n_sess,) 0=human, 1=agent
    lengths: np.ndarray     # (n_sess,) actual session length
 class SimResult(NamedTuple):
    demand_human: np.ndarray
    demand_agent: np.ndarray
    revenue: float
    revenue_oracle: float
    agent_loss: float
    coi: float
    look_to_book: float
    mean_sale_price: float
    n_human_purchases: int
    n_agent_purchases: int
    sessions: SessionBatch
 if JAX_AVAILABLE:
    @partial(jax.jit, static_argnums=(5,6,7))
    def _sample_sessions_jax(key, T_human, T_agent, dwell_human, dwell_agent, n_human, n_agent, max_steps):
        n = n_human + n_agent
        k1, k2, k3, k4 = jax.random.split(key, 4)
        actors = jnp.concatenate([jnp.zeros(n_human, dtype=jnp.int32), jnp.ones(n_agent, dtype=jnp.int32)])
        T = jnp.where(actors[:,None,None]==0, T_human[None], T_agent[None])  # (n,6,6)
        dwell_p = jnp.where(actors[:,None,None]==0, dwell_human[None], dwell_agent[None])  # (n,6,2)
        def step(carry, _):
            s, active, k = carry
            k, k1, k2 = jax.random.split(k, 3)
            probs = T[jnp.arange(n), s]  # (n,6)
            nxt = jax.random.categorical(k1, jnp.log(probs + 1e-10))
            nxt = jnp.where(active, nxt, -1)
            shape = dwell_p[jnp.arange(n), s, 0]
            scale = dwell_p[jnp.arange(n), s, 1]
            dwell = jnp.maximum(0.3, jax.random.gamma(k2, shape) * scale)
            still = active & (nxt != TERM_IDX) & (nxt >= 0)
            return (nxt, still, k), (nxt, dwell)
        init = (jnp.zeros(n, dtype=jnp.int32), jnp.ones(n, dtype=jnp.bool_), k3)
        _, (states, dwells) = lax.scan(step, init, None, length=max_steps)
        states, dwells = states.T, dwells.T  # (n, max_steps)
        is_term = (states == -1) | (states == TERM_IDX)
        lengths = jnp.argmax(is_term, axis=1) + 1
        lengths = jnp.where(jnp.any(is_term, axis=1), lengths, max_steps)
        return states, dwells, actors, lengths
 def sample_sessions(key, trans: TransitionData, n_human: int, n_agent: int, n_products: int, max_steps: int = 40) -> SessionBatch:
    if JAX_AVAILABLE:
        k1, k2 = jax.random.split(key)
        states, dwells, actors, lengths = _sample_sessions_jax(k1, trans.human_T, trans.agent_T, trans.human_dwell, trans.agent_dwell, n_human, n_agent, max_steps)
        products = jax.random.randint(k2, (n_human + n_agent,), 0, n_products)
        return SessionBatch(np.asarray(states), np.asarray(dwells), np.asarray(products), np.asarray(actors), np.asarray(lengths))
    # numpy fallback
    rng = np.random.default_rng(int(key[0]) if hasattr(key, '__getitem__') else 42)
    n = n_human + n_agent
    actors = np.concatenate([np.zeros(n_human, dtype=np.int32), np.ones(n_agent, dtype=np.int32)])
    products = rng.integers(0, n_products, size=n)
    states, dwells = np.full((n, max_steps), -1, dtype=np.int32), np.zeros((n, max_steps), dtype=np.float32)
    lengths = np.zeros(n, dtype=np.int32)
    for i in range(n):
        T = trans.human_T if actors[i] == 0 else trans.agent_T
        dp = trans.human_dwell if actors[i] == 0 else trans.agent_dwell
        s, t = 0, 0
        while t < max_steps and s != TERM_IDX:
            states[i, t] = s
            dwells[i, t] = max(0.3, rng.gamma(dp[s, 0], dp[s, 1]))
            s = rng.choice(N_STATES, p=T[s])
            t += 1
        lengths[i] = t
    return SessionBatch(states, dwells, products, actors, lengths)
 def compute_metrics(batch: SessionBatch, prices: np.ndarray, unit_cost: np.ndarray, base_price: np.ndarray) -> SimResult:
    purchased = np.any(batch.states == PURCHASE_IDX, axis=1)
    human_mask, agent_mask = batch.actors == 0, batch.actors == 1
    human_purch, agent_purch = purchased & human_mask, purchased & agent_mask
    demand_h = np.bincount(batch.products[human_purch], minlength=len(prices)).astype(np.float32)
    demand_a = np.bincount(batch.products[agent_purch], minlength=len(prices)).astype(np.float32)
    # revenue and oracle
    purch_products = batch.products[purchased]
    revenue = float(np.sum(prices[purch_products]))
    revenue_oracle = float(np.sum(base_price[purch_products]))
    # agent loss: base_price - price_paid for agent purchases (agents gaming the system)
    agent_products = batch.products[agent_purch]
    agent_loss = float(np.sum(base_price[agent_products] - prices[agent_products]))
    # COI: margin - expected_premium*0.5 for human purchases
    human_products = batch.products[human_purch]
    if len(human_products) > 0:
        margin = float(np.mean(prices[human_products] - unit_cost[human_products]))
        premium = float(np.mean(base_price[human_products] - prices[human_products]))
        coi = max(0.0, margin - premium * 0.5)
    else:
        coi = 0.0
    # look to book: views / purchases
    views = float(np.sum(batch.states == 1))  # view_item_page = index 1
    n_purch = int(purchased.sum())
    look_to_book = views / (n_purch + 1e-6)
    mean_sale = float(np.mean(prices[purch_products])) if n_purch > 0 else 0.0
    return SimResult(demand_h, demand_a, revenue, revenue_oracle, agent_loss, coi, look_to_book, mean_sale,
                     int(human_purch.sum()), int(agent_purch.sum()), batch)
--- a/sim/rl/jax_core/transitions.py
+++ b/sim/rl/jax_core/transitions.py
@@ -0,0 +1,47 @@
 """Dense transition matrices for JAX Markov chain sampling."""
 from dataclasses import dataclass
 import numpy as np
 try:
    import jax.numpy as jnp
    JAX_AVAILABLE = True
 except ImportError:
    jnp, JAX_AVAILABLE = np, False
 STATES = ["session_start", "view_item_page", "learn_more_about_item", "add_item_to_cart", "purchase_complete", "session_end"]
 S2I = {s: i for i, s in enumerate(STATES)}
 N_STATES, TERM_IDX, PURCHASE_IDX, CART_IDX = len(STATES), 5, 4, 3
@dataclass
 class TransitionData:
    human_T: np.ndarray   # (6,6) transition probs
    agent_T: np.ndarray   # (6,6)
    human_dwell: np.ndarray  # (6,2) shape,scale
    agent_dwell: np.ndarray  # (6,2)
    def to_jax(self):
        if not JAX_AVAILABLE: return self
        return TransitionData(*[jnp.array(x) for x in [self.human_T, self.agent_T, self.human_dwell, self.agent_dwell]])
 def dict_to_dense(d):
    m = np.zeros((N_STATES, N_STATES), dtype=np.float32)
    for src, dsts in d.items():
        if (i := S2I.get(src)) is not None:
            for dst, p in dsts.items():
                if (j := S2I.get(dst)) is not None: m[i,j] = p
    m /= np.maximum(m.sum(1, keepdims=True), 1e-8)
    m[TERM_IDX] = 0; m[TERM_IDX, TERM_IDX] = 1.0
    return m
 def compile_transitions(human_profile, agent_profile):
    def dwell_arr(params): return np.array([[params.get(s, (2.0, 1.0)) for s in STATES]], dtype=np.float32).reshape(N_STATES, 2)
    return TransitionData(dict_to_dense(human_profile.transitions), dict_to_dense(agent_profile.transitions),
                          dwell_arr(human_profile.dwell_params), dwell_arr(agent_profile.dwell_params))
 def fallback_transitions():
    H = {"session_start": {"view_item_page": .85, "session_end": .15}, "view_item_page": {"learn_more_about_item": .4, "add_item_to_cart": .3, "view_item_page": .2, "session_end": .1},
         "learn_more_about_item": {"add_item_to_cart": .5, "view_item_page": .3, "session_end": .2}, "add_item_to_cart": {"purchase_complete": .6, "view_item_page": .25, "session_end": .15}, "purchase_complete": {"session_end": 1.0}}
    A = {"session_start": {"view_item_page": .9, "session_end": .1}, "view_item_page": {"learn_more_about_item": .5, "add_item_to_cart": .25, "view_item_page": .15, "session_end": .1},
         "learn_more_about_item": {"add_item_to_cart": .4, "view_item_page": .4, "session_end": .2}, "add_item_to_cart": {"purchase_complete": .5, "view_item_page": .3, "session_end": .2}, "purchase_complete": {"session_end": 1.0}}
    dwell = np.full((N_STATES, 2), [2.0, 1.0], dtype=np.float32)
    return TransitionData(dict_to_dense(H), dict_to_dense(A), dwell.copy(), dwell.copy())
--- a/sim/rl/train.py
+++ b/sim/rl/train.py
@@ -4,16 +4,17 @@ from pathlib import Path
 from typing import Dict, Type, Optional
 import pickle
 from torch.utils.tensorboard import SummaryWriter
-from environment import PHANTOMEnv, BusinessLogicConstraints
+from sim.rl.environment import PHANTOMEnv, BusinessLogicConstraints
 logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
 logger = logging.getLogger(__name__)
 try:
-    from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
+    from sim.rl.engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
                       SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
-except ImportError:
+except ImportError as e:
    BasePricingEngine = None  # engines not required for basic usage
    print(e)
 """
@@ -36,27 +37,49 @@ class EngineTrainer:
        self.global_step = 0
    def train(self, n_episodes: int, seed: int = 42):
        obs, _ = self.env.reset(seed=seed)
        prices = None
        for ep in range(n_episodes):
-            prices = self.engine.compute_prices(prices, obs)
+            obs, _ = self.env.reset(seed=seed + ep)
-            obs, reward, done, _, info = self.env.step(prices)
+            self.engine.reset()
-            self.engine.update(obs, reward, done, info)
+            done = False
            prev_prices = obs["elasticity"]["price"]
            episode_reward = 0.0
            last_info: Dict[str, float] = {}
            while not done:
                action_prices = self.engine.compute_prices(prev_prices, obs)
                obs, reward, done, _, info = self.env.step(action_prices)
                self.engine.update(obs, reward, done, info)
                episode_reward += reward
                prev_prices = obs["elasticity"]["price"]
                last_info = info
                if self.tb_writer:
                    self.tb_writer.add_scalar("reward/step", reward, self.global_step)
                    if "coi" in info:
                        self.tb_writer.add_scalar("diagnostics/coi", info["coi"], self.global_step)
                    if "alpha_hat" in info:
                        self.tb_writer.add_scalar("diagnostics/alpha_hat", info["alpha_hat"], self.global_step)
                self.global_step += 1
            last_info = dict(last_info)
            last_info.update({"episode_reward": episode_reward, "episode": ep})
            self.episode_metrics.append(last_info)
            if self.tb_writer:
                self.tb_writer.add_scalar("reward/episode", episode_reward, ep)
        return self
    def run_episode(self, seed: int = 42) -> Dict:
        """run single evaluation episode and return metrics"""
        obs, _ = self.env.reset(seed=seed)
        self.engine.reset()
-        total_reward, prices = 0.0, None
+        total_reward = 0.0
        prev_prices = obs["elasticity"]["price"]
        ep_metrics = {'total_reward': 0.0}
        done = False
        while not done:
-            prices = self.engine.compute_prices(prices, obs) if prices is not None else obs["elasticity"]["price"]
+            action_prices = self.engine.compute_prices(prev_prices, obs)
-            obs, reward, done, _, info = self.env.step(prices)
+            obs, reward, done, _, info = self.env.step(action_prices)
            total_reward += reward
            for k, v in info.items():
                ep_metrics[k] = v
            prev_prices = obs["elasticity"]["price"]
        ep_metrics['total_reward'] = total_reward
        return ep_metrics
@@ -106,7 +129,7 @@ if __name__ == "__main__":
        logger.error("Engines not available, cannot run training")
        exit(1)
-    base_dir = Path("./runs")
+    base_dir = Path("./sim/rl/runs")
    base_dir.mkdir(exist_ok=True)
    engines = {
--- a/sim/strong_learner/data.py
+++ b/sim/strong_learner/data.py
@@ -1,4 +1,9 @@
-import os, requests, py7zr
+import os
 import requests
 try:
    import py7zr  # type: ignore
 except ImportError:  # pragma: no cover - optional dependency
    py7zr = None
 import pandas as pd
 from typing import Generator
 try:
@@ -22,12 +27,16 @@ class YooChooseLoader(Loader):
        self.entries = list(self.data.keys())
    def _setup(self):
        if py7zr is None:
            raise RuntimeError("py7zr is required to unpack YooChoose dataset. Install py7zr first.")
        os.makedirs(self.root, exist_ok=True)
        zip_path = f"{self.root}/temp.7z"
        with requests.get(self.URL, stream=True) as r:
            with open(zip_path, 'wb') as f:
-                for chunk in r.iter_content(8192): f.write(chunk)
+                for chunk in r.iter_content(8192):
-        with py7zr.SevenZipFile(zip_path, 'r') as z: z.extractall(self.root)
+                    f.write(chunk)
        with py7zr.SevenZipFile(zip_path, 'r') as z:
            z.extractall(self.root)
        os.remove(zip_path)
    def _make_interaction(self, sid: str, ts: str, item_id: str, event: str, page: str, meta: dict) -> InteractionModel:
--- a/tests/e2e/.env.example
+++ b/tests/e2e/.env.example
@@ -0,0 +1,7 @@
 WEB_URL=http://localhost:3000
 BACKEND_URL=http://localhost:5000
 PRICING_PROVIDER_URL=http://localhost:5001
 AIRFLOW_URL=http://localhost:8085
 AIRFLOW_USER=admin
 AIRFLOW_PASS=admin
 HEADLESS=true
		`@@ -0,0 +1,2 @@`
							`"""Case-specific simulations and experiments."""`
		`@@ -0,0 +1,2 @@`
							`"""Minimal thesis-aligned pricing simulation (self-contained)."""`