diff --git a/.gitignore b/.gitignore index 9101b2f..90077a7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ *.old **/package-lock.json **/*.parquet +**/_build/ paper/src/bib/auto experiments/airflow/logs/* @@ -21,3 +22,5 @@ sim/rl/behavior_loader/*.png sim/rl/behavior_loader/*.svg sim/rl/behavior_loader/*.pdf tests/e2e/node_modules/** +lab/case/thesis/runs*/ +sim/case/thesis_simplified/runs*/ diff --git a/engine/engine.py b/engine/engine.py new file mode 100644 index 0000000..e304aeb --- /dev/null +++ b/engine/engine.py @@ -0,0 +1,66 @@ +from sys import platform +import numpy as np +from .lib.demand import generate_demand, estimate_demand +from .lib.behavior import sample_behavior +from logging import INFO, getLogger +logger = getLogger(__name__) +logger.setLevel(INFO) + + + +class MarketEngine(): + def __init__(self, + alpha = 0.5, + N = 100, + demand_distribution = (50, 10), + demand_sampling_function = np.random.normal): + self.Nagents = int(N*alpha) + self.Nhumans = int(N*(1-alpha)) + self.demand = (demand_sampling_function, demand_distribution) + + def act(self, prices): + demand = generate_demand(prices, *self.demand) + sample_n = lambda n, human: [sample_behavior(demand, human=human) for _ in range(n)] + human_t, agent_t = sample_n(100, True), sample_n(100, False) + trajectories = human_t + agent_t + demand_estimate = estimate_demand(trajectories) + return demand_estimate + + def measure(self): + pass + +class PricingEngine(): + def __init__(self, + ) -> None: + pass + + def act(self, demand): + return np.random.uniform(low=25, high=100, size=10) + + + +class Limbo(): + def __init__(self, + platform, + market + ) -> None: + self.platform_turn = True + self.platform = platform + self.market = market + self.output = None + + def step(self): + # we could code golf this a little bit + if self.platform_turn: + self.output = self.platform.act(self.output) + else: + self.output = self.market.act(self.output) + print(self.output) + self.platform_turn = not self.platform_turn + +if __name__ == "__main__": + platform = PricingEngine() + market = MarketEngine() + limbo = Limbo(platform, market) + for _ in range(10): + limbo.step() diff --git a/engine/lib/__init__.py b/engine/lib/__init__.py new file mode 100644 index 0000000..8e17835 --- /dev/null +++ b/engine/lib/__init__.py @@ -0,0 +1,3 @@ +from .demand import generate_demand, estimate_demand +from .behavior import sample_behavior +from .render import DashboardRenderer, style_axis diff --git a/engine/lib/behavior.py b/engine/lib/behavior.py new file mode 100644 index 0000000..1822dde --- /dev/null +++ b/engine/lib/behavior.py @@ -0,0 +1,47 @@ +from sim.rl.behavior_loader.models import BehaviorModel, AgentBehaviorModel, aggregate_event_transitions +import pandas as pd +import numpy as np +from .demand import generate_demand + +base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments" +human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/" + +_cache = {} # lazy cache for models and base pivots + +def _get_base_pivot(human: bool): + key = 'human' if human else 'agent' + if key not in _cache: + model = BehaviorModel(human_dir) if human else AgentBehaviorModel(agent_dir) + mdp = model.build_MDP() + _cache[key] = pd.DataFrame(aggregate_event_transitions(mdp)).fillna(0.0) + return _cache[key] + +def adjust_behavior_to_condition(condition, transition_matrix): + # expand NxN transition matrix to (N*P)x(N*P) weighted by demand condition + cond_norm = condition / np.sum(condition) + n_products = len(condition) + base_vals = transition_matrix.values + base_cols, base_rows = transition_matrix.columns.tolist(), transition_matrix.index.tolist() + + # expand via kronecker-like tiling: each cell becomes a P*P block weighted by outer product of cond_norm + expanded = np.kron(base_vals, np.outer(cond_norm, cond_norm)) + new_cols = [f"{c}_product{p}" for c in base_cols for p in range(n_products)] + new_rows = [f"{r}_product{p}" for r in base_rows for p in range(n_products)] + return pd.DataFrame(expanded, index=new_rows, columns=new_cols) + +def sample_behavior(condition, human=True, max_len=40): + base_pivot = _get_base_pivot(human) + adjusted_transitions = adjust_behavior_to_condition(condition, base_pivot) + + trajectory = [np.random.choice(adjusted_transitions.index)] + while len(trajectory) < max_len or 'checkout' in trajectory[-1]: + probs = adjusted_transitions.loc[trajectory[-1]].values + sample = np.random.choice(adjusted_transitions.columns, p=probs/np.sum(probs) if np.sum(probs) > 0 else None) + trajectory.append(sample) + return trajectory + +if __name__ == "__main__": + t=sample_behavior(generate_demand(np.array([10,20,30])), human=True) + print(t) + t=sample_behavior(generate_demand(np.array([10,20,30])), human=False) + print(t) diff --git a/engine/lib/demand.py b/engine/lib/demand.py new file mode 100644 index 0000000..7215f7c --- /dev/null +++ b/engine/lib/demand.py @@ -0,0 +1,45 @@ +import logging +import numpy as np +from logging import getLogger +logger = getLogger(__name__) + +def generate_demand(prices, distribution_method = np.random.normal, distribution_params = (50.0, 10.0)): + # assumption 1: each product has an intrinsic valuation drawn from a normal distribution centered at 50 + product_valuations = distribution_method(*distribution_params, size=len(prices)) + # assumption 2: demand decreases as price increases, following a simple linear model + demand = np.maximum(0, product_valuations - prices) # demand cannot be negative + total = np.sum(demand) + demand = demand / total * 100 if total > 0 else demand # normalize to percentage, avoid div by zero + logger.info(f"Generated demand for prices {prices}: {demand} with valuations from distribution {distribution_params}") + return demand + +def estimate_demand(trajectories): + demand_estimate = {} + for traj in trajectories: + for event in traj: + if 'view_product' in event: + product_id = int(event.split('_')[-1].replace('product', '')) + demand_estimate[product_id] = demand_estimate.get(product_id, 0) + 1 + total_views = sum(demand_estimate.values()) + for product_id in demand_estimate: + demand_estimate[product_id] = (demand_estimate[product_id] / total_views) * 100 # normalize to percentage + return demand_estimate + +# Example usage +if __name__ == "__main__": + np.random.seed(42) + prices = np.array([20.0, 35.0, 50.0, 65.0]) + demand = generate_demand(prices) + print("Generated Demand:", demand) + from .behavior import sample_behavior + N, alphat =200, 0.1 + trajectories = [] + for _ in range(int(N*(1 - alphat))): + trajectories.append(sample_behavior(demand, human=True)) + for _ in range(int(N*alphat)): + trajectories.append(sample_behavior(demand, human=False)) + demand_estimate = estimate_demand(trajectories) + print("Estimated Demand from Behavior:", demand_estimate) + delta = {k: demand_estimate.get(k, 0) - demand[i] for i, k in enumerate(range(len(prices)))} + delta = np.mean([np.abs(v) for v in delta.values()]) + print("Demand Delta:", delta) diff --git a/engine/lib/render.py b/engine/lib/render.py new file mode 100644 index 0000000..a16f215 --- /dev/null +++ b/engine/lib/render.py @@ -0,0 +1,126 @@ +"""rendering logic for PHANTOM environment dashboard""" +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.gridspec import GridSpec + + +def style_axis(ax, title: str = None, xlabel: str = None, ylabel: str = None): + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + if title: ax.set_title(title, fontsize=11, fontweight='bold', pad=8) + if xlabel: ax.set_xlabel(xlabel, fontsize=9) + if ylabel: ax.set_ylabel(ylabel, fontsize=9) + + +class DashboardRenderer: + """stateful renderer for PHANTOM market dynamics visualization""" + + def __init__(self): + self.fig = None + self.gs = None + + def render(self, env) -> None: + if self.fig is None: + plt.ion() + self.fig = plt.figure(figsize=(14, 10)) + self.gs = GridSpec(3, 3, figure=self.fig, hspace=0.35, wspace=0.3, + left=0.07, right=0.95, top=0.92, bottom=0.08) + plt.show(block=False) + + self.fig.clear() + self.fig.suptitle(f'PHANTOM Market Dynamics [t={env._step_count}, a={env.alpha:.2f}]', + fontsize=14, fontweight='bold') + + demand_mat = np.array(env._demand_history).T + price_mat = np.array(env._price_history).T + elasticity = env._compute_elasticity() + + self._render_scatter(env) + self._render_elasticity_bar(env, elasticity) + self._render_session_pie(env) + self._render_price_heatmap(price_mat) + self._render_demand_heatmap(demand_mat) + self._render_correlation(env.n_products, price_mat, demand_mat) + self._render_revenue(env) + + self.fig.canvas.draw_idle() + self.fig.canvas.flush_events() + + def _render_scatter(self, env): + ax = self.fig.add_subplot(self.gs[0, 0]) + prices_flat = np.array(env._price_history).flatten() + demands_flat = np.array(env._demand_history).flatten() + product_ids = np.tile(np.arange(env.n_products), len(env._price_history)) + ax.scatter(prices_flat, demands_flat, c=product_ids, cmap='plasma', alpha=0.6, s=15, edgecolors='none') + if len(prices_flat) > 1: + z = np.polyfit(prices_flat, demands_flat, 1) + p_line = np.linspace(prices_flat.min(), prices_flat.max(), 50) + ax.plot(p_line, np.polyval(z, p_line), '--', lw=1.5, alpha=0.8) + style_axis(ax, "Price-Demand Relationship", "Price ($)", "Demand") + + def _render_elasticity_bar(self, env, elasticity): + ax = self.fig.add_subplot(self.gs[0, 1]) + ax.barh(range(env.n_products), elasticity, alpha=0.8) + ax.axvline(0, lw=0.8, alpha=0.5) + ax.axvline(-1, lw=1, ls='--', alpha=0.5) + ax.set_yticks(range(env.n_products)) + ax.set_yticklabels([f'P{i}' for i in range(env.n_products)], fontsize=7) + style_axis(ax, "Price Elasticity", "(dQ/dP)(P/Q)", None) + + def _render_session_pie(self, env): + ax = self.fig.add_subplot(self.gs[0, 2]) + n_h, n_a = env.market.Nhumans, env.market.Nagents + wedges, _ = ax.pie([n_h, n_a], startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'}) + ax.legend(wedges, [f'H ({n_h})', f'A ({n_a})'], loc='lower center', fontsize=8, + frameon=False, bbox_to_anchor=(0.5, -0.05)) + ax.set_title("Session Mix", fontsize=11, fontweight='bold') + + def _render_price_heatmap(self, price_mat): + ax = self.fig.add_subplot(self.gs[1, :2]) + im = ax.imshow(price_mat, aspect='auto', cmap='viridis', origin='lower') + style_axis(ax, "Price Heatmap P(product, t)", "Step", "Product") + cbar = self.fig.colorbar(im, ax=ax, fraction=0.03, pad=0.02) + cbar.set_label('$', fontsize=8) + + def _render_demand_heatmap(self, demand_mat): + ax = self.fig.add_subplot(self.gs[1, 2]) + im = ax.imshow(demand_mat, aspect='auto', cmap='Blues', origin='lower') + style_axis(ax, "Demand Q(product, t)", "Step", None) + self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02) + + def _render_correlation(self, n_products, price_mat, demand_mat): + ax = self.fig.add_subplot(self.gs[2, 0]) + if price_mat.shape[1] > 2: + corr = np.corrcoef(price_mat, demand_mat)[:n_products, n_products:] + im = ax.imshow(corr, cmap='RdBu', vmin=-1, vmax=1, aspect='auto') + ax.set_xticks(range(n_products)) + ax.set_yticks(range(n_products)) + ax.set_xticklabels([f'Q{i}' for i in range(n_products)], fontsize=6) + ax.set_yticklabels([f'P{i}' for i in range(n_products)], fontsize=6) + self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02) + style_axis(ax, "Price-Demand Correlation", None, None) + + def _render_revenue(self, env): + ax = self.fig.add_subplot(self.gs[2, 1:]) + n_steps = len(env._revenue_history) + demand_std = [np.std(d) for d in env._demand_history] + ax.fill_between(range(n_steps), env._revenue_history, alpha=0.3) + ax.plot(env._revenue_history, linewidth=2, label='Revenue') + ax.set_xlim(0, max(n_steps, 1)) + ax.set_ylim(0, max(env._revenue_history) * 1.1 if env._revenue_history else 1) + + ax2 = ax.twinx() + ax2.plot(range(n_steps), demand_std, linewidth=2, ls='-', alpha=0.9, label='sigma(Demand)') + d_min, d_max = min(demand_std), max(demand_std) + margin = (d_max - d_min) * 0.2 if d_max > d_min else 0.5 + ax2.set_ylim(max(0, d_min - margin), d_max + margin) + ax2.set_ylabel('Demand sigma', fontsize=9) + + style_axis(ax, "Revenue & Demand Dispersion", "Step", "Revenue ($)") + ax.legend(loc='upper left', fontsize=7, frameon=False) + ax2.legend(loc='upper right', fontsize=7, frameon=False) + + def close(self): + if self.fig: + plt.close(self.fig) + self.fig = None diff --git a/engine/studies/factors.py b/engine/studies/factors.py new file mode 100644 index 0000000..1fbfbe1 --- /dev/null +++ b/engine/studies/factors.py @@ -0,0 +1,34 @@ +"""shared factor definitions for experimental designs""" +import numpy as np +from dataclasses import dataclass, field +from typing import Callable, Any + +@dataclass +class Factor: + name: str + levels: list + primary: bool = True # full cross vs sampled + +# demand functions with compatible signatures +def demand_linear(mu, sigma, size): return np.maximum(0, np.random.normal(mu, sigma, size)) +def demand_uniform(mu, sigma, size): return np.random.uniform(mu - sigma, mu + sigma, size) +def demand_exponential(mu, sigma, size): return np.random.exponential(mu, size) +def demand_logistic(mu, sigma, size): return np.random.logistic(mu, sigma, size) + +DEMAND_FUNCTIONS = { + "linear": demand_linear, + "uniform": demand_uniform, + "exponential": demand_exponential, + "logistic": demand_logistic, +} + +FACTORS = [ + Factor("demand_fn", list(DEMAND_FUNCTIONS.keys()), primary=True), + Factor("alpha", [0.1, 0.3, 0.5, 0.7], primary=True), + Factor("n_products", [5, 15, 30, 50], primary=True), + Factor("demand_mu", [30.0, 50.0, 70.0], primary=False), + Factor("demand_sigma", [5.0, 10.0, 20.0], primary=False), + Factor("N", [100, 500, 1000], primary=False), +] + +SEEDS_PER_CONFIG = 5 diff --git a/engine/studies/full_factorial.py b/engine/studies/full_factorial.py new file mode 100644 index 0000000..9b4d1eb --- /dev/null +++ b/engine/studies/full_factorial.py @@ -0,0 +1,89 @@ +"""full factorial design - all factor combinations""" +import sys +sys.path.insert(0, "..") +import logging +from itertools import product +import json +import hashlib +from pathlib import Path +from concurrent.futures import ProcessPoolExecutor +from .factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +log = logging.getLogger(__name__) + +def generate_configs(): + """generate all factor combinations with seeds""" + all_levels = [f.levels for f in FACTORS] + names = [f.name for f in FACTORS] + + configs = [] + for combo in product(*all_levels): + base = {names[i]: combo[i] for i in range(len(names))} + for seed in range(SEEDS_PER_CONFIG): + cfg = {**base, "seed": seed} + cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8] + configs.append(cfg) + return configs + +def run_single(cfg: dict) -> dict: + """execute one experiment config, return metrics""" + from engine.wrapper import PHANTOM + import numpy as np + + np.random.seed(cfg["seed"]) + demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]] + + env = PHANTOM( + n_products=cfg["n_products"], + alpha=cfg["alpha"], + N=cfg["N"], + ) + env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"])) + + obs, _ = env.reset() + total_reward, steps = 0.0, 0 + + for _ in range(100): + action = env.action_space.sample() + obs, reward, term, trunc, _ = env.step(action) + total_reward += reward + steps += 1 + if term: break + + env.close() + return { + "id": cfg["id"], + "config": cfg, + "total_reward": total_reward, + "avg_reward": total_reward / steps, + "steps": steps, + } + +def run_study(max_workers: int = None, output: str = "results_full.jsonl"): + configs = generate_configs() + log.info(f"full factorial: {len(configs)} configs ({len(configs)//SEEDS_PER_CONFIG} unique × {SEEDS_PER_CONFIG} seeds)") + + results = [] + with ProcessPoolExecutor(max_workers=max_workers) as ex: + for i, result in enumerate(ex.map(run_single, configs)): + results.append(result) + if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}") + + Path(output).write_text("\n".join(json.dumps(r) for r in results)) + log.info(f"wrote {len(results)} results to {output}") + return results + +if __name__ == "__main__": + import argparse + p = argparse.ArgumentParser() + p.add_argument("--workers", type=int, default=None) + p.add_argument("--output", default="results_full.jsonl") + p.add_argument("--dry-run", action="store_true", help="only show design size") + args = p.parse_args() + + configs = generate_configs() + log.info(f"design: {len(configs)} runs | factors: {[f.name for f in FACTORS]} | levels: {[len(f.levels) for f in FACTORS]}") + + if not args.dry_run: + run_study(args.workers, args.output) diff --git a/engine/studies/mixed_lh.py b/engine/studies/mixed_lh.py new file mode 100644 index 0000000..33ea2ee --- /dev/null +++ b/engine/studies/mixed_lh.py @@ -0,0 +1,106 @@ +"""mixed design: full factorial on primary factors, latin hypercube on secondary""" +import sys +sys.path.insert(0, "..") +import logging +from itertools import product +import json +import hashlib +from pathlib import Path +from concurrent.futures import ProcessPoolExecutor +import numpy as np +from scipy.stats.qmc import LatinHypercube +from factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +log = logging.getLogger(__name__) + +LH_SAMPLES = 10 + +def generate_configs(lh_samples: int = LH_SAMPLES): + primary = [f for f in FACTORS if f.primary] + secondary = [f for f in FACTORS if not f.primary] + + primary_grid = list(product(*[f.levels for f in primary])) + lhs = LatinHypercube(d=len(secondary), seed=42) + + configs = [] + for p_combo in primary_grid: + samples = lhs.random(n=lh_samples) + for s in samples: + sec_vals = { + secondary[i].name: secondary[i].levels[int(s[i] * len(secondary[i].levels))] + for i in range(len(secondary)) + } + base = {primary[i].name: p_combo[i] for i in range(len(primary))} + base.update(sec_vals) + + for seed in range(SEEDS_PER_CONFIG): + cfg = {**base, "seed": seed} + cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8] + configs.append(cfg) + return configs + +def run_single(cfg: dict) -> dict: + from engine.wrapper import PHANTOM + import numpy as np + + np.random.seed(cfg["seed"]) + demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]] + + env = PHANTOM( + n_products=cfg["n_products"], + alpha=cfg["alpha"], + N=cfg["N"], + ) + env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"])) + + obs, _ = env.reset() + total_reward, steps = 0.0, 0 + + for _ in range(100): + action = env.action_space.sample() + obs, reward, term, trunc, _ = env.step(action) + total_reward += reward + steps += 1 + if term: break + + env.close() + return { + "id": cfg["id"], + "config": cfg, + "total_reward": total_reward, + "avg_reward": total_reward / steps, + "steps": steps, + } + +def run_study(max_workers: int = None, output: str = "results_mixed.jsonl", lh_samples: int = LH_SAMPLES): + configs = generate_configs(lh_samples) + n_primary_cells = int(np.prod([len(f.levels) for f in FACTORS if f.primary])) + log.info(f"mixed LH: {len(configs)} configs ({n_primary_cells} primary × {lh_samples} LH × {SEEDS_PER_CONFIG} seeds)") + + results = [] + with ProcessPoolExecutor(max_workers=max_workers) as ex: + for i, result in enumerate(ex.map(run_single, configs)): + results.append(result) + if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}") + + Path(output).write_text("\n".join(json.dumps(r) for r in results)) + log.info(f"wrote {len(results)} results to {output}") + return results + +if __name__ == "__main__": + import argparse + p = argparse.ArgumentParser() + p.add_argument("--workers", type=int, default=None) + p.add_argument("--output", default="results_mixed.jsonl") + p.add_argument("--lh-samples", type=int, default=10) + p.add_argument("--dry-run", action="store_true", help="only show design size") + args = p.parse_args() + + primary = [f for f in FACTORS if f.primary] + secondary = [f for f in FACTORS if not f.primary] + configs = generate_configs(args.lh_samples) + log.info(f"design: {len(configs)} runs | primary: {[f.name for f in primary]} | secondary (LH): {[f.name for f in secondary]}") + + if not args.dry_run: + run_study(args.workers, args.output, args.lh_samples) diff --git a/engine/train.py b/engine/train.py new file mode 100644 index 0000000..496ecfd --- /dev/null +++ b/engine/train.py @@ -0,0 +1,45 @@ +from stable_baselines3 import SAC +from stable_baselines3.common.callbacks import EvalCallback, BaseCallback +from .wrapper import PHANTOM + + +class RenderCallback(BaseCallback): + """Renders environment on every step for live visualization.""" + def __init__(self, env: PHANTOM): + super().__init__() + self.env = env + + def _on_step(self) -> bool: + self.env.render() + return True + + +env = PHANTOM(n_products=10, alpha=0.3, render_mode="human") +eval_env = PHANTOM(n_products=10, alpha=0.3, render_mode=None) + +model = SAC( + "MultiInputPolicy", + env, + verbose=1, + learning_rate=3e-4, + buffer_size=50000, + batch_size=256, + tau=0.005, + gamma=0.99, +) + +render_cb = RenderCallback(env) +eval_cb = EvalCallback(eval_env, eval_freq=1000, n_eval_episodes=5, verbose=1) + +model.learn(total_timesteps=50000, callback=[render_cb, eval_cb]) +model.save("phantom_sac") + +# test trained policy +env = PHANTOM(n_products=10, alpha=0.3, render_mode="human") +obs, _ = env.reset() +for _ in range(100): + action, _ = model.predict(obs, deterministic=True) + obs, reward, term, trunc, _ = env.step(action) + env.render() + if term or trunc: break +env.close() diff --git a/engine/wrapper.py b/engine/wrapper.py new file mode 100644 index 0000000..0301082 --- /dev/null +++ b/engine/wrapper.py @@ -0,0 +1,118 @@ +import gymnasium as gym +from gymnasium import spaces +import numpy as np +from .engine import Limbo, MarketEngine, PricingEngine +from .lib.render import DashboardRenderer + + +class PHANTOM(gym.Env): + """Gymnasium wrapper for the Limbo pricing-market simulation. Platform sets prices, market responds with demand.""" + metadata = {"render_modes": ["human", "ansi"]} + + def __init__(self, + n_products: int = 10, + alpha: float = 0.3, + N: int = 100, + price_bounds: tuple = (10.0, 150.0), + lambda_coi: float = 0.1, + render_mode: str = None): + super().__init__() + self.n_products = n_products + self.price_bounds = price_bounds + self.lambda_coi = lambda_coi + self.render_mode = render_mode + self.alpha = alpha + self.N = N + + self.market = MarketEngine(alpha=alpha, N=N) + self._platform_stub = PricingEngine() + self._limbo = Limbo(self._platform_stub, self.market) + + self.action_space = spaces.Box( + low=price_bounds[0], high=price_bounds[1], + shape=(n_products,), dtype=np.float32 + ) + self.observation_space = spaces.Dict({ + "demand": spaces.Box(low=0.0, high=100.0, shape=(n_products,), dtype=np.float32), + "prices": spaces.Box(low=price_bounds[0], high=price_bounds[1], shape=(n_products,), dtype=np.float32), + }) + + self._prices = None + self._demand = None + self._step_count = 0 + self._demand_history = [] + self._price_history = [] + self._revenue_history = [] + self._renderer = None + + def _get_obs(self) -> dict: + demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32) + return {"demand": demand_arr, "prices": self._prices.astype(np.float32)} + + def _compute_reward(self, prices: np.ndarray, demand: dict) -> float: + revenue = np.sum(prices * np.array([demand.get(i, 0.0) for i in range(self.n_products)])) + # TODO: implement supra-competitive price punishment + return float(revenue) + + def _record_history(self): + demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)]) + self._demand_history.append(demand_arr) + self._price_history.append(self._prices.copy()) + self._revenue_history.append(np.sum(self._prices * demand_arr)) + + def reset(self, seed=None, options=None): + super().reset(seed=seed) + self._prices = np.random.uniform(*self.price_bounds, size=self.n_products) + self._demand = self.market.act(self._prices) + self._step_count = 0 + self._demand_history, self._price_history, self._revenue_history = [], [], [] + self._record_history() + return self._get_obs(), {} + + def step(self, action: np.ndarray): + self._prices = np.clip(action, *self.price_bounds) + self._demand = self.market.act(self._prices) + self._step_count += 1 + self._record_history() + + reward = self._compute_reward(self._prices, self._demand) + terminated = self._step_count >= 100 + + return self._get_obs(), reward, terminated, False, {"step": self._step_count} + + def _compute_elasticity(self) -> np.ndarray: + """point elasticity: e = (dQ/dP) * (P/Q) via finite differences, clipped to [-5, 5]""" + if len(self._price_history) < 2: + return np.zeros(self.n_products) + p, q = np.array(self._price_history), np.array(self._demand_history) + dp, dq = np.diff(p, axis=0), np.diff(q, axis=0) + valid = np.abs(dp) > 0.5 + with np.errstate(divide='ignore', invalid='ignore'): + elasticity = np.where(valid, (dq / dp) * (p[:-1] / np.maximum(q[:-1], 1.0)), 0.0) + elasticity = np.nan_to_num(np.clip(elasticity, -5.0, 5.0), nan=0.0) + return np.mean(elasticity, axis=0) if len(elasticity) > 0 else np.zeros(self.n_products) + + def render(self): + if self.render_mode == "human": + if self._renderer is None: + self._renderer = DashboardRenderer() + self._renderer.render(self) + elif self.render_mode == "ansi": + return f"step={self._step_count}, prices={self._prices}, demand={self._demand}" + return None + + def close(self): + if self._renderer: + self._renderer.close() + self._renderer = None + + +if __name__ == "__main__": + env = PHANTOM(n_products=15, alpha=0.3, N=100, render_mode="human") + obs, _ = env.reset() + for step in range(100): + action = env.action_space.sample() + obs, reward, term, trunc, info = env.step(action) + env.render() + if term: break + env.close() diff --git a/lab/README.md b/lab/README.md new file mode 100644 index 0000000..b5226aa --- /dev/null +++ b/lab/README.md @@ -0,0 +1,75 @@ +# MOS (Money Operating System) + +Research-grade quote-control simulator for studying dynamic pricing and market making policies. +The system models pricing as a closed loop of **Quote → Arrival → Execution → Position**, enabling +controlled experimentation with demand models, inventory constraints, and reward shaping. + +## Core Loop + +1. **Quote** – the policy posts prices (one-sided or two-sided depending on the mechanism). +2. **Arrival** – a population model generates purchase opportunities or market orders. +3. **Execution** – an execution model decides whether an arrival converts at the quoted price. +4. **Position** – inventory/position limits censor fills and generate holding/shortage costs. +5. **Observation & Reward** – censored fills and aggregate metrics are exposed to the agent, while + objectives turn metrics into a scalar reward. + +Each stage is pluggable via light-weight protocols so you can swap in alternative mechanisms, +demand models, or objectives without rewriting the rest of the simulator. + +## Package Layout + +| Module | Purpose | +|-------------------|---------| +| `lab.outlet` | Core simulation engine, domain types, pricing mechanisms, objectives. | +| `lab.population` | Demand arrival models, execution probability models, competitor/market dynamics. | +| `lab.experiments` | Rollout utilities, baseline policies, and off-policy evaluation helpers. | +| `lab.config` | Convenience factories for preconfigured retail and market-making environments. | + +## Preconfigured Scenarios + +### Retail Dynamic Pricing +- Mechanism: posted prices with margin and delta constraints. +- Arrivals: browsing sessions with contamination support (scrapers). +- Execution: elasticity model with competitor cross-effects. +- Position: inventory tracking with holding and shortage costs. +- Market: reactive competitor that can trigger price wars. +- Objective: PnL minus volatility, holding cost, and lost opportunity penalties. + +```python +from lab.config import make_retail_platform +from lab.experiments import rollout, fixed_price_policy + +platform = make_retail_platform() +policy = fixed_price_policy(platform.instruments.refs) +result = rollout(platform, policy, n_steps=100) +print(result.total_pnl) +``` + +### Market Making +- Mechanism: two-sided quoting with bid/ask spreads. +- Arrivals: Hawkes order flow for clustered demand. +- Execution: Avellaneda–Stoikov style intensity model. +- Position: inventory risk limits and quadratic penalty objective. +- Market: geometric Brownian motion mid-price process. +- Objective: PnL plus spread capture minus inventory risk. + +```python +from lab.config import make_market_making_platform +from lab.experiments import rollout + +platform = make_market_making_platform() +mm_policy = lambda obs, t: (platform.instruments.refs, 1.0) +result = rollout(platform, mm_policy, n_steps=200, seed=42) +print(result.total_pnl) +``` + +## Extending the Simulator + +- Implement `lab.outlet.protocols.Mechanism` or `ArrivalModel` to introduce new pricing +domains or demand processes. +- Compose objectives with `lab.outlet.objectives.factory.make_composite` to study alternate +reward formulations. +- Use `lab.experiments.compare_policies` to benchmark candidate policies across multiple +random seeds. + +Comprehensive API documentation lives in `lab/docs` (build with `make html`). diff --git a/lab/__init__.py b/lab/__init__.py new file mode 100644 index 0000000..cc6df0c --- /dev/null +++ b/lab/__init__.py @@ -0,0 +1,27 @@ +""" +Quote-Control Simulator: Research-grade platform for dynamic pricing and market making + +The platform abstracts pricing as: Quote -> Arrival -> Execution -> Position +Supports multiple mechanisms: + - PostedPrice: retail dynamic pricing + - TwoSided: market making with bid-ask spreads + - Auction: reserve/shading for auction settings + +Example usage: + from lab.config import make_retail_platform + from lab.experiments import rollout, fixed_price_policy + + platform = make_retail_platform() + policy = fixed_price_policy(platform.instruments.refs) + result = rollout(platform, policy, n_steps=100) + print(f"Total PnL: {result.total_pnl:.2f}") +""" + +from .config import make_retail_platform, make_market_making_platform, RetailConfig, MarketMakingConfig +from .outlet import Platform, PlatformConfig, Quote, Observation, StepResult + +__all__ = [ + 'make_retail_platform', 'make_market_making_platform', + 'RetailConfig', 'MarketMakingConfig', + 'Platform', 'PlatformConfig', 'Quote', 'Observation', 'StepResult', +] diff --git a/lab/case/__init__.py b/lab/case/__init__.py new file mode 100644 index 0000000..44fbf8c --- /dev/null +++ b/lab/case/__init__.py @@ -0,0 +1,6 @@ +""" +Case studies implementing specific research scenarios. + +Available cases: +- thesis: PHANTOM thesis implementation with contaminated demand and DR-RL +""" diff --git a/lab/case/thesis/__init__.py b/lab/case/thesis/__init__.py new file mode 100644 index 0000000..31db465 --- /dev/null +++ b/lab/case/thesis/__init__.py @@ -0,0 +1,25 @@ +""" +Thesis-specific implementation of the PHANTOM pricing defense framework. + +This module implements the mathematical models from the thesis: +- ContaminatedArrivalModel: Mixture demand Q(p) = (1-α)d_H + αd_A (Eq 3) +- HybridExecutionModel: Divergent H/A behavior with separability (Section 2.1) +- RobustStackelbergObjective: Maximin objective with COI penalty (Eq 23) +- COIMetrics: Cost of Information tracking (Definition 1) + +The platform configuration creates a research environment that directly +maps to the thesis mathematical framework for DR-RL experiments. +""" +from .arrivals import ContaminatedArrivalModel, ContaminatedArrivalConfig +from .execution import HybridExecutionModel, HybridExecutionConfig +from .objectives import RobustStackelbergObjective, COIObjective +from .platform import make_thesis_platform, ThesisConfig +from .metrics import COIMetrics, compute_coi, compute_separability + +__all__ = [ + 'ContaminatedArrivalModel', 'ContaminatedArrivalConfig', + 'HybridExecutionModel', 'HybridExecutionConfig', + 'RobustStackelbergObjective', 'COIObjective', + 'make_thesis_platform', 'ThesisConfig', + 'COIMetrics', 'compute_coi', 'compute_separability', +] diff --git a/lab/case/thesis/arrivals.py b/lab/case/thesis/arrivals.py new file mode 100644 index 0000000..909cab5 --- /dev/null +++ b/lab/case/thesis/arrivals.py @@ -0,0 +1,327 @@ +"""Contaminated arrivals using learned MDP kernels from behavior_loader. + +Implements thesis demand model (Section 3.1): +- Aggregate demand Q(p) = (1-α)E[d(p;θ_H)] + αE[d(p;θ_A)] + ε_t (Eq 3) +- Demand proxy q̂_{t,i} = Σ_s Σ_k ω(a_{s,k}) · 1[i_{s,k} = i] (Eq 2) +- Per-session separability via KL divergence Δ_H, Δ_A (Eq 20-21) + +The arrival model samples sessions from a mixture of human/agent behavioral profiles, +each session produces a trajectory τ_s and associated demand computation q(τ'). +""" +from __future__ import annotations +from dataclasses import dataclass, field +from types import SimpleNamespace +from typing import Dict, List, Tuple, Optional +import numpy as np +from ...outlet.types import Opportunity, InstrumentSet, MarketState, HiddenState +from ...outlet.constants import Side, OpportunityType +from ...outlet.math_util import poisson_arrivals + +try: + import sys + from pathlib import Path + sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + from sim.rl.behavior_loader.models import ( + BehaviorModel, AgentBehaviorModel, aggregate_event_transitions, kl_divergence + ) + REAL_MDP = True +except ImportError: + REAL_MDP = False + kl_divergence = None + +EVENT_PAGE = {"session_start": "/", "view_item_page": "/products", "learn_more_about_item": "/products/details", + "add_item_to_cart": "/cart", "purchase_complete": "/checkout", "session_end": "/checkout/success"} +EVENT_CANON = {"page_view": "session_start", "hover_over_paragraph": "view_item_page", "hover_over_title": "view_item_page", + "view_item_page": "view_item_page", "learn_more_about_item": "learn_more_about_item", + "add_item_to_cart": "add_item_to_cart", "checkout_start": "purchase_complete", "remove_item": "view_item_page"} + +# action space partition A = A_nav ∪ A_cart ∪ A_filter ∪ A_dwell with signal weights ω (Table 1) +ACTION_WEIGHTS: Dict[str, float] = { + "add_item_to_cart": 0.8, "remove_item": 0.6, "checkout_start": 0.9, "purchase_complete": 1.0, # A_cart + "hover_over_title": 0.3, "hover_over_paragraph": 0.35, "hover_over_link": 0.25, # A_dwell + "page_view": 0.1, "session_start": 0.05, "view_item_page": 0.15, "learn_more_about_item": 0.2, # A_nav + "search": 0.05, "filter_date": 0.05, "filter_price": 0.08, "sort": 0.03, "session_end": 0.0, # A_filter +} + + +@dataclass +class SessionDemand: + """Per-session demand computation per thesis formulation (Section 3.1). + + Each session s ∈ S produces trajectory τ_s and demand proxy q̂. The platform uses + divergence signals Δ_H, Δ_A to estimate per-session contamination α̂(τ'). + """ + session_id: str + q: Dict[int, float] # q̂_i demand proxy per product (Eq 2) + trajectory: List[Dict] # τ_s = (e_{s,1}, ..., e_{s,L_s}) + delta_h: float = 0.0 # D_KL(T̂' || T̄_H) (Eq 20) + delta_a: float = 0.0 # D_KL(T̂' || T̄_A) (Eq 21) + alpha_hat: float = 0.0 # per-session contamination estimate + actor_class: str = "H" # ground truth Y_s ∈ {H, A} + theta: Dict[str, float] = field(default_factory=dict) + + +def compute_demand_proxy(events: List[Dict], n_products: int) -> Dict[int, float]: + """Compute q̂_{t,i} = Σ_k ω(a_{s,k}) · 1[i_{s,k} = i] per Eq 2.""" + q = {i: 0.0 for i in range(n_products)} + for e in events: + action, pidx = e.get("eventName", ""), e.get("product_idx") + if pidx is not None and 0 <= pidx < n_products: + q[pidx] += ACTION_WEIGHTS.get(action, 0.1) + return q + + +def compute_session_divergence(events: List[Dict], ref_h: Dict, ref_a: Dict) -> Tuple[float, float]: + """Compute Δ_H, Δ_A divergence signals from trajectory (Eq 20-21).""" + if not events or kl_divergence is None: + return 0.0, 0.0 + # build empirical transition kernel from trajectory + trans: Dict[str, Dict[str, int]] = {} + prev = "session_start" + for e in events: + curr = e.get("eventName", "session_end") + trans.setdefault(prev, {}) + trans[prev][curr] = trans[prev].get(curr, 0) + 1 + prev = curr + # normalize to probabilities + kernel = {} + for s, dests in trans.items(): + total = sum(dests.values()) + kernel[s] = {d: c / total for d, c in dests.items()} if total > 0 else {} + # aggregate to event-level and compute KL divergence against reference kernels + delta_h = sum(kl_divergence(kernel.get(s, {}), ref_h.get(s, {})) for s in kernel) / max(len(kernel), 1) + delta_a = sum(kl_divergence(kernel.get(s, {}), ref_a.get(s, {})) for s in kernel) / max(len(kernel), 1) + return delta_h, delta_a + +def _canonicalize(raw: Dict) -> Dict: + out = {} + for src, dsts in raw.items(): + sc = EVENT_CANON.get(src, src) + out.setdefault(sc, {}) + for dst, p in dsts.items(): + dc = EVENT_CANON.get(dst, dst) + out[sc][dc] = out[sc].get(dc, 0.0) + p + return {s: {k: v/sum(d.values()) for k, v in d.items()} for s, d in out.items() if sum(d.values()) > 0} + + +class BehavioralProfile: + """Markov profile from learned MDP kernels (Section 3.5.2). + + Transition kernel T̂_Y estimated via MLE: P̂(s'|s) = N(s,s') / Σ_k N(s,k) (Eq 19) + """ + STATES = ["session_start", "view_item_page", "learn_more_about_item", "add_item_to_cart", "purchase_complete", "session_end"] + # fallback kernels T̄_H, T̄_A when real data unavailable + FALLBACK_H = {"session_start": {"view_item_page": 0.85, "session_end": 0.15}, + "view_item_page": {"learn_more_about_item": 0.4, "add_item_to_cart": 0.3, "view_item_page": 0.2, "session_end": 0.1}, + "learn_more_about_item": {"add_item_to_cart": 0.5, "view_item_page": 0.3, "session_end": 0.2}, + "add_item_to_cart": {"purchase_complete": 0.6, "view_item_page": 0.25, "session_end": 0.15}, + "purchase_complete": {"session_end": 1.0}} + FALLBACK_A = {"session_start": {"view_item_page": 0.95, "session_end": 0.05}, + "view_item_page": {"learn_more_about_item": 0.6, "view_item_page": 0.25, "add_item_to_cart": 0.1, "session_end": 0.05}, + "learn_more_about_item": {"view_item_page": 0.5, "add_item_to_cart": 0.15, "learn_more_about_item": 0.3, "session_end": 0.05}, + "add_item_to_cart": {"view_item_page": 0.4, "purchase_complete": 0.2, "session_end": 0.4}, + "purchase_complete": {"session_end": 1.0}} + + def __init__(self, actor: str, pprobs: np.ndarray, data_dir: str = ""): + self.actor, self.pprobs = actor, np.clip(pprobs, 0.0, 0.95) + self.trans = self._load(data_dir) # T̂_Y transition kernel + self._ensure_terminal() + self.dwell = {s: (1.2, 0.5) if actor == "agents" else (2.0, 1.2) for s in self.STATES} + + def _load(self, data_dir: str) -> Dict: + if not REAL_MDP or not data_dir: + print("using fallback") + return dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H) + try: + mdp = (AgentBehaviorModel if self.actor == "agents" else BehaviorModel)(data_dir).build_MDP() + raw = aggregate_event_transitions(mdp) if mdp.get("transitions") else {} + return _canonicalize(raw) if raw else dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H) + except Exception: + print("using fallback") + return dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H) + + def _ensure_terminal(self): + self.trans.setdefault("purchase_complete", {})["session_end"] = self.trans.get("purchase_complete", {}).get("session_end", 1.0) + self.trans.setdefault("session_start", {"view_item_page": 0.7, "learn_more_about_item": 0.2, "session_end": 0.1}) + + def _tprobs(self, state: str, pidx: int) -> Dict[str, float]: + probs = dict(self.trans.get(state, {"session_end": 1.0})) + if state == "add_item_to_cart": + base = probs.get("purchase_complete", 0.0) + df = float(self.pprobs[pidx]) * (0.3 if self.actor == "agents" else 1.0) + adj = np.clip(base * 0.5 + df * 0.5, 0.0, 0.95) + rem = max(1e-6, 1.0 - adj) + other = sum(v for k, v in probs.items() if k != "purchase_complete") + probs = {k: (adj if k == "purchase_complete" else v * rem / max(other, 1e-6)) for k, v in probs.items()} + total = sum(probs.values()) + return {k: v/total for k, v in probs.items()} if total > 0 else {"session_end": 1.0} + + def sample(self, rng: np.random.Generator, sid: str, prices: np.ndarray, costs: np.ndarray) -> Tuple[List[Dict], List[SimpleNamespace]]: + events, fevts = [], [] + state, t, pidx = "session_start", 0.0, int(rng.integers(0, len(prices))) + cost, cprice = float(costs[pidx]), max(float(prices[pidx]), float(costs[pidx]) * 1.05) + + while state != "session_end" and len(events) < 40: + if state != "session_start": + row = {"session_id": sid, "actor": "agent" if self.actor == "agents" else "human", + "eventName": state, "product_idx": pidx, "productId": f"product-{pidx:04d}", + "price_offered": cprice, "price_paid": 0.0, "page": EVENT_PAGE.get(state, "/"), + "ts": t, "unit_cost": cost, "base_price": float(prices[pidx])} + if state == "purchase_complete": + row["price_paid"] = max(cprice * (1.0 + rng.normal(0.0, 0.015)), cost) + events.append(row) + fevts.append(SimpleNamespace(eventName=state, page=row["page"], productId=row["productId"], ts=t)) + + probs = self._tprobs(state, pidx) + state = rng.choice(list(probs.keys()), p=list(probs.values())) + sh, sc = self.dwell.get(state, (2.0, 1.0)) + t += max(0.3, rng.gamma(shape=sh, scale=sc)) + return events, fevts + + +@dataclass +class ContaminatedArrivalConfig: + base_rate: float = 20.0 + alpha_contamination: float = 0.2 + alpha_drift: float = 0.0 + alpha_bounds: tuple[float, float] = (0.0, 0.5) + human_views_range: tuple[int, int] = (1, 4) + agent_views_range: tuple[int, int] = (3, 10) + agent_systematic: bool = True + use_real_behavior: bool = True + human_data_dir: str = "" + agent_data_dir: str = "" + + +class ContaminatedArrivalModel: + """Mixture model Q(p) = (1-α)E[d(p;θ_H)] + αE[d(p;θ_A)] + ε_t (Eq 3). + + Samples sessions from human/agent behavioral profiles, computes per-session + demand proxy q̂ and divergence signals Δ_H, Δ_A for separability. + """ + + def __init__(self, cfg: ContaminatedArrivalConfig | None = None): + self.cfg = cfg or ContaminatedArrivalConfig() + self._alpha = self.cfg.alpha_contamination + self._scount = 0 + self._profiles: Dict[str, BehavioralProfile] = {} + self._ref_kernels: Dict[str, Dict] = {} # T̄_H, T̄_A reference kernels + self._session_demands: List[SessionDemand] = [] # collected session demands + + @property + def alpha(self) -> float: + return self._alpha + + def _profile(self, actor: str, pprobs: np.ndarray) -> BehavioralProfile: + key = actor + if key not in self._profiles: + ddir = self.cfg.agent_data_dir if actor == "agents" else self.cfg.human_data_dir + if not ddir and self.cfg.use_real_behavior: + base = Path(__file__).parent.parent.parent.parent / "experiments" + ddir = str(base / ("agents/collected_data" if actor == "agents" else "collected_data")) + profile = BehavioralProfile(actor, pprobs, ddir if self.cfg.use_real_behavior else "") + self._profiles[key] = profile + self._ref_kernels[key] = profile.trans # cache T̄_Y for divergence + return self._profiles[key] + + def get_ref_kernels(self) -> Tuple[Dict, Dict]: + """Return reference transition kernels T̄_H, T̄_A for divergence computation.""" + return (self._ref_kernels.get("humans", BehavioralProfile.FALLBACK_H), + self._ref_kernels.get("agents", BehavioralProfile.FALLBACK_A)) + + def get_session_demands(self) -> List[SessionDemand]: + """Return collected session demands for downstream analysis.""" + return self._session_demands + + def sample(self, t: float, dt: float, instruments: InstrumentSet, + market: MarketState | None, hidden: HiddenState, rng: np.random.Generator) -> list[Opportunity]: + """Sample arrivals as per Eq 3: mixture of human/agent demand distributions. + + For each session s, computes: + - Trajectory τ_s from behavioral profile sampling + - Demand proxy q̂ via weighted action aggregation (Eq 2) + - Divergence signals Δ_H, Δ_A for separability (Eq 20-21) + - Per-session contamination estimate α̂(τ') + """ + cfg = self.cfg + if cfg.alpha_drift != 0: + self._alpha = np.clip(self._alpha + cfg.alpha_drift * rng.normal(), *cfg.alpha_bounds) + hidden.contamination = self._alpha + + n_sess = poisson_arrivals(cfg.base_rate * hidden.true_demand_intensity, dt, rng) + prices, costs = instruments.refs, instruments.costs + margin = np.clip((prices - costs) / np.maximum(costs, 1e-3), -0.9, 2.0) + hprob, aprob = 0.08 * np.exp(-1.2 * margin), 0.05 * np.exp(-0.6 * margin) + ref_h, ref_a = self.get_ref_kernels() + + opps = [] + for _ in range(n_sess): + self._scount += 1 + sid = f"s{self._scount:06d}" + is_agent = rng.random() < self._alpha + actor, probs = ("agents", aprob) if is_agent else ("humans", hprob) + profile = self._profile(actor, probs) + events, fevts = profile.sample(rng, sid, prices, costs) + + # compute demand proxy q̂ per Eq 2 + q = compute_demand_proxy(events, instruments.n) + + # compute divergence signals Δ_H, Δ_A per Eq 20-21 + delta_h, delta_a = compute_session_divergence(events, ref_h, ref_a) + # per-session contamination estimate α̂(τ') = σ(β(Δ_H - Δ_A)) + alpha_hat = 1.0 / (1.0 + np.exp(-2.0 * (delta_h - delta_a))) if (delta_h + delta_a) > 0 else 0.5 + + theta = ({'price_sensitivity': rng.uniform(0.05, 0.2), 'base_conversion': 0.01, 'info_value': 1.0} if is_agent + else {'price_sensitivity': rng.uniform(1.5, 4.0), 'base_conversion': rng.uniform(0.2, 0.5), 'info_value': 0.0}) + + # store session demand for downstream analysis + self._session_demands.append(SessionDemand( + session_id=sid, q=q, trajectory=events, delta_h=delta_h, delta_a=delta_a, + alpha_hat=alpha_hat, actor_class="A" if is_agent else "H", theta=theta)) + + viewed = list({e["product_idx"] for e in events if "product_idx" in e}) + if not viewed: + vr = cfg.agent_views_range if is_agent else cfg.human_views_range + viewed = list(rng.choice(instruments.n, size=min(rng.integers(*vr), instruments.n), replace=False)) + + for vi, iid in enumerate(viewed): + opps.append(Opportunity( + id=f"{sid}-{iid}", type=OpportunityType.SESSION, side=Side.BUY, + instrument_id=int(iid), size=1.0, t=t + rng.uniform(0, dt), + context={'session_id': sid, 'actor_class': 'AGENT' if is_agent else 'HUMAN', 'is_agent': is_agent, + 'reconnaissance_intent': is_agent, 'view_index': vi, 'total_views': len(viewed), + 'theta': theta, 'trajectory_events': fevts, 'mdp_trajectory': events, + 'demand_proxy': q, 'alpha_hat': alpha_hat, 'delta_h': delta_h, 'delta_a': delta_a})) + return opps + + +@dataclass +class AdversarialArrivalConfig: + base_rate: float = 5.0 + n_parallel_agents: int = 3 + query_all_products: bool = True + + +class AdversarialArrivalModel: + """Adversarial coordination (Theorem 1): as N->inf, COI->0.""" + + def __init__(self, cfg: AdversarialArrivalConfig | None = None): + self.cfg = cfg or AdversarialArrivalConfig() + self._qcount = 0 + + def sample(self, t: float, dt: float, instruments: InstrumentSet, + market: MarketState | None, hidden: HiddenState, rng: np.random.Generator) -> list[Opportunity]: + cfg, opps = self.cfg, [] + for _ in range(poisson_arrivals(cfg.base_rate, dt, rng)): + self._qcount += 1 + for ai in range(cfg.n_parallel_agents): + sid = f"adv{self._qcount:06d}-{ai}" + prods = np.arange(instruments.n) if cfg.query_all_products else rng.choice(instruments.n, size=1) + for iid in prods: + opps.append(Opportunity( + id=f"{sid}-{iid}", type=OpportunityType.SESSION, side=Side.BUY, + instrument_id=int(iid), size=1.0, t=t, + context={'session_id': sid, 'actor_class': 'AGENT', 'is_agent': True, 'adversarial': True, + 'agent_index': ai, 'query_group': self._qcount, + 'theta': {'price_sensitivity': 0.0, 'base_conversion': 0.0, 'info_value': 1.0}})) + return opps diff --git a/lab/case/thesis/execution.py b/lab/case/thesis/execution.py new file mode 100644 index 0000000..5d2aa37 --- /dev/null +++ b/lab/case/thesis/execution.py @@ -0,0 +1,91 @@ +"""Execution models with divergent H/A behavior using ground truth labels.""" +from __future__ import annotations +from dataclasses import dataclass +from typing import Any, Dict +import numpy as np +from ...outlet.types import Opportunity, Quote, InstrumentSet, MarketState +from ...outlet.math_util import sigmoid, safe_log, EPS + + +@dataclass +class HybridExecutionConfig: + human_base_prob: float = 0.3 + human_elasticity: float = 2.5 + agent_conversion: float = 0.01 + cross_elasticity: float = 0.4 + quality_weight: float = 0.2 + use_separability: bool = False + + +class HybridExecutionModel: + """Execution with divergent H/A behavior using ground truth labels.""" + + def __init__(self, cfg: HybridExecutionConfig | None = None): + self.cfg = cfg or HybridExecutionConfig() + + def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet, + market: MarketState | None, rng: np.random.Generator) -> float: + cfg, idx = self.cfg, int(opp.instrument_id) + price, ref, cost = float(quote.prices[idx]), float(instruments.refs[idx]), float(instruments.costs[idx]) + ctx = opp.context + theta = ctx.get('theta', {}) + is_agent = ctx.get('is_agent', False) + + if is_agent: + return cfg.agent_conversion * theta.get('base_conversion', 1.0) + + # human logit discrete choice + sens = theta.get('price_sensitivity', cfg.human_elasticity) + base = theta.get('base_conversion', cfg.human_base_prob) + u_price = -sens * safe_log(price / (ref + EPS)) + quality = instruments.instruments[idx].attrs.get('quality', 0.5) + u_quality = cfg.quality_weight * quality + + u_comp = 0.0 + if market and market.competitor_quotes is not None: + cp = market.competitor_quotes[idx] + if cp < price: + u_comp = -cfg.cross_elasticity * (price - cp) / ref + + utility = safe_log(base / (1 - base + EPS)) + u_price + u_quality + u_comp + return float(sigmoid(utility)) + + def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, context: dict[str, Any] | None = None) -> np.ndarray: + if context is None: + return fills / (self.cfg.human_base_prob + EPS) + agent_frac = context.get('contamination', 0.0) + return fills / (self.cfg.human_base_prob * (1 - agent_frac) + EPS) + + +@dataclass +class SeparableExecutionConfig: + human_funnel: Dict[str, float] = None + agent_funnel: Dict[str, float] = None + + def __post_init__(self): + self.human_funnel = self.human_funnel or {'view_to_detail': 0.4, 'detail_to_cart': 0.3, 'cart_to_purchase': 0.6} + self.agent_funnel = self.agent_funnel or {'view_to_detail': 0.8, 'detail_to_cart': 0.05, 'cart_to_purchase': 0.1} + + +class SeparableExecutionModel: + """Execution with Markov funnel kernels using ground truth labels.""" + + def __init__(self, cfg: SeparableExecutionConfig | None = None): + self.cfg = cfg or SeparableExecutionConfig() + + def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet, + market: MarketState | None, rng: np.random.Generator) -> float: + is_agent = opp.context.get('is_agent', False) + probs = self.cfg.agent_funnel if is_agent else self.cfg.human_funnel + p = probs['view_to_detail'] * probs['detail_to_cart'] * probs['cart_to_purchase'] + + if not is_agent: + idx = int(opp.instrument_id) + price_ratio = quote.prices[idx] / (instruments.refs[idx] + EPS) + p *= np.exp(-0.5 * (price_ratio - 1.0)) + return float(np.clip(p, 0, 1)) + + def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, context: dict[str, Any] | None = None) -> np.ndarray: + h = self.cfg.human_funnel + exp_conv = h['view_to_detail'] * h['detail_to_cart'] * h['cart_to_purchase'] + return fills / (exp_conv + EPS) diff --git a/lab/case/thesis/metrics.py b/lab/case/thesis/metrics.py new file mode 100644 index 0000000..0cd9680 --- /dev/null +++ b/lab/case/thesis/metrics.py @@ -0,0 +1,102 @@ +"""Thesis metrics for COI and behavioral analysis using ground truth labels.""" +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Dict +import numpy as np +from ...outlet.types import StepLogs, StepMetrics, Quote, InstrumentSet +from ...outlet.math_util import safe_log, EPS + + +@dataclass +class COIMetrics: + coi_level: float = 0.0 + coi_leakage: float = 0.0 + realized_premium: float = 0.0 + theoretical_max: float = 0.0 + erosion_rate: float = 0.0 + + def to_dict(self) -> dict[str, float]: + return {k: getattr(self, k) for k in ['coi_level', 'coi_leakage', 'realized_premium', 'theoretical_max', 'erosion_rate']} + + +def compute_coi(quote: Quote, instruments: InstrumentSet, metrics: StepMetrics, contamination: float) -> COIMetrics: + prices, costs, refs = quote.prices, instruments.costs, instruments.refs + margins = prices - costs + coi_level = float(np.mean(margins)) + theoretical_max = float(np.mean(costs)) + realized_premium = (metrics.revenue - metrics.cost) / metrics.units_traded if metrics.units_traded > 0 else 0.0 + price_var = float(np.var(prices / refs)) + coi_leakage = contamination * (coi_level + price_var) + erosion_rate = contamination * coi_level / (theoretical_max + EPS) + return COIMetrics(coi_level=coi_level, coi_leakage=coi_leakage, realized_premium=realized_premium, + theoretical_max=theoretical_max, erosion_rate=erosion_rate) + + +@dataclass +class SeparabilityMetrics: + classification_accuracy: float = 0.0 + estimated_alpha: float = 0.0 + n_human_sessions: int = 0 + n_agent_sessions: int = 0 + + +def compute_separability(logs: StepLogs, true_alpha: float) -> SeparabilityMetrics: + """Compute separability using ground truth labels only.""" + if logs.events is None or len(logs.events) == 0: + return SeparabilityMetrics(estimated_alpha=true_alpha) + + sessions: Dict[str, bool] = {} + for evt in logs.events: + sid = evt.metadata.get('session_id', evt.opportunity_id) + if sid not in sessions: + sessions[sid] = evt.metadata.get('is_agent', False) + + n_agent = sum(1 for is_agent in sessions.values() if is_agent) + n_human = len(sessions) - n_agent + est_alpha = n_agent / len(sessions) if sessions else 0.0 + + return SeparabilityMetrics( + classification_accuracy=1.0, # ground truth is always correct + estimated_alpha=est_alpha, + n_human_sessions=n_human, + n_agent_sessions=n_agent) + + +@dataclass +class RevenueAttribution: + total_revenue: float = 0.0 + human_revenue: float = 0.0 + agent_revenue: float = 0.0 + human_conversion: float = 0.0 + agent_conversion: float = 0.0 + + +def compute_attribution(logs: StepLogs, metrics: StepMetrics) -> RevenueAttribution: + if logs.executions is None: + return RevenueAttribution(total_revenue=metrics.revenue) + + human_rev, agent_rev, human_cnt, agent_cnt = 0.0, 0.0, 0, 0 + for exe in logs.executions: + if exe.propensity < 0.05: + agent_rev += exe.price * exe.size_filled + agent_cnt += 1 + else: + human_rev += exe.price * exe.size_filled + human_cnt += 1 + + total_exp = logs.aggregates.get('n_arrivals', 1) + return RevenueAttribution( + total_revenue=metrics.revenue, human_revenue=human_rev, agent_revenue=agent_rev, + human_conversion=human_cnt / (total_exp * 0.8 + EPS), + agent_conversion=agent_cnt / (total_exp * 0.2 + EPS)) + + +def order_statistic_erosion(n_agents: int, price_variance: float) -> float: + """COI erosion from Theorem 1: as N->inf, min(p_1..p_N)->p_min.""" + if n_agents <= 1: + return 0.0 + sigma, log_n = np.sqrt(price_variance), safe_log(n_agents) + if log_n < 1: + return 0.0 + shift = sigma * (np.sqrt(2 * log_n) - (safe_log(log_n) + safe_log(4 * np.pi)) / (2 * np.sqrt(2 * log_n) + EPS)) + return float(min(shift / (sigma * 2 + EPS), 1.0)) diff --git a/lab/case/thesis/objectives.py b/lab/case/thesis/objectives.py new file mode 100644 index 0000000..ba70320 --- /dev/null +++ b/lab/case/thesis/objectives.py @@ -0,0 +1,228 @@ +""" +Thesis-specific objectives implementing robust pricing under contamination. + +Implements the Maximin objective from Eq 23: +π* = argmax_π min_{Q ∈ U_ε} E_d~Q[R(p,d) - λ·COI(p)] + +Key components: +- COIObjective: Cost of Information penalty (Definition 1) +- RobustStackelbergObjective: Full maximin objective with Wasserstein robustness +- UXPenalty: User experience degradation from volatility +""" +from __future__ import annotations +from dataclasses import dataclass +import numpy as np +from ...outlet.objectives.base import BaseObjective, CompositeObjective +from ...outlet.types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation +from ...outlet.math_util import safe_log, EPS + +class COIObjective(BaseObjective): + """Cost of Information penalty from Definition 1. + + COI(π) = E[P] - p_min + + The expected price premium over marginal cost represents the platform's + pricing power. Agent reconnaissance erodes this by revealing price + distribution to buyers. + + We implement COI_leakage = f(τ') · InfoValue(p, τ') + where f(τ') is the estimated agent probability. + """ + + def __init__(self, lambda_coi: float = 1.0, use_revelation: bool = False): + """ + Args: + lambda_coi: Weight on COI penalty + use_revelation: If True, use -log(π(p)) as info value (penalizes rare prices) + """ + self.lambda_coi = lambda_coi + self.use_revelation = use_revelation + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + # COI_leakage = α · InfoValue + alpha = hidden.contamination + + if self.use_revelation: + # revelation surrogate: rare prices reveal more about policy + # InfoValue = -log(π(p|τ')) ≈ surprise of the price + price_surprise = np.mean(np.abs(quote.prices - instruments.refs) / (instruments.refs + EPS)) + info_value = price_surprise + else: + # query-tax surrogate: each agent query incurs constant leakage + info_value = 1.0 + + leakage = alpha * info_value + return -self.lambda_coi * leakage + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + alpha = hidden.contamination + margins = (quote.prices - instruments.costs) / (instruments.costs + EPS) + return { + 'coi_penalty': self.reward(quote, instruments, metrics, hidden, obs), + 'contamination': alpha, + 'avg_margin': float(np.mean(margins)), + } + +@dataclass +class RobustObjectiveConfig: + """Configuration for robust Stackelberg objective. + + Attributes: + lambda_coi: Weight on COI penalty (λ in Eq 23) + lambda_ux: Weight on UX penalty + lambda_volatility: Weight on price volatility penalty + gamma_inventory: Inventory risk aversion + wasserstein_epsilon: Ambiguity set radius (ε in Eq 21) + """ + lambda_coi: float = 0.5 + lambda_ux: float = 0.1 + lambda_volatility: float = 0.2 + gamma_inventory: float = 0.1 + wasserstein_epsilon: float = 0.1 + +class RobustStackelbergObjective(BaseObjective): + """Implements the Maximin Objective from thesis Eq 23. + + π* = argmax_π min_{Q ∈ U_ε(P̂_N)} E_d~Q[R(p,d) - λ·COI(p)] + + The objective balances: + 1. Revenue R(p,d) from human purchases + 2. COI penalty for information leakage to agents + 3. UX penalty for price volatility + 4. Inventory/holding costs + + The min over ambiguity set U_ε is approximated by penalizing + high contamination scenarios more heavily. + """ + + def __init__(self, cfg: RobustObjectiveConfig | None = None): + self.cfg = cfg or RobustObjectiveConfig() + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + cfg = self.cfg + + # 1. base revenue (R(p,d)) + revenue = metrics.revenue + cost = metrics.cost + profit = revenue - cost + + # 2. COI penalty: scales with contamination and margin extraction + # high margins + high contamination = high leakage + alpha = hidden.contamination + margins = quote.prices - instruments.costs + avg_margin = float(np.mean(margins)) + coi_penalty = cfg.lambda_coi * avg_margin * alpha + + # 3. UX penalty: price volatility harms legitimate users + volatility_penalty = cfg.lambda_volatility * metrics.volatility + + # 4. inventory/position cost + position_penalty = cfg.gamma_inventory * metrics.position_cost + + # 5. lost opportunity cost (stockouts) + lost_penalty = 0.1 * metrics.lost_opportunity + + # robust adjustment: under adversarial distribution Q, + # expect lower revenue and higher costs + # approximate via worst-case contamination within ε-ball + worst_case_alpha = min(alpha + cfg.wasserstein_epsilon, 1.0) + robustness_penalty = cfg.wasserstein_epsilon * avg_margin * worst_case_alpha + + total = profit - coi_penalty - volatility_penalty - position_penalty - lost_penalty - robustness_penalty + + return total + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + cfg = self.cfg + alpha = hidden.contamination + margins = quote.prices - instruments.costs + avg_margin = float(np.mean(margins)) + + return { + 'revenue': metrics.revenue, + 'cost': metrics.cost, + 'profit': metrics.revenue - metrics.cost, + 'coi_penalty': -cfg.lambda_coi * avg_margin * alpha, + 'volatility_penalty': -cfg.lambda_volatility * metrics.volatility, + 'position_penalty': -cfg.gamma_inventory * metrics.position_cost, + 'lost_penalty': -0.1 * metrics.lost_opportunity, + 'robustness_penalty': -cfg.wasserstein_epsilon * avg_margin * min(alpha + cfg.wasserstein_epsilon, 1.0), + 'contamination': alpha, + 'avg_margin_pct': avg_margin / (float(np.mean(instruments.costs)) + EPS), + } + +class UXPenalty(BaseObjective): + """User experience penalty from price volatility. + + High price volatility degrades UX for legitimate human users. + This term ensures the defense doesn't harm real customers while + protecting against agent reconnaissance. + """ + + def __init__(self, scale: float = 1.0, max_acceptable_volatility: float = 0.1): + self.scale = scale + self.max_vol = max_acceptable_volatility + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + # penalty increases quadratically beyond threshold + excess_vol = max(0, metrics.volatility - self.max_vol) + return -self.scale * (excess_vol ** 2) + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + return { + 'ux_penalty': self.reward(quote, instruments, metrics, hidden, obs), + 'volatility': metrics.volatility, + } + +class AdaptiveObjective(BaseObjective): + """Objective that adapts weights based on estimated contamination. + + When contamination is low, focus on revenue maximization. + When contamination is high, increase COI defense weight. + """ + + def __init__(self, base_lambda_coi: float = 0.3, max_lambda_coi: float = 2.0, + adaptation_rate: float = 2.0): + self.base_lambda = base_lambda_coi + self.max_lambda = max_lambda_coi + self.rate = adaptation_rate + + def _adaptive_lambda(self, alpha: float) -> float: + # sigmoid scaling: λ(α) = base + (max-base) * sigmoid(rate*(α-0.5)) + from ...outlet.math_util import sigmoid + scale = sigmoid(self.rate * (alpha - 0.3)) + return self.base_lambda + (self.max_lambda - self.base_lambda) * scale + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + alpha = hidden.contamination + lambda_coi = self._adaptive_lambda(alpha) + + profit = metrics.revenue - metrics.cost + margins = quote.prices - instruments.costs + coi_penalty = lambda_coi * float(np.mean(margins)) * alpha + + return profit - coi_penalty + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + alpha = hidden.contamination + return { + 'profit': metrics.revenue - metrics.cost, + 'adaptive_lambda': self._adaptive_lambda(alpha), + 'contamination': alpha, + } + +def make_thesis_objective(lambda_coi: float = 0.5, lambda_ux: float = 0.1, + lambda_vol: float = 0.2) -> CompositeObjective: + """Create the standard thesis objective composition.""" + return CompositeObjective([ + (RobustStackelbergObjective(RobustObjectiveConfig( + lambda_coi=lambda_coi, lambda_ux=lambda_ux, lambda_volatility=lambda_vol)), 1.0), + ]) diff --git a/lab/case/thesis/platform.py b/lab/case/thesis/platform.py new file mode 100644 index 0000000..ec00da5 --- /dev/null +++ b/lab/case/thesis/platform.py @@ -0,0 +1,176 @@ +"""Thesis platform with real MDP behavioral models and separability scoring.""" +from __future__ import annotations +from dataclasses import dataclass +from pathlib import Path +import numpy as np +from ...outlet import (Platform, PlatformConfig, PositionModel, PositionConfig, + PostedPriceMechanism, make_instruments, InstrumentType, LogLevel) +from ...outlet.mechanisms.posted_price import PostedPriceConfig +from ...outlet.observation import DefaultObservationBuilder, ObservationConfig +from .arrivals import ContaminatedArrivalModel, ContaminatedArrivalConfig +from .execution import HybridExecutionModel, HybridExecutionConfig +from .objectives import RobustStackelbergObjective, RobustObjectiveConfig + + +@dataclass +class ThesisConfig: + # instruments + n_instruments: int = 10 + cost_range: tuple[float, float] = (5.0, 50.0) + margin_range: tuple[float, float] = (0.2, 0.5) + + # contamination (Section 3.1) + alpha_contamination: float = 0.2 + alpha_drift: float = 0.0 + alpha_bounds: tuple[float, float] = (0.0, 0.5) + + # objectives (Eq 23) + lambda_coi: float = 0.5 + lambda_ux: float = 0.1 + lambda_volatility: float = 0.2 + wasserstein_epsilon: float = 0.1 + + # arrivals + sessions_per_step: int = 30 + human_views_range: tuple[int, int] = (1, 4) + agent_views_range: tuple[int, int] = (3, 10) + + # inventory + initial_inventory: float = 100.0 + holding_cost_rate: float = 0.002 + + # real behavioral models (from sim.rl) + use_real_behavior: bool = True + use_separability: bool = False # disabled until classifier trained + human_data_dir: str = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data" + agent_data_dir: str = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data" + + # simulation + max_steps: int = 500 + seed: int | None = 24 + log_level: LogLevel = LogLevel.AGG_ONLY + + +def _resolve_data_dirs(cfg: ThesisConfig) -> tuple[str, str]: + """Resolve data directories for behavioral models.""" + base = Path(__file__).parent.parent.parent.parent / "experiments" + human = cfg.human_data_dir or str(base / "collected_data") + agent = cfg.agent_data_dir or str(base / "agents/collected_data") + return human, agent + + +def make_thesis_platform(cfg: ThesisConfig | None = None) -> Platform: + """Create platform with real MDP behavioral models. + + Implements: + - Contaminated arrivals using learned MDP kernels from behavior_loader + - Hybrid execution with real separability scoring from lib.separability + - Robust Stackelberg objective (Eq 23) + """ + cfg = cfg or ThesisConfig() + rng = np.random.default_rng(cfg.seed) + human_dir, agent_dir = _resolve_data_dirs(cfg) + + instruments = make_instruments( + n=cfg.n_instruments, cost_range=cfg.cost_range, margin_range=cfg.margin_range, + inst_type=InstrumentType.SKU, rng=rng) + instruments.position = np.full(cfg.n_instruments, cfg.initial_inventory) + + arrival = ContaminatedArrivalModel(ContaminatedArrivalConfig( + base_rate=cfg.sessions_per_step, + alpha_contamination=cfg.alpha_contamination, + alpha_drift=cfg.alpha_drift, + alpha_bounds=cfg.alpha_bounds, + human_views_range=cfg.human_views_range, + agent_views_range=cfg.agent_views_range, + use_real_behavior=cfg.use_real_behavior, + human_data_dir=human_dir, + agent_data_dir=agent_dir, + )) + + execution = HybridExecutionModel(HybridExecutionConfig( + use_separability=cfg.use_separability, + )) + + mechanism = PostedPriceMechanism(PostedPriceConfig(max_delta_pct=0.15, min_margin_pct=0.05)) + position = PositionModel(PositionConfig(initial_position=cfg.initial_inventory, holding_cost_rate=cfg.holding_cost_rate)) + + market = None + objective = RobustStackelbergObjective(RobustObjectiveConfig( + lambda_coi=cfg.lambda_coi, lambda_ux=cfg.lambda_ux, + lambda_volatility=cfg.lambda_volatility, wasserstein_epsilon=cfg.wasserstein_epsilon)) + + obs_builder = DefaultObservationBuilder(ObservationConfig(mask_true_demand=True)) + platform_cfg = PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps, + seed=cfg.seed, log_level=cfg.log_level, mask_demand=True) + + return Platform(instruments=instruments, mechanism=mechanism, arrival=arrival, execution=execution, + position=position, market=market, obs_builder=obs_builder, objective=objective, cfg=platform_cfg) + + +@dataclass +class AblationConfig(ThesisConfig): + disable_coi_penalty: bool = False + disable_ux_penalty: bool = False + disable_contamination: bool = False + disable_real_behavior: bool = False + + +def make_ablation_platform(cfg: AblationConfig) -> Platform: + if cfg.disable_coi_penalty: + cfg.lambda_coi = 0.0 + if cfg.disable_ux_penalty: + cfg.lambda_ux = 0.0 + if cfg.disable_contamination: + cfg.alpha_contamination = 0.0 + if cfg.disable_real_behavior: + cfg.use_real_behavior = False + cfg.use_separability = False + return make_thesis_platform(cfg) + + +def sweep_contamination(alpha_values: list[float], base_cfg: ThesisConfig | None = None, + n_steps: int = 100, seed: int = 42) -> dict[float, dict]: + """Test performance across contamination levels (Theorem 1 validation).""" + from ...experiments.eval import rollout, fixed_price_policy + + results = {} + base_cfg = base_cfg or ThesisConfig() + + for alpha in alpha_values: + cfg = ThesisConfig(**{k: v for k, v in base_cfg.__dict__.items() if k != 'alpha_contamination'}, + alpha_contamination=alpha) + platform = make_thesis_platform(cfg) + policy = fixed_price_policy(platform.instruments.refs) + result = rollout(platform, policy, n_steps, seed=seed) + results[alpha] = { + 'total_reward': result.total_reward, + 'total_pnl': result.total_pnl, + 'avg_conversion': result.avg_conversion, + 'final_contamination': platform._hidden.contamination, + } + return results + + +def sweep_behavior_modes(base_cfg: ThesisConfig | None = None, n_steps: int = 100, seed: int = 42) -> dict[str, dict]: + """Compare real vs synthetic behavioral models.""" + from ...experiments.eval import rollout, fixed_price_policy + + base_cfg = base_cfg or ThesisConfig() + modes = { + 'real_mdp': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': True, 'use_separability': True}), + 'synthetic': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': False, 'use_separability': False}), + 'real_mdp_no_sep': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': True, 'use_separability': False}), + } + + results = {} + for name, cfg in modes.items(): + platform = make_thesis_platform(cfg) + policy = fixed_price_policy(platform.instruments.refs) + result = rollout(platform, policy, n_steps, seed=seed) + results[name] = { + 'total_reward': result.total_reward, + 'total_pnl': result.total_pnl, + 'avg_conversion': result.avg_conversion, + } + return results diff --git a/lab/case/thesis/run_experiment.py b/lab/case/thesis/run_experiment.py new file mode 100644 index 0000000..962db4f --- /dev/null +++ b/lab/case/thesis/run_experiment.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +"""Thesis simulation experiments with real MDP behavioral models.""" +from __future__ import annotations +import sys +from pathlib import Path + +if __name__ == '__main__': + sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from lab.case.thesis.platform import make_thesis_platform, ThesisConfig +from lab.case.thesis.metrics import compute_coi, compute_separability +from lab.experiments.eval import compare_policies +import numpy as np + + +def demo_basic_simulation(): + print("=" * 70) + print("THESIS SIMULATION: Contaminated Dynamic Pricing (Real MDP Kernels)") + print("=" * 70) + + cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.3, lambda_coi=0.5, + max_steps=100, seed=42, use_real_behavior=True) + platform = make_thesis_platform(cfg) + + print(f"\nInstruments: {platform.instruments.n}") + print(f"Reference prices: {platform.instruments.refs.round(2)}") + print(f"Costs: {platform.instruments.costs.round(2)}") + print(f"Initial contamination alpha={cfg.alpha_contamination}") + print(f"Using real behavior: {cfg.use_real_behavior}") + + result = platform.reset(seed=42) + total_reward, coi_history = 0, [] + + print(f"\n{'Step':>5} {'Reward':>10} {'PnL':>10} {'COI':>8} {'alpha':>6} {'Conv':>8}") + print("-" * 55) + + for t in range(cfg.max_steps): + action = platform.instruments.refs * np.random.uniform(0.95, 1.15, size=platform.instruments.n) + result = platform.step(action) + total_reward += result.reward + coi = compute_coi(platform._quote, platform.instruments, result.metrics, result.hidden.contamination) + coi_history.append(coi.coi_level) + + if t % 20 == 0: + print(f"{t:5d} {result.reward:10.2f} {result.metrics.pnl:10.2f} " + f"{coi.coi_level:8.2f} {result.hidden.contamination:6.2f} {result.metrics.conversion:8.3f}") + + print("-" * 55) + print(f"Total Reward: {total_reward:.2f}") + print(f"Average COI: {np.mean(coi_history):.2f}") + print(f"COI Trend: {coi_history[-1] - coi_history[0]:+.2f}") + + +def demo_contamination_sweep(): + print("\n" + "=" * 70) + print("EXPERIMENT: COI Erosion vs Contamination (Theorem 1)") + print("=" * 70) + + from lab.case.thesis.platform import sweep_contamination + trials = 20 + alpha_values = [i/trials for i in range(trials)] + results = sweep_contamination(alpha_values, n_steps=100, seed=42) + + print(f"\n{'alpha':>6} {'Reward':>12} {'PnL':>12} {'Conv':>10}") + print("-" * 45) + for alpha, m in sorted(results.items()): + print(f"{alpha:6.2f} {m['total_reward']:12.2f} {m['total_pnl']:12.2f} {m['avg_conversion']:10.3f}") + + rewards = [results[a]['total_reward'] for a in sorted(results.keys())] + dataset = np.array([[a, r] for a, r in zip(alpha_values, rewards)]) + trend = np.corrcoef(dataset[:, 0], dataset[:, 1])[0, 1] + print(f"Trend (alpha~reward correlation): {trend:.3f}") + + +def demo_policy_comparison(): + print("\n" + "=" * 70) + print("EXPERIMENT: Policy Comparison under Contamination") + print("=" * 70) + + cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.25, max_steps=100, seed=42) + platform = make_thesis_platform(cfg) + + def fixed_policy(obs, t): return platform.instruments.refs.copy(), 1.0 + def aggressive_policy(obs, t): return platform.instruments.refs * 1.3, 1.0 + def conservative_policy(obs, t): return platform.instruments.refs * 1.05, 1.0 + def adaptive_policy(obs, t): + fills = obs[platform.instruments.n:2*platform.instruments.n] + exp = obs[2*platform.instruments.n:3*platform.instruments.n] + conv = np.sum(fills) / (np.sum(exp) + 1e-8) + return platform.instruments.refs * (1.0 + 0.2 * conv), 1.0 + + policies = {'fixed': fixed_policy, 'aggressive': aggressive_policy, + 'conservative': conservative_policy, 'adaptive': adaptive_policy} + results = compare_policies(platform, policies, n_steps=100, n_runs=3, seed=42) + + print(f"\n{'Policy':>15} {'Reward':>12} {'Std':>10} {'PnL':>12} {'Conv':>10}") + print("-" * 65) + for name, r in sorted(results.items(), key=lambda x: -x[1]['mean_reward']): + print(f"{name:>15} {r['mean_reward']:12.2f} {r['std_reward']:10.2f} " + f"{r['mean_pnl']:12.2f} {r['mean_conversion']:10.3f}") + + +def demo_session_analysis(): + """Analyze session-level behavior from MDP trajectories.""" + print("\n" + "=" * 70) + print("EXPERIMENT: Session Analysis (Ground Truth)") + print("=" * 70) + + from lab.outlet.constants import LogLevel + cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.3, max_steps=50, + log_level=LogLevel.FULL, seed=42, use_real_behavior=True) + platform = make_thesis_platform(cfg) + + result = platform.reset(seed=42) + human_sessions, agent_sessions = 0, 0 + + for t in range(cfg.max_steps): + action = platform.instruments.refs * 1.1 + result = platform.step(action) + sep = compute_separability(result.logs, result.hidden.contamination) + human_sessions += sep.n_human_sessions + agent_sessions += sep.n_agent_sessions + + total = human_sessions + agent_sessions + print(f"\nTotal sessions: {total}") + print(f"Human sessions: {human_sessions} ({100*human_sessions/total:.1f}%)") + print(f"Agent sessions: {agent_sessions} ({100*agent_sessions/total:.1f}%)") + print(f"True contamination: {cfg.alpha_contamination:.1%}") + print(f"Observed contamination: {agent_sessions/total:.1%}") + + +if __name__ == '__main__': + demo_basic_simulation() + demo_contamination_sweep() + # demo_policy_comparison() + # demo_session_analysis() diff --git a/lab/config.py b/lab/config.py new file mode 100644 index 0000000..441085d --- /dev/null +++ b/lab/config.py @@ -0,0 +1,156 @@ +""" +Configuration and factory functions for creating pre-configured platforms. + +This module provides: +- RetailConfig, MarketMakingConfig: Configuration dataclasses +- make_retail_platform: Factory for retail dynamic pricing scenarios +- make_market_making_platform: Factory for market making scenarios + +Example: + >>> from lab.config import make_retail_platform + >>> platform = make_retail_platform(RetailConfig(n_instruments=5)) + >>> result = platform.reset(seed=42) +""" +from __future__ import annotations +from dataclasses import dataclass +import numpy as np +from .outlet import (Platform, PlatformConfig, PositionModel, PositionConfig, + PostedPriceMechanism, TwoSidedMechanism, make_instruments, + InstrumentType, LogLevel) +from .outlet.mechanisms.posted_price import PostedPriceConfig +from .outlet.mechanisms.two_sided import TwoSidedConfig +from .population import (SessionArrivalModel, PoissonArrivalModel, HawkesArrivalModel, + ElasticityExecutionModel, IntensityExecutionModel, + ReactiveCompetitorModel, GBMMarketModel) +from .population.arrivals import SessionArrivalConfig, PoissonArrivalConfig, HawkesArrivalConfig +from .population.execution import ElasticityConfig, IntensityConfig +from .population.competitors import ReactiveCompetitorConfig, GBMMarketConfig +from .outlet.objectives.factory import retail_objective, market_making_objective + +@dataclass +class RetailConfig: + """Configuration for retail dynamic pricing scenario. + + Attributes: + n_instruments: Number of products to price + cost_range: (min, max) for random product costs + margin_range: (min, max) for random initial margins + initial_inventory: Starting inventory per product + holding_cost_rate: Cost per unit per step for holding + sessions_per_step: Number of browsing sessions per step + contamination: Fraction of sessions that are scrapers + max_steps: Maximum episode length + seed: Random seed for reproducibility + """ + n_instruments: int = 10 + cost_range: tuple[float, float] = (5.0, 50.0) + margin_range: tuple[float, float] = (0.2, 0.5) + initial_inventory: float = 100.0 + holding_cost_rate: float = 0.002 + sessions_per_step: int = 30 + contamination: float = 0.1 + max_steps: int = 500 + seed: int | None = None + +def make_retail_platform(cfg: RetailConfig | None = None) -> Platform: + """Create a pre-configured retail dynamic pricing platform. + + Components: + - Mechanism: PostedPriceMechanism (single price per product) + - Arrivals: SessionArrivalModel (browsing sessions with views) + - Execution: ElasticityExecutionModel (price sensitivity) + - Market: ReactiveCompetitorModel (can trigger price wars) + - Objective: PnL - holding_cost - volatility - lost_opportunity + + Args: + cfg: Configuration (uses defaults if None) + + Returns: + Configured Platform instance + """ + cfg = cfg or RetailConfig() + rng = np.random.default_rng(cfg.seed) + + instruments = make_instruments(cfg.n_instruments, cfg.cost_range, cfg.margin_range, + InstrumentType.SKU, rng) + instruments.position = np.full(cfg.n_instruments, cfg.initial_inventory) + + mechanism = PostedPriceMechanism(PostedPriceConfig()) + arrival = SessionArrivalModel(SessionArrivalConfig( + sessions_per_step=cfg.sessions_per_step, contamination=cfg.contamination)) + execution = ElasticityExecutionModel(ElasticityConfig()) + position = PositionModel(PositionConfig( + initial_position=cfg.initial_inventory, + holding_cost_rate=cfg.holding_cost_rate)) + market = ReactiveCompetitorModel(ReactiveCompetitorConfig(), refs=instruments.refs) + objective = retail_objective() + + return Platform( + instruments=instruments, mechanism=mechanism, arrival=arrival, + execution=execution, position=position, market=market, objective=objective, + cfg=PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps, + seed=cfg.seed, log_level=LogLevel.AGG_ONLY) + ) + +@dataclass +class MarketMakingConfig: + """Configuration for market making scenario. + + Attributes: + n_instruments: Number of assets to quote + initial_mid: Initial mid-price for assets + mu: Price drift (expected return) + sigma: Price volatility + gamma: Inventory risk aversion parameter + base_arrival_rate: Order arrival rate (Hawkes baseline) + max_steps: Maximum episode length + seed: Random seed for reproducibility + """ + n_instruments: int = 5 + initial_mid: float = 100.0 + mu: float = 0.0 + sigma: float = 0.02 + gamma: float = 0.1 + base_arrival_rate: float = 20.0 + max_steps: int = 1000 + seed: int | None = None + +def make_market_making_platform(cfg: MarketMakingConfig | None = None) -> Platform: + """Create a pre-configured market making platform. + + Components: + - Mechanism: TwoSidedMechanism (bid-ask spread quoting) + - Arrivals: HawkesArrivalModel (clustered order flow) + - Execution: IntensityExecutionModel (distance-based fills) + - Market: GBMMarketModel (geometric Brownian motion mid-prices) + - Objective: PnL + spread_capture - inventory_risk + + Args: + cfg: Configuration (uses defaults if None) + + Returns: + Configured Platform instance + """ + cfg = cfg or MarketMakingConfig() + rng = np.random.default_rng(cfg.seed) + + instruments = make_instruments(cfg.n_instruments, (cfg.initial_mid*0.9, cfg.initial_mid*1.1), + (0.0, 0.0), InstrumentType.ASSET, rng) + instruments.position = np.zeros(cfg.n_instruments) + + mechanism = TwoSidedMechanism(TwoSidedConfig()) + arrival = HawkesArrivalModel(HawkesArrivalConfig(base_rate=cfg.base_arrival_rate)) + execution = IntensityExecutionModel(IntensityConfig()) + position = PositionModel(PositionConfig( + initial_position=0.0, min_position=-500, max_position=500, + holding_cost_rate=0.0)) # use inventory risk penalty instead + market = GBMMarketModel(GBMMarketConfig(mu=cfg.mu, sigma=cfg.sigma), + initial=instruments.refs) + objective = market_making_objective(gamma=cfg.gamma, sigma=cfg.sigma) + + return Platform( + instruments=instruments, mechanism=mechanism, arrival=arrival, + execution=execution, position=position, market=market, objective=objective, + cfg=PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps, + seed=cfg.seed, log_level=LogLevel.AGG_ONLY) + ) diff --git a/lab/docs/Makefile b/lab/docs/Makefile new file mode 100644 index 0000000..fe8e88c --- /dev/null +++ b/lab/docs/Makefile @@ -0,0 +1,12 @@ +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/lab/docs/conf.py b/lab/docs/conf.py new file mode 100644 index 0000000..0e39351 --- /dev/null +++ b/lab/docs/conf.py @@ -0,0 +1,39 @@ +import os +import sys +sys.path.insert(0, os.path.abspath('../..')) + +project = 'Quote-Control Simulator' +copyright = '2025, PHANTOM Research' +author = 'PHANTOM Research' +release = '0.1.0' + +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'sphinx.ext.intersphinx', + 'sphinx.ext.autosummary', +] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +html_theme = 'alabaster' +html_static_path = ['_static'] + +autodoc_default_options = { + 'members': True, + 'undoc-members': True, + 'show-inheritance': True, +} + +napoleon_google_docstring = True +napoleon_numpy_docstring = True +napoleon_include_init_with_doc = True + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'numpy': ('https://numpy.org/doc/stable/', None), +} + +autosummary_generate = True diff --git a/lab/docs/index.rst b/lab/docs/index.rst new file mode 100644 index 0000000..bd36ecd --- /dev/null +++ b/lab/docs/index.rst @@ -0,0 +1,40 @@ +Quote-Control Simulator +======================= + +Research-grade platform for dynamic pricing and market making experiments. + +The platform abstracts pricing as: **Quote → Arrival → Execution → Position** + +Supports multiple mechanisms: + +* **PostedPrice**: retail dynamic pricing +* **TwoSided**: market making with bid-ask spreads +* **Auction**: reserve/shading for auction settings + +Quick Start +----------- + +.. code-block:: python + + from lab.config import make_retail_platform + from lab.experiments import rollout, fixed_price_policy + + platform = make_retail_platform() + policy = fixed_price_policy(platform.instruments.refs) + result = rollout(platform, policy, n_steps=100) + print(f"Total PnL: {result.total_pnl:.2f}") + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + system_overview + modules/outlet + modules/population + modules/experiments + +Indices +------- + +* :ref:`genindex` +* :ref:`modindex` diff --git a/lab/docs/modules/experiments.rst b/lab/docs/modules/experiments.rst new file mode 100644 index 0000000..c71ee36 --- /dev/null +++ b/lab/docs/modules/experiments.rst @@ -0,0 +1,14 @@ +Experiments +=========== + +Evaluation & OPE +---------------- + +.. automodule:: lab.experiments.eval + :members: + +Configuration +------------- + +.. automodule:: lab.config + :members: diff --git a/lab/docs/modules/outlet.rst b/lab/docs/modules/outlet.rst new file mode 100644 index 0000000..9f3b8c3 --- /dev/null +++ b/lab/docs/modules/outlet.rst @@ -0,0 +1,77 @@ +Outlet (Core Simulator) +======================= + +Types +----- + +.. automodule:: lab.outlet.types + :members: + +Constants +--------- + +.. automodule:: lab.outlet.constants + :members: + +Protocols +--------- + +.. automodule:: lab.outlet.protocols + :members: + +Platform +-------- + +.. automodule:: lab.outlet.platform + :members: + +Stock & Position +---------------- + +.. automodule:: lab.outlet.stock + :members: + +Observation +----------- + +.. automodule:: lab.outlet.observation + :members: + +Mechanisms +---------- + +Posted Price +~~~~~~~~~~~~ + +.. automodule:: lab.outlet.mechanisms.posted_price + :members: + +Two-Sided (Market Making) +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: lab.outlet.mechanisms.two_sided + :members: + +Auction +~~~~~~~ + +.. automodule:: lab.outlet.mechanisms.auction + :members: + +Objectives +---------- + +.. automodule:: lab.outlet.objectives.base + :members: + +.. automodule:: lab.outlet.objectives.penalties + :members: + +.. automodule:: lab.outlet.objectives.factory + :members: + +Math Utilities +-------------- + +.. automodule:: lab.outlet.math_util + :members: diff --git a/lab/docs/modules/population.rst b/lab/docs/modules/population.rst new file mode 100644 index 0000000..0b7ef75 --- /dev/null +++ b/lab/docs/modules/population.rst @@ -0,0 +1,20 @@ +Population Models +================= + +Arrival Models +-------------- + +.. automodule:: lab.population.arrivals + :members: + +Execution Models +---------------- + +.. automodule:: lab.population.execution + :members: + +Competitor / Market Models +-------------------------- + +.. automodule:: lab.population.competitors + :members: diff --git a/lab/docs/system_overview.rst b/lab/docs/system_overview.rst new file mode 100644 index 0000000..3fda8ad --- /dev/null +++ b/lab/docs/system_overview.rst @@ -0,0 +1,97 @@ +System Overview +=============== + +The simulator organises dynamic pricing and market-making experiments as a +closed loop with the following stages: + +* **Quote** – a policy or agent emits a :class:`lab.outlet.types.Quote`. The + quote is normalised and validated by a concrete + :class:`lab.outlet.protocols.Mechanism` implementation + (posted-price, two-sided, auction). +* **Arrival** – a :class:`lab.outlet.protocols.ArrivalModel` samples a stream of + :class:`lab.outlet.types.Opportunity` objects given the current time, + instrument catalogue, and market state. +* **Execution** – the :class:`lab.outlet.protocols.ExecutionModel` converts an + opportunity into a probabilistic fill using the active quote, optional + competitor prices, and demand-side context. +* **Position** – a :class:`lab.outlet.protocols.PositionModel` enforces + inventory or position constraints, censors oversized fills, and accrues + holding and shortage costs. +* **Observation & Reward** – the + :class:`lab.outlet.protocols.ObservationBuilder` constructs the censored view + exposed to the agent, while a :class:`lab.outlet.protocols.Objective` + transforms :class:`lab.outlet.types.StepMetrics` into a scalar reward with an + optional breakdown per term. + +These components are orchestrated by :class:`lab.outlet.platform.Platform`, +which manages internal hidden state, deterministic seeding, and logging. + +Component Matrix +---------------- + +=============================== ============================================== +Layer Responsibilities / Examples +=============================== ============================================== +Mechanisms Quote normalisation, execution semantics + (`posted_price`, `two_sided`, `auction`). +Population models Arrivals (:mod:`lab.population.arrivals`), + execution probability models + (:mod:`lab.population.execution`), and + competitor or market dynamics + (:mod:`lab.population.competitors`). +Position management Inventory limits, replenishment, holding and + shortage costs (:mod:`lab.outlet.stock`). +Observation & logging Censored observations and optional event logs + (:mod:`lab.outlet.observation`). +Objectives Reward composition utilities + (:mod:`lab.outlet.objectives`). +Experiments Rollout helpers, baseline policies, off-policy + evaluation (:mod:`lab.experiments.eval`). +=============================== ============================================== + +Preconfigured Platforms +----------------------- + +Two high-level factories in :mod:`lab.config` wire common combinations of the +building blocks: + +* **Retail dynamic pricing** – posted-price mechanism, session arrivals with + contamination, elasticity-based executions, reactive competitor model, and a + composite objective that penalises volatility, holding costs, and lost + opportunities. +* **Market making** – two-sided quoting, Hawkes order flow, intensity-based + executions, geometric Brownian motion mid-prices, and an objective combining + PnL, spread capture, and quadratic inventory risk. + +State & Reset Behaviour +----------------------- + +When you call :meth:`lab.outlet.platform.Platform.reset`, the platform resets +instrument positions, quotes, and hidden state, but component implementations +may maintain their own internal buffers. For reproducible experiments: + +* Reuse freshly instantiated arrival/market models per episode, or add explicit + ``reset`` methods if the model keeps history (for example, + :class:`lab.population.arrivals.HawkesArrivalModel` maintains an event + history, while :class:`lab.population.competitors.ReactiveCompetitorModel` + tracks prior competitor quotes). +* Seed randomness through the factory configuration (``RetailConfig.seed`` or + ``MarketMakingConfig.seed``) or pass a seed to ``Platform.reset`` for + deterministic rollouts. + +Extending the Platform +---------------------- + +To support a new domain: + +1. Create custom Mechanism/Arrival/Execution/Market/Observation components by + implementing the respective protocol in :mod:`lab.outlet.protocols`. +2. Compose a new objective with + :func:`lab.outlet.objectives.factory.make_composite` or write a bespoke + :class:`lab.outlet.objectives.base.BaseObjective`. +3. Wire everything together via :class:`lab.outlet.platform.Platform` directly + or expose a helper factory in :mod:`lab.config`. + +Use :func:`lab.experiments.rollout` and +:func:`lab.experiments.compare_policies` to benchmark candidate policies under +multiple random seeds, collecting per-step logs for analysis or OPE. diff --git a/lab/experiments/__init__.py b/lab/experiments/__init__.py new file mode 100644 index 0000000..ac427f3 --- /dev/null +++ b/lab/experiments/__init__.py @@ -0,0 +1,7 @@ +from .eval import (rollout, RolloutResult, compare_policies, compute_ips, OPEResult, + fixed_price_policy, cost_plus_margin_policy, random_walk_policy, epsilon_greedy_policy) + +__all__ = [ + 'rollout', 'RolloutResult', 'compare_policies', 'compute_ips', 'OPEResult', + 'fixed_price_policy', 'cost_plus_margin_policy', 'random_walk_policy', 'epsilon_greedy_policy', +] diff --git a/lab/experiments/eval.py b/lab/experiments/eval.py new file mode 100644 index 0000000..8bc9330 --- /dev/null +++ b/lab/experiments/eval.py @@ -0,0 +1,213 @@ +""" +Evaluation utilities for policy testing and off-policy evaluation. + +This module provides: +- rollout: Run a policy on the platform for multiple steps +- compare_policies: Compare multiple policies with statistics +- Baseline policies: fixed_price, cost_plus_margin, random_walk, epsilon_greedy +- OPE estimators: IPS and SNIPS for off-policy evaluation + +Example: + >>> from lab.config import make_retail_platform + >>> from lab.experiments.eval import rollout, fixed_price_policy + >>> platform = make_retail_platform() + >>> policy = fixed_price_policy(platform.instruments.refs) + >>> result = rollout(platform, policy, n_steps=100) + >>> print(f"Total PnL: {result.total_pnl:.2f}") +""" +from __future__ import annotations +from dataclasses import dataclass +from typing import Callable, Any +import numpy as np +from ..outlet.platform import Platform +from ..outlet.types import StepResult, StepLogs, Quote + +# Policy signature: takes (observation_flat, timestep) -> (action_prices, propensity) +Policy = Callable[[np.ndarray, int], tuple[np.ndarray, float]] + +@dataclass +class RolloutResult: + """Results from a policy rollout. + + Attributes: + rewards: Per-step rewards + metrics: Per-step StepMetrics objects + logs: Per-step StepLogs objects + total_reward: Sum of rewards + total_pnl: Sum of PnL from metrics + avg_conversion: Average conversion rate + """ + rewards: list[float] + metrics: list[Any] + logs: list[StepLogs] + total_reward: float + total_pnl: float + avg_conversion: float + +def rollout(platform: Platform, policy: Policy, n_steps: int, seed: int | None = None) -> RolloutResult: + """Execute a policy on the platform for n_steps. + + Args: + platform: The simulation platform + policy: Function (obs, t) -> (action, propensity) + n_steps: Number of steps to run + seed: Random seed for reproducibility + + Returns: + RolloutResult with rewards, metrics, and summary statistics + """ + result = platform.reset(seed) + rewards, metrics, logs = [], [], [] + + for t in range(n_steps): + obs_flat = result.obs.to_flat() + action, propensity = policy(obs_flat, t) + result = platform.step(action, propensity) + rewards.append(result.reward) + metrics.append(result.metrics) + logs.append(result.logs) + if result.terminated or result.truncated: + break + + return RolloutResult( + rewards=rewards, metrics=metrics, logs=logs, + total_reward=sum(rewards), + total_pnl=sum(m.pnl for m in metrics), + avg_conversion=np.mean([m.conversion for m in metrics]) + ) + +# Baseline policies for comparison + +def fixed_price_policy(refs: np.ndarray) -> Policy: + """Policy that always quotes at reference prices.""" + def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]: + return refs.copy(), 1.0 + return policy + +def cost_plus_margin_policy(costs: np.ndarray, margin: float = 0.3) -> Policy: + """Policy that quotes at cost * (1 + margin).""" + prices = costs * (1 + margin) + def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]: + return prices.copy(), 1.0 + return policy + +def random_walk_policy(refs: np.ndarray, volatility: float = 0.05, + rng: np.random.Generator | None = None) -> Policy: + """Policy that performs a random walk around reference prices.""" + rng = rng or np.random.default_rng() + prices = refs.copy() + def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]: + nonlocal prices + delta = rng.normal(0, volatility, len(prices)) + prices = prices * (1 + delta) + prices = np.clip(prices, refs * 0.5, refs * 2.0) + return prices.copy(), 1.0 + return policy + +def epsilon_greedy_policy(base_policy: Policy, refs: np.ndarray, + epsilon: float = 0.1, rng: np.random.Generator | None = None) -> Policy: + """Wrap a policy with epsilon-greedy exploration.""" + rng = rng or np.random.default_rng() + def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]: + if rng.random() < epsilon: + action = refs * rng.uniform(0.8, 1.2, len(refs)) + return action, epsilon / len(refs) + else: + action, _ = base_policy(obs, t) + return action, 1 - epsilon + return policy + +# Off-Policy Evaluation (OPE) + +@dataclass +class OPEResult: + """Results from off-policy evaluation. + + Attributes: + ips_estimate: Inverse Propensity Scoring estimate + snips_estimate: Self-normalized IPS estimate (more stable) + n_samples: Number of samples used + effective_samples: Effective sample size (accounts for variance) + """ + ips_estimate: float + snips_estimate: float + n_samples: int + effective_samples: float + +def compute_ips(logs: list[StepLogs], rewards: list[float], + target_policy: Policy, behavior_propensities: list[float] | None = None) -> OPEResult: + """Compute IPS and SNIPS estimators for off-policy evaluation. + + Uses logged propensities to estimate expected reward under a target + policy from data collected under a behavior policy. + + Args: + logs: Step logs containing propensities + rewards: Observed rewards from behavior policy + target_policy: Policy to evaluate (not currently used, assumes deterministic) + behavior_propensities: Override propensities if not in logs + + Returns: + OPEResult with IPS, SNIPS estimates and sample statistics + """ + if behavior_propensities is None: + # extract from logs + behavior_propensities = [] + for log in logs: + if log.executions: + avg_prop = np.mean([e.propensity for e in log.executions]) + else: + avg_prop = 1.0 + behavior_propensities.append(avg_prop) + + # compute importance weights + weights = [] + for i, (log, bp) in enumerate(zip(logs, behavior_propensities)): + # target propensity would need obs reconstruction - simplified here + tp = 1.0 # assume deterministic target + w = tp / (bp + 1e-8) + weights.append(w) + + weights = np.array(weights) + rewards = np.array(rewards) + + # IPS estimate + ips = np.sum(weights * rewards) / len(rewards) + + # SNIPS (self-normalized) + snips = np.sum(weights * rewards) / (np.sum(weights) + 1e-8) + + # effective sample size + ess = (np.sum(weights) ** 2) / (np.sum(weights ** 2) + 1e-8) + + return OPEResult(ips_estimate=ips, snips_estimate=snips, + n_samples=len(rewards), effective_samples=ess) + +def compare_policies(platform: Platform, policies: dict[str, Policy], + n_steps: int = 100, n_runs: int = 5, seed: int = 42) -> dict[str, dict]: + """Compare multiple policies with statistical summary. + + Args: + platform: Simulation platform + policies: Dict mapping policy names to policy functions + n_steps: Steps per rollout + n_runs: Number of rollouts per policy (different seeds) + seed: Base random seed + + Returns: + Dict mapping policy names to result dicts with mean/std statistics + """ + results = {} + for name, policy in policies.items(): + run_results = [] + for i in range(n_runs): + r = rollout(platform, policy, n_steps, seed=seed + i) + run_results.append(r) + + results[name] = { + 'mean_reward': np.mean([r.total_reward for r in run_results]), + 'std_reward': np.std([r.total_reward for r in run_results]), + 'mean_pnl': np.mean([r.total_pnl for r in run_results]), + 'mean_conversion': np.mean([r.avg_conversion for r in run_results]), + } + return results diff --git a/lab/outlet/__init__.py b/lab/outlet/__init__.py new file mode 100644 index 0000000..11a8d76 --- /dev/null +++ b/lab/outlet/__init__.py @@ -0,0 +1,17 @@ +from .constants import Side, MechanismType, InstrumentType, OpportunityType, EventType, LogLevel +from .types import (Instrument, InstrumentSet, Quote, Opportunity, Execution, + StepEvent, StepLogs, StepMetrics, MarketState, HiddenState, Observation, StepResult) +from .stock import PositionModel, PositionConfig, make_instruments +from .platform import Platform, PlatformConfig +from .observation import DefaultObservationBuilder, ObservationConfig +from .mechanisms import PostedPriceMechanism, TwoSidedMechanism, AuctionMechanism + +__all__ = [ + 'Side', 'MechanismType', 'InstrumentType', 'OpportunityType', 'EventType', 'LogLevel', + 'Instrument', 'InstrumentSet', 'Quote', 'Opportunity', 'Execution', + 'StepEvent', 'StepLogs', 'StepMetrics', 'MarketState', 'HiddenState', 'Observation', 'StepResult', + 'PositionModel', 'PositionConfig', 'make_instruments', + 'Platform', 'PlatformConfig', + 'DefaultObservationBuilder', 'ObservationConfig', + 'PostedPriceMechanism', 'TwoSidedMechanism', 'AuctionMechanism', +] diff --git a/lab/outlet/constants.py b/lab/outlet/constants.py new file mode 100644 index 0000000..27c7da2 --- /dev/null +++ b/lab/outlet/constants.py @@ -0,0 +1,83 @@ +""" +Constants and enumerations for the Quote-Control simulator. + +This module defines the core enums used throughout the platform to ensure +type safety and consistent semantics across different pricing mechanisms. +""" +from enum import Enum, auto + +class Side(Enum): + """Transaction side indicator. + + Attributes: + BUY: Buyer-initiated transaction (customer purchases, market buy order) + SELL: Seller-initiated transaction (market sell order, short sale) + """ + BUY = auto() + SELL = auto() + +class MechanismType(Enum): + """Pricing mechanism type defining how quotes translate to executions. + + Attributes: + POSTED_PRICE: Single posted price per instrument (retail dynamic pricing) + TWO_SIDED_QUOTE: Bid-ask spread quoting (market making, liquidity provision) + AUCTION: Reserve price or bid shading (ad auctions, marketplaces) + """ + POSTED_PRICE = auto() + TWO_SIDED_QUOTE = auto() + AUCTION = auto() + +class InstrumentType(Enum): + """Type of instrument being priced. + + Attributes: + SKU: Retail product with inventory constraints + ASSET: Financial instrument with position limits + LOAN: Credit product with interest rate pricing + SUBSCRIPTION: Recurring service with periodic fees + """ + SKU = auto() + ASSET = auto() + LOAN = auto() + SUBSCRIPTION = auto() + +class OpportunityType(Enum): + """Type of arrival opportunity. + + Attributes: + SESSION: Retail browsing session with potential purchase intent + MARKET_ORDER: Financial market order arrival (buy or sell) + REQUEST: Service or credit request requiring quote response + """ + SESSION = auto() + MARKET_ORDER = auto() + REQUEST = auto() + +class EventType(Enum): + """Type of logged event during simulation. + + Attributes: + ARRIVAL: New opportunity arrived in the system + EXPOSURE: Quote was shown to an arrival + EXECUTION: Transaction was executed + ABANDON: Opportunity abandoned without execution + CANCEL: Pending order was cancelled + """ + ARRIVAL = auto() + EXPOSURE = auto() + EXECUTION = auto() + ABANDON = auto() + CANCEL = auto() + +class LogLevel(Enum): + """Verbosity level for step logging. + + Attributes: + NONE: No logging, fastest execution + AGG_ONLY: Only aggregate statistics per step + FULL: Full event-level logging with propensities for OPE + """ + NONE = auto() + AGG_ONLY = auto() + FULL = auto() diff --git a/lab/outlet/gym_wrapper.py b/lab/outlet/gym_wrapper.py new file mode 100644 index 0000000..790adcf --- /dev/null +++ b/lab/outlet/gym_wrapper.py @@ -0,0 +1,86 @@ +""" +Gymnasium-compatible wrapper for the Quote-Control platform. + +Provides a standard Gym interface for RL training: +- observation_space: Box space with flattened observation +- action_space: Box space with price multipliers [0.5, 2.0] +- reset(), step(), render(), close() methods + +Example: + >>> from lab.config import make_retail_platform + >>> from lab.outlet.gym_wrapper import QuoteGymEnv + >>> env = QuoteGymEnv(make_retail_platform()) + >>> obs, info = env.reset() + >>> obs, reward, done, truncated, info = env.step(env.action_space.sample()) +""" +from __future__ import annotations +from typing import Any +import numpy as np + +try: + import gymnasium as gym + from gymnasium import spaces + HAS_GYM = True +except ImportError: + HAS_GYM = False + +from .platform import Platform, PlatformConfig +from .types import Quote, InstrumentSet, StepResult + +class QuoteGymEnv: + """Gymnasium-compatible environment wrapper. + + Wraps a Platform instance with standard Gym interface. + Actions are price multipliers in [0.5, 2.0] applied to reference prices. + Observations are flattened numpy arrays containing quotes, fills, exposures. + """ + + def __init__(self, platform: Platform): + if not HAS_GYM: + raise ImportError("gymnasium required for QuoteGymEnv") + self.platform = platform + self.n = platform.instruments.n + self._last_result: StepResult | None = None + + # action space: price adjustments as multipliers [0.5, 2.0] + self.action_space = spaces.Box(low=0.5, high=2.0, shape=(self.n,), dtype=np.float32) + + # observation space + obs_dim = self.n * 4 # quotes + fills + exposures + position + if platform.market: + obs_dim += self.n # competitor quotes + self.observation_space = spaces.Box(low=-np.inf, high=np.inf, + shape=(obs_dim,), dtype=np.float32) + + def reset(self, seed: int | None = None, options: dict | None = None) -> tuple[np.ndarray, dict]: + result = self.platform.reset(seed) + self._last_result = result + return result.obs.to_flat().astype(np.float32), result.info + + def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, bool, dict]: + # convert action (multipliers) to absolute prices + refs = self.platform.instruments.refs + prices = refs * action + result = self.platform.step(prices) + self._last_result = result + return (result.obs.to_flat().astype(np.float32), result.reward, + result.terminated, result.truncated, result.info) + + def render(self) -> None: + if self._last_result: + m = self._last_result.metrics + print(f"t={self.platform._t} pnl={m.pnl:.2f} units={m.units_traded:.0f} " + f"conv={m.conversion:.3f} vol={m.volatility:.3f}") + + def close(self) -> None: + pass + +def make_env(platform: Platform) -> QuoteGymEnv: + return QuoteGymEnv(platform) + +if HAS_GYM: + # register if gymnasium available + try: + gym.register(id='QuoteControl-v0', entry_point='outlet.gym_wrapper:QuoteGymEnv') + except: + pass # already registered or other issue diff --git a/lab/outlet/math_util.py b/lab/outlet/math_util.py new file mode 100644 index 0000000..da78745 --- /dev/null +++ b/lab/outlet/math_util.py @@ -0,0 +1,57 @@ +""" +Numerical utilities for stable computation. + +This module provides numerically stable implementations of common operations: +- safe_exp, safe_log: Avoid overflow/underflow +- softmax: Numerically stable softmax +- sigmoid, clamp: Standard transformations +- intensity_decay: Avellaneda-Stoikov fill intensity +- inventory_penalty: Quadratic inventory risk +- poisson_arrivals, hawkes_intensity: Arrival process helpers + +All functions accept both scalars and numpy arrays. +""" +import numpy as np + +EPS = 1e-8 # small constant to avoid division by zero +MAX_EXP = 700.0 # maximum safe exponent to avoid overflow + +def safe_exp(x: np.ndarray | float) -> np.ndarray | float: + return np.exp(np.clip(x, -MAX_EXP, MAX_EXP)) + +def safe_log(x: np.ndarray | float) -> np.ndarray | float: + return np.log(np.maximum(x, EPS)) + +def clamp(x: np.ndarray | float, lo: float, hi: float) -> np.ndarray | float: + return np.clip(x, lo, hi) + +def sigmoid(x: np.ndarray | float) -> np.ndarray | float: + return 1.0 / (1.0 + safe_exp(-x)) + +def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: + x_max = np.max(x, axis=axis, keepdims=True) + exp_x = safe_exp(x - x_max) + return exp_x / (np.sum(exp_x, axis=axis, keepdims=True) + EPS) + +def geometric_series(base: float, ratio: float, n: int) -> np.ndarray: + return base * (ratio ** np.arange(n)) + +def ema(old: float, new: float, alpha: float = 0.1) -> float: + return alpha * new + (1 - alpha) * old + +def intensity_decay(distance: float, kappa: float = 1.0) -> float: + """Avellaneda-Stoikov style fill intensity decay with quote distance""" + return safe_exp(-kappa * distance) + +def inventory_penalty(q: float, gamma: float = 0.1, sigma: float = 1.0) -> float: + """Quadratic inventory risk penalty""" + return gamma * sigma**2 * q**2 / 2 + +def poisson_arrivals(rate: float, dt: float, rng: np.random.Generator) -> int: + return rng.poisson(rate * dt) + +def hawkes_intensity(base: float, history: np.ndarray, alpha: float, beta: float, t: float) -> float: + """Self-exciting Hawkes process intensity""" + if len(history) == 0: return base + decays = safe_exp(-beta * (t - history[history < t])) + return base + alpha * np.sum(decays) diff --git a/lab/outlet/mechanisms/__init__.py b/lab/outlet/mechanisms/__init__.py new file mode 100644 index 0000000..3c3c36e --- /dev/null +++ b/lab/outlet/mechanisms/__init__.py @@ -0,0 +1,5 @@ +from .posted_price import PostedPriceMechanism +from .two_sided import TwoSidedMechanism +from .auction import AuctionMechanism + +__all__ = ['PostedPriceMechanism', 'TwoSidedMechanism', 'AuctionMechanism'] diff --git a/lab/outlet/mechanisms/auction.py b/lab/outlet/mechanisms/auction.py new file mode 100644 index 0000000..2260aef --- /dev/null +++ b/lab/outlet/mechanisms/auction.py @@ -0,0 +1,73 @@ +""" +Auction mechanism for reserve pricing and bid shading. + +In this mechanism, the agent sets reserve prices that affect +win probability and clearing prices. Used for ad auctions, +marketplace auctions, and similar settings. +""" +from __future__ import annotations +from dataclasses import dataclass +import numpy as np +from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState +from ..constants import Side +from ..math_util import clamp, sigmoid + +@dataclass +class AuctionConfig: + """Configuration for auction mechanism. + + Attributes: + min_reserve: Minimum reserve price + max_reserve: Maximum reserve price + base_win_prob: Baseline win probability at reference reserve + sensitivity: How much higher reserves reduce win probability + """ + min_reserve: float = 0.0 + max_reserve: float = 100.0 + base_win_prob: float = 0.3 + sensitivity: float = 2.0 + +class AuctionMechanism: + """Auction mechanism for reserve pricing. + + The agent sets reserve prices that affect: + - Win probability: higher reserves reduce chance of winning + - Clearing price: bounded between reserve and simulated max bid + + Win probability: base_prob * sigmoid(-sensitivity * (reserve - ref) / ref) + Clearing price: max(reserve, min(max_bid, reserve + random_increment)) + + Only BUY-side opportunities are processed (auction wins). + """ + + def __init__(self, cfg: AuctionConfig | None = None): + self.cfg = cfg or AuctionConfig() + + def apply_quote(self, quote: Quote, instruments: InstrumentSet, + rng: np.random.Generator) -> Quote: + reserves = clamp(quote.prices, self.cfg.min_reserve, self.cfg.max_reserve) + return Quote(prices=reserves, propensity=quote.propensity, metadata=quote.metadata) + + def process_opportunity(self, opp: Opportunity, quote: Quote, + instruments: InstrumentSet, market: MarketState | None, + rng: np.random.Generator) -> Execution | None: + if opp.side != Side.BUY: return None + idx = int(opp.instrument_id) + reserve = float(quote.prices[idx]) + ref = instruments.refs[idx] + + # win probability decreases with higher reserve + relative_reserve = (reserve - ref) / (ref + 1e-8) + win_prob = self.cfg.base_win_prob * sigmoid(-self.cfg.sensitivity * relative_reserve) + + if rng.random() > win_prob: return None + + # clearing price is between reserve and some max bid (simulated) + max_bid = ref * (1 + rng.exponential(0.2)) + clearing = max(reserve, min(max_bid, reserve + rng.exponential(0.1) * ref)) + + return Execution( + opportunity_id=opp.id, instrument_id=opp.instrument_id, + side=opp.side, size_requested=opp.size, size_filled=opp.size, + price=clearing, propensity=quote.propensity * win_prob, t=opp.t + ) diff --git a/lab/outlet/mechanisms/posted_price.py b/lab/outlet/mechanisms/posted_price.py new file mode 100644 index 0000000..92bac12 --- /dev/null +++ b/lab/outlet/mechanisms/posted_price.py @@ -0,0 +1,84 @@ +""" +Posted price mechanism for retail dynamic pricing. + +In this mechanism, the agent posts a single price per instrument. +Buyers decide whether to purchase based on the posted price. +This is the standard e-commerce dynamic pricing model. +""" +from __future__ import annotations +from dataclasses import dataclass +import numpy as np +from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState +from ..constants import Side +from ..math_util import clamp + +@dataclass +class PostedPriceConfig: + """Configuration for posted price mechanism. + + Attributes: + min_price: Absolute minimum price + max_price: Absolute maximum price + max_delta_pct: Maximum price change per step as fraction of previous + min_margin_pct: Minimum margin over cost basis + round_to: Price rounding granularity (None = no rounding) + """ + min_price: float = 0.01 + max_price: float = 1000.0 + max_delta_pct: float = 0.2 + min_margin_pct: float = 0.05 + round_to: float | None = 0.01 + +class PostedPriceMechanism: + """Posted price mechanism for retail dynamic pricing. + + The agent posts a single price per product. Constraints enforced: + - Prices within [min_price, max_price] + - Margin at least min_margin_pct above cost + - Price changes limited to max_delta_pct per step + - Prices rounded to round_to granularity + + Only BUY-side opportunities are processed (customers purchasing). + """ + + def __init__(self, cfg: PostedPriceConfig | None = None): + self.cfg = cfg or PostedPriceConfig() + + def apply_quote(self, quote: Quote, instruments: InstrumentSet, + rng: np.random.Generator) -> Quote: + prices = quote.prices.copy() + costs = instruments.costs + refs = instruments.refs + c = self.cfg + + # enforce min margin + min_prices = costs * (1 + c.min_margin_pct) + prices = np.maximum(prices, min_prices) + + # enforce absolute bounds + prices = clamp(prices, c.min_price, c.max_price) + + # enforce max delta if we have history + if 'prev_prices' in quote.metadata: + prev = quote.metadata['prev_prices'] + max_change = prev * c.max_delta_pct + prices = clamp(prices, prev - max_change, prev + max_change) + + # round prices + if c.round_to: + prices = np.round(prices / c.round_to) * c.round_to + + return Quote(prices=prices, propensity=quote.propensity, + metadata={**quote.metadata, 'prev_prices': prices}) + + def process_opportunity(self, opp: Opportunity, quote: Quote, + instruments: InstrumentSet, market: MarketState | None, + rng: np.random.Generator) -> Execution | None: + if opp.side != Side.BUY: return None # posted price is buy-only + idx = int(opp.instrument_id) + price = float(quote.prices[idx]) + return Execution( + opportunity_id=opp.id, instrument_id=opp.instrument_id, + side=opp.side, size_requested=opp.size, size_filled=opp.size, + price=price, propensity=quote.propensity, t=opp.t + ) diff --git a/lab/outlet/mechanisms/two_sided.py b/lab/outlet/mechanisms/two_sided.py new file mode 100644 index 0000000..166f4d9 --- /dev/null +++ b/lab/outlet/mechanisms/two_sided.py @@ -0,0 +1,89 @@ +""" +Two-sided quoting mechanism for market making. + +In this mechanism, the agent posts both bid and ask prices. +Execution depends on the distance from the market mid-price. +This models liquidity provision in financial markets. +""" +from __future__ import annotations +from dataclasses import dataclass +import numpy as np +from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState +from ..constants import Side +from ..math_util import clamp, intensity_decay + +@dataclass +class TwoSidedConfig: + """Configuration for two-sided quoting mechanism. + + Attributes: + min_spread: Minimum bid-ask spread + max_spread: Maximum bid-ask spread + min_price: Absolute minimum price + max_price: Absolute maximum price + fill_kappa: Intensity decay parameter (higher = faster decay with distance) + """ + min_spread: float = 0.01 + max_spread: float = 0.5 + min_price: float = 0.01 + max_price: float = 10000.0 + fill_kappa: float = 1.5 + +class TwoSidedMechanism: + """Two-sided quoting mechanism for market making. + + The agent posts bid (buy) and ask (sell) prices around a mid-point. + Fill probability decays exponentially with distance from mid-price, + following the Avellaneda-Stoikov intensity model. + + Both BUY and SELL opportunities are processed: + - BUY: customer buys at agent's ask price + - SELL: customer sells at agent's bid price + """ + + def __init__(self, cfg: TwoSidedConfig | None = None): + self.cfg = cfg or TwoSidedConfig() + + def apply_quote(self, quote: Quote, instruments: InstrumentSet, + rng: np.random.Generator) -> Quote: + prices = quote.prices.copy() + spreads = quote.spreads.copy() if quote.spreads is not None else np.full_like(prices, 0.02) + c = self.cfg + + prices = clamp(prices, c.min_price, c.max_price) + spreads = clamp(spreads, c.min_spread, c.max_spread) + + # ensure bids < asks + half_spread = spreads / 2 + bids = prices - half_spread + asks = prices + half_spread + bids = np.maximum(bids, c.min_price) + asks = np.minimum(asks, c.max_price) + spreads = asks - bids + prices = (bids + asks) / 2 + + return Quote(prices=prices, spreads=spreads, propensity=quote.propensity, + metadata=quote.metadata) + + def process_opportunity(self, opp: Opportunity, quote: Quote, + instruments: InstrumentSet, market: MarketState | None, + rng: np.random.Generator) -> Execution | None: + idx = int(opp.instrument_id) + mid = market.mid_prices[idx] if market and market.mid_prices is not None else quote.prices[idx] + + if opp.side == Side.BUY: + price = float(quote.asks[idx]) if quote.asks is not None else float(quote.prices[idx]) + distance = price - mid + else: + price = float(quote.bids[idx]) if quote.bids is not None else float(quote.prices[idx]) + distance = mid - price + + # probabilistic fill based on distance from mid + fill_prob = intensity_decay(abs(distance), self.cfg.fill_kappa) + if rng.random() > fill_prob: return None + + return Execution( + opportunity_id=opp.id, instrument_id=opp.instrument_id, + side=opp.side, size_requested=opp.size, size_filled=opp.size, + price=price, propensity=quote.propensity * fill_prob, t=opp.t + ) diff --git a/lab/outlet/objectives/__init__.py b/lab/outlet/objectives/__init__.py new file mode 100644 index 0000000..063b7a5 --- /dev/null +++ b/lab/outlet/objectives/__init__.py @@ -0,0 +1,11 @@ +from .base import BaseObjective, CompositeObjective +from .penalties import (PnLObjective, VolatilityPenalty, HoldingCostPenalty, + LostOpportunityCostPenalty, InventoryRiskPenalty, SpreadCaptureReward) +from .factory import make_objective, make_composite, retail_objective, market_making_objective + +__all__ = [ + 'BaseObjective', 'CompositeObjective', + 'PnLObjective', 'VolatilityPenalty', 'HoldingCostPenalty', + 'LostOpportunityCostPenalty', 'InventoryRiskPenalty', 'SpreadCaptureReward', + 'make_objective', 'make_composite', 'retail_objective', 'market_making_objective', +] diff --git a/lab/outlet/objectives/base.py b/lab/outlet/objectives/base.py new file mode 100644 index 0000000..49847aa --- /dev/null +++ b/lab/outlet/objectives/base.py @@ -0,0 +1,48 @@ +""" +Base classes for reward objectives. + +Objectives compute scalar rewards from step metrics. The CompositeObjective +allows combining multiple objectives with weights for multi-objective optimization. +""" +from __future__ import annotations +from abc import ABC, abstractmethod +from ..types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation + +class BaseObjective(ABC): + """Abstract base class for reward objectives. + + Subclasses must implement reward() and breakdown() methods. + """ + + @abstractmethod + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: ... + + @abstractmethod + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: ... + +class CompositeObjective(BaseObjective): + """Weighted sum of multiple objectives. + + Allows combining multiple reward terms (e.g., PnL - holding_cost - volatility). + + Args: + objectives: List of (objective, weight) tuples + """ + + def __init__(self, objectives: list[tuple[BaseObjective, float]]): + self.objectives = objectives + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + return sum(w * obj.reward(quote, instruments, metrics, hidden, obs) + for obj, w in self.objectives) + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + bd = {} + for obj, w in self.objectives: + for k, v in obj.breakdown(quote, instruments, metrics, hidden, obs).items(): + bd[k] = w * v + return bd diff --git a/lab/outlet/objectives/factory.py b/lab/outlet/objectives/factory.py new file mode 100644 index 0000000..6e75294 --- /dev/null +++ b/lab/outlet/objectives/factory.py @@ -0,0 +1,82 @@ +""" +Factory functions for creating objectives. + +Provides: +- make_objective: Create single objective by name +- make_composite: Create weighted combination of objectives +- retail_objective: Default objective for retail pricing +- market_making_objective: Default objective for market making +""" +from __future__ import annotations +from .base import BaseObjective, CompositeObjective +from .penalties import (PnLObjective, VolatilityPenalty, HoldingCostPenalty, + LostOpportunityCostPenalty, InventoryRiskPenalty, SpreadCaptureReward) + +REGISTRY: dict[str, type[BaseObjective]] = { + 'pnl': PnLObjective, + 'volatility': VolatilityPenalty, + 'holding_cost': HoldingCostPenalty, + 'lost_opportunity': LostOpportunityCostPenalty, + 'inventory_risk': InventoryRiskPenalty, + 'spread_capture': SpreadCaptureReward, +} + +def make_objective(name: str, **kwargs) -> BaseObjective: + """Create an objective by name. + + Args: + name: Objective name (pnl, volatility, holding_cost, lost_opportunity, + inventory_risk, spread_capture) + **kwargs: Passed to objective constructor + + Returns: + Instantiated objective + """ + if name not in REGISTRY: + raise ValueError(f"Unknown objective: {name}. Available: {list(REGISTRY.keys())}") + return REGISTRY[name](**kwargs) + +def make_composite(spec: list[tuple[str, float, dict]] | dict[str, float]) -> CompositeObjective: + """Create composite objective from specification. + + Args: + spec: Either: + - list of (name, weight, kwargs) tuples for full control + - dict of {name: weight} for simple cases + + Returns: + CompositeObjective with specified components + """ + objectives = [] + if isinstance(spec, dict): + for name, weight in spec.items(): + objectives.append((make_objective(name), weight)) + else: + for name, weight, kwargs in spec: + objectives.append((make_objective(name, **kwargs), weight)) + return CompositeObjective(objectives) + +def retail_objective(volatility_weight: float = 0.1, holding_weight: float = 0.5, + stockout_weight: float = 0.3) -> CompositeObjective: + """Default objective for retail dynamic pricing. + + Reward = PnL - volatility_weight*volatility - holding_weight*holding_cost + - stockout_weight*lost_opportunity + """ + return make_composite({ + 'pnl': 1.0, + 'volatility': volatility_weight, + 'holding_cost': holding_weight, + 'lost_opportunity': stockout_weight, + }) + +def market_making_objective(gamma: float = 0.1, sigma: float = 1.0) -> CompositeObjective: + """Default objective for market making. + + Reward = PnL + 0.5*spread_capture - inventory_risk(gamma, sigma) + """ + return CompositeObjective([ + (PnLObjective(), 1.0), + (SpreadCaptureReward(), 0.5), + (InventoryRiskPenalty(gamma=gamma, sigma=sigma), 1.0), + ]) diff --git a/lab/outlet/objectives/penalties.py b/lab/outlet/objectives/penalties.py new file mode 100644 index 0000000..916e0e2 --- /dev/null +++ b/lab/outlet/objectives/penalties.py @@ -0,0 +1,101 @@ +""" +Standard objective components and penalties. + +This module provides common reward terms: +- PnLObjective: Basic profit and loss +- VolatilityPenalty: Penalize price volatility for UX +- HoldingCostPenalty: Inventory holding cost +- LostOpportunityCostPenalty: Stockout/missed fill cost +- InventoryRiskPenalty: Quadratic inventory risk (market making) +- SpreadCaptureReward: Bid-ask spread capture (market making) +""" +from __future__ import annotations +import numpy as np +from .base import BaseObjective +from ..types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation +from ..math_util import inventory_penalty + +class PnLObjective(BaseObjective): + """Profit and loss reward (revenue - cost).""" + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + return metrics.pnl + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + return {'pnl': metrics.pnl, 'revenue': metrics.revenue, 'cost': metrics.cost} + +class VolatilityPenalty(BaseObjective): + """Penalize price volatility for user experience.""" + + def __init__(self, scale: float = 1.0): + self.scale = scale + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + return -self.scale * metrics.volatility + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + return {'volatility_penalty': -self.scale * metrics.volatility} + +class HoldingCostPenalty(BaseObjective): + """Penalty for inventory holding costs.""" + + def __init__(self, scale: float = 1.0): + self.scale = scale + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + return -self.scale * metrics.position_cost + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + return {'holding_cost_penalty': -self.scale * metrics.position_cost} + +class LostOpportunityCostPenalty(BaseObjective): + """Penalty for lost sales due to stockouts or missed fills.""" + + def __init__(self, scale: float = 1.0): + self.scale = scale + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + return -self.scale * metrics.lost_opportunity + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + return {'lost_opportunity_penalty': -self.scale * metrics.lost_opportunity} + +class InventoryRiskPenalty(BaseObjective): + """Quadratic inventory risk penalty (Avellaneda-Stoikov style). + + Penalty = gamma * sigma^2 * q^2 / 2, where q is total position. + Encourages market makers to keep inventory near zero. + """ + + def __init__(self, gamma: float = 0.1, sigma: float = 1.0): + self.gamma = gamma + self.sigma = sigma + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + if obs.position is None: return 0.0 + q = np.sum(obs.position) + return -inventory_penalty(q, self.gamma, self.sigma) + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + return {'inventory_risk_penalty': self.reward(quote, instruments, metrics, hidden, obs)} + +class SpreadCaptureReward(BaseObjective): + """Reward for capturing bid-ask spread in market making.""" + + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: + return metrics.spread_capture + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: + return {'spread_capture': metrics.spread_capture} diff --git a/lab/outlet/observation.py b/lab/outlet/observation.py new file mode 100644 index 0000000..cffc71b --- /dev/null +++ b/lab/outlet/observation.py @@ -0,0 +1,92 @@ +""" +Observation construction with demand censoring. + +This module provides the ObservationBuilder that constructs agent observations +from step data. The key invariant is that observations only contain censored +data (fills) and never true demand, ensuring proper research conditions. + +The ObservationConfig controls what is included in observations: +- Position visibility +- Market/competitor visibility +- Demand proxy method +""" +from __future__ import annotations +from dataclasses import dataclass +import numpy as np +from .types import Quote, InstrumentSet, StepLogs, StepMetrics, MarketState, HiddenState, Observation + +@dataclass +class ObservationConfig: + """Configuration for observation construction. + + Attributes: + include_position: Include current position in observation + include_market: Include market/competitor state in observation + mask_true_demand: If True, observation excludes true demand (research mode) + demand_proxy: Method for demand proxy ('fills', 'exposures', 'weighted') + exposure_weights: Weights for weighted demand proxy + """ + include_position: bool = True + include_market: bool = True + mask_true_demand: bool = True + demand_proxy: str = 'fills' + exposure_weights: dict[str, float] | None = None + +class DefaultObservationBuilder: + """Constructs censored observations for the agent. + + Ensures the key research invariant: observations contain only + censored fills (realized sales), never true demand. True demand + is placed in the info dict for research analysis only. + """ + + def __init__(self, cfg: ObservationConfig | None = None): + self.cfg = cfg or ObservationConfig() + + def build(self, quote: Quote, instruments: InstrumentSet, logs: StepLogs, + metrics: StepMetrics, market: MarketState | None, + hidden: HiddenState, mask_demand: bool, t: int) -> Observation: + n = instruments.n + cfg = self.cfg + + # always show censored fills + fills = logs.censored_fills if logs.censored_fills is not None else np.zeros(n) + + # compute exposures from logs + if logs.events: + exposures = np.zeros(n) + for e in logs.events: + if e.instrument_id is not None: + exposures[e.instrument_id] += 1 + else: + exposures = logs.aggregates.get('exposures', np.zeros(n)) + + # position - only if configured and available + position = None + if cfg.include_position and instruments.position is not None: + position = instruments.position.copy() + + # market state - only if configured + obs_market = market if cfg.include_market else None + + return Observation( + quotes=quote.prices.copy(), + position=position, + fills=fills, + exposures=exposures, + market=obs_market, + t=t + ) + + def make_space(self, n_instruments: int, include_market: bool = True) -> dict: + """Returns dict describing observation space for gym""" + space = { + 'quotes': {'shape': (n_instruments,), 'low': 0, 'high': np.inf}, + 'fills': {'shape': (n_instruments,), 'low': 0, 'high': np.inf}, + 'exposures': {'shape': (n_instruments,), 'low': 0, 'high': np.inf}, + } + if self.cfg.include_position: + space['position'] = {'shape': (n_instruments,), 'low': -np.inf, 'high': np.inf} + if include_market: + space['competitor_quotes'] = {'shape': (n_instruments,), 'low': 0, 'high': np.inf} + return space diff --git a/lab/outlet/platform.py b/lab/outlet/platform.py new file mode 100644 index 0000000..eabb69a --- /dev/null +++ b/lab/outlet/platform.py @@ -0,0 +1,285 @@ +""" +Main simulation platform orchestrating the Quote-Control loop. + +The Platform class is the central coordinator that: +1. Receives pricing actions (quotes) from the agent +2. Generates arrivals via the ArrivalModel +3. Processes executions via Mechanism and ExecutionModel +4. Applies position censorship via PositionModel +5. Computes metrics and reward via Objective +6. Returns censored observations + +Example: + >>> from lab.config import make_retail_platform + >>> platform = make_retail_platform() + >>> result = platform.reset(seed=42) + >>> result = platform.step(platform.instruments.refs * 1.1) + >>> print(f"PnL: {result.metrics.pnl:.2f}") +""" +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Any +import numpy as np +from .types import (Quote, Opportunity, Execution, InstrumentSet, StepLogs, StepMetrics, + StepEvent, MarketState, HiddenState, Observation, StepResult) +from .constants import LogLevel, EventType, Side +from .protocols import Mechanism, ArrivalModel, ExecutionModel, PositionModel, MarketModel, ObservationBuilder, Objective +from .stock import PositionModel as DefaultPositionModel, PositionConfig +from .observation import DefaultObservationBuilder, ObservationConfig +from .objectives.factory import retail_objective + +@dataclass +class PlatformConfig: + """Configuration for the simulation platform. + + Attributes: + n_instruments: Number of instruments in the simulation + max_steps: Maximum steps before episode terminates + dt: Time duration per step (affects arrival rates) + log_level: Verbosity of logging (NONE, AGG_ONLY, FULL) + mask_demand: If True, observations exclude true demand (research mode) + seed: Random seed for reproducibility + """ + n_instruments: int = 10 + max_steps: int = 1000 + dt: float = 1.0 + log_level: LogLevel = LogLevel.AGG_ONLY + mask_demand: bool = True + seed: int | None = None + +class Platform: + """Main simulation orchestrator implementing Quote -> Arrival -> Execution -> Position. + + The Platform coordinates all components to simulate a pricing environment: + - Mechanism: validates quotes and determines execution logic + - ArrivalModel: generates demand opportunities + - ExecutionModel: computes acceptance probabilities + - PositionModel: manages inventory/position and censorship + - MarketModel: updates competitor/market state + - ObservationBuilder: constructs censored observations + - Objective: computes reward from metrics + + Attributes: + instruments: The instrument set being priced + mechanism: Quote validation and execution mechanism + arrival: Demand arrival generator + execution: Acceptance probability model + position: Inventory/position manager + market: Competitor/market dynamics (optional) + obs_builder: Observation constructor + objective: Reward function + cfg: Platform configuration + """ + + def __init__(self, instruments: InstrumentSet, mechanism: Mechanism, + arrival: ArrivalModel, execution: ExecutionModel, + position: PositionModel | None = None, + market: MarketModel | None = None, + obs_builder: ObservationBuilder | None = None, + objective: Objective | None = None, + cfg: PlatformConfig | None = None): + self.instruments = instruments + self.mechanism = mechanism + self.arrival = arrival + self.execution = execution + self.position = position or DefaultPositionModel(PositionConfig()) + self.market = market + self.obs_builder = obs_builder or DefaultObservationBuilder() + self.objective = objective or retail_objective() + self.cfg = cfg or PlatformConfig(n_instruments=instruments.n) + + self._t: int = 0 + self._rng: np.random.Generator = np.random.default_rng(self.cfg.seed) + self._quote: Quote | None = None + self._market_state: MarketState | None = None + self._hidden: HiddenState = HiddenState() + self._prev_prices: np.ndarray | None = None + + def reset(self, seed: int | None = None) -> StepResult: + """Reset the platform to initial state. + + Args: + seed: Random seed (overrides config seed if provided) + + Returns: + Initial StepResult with zeroed metrics and initial observation + """ + self._t = 0 + self._rng = np.random.default_rng(seed or self.cfg.seed) + self._hidden = HiddenState() + self._prev_prices = self.instruments.refs.copy() + + # reset position + self.position.reset(self.instruments, self._rng) + self.instruments.position = self.position.position + + # initial quote at reference prices + self._quote = Quote(prices=self.instruments.refs.copy(), propensity=1.0, + metadata={'prev_prices': self._prev_prices}) + self._quote = self.mechanism.apply_quote(self._quote, self.instruments, self._rng) + + # initial market state + if self.market: + self._market_state = self.market.step(0, self._quote, self._hidden, self._rng) + + # build initial observation + logs = StepLogs(aggregates={'reset': True}, + true_demand=np.zeros(self.instruments.n), + censored_fills=np.zeros(self.instruments.n)) + metrics = StepMetrics() + obs = self.obs_builder.build(self._quote, self.instruments, logs, metrics, + self._market_state, self._hidden, self.cfg.mask_demand, 0) + + return StepResult(obs=obs, reward=0.0, terminated=False, truncated=False, + info={'true_demand': logs.true_demand}, metrics=metrics, + logs=logs, hidden=self._hidden) + + def step(self, action: np.ndarray, propensity: float = 1.0) -> StepResult: + """Execute one simulation step with the given pricing action. + + The step proceeds as follows: + 1. Apply quote constraints via mechanism + 2. Update market/competitor state + 3. Generate arrivals + 4. Process arrivals -> executions with acceptance check + 5. Apply position censorship to executions + 6. Update position state + 7. Compute metrics (PnL, costs, etc.) + 8. Build logs with propensities + 9. Construct censored observation + 10. Compute reward + + Args: + action: Price vector for all instruments + propensity: P(action | behavior policy) for OPE logging + + Returns: + StepResult containing observation, reward, metrics, logs, and hidden state + """ + self._t += 1 + cfg = self.cfg + + # 1. apply quote from action + self._quote = Quote(prices=action, propensity=propensity, + metadata={'prev_prices': self._prev_prices}) + self._quote = self.mechanism.apply_quote(self._quote, self.instruments, self._rng) + self._prev_prices = self._quote.prices.copy() + self._hidden.quote_history.append(self._quote.prices.copy()) + + # 2. update market/competitors + if self.market: + self._market_state = self.market.step(self._t, self._quote, self._hidden, self._rng) + self._hidden.market_history.append(self._market_state) + + # 3. generate arrivals + opps = self.arrival.sample(self._t, cfg.dt, self.instruments, + self._market_state, self._hidden, self._rng) + + # 4. process opportunities -> executions + executions: list[Execution] = [] + events: list[StepEvent] = [] + true_demand = np.zeros(self.instruments.n) + + for opp in opps: + # log exposure + if cfg.log_level == LogLevel.FULL: + events.append(StepEvent(t=opp.t, type=EventType.EXPOSURE, + instrument_id=opp.instrument_id, + opportunity_id=opp.id, + price=float(self._quote.prices[opp.instrument_id]), + propensity=self._quote.propensity)) + + # check acceptance + prob = self.execution.prob(opp, self._quote, self.instruments, + self._market_state, self._rng) + if self._rng.random() < prob: + # create execution + exe = self.mechanism.process_opportunity(opp, self._quote, self.instruments, + self._market_state, self._rng) + if exe: + true_demand[exe.instrument_id] += exe.size_requested + # apply position censorship + exe = self.position.apply_execution(exe) + executions.append(exe) + if cfg.log_level == LogLevel.FULL: + events.append(StepEvent(t=exe.t, type=EventType.EXECUTION, + instrument_id=exe.instrument_id, + opportunity_id=exe.opportunity_id, + price=exe.price, size=exe.size_filled, + propensity=exe.propensity)) + + # 5. update position state + self.position.step(self._t) + self.instruments.position = self.position.position + + # 6. compute metrics + censored_fills = np.zeros(self.instruments.n) + revenue = 0.0 + cost = 0.0 + spread_capture = 0.0 + + for exe in executions: + censored_fills[exe.instrument_id] += exe.size_filled + if exe.side == Side.BUY: + revenue += exe.price * exe.size_filled + cost += self.instruments.costs[exe.instrument_id] * exe.size_filled + else: + revenue -= exe.price * exe.size_filled + cost -= self.instruments.costs[exe.instrument_id] * exe.size_filled + # spread capture for market making + if self._quote.spreads is not None and self._market_state and self._market_state.mid_prices is not None: + mid = self._market_state.mid_prices[exe.instrument_id] + if exe.side == Side.BUY: + spread_capture += (exe.price - mid) * exe.size_filled + else: + spread_capture += (mid - exe.price) * exe.size_filled + + pnl = revenue - cost + units = float(np.sum(censored_fills)) + lost = float(np.sum(true_demand - censored_fills)) + + # volatility + volatility = 0.0 + if len(self._hidden.quote_history) > 1: + prev = self._hidden.quote_history[-2] + volatility = float(np.mean(np.abs(self._quote.prices - prev) / (prev + 1e-8))) + + metrics = StepMetrics( + pnl=pnl, revenue=revenue, cost=cost, units_traded=units, + position_cost=self.position.holding_cost, + lost_opportunity=self.position.shortage_cost + lost * np.mean(self._quote.prices) * 0.1, + spread_capture=spread_capture, volatility=volatility, + conversion=units / (len(opps) + 1e-8), + per_instrument={'fills': censored_fills, 'demand': true_demand} + ) + + # 7. build logs + logs = StepLogs( + events=events if cfg.log_level == LogLevel.FULL else None, + executions=executions if cfg.log_level == LogLevel.FULL else None, + aggregates={'n_arrivals': len(opps), 'n_executions': len(executions), + 'exposures': np.bincount([o.instrument_id for o in opps], + minlength=self.instruments.n).astype(float)}, + true_demand=true_demand, + censored_fills=censored_fills + ) + + # 8. build observation + obs = self.obs_builder.build(self._quote, self.instruments, logs, metrics, + self._market_state, self._hidden, cfg.mask_demand, self._t) + + # 9. compute reward + reward = self.objective.reward(self._quote, self.instruments, metrics, self._hidden, obs) + breakdown = self.objective.breakdown(self._quote, self.instruments, metrics, self._hidden, obs) + # print(f"Step {self._t}: Reward={reward:.2f}, Breakdown={breakdown}") + + + # 10. check termination + terminated = self._t >= cfg.max_steps + truncated = False + + info = {'true_demand': true_demand, 'breakdown': self.objective.breakdown( + self._quote, self.instruments, metrics, self._hidden, obs)} + + return StepResult(obs=obs, reward=reward, terminated=terminated, truncated=truncated, + info=info, metrics=metrics, logs=logs, hidden=self._hidden) diff --git a/lab/outlet/protocols.py b/lab/outlet/protocols.py new file mode 100644 index 0000000..13bf967 --- /dev/null +++ b/lab/outlet/protocols.py @@ -0,0 +1,297 @@ +""" +Protocol definitions for pluggable simulator components. + +This module defines the interfaces (Protocols) that allow swapping different +implementations for each stage of the Quote -> Arrival -> Execution -> Position +pipeline. All protocols use structural subtyping (duck typing). + +Protocols: + Mechanism: How quotes translate to executions (posted price, two-sided, auction) + ArrivalModel: How opportunities arrive (Poisson, Hawkes, sessions) + ExecutionModel: Acceptance probability given quote (elasticity, intensity) + PositionModel: Inventory/position management and censorship + MarketModel: Competitor/market dynamics + ObservationBuilder: Constructs agent observations with censoring + Objective: Computes reward from metrics +""" +from __future__ import annotations +from typing import Protocol, Any, TYPE_CHECKING +import numpy as np +if TYPE_CHECKING: + from .types import (Quote, Opportunity, Execution, InstrumentSet, StepLogs, + StepMetrics, HiddenState, Observation, MarketState) + from .constants import LogLevel + +class Mechanism(Protocol): + """Defines how quotes translate to executions. + + The Mechanism is the core abstraction that differentiates pricing domains: + - PostedPrice: single price, buyer decides to purchase or not + - TwoSided: bid/ask spread, execution depends on distance from mid + - Auction: reserve price affects win probability and clearing price + + Methods: + apply_quote: Enforce constraints and return valid quote + process_opportunity: Determine execution given opportunity and quote + """ + def apply_quote(self, quote: Quote, instruments: InstrumentSet, + rng: np.random.Generator) -> Quote: + """Apply mechanism-specific constraints to a quote. + + Args: + quote: Raw quote from policy + instruments: Current instrument set with costs/refs + rng: Random generator for stochastic constraints + + Returns: + Constrained quote satisfying mechanism rules (min margin, max delta, etc.) + """ + ... + + def process_opportunity(self, opp: Opportunity, quote: Quote, + instruments: InstrumentSet, market: MarketState | None, + rng: np.random.Generator) -> Execution | None: + """Process an opportunity against the current quote. + + Args: + opp: Incoming opportunity (session, order, request) + quote: Current posted quote + instruments: Instrument set + market: Current market state (competitor prices, mid-prices) + rng: Random generator + + Returns: + Execution if opportunity converts, None otherwise + """ + ... + +class ArrivalModel(Protocol): + """Generates opportunities (demand arrivals) for each step. + + Different arrival models capture different demand dynamics: + - Poisson: constant rate, memoryless + - Hawkes: self-exciting, clustered arrivals + - Session: retail browsing with multi-product views + + Methods: + sample: Generate opportunities for a time interval + """ + def sample(self, t: float, dt: float, instruments: InstrumentSet, + market: MarketState | None, hidden: HiddenState, + rng: np.random.Generator) -> list[Opportunity]: + """Sample opportunities for time interval [t, t+dt). + + Args: + t: Current time + dt: Time interval length + instruments: Available instruments + market: Current market state + hidden: Hidden state (contains demand intensity, contamination) + rng: Random generator + + Returns: + List of opportunities arriving in this interval + """ + ... + +class ExecutionModel(Protocol): + """Computes acceptance/execution probability given quote and context. + + Different models capture different demand responses: + - Elasticity: price sensitivity with competitor cross-effects + - Intensity: distance-based fill probability (market making) + - Logit: discrete choice model + + Methods: + prob: Compute acceptance probability + uncensor: Estimate true demand from censored fills + """ + def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet, + market: MarketState | None, rng: np.random.Generator) -> float: + """Compute probability that opportunity accepts the quote. + + Args: + opp: Opportunity to evaluate + quote: Current quote + instruments: Instrument set + market: Market state (competitor prices affect cross-elasticity) + rng: Random generator + + Returns: + Probability in [0, 1] that opportunity executes + """ + ... + + def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, + context: dict[str, Any] | None = None) -> np.ndarray: + """Estimate true demand from censored fills. + + Used for demand estimation research under inventory censorship. + + Args: + fills: Observed (censored) fill counts + instruments: Instrument set + context: Additional context (exposures, prices shown) + + Returns: + Estimated true demand counts + """ + ... + +class PositionModel(Protocol): + """Manages inventory (retail) or position (finance). + + Handles: + - Position constraints and censorship + - Holding costs (retail) or inventory risk (finance) + - Replenishment and order receipt + + Methods: + reset: Initialize position state + available: Query available capacity for a trade + apply_execution: Censor execution by available position + step: Process time-based updates (replenishment, holding cost) + + Properties: + position: Current position vector + holding_cost: Cost incurred this step from holding position + """ + def reset(self, instruments: InstrumentSet, rng: np.random.Generator) -> None: + """Initialize position state for new episode.""" + ... + + def available(self, instrument_id: int, side: Any) -> float: + """Query available capacity for a trade. + + Args: + instrument_id: Which instrument + side: BUY or SELL + + Returns: + Maximum tradeable size given current position + """ + ... + + def apply_execution(self, exe: Execution) -> Execution: + """Apply position constraints to an execution. + + Args: + exe: Proposed execution with size_requested + + Returns: + Censored execution with size_filled <= available capacity + """ + ... + + def step(self, t: float) -> None: + """Process time-based position updates. + + Handles replenishment receipt, holding cost calculation, etc. + """ + ... + + @property + def position(self) -> np.ndarray: + """Current position vector (positive=long/inventory, negative=short).""" + ... + + @property + def holding_cost(self) -> float: + """Holding cost incurred this step.""" + ... + +class MarketModel(Protocol): + """Models external market dynamics and competitor behavior. + + For retail: competitor price dynamics (static, reactive, stochastic) + For finance: mid-price process (GBM, mean-reverting) + + Methods: + step: Update market state given agent's quotes + """ + def step(self, t: float, self_quotes: Quote, hidden: HiddenState, + rng: np.random.Generator) -> MarketState: + """Update market state for this timestep. + + Args: + t: Current time + self_quotes: Agent's current quotes (competitors may react) + hidden: Hidden state (regime info) + rng: Random generator + + Returns: + Updated market state with competitor prices, mid-prices, volatility + """ + ... + +class ObservationBuilder(Protocol): + """Constructs agent observations with appropriate censoring. + + Critical for research: ensures agent only sees censored fills, + never true demand (which goes in info dict). + + Methods: + build: Construct observation from step data + """ + def build(self, quote: Quote, instruments: InstrumentSet, logs: StepLogs, + metrics: StepMetrics, market: MarketState | None, + hidden: HiddenState, mask_demand: bool, t: int) -> Observation: + """Build observation for agent. + + Args: + quote: Current quote + instruments: Instrument set with positions + logs: Step logs with true_demand and censored_fills + metrics: Computed metrics + market: Market state + hidden: Hidden state (not included in obs) + mask_demand: If True, exclude true demand from observation + t: Current timestep + + Returns: + Observation containing only observable quantities + """ + ... + +class Objective(Protocol): + """Computes reward from step metrics. + + Supports composite objectives with weighted terms: + - PnL (profit) + - Position costs (holding, inventory risk) + - Lost opportunity (stockouts) + - Volatility penalty (UX) + - Spread capture (market making) + + Methods: + reward: Compute scalar reward + breakdown: Get per-term contribution for analysis + """ + def reward(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, + obs: Observation) -> float: + """Compute scalar reward for this step. + + Args: + quote: Current quote + instruments: Instrument set + metrics: Step metrics (pnl, costs, etc.) + hidden: Hidden state + obs: Agent observation + + Returns: + Scalar reward value + """ + ... + + def breakdown(self, quote: Quote, instruments: InstrumentSet, + metrics: StepMetrics, hidden: HiddenState, + obs: Observation) -> dict[str, float]: + """Get reward breakdown by component. + + Useful for analyzing which terms dominate the reward. + + Returns: + Dict mapping term names to their contributions + """ + ... diff --git a/lab/outlet/stock.py b/lab/outlet/stock.py new file mode 100644 index 0000000..b2c88a2 --- /dev/null +++ b/lab/outlet/stock.py @@ -0,0 +1,151 @@ +""" +Inventory/position management and instrument factories. + +This module provides: +- PositionConfig: Configuration for position constraints and costs +- PositionModel: Manages inventory (retail) or position (finance) +- make_instruments: Factory for creating instrument sets + +The PositionModel handles demand censorship by limiting executions +to available inventory, computing holding costs, and managing replenishment. +""" +from __future__ import annotations +from dataclasses import dataclass, field +import numpy as np +from .types import Instrument, InstrumentSet, Execution +from .constants import Side, InstrumentType + +@dataclass +class PositionConfig: + """Configuration for position/inventory management. + + Attributes: + initial_position: Starting inventory (None = unlimited, float = same for all) + max_position: Maximum long position per instrument + min_position: Maximum short position (negative, for finance) + holding_cost_rate: Cost per unit per step for holding inventory + shortage_cost_rate: Opportunity cost rate for stockouts + lead_time: Steps until replenishment orders arrive + """ + initial_position: np.ndarray | float | None = None + max_position: float = 1000.0 + min_position: float = -1000.0 + holding_cost_rate: float = 0.001 + shortage_cost_rate: float = 0.05 + lead_time: int = 0 + +@dataclass +class PositionModel: + """Manages inventory (retail) or position (finance) with censorship. + + Key responsibilities: + - Track current position per instrument + - Censor executions when position is insufficient + - Compute holding costs per step + - Track shortage/stockout costs + - Handle replenishment orders with lead time + + For retail: position is inventory (positive), selling reduces it + For finance: position can be positive (long) or negative (short) + """ + cfg: PositionConfig + n: int = 0 + _position: np.ndarray = field(default_factory=lambda: np.array([])) + _pending_orders: list[tuple[int, np.ndarray]] = field(default_factory=list) + _step_holding_cost: float = 0.0 + _step_shortage_cost: float = 0.0 + + def reset(self, instruments: InstrumentSet, rng: np.random.Generator) -> None: + self.n = instruments.n + if self.cfg.initial_position is None: + self._position = np.full(self.n, np.inf) # unlimited + elif isinstance(self.cfg.initial_position, (int, float)): + self._position = np.full(self.n, float(self.cfg.initial_position)) + else: + self._position = self.cfg.initial_position.copy().astype(np.float64) + self._pending_orders = [] + self._step_holding_cost = 0.0 + self._step_shortage_cost = 0.0 + + def available(self, instrument_id: int, side: Side) -> float: + pos = self._position[instrument_id] + if np.isinf(pos): return np.inf + if side == Side.BUY: + return max(0, pos) # can sell up to current inventory + else: + return max(0, self.cfg.max_position - pos) # can buy up to max + + def apply_execution(self, exe: Execution) -> Execution: + idx = int(exe.instrument_id) + avail = self.available(idx, exe.side) + filled = min(exe.size_requested, avail) + shortage = exe.size_requested - filled + + if exe.side == Side.BUY: + self._position[idx] -= filled # sold from inventory + else: + self._position[idx] += filled # bought into inventory + + if shortage > 0: + self._step_shortage_cost += shortage * exe.price * self.cfg.shortage_cost_rate + + return Execution( + opportunity_id=exe.opportunity_id, instrument_id=exe.instrument_id, + side=exe.side, size_requested=exe.size_requested, + size_filled=filled, price=exe.price, propensity=exe.propensity, t=exe.t + ) + + def order(self, quantity: np.ndarray) -> None: + if self.cfg.lead_time > 0: + self._pending_orders.append((self.cfg.lead_time, quantity.copy())) + else: + self._position += quantity + + def step(self, t: float) -> None: + # compute holding cost + pos = np.where(np.isinf(self._position), 0, self._position) + self._step_holding_cost = float(np.sum(np.abs(pos)) * self.cfg.holding_cost_rate) + + # receive pending orders + new_pending = [] + for (remaining, qty) in self._pending_orders: + if remaining <= 1: + self._position += qty + else: + new_pending.append((remaining - 1, qty)) + self._pending_orders = new_pending + + @property + def position(self) -> np.ndarray: + return np.where(np.isinf(self._position), -1, self._position) + + @property + def holding_cost(self) -> float: + return self._step_holding_cost + + @property + def shortage_cost(self) -> float: + return self._step_shortage_cost + +def make_instruments(n: int, cost_range: tuple[float, float] = (1.0, 10.0), + margin_range: tuple[float, float] = (0.2, 0.5), + inst_type: InstrumentType = InstrumentType.SKU, + rng: np.random.Generator | None = None) -> InstrumentSet: + """Factory function to create a random instrument set. + + Args: + n: Number of instruments to create + cost_range: (min, max) for uniform cost sampling + margin_range: (min, max) for uniform margin sampling + inst_type: Type of instruments (SKU, ASSET, etc.) + rng: Random generator (uses default if None) + + Returns: + InstrumentSet with n instruments having random costs and margins + """ + rng = rng or np.random.default_rng() + costs = rng.uniform(*cost_range, n) + margins = rng.uniform(*margin_range, n) + items = [Instrument(id=i, type=inst_type, cost_basis=c, reference_price=c*(1+m)) + for i, (c, m) in enumerate(zip(costs, margins))] + return InstrumentSet(instruments=items) diff --git a/lab/outlet/types.py b/lab/outlet/types.py new file mode 100644 index 0000000..db49117 --- /dev/null +++ b/lab/outlet/types.py @@ -0,0 +1,318 @@ +""" +Core data types for the Quote-Control simulator. + +This module defines the fundamental data structures used throughout the platform: +- Identifiers (InstrumentId, OpportunityId, AgentId) +- Domain objects (Instrument, Quote, Opportunity, Execution) +- Logging structures (StepEvent, StepLogs, StepMetrics) +- State containers (MarketState, HiddenState, Observation, StepResult) + +All dataclasses are designed to be serializable and numpy-compatible. +""" +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Any, NewType +import numpy as np +from .constants import Side, InstrumentType, OpportunityType, EventType + +InstrumentId = NewType('InstrumentId', int) # unique instrument index +OpportunityId = NewType('OpportunityId', str) # unique opportunity/session ID +AgentId = NewType('AgentId', str) # unique agent/actor ID + +@dataclass +class Instrument: + """Represents a priceable entity in the simulation. + + An instrument can be a retail SKU, financial asset, loan product, or subscription. + The cost_basis represents the fundamental value (marginal cost for retail, + mid-price for assets, funding rate for loans). + + Attributes: + id: Unique identifier for this instrument + type: Category of instrument (SKU, ASSET, LOAN, SUBSCRIPTION) + cost_basis: Fundamental cost or value (marginal cost, mid-price, funding rate) + reference_price: Base or fair price used for action scaling + attrs: Additional attributes (quality score, category, volatility, etc.) + """ + id: InstrumentId + type: InstrumentType + cost_basis: float + reference_price: float + attrs: dict[str, Any] = field(default_factory=dict) + +@dataclass +class InstrumentSet: + """Collection of instruments with optional position tracking. + + Provides vectorized access to instrument properties for efficient computation. + Position can be positive (long/inventory) or negative (short) for financial assets. + + Attributes: + instruments: List of Instrument objects + position: Current position per instrument (None = unlimited capacity) + + Properties: + n: Number of instruments + costs: Vector of cost bases + refs: Vector of reference prices + """ + instruments: list[Instrument] + position: np.ndarray | None = None + + @property + def n(self) -> int: return len(self.instruments) + @property + def costs(self) -> np.ndarray: return np.array([i.cost_basis for i in self.instruments], np.float32) + @property + def refs(self) -> np.ndarray: return np.array([i.reference_price for i in self.instruments], np.float32) + +@dataclass +class Quote: + """Price quote set by the policy - the action in the MDP. + + Supports multiple quoting mechanisms: + - Posted price: only `prices` field used + - Two-sided: `prices` as mid, `spreads` for bid-ask width + - Auction: `prices` as reserve prices + + The propensity field is critical for off-policy evaluation (OPE). + + Attributes: + prices: Posted prices (retail) or mid-quotes (market making) + spreads: Bid-ask spread width for two-sided quoting (None for posted price) + propensity: P(this quote | behavior policy) for importance sampling + metadata: Additional info (prev_prices for delta constraints, etc.) + + Properties: + bids: Computed bid prices (mid - spread/2) + asks: Computed ask prices (mid + spread/2) + """ + prices: np.ndarray + spreads: np.ndarray | None = None + propensity: float = 1.0 + metadata: dict[str, Any] = field(default_factory=dict) + + @property + def bids(self) -> np.ndarray | None: + return self.prices - self.spreads/2 if self.spreads is not None else None + @property + def asks(self) -> np.ndarray | None: + return self.prices + self.spreads/2 if self.spreads is not None else None + +@dataclass +class Opportunity: + """An arrival event that may result in a transaction. + + Opportunities are the demand side of the simulation: + - Retail: browsing session with purchase intent + - Market making: incoming market order + - Lending: loan application + + The context dict carries segment/type information used by execution models. + + Attributes: + id: Unique identifier for this opportunity + type: Category (SESSION, MARKET_ORDER, REQUEST) + side: BUY or SELL intent + instrument_id: Which instrument the opportunity targets + size: Requested transaction size (units, shares, principal) + t: Arrival timestamp + context: Segment info (is_scraper, credit_score, urgency, etc.) + """ + id: OpportunityId + type: OpportunityType + side: Side + instrument_id: InstrumentId + size: float = 1.0 + t: float = 0.0 + context: dict[str, Any] = field(default_factory=dict) + +@dataclass +class Execution: + """A realized transaction after acceptance and position censorship. + + The difference between size_requested and size_filled represents + censored demand due to inventory/position constraints. + + Attributes: + opportunity_id: Links back to the originating Opportunity + instrument_id: Which instrument was traded + side: BUY or SELL + size_requested: Original requested size (true demand) + size_filled: Actual filled size after censorship + price: Execution price + propensity: Combined propensity for OPE (quote * acceptance) + t: Execution timestamp + """ + opportunity_id: OpportunityId + instrument_id: InstrumentId + side: Side + size_requested: float + size_filled: float + price: float + propensity: float = 1.0 + t: float = 0.0 + +@dataclass +class StepEvent: + """Generic logged event""" + t: float + type: EventType + instrument_id: InstrumentId | None = None + opportunity_id: OpportunityId | None = None + price: float | None = None + size: float | None = None + propensity: float = 1.0 + metadata: dict[str, Any] = field(default_factory=dict) + +@dataclass +class StepLogs: + """Container for all logging data from a simulation step. + + Supports both detailed event logging (for OPE) and aggregate-only mode + (for fast simulation). The true_demand vs censored_fills distinction + is critical for research on demand estimation under censorship. + + Attributes: + events: Detailed event log (None if LogLevel != FULL) + executions: List of executed transactions (None if LogLevel != FULL) + aggregates: Always-available aggregate statistics + true_demand: Oracle demand before censorship (for research, not in obs) + censored_fills: Realized fills after position constraints (observable) + """ + events: list[StepEvent] | None = None + executions: list[Execution] | None = None + aggregates: dict[str, Any] = field(default_factory=dict) + true_demand: np.ndarray | None = None + censored_fills: np.ndarray | None = None + +@dataclass +class StepMetrics: + """Computed metrics for a single simulation step. + + Metrics are domain-aware: retail uses revenue/cost/holding_cost, + market making uses spread_capture and inventory risk. + + Attributes: + pnl: Profit and loss (revenue - cost for retail, mark-to-market for finance) + revenue: Gross revenue from sales/executions + cost: Cost of goods sold or position acquisition cost + units_traded: Total units/shares transacted + position_cost: Holding cost (retail) or inventory risk penalty (finance) + lost_opportunity: Cost of stockouts or missed fills + spread_capture: Bid-ask spread captured (market making) + volatility: Price volatility metric for UX consideration + conversion: Fill rate (executions / opportunities) + per_instrument: Per-instrument breakdowns (fills, demand, etc.) + """ + pnl: float = 0.0 + revenue: float = 0.0 + cost: float = 0.0 + units_traded: float = 0.0 + position_cost: float = 0.0 + lost_opportunity: float = 0.0 + spread_capture: float = 0.0 + volatility: float = 0.0 + conversion: float = 0.0 + per_instrument: dict[str, np.ndarray] = field(default_factory=dict) + +@dataclass +class MarketState: + """External market conditions and competitor state. + + For retail: competitor_quotes drives cross-elasticity effects. + For finance: mid_prices and volatility drive execution dynamics. + + Attributes: + competitor_quotes: Competitor posted prices (retail) + mid_prices: Market mid-prices for assets (finance) + volatility: Per-instrument volatility estimate + regime: Market regime identifier (normal, price_war, high_vol, etc.) + t: Timestamp of this market state + """ + competitor_quotes: np.ndarray | None = None + mid_prices: np.ndarray | None = None + volatility: np.ndarray | None = None + regime: str = 'normal' + t: float = 0.0 + +@dataclass +class HiddenState: + """Internal simulator state not exposed to the agent. + + Contains oracle information for research analysis and + history needed for non-stationary dynamics. + + Attributes: + true_demand_intensity: Latent demand multiplier + contamination: Fraction of arrivals that are adversarial/scraper + regime: Current market/competitor regime + quote_history: History of agent quotes for volatility calculation + market_history: History of market states for analysis + """ + true_demand_intensity: float = 1.0 + contamination: float = 0.0 + regime: str = 'normal' + quote_history: list[np.ndarray] = field(default_factory=list) + market_history: list[MarketState] = field(default_factory=list) + +@dataclass +class Observation: + """Observable state provided to the agent - censored view only. + + Critical invariant: Observation never contains true_demand, only + censored fills. This enforces the censorship research setting. + + Attributes: + quotes: Current posted quotes (the agent's last action) + position: Current inventory/position state + fills: Censored execution counts per instrument + exposures: Opportunity exposure counts per instrument + market: Observable market state (competitor prices, volatility) + t: Current timestep + extra: Additional observable features + + Methods: + to_flat: Flatten to numpy array for gym compatibility + """ + quotes: np.ndarray + position: np.ndarray | None + fills: np.ndarray + exposures: np.ndarray + market: MarketState | None + t: int + extra: dict[str, Any] = field(default_factory=dict) + + def to_flat(self) -> np.ndarray: + """Flatten observation to 1D numpy array for gym environments.""" + parts = [self.quotes, self.fills, self.exposures] + if self.position is not None: parts.append(self.position) + if self.market and self.market.competitor_quotes is not None: + parts.append(self.market.competitor_quotes) + return np.concatenate([p.flatten() for p in parts]) + +@dataclass +class StepResult: + """Complete result from a simulation step. + + Follows gymnasium convention for obs, reward, terminated, truncated, info. + Additionally provides metrics, logs, and hidden state for research. + + Attributes: + obs: Observable state (censored) + reward: Scalar reward from objective function + terminated: Episode ended naturally (max_steps reached) + truncated: Episode ended early (bankruptcy, constraint violation) + info: Additional info dict (contains true_demand for research) + metrics: Computed metrics for this step + logs: Event logs and aggregates + hidden: Internal simulator state (oracle info) + """ + obs: Observation + reward: float + terminated: bool + truncated: bool + info: dict[str, Any] + metrics: StepMetrics + logs: StepLogs + hidden: HiddenState diff --git a/lab/population/__init__.py b/lab/population/__init__.py new file mode 100644 index 0000000..081dbd0 --- /dev/null +++ b/lab/population/__init__.py @@ -0,0 +1,10 @@ +from .arrivals import PoissonArrivalModel, HawkesArrivalModel, SessionArrivalModel +from .execution import ElasticityExecutionModel, IntensityExecutionModel, LogitExecutionModel +from .competitors import (StaticCompetitorModel, ReactiveCompetitorModel, + StochasticCompetitorModel, GBMMarketModel) + +__all__ = [ + 'PoissonArrivalModel', 'HawkesArrivalModel', 'SessionArrivalModel', + 'ElasticityExecutionModel', 'IntensityExecutionModel', 'LogitExecutionModel', + 'StaticCompetitorModel', 'ReactiveCompetitorModel', 'StochasticCompetitorModel', 'GBMMarketModel', +] diff --git a/lab/population/arrivals.py b/lab/population/arrivals.py new file mode 100644 index 0000000..b7e7ed6 --- /dev/null +++ b/lab/population/arrivals.py @@ -0,0 +1,168 @@ +""" +Arrival models for generating demand opportunities. + +This module provides different arrival processes: +- PoissonArrivalModel: Constant-rate memoryless arrivals +- HawkesArrivalModel: Self-exciting clustered arrivals (market orders) +- SessionArrivalModel: Retail browsing sessions with multi-product views + +Each model implements the ArrivalModel protocol and generates Opportunity objects +that flow through the execution pipeline. +""" +from __future__ import annotations +from dataclasses import dataclass +from typing import Callable +import numpy as np +from uuid import uuid4 +from ..outlet.types import Opportunity, InstrumentSet, MarketState, HiddenState +from ..outlet.constants import Side, OpportunityType +from ..outlet.math_util import poisson_arrivals, hawkes_intensity + +@dataclass +class PoissonArrivalConfig: + """Configuration for Poisson arrival process. + + Attributes: + base_rate: Expected arrivals per unit time (scaled by hidden.true_demand_intensity) + side_probs: Probability distribution over BUY/SELL sides + """ + base_rate: float = 10.0 + side_probs: dict[Side, float] = None + + def __post_init__(self): + if self.side_probs is None: + self.side_probs = {Side.BUY: 1.0} + +class PoissonArrivalModel: + """Homogeneous Poisson arrival process. + + Generates arrivals at a constant rate (modulated by demand intensity). + Suitable for stationary demand or as a baseline model. + + The actual arrival count follows Poisson(rate * dt * intensity). + """ + + def __init__(self, cfg: PoissonArrivalConfig | None = None): + self.cfg = cfg or PoissonArrivalConfig() + + def sample(self, t: float, dt: float, instruments: InstrumentSet, + market: MarketState | None, hidden: HiddenState, + rng: np.random.Generator) -> list[Opportunity]: + n_arrivals = poisson_arrivals(self.cfg.base_rate * hidden.true_demand_intensity, dt, rng) + opps = [] + for _ in range(n_arrivals): + inst_id = rng.integers(0, instruments.n) + side = rng.choice(list(self.cfg.side_probs.keys()), + p=list(self.cfg.side_probs.values())) + opps.append(Opportunity( + id=str(uuid4())[:8], type=OpportunityType.SESSION, + side=side, instrument_id=inst_id, size=1.0, t=t, + context={'segment': 'default'} + )) + return opps + +@dataclass +class HawkesArrivalConfig: + """Configuration for Hawkes self-exciting process. + + Attributes: + base_rate: Baseline arrival intensity + alpha: Excitation strength (how much each arrival increases intensity) + beta: Decay rate (how quickly excitation fades) + side_probs: Probability distribution over BUY/SELL sides + """ + base_rate: float = 5.0 + alpha: float = 0.5 + beta: float = 1.0 + side_probs: dict[Side, float] = None + + def __post_init__(self): + if self.side_probs is None: + self.side_probs = {Side.BUY: 0.5, Side.SELL: 0.5} + +class HawkesArrivalModel: + """Self-exciting Hawkes point process for clustered arrivals. + + Models order flow where arrivals cluster in time (momentum, herding). + Intensity: lambda(t) = base + alpha * sum(exp(-beta * (t - t_i))) + + Used for market making scenarios where orders arrive in bursts. + """ + + def __init__(self, cfg: HawkesArrivalConfig | None = None): + self.cfg = cfg or HawkesArrivalConfig() + self._history: np.ndarray = np.array([]) + + def sample(self, t: float, dt: float, instruments: InstrumentSet, + market: MarketState | None, hidden: HiddenState, + rng: np.random.Generator) -> list[Opportunity]: + intensity = hawkes_intensity( + self.cfg.base_rate * hidden.true_demand_intensity, + self._history, self.cfg.alpha, self.cfg.beta, t + ) + n_arrivals = poisson_arrivals(intensity, dt, rng) + opps = [] + for i in range(n_arrivals): + arr_t = t + rng.uniform(0, dt) + self._history = np.append(self._history, arr_t) + inst_id = rng.integers(0, instruments.n) + side = rng.choice(list(self.cfg.side_probs.keys()), + p=list(self.cfg.side_probs.values())) + opps.append(Opportunity( + id=str(uuid4())[:8], type=OpportunityType.MARKET_ORDER, + side=side, instrument_id=inst_id, + size=rng.exponential(1.0), t=arr_t, + context={'intensity': intensity} + )) + # decay old history + self._history = self._history[self._history > t - 10] + return opps + +@dataclass +class SessionArrivalConfig: + """Configuration for retail session arrivals. + + Attributes: + sessions_per_step: Number of browsing sessions per step + views_per_session: (min, max) product views per session + contamination: Fraction of sessions that are scrapers/bots + """ + sessions_per_step: int = 20 + views_per_session: tuple[int, int] = (1, 5) + contamination: float = 0.0 + +class SessionArrivalModel: + """Retail browsing session model with multi-product views. + + Each session views multiple products, generating one opportunity per view. + Scraper sessions (controlled by contamination) view more products + but convert at lower rates (handled by ExecutionModel). + """ + + def __init__(self, cfg: SessionArrivalConfig | None = None): + self.cfg = cfg or SessionArrivalConfig() + + def sample(self, t: float, dt: float, instruments: InstrumentSet, + market: MarketState | None, hidden: HiddenState, + rng: np.random.Generator) -> list[Opportunity]: + n_sessions = self.cfg.sessions_per_step + contamination = hidden.contamination if hidden else self.cfg.contamination + opps = [] + + for _ in range(n_sessions): + is_scraper = rng.random() < contamination + n_views = rng.integers(*self.cfg.views_per_session) + sid = str(uuid4())[:8] + + # scrapers view more products + if is_scraper: + n_views = min(instruments.n, n_views * 3) + + viewed = rng.choice(instruments.n, size=min(n_views, instruments.n), replace=False) + for inst_id in viewed: + opps.append(Opportunity( + id=f"{sid}-{inst_id}", type=OpportunityType.SESSION, + side=Side.BUY, instrument_id=int(inst_id), size=1.0, t=t, + context={'session_id': sid, 'is_scraper': is_scraper, 'n_views': n_views} + )) + return opps diff --git a/lab/population/competitors.py b/lab/population/competitors.py new file mode 100644 index 0000000..9417709 --- /dev/null +++ b/lab/population/competitors.py @@ -0,0 +1,189 @@ +""" +Market and competitor models for external dynamics. + +This module provides models for competitor pricing (retail) and market dynamics (finance): +- StaticCompetitorModel: Fixed competitor prices +- ReactiveCompetitorModel: Competitor reacts to agent's prices, can trigger price wars +- StochasticCompetitorModel: Random walk competitor prices +- GBMMarketModel: Geometric Brownian Motion for asset mid-prices + +Each model implements the MarketModel protocol. +""" +from __future__ import annotations +from dataclasses import dataclass +import numpy as np +from ..outlet.types import Quote, MarketState, HiddenState +from ..outlet.math_util import clamp, ema + +@dataclass +class StaticCompetitorConfig: + """Configuration for static competitor. + + Attributes: + markup: Fixed percentage markup over reference prices + """ + markup: float = 0.1 + +class StaticCompetitorModel: + """Static competitor with fixed markup pricing. + + Competitor prices = reference * (1 + markup). + Useful as a baseline or for testing without competitor dynamics. + """ + + def __init__(self, cfg: StaticCompetitorConfig | None = None, refs: np.ndarray | None = None): + self.cfg = cfg or StaticCompetitorConfig() + self.refs = refs + + def step(self, t: float, self_quotes: Quote, hidden: HiddenState, + rng: np.random.Generator) -> MarketState: + refs = self.refs if self.refs is not None else self_quotes.prices + comp_prices = refs * (1 + self.cfg.markup) + return MarketState(competitor_quotes=comp_prices, regime='static', t=t) + +@dataclass +class ReactiveCompetitorConfig: + """Configuration for reactive competitor. + + Attributes: + follow_weight: Smoothing weight for price following (0=ignore, 1=instant) + band_pct: Maximum deviation from reference prices + war_threshold: Relative price diff that triggers price war + war_aggression: How much competitor cuts prices during war + """ + follow_weight: float = 0.3 + band_pct: float = 0.1 + war_threshold: float = -0.15 + war_aggression: float = 0.2 + +class ReactiveCompetitorModel: + """Competitor that reacts to agent's prices with price war dynamics. + + The competitor follows the agent's prices with smoothing. + If the agent undercuts significantly (beyond war_threshold), + a price war is triggered where the competitor becomes more aggressive. + + This creates non-stationary dynamics that test policy robustness. + """ + + def __init__(self, cfg: ReactiveCompetitorConfig | None = None, refs: np.ndarray | None = None): + self.cfg = cfg or ReactiveCompetitorConfig() + self.refs = refs + self._prices: np.ndarray | None = None + self._in_war: bool = False + + def step(self, t: float, self_quotes: Quote, hidden: HiddenState, + rng: np.random.Generator) -> MarketState: + refs = self.refs if self.refs is not None else self_quotes.prices + c = self.cfg + + if self._prices is None: + self._prices = refs.copy() + + # check for price war trigger + relative_diff = (self_quotes.prices - self._prices) / (self._prices + 1e-8) + if np.any(relative_diff < c.war_threshold): + self._in_war = True + elif np.all(relative_diff > -c.war_threshold / 2): + self._in_war = False + + # update prices + if self._in_war: + target = self_quotes.prices * (1 - c.war_aggression) + hidden.regime = 'price_war' + else: + target = self_quotes.prices * (1 + c.follow_weight * 0.05) + hidden.regime = 'normal' + + # follow with smoothing + new_prices = np.array([ema(old, new, c.follow_weight) + for old, new in zip(self._prices, target)]) + + # stay within band + new_prices = clamp(new_prices, refs * (1 - c.band_pct), refs * (1 + c.band_pct)) + self._prices = new_prices + + return MarketState(competitor_quotes=new_prices, regime=hidden.regime, t=t) + +@dataclass +class StochasticCompetitorConfig: + """Configuration for stochastic competitor. + + Attributes: + drift: Price drift per step + volatility: Price volatility (std of random shocks) + mean_revert: Mean reversion strength toward reference + """ + drift: float = 0.0 + volatility: float = 0.02 + mean_revert: float = 0.1 + +class StochasticCompetitorModel: + """Ornstein-Uhlenbeck style stochastic competitor prices. + + Prices follow: dP = drift + mean_revert*(ref - P) + volatility*P*dW + + Provides non-stationary competitor dynamics independent of agent actions. + Useful for testing robustness to market noise. + """ + + def __init__(self, cfg: StochasticCompetitorConfig | None = None, refs: np.ndarray | None = None): + self.cfg = cfg or StochasticCompetitorConfig() + self.refs = refs + self._prices: np.ndarray | None = None + + def step(self, t: float, self_quotes: Quote, hidden: HiddenState, + rng: np.random.Generator) -> MarketState: + refs = self.refs if self.refs is not None else self_quotes.prices + c = self.cfg + + if self._prices is None: + self._prices = refs.copy() + + # Ornstein-Uhlenbeck style dynamics + n = len(self._prices) + noise = rng.normal(0, c.volatility, n) + reversion = c.mean_revert * (refs - self._prices) + self._prices = self._prices + c.drift + reversion + noise * self._prices + self._prices = np.maximum(self._prices, refs * 0.5) + + return MarketState(competitor_quotes=self._prices.copy(), regime='stochastic', t=t) + +@dataclass +class GBMMarketConfig: + """Configuration for GBM market model. + + Attributes: + mu: Price drift (expected return) + sigma: Price volatility + dt: Time step size + """ + mu: float = 0.0 + sigma: float = 0.1 + dt: float = 1.0 + +class GBMMarketModel: + """Geometric Brownian Motion model for asset mid-prices. + + Standard Black-Scholes dynamics: dS = mu*S*dt + sigma*S*dW + + Used for market making scenarios where the underlying asset price + follows a random walk. The agent quotes around this moving mid-price. + """ + + def __init__(self, cfg: GBMMarketConfig | None = None, initial: np.ndarray | None = None): + self.cfg = cfg or GBMMarketConfig() + self._mids = initial + + def step(self, t: float, self_quotes: Quote, hidden: HiddenState, + rng: np.random.Generator) -> MarketState: + if self._mids is None: + self._mids = self_quotes.prices.copy() + + c = self.cfg + n = len(self._mids) + z = rng.standard_normal(n) + self._mids = self._mids * np.exp((c.mu - 0.5*c.sigma**2)*c.dt + c.sigma*np.sqrt(c.dt)*z) + + vol = np.full(n, c.sigma) + return MarketState(mid_prices=self._mids.copy(), volatility=vol, regime='gbm', t=t) diff --git a/lab/population/execution.py b/lab/population/execution.py new file mode 100644 index 0000000..97484b2 --- /dev/null +++ b/lab/population/execution.py @@ -0,0 +1,174 @@ +""" +Execution models for computing acceptance/fill probabilities. + +This module provides different models for how opportunities convert to executions: +- ElasticityExecutionModel: Price elasticity with competitor cross-effects (retail) +- IntensityExecutionModel: Distance-based fill intensity (market making) +- LogitExecutionModel: Discrete choice model + +Each model implements the ExecutionModel protocol. +""" +from __future__ import annotations +from dataclasses import dataclass +from typing import Any +import numpy as np +from ..outlet.types import Opportunity, Quote, InstrumentSet, MarketState +from ..outlet.constants import Side +from ..outlet.math_util import sigmoid, safe_log, intensity_decay, EPS + +@dataclass +class ElasticityConfig: + """Configuration for price elasticity execution model. + + Attributes: + base_prob: Baseline purchase probability at reference price + price_sensitivity: Own-price elasticity coefficient + cross_elasticity: Competitor price cross-elasticity + scraper_conversion: Multiplier for scraper conversion (typically << 1) + """ + base_prob: float = 0.3 + price_sensitivity: float = 2.0 + cross_elasticity: float = 0.5 + scraper_conversion: float = 0.01 + +class ElasticityExecutionModel: + """Price elasticity model for retail dynamic pricing. + + P(buy) = base_prob * exp(-sensitivity * log(price/ref)) * cross_effect * scraper_mult + + Higher prices reduce purchase probability exponentially. + Competitor undercutting shifts demand away from the platform. + Scrapers convert at a much lower rate (reconnaissance, not purchase). + """ + + def __init__(self, cfg: ElasticityConfig | None = None): + self.cfg = cfg or ElasticityConfig() + + def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet, + market: MarketState | None, rng: np.random.Generator) -> float: + idx = int(opp.instrument_id) + price = quote.prices[idx] + ref = instruments.refs[idx] + + # base probability adjusted by price ratio + log_ratio = safe_log(price / ref) + prob = self.cfg.base_prob * np.exp(-self.cfg.price_sensitivity * log_ratio) + + # cross-elasticity: competitor undercutting increases their share + if market and market.competitor_quotes is not None: + comp_price = market.competitor_quotes[idx] + if comp_price < price: + prob *= np.exp(-self.cfg.cross_elasticity * (price - comp_price) / ref) + + # scrapers convert at much lower rate + if opp.context.get('is_scraper', False): + prob *= self.cfg.scraper_conversion + + return float(np.clip(prob, 0, 1)) + + def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, + context: dict[str, Any] | None = None) -> np.ndarray: + # simple imputation: assume fills = prob * exposures, invert + exposures = context.get('exposures', fills) if context else fills + avg_prob = self.cfg.base_prob + return fills / (avg_prob + EPS) + +@dataclass +class IntensityConfig: + """Configuration for intensity-based execution model. + + Attributes: + base_intensity: Baseline fill intensity + kappa: Decay rate with distance from mid-price + vol_scale: Volatility multiplier for fill intensity + """ + base_intensity: float = 1.0 + kappa: float = 1.5 + vol_scale: float = 0.5 + +class IntensityExecutionModel: + """Avellaneda-Stoikov style fill intensity for market making. + + Fill probability decays exponentially with distance from mid-price: + P(fill) = base * exp(-kappa * |quote - mid|) * (1 + vol_scale * sigma) + + Tighter spreads (closer to mid) have higher fill probability. + Higher volatility increases fill probability (more aggressive traders). + """ + + def __init__(self, cfg: IntensityConfig | None = None): + self.cfg = cfg or IntensityConfig() + + def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet, + market: MarketState | None, rng: np.random.Generator) -> float: + idx = int(opp.instrument_id) + + # get mid price from market or use quote price + if market and market.mid_prices is not None: + mid = market.mid_prices[idx] + else: + mid = quote.prices[idx] + + # compute distance from mid + if opp.side == Side.BUY: + exec_price = quote.asks[idx] if quote.asks is not None else quote.prices[idx] + distance = exec_price - mid + else: + exec_price = quote.bids[idx] if quote.bids is not None else quote.prices[idx] + distance = mid - exec_price + + # intensity decays with distance + intensity = self.cfg.base_intensity * intensity_decay(abs(distance), self.cfg.kappa) + + # volatility increases fill probability + if market and market.volatility is not None: + vol = market.volatility[idx] + intensity *= (1 + self.cfg.vol_scale * vol) + + return float(np.clip(intensity, 0, 1)) + + def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, + context: dict[str, Any] | None = None) -> np.ndarray: + return fills # market making doesn't have same censorship concept + +@dataclass +class LogitConfig: + """Configuration for logit discrete choice model. + + Attributes: + beta_0: Intercept (base utility) + beta_price: Price coefficient (typically negative) + beta_quality: Quality attribute coefficient + """ + beta_0: float = 0.5 + beta_price: float = -1.5 + beta_quality: float = 0.3 + +class LogitExecutionModel: + """Discrete choice logit model for purchase probability. + + Utility: U = beta_0 + beta_price * (price/ref) + beta_quality * quality + P(buy) = sigmoid(U) + + Provides a theoretically grounded demand model from economics literature. + """ + + def __init__(self, cfg: LogitConfig | None = None): + self.cfg = cfg or LogitConfig() + + def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet, + market: MarketState | None, rng: np.random.Generator) -> float: + idx = int(opp.instrument_id) + price = quote.prices[idx] + ref = instruments.refs[idx] + quality = instruments.instruments[idx].attrs.get('quality', 0.5) + + # utility + u = self.cfg.beta_0 + self.cfg.beta_price * (price / ref) + self.cfg.beta_quality * quality + + # choice probability via sigmoid + return float(sigmoid(u)) + + def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, + context: dict[str, Any] | None = None) -> np.ndarray: + return fills / (self.cfg.beta_0 + EPS) diff --git a/lab/run_example.py b/lab/run_example.py new file mode 100644 index 0000000..ebe0f18 --- /dev/null +++ b/lab/run_example.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +"""Example script demonstrating the Quote-Control platform""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import numpy as np +from lab.config import make_retail_platform, make_market_making_platform +from lab.experiments.eval import (rollout, compare_policies, fixed_price_policy, + cost_plus_margin_policy, random_walk_policy) + +def demo_retail(): + print("=" * 60) + print("RETAIL DYNAMIC PRICING DEMO") + print("=" * 60) + + platform = make_retail_platform() + print(f"Instruments: {platform.instruments.n}") + print(f"Reference prices: {platform.instruments.refs[:5].round(2)}...") + + # compare policies + policies = { + 'fixed': fixed_price_policy(platform.instruments.refs), + 'cost_plus_30%': cost_plus_margin_policy(platform.instruments.costs, 0.3), + 'cost_plus_50%': cost_plus_margin_policy(platform.instruments.costs, 0.5), + 'random_walk': random_walk_policy(platform.instruments.refs, 0.03), + } + + results = compare_policies(platform, policies, n_steps=100, n_runs=3) + + print("\nPolicy Comparison (100 steps, 3 runs):") + print("-" * 50) + for name, r in sorted(results.items(), key=lambda x: -x[1]['mean_pnl']): + print(f"{name:20s} PnL={r['mean_pnl']:8.1f} +/- {r['std_reward']:6.1f} " + f"conv={r['mean_conversion']:.3f}") + +def demo_market_making(): + print("\n" + "=" * 60) + print("MARKET MAKING DEMO") + print("=" * 60) + + platform = make_market_making_platform() + print(f"Instruments: {platform.instruments.n}") + print(f"Initial mids: {platform.instruments.refs.round(2)}") + + # simple policy: quote at mid with fixed spread + def mm_policy(obs: np.ndarray, t: int): + mids = platform.instruments.refs # would use obs in real policy + return mids, 1.0 + + result = rollout(platform, mm_policy, n_steps=200, seed=42) + print(f"\nRollout (200 steps):") + print(f" Total PnL: {result.total_pnl:.2f}") + print(f" Avg conversion: {result.avg_conversion:.3f}") + print(f" Total spread capture: {sum(m.spread_capture for m in result.metrics):.2f}") + +if __name__ == '__main__': + demo_retail() + demo_market_making() diff --git a/sim/case/__init__.py b/sim/case/__init__.py new file mode 100644 index 0000000..cb6c13c --- /dev/null +++ b/sim/case/__init__.py @@ -0,0 +1,2 @@ +"""Case-specific simulations and experiments.""" + diff --git a/sim/case/thesis_simplified/__init__.py b/sim/case/thesis_simplified/__init__.py new file mode 100644 index 0000000..6259958 --- /dev/null +++ b/sim/case/thesis_simplified/__init__.py @@ -0,0 +1,2 @@ +"""Minimal thesis-aligned pricing simulation (self-contained).""" + diff --git a/sim/case/thesis_simplified/coi.py b/sim/case/thesis_simplified/coi.py new file mode 100644 index 0000000..1657f65 --- /dev/null +++ b/sim/case/thesis_simplified/coi.py @@ -0,0 +1,125 @@ +"""Cost of Information (COI) computation for thesis pricing system. + +Core KPI: COI = E[p_shown] - p_min measures pricing power from information asymmetry. +Theorem 1 shows COI erodes as agent queries increase: as N->inf, p^(1)->p_min. +""" +from __future__ import annotations +from dataclasses import dataclass +from typing import Dict, List, TYPE_CHECKING +import numpy as np + +if TYPE_CHECKING: + from .simplified import Session + + +@dataclass(frozen=True) +class COIWindow: + """Windowed COI metrics computed from realized price exposures. + + policy: E[p_shown] - cost, the definition-level KPI + agent: E[p^(1)] - cost where p^(1) is min price under agent querying + leak: max(policy - agent, 0), observable gap from reconnaissance + survival_ratio: agent/policy, fraction of pricing power retained + """ + policy: float + agent: float + leak: float + survival_ratio: float + policy_by_product: np.ndarray + agent_by_product: np.ndarray + demand_weights: np.ndarray + + +def aggregate_prices(sessions: List["Session"], mode: str = "all") -> Dict[int, List[float] | float]: + """Unified price aggregation across sessions. + + mode: "all" returns all prices per product, "min_per_session" returns min price per session per product, + "min_across" returns single min price per product + """ + if mode == "min_across": + mins: Dict[int, float] = {} + for s in sessions: + for e in s.events: + pidx, price = int(e.product_idx), float(e.price_seen) + mins[pidx] = min(mins.get(pidx, price), price) + return mins + elif mode == "min_per_session": + result: Dict[int, List[float]] = {} + for s in sessions: + by_p: Dict[int, float] = {} + for e in s.events: + pidx, price = int(e.product_idx), float(e.price_seen) + by_p[pidx] = min(by_p.get(pidx, price), price) + for pidx, pmin in by_p.items(): + result.setdefault(pidx, []).append(pmin) + return result + else: # "all" + prices: Dict[int, List[float]] = {} + for s in sessions: + for e in s.events: + prices.setdefault(e.product_idx, []).append(float(e.price_seen)) + return prices + + +def demand_weights_by_product(sessions: List["Session"], demand_mapping: Dict[str, float], n_products: int) -> np.ndarray: + """Compute demand-weighted importance per product.""" + w = np.zeros(n_products, dtype=float) + sessions_by_id = {s.sid: s for s in sessions} + for sid, q in demand_mapping.items(): + sess = sessions_by_id.get(sid) + if sess and sess.events: + w[int(sess.events[0].product_idx)] += float(q) + total = float(np.sum(w)) + return (w / total) if total > 0 else w + + +def compute_coi_window(sessions: List["Session"], costs: np.ndarray, demand_mapping: Dict[str, float] | None = None) -> COIWindow: + """Compute COI metrics over session window. + + Aggregates price exposures and computes policy-level vs agent-realized COI. + """ + n = int(len(costs)) + prices = aggregate_prices(sessions, mode="all") + agent_sessions = [s for s in sessions if s.actor == "A"] + agent_min = aggregate_prices(agent_sessions, mode="min_across") if agent_sessions else {} + + policy_by = np.zeros(n, dtype=float) + agent_by = np.zeros(n, dtype=float) + seen = np.array([(i in prices) for i in range(n)], dtype=bool) + agent_seen = np.array([(i in agent_min) for i in range(n)], dtype=bool) + + for pidx, ps in prices.items(): + if 0 <= pidx < n and ps: + policy_by[pidx] = float(np.mean(ps) - float(costs[pidx])) + for pidx, pmin in agent_min.items(): + if 0 <= pidx < n: + agent_by[pidx] = float(pmin - float(costs[pidx])) + + agent_by[seen & ~agent_seen] = policy_by[seen & ~agent_seen] # no erosion if no agent exposure + + demand_w = demand_weights_by_product(sessions, demand_mapping, n) if demand_mapping else np.zeros(n, dtype=float) + has_weights = float(np.sum(demand_w)) > 0 + + if has_weights: + policy, agent = float(np.dot(demand_w, policy_by)), float(np.dot(demand_w, agent_by)) + elif np.any(seen): + policy, agent = float(np.mean(policy_by[seen])), float(np.mean(agent_by[seen])) + else: + policy, agent = 0.0, 0.0 + + leak = float(max(policy - agent, 0.0)) + survival = float(np.clip(agent / policy, 0.0, 1.0)) if policy > 0 else 0.0 + + return COIWindow(policy=policy, agent=agent, leak=leak, survival_ratio=survival, + policy_by_product=policy_by, agent_by_product=agent_by, demand_weights=demand_w) + + +def coi_erosion(coi_policy: float, coi_agent: float, eps: float = 1e-9) -> float: + """Thesis-consistent COI erosion: fraction of pricing power destroyed by agent queries. + + erosion = 1 - (COI_agent / COI_policy) + When agents find low prices, COI_agent -> 0, erosion -> 1. + """ + if coi_policy <= eps: + return 0.0 + return float(np.clip(1.0 - (coi_agent / (coi_policy + eps)), 0.0, 1.0)) diff --git a/sim/case/thesis_simplified/experiments.py b/sim/case/thesis_simplified/experiments.py new file mode 100644 index 0000000..74458d7 --- /dev/null +++ b/sim/case/thesis_simplified/experiments.py @@ -0,0 +1,325 @@ +"""COI leakage experiments and policy comparisons. + +Demonstrates the core thesis contribution: COI erosion under agent contamination +and recovery via robust pricing policies. + +Generates TensorBoard logs for: +- COI erosion curves across contamination levels +- Policy comparison (fixed vs adaptive vs RL) +- Revenue/margin trade-offs +""" +from __future__ import annotations +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Tuple +import json +import numpy as np + +try: + from torch.utils.tensorboard import SummaryWriter + HAS_TB = True +except ImportError: + HAS_TB = False + +from .simplified_env import PricingEnv, EnvConfig, make_env +from .simplified import System + + +@dataclass +class ExperimentResult: + """Container for experiment metrics.""" + name: str + alpha: float + reward_mean: float + reward_std: float + coi_erosion: float + alpha_error: float + revenue: float + margin: float + + def to_dict(self) -> dict: + return {k: getattr(self, k) for k in self.__dataclass_fields__} + + +def theoretical_coi_erosion_curve(alphas: np.ndarray, n_sessions: int = 1000) -> np.ndarray: + """Theoretical COI erosion from Theorem 1 using order statistic model. + + For N i.i.d. uniform queries on [p_min, p_max]: + E[p^(1)] = p_min + (p_max - p_min)/(N+1), so erosion = 1 - 2/(N+1) + """ + erosions = [] + for a in alphas: + n_agents = max(1, int(a * n_sessions)) + erosions.append(1.0 - 2.0 / (n_agents + 1)) + return np.array(erosions) + + +def run_policy_episode( + env: PricingEnv, + policy_fn, + n_episodes: int = 10 +) -> Tuple[List[float], List[float], List[float], List[float]]: + """Run policy and collect per-step metrics.""" + rewards, coi_erosions, alpha_errors, revenues = [], [], [], [] + + for _ in range(n_episodes): + obs, info = env.reset() + done = False + while not done: + action = policy_fn(obs, env.n) + obs, reward, terminated, truncated, info = env.step(action) + done = terminated or truncated + rewards.append(reward) + if 'coi_erosion' in info: + coi_erosions.append(info['coi_erosion']) + if 'alpha_true' in info and 'alpha_est' in info: + alpha_errors.append(abs(info['alpha_true'] - info['alpha_est'])) + if 'revenue' in info: + revenues.append(info['revenue']) + + return rewards, coi_erosions, alpha_errors, revenues + + +class PolicyRegistry: + """Registry of baseline policies.""" + + @staticmethod + def fixed(obs: np.ndarray, n: int, margin: float = 0.15) -> np.ndarray: + return np.ones(n, dtype=np.float32) * (1.0 + margin) + + @staticmethod + def random(obs: np.ndarray, n: int, rng: np.random.Generator = None) -> np.ndarray: + rng = rng or np.random.default_rng() + return rng.uniform(0.7, 1.3, n).astype(np.float32) + + @staticmethod + def adaptive(obs: np.ndarray, n: int, base_margin: float = 0.15) -> np.ndarray: + """Reduce margins when alpha estimate is high.""" + alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2 + margin_scale = 1.0 - 0.4 * alpha_est + return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale) + + @staticmethod + def aggressive(obs: np.ndarray, n: int) -> np.ndarray: + """High margins, ignores contamination.""" + return np.ones(n, dtype=np.float32) * 1.4 + + @staticmethod + def defensive(obs: np.ndarray, n: int) -> np.ndarray: + """Low margins, always cautious.""" + return np.ones(n, dtype=np.float32) * 1.05 + + @staticmethod + def alpha_proportional(obs: np.ndarray, n: int, max_margin: float = 0.3) -> np.ndarray: + """Margin inversely proportional to estimated alpha.""" + alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2 + margin = max_margin * (1.0 - alpha_est) + return np.ones(n, dtype=np.float32) * (1.0 + margin) + + +def run_contamination_sweep( + alphas: List[float], + policies: Dict[str, callable], + n_products: int = 10, + max_steps: int = 200, + n_episodes: int = 10, + seed: int = 42, + log_dir: str = None +) -> Dict[str, List[ExperimentResult]]: + """Run policies across contamination levels.""" + + results = {name: [] for name in policies} + writer = SummaryWriter(Path(log_dir) / "sweep") if log_dir and HAS_TB else None + + for alpha in alphas: + print(f" alpha={alpha:.2f}", end=" ") + env_cfg = EnvConfig( + n_products=n_products, max_steps=max_steps, + alpha_true=alpha, reward_mode="robust", seed=seed) + env = make_env(env_cfg) + + for name, policy_fn in policies.items(): + rewards, coi_vals, alpha_errs, revenues = run_policy_episode(env, policy_fn, n_episodes) + + result = ExperimentResult( + name=name, alpha=alpha, + reward_mean=float(np.mean(rewards)), + reward_std=float(np.std(rewards)), + coi_erosion=float(np.mean(coi_vals)) if coi_vals else 0.0, + alpha_error=float(np.mean(alpha_errs)) if alpha_errs else 0.0, + revenue=float(np.mean(revenues)) if revenues else 0.0, + margin=float(np.mean([policy_fn(np.zeros(3 * n_products + 3), n_products)]) - 1.0)) + + results[name].append(result) + + if writer: + step = int(alpha * 100) + writer.add_scalar(f'{name}/reward', result.reward_mean, step) + writer.add_scalar(f'{name}/coi_erosion', result.coi_erosion, step) + writer.add_scalar(f'{name}/alpha_error', result.alpha_error, step) + writer.add_scalar(f'{name}/revenue', result.revenue, step) + + print(f"done") + + # add theoretical curve + if writer: + theo = theoretical_coi_erosion_curve(np.array(alphas)) + for i, (a, e) in enumerate(zip(alphas, theo)): + writer.add_scalar('theoretical/coi_erosion', e, int(a * 100)) + writer.close() + + return results + + +def run_coi_demonstration(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict: + """Main COI demonstration experiment.""" + print("=== COI Leakage Demonstration ===\n") + + Path(log_dir).mkdir(parents=True, exist_ok=True) + writer = SummaryWriter(Path(log_dir) / "coi_demo") if HAS_TB else None + + # theoretical erosion curve + print("1. Theoretical COI erosion (Theorem 1)") + alphas = np.linspace(0.0, 0.6, 13) + theo_erosion = theoretical_coi_erosion_curve(alphas, n_sessions=1000) + + for a, e in zip(alphas, theo_erosion): + print(f" alpha={a:.2f} -> erosion={e:.3f}") + if writer: + writer.add_scalar('theory/coi_erosion', e, int(a * 100)) + + # policy comparison + print("\n2. Policy comparison across contamination levels") + policies = { + 'fixed': lambda obs, n: PolicyRegistry.fixed(obs, n), + 'aggressive': PolicyRegistry.aggressive, + 'defensive': PolicyRegistry.defensive, + 'adaptive': PolicyRegistry.adaptive, + 'alpha_proportional': PolicyRegistry.alpha_proportional, + } + + sweep_alphas = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5] + results = run_contamination_sweep( + sweep_alphas, policies, n_products=10, max_steps=100, + n_episodes=5, seed=seed, log_dir=log_dir) + + # summarize + print("\n3. Summary by policy") + for name, res_list in results.items(): + avg_reward = np.mean([r.reward_mean for r in res_list]) + avg_coi = np.mean([r.coi_erosion for r in res_list]) + print(f" {name:20s}: avg_reward={avg_reward:.2f}, avg_coi={avg_coi:.3f}") + + # save results + output = { + 'theoretical': {'alphas': alphas.tolist(), 'erosion': theo_erosion.tolist()}, + 'empirical': {name: [r.to_dict() for r in res_list] for name, res_list in results.items()}} + + with open(Path(log_dir) / "coi_demo_results.json", 'w') as f: + json.dump(output, f, indent=2) + + if writer: + writer.close() + + print(f"\nResults saved to {log_dir}/coi_demo_results.json") + print(f"TensorBoard: tensorboard --logdir {log_dir}") + + return output + + +def run_reward_mode_comparison(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict: + """Compare different reward modes.""" + print("=== Reward Mode Comparison ===\n") + + Path(log_dir).mkdir(parents=True, exist_ok=True) + writer = SummaryWriter(Path(log_dir) / "reward_modes") if HAS_TB else None + + reward_modes = ["revenue", "profit", "robust", "coi_aware"] + alpha = 0.3 # moderate contamination + + results = {} + for mode in reward_modes: + print(f" mode={mode}", end=" ") + env_cfg = EnvConfig( + n_products=10, max_steps=200, alpha_true=alpha, + reward_mode=mode, seed=seed) + env = make_env(env_cfg) + + rewards, coi_vals, _, revenues = run_policy_episode( + env, PolicyRegistry.adaptive, n_episodes=10) + + results[mode] = { + 'reward_mean': float(np.mean(rewards)), + 'reward_std': float(np.std(rewards)), + 'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0, + 'revenue': float(np.mean(revenues)) if revenues else 0.0} + + if writer: + for k, v in results[mode].items(): + writer.add_scalar(f'{mode}/{k}', v, 0) + + print(f"reward={results[mode]['reward_mean']:.2f}, coi={results[mode]['coi_erosion']:.3f}") + + if writer: + writer.close() + + with open(Path(log_dir) / "reward_mode_results.json", 'w') as f: + json.dump(results, f, indent=2) + + return results + + +def run_alpha_drift_experiment(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict: + """Test policy robustness under non-stationary contamination.""" + print("=== Alpha Drift Experiment ===\n") + + Path(log_dir).mkdir(parents=True, exist_ok=True) + writer = SummaryWriter(Path(log_dir) / "alpha_drift") if HAS_TB else None + + drift_rates = [0.0, 0.01, 0.02, 0.05] + results = {} + + for drift in drift_rates: + print(f" drift={drift:.2f}", end=" ") + env_cfg = EnvConfig( + n_products=10, max_steps=200, alpha_true=0.2, + alpha_drift=drift, reward_mode="robust", seed=seed) + env = make_env(env_cfg) + + rewards, coi_vals, alpha_errs, _ = run_policy_episode( + env, PolicyRegistry.adaptive, n_episodes=10) + + results[f'drift_{drift}'] = { + 'reward_mean': float(np.mean(rewards)), + 'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0, + 'alpha_tracking_error': float(np.mean(alpha_errs)) if alpha_errs else 0.0} + + if writer: + for k, v in results[f'drift_{drift}'].items(): + writer.add_scalar(f'drift_{drift}/{k}', v, 0) + + print(f"reward={results[f'drift_{drift}']['reward_mean']:.2f}, " + f"alpha_err={results[f'drift_{drift}']['alpha_tracking_error']:.3f}") + + if writer: + writer.close() + + return results + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Run COI experiments") + parser.add_argument("--exp", type=str, default="coi", choices=["coi", "reward", "drift", "all"]) + parser.add_argument("--log-dir", type=str, default="sim/case/thesis_simplified/runs") + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + if args.exp == "coi" or args.exp == "all": + run_coi_demonstration(args.log_dir, args.seed) + + if args.exp == "reward" or args.exp == "all": + run_reward_mode_comparison(args.log_dir, args.seed) + + if args.exp == "drift" or args.exp == "all": + run_alpha_drift_experiment(args.log_dir, args.seed) diff --git a/sim/case/thesis_simplified/separability.py b/sim/case/thesis_simplified/separability.py new file mode 100644 index 0000000..eaabaa3 --- /dev/null +++ b/sim/case/thesis_simplified/separability.py @@ -0,0 +1,72 @@ +"""Behavioral separability for human/agent detection. + +Computes divergence signals delta_H, delta_A from session trajectories using +transition kernel estimation and KL divergence to prototype behavioral profiles. +""" +from __future__ import annotations +from typing import Dict, List, Tuple, TYPE_CHECKING +import numpy as np + +if TYPE_CHECKING: + from .simplified import Event, Session + + +# prototype behavioral kernels for human vs agent sessions +TRANS_H = { + "start": {"view": 0.85, "end": 0.15}, + "view": {"detail": 0.4, "cart": 0.3, "view": 0.2, "end": 0.1}, + "detail": {"cart": 0.5, "view": 0.3, "end": 0.2}, + "cart": {"purchase": 0.6, "view": 0.25, "end": 0.15}, + "purchase": {"end": 1.0}, +} + +TRANS_A = { + "start": {"view": 0.95, "end": 0.05}, + "view": {"detail": 0.6, "view": 0.25, "cart": 0.1, "end": 0.05}, + "detail": {"view": 0.5, "cart": 0.15, "detail": 0.3, "end": 0.05}, + "cart": {"view": 0.4, "purchase": 0.2, "end": 0.4}, + "purchase": {"end": 1.0}, +} + + +def kl_div(p: Dict[str, float], q: Dict[str, float], eps: float = 1e-10) -> float: + """KL divergence D_KL(p || q) for discrete distributions.""" + keys = set(p.keys()) | set(q.keys()) + return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys) + + +def build_kernel(events: List["Event"]) -> Dict[str, Dict[str, float]]: + """Build empirical transition kernel T' from trajectory events.""" + trans: Dict[str, Dict[str, int]] = {} + prev = "start" + for e in events: + curr = e.action + trans.setdefault(prev, {}) + trans[prev][curr] = trans[prev].get(curr, 0) + 1 + prev = curr + return {s: {d: c / sum(dsts.values()) for d, c in dsts.items()} for s, dsts in trans.items() if sum(dsts.values()) > 0} + + +def compute_divergence(session: "Session") -> Tuple[float, float]: + """Compute divergence signals delta_H, delta_A for session. + + delta_H = mean KL(T' || T_H) across states, measures distance to human prototype + delta_A = mean KL(T' || T_A) across states, measures distance to agent prototype + """ + kernel = build_kernel(session.events) + if not kernel: + return 0.5, 0.5 + delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / len(kernel) + delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / len(kernel) + return delta_h, delta_a + + +def estimate_alpha(session: "Session", beta: float = 2.0) -> float: + """Per-session contamination estimate alpha_hat = sigma(beta*(delta_H - delta_A)). + + Returns probability session is agent-generated based on behavioral divergence. + """ + dh, da = compute_divergence(session) + if (dh + da) <= 0: + return 0.5 + return 1.0 / (1.0 + np.exp(-beta * (dh - da))) diff --git a/sim/case/thesis_simplified/simplified.py b/sim/case/thesis_simplified/simplified.py new file mode 100644 index 0000000..450f01a --- /dev/null +++ b/sim/case/thesis_simplified/simplified.py @@ -0,0 +1,219 @@ +"""Minimal implementation of thesis pricing system. + +Implements the core loop: prices -> sessions -> demand -> prices +with behavioral separability and robust pricing objective. + +Objects: +- Session trajectories tau_s from mixture of H/A behavioral profiles +- Demand proxy q_hat via weighted action aggregation +- COI leakage penalty for agent reconnaissance +- Limbo: alternating price/demand history for trajectory analysis +""" +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Dict, List, Tuple +import numpy as np + +from .coi import COIWindow, compute_coi_window +from .separability import TRANS_H, TRANS_A, kl_div, build_kernel, compute_divergence, estimate_alpha + +ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0} + + +@dataclass +class Event: + action: str + product_idx: int + price_seen: float + ts: float + + +@dataclass +class Session: + sid: str + events: List[Event] + actor: str # H or A (ground truth label) + theta: Dict[str, float] = field(default_factory=dict) + + +def compute_demand(session: Session) -> float: + """Compute demand proxy q_hat = sum_k omega(a_k) for session.""" + return sum(ACTION_WEIGHTS.get(e.action, 0.1) for e in session.events) + + +def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray, costs: np.ndarray, theta: Dict[str, float], + is_agent: bool, session_noise: float = 0.02, surge: float = 0.08, max_mult: float = 1.8) -> Tuple[List[Event], int]: + """Sample session trajectory from behavioral kernel.""" + pidx = int(rng.integers(0, len(prices))) + cost, base = float(costs[pidx]), float(prices[pidx]) * (1.0 + rng.normal(0.0, session_noise)) + base = float(np.clip(base, cost * 1.01, float(prices[pidx]) * 2.0)) + price, signal, state, t = base, 0.0, "start", 0.0 + events = [] + + while state != "end" and len(events) < 30: + probs = trans.get(state, {"end": 1.0}) + nxt = rng.choice(list(probs.keys()), p=list(probs.values())) + if nxt == "purchase": # purchase conversion check + rel = max((price - cost) / (cost + 1e-6), 0.0) + p_buy = float(np.clip(theta.get("base_conv", 0.2) * np.exp(-theta.get("price_sens", 2.0) * rel), 0.0, 1.0)) + if rng.random() > p_buy: + nxt = "end" + state = nxt + if state not in {"start", "end"}: + events.append(Event(action=state, product_idx=pidx, price_seen=float(price), ts=t)) + signal += float(ACTION_WEIGHTS.get(state, 0.1)) + price = float(np.clip(base * (1.0 + surge * signal), cost * 1.01, base * max_mult)) + t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2)) + return events, pidx + + +def put_prices_to_market(prices: np.ndarray, costs: np.ndarray, alpha: float = 0.2, n_sessions: int = 50, + seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]: + """Generate sessions from mixture model. Returns sessions and demand mapping sid -> q_hat.""" + rng = np.random.default_rng(seed) + sessions, demand = [], {} + for i in range(n_sessions): + sid = f"s{i:04d}" + is_agent = rng.random() < alpha + trans = TRANS_A if is_agent else TRANS_H + theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else \ + {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)} + events, _ = sample_trajectory(rng, trans, prices, costs=costs, theta=theta, is_agent=is_agent) + session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta) + sessions.append(session) + demand[sid] = compute_demand(session) + return sessions, demand + + +@dataclass +class LimboUpdate: + utype: str # "prices" or "demand" + data: np.ndarray | Dict[str, float] + t: int + + +class Limbo: + """Historical trajectory of alternating price/demand observations.""" + + def __init__(self): + self.history: List[LimboUpdate] = [] + self._t = 0 + + def add_update(self, utype: str, data: np.ndarray | Dict[str, float]) -> Dict: + self.history.append(LimboUpdate(utype=utype, data=data, t=self._t)) + self._t += 1 + return {"action": "observe_demand" if utype == "prices" else "set_prices"} + + def get_prices_history(self) -> List[np.ndarray]: + return [u.data for u in self.history if u.utype == "prices"] + + def get_demand_history(self) -> List[Dict[str, float]]: + return [u.data for u in self.history if u.utype == "demand"] + + +class System: + """Main pricing system implementing robust Stackelberg objective. + + Manages the alternating loop: set prices p_t -> observe demand Q_hat(p_t) -> + estimate contamination alpha from behavioral signals -> compute next prices. + """ + + def __init__(self, n_products: int = 10, costs: np.ndarray | None = None, lambda_coi: float = 0.5, seed: int | None = 42): + self.n = n_products + self.rng = np.random.default_rng(seed) + self.costs = costs if costs is not None else self.rng.uniform(10, 50, n_products) + self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products)) + self.lambda_coi = lambda_coi + self.limbo = Limbo() + self._alpha_est = 0.2 + self._sessions: List[Session] = [] + self._last_sessions: List[Session] = [] + self._last_coi: COIWindow | None = None + + @property + def alpha(self) -> float: + return self._alpha_est + + def _estimate_alpha_from_sessions(self) -> float: + if not self._sessions: + return self._alpha_est + return float(np.mean([estimate_alpha(s) for s in self._sessions[-50:]])) + + def _revenue_under_demand(self, prices: np.ndarray, demand: Dict[str, float]) -> float: + agg = np.zeros(self.n) + for sid, q in demand.items(): + sess = next((s for s in self._sessions if s.sid == sid), None) + if sess and sess.events: + agg[sess.events[0].product_idx] += q + return float(np.dot(prices, agg)) + + def _compute_coi_window(self, demand: Dict[str, float]) -> COIWindow: + if not self._last_sessions: + zeros = np.zeros(self.n, dtype=float) + return COIWindow(policy=0.0, agent=0.0, leak=0.0, survival_ratio=0.0, + policy_by_product=zeros, agent_by_product=zeros, demand_weights=zeros) + return compute_coi_window(self._last_sessions, self.costs, demand_mapping=demand) + + def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float: + """Robust objective: R(p,d) - lambda * COI_leak.""" + profit = self._revenue_under_demand(prices, demand) - float(np.sum(self.costs)) + self._last_coi = self._compute_coi_window(demand) + return profit - self.lambda_coi * self._last_coi.leak + + def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray: + """Compute next prices via heuristic margin adjustment based on alpha estimate.""" + self._alpha_est = self._estimate_alpha_from_sessions() + margin_scale = 1.0 - 0.5 * self._alpha_est # defensive pricing under high contamination + margins = (self.refs - self.costs) * margin_scale + noise = self.rng.normal(0, 0.02, self.n) * self.costs + prices = np.clip(self.costs + margins + noise, self.costs * 1.02, self.refs * 1.3) + self.limbo.add_update("prices", prices) + return prices + + def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]: + sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true, + n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000))) + self._last_sessions = sessions + self._sessions.extend(sessions) + self.limbo.add_update("demand", demand_map) + return demand_map + + def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float, COIWindow]: + demand_hist = self.limbo.get_demand_history() + prices = self.compute_prices(demand_hist[-1] if demand_hist else None) + demand = self.observe_demand(prices, alpha_true, n_sessions) + reward = self._objective(prices, demand) + return prices, demand, reward, self._last_coi or self._compute_coi_window(demand) + + def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict: + traj = {"prices": [], "demand": [], "rewards": [], "alpha_est": [], "alpha_true": alpha_true, + "coi_policy": [], "coi_agent": [], "coi_leak": [], "coi_survival": []} + for _ in range(n_steps): + p, d, r, coi = self.step(alpha_true) + traj["prices"].append(p); traj["demand"].append(d); traj["rewards"].append(r) + traj["alpha_est"].append(self._alpha_est) + traj["coi_policy"].append(coi.policy); traj["coi_agent"].append(coi.agent) + traj["coi_leak"].append(coi.leak); traj["coi_survival"].append(coi.survival_ratio) + return traj + + +if __name__ == "__main__": + sys = System(n_products=5, seed=42) + traj = sys.run(n_steps=20, alpha_true=0.25) + print(f"avg reward: {np.mean(traj['rewards']):.2f}, final alpha_hat: {traj['alpha_est'][-1]:.3f}, " + f"COI_policy: {np.mean(traj['coi_policy']):.3f}, COI_agent: {np.mean(traj['coi_agent']):.3f}, leak: {np.mean(traj['coi_leak']):.3f}") + + prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0]) + costs = np.array([15.0, 28.0, 40.0, 18.0, 30.0]) + sessions, demand = put_prices_to_market(prices, costs=costs, alpha=0.3, n_sessions=20, seed=123) + print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}') + + for n in [1, 5, 10, 50, 100]: + # theoretical: erosion = 1 - 2/(N+1) for uniform order statistic + print(f'N={n:3d} agents -> COI erosion: {1.0 - 2.0/(n+1):.3f}') + + events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0), Event('purchase', 0, 20.0, 2.0)] + print(f'human-like session alpha_hat: {estimate_alpha(Session(sid="test", events=events, actor="H")):.3f}') + + events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3), Event('detail', 0, 20.0, 0.4)] + print(f'agent-like session alpha_hat: {estimate_alpha(Session(sid="test2", events=events_a, actor="A")):.3f}') diff --git a/sim/case/thesis_simplified/simplified_env.py b/sim/case/thesis_simplified/simplified_env.py new file mode 100644 index 0000000..70b3904 --- /dev/null +++ b/sim/case/thesis_simplified/simplified_env.py @@ -0,0 +1,249 @@ +"""Gymnasium-compatible RL environment for thesis pricing system. + +Wraps simplified.System with standard Gym interface for training pricing policies. +Supports multiple reward modes and contamination scenarios. + +Action: price multipliers [0.5, 1.5] applied to reference prices +Observation: [prices, demand_agg, alpha_est, margins, position_proxy] +Reward: configurable objective (revenue, profit, robust, coi-aware) +""" +from __future__ import annotations +from dataclasses import dataclass +from typing import Any, Dict, Tuple +import numpy as np + +try: + import gymnasium as gym + from gymnasium import spaces + HAS_GYM = True +except ImportError: + HAS_GYM = False + +from .simplified import System, Session, Event, Limbo, put_prices_to_market, compute_demand, estimate_alpha +from .coi import COIWindow, compute_coi_window, coi_erosion + + +@dataclass +class EnvConfig: + n_products: int = 5 + max_steps: int = 200 + sessions_per_step: int = 30 + alpha_true: float = 0.2 + alpha_drift: float = 0.0 + alpha_bounds: Tuple[float, float] = (0.0, 0.6) + lambda_coi: float = 0.5 + lambda_vol: float = 0.1 + reward_mode: str = "robust" # revenue | profit | robust | coi_aware + normalize_reward: bool = True + seed: int | None = 42 + + +def aggregate_purchases(sessions: list[Session], n_products: int, costs: np.ndarray) -> Tuple[np.ndarray, float, float]: + """Aggregate purchases from sessions, returns (counts, revenue, cost).""" + purchases = np.zeros(n_products, dtype=float) + revenue, cost = 0.0, 0.0 + for sess in sessions: + for e in sess.events: + if e.action == "purchase" and 0 <= e.product_idx < n_products: + purchases[e.product_idx] += 1.0 + revenue += float(e.price_seen) + cost += float(costs[e.product_idx]) + return purchases, revenue, cost + + +class PricingEnv(gym.Env if HAS_GYM else object): + """RL environment for dynamic pricing under agent contamination. + + Platform sets prices p_t, market responds with mixture demand Q(p) = (1-alpha)*D_H + alpha*D_A. + Agent estimates contamination alpha_hat from behavioral signals. + Reward balances profit vs COI leakage. + """ + metadata = {"render_modes": ["human", "ansi"]} + + def __init__(self, cfg: EnvConfig | None = None): + if not HAS_GYM: + raise ImportError("gymnasium required") + self.cfg = cfg or EnvConfig() + self.n = self.cfg.n_products + self._sys: System | None = None + self._t = 0 + self._alpha = self.cfg.alpha_true + self._last_prices: np.ndarray | None = None + self._last_demand: Dict[str, float] | None = None + self._episode_rewards: list[float] = [] + self._demand_agg = np.zeros(self.n) + + self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32) + obs_dim = self.n + self.n + 1 + 1 + self.n + 1 # prices + demand + alpha_hat + alpha + margins + t + self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32) + + def _build_obs(self) -> np.ndarray: + if self._sys is None: + return np.zeros(self.observation_space.shape[0], dtype=np.float32) + prices = self._last_prices if self._last_prices is not None else self._sys.refs + return np.concatenate([ + prices / (self._sys.refs + 1e-6), + self._demand_agg / (np.sum(self._demand_agg) + 1e-6), + [self._sys.alpha, self._alpha], + (prices - self._sys.costs) / (self._sys.costs + 1e-6), + [self._t / self.cfg.max_steps], + ]).astype(np.float32) + + def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float: + cfg, sys = self.cfg, self._sys + if sys is None: + return 0.0 + + # aggregate demand per product + agg = np.zeros(self.n) + for sid, q in demand.items(): + sess = next((s for s in sys._sessions if s.sid == sid), None) + if sess and sess.events: + agg[sess.events[0].product_idx] += q + self._demand_agg = agg + + _, revenue, cost = aggregate_purchases(sys._last_sessions, self.n, sys.costs) + profit = revenue - cost + + vol_penalty = 0.0 + if self._last_prices is not None: + vol_penalty = cfg.lambda_vol * float(np.mean(np.abs(prices - self._last_prices) / (sys.refs + 1e-6))) + + coi = compute_coi_window(sys._last_sessions, sys.costs, demand_mapping=demand) + leak = float(coi.leak) + + reward_fns = { + "revenue": lambda: revenue, + "profit": lambda: profit, + "robust": lambda: profit - cfg.lambda_coi * leak - vol_penalty, + "coi_aware": lambda: profit - cfg.lambda_coi * (1 + 2 * sys.alpha) * leak - vol_penalty, + } + r = reward_fns.get(cfg.reward_mode, lambda: profit)() + return float(r / (float(np.sum(sys.refs)) + 1e-6)) if cfg.normalize_reward else float(r) + + def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]: + seed = seed if seed is not None else self.cfg.seed + self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed) + self._t, self._alpha = 0, self.cfg.alpha_true + self._last_prices, self._last_demand = None, None + self._episode_rewards, self._demand_agg = [], np.zeros(self.n) + return self._build_obs(), {"alpha_true": self._alpha, "alpha_est": self._sys.alpha, + "costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()} + + def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]: + if self._sys is None: + raise RuntimeError("call reset() first") + + action = np.clip(action, 0.5, 1.5) + prices = np.clip(self._sys.refs * action.astype(np.float64), self._sys.costs * 1.01, self._sys.refs * 2.0) + demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step) + self._sys.limbo.add_update("prices", prices) + self._sys._alpha_est = self._sys._estimate_alpha_from_sessions() + + reward = self._compute_reward(prices, demand) + self._episode_rewards.append(reward) + self._last_prices, self._last_demand = prices.copy(), demand + self._t += 1 + + # compute info metrics using shared helper + purchases, revenue, cost = aggregate_purchases(self._sys._last_sessions, self.n, self._sys.costs) + n_agents = int(self._alpha * self.cfg.sessions_per_step) + coi = compute_coi_window(self._sys._last_sessions, self._sys.costs, demand_mapping=demand) + + info = { + "alpha_true": self._alpha, "alpha_est": self._sys.alpha, + "alpha_error": abs(self._alpha - self._sys.alpha), + "revenue": float(revenue), "profit": float(revenue - cost), "cost": float(cost), + "n_purchases": int(np.sum(purchases)), + "avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)), + "n_sessions": len(demand), "n_agents": n_agents, "price_std": float(np.std(prices)), + "coi_erosion": coi_erosion(coi.policy, coi.agent), + "coi_policy": float(coi.policy), "coi_agent": float(coi.agent), + "coi_leakage": float(coi.leak), "coi_survival": float(coi.survival_ratio), + "cumulative_reward": sum(self._episode_rewards), "step": self._t, + } + return self._build_obs(), reward, self._t >= self.cfg.max_steps, False, info + + def render(self, mode: str = "human") -> str | None: + if self._sys is None or self._last_prices is None: + return None + out = f"t={self._t}/{self.cfg.max_steps} | alpha_true={self._alpha:.3f} alpha_hat={self._sys.alpha:.3f} | " \ + f"prices: {self._last_prices.round(1)} | demand: {self._demand_agg.round(2)} | " \ + f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}" + if mode == "human": + print(out) + return out + + def close(self) -> None: + pass + + +class ContaminationSweepEnv(PricingEnv): + """Environment that sweeps through contamination levels during training.""" + + def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None): + super().__init__(cfg) + self._schedule = alpha_schedule or [0.1, 0.2, 0.3, 0.4, 0.5] + self._schedule_idx = 0 + + def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]: + if options and options.get("advance_schedule", False): + self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule) + self.cfg.alpha_true = self._schedule[self._schedule_idx] + return super().reset(seed, options) + + +class AdversarialEnv(PricingEnv): + """Environment with adversarial contamination dynamics. + + Contamination increases when prices are predictable (agents exploit). + """ + + def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02): + super().__init__(cfg) + self._exploit_rate = exploitation_rate + self._price_history: list[np.ndarray] = [] + + def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]: + obs, reward, term, trunc, info = super().step(action) + if self._last_prices is not None: + self._price_history.append(self._last_prices.copy()) + predictability = 0.0 + if len(self._price_history) > 10: + predictability = 1.0 / (float(np.std(self._price_history[-10:])) + 0.1) + self._alpha = np.clip(self._alpha + self._exploit_rate * predictability * self._sys.rng.random(), *self.cfg.alpha_bounds) + info["predictability"] = predictability + return obs, reward, term, trunc, info + + def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]: + self._price_history = [] + return super().reset(seed, options) + + +def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv: + return {"sweep": ContaminationSweepEnv, "adversarial": AdversarialEnv}.get(env_type, PricingEnv)(cfg) + + +# baseline policies +fixed_price_policy = lambda refs, margin=0.0: np.ones(len(refs), dtype=np.float32) * (1.0 + margin) +random_policy = lambda n, rng=None: (rng or np.random.default_rng()).uniform(0.7, 1.3, n).astype(np.float32) +adaptive_policy = lambda obs, n, base=0.1: np.ones(n, dtype=np.float32) * (1.0 + base * (1.0 - 0.4 * obs[2 * n])) + + +if __name__ == "__main__": + cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust") + env = make_env(cfg) + obs, info = env.reset() + print(f"initial: alpha={info['alpha_true']:.2f}") + + total_reward = 0.0 + for t in range(cfg.max_steps): + action = adaptive_policy(obs, cfg.n_products) + obs, reward, done, _, info = env.step(action) + total_reward += reward + if t % 10 == 0: + env.render() + if done: + break + + print(f"\ntotal reward: {total_reward:.2f}, final alpha_hat: {info['alpha_est']:.3f}") diff --git a/sim/case/thesis_simplified/summarize.py b/sim/case/thesis_simplified/summarize.py new file mode 100644 index 0000000..10406aa --- /dev/null +++ b/sim/case/thesis_simplified/summarize.py @@ -0,0 +1,168 @@ +"""Summarize TensorBoard logs into comparison tables.""" +from __future__ import annotations +import json +import re +from pathlib import Path +from collections import defaultdict +from dataclasses import dataclass +import pandas as pd + +try: + from tensorboard.backend.event_processing.event_accumulator import EventAccumulator + HAS_TB = True +except ImportError: + HAS_TB = False + + +@dataclass +class RunInfo: + algo: str + alpha: float + reward_mode: str + path: Path + + +def parse_run_name(name: str) -> RunInfo | None: + """Extract algo, alpha, reward_mode from run directory name.""" + # patterns: ppo_a0.20_robust, cmp_fixed_a0.20, sac_a0.90_robust + m = re.match(r'(cmp_)?(\w+)_a([\d.]+)_?(\w+)?', name) + if not m: + return None + prefix, algo, alpha, mode = m.groups() + return RunInfo(algo=algo, alpha=float(alpha), reward_mode=mode or 'robust', path=Path()) + + +def load_tb_scalars(log_dir: Path, tags: list[str], reduce: str = 'last') -> dict[str, float]: + """Load scalar values from TensorBoard event files.""" + if not HAS_TB: + return {} + ea = EventAccumulator(str(log_dir)) + ea.Reload() + results = {} + for tag in tags: + if tag in ea.Tags().get('scalars', []): + events = ea.Scalars(tag) + if not events: + continue + vals = [e.value for e in events] + if reduce == 'last': + results[tag] = vals[-1] + elif reduce == 'mean': + results[tag] = sum(vals) / len(vals) + elif reduce == 'max': + results[tag] = max(vals) + elif reduce == 'min': + results[tag] = min(vals) + return results + + +def load_json_results(log_dir: Path) -> dict[str, float]: + """Load metrics from results.json if available.""" + results_file = log_dir / 'results.json' + if results_file.exists(): + with open(results_file) as f: + return json.load(f) + return {} + + +def discover_runs(base_dir: Path) -> list[RunInfo]: + """Find all experiment runs in base directory.""" + runs = [] + for d in base_dir.iterdir(): + if not d.is_dir(): + continue + info = parse_run_name(d.name) + if info: + info.path = d + runs.append(info) + return runs + + +def build_tables(runs: list[RunInfo], metrics: list[str], reduce: str = 'last') -> dict[str, dict[str, pd.DataFrame]]: + """Build pivot tables: reward_mode -> metric -> DataFrame[alpha x algo].""" + # collect data: {reward_mode: {metric: {(alpha, algo): value}}} + data = defaultdict(lambda: defaultdict(dict)) + + tb_tags = [f'economics/{m}' if m in ['revenue', 'profit', 'margin'] else f'coi/{m}' if m in ['erosion', 'leakage'] else f'alpha/{m}' for m in metrics] + tag_map = dict(zip(tb_tags, metrics)) + + for run in runs: + # try json first (final eval metrics) + jm = load_json_results(run.path) + tb = load_tb_scalars(run.path, tb_tags, reduce) + + for tag, metric in tag_map.items(): + val = None + json_key = f'{metric}_mean' if metric != 'reward' else 'reward_mean' + if json_key in jm: + val = jm[json_key] + elif tag in tb: + val = tb[tag] + if val is not None: + data[run.reward_mode][metric][(run.alpha, run.algo)] = val + + # convert to DataFrames + tables = {} + for mode, metrics_data in data.items(): + tables[mode] = {} + for metric, vals in metrics_data.items(): + if not vals: + continue + alphas = sorted(set(a for a, _ in vals.keys())) + algos = sorted(set(al for _, al in vals.keys())) + df = pd.DataFrame(index=alphas, columns=algos, dtype=float) + for (a, al), v in vals.items(): + df.loc[a, al] = v + df.index.name = 'alpha' + tables[mode][metric] = df + return tables + + +def format_table(df: pd.DataFrame, fmt: str = '.3f') -> str: + """Format DataFrame as markdown table.""" + return df.to_markdown(floatfmt=fmt) + + +def summarize(base_dir: str = 'sim/case/thesis_simplified/runs', + metrics: list[str] | None = None, + reduce: str = 'last', + output: str | None = None) -> dict: + """Generate summary tables from experiment runs.""" + base = Path(base_dir) + metrics = metrics or ['revenue', 'profit', 'margin', 'erosion', 'leakage'] + + runs = discover_runs(base) + if not runs: + print(f"No runs found in {base}") + return {} + + print(f"Found {len(runs)} runs") + tables = build_tables(runs, metrics, reduce) + + lines = [] + for mode, metric_tables in sorted(tables.items()): + lines.append(f"\n# Reward Mode: {mode}\n") + for metric, df in sorted(metric_tables.items()): + lines.append(f"\n## {metric}\n") + lines.append(format_table(df)) + lines.append("") + + report = '\n'.join(lines) + print(report) + + if output: + Path(output).write_text(report) + print(f"\nSaved to {output}") + + return tables + + +if __name__ == '__main__': + import argparse + p = argparse.ArgumentParser() + p.add_argument('--dir', default='sim/case/thesis_simplified/runs') + p.add_argument('--metrics', nargs='+', default=['revenue', 'profit', 'margin', 'erosion', 'leakage']) + p.add_argument('--reduce', default='last', choices=['last', 'mean', 'max', 'min']) + p.add_argument('--output', '-o', help='save markdown to file') + args = p.parse_args() + summarize(args.dir, args.metrics, args.reduce, args.output) diff --git a/sim/case/thesis_simplified/train.py b/sim/case/thesis_simplified/train.py new file mode 100644 index 0000000..a405c44 --- /dev/null +++ b/sim/case/thesis_simplified/train.py @@ -0,0 +1,336 @@ +"""RL training for thesis pricing system with thesis-aligned metrics. + +Trains pricing policies using stable-baselines3 with TensorBoard logging. +Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formulation. +""" +from __future__ import annotations +import argparse +import json +from concurrent.futures import ProcessPoolExecutor, as_completed +from dataclasses import dataclass, asdict, field +from pathlib import Path +from typing import Dict, List, Callable, Any +import numpy as np + +try: + from stable_baselines3 import PPO, SAC, A2C + from stable_baselines3.common.callbacks import BaseCallback, EvalCallback + from stable_baselines3.common.vec_env import DummyVecEnv + from stable_baselines3.common.monitor import Monitor + HAS_SB3 = True +except ImportError: + HAS_SB3 = False + +try: + from torch.utils.tensorboard import SummaryWriter + HAS_TB = True +except ImportError: + HAS_TB = False + +from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy + + +@dataclass +class EpisodeMetrics: + reward: float = 0.0 + revenue: float = 0.0 + profit: float = 0.0 + coi_erosion: float = 0.0 + coi_leakage: float = 0.0 + alpha_error: float = 0.0 + avg_margin: float = 0.0 + n_agents: int = 0 + steps: int = 0 + + def accumulate(self, info: Dict[str, Any]) -> None: + self.steps += 1 + self.reward += info.get('reward', 0) + self.revenue += info.get('revenue', 0) + self.profit += info.get('profit', 0) + self.coi_erosion += info.get('coi_erosion', 0) + self.coi_leakage += info.get('coi_leakage', 0) + self.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)) + self.avg_margin += info.get('avg_margin', 0) + self.n_agents += info.get('n_agents', 0) + + def normalized(self) -> Dict[str, float]: + s = max(self.steps, 1) + return {k: getattr(self, k) / s for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin', 'n_agents']} + + +@dataclass +class ExperimentConfig: + algo: str = "ppo" + total_timesteps: int = 100_000 + n_envs: int = 4 + eval_freq: int = 5000 + n_eval_episodes: int = 10 + log_dir: str = "sim/case/thesis_simplified/runs" + seed: int = 42 + n_products: int = 10 + max_steps: int = 200 + alpha_true: float = 0.2 + reward_mode: str = "robust" + experiment_name: str | None = None + + def __post_init__(self): + if self.experiment_name is None: + self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}" + + +class Policy: + """Unified policy interface for baselines and trained models.""" + + def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str): + self._fn, self.name = policy_fn, name + + def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]: + return self._fn(obs, (len(obs) - 3) // 3), None + + @staticmethod + def fixed(margin: float = 0.15) -> "Policy": + return Policy(lambda obs, n: fixed_price_policy(np.ones(n), margin), f"fixed_{margin:.2f}") + + @staticmethod + def adaptive(base_margin: float = 0.15) -> "Policy": + return Policy(lambda obs, n: adaptive_policy(obs, n, base_margin), f"adaptive_{base_margin:.2f}") + + @staticmethod + def random() -> "Policy": + return Policy(lambda obs, n: random_policy(n), "random") + + @staticmethod + def myopic(greed: float = 0.3) -> "Policy": + def _fn(obs: np.ndarray, n: int) -> np.ndarray: + demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5 + return np.ones(n, dtype=np.float32) * np.clip(1.0 + greed * (1 + np.mean(demand_norm)), 0.5, 1.5) + return Policy(_fn, f"myopic_{greed:.1f}") + + +def log_metrics(writer: SummaryWriter | None, metrics: Dict[str, float], prefix: str, step: int) -> None: + if writer is None: + return + for k, v in metrics.items(): + writer.add_scalar(f'{prefix}/{k}', v, step) + + +class MetricsCallback(BaseCallback): + def __init__(self, writer: SummaryWriter | None, verbose: int = 0): + super().__init__(verbose) + self._writer = writer + + def _on_step(self) -> bool: + if self._writer is None: + return True + for info in self.locals.get('infos', []): + t = self.num_timesteps + self._writer.add_scalar('economics/revenue', info.get('revenue', 0), t) + self._writer.add_scalar('economics/profit', info.get('profit', 0), t) + self._writer.add_scalar('economics/margin', info.get('avg_margin', 0), t) + self._writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), t) + self._writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), t) + self._writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), t) + self._writer.add_scalar('agents/count', info.get('n_agents', 0), t) + return True + + +def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv: + def _make(): + return Monitor(make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps, + alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed))) + return DummyVecEnv([_make for _ in range(n_envs)]) + + +def run_episodes(policy: Policy | Any, env: PricingEnv, n_episodes: int) -> List[EpisodeMetrics]: + """Run policy for n episodes and collect metrics.""" + metrics = [] + for _ in range(n_episodes): + obs, _ = env.reset() + ep, done = EpisodeMetrics(), False + while not done: + action, _ = policy.predict(obs, deterministic=True) + obs, reward, term, trunc, info = env.step(action) + done = term or trunc + ep.accumulate(info) + ep.reward += reward + metrics.append(ep) + return metrics + + +def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]: + env = make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps, + alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999)) + metrics = run_episodes(policy, env, n_episodes) + return { + 'reward_mean': np.mean([m.reward for m in metrics]), 'reward_std': np.std([m.reward for m in metrics]), + **{f'{k}_mean': np.mean([m.normalized()[k] for m in metrics]) + for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin']}, + } + + +def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None): + obs, n_envs = vec_env.reset(), vec_env.num_envs + ep_rewards = np.zeros(n_envs) + + for step in range(0, total_steps, n_envs): + actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)]) + obs, rewards, dones, infos = vec_env.step(actions) + ep_rewards += rewards + for i, info in enumerate(infos): + if writer: + writer.add_scalar('economics/revenue', info.get('revenue', 0), step) + writer.add_scalar('economics/profit', info.get('profit', 0), step) + writer.add_scalar('economics/margin', info.get('avg_margin', 0), step) + writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), step) + writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), step) + writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), step) + writer.add_scalar('agents/count', info.get('n_agents', 0), step) + if dones[i]: + if writer: + writer.add_scalar('rollout/ep_reward', ep_rewards[i], step) + ep_rewards[i] = 0 + + +def train(cfg: ExperimentConfig) -> Dict[str, Any]: + is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"] + if not HAS_SB3 and not is_baseline: + raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]") + + log_path = Path(cfg.log_dir) / cfg.experiment_name + log_path.mkdir(parents=True, exist_ok=True) + with open(log_path / "config.json", "w") as f: + json.dump(asdict(cfg), f, indent=2) + + writer = SummaryWriter(log_path) if HAS_TB else None + train_env, eval_env = make_vec_env(cfg, cfg.n_envs), make_vec_env(cfg, 1) + + if is_baseline: + policy = {"fixed": Policy.fixed, "adaptive": Policy.adaptive, "random": Policy.random, "myopic": Policy.myopic}[cfg.algo.lower()]() + run_baseline(policy, train_env, cfg.total_timesteps, writer) + final_metrics = evaluate_policy(policy, cfg) + else: + algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}[cfg.algo.lower()] + common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto") + model = { + "ppo": lambda: PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, ent_coef=0.01, **common), + "sac": lambda: SAC("MlpPolicy", train_env, learning_rate=1e-4, buffer_size=50_000, batch_size=512, tau=0.02, gamma=0.99, learning_starts=1000, ent_coef="auto_0.1", train_freq=4, **common), + "a2c": lambda: A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common), + }[cfg.algo.lower()]() + + cb = MetricsCallback(writer) + eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"), log_path=str(log_path), + eval_freq=cfg.eval_freq, n_eval_episodes=cfg.n_eval_episodes, deterministic=True) + model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True) + model.save(log_path / "final_model") + policy = model + final_metrics = evaluate_policy(model, cfg) + + if writer: + log_metrics(writer, final_metrics, 'final', cfg.total_timesteps) + writer.close() + + train_env.close(); eval_env.close() + with open(log_path / "results.json", "w") as f: + json.dump(final_metrics, f, indent=2) + return {"path": str(log_path), "metrics": final_metrics} + + +def _train_alpha(args: tuple) -> tuple[str, Dict]: + """Worker for parallel sweep - must be top-level for pickling.""" + cfg_dict, alpha = args + cfg_dict["alpha_true"] = alpha + cfg_dict["experiment_name"] = f"{cfg_dict['algo']}_a{alpha:.2f}_{cfg_dict['reward_mode']}" + sweep_cfg = ExperimentConfig(**cfg_dict) + print(f"[alpha={alpha:.2f}] starting") + metrics = train(sweep_cfg)["metrics"] + print(f"[alpha={alpha:.2f}] done") + return f"alpha_{alpha:.2f}", metrics + + +def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None, max_workers: int | None = None) -> Dict[str, Dict]: + alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + cfg_dict = asdict(cfg) + + if max_workers == 1: # sequential fallback + results = dict(_train_alpha((cfg_dict.copy(), a)) for a in alphas) + else: + with ProcessPoolExecutor(max_workers=max_workers) as pool: + futures = {pool.submit(_train_alpha, (cfg_dict.copy(), a)): a for a in alphas} + results = {} + for fut in as_completed(futures): + key, metrics = fut.result() + results[key] = metrics + + summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json" + with open(summary_path, "w") as f: + json.dump(results, f, indent=2) + print(f"\nSweep results saved to {summary_path}") + return results + + +def _train_policy(args: tuple) -> tuple[str, Dict]: + """Worker for parallel policy comparison.""" + cfg_dict, algo = args + cfg_dict["algo"] = algo + cfg_dict["experiment_name"] = f"cmp_{algo}_a{cfg_dict['alpha_true']:.2f}" + cmp_cfg = ExperimentConfig(**cfg_dict) + print(f"[{algo}] starting") + metrics = train(cmp_cfg)["metrics"] + print(f"[{algo}] done") + return algo, metrics + + +def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None, max_workers: int | None = None) -> Dict[str, Dict]: + policies = policies or ["fixed", "adaptive", "myopic", "random"] + cfg_dict = asdict(cfg) + + if max_workers == 1: + results = dict(_train_policy((cfg_dict.copy(), p)) for p in policies) + else: + with ProcessPoolExecutor(max_workers=max_workers) as pool: + futures = {pool.submit(_train_policy, (cfg_dict.copy(), p)): p for p in policies} + results = {} + for fut in as_completed(futures): + algo, metrics = fut.result() + results[algo] = metrics + + cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json" + with open(cmp_path, "w") as f: + json.dump(results, f, indent=2) + print(f"\nComparison saved to {cmp_path}") + for algo, m in results.items(): + print(f" {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} alpha_err={m['alpha_error_mean']:.4f}") + return results + + +def main(): + parser = argparse.ArgumentParser(description="Train RL pricing policies") + parser.add_argument("--algo", default="ppo", choices=["ppo", "sac", "a2c", "fixed", "adaptive", "random", "myopic"]) + parser.add_argument("--steps", type=int, default=100_000) + parser.add_argument("--alpha", type=float, default=0.2) + parser.add_argument("--reward-mode", default="robust", choices=["revenue", "profit", "robust", "coi_aware"]) + parser.add_argument("--n-products", type=int, default=10) + parser.add_argument("--n-envs", type=int, default=4) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--log-dir", default="sim/case/thesis_simplified/runs") + parser.add_argument("--sweep", action="store_true", help="run contamination sweep") + parser.add_argument("--compare", action="store_true", help="compare all baselines") + parser.add_argument("--workers", type=int, default=None, help="max parallel workers for sweep (None=auto, 1=sequential)") + args = parser.parse_args() + + cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha, + reward_mode=args.reward_mode, n_products=args.n_products, + n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir) + + if args.sweep: + run_sweep(cfg, max_workers=args.workers) + elif args.compare: + compare_policies(cfg, max_workers=args.workers) + else: + result = train(cfg) + print(f"\nTraining complete: {result['path']}") + print(f"Metrics: {json.dumps(result['metrics'], indent=2)}") + + +if __name__ == "__main__": + main() diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py index 33f83f4..8cc0214 100644 --- a/sim/rl/behavior_loader/models.py +++ b/sim/rl/behavior_loader/models.py @@ -226,6 +226,7 @@ if __name__ == "__main__": agent_model = AgentBehaviorModel(agent_dir) agent_mdp = agent_model.build_MDP() + print(agent_mdp) print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, " f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions") if not agent_mdp['states']: @@ -234,6 +235,9 @@ if __name__ == "__main__": human_evt = aggregate_event_transitions(human_mdp) agent_evt = aggregate_event_transitions(agent_mdp) + print(agent_evt) + + common = set(human_evt.keys()) & set(agent_evt.keys()) if not common: diff --git a/sim/rl/engine.py b/sim/rl/engine.py index ab751e3..ec4d871 100644 --- a/sim/rl/engine.py +++ b/sim/rl/engine.py @@ -76,8 +76,7 @@ class WildPricingEngine(BasePricingEngine): def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray: self.step_count += 1 - # extract demand signal (from env observation) as proxy for sales - demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32)) + demand = _extract_demand(observation, self.c.product_catalogue_size) return self._update_from_demand(current_prices, demand) def _update_from_demand(self, prices: np.ndarray, sold: np.ndarray) -> np.ndarray: @@ -141,7 +140,7 @@ class SimpleDemandEngine(BasePricingEngine): def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray: self.step_count += 1 - demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32)) + demand = _extract_demand(observation, self.c.product_catalogue_size) if self.prev_demand is None: self.prev_demand = demand.copy() return current_prices.copy() @@ -207,7 +206,7 @@ class ThompsonSamplingEngine(BasePricingEngine): lo = current_prices * 0.7 hi = current_prices * 1.3 self.price_grid = np.linspace(lo, hi, self.n_price_levels).T - demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32)) + demand = _extract_demand(observation, self.c.product_catalogue_size) # update beliefs based on last action if self.last_actions is not None: for i in range(self.c.product_catalogue_size): @@ -226,3 +225,14 @@ class ThompsonSamplingEngine(BasePricingEngine): new_prices[i] = self.price_grid[i, actions[i]] self.last_actions = actions return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32) + + +def _extract_demand(observation: Dict[str, Any], n: int) -> np.ndarray: + if "elasticity" in observation and isinstance(observation["elasticity"], dict): + d = observation["elasticity"].get("demand") + if d is not None: + return np.asarray(d, dtype=np.float32) + d = observation.get("demand") + if d is not None: + return np.asarray(d, dtype=np.float32) + return np.zeros(n, dtype=np.float32) diff --git a/sim/rl/environment.py b/sim/rl/environment.py index 597359f..94bc8e1 100644 --- a/sim/rl/environment.py +++ b/sim/rl/environment.py @@ -1,682 +1,244 @@ -import gymnasium as gym -from gymnasium import spaces -import numpy as np -from dataclasses import dataclass -import pandas as pd -from types import SimpleNamespace -from typing import Optional, Dict, Any, List, Tuple +from __future__ import annotations -from lib.separability import load_artifacts, score_session, estimate_alpha -from sim.rl.behavior_loader.models import AgentBehaviorModel, BehaviorModel, aggregate_event_transitions +from dataclasses import dataclass +from typing import Any, Dict, Optional, Tuple + +import numpy as np try: - import jax - from sim.rl.jax_core import JAX_AVAILABLE, compile_transitions, fallback_transitions, sample_sessions, compute_metrics - from sim.rl.jax_core import session_features, compute_session_transitions, compute_divergences, estimate_alpha_batch -except ImportError: - JAX_AVAILABLE = False + import gymnasium as gym + from gymnasium import spaces +except ImportError as e: + raise ImportError("sim.rl.environment requires gymnasium") from e -# "learner" agent learning to optimize pricing -# "agent" part of environment creating demand signals that learner processes +from sim.case.thesis_simplified.coi import COIWindow, coi_erosion, compute_coi_window +from sim.case.thesis_simplified.separability import estimate_alpha as estimate_session_alpha +from sim.case.thesis_simplified.simplified import Limbo, Session, put_prices_to_market +from sim.rl.thesis_core import aggregate_demand_by_product, aggregate_purchases, constrain_prices + + +@dataclass(frozen=True) +class BusinessLogicConstraints: + product_catalogue_size: int = 100 + max_steps: int = 2000 + sessions_per_step: int = 250 -base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments" -human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/" -@dataclass -class BusinessLogicConstraints(): - max_price_adjustment: float = 0.30 system_max_price: float = 500.0 system_min_price: float = 1.0 - product_catalogue_size: int = 100 - episode_length: int = 2000 - sessions_per_step: int = 250 + max_price_adjustment: float = 0.30 + min_margin_pct: float = 0.05 + agent_share: float = 0.2 - agent_recon_multiplier: float = 6.0 - agent_purchase_probability: float = 0.20 + alpha_drift: float = 0.0 + alpha_bounds: tuple[float, float] = (0.0, 0.8) + coi_strength: float = 0.25 - coi_threshold: float = 4.0 - coi_sigmoid_temp: float = 1.25 - base_human_demand: float = 0.08 - base_agent_demand: float = 0.05 - human_price_elasticity: float = -1.2 # assumptions here - agent_price_elasticity: float = -0.6 - w_agent_loss: float = 1.0 w_volatility: float = 5.0 w_estimation_error: float = 0.25 + seed: int = 7 -def _sigmoid(x: np.ndarray) -> np.ndarray: - return 1.0 / (1.0 + np.exp(-x)) - -EVENT_PAGE_MAP = { - "session_start": "/", - "page_view": "/", - "view_item_page": "/products", - "learn_more_about_item": "/products/details", - "add_item_to_cart": "/cart", - "checkout_start": "/checkout", - "purchase_complete": "/checkout", - "session_end": "/checkout/success", -} - -# map real collected event names to canonical simulation states -EVENT_CANONICAL_MAP = { - "page_view": "session_start", - "hover_over_paragraph": "view_item_page", - "hover_over_title": "view_item_page", - "view_item_page": "view_item_page", - "learn_more_about_item": "learn_more_about_item", - "add_item_to_cart": "add_item_to_cart", - "checkout_start": "purchase_complete", - "remove_item": "view_item_page", -} - - -def _canonicalize_transitions(raw_trans: Dict[str, Dict[str, float]]) -> Dict[str, Dict[str, float]]: - """Map real event transition names to canonical simulation states.""" - canonical: Dict[str, Dict[str, float]] = {} - for src, dsts in raw_trans.items(): - src_canon = EVENT_CANONICAL_MAP.get(src, src) - if src_canon not in canonical: - canonical[src_canon] = {} - for dst, prob in dsts.items(): - dst_canon = EVENT_CANONICAL_MAP.get(dst, dst) - canonical[src_canon][dst_canon] = canonical[src_canon].get(dst_canon, 0.0) + prob - # re-normalize after aggregation - for src in canonical: - total = sum(canonical[src].values()) - if total > 0: - canonical[src] = {k: v / total for k, v in canonical[src].items()} - return canonical - - -class BehavioralProfile: - """Synthetic Markov profile used to generate interaction sessions. - Uses aggregate_event_transitions from models.py to build transition kernels from real data.""" - - def __init__(self, actor: str, purchase_probs: np.ndarray): - self.actor = actor - self.purchase_probs = np.clip(purchase_probs, 0.0, 0.95) - self.states = [ - "session_start", - "view_item_page", - "learn_more_about_item", - "add_item_to_cart", - "purchase_complete", - "session_end", - ] - model = AgentBehaviorModel(agent_dir) if actor == "agents" else BehaviorModel(human_dir) - mdp = model.build_MDP() - raw_trans = aggregate_event_transitions(mdp) if mdp.get("transitions") else {} - self.transitions = _canonicalize_transitions(raw_trans) if raw_trans else self._fallback_transitions() - self._ensure_terminal_states() - self.dwell_params = self._extract_dwell_params(mdp) - - def _ensure_terminal_states(self): - # guarantee purchase_complete leads to session_end and session_start exists - if "purchase_complete" not in self.transitions: - self.transitions["purchase_complete"] = {"session_end": 1.0} - elif "session_end" not in self.transitions.get("purchase_complete", {}): - self.transitions["purchase_complete"]["session_end"] = 1.0 - total = sum(self.transitions["purchase_complete"].values()) - self.transitions["purchase_complete"] = {k: v/total for k, v in self.transitions["purchase_complete"].items()} - if "session_start" not in self.transitions: - self.transitions["session_start"] = {"view_item_page": 0.7, "learn_more_about_item": 0.2, "session_end": 0.1} - - def _fallback_transitions(self) -> Dict[str, Dict[str, float]]: - return { - "session_start": {"view_item_page": 0.85, "session_end": 0.15}, - "view_item_page": {"learn_more_about_item": 0.4, "add_item_to_cart": 0.3, "view_item_page": 0.2, "session_end": 0.1}, - "learn_more_about_item": {"add_item_to_cart": 0.5, "view_item_page": 0.3, "session_end": 0.2}, - "add_item_to_cart": {"purchase_complete": 0.6, "view_item_page": 0.25, "session_end": 0.15}, - "purchase_complete": {"session_end": 1.0}, - } - - def _extract_dwell_params(self, mdp: Dict) -> Dict[str, Tuple[float, float]]: - state_vals = mdp.get("state_values", {}) - params = {} - for state in self.states: - # try canonical and raw state names - val = state_vals.get(state, 0.5) - for raw, canon in EVENT_CANONICAL_MAP.items(): - if canon == state and raw in state_vals: - val = state_vals[raw] - break - shape = 1.5 + val * 2.0 - scale = 0.8 + (1.0 - val) * 1.2 - params[state] = (shape, scale) - return params - - def _transition_probs(self, state: str, product_idx: int) -> Dict[str, float]: - probs = dict(self.transitions.get(state, {"session_end": 1.0})) - if state == "add_item_to_cart": - base = probs.get("purchase_complete", 0.0) - demand_factor = float(self.purchase_probs[int(product_idx)]) - if self.actor == "agents": - demand_factor *= 0.7 - adjusted = np.clip(base * 0.5 + demand_factor * 0.5, 0.0, 0.95) - remainder = max(1e-6, 1.0 - adjusted) - other_total = sum(v for k, v in probs.items() if k != "purchase_complete") - scale = remainder / max(other_total, 1e-6) - for key in probs: - if key == "purchase_complete": - probs[key] = adjusted - else: - probs[key] = probs[key] * scale - total = sum(probs.values()) - if total <= 0: - return {"session_end": 1.0} - return {state: val / total for state, val in probs.items()} - - def sample_session( - self, - rng: np.random.Generator, - session_id: str, - prices: np.ndarray, - unit_cost: np.ndarray, - ) -> Tuple[List[Dict[str, Any]], List[SimpleNamespace]]: - """Generate a single session trajectory respecting business constraints.""" - events: List[Dict[str, Any]] = [] - feature_events: List[SimpleNamespace] = [] - state = "session_start" - t = 0.0 - product_idx = int(rng.integers(0, len(prices))) - product_id = f"product-{product_idx:04d}" - - - # enforce price >= cost constraint (lipschitz bound on pricing) - # This is a sort of last resort to not let an pricing learner go rogue - cost = float(unit_cost[product_idx]) - constrained_price = max(float(prices[product_idx]), cost * 1.05) # 5% min margin - - while state != "session_end" and len(events) < 40: - if state != "session_start": - row = { - "session_id": session_id, - "actor": "agent" if self.actor == "agents" else "human", - "eventName": state, - "product_idx": product_idx, - "productId": product_id, - "price_offered": constrained_price, - "price_paid": 0.0, - "page": EVENT_PAGE_MAP.get(state, "/"), - "ts": t, - "unit_cost": cost, - "base_price": float(prices[product_idx]), - } - if state == "purchase_complete": - noise = float(rng.normal(0.0, 0.015)) - row["price_paid"] = max(constrained_price * (1.0 + noise), cost) - events.append(row) - feature_events.append( - SimpleNamespace( - eventName=row["eventName"], - page=row["page"], - productId=row["productId"], - ts=row["ts"], - ) - ) - - transitions = self._transition_probs(state, product_idx) - next_state = rng.choice(list(transitions.keys()), p=list(transitions.values())) - shape, scale = self.dwell_params.get(state, (2.0, 1.0)) - dwell = max(0.3, rng.gamma(shape=shape, scale=scale)) - t += dwell - state = next_state - - return events, feature_events - - -def _load_behavioral_profile(actor: str, demand_forcing: np.ndarray) -> BehavioralProfile: - """returns a behavioral profile for generating synthetic sessions - actor: 'humans' or 'agents' - demand_forcing: per-product purchase probabilities used to weight interactions - """ - return BehavioralProfile(actor, demand_forcing) - - -class CommercePlatform: - """state management for the environment, simulates demand""" - def __init__(self, product_catalogue_size: int, max_price: float, min_price: float, constraints: BusinessLogicConstraints): - self.product_catalogue_size = product_catalogue_size - self.max_price = max_price - self.min_price = min_price - self.constraints = constraints - self.simulation_history: List[Dict[str, Any]] = [] - self._rng = np.random.default_rng(constraints.seed) - self._last_interaction_df: pd.DataFrame = pd.DataFrame() - self.unit_cost = np.random.uniform(low=15.0, high=60.0, size=(self.product_catalogue_size,)).astype(np.float32) - self.base_price = np.random.uniform(low=60.0, high=140.0, size=(self.product_catalogue_size,)).astype(np.float32) - self.alpha_hat = constraints.agent_share - try: - self.separability_artifacts = load_artifacts() - except FileNotFoundError: - self.separability_artifacts = None - - def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]: - p = np.clip(prices, self.min_price, self.max_price) - cost = np.clip(self.unit_cost, self.min_price * 0.2, self.max_price) - margin = np.clip((p - cost) / np.maximum(cost, 1e-3), -0.9, 2.0) - # isoelastic demand approximation - human_prob = self.constraints.base_human_demand * np.exp(self.constraints.human_price_elasticity * margin) - agent_prob = self.constraints.base_agent_demand * np.exp(self.constraints.agent_price_elasticity * margin) - return { - "human_purchase_prob": np.clip(human_prob, 0.0, 0.95), - "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95), - } - - def _simulate_sessions(self, prices: np.ndarray) -> Tuple[pd.DataFrame, Dict[str, Any]]: - demand = self.setup_true_demand(prices) - T = self.constraints.sessions_per_step - effective_share = float(np.clip(self.alpha_hat, 0.0, 0.95)) - n_agent_sessions = max(1, int(round(T * effective_share))) - n_human_sessions = max(1, T - n_agent_sessions) - - session_map = { - "humans": n_human_sessions, - "agents": n_agent_sessions, - } - pprob_map = { - "humans": demand["human_purchase_prob"], - "agents": demand["agent_purchase_prob"], - } - - rows: List[Dict[str, Any]] = [] - session_scores: List[Dict[str, float]] = [] - demand_human = np.zeros_like(prices, dtype=np.float32) - demand_agent = np.zeros_like(prices, dtype=np.float32) - - for actor, n_sessions in session_map.items(): - profile = _load_behavioral_profile(actor, pprob_map[actor]) - for idx in range(n_sessions): - session_id = f"{actor}_{idx:06d}" - session_rows, feature_events = profile.sample_session( - self._rng, session_id, prices, self.unit_cost - ) - rows.extend(session_rows) - if session_rows: - df_session = pd.DataFrame(session_rows) - purchases = df_session[df_session["eventName"] == "purchase_complete"] - if not purchases.empty: - counts = purchases.groupby("product_idx").size() - if actor == "agents": - demand_agent[counts.index.to_numpy(dtype=int)] += counts.to_numpy(dtype=np.float32) - else: - demand_human[counts.index.to_numpy(dtype=int)] += counts.to_numpy(dtype=np.float32) - if self.separability_artifacts and feature_events: - score = score_session(feature_events, self.separability_artifacts) - session_scores.append(score) - - interactions_df = pd.DataFrame(rows) - diagnostics = { - "alpha_hat": float(self.alpha_hat), - "session_scores": session_scores, - "demand_human": demand_human, - "demand_agent": demand_agent, - } - - if session_scores: - alphas = [ - estimate_alpha(s["prob_agent"], s["delta_h"], s["delta_a"], temperature=2.0) - for s in session_scores - ] - mean_alpha = float(np.mean(alphas)) - # exponential moving average for stability - self.alpha_hat = 0.7 * self.alpha_hat + 0.3 * mean_alpha - diagnostics.update( - { - "alpha_hat": float(self.alpha_hat), - "delta_h_mean": float(np.mean([s["delta_h"] for s in session_scores])), - "delta_a_mean": float(np.mean([s["delta_a"] for s in session_scores])), - "prob_agent_mean": float(np.mean([s["prob_agent"] for s in session_scores])), - } - ) - - self._last_interaction_df = interactions_df - return interactions_df, diagnostics - - def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]: - if interaction_df.empty: - return { - "revenue_observed": 0.0, - "revenue_oracle": 0.0, - "agent_loss": 0.0, - "true_human_purchases": 0.0, - "true_agent_purchases": 0.0, - "mean_sale_price": 0.0, - "look_to_book": 0.0, - "coi": 0.0, - "expected_premium": 0.0, - } - - purchases = interaction_df[interaction_df["eventName"] == "purchase_complete"] - human_purchases = purchases[purchases["actor"] == "human"] - agent_purchases = purchases[purchases["actor"] == "agent"] - - revenue_observed = float(purchases["price_paid"].sum()) - revenue_oracle = float(purchases["base_price"].sum()) - agent_loss = float((agent_purchases["base_price"] - agent_purchases["price_paid"]).sum()) - - mean_sale_price = float(purchases["price_paid"].mean()) if not purchases.empty else 0.0 - views = float((interaction_df["eventName"] == "view_item_page").sum()) - look_to_book = float(views / (len(purchases) + 1e-6)) - true_human = float(len(human_purchases)) - true_agent = float(len(agent_purchases)) - - human_prices = human_purchases["price_offered"] if not human_purchases.empty else pd.Series(dtype=float) - human_costs = human_purchases["unit_cost"] if not human_purchases.empty else pd.Series(dtype=float) - human_base = human_purchases["base_price"] if not human_purchases.empty else pd.Series(dtype=float) - coi = 0.0 - if not human_prices.empty and not human_costs.empty: - # COI = E[P] - p_min where p_min is cost, accounting for expected premium (base - realized) - margin = human_prices.mean() - human_costs.mean() - expected_premium = human_base.mean() - human_prices.mean() if not human_base.empty else 0.0 - coi = float(np.maximum(0.0, margin - expected_premium * 0.5)) - - return { - "revenue_observed": revenue_observed, - "revenue_oracle": revenue_oracle, - "agent_loss": agent_loss, - "true_human_purchases": true_human, - "true_agent_purchases": true_agent, - "mean_sale_price": mean_sale_price, - "look_to_book": look_to_book, - "coi": coi, - "expected_premium": float(expected_premium) if not human_base.empty else 0.0, - } - - def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame: - """Extract per-session behavioral features for separability analysis.""" - if df.empty: - return pd.DataFrame() - g = df.groupby("session_id", sort=False) - session_duration = g["ts"].max() - g["ts"].min() - total_interactions = g.size() - avg_time_between = g["ts"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0) - interaction_velocity = total_interactions / (session_duration + 1e-6) - views = g.apply(lambda x: int((x["eventName"] == "view_item_page").sum()), include_groups=False) - cart_adds = g.apply(lambda x: int((x["eventName"] == "add_item_to_cart").sum()), include_groups=False) - purchases = g.apply(lambda x: int((x["eventName"] == "purchase_complete").sum()), include_groups=False) - learn_more = g.apply(lambda x: int((x["eventName"] == "learn_more_about_item").sum()), include_groups=False) - conversion_rate = purchases / (views + 1e-6) - is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False) - # price sensitivity features - price_variance = g["price_offered"].var().fillna(0.0) - avg_price_seen = g["price_offered"].mean().fillna(0.0) - products_viewed = g["product_idx"].nunique() - - return pd.DataFrame({ - "session_duration_sec": session_duration.astype(float), - "avg_time_between_events": avg_time_between.astype(float), - "total_interactions": total_interactions.astype(int), - "interaction_velocity": interaction_velocity.astype(float), - "item_views": views.astype(int), - "cart_adds": cart_adds.astype(int), - "purchases": purchases.astype(int), - "learn_more_clicks": learn_more.astype(int), - "conversion_rate": conversion_rate.astype(float), - "price_variance": price_variance.astype(float), - "avg_price_seen": avg_price_seen.astype(float), - "products_viewed": products_viewed.astype(int), - "is_agent": is_agent.astype(bool), - }).reset_index() - - def get_interaction_data(self) -> np.ndarray: - if self._last_interaction_df.empty: - return np.array([], dtype=object) - return self._last_interaction_df.to_dict(orient="records") +def make_env(constraints: Optional[BusinessLogicConstraints] = None) -> "PHANTOMEnv": + return PHANTOMEnv(constraints=constraints or BusinessLogicConstraints()) class PHANTOMEnv(gym.Env): - metadata = {"render_modes": []} + metadata = {"render_modes": ["human", "ansi"]} - def __init__(self, constraints: Optional[BusinessLogicConstraints] = None, use_jax: bool = True): + def __init__(self, constraints: Optional[BusinessLogicConstraints] = None): super().__init__() - self.constraints = constraints if isinstance(constraints, BusinessLogicConstraints) else BusinessLogicConstraints() - self.use_jax = use_jax and JAX_AVAILABLE - self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment, - high=self.constraints.max_price_adjustment, - shape=(self.constraints.product_catalogue_size,), dtype=np.float32) - n_products = self.constraints.product_catalogue_size - self.observation_space = spaces.Dict({ - "elasticity": spaces.Dict({ - "price": spaces.Box( - low=np.full((n_products,), self.constraints.system_min_price, dtype=np.float32), - high=np.full((n_products,), self.constraints.system_max_price, dtype=np.float32), - dtype=np.float32), - "demand": spaces.Box( - low=np.zeros((n_products,), dtype=np.float32), - high=np.full((n_products,), 1e6, dtype=np.float32), - dtype=np.float32), - }), - "market": spaces.Dict({ - "alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32), - "revenue_rate": spaces.Box(low=0.0, high=1e6, shape=(1,), dtype=np.float32), - "conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32), - "price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32), - }), - "cost": spaces.Box(low=0.0, high=self.constraints.system_max_price, shape=(n_products,), dtype=np.float32), - }) - self.commerce_platform = CommercePlatform( - product_catalogue_size=self.constraints.product_catalogue_size, - max_price=self.constraints.system_max_price, - min_price=self.constraints.system_min_price, - constraints=self.constraints) - self._rng = np.random.default_rng(self.constraints.seed) - self.t = 0 - self._prev_prices: Optional[np.ndarray] = None - self.state: Dict[str, Any] = {} - self._jax_key = None - self._jax_trans = None - if self.use_jax: - self._jax_key = jax.random.PRNGKey(self.constraints.seed) - self._init_jax_transitions() + self.c = constraints or BusinessLogicConstraints() + self.n = int(self.c.product_catalogue_size) - def _init_jax_transitions(self): - try: - human_profile = _load_behavioral_profile("humans", np.ones(self.constraints.product_catalogue_size) * 0.1) - agent_profile = _load_behavioral_profile("agents", np.ones(self.constraints.product_catalogue_size) * 0.1) - self._jax_trans = compile_transitions(human_profile, agent_profile).to_jax() - except Exception: - self._jax_trans = fallback_transitions().to_jax() + self._rng = np.random.default_rng(self.c.seed) + self._t = 0 + self._alpha_true = float(self.c.agent_share) + self._alpha_hat = float(self.c.agent_share) + self._costs = np.zeros(self.n, dtype=np.float32) + self._refs = np.zeros(self.n, dtype=np.float32) + self._prices: Optional[np.ndarray] = None + self._last_sessions: list[Session] = [] + self._last_coi: COIWindow | None = None + self._limbo = Limbo() + + self.action_space = spaces.Box( + low=np.full((self.n,), self.c.system_min_price, dtype=np.float32), + high=np.full((self.n,), self.c.system_max_price, dtype=np.float32), + dtype=np.float32, + ) + self.observation_space = spaces.Dict( + { + "elasticity": spaces.Dict( + { + "price": spaces.Box( + low=np.full((self.n,), self.c.system_min_price, dtype=np.float32), + high=np.full((self.n,), self.c.system_max_price, dtype=np.float32), + dtype=np.float32, + ), + "demand": spaces.Box( + low=np.zeros((self.n,), dtype=np.float32), + high=np.full((self.n,), 1e9, dtype=np.float32), + dtype=np.float32, + ), + } + ), + "market": spaces.Dict( + { + "alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32), + "revenue_rate": spaces.Box(low=0.0, high=1e12, shape=(1,), dtype=np.float32), + "conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32), + "price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32), + } + ), + "cost": spaces.Box( + low=np.zeros((self.n,), dtype=np.float32), + high=np.full((self.n,), self.c.system_max_price, dtype=np.float32), + dtype=np.float32, + ), + } + ) + + def _reset_catalogue(self) -> None: + self._costs = self._rng.uniform(15.0, 60.0, size=self.n).astype(np.float32) + margins = self._rng.uniform(0.2, 0.6, size=self.n).astype(np.float32) + self._refs = (self._costs * (1.0 + margins)).astype(np.float32) + self._prices = self._refs.copy() + + def _observe_market( + self, prices: np.ndarray + ) -> tuple[list[Session], Dict[str, float], np.ndarray, np.ndarray, float, float, int]: + sessions, demand_map = put_prices_to_market( + prices, + costs=self._costs, + alpha=self._alpha_true, + n_sessions=int(self.c.sessions_per_step), + seed=int(self._rng.integers(0, 2**31 - 1)), + ) + demand_by_product = aggregate_demand_by_product(sessions, demand_map, self.n) + purchases, revenue, cost, n_agents = aggregate_purchases(sessions, self._costs, self.n) + conversion = float(np.sum(purchases) / max(len(sessions), 1)) + return sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents + + def _update_alpha_hat(self, sessions: list[Session]) -> float: + scores = [estimate_session_alpha(s) for s in sessions if s.events] + if not scores: + return self._alpha_hat + alpha_step = float(np.mean(scores)) + self._alpha_hat = 0.8 * self._alpha_hat + 0.2 * alpha_step + self._alpha_hat = float(np.clip(self._alpha_hat, 0.0, 1.0)) + return self._alpha_hat + + def _reward(self, prices: np.ndarray, revenue: float, cost: float, volatility: float) -> float: + profit = float(revenue - cost) + coi_leak = float(self._last_coi.leak) if self._last_coi else 0.0 + alpha_err = abs(self._alpha_hat - self._alpha_true) + return profit - self.c.coi_strength * coi_leak - self.c.w_volatility * volatility - self.c.w_estimation_error * alpha_err + + def _build_obs( + self, + prices: np.ndarray, + demand_by_product: np.ndarray, + revenue: float, + conversion: float, + volatility: float, + ) -> Dict[str, Any]: + return { + "elasticity": {"price": prices.astype(np.float32), "demand": demand_by_product.astype(np.float32)}, + "market": { + "alpha_hat": np.array([self._alpha_hat], dtype=np.float32), + "revenue_rate": np.array([revenue], dtype=np.float32), + "conversion_rate": np.array([conversion], dtype=np.float32), + "price_volatility": np.array([volatility], dtype=np.float32), + }, + "cost": self._costs.astype(np.float32), + } def reset(self, seed: Optional[int] = None, options: Optional[dict] = None): super().reset(seed=seed) if seed is not None: self._rng = np.random.default_rng(seed) - self.commerce_platform._rng = np.random.default_rng(seed) - if self.use_jax: - self._jax_key = jax.random.PRNGKey(seed) - self.commerce_platform.alpha_hat = self.constraints.agent_share - self.t = 0 - init_prices = self._rng.uniform( - low=60.0, - high=140.0, - size=(self.constraints.product_catalogue_size,), - ).astype(np.float32) - self.commerce_platform.unit_cost = self._rng.uniform( - low=15.0, - high=60.0, - size=(self.constraints.product_catalogue_size,), - ).astype(np.float32) - self.commerce_platform.base_price = init_prices.copy() - self._prev_prices = init_prices.copy() - self.state = { - "elasticity": { - "price": init_prices, - "demand": np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32), - }, - "market": { - "alpha_hat": np.array([self.constraints.agent_share], dtype=np.float32), - "revenue_rate": np.array([0.0], dtype=np.float32), - "conversion_rate": np.array([0.0], dtype=np.float32), - "price_volatility": np.array([0.0], dtype=np.float32), - }, - "cost": self.commerce_platform.unit_cost.astype(np.float32), - } - return self.state, {} + self._t = 0 + self._alpha_true = float(np.clip(self.c.agent_share, *self.c.alpha_bounds)) + self._alpha_hat = float(self.c.agent_share) + self._reset_catalogue() + self._limbo = Limbo() + self._last_sessions = [] + self._last_coi = None - def _step_jax(self, new_prices: np.ndarray) -> Tuple[Dict, Dict]: - self._jax_key, subkey = jax.random.split(self._jax_key) - alpha = float(np.clip(self.commerce_platform.alpha_hat, 0.0, 0.95)) - n_agent = max(1, int(self.constraints.sessions_per_step * alpha)) - n_human = max(1, self.constraints.sessions_per_step - n_agent) - batch = sample_sessions(subkey, self._jax_trans, n_human, n_agent, len(new_prices)) - sim = compute_metrics(batch, new_prices, self.commerce_platform.unit_cost, self.commerce_platform.base_price) - result = {"revenue_observed": sim.revenue, "revenue_oracle": sim.revenue_oracle, - "agent_loss": sim.agent_loss, "coi": sim.coi, "look_to_book": sim.look_to_book, - "mean_sale_price": sim.mean_sale_price, "true_human_purchases": sim.n_human_purchases, - "true_agent_purchases": sim.n_agent_purchases} - diagnostics = {"demand_human": sim.demand_human, "demand_agent": sim.demand_agent, "alpha_hat": alpha} - return result, diagnostics + prices = self._prices if self._prices is not None else np.zeros(self.n, dtype=np.float32) + obs = self._build_obs(prices, np.zeros(self.n, dtype=np.float32), 0.0, 0.0, 0.0) + return obs, {"alpha_true": self._alpha_true} - def step(self, action: np.ndarray): - self.t += 1 - base_prices = self.state["elasticity"]["price"].astype(np.float32) - new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)), - self.constraints.system_min_price, - self.constraints.system_max_price).astype(np.float32) + def step(self, action: np.ndarray) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]: + if self._prices is None: + raise RuntimeError("reset() must be called before step()") - self.state["elasticity"]["price"] = new_prices - if self.use_jax: - result, diagnostics = self._step_jax(new_prices) - else: - interactions_df, diagnostics = self.commerce_platform._simulate_sessions(new_prices) - result = self.commerce_platform.compute_interaction_features(interactions_df) - COI = float(result.get("coi", 0.0)) - - demand_vector = diagnostics.get("demand_human", np.zeros_like(new_prices)) + diagnostics.get( - "demand_agent", np.zeros_like(new_prices) + prev = self._prices + prices = constrain_prices( + prev, + np.asarray(action, dtype=np.float32), + costs=self._costs, + min_price=float(self.c.system_min_price), + max_price=float(self.c.system_max_price), + max_adjustment=float(self.c.max_price_adjustment), + min_margin_pct=float(self.c.min_margin_pct), ) - self.state["elasticity"]["demand"] = demand_vector.astype(np.float32) + self._prices = prices + self._limbo.add_update("prices", prices) - volatility = 0.0 if self._prev_prices is None else \ - float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6)))) - self._prev_prices = new_prices.copy() + sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents = self._observe_market(prices) + self._last_sessions = sessions + self._limbo.add_update("demand", demand_map) - # update market observation features - total_demand = float(np.sum(demand_vector)) - total_purchases = float(result.get("true_human_purchases", 0.0) + result.get("true_agent_purchases", 0.0)) - conv_rate = total_purchases / max(total_demand, 1.0) - self.state["market"] = { - "alpha_hat": np.array([float(diagnostics.get("alpha_hat", self.commerce_platform.alpha_hat))], dtype=np.float32), - "revenue_rate": np.array([float(result.get("revenue_observed", 0.0))], dtype=np.float32), - "conversion_rate": np.array([float(np.clip(conv_rate, 0.0, 1.0))], dtype=np.float32), - "price_volatility": np.array([float(volatility)], dtype=np.float32), - } - self.state["cost"] = self.commerce_platform.unit_cost.astype(np.float32) + self._update_alpha_hat(self._last_sessions) + self._last_coi = compute_coi_window(self._last_sessions, self._costs, demand_mapping=demand_map) - # extract metrics with safe defaults for incomplete simulation - revenue_observed = float(result.get("revenue_observed", 0.0)) - agent_loss = float(result.get("agent_loss", 0.0)) + self._alpha_true = float(np.clip(self._alpha_true + self.c.alpha_drift, *self.c.alpha_bounds)) + volatility = float(np.std((prices - prev) / (prev + 1e-6))) + reward = float(self._reward(prices, revenue, cost, volatility)) + conversion = float(np.sum(purchases) / max(len(self._last_sessions), 1)) - reward = (revenue_observed - - COI - - self.constraints.w_agent_loss * agent_loss - - self.constraints.w_volatility * volatility - - self.constraints.w_estimation_error) + self._t += 1 + terminated = self._t >= int(self.c.max_steps) - terminated = self.t >= self.constraints.episode_length + obs = self._build_obs(prices, demand_by_product, revenue, conversion, min(volatility, 1.0)) info = { - "t": self.t, - "revenue_observed": revenue_observed, - "revenue_oracle": float(result.get("revenue_oracle", revenue_observed)), - "agent_loss": agent_loss, - "ux_volatility": volatility, - "look_to_book": float(result.get("look_to_book", 0.0)), - "mean_sale_price": float(result.get("mean_sale_price", 0.0)), - "true_human_purchases_total": float(result.get("true_human_purchases", 0.0)), - "true_agent_purchases_total": float(result.get("true_agent_purchases", 0.0)), - "coi": COI, - "alpha_hat": diagnostics.get("alpha_hat", self.commerce_platform.alpha_hat), - "mean_human_demand": float(np.mean(diagnostics.get("demand_human", np.zeros_like(new_prices)))), - "mean_agent_demand": float(np.mean(diagnostics.get("demand_agent", np.zeros_like(new_prices)))), + "step": self._t, + "reward": reward, + "revenue": float(revenue), + "profit": float(revenue - cost), + "n_sessions": int(self.c.sessions_per_step), + "n_agents": int(n_agents), + "alpha_true": float(self._alpha_true), + "alpha_hat": float(self._alpha_hat), + "alpha_error": float(abs(self._alpha_hat - self._alpha_true)), + "price_std": float(np.std(prices)), + "price_volatility": float(volatility), } - if "delta_h_mean" in diagnostics: + if self._last_coi is not None: info.update( { - "delta_h_mean": diagnostics["delta_h_mean"], - "delta_a_mean": diagnostics["delta_a_mean"], - "prob_agent_mean": diagnostics["prob_agent_mean"], + "coi_policy": float(self._last_coi.policy), + "coi_agent": float(self._last_coi.agent), + "coi_leakage": float(self._last_coi.leak), + "coi_survival": float(self._last_coi.survival_ratio), + "coi_erosion": float(coi_erosion(self._last_coi.policy, self._last_coi.agent)), } ) - return self.state, float(reward), terminated, False, info + return obs, reward, terminated, False, info + def render(self, mode: str = "human") -> str | None: + if self._prices is None: + return None + out = ( + f"t={self._t}/{self.c.max_steps} " + f"alpha_true={self._alpha_true:.3f} alpha_hat={self._alpha_hat:.3f} " + f"price_std={float(np.std(self._prices)):.2f}" + ) + if mode == "human": + print(out) + return out -if __name__ == "__main__": - import matplotlib.pyplot as plt - from collections import defaultdict - - env = PHANTOMEnv(constraints=BusinessLogicConstraints()) - obs, _ = env.reset(seed=42) - metrics = defaultdict(list) - total_reward = 0.0 - done = False - - while not done: - action = env.action_space.sample() - obs, reward, done, _, info = env.step(action) - total_reward += reward - p_mean = float(np.mean(obs["elasticity"]["price"])) - q_mean = float(np.mean(obs["elasticity"]["demand"])) - p_std = float(np.std(obs["elasticity"]["price"])) - - metrics['t'].append(info['t']) - metrics['price_mean'].append(p_mean) - metrics['price_std'].append(p_std) - metrics['demand_mean'].append(q_mean) - metrics['revenue_observed'].append(info['revenue_observed']) - metrics['revenue_oracle'].append(info['revenue_oracle']) - metrics['agent_loss'].append(info['agent_loss']) - metrics['ux_volatility'].append(info['ux_volatility']) - metrics['look_to_book'].append(info['look_to_book']) - metrics['reward'].append(reward) - metrics['human_purchases'].append(info['true_human_purchases_total']) - metrics['agent_purchases'].append(info['true_agent_purchases_total']) - metrics['coi'].append(info.get('coi', 0.0)) - metrics['alpha_hat'].append(info.get('alpha_hat', env.commerce_platform.alpha_hat)) - metrics['mean_human_demand'].append(info.get('mean_human_demand', 0.0)) - metrics['mean_agent_demand'].append(info.get('mean_agent_demand', 0.0)) - metrics['delta_h_mean'].append(info.get('delta_h_mean', 0.0)) - metrics['delta_a_mean'].append(info.get('delta_a_mean', 0.0)) - metrics['prob_agent_mean'].append(info.get('prob_agent_mean', 0.0)) - - if info['t'] % 20 == 0 or done: - print(f"t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} q={q_mean:6.2f} " - f"rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} " - f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} " - f"coi={info.get('coi', 0.0):6.2f} alpha={info.get('alpha_hat', 0.0):4.2f} " - f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}") - - print(f"total_reward={total_reward:.2f}") - - fig, axes = plt.subplots(3, 4, figsize=(18, 12)) - fig.suptitle('PHANTOM Environment Run', fontsize=14, fontweight='bold') - - plot_configs = [ - ('price_mean', 'Mean Price', 'Price'), - ('demand_mean', 'Mean Demand (All)', 'Demand'), - ('mean_human_demand', 'Mean Human Demand', 'Count'), - ('mean_agent_demand', 'Mean Agent Demand', 'Count'), - ('revenue_observed', 'Revenue (Observed)', 'Revenue'), - ('agent_loss', 'Agent Loss (Oracle - Observed)', 'Loss'), - ('coi', 'Cost of Information', 'COI'), - ('alpha_hat', 'Estimated α̂', 'alpha'), - ('ux_volatility', 'UX Volatility (Price Change)', 'Volatility'), - ('look_to_book', 'Look-to-Book Ratio', 'Ratio'), - ('reward', 'Step Reward', 'Reward'), - ('prob_agent_mean', 'Avg Agent Probability', 'Probability'), - ] - - for idx, (key, title, ylabel) in enumerate(plot_configs): - ax = axes[idx // 4, idx % 4] - ax.plot(metrics['t'], metrics[key], color='blue', alpha=0.7, linewidth=1.5) - ax.set_xlabel('Step') - ax.set_ylabel(ylabel) - ax.set_title(title, fontsize=10, fontweight='bold') - ax.grid(True, alpha=0.3) - - plt.tight_layout() - plt.savefig('phantom_env_comparison.png', dpi=150, bbox_inches='tight') - print("Plot saved to phantom_env_comparison.png") - plt.show() + def close(self) -> None: + return