Merge pull request #50 from velocitatem/new-simulation-environment-development

New simulation environment development
This commit is contained in:
Daniel Alves Rösel
2026-01-30 13:19:53 +01:00
committed by GitHub
65 changed files with 6747 additions and 644 deletions

3
.gitignore vendored
View File

@@ -9,6 +9,7 @@
*.old
**/package-lock.json
**/*.parquet
**/_build/
paper/src/bib/auto
experiments/airflow/logs/*
@@ -21,3 +22,5 @@ sim/rl/behavior_loader/*.png
sim/rl/behavior_loader/*.svg
sim/rl/behavior_loader/*.pdf
tests/e2e/node_modules/**
lab/case/thesis/runs*/
sim/case/thesis_simplified/runs*/

66
engine/engine.py Normal file
View File

@@ -0,0 +1,66 @@
from sys import platform
import numpy as np
from .lib.demand import generate_demand, estimate_demand
from .lib.behavior import sample_behavior
from logging import INFO, getLogger
logger = getLogger(__name__)
logger.setLevel(INFO)
class MarketEngine():
def __init__(self,
alpha = 0.5,
N = 100,
demand_distribution = (50, 10),
demand_sampling_function = np.random.normal):
self.Nagents = int(N*alpha)
self.Nhumans = int(N*(1-alpha))
self.demand = (demand_sampling_function, demand_distribution)
def act(self, prices):
demand = generate_demand(prices, *self.demand)
sample_n = lambda n, human: [sample_behavior(demand, human=human) for _ in range(n)]
human_t, agent_t = sample_n(100, True), sample_n(100, False)
trajectories = human_t + agent_t
demand_estimate = estimate_demand(trajectories)
return demand_estimate
def measure(self):
pass
class PricingEngine():
def __init__(self,
) -> None:
pass
def act(self, demand):
return np.random.uniform(low=25, high=100, size=10)
class Limbo():
def __init__(self,
platform,
market
) -> None:
self.platform_turn = True
self.platform = platform
self.market = market
self.output = None
def step(self):
# we could code golf this a little bit
if self.platform_turn:
self.output = self.platform.act(self.output)
else:
self.output = self.market.act(self.output)
print(self.output)
self.platform_turn = not self.platform_turn
if __name__ == "__main__":
platform = PricingEngine()
market = MarketEngine()
limbo = Limbo(platform, market)
for _ in range(10):
limbo.step()

3
engine/lib/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .demand import generate_demand, estimate_demand
from .behavior import sample_behavior
from .render import DashboardRenderer, style_axis

47
engine/lib/behavior.py Normal file
View File

@@ -0,0 +1,47 @@
from sim.rl.behavior_loader.models import BehaviorModel, AgentBehaviorModel, aggregate_event_transitions
import pandas as pd
import numpy as np
from .demand import generate_demand
base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
_cache = {} # lazy cache for models and base pivots
def _get_base_pivot(human: bool):
key = 'human' if human else 'agent'
if key not in _cache:
model = BehaviorModel(human_dir) if human else AgentBehaviorModel(agent_dir)
mdp = model.build_MDP()
_cache[key] = pd.DataFrame(aggregate_event_transitions(mdp)).fillna(0.0)
return _cache[key]
def adjust_behavior_to_condition(condition, transition_matrix):
# expand NxN transition matrix to (N*P)x(N*P) weighted by demand condition
cond_norm = condition / np.sum(condition)
n_products = len(condition)
base_vals = transition_matrix.values
base_cols, base_rows = transition_matrix.columns.tolist(), transition_matrix.index.tolist()
# expand via kronecker-like tiling: each cell becomes a P*P block weighted by outer product of cond_norm
expanded = np.kron(base_vals, np.outer(cond_norm, cond_norm))
new_cols = [f"{c}_product{p}" for c in base_cols for p in range(n_products)]
new_rows = [f"{r}_product{p}" for r in base_rows for p in range(n_products)]
return pd.DataFrame(expanded, index=new_rows, columns=new_cols)
def sample_behavior(condition, human=True, max_len=40):
base_pivot = _get_base_pivot(human)
adjusted_transitions = adjust_behavior_to_condition(condition, base_pivot)
trajectory = [np.random.choice(adjusted_transitions.index)]
while len(trajectory) < max_len or 'checkout' in trajectory[-1]:
probs = adjusted_transitions.loc[trajectory[-1]].values
sample = np.random.choice(adjusted_transitions.columns, p=probs/np.sum(probs) if np.sum(probs) > 0 else None)
trajectory.append(sample)
return trajectory
if __name__ == "__main__":
t=sample_behavior(generate_demand(np.array([10,20,30])), human=True)
print(t)
t=sample_behavior(generate_demand(np.array([10,20,30])), human=False)
print(t)

45
engine/lib/demand.py Normal file
View File

@@ -0,0 +1,45 @@
import logging
import numpy as np
from logging import getLogger
logger = getLogger(__name__)
def generate_demand(prices, distribution_method = np.random.normal, distribution_params = (50.0, 10.0)):
# assumption 1: each product has an intrinsic valuation drawn from a normal distribution centered at 50
product_valuations = distribution_method(*distribution_params, size=len(prices))
# assumption 2: demand decreases as price increases, following a simple linear model
demand = np.maximum(0, product_valuations - prices) # demand cannot be negative
total = np.sum(demand)
demand = demand / total * 100 if total > 0 else demand # normalize to percentage, avoid div by zero
logger.info(f"Generated demand for prices {prices}: {demand} with valuations from distribution {distribution_params}")
return demand
def estimate_demand(trajectories):
demand_estimate = {}
for traj in trajectories:
for event in traj:
if 'view_product' in event:
product_id = int(event.split('_')[-1].replace('product', ''))
demand_estimate[product_id] = demand_estimate.get(product_id, 0) + 1
total_views = sum(demand_estimate.values())
for product_id in demand_estimate:
demand_estimate[product_id] = (demand_estimate[product_id] / total_views) * 100 # normalize to percentage
return demand_estimate
# Example usage
if __name__ == "__main__":
np.random.seed(42)
prices = np.array([20.0, 35.0, 50.0, 65.0])
demand = generate_demand(prices)
print("Generated Demand:", demand)
from .behavior import sample_behavior
N, alphat =200, 0.1
trajectories = []
for _ in range(int(N*(1 - alphat))):
trajectories.append(sample_behavior(demand, human=True))
for _ in range(int(N*alphat)):
trajectories.append(sample_behavior(demand, human=False))
demand_estimate = estimate_demand(trajectories)
print("Estimated Demand from Behavior:", demand_estimate)
delta = {k: demand_estimate.get(k, 0) - demand[i] for i, k in enumerate(range(len(prices)))}
delta = np.mean([np.abs(v) for v in delta.values()])
print("Demand Delta:", delta)

126
engine/lib/render.py Normal file
View File

@@ -0,0 +1,126 @@
"""rendering logic for PHANTOM environment dashboard"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
def style_axis(ax, title: str = None, xlabel: str = None, ylabel: str = None):
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
if title: ax.set_title(title, fontsize=11, fontweight='bold', pad=8)
if xlabel: ax.set_xlabel(xlabel, fontsize=9)
if ylabel: ax.set_ylabel(ylabel, fontsize=9)
class DashboardRenderer:
"""stateful renderer for PHANTOM market dynamics visualization"""
def __init__(self):
self.fig = None
self.gs = None
def render(self, env) -> None:
if self.fig is None:
plt.ion()
self.fig = plt.figure(figsize=(14, 10))
self.gs = GridSpec(3, 3, figure=self.fig, hspace=0.35, wspace=0.3,
left=0.07, right=0.95, top=0.92, bottom=0.08)
plt.show(block=False)
self.fig.clear()
self.fig.suptitle(f'PHANTOM Market Dynamics [t={env._step_count}, a={env.alpha:.2f}]',
fontsize=14, fontweight='bold')
demand_mat = np.array(env._demand_history).T
price_mat = np.array(env._price_history).T
elasticity = env._compute_elasticity()
self._render_scatter(env)
self._render_elasticity_bar(env, elasticity)
self._render_session_pie(env)
self._render_price_heatmap(price_mat)
self._render_demand_heatmap(demand_mat)
self._render_correlation(env.n_products, price_mat, demand_mat)
self._render_revenue(env)
self.fig.canvas.draw_idle()
self.fig.canvas.flush_events()
def _render_scatter(self, env):
ax = self.fig.add_subplot(self.gs[0, 0])
prices_flat = np.array(env._price_history).flatten()
demands_flat = np.array(env._demand_history).flatten()
product_ids = np.tile(np.arange(env.n_products), len(env._price_history))
ax.scatter(prices_flat, demands_flat, c=product_ids, cmap='plasma', alpha=0.6, s=15, edgecolors='none')
if len(prices_flat) > 1:
z = np.polyfit(prices_flat, demands_flat, 1)
p_line = np.linspace(prices_flat.min(), prices_flat.max(), 50)
ax.plot(p_line, np.polyval(z, p_line), '--', lw=1.5, alpha=0.8)
style_axis(ax, "Price-Demand Relationship", "Price ($)", "Demand")
def _render_elasticity_bar(self, env, elasticity):
ax = self.fig.add_subplot(self.gs[0, 1])
ax.barh(range(env.n_products), elasticity, alpha=0.8)
ax.axvline(0, lw=0.8, alpha=0.5)
ax.axvline(-1, lw=1, ls='--', alpha=0.5)
ax.set_yticks(range(env.n_products))
ax.set_yticklabels([f'P{i}' for i in range(env.n_products)], fontsize=7)
style_axis(ax, "Price Elasticity", "(dQ/dP)(P/Q)", None)
def _render_session_pie(self, env):
ax = self.fig.add_subplot(self.gs[0, 2])
n_h, n_a = env.market.Nhumans, env.market.Nagents
wedges, _ = ax.pie([n_h, n_a], startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'})
ax.legend(wedges, [f'H ({n_h})', f'A ({n_a})'], loc='lower center', fontsize=8,
frameon=False, bbox_to_anchor=(0.5, -0.05))
ax.set_title("Session Mix", fontsize=11, fontweight='bold')
def _render_price_heatmap(self, price_mat):
ax = self.fig.add_subplot(self.gs[1, :2])
im = ax.imshow(price_mat, aspect='auto', cmap='viridis', origin='lower')
style_axis(ax, "Price Heatmap P(product, t)", "Step", "Product")
cbar = self.fig.colorbar(im, ax=ax, fraction=0.03, pad=0.02)
cbar.set_label('$', fontsize=8)
def _render_demand_heatmap(self, demand_mat):
ax = self.fig.add_subplot(self.gs[1, 2])
im = ax.imshow(demand_mat, aspect='auto', cmap='Blues', origin='lower')
style_axis(ax, "Demand Q(product, t)", "Step", None)
self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
def _render_correlation(self, n_products, price_mat, demand_mat):
ax = self.fig.add_subplot(self.gs[2, 0])
if price_mat.shape[1] > 2:
corr = np.corrcoef(price_mat, demand_mat)[:n_products, n_products:]
im = ax.imshow(corr, cmap='RdBu', vmin=-1, vmax=1, aspect='auto')
ax.set_xticks(range(n_products))
ax.set_yticks(range(n_products))
ax.set_xticklabels([f'Q{i}' for i in range(n_products)], fontsize=6)
ax.set_yticklabels([f'P{i}' for i in range(n_products)], fontsize=6)
self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
style_axis(ax, "Price-Demand Correlation", None, None)
def _render_revenue(self, env):
ax = self.fig.add_subplot(self.gs[2, 1:])
n_steps = len(env._revenue_history)
demand_std = [np.std(d) for d in env._demand_history]
ax.fill_between(range(n_steps), env._revenue_history, alpha=0.3)
ax.plot(env._revenue_history, linewidth=2, label='Revenue')
ax.set_xlim(0, max(n_steps, 1))
ax.set_ylim(0, max(env._revenue_history) * 1.1 if env._revenue_history else 1)
ax2 = ax.twinx()
ax2.plot(range(n_steps), demand_std, linewidth=2, ls='-', alpha=0.9, label='sigma(Demand)')
d_min, d_max = min(demand_std), max(demand_std)
margin = (d_max - d_min) * 0.2 if d_max > d_min else 0.5
ax2.set_ylim(max(0, d_min - margin), d_max + margin)
ax2.set_ylabel('Demand sigma', fontsize=9)
style_axis(ax, "Revenue & Demand Dispersion", "Step", "Revenue ($)")
ax.legend(loc='upper left', fontsize=7, frameon=False)
ax2.legend(loc='upper right', fontsize=7, frameon=False)
def close(self):
if self.fig:
plt.close(self.fig)
self.fig = None

34
engine/studies/factors.py Normal file
View File

@@ -0,0 +1,34 @@
"""shared factor definitions for experimental designs"""
import numpy as np
from dataclasses import dataclass, field
from typing import Callable, Any
@dataclass
class Factor:
name: str
levels: list
primary: bool = True # full cross vs sampled
# demand functions with compatible signatures
def demand_linear(mu, sigma, size): return np.maximum(0, np.random.normal(mu, sigma, size))
def demand_uniform(mu, sigma, size): return np.random.uniform(mu - sigma, mu + sigma, size)
def demand_exponential(mu, sigma, size): return np.random.exponential(mu, size)
def demand_logistic(mu, sigma, size): return np.random.logistic(mu, sigma, size)
DEMAND_FUNCTIONS = {
"linear": demand_linear,
"uniform": demand_uniform,
"exponential": demand_exponential,
"logistic": demand_logistic,
}
FACTORS = [
Factor("demand_fn", list(DEMAND_FUNCTIONS.keys()), primary=True),
Factor("alpha", [0.1, 0.3, 0.5, 0.7], primary=True),
Factor("n_products", [5, 15, 30, 50], primary=True),
Factor("demand_mu", [30.0, 50.0, 70.0], primary=False),
Factor("demand_sigma", [5.0, 10.0, 20.0], primary=False),
Factor("N", [100, 500, 1000], primary=False),
]
SEEDS_PER_CONFIG = 5

View File

@@ -0,0 +1,89 @@
"""full factorial design - all factor combinations"""
import sys
sys.path.insert(0, "..")
import logging
from itertools import product
import json
import hashlib
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from .factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
def generate_configs():
"""generate all factor combinations with seeds"""
all_levels = [f.levels for f in FACTORS]
names = [f.name for f in FACTORS]
configs = []
for combo in product(*all_levels):
base = {names[i]: combo[i] for i in range(len(names))}
for seed in range(SEEDS_PER_CONFIG):
cfg = {**base, "seed": seed}
cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
configs.append(cfg)
return configs
def run_single(cfg: dict) -> dict:
"""execute one experiment config, return metrics"""
from engine.wrapper import PHANTOM
import numpy as np
np.random.seed(cfg["seed"])
demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]]
env = PHANTOM(
n_products=cfg["n_products"],
alpha=cfg["alpha"],
N=cfg["N"],
)
env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"]))
obs, _ = env.reset()
total_reward, steps = 0.0, 0
for _ in range(100):
action = env.action_space.sample()
obs, reward, term, trunc, _ = env.step(action)
total_reward += reward
steps += 1
if term: break
env.close()
return {
"id": cfg["id"],
"config": cfg,
"total_reward": total_reward,
"avg_reward": total_reward / steps,
"steps": steps,
}
def run_study(max_workers: int = None, output: str = "results_full.jsonl"):
configs = generate_configs()
log.info(f"full factorial: {len(configs)} configs ({len(configs)//SEEDS_PER_CONFIG} unique × {SEEDS_PER_CONFIG} seeds)")
results = []
with ProcessPoolExecutor(max_workers=max_workers) as ex:
for i, result in enumerate(ex.map(run_single, configs)):
results.append(result)
if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
Path(output).write_text("\n".join(json.dumps(r) for r in results))
log.info(f"wrote {len(results)} results to {output}")
return results
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser()
p.add_argument("--workers", type=int, default=None)
p.add_argument("--output", default="results_full.jsonl")
p.add_argument("--dry-run", action="store_true", help="only show design size")
args = p.parse_args()
configs = generate_configs()
log.info(f"design: {len(configs)} runs | factors: {[f.name for f in FACTORS]} | levels: {[len(f.levels) for f in FACTORS]}")
if not args.dry_run:
run_study(args.workers, args.output)

106
engine/studies/mixed_lh.py Normal file
View File

@@ -0,0 +1,106 @@
"""mixed design: full factorial on primary factors, latin hypercube on secondary"""
import sys
sys.path.insert(0, "..")
import logging
from itertools import product
import json
import hashlib
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from scipy.stats.qmc import LatinHypercube
from factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
LH_SAMPLES = 10
def generate_configs(lh_samples: int = LH_SAMPLES):
primary = [f for f in FACTORS if f.primary]
secondary = [f for f in FACTORS if not f.primary]
primary_grid = list(product(*[f.levels for f in primary]))
lhs = LatinHypercube(d=len(secondary), seed=42)
configs = []
for p_combo in primary_grid:
samples = lhs.random(n=lh_samples)
for s in samples:
sec_vals = {
secondary[i].name: secondary[i].levels[int(s[i] * len(secondary[i].levels))]
for i in range(len(secondary))
}
base = {primary[i].name: p_combo[i] for i in range(len(primary))}
base.update(sec_vals)
for seed in range(SEEDS_PER_CONFIG):
cfg = {**base, "seed": seed}
cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
configs.append(cfg)
return configs
def run_single(cfg: dict) -> dict:
from engine.wrapper import PHANTOM
import numpy as np
np.random.seed(cfg["seed"])
demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]]
env = PHANTOM(
n_products=cfg["n_products"],
alpha=cfg["alpha"],
N=cfg["N"],
)
env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"]))
obs, _ = env.reset()
total_reward, steps = 0.0, 0
for _ in range(100):
action = env.action_space.sample()
obs, reward, term, trunc, _ = env.step(action)
total_reward += reward
steps += 1
if term: break
env.close()
return {
"id": cfg["id"],
"config": cfg,
"total_reward": total_reward,
"avg_reward": total_reward / steps,
"steps": steps,
}
def run_study(max_workers: int = None, output: str = "results_mixed.jsonl", lh_samples: int = LH_SAMPLES):
configs = generate_configs(lh_samples)
n_primary_cells = int(np.prod([len(f.levels) for f in FACTORS if f.primary]))
log.info(f"mixed LH: {len(configs)} configs ({n_primary_cells} primary × {lh_samples} LH × {SEEDS_PER_CONFIG} seeds)")
results = []
with ProcessPoolExecutor(max_workers=max_workers) as ex:
for i, result in enumerate(ex.map(run_single, configs)):
results.append(result)
if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
Path(output).write_text("\n".join(json.dumps(r) for r in results))
log.info(f"wrote {len(results)} results to {output}")
return results
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser()
p.add_argument("--workers", type=int, default=None)
p.add_argument("--output", default="results_mixed.jsonl")
p.add_argument("--lh-samples", type=int, default=10)
p.add_argument("--dry-run", action="store_true", help="only show design size")
args = p.parse_args()
primary = [f for f in FACTORS if f.primary]
secondary = [f for f in FACTORS if not f.primary]
configs = generate_configs(args.lh_samples)
log.info(f"design: {len(configs)} runs | primary: {[f.name for f in primary]} | secondary (LH): {[f.name for f in secondary]}")
if not args.dry_run:
run_study(args.workers, args.output, args.lh_samples)

45
engine/train.py Normal file
View File

@@ -0,0 +1,45 @@
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from .wrapper import PHANTOM
class RenderCallback(BaseCallback):
"""Renders environment on every step for live visualization."""
def __init__(self, env: PHANTOM):
super().__init__()
self.env = env
def _on_step(self) -> bool:
self.env.render()
return True
env = PHANTOM(n_products=10, alpha=0.3, render_mode="human")
eval_env = PHANTOM(n_products=10, alpha=0.3, render_mode=None)
model = SAC(
"MultiInputPolicy",
env,
verbose=1,
learning_rate=3e-4,
buffer_size=50000,
batch_size=256,
tau=0.005,
gamma=0.99,
)
render_cb = RenderCallback(env)
eval_cb = EvalCallback(eval_env, eval_freq=1000, n_eval_episodes=5, verbose=1)
model.learn(total_timesteps=50000, callback=[render_cb, eval_cb])
model.save("phantom_sac")
# test trained policy
env = PHANTOM(n_products=10, alpha=0.3, render_mode="human")
obs, _ = env.reset()
for _ in range(100):
action, _ = model.predict(obs, deterministic=True)
obs, reward, term, trunc, _ = env.step(action)
env.render()
if term or trunc: break
env.close()

118
engine/wrapper.py Normal file
View File

@@ -0,0 +1,118 @@
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from .engine import Limbo, MarketEngine, PricingEngine
from .lib.render import DashboardRenderer
class PHANTOM(gym.Env):
"""Gymnasium wrapper for the Limbo pricing-market simulation. Platform sets prices, market responds with demand."""
metadata = {"render_modes": ["human", "ansi"]}
def __init__(self,
n_products: int = 10,
alpha: float = 0.3,
N: int = 100,
price_bounds: tuple = (10.0, 150.0),
lambda_coi: float = 0.1,
render_mode: str = None):
super().__init__()
self.n_products = n_products
self.price_bounds = price_bounds
self.lambda_coi = lambda_coi
self.render_mode = render_mode
self.alpha = alpha
self.N = N
self.market = MarketEngine(alpha=alpha, N=N)
self._platform_stub = PricingEngine()
self._limbo = Limbo(self._platform_stub, self.market)
self.action_space = spaces.Box(
low=price_bounds[0], high=price_bounds[1],
shape=(n_products,), dtype=np.float32
)
self.observation_space = spaces.Dict({
"demand": spaces.Box(low=0.0, high=100.0, shape=(n_products,), dtype=np.float32),
"prices": spaces.Box(low=price_bounds[0], high=price_bounds[1], shape=(n_products,), dtype=np.float32),
})
self._prices = None
self._demand = None
self._step_count = 0
self._demand_history = []
self._price_history = []
self._revenue_history = []
self._renderer = None
def _get_obs(self) -> dict:
demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32)
return {"demand": demand_arr, "prices": self._prices.astype(np.float32)}
def _compute_reward(self, prices: np.ndarray, demand: dict) -> float:
revenue = np.sum(prices * np.array([demand.get(i, 0.0) for i in range(self.n_products)]))
# TODO: implement supra-competitive price punishment
return float(revenue)
def _record_history(self):
demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)])
self._demand_history.append(demand_arr)
self._price_history.append(self._prices.copy())
self._revenue_history.append(np.sum(self._prices * demand_arr))
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self._prices = np.random.uniform(*self.price_bounds, size=self.n_products)
self._demand = self.market.act(self._prices)
self._step_count = 0
self._demand_history, self._price_history, self._revenue_history = [], [], []
self._record_history()
return self._get_obs(), {}
def step(self, action: np.ndarray):
self._prices = np.clip(action, *self.price_bounds)
self._demand = self.market.act(self._prices)
self._step_count += 1
self._record_history()
reward = self._compute_reward(self._prices, self._demand)
terminated = self._step_count >= 100
return self._get_obs(), reward, terminated, False, {"step": self._step_count}
def _compute_elasticity(self) -> np.ndarray:
"""point elasticity: e = (dQ/dP) * (P/Q) via finite differences, clipped to [-5, 5]"""
if len(self._price_history) < 2:
return np.zeros(self.n_products)
p, q = np.array(self._price_history), np.array(self._demand_history)
dp, dq = np.diff(p, axis=0), np.diff(q, axis=0)
valid = np.abs(dp) > 0.5
with np.errstate(divide='ignore', invalid='ignore'):
elasticity = np.where(valid, (dq / dp) * (p[:-1] / np.maximum(q[:-1], 1.0)), 0.0)
elasticity = np.nan_to_num(np.clip(elasticity, -5.0, 5.0), nan=0.0)
return np.mean(elasticity, axis=0) if len(elasticity) > 0 else np.zeros(self.n_products)
def render(self):
if self.render_mode == "human":
if self._renderer is None:
self._renderer = DashboardRenderer()
self._renderer.render(self)
elif self.render_mode == "ansi":
return f"step={self._step_count}, prices={self._prices}, demand={self._demand}"
return None
def close(self):
if self._renderer:
self._renderer.close()
self._renderer = None
if __name__ == "__main__":
env = PHANTOM(n_products=15, alpha=0.3, N=100, render_mode="human")
obs, _ = env.reset()
for step in range(100):
action = env.action_space.sample()
obs, reward, term, trunc, info = env.step(action)
env.render()
if term: break
env.close()

75
lab/README.md Normal file
View File

@@ -0,0 +1,75 @@
# MOS (Money Operating System)
Research-grade quote-control simulator for studying dynamic pricing and market making policies.
The system models pricing as a closed loop of **Quote → Arrival → Execution → Position**, enabling
controlled experimentation with demand models, inventory constraints, and reward shaping.
## Core Loop
1. **Quote** the policy posts prices (one-sided or two-sided depending on the mechanism).
2. **Arrival** a population model generates purchase opportunities or market orders.
3. **Execution** an execution model decides whether an arrival converts at the quoted price.
4. **Position** inventory/position limits censor fills and generate holding/shortage costs.
5. **Observation & Reward** censored fills and aggregate metrics are exposed to the agent, while
objectives turn metrics into a scalar reward.
Each stage is pluggable via light-weight protocols so you can swap in alternative mechanisms,
demand models, or objectives without rewriting the rest of the simulator.
## Package Layout
| Module | Purpose |
|-------------------|---------|
| `lab.outlet` | Core simulation engine, domain types, pricing mechanisms, objectives. |
| `lab.population` | Demand arrival models, execution probability models, competitor/market dynamics. |
| `lab.experiments` | Rollout utilities, baseline policies, and off-policy evaluation helpers. |
| `lab.config` | Convenience factories for preconfigured retail and market-making environments. |
## Preconfigured Scenarios
### Retail Dynamic Pricing
- Mechanism: posted prices with margin and delta constraints.
- Arrivals: browsing sessions with contamination support (scrapers).
- Execution: elasticity model with competitor cross-effects.
- Position: inventory tracking with holding and shortage costs.
- Market: reactive competitor that can trigger price wars.
- Objective: PnL minus volatility, holding cost, and lost opportunity penalties.
```python
from lab.config import make_retail_platform
from lab.experiments import rollout, fixed_price_policy
platform = make_retail_platform()
policy = fixed_price_policy(platform.instruments.refs)
result = rollout(platform, policy, n_steps=100)
print(result.total_pnl)
```
### Market Making
- Mechanism: two-sided quoting with bid/ask spreads.
- Arrivals: Hawkes order flow for clustered demand.
- Execution: AvellanedaStoikov style intensity model.
- Position: inventory risk limits and quadratic penalty objective.
- Market: geometric Brownian motion mid-price process.
- Objective: PnL plus spread capture minus inventory risk.
```python
from lab.config import make_market_making_platform
from lab.experiments import rollout
platform = make_market_making_platform()
mm_policy = lambda obs, t: (platform.instruments.refs, 1.0)
result = rollout(platform, mm_policy, n_steps=200, seed=42)
print(result.total_pnl)
```
## Extending the Simulator
- Implement `lab.outlet.protocols.Mechanism` or `ArrivalModel` to introduce new pricing
domains or demand processes.
- Compose objectives with `lab.outlet.objectives.factory.make_composite` to study alternate
reward formulations.
- Use `lab.experiments.compare_policies` to benchmark candidate policies across multiple
random seeds.
Comprehensive API documentation lives in `lab/docs` (build with `make html`).

27
lab/__init__.py Normal file
View File

@@ -0,0 +1,27 @@
"""
Quote-Control Simulator: Research-grade platform for dynamic pricing and market making
The platform abstracts pricing as: Quote -> Arrival -> Execution -> Position
Supports multiple mechanisms:
- PostedPrice: retail dynamic pricing
- TwoSided: market making with bid-ask spreads
- Auction: reserve/shading for auction settings
Example usage:
from lab.config import make_retail_platform
from lab.experiments import rollout, fixed_price_policy
platform = make_retail_platform()
policy = fixed_price_policy(platform.instruments.refs)
result = rollout(platform, policy, n_steps=100)
print(f"Total PnL: {result.total_pnl:.2f}")
"""
from .config import make_retail_platform, make_market_making_platform, RetailConfig, MarketMakingConfig
from .outlet import Platform, PlatformConfig, Quote, Observation, StepResult
__all__ = [
'make_retail_platform', 'make_market_making_platform',
'RetailConfig', 'MarketMakingConfig',
'Platform', 'PlatformConfig', 'Quote', 'Observation', 'StepResult',
]

6
lab/case/__init__.py Normal file
View File

@@ -0,0 +1,6 @@
"""
Case studies implementing specific research scenarios.
Available cases:
- thesis: PHANTOM thesis implementation with contaminated demand and DR-RL
"""

View File

@@ -0,0 +1,25 @@
"""
Thesis-specific implementation of the PHANTOM pricing defense framework.
This module implements the mathematical models from the thesis:
- ContaminatedArrivalModel: Mixture demand Q(p) = (1-α)d_H + αd_A (Eq 3)
- HybridExecutionModel: Divergent H/A behavior with separability (Section 2.1)
- RobustStackelbergObjective: Maximin objective with COI penalty (Eq 23)
- COIMetrics: Cost of Information tracking (Definition 1)
The platform configuration creates a research environment that directly
maps to the thesis mathematical framework for DR-RL experiments.
"""
from .arrivals import ContaminatedArrivalModel, ContaminatedArrivalConfig
from .execution import HybridExecutionModel, HybridExecutionConfig
from .objectives import RobustStackelbergObjective, COIObjective
from .platform import make_thesis_platform, ThesisConfig
from .metrics import COIMetrics, compute_coi, compute_separability
__all__ = [
'ContaminatedArrivalModel', 'ContaminatedArrivalConfig',
'HybridExecutionModel', 'HybridExecutionConfig',
'RobustStackelbergObjective', 'COIObjective',
'make_thesis_platform', 'ThesisConfig',
'COIMetrics', 'compute_coi', 'compute_separability',
]

327
lab/case/thesis/arrivals.py Normal file
View File

@@ -0,0 +1,327 @@
"""Contaminated arrivals using learned MDP kernels from behavior_loader.
Implements thesis demand model (Section 3.1):
- Aggregate demand Q(p) = (1-α)E[d(p;θ_H)] + αE[d(p;θ_A)] + ε_t (Eq 3)
- Demand proxy q̂_{t,i} = Σ_s Σ_k ω(a_{s,k}) · 1[i_{s,k} = i] (Eq 2)
- Per-session separability via KL divergence Δ_H, Δ_A (Eq 20-21)
The arrival model samples sessions from a mixture of human/agent behavioral profiles,
each session produces a trajectory τ_s and associated demand computation q(τ').
"""
from __future__ import annotations
from dataclasses import dataclass, field
from types import SimpleNamespace
from typing import Dict, List, Tuple, Optional
import numpy as np
from ...outlet.types import Opportunity, InstrumentSet, MarketState, HiddenState
from ...outlet.constants import Side, OpportunityType
from ...outlet.math_util import poisson_arrivals
try:
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
from sim.rl.behavior_loader.models import (
BehaviorModel, AgentBehaviorModel, aggregate_event_transitions, kl_divergence
)
REAL_MDP = True
except ImportError:
REAL_MDP = False
kl_divergence = None
EVENT_PAGE = {"session_start": "/", "view_item_page": "/products", "learn_more_about_item": "/products/details",
"add_item_to_cart": "/cart", "purchase_complete": "/checkout", "session_end": "/checkout/success"}
EVENT_CANON = {"page_view": "session_start", "hover_over_paragraph": "view_item_page", "hover_over_title": "view_item_page",
"view_item_page": "view_item_page", "learn_more_about_item": "learn_more_about_item",
"add_item_to_cart": "add_item_to_cart", "checkout_start": "purchase_complete", "remove_item": "view_item_page"}
# action space partition A = A_nav A_cart A_filter A_dwell with signal weights ω (Table 1)
ACTION_WEIGHTS: Dict[str, float] = {
"add_item_to_cart": 0.8, "remove_item": 0.6, "checkout_start": 0.9, "purchase_complete": 1.0, # A_cart
"hover_over_title": 0.3, "hover_over_paragraph": 0.35, "hover_over_link": 0.25, # A_dwell
"page_view": 0.1, "session_start": 0.05, "view_item_page": 0.15, "learn_more_about_item": 0.2, # A_nav
"search": 0.05, "filter_date": 0.05, "filter_price": 0.08, "sort": 0.03, "session_end": 0.0, # A_filter
}
@dataclass
class SessionDemand:
"""Per-session demand computation per thesis formulation (Section 3.1).
Each session s ∈ S produces trajectory τ_s and demand proxy q̂. The platform uses
divergence signals Δ_H, Δ_A to estimate per-session contamination α̂(τ').
"""
session_id: str
q: Dict[int, float] # q̂_i demand proxy per product (Eq 2)
trajectory: List[Dict] # τ_s = (e_{s,1}, ..., e_{s,L_s})
delta_h: float = 0.0 # D_KL(T̂' || T̄_H) (Eq 20)
delta_a: float = 0.0 # D_KL(T̂' || T̄_A) (Eq 21)
alpha_hat: float = 0.0 # per-session contamination estimate
actor_class: str = "H" # ground truth Y_s ∈ {H, A}
theta: Dict[str, float] = field(default_factory=dict)
def compute_demand_proxy(events: List[Dict], n_products: int) -> Dict[int, float]:
"""Compute q̂_{t,i} = Σ_k ω(a_{s,k}) · 1[i_{s,k} = i] per Eq 2."""
q = {i: 0.0 for i in range(n_products)}
for e in events:
action, pidx = e.get("eventName", ""), e.get("product_idx")
if pidx is not None and 0 <= pidx < n_products:
q[pidx] += ACTION_WEIGHTS.get(action, 0.1)
return q
def compute_session_divergence(events: List[Dict], ref_h: Dict, ref_a: Dict) -> Tuple[float, float]:
"""Compute Δ_H, Δ_A divergence signals from trajectory (Eq 20-21)."""
if not events or kl_divergence is None:
return 0.0, 0.0
# build empirical transition kernel from trajectory
trans: Dict[str, Dict[str, int]] = {}
prev = "session_start"
for e in events:
curr = e.get("eventName", "session_end")
trans.setdefault(prev, {})
trans[prev][curr] = trans[prev].get(curr, 0) + 1
prev = curr
# normalize to probabilities
kernel = {}
for s, dests in trans.items():
total = sum(dests.values())
kernel[s] = {d: c / total for d, c in dests.items()} if total > 0 else {}
# aggregate to event-level and compute KL divergence against reference kernels
delta_h = sum(kl_divergence(kernel.get(s, {}), ref_h.get(s, {})) for s in kernel) / max(len(kernel), 1)
delta_a = sum(kl_divergence(kernel.get(s, {}), ref_a.get(s, {})) for s in kernel) / max(len(kernel), 1)
return delta_h, delta_a
def _canonicalize(raw: Dict) -> Dict:
out = {}
for src, dsts in raw.items():
sc = EVENT_CANON.get(src, src)
out.setdefault(sc, {})
for dst, p in dsts.items():
dc = EVENT_CANON.get(dst, dst)
out[sc][dc] = out[sc].get(dc, 0.0) + p
return {s: {k: v/sum(d.values()) for k, v in d.items()} for s, d in out.items() if sum(d.values()) > 0}
class BehavioralProfile:
"""Markov profile from learned MDP kernels (Section 3.5.2).
Transition kernel T̂_Y estimated via MLE: P̂(s'|s) = N(s,s') / Σ_k N(s,k) (Eq 19)
"""
STATES = ["session_start", "view_item_page", "learn_more_about_item", "add_item_to_cart", "purchase_complete", "session_end"]
# fallback kernels T̄_H, T̄_A when real data unavailable
FALLBACK_H = {"session_start": {"view_item_page": 0.85, "session_end": 0.15},
"view_item_page": {"learn_more_about_item": 0.4, "add_item_to_cart": 0.3, "view_item_page": 0.2, "session_end": 0.1},
"learn_more_about_item": {"add_item_to_cart": 0.5, "view_item_page": 0.3, "session_end": 0.2},
"add_item_to_cart": {"purchase_complete": 0.6, "view_item_page": 0.25, "session_end": 0.15},
"purchase_complete": {"session_end": 1.0}}
FALLBACK_A = {"session_start": {"view_item_page": 0.95, "session_end": 0.05},
"view_item_page": {"learn_more_about_item": 0.6, "view_item_page": 0.25, "add_item_to_cart": 0.1, "session_end": 0.05},
"learn_more_about_item": {"view_item_page": 0.5, "add_item_to_cart": 0.15, "learn_more_about_item": 0.3, "session_end": 0.05},
"add_item_to_cart": {"view_item_page": 0.4, "purchase_complete": 0.2, "session_end": 0.4},
"purchase_complete": {"session_end": 1.0}}
def __init__(self, actor: str, pprobs: np.ndarray, data_dir: str = ""):
self.actor, self.pprobs = actor, np.clip(pprobs, 0.0, 0.95)
self.trans = self._load(data_dir) # T̂_Y transition kernel
self._ensure_terminal()
self.dwell = {s: (1.2, 0.5) if actor == "agents" else (2.0, 1.2) for s in self.STATES}
def _load(self, data_dir: str) -> Dict:
if not REAL_MDP or not data_dir:
print("using fallback")
return dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H)
try:
mdp = (AgentBehaviorModel if self.actor == "agents" else BehaviorModel)(data_dir).build_MDP()
raw = aggregate_event_transitions(mdp) if mdp.get("transitions") else {}
return _canonicalize(raw) if raw else dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H)
except Exception:
print("using fallback")
return dict(self.FALLBACK_A if self.actor == "agents" else self.FALLBACK_H)
def _ensure_terminal(self):
self.trans.setdefault("purchase_complete", {})["session_end"] = self.trans.get("purchase_complete", {}).get("session_end", 1.0)
self.trans.setdefault("session_start", {"view_item_page": 0.7, "learn_more_about_item": 0.2, "session_end": 0.1})
def _tprobs(self, state: str, pidx: int) -> Dict[str, float]:
probs = dict(self.trans.get(state, {"session_end": 1.0}))
if state == "add_item_to_cart":
base = probs.get("purchase_complete", 0.0)
df = float(self.pprobs[pidx]) * (0.3 if self.actor == "agents" else 1.0)
adj = np.clip(base * 0.5 + df * 0.5, 0.0, 0.95)
rem = max(1e-6, 1.0 - adj)
other = sum(v for k, v in probs.items() if k != "purchase_complete")
probs = {k: (adj if k == "purchase_complete" else v * rem / max(other, 1e-6)) for k, v in probs.items()}
total = sum(probs.values())
return {k: v/total for k, v in probs.items()} if total > 0 else {"session_end": 1.0}
def sample(self, rng: np.random.Generator, sid: str, prices: np.ndarray, costs: np.ndarray) -> Tuple[List[Dict], List[SimpleNamespace]]:
events, fevts = [], []
state, t, pidx = "session_start", 0.0, int(rng.integers(0, len(prices)))
cost, cprice = float(costs[pidx]), max(float(prices[pidx]), float(costs[pidx]) * 1.05)
while state != "session_end" and len(events) < 40:
if state != "session_start":
row = {"session_id": sid, "actor": "agent" if self.actor == "agents" else "human",
"eventName": state, "product_idx": pidx, "productId": f"product-{pidx:04d}",
"price_offered": cprice, "price_paid": 0.0, "page": EVENT_PAGE.get(state, "/"),
"ts": t, "unit_cost": cost, "base_price": float(prices[pidx])}
if state == "purchase_complete":
row["price_paid"] = max(cprice * (1.0 + rng.normal(0.0, 0.015)), cost)
events.append(row)
fevts.append(SimpleNamespace(eventName=state, page=row["page"], productId=row["productId"], ts=t))
probs = self._tprobs(state, pidx)
state = rng.choice(list(probs.keys()), p=list(probs.values()))
sh, sc = self.dwell.get(state, (2.0, 1.0))
t += max(0.3, rng.gamma(shape=sh, scale=sc))
return events, fevts
@dataclass
class ContaminatedArrivalConfig:
base_rate: float = 20.0
alpha_contamination: float = 0.2
alpha_drift: float = 0.0
alpha_bounds: tuple[float, float] = (0.0, 0.5)
human_views_range: tuple[int, int] = (1, 4)
agent_views_range: tuple[int, int] = (3, 10)
agent_systematic: bool = True
use_real_behavior: bool = True
human_data_dir: str = ""
agent_data_dir: str = ""
class ContaminatedArrivalModel:
"""Mixture model Q(p) = (1-α)E[d(p;θ_H)] + αE[d(p;θ_A)] + ε_t (Eq 3).
Samples sessions from human/agent behavioral profiles, computes per-session
demand proxy q̂ and divergence signals Δ_H, Δ_A for separability.
"""
def __init__(self, cfg: ContaminatedArrivalConfig | None = None):
self.cfg = cfg or ContaminatedArrivalConfig()
self._alpha = self.cfg.alpha_contamination
self._scount = 0
self._profiles: Dict[str, BehavioralProfile] = {}
self._ref_kernels: Dict[str, Dict] = {} # T̄_H, T̄_A reference kernels
self._session_demands: List[SessionDemand] = [] # collected session demands
@property
def alpha(self) -> float:
return self._alpha
def _profile(self, actor: str, pprobs: np.ndarray) -> BehavioralProfile:
key = actor
if key not in self._profiles:
ddir = self.cfg.agent_data_dir if actor == "agents" else self.cfg.human_data_dir
if not ddir and self.cfg.use_real_behavior:
base = Path(__file__).parent.parent.parent.parent / "experiments"
ddir = str(base / ("agents/collected_data" if actor == "agents" else "collected_data"))
profile = BehavioralProfile(actor, pprobs, ddir if self.cfg.use_real_behavior else "")
self._profiles[key] = profile
self._ref_kernels[key] = profile.trans # cache T̄_Y for divergence
return self._profiles[key]
def get_ref_kernels(self) -> Tuple[Dict, Dict]:
"""Return reference transition kernels T̄_H, T̄_A for divergence computation."""
return (self._ref_kernels.get("humans", BehavioralProfile.FALLBACK_H),
self._ref_kernels.get("agents", BehavioralProfile.FALLBACK_A))
def get_session_demands(self) -> List[SessionDemand]:
"""Return collected session demands for downstream analysis."""
return self._session_demands
def sample(self, t: float, dt: float, instruments: InstrumentSet,
market: MarketState | None, hidden: HiddenState, rng: np.random.Generator) -> list[Opportunity]:
"""Sample arrivals as per Eq 3: mixture of human/agent demand distributions.
For each session s, computes:
- Trajectory τ_s from behavioral profile sampling
- Demand proxy q̂ via weighted action aggregation (Eq 2)
- Divergence signals Δ_H, Δ_A for separability (Eq 20-21)
- Per-session contamination estimate α̂(τ')
"""
cfg = self.cfg
if cfg.alpha_drift != 0:
self._alpha = np.clip(self._alpha + cfg.alpha_drift * rng.normal(), *cfg.alpha_bounds)
hidden.contamination = self._alpha
n_sess = poisson_arrivals(cfg.base_rate * hidden.true_demand_intensity, dt, rng)
prices, costs = instruments.refs, instruments.costs
margin = np.clip((prices - costs) / np.maximum(costs, 1e-3), -0.9, 2.0)
hprob, aprob = 0.08 * np.exp(-1.2 * margin), 0.05 * np.exp(-0.6 * margin)
ref_h, ref_a = self.get_ref_kernels()
opps = []
for _ in range(n_sess):
self._scount += 1
sid = f"s{self._scount:06d}"
is_agent = rng.random() < self._alpha
actor, probs = ("agents", aprob) if is_agent else ("humans", hprob)
profile = self._profile(actor, probs)
events, fevts = profile.sample(rng, sid, prices, costs)
# compute demand proxy q̂ per Eq 2
q = compute_demand_proxy(events, instruments.n)
# compute divergence signals Δ_H, Δ_A per Eq 20-21
delta_h, delta_a = compute_session_divergence(events, ref_h, ref_a)
# per-session contamination estimate α̂(τ') = σ(β(Δ_H - Δ_A))
alpha_hat = 1.0 / (1.0 + np.exp(-2.0 * (delta_h - delta_a))) if (delta_h + delta_a) > 0 else 0.5
theta = ({'price_sensitivity': rng.uniform(0.05, 0.2), 'base_conversion': 0.01, 'info_value': 1.0} if is_agent
else {'price_sensitivity': rng.uniform(1.5, 4.0), 'base_conversion': rng.uniform(0.2, 0.5), 'info_value': 0.0})
# store session demand for downstream analysis
self._session_demands.append(SessionDemand(
session_id=sid, q=q, trajectory=events, delta_h=delta_h, delta_a=delta_a,
alpha_hat=alpha_hat, actor_class="A" if is_agent else "H", theta=theta))
viewed = list({e["product_idx"] for e in events if "product_idx" in e})
if not viewed:
vr = cfg.agent_views_range if is_agent else cfg.human_views_range
viewed = list(rng.choice(instruments.n, size=min(rng.integers(*vr), instruments.n), replace=False))
for vi, iid in enumerate(viewed):
opps.append(Opportunity(
id=f"{sid}-{iid}", type=OpportunityType.SESSION, side=Side.BUY,
instrument_id=int(iid), size=1.0, t=t + rng.uniform(0, dt),
context={'session_id': sid, 'actor_class': 'AGENT' if is_agent else 'HUMAN', 'is_agent': is_agent,
'reconnaissance_intent': is_agent, 'view_index': vi, 'total_views': len(viewed),
'theta': theta, 'trajectory_events': fevts, 'mdp_trajectory': events,
'demand_proxy': q, 'alpha_hat': alpha_hat, 'delta_h': delta_h, 'delta_a': delta_a}))
return opps
@dataclass
class AdversarialArrivalConfig:
base_rate: float = 5.0
n_parallel_agents: int = 3
query_all_products: bool = True
class AdversarialArrivalModel:
"""Adversarial coordination (Theorem 1): as N->inf, COI->0."""
def __init__(self, cfg: AdversarialArrivalConfig | None = None):
self.cfg = cfg or AdversarialArrivalConfig()
self._qcount = 0
def sample(self, t: float, dt: float, instruments: InstrumentSet,
market: MarketState | None, hidden: HiddenState, rng: np.random.Generator) -> list[Opportunity]:
cfg, opps = self.cfg, []
for _ in range(poisson_arrivals(cfg.base_rate, dt, rng)):
self._qcount += 1
for ai in range(cfg.n_parallel_agents):
sid = f"adv{self._qcount:06d}-{ai}"
prods = np.arange(instruments.n) if cfg.query_all_products else rng.choice(instruments.n, size=1)
for iid in prods:
opps.append(Opportunity(
id=f"{sid}-{iid}", type=OpportunityType.SESSION, side=Side.BUY,
instrument_id=int(iid), size=1.0, t=t,
context={'session_id': sid, 'actor_class': 'AGENT', 'is_agent': True, 'adversarial': True,
'agent_index': ai, 'query_group': self._qcount,
'theta': {'price_sensitivity': 0.0, 'base_conversion': 0.0, 'info_value': 1.0}}))
return opps

View File

@@ -0,0 +1,91 @@
"""Execution models with divergent H/A behavior using ground truth labels."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict
import numpy as np
from ...outlet.types import Opportunity, Quote, InstrumentSet, MarketState
from ...outlet.math_util import sigmoid, safe_log, EPS
@dataclass
class HybridExecutionConfig:
human_base_prob: float = 0.3
human_elasticity: float = 2.5
agent_conversion: float = 0.01
cross_elasticity: float = 0.4
quality_weight: float = 0.2
use_separability: bool = False
class HybridExecutionModel:
"""Execution with divergent H/A behavior using ground truth labels."""
def __init__(self, cfg: HybridExecutionConfig | None = None):
self.cfg = cfg or HybridExecutionConfig()
def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
market: MarketState | None, rng: np.random.Generator) -> float:
cfg, idx = self.cfg, int(opp.instrument_id)
price, ref, cost = float(quote.prices[idx]), float(instruments.refs[idx]), float(instruments.costs[idx])
ctx = opp.context
theta = ctx.get('theta', {})
is_agent = ctx.get('is_agent', False)
if is_agent:
return cfg.agent_conversion * theta.get('base_conversion', 1.0)
# human logit discrete choice
sens = theta.get('price_sensitivity', cfg.human_elasticity)
base = theta.get('base_conversion', cfg.human_base_prob)
u_price = -sens * safe_log(price / (ref + EPS))
quality = instruments.instruments[idx].attrs.get('quality', 0.5)
u_quality = cfg.quality_weight * quality
u_comp = 0.0
if market and market.competitor_quotes is not None:
cp = market.competitor_quotes[idx]
if cp < price:
u_comp = -cfg.cross_elasticity * (price - cp) / ref
utility = safe_log(base / (1 - base + EPS)) + u_price + u_quality + u_comp
return float(sigmoid(utility))
def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, context: dict[str, Any] | None = None) -> np.ndarray:
if context is None:
return fills / (self.cfg.human_base_prob + EPS)
agent_frac = context.get('contamination', 0.0)
return fills / (self.cfg.human_base_prob * (1 - agent_frac) + EPS)
@dataclass
class SeparableExecutionConfig:
human_funnel: Dict[str, float] = None
agent_funnel: Dict[str, float] = None
def __post_init__(self):
self.human_funnel = self.human_funnel or {'view_to_detail': 0.4, 'detail_to_cart': 0.3, 'cart_to_purchase': 0.6}
self.agent_funnel = self.agent_funnel or {'view_to_detail': 0.8, 'detail_to_cart': 0.05, 'cart_to_purchase': 0.1}
class SeparableExecutionModel:
"""Execution with Markov funnel kernels using ground truth labels."""
def __init__(self, cfg: SeparableExecutionConfig | None = None):
self.cfg = cfg or SeparableExecutionConfig()
def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
market: MarketState | None, rng: np.random.Generator) -> float:
is_agent = opp.context.get('is_agent', False)
probs = self.cfg.agent_funnel if is_agent else self.cfg.human_funnel
p = probs['view_to_detail'] * probs['detail_to_cart'] * probs['cart_to_purchase']
if not is_agent:
idx = int(opp.instrument_id)
price_ratio = quote.prices[idx] / (instruments.refs[idx] + EPS)
p *= np.exp(-0.5 * (price_ratio - 1.0))
return float(np.clip(p, 0, 1))
def uncensor(self, fills: np.ndarray, instruments: InstrumentSet, context: dict[str, Any] | None = None) -> np.ndarray:
h = self.cfg.human_funnel
exp_conv = h['view_to_detail'] * h['detail_to_cart'] * h['cart_to_purchase']
return fills / (exp_conv + EPS)

102
lab/case/thesis/metrics.py Normal file
View File

@@ -0,0 +1,102 @@
"""Thesis metrics for COI and behavioral analysis using ground truth labels."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict
import numpy as np
from ...outlet.types import StepLogs, StepMetrics, Quote, InstrumentSet
from ...outlet.math_util import safe_log, EPS
@dataclass
class COIMetrics:
coi_level: float = 0.0
coi_leakage: float = 0.0
realized_premium: float = 0.0
theoretical_max: float = 0.0
erosion_rate: float = 0.0
def to_dict(self) -> dict[str, float]:
return {k: getattr(self, k) for k in ['coi_level', 'coi_leakage', 'realized_premium', 'theoretical_max', 'erosion_rate']}
def compute_coi(quote: Quote, instruments: InstrumentSet, metrics: StepMetrics, contamination: float) -> COIMetrics:
prices, costs, refs = quote.prices, instruments.costs, instruments.refs
margins = prices - costs
coi_level = float(np.mean(margins))
theoretical_max = float(np.mean(costs))
realized_premium = (metrics.revenue - metrics.cost) / metrics.units_traded if metrics.units_traded > 0 else 0.0
price_var = float(np.var(prices / refs))
coi_leakage = contamination * (coi_level + price_var)
erosion_rate = contamination * coi_level / (theoretical_max + EPS)
return COIMetrics(coi_level=coi_level, coi_leakage=coi_leakage, realized_premium=realized_premium,
theoretical_max=theoretical_max, erosion_rate=erosion_rate)
@dataclass
class SeparabilityMetrics:
classification_accuracy: float = 0.0
estimated_alpha: float = 0.0
n_human_sessions: int = 0
n_agent_sessions: int = 0
def compute_separability(logs: StepLogs, true_alpha: float) -> SeparabilityMetrics:
"""Compute separability using ground truth labels only."""
if logs.events is None or len(logs.events) == 0:
return SeparabilityMetrics(estimated_alpha=true_alpha)
sessions: Dict[str, bool] = {}
for evt in logs.events:
sid = evt.metadata.get('session_id', evt.opportunity_id)
if sid not in sessions:
sessions[sid] = evt.metadata.get('is_agent', False)
n_agent = sum(1 for is_agent in sessions.values() if is_agent)
n_human = len(sessions) - n_agent
est_alpha = n_agent / len(sessions) if sessions else 0.0
return SeparabilityMetrics(
classification_accuracy=1.0, # ground truth is always correct
estimated_alpha=est_alpha,
n_human_sessions=n_human,
n_agent_sessions=n_agent)
@dataclass
class RevenueAttribution:
total_revenue: float = 0.0
human_revenue: float = 0.0
agent_revenue: float = 0.0
human_conversion: float = 0.0
agent_conversion: float = 0.0
def compute_attribution(logs: StepLogs, metrics: StepMetrics) -> RevenueAttribution:
if logs.executions is None:
return RevenueAttribution(total_revenue=metrics.revenue)
human_rev, agent_rev, human_cnt, agent_cnt = 0.0, 0.0, 0, 0
for exe in logs.executions:
if exe.propensity < 0.05:
agent_rev += exe.price * exe.size_filled
agent_cnt += 1
else:
human_rev += exe.price * exe.size_filled
human_cnt += 1
total_exp = logs.aggregates.get('n_arrivals', 1)
return RevenueAttribution(
total_revenue=metrics.revenue, human_revenue=human_rev, agent_revenue=agent_rev,
human_conversion=human_cnt / (total_exp * 0.8 + EPS),
agent_conversion=agent_cnt / (total_exp * 0.2 + EPS))
def order_statistic_erosion(n_agents: int, price_variance: float) -> float:
"""COI erosion from Theorem 1: as N->inf, min(p_1..p_N)->p_min."""
if n_agents <= 1:
return 0.0
sigma, log_n = np.sqrt(price_variance), safe_log(n_agents)
if log_n < 1:
return 0.0
shift = sigma * (np.sqrt(2 * log_n) - (safe_log(log_n) + safe_log(4 * np.pi)) / (2 * np.sqrt(2 * log_n) + EPS))
return float(min(shift / (sigma * 2 + EPS), 1.0))

View File

@@ -0,0 +1,228 @@
"""
Thesis-specific objectives implementing robust pricing under contamination.
Implements the Maximin objective from Eq 23:
π* = argmax_π min_{Q ∈ U_ε} E_d~Q[R(p,d) - λ·COI(p)]
Key components:
- COIObjective: Cost of Information penalty (Definition 1)
- RobustStackelbergObjective: Full maximin objective with Wasserstein robustness
- UXPenalty: User experience degradation from volatility
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from ...outlet.objectives.base import BaseObjective, CompositeObjective
from ...outlet.types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation
from ...outlet.math_util import safe_log, EPS
class COIObjective(BaseObjective):
"""Cost of Information penalty from Definition 1.
COI(π) = E[P] - p_min
The expected price premium over marginal cost represents the platform's
pricing power. Agent reconnaissance erodes this by revealing price
distribution to buyers.
We implement COI_leakage = f(τ') · InfoValue(p, τ')
where f(τ') is the estimated agent probability.
"""
def __init__(self, lambda_coi: float = 1.0, use_revelation: bool = False):
"""
Args:
lambda_coi: Weight on COI penalty
use_revelation: If True, use -log(π(p)) as info value (penalizes rare prices)
"""
self.lambda_coi = lambda_coi
self.use_revelation = use_revelation
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
# COI_leakage = α · InfoValue
alpha = hidden.contamination
if self.use_revelation:
# revelation surrogate: rare prices reveal more about policy
# InfoValue = -log(π(p|τ')) ≈ surprise of the price
price_surprise = np.mean(np.abs(quote.prices - instruments.refs) / (instruments.refs + EPS))
info_value = price_surprise
else:
# query-tax surrogate: each agent query incurs constant leakage
info_value = 1.0
leakage = alpha * info_value
return -self.lambda_coi * leakage
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
alpha = hidden.contamination
margins = (quote.prices - instruments.costs) / (instruments.costs + EPS)
return {
'coi_penalty': self.reward(quote, instruments, metrics, hidden, obs),
'contamination': alpha,
'avg_margin': float(np.mean(margins)),
}
@dataclass
class RobustObjectiveConfig:
"""Configuration for robust Stackelberg objective.
Attributes:
lambda_coi: Weight on COI penalty (λ in Eq 23)
lambda_ux: Weight on UX penalty
lambda_volatility: Weight on price volatility penalty
gamma_inventory: Inventory risk aversion
wasserstein_epsilon: Ambiguity set radius (ε in Eq 21)
"""
lambda_coi: float = 0.5
lambda_ux: float = 0.1
lambda_volatility: float = 0.2
gamma_inventory: float = 0.1
wasserstein_epsilon: float = 0.1
class RobustStackelbergObjective(BaseObjective):
"""Implements the Maximin Objective from thesis Eq 23.
π* = argmax_π min_{Q ∈ U_ε(P̂_N)} E_d~Q[R(p,d) - λ·COI(p)]
The objective balances:
1. Revenue R(p,d) from human purchases
2. COI penalty for information leakage to agents
3. UX penalty for price volatility
4. Inventory/holding costs
The min over ambiguity set U_ε is approximated by penalizing
high contamination scenarios more heavily.
"""
def __init__(self, cfg: RobustObjectiveConfig | None = None):
self.cfg = cfg or RobustObjectiveConfig()
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
cfg = self.cfg
# 1. base revenue (R(p,d))
revenue = metrics.revenue
cost = metrics.cost
profit = revenue - cost
# 2. COI penalty: scales with contamination and margin extraction
# high margins + high contamination = high leakage
alpha = hidden.contamination
margins = quote.prices - instruments.costs
avg_margin = float(np.mean(margins))
coi_penalty = cfg.lambda_coi * avg_margin * alpha
# 3. UX penalty: price volatility harms legitimate users
volatility_penalty = cfg.lambda_volatility * metrics.volatility
# 4. inventory/position cost
position_penalty = cfg.gamma_inventory * metrics.position_cost
# 5. lost opportunity cost (stockouts)
lost_penalty = 0.1 * metrics.lost_opportunity
# robust adjustment: under adversarial distribution Q,
# expect lower revenue and higher costs
# approximate via worst-case contamination within ε-ball
worst_case_alpha = min(alpha + cfg.wasserstein_epsilon, 1.0)
robustness_penalty = cfg.wasserstein_epsilon * avg_margin * worst_case_alpha
total = profit - coi_penalty - volatility_penalty - position_penalty - lost_penalty - robustness_penalty
return total
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
cfg = self.cfg
alpha = hidden.contamination
margins = quote.prices - instruments.costs
avg_margin = float(np.mean(margins))
return {
'revenue': metrics.revenue,
'cost': metrics.cost,
'profit': metrics.revenue - metrics.cost,
'coi_penalty': -cfg.lambda_coi * avg_margin * alpha,
'volatility_penalty': -cfg.lambda_volatility * metrics.volatility,
'position_penalty': -cfg.gamma_inventory * metrics.position_cost,
'lost_penalty': -0.1 * metrics.lost_opportunity,
'robustness_penalty': -cfg.wasserstein_epsilon * avg_margin * min(alpha + cfg.wasserstein_epsilon, 1.0),
'contamination': alpha,
'avg_margin_pct': avg_margin / (float(np.mean(instruments.costs)) + EPS),
}
class UXPenalty(BaseObjective):
"""User experience penalty from price volatility.
High price volatility degrades UX for legitimate human users.
This term ensures the defense doesn't harm real customers while
protecting against agent reconnaissance.
"""
def __init__(self, scale: float = 1.0, max_acceptable_volatility: float = 0.1):
self.scale = scale
self.max_vol = max_acceptable_volatility
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
# penalty increases quadratically beyond threshold
excess_vol = max(0, metrics.volatility - self.max_vol)
return -self.scale * (excess_vol ** 2)
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
return {
'ux_penalty': self.reward(quote, instruments, metrics, hidden, obs),
'volatility': metrics.volatility,
}
class AdaptiveObjective(BaseObjective):
"""Objective that adapts weights based on estimated contamination.
When contamination is low, focus on revenue maximization.
When contamination is high, increase COI defense weight.
"""
def __init__(self, base_lambda_coi: float = 0.3, max_lambda_coi: float = 2.0,
adaptation_rate: float = 2.0):
self.base_lambda = base_lambda_coi
self.max_lambda = max_lambda_coi
self.rate = adaptation_rate
def _adaptive_lambda(self, alpha: float) -> float:
# sigmoid scaling: λ(α) = base + (max-base) * sigmoid(rate*(α-0.5))
from ...outlet.math_util import sigmoid
scale = sigmoid(self.rate * (alpha - 0.3))
return self.base_lambda + (self.max_lambda - self.base_lambda) * scale
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
alpha = hidden.contamination
lambda_coi = self._adaptive_lambda(alpha)
profit = metrics.revenue - metrics.cost
margins = quote.prices - instruments.costs
coi_penalty = lambda_coi * float(np.mean(margins)) * alpha
return profit - coi_penalty
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
alpha = hidden.contamination
return {
'profit': metrics.revenue - metrics.cost,
'adaptive_lambda': self._adaptive_lambda(alpha),
'contamination': alpha,
}
def make_thesis_objective(lambda_coi: float = 0.5, lambda_ux: float = 0.1,
lambda_vol: float = 0.2) -> CompositeObjective:
"""Create the standard thesis objective composition."""
return CompositeObjective([
(RobustStackelbergObjective(RobustObjectiveConfig(
lambda_coi=lambda_coi, lambda_ux=lambda_ux, lambda_volatility=lambda_vol)), 1.0),
])

176
lab/case/thesis/platform.py Normal file
View File

@@ -0,0 +1,176 @@
"""Thesis platform with real MDP behavioral models and separability scoring."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import numpy as np
from ...outlet import (Platform, PlatformConfig, PositionModel, PositionConfig,
PostedPriceMechanism, make_instruments, InstrumentType, LogLevel)
from ...outlet.mechanisms.posted_price import PostedPriceConfig
from ...outlet.observation import DefaultObservationBuilder, ObservationConfig
from .arrivals import ContaminatedArrivalModel, ContaminatedArrivalConfig
from .execution import HybridExecutionModel, HybridExecutionConfig
from .objectives import RobustStackelbergObjective, RobustObjectiveConfig
@dataclass
class ThesisConfig:
# instruments
n_instruments: int = 10
cost_range: tuple[float, float] = (5.0, 50.0)
margin_range: tuple[float, float] = (0.2, 0.5)
# contamination (Section 3.1)
alpha_contamination: float = 0.2
alpha_drift: float = 0.0
alpha_bounds: tuple[float, float] = (0.0, 0.5)
# objectives (Eq 23)
lambda_coi: float = 0.5
lambda_ux: float = 0.1
lambda_volatility: float = 0.2
wasserstein_epsilon: float = 0.1
# arrivals
sessions_per_step: int = 30
human_views_range: tuple[int, int] = (1, 4)
agent_views_range: tuple[int, int] = (3, 10)
# inventory
initial_inventory: float = 100.0
holding_cost_rate: float = 0.002
# real behavioral models (from sim.rl)
use_real_behavior: bool = True
use_separability: bool = False # disabled until classifier trained
human_data_dir: str = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data"
agent_data_dir: str = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data"
# simulation
max_steps: int = 500
seed: int | None = 24
log_level: LogLevel = LogLevel.AGG_ONLY
def _resolve_data_dirs(cfg: ThesisConfig) -> tuple[str, str]:
"""Resolve data directories for behavioral models."""
base = Path(__file__).parent.parent.parent.parent / "experiments"
human = cfg.human_data_dir or str(base / "collected_data")
agent = cfg.agent_data_dir or str(base / "agents/collected_data")
return human, agent
def make_thesis_platform(cfg: ThesisConfig | None = None) -> Platform:
"""Create platform with real MDP behavioral models.
Implements:
- Contaminated arrivals using learned MDP kernels from behavior_loader
- Hybrid execution with real separability scoring from lib.separability
- Robust Stackelberg objective (Eq 23)
"""
cfg = cfg or ThesisConfig()
rng = np.random.default_rng(cfg.seed)
human_dir, agent_dir = _resolve_data_dirs(cfg)
instruments = make_instruments(
n=cfg.n_instruments, cost_range=cfg.cost_range, margin_range=cfg.margin_range,
inst_type=InstrumentType.SKU, rng=rng)
instruments.position = np.full(cfg.n_instruments, cfg.initial_inventory)
arrival = ContaminatedArrivalModel(ContaminatedArrivalConfig(
base_rate=cfg.sessions_per_step,
alpha_contamination=cfg.alpha_contamination,
alpha_drift=cfg.alpha_drift,
alpha_bounds=cfg.alpha_bounds,
human_views_range=cfg.human_views_range,
agent_views_range=cfg.agent_views_range,
use_real_behavior=cfg.use_real_behavior,
human_data_dir=human_dir,
agent_data_dir=agent_dir,
))
execution = HybridExecutionModel(HybridExecutionConfig(
use_separability=cfg.use_separability,
))
mechanism = PostedPriceMechanism(PostedPriceConfig(max_delta_pct=0.15, min_margin_pct=0.05))
position = PositionModel(PositionConfig(initial_position=cfg.initial_inventory, holding_cost_rate=cfg.holding_cost_rate))
market = None
objective = RobustStackelbergObjective(RobustObjectiveConfig(
lambda_coi=cfg.lambda_coi, lambda_ux=cfg.lambda_ux,
lambda_volatility=cfg.lambda_volatility, wasserstein_epsilon=cfg.wasserstein_epsilon))
obs_builder = DefaultObservationBuilder(ObservationConfig(mask_true_demand=True))
platform_cfg = PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps,
seed=cfg.seed, log_level=cfg.log_level, mask_demand=True)
return Platform(instruments=instruments, mechanism=mechanism, arrival=arrival, execution=execution,
position=position, market=market, obs_builder=obs_builder, objective=objective, cfg=platform_cfg)
@dataclass
class AblationConfig(ThesisConfig):
disable_coi_penalty: bool = False
disable_ux_penalty: bool = False
disable_contamination: bool = False
disable_real_behavior: bool = False
def make_ablation_platform(cfg: AblationConfig) -> Platform:
if cfg.disable_coi_penalty:
cfg.lambda_coi = 0.0
if cfg.disable_ux_penalty:
cfg.lambda_ux = 0.0
if cfg.disable_contamination:
cfg.alpha_contamination = 0.0
if cfg.disable_real_behavior:
cfg.use_real_behavior = False
cfg.use_separability = False
return make_thesis_platform(cfg)
def sweep_contamination(alpha_values: list[float], base_cfg: ThesisConfig | None = None,
n_steps: int = 100, seed: int = 42) -> dict[float, dict]:
"""Test performance across contamination levels (Theorem 1 validation)."""
from ...experiments.eval import rollout, fixed_price_policy
results = {}
base_cfg = base_cfg or ThesisConfig()
for alpha in alpha_values:
cfg = ThesisConfig(**{k: v for k, v in base_cfg.__dict__.items() if k != 'alpha_contamination'},
alpha_contamination=alpha)
platform = make_thesis_platform(cfg)
policy = fixed_price_policy(platform.instruments.refs)
result = rollout(platform, policy, n_steps, seed=seed)
results[alpha] = {
'total_reward': result.total_reward,
'total_pnl': result.total_pnl,
'avg_conversion': result.avg_conversion,
'final_contamination': platform._hidden.contamination,
}
return results
def sweep_behavior_modes(base_cfg: ThesisConfig | None = None, n_steps: int = 100, seed: int = 42) -> dict[str, dict]:
"""Compare real vs synthetic behavioral models."""
from ...experiments.eval import rollout, fixed_price_policy
base_cfg = base_cfg or ThesisConfig()
modes = {
'real_mdp': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': True, 'use_separability': True}),
'synthetic': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': False, 'use_separability': False}),
'real_mdp_no_sep': ThesisConfig(**{**base_cfg.__dict__, 'use_real_behavior': True, 'use_separability': False}),
}
results = {}
for name, cfg in modes.items():
platform = make_thesis_platform(cfg)
policy = fixed_price_policy(platform.instruments.refs)
result = rollout(platform, policy, n_steps, seed=seed)
results[name] = {
'total_reward': result.total_reward,
'total_pnl': result.total_pnl,
'avg_conversion': result.avg_conversion,
}
return results

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python
"""Thesis simulation experiments with real MDP behavioral models."""
from __future__ import annotations
import sys
from pathlib import Path
if __name__ == '__main__':
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
from lab.case.thesis.platform import make_thesis_platform, ThesisConfig
from lab.case.thesis.metrics import compute_coi, compute_separability
from lab.experiments.eval import compare_policies
import numpy as np
def demo_basic_simulation():
print("=" * 70)
print("THESIS SIMULATION: Contaminated Dynamic Pricing (Real MDP Kernels)")
print("=" * 70)
cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.3, lambda_coi=0.5,
max_steps=100, seed=42, use_real_behavior=True)
platform = make_thesis_platform(cfg)
print(f"\nInstruments: {platform.instruments.n}")
print(f"Reference prices: {platform.instruments.refs.round(2)}")
print(f"Costs: {platform.instruments.costs.round(2)}")
print(f"Initial contamination alpha={cfg.alpha_contamination}")
print(f"Using real behavior: {cfg.use_real_behavior}")
result = platform.reset(seed=42)
total_reward, coi_history = 0, []
print(f"\n{'Step':>5} {'Reward':>10} {'PnL':>10} {'COI':>8} {'alpha':>6} {'Conv':>8}")
print("-" * 55)
for t in range(cfg.max_steps):
action = platform.instruments.refs * np.random.uniform(0.95, 1.15, size=platform.instruments.n)
result = platform.step(action)
total_reward += result.reward
coi = compute_coi(platform._quote, platform.instruments, result.metrics, result.hidden.contamination)
coi_history.append(coi.coi_level)
if t % 20 == 0:
print(f"{t:5d} {result.reward:10.2f} {result.metrics.pnl:10.2f} "
f"{coi.coi_level:8.2f} {result.hidden.contamination:6.2f} {result.metrics.conversion:8.3f}")
print("-" * 55)
print(f"Total Reward: {total_reward:.2f}")
print(f"Average COI: {np.mean(coi_history):.2f}")
print(f"COI Trend: {coi_history[-1] - coi_history[0]:+.2f}")
def demo_contamination_sweep():
print("\n" + "=" * 70)
print("EXPERIMENT: COI Erosion vs Contamination (Theorem 1)")
print("=" * 70)
from lab.case.thesis.platform import sweep_contamination
trials = 20
alpha_values = [i/trials for i in range(trials)]
results = sweep_contamination(alpha_values, n_steps=100, seed=42)
print(f"\n{'alpha':>6} {'Reward':>12} {'PnL':>12} {'Conv':>10}")
print("-" * 45)
for alpha, m in sorted(results.items()):
print(f"{alpha:6.2f} {m['total_reward']:12.2f} {m['total_pnl']:12.2f} {m['avg_conversion']:10.3f}")
rewards = [results[a]['total_reward'] for a in sorted(results.keys())]
dataset = np.array([[a, r] for a, r in zip(alpha_values, rewards)])
trend = np.corrcoef(dataset[:, 0], dataset[:, 1])[0, 1]
print(f"Trend (alpha~reward correlation): {trend:.3f}")
def demo_policy_comparison():
print("\n" + "=" * 70)
print("EXPERIMENT: Policy Comparison under Contamination")
print("=" * 70)
cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.25, max_steps=100, seed=42)
platform = make_thesis_platform(cfg)
def fixed_policy(obs, t): return platform.instruments.refs.copy(), 1.0
def aggressive_policy(obs, t): return platform.instruments.refs * 1.3, 1.0
def conservative_policy(obs, t): return platform.instruments.refs * 1.05, 1.0
def adaptive_policy(obs, t):
fills = obs[platform.instruments.n:2*platform.instruments.n]
exp = obs[2*platform.instruments.n:3*platform.instruments.n]
conv = np.sum(fills) / (np.sum(exp) + 1e-8)
return platform.instruments.refs * (1.0 + 0.2 * conv), 1.0
policies = {'fixed': fixed_policy, 'aggressive': aggressive_policy,
'conservative': conservative_policy, 'adaptive': adaptive_policy}
results = compare_policies(platform, policies, n_steps=100, n_runs=3, seed=42)
print(f"\n{'Policy':>15} {'Reward':>12} {'Std':>10} {'PnL':>12} {'Conv':>10}")
print("-" * 65)
for name, r in sorted(results.items(), key=lambda x: -x[1]['mean_reward']):
print(f"{name:>15} {r['mean_reward']:12.2f} {r['std_reward']:10.2f} "
f"{r['mean_pnl']:12.2f} {r['mean_conversion']:10.3f}")
def demo_session_analysis():
"""Analyze session-level behavior from MDP trajectories."""
print("\n" + "=" * 70)
print("EXPERIMENT: Session Analysis (Ground Truth)")
print("=" * 70)
from lab.outlet.constants import LogLevel
cfg = ThesisConfig(n_instruments=5, alpha_contamination=0.3, max_steps=50,
log_level=LogLevel.FULL, seed=42, use_real_behavior=True)
platform = make_thesis_platform(cfg)
result = platform.reset(seed=42)
human_sessions, agent_sessions = 0, 0
for t in range(cfg.max_steps):
action = platform.instruments.refs * 1.1
result = platform.step(action)
sep = compute_separability(result.logs, result.hidden.contamination)
human_sessions += sep.n_human_sessions
agent_sessions += sep.n_agent_sessions
total = human_sessions + agent_sessions
print(f"\nTotal sessions: {total}")
print(f"Human sessions: {human_sessions} ({100*human_sessions/total:.1f}%)")
print(f"Agent sessions: {agent_sessions} ({100*agent_sessions/total:.1f}%)")
print(f"True contamination: {cfg.alpha_contamination:.1%}")
print(f"Observed contamination: {agent_sessions/total:.1%}")
if __name__ == '__main__':
demo_basic_simulation()
demo_contamination_sweep()
# demo_policy_comparison()
# demo_session_analysis()

156
lab/config.py Normal file
View File

@@ -0,0 +1,156 @@
"""
Configuration and factory functions for creating pre-configured platforms.
This module provides:
- RetailConfig, MarketMakingConfig: Configuration dataclasses
- make_retail_platform: Factory for retail dynamic pricing scenarios
- make_market_making_platform: Factory for market making scenarios
Example:
>>> from lab.config import make_retail_platform
>>> platform = make_retail_platform(RetailConfig(n_instruments=5))
>>> result = platform.reset(seed=42)
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from .outlet import (Platform, PlatformConfig, PositionModel, PositionConfig,
PostedPriceMechanism, TwoSidedMechanism, make_instruments,
InstrumentType, LogLevel)
from .outlet.mechanisms.posted_price import PostedPriceConfig
from .outlet.mechanisms.two_sided import TwoSidedConfig
from .population import (SessionArrivalModel, PoissonArrivalModel, HawkesArrivalModel,
ElasticityExecutionModel, IntensityExecutionModel,
ReactiveCompetitorModel, GBMMarketModel)
from .population.arrivals import SessionArrivalConfig, PoissonArrivalConfig, HawkesArrivalConfig
from .population.execution import ElasticityConfig, IntensityConfig
from .population.competitors import ReactiveCompetitorConfig, GBMMarketConfig
from .outlet.objectives.factory import retail_objective, market_making_objective
@dataclass
class RetailConfig:
"""Configuration for retail dynamic pricing scenario.
Attributes:
n_instruments: Number of products to price
cost_range: (min, max) for random product costs
margin_range: (min, max) for random initial margins
initial_inventory: Starting inventory per product
holding_cost_rate: Cost per unit per step for holding
sessions_per_step: Number of browsing sessions per step
contamination: Fraction of sessions that are scrapers
max_steps: Maximum episode length
seed: Random seed for reproducibility
"""
n_instruments: int = 10
cost_range: tuple[float, float] = (5.0, 50.0)
margin_range: tuple[float, float] = (0.2, 0.5)
initial_inventory: float = 100.0
holding_cost_rate: float = 0.002
sessions_per_step: int = 30
contamination: float = 0.1
max_steps: int = 500
seed: int | None = None
def make_retail_platform(cfg: RetailConfig | None = None) -> Platform:
"""Create a pre-configured retail dynamic pricing platform.
Components:
- Mechanism: PostedPriceMechanism (single price per product)
- Arrivals: SessionArrivalModel (browsing sessions with views)
- Execution: ElasticityExecutionModel (price sensitivity)
- Market: ReactiveCompetitorModel (can trigger price wars)
- Objective: PnL - holding_cost - volatility - lost_opportunity
Args:
cfg: Configuration (uses defaults if None)
Returns:
Configured Platform instance
"""
cfg = cfg or RetailConfig()
rng = np.random.default_rng(cfg.seed)
instruments = make_instruments(cfg.n_instruments, cfg.cost_range, cfg.margin_range,
InstrumentType.SKU, rng)
instruments.position = np.full(cfg.n_instruments, cfg.initial_inventory)
mechanism = PostedPriceMechanism(PostedPriceConfig())
arrival = SessionArrivalModel(SessionArrivalConfig(
sessions_per_step=cfg.sessions_per_step, contamination=cfg.contamination))
execution = ElasticityExecutionModel(ElasticityConfig())
position = PositionModel(PositionConfig(
initial_position=cfg.initial_inventory,
holding_cost_rate=cfg.holding_cost_rate))
market = ReactiveCompetitorModel(ReactiveCompetitorConfig(), refs=instruments.refs)
objective = retail_objective()
return Platform(
instruments=instruments, mechanism=mechanism, arrival=arrival,
execution=execution, position=position, market=market, objective=objective,
cfg=PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps,
seed=cfg.seed, log_level=LogLevel.AGG_ONLY)
)
@dataclass
class MarketMakingConfig:
"""Configuration for market making scenario.
Attributes:
n_instruments: Number of assets to quote
initial_mid: Initial mid-price for assets
mu: Price drift (expected return)
sigma: Price volatility
gamma: Inventory risk aversion parameter
base_arrival_rate: Order arrival rate (Hawkes baseline)
max_steps: Maximum episode length
seed: Random seed for reproducibility
"""
n_instruments: int = 5
initial_mid: float = 100.0
mu: float = 0.0
sigma: float = 0.02
gamma: float = 0.1
base_arrival_rate: float = 20.0
max_steps: int = 1000
seed: int | None = None
def make_market_making_platform(cfg: MarketMakingConfig | None = None) -> Platform:
"""Create a pre-configured market making platform.
Components:
- Mechanism: TwoSidedMechanism (bid-ask spread quoting)
- Arrivals: HawkesArrivalModel (clustered order flow)
- Execution: IntensityExecutionModel (distance-based fills)
- Market: GBMMarketModel (geometric Brownian motion mid-prices)
- Objective: PnL + spread_capture - inventory_risk
Args:
cfg: Configuration (uses defaults if None)
Returns:
Configured Platform instance
"""
cfg = cfg or MarketMakingConfig()
rng = np.random.default_rng(cfg.seed)
instruments = make_instruments(cfg.n_instruments, (cfg.initial_mid*0.9, cfg.initial_mid*1.1),
(0.0, 0.0), InstrumentType.ASSET, rng)
instruments.position = np.zeros(cfg.n_instruments)
mechanism = TwoSidedMechanism(TwoSidedConfig())
arrival = HawkesArrivalModel(HawkesArrivalConfig(base_rate=cfg.base_arrival_rate))
execution = IntensityExecutionModel(IntensityConfig())
position = PositionModel(PositionConfig(
initial_position=0.0, min_position=-500, max_position=500,
holding_cost_rate=0.0)) # use inventory risk penalty instead
market = GBMMarketModel(GBMMarketConfig(mu=cfg.mu, sigma=cfg.sigma),
initial=instruments.refs)
objective = market_making_objective(gamma=cfg.gamma, sigma=cfg.sigma)
return Platform(
instruments=instruments, mechanism=mechanism, arrival=arrival,
execution=execution, position=position, market=market, objective=objective,
cfg=PlatformConfig(n_instruments=cfg.n_instruments, max_steps=cfg.max_steps,
seed=cfg.seed, log_level=LogLevel.AGG_ONLY)
)

12
lab/docs/Makefile Normal file
View File

@@ -0,0 +1,12 @@
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

39
lab/docs/conf.py Normal file
View File

@@ -0,0 +1,39 @@
import os
import sys
sys.path.insert(0, os.path.abspath('../..'))
project = 'Quote-Control Simulator'
copyright = '2025, PHANTOM Research'
author = 'PHANTOM Research'
release = '0.1.0'
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.napoleon',
'sphinx.ext.viewcode',
'sphinx.ext.intersphinx',
'sphinx.ext.autosummary',
]
templates_path = ['_templates']
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
html_theme = 'alabaster'
html_static_path = ['_static']
autodoc_default_options = {
'members': True,
'undoc-members': True,
'show-inheritance': True,
}
napoleon_google_docstring = True
napoleon_numpy_docstring = True
napoleon_include_init_with_doc = True
intersphinx_mapping = {
'python': ('https://docs.python.org/3', None),
'numpy': ('https://numpy.org/doc/stable/', None),
}
autosummary_generate = True

40
lab/docs/index.rst Normal file
View File

@@ -0,0 +1,40 @@
Quote-Control Simulator
=======================
Research-grade platform for dynamic pricing and market making experiments.
The platform abstracts pricing as: **Quote → Arrival → Execution → Position**
Supports multiple mechanisms:
* **PostedPrice**: retail dynamic pricing
* **TwoSided**: market making with bid-ask spreads
* **Auction**: reserve/shading for auction settings
Quick Start
-----------
.. code-block:: python
from lab.config import make_retail_platform
from lab.experiments import rollout, fixed_price_policy
platform = make_retail_platform()
policy = fixed_price_policy(platform.instruments.refs)
result = rollout(platform, policy, n_steps=100)
print(f"Total PnL: {result.total_pnl:.2f}")
.. toctree::
:maxdepth: 2
:caption: Contents:
system_overview
modules/outlet
modules/population
modules/experiments
Indices
-------
* :ref:`genindex`
* :ref:`modindex`

View File

@@ -0,0 +1,14 @@
Experiments
===========
Evaluation & OPE
----------------
.. automodule:: lab.experiments.eval
:members:
Configuration
-------------
.. automodule:: lab.config
:members:

View File

@@ -0,0 +1,77 @@
Outlet (Core Simulator)
=======================
Types
-----
.. automodule:: lab.outlet.types
:members:
Constants
---------
.. automodule:: lab.outlet.constants
:members:
Protocols
---------
.. automodule:: lab.outlet.protocols
:members:
Platform
--------
.. automodule:: lab.outlet.platform
:members:
Stock & Position
----------------
.. automodule:: lab.outlet.stock
:members:
Observation
-----------
.. automodule:: lab.outlet.observation
:members:
Mechanisms
----------
Posted Price
~~~~~~~~~~~~
.. automodule:: lab.outlet.mechanisms.posted_price
:members:
Two-Sided (Market Making)
~~~~~~~~~~~~~~~~~~~~~~~~~
.. automodule:: lab.outlet.mechanisms.two_sided
:members:
Auction
~~~~~~~
.. automodule:: lab.outlet.mechanisms.auction
:members:
Objectives
----------
.. automodule:: lab.outlet.objectives.base
:members:
.. automodule:: lab.outlet.objectives.penalties
:members:
.. automodule:: lab.outlet.objectives.factory
:members:
Math Utilities
--------------
.. automodule:: lab.outlet.math_util
:members:

View File

@@ -0,0 +1,20 @@
Population Models
=================
Arrival Models
--------------
.. automodule:: lab.population.arrivals
:members:
Execution Models
----------------
.. automodule:: lab.population.execution
:members:
Competitor / Market Models
--------------------------
.. automodule:: lab.population.competitors
:members:

View File

@@ -0,0 +1,97 @@
System Overview
===============
The simulator organises dynamic pricing and market-making experiments as a
closed loop with the following stages:
* **Quote** a policy or agent emits a :class:`lab.outlet.types.Quote`. The
quote is normalised and validated by a concrete
:class:`lab.outlet.protocols.Mechanism` implementation
(posted-price, two-sided, auction).
* **Arrival** a :class:`lab.outlet.protocols.ArrivalModel` samples a stream of
:class:`lab.outlet.types.Opportunity` objects given the current time,
instrument catalogue, and market state.
* **Execution** the :class:`lab.outlet.protocols.ExecutionModel` converts an
opportunity into a probabilistic fill using the active quote, optional
competitor prices, and demand-side context.
* **Position** a :class:`lab.outlet.protocols.PositionModel` enforces
inventory or position constraints, censors oversized fills, and accrues
holding and shortage costs.
* **Observation & Reward** the
:class:`lab.outlet.protocols.ObservationBuilder` constructs the censored view
exposed to the agent, while a :class:`lab.outlet.protocols.Objective`
transforms :class:`lab.outlet.types.StepMetrics` into a scalar reward with an
optional breakdown per term.
These components are orchestrated by :class:`lab.outlet.platform.Platform`,
which manages internal hidden state, deterministic seeding, and logging.
Component Matrix
----------------
=============================== ==============================================
Layer Responsibilities / Examples
=============================== ==============================================
Mechanisms Quote normalisation, execution semantics
(`posted_price`, `two_sided`, `auction`).
Population models Arrivals (:mod:`lab.population.arrivals`),
execution probability models
(:mod:`lab.population.execution`), and
competitor or market dynamics
(:mod:`lab.population.competitors`).
Position management Inventory limits, replenishment, holding and
shortage costs (:mod:`lab.outlet.stock`).
Observation & logging Censored observations and optional event logs
(:mod:`lab.outlet.observation`).
Objectives Reward composition utilities
(:mod:`lab.outlet.objectives`).
Experiments Rollout helpers, baseline policies, off-policy
evaluation (:mod:`lab.experiments.eval`).
=============================== ==============================================
Preconfigured Platforms
-----------------------
Two high-level factories in :mod:`lab.config` wire common combinations of the
building blocks:
* **Retail dynamic pricing** posted-price mechanism, session arrivals with
contamination, elasticity-based executions, reactive competitor model, and a
composite objective that penalises volatility, holding costs, and lost
opportunities.
* **Market making** two-sided quoting, Hawkes order flow, intensity-based
executions, geometric Brownian motion mid-prices, and an objective combining
PnL, spread capture, and quadratic inventory risk.
State & Reset Behaviour
-----------------------
When you call :meth:`lab.outlet.platform.Platform.reset`, the platform resets
instrument positions, quotes, and hidden state, but component implementations
may maintain their own internal buffers. For reproducible experiments:
* Reuse freshly instantiated arrival/market models per episode, or add explicit
``reset`` methods if the model keeps history (for example,
:class:`lab.population.arrivals.HawkesArrivalModel` maintains an event
history, while :class:`lab.population.competitors.ReactiveCompetitorModel`
tracks prior competitor quotes).
* Seed randomness through the factory configuration (``RetailConfig.seed`` or
``MarketMakingConfig.seed``) or pass a seed to ``Platform.reset`` for
deterministic rollouts.
Extending the Platform
----------------------
To support a new domain:
1. Create custom Mechanism/Arrival/Execution/Market/Observation components by
implementing the respective protocol in :mod:`lab.outlet.protocols`.
2. Compose a new objective with
:func:`lab.outlet.objectives.factory.make_composite` or write a bespoke
:class:`lab.outlet.objectives.base.BaseObjective`.
3. Wire everything together via :class:`lab.outlet.platform.Platform` directly
or expose a helper factory in :mod:`lab.config`.
Use :func:`lab.experiments.rollout` and
:func:`lab.experiments.compare_policies` to benchmark candidate policies under
multiple random seeds, collecting per-step logs for analysis or OPE.

View File

@@ -0,0 +1,7 @@
from .eval import (rollout, RolloutResult, compare_policies, compute_ips, OPEResult,
fixed_price_policy, cost_plus_margin_policy, random_walk_policy, epsilon_greedy_policy)
__all__ = [
'rollout', 'RolloutResult', 'compare_policies', 'compute_ips', 'OPEResult',
'fixed_price_policy', 'cost_plus_margin_policy', 'random_walk_policy', 'epsilon_greedy_policy',
]

213
lab/experiments/eval.py Normal file
View File

@@ -0,0 +1,213 @@
"""
Evaluation utilities for policy testing and off-policy evaluation.
This module provides:
- rollout: Run a policy on the platform for multiple steps
- compare_policies: Compare multiple policies with statistics
- Baseline policies: fixed_price, cost_plus_margin, random_walk, epsilon_greedy
- OPE estimators: IPS and SNIPS for off-policy evaluation
Example:
>>> from lab.config import make_retail_platform
>>> from lab.experiments.eval import rollout, fixed_price_policy
>>> platform = make_retail_platform()
>>> policy = fixed_price_policy(platform.instruments.refs)
>>> result = rollout(platform, policy, n_steps=100)
>>> print(f"Total PnL: {result.total_pnl:.2f}")
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Callable, Any
import numpy as np
from ..outlet.platform import Platform
from ..outlet.types import StepResult, StepLogs, Quote
# Policy signature: takes (observation_flat, timestep) -> (action_prices, propensity)
Policy = Callable[[np.ndarray, int], tuple[np.ndarray, float]]
@dataclass
class RolloutResult:
"""Results from a policy rollout.
Attributes:
rewards: Per-step rewards
metrics: Per-step StepMetrics objects
logs: Per-step StepLogs objects
total_reward: Sum of rewards
total_pnl: Sum of PnL from metrics
avg_conversion: Average conversion rate
"""
rewards: list[float]
metrics: list[Any]
logs: list[StepLogs]
total_reward: float
total_pnl: float
avg_conversion: float
def rollout(platform: Platform, policy: Policy, n_steps: int, seed: int | None = None) -> RolloutResult:
"""Execute a policy on the platform for n_steps.
Args:
platform: The simulation platform
policy: Function (obs, t) -> (action, propensity)
n_steps: Number of steps to run
seed: Random seed for reproducibility
Returns:
RolloutResult with rewards, metrics, and summary statistics
"""
result = platform.reset(seed)
rewards, metrics, logs = [], [], []
for t in range(n_steps):
obs_flat = result.obs.to_flat()
action, propensity = policy(obs_flat, t)
result = platform.step(action, propensity)
rewards.append(result.reward)
metrics.append(result.metrics)
logs.append(result.logs)
if result.terminated or result.truncated:
break
return RolloutResult(
rewards=rewards, metrics=metrics, logs=logs,
total_reward=sum(rewards),
total_pnl=sum(m.pnl for m in metrics),
avg_conversion=np.mean([m.conversion for m in metrics])
)
# Baseline policies for comparison
def fixed_price_policy(refs: np.ndarray) -> Policy:
"""Policy that always quotes at reference prices."""
def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
return refs.copy(), 1.0
return policy
def cost_plus_margin_policy(costs: np.ndarray, margin: float = 0.3) -> Policy:
"""Policy that quotes at cost * (1 + margin)."""
prices = costs * (1 + margin)
def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
return prices.copy(), 1.0
return policy
def random_walk_policy(refs: np.ndarray, volatility: float = 0.05,
rng: np.random.Generator | None = None) -> Policy:
"""Policy that performs a random walk around reference prices."""
rng = rng or np.random.default_rng()
prices = refs.copy()
def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
nonlocal prices
delta = rng.normal(0, volatility, len(prices))
prices = prices * (1 + delta)
prices = np.clip(prices, refs * 0.5, refs * 2.0)
return prices.copy(), 1.0
return policy
def epsilon_greedy_policy(base_policy: Policy, refs: np.ndarray,
epsilon: float = 0.1, rng: np.random.Generator | None = None) -> Policy:
"""Wrap a policy with epsilon-greedy exploration."""
rng = rng or np.random.default_rng()
def policy(obs: np.ndarray, t: int) -> tuple[np.ndarray, float]:
if rng.random() < epsilon:
action = refs * rng.uniform(0.8, 1.2, len(refs))
return action, epsilon / len(refs)
else:
action, _ = base_policy(obs, t)
return action, 1 - epsilon
return policy
# Off-Policy Evaluation (OPE)
@dataclass
class OPEResult:
"""Results from off-policy evaluation.
Attributes:
ips_estimate: Inverse Propensity Scoring estimate
snips_estimate: Self-normalized IPS estimate (more stable)
n_samples: Number of samples used
effective_samples: Effective sample size (accounts for variance)
"""
ips_estimate: float
snips_estimate: float
n_samples: int
effective_samples: float
def compute_ips(logs: list[StepLogs], rewards: list[float],
target_policy: Policy, behavior_propensities: list[float] | None = None) -> OPEResult:
"""Compute IPS and SNIPS estimators for off-policy evaluation.
Uses logged propensities to estimate expected reward under a target
policy from data collected under a behavior policy.
Args:
logs: Step logs containing propensities
rewards: Observed rewards from behavior policy
target_policy: Policy to evaluate (not currently used, assumes deterministic)
behavior_propensities: Override propensities if not in logs
Returns:
OPEResult with IPS, SNIPS estimates and sample statistics
"""
if behavior_propensities is None:
# extract from logs
behavior_propensities = []
for log in logs:
if log.executions:
avg_prop = np.mean([e.propensity for e in log.executions])
else:
avg_prop = 1.0
behavior_propensities.append(avg_prop)
# compute importance weights
weights = []
for i, (log, bp) in enumerate(zip(logs, behavior_propensities)):
# target propensity would need obs reconstruction - simplified here
tp = 1.0 # assume deterministic target
w = tp / (bp + 1e-8)
weights.append(w)
weights = np.array(weights)
rewards = np.array(rewards)
# IPS estimate
ips = np.sum(weights * rewards) / len(rewards)
# SNIPS (self-normalized)
snips = np.sum(weights * rewards) / (np.sum(weights) + 1e-8)
# effective sample size
ess = (np.sum(weights) ** 2) / (np.sum(weights ** 2) + 1e-8)
return OPEResult(ips_estimate=ips, snips_estimate=snips,
n_samples=len(rewards), effective_samples=ess)
def compare_policies(platform: Platform, policies: dict[str, Policy],
n_steps: int = 100, n_runs: int = 5, seed: int = 42) -> dict[str, dict]:
"""Compare multiple policies with statistical summary.
Args:
platform: Simulation platform
policies: Dict mapping policy names to policy functions
n_steps: Steps per rollout
n_runs: Number of rollouts per policy (different seeds)
seed: Base random seed
Returns:
Dict mapping policy names to result dicts with mean/std statistics
"""
results = {}
for name, policy in policies.items():
run_results = []
for i in range(n_runs):
r = rollout(platform, policy, n_steps, seed=seed + i)
run_results.append(r)
results[name] = {
'mean_reward': np.mean([r.total_reward for r in run_results]),
'std_reward': np.std([r.total_reward for r in run_results]),
'mean_pnl': np.mean([r.total_pnl for r in run_results]),
'mean_conversion': np.mean([r.avg_conversion for r in run_results]),
}
return results

17
lab/outlet/__init__.py Normal file
View File

@@ -0,0 +1,17 @@
from .constants import Side, MechanismType, InstrumentType, OpportunityType, EventType, LogLevel
from .types import (Instrument, InstrumentSet, Quote, Opportunity, Execution,
StepEvent, StepLogs, StepMetrics, MarketState, HiddenState, Observation, StepResult)
from .stock import PositionModel, PositionConfig, make_instruments
from .platform import Platform, PlatformConfig
from .observation import DefaultObservationBuilder, ObservationConfig
from .mechanisms import PostedPriceMechanism, TwoSidedMechanism, AuctionMechanism
__all__ = [
'Side', 'MechanismType', 'InstrumentType', 'OpportunityType', 'EventType', 'LogLevel',
'Instrument', 'InstrumentSet', 'Quote', 'Opportunity', 'Execution',
'StepEvent', 'StepLogs', 'StepMetrics', 'MarketState', 'HiddenState', 'Observation', 'StepResult',
'PositionModel', 'PositionConfig', 'make_instruments',
'Platform', 'PlatformConfig',
'DefaultObservationBuilder', 'ObservationConfig',
'PostedPriceMechanism', 'TwoSidedMechanism', 'AuctionMechanism',
]

83
lab/outlet/constants.py Normal file
View File

@@ -0,0 +1,83 @@
"""
Constants and enumerations for the Quote-Control simulator.
This module defines the core enums used throughout the platform to ensure
type safety and consistent semantics across different pricing mechanisms.
"""
from enum import Enum, auto
class Side(Enum):
"""Transaction side indicator.
Attributes:
BUY: Buyer-initiated transaction (customer purchases, market buy order)
SELL: Seller-initiated transaction (market sell order, short sale)
"""
BUY = auto()
SELL = auto()
class MechanismType(Enum):
"""Pricing mechanism type defining how quotes translate to executions.
Attributes:
POSTED_PRICE: Single posted price per instrument (retail dynamic pricing)
TWO_SIDED_QUOTE: Bid-ask spread quoting (market making, liquidity provision)
AUCTION: Reserve price or bid shading (ad auctions, marketplaces)
"""
POSTED_PRICE = auto()
TWO_SIDED_QUOTE = auto()
AUCTION = auto()
class InstrumentType(Enum):
"""Type of instrument being priced.
Attributes:
SKU: Retail product with inventory constraints
ASSET: Financial instrument with position limits
LOAN: Credit product with interest rate pricing
SUBSCRIPTION: Recurring service with periodic fees
"""
SKU = auto()
ASSET = auto()
LOAN = auto()
SUBSCRIPTION = auto()
class OpportunityType(Enum):
"""Type of arrival opportunity.
Attributes:
SESSION: Retail browsing session with potential purchase intent
MARKET_ORDER: Financial market order arrival (buy or sell)
REQUEST: Service or credit request requiring quote response
"""
SESSION = auto()
MARKET_ORDER = auto()
REQUEST = auto()
class EventType(Enum):
"""Type of logged event during simulation.
Attributes:
ARRIVAL: New opportunity arrived in the system
EXPOSURE: Quote was shown to an arrival
EXECUTION: Transaction was executed
ABANDON: Opportunity abandoned without execution
CANCEL: Pending order was cancelled
"""
ARRIVAL = auto()
EXPOSURE = auto()
EXECUTION = auto()
ABANDON = auto()
CANCEL = auto()
class LogLevel(Enum):
"""Verbosity level for step logging.
Attributes:
NONE: No logging, fastest execution
AGG_ONLY: Only aggregate statistics per step
FULL: Full event-level logging with propensities for OPE
"""
NONE = auto()
AGG_ONLY = auto()
FULL = auto()

86
lab/outlet/gym_wrapper.py Normal file
View File

@@ -0,0 +1,86 @@
"""
Gymnasium-compatible wrapper for the Quote-Control platform.
Provides a standard Gym interface for RL training:
- observation_space: Box space with flattened observation
- action_space: Box space with price multipliers [0.5, 2.0]
- reset(), step(), render(), close() methods
Example:
>>> from lab.config import make_retail_platform
>>> from lab.outlet.gym_wrapper import QuoteGymEnv
>>> env = QuoteGymEnv(make_retail_platform())
>>> obs, info = env.reset()
>>> obs, reward, done, truncated, info = env.step(env.action_space.sample())
"""
from __future__ import annotations
from typing import Any
import numpy as np
try:
import gymnasium as gym
from gymnasium import spaces
HAS_GYM = True
except ImportError:
HAS_GYM = False
from .platform import Platform, PlatformConfig
from .types import Quote, InstrumentSet, StepResult
class QuoteGymEnv:
"""Gymnasium-compatible environment wrapper.
Wraps a Platform instance with standard Gym interface.
Actions are price multipliers in [0.5, 2.0] applied to reference prices.
Observations are flattened numpy arrays containing quotes, fills, exposures.
"""
def __init__(self, platform: Platform):
if not HAS_GYM:
raise ImportError("gymnasium required for QuoteGymEnv")
self.platform = platform
self.n = platform.instruments.n
self._last_result: StepResult | None = None
# action space: price adjustments as multipliers [0.5, 2.0]
self.action_space = spaces.Box(low=0.5, high=2.0, shape=(self.n,), dtype=np.float32)
# observation space
obs_dim = self.n * 4 # quotes + fills + exposures + position
if platform.market:
obs_dim += self.n # competitor quotes
self.observation_space = spaces.Box(low=-np.inf, high=np.inf,
shape=(obs_dim,), dtype=np.float32)
def reset(self, seed: int | None = None, options: dict | None = None) -> tuple[np.ndarray, dict]:
result = self.platform.reset(seed)
self._last_result = result
return result.obs.to_flat().astype(np.float32), result.info
def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, bool, dict]:
# convert action (multipliers) to absolute prices
refs = self.platform.instruments.refs
prices = refs * action
result = self.platform.step(prices)
self._last_result = result
return (result.obs.to_flat().astype(np.float32), result.reward,
result.terminated, result.truncated, result.info)
def render(self) -> None:
if self._last_result:
m = self._last_result.metrics
print(f"t={self.platform._t} pnl={m.pnl:.2f} units={m.units_traded:.0f} "
f"conv={m.conversion:.3f} vol={m.volatility:.3f}")
def close(self) -> None:
pass
def make_env(platform: Platform) -> QuoteGymEnv:
return QuoteGymEnv(platform)
if HAS_GYM:
# register if gymnasium available
try:
gym.register(id='QuoteControl-v0', entry_point='outlet.gym_wrapper:QuoteGymEnv')
except:
pass # already registered or other issue

57
lab/outlet/math_util.py Normal file
View File

@@ -0,0 +1,57 @@
"""
Numerical utilities for stable computation.
This module provides numerically stable implementations of common operations:
- safe_exp, safe_log: Avoid overflow/underflow
- softmax: Numerically stable softmax
- sigmoid, clamp: Standard transformations
- intensity_decay: Avellaneda-Stoikov fill intensity
- inventory_penalty: Quadratic inventory risk
- poisson_arrivals, hawkes_intensity: Arrival process helpers
All functions accept both scalars and numpy arrays.
"""
import numpy as np
EPS = 1e-8 # small constant to avoid division by zero
MAX_EXP = 700.0 # maximum safe exponent to avoid overflow
def safe_exp(x: np.ndarray | float) -> np.ndarray | float:
return np.exp(np.clip(x, -MAX_EXP, MAX_EXP))
def safe_log(x: np.ndarray | float) -> np.ndarray | float:
return np.log(np.maximum(x, EPS))
def clamp(x: np.ndarray | float, lo: float, hi: float) -> np.ndarray | float:
return np.clip(x, lo, hi)
def sigmoid(x: np.ndarray | float) -> np.ndarray | float:
return 1.0 / (1.0 + safe_exp(-x))
def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
x_max = np.max(x, axis=axis, keepdims=True)
exp_x = safe_exp(x - x_max)
return exp_x / (np.sum(exp_x, axis=axis, keepdims=True) + EPS)
def geometric_series(base: float, ratio: float, n: int) -> np.ndarray:
return base * (ratio ** np.arange(n))
def ema(old: float, new: float, alpha: float = 0.1) -> float:
return alpha * new + (1 - alpha) * old
def intensity_decay(distance: float, kappa: float = 1.0) -> float:
"""Avellaneda-Stoikov style fill intensity decay with quote distance"""
return safe_exp(-kappa * distance)
def inventory_penalty(q: float, gamma: float = 0.1, sigma: float = 1.0) -> float:
"""Quadratic inventory risk penalty"""
return gamma * sigma**2 * q**2 / 2
def poisson_arrivals(rate: float, dt: float, rng: np.random.Generator) -> int:
return rng.poisson(rate * dt)
def hawkes_intensity(base: float, history: np.ndarray, alpha: float, beta: float, t: float) -> float:
"""Self-exciting Hawkes process intensity"""
if len(history) == 0: return base
decays = safe_exp(-beta * (t - history[history < t]))
return base + alpha * np.sum(decays)

View File

@@ -0,0 +1,5 @@
from .posted_price import PostedPriceMechanism
from .two_sided import TwoSidedMechanism
from .auction import AuctionMechanism
__all__ = ['PostedPriceMechanism', 'TwoSidedMechanism', 'AuctionMechanism']

View File

@@ -0,0 +1,73 @@
"""
Auction mechanism for reserve pricing and bid shading.
In this mechanism, the agent sets reserve prices that affect
win probability and clearing prices. Used for ad auctions,
marketplace auctions, and similar settings.
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState
from ..constants import Side
from ..math_util import clamp, sigmoid
@dataclass
class AuctionConfig:
"""Configuration for auction mechanism.
Attributes:
min_reserve: Minimum reserve price
max_reserve: Maximum reserve price
base_win_prob: Baseline win probability at reference reserve
sensitivity: How much higher reserves reduce win probability
"""
min_reserve: float = 0.0
max_reserve: float = 100.0
base_win_prob: float = 0.3
sensitivity: float = 2.0
class AuctionMechanism:
"""Auction mechanism for reserve pricing.
The agent sets reserve prices that affect:
- Win probability: higher reserves reduce chance of winning
- Clearing price: bounded between reserve and simulated max bid
Win probability: base_prob * sigmoid(-sensitivity * (reserve - ref) / ref)
Clearing price: max(reserve, min(max_bid, reserve + random_increment))
Only BUY-side opportunities are processed (auction wins).
"""
def __init__(self, cfg: AuctionConfig | None = None):
self.cfg = cfg or AuctionConfig()
def apply_quote(self, quote: Quote, instruments: InstrumentSet,
rng: np.random.Generator) -> Quote:
reserves = clamp(quote.prices, self.cfg.min_reserve, self.cfg.max_reserve)
return Quote(prices=reserves, propensity=quote.propensity, metadata=quote.metadata)
def process_opportunity(self, opp: Opportunity, quote: Quote,
instruments: InstrumentSet, market: MarketState | None,
rng: np.random.Generator) -> Execution | None:
if opp.side != Side.BUY: return None
idx = int(opp.instrument_id)
reserve = float(quote.prices[idx])
ref = instruments.refs[idx]
# win probability decreases with higher reserve
relative_reserve = (reserve - ref) / (ref + 1e-8)
win_prob = self.cfg.base_win_prob * sigmoid(-self.cfg.sensitivity * relative_reserve)
if rng.random() > win_prob: return None
# clearing price is between reserve and some max bid (simulated)
max_bid = ref * (1 + rng.exponential(0.2))
clearing = max(reserve, min(max_bid, reserve + rng.exponential(0.1) * ref))
return Execution(
opportunity_id=opp.id, instrument_id=opp.instrument_id,
side=opp.side, size_requested=opp.size, size_filled=opp.size,
price=clearing, propensity=quote.propensity * win_prob, t=opp.t
)

View File

@@ -0,0 +1,84 @@
"""
Posted price mechanism for retail dynamic pricing.
In this mechanism, the agent posts a single price per instrument.
Buyers decide whether to purchase based on the posted price.
This is the standard e-commerce dynamic pricing model.
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState
from ..constants import Side
from ..math_util import clamp
@dataclass
class PostedPriceConfig:
"""Configuration for posted price mechanism.
Attributes:
min_price: Absolute minimum price
max_price: Absolute maximum price
max_delta_pct: Maximum price change per step as fraction of previous
min_margin_pct: Minimum margin over cost basis
round_to: Price rounding granularity (None = no rounding)
"""
min_price: float = 0.01
max_price: float = 1000.0
max_delta_pct: float = 0.2
min_margin_pct: float = 0.05
round_to: float | None = 0.01
class PostedPriceMechanism:
"""Posted price mechanism for retail dynamic pricing.
The agent posts a single price per product. Constraints enforced:
- Prices within [min_price, max_price]
- Margin at least min_margin_pct above cost
- Price changes limited to max_delta_pct per step
- Prices rounded to round_to granularity
Only BUY-side opportunities are processed (customers purchasing).
"""
def __init__(self, cfg: PostedPriceConfig | None = None):
self.cfg = cfg or PostedPriceConfig()
def apply_quote(self, quote: Quote, instruments: InstrumentSet,
rng: np.random.Generator) -> Quote:
prices = quote.prices.copy()
costs = instruments.costs
refs = instruments.refs
c = self.cfg
# enforce min margin
min_prices = costs * (1 + c.min_margin_pct)
prices = np.maximum(prices, min_prices)
# enforce absolute bounds
prices = clamp(prices, c.min_price, c.max_price)
# enforce max delta if we have history
if 'prev_prices' in quote.metadata:
prev = quote.metadata['prev_prices']
max_change = prev * c.max_delta_pct
prices = clamp(prices, prev - max_change, prev + max_change)
# round prices
if c.round_to:
prices = np.round(prices / c.round_to) * c.round_to
return Quote(prices=prices, propensity=quote.propensity,
metadata={**quote.metadata, 'prev_prices': prices})
def process_opportunity(self, opp: Opportunity, quote: Quote,
instruments: InstrumentSet, market: MarketState | None,
rng: np.random.Generator) -> Execution | None:
if opp.side != Side.BUY: return None # posted price is buy-only
idx = int(opp.instrument_id)
price = float(quote.prices[idx])
return Execution(
opportunity_id=opp.id, instrument_id=opp.instrument_id,
side=opp.side, size_requested=opp.size, size_filled=opp.size,
price=price, propensity=quote.propensity, t=opp.t
)

View File

@@ -0,0 +1,89 @@
"""
Two-sided quoting mechanism for market making.
In this mechanism, the agent posts both bid and ask prices.
Execution depends on the distance from the market mid-price.
This models liquidity provision in financial markets.
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from ..types import Quote, Opportunity, Execution, InstrumentSet, MarketState
from ..constants import Side
from ..math_util import clamp, intensity_decay
@dataclass
class TwoSidedConfig:
"""Configuration for two-sided quoting mechanism.
Attributes:
min_spread: Minimum bid-ask spread
max_spread: Maximum bid-ask spread
min_price: Absolute minimum price
max_price: Absolute maximum price
fill_kappa: Intensity decay parameter (higher = faster decay with distance)
"""
min_spread: float = 0.01
max_spread: float = 0.5
min_price: float = 0.01
max_price: float = 10000.0
fill_kappa: float = 1.5
class TwoSidedMechanism:
"""Two-sided quoting mechanism for market making.
The agent posts bid (buy) and ask (sell) prices around a mid-point.
Fill probability decays exponentially with distance from mid-price,
following the Avellaneda-Stoikov intensity model.
Both BUY and SELL opportunities are processed:
- BUY: customer buys at agent's ask price
- SELL: customer sells at agent's bid price
"""
def __init__(self, cfg: TwoSidedConfig | None = None):
self.cfg = cfg or TwoSidedConfig()
def apply_quote(self, quote: Quote, instruments: InstrumentSet,
rng: np.random.Generator) -> Quote:
prices = quote.prices.copy()
spreads = quote.spreads.copy() if quote.spreads is not None else np.full_like(prices, 0.02)
c = self.cfg
prices = clamp(prices, c.min_price, c.max_price)
spreads = clamp(spreads, c.min_spread, c.max_spread)
# ensure bids < asks
half_spread = spreads / 2
bids = prices - half_spread
asks = prices + half_spread
bids = np.maximum(bids, c.min_price)
asks = np.minimum(asks, c.max_price)
spreads = asks - bids
prices = (bids + asks) / 2
return Quote(prices=prices, spreads=spreads, propensity=quote.propensity,
metadata=quote.metadata)
def process_opportunity(self, opp: Opportunity, quote: Quote,
instruments: InstrumentSet, market: MarketState | None,
rng: np.random.Generator) -> Execution | None:
idx = int(opp.instrument_id)
mid = market.mid_prices[idx] if market and market.mid_prices is not None else quote.prices[idx]
if opp.side == Side.BUY:
price = float(quote.asks[idx]) if quote.asks is not None else float(quote.prices[idx])
distance = price - mid
else:
price = float(quote.bids[idx]) if quote.bids is not None else float(quote.prices[idx])
distance = mid - price
# probabilistic fill based on distance from mid
fill_prob = intensity_decay(abs(distance), self.cfg.fill_kappa)
if rng.random() > fill_prob: return None
return Execution(
opportunity_id=opp.id, instrument_id=opp.instrument_id,
side=opp.side, size_requested=opp.size, size_filled=opp.size,
price=price, propensity=quote.propensity * fill_prob, t=opp.t
)

View File

@@ -0,0 +1,11 @@
from .base import BaseObjective, CompositeObjective
from .penalties import (PnLObjective, VolatilityPenalty, HoldingCostPenalty,
LostOpportunityCostPenalty, InventoryRiskPenalty, SpreadCaptureReward)
from .factory import make_objective, make_composite, retail_objective, market_making_objective
__all__ = [
'BaseObjective', 'CompositeObjective',
'PnLObjective', 'VolatilityPenalty', 'HoldingCostPenalty',
'LostOpportunityCostPenalty', 'InventoryRiskPenalty', 'SpreadCaptureReward',
'make_objective', 'make_composite', 'retail_objective', 'market_making_objective',
]

View File

@@ -0,0 +1,48 @@
"""
Base classes for reward objectives.
Objectives compute scalar rewards from step metrics. The CompositeObjective
allows combining multiple objectives with weights for multi-objective optimization.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from ..types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation
class BaseObjective(ABC):
"""Abstract base class for reward objectives.
Subclasses must implement reward() and breakdown() methods.
"""
@abstractmethod
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float: ...
@abstractmethod
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]: ...
class CompositeObjective(BaseObjective):
"""Weighted sum of multiple objectives.
Allows combining multiple reward terms (e.g., PnL - holding_cost - volatility).
Args:
objectives: List of (objective, weight) tuples
"""
def __init__(self, objectives: list[tuple[BaseObjective, float]]):
self.objectives = objectives
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
return sum(w * obj.reward(quote, instruments, metrics, hidden, obs)
for obj, w in self.objectives)
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
bd = {}
for obj, w in self.objectives:
for k, v in obj.breakdown(quote, instruments, metrics, hidden, obs).items():
bd[k] = w * v
return bd

View File

@@ -0,0 +1,82 @@
"""
Factory functions for creating objectives.
Provides:
- make_objective: Create single objective by name
- make_composite: Create weighted combination of objectives
- retail_objective: Default objective for retail pricing
- market_making_objective: Default objective for market making
"""
from __future__ import annotations
from .base import BaseObjective, CompositeObjective
from .penalties import (PnLObjective, VolatilityPenalty, HoldingCostPenalty,
LostOpportunityCostPenalty, InventoryRiskPenalty, SpreadCaptureReward)
REGISTRY: dict[str, type[BaseObjective]] = {
'pnl': PnLObjective,
'volatility': VolatilityPenalty,
'holding_cost': HoldingCostPenalty,
'lost_opportunity': LostOpportunityCostPenalty,
'inventory_risk': InventoryRiskPenalty,
'spread_capture': SpreadCaptureReward,
}
def make_objective(name: str, **kwargs) -> BaseObjective:
"""Create an objective by name.
Args:
name: Objective name (pnl, volatility, holding_cost, lost_opportunity,
inventory_risk, spread_capture)
**kwargs: Passed to objective constructor
Returns:
Instantiated objective
"""
if name not in REGISTRY:
raise ValueError(f"Unknown objective: {name}. Available: {list(REGISTRY.keys())}")
return REGISTRY[name](**kwargs)
def make_composite(spec: list[tuple[str, float, dict]] | dict[str, float]) -> CompositeObjective:
"""Create composite objective from specification.
Args:
spec: Either:
- list of (name, weight, kwargs) tuples for full control
- dict of {name: weight} for simple cases
Returns:
CompositeObjective with specified components
"""
objectives = []
if isinstance(spec, dict):
for name, weight in spec.items():
objectives.append((make_objective(name), weight))
else:
for name, weight, kwargs in spec:
objectives.append((make_objective(name, **kwargs), weight))
return CompositeObjective(objectives)
def retail_objective(volatility_weight: float = 0.1, holding_weight: float = 0.5,
stockout_weight: float = 0.3) -> CompositeObjective:
"""Default objective for retail dynamic pricing.
Reward = PnL - volatility_weight*volatility - holding_weight*holding_cost
- stockout_weight*lost_opportunity
"""
return make_composite({
'pnl': 1.0,
'volatility': volatility_weight,
'holding_cost': holding_weight,
'lost_opportunity': stockout_weight,
})
def market_making_objective(gamma: float = 0.1, sigma: float = 1.0) -> CompositeObjective:
"""Default objective for market making.
Reward = PnL + 0.5*spread_capture - inventory_risk(gamma, sigma)
"""
return CompositeObjective([
(PnLObjective(), 1.0),
(SpreadCaptureReward(), 0.5),
(InventoryRiskPenalty(gamma=gamma, sigma=sigma), 1.0),
])

View File

@@ -0,0 +1,101 @@
"""
Standard objective components and penalties.
This module provides common reward terms:
- PnLObjective: Basic profit and loss
- VolatilityPenalty: Penalize price volatility for UX
- HoldingCostPenalty: Inventory holding cost
- LostOpportunityCostPenalty: Stockout/missed fill cost
- InventoryRiskPenalty: Quadratic inventory risk (market making)
- SpreadCaptureReward: Bid-ask spread capture (market making)
"""
from __future__ import annotations
import numpy as np
from .base import BaseObjective
from ..types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation
from ..math_util import inventory_penalty
class PnLObjective(BaseObjective):
"""Profit and loss reward (revenue - cost)."""
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
return metrics.pnl
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
return {'pnl': metrics.pnl, 'revenue': metrics.revenue, 'cost': metrics.cost}
class VolatilityPenalty(BaseObjective):
"""Penalize price volatility for user experience."""
def __init__(self, scale: float = 1.0):
self.scale = scale
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
return -self.scale * metrics.volatility
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
return {'volatility_penalty': -self.scale * metrics.volatility}
class HoldingCostPenalty(BaseObjective):
"""Penalty for inventory holding costs."""
def __init__(self, scale: float = 1.0):
self.scale = scale
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
return -self.scale * metrics.position_cost
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
return {'holding_cost_penalty': -self.scale * metrics.position_cost}
class LostOpportunityCostPenalty(BaseObjective):
"""Penalty for lost sales due to stockouts or missed fills."""
def __init__(self, scale: float = 1.0):
self.scale = scale
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
return -self.scale * metrics.lost_opportunity
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
return {'lost_opportunity_penalty': -self.scale * metrics.lost_opportunity}
class InventoryRiskPenalty(BaseObjective):
"""Quadratic inventory risk penalty (Avellaneda-Stoikov style).
Penalty = gamma * sigma^2 * q^2 / 2, where q is total position.
Encourages market makers to keep inventory near zero.
"""
def __init__(self, gamma: float = 0.1, sigma: float = 1.0):
self.gamma = gamma
self.sigma = sigma
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
if obs.position is None: return 0.0
q = np.sum(obs.position)
return -inventory_penalty(q, self.gamma, self.sigma)
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
return {'inventory_risk_penalty': self.reward(quote, instruments, metrics, hidden, obs)}
class SpreadCaptureReward(BaseObjective):
"""Reward for capturing bid-ask spread in market making."""
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
return metrics.spread_capture
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
return {'spread_capture': metrics.spread_capture}

92
lab/outlet/observation.py Normal file
View File

@@ -0,0 +1,92 @@
"""
Observation construction with demand censoring.
This module provides the ObservationBuilder that constructs agent observations
from step data. The key invariant is that observations only contain censored
data (fills) and never true demand, ensuring proper research conditions.
The ObservationConfig controls what is included in observations:
- Position visibility
- Market/competitor visibility
- Demand proxy method
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from .types import Quote, InstrumentSet, StepLogs, StepMetrics, MarketState, HiddenState, Observation
@dataclass
class ObservationConfig:
"""Configuration for observation construction.
Attributes:
include_position: Include current position in observation
include_market: Include market/competitor state in observation
mask_true_demand: If True, observation excludes true demand (research mode)
demand_proxy: Method for demand proxy ('fills', 'exposures', 'weighted')
exposure_weights: Weights for weighted demand proxy
"""
include_position: bool = True
include_market: bool = True
mask_true_demand: bool = True
demand_proxy: str = 'fills'
exposure_weights: dict[str, float] | None = None
class DefaultObservationBuilder:
"""Constructs censored observations for the agent.
Ensures the key research invariant: observations contain only
censored fills (realized sales), never true demand. True demand
is placed in the info dict for research analysis only.
"""
def __init__(self, cfg: ObservationConfig | None = None):
self.cfg = cfg or ObservationConfig()
def build(self, quote: Quote, instruments: InstrumentSet, logs: StepLogs,
metrics: StepMetrics, market: MarketState | None,
hidden: HiddenState, mask_demand: bool, t: int) -> Observation:
n = instruments.n
cfg = self.cfg
# always show censored fills
fills = logs.censored_fills if logs.censored_fills is not None else np.zeros(n)
# compute exposures from logs
if logs.events:
exposures = np.zeros(n)
for e in logs.events:
if e.instrument_id is not None:
exposures[e.instrument_id] += 1
else:
exposures = logs.aggregates.get('exposures', np.zeros(n))
# position - only if configured and available
position = None
if cfg.include_position and instruments.position is not None:
position = instruments.position.copy()
# market state - only if configured
obs_market = market if cfg.include_market else None
return Observation(
quotes=quote.prices.copy(),
position=position,
fills=fills,
exposures=exposures,
market=obs_market,
t=t
)
def make_space(self, n_instruments: int, include_market: bool = True) -> dict:
"""Returns dict describing observation space for gym"""
space = {
'quotes': {'shape': (n_instruments,), 'low': 0, 'high': np.inf},
'fills': {'shape': (n_instruments,), 'low': 0, 'high': np.inf},
'exposures': {'shape': (n_instruments,), 'low': 0, 'high': np.inf},
}
if self.cfg.include_position:
space['position'] = {'shape': (n_instruments,), 'low': -np.inf, 'high': np.inf}
if include_market:
space['competitor_quotes'] = {'shape': (n_instruments,), 'low': 0, 'high': np.inf}
return space

285
lab/outlet/platform.py Normal file
View File

@@ -0,0 +1,285 @@
"""
Main simulation platform orchestrating the Quote-Control loop.
The Platform class is the central coordinator that:
1. Receives pricing actions (quotes) from the agent
2. Generates arrivals via the ArrivalModel
3. Processes executions via Mechanism and ExecutionModel
4. Applies position censorship via PositionModel
5. Computes metrics and reward via Objective
6. Returns censored observations
Example:
>>> from lab.config import make_retail_platform
>>> platform = make_retail_platform()
>>> result = platform.reset(seed=42)
>>> result = platform.step(platform.instruments.refs * 1.1)
>>> print(f"PnL: {result.metrics.pnl:.2f}")
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
import numpy as np
from .types import (Quote, Opportunity, Execution, InstrumentSet, StepLogs, StepMetrics,
StepEvent, MarketState, HiddenState, Observation, StepResult)
from .constants import LogLevel, EventType, Side
from .protocols import Mechanism, ArrivalModel, ExecutionModel, PositionModel, MarketModel, ObservationBuilder, Objective
from .stock import PositionModel as DefaultPositionModel, PositionConfig
from .observation import DefaultObservationBuilder, ObservationConfig
from .objectives.factory import retail_objective
@dataclass
class PlatformConfig:
"""Configuration for the simulation platform.
Attributes:
n_instruments: Number of instruments in the simulation
max_steps: Maximum steps before episode terminates
dt: Time duration per step (affects arrival rates)
log_level: Verbosity of logging (NONE, AGG_ONLY, FULL)
mask_demand: If True, observations exclude true demand (research mode)
seed: Random seed for reproducibility
"""
n_instruments: int = 10
max_steps: int = 1000
dt: float = 1.0
log_level: LogLevel = LogLevel.AGG_ONLY
mask_demand: bool = True
seed: int | None = None
class Platform:
"""Main simulation orchestrator implementing Quote -> Arrival -> Execution -> Position.
The Platform coordinates all components to simulate a pricing environment:
- Mechanism: validates quotes and determines execution logic
- ArrivalModel: generates demand opportunities
- ExecutionModel: computes acceptance probabilities
- PositionModel: manages inventory/position and censorship
- MarketModel: updates competitor/market state
- ObservationBuilder: constructs censored observations
- Objective: computes reward from metrics
Attributes:
instruments: The instrument set being priced
mechanism: Quote validation and execution mechanism
arrival: Demand arrival generator
execution: Acceptance probability model
position: Inventory/position manager
market: Competitor/market dynamics (optional)
obs_builder: Observation constructor
objective: Reward function
cfg: Platform configuration
"""
def __init__(self, instruments: InstrumentSet, mechanism: Mechanism,
arrival: ArrivalModel, execution: ExecutionModel,
position: PositionModel | None = None,
market: MarketModel | None = None,
obs_builder: ObservationBuilder | None = None,
objective: Objective | None = None,
cfg: PlatformConfig | None = None):
self.instruments = instruments
self.mechanism = mechanism
self.arrival = arrival
self.execution = execution
self.position = position or DefaultPositionModel(PositionConfig())
self.market = market
self.obs_builder = obs_builder or DefaultObservationBuilder()
self.objective = objective or retail_objective()
self.cfg = cfg or PlatformConfig(n_instruments=instruments.n)
self._t: int = 0
self._rng: np.random.Generator = np.random.default_rng(self.cfg.seed)
self._quote: Quote | None = None
self._market_state: MarketState | None = None
self._hidden: HiddenState = HiddenState()
self._prev_prices: np.ndarray | None = None
def reset(self, seed: int | None = None) -> StepResult:
"""Reset the platform to initial state.
Args:
seed: Random seed (overrides config seed if provided)
Returns:
Initial StepResult with zeroed metrics and initial observation
"""
self._t = 0
self._rng = np.random.default_rng(seed or self.cfg.seed)
self._hidden = HiddenState()
self._prev_prices = self.instruments.refs.copy()
# reset position
self.position.reset(self.instruments, self._rng)
self.instruments.position = self.position.position
# initial quote at reference prices
self._quote = Quote(prices=self.instruments.refs.copy(), propensity=1.0,
metadata={'prev_prices': self._prev_prices})
self._quote = self.mechanism.apply_quote(self._quote, self.instruments, self._rng)
# initial market state
if self.market:
self._market_state = self.market.step(0, self._quote, self._hidden, self._rng)
# build initial observation
logs = StepLogs(aggregates={'reset': True},
true_demand=np.zeros(self.instruments.n),
censored_fills=np.zeros(self.instruments.n))
metrics = StepMetrics()
obs = self.obs_builder.build(self._quote, self.instruments, logs, metrics,
self._market_state, self._hidden, self.cfg.mask_demand, 0)
return StepResult(obs=obs, reward=0.0, terminated=False, truncated=False,
info={'true_demand': logs.true_demand}, metrics=metrics,
logs=logs, hidden=self._hidden)
def step(self, action: np.ndarray, propensity: float = 1.0) -> StepResult:
"""Execute one simulation step with the given pricing action.
The step proceeds as follows:
1. Apply quote constraints via mechanism
2. Update market/competitor state
3. Generate arrivals
4. Process arrivals -> executions with acceptance check
5. Apply position censorship to executions
6. Update position state
7. Compute metrics (PnL, costs, etc.)
8. Build logs with propensities
9. Construct censored observation
10. Compute reward
Args:
action: Price vector for all instruments
propensity: P(action | behavior policy) for OPE logging
Returns:
StepResult containing observation, reward, metrics, logs, and hidden state
"""
self._t += 1
cfg = self.cfg
# 1. apply quote from action
self._quote = Quote(prices=action, propensity=propensity,
metadata={'prev_prices': self._prev_prices})
self._quote = self.mechanism.apply_quote(self._quote, self.instruments, self._rng)
self._prev_prices = self._quote.prices.copy()
self._hidden.quote_history.append(self._quote.prices.copy())
# 2. update market/competitors
if self.market:
self._market_state = self.market.step(self._t, self._quote, self._hidden, self._rng)
self._hidden.market_history.append(self._market_state)
# 3. generate arrivals
opps = self.arrival.sample(self._t, cfg.dt, self.instruments,
self._market_state, self._hidden, self._rng)
# 4. process opportunities -> executions
executions: list[Execution] = []
events: list[StepEvent] = []
true_demand = np.zeros(self.instruments.n)
for opp in opps:
# log exposure
if cfg.log_level == LogLevel.FULL:
events.append(StepEvent(t=opp.t, type=EventType.EXPOSURE,
instrument_id=opp.instrument_id,
opportunity_id=opp.id,
price=float(self._quote.prices[opp.instrument_id]),
propensity=self._quote.propensity))
# check acceptance
prob = self.execution.prob(opp, self._quote, self.instruments,
self._market_state, self._rng)
if self._rng.random() < prob:
# create execution
exe = self.mechanism.process_opportunity(opp, self._quote, self.instruments,
self._market_state, self._rng)
if exe:
true_demand[exe.instrument_id] += exe.size_requested
# apply position censorship
exe = self.position.apply_execution(exe)
executions.append(exe)
if cfg.log_level == LogLevel.FULL:
events.append(StepEvent(t=exe.t, type=EventType.EXECUTION,
instrument_id=exe.instrument_id,
opportunity_id=exe.opportunity_id,
price=exe.price, size=exe.size_filled,
propensity=exe.propensity))
# 5. update position state
self.position.step(self._t)
self.instruments.position = self.position.position
# 6. compute metrics
censored_fills = np.zeros(self.instruments.n)
revenue = 0.0
cost = 0.0
spread_capture = 0.0
for exe in executions:
censored_fills[exe.instrument_id] += exe.size_filled
if exe.side == Side.BUY:
revenue += exe.price * exe.size_filled
cost += self.instruments.costs[exe.instrument_id] * exe.size_filled
else:
revenue -= exe.price * exe.size_filled
cost -= self.instruments.costs[exe.instrument_id] * exe.size_filled
# spread capture for market making
if self._quote.spreads is not None and self._market_state and self._market_state.mid_prices is not None:
mid = self._market_state.mid_prices[exe.instrument_id]
if exe.side == Side.BUY:
spread_capture += (exe.price - mid) * exe.size_filled
else:
spread_capture += (mid - exe.price) * exe.size_filled
pnl = revenue - cost
units = float(np.sum(censored_fills))
lost = float(np.sum(true_demand - censored_fills))
# volatility
volatility = 0.0
if len(self._hidden.quote_history) > 1:
prev = self._hidden.quote_history[-2]
volatility = float(np.mean(np.abs(self._quote.prices - prev) / (prev + 1e-8)))
metrics = StepMetrics(
pnl=pnl, revenue=revenue, cost=cost, units_traded=units,
position_cost=self.position.holding_cost,
lost_opportunity=self.position.shortage_cost + lost * np.mean(self._quote.prices) * 0.1,
spread_capture=spread_capture, volatility=volatility,
conversion=units / (len(opps) + 1e-8),
per_instrument={'fills': censored_fills, 'demand': true_demand}
)
# 7. build logs
logs = StepLogs(
events=events if cfg.log_level == LogLevel.FULL else None,
executions=executions if cfg.log_level == LogLevel.FULL else None,
aggregates={'n_arrivals': len(opps), 'n_executions': len(executions),
'exposures': np.bincount([o.instrument_id for o in opps],
minlength=self.instruments.n).astype(float)},
true_demand=true_demand,
censored_fills=censored_fills
)
# 8. build observation
obs = self.obs_builder.build(self._quote, self.instruments, logs, metrics,
self._market_state, self._hidden, cfg.mask_demand, self._t)
# 9. compute reward
reward = self.objective.reward(self._quote, self.instruments, metrics, self._hidden, obs)
breakdown = self.objective.breakdown(self._quote, self.instruments, metrics, self._hidden, obs)
# print(f"Step {self._t}: Reward={reward:.2f}, Breakdown={breakdown}")
# 10. check termination
terminated = self._t >= cfg.max_steps
truncated = False
info = {'true_demand': true_demand, 'breakdown': self.objective.breakdown(
self._quote, self.instruments, metrics, self._hidden, obs)}
return StepResult(obs=obs, reward=reward, terminated=terminated, truncated=truncated,
info=info, metrics=metrics, logs=logs, hidden=self._hidden)

297
lab/outlet/protocols.py Normal file
View File

@@ -0,0 +1,297 @@
"""
Protocol definitions for pluggable simulator components.
This module defines the interfaces (Protocols) that allow swapping different
implementations for each stage of the Quote -> Arrival -> Execution -> Position
pipeline. All protocols use structural subtyping (duck typing).
Protocols:
Mechanism: How quotes translate to executions (posted price, two-sided, auction)
ArrivalModel: How opportunities arrive (Poisson, Hawkes, sessions)
ExecutionModel: Acceptance probability given quote (elasticity, intensity)
PositionModel: Inventory/position management and censorship
MarketModel: Competitor/market dynamics
ObservationBuilder: Constructs agent observations with censoring
Objective: Computes reward from metrics
"""
from __future__ import annotations
from typing import Protocol, Any, TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from .types import (Quote, Opportunity, Execution, InstrumentSet, StepLogs,
StepMetrics, HiddenState, Observation, MarketState)
from .constants import LogLevel
class Mechanism(Protocol):
"""Defines how quotes translate to executions.
The Mechanism is the core abstraction that differentiates pricing domains:
- PostedPrice: single price, buyer decides to purchase or not
- TwoSided: bid/ask spread, execution depends on distance from mid
- Auction: reserve price affects win probability and clearing price
Methods:
apply_quote: Enforce constraints and return valid quote
process_opportunity: Determine execution given opportunity and quote
"""
def apply_quote(self, quote: Quote, instruments: InstrumentSet,
rng: np.random.Generator) -> Quote:
"""Apply mechanism-specific constraints to a quote.
Args:
quote: Raw quote from policy
instruments: Current instrument set with costs/refs
rng: Random generator for stochastic constraints
Returns:
Constrained quote satisfying mechanism rules (min margin, max delta, etc.)
"""
...
def process_opportunity(self, opp: Opportunity, quote: Quote,
instruments: InstrumentSet, market: MarketState | None,
rng: np.random.Generator) -> Execution | None:
"""Process an opportunity against the current quote.
Args:
opp: Incoming opportunity (session, order, request)
quote: Current posted quote
instruments: Instrument set
market: Current market state (competitor prices, mid-prices)
rng: Random generator
Returns:
Execution if opportunity converts, None otherwise
"""
...
class ArrivalModel(Protocol):
"""Generates opportunities (demand arrivals) for each step.
Different arrival models capture different demand dynamics:
- Poisson: constant rate, memoryless
- Hawkes: self-exciting, clustered arrivals
- Session: retail browsing with multi-product views
Methods:
sample: Generate opportunities for a time interval
"""
def sample(self, t: float, dt: float, instruments: InstrumentSet,
market: MarketState | None, hidden: HiddenState,
rng: np.random.Generator) -> list[Opportunity]:
"""Sample opportunities for time interval [t, t+dt).
Args:
t: Current time
dt: Time interval length
instruments: Available instruments
market: Current market state
hidden: Hidden state (contains demand intensity, contamination)
rng: Random generator
Returns:
List of opportunities arriving in this interval
"""
...
class ExecutionModel(Protocol):
"""Computes acceptance/execution probability given quote and context.
Different models capture different demand responses:
- Elasticity: price sensitivity with competitor cross-effects
- Intensity: distance-based fill probability (market making)
- Logit: discrete choice model
Methods:
prob: Compute acceptance probability
uncensor: Estimate true demand from censored fills
"""
def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
market: MarketState | None, rng: np.random.Generator) -> float:
"""Compute probability that opportunity accepts the quote.
Args:
opp: Opportunity to evaluate
quote: Current quote
instruments: Instrument set
market: Market state (competitor prices affect cross-elasticity)
rng: Random generator
Returns:
Probability in [0, 1] that opportunity executes
"""
...
def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
context: dict[str, Any] | None = None) -> np.ndarray:
"""Estimate true demand from censored fills.
Used for demand estimation research under inventory censorship.
Args:
fills: Observed (censored) fill counts
instruments: Instrument set
context: Additional context (exposures, prices shown)
Returns:
Estimated true demand counts
"""
...
class PositionModel(Protocol):
"""Manages inventory (retail) or position (finance).
Handles:
- Position constraints and censorship
- Holding costs (retail) or inventory risk (finance)
- Replenishment and order receipt
Methods:
reset: Initialize position state
available: Query available capacity for a trade
apply_execution: Censor execution by available position
step: Process time-based updates (replenishment, holding cost)
Properties:
position: Current position vector
holding_cost: Cost incurred this step from holding position
"""
def reset(self, instruments: InstrumentSet, rng: np.random.Generator) -> None:
"""Initialize position state for new episode."""
...
def available(self, instrument_id: int, side: Any) -> float:
"""Query available capacity for a trade.
Args:
instrument_id: Which instrument
side: BUY or SELL
Returns:
Maximum tradeable size given current position
"""
...
def apply_execution(self, exe: Execution) -> Execution:
"""Apply position constraints to an execution.
Args:
exe: Proposed execution with size_requested
Returns:
Censored execution with size_filled <= available capacity
"""
...
def step(self, t: float) -> None:
"""Process time-based position updates.
Handles replenishment receipt, holding cost calculation, etc.
"""
...
@property
def position(self) -> np.ndarray:
"""Current position vector (positive=long/inventory, negative=short)."""
...
@property
def holding_cost(self) -> float:
"""Holding cost incurred this step."""
...
class MarketModel(Protocol):
"""Models external market dynamics and competitor behavior.
For retail: competitor price dynamics (static, reactive, stochastic)
For finance: mid-price process (GBM, mean-reverting)
Methods:
step: Update market state given agent's quotes
"""
def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
rng: np.random.Generator) -> MarketState:
"""Update market state for this timestep.
Args:
t: Current time
self_quotes: Agent's current quotes (competitors may react)
hidden: Hidden state (regime info)
rng: Random generator
Returns:
Updated market state with competitor prices, mid-prices, volatility
"""
...
class ObservationBuilder(Protocol):
"""Constructs agent observations with appropriate censoring.
Critical for research: ensures agent only sees censored fills,
never true demand (which goes in info dict).
Methods:
build: Construct observation from step data
"""
def build(self, quote: Quote, instruments: InstrumentSet, logs: StepLogs,
metrics: StepMetrics, market: MarketState | None,
hidden: HiddenState, mask_demand: bool, t: int) -> Observation:
"""Build observation for agent.
Args:
quote: Current quote
instruments: Instrument set with positions
logs: Step logs with true_demand and censored_fills
metrics: Computed metrics
market: Market state
hidden: Hidden state (not included in obs)
mask_demand: If True, exclude true demand from observation
t: Current timestep
Returns:
Observation containing only observable quantities
"""
...
class Objective(Protocol):
"""Computes reward from step metrics.
Supports composite objectives with weighted terms:
- PnL (profit)
- Position costs (holding, inventory risk)
- Lost opportunity (stockouts)
- Volatility penalty (UX)
- Spread capture (market making)
Methods:
reward: Compute scalar reward
breakdown: Get per-term contribution for analysis
"""
def reward(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState,
obs: Observation) -> float:
"""Compute scalar reward for this step.
Args:
quote: Current quote
instruments: Instrument set
metrics: Step metrics (pnl, costs, etc.)
hidden: Hidden state
obs: Agent observation
Returns:
Scalar reward value
"""
...
def breakdown(self, quote: Quote, instruments: InstrumentSet,
metrics: StepMetrics, hidden: HiddenState,
obs: Observation) -> dict[str, float]:
"""Get reward breakdown by component.
Useful for analyzing which terms dominate the reward.
Returns:
Dict mapping term names to their contributions
"""
...

151
lab/outlet/stock.py Normal file
View File

@@ -0,0 +1,151 @@
"""
Inventory/position management and instrument factories.
This module provides:
- PositionConfig: Configuration for position constraints and costs
- PositionModel: Manages inventory (retail) or position (finance)
- make_instruments: Factory for creating instrument sets
The PositionModel handles demand censorship by limiting executions
to available inventory, computing holding costs, and managing replenishment.
"""
from __future__ import annotations
from dataclasses import dataclass, field
import numpy as np
from .types import Instrument, InstrumentSet, Execution
from .constants import Side, InstrumentType
@dataclass
class PositionConfig:
"""Configuration for position/inventory management.
Attributes:
initial_position: Starting inventory (None = unlimited, float = same for all)
max_position: Maximum long position per instrument
min_position: Maximum short position (negative, for finance)
holding_cost_rate: Cost per unit per step for holding inventory
shortage_cost_rate: Opportunity cost rate for stockouts
lead_time: Steps until replenishment orders arrive
"""
initial_position: np.ndarray | float | None = None
max_position: float = 1000.0
min_position: float = -1000.0
holding_cost_rate: float = 0.001
shortage_cost_rate: float = 0.05
lead_time: int = 0
@dataclass
class PositionModel:
"""Manages inventory (retail) or position (finance) with censorship.
Key responsibilities:
- Track current position per instrument
- Censor executions when position is insufficient
- Compute holding costs per step
- Track shortage/stockout costs
- Handle replenishment orders with lead time
For retail: position is inventory (positive), selling reduces it
For finance: position can be positive (long) or negative (short)
"""
cfg: PositionConfig
n: int = 0
_position: np.ndarray = field(default_factory=lambda: np.array([]))
_pending_orders: list[tuple[int, np.ndarray]] = field(default_factory=list)
_step_holding_cost: float = 0.0
_step_shortage_cost: float = 0.0
def reset(self, instruments: InstrumentSet, rng: np.random.Generator) -> None:
self.n = instruments.n
if self.cfg.initial_position is None:
self._position = np.full(self.n, np.inf) # unlimited
elif isinstance(self.cfg.initial_position, (int, float)):
self._position = np.full(self.n, float(self.cfg.initial_position))
else:
self._position = self.cfg.initial_position.copy().astype(np.float64)
self._pending_orders = []
self._step_holding_cost = 0.0
self._step_shortage_cost = 0.0
def available(self, instrument_id: int, side: Side) -> float:
pos = self._position[instrument_id]
if np.isinf(pos): return np.inf
if side == Side.BUY:
return max(0, pos) # can sell up to current inventory
else:
return max(0, self.cfg.max_position - pos) # can buy up to max
def apply_execution(self, exe: Execution) -> Execution:
idx = int(exe.instrument_id)
avail = self.available(idx, exe.side)
filled = min(exe.size_requested, avail)
shortage = exe.size_requested - filled
if exe.side == Side.BUY:
self._position[idx] -= filled # sold from inventory
else:
self._position[idx] += filled # bought into inventory
if shortage > 0:
self._step_shortage_cost += shortage * exe.price * self.cfg.shortage_cost_rate
return Execution(
opportunity_id=exe.opportunity_id, instrument_id=exe.instrument_id,
side=exe.side, size_requested=exe.size_requested,
size_filled=filled, price=exe.price, propensity=exe.propensity, t=exe.t
)
def order(self, quantity: np.ndarray) -> None:
if self.cfg.lead_time > 0:
self._pending_orders.append((self.cfg.lead_time, quantity.copy()))
else:
self._position += quantity
def step(self, t: float) -> None:
# compute holding cost
pos = np.where(np.isinf(self._position), 0, self._position)
self._step_holding_cost = float(np.sum(np.abs(pos)) * self.cfg.holding_cost_rate)
# receive pending orders
new_pending = []
for (remaining, qty) in self._pending_orders:
if remaining <= 1:
self._position += qty
else:
new_pending.append((remaining - 1, qty))
self._pending_orders = new_pending
@property
def position(self) -> np.ndarray:
return np.where(np.isinf(self._position), -1, self._position)
@property
def holding_cost(self) -> float:
return self._step_holding_cost
@property
def shortage_cost(self) -> float:
return self._step_shortage_cost
def make_instruments(n: int, cost_range: tuple[float, float] = (1.0, 10.0),
margin_range: tuple[float, float] = (0.2, 0.5),
inst_type: InstrumentType = InstrumentType.SKU,
rng: np.random.Generator | None = None) -> InstrumentSet:
"""Factory function to create a random instrument set.
Args:
n: Number of instruments to create
cost_range: (min, max) for uniform cost sampling
margin_range: (min, max) for uniform margin sampling
inst_type: Type of instruments (SKU, ASSET, etc.)
rng: Random generator (uses default if None)
Returns:
InstrumentSet with n instruments having random costs and margins
"""
rng = rng or np.random.default_rng()
costs = rng.uniform(*cost_range, n)
margins = rng.uniform(*margin_range, n)
items = [Instrument(id=i, type=inst_type, cost_basis=c, reference_price=c*(1+m))
for i, (c, m) in enumerate(zip(costs, margins))]
return InstrumentSet(instruments=items)

318
lab/outlet/types.py Normal file
View File

@@ -0,0 +1,318 @@
"""
Core data types for the Quote-Control simulator.
This module defines the fundamental data structures used throughout the platform:
- Identifiers (InstrumentId, OpportunityId, AgentId)
- Domain objects (Instrument, Quote, Opportunity, Execution)
- Logging structures (StepEvent, StepLogs, StepMetrics)
- State containers (MarketState, HiddenState, Observation, StepResult)
All dataclasses are designed to be serializable and numpy-compatible.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, NewType
import numpy as np
from .constants import Side, InstrumentType, OpportunityType, EventType
InstrumentId = NewType('InstrumentId', int) # unique instrument index
OpportunityId = NewType('OpportunityId', str) # unique opportunity/session ID
AgentId = NewType('AgentId', str) # unique agent/actor ID
@dataclass
class Instrument:
"""Represents a priceable entity in the simulation.
An instrument can be a retail SKU, financial asset, loan product, or subscription.
The cost_basis represents the fundamental value (marginal cost for retail,
mid-price for assets, funding rate for loans).
Attributes:
id: Unique identifier for this instrument
type: Category of instrument (SKU, ASSET, LOAN, SUBSCRIPTION)
cost_basis: Fundamental cost or value (marginal cost, mid-price, funding rate)
reference_price: Base or fair price used for action scaling
attrs: Additional attributes (quality score, category, volatility, etc.)
"""
id: InstrumentId
type: InstrumentType
cost_basis: float
reference_price: float
attrs: dict[str, Any] = field(default_factory=dict)
@dataclass
class InstrumentSet:
"""Collection of instruments with optional position tracking.
Provides vectorized access to instrument properties for efficient computation.
Position can be positive (long/inventory) or negative (short) for financial assets.
Attributes:
instruments: List of Instrument objects
position: Current position per instrument (None = unlimited capacity)
Properties:
n: Number of instruments
costs: Vector of cost bases
refs: Vector of reference prices
"""
instruments: list[Instrument]
position: np.ndarray | None = None
@property
def n(self) -> int: return len(self.instruments)
@property
def costs(self) -> np.ndarray: return np.array([i.cost_basis for i in self.instruments], np.float32)
@property
def refs(self) -> np.ndarray: return np.array([i.reference_price for i in self.instruments], np.float32)
@dataclass
class Quote:
"""Price quote set by the policy - the action in the MDP.
Supports multiple quoting mechanisms:
- Posted price: only `prices` field used
- Two-sided: `prices` as mid, `spreads` for bid-ask width
- Auction: `prices` as reserve prices
The propensity field is critical for off-policy evaluation (OPE).
Attributes:
prices: Posted prices (retail) or mid-quotes (market making)
spreads: Bid-ask spread width for two-sided quoting (None for posted price)
propensity: P(this quote | behavior policy) for importance sampling
metadata: Additional info (prev_prices for delta constraints, etc.)
Properties:
bids: Computed bid prices (mid - spread/2)
asks: Computed ask prices (mid + spread/2)
"""
prices: np.ndarray
spreads: np.ndarray | None = None
propensity: float = 1.0
metadata: dict[str, Any] = field(default_factory=dict)
@property
def bids(self) -> np.ndarray | None:
return self.prices - self.spreads/2 if self.spreads is not None else None
@property
def asks(self) -> np.ndarray | None:
return self.prices + self.spreads/2 if self.spreads is not None else None
@dataclass
class Opportunity:
"""An arrival event that may result in a transaction.
Opportunities are the demand side of the simulation:
- Retail: browsing session with purchase intent
- Market making: incoming market order
- Lending: loan application
The context dict carries segment/type information used by execution models.
Attributes:
id: Unique identifier for this opportunity
type: Category (SESSION, MARKET_ORDER, REQUEST)
side: BUY or SELL intent
instrument_id: Which instrument the opportunity targets
size: Requested transaction size (units, shares, principal)
t: Arrival timestamp
context: Segment info (is_scraper, credit_score, urgency, etc.)
"""
id: OpportunityId
type: OpportunityType
side: Side
instrument_id: InstrumentId
size: float = 1.0
t: float = 0.0
context: dict[str, Any] = field(default_factory=dict)
@dataclass
class Execution:
"""A realized transaction after acceptance and position censorship.
The difference between size_requested and size_filled represents
censored demand due to inventory/position constraints.
Attributes:
opportunity_id: Links back to the originating Opportunity
instrument_id: Which instrument was traded
side: BUY or SELL
size_requested: Original requested size (true demand)
size_filled: Actual filled size after censorship
price: Execution price
propensity: Combined propensity for OPE (quote * acceptance)
t: Execution timestamp
"""
opportunity_id: OpportunityId
instrument_id: InstrumentId
side: Side
size_requested: float
size_filled: float
price: float
propensity: float = 1.0
t: float = 0.0
@dataclass
class StepEvent:
"""Generic logged event"""
t: float
type: EventType
instrument_id: InstrumentId | None = None
opportunity_id: OpportunityId | None = None
price: float | None = None
size: float | None = None
propensity: float = 1.0
metadata: dict[str, Any] = field(default_factory=dict)
@dataclass
class StepLogs:
"""Container for all logging data from a simulation step.
Supports both detailed event logging (for OPE) and aggregate-only mode
(for fast simulation). The true_demand vs censored_fills distinction
is critical for research on demand estimation under censorship.
Attributes:
events: Detailed event log (None if LogLevel != FULL)
executions: List of executed transactions (None if LogLevel != FULL)
aggregates: Always-available aggregate statistics
true_demand: Oracle demand before censorship (for research, not in obs)
censored_fills: Realized fills after position constraints (observable)
"""
events: list[StepEvent] | None = None
executions: list[Execution] | None = None
aggregates: dict[str, Any] = field(default_factory=dict)
true_demand: np.ndarray | None = None
censored_fills: np.ndarray | None = None
@dataclass
class StepMetrics:
"""Computed metrics for a single simulation step.
Metrics are domain-aware: retail uses revenue/cost/holding_cost,
market making uses spread_capture and inventory risk.
Attributes:
pnl: Profit and loss (revenue - cost for retail, mark-to-market for finance)
revenue: Gross revenue from sales/executions
cost: Cost of goods sold or position acquisition cost
units_traded: Total units/shares transacted
position_cost: Holding cost (retail) or inventory risk penalty (finance)
lost_opportunity: Cost of stockouts or missed fills
spread_capture: Bid-ask spread captured (market making)
volatility: Price volatility metric for UX consideration
conversion: Fill rate (executions / opportunities)
per_instrument: Per-instrument breakdowns (fills, demand, etc.)
"""
pnl: float = 0.0
revenue: float = 0.0
cost: float = 0.0
units_traded: float = 0.0
position_cost: float = 0.0
lost_opportunity: float = 0.0
spread_capture: float = 0.0
volatility: float = 0.0
conversion: float = 0.0
per_instrument: dict[str, np.ndarray] = field(default_factory=dict)
@dataclass
class MarketState:
"""External market conditions and competitor state.
For retail: competitor_quotes drives cross-elasticity effects.
For finance: mid_prices and volatility drive execution dynamics.
Attributes:
competitor_quotes: Competitor posted prices (retail)
mid_prices: Market mid-prices for assets (finance)
volatility: Per-instrument volatility estimate
regime: Market regime identifier (normal, price_war, high_vol, etc.)
t: Timestamp of this market state
"""
competitor_quotes: np.ndarray | None = None
mid_prices: np.ndarray | None = None
volatility: np.ndarray | None = None
regime: str = 'normal'
t: float = 0.0
@dataclass
class HiddenState:
"""Internal simulator state not exposed to the agent.
Contains oracle information for research analysis and
history needed for non-stationary dynamics.
Attributes:
true_demand_intensity: Latent demand multiplier
contamination: Fraction of arrivals that are adversarial/scraper
regime: Current market/competitor regime
quote_history: History of agent quotes for volatility calculation
market_history: History of market states for analysis
"""
true_demand_intensity: float = 1.0
contamination: float = 0.0
regime: str = 'normal'
quote_history: list[np.ndarray] = field(default_factory=list)
market_history: list[MarketState] = field(default_factory=list)
@dataclass
class Observation:
"""Observable state provided to the agent - censored view only.
Critical invariant: Observation never contains true_demand, only
censored fills. This enforces the censorship research setting.
Attributes:
quotes: Current posted quotes (the agent's last action)
position: Current inventory/position state
fills: Censored execution counts per instrument
exposures: Opportunity exposure counts per instrument
market: Observable market state (competitor prices, volatility)
t: Current timestep
extra: Additional observable features
Methods:
to_flat: Flatten to numpy array for gym compatibility
"""
quotes: np.ndarray
position: np.ndarray | None
fills: np.ndarray
exposures: np.ndarray
market: MarketState | None
t: int
extra: dict[str, Any] = field(default_factory=dict)
def to_flat(self) -> np.ndarray:
"""Flatten observation to 1D numpy array for gym environments."""
parts = [self.quotes, self.fills, self.exposures]
if self.position is not None: parts.append(self.position)
if self.market and self.market.competitor_quotes is not None:
parts.append(self.market.competitor_quotes)
return np.concatenate([p.flatten() for p in parts])
@dataclass
class StepResult:
"""Complete result from a simulation step.
Follows gymnasium convention for obs, reward, terminated, truncated, info.
Additionally provides metrics, logs, and hidden state for research.
Attributes:
obs: Observable state (censored)
reward: Scalar reward from objective function
terminated: Episode ended naturally (max_steps reached)
truncated: Episode ended early (bankruptcy, constraint violation)
info: Additional info dict (contains true_demand for research)
metrics: Computed metrics for this step
logs: Event logs and aggregates
hidden: Internal simulator state (oracle info)
"""
obs: Observation
reward: float
terminated: bool
truncated: bool
info: dict[str, Any]
metrics: StepMetrics
logs: StepLogs
hidden: HiddenState

View File

@@ -0,0 +1,10 @@
from .arrivals import PoissonArrivalModel, HawkesArrivalModel, SessionArrivalModel
from .execution import ElasticityExecutionModel, IntensityExecutionModel, LogitExecutionModel
from .competitors import (StaticCompetitorModel, ReactiveCompetitorModel,
StochasticCompetitorModel, GBMMarketModel)
__all__ = [
'PoissonArrivalModel', 'HawkesArrivalModel', 'SessionArrivalModel',
'ElasticityExecutionModel', 'IntensityExecutionModel', 'LogitExecutionModel',
'StaticCompetitorModel', 'ReactiveCompetitorModel', 'StochasticCompetitorModel', 'GBMMarketModel',
]

168
lab/population/arrivals.py Normal file
View File

@@ -0,0 +1,168 @@
"""
Arrival models for generating demand opportunities.
This module provides different arrival processes:
- PoissonArrivalModel: Constant-rate memoryless arrivals
- HawkesArrivalModel: Self-exciting clustered arrivals (market orders)
- SessionArrivalModel: Retail browsing sessions with multi-product views
Each model implements the ArrivalModel protocol and generates Opportunity objects
that flow through the execution pipeline.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Callable
import numpy as np
from uuid import uuid4
from ..outlet.types import Opportunity, InstrumentSet, MarketState, HiddenState
from ..outlet.constants import Side, OpportunityType
from ..outlet.math_util import poisson_arrivals, hawkes_intensity
@dataclass
class PoissonArrivalConfig:
"""Configuration for Poisson arrival process.
Attributes:
base_rate: Expected arrivals per unit time (scaled by hidden.true_demand_intensity)
side_probs: Probability distribution over BUY/SELL sides
"""
base_rate: float = 10.0
side_probs: dict[Side, float] = None
def __post_init__(self):
if self.side_probs is None:
self.side_probs = {Side.BUY: 1.0}
class PoissonArrivalModel:
"""Homogeneous Poisson arrival process.
Generates arrivals at a constant rate (modulated by demand intensity).
Suitable for stationary demand or as a baseline model.
The actual arrival count follows Poisson(rate * dt * intensity).
"""
def __init__(self, cfg: PoissonArrivalConfig | None = None):
self.cfg = cfg or PoissonArrivalConfig()
def sample(self, t: float, dt: float, instruments: InstrumentSet,
market: MarketState | None, hidden: HiddenState,
rng: np.random.Generator) -> list[Opportunity]:
n_arrivals = poisson_arrivals(self.cfg.base_rate * hidden.true_demand_intensity, dt, rng)
opps = []
for _ in range(n_arrivals):
inst_id = rng.integers(0, instruments.n)
side = rng.choice(list(self.cfg.side_probs.keys()),
p=list(self.cfg.side_probs.values()))
opps.append(Opportunity(
id=str(uuid4())[:8], type=OpportunityType.SESSION,
side=side, instrument_id=inst_id, size=1.0, t=t,
context={'segment': 'default'}
))
return opps
@dataclass
class HawkesArrivalConfig:
"""Configuration for Hawkes self-exciting process.
Attributes:
base_rate: Baseline arrival intensity
alpha: Excitation strength (how much each arrival increases intensity)
beta: Decay rate (how quickly excitation fades)
side_probs: Probability distribution over BUY/SELL sides
"""
base_rate: float = 5.0
alpha: float = 0.5
beta: float = 1.0
side_probs: dict[Side, float] = None
def __post_init__(self):
if self.side_probs is None:
self.side_probs = {Side.BUY: 0.5, Side.SELL: 0.5}
class HawkesArrivalModel:
"""Self-exciting Hawkes point process for clustered arrivals.
Models order flow where arrivals cluster in time (momentum, herding).
Intensity: lambda(t) = base + alpha * sum(exp(-beta * (t - t_i)))
Used for market making scenarios where orders arrive in bursts.
"""
def __init__(self, cfg: HawkesArrivalConfig | None = None):
self.cfg = cfg or HawkesArrivalConfig()
self._history: np.ndarray = np.array([])
def sample(self, t: float, dt: float, instruments: InstrumentSet,
market: MarketState | None, hidden: HiddenState,
rng: np.random.Generator) -> list[Opportunity]:
intensity = hawkes_intensity(
self.cfg.base_rate * hidden.true_demand_intensity,
self._history, self.cfg.alpha, self.cfg.beta, t
)
n_arrivals = poisson_arrivals(intensity, dt, rng)
opps = []
for i in range(n_arrivals):
arr_t = t + rng.uniform(0, dt)
self._history = np.append(self._history, arr_t)
inst_id = rng.integers(0, instruments.n)
side = rng.choice(list(self.cfg.side_probs.keys()),
p=list(self.cfg.side_probs.values()))
opps.append(Opportunity(
id=str(uuid4())[:8], type=OpportunityType.MARKET_ORDER,
side=side, instrument_id=inst_id,
size=rng.exponential(1.0), t=arr_t,
context={'intensity': intensity}
))
# decay old history
self._history = self._history[self._history > t - 10]
return opps
@dataclass
class SessionArrivalConfig:
"""Configuration for retail session arrivals.
Attributes:
sessions_per_step: Number of browsing sessions per step
views_per_session: (min, max) product views per session
contamination: Fraction of sessions that are scrapers/bots
"""
sessions_per_step: int = 20
views_per_session: tuple[int, int] = (1, 5)
contamination: float = 0.0
class SessionArrivalModel:
"""Retail browsing session model with multi-product views.
Each session views multiple products, generating one opportunity per view.
Scraper sessions (controlled by contamination) view more products
but convert at lower rates (handled by ExecutionModel).
"""
def __init__(self, cfg: SessionArrivalConfig | None = None):
self.cfg = cfg or SessionArrivalConfig()
def sample(self, t: float, dt: float, instruments: InstrumentSet,
market: MarketState | None, hidden: HiddenState,
rng: np.random.Generator) -> list[Opportunity]:
n_sessions = self.cfg.sessions_per_step
contamination = hidden.contamination if hidden else self.cfg.contamination
opps = []
for _ in range(n_sessions):
is_scraper = rng.random() < contamination
n_views = rng.integers(*self.cfg.views_per_session)
sid = str(uuid4())[:8]
# scrapers view more products
if is_scraper:
n_views = min(instruments.n, n_views * 3)
viewed = rng.choice(instruments.n, size=min(n_views, instruments.n), replace=False)
for inst_id in viewed:
opps.append(Opportunity(
id=f"{sid}-{inst_id}", type=OpportunityType.SESSION,
side=Side.BUY, instrument_id=int(inst_id), size=1.0, t=t,
context={'session_id': sid, 'is_scraper': is_scraper, 'n_views': n_views}
))
return opps

View File

@@ -0,0 +1,189 @@
"""
Market and competitor models for external dynamics.
This module provides models for competitor pricing (retail) and market dynamics (finance):
- StaticCompetitorModel: Fixed competitor prices
- ReactiveCompetitorModel: Competitor reacts to agent's prices, can trigger price wars
- StochasticCompetitorModel: Random walk competitor prices
- GBMMarketModel: Geometric Brownian Motion for asset mid-prices
Each model implements the MarketModel protocol.
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from ..outlet.types import Quote, MarketState, HiddenState
from ..outlet.math_util import clamp, ema
@dataclass
class StaticCompetitorConfig:
"""Configuration for static competitor.
Attributes:
markup: Fixed percentage markup over reference prices
"""
markup: float = 0.1
class StaticCompetitorModel:
"""Static competitor with fixed markup pricing.
Competitor prices = reference * (1 + markup).
Useful as a baseline or for testing without competitor dynamics.
"""
def __init__(self, cfg: StaticCompetitorConfig | None = None, refs: np.ndarray | None = None):
self.cfg = cfg or StaticCompetitorConfig()
self.refs = refs
def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
rng: np.random.Generator) -> MarketState:
refs = self.refs if self.refs is not None else self_quotes.prices
comp_prices = refs * (1 + self.cfg.markup)
return MarketState(competitor_quotes=comp_prices, regime='static', t=t)
@dataclass
class ReactiveCompetitorConfig:
"""Configuration for reactive competitor.
Attributes:
follow_weight: Smoothing weight for price following (0=ignore, 1=instant)
band_pct: Maximum deviation from reference prices
war_threshold: Relative price diff that triggers price war
war_aggression: How much competitor cuts prices during war
"""
follow_weight: float = 0.3
band_pct: float = 0.1
war_threshold: float = -0.15
war_aggression: float = 0.2
class ReactiveCompetitorModel:
"""Competitor that reacts to agent's prices with price war dynamics.
The competitor follows the agent's prices with smoothing.
If the agent undercuts significantly (beyond war_threshold),
a price war is triggered where the competitor becomes more aggressive.
This creates non-stationary dynamics that test policy robustness.
"""
def __init__(self, cfg: ReactiveCompetitorConfig | None = None, refs: np.ndarray | None = None):
self.cfg = cfg or ReactiveCompetitorConfig()
self.refs = refs
self._prices: np.ndarray | None = None
self._in_war: bool = False
def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
rng: np.random.Generator) -> MarketState:
refs = self.refs if self.refs is not None else self_quotes.prices
c = self.cfg
if self._prices is None:
self._prices = refs.copy()
# check for price war trigger
relative_diff = (self_quotes.prices - self._prices) / (self._prices + 1e-8)
if np.any(relative_diff < c.war_threshold):
self._in_war = True
elif np.all(relative_diff > -c.war_threshold / 2):
self._in_war = False
# update prices
if self._in_war:
target = self_quotes.prices * (1 - c.war_aggression)
hidden.regime = 'price_war'
else:
target = self_quotes.prices * (1 + c.follow_weight * 0.05)
hidden.regime = 'normal'
# follow with smoothing
new_prices = np.array([ema(old, new, c.follow_weight)
for old, new in zip(self._prices, target)])
# stay within band
new_prices = clamp(new_prices, refs * (1 - c.band_pct), refs * (1 + c.band_pct))
self._prices = new_prices
return MarketState(competitor_quotes=new_prices, regime=hidden.regime, t=t)
@dataclass
class StochasticCompetitorConfig:
"""Configuration for stochastic competitor.
Attributes:
drift: Price drift per step
volatility: Price volatility (std of random shocks)
mean_revert: Mean reversion strength toward reference
"""
drift: float = 0.0
volatility: float = 0.02
mean_revert: float = 0.1
class StochasticCompetitorModel:
"""Ornstein-Uhlenbeck style stochastic competitor prices.
Prices follow: dP = drift + mean_revert*(ref - P) + volatility*P*dW
Provides non-stationary competitor dynamics independent of agent actions.
Useful for testing robustness to market noise.
"""
def __init__(self, cfg: StochasticCompetitorConfig | None = None, refs: np.ndarray | None = None):
self.cfg = cfg or StochasticCompetitorConfig()
self.refs = refs
self._prices: np.ndarray | None = None
def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
rng: np.random.Generator) -> MarketState:
refs = self.refs if self.refs is not None else self_quotes.prices
c = self.cfg
if self._prices is None:
self._prices = refs.copy()
# Ornstein-Uhlenbeck style dynamics
n = len(self._prices)
noise = rng.normal(0, c.volatility, n)
reversion = c.mean_revert * (refs - self._prices)
self._prices = self._prices + c.drift + reversion + noise * self._prices
self._prices = np.maximum(self._prices, refs * 0.5)
return MarketState(competitor_quotes=self._prices.copy(), regime='stochastic', t=t)
@dataclass
class GBMMarketConfig:
"""Configuration for GBM market model.
Attributes:
mu: Price drift (expected return)
sigma: Price volatility
dt: Time step size
"""
mu: float = 0.0
sigma: float = 0.1
dt: float = 1.0
class GBMMarketModel:
"""Geometric Brownian Motion model for asset mid-prices.
Standard Black-Scholes dynamics: dS = mu*S*dt + sigma*S*dW
Used for market making scenarios where the underlying asset price
follows a random walk. The agent quotes around this moving mid-price.
"""
def __init__(self, cfg: GBMMarketConfig | None = None, initial: np.ndarray | None = None):
self.cfg = cfg or GBMMarketConfig()
self._mids = initial
def step(self, t: float, self_quotes: Quote, hidden: HiddenState,
rng: np.random.Generator) -> MarketState:
if self._mids is None:
self._mids = self_quotes.prices.copy()
c = self.cfg
n = len(self._mids)
z = rng.standard_normal(n)
self._mids = self._mids * np.exp((c.mu - 0.5*c.sigma**2)*c.dt + c.sigma*np.sqrt(c.dt)*z)
vol = np.full(n, c.sigma)
return MarketState(mid_prices=self._mids.copy(), volatility=vol, regime='gbm', t=t)

174
lab/population/execution.py Normal file
View File

@@ -0,0 +1,174 @@
"""
Execution models for computing acceptance/fill probabilities.
This module provides different models for how opportunities convert to executions:
- ElasticityExecutionModel: Price elasticity with competitor cross-effects (retail)
- IntensityExecutionModel: Distance-based fill intensity (market making)
- LogitExecutionModel: Discrete choice model
Each model implements the ExecutionModel protocol.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
import numpy as np
from ..outlet.types import Opportunity, Quote, InstrumentSet, MarketState
from ..outlet.constants import Side
from ..outlet.math_util import sigmoid, safe_log, intensity_decay, EPS
@dataclass
class ElasticityConfig:
"""Configuration for price elasticity execution model.
Attributes:
base_prob: Baseline purchase probability at reference price
price_sensitivity: Own-price elasticity coefficient
cross_elasticity: Competitor price cross-elasticity
scraper_conversion: Multiplier for scraper conversion (typically << 1)
"""
base_prob: float = 0.3
price_sensitivity: float = 2.0
cross_elasticity: float = 0.5
scraper_conversion: float = 0.01
class ElasticityExecutionModel:
"""Price elasticity model for retail dynamic pricing.
P(buy) = base_prob * exp(-sensitivity * log(price/ref)) * cross_effect * scraper_mult
Higher prices reduce purchase probability exponentially.
Competitor undercutting shifts demand away from the platform.
Scrapers convert at a much lower rate (reconnaissance, not purchase).
"""
def __init__(self, cfg: ElasticityConfig | None = None):
self.cfg = cfg or ElasticityConfig()
def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
market: MarketState | None, rng: np.random.Generator) -> float:
idx = int(opp.instrument_id)
price = quote.prices[idx]
ref = instruments.refs[idx]
# base probability adjusted by price ratio
log_ratio = safe_log(price / ref)
prob = self.cfg.base_prob * np.exp(-self.cfg.price_sensitivity * log_ratio)
# cross-elasticity: competitor undercutting increases their share
if market and market.competitor_quotes is not None:
comp_price = market.competitor_quotes[idx]
if comp_price < price:
prob *= np.exp(-self.cfg.cross_elasticity * (price - comp_price) / ref)
# scrapers convert at much lower rate
if opp.context.get('is_scraper', False):
prob *= self.cfg.scraper_conversion
return float(np.clip(prob, 0, 1))
def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
context: dict[str, Any] | None = None) -> np.ndarray:
# simple imputation: assume fills = prob * exposures, invert
exposures = context.get('exposures', fills) if context else fills
avg_prob = self.cfg.base_prob
return fills / (avg_prob + EPS)
@dataclass
class IntensityConfig:
"""Configuration for intensity-based execution model.
Attributes:
base_intensity: Baseline fill intensity
kappa: Decay rate with distance from mid-price
vol_scale: Volatility multiplier for fill intensity
"""
base_intensity: float = 1.0
kappa: float = 1.5
vol_scale: float = 0.5
class IntensityExecutionModel:
"""Avellaneda-Stoikov style fill intensity for market making.
Fill probability decays exponentially with distance from mid-price:
P(fill) = base * exp(-kappa * |quote - mid|) * (1 + vol_scale * sigma)
Tighter spreads (closer to mid) have higher fill probability.
Higher volatility increases fill probability (more aggressive traders).
"""
def __init__(self, cfg: IntensityConfig | None = None):
self.cfg = cfg or IntensityConfig()
def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
market: MarketState | None, rng: np.random.Generator) -> float:
idx = int(opp.instrument_id)
# get mid price from market or use quote price
if market and market.mid_prices is not None:
mid = market.mid_prices[idx]
else:
mid = quote.prices[idx]
# compute distance from mid
if opp.side == Side.BUY:
exec_price = quote.asks[idx] if quote.asks is not None else quote.prices[idx]
distance = exec_price - mid
else:
exec_price = quote.bids[idx] if quote.bids is not None else quote.prices[idx]
distance = mid - exec_price
# intensity decays with distance
intensity = self.cfg.base_intensity * intensity_decay(abs(distance), self.cfg.kappa)
# volatility increases fill probability
if market and market.volatility is not None:
vol = market.volatility[idx]
intensity *= (1 + self.cfg.vol_scale * vol)
return float(np.clip(intensity, 0, 1))
def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
context: dict[str, Any] | None = None) -> np.ndarray:
return fills # market making doesn't have same censorship concept
@dataclass
class LogitConfig:
"""Configuration for logit discrete choice model.
Attributes:
beta_0: Intercept (base utility)
beta_price: Price coefficient (typically negative)
beta_quality: Quality attribute coefficient
"""
beta_0: float = 0.5
beta_price: float = -1.5
beta_quality: float = 0.3
class LogitExecutionModel:
"""Discrete choice logit model for purchase probability.
Utility: U = beta_0 + beta_price * (price/ref) + beta_quality * quality
P(buy) = sigmoid(U)
Provides a theoretically grounded demand model from economics literature.
"""
def __init__(self, cfg: LogitConfig | None = None):
self.cfg = cfg or LogitConfig()
def prob(self, opp: Opportunity, quote: Quote, instruments: InstrumentSet,
market: MarketState | None, rng: np.random.Generator) -> float:
idx = int(opp.instrument_id)
price = quote.prices[idx]
ref = instruments.refs[idx]
quality = instruments.instruments[idx].attrs.get('quality', 0.5)
# utility
u = self.cfg.beta_0 + self.cfg.beta_price * (price / ref) + self.cfg.beta_quality * quality
# choice probability via sigmoid
return float(sigmoid(u))
def uncensor(self, fills: np.ndarray, instruments: InstrumentSet,
context: dict[str, Any] | None = None) -> np.ndarray:
return fills / (self.cfg.beta_0 + EPS)

59
lab/run_example.py Normal file
View File

@@ -0,0 +1,59 @@
#!/usr/bin/env python
"""Example script demonstrating the Quote-Control platform"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import numpy as np
from lab.config import make_retail_platform, make_market_making_platform
from lab.experiments.eval import (rollout, compare_policies, fixed_price_policy,
cost_plus_margin_policy, random_walk_policy)
def demo_retail():
print("=" * 60)
print("RETAIL DYNAMIC PRICING DEMO")
print("=" * 60)
platform = make_retail_platform()
print(f"Instruments: {platform.instruments.n}")
print(f"Reference prices: {platform.instruments.refs[:5].round(2)}...")
# compare policies
policies = {
'fixed': fixed_price_policy(platform.instruments.refs),
'cost_plus_30%': cost_plus_margin_policy(platform.instruments.costs, 0.3),
'cost_plus_50%': cost_plus_margin_policy(platform.instruments.costs, 0.5),
'random_walk': random_walk_policy(platform.instruments.refs, 0.03),
}
results = compare_policies(platform, policies, n_steps=100, n_runs=3)
print("\nPolicy Comparison (100 steps, 3 runs):")
print("-" * 50)
for name, r in sorted(results.items(), key=lambda x: -x[1]['mean_pnl']):
print(f"{name:20s} PnL={r['mean_pnl']:8.1f} +/- {r['std_reward']:6.1f} "
f"conv={r['mean_conversion']:.3f}")
def demo_market_making():
print("\n" + "=" * 60)
print("MARKET MAKING DEMO")
print("=" * 60)
platform = make_market_making_platform()
print(f"Instruments: {platform.instruments.n}")
print(f"Initial mids: {platform.instruments.refs.round(2)}")
# simple policy: quote at mid with fixed spread
def mm_policy(obs: np.ndarray, t: int):
mids = platform.instruments.refs # would use obs in real policy
return mids, 1.0
result = rollout(platform, mm_policy, n_steps=200, seed=42)
print(f"\nRollout (200 steps):")
print(f" Total PnL: {result.total_pnl:.2f}")
print(f" Avg conversion: {result.avg_conversion:.3f}")
print(f" Total spread capture: {sum(m.spread_capture for m in result.metrics):.2f}")
if __name__ == '__main__':
demo_retail()
demo_market_making()

2
sim/case/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
"""Case-specific simulations and experiments."""

View File

@@ -0,0 +1,2 @@
"""Minimal thesis-aligned pricing simulation (self-contained)."""

View File

@@ -0,0 +1,125 @@
"""Cost of Information (COI) computation for thesis pricing system.
Core KPI: COI = E[p_shown] - p_min measures pricing power from information asymmetry.
Theorem 1 shows COI erodes as agent queries increase: as N->inf, p^(1)->p_min.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, List, TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from .simplified import Session
@dataclass(frozen=True)
class COIWindow:
"""Windowed COI metrics computed from realized price exposures.
policy: E[p_shown] - cost, the definition-level KPI
agent: E[p^(1)] - cost where p^(1) is min price under agent querying
leak: max(policy - agent, 0), observable gap from reconnaissance
survival_ratio: agent/policy, fraction of pricing power retained
"""
policy: float
agent: float
leak: float
survival_ratio: float
policy_by_product: np.ndarray
agent_by_product: np.ndarray
demand_weights: np.ndarray
def aggregate_prices(sessions: List["Session"], mode: str = "all") -> Dict[int, List[float] | float]:
"""Unified price aggregation across sessions.
mode: "all" returns all prices per product, "min_per_session" returns min price per session per product,
"min_across" returns single min price per product
"""
if mode == "min_across":
mins: Dict[int, float] = {}
for s in sessions:
for e in s.events:
pidx, price = int(e.product_idx), float(e.price_seen)
mins[pidx] = min(mins.get(pidx, price), price)
return mins
elif mode == "min_per_session":
result: Dict[int, List[float]] = {}
for s in sessions:
by_p: Dict[int, float] = {}
for e in s.events:
pidx, price = int(e.product_idx), float(e.price_seen)
by_p[pidx] = min(by_p.get(pidx, price), price)
for pidx, pmin in by_p.items():
result.setdefault(pidx, []).append(pmin)
return result
else: # "all"
prices: Dict[int, List[float]] = {}
for s in sessions:
for e in s.events:
prices.setdefault(e.product_idx, []).append(float(e.price_seen))
return prices
def demand_weights_by_product(sessions: List["Session"], demand_mapping: Dict[str, float], n_products: int) -> np.ndarray:
"""Compute demand-weighted importance per product."""
w = np.zeros(n_products, dtype=float)
sessions_by_id = {s.sid: s for s in sessions}
for sid, q in demand_mapping.items():
sess = sessions_by_id.get(sid)
if sess and sess.events:
w[int(sess.events[0].product_idx)] += float(q)
total = float(np.sum(w))
return (w / total) if total > 0 else w
def compute_coi_window(sessions: List["Session"], costs: np.ndarray, demand_mapping: Dict[str, float] | None = None) -> COIWindow:
"""Compute COI metrics over session window.
Aggregates price exposures and computes policy-level vs agent-realized COI.
"""
n = int(len(costs))
prices = aggregate_prices(sessions, mode="all")
agent_sessions = [s for s in sessions if s.actor == "A"]
agent_min = aggregate_prices(agent_sessions, mode="min_across") if agent_sessions else {}
policy_by = np.zeros(n, dtype=float)
agent_by = np.zeros(n, dtype=float)
seen = np.array([(i in prices) for i in range(n)], dtype=bool)
agent_seen = np.array([(i in agent_min) for i in range(n)], dtype=bool)
for pidx, ps in prices.items():
if 0 <= pidx < n and ps:
policy_by[pidx] = float(np.mean(ps) - float(costs[pidx]))
for pidx, pmin in agent_min.items():
if 0 <= pidx < n:
agent_by[pidx] = float(pmin - float(costs[pidx]))
agent_by[seen & ~agent_seen] = policy_by[seen & ~agent_seen] # no erosion if no agent exposure
demand_w = demand_weights_by_product(sessions, demand_mapping, n) if demand_mapping else np.zeros(n, dtype=float)
has_weights = float(np.sum(demand_w)) > 0
if has_weights:
policy, agent = float(np.dot(demand_w, policy_by)), float(np.dot(demand_w, agent_by))
elif np.any(seen):
policy, agent = float(np.mean(policy_by[seen])), float(np.mean(agent_by[seen]))
else:
policy, agent = 0.0, 0.0
leak = float(max(policy - agent, 0.0))
survival = float(np.clip(agent / policy, 0.0, 1.0)) if policy > 0 else 0.0
return COIWindow(policy=policy, agent=agent, leak=leak, survival_ratio=survival,
policy_by_product=policy_by, agent_by_product=agent_by, demand_weights=demand_w)
def coi_erosion(coi_policy: float, coi_agent: float, eps: float = 1e-9) -> float:
"""Thesis-consistent COI erosion: fraction of pricing power destroyed by agent queries.
erosion = 1 - (COI_agent / COI_policy)
When agents find low prices, COI_agent -> 0, erosion -> 1.
"""
if coi_policy <= eps:
return 0.0
return float(np.clip(1.0 - (coi_agent / (coi_policy + eps)), 0.0, 1.0))

View File

@@ -0,0 +1,325 @@
"""COI leakage experiments and policy comparisons.
Demonstrates the core thesis contribution: COI erosion under agent contamination
and recovery via robust pricing policies.
Generates TensorBoard logs for:
- COI erosion curves across contamination levels
- Policy comparison (fixed vs adaptive vs RL)
- Revenue/margin trade-offs
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple
import json
import numpy as np
try:
from torch.utils.tensorboard import SummaryWriter
HAS_TB = True
except ImportError:
HAS_TB = False
from .simplified_env import PricingEnv, EnvConfig, make_env
from .simplified import System
@dataclass
class ExperimentResult:
"""Container for experiment metrics."""
name: str
alpha: float
reward_mean: float
reward_std: float
coi_erosion: float
alpha_error: float
revenue: float
margin: float
def to_dict(self) -> dict:
return {k: getattr(self, k) for k in self.__dataclass_fields__}
def theoretical_coi_erosion_curve(alphas: np.ndarray, n_sessions: int = 1000) -> np.ndarray:
"""Theoretical COI erosion from Theorem 1 using order statistic model.
For N i.i.d. uniform queries on [p_min, p_max]:
E[p^(1)] = p_min + (p_max - p_min)/(N+1), so erosion = 1 - 2/(N+1)
"""
erosions = []
for a in alphas:
n_agents = max(1, int(a * n_sessions))
erosions.append(1.0 - 2.0 / (n_agents + 1))
return np.array(erosions)
def run_policy_episode(
env: PricingEnv,
policy_fn,
n_episodes: int = 10
) -> Tuple[List[float], List[float], List[float], List[float]]:
"""Run policy and collect per-step metrics."""
rewards, coi_erosions, alpha_errors, revenues = [], [], [], []
for _ in range(n_episodes):
obs, info = env.reset()
done = False
while not done:
action = policy_fn(obs, env.n)
obs, reward, terminated, truncated, info = env.step(action)
done = terminated or truncated
rewards.append(reward)
if 'coi_erosion' in info:
coi_erosions.append(info['coi_erosion'])
if 'alpha_true' in info and 'alpha_est' in info:
alpha_errors.append(abs(info['alpha_true'] - info['alpha_est']))
if 'revenue' in info:
revenues.append(info['revenue'])
return rewards, coi_erosions, alpha_errors, revenues
class PolicyRegistry:
"""Registry of baseline policies."""
@staticmethod
def fixed(obs: np.ndarray, n: int, margin: float = 0.15) -> np.ndarray:
return np.ones(n, dtype=np.float32) * (1.0 + margin)
@staticmethod
def random(obs: np.ndarray, n: int, rng: np.random.Generator = None) -> np.ndarray:
rng = rng or np.random.default_rng()
return rng.uniform(0.7, 1.3, n).astype(np.float32)
@staticmethod
def adaptive(obs: np.ndarray, n: int, base_margin: float = 0.15) -> np.ndarray:
"""Reduce margins when alpha estimate is high."""
alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
margin_scale = 1.0 - 0.4 * alpha_est
return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale)
@staticmethod
def aggressive(obs: np.ndarray, n: int) -> np.ndarray:
"""High margins, ignores contamination."""
return np.ones(n, dtype=np.float32) * 1.4
@staticmethod
def defensive(obs: np.ndarray, n: int) -> np.ndarray:
"""Low margins, always cautious."""
return np.ones(n, dtype=np.float32) * 1.05
@staticmethod
def alpha_proportional(obs: np.ndarray, n: int, max_margin: float = 0.3) -> np.ndarray:
"""Margin inversely proportional to estimated alpha."""
alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
margin = max_margin * (1.0 - alpha_est)
return np.ones(n, dtype=np.float32) * (1.0 + margin)
def run_contamination_sweep(
alphas: List[float],
policies: Dict[str, callable],
n_products: int = 10,
max_steps: int = 200,
n_episodes: int = 10,
seed: int = 42,
log_dir: str = None
) -> Dict[str, List[ExperimentResult]]:
"""Run policies across contamination levels."""
results = {name: [] for name in policies}
writer = SummaryWriter(Path(log_dir) / "sweep") if log_dir and HAS_TB else None
for alpha in alphas:
print(f" alpha={alpha:.2f}", end=" ")
env_cfg = EnvConfig(
n_products=n_products, max_steps=max_steps,
alpha_true=alpha, reward_mode="robust", seed=seed)
env = make_env(env_cfg)
for name, policy_fn in policies.items():
rewards, coi_vals, alpha_errs, revenues = run_policy_episode(env, policy_fn, n_episodes)
result = ExperimentResult(
name=name, alpha=alpha,
reward_mean=float(np.mean(rewards)),
reward_std=float(np.std(rewards)),
coi_erosion=float(np.mean(coi_vals)) if coi_vals else 0.0,
alpha_error=float(np.mean(alpha_errs)) if alpha_errs else 0.0,
revenue=float(np.mean(revenues)) if revenues else 0.0,
margin=float(np.mean([policy_fn(np.zeros(3 * n_products + 3), n_products)]) - 1.0))
results[name].append(result)
if writer:
step = int(alpha * 100)
writer.add_scalar(f'{name}/reward', result.reward_mean, step)
writer.add_scalar(f'{name}/coi_erosion', result.coi_erosion, step)
writer.add_scalar(f'{name}/alpha_error', result.alpha_error, step)
writer.add_scalar(f'{name}/revenue', result.revenue, step)
print(f"done")
# add theoretical curve
if writer:
theo = theoretical_coi_erosion_curve(np.array(alphas))
for i, (a, e) in enumerate(zip(alphas, theo)):
writer.add_scalar('theoretical/coi_erosion', e, int(a * 100))
writer.close()
return results
def run_coi_demonstration(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
"""Main COI demonstration experiment."""
print("=== COI Leakage Demonstration ===\n")
Path(log_dir).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(Path(log_dir) / "coi_demo") if HAS_TB else None
# theoretical erosion curve
print("1. Theoretical COI erosion (Theorem 1)")
alphas = np.linspace(0.0, 0.6, 13)
theo_erosion = theoretical_coi_erosion_curve(alphas, n_sessions=1000)
for a, e in zip(alphas, theo_erosion):
print(f" alpha={a:.2f} -> erosion={e:.3f}")
if writer:
writer.add_scalar('theory/coi_erosion', e, int(a * 100))
# policy comparison
print("\n2. Policy comparison across contamination levels")
policies = {
'fixed': lambda obs, n: PolicyRegistry.fixed(obs, n),
'aggressive': PolicyRegistry.aggressive,
'defensive': PolicyRegistry.defensive,
'adaptive': PolicyRegistry.adaptive,
'alpha_proportional': PolicyRegistry.alpha_proportional,
}
sweep_alphas = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
results = run_contamination_sweep(
sweep_alphas, policies, n_products=10, max_steps=100,
n_episodes=5, seed=seed, log_dir=log_dir)
# summarize
print("\n3. Summary by policy")
for name, res_list in results.items():
avg_reward = np.mean([r.reward_mean for r in res_list])
avg_coi = np.mean([r.coi_erosion for r in res_list])
print(f" {name:20s}: avg_reward={avg_reward:.2f}, avg_coi={avg_coi:.3f}")
# save results
output = {
'theoretical': {'alphas': alphas.tolist(), 'erosion': theo_erosion.tolist()},
'empirical': {name: [r.to_dict() for r in res_list] for name, res_list in results.items()}}
with open(Path(log_dir) / "coi_demo_results.json", 'w') as f:
json.dump(output, f, indent=2)
if writer:
writer.close()
print(f"\nResults saved to {log_dir}/coi_demo_results.json")
print(f"TensorBoard: tensorboard --logdir {log_dir}")
return output
def run_reward_mode_comparison(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
"""Compare different reward modes."""
print("=== Reward Mode Comparison ===\n")
Path(log_dir).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(Path(log_dir) / "reward_modes") if HAS_TB else None
reward_modes = ["revenue", "profit", "robust", "coi_aware"]
alpha = 0.3 # moderate contamination
results = {}
for mode in reward_modes:
print(f" mode={mode}", end=" ")
env_cfg = EnvConfig(
n_products=10, max_steps=200, alpha_true=alpha,
reward_mode=mode, seed=seed)
env = make_env(env_cfg)
rewards, coi_vals, _, revenues = run_policy_episode(
env, PolicyRegistry.adaptive, n_episodes=10)
results[mode] = {
'reward_mean': float(np.mean(rewards)),
'reward_std': float(np.std(rewards)),
'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
'revenue': float(np.mean(revenues)) if revenues else 0.0}
if writer:
for k, v in results[mode].items():
writer.add_scalar(f'{mode}/{k}', v, 0)
print(f"reward={results[mode]['reward_mean']:.2f}, coi={results[mode]['coi_erosion']:.3f}")
if writer:
writer.close()
with open(Path(log_dir) / "reward_mode_results.json", 'w') as f:
json.dump(results, f, indent=2)
return results
def run_alpha_drift_experiment(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
"""Test policy robustness under non-stationary contamination."""
print("=== Alpha Drift Experiment ===\n")
Path(log_dir).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(Path(log_dir) / "alpha_drift") if HAS_TB else None
drift_rates = [0.0, 0.01, 0.02, 0.05]
results = {}
for drift in drift_rates:
print(f" drift={drift:.2f}", end=" ")
env_cfg = EnvConfig(
n_products=10, max_steps=200, alpha_true=0.2,
alpha_drift=drift, reward_mode="robust", seed=seed)
env = make_env(env_cfg)
rewards, coi_vals, alpha_errs, _ = run_policy_episode(
env, PolicyRegistry.adaptive, n_episodes=10)
results[f'drift_{drift}'] = {
'reward_mean': float(np.mean(rewards)),
'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
'alpha_tracking_error': float(np.mean(alpha_errs)) if alpha_errs else 0.0}
if writer:
for k, v in results[f'drift_{drift}'].items():
writer.add_scalar(f'drift_{drift}/{k}', v, 0)
print(f"reward={results[f'drift_{drift}']['reward_mean']:.2f}, "
f"alpha_err={results[f'drift_{drift}']['alpha_tracking_error']:.3f}")
if writer:
writer.close()
return results
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Run COI experiments")
parser.add_argument("--exp", type=str, default="coi", choices=["coi", "reward", "drift", "all"])
parser.add_argument("--log-dir", type=str, default="sim/case/thesis_simplified/runs")
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
if args.exp == "coi" or args.exp == "all":
run_coi_demonstration(args.log_dir, args.seed)
if args.exp == "reward" or args.exp == "all":
run_reward_mode_comparison(args.log_dir, args.seed)
if args.exp == "drift" or args.exp == "all":
run_alpha_drift_experiment(args.log_dir, args.seed)

View File

@@ -0,0 +1,72 @@
"""Behavioral separability for human/agent detection.
Computes divergence signals delta_H, delta_A from session trajectories using
transition kernel estimation and KL divergence to prototype behavioral profiles.
"""
from __future__ import annotations
from typing import Dict, List, Tuple, TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from .simplified import Event, Session
# prototype behavioral kernels for human vs agent sessions
TRANS_H = {
"start": {"view": 0.85, "end": 0.15},
"view": {"detail": 0.4, "cart": 0.3, "view": 0.2, "end": 0.1},
"detail": {"cart": 0.5, "view": 0.3, "end": 0.2},
"cart": {"purchase": 0.6, "view": 0.25, "end": 0.15},
"purchase": {"end": 1.0},
}
TRANS_A = {
"start": {"view": 0.95, "end": 0.05},
"view": {"detail": 0.6, "view": 0.25, "cart": 0.1, "end": 0.05},
"detail": {"view": 0.5, "cart": 0.15, "detail": 0.3, "end": 0.05},
"cart": {"view": 0.4, "purchase": 0.2, "end": 0.4},
"purchase": {"end": 1.0},
}
def kl_div(p: Dict[str, float], q: Dict[str, float], eps: float = 1e-10) -> float:
"""KL divergence D_KL(p || q) for discrete distributions."""
keys = set(p.keys()) | set(q.keys())
return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys)
def build_kernel(events: List["Event"]) -> Dict[str, Dict[str, float]]:
"""Build empirical transition kernel T' from trajectory events."""
trans: Dict[str, Dict[str, int]] = {}
prev = "start"
for e in events:
curr = e.action
trans.setdefault(prev, {})
trans[prev][curr] = trans[prev].get(curr, 0) + 1
prev = curr
return {s: {d: c / sum(dsts.values()) for d, c in dsts.items()} for s, dsts in trans.items() if sum(dsts.values()) > 0}
def compute_divergence(session: "Session") -> Tuple[float, float]:
"""Compute divergence signals delta_H, delta_A for session.
delta_H = mean KL(T' || T_H) across states, measures distance to human prototype
delta_A = mean KL(T' || T_A) across states, measures distance to agent prototype
"""
kernel = build_kernel(session.events)
if not kernel:
return 0.5, 0.5
delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / len(kernel)
delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / len(kernel)
return delta_h, delta_a
def estimate_alpha(session: "Session", beta: float = 2.0) -> float:
"""Per-session contamination estimate alpha_hat = sigma(beta*(delta_H - delta_A)).
Returns probability session is agent-generated based on behavioral divergence.
"""
dh, da = compute_divergence(session)
if (dh + da) <= 0:
return 0.5
return 1.0 / (1.0 + np.exp(-beta * (dh - da)))

View File

@@ -0,0 +1,219 @@
"""Minimal implementation of thesis pricing system.
Implements the core loop: prices -> sessions -> demand -> prices
with behavioral separability and robust pricing objective.
Objects:
- Session trajectories tau_s from mixture of H/A behavioral profiles
- Demand proxy q_hat via weighted action aggregation
- COI leakage penalty for agent reconnaissance
- Limbo: alternating price/demand history for trajectory analysis
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict, List, Tuple
import numpy as np
from .coi import COIWindow, compute_coi_window
from .separability import TRANS_H, TRANS_A, kl_div, build_kernel, compute_divergence, estimate_alpha
ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0}
@dataclass
class Event:
action: str
product_idx: int
price_seen: float
ts: float
@dataclass
class Session:
sid: str
events: List[Event]
actor: str # H or A (ground truth label)
theta: Dict[str, float] = field(default_factory=dict)
def compute_demand(session: Session) -> float:
"""Compute demand proxy q_hat = sum_k omega(a_k) for session."""
return sum(ACTION_WEIGHTS.get(e.action, 0.1) for e in session.events)
def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray, costs: np.ndarray, theta: Dict[str, float],
is_agent: bool, session_noise: float = 0.02, surge: float = 0.08, max_mult: float = 1.8) -> Tuple[List[Event], int]:
"""Sample session trajectory from behavioral kernel."""
pidx = int(rng.integers(0, len(prices)))
cost, base = float(costs[pidx]), float(prices[pidx]) * (1.0 + rng.normal(0.0, session_noise))
base = float(np.clip(base, cost * 1.01, float(prices[pidx]) * 2.0))
price, signal, state, t = base, 0.0, "start", 0.0
events = []
while state != "end" and len(events) < 30:
probs = trans.get(state, {"end": 1.0})
nxt = rng.choice(list(probs.keys()), p=list(probs.values()))
if nxt == "purchase": # purchase conversion check
rel = max((price - cost) / (cost + 1e-6), 0.0)
p_buy = float(np.clip(theta.get("base_conv", 0.2) * np.exp(-theta.get("price_sens", 2.0) * rel), 0.0, 1.0))
if rng.random() > p_buy:
nxt = "end"
state = nxt
if state not in {"start", "end"}:
events.append(Event(action=state, product_idx=pidx, price_seen=float(price), ts=t))
signal += float(ACTION_WEIGHTS.get(state, 0.1))
price = float(np.clip(base * (1.0 + surge * signal), cost * 1.01, base * max_mult))
t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2))
return events, pidx
def put_prices_to_market(prices: np.ndarray, costs: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]:
"""Generate sessions from mixture model. Returns sessions and demand mapping sid -> q_hat."""
rng = np.random.default_rng(seed)
sessions, demand = [], {}
for i in range(n_sessions):
sid = f"s{i:04d}"
is_agent = rng.random() < alpha
trans = TRANS_A if is_agent else TRANS_H
theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else \
{"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
events, _ = sample_trajectory(rng, trans, prices, costs=costs, theta=theta, is_agent=is_agent)
session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
sessions.append(session)
demand[sid] = compute_demand(session)
return sessions, demand
@dataclass
class LimboUpdate:
utype: str # "prices" or "demand"
data: np.ndarray | Dict[str, float]
t: int
class Limbo:
"""Historical trajectory of alternating price/demand observations."""
def __init__(self):
self.history: List[LimboUpdate] = []
self._t = 0
def add_update(self, utype: str, data: np.ndarray | Dict[str, float]) -> Dict:
self.history.append(LimboUpdate(utype=utype, data=data, t=self._t))
self._t += 1
return {"action": "observe_demand" if utype == "prices" else "set_prices"}
def get_prices_history(self) -> List[np.ndarray]:
return [u.data for u in self.history if u.utype == "prices"]
def get_demand_history(self) -> List[Dict[str, float]]:
return [u.data for u in self.history if u.utype == "demand"]
class System:
"""Main pricing system implementing robust Stackelberg objective.
Manages the alternating loop: set prices p_t -> observe demand Q_hat(p_t) ->
estimate contamination alpha from behavioral signals -> compute next prices.
"""
def __init__(self, n_products: int = 10, costs: np.ndarray | None = None, lambda_coi: float = 0.5, seed: int | None = 42):
self.n = n_products
self.rng = np.random.default_rng(seed)
self.costs = costs if costs is not None else self.rng.uniform(10, 50, n_products)
self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products))
self.lambda_coi = lambda_coi
self.limbo = Limbo()
self._alpha_est = 0.2
self._sessions: List[Session] = []
self._last_sessions: List[Session] = []
self._last_coi: COIWindow | None = None
@property
def alpha(self) -> float:
return self._alpha_est
def _estimate_alpha_from_sessions(self) -> float:
if not self._sessions:
return self._alpha_est
return float(np.mean([estimate_alpha(s) for s in self._sessions[-50:]]))
def _revenue_under_demand(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
agg = np.zeros(self.n)
for sid, q in demand.items():
sess = next((s for s in self._sessions if s.sid == sid), None)
if sess and sess.events:
agg[sess.events[0].product_idx] += q
return float(np.dot(prices, agg))
def _compute_coi_window(self, demand: Dict[str, float]) -> COIWindow:
if not self._last_sessions:
zeros = np.zeros(self.n, dtype=float)
return COIWindow(policy=0.0, agent=0.0, leak=0.0, survival_ratio=0.0,
policy_by_product=zeros, agent_by_product=zeros, demand_weights=zeros)
return compute_coi_window(self._last_sessions, self.costs, demand_mapping=demand)
def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
"""Robust objective: R(p,d) - lambda * COI_leak."""
profit = self._revenue_under_demand(prices, demand) - float(np.sum(self.costs))
self._last_coi = self._compute_coi_window(demand)
return profit - self.lambda_coi * self._last_coi.leak
def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray:
"""Compute next prices via heuristic margin adjustment based on alpha estimate."""
self._alpha_est = self._estimate_alpha_from_sessions()
margin_scale = 1.0 - 0.5 * self._alpha_est # defensive pricing under high contamination
margins = (self.refs - self.costs) * margin_scale
noise = self.rng.normal(0, 0.02, self.n) * self.costs
prices = np.clip(self.costs + margins + noise, self.costs * 1.02, self.refs * 1.3)
self.limbo.add_update("prices", prices)
return prices
def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true,
n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
self._last_sessions = sessions
self._sessions.extend(sessions)
self.limbo.add_update("demand", demand_map)
return demand_map
def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float, COIWindow]:
demand_hist = self.limbo.get_demand_history()
prices = self.compute_prices(demand_hist[-1] if demand_hist else None)
demand = self.observe_demand(prices, alpha_true, n_sessions)
reward = self._objective(prices, demand)
return prices, demand, reward, self._last_coi or self._compute_coi_window(demand)
def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict:
traj = {"prices": [], "demand": [], "rewards": [], "alpha_est": [], "alpha_true": alpha_true,
"coi_policy": [], "coi_agent": [], "coi_leak": [], "coi_survival": []}
for _ in range(n_steps):
p, d, r, coi = self.step(alpha_true)
traj["prices"].append(p); traj["demand"].append(d); traj["rewards"].append(r)
traj["alpha_est"].append(self._alpha_est)
traj["coi_policy"].append(coi.policy); traj["coi_agent"].append(coi.agent)
traj["coi_leak"].append(coi.leak); traj["coi_survival"].append(coi.survival_ratio)
return traj
if __name__ == "__main__":
sys = System(n_products=5, seed=42)
traj = sys.run(n_steps=20, alpha_true=0.25)
print(f"avg reward: {np.mean(traj['rewards']):.2f}, final alpha_hat: {traj['alpha_est'][-1]:.3f}, "
f"COI_policy: {np.mean(traj['coi_policy']):.3f}, COI_agent: {np.mean(traj['coi_agent']):.3f}, leak: {np.mean(traj['coi_leak']):.3f}")
prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
costs = np.array([15.0, 28.0, 40.0, 18.0, 30.0])
sessions, demand = put_prices_to_market(prices, costs=costs, alpha=0.3, n_sessions=20, seed=123)
print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
for n in [1, 5, 10, 50, 100]:
# theoretical: erosion = 1 - 2/(N+1) for uniform order statistic
print(f'N={n:3d} agents -> COI erosion: {1.0 - 2.0/(n+1):.3f}')
events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0), Event('purchase', 0, 20.0, 2.0)]
print(f'human-like session alpha_hat: {estimate_alpha(Session(sid="test", events=events, actor="H")):.3f}')
events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3), Event('detail', 0, 20.0, 0.4)]
print(f'agent-like session alpha_hat: {estimate_alpha(Session(sid="test2", events=events_a, actor="A")):.3f}')

View File

@@ -0,0 +1,249 @@
"""Gymnasium-compatible RL environment for thesis pricing system.
Wraps simplified.System with standard Gym interface for training pricing policies.
Supports multiple reward modes and contamination scenarios.
Action: price multipliers [0.5, 1.5] applied to reference prices
Observation: [prices, demand_agg, alpha_est, margins, position_proxy]
Reward: configurable objective (revenue, profit, robust, coi-aware)
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, Tuple
import numpy as np
try:
import gymnasium as gym
from gymnasium import spaces
HAS_GYM = True
except ImportError:
HAS_GYM = False
from .simplified import System, Session, Event, Limbo, put_prices_to_market, compute_demand, estimate_alpha
from .coi import COIWindow, compute_coi_window, coi_erosion
@dataclass
class EnvConfig:
n_products: int = 5
max_steps: int = 200
sessions_per_step: int = 30
alpha_true: float = 0.2
alpha_drift: float = 0.0
alpha_bounds: Tuple[float, float] = (0.0, 0.6)
lambda_coi: float = 0.5
lambda_vol: float = 0.1
reward_mode: str = "robust" # revenue | profit | robust | coi_aware
normalize_reward: bool = True
seed: int | None = 42
def aggregate_purchases(sessions: list[Session], n_products: int, costs: np.ndarray) -> Tuple[np.ndarray, float, float]:
"""Aggregate purchases from sessions, returns (counts, revenue, cost)."""
purchases = np.zeros(n_products, dtype=float)
revenue, cost = 0.0, 0.0
for sess in sessions:
for e in sess.events:
if e.action == "purchase" and 0 <= e.product_idx < n_products:
purchases[e.product_idx] += 1.0
revenue += float(e.price_seen)
cost += float(costs[e.product_idx])
return purchases, revenue, cost
class PricingEnv(gym.Env if HAS_GYM else object):
"""RL environment for dynamic pricing under agent contamination.
Platform sets prices p_t, market responds with mixture demand Q(p) = (1-alpha)*D_H + alpha*D_A.
Agent estimates contamination alpha_hat from behavioral signals.
Reward balances profit vs COI leakage.
"""
metadata = {"render_modes": ["human", "ansi"]}
def __init__(self, cfg: EnvConfig | None = None):
if not HAS_GYM:
raise ImportError("gymnasium required")
self.cfg = cfg or EnvConfig()
self.n = self.cfg.n_products
self._sys: System | None = None
self._t = 0
self._alpha = self.cfg.alpha_true
self._last_prices: np.ndarray | None = None
self._last_demand: Dict[str, float] | None = None
self._episode_rewards: list[float] = []
self._demand_agg = np.zeros(self.n)
self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32)
obs_dim = self.n + self.n + 1 + 1 + self.n + 1 # prices + demand + alpha_hat + alpha + margins + t
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)
def _build_obs(self) -> np.ndarray:
if self._sys is None:
return np.zeros(self.observation_space.shape[0], dtype=np.float32)
prices = self._last_prices if self._last_prices is not None else self._sys.refs
return np.concatenate([
prices / (self._sys.refs + 1e-6),
self._demand_agg / (np.sum(self._demand_agg) + 1e-6),
[self._sys.alpha, self._alpha],
(prices - self._sys.costs) / (self._sys.costs + 1e-6),
[self._t / self.cfg.max_steps],
]).astype(np.float32)
def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
cfg, sys = self.cfg, self._sys
if sys is None:
return 0.0
# aggregate demand per product
agg = np.zeros(self.n)
for sid, q in demand.items():
sess = next((s for s in sys._sessions if s.sid == sid), None)
if sess and sess.events:
agg[sess.events[0].product_idx] += q
self._demand_agg = agg
_, revenue, cost = aggregate_purchases(sys._last_sessions, self.n, sys.costs)
profit = revenue - cost
vol_penalty = 0.0
if self._last_prices is not None:
vol_penalty = cfg.lambda_vol * float(np.mean(np.abs(prices - self._last_prices) / (sys.refs + 1e-6)))
coi = compute_coi_window(sys._last_sessions, sys.costs, demand_mapping=demand)
leak = float(coi.leak)
reward_fns = {
"revenue": lambda: revenue,
"profit": lambda: profit,
"robust": lambda: profit - cfg.lambda_coi * leak - vol_penalty,
"coi_aware": lambda: profit - cfg.lambda_coi * (1 + 2 * sys.alpha) * leak - vol_penalty,
}
r = reward_fns.get(cfg.reward_mode, lambda: profit)()
return float(r / (float(np.sum(sys.refs)) + 1e-6)) if cfg.normalize_reward else float(r)
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
seed = seed if seed is not None else self.cfg.seed
self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed)
self._t, self._alpha = 0, self.cfg.alpha_true
self._last_prices, self._last_demand = None, None
self._episode_rewards, self._demand_agg = [], np.zeros(self.n)
return self._build_obs(), {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
"costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()}
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
if self._sys is None:
raise RuntimeError("call reset() first")
action = np.clip(action, 0.5, 1.5)
prices = np.clip(self._sys.refs * action.astype(np.float64), self._sys.costs * 1.01, self._sys.refs * 2.0)
demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step)
self._sys.limbo.add_update("prices", prices)
self._sys._alpha_est = self._sys._estimate_alpha_from_sessions()
reward = self._compute_reward(prices, demand)
self._episode_rewards.append(reward)
self._last_prices, self._last_demand = prices.copy(), demand
self._t += 1
# compute info metrics using shared helper
purchases, revenue, cost = aggregate_purchases(self._sys._last_sessions, self.n, self._sys.costs)
n_agents = int(self._alpha * self.cfg.sessions_per_step)
coi = compute_coi_window(self._sys._last_sessions, self._sys.costs, demand_mapping=demand)
info = {
"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
"alpha_error": abs(self._alpha - self._sys.alpha),
"revenue": float(revenue), "profit": float(revenue - cost), "cost": float(cost),
"n_purchases": int(np.sum(purchases)),
"avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
"n_sessions": len(demand), "n_agents": n_agents, "price_std": float(np.std(prices)),
"coi_erosion": coi_erosion(coi.policy, coi.agent),
"coi_policy": float(coi.policy), "coi_agent": float(coi.agent),
"coi_leakage": float(coi.leak), "coi_survival": float(coi.survival_ratio),
"cumulative_reward": sum(self._episode_rewards), "step": self._t,
}
return self._build_obs(), reward, self._t >= self.cfg.max_steps, False, info
def render(self, mode: str = "human") -> str | None:
if self._sys is None or self._last_prices is None:
return None
out = f"t={self._t}/{self.cfg.max_steps} | alpha_true={self._alpha:.3f} alpha_hat={self._sys.alpha:.3f} | " \
f"prices: {self._last_prices.round(1)} | demand: {self._demand_agg.round(2)} | " \
f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}"
if mode == "human":
print(out)
return out
def close(self) -> None:
pass
class ContaminationSweepEnv(PricingEnv):
"""Environment that sweeps through contamination levels during training."""
def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None):
super().__init__(cfg)
self._schedule = alpha_schedule or [0.1, 0.2, 0.3, 0.4, 0.5]
self._schedule_idx = 0
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
if options and options.get("advance_schedule", False):
self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule)
self.cfg.alpha_true = self._schedule[self._schedule_idx]
return super().reset(seed, options)
class AdversarialEnv(PricingEnv):
"""Environment with adversarial contamination dynamics.
Contamination increases when prices are predictable (agents exploit).
"""
def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02):
super().__init__(cfg)
self._exploit_rate = exploitation_rate
self._price_history: list[np.ndarray] = []
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
obs, reward, term, trunc, info = super().step(action)
if self._last_prices is not None:
self._price_history.append(self._last_prices.copy())
predictability = 0.0
if len(self._price_history) > 10:
predictability = 1.0 / (float(np.std(self._price_history[-10:])) + 0.1)
self._alpha = np.clip(self._alpha + self._exploit_rate * predictability * self._sys.rng.random(), *self.cfg.alpha_bounds)
info["predictability"] = predictability
return obs, reward, term, trunc, info
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
self._price_history = []
return super().reset(seed, options)
def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv:
return {"sweep": ContaminationSweepEnv, "adversarial": AdversarialEnv}.get(env_type, PricingEnv)(cfg)
# baseline policies
fixed_price_policy = lambda refs, margin=0.0: np.ones(len(refs), dtype=np.float32) * (1.0 + margin)
random_policy = lambda n, rng=None: (rng or np.random.default_rng()).uniform(0.7, 1.3, n).astype(np.float32)
adaptive_policy = lambda obs, n, base=0.1: np.ones(n, dtype=np.float32) * (1.0 + base * (1.0 - 0.4 * obs[2 * n]))
if __name__ == "__main__":
cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust")
env = make_env(cfg)
obs, info = env.reset()
print(f"initial: alpha={info['alpha_true']:.2f}")
total_reward = 0.0
for t in range(cfg.max_steps):
action = adaptive_policy(obs, cfg.n_products)
obs, reward, done, _, info = env.step(action)
total_reward += reward
if t % 10 == 0:
env.render()
if done:
break
print(f"\ntotal reward: {total_reward:.2f}, final alpha_hat: {info['alpha_est']:.3f}")

View File

@@ -0,0 +1,168 @@
"""Summarize TensorBoard logs into comparison tables."""
from __future__ import annotations
import json
import re
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass
import pandas as pd
try:
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
HAS_TB = True
except ImportError:
HAS_TB = False
@dataclass
class RunInfo:
algo: str
alpha: float
reward_mode: str
path: Path
def parse_run_name(name: str) -> RunInfo | None:
"""Extract algo, alpha, reward_mode from run directory name."""
# patterns: ppo_a0.20_robust, cmp_fixed_a0.20, sac_a0.90_robust
m = re.match(r'(cmp_)?(\w+)_a([\d.]+)_?(\w+)?', name)
if not m:
return None
prefix, algo, alpha, mode = m.groups()
return RunInfo(algo=algo, alpha=float(alpha), reward_mode=mode or 'robust', path=Path())
def load_tb_scalars(log_dir: Path, tags: list[str], reduce: str = 'last') -> dict[str, float]:
"""Load scalar values from TensorBoard event files."""
if not HAS_TB:
return {}
ea = EventAccumulator(str(log_dir))
ea.Reload()
results = {}
for tag in tags:
if tag in ea.Tags().get('scalars', []):
events = ea.Scalars(tag)
if not events:
continue
vals = [e.value for e in events]
if reduce == 'last':
results[tag] = vals[-1]
elif reduce == 'mean':
results[tag] = sum(vals) / len(vals)
elif reduce == 'max':
results[tag] = max(vals)
elif reduce == 'min':
results[tag] = min(vals)
return results
def load_json_results(log_dir: Path) -> dict[str, float]:
"""Load metrics from results.json if available."""
results_file = log_dir / 'results.json'
if results_file.exists():
with open(results_file) as f:
return json.load(f)
return {}
def discover_runs(base_dir: Path) -> list[RunInfo]:
"""Find all experiment runs in base directory."""
runs = []
for d in base_dir.iterdir():
if not d.is_dir():
continue
info = parse_run_name(d.name)
if info:
info.path = d
runs.append(info)
return runs
def build_tables(runs: list[RunInfo], metrics: list[str], reduce: str = 'last') -> dict[str, dict[str, pd.DataFrame]]:
"""Build pivot tables: reward_mode -> metric -> DataFrame[alpha x algo]."""
# collect data: {reward_mode: {metric: {(alpha, algo): value}}}
data = defaultdict(lambda: defaultdict(dict))
tb_tags = [f'economics/{m}' if m in ['revenue', 'profit', 'margin'] else f'coi/{m}' if m in ['erosion', 'leakage'] else f'alpha/{m}' for m in metrics]
tag_map = dict(zip(tb_tags, metrics))
for run in runs:
# try json first (final eval metrics)
jm = load_json_results(run.path)
tb = load_tb_scalars(run.path, tb_tags, reduce)
for tag, metric in tag_map.items():
val = None
json_key = f'{metric}_mean' if metric != 'reward' else 'reward_mean'
if json_key in jm:
val = jm[json_key]
elif tag in tb:
val = tb[tag]
if val is not None:
data[run.reward_mode][metric][(run.alpha, run.algo)] = val
# convert to DataFrames
tables = {}
for mode, metrics_data in data.items():
tables[mode] = {}
for metric, vals in metrics_data.items():
if not vals:
continue
alphas = sorted(set(a for a, _ in vals.keys()))
algos = sorted(set(al for _, al in vals.keys()))
df = pd.DataFrame(index=alphas, columns=algos, dtype=float)
for (a, al), v in vals.items():
df.loc[a, al] = v
df.index.name = 'alpha'
tables[mode][metric] = df
return tables
def format_table(df: pd.DataFrame, fmt: str = '.3f') -> str:
"""Format DataFrame as markdown table."""
return df.to_markdown(floatfmt=fmt)
def summarize(base_dir: str = 'sim/case/thesis_simplified/runs',
metrics: list[str] | None = None,
reduce: str = 'last',
output: str | None = None) -> dict:
"""Generate summary tables from experiment runs."""
base = Path(base_dir)
metrics = metrics or ['revenue', 'profit', 'margin', 'erosion', 'leakage']
runs = discover_runs(base)
if not runs:
print(f"No runs found in {base}")
return {}
print(f"Found {len(runs)} runs")
tables = build_tables(runs, metrics, reduce)
lines = []
for mode, metric_tables in sorted(tables.items()):
lines.append(f"\n# Reward Mode: {mode}\n")
for metric, df in sorted(metric_tables.items()):
lines.append(f"\n## {metric}\n")
lines.append(format_table(df))
lines.append("")
report = '\n'.join(lines)
print(report)
if output:
Path(output).write_text(report)
print(f"\nSaved to {output}")
return tables
if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser()
p.add_argument('--dir', default='sim/case/thesis_simplified/runs')
p.add_argument('--metrics', nargs='+', default=['revenue', 'profit', 'margin', 'erosion', 'leakage'])
p.add_argument('--reduce', default='last', choices=['last', 'mean', 'max', 'min'])
p.add_argument('--output', '-o', help='save markdown to file')
args = p.parse_args()
summarize(args.dir, args.metrics, args.reduce, args.output)

View File

@@ -0,0 +1,336 @@
"""RL training for thesis pricing system with thesis-aligned metrics.
Trains pricing policies using stable-baselines3 with TensorBoard logging.
Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formulation.
"""
from __future__ import annotations
import argparse
import json
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass, asdict, field
from pathlib import Path
from typing import Dict, List, Callable, Any
import numpy as np
try:
from stable_baselines3 import PPO, SAC, A2C
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
HAS_SB3 = True
except ImportError:
HAS_SB3 = False
try:
from torch.utils.tensorboard import SummaryWriter
HAS_TB = True
except ImportError:
HAS_TB = False
from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy
@dataclass
class EpisodeMetrics:
reward: float = 0.0
revenue: float = 0.0
profit: float = 0.0
coi_erosion: float = 0.0
coi_leakage: float = 0.0
alpha_error: float = 0.0
avg_margin: float = 0.0
n_agents: int = 0
steps: int = 0
def accumulate(self, info: Dict[str, Any]) -> None:
self.steps += 1
self.reward += info.get('reward', 0)
self.revenue += info.get('revenue', 0)
self.profit += info.get('profit', 0)
self.coi_erosion += info.get('coi_erosion', 0)
self.coi_leakage += info.get('coi_leakage', 0)
self.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0))
self.avg_margin += info.get('avg_margin', 0)
self.n_agents += info.get('n_agents', 0)
def normalized(self) -> Dict[str, float]:
s = max(self.steps, 1)
return {k: getattr(self, k) / s for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin', 'n_agents']}
@dataclass
class ExperimentConfig:
algo: str = "ppo"
total_timesteps: int = 100_000
n_envs: int = 4
eval_freq: int = 5000
n_eval_episodes: int = 10
log_dir: str = "sim/case/thesis_simplified/runs"
seed: int = 42
n_products: int = 10
max_steps: int = 200
alpha_true: float = 0.2
reward_mode: str = "robust"
experiment_name: str | None = None
def __post_init__(self):
if self.experiment_name is None:
self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}"
class Policy:
"""Unified policy interface for baselines and trained models."""
def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str):
self._fn, self.name = policy_fn, name
def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]:
return self._fn(obs, (len(obs) - 3) // 3), None
@staticmethod
def fixed(margin: float = 0.15) -> "Policy":
return Policy(lambda obs, n: fixed_price_policy(np.ones(n), margin), f"fixed_{margin:.2f}")
@staticmethod
def adaptive(base_margin: float = 0.15) -> "Policy":
return Policy(lambda obs, n: adaptive_policy(obs, n, base_margin), f"adaptive_{base_margin:.2f}")
@staticmethod
def random() -> "Policy":
return Policy(lambda obs, n: random_policy(n), "random")
@staticmethod
def myopic(greed: float = 0.3) -> "Policy":
def _fn(obs: np.ndarray, n: int) -> np.ndarray:
demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
return np.ones(n, dtype=np.float32) * np.clip(1.0 + greed * (1 + np.mean(demand_norm)), 0.5, 1.5)
return Policy(_fn, f"myopic_{greed:.1f}")
def log_metrics(writer: SummaryWriter | None, metrics: Dict[str, float], prefix: str, step: int) -> None:
if writer is None:
return
for k, v in metrics.items():
writer.add_scalar(f'{prefix}/{k}', v, step)
class MetricsCallback(BaseCallback):
def __init__(self, writer: SummaryWriter | None, verbose: int = 0):
super().__init__(verbose)
self._writer = writer
def _on_step(self) -> bool:
if self._writer is None:
return True
for info in self.locals.get('infos', []):
t = self.num_timesteps
self._writer.add_scalar('economics/revenue', info.get('revenue', 0), t)
self._writer.add_scalar('economics/profit', info.get('profit', 0), t)
self._writer.add_scalar('economics/margin', info.get('avg_margin', 0), t)
self._writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), t)
self._writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), t)
self._writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), t)
self._writer.add_scalar('agents/count', info.get('n_agents', 0), t)
return True
def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv:
def _make():
return Monitor(make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)))
return DummyVecEnv([_make for _ in range(n_envs)])
def run_episodes(policy: Policy | Any, env: PricingEnv, n_episodes: int) -> List[EpisodeMetrics]:
"""Run policy for n episodes and collect metrics."""
metrics = []
for _ in range(n_episodes):
obs, _ = env.reset()
ep, done = EpisodeMetrics(), False
while not done:
action, _ = policy.predict(obs, deterministic=True)
obs, reward, term, trunc, info = env.step(action)
done = term or trunc
ep.accumulate(info)
ep.reward += reward
metrics.append(ep)
return metrics
def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]:
env = make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999))
metrics = run_episodes(policy, env, n_episodes)
return {
'reward_mean': np.mean([m.reward for m in metrics]), 'reward_std': np.std([m.reward for m in metrics]),
**{f'{k}_mean': np.mean([m.normalized()[k] for m in metrics])
for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin']},
}
def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None):
obs, n_envs = vec_env.reset(), vec_env.num_envs
ep_rewards = np.zeros(n_envs)
for step in range(0, total_steps, n_envs):
actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)])
obs, rewards, dones, infos = vec_env.step(actions)
ep_rewards += rewards
for i, info in enumerate(infos):
if writer:
writer.add_scalar('economics/revenue', info.get('revenue', 0), step)
writer.add_scalar('economics/profit', info.get('profit', 0), step)
writer.add_scalar('economics/margin', info.get('avg_margin', 0), step)
writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), step)
writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), step)
writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), step)
writer.add_scalar('agents/count', info.get('n_agents', 0), step)
if dones[i]:
if writer:
writer.add_scalar('rollout/ep_reward', ep_rewards[i], step)
ep_rewards[i] = 0
def train(cfg: ExperimentConfig) -> Dict[str, Any]:
is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"]
if not HAS_SB3 and not is_baseline:
raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]")
log_path = Path(cfg.log_dir) / cfg.experiment_name
log_path.mkdir(parents=True, exist_ok=True)
with open(log_path / "config.json", "w") as f:
json.dump(asdict(cfg), f, indent=2)
writer = SummaryWriter(log_path) if HAS_TB else None
train_env, eval_env = make_vec_env(cfg, cfg.n_envs), make_vec_env(cfg, 1)
if is_baseline:
policy = {"fixed": Policy.fixed, "adaptive": Policy.adaptive, "random": Policy.random, "myopic": Policy.myopic}[cfg.algo.lower()]()
run_baseline(policy, train_env, cfg.total_timesteps, writer)
final_metrics = evaluate_policy(policy, cfg)
else:
algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}[cfg.algo.lower()]
common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
model = {
"ppo": lambda: PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, ent_coef=0.01, **common),
"sac": lambda: SAC("MlpPolicy", train_env, learning_rate=1e-4, buffer_size=50_000, batch_size=512, tau=0.02, gamma=0.99, learning_starts=1000, ent_coef="auto_0.1", train_freq=4, **common),
"a2c": lambda: A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common),
}[cfg.algo.lower()]()
cb = MetricsCallback(writer)
eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"), log_path=str(log_path),
eval_freq=cfg.eval_freq, n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True)
model.save(log_path / "final_model")
policy = model
final_metrics = evaluate_policy(model, cfg)
if writer:
log_metrics(writer, final_metrics, 'final', cfg.total_timesteps)
writer.close()
train_env.close(); eval_env.close()
with open(log_path / "results.json", "w") as f:
json.dump(final_metrics, f, indent=2)
return {"path": str(log_path), "metrics": final_metrics}
def _train_alpha(args: tuple) -> tuple[str, Dict]:
"""Worker for parallel sweep - must be top-level for pickling."""
cfg_dict, alpha = args
cfg_dict["alpha_true"] = alpha
cfg_dict["experiment_name"] = f"{cfg_dict['algo']}_a{alpha:.2f}_{cfg_dict['reward_mode']}"
sweep_cfg = ExperimentConfig(**cfg_dict)
print(f"[alpha={alpha:.2f}] starting")
metrics = train(sweep_cfg)["metrics"]
print(f"[alpha={alpha:.2f}] done")
return f"alpha_{alpha:.2f}", metrics
def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
cfg_dict = asdict(cfg)
if max_workers == 1: # sequential fallback
results = dict(_train_alpha((cfg_dict.copy(), a)) for a in alphas)
else:
with ProcessPoolExecutor(max_workers=max_workers) as pool:
futures = {pool.submit(_train_alpha, (cfg_dict.copy(), a)): a for a in alphas}
results = {}
for fut in as_completed(futures):
key, metrics = fut.result()
results[key] = metrics
summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json"
with open(summary_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nSweep results saved to {summary_path}")
return results
def _train_policy(args: tuple) -> tuple[str, Dict]:
"""Worker for parallel policy comparison."""
cfg_dict, algo = args
cfg_dict["algo"] = algo
cfg_dict["experiment_name"] = f"cmp_{algo}_a{cfg_dict['alpha_true']:.2f}"
cmp_cfg = ExperimentConfig(**cfg_dict)
print(f"[{algo}] starting")
metrics = train(cmp_cfg)["metrics"]
print(f"[{algo}] done")
return algo, metrics
def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
policies = policies or ["fixed", "adaptive", "myopic", "random"]
cfg_dict = asdict(cfg)
if max_workers == 1:
results = dict(_train_policy((cfg_dict.copy(), p)) for p in policies)
else:
with ProcessPoolExecutor(max_workers=max_workers) as pool:
futures = {pool.submit(_train_policy, (cfg_dict.copy(), p)): p for p in policies}
results = {}
for fut in as_completed(futures):
algo, metrics = fut.result()
results[algo] = metrics
cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json"
with open(cmp_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nComparison saved to {cmp_path}")
for algo, m in results.items():
print(f" {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} alpha_err={m['alpha_error_mean']:.4f}")
return results
def main():
parser = argparse.ArgumentParser(description="Train RL pricing policies")
parser.add_argument("--algo", default="ppo", choices=["ppo", "sac", "a2c", "fixed", "adaptive", "random", "myopic"])
parser.add_argument("--steps", type=int, default=100_000)
parser.add_argument("--alpha", type=float, default=0.2)
parser.add_argument("--reward-mode", default="robust", choices=["revenue", "profit", "robust", "coi_aware"])
parser.add_argument("--n-products", type=int, default=10)
parser.add_argument("--n-envs", type=int, default=4)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--log-dir", default="sim/case/thesis_simplified/runs")
parser.add_argument("--sweep", action="store_true", help="run contamination sweep")
parser.add_argument("--compare", action="store_true", help="compare all baselines")
parser.add_argument("--workers", type=int, default=None, help="max parallel workers for sweep (None=auto, 1=sequential)")
args = parser.parse_args()
cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
reward_mode=args.reward_mode, n_products=args.n_products,
n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir)
if args.sweep:
run_sweep(cfg, max_workers=args.workers)
elif args.compare:
compare_policies(cfg, max_workers=args.workers)
else:
result = train(cfg)
print(f"\nTraining complete: {result['path']}")
print(f"Metrics: {json.dumps(result['metrics'], indent=2)}")
if __name__ == "__main__":
main()

View File

@@ -226,6 +226,7 @@ if __name__ == "__main__":
agent_model = AgentBehaviorModel(agent_dir)
agent_mdp = agent_model.build_MDP()
print(agent_mdp)
print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
if not agent_mdp['states']:
@@ -234,6 +235,9 @@ if __name__ == "__main__":
human_evt = aggregate_event_transitions(human_mdp)
agent_evt = aggregate_event_transitions(agent_mdp)
print(agent_evt)
common = set(human_evt.keys()) & set(agent_evt.keys())
if not common:

View File

@@ -76,8 +76,7 @@ class WildPricingEngine(BasePricingEngine):
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
self.step_count += 1
# extract demand signal (from env observation) as proxy for sales
demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32))
demand = _extract_demand(observation, self.c.product_catalogue_size)
return self._update_from_demand(current_prices, demand)
def _update_from_demand(self, prices: np.ndarray, sold: np.ndarray) -> np.ndarray:
@@ -141,7 +140,7 @@ class SimpleDemandEngine(BasePricingEngine):
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
self.step_count += 1
demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32))
demand = _extract_demand(observation, self.c.product_catalogue_size)
if self.prev_demand is None:
self.prev_demand = demand.copy()
return current_prices.copy()
@@ -207,7 +206,7 @@ class ThompsonSamplingEngine(BasePricingEngine):
lo = current_prices * 0.7
hi = current_prices * 1.3
self.price_grid = np.linspace(lo, hi, self.n_price_levels).T
demand = observation.get('demand', np.zeros(self.c.product_catalogue_size, dtype=np.float32))
demand = _extract_demand(observation, self.c.product_catalogue_size)
# update beliefs based on last action
if self.last_actions is not None:
for i in range(self.c.product_catalogue_size):
@@ -226,3 +225,14 @@ class ThompsonSamplingEngine(BasePricingEngine):
new_prices[i] = self.price_grid[i, actions[i]]
self.last_actions = actions
return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
def _extract_demand(observation: Dict[str, Any], n: int) -> np.ndarray:
if "elasticity" in observation and isinstance(observation["elasticity"], dict):
d = observation["elasticity"].get("demand")
if d is not None:
return np.asarray(d, dtype=np.float32)
d = observation.get("demand")
if d is not None:
return np.asarray(d, dtype=np.float32)
return np.zeros(n, dtype=np.float32)

View File

@@ -1,682 +1,244 @@
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from dataclasses import dataclass
import pandas as pd
from types import SimpleNamespace
from typing import Optional, Dict, Any, List, Tuple
from __future__ import annotations
from lib.separability import load_artifacts, score_session, estimate_alpha
from sim.rl.behavior_loader.models import AgentBehaviorModel, BehaviorModel, aggregate_event_transitions
from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple
import numpy as np
try:
import jax
from sim.rl.jax_core import JAX_AVAILABLE, compile_transitions, fallback_transitions, sample_sessions, compute_metrics
from sim.rl.jax_core import session_features, compute_session_transitions, compute_divergences, estimate_alpha_batch
except ImportError:
JAX_AVAILABLE = False
import gymnasium as gym
from gymnasium import spaces
except ImportError as e:
raise ImportError("sim.rl.environment requires gymnasium") from e
# "learner" agent learning to optimize pricing
# "agent" part of environment creating demand signals that learner processes
from sim.case.thesis_simplified.coi import COIWindow, coi_erosion, compute_coi_window
from sim.case.thesis_simplified.separability import estimate_alpha as estimate_session_alpha
from sim.case.thesis_simplified.simplified import Limbo, Session, put_prices_to_market
from sim.rl.thesis_core import aggregate_demand_by_product, aggregate_purchases, constrain_prices
@dataclass(frozen=True)
class BusinessLogicConstraints:
product_catalogue_size: int = 100
max_steps: int = 2000
sessions_per_step: int = 250
base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
@dataclass
class BusinessLogicConstraints():
max_price_adjustment: float = 0.30
system_max_price: float = 500.0
system_min_price: float = 1.0
product_catalogue_size: int = 100
episode_length: int = 2000
sessions_per_step: int = 250
max_price_adjustment: float = 0.30
min_margin_pct: float = 0.05
agent_share: float = 0.2
agent_recon_multiplier: float = 6.0
agent_purchase_probability: float = 0.20
alpha_drift: float = 0.0
alpha_bounds: tuple[float, float] = (0.0, 0.8)
coi_strength: float = 0.25
coi_threshold: float = 4.0
coi_sigmoid_temp: float = 1.25
base_human_demand: float = 0.08
base_agent_demand: float = 0.05
human_price_elasticity: float = -1.2 # assumptions here
agent_price_elasticity: float = -0.6
w_agent_loss: float = 1.0
w_volatility: float = 5.0
w_estimation_error: float = 0.25
seed: int = 7
def _sigmoid(x: np.ndarray) -> np.ndarray:
return 1.0 / (1.0 + np.exp(-x))
EVENT_PAGE_MAP = {
"session_start": "/",
"page_view": "/",
"view_item_page": "/products",
"learn_more_about_item": "/products/details",
"add_item_to_cart": "/cart",
"checkout_start": "/checkout",
"purchase_complete": "/checkout",
"session_end": "/checkout/success",
}
# map real collected event names to canonical simulation states
EVENT_CANONICAL_MAP = {
"page_view": "session_start",
"hover_over_paragraph": "view_item_page",
"hover_over_title": "view_item_page",
"view_item_page": "view_item_page",
"learn_more_about_item": "learn_more_about_item",
"add_item_to_cart": "add_item_to_cart",
"checkout_start": "purchase_complete",
"remove_item": "view_item_page",
}
def _canonicalize_transitions(raw_trans: Dict[str, Dict[str, float]]) -> Dict[str, Dict[str, float]]:
"""Map real event transition names to canonical simulation states."""
canonical: Dict[str, Dict[str, float]] = {}
for src, dsts in raw_trans.items():
src_canon = EVENT_CANONICAL_MAP.get(src, src)
if src_canon not in canonical:
canonical[src_canon] = {}
for dst, prob in dsts.items():
dst_canon = EVENT_CANONICAL_MAP.get(dst, dst)
canonical[src_canon][dst_canon] = canonical[src_canon].get(dst_canon, 0.0) + prob
# re-normalize after aggregation
for src in canonical:
total = sum(canonical[src].values())
if total > 0:
canonical[src] = {k: v / total for k, v in canonical[src].items()}
return canonical
class BehavioralProfile:
"""Synthetic Markov profile used to generate interaction sessions.
Uses aggregate_event_transitions from models.py to build transition kernels from real data."""
def __init__(self, actor: str, purchase_probs: np.ndarray):
self.actor = actor
self.purchase_probs = np.clip(purchase_probs, 0.0, 0.95)
self.states = [
"session_start",
"view_item_page",
"learn_more_about_item",
"add_item_to_cart",
"purchase_complete",
"session_end",
]
model = AgentBehaviorModel(agent_dir) if actor == "agents" else BehaviorModel(human_dir)
mdp = model.build_MDP()
raw_trans = aggregate_event_transitions(mdp) if mdp.get("transitions") else {}
self.transitions = _canonicalize_transitions(raw_trans) if raw_trans else self._fallback_transitions()
self._ensure_terminal_states()
self.dwell_params = self._extract_dwell_params(mdp)
def _ensure_terminal_states(self):
# guarantee purchase_complete leads to session_end and session_start exists
if "purchase_complete" not in self.transitions:
self.transitions["purchase_complete"] = {"session_end": 1.0}
elif "session_end" not in self.transitions.get("purchase_complete", {}):
self.transitions["purchase_complete"]["session_end"] = 1.0
total = sum(self.transitions["purchase_complete"].values())
self.transitions["purchase_complete"] = {k: v/total for k, v in self.transitions["purchase_complete"].items()}
if "session_start" not in self.transitions:
self.transitions["session_start"] = {"view_item_page": 0.7, "learn_more_about_item": 0.2, "session_end": 0.1}
def _fallback_transitions(self) -> Dict[str, Dict[str, float]]:
return {
"session_start": {"view_item_page": 0.85, "session_end": 0.15},
"view_item_page": {"learn_more_about_item": 0.4, "add_item_to_cart": 0.3, "view_item_page": 0.2, "session_end": 0.1},
"learn_more_about_item": {"add_item_to_cart": 0.5, "view_item_page": 0.3, "session_end": 0.2},
"add_item_to_cart": {"purchase_complete": 0.6, "view_item_page": 0.25, "session_end": 0.15},
"purchase_complete": {"session_end": 1.0},
}
def _extract_dwell_params(self, mdp: Dict) -> Dict[str, Tuple[float, float]]:
state_vals = mdp.get("state_values", {})
params = {}
for state in self.states:
# try canonical and raw state names
val = state_vals.get(state, 0.5)
for raw, canon in EVENT_CANONICAL_MAP.items():
if canon == state and raw in state_vals:
val = state_vals[raw]
break
shape = 1.5 + val * 2.0
scale = 0.8 + (1.0 - val) * 1.2
params[state] = (shape, scale)
return params
def _transition_probs(self, state: str, product_idx: int) -> Dict[str, float]:
probs = dict(self.transitions.get(state, {"session_end": 1.0}))
if state == "add_item_to_cart":
base = probs.get("purchase_complete", 0.0)
demand_factor = float(self.purchase_probs[int(product_idx)])
if self.actor == "agents":
demand_factor *= 0.7
adjusted = np.clip(base * 0.5 + demand_factor * 0.5, 0.0, 0.95)
remainder = max(1e-6, 1.0 - adjusted)
other_total = sum(v for k, v in probs.items() if k != "purchase_complete")
scale = remainder / max(other_total, 1e-6)
for key in probs:
if key == "purchase_complete":
probs[key] = adjusted
else:
probs[key] = probs[key] * scale
total = sum(probs.values())
if total <= 0:
return {"session_end": 1.0}
return {state: val / total for state, val in probs.items()}
def sample_session(
self,
rng: np.random.Generator,
session_id: str,
prices: np.ndarray,
unit_cost: np.ndarray,
) -> Tuple[List[Dict[str, Any]], List[SimpleNamespace]]:
"""Generate a single session trajectory respecting business constraints."""
events: List[Dict[str, Any]] = []
feature_events: List[SimpleNamespace] = []
state = "session_start"
t = 0.0
product_idx = int(rng.integers(0, len(prices)))
product_id = f"product-{product_idx:04d}"
# enforce price >= cost constraint (lipschitz bound on pricing)
# This is a sort of last resort to not let an pricing learner go rogue
cost = float(unit_cost[product_idx])
constrained_price = max(float(prices[product_idx]), cost * 1.05) # 5% min margin
while state != "session_end" and len(events) < 40:
if state != "session_start":
row = {
"session_id": session_id,
"actor": "agent" if self.actor == "agents" else "human",
"eventName": state,
"product_idx": product_idx,
"productId": product_id,
"price_offered": constrained_price,
"price_paid": 0.0,
"page": EVENT_PAGE_MAP.get(state, "/"),
"ts": t,
"unit_cost": cost,
"base_price": float(prices[product_idx]),
}
if state == "purchase_complete":
noise = float(rng.normal(0.0, 0.015))
row["price_paid"] = max(constrained_price * (1.0 + noise), cost)
events.append(row)
feature_events.append(
SimpleNamespace(
eventName=row["eventName"],
page=row["page"],
productId=row["productId"],
ts=row["ts"],
)
)
transitions = self._transition_probs(state, product_idx)
next_state = rng.choice(list(transitions.keys()), p=list(transitions.values()))
shape, scale = self.dwell_params.get(state, (2.0, 1.0))
dwell = max(0.3, rng.gamma(shape=shape, scale=scale))
t += dwell
state = next_state
return events, feature_events
def _load_behavioral_profile(actor: str, demand_forcing: np.ndarray) -> BehavioralProfile:
"""returns a behavioral profile for generating synthetic sessions
actor: 'humans' or 'agents'
demand_forcing: per-product purchase probabilities used to weight interactions
"""
return BehavioralProfile(actor, demand_forcing)
class CommercePlatform:
"""state management for the environment, simulates demand"""
def __init__(self, product_catalogue_size: int, max_price: float, min_price: float, constraints: BusinessLogicConstraints):
self.product_catalogue_size = product_catalogue_size
self.max_price = max_price
self.min_price = min_price
self.constraints = constraints
self.simulation_history: List[Dict[str, Any]] = []
self._rng = np.random.default_rng(constraints.seed)
self._last_interaction_df: pd.DataFrame = pd.DataFrame()
self.unit_cost = np.random.uniform(low=15.0, high=60.0, size=(self.product_catalogue_size,)).astype(np.float32)
self.base_price = np.random.uniform(low=60.0, high=140.0, size=(self.product_catalogue_size,)).astype(np.float32)
self.alpha_hat = constraints.agent_share
try:
self.separability_artifacts = load_artifacts()
except FileNotFoundError:
self.separability_artifacts = None
def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
p = np.clip(prices, self.min_price, self.max_price)
cost = np.clip(self.unit_cost, self.min_price * 0.2, self.max_price)
margin = np.clip((p - cost) / np.maximum(cost, 1e-3), -0.9, 2.0)
# isoelastic demand approximation
human_prob = self.constraints.base_human_demand * np.exp(self.constraints.human_price_elasticity * margin)
agent_prob = self.constraints.base_agent_demand * np.exp(self.constraints.agent_price_elasticity * margin)
return {
"human_purchase_prob": np.clip(human_prob, 0.0, 0.95),
"agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95),
}
def _simulate_sessions(self, prices: np.ndarray) -> Tuple[pd.DataFrame, Dict[str, Any]]:
demand = self.setup_true_demand(prices)
T = self.constraints.sessions_per_step
effective_share = float(np.clip(self.alpha_hat, 0.0, 0.95))
n_agent_sessions = max(1, int(round(T * effective_share)))
n_human_sessions = max(1, T - n_agent_sessions)
session_map = {
"humans": n_human_sessions,
"agents": n_agent_sessions,
}
pprob_map = {
"humans": demand["human_purchase_prob"],
"agents": demand["agent_purchase_prob"],
}
rows: List[Dict[str, Any]] = []
session_scores: List[Dict[str, float]] = []
demand_human = np.zeros_like(prices, dtype=np.float32)
demand_agent = np.zeros_like(prices, dtype=np.float32)
for actor, n_sessions in session_map.items():
profile = _load_behavioral_profile(actor, pprob_map[actor])
for idx in range(n_sessions):
session_id = f"{actor}_{idx:06d}"
session_rows, feature_events = profile.sample_session(
self._rng, session_id, prices, self.unit_cost
)
rows.extend(session_rows)
if session_rows:
df_session = pd.DataFrame(session_rows)
purchases = df_session[df_session["eventName"] == "purchase_complete"]
if not purchases.empty:
counts = purchases.groupby("product_idx").size()
if actor == "agents":
demand_agent[counts.index.to_numpy(dtype=int)] += counts.to_numpy(dtype=np.float32)
else:
demand_human[counts.index.to_numpy(dtype=int)] += counts.to_numpy(dtype=np.float32)
if self.separability_artifacts and feature_events:
score = score_session(feature_events, self.separability_artifacts)
session_scores.append(score)
interactions_df = pd.DataFrame(rows)
diagnostics = {
"alpha_hat": float(self.alpha_hat),
"session_scores": session_scores,
"demand_human": demand_human,
"demand_agent": demand_agent,
}
if session_scores:
alphas = [
estimate_alpha(s["prob_agent"], s["delta_h"], s["delta_a"], temperature=2.0)
for s in session_scores
]
mean_alpha = float(np.mean(alphas))
# exponential moving average for stability
self.alpha_hat = 0.7 * self.alpha_hat + 0.3 * mean_alpha
diagnostics.update(
{
"alpha_hat": float(self.alpha_hat),
"delta_h_mean": float(np.mean([s["delta_h"] for s in session_scores])),
"delta_a_mean": float(np.mean([s["delta_a"] for s in session_scores])),
"prob_agent_mean": float(np.mean([s["prob_agent"] for s in session_scores])),
}
)
self._last_interaction_df = interactions_df
return interactions_df, diagnostics
def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
if interaction_df.empty:
return {
"revenue_observed": 0.0,
"revenue_oracle": 0.0,
"agent_loss": 0.0,
"true_human_purchases": 0.0,
"true_agent_purchases": 0.0,
"mean_sale_price": 0.0,
"look_to_book": 0.0,
"coi": 0.0,
"expected_premium": 0.0,
}
purchases = interaction_df[interaction_df["eventName"] == "purchase_complete"]
human_purchases = purchases[purchases["actor"] == "human"]
agent_purchases = purchases[purchases["actor"] == "agent"]
revenue_observed = float(purchases["price_paid"].sum())
revenue_oracle = float(purchases["base_price"].sum())
agent_loss = float((agent_purchases["base_price"] - agent_purchases["price_paid"]).sum())
mean_sale_price = float(purchases["price_paid"].mean()) if not purchases.empty else 0.0
views = float((interaction_df["eventName"] == "view_item_page").sum())
look_to_book = float(views / (len(purchases) + 1e-6))
true_human = float(len(human_purchases))
true_agent = float(len(agent_purchases))
human_prices = human_purchases["price_offered"] if not human_purchases.empty else pd.Series(dtype=float)
human_costs = human_purchases["unit_cost"] if not human_purchases.empty else pd.Series(dtype=float)
human_base = human_purchases["base_price"] if not human_purchases.empty else pd.Series(dtype=float)
coi = 0.0
if not human_prices.empty and not human_costs.empty:
# COI = E[P] - p_min where p_min is cost, accounting for expected premium (base - realized)
margin = human_prices.mean() - human_costs.mean()
expected_premium = human_base.mean() - human_prices.mean() if not human_base.empty else 0.0
coi = float(np.maximum(0.0, margin - expected_premium * 0.5))
return {
"revenue_observed": revenue_observed,
"revenue_oracle": revenue_oracle,
"agent_loss": agent_loss,
"true_human_purchases": true_human,
"true_agent_purchases": true_agent,
"mean_sale_price": mean_sale_price,
"look_to_book": look_to_book,
"coi": coi,
"expected_premium": float(expected_premium) if not human_base.empty else 0.0,
}
def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
"""Extract per-session behavioral features for separability analysis."""
if df.empty:
return pd.DataFrame()
g = df.groupby("session_id", sort=False)
session_duration = g["ts"].max() - g["ts"].min()
total_interactions = g.size()
avg_time_between = g["ts"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0)
interaction_velocity = total_interactions / (session_duration + 1e-6)
views = g.apply(lambda x: int((x["eventName"] == "view_item_page").sum()), include_groups=False)
cart_adds = g.apply(lambda x: int((x["eventName"] == "add_item_to_cart").sum()), include_groups=False)
purchases = g.apply(lambda x: int((x["eventName"] == "purchase_complete").sum()), include_groups=False)
learn_more = g.apply(lambda x: int((x["eventName"] == "learn_more_about_item").sum()), include_groups=False)
conversion_rate = purchases / (views + 1e-6)
is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False)
# price sensitivity features
price_variance = g["price_offered"].var().fillna(0.0)
avg_price_seen = g["price_offered"].mean().fillna(0.0)
products_viewed = g["product_idx"].nunique()
return pd.DataFrame({
"session_duration_sec": session_duration.astype(float),
"avg_time_between_events": avg_time_between.astype(float),
"total_interactions": total_interactions.astype(int),
"interaction_velocity": interaction_velocity.astype(float),
"item_views": views.astype(int),
"cart_adds": cart_adds.astype(int),
"purchases": purchases.astype(int),
"learn_more_clicks": learn_more.astype(int),
"conversion_rate": conversion_rate.astype(float),
"price_variance": price_variance.astype(float),
"avg_price_seen": avg_price_seen.astype(float),
"products_viewed": products_viewed.astype(int),
"is_agent": is_agent.astype(bool),
}).reset_index()
def get_interaction_data(self) -> np.ndarray:
if self._last_interaction_df.empty:
return np.array([], dtype=object)
return self._last_interaction_df.to_dict(orient="records")
def make_env(constraints: Optional[BusinessLogicConstraints] = None) -> "PHANTOMEnv":
return PHANTOMEnv(constraints=constraints or BusinessLogicConstraints())
class PHANTOMEnv(gym.Env):
metadata = {"render_modes": []}
metadata = {"render_modes": ["human", "ansi"]}
def __init__(self, constraints: Optional[BusinessLogicConstraints] = None, use_jax: bool = True):
def __init__(self, constraints: Optional[BusinessLogicConstraints] = None):
super().__init__()
self.constraints = constraints if isinstance(constraints, BusinessLogicConstraints) else BusinessLogicConstraints()
self.use_jax = use_jax and JAX_AVAILABLE
self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
high=self.constraints.max_price_adjustment,
shape=(self.constraints.product_catalogue_size,), dtype=np.float32)
n_products = self.constraints.product_catalogue_size
self.observation_space = spaces.Dict({
"elasticity": spaces.Dict({
"price": spaces.Box(
low=np.full((n_products,), self.constraints.system_min_price, dtype=np.float32),
high=np.full((n_products,), self.constraints.system_max_price, dtype=np.float32),
dtype=np.float32),
"demand": spaces.Box(
low=np.zeros((n_products,), dtype=np.float32),
high=np.full((n_products,), 1e6, dtype=np.float32),
dtype=np.float32),
}),
"market": spaces.Dict({
"alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
"revenue_rate": spaces.Box(low=0.0, high=1e6, shape=(1,), dtype=np.float32),
"conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
"price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
}),
"cost": spaces.Box(low=0.0, high=self.constraints.system_max_price, shape=(n_products,), dtype=np.float32),
})
self.commerce_platform = CommercePlatform(
product_catalogue_size=self.constraints.product_catalogue_size,
max_price=self.constraints.system_max_price,
min_price=self.constraints.system_min_price,
constraints=self.constraints)
self._rng = np.random.default_rng(self.constraints.seed)
self.t = 0
self._prev_prices: Optional[np.ndarray] = None
self.state: Dict[str, Any] = {}
self._jax_key = None
self._jax_trans = None
if self.use_jax:
self._jax_key = jax.random.PRNGKey(self.constraints.seed)
self._init_jax_transitions()
self.c = constraints or BusinessLogicConstraints()
self.n = int(self.c.product_catalogue_size)
def _init_jax_transitions(self):
try:
human_profile = _load_behavioral_profile("humans", np.ones(self.constraints.product_catalogue_size) * 0.1)
agent_profile = _load_behavioral_profile("agents", np.ones(self.constraints.product_catalogue_size) * 0.1)
self._jax_trans = compile_transitions(human_profile, agent_profile).to_jax()
except Exception:
self._jax_trans = fallback_transitions().to_jax()
self._rng = np.random.default_rng(self.c.seed)
self._t = 0
self._alpha_true = float(self.c.agent_share)
self._alpha_hat = float(self.c.agent_share)
self._costs = np.zeros(self.n, dtype=np.float32)
self._refs = np.zeros(self.n, dtype=np.float32)
self._prices: Optional[np.ndarray] = None
self._last_sessions: list[Session] = []
self._last_coi: COIWindow | None = None
self._limbo = Limbo()
self.action_space = spaces.Box(
low=np.full((self.n,), self.c.system_min_price, dtype=np.float32),
high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
dtype=np.float32,
)
self.observation_space = spaces.Dict(
{
"elasticity": spaces.Dict(
{
"price": spaces.Box(
low=np.full((self.n,), self.c.system_min_price, dtype=np.float32),
high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
dtype=np.float32,
),
"demand": spaces.Box(
low=np.zeros((self.n,), dtype=np.float32),
high=np.full((self.n,), 1e9, dtype=np.float32),
dtype=np.float32,
),
}
),
"market": spaces.Dict(
{
"alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
"revenue_rate": spaces.Box(low=0.0, high=1e12, shape=(1,), dtype=np.float32),
"conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
"price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
}
),
"cost": spaces.Box(
low=np.zeros((self.n,), dtype=np.float32),
high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
dtype=np.float32,
),
}
)
def _reset_catalogue(self) -> None:
self._costs = self._rng.uniform(15.0, 60.0, size=self.n).astype(np.float32)
margins = self._rng.uniform(0.2, 0.6, size=self.n).astype(np.float32)
self._refs = (self._costs * (1.0 + margins)).astype(np.float32)
self._prices = self._refs.copy()
def _observe_market(
self, prices: np.ndarray
) -> tuple[list[Session], Dict[str, float], np.ndarray, np.ndarray, float, float, int]:
sessions, demand_map = put_prices_to_market(
prices,
costs=self._costs,
alpha=self._alpha_true,
n_sessions=int(self.c.sessions_per_step),
seed=int(self._rng.integers(0, 2**31 - 1)),
)
demand_by_product = aggregate_demand_by_product(sessions, demand_map, self.n)
purchases, revenue, cost, n_agents = aggregate_purchases(sessions, self._costs, self.n)
conversion = float(np.sum(purchases) / max(len(sessions), 1))
return sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents
def _update_alpha_hat(self, sessions: list[Session]) -> float:
scores = [estimate_session_alpha(s) for s in sessions if s.events]
if not scores:
return self._alpha_hat
alpha_step = float(np.mean(scores))
self._alpha_hat = 0.8 * self._alpha_hat + 0.2 * alpha_step
self._alpha_hat = float(np.clip(self._alpha_hat, 0.0, 1.0))
return self._alpha_hat
def _reward(self, prices: np.ndarray, revenue: float, cost: float, volatility: float) -> float:
profit = float(revenue - cost)
coi_leak = float(self._last_coi.leak) if self._last_coi else 0.0
alpha_err = abs(self._alpha_hat - self._alpha_true)
return profit - self.c.coi_strength * coi_leak - self.c.w_volatility * volatility - self.c.w_estimation_error * alpha_err
def _build_obs(
self,
prices: np.ndarray,
demand_by_product: np.ndarray,
revenue: float,
conversion: float,
volatility: float,
) -> Dict[str, Any]:
return {
"elasticity": {"price": prices.astype(np.float32), "demand": demand_by_product.astype(np.float32)},
"market": {
"alpha_hat": np.array([self._alpha_hat], dtype=np.float32),
"revenue_rate": np.array([revenue], dtype=np.float32),
"conversion_rate": np.array([conversion], dtype=np.float32),
"price_volatility": np.array([volatility], dtype=np.float32),
},
"cost": self._costs.astype(np.float32),
}
def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
super().reset(seed=seed)
if seed is not None:
self._rng = np.random.default_rng(seed)
self.commerce_platform._rng = np.random.default_rng(seed)
if self.use_jax:
self._jax_key = jax.random.PRNGKey(seed)
self.commerce_platform.alpha_hat = self.constraints.agent_share
self.t = 0
init_prices = self._rng.uniform(
low=60.0,
high=140.0,
size=(self.constraints.product_catalogue_size,),
).astype(np.float32)
self.commerce_platform.unit_cost = self._rng.uniform(
low=15.0,
high=60.0,
size=(self.constraints.product_catalogue_size,),
).astype(np.float32)
self.commerce_platform.base_price = init_prices.copy()
self._prev_prices = init_prices.copy()
self.state = {
"elasticity": {
"price": init_prices,
"demand": np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
},
"market": {
"alpha_hat": np.array([self.constraints.agent_share], dtype=np.float32),
"revenue_rate": np.array([0.0], dtype=np.float32),
"conversion_rate": np.array([0.0], dtype=np.float32),
"price_volatility": np.array([0.0], dtype=np.float32),
},
"cost": self.commerce_platform.unit_cost.astype(np.float32),
}
return self.state, {}
self._t = 0
self._alpha_true = float(np.clip(self.c.agent_share, *self.c.alpha_bounds))
self._alpha_hat = float(self.c.agent_share)
self._reset_catalogue()
self._limbo = Limbo()
self._last_sessions = []
self._last_coi = None
def _step_jax(self, new_prices: np.ndarray) -> Tuple[Dict, Dict]:
self._jax_key, subkey = jax.random.split(self._jax_key)
alpha = float(np.clip(self.commerce_platform.alpha_hat, 0.0, 0.95))
n_agent = max(1, int(self.constraints.sessions_per_step * alpha))
n_human = max(1, self.constraints.sessions_per_step - n_agent)
batch = sample_sessions(subkey, self._jax_trans, n_human, n_agent, len(new_prices))
sim = compute_metrics(batch, new_prices, self.commerce_platform.unit_cost, self.commerce_platform.base_price)
result = {"revenue_observed": sim.revenue, "revenue_oracle": sim.revenue_oracle,
"agent_loss": sim.agent_loss, "coi": sim.coi, "look_to_book": sim.look_to_book,
"mean_sale_price": sim.mean_sale_price, "true_human_purchases": sim.n_human_purchases,
"true_agent_purchases": sim.n_agent_purchases}
diagnostics = {"demand_human": sim.demand_human, "demand_agent": sim.demand_agent, "alpha_hat": alpha}
return result, diagnostics
prices = self._prices if self._prices is not None else np.zeros(self.n, dtype=np.float32)
obs = self._build_obs(prices, np.zeros(self.n, dtype=np.float32), 0.0, 0.0, 0.0)
return obs, {"alpha_true": self._alpha_true}
def step(self, action: np.ndarray):
self.t += 1
base_prices = self.state["elasticity"]["price"].astype(np.float32)
new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
self.constraints.system_min_price,
self.constraints.system_max_price).astype(np.float32)
def step(self, action: np.ndarray) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
if self._prices is None:
raise RuntimeError("reset() must be called before step()")
self.state["elasticity"]["price"] = new_prices
if self.use_jax:
result, diagnostics = self._step_jax(new_prices)
else:
interactions_df, diagnostics = self.commerce_platform._simulate_sessions(new_prices)
result = self.commerce_platform.compute_interaction_features(interactions_df)
COI = float(result.get("coi", 0.0))
demand_vector = diagnostics.get("demand_human", np.zeros_like(new_prices)) + diagnostics.get(
"demand_agent", np.zeros_like(new_prices)
prev = self._prices
prices = constrain_prices(
prev,
np.asarray(action, dtype=np.float32),
costs=self._costs,
min_price=float(self.c.system_min_price),
max_price=float(self.c.system_max_price),
max_adjustment=float(self.c.max_price_adjustment),
min_margin_pct=float(self.c.min_margin_pct),
)
self.state["elasticity"]["demand"] = demand_vector.astype(np.float32)
self._prices = prices
self._limbo.add_update("prices", prices)
volatility = 0.0 if self._prev_prices is None else \
float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
self._prev_prices = new_prices.copy()
sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents = self._observe_market(prices)
self._last_sessions = sessions
self._limbo.add_update("demand", demand_map)
# update market observation features
total_demand = float(np.sum(demand_vector))
total_purchases = float(result.get("true_human_purchases", 0.0) + result.get("true_agent_purchases", 0.0))
conv_rate = total_purchases / max(total_demand, 1.0)
self.state["market"] = {
"alpha_hat": np.array([float(diagnostics.get("alpha_hat", self.commerce_platform.alpha_hat))], dtype=np.float32),
"revenue_rate": np.array([float(result.get("revenue_observed", 0.0))], dtype=np.float32),
"conversion_rate": np.array([float(np.clip(conv_rate, 0.0, 1.0))], dtype=np.float32),
"price_volatility": np.array([float(volatility)], dtype=np.float32),
}
self.state["cost"] = self.commerce_platform.unit_cost.astype(np.float32)
self._update_alpha_hat(self._last_sessions)
self._last_coi = compute_coi_window(self._last_sessions, self._costs, demand_mapping=demand_map)
# extract metrics with safe defaults for incomplete simulation
revenue_observed = float(result.get("revenue_observed", 0.0))
agent_loss = float(result.get("agent_loss", 0.0))
self._alpha_true = float(np.clip(self._alpha_true + self.c.alpha_drift, *self.c.alpha_bounds))
volatility = float(np.std((prices - prev) / (prev + 1e-6)))
reward = float(self._reward(prices, revenue, cost, volatility))
conversion = float(np.sum(purchases) / max(len(self._last_sessions), 1))
reward = (revenue_observed
- COI
- self.constraints.w_agent_loss * agent_loss
- self.constraints.w_volatility * volatility
- self.constraints.w_estimation_error)
self._t += 1
terminated = self._t >= int(self.c.max_steps)
terminated = self.t >= self.constraints.episode_length
obs = self._build_obs(prices, demand_by_product, revenue, conversion, min(volatility, 1.0))
info = {
"t": self.t,
"revenue_observed": revenue_observed,
"revenue_oracle": float(result.get("revenue_oracle", revenue_observed)),
"agent_loss": agent_loss,
"ux_volatility": volatility,
"look_to_book": float(result.get("look_to_book", 0.0)),
"mean_sale_price": float(result.get("mean_sale_price", 0.0)),
"true_human_purchases_total": float(result.get("true_human_purchases", 0.0)),
"true_agent_purchases_total": float(result.get("true_agent_purchases", 0.0)),
"coi": COI,
"alpha_hat": diagnostics.get("alpha_hat", self.commerce_platform.alpha_hat),
"mean_human_demand": float(np.mean(diagnostics.get("demand_human", np.zeros_like(new_prices)))),
"mean_agent_demand": float(np.mean(diagnostics.get("demand_agent", np.zeros_like(new_prices)))),
"step": self._t,
"reward": reward,
"revenue": float(revenue),
"profit": float(revenue - cost),
"n_sessions": int(self.c.sessions_per_step),
"n_agents": int(n_agents),
"alpha_true": float(self._alpha_true),
"alpha_hat": float(self._alpha_hat),
"alpha_error": float(abs(self._alpha_hat - self._alpha_true)),
"price_std": float(np.std(prices)),
"price_volatility": float(volatility),
}
if "delta_h_mean" in diagnostics:
if self._last_coi is not None:
info.update(
{
"delta_h_mean": diagnostics["delta_h_mean"],
"delta_a_mean": diagnostics["delta_a_mean"],
"prob_agent_mean": diagnostics["prob_agent_mean"],
"coi_policy": float(self._last_coi.policy),
"coi_agent": float(self._last_coi.agent),
"coi_leakage": float(self._last_coi.leak),
"coi_survival": float(self._last_coi.survival_ratio),
"coi_erosion": float(coi_erosion(self._last_coi.policy, self._last_coi.agent)),
}
)
return self.state, float(reward), terminated, False, info
return obs, reward, terminated, False, info
def render(self, mode: str = "human") -> str | None:
if self._prices is None:
return None
out = (
f"t={self._t}/{self.c.max_steps} "
f"alpha_true={self._alpha_true:.3f} alpha_hat={self._alpha_hat:.3f} "
f"price_std={float(np.std(self._prices)):.2f}"
)
if mode == "human":
print(out)
return out
if __name__ == "__main__":
import matplotlib.pyplot as plt
from collections import defaultdict
env = PHANTOMEnv(constraints=BusinessLogicConstraints())
obs, _ = env.reset(seed=42)
metrics = defaultdict(list)
total_reward = 0.0
done = False
while not done:
action = env.action_space.sample()
obs, reward, done, _, info = env.step(action)
total_reward += reward
p_mean = float(np.mean(obs["elasticity"]["price"]))
q_mean = float(np.mean(obs["elasticity"]["demand"]))
p_std = float(np.std(obs["elasticity"]["price"]))
metrics['t'].append(info['t'])
metrics['price_mean'].append(p_mean)
metrics['price_std'].append(p_std)
metrics['demand_mean'].append(q_mean)
metrics['revenue_observed'].append(info['revenue_observed'])
metrics['revenue_oracle'].append(info['revenue_oracle'])
metrics['agent_loss'].append(info['agent_loss'])
metrics['ux_volatility'].append(info['ux_volatility'])
metrics['look_to_book'].append(info['look_to_book'])
metrics['reward'].append(reward)
metrics['human_purchases'].append(info['true_human_purchases_total'])
metrics['agent_purchases'].append(info['true_agent_purchases_total'])
metrics['coi'].append(info.get('coi', 0.0))
metrics['alpha_hat'].append(info.get('alpha_hat', env.commerce_platform.alpha_hat))
metrics['mean_human_demand'].append(info.get('mean_human_demand', 0.0))
metrics['mean_agent_demand'].append(info.get('mean_agent_demand', 0.0))
metrics['delta_h_mean'].append(info.get('delta_h_mean', 0.0))
metrics['delta_a_mean'].append(info.get('delta_a_mean', 0.0))
metrics['prob_agent_mean'].append(info.get('prob_agent_mean', 0.0))
if info['t'] % 20 == 0 or done:
print(f"t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} q={q_mean:6.2f} "
f"rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
f"coi={info.get('coi', 0.0):6.2f} alpha={info.get('alpha_hat', 0.0):4.2f} "
f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
print(f"total_reward={total_reward:.2f}")
fig, axes = plt.subplots(3, 4, figsize=(18, 12))
fig.suptitle('PHANTOM Environment Run', fontsize=14, fontweight='bold')
plot_configs = [
('price_mean', 'Mean Price', 'Price'),
('demand_mean', 'Mean Demand (All)', 'Demand'),
('mean_human_demand', 'Mean Human Demand', 'Count'),
('mean_agent_demand', 'Mean Agent Demand', 'Count'),
('revenue_observed', 'Revenue (Observed)', 'Revenue'),
('agent_loss', 'Agent Loss (Oracle - Observed)', 'Loss'),
('coi', 'Cost of Information', 'COI'),
('alpha_hat', 'Estimated α̂', 'alpha'),
('ux_volatility', 'UX Volatility (Price Change)', 'Volatility'),
('look_to_book', 'Look-to-Book Ratio', 'Ratio'),
('reward', 'Step Reward', 'Reward'),
('prob_agent_mean', 'Avg Agent Probability', 'Probability'),
]
for idx, (key, title, ylabel) in enumerate(plot_configs):
ax = axes[idx // 4, idx % 4]
ax.plot(metrics['t'], metrics[key], color='blue', alpha=0.7, linewidth=1.5)
ax.set_xlabel('Step')
ax.set_ylabel(ylabel)
ax.set_title(title, fontsize=10, fontweight='bold')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('phantom_env_comparison.png', dpi=150, bbox_inches='tight')
print("Plot saved to phantom_env_comparison.png")
plt.show()
def close(self) -> None:
return