Merge pull request #51 from velocitatem/feat-strong-learning-implementation-with-data-contamination

Feat strong learning implementation with data contamination
This commit is contained in:
Daniel Alves Rösel
2026-01-31 10:15:09 +01:00
committed by GitHub
33 changed files with 2828 additions and 328 deletions

6
.gitignore vendored
View File

@@ -9,7 +9,11 @@
*.old *.old
**/package-lock.json **/package-lock.json
**/*.parquet **/*.parquet
**/_build/
paper/src/bib/auto
=======
**/_build/
paper/src/auto/* paper/src/auto/*
paper/src/bib/auto paper/src/bib/auto
docs/goals/*.md docs/goals/*.md
@@ -24,3 +28,5 @@ sim/rl/behavior_loader/*.png
sim/rl/behavior_loader/*.svg sim/rl/behavior_loader/*.svg
sim/rl/behavior_loader/*.pdf sim/rl/behavior_loader/*.pdf
tests/e2e/node_modules/** tests/e2e/node_modules/**
lab/case/thesis/runs*/
sim/case/thesis_simplified/runs*/

66
engine/engine.py Normal file
View File

@@ -0,0 +1,66 @@
from sys import platform
import numpy as np
from .lib.demand import generate_demand, estimate_demand
from .lib.behavior import sample_behavior
from logging import INFO, getLogger
logger = getLogger(__name__)
logger.setLevel(INFO)
class MarketEngine():
def __init__(self,
alpha = 0.5,
N = 100,
demand_distribution = (50, 10),
demand_sampling_function = np.random.normal):
self.Nagents = int(N*alpha)
self.Nhumans = int(N*(1-alpha))
self.demand = (demand_sampling_function, demand_distribution)
def act(self, prices):
demand = generate_demand(prices, *self.demand)
sample_n = lambda n, human: [sample_behavior(demand, human=human) for _ in range(n)]
human_t, agent_t = sample_n(self.Nhumans, True), sample_n(self.Nagents, False)
trajectories = human_t + agent_t
demand_estimate = estimate_demand(trajectories)
return demand_estimate
def measure(self):
pass
class PricingEngine():
def __init__(self,
) -> None:
pass
def act(self, demand):
return np.random.uniform(low=25, high=100, size=10)
class Limbo():
def __init__(self,
platform,
market
) -> None:
self.platform_turn = True
self.platform = platform
self.market = market
self.output = None
def step(self):
# we could code golf this a little bit
if self.platform_turn:
self.output = self.platform.act(self.output)
else:
self.output = self.market.act(self.output)
print(self.output)
self.platform_turn = not self.platform_turn
if __name__ == "__main__":
platform = PricingEngine()
market = MarketEngine()
limbo = Limbo(platform, market)
for _ in range(10):
limbo.step()

3
engine/lib/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .demand import generate_demand, estimate_demand
from .behavior import sample_behavior
from .render import DashboardRenderer, style_axis

47
engine/lib/behavior.py Normal file
View File

@@ -0,0 +1,47 @@
from sim.rl.behavior_loader.models import BehaviorModel, AgentBehaviorModel, aggregate_event_transitions
import pandas as pd
import numpy as np
from .demand import generate_demand
base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
_cache = {} # lazy cache for models and base pivots
def _get_base_pivot(human: bool):
key = 'human' if human else 'agent'
if key not in _cache:
model = BehaviorModel(human_dir) if human else AgentBehaviorModel(agent_dir)
mdp = model.build_MDP()
_cache[key] = pd.DataFrame(aggregate_event_transitions(mdp)).fillna(0.0)
return _cache[key]
def adjust_behavior_to_condition(condition, transition_matrix):
# expand NxN transition matrix to (N*P)x(N*P) weighted by demand condition
cond_norm = condition / np.sum(condition)
n_products = len(condition)
base_vals = transition_matrix.values
base_cols, base_rows = transition_matrix.columns.tolist(), transition_matrix.index.tolist()
# expand via kronecker-like tiling: each cell becomes a P*P block weighted by outer product of cond_norm
expanded = np.kron(base_vals, np.outer(cond_norm, cond_norm))
new_cols = [f"{c}_product{p}" for c in base_cols for p in range(n_products)]
new_rows = [f"{r}_product{p}" for r in base_rows for p in range(n_products)]
return pd.DataFrame(expanded, index=new_rows, columns=new_cols)
def sample_behavior(condition, human=True, max_len=40):
base_pivot = _get_base_pivot(human)
adjusted_transitions = adjust_behavior_to_condition(condition, base_pivot)
trajectory = [np.random.choice(adjusted_transitions.index)]
while len(trajectory) < max_len or 'checkout' in trajectory[-1]:
probs = adjusted_transitions.loc[trajectory[-1]].values
sample = np.random.choice(adjusted_transitions.columns, p=probs/np.sum(probs) if np.sum(probs) > 0 else None)
trajectory.append(sample)
return trajectory
if __name__ == "__main__":
t=sample_behavior(generate_demand(np.array([10,20,30])), human=True)
print(t)
t=sample_behavior(generate_demand(np.array([10,20,30])), human=False)
print(t)

45
engine/lib/demand.py Normal file
View File

@@ -0,0 +1,45 @@
import logging
import numpy as np
from logging import getLogger
logger = getLogger(__name__)
def generate_demand(prices, distribution_method = np.random.normal, distribution_params = (50.0, 10.0)):
# assumption 1: each product has an intrinsic valuation drawn from a normal distribution centered at 50
product_valuations = distribution_method(*distribution_params, size=len(prices))
# assumption 2: demand decreases as price increases, following a simple linear model
demand = np.maximum(0, product_valuations - prices) # demand cannot be negative
total = np.sum(demand)
demand = demand / total * 100 if total > 0 else demand # normalize to percentage, avoid div by zero
logger.info(f"Generated demand for prices {prices}: {demand} with valuations from distribution {distribution_params}")
return demand
def estimate_demand(trajectories):
demand_estimate = {}
for traj in trajectories:
for event in traj:
if 'view_product' in event:
product_id = int(event.split('_')[-1].replace('product', ''))
demand_estimate[product_id] = demand_estimate.get(product_id, 0) + 1
total_views = sum(demand_estimate.values())
for product_id in demand_estimate:
demand_estimate[product_id] = (demand_estimate[product_id] / total_views) * 100 # normalize to percentage
return demand_estimate
# Example usage
if __name__ == "__main__":
np.random.seed(42)
prices = np.array([20.0, 35.0, 50.0, 65.0])
demand = generate_demand(prices)
print("Generated Demand:", demand)
from .behavior import sample_behavior
N, alphat =200, 0.1
trajectories = []
for _ in range(int(N*(1 - alphat))):
trajectories.append(sample_behavior(demand, human=True))
for _ in range(int(N*alphat)):
trajectories.append(sample_behavior(demand, human=False))
demand_estimate = estimate_demand(trajectories)
print("Estimated Demand from Behavior:", demand_estimate)
delta = {k: demand_estimate.get(k, 0) - demand[i] for i, k in enumerate(range(len(prices)))}
delta = np.mean([np.abs(v) for v in delta.values()])
print("Demand Delta:", delta)

126
engine/lib/render.py Normal file
View File

@@ -0,0 +1,126 @@
"""rendering logic for PHANTOM environment dashboard"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
def style_axis(ax, title: str = None, xlabel: str = None, ylabel: str = None):
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
if title: ax.set_title(title, fontsize=11, fontweight='bold', pad=8)
if xlabel: ax.set_xlabel(xlabel, fontsize=9)
if ylabel: ax.set_ylabel(ylabel, fontsize=9)
class DashboardRenderer:
"""stateful renderer for PHANTOM market dynamics visualization"""
def __init__(self):
self.fig = None
self.gs = None
def render(self, env) -> None:
if self.fig is None:
plt.ion()
self.fig = plt.figure(figsize=(14, 10))
self.gs = GridSpec(3, 3, figure=self.fig, hspace=0.35, wspace=0.3,
left=0.07, right=0.95, top=0.92, bottom=0.08)
plt.show(block=False)
self.fig.clear()
self.fig.suptitle(f'PHANTOM Market Dynamics [t={env._step_count}, a={env.alpha:.2f}]',
fontsize=14, fontweight='bold')
demand_mat = np.array(env._demand_history).T
price_mat = np.array(env._price_history).T
elasticity = env._compute_elasticity()
self._render_scatter(env)
self._render_elasticity_bar(env, elasticity)
self._render_session_pie(env)
self._render_price_heatmap(price_mat)
self._render_demand_heatmap(demand_mat)
self._render_correlation(env.n_products, price_mat, demand_mat)
self._render_revenue(env)
self.fig.canvas.draw_idle()
self.fig.canvas.flush_events()
def _render_scatter(self, env):
ax = self.fig.add_subplot(self.gs[0, 0])
prices_flat = np.array(env._price_history).flatten()
demands_flat = np.array(env._demand_history).flatten()
product_ids = np.tile(np.arange(env.n_products), len(env._price_history))
ax.scatter(prices_flat, demands_flat, c=product_ids, cmap='plasma', alpha=0.6, s=15, edgecolors='none')
if len(prices_flat) > 1:
z = np.polyfit(prices_flat, demands_flat, 1)
p_line = np.linspace(prices_flat.min(), prices_flat.max(), 50)
ax.plot(p_line, np.polyval(z, p_line), '--', lw=1.5, alpha=0.8)
style_axis(ax, "Price-Demand Relationship", "Price ($)", "Demand")
def _render_elasticity_bar(self, env, elasticity):
ax = self.fig.add_subplot(self.gs[0, 1])
ax.barh(range(env.n_products), elasticity, alpha=0.8)
ax.axvline(0, lw=0.8, alpha=0.5)
ax.axvline(-1, lw=1, ls='--', alpha=0.5)
ax.set_yticks(range(env.n_products))
ax.set_yticklabels([f'P{i}' for i in range(env.n_products)], fontsize=7)
style_axis(ax, "Price Elasticity", "(dQ/dP)(P/Q)", None)
def _render_session_pie(self, env):
ax = self.fig.add_subplot(self.gs[0, 2])
n_h, n_a = env.market.Nhumans, env.market.Nagents
wedges, _ = ax.pie([n_h, n_a], startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'})
ax.legend(wedges, [f'H ({n_h})', f'A ({n_a})'], loc='lower center', fontsize=8,
frameon=False, bbox_to_anchor=(0.5, -0.05))
ax.set_title("Session Mix", fontsize=11, fontweight='bold')
def _render_price_heatmap(self, price_mat):
ax = self.fig.add_subplot(self.gs[1, :2])
im = ax.imshow(price_mat, aspect='auto', cmap='viridis', origin='lower')
style_axis(ax, "Price Heatmap P(product, t)", "Step", "Product")
cbar = self.fig.colorbar(im, ax=ax, fraction=0.03, pad=0.02)
cbar.set_label('$', fontsize=8)
def _render_demand_heatmap(self, demand_mat):
ax = self.fig.add_subplot(self.gs[1, 2])
im = ax.imshow(demand_mat, aspect='auto', cmap='Blues', origin='lower')
style_axis(ax, "Demand Q(product, t)", "Step", None)
self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
def _render_correlation(self, n_products, price_mat, demand_mat):
ax = self.fig.add_subplot(self.gs[2, 0])
if price_mat.shape[1] > 2:
corr = np.corrcoef(price_mat, demand_mat)[:n_products, n_products:]
im = ax.imshow(corr, cmap='RdBu', vmin=-1, vmax=1, aspect='auto')
ax.set_xticks(range(n_products))
ax.set_yticks(range(n_products))
ax.set_xticklabels([f'Q{i}' for i in range(n_products)], fontsize=6)
ax.set_yticklabels([f'P{i}' for i in range(n_products)], fontsize=6)
self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
style_axis(ax, "Price-Demand Correlation", None, None)
def _render_revenue(self, env):
ax = self.fig.add_subplot(self.gs[2, 1:])
n_steps = len(env._revenue_history)
demand_std = [np.std(d) for d in env._demand_history]
ax.fill_between(range(n_steps), env._revenue_history, alpha=0.3)
ax.plot(env._revenue_history, linewidth=2, label='Revenue')
ax.set_xlim(0, max(n_steps, 1))
ax.set_ylim(0, max(env._revenue_history) * 1.1 if env._revenue_history else 1)
ax2 = ax.twinx()
ax2.plot(range(n_steps), demand_std, linewidth=2, ls='-', alpha=0.9, label='sigma(Demand)')
d_min, d_max = min(demand_std), max(demand_std)
margin = (d_max - d_min) * 0.2 if d_max > d_min else 0.5
ax2.set_ylim(max(0, d_min - margin), d_max + margin)
ax2.set_ylabel('Demand sigma', fontsize=9)
style_axis(ax, "Revenue & Demand Dispersion", "Step", "Revenue ($)")
ax.legend(loc='upper left', fontsize=7, frameon=False)
ax2.legend(loc='upper right', fontsize=7, frameon=False)
def close(self):
if self.fig:
plt.close(self.fig)
self.fig = None

34
engine/studies/factors.py Normal file
View File

@@ -0,0 +1,34 @@
"""shared factor definitions for experimental designs"""
import numpy as np
from dataclasses import dataclass, field
from typing import Callable, Any
@dataclass
class Factor:
name: str
levels: list
primary: bool = True # full cross vs sampled
# demand functions with compatible signatures
def demand_linear(mu, sigma, size): return np.maximum(0, np.random.normal(mu, sigma, size))
def demand_uniform(mu, sigma, size): return np.random.uniform(mu - sigma, mu + sigma, size)
def demand_exponential(mu, sigma, size): return np.random.exponential(mu, size)
def demand_logistic(mu, sigma, size): return np.random.logistic(mu, sigma, size)
DEMAND_FUNCTIONS = {
"linear": demand_linear,
"uniform": demand_uniform,
"exponential": demand_exponential,
"logistic": demand_logistic,
}
FACTORS = [
Factor("demand_fn", list(DEMAND_FUNCTIONS.keys()), primary=True),
Factor("alpha", [0.1, 0.3, 0.5, 0.7], primary=True),
Factor("n_products", [5, 15, 30, 50], primary=True),
Factor("demand_mu", [30.0, 50.0, 70.0], primary=False),
Factor("demand_sigma", [5.0, 10.0, 20.0], primary=False),
Factor("N", [100, 500, 1000], primary=False),
]
SEEDS_PER_CONFIG = 5

View File

@@ -0,0 +1,89 @@
"""full factorial design - all factor combinations"""
import sys
sys.path.insert(0, "..")
import logging
from itertools import product
import json
import hashlib
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from .factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
def generate_configs():
"""generate all factor combinations with seeds"""
all_levels = [f.levels for f in FACTORS]
names = [f.name for f in FACTORS]
configs = []
for combo in product(*all_levels):
base = {names[i]: combo[i] for i in range(len(names))}
for seed in range(SEEDS_PER_CONFIG):
cfg = {**base, "seed": seed}
cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
configs.append(cfg)
return configs
def run_single(cfg: dict) -> dict:
"""execute one experiment config, return metrics"""
from engine.wrapper import PHANTOM
import numpy as np
np.random.seed(cfg["seed"])
demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]]
env = PHANTOM(
n_products=cfg["n_products"],
alpha=cfg["alpha"],
N=cfg["N"],
)
env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"]))
obs, _ = env.reset()
total_reward, steps = 0.0, 0
for _ in range(100):
action = env.action_space.sample()
obs, reward, term, trunc, _ = env.step(action)
total_reward += reward
steps += 1
if term: break
env.close()
return {
"id": cfg["id"],
"config": cfg,
"total_reward": total_reward,
"avg_reward": total_reward / steps if steps > 0 else 0.0,
"steps": steps,
}
def run_study(max_workers: int = None, output: str = "results_full.jsonl"):
configs = generate_configs()
log.info(f"full factorial: {len(configs)} configs ({len(configs)//SEEDS_PER_CONFIG} unique × {SEEDS_PER_CONFIG} seeds)")
results = []
with ProcessPoolExecutor(max_workers=max_workers) as ex:
for i, result in enumerate(ex.map(run_single, configs)):
results.append(result)
if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
Path(output).write_text("\n".join(json.dumps(r) for r in results))
log.info(f"wrote {len(results)} results to {output}")
return results
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser()
p.add_argument("--workers", type=int, default=None)
p.add_argument("--output", default="results_full.jsonl")
p.add_argument("--dry-run", action="store_true", help="only show design size")
args = p.parse_args()
configs = generate_configs()
log.info(f"design: {len(configs)} runs | factors: {[f.name for f in FACTORS]} | levels: {[len(f.levels) for f in FACTORS]}")
if not args.dry_run:
run_study(args.workers, args.output)

106
engine/studies/mixed_lh.py Normal file
View File

@@ -0,0 +1,106 @@
"""mixed design: full factorial on primary factors, latin hypercube on secondary"""
import sys
sys.path.insert(0, "..")
import logging
from itertools import product
import json
import hashlib
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from scipy.stats.qmc import LatinHypercube
from factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
LH_SAMPLES = 10
def generate_configs(lh_samples: int = LH_SAMPLES):
primary = [f for f in FACTORS if f.primary]
secondary = [f for f in FACTORS if not f.primary]
primary_grid = list(product(*[f.levels for f in primary]))
lhs = LatinHypercube(d=len(secondary), seed=42)
configs = []
for p_combo in primary_grid:
samples = lhs.random(n=lh_samples)
for s in samples:
sec_vals = {
secondary[i].name: secondary[i].levels[int(s[i] * len(secondary[i].levels))]
for i in range(len(secondary))
}
base = {primary[i].name: p_combo[i] for i in range(len(primary))}
base.update(sec_vals)
for seed in range(SEEDS_PER_CONFIG):
cfg = {**base, "seed": seed}
cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
configs.append(cfg)
return configs
def run_single(cfg: dict) -> dict:
from engine.wrapper import PHANTOM
import numpy as np
np.random.seed(cfg["seed"])
demand_fn = DEMAND_FUNCTIONS[cfg["demand_fn"]]
env = PHANTOM(
n_products=cfg["n_products"],
alpha=cfg["alpha"],
N=cfg["N"],
)
env.market.demand = (demand_fn, (cfg["demand_mu"], cfg["demand_sigma"]))
obs, _ = env.reset()
total_reward, steps = 0.0, 0
for _ in range(100):
action = env.action_space.sample()
obs, reward, term, trunc, _ = env.step(action)
total_reward += reward
steps += 1
if term: break
env.close()
return {
"id": cfg["id"],
"config": cfg,
"total_reward": total_reward,
"avg_reward": total_reward / steps,
"steps": steps,
}
def run_study(max_workers: int = None, output: str = "results_mixed.jsonl", lh_samples: int = LH_SAMPLES):
configs = generate_configs(lh_samples)
n_primary_cells = int(np.prod([len(f.levels) for f in FACTORS if f.primary]))
log.info(f"mixed LH: {len(configs)} configs ({n_primary_cells} primary × {lh_samples} LH × {SEEDS_PER_CONFIG} seeds)")
results = []
with ProcessPoolExecutor(max_workers=max_workers) as ex:
for i, result in enumerate(ex.map(run_single, configs)):
results.append(result)
if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
Path(output).write_text("\n".join(json.dumps(r) for r in results))
log.info(f"wrote {len(results)} results to {output}")
return results
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser()
p.add_argument("--workers", type=int, default=None)
p.add_argument("--output", default="results_mixed.jsonl")
p.add_argument("--lh-samples", type=int, default=10)
p.add_argument("--dry-run", action="store_true", help="only show design size")
args = p.parse_args()
primary = [f for f in FACTORS if f.primary]
secondary = [f for f in FACTORS if not f.primary]
configs = generate_configs(args.lh_samples)
log.info(f"design: {len(configs)} runs | primary: {[f.name for f in primary]} | secondary (LH): {[f.name for f in secondary]}")
if not args.dry_run:
run_study(args.workers, args.output, args.lh_samples)

45
engine/train.py Normal file
View File

@@ -0,0 +1,45 @@
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from .wrapper import PHANTOM
class RenderCallback(BaseCallback):
"""Renders environment on every step for live visualization."""
def __init__(self, env: PHANTOM):
super().__init__()
self.env = env
def _on_step(self) -> bool:
self.env.render()
return True
env = PHANTOM(n_products=10, alpha=0.3, render_mode="human")
eval_env = PHANTOM(n_products=10, alpha=0.3, render_mode=None)
model = SAC(
"MultiInputPolicy",
env,
verbose=1,
learning_rate=3e-4,
buffer_size=50000,
batch_size=256,
tau=0.005,
gamma=0.99,
)
render_cb = RenderCallback(env)
eval_cb = EvalCallback(eval_env, eval_freq=1000, n_eval_episodes=5, verbose=1)
model.learn(total_timesteps=50000, callback=[render_cb, eval_cb])
model.save("phantom_sac")
# test trained policy
env = PHANTOM(n_products=10, alpha=0.3, render_mode="human")
obs, _ = env.reset()
for _ in range(100):
action, _ = model.predict(obs, deterministic=True)
obs, reward, term, trunc, _ = env.step(action)
env.render()
if term or trunc: break
env.close()

118
engine/wrapper.py Normal file
View File

@@ -0,0 +1,118 @@
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from .engine import Limbo, MarketEngine, PricingEngine
from .lib.render import DashboardRenderer
class PHANTOM(gym.Env):
"""Gymnasium wrapper for the Limbo pricing-market simulation. Platform sets prices, market responds with demand."""
metadata = {"render_modes": ["human", "ansi"]}
def __init__(self,
n_products: int = 10,
alpha: float = 0.3,
N: int = 100,
price_bounds: tuple = (10.0, 150.0),
lambda_coi: float = 0.1,
render_mode: str = None):
super().__init__()
self.n_products = n_products
self.price_bounds = price_bounds
self.lambda_coi = lambda_coi
self.render_mode = render_mode
self.alpha = alpha
self.N = N
self.market = MarketEngine(alpha=alpha, N=N)
self._platform_stub = PricingEngine()
self._limbo = Limbo(self._platform_stub, self.market)
self.action_space = spaces.Box(
low=price_bounds[0], high=price_bounds[1],
shape=(n_products,), dtype=np.float32
)
self.observation_space = spaces.Dict({
"demand": spaces.Box(low=0.0, high=100.0, shape=(n_products,), dtype=np.float32),
"prices": spaces.Box(low=price_bounds[0], high=price_bounds[1], shape=(n_products,), dtype=np.float32),
})
self._prices = None
self._demand = None
self._step_count = 0
self._demand_history = []
self._price_history = []
self._revenue_history = []
self._renderer = None
def _get_obs(self) -> dict:
demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32)
return {"demand": demand_arr, "prices": self._prices.astype(np.float32)}
def _compute_reward(self, prices: np.ndarray, demand: dict) -> float:
revenue = np.sum(prices * np.array([demand.get(i, 0.0) for i in range(self.n_products)]))
# TODO: implement supra-competitive price punishment
return float(revenue)
def _record_history(self):
demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)])
self._demand_history.append(demand_arr)
self._price_history.append(self._prices.copy())
self._revenue_history.append(np.sum(self._prices * demand_arr))
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self._prices = np.random.uniform(*self.price_bounds, size=self.n_products)
self._demand = self.market.act(self._prices)
self._step_count = 0
self._demand_history, self._price_history, self._revenue_history = [], [], []
self._record_history()
return self._get_obs(), {}
def step(self, action: np.ndarray):
self._prices = np.clip(action, *self.price_bounds)
self._demand = self.market.act(self._prices)
self._step_count += 1
self._record_history()
reward = self._compute_reward(self._prices, self._demand)
terminated = self._step_count >= 100
return self._get_obs(), reward, terminated, False, {"step": self._step_count}
def _compute_elasticity(self) -> np.ndarray:
"""point elasticity: e = (dQ/dP) * (P/Q) via finite differences, clipped to [-5, 5]"""
if len(self._price_history) < 2:
return np.zeros(self.n_products)
p, q = np.array(self._price_history), np.array(self._demand_history)
dp, dq = np.diff(p, axis=0), np.diff(q, axis=0)
valid = np.abs(dp) > 0.5
with np.errstate(divide='ignore', invalid='ignore'):
elasticity = np.where(valid, (dq / dp) * (p[:-1] / np.maximum(q[:-1], 1.0)), 0.0)
elasticity = np.nan_to_num(np.clip(elasticity, -5.0, 5.0), nan=0.0)
return np.mean(elasticity, axis=0) if len(elasticity) > 0 else np.zeros(self.n_products)
def render(self):
if self.render_mode == "human":
if self._renderer is None:
self._renderer = DashboardRenderer()
self._renderer.render(self)
elif self.render_mode == "ansi":
return f"step={self._step_count}, prices={self._prices}, demand={self._demand}"
return None
def close(self):
if self._renderer:
self._renderer.close()
self._renderer = None
if __name__ == "__main__":
env = PHANTOM(n_products=15, alpha=0.3, N=100, render_mode="human")
obs, _ = env.reset()
for step in range(100):
action = env.action_space.sample()
obs, reward, term, trunc, info = env.step(action)
env.render()
if term: break
env.close()

View File

@@ -1,7 +1,14 @@
import pandas as pd from __future__ import annotations
import random
import os import os
import random
from pathlib import Path from pathlib import Path
from types import SimpleNamespace
import pandas as pd
from lib.separability import estimate_alpha, load_artifacts, score_session
# use relative import when in package context, fallback for standalone # use relative import when in package context, fallback for standalone
try: try:
@@ -15,6 +22,11 @@ except ImportError:
PROJECT_ROOT = Path(__file__).parent.parent.parent PROJECT_ROOT = Path(__file__).parent.parent.parent
AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', PROJECT_ROOT / "experiments" / "agents" / "collected_data")) AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', PROJECT_ROOT / "experiments" / "agents" / "collected_data"))
try:
SEPARABILITY_ARTIFACTS = load_artifacts()
except FileNotFoundError:
SEPARABILITY_ARTIFACTS = None
def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.DataFrame: def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.DataFrame:
"""remap column values according to mapping dict, preserving unmapped values""" """remap column values according to mapping dict, preserving unmapped values"""
@@ -23,6 +35,23 @@ def remap_schema(df: pd.DataFrame, mapping: dict, on: str = "event_type") -> pd.
return df return df
def _states_to_events(states: list[str]) -> list[SimpleNamespace]:
events: list[SimpleNamespace] = []
for idx, state in enumerate(states):
parts = state.split("|") if isinstance(state, str) else ["page", "product", str(state)]
page = f"/{parts[0]}" if parts else "/"
product = parts[1] if len(parts) > 1 else "unknown"
event_name = parts[2] if len(parts) > 2 else parts[-1]
events.append(
SimpleNamespace(
eventName=event_name,
page=page,
productId=product,
ts=float(idx),
)
)
return events
def contaminate_dataset(df: pd.DataFrame, on: str = "event_type", def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
contamination_rate: float = 0.1, contamination_rate: float = 0.1,
agent_data_dir: Path = None) -> pd.DataFrame: agent_data_dir: Path = None) -> pd.DataFrame:
@@ -48,6 +77,8 @@ def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
# generate synthetic trajectories # generate synthetic trajectories
new_rows = [] new_rows = []
alpha_estimates = []
for start_event in start_events: for start_event in start_events:
# sample trajectory from agent model, using a state that contains the event type # sample trajectory from agent model, using a state that contains the event type
mdp_states = model.mdp.get('states', []) if model.mdp else [] mdp_states = model.mdp.get('states', []) if model.mdp else []
@@ -56,11 +87,28 @@ def contaminate_dataset(df: pd.DataFrame, on: str = "event_type",
continue # skip if no matching start state continue # skip if no matching start state
start_state = random.choice(matching_starts) start_state = random.choice(matching_starts)
trajectory = model.sample_traj(start_state, max_len=20) trajectory = model.sample_traj(start_state, max_len=20)
score_payload: list[SimpleNamespace] = []
score: dict[str, float] = {}
if SEPARABILITY_ARTIFACTS:
score_payload = _states_to_events(trajectory)
score = score_session(score_payload, SEPARABILITY_ARTIFACTS)
alpha_estimates.append(
estimate_alpha(score["prob_agent"], score["delta_h"], score["delta_a"], temperature=2.0)
)
for state in trajectory: for state in trajectory:
parts = state.split('|') # page|productId|eventName format parts = state.split('|') if isinstance(state, str) else [start_event]
new_rows.append({on: parts[-1] if parts else start_event, 'source': 'synthetic_agent'}) new_rows.append({
on: parts[-1] if parts else start_event,
'source': 'synthetic_agent',
'prob_agent': score.get('prob_agent') if SEPARABILITY_ARTIFACTS and score_payload else None,
'delta_h': score.get('delta_h') if SEPARABILITY_ARTIFACTS and score_payload else None,
'delta_a': score.get('delta_a') if SEPARABILITY_ARTIFACTS and score_payload else None,
})
if new_rows: if new_rows:
contaminate_df = pd.DataFrame(new_rows) contaminate_df = pd.DataFrame(new_rows)
df = pd.concat([df, contaminate_df], ignore_index=True) df = pd.concat([df, contaminate_df], ignore_index=True)
if alpha_estimates:
df['estimated_alpha'] = sum(alpha_estimates) / len(alpha_estimates)
return df return df

View File

@@ -6,6 +6,7 @@ from procesing.steps import (
) )
def test_compute_demand(pipeline_context): def test_compute_demand(pipeline_context):
random.seed(42) # deterministic test
step = ComputeDemandStep(context=pipeline_context) step = ComputeDemandStep(context=pipeline_context)
# Test with normal interaction data # Test with normal interaction data
@@ -26,6 +27,7 @@ def test_compute_demand(pipeline_context):
def test_compute_demand_skewed(pipeline_context): def test_compute_demand_skewed(pipeline_context):
random.seed(42) # deterministic test
step = ComputeDemandStep(context=pipeline_context) step = ComputeDemandStep(context=pipeline_context)
# Test with normal interaction data # Test with normal interaction data

2
sim/case/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
"""Case-specific simulations and experiments."""

View File

@@ -0,0 +1,2 @@
"""Minimal thesis-aligned pricing simulation (self-contained)."""

View File

@@ -0,0 +1,125 @@
"""Cost of Information (COI) computation for thesis pricing system.
Core KPI: COI = E[p_shown] - p_min measures pricing power from information asymmetry.
Theorem 1 shows COI erodes as agent queries increase: as N->inf, p^(1)->p_min.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, List, TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from .simplified import Session
@dataclass(frozen=True)
class COIWindow:
"""Windowed COI metrics computed from realized price exposures.
policy: E[p_shown] - cost, the definition-level KPI
agent: E[p^(1)] - cost where p^(1) is min price under agent querying
leak: max(policy - agent, 0), observable gap from reconnaissance
survival_ratio: agent/policy, fraction of pricing power retained
"""
policy: float
agent: float
leak: float
survival_ratio: float
policy_by_product: np.ndarray
agent_by_product: np.ndarray
demand_weights: np.ndarray
def aggregate_prices(sessions: List["Session"], mode: str = "all") -> Dict[int, List[float] | float]:
"""Unified price aggregation across sessions.
mode: "all" returns all prices per product, "min_per_session" returns min price per session per product,
"min_across" returns single min price per product
"""
if mode == "min_across":
mins: Dict[int, float] = {}
for s in sessions:
for e in s.events:
pidx, price = int(e.product_idx), float(e.price_seen)
mins[pidx] = min(mins.get(pidx, price), price)
return mins
elif mode == "min_per_session":
result: Dict[int, List[float]] = {}
for s in sessions:
by_p: Dict[int, float] = {}
for e in s.events:
pidx, price = int(e.product_idx), float(e.price_seen)
by_p[pidx] = min(by_p.get(pidx, price), price)
for pidx, pmin in by_p.items():
result.setdefault(pidx, []).append(pmin)
return result
else: # "all"
prices: Dict[int, List[float]] = {}
for s in sessions:
for e in s.events:
prices.setdefault(e.product_idx, []).append(float(e.price_seen))
return prices
def demand_weights_by_product(sessions: List["Session"], demand_mapping: Dict[str, float], n_products: int) -> np.ndarray:
"""Compute demand-weighted importance per product."""
w = np.zeros(n_products, dtype=float)
sessions_by_id = {s.sid: s for s in sessions}
for sid, q in demand_mapping.items():
sess = sessions_by_id.get(sid)
if sess and sess.events:
w[int(sess.events[0].product_idx)] += float(q)
total = float(np.sum(w))
return (w / total) if total > 0 else w
def compute_coi_window(sessions: List["Session"], costs: np.ndarray, demand_mapping: Dict[str, float] | None = None) -> COIWindow:
"""Compute COI metrics over session window.
Aggregates price exposures and computes policy-level vs agent-realized COI.
"""
n = int(len(costs))
prices = aggregate_prices(sessions, mode="all")
agent_sessions = [s for s in sessions if s.actor == "A"]
agent_min = aggregate_prices(agent_sessions, mode="min_across") if agent_sessions else {}
policy_by = np.zeros(n, dtype=float)
agent_by = np.zeros(n, dtype=float)
seen = np.array([(i in prices) for i in range(n)], dtype=bool)
agent_seen = np.array([(i in agent_min) for i in range(n)], dtype=bool)
for pidx, ps in prices.items():
if 0 <= pidx < n and ps:
policy_by[pidx] = float(np.mean(ps) - float(costs[pidx]))
for pidx, pmin in agent_min.items():
if 0 <= pidx < n:
agent_by[pidx] = float(pmin - float(costs[pidx]))
agent_by[seen & ~agent_seen] = policy_by[seen & ~agent_seen] # no erosion if no agent exposure
demand_w = demand_weights_by_product(sessions, demand_mapping, n) if demand_mapping else np.zeros(n, dtype=float)
has_weights = float(np.sum(demand_w)) > 0
if has_weights:
policy, agent = float(np.dot(demand_w, policy_by)), float(np.dot(demand_w, agent_by))
elif np.any(seen):
policy, agent = float(np.mean(policy_by[seen])), float(np.mean(agent_by[seen]))
else:
policy, agent = 0.0, 0.0
leak = float(max(policy - agent, 0.0))
survival = float(np.clip(agent / policy, 0.0, 1.0)) if policy > 0 else 0.0
return COIWindow(policy=policy, agent=agent, leak=leak, survival_ratio=survival,
policy_by_product=policy_by, agent_by_product=agent_by, demand_weights=demand_w)
def coi_erosion(coi_policy: float, coi_agent: float, eps: float = 1e-9) -> float:
"""Thesis-consistent COI erosion: fraction of pricing power destroyed by agent queries.
erosion = 1 - (COI_agent / COI_policy)
When agents find low prices, COI_agent -> 0, erosion -> 1.
"""
if coi_policy <= eps:
return 0.0
return float(np.clip(1.0 - (coi_agent / (coi_policy + eps)), 0.0, 1.0))

View File

@@ -0,0 +1,325 @@
"""COI leakage experiments and policy comparisons.
Demonstrates the core thesis contribution: COI erosion under agent contamination
and recovery via robust pricing policies.
Generates TensorBoard logs for:
- COI erosion curves across contamination levels
- Policy comparison (fixed vs adaptive vs RL)
- Revenue/margin trade-offs
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple
import json
import numpy as np
try:
from torch.utils.tensorboard import SummaryWriter
HAS_TB = True
except ImportError:
HAS_TB = False
from .simplified_env import PricingEnv, EnvConfig, make_env
from .simplified import System
@dataclass
class ExperimentResult:
"""Container for experiment metrics."""
name: str
alpha: float
reward_mean: float
reward_std: float
coi_erosion: float
alpha_error: float
revenue: float
margin: float
def to_dict(self) -> dict:
return {k: getattr(self, k) for k in self.__dataclass_fields__}
def theoretical_coi_erosion_curve(alphas: np.ndarray, n_sessions: int = 1000) -> np.ndarray:
"""Theoretical COI erosion from Theorem 1 using order statistic model.
For N i.i.d. uniform queries on [p_min, p_max]:
E[p^(1)] = p_min + (p_max - p_min)/(N+1), so erosion = 1 - 2/(N+1)
"""
erosions = []
for a in alphas:
n_agents = max(1, int(a * n_sessions))
erosions.append(1.0 - 2.0 / (n_agents + 1))
return np.array(erosions)
def run_policy_episode(
env: PricingEnv,
policy_fn,
n_episodes: int = 10
) -> Tuple[List[float], List[float], List[float], List[float]]:
"""Run policy and collect per-step metrics."""
rewards, coi_erosions, alpha_errors, revenues = [], [], [], []
for _ in range(n_episodes):
obs, info = env.reset()
done = False
while not done:
action = policy_fn(obs, env.n)
obs, reward, terminated, truncated, info = env.step(action)
done = terminated or truncated
rewards.append(reward)
if 'coi_erosion' in info:
coi_erosions.append(info['coi_erosion'])
if 'alpha_true' in info and 'alpha_est' in info:
alpha_errors.append(abs(info['alpha_true'] - info['alpha_est']))
if 'revenue' in info:
revenues.append(info['revenue'])
return rewards, coi_erosions, alpha_errors, revenues
class PolicyRegistry:
"""Registry of baseline policies."""
@staticmethod
def fixed(obs: np.ndarray, n: int, margin: float = 0.15) -> np.ndarray:
return np.ones(n, dtype=np.float32) * (1.0 + margin)
@staticmethod
def random(obs: np.ndarray, n: int, rng: np.random.Generator = None) -> np.ndarray:
rng = rng or np.random.default_rng()
return rng.uniform(0.7, 1.3, n).astype(np.float32)
@staticmethod
def adaptive(obs: np.ndarray, n: int, base_margin: float = 0.15) -> np.ndarray:
"""Reduce margins when alpha estimate is high."""
alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
margin_scale = 1.0 - 0.4 * alpha_est
return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale)
@staticmethod
def aggressive(obs: np.ndarray, n: int) -> np.ndarray:
"""High margins, ignores contamination."""
return np.ones(n, dtype=np.float32) * 1.4
@staticmethod
def defensive(obs: np.ndarray, n: int) -> np.ndarray:
"""Low margins, always cautious."""
return np.ones(n, dtype=np.float32) * 1.05
@staticmethod
def alpha_proportional(obs: np.ndarray, n: int, max_margin: float = 0.3) -> np.ndarray:
"""Margin inversely proportional to estimated alpha."""
alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
margin = max_margin * (1.0 - alpha_est)
return np.ones(n, dtype=np.float32) * (1.0 + margin)
def run_contamination_sweep(
alphas: List[float],
policies: Dict[str, callable],
n_products: int = 10,
max_steps: int = 200,
n_episodes: int = 10,
seed: int = 42,
log_dir: str = None
) -> Dict[str, List[ExperimentResult]]:
"""Run policies across contamination levels."""
results = {name: [] for name in policies}
writer = SummaryWriter(Path(log_dir) / "sweep") if log_dir and HAS_TB else None
for alpha in alphas:
print(f" alpha={alpha:.2f}", end=" ")
env_cfg = EnvConfig(
n_products=n_products, max_steps=max_steps,
alpha_true=alpha, reward_mode="robust", seed=seed)
env = make_env(env_cfg)
for name, policy_fn in policies.items():
rewards, coi_vals, alpha_errs, revenues = run_policy_episode(env, policy_fn, n_episodes)
result = ExperimentResult(
name=name, alpha=alpha,
reward_mean=float(np.mean(rewards)),
reward_std=float(np.std(rewards)),
coi_erosion=float(np.mean(coi_vals)) if coi_vals else 0.0,
alpha_error=float(np.mean(alpha_errs)) if alpha_errs else 0.0,
revenue=float(np.mean(revenues)) if revenues else 0.0,
margin=float(np.mean([policy_fn(np.zeros(3 * n_products + 3), n_products)]) - 1.0))
results[name].append(result)
if writer:
step = int(alpha * 100)
writer.add_scalar(f'{name}/reward', result.reward_mean, step)
writer.add_scalar(f'{name}/coi_erosion', result.coi_erosion, step)
writer.add_scalar(f'{name}/alpha_error', result.alpha_error, step)
writer.add_scalar(f'{name}/revenue', result.revenue, step)
print(f"done")
# add theoretical curve
if writer:
theo = theoretical_coi_erosion_curve(np.array(alphas))
for i, (a, e) in enumerate(zip(alphas, theo)):
writer.add_scalar('theoretical/coi_erosion', e, int(a * 100))
writer.close()
return results
def run_coi_demonstration(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
"""Main COI demonstration experiment."""
print("=== COI Leakage Demonstration ===\n")
Path(log_dir).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(Path(log_dir) / "coi_demo") if HAS_TB else None
# theoretical erosion curve
print("1. Theoretical COI erosion (Theorem 1)")
alphas = np.linspace(0.0, 0.6, 13)
theo_erosion = theoretical_coi_erosion_curve(alphas, n_sessions=1000)
for a, e in zip(alphas, theo_erosion):
print(f" alpha={a:.2f} -> erosion={e:.3f}")
if writer:
writer.add_scalar('theory/coi_erosion', e, int(a * 100))
# policy comparison
print("\n2. Policy comparison across contamination levels")
policies = {
'fixed': lambda obs, n: PolicyRegistry.fixed(obs, n),
'aggressive': PolicyRegistry.aggressive,
'defensive': PolicyRegistry.defensive,
'adaptive': PolicyRegistry.adaptive,
'alpha_proportional': PolicyRegistry.alpha_proportional,
}
sweep_alphas = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
results = run_contamination_sweep(
sweep_alphas, policies, n_products=10, max_steps=100,
n_episodes=5, seed=seed, log_dir=log_dir)
# summarize
print("\n3. Summary by policy")
for name, res_list in results.items():
avg_reward = np.mean([r.reward_mean for r in res_list])
avg_coi = np.mean([r.coi_erosion for r in res_list])
print(f" {name:20s}: avg_reward={avg_reward:.2f}, avg_coi={avg_coi:.3f}")
# save results
output = {
'theoretical': {'alphas': alphas.tolist(), 'erosion': theo_erosion.tolist()},
'empirical': {name: [r.to_dict() for r in res_list] for name, res_list in results.items()}}
with open(Path(log_dir) / "coi_demo_results.json", 'w') as f:
json.dump(output, f, indent=2)
if writer:
writer.close()
print(f"\nResults saved to {log_dir}/coi_demo_results.json")
print(f"TensorBoard: tensorboard --logdir {log_dir}")
return output
def run_reward_mode_comparison(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
"""Compare different reward modes."""
print("=== Reward Mode Comparison ===\n")
Path(log_dir).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(Path(log_dir) / "reward_modes") if HAS_TB else None
reward_modes = ["revenue", "profit", "robust", "coi_aware"]
alpha = 0.3 # moderate contamination
results = {}
for mode in reward_modes:
print(f" mode={mode}", end=" ")
env_cfg = EnvConfig(
n_products=10, max_steps=200, alpha_true=alpha,
reward_mode=mode, seed=seed)
env = make_env(env_cfg)
rewards, coi_vals, _, revenues = run_policy_episode(
env, PolicyRegistry.adaptive, n_episodes=10)
results[mode] = {
'reward_mean': float(np.mean(rewards)),
'reward_std': float(np.std(rewards)),
'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
'revenue': float(np.mean(revenues)) if revenues else 0.0}
if writer:
for k, v in results[mode].items():
writer.add_scalar(f'{mode}/{k}', v, 0)
print(f"reward={results[mode]['reward_mean']:.2f}, coi={results[mode]['coi_erosion']:.3f}")
if writer:
writer.close()
with open(Path(log_dir) / "reward_mode_results.json", 'w') as f:
json.dump(results, f, indent=2)
return results
def run_alpha_drift_experiment(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
"""Test policy robustness under non-stationary contamination."""
print("=== Alpha Drift Experiment ===\n")
Path(log_dir).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(Path(log_dir) / "alpha_drift") if HAS_TB else None
drift_rates = [0.0, 0.01, 0.02, 0.05]
results = {}
for drift in drift_rates:
print(f" drift={drift:.2f}", end=" ")
env_cfg = EnvConfig(
n_products=10, max_steps=200, alpha_true=0.2,
alpha_drift=drift, reward_mode="robust", seed=seed)
env = make_env(env_cfg)
rewards, coi_vals, alpha_errs, _ = run_policy_episode(
env, PolicyRegistry.adaptive, n_episodes=10)
results[f'drift_{drift}'] = {
'reward_mean': float(np.mean(rewards)),
'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
'alpha_tracking_error': float(np.mean(alpha_errs)) if alpha_errs else 0.0}
if writer:
for k, v in results[f'drift_{drift}'].items():
writer.add_scalar(f'drift_{drift}/{k}', v, 0)
print(f"reward={results[f'drift_{drift}']['reward_mean']:.2f}, "
f"alpha_err={results[f'drift_{drift}']['alpha_tracking_error']:.3f}")
if writer:
writer.close()
return results
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Run COI experiments")
parser.add_argument("--exp", type=str, default="coi", choices=["coi", "reward", "drift", "all"])
parser.add_argument("--log-dir", type=str, default="sim/case/thesis_simplified/runs")
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
if args.exp == "coi" or args.exp == "all":
run_coi_demonstration(args.log_dir, args.seed)
if args.exp == "reward" or args.exp == "all":
run_reward_mode_comparison(args.log_dir, args.seed)
if args.exp == "drift" or args.exp == "all":
run_alpha_drift_experiment(args.log_dir, args.seed)

View File

@@ -0,0 +1,72 @@
"""Behavioral separability for human/agent detection.
Computes divergence signals delta_H, delta_A from session trajectories using
transition kernel estimation and KL divergence to prototype behavioral profiles.
"""
from __future__ import annotations
from typing import Dict, List, Tuple, TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from .simplified import Event, Session
# prototype behavioral kernels for human vs agent sessions
TRANS_H = {
"start": {"view": 0.85, "end": 0.15},
"view": {"detail": 0.4, "cart": 0.3, "view": 0.2, "end": 0.1},
"detail": {"cart": 0.5, "view": 0.3, "end": 0.2},
"cart": {"purchase": 0.6, "view": 0.25, "end": 0.15},
"purchase": {"end": 1.0},
}
TRANS_A = {
"start": {"view": 0.95, "end": 0.05},
"view": {"detail": 0.6, "view": 0.25, "cart": 0.1, "end": 0.05},
"detail": {"view": 0.5, "cart": 0.15, "detail": 0.3, "end": 0.05},
"cart": {"view": 0.4, "purchase": 0.2, "end": 0.4},
"purchase": {"end": 1.0},
}
def kl_div(p: Dict[str, float], q: Dict[str, float], eps: float = 1e-10) -> float:
"""KL divergence D_KL(p || q) for discrete distributions."""
keys = set(p.keys()) | set(q.keys())
return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys)
def build_kernel(events: List["Event"]) -> Dict[str, Dict[str, float]]:
"""Build empirical transition kernel T' from trajectory events."""
trans: Dict[str, Dict[str, int]] = {}
prev = "start"
for e in events:
curr = e.action
trans.setdefault(prev, {})
trans[prev][curr] = trans[prev].get(curr, 0) + 1
prev = curr
return {s: {d: c / sum(dsts.values()) for d, c in dsts.items()} for s, dsts in trans.items() if sum(dsts.values()) > 0}
def compute_divergence(session: "Session") -> Tuple[float, float]:
"""Compute divergence signals delta_H, delta_A for session.
delta_H = mean KL(T' || T_H) across states, measures distance to human prototype
delta_A = mean KL(T' || T_A) across states, measures distance to agent prototype
"""
kernel = build_kernel(session.events)
if not kernel:
return 0.5, 0.5
delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / len(kernel)
delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / len(kernel)
return delta_h, delta_a
def estimate_alpha(session: "Session", beta: float = 2.0) -> float:
"""Per-session contamination estimate alpha_hat = sigma(beta*(delta_H - delta_A)).
Returns probability session is agent-generated based on behavioral divergence.
"""
dh, da = compute_divergence(session)
if (dh + da) <= 0:
return 0.5
return 1.0 / (1.0 + np.exp(-beta * (dh - da)))

View File

@@ -0,0 +1,219 @@
"""Minimal implementation of thesis pricing system.
Implements the core loop: prices -> sessions -> demand -> prices
with behavioral separability and robust pricing objective.
Objects:
- Session trajectories tau_s from mixture of H/A behavioral profiles
- Demand proxy q_hat via weighted action aggregation
- COI leakage penalty for agent reconnaissance
- Limbo: alternating price/demand history for trajectory analysis
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict, List, Tuple
import numpy as np
from .coi import COIWindow, compute_coi_window
from .separability import TRANS_H, TRANS_A, kl_div, build_kernel, compute_divergence, estimate_alpha
ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0}
@dataclass
class Event:
action: str
product_idx: int
price_seen: float
ts: float
@dataclass
class Session:
sid: str
events: List[Event]
actor: str # H or A (ground truth label)
theta: Dict[str, float] = field(default_factory=dict)
def compute_demand(session: Session) -> float:
"""Compute demand proxy q_hat = sum_k omega(a_k) for session."""
return sum(ACTION_WEIGHTS.get(e.action, 0.1) for e in session.events)
def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray, costs: np.ndarray, theta: Dict[str, float],
is_agent: bool, session_noise: float = 0.02, surge: float = 0.08, max_mult: float = 1.8) -> Tuple[List[Event], int]:
"""Sample session trajectory from behavioral kernel."""
pidx = int(rng.integers(0, len(prices)))
cost, base = float(costs[pidx]), float(prices[pidx]) * (1.0 + rng.normal(0.0, session_noise))
base = float(np.clip(base, cost * 1.01, float(prices[pidx]) * 2.0))
price, signal, state, t = base, 0.0, "start", 0.0
events = []
while state != "end" and len(events) < 30:
probs = trans.get(state, {"end": 1.0})
nxt = rng.choice(list(probs.keys()), p=list(probs.values()))
if nxt == "purchase": # purchase conversion check
rel = max((price - cost) / (cost + 1e-6), 0.0)
p_buy = float(np.clip(theta.get("base_conv", 0.2) * np.exp(-theta.get("price_sens", 2.0) * rel), 0.0, 1.0))
if rng.random() > p_buy:
nxt = "end"
state = nxt
if state not in {"start", "end"}:
events.append(Event(action=state, product_idx=pidx, price_seen=float(price), ts=t))
signal += float(ACTION_WEIGHTS.get(state, 0.1))
price = float(np.clip(base * (1.0 + surge * signal), cost * 1.01, base * max_mult))
t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2))
return events, pidx
def put_prices_to_market(prices: np.ndarray, costs: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]:
"""Generate sessions from mixture model. Returns sessions and demand mapping sid -> q_hat."""
rng = np.random.default_rng(seed)
sessions, demand = [], {}
for i in range(n_sessions):
sid = f"s{i:04d}"
is_agent = rng.random() < alpha
trans = TRANS_A if is_agent else TRANS_H
theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else \
{"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
events, _ = sample_trajectory(rng, trans, prices, costs=costs, theta=theta, is_agent=is_agent)
session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
sessions.append(session)
demand[sid] = compute_demand(session)
return sessions, demand
@dataclass
class LimboUpdate:
utype: str # "prices" or "demand"
data: np.ndarray | Dict[str, float]
t: int
class Limbo:
"""Historical trajectory of alternating price/demand observations."""
def __init__(self):
self.history: List[LimboUpdate] = []
self._t = 0
def add_update(self, utype: str, data: np.ndarray | Dict[str, float]) -> Dict:
self.history.append(LimboUpdate(utype=utype, data=data, t=self._t))
self._t += 1
return {"action": "observe_demand" if utype == "prices" else "set_prices"}
def get_prices_history(self) -> List[np.ndarray]:
return [u.data for u in self.history if u.utype == "prices"]
def get_demand_history(self) -> List[Dict[str, float]]:
return [u.data for u in self.history if u.utype == "demand"]
class System:
"""Main pricing system implementing robust Stackelberg objective.
Manages the alternating loop: set prices p_t -> observe demand Q_hat(p_t) ->
estimate contamination alpha from behavioral signals -> compute next prices.
"""
def __init__(self, n_products: int = 10, costs: np.ndarray | None = None, lambda_coi: float = 0.5, seed: int | None = 42):
self.n = n_products
self.rng = np.random.default_rng(seed)
self.costs = costs if costs is not None else self.rng.uniform(10, 50, n_products)
self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products))
self.lambda_coi = lambda_coi
self.limbo = Limbo()
self._alpha_est = 0.2
self._sessions: List[Session] = []
self._last_sessions: List[Session] = []
self._last_coi: COIWindow | None = None
@property
def alpha(self) -> float:
return self._alpha_est
def _estimate_alpha_from_sessions(self) -> float:
if not self._sessions:
return self._alpha_est
return float(np.mean([estimate_alpha(s) for s in self._sessions[-50:]]))
def _revenue_under_demand(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
agg = np.zeros(self.n)
for sid, q in demand.items():
sess = next((s for s in self._sessions if s.sid == sid), None)
if sess and sess.events:
agg[sess.events[0].product_idx] += q
return float(np.dot(prices, agg))
def _compute_coi_window(self, demand: Dict[str, float]) -> COIWindow:
if not self._last_sessions:
zeros = np.zeros(self.n, dtype=float)
return COIWindow(policy=0.0, agent=0.0, leak=0.0, survival_ratio=0.0,
policy_by_product=zeros, agent_by_product=zeros, demand_weights=zeros)
return compute_coi_window(self._last_sessions, self.costs, demand_mapping=demand)
def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
"""Robust objective: R(p,d) - lambda * COI_leak."""
profit = self._revenue_under_demand(prices, demand) - float(np.sum(self.costs))
self._last_coi = self._compute_coi_window(demand)
return profit - self.lambda_coi * self._last_coi.leak
def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray:
"""Compute next prices via heuristic margin adjustment based on alpha estimate."""
self._alpha_est = self._estimate_alpha_from_sessions()
margin_scale = 1.0 - 0.5 * self._alpha_est # defensive pricing under high contamination
margins = (self.refs - self.costs) * margin_scale
noise = self.rng.normal(0, 0.02, self.n) * self.costs
prices = np.clip(self.costs + margins + noise, self.costs * 1.02, self.refs * 1.3)
self.limbo.add_update("prices", prices)
return prices
def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true,
n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
self._last_sessions = sessions
self._sessions.extend(sessions)
self.limbo.add_update("demand", demand_map)
return demand_map
def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float, COIWindow]:
demand_hist = self.limbo.get_demand_history()
prices = self.compute_prices(demand_hist[-1] if demand_hist else None)
demand = self.observe_demand(prices, alpha_true, n_sessions)
reward = self._objective(prices, demand)
return prices, demand, reward, self._last_coi or self._compute_coi_window(demand)
def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict:
traj = {"prices": [], "demand": [], "rewards": [], "alpha_est": [], "alpha_true": alpha_true,
"coi_policy": [], "coi_agent": [], "coi_leak": [], "coi_survival": []}
for _ in range(n_steps):
p, d, r, coi = self.step(alpha_true)
traj["prices"].append(p); traj["demand"].append(d); traj["rewards"].append(r)
traj["alpha_est"].append(self._alpha_est)
traj["coi_policy"].append(coi.policy); traj["coi_agent"].append(coi.agent)
traj["coi_leak"].append(coi.leak); traj["coi_survival"].append(coi.survival_ratio)
return traj
if __name__ == "__main__":
sys = System(n_products=5, seed=42)
traj = sys.run(n_steps=20, alpha_true=0.25)
print(f"avg reward: {np.mean(traj['rewards']):.2f}, final alpha_hat: {traj['alpha_est'][-1]:.3f}, "
f"COI_policy: {np.mean(traj['coi_policy']):.3f}, COI_agent: {np.mean(traj['coi_agent']):.3f}, leak: {np.mean(traj['coi_leak']):.3f}")
prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
costs = np.array([15.0, 28.0, 40.0, 18.0, 30.0])
sessions, demand = put_prices_to_market(prices, costs=costs, alpha=0.3, n_sessions=20, seed=123)
print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
for n in [1, 5, 10, 50, 100]:
# theoretical: erosion = 1 - 2/(N+1) for uniform order statistic
print(f'N={n:3d} agents -> COI erosion: {1.0 - 2.0/(n+1):.3f}')
events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0), Event('purchase', 0, 20.0, 2.0)]
print(f'human-like session alpha_hat: {estimate_alpha(Session(sid="test", events=events, actor="H")):.3f}')
events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3), Event('detail', 0, 20.0, 0.4)]
print(f'agent-like session alpha_hat: {estimate_alpha(Session(sid="test2", events=events_a, actor="A")):.3f}')

View File

@@ -0,0 +1,249 @@
"""Gymnasium-compatible RL environment for thesis pricing system.
Wraps simplified.System with standard Gym interface for training pricing policies.
Supports multiple reward modes and contamination scenarios.
Action: price multipliers [0.5, 1.5] applied to reference prices
Observation: [prices, demand_agg, alpha_est, margins, position_proxy]
Reward: configurable objective (revenue, profit, robust, coi-aware)
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, Tuple
import numpy as np
try:
import gymnasium as gym
from gymnasium import spaces
HAS_GYM = True
except ImportError:
HAS_GYM = False
from .simplified import System, Session, Event, Limbo, put_prices_to_market, compute_demand, estimate_alpha
from .coi import COIWindow, compute_coi_window, coi_erosion
@dataclass
class EnvConfig:
n_products: int = 5
max_steps: int = 200
sessions_per_step: int = 30
alpha_true: float = 0.2
alpha_drift: float = 0.0
alpha_bounds: Tuple[float, float] = (0.0, 0.6)
lambda_coi: float = 0.5
lambda_vol: float = 0.1
reward_mode: str = "robust" # revenue | profit | robust | coi_aware
normalize_reward: bool = True
seed: int | None = 42
def aggregate_purchases(sessions: list[Session], n_products: int, costs: np.ndarray) -> Tuple[np.ndarray, float, float]:
"""Aggregate purchases from sessions, returns (counts, revenue, cost)."""
purchases = np.zeros(n_products, dtype=float)
revenue, cost = 0.0, 0.0
for sess in sessions:
for e in sess.events:
if e.action == "purchase" and 0 <= e.product_idx < n_products:
purchases[e.product_idx] += 1.0
revenue += float(e.price_seen)
cost += float(costs[e.product_idx])
return purchases, revenue, cost
class PricingEnv(gym.Env if HAS_GYM else object):
"""RL environment for dynamic pricing under agent contamination.
Platform sets prices p_t, market responds with mixture demand Q(p) = (1-alpha)*D_H + alpha*D_A.
Agent estimates contamination alpha_hat from behavioral signals.
Reward balances profit vs COI leakage.
"""
metadata = {"render_modes": ["human", "ansi"]}
def __init__(self, cfg: EnvConfig | None = None):
if not HAS_GYM:
raise ImportError("gymnasium required")
self.cfg = cfg or EnvConfig()
self.n = self.cfg.n_products
self._sys: System | None = None
self._t = 0
self._alpha = self.cfg.alpha_true
self._last_prices: np.ndarray | None = None
self._last_demand: Dict[str, float] | None = None
self._episode_rewards: list[float] = []
self._demand_agg = np.zeros(self.n)
self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32)
obs_dim = self.n + self.n + 1 + 1 + self.n + 1 # prices + demand + alpha_hat + alpha + margins + t
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)
def _build_obs(self) -> np.ndarray:
if self._sys is None:
return np.zeros(self.observation_space.shape[0], dtype=np.float32)
prices = self._last_prices if self._last_prices is not None else self._sys.refs
return np.concatenate([
prices / (self._sys.refs + 1e-6),
self._demand_agg / (np.sum(self._demand_agg) + 1e-6),
[self._sys.alpha, self._alpha],
(prices - self._sys.costs) / (self._sys.costs + 1e-6),
[self._t / self.cfg.max_steps],
]).astype(np.float32)
def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
cfg, sys = self.cfg, self._sys
if sys is None:
return 0.0
# aggregate demand per product
agg = np.zeros(self.n)
for sid, q in demand.items():
sess = next((s for s in sys._sessions if s.sid == sid), None)
if sess and sess.events:
agg[sess.events[0].product_idx] += q
self._demand_agg = agg
_, revenue, cost = aggregate_purchases(sys._last_sessions, self.n, sys.costs)
profit = revenue - cost
vol_penalty = 0.0
if self._last_prices is not None:
vol_penalty = cfg.lambda_vol * float(np.mean(np.abs(prices - self._last_prices) / (sys.refs + 1e-6)))
coi = compute_coi_window(sys._last_sessions, sys.costs, demand_mapping=demand)
leak = float(coi.leak)
reward_fns = {
"revenue": lambda: revenue,
"profit": lambda: profit,
"robust": lambda: profit - cfg.lambda_coi * leak - vol_penalty,
"coi_aware": lambda: profit - cfg.lambda_coi * (1 + 2 * sys.alpha) * leak - vol_penalty,
}
r = reward_fns.get(cfg.reward_mode, lambda: profit)()
return float(r / (float(np.sum(sys.refs)) + 1e-6)) if cfg.normalize_reward else float(r)
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
seed = seed if seed is not None else self.cfg.seed
self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed)
self._t, self._alpha = 0, self.cfg.alpha_true
self._last_prices, self._last_demand = None, None
self._episode_rewards, self._demand_agg = [], np.zeros(self.n)
return self._build_obs(), {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
"costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()}
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
if self._sys is None:
raise RuntimeError("call reset() first")
action = np.clip(action, 0.5, 1.5)
prices = np.clip(self._sys.refs * action.astype(np.float64), self._sys.costs * 1.01, self._sys.refs * 2.0)
demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step)
self._sys.limbo.add_update("prices", prices)
self._sys._alpha_est = self._sys._estimate_alpha_from_sessions()
reward = self._compute_reward(prices, demand)
self._episode_rewards.append(reward)
self._last_prices, self._last_demand = prices.copy(), demand
self._t += 1
# compute info metrics using shared helper
purchases, revenue, cost = aggregate_purchases(self._sys._last_sessions, self.n, self._sys.costs)
n_agents = int(self._alpha * self.cfg.sessions_per_step)
coi = compute_coi_window(self._sys._last_sessions, self._sys.costs, demand_mapping=demand)
info = {
"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
"alpha_error": abs(self._alpha - self._sys.alpha),
"revenue": float(revenue), "profit": float(revenue - cost), "cost": float(cost),
"n_purchases": int(np.sum(purchases)),
"avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
"n_sessions": len(demand), "n_agents": n_agents, "price_std": float(np.std(prices)),
"coi_erosion": coi_erosion(coi.policy, coi.agent),
"coi_policy": float(coi.policy), "coi_agent": float(coi.agent),
"coi_leakage": float(coi.leak), "coi_survival": float(coi.survival_ratio),
"cumulative_reward": sum(self._episode_rewards), "step": self._t,
}
return self._build_obs(), reward, self._t >= self.cfg.max_steps, False, info
def render(self, mode: str = "human") -> str | None:
if self._sys is None or self._last_prices is None:
return None
out = f"t={self._t}/{self.cfg.max_steps} | alpha_true={self._alpha:.3f} alpha_hat={self._sys.alpha:.3f} | " \
f"prices: {self._last_prices.round(1)} | demand: {self._demand_agg.round(2)} | " \
f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}"
if mode == "human":
print(out)
return out
def close(self) -> None:
pass
class ContaminationSweepEnv(PricingEnv):
"""Environment that sweeps through contamination levels during training."""
def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None):
super().__init__(cfg)
self._schedule = alpha_schedule or [0.1, 0.2, 0.3, 0.4, 0.5]
self._schedule_idx = 0
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
if options and options.get("advance_schedule", False):
self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule)
self.cfg.alpha_true = self._schedule[self._schedule_idx]
return super().reset(seed, options)
class AdversarialEnv(PricingEnv):
"""Environment with adversarial contamination dynamics.
Contamination increases when prices are predictable (agents exploit).
"""
def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02):
super().__init__(cfg)
self._exploit_rate = exploitation_rate
self._price_history: list[np.ndarray] = []
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
obs, reward, term, trunc, info = super().step(action)
if self._last_prices is not None:
self._price_history.append(self._last_prices.copy())
predictability = 0.0
if len(self._price_history) > 10:
predictability = 1.0 / (float(np.std(self._price_history[-10:])) + 0.1)
self._alpha = np.clip(self._alpha + self._exploit_rate * predictability * self._sys.rng.random(), *self.cfg.alpha_bounds)
info["predictability"] = predictability
return obs, reward, term, trunc, info
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
self._price_history = []
return super().reset(seed, options)
def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv:
return {"sweep": ContaminationSweepEnv, "adversarial": AdversarialEnv}.get(env_type, PricingEnv)(cfg)
# baseline policies
fixed_price_policy = lambda refs, margin=0.0: np.ones(len(refs), dtype=np.float32) * (1.0 + margin)
random_policy = lambda n, rng=None: (rng or np.random.default_rng()).uniform(0.7, 1.3, n).astype(np.float32)
adaptive_policy = lambda obs, n, base=0.1: np.ones(n, dtype=np.float32) * (1.0 + base * (1.0 - 0.4 * obs[2 * n]))
if __name__ == "__main__":
cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust")
env = make_env(cfg)
obs, info = env.reset()
print(f"initial: alpha={info['alpha_true']:.2f}")
total_reward = 0.0
for t in range(cfg.max_steps):
action = adaptive_policy(obs, cfg.n_products)
obs, reward, done, _, info = env.step(action)
total_reward += reward
if t % 10 == 0:
env.render()
if done:
break
print(f"\ntotal reward: {total_reward:.2f}, final alpha_hat: {info['alpha_est']:.3f}")

View File

@@ -0,0 +1,168 @@
"""Summarize TensorBoard logs into comparison tables."""
from __future__ import annotations
import json
import re
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass
import pandas as pd
try:
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
HAS_TB = True
except ImportError:
HAS_TB = False
@dataclass
class RunInfo:
algo: str
alpha: float
reward_mode: str
path: Path
def parse_run_name(name: str) -> RunInfo | None:
"""Extract algo, alpha, reward_mode from run directory name."""
# patterns: ppo_a0.20_robust, cmp_fixed_a0.20, sac_a0.90_robust
m = re.match(r'(cmp_)?(\w+)_a([\d.]+)_?(\w+)?', name)
if not m:
return None
prefix, algo, alpha, mode = m.groups()
return RunInfo(algo=algo, alpha=float(alpha), reward_mode=mode or 'robust', path=Path())
def load_tb_scalars(log_dir: Path, tags: list[str], reduce: str = 'last') -> dict[str, float]:
"""Load scalar values from TensorBoard event files."""
if not HAS_TB:
return {}
ea = EventAccumulator(str(log_dir))
ea.Reload()
results = {}
for tag in tags:
if tag in ea.Tags().get('scalars', []):
events = ea.Scalars(tag)
if not events:
continue
vals = [e.value for e in events]
if reduce == 'last':
results[tag] = vals[-1]
elif reduce == 'mean':
results[tag] = sum(vals) / len(vals)
elif reduce == 'max':
results[tag] = max(vals)
elif reduce == 'min':
results[tag] = min(vals)
return results
def load_json_results(log_dir: Path) -> dict[str, float]:
"""Load metrics from results.json if available."""
results_file = log_dir / 'results.json'
if results_file.exists():
with open(results_file) as f:
return json.load(f)
return {}
def discover_runs(base_dir: Path) -> list[RunInfo]:
"""Find all experiment runs in base directory."""
runs = []
for d in base_dir.iterdir():
if not d.is_dir():
continue
info = parse_run_name(d.name)
if info:
info.path = d
runs.append(info)
return runs
def build_tables(runs: list[RunInfo], metrics: list[str], reduce: str = 'last') -> dict[str, dict[str, pd.DataFrame]]:
"""Build pivot tables: reward_mode -> metric -> DataFrame[alpha x algo]."""
# collect data: {reward_mode: {metric: {(alpha, algo): value}}}
data = defaultdict(lambda: defaultdict(dict))
tb_tags = [f'economics/{m}' if m in ['revenue', 'profit', 'margin'] else f'coi/{m}' if m in ['erosion', 'leakage'] else f'alpha/{m}' for m in metrics]
tag_map = dict(zip(tb_tags, metrics))
for run in runs:
# try json first (final eval metrics)
jm = load_json_results(run.path)
tb = load_tb_scalars(run.path, tb_tags, reduce)
for tag, metric in tag_map.items():
val = None
json_key = f'{metric}_mean' if metric != 'reward' else 'reward_mean'
if json_key in jm:
val = jm[json_key]
elif tag in tb:
val = tb[tag]
if val is not None:
data[run.reward_mode][metric][(run.alpha, run.algo)] = val
# convert to DataFrames
tables = {}
for mode, metrics_data in data.items():
tables[mode] = {}
for metric, vals in metrics_data.items():
if not vals:
continue
alphas = sorted(set(a for a, _ in vals.keys()))
algos = sorted(set(al for _, al in vals.keys()))
df = pd.DataFrame(index=alphas, columns=algos, dtype=float)
for (a, al), v in vals.items():
df.loc[a, al] = v
df.index.name = 'alpha'
tables[mode][metric] = df
return tables
def format_table(df: pd.DataFrame, fmt: str = '.3f') -> str:
"""Format DataFrame as markdown table."""
return df.to_markdown(floatfmt=fmt)
def summarize(base_dir: str = 'sim/case/thesis_simplified/runs',
metrics: list[str] | None = None,
reduce: str = 'last',
output: str | None = None) -> dict:
"""Generate summary tables from experiment runs."""
base = Path(base_dir)
metrics = metrics or ['revenue', 'profit', 'margin', 'erosion', 'leakage']
runs = discover_runs(base)
if not runs:
print(f"No runs found in {base}")
return {}
print(f"Found {len(runs)} runs")
tables = build_tables(runs, metrics, reduce)
lines = []
for mode, metric_tables in sorted(tables.items()):
lines.append(f"\n# Reward Mode: {mode}\n")
for metric, df in sorted(metric_tables.items()):
lines.append(f"\n## {metric}\n")
lines.append(format_table(df))
lines.append("")
report = '\n'.join(lines)
print(report)
if output:
Path(output).write_text(report)
print(f"\nSaved to {output}")
return tables
if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser()
p.add_argument('--dir', default='sim/case/thesis_simplified/runs')
p.add_argument('--metrics', nargs='+', default=['revenue', 'profit', 'margin', 'erosion', 'leakage'])
p.add_argument('--reduce', default='last', choices=['last', 'mean', 'max', 'min'])
p.add_argument('--output', '-o', help='save markdown to file')
args = p.parse_args()
summarize(args.dir, args.metrics, args.reduce, args.output)

View File

@@ -0,0 +1,336 @@
"""RL training for thesis pricing system with thesis-aligned metrics.
Trains pricing policies using stable-baselines3 with TensorBoard logging.
Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formulation.
"""
from __future__ import annotations
import argparse
import json
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass, asdict, field
from pathlib import Path
from typing import Dict, List, Callable, Any
import numpy as np
try:
from stable_baselines3 import PPO, SAC, A2C
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
HAS_SB3 = True
except ImportError:
HAS_SB3 = False
try:
from torch.utils.tensorboard import SummaryWriter
HAS_TB = True
except ImportError:
HAS_TB = False
from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy
@dataclass
class EpisodeMetrics:
reward: float = 0.0
revenue: float = 0.0
profit: float = 0.0
coi_erosion: float = 0.0
coi_leakage: float = 0.0
alpha_error: float = 0.0
avg_margin: float = 0.0
n_agents: int = 0
steps: int = 0
def accumulate(self, info: Dict[str, Any]) -> None:
self.steps += 1
self.reward += info.get('reward', 0)
self.revenue += info.get('revenue', 0)
self.profit += info.get('profit', 0)
self.coi_erosion += info.get('coi_erosion', 0)
self.coi_leakage += info.get('coi_leakage', 0)
self.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0))
self.avg_margin += info.get('avg_margin', 0)
self.n_agents += info.get('n_agents', 0)
def normalized(self) -> Dict[str, float]:
s = max(self.steps, 1)
return {k: getattr(self, k) / s for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin', 'n_agents']}
@dataclass
class ExperimentConfig:
algo: str = "ppo"
total_timesteps: int = 100_000
n_envs: int = 4
eval_freq: int = 5000
n_eval_episodes: int = 10
log_dir: str = "sim/case/thesis_simplified/runs"
seed: int = 42
n_products: int = 10
max_steps: int = 200
alpha_true: float = 0.2
reward_mode: str = "robust"
experiment_name: str | None = None
def __post_init__(self):
if self.experiment_name is None:
self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}"
class Policy:
"""Unified policy interface for baselines and trained models."""
def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str):
self._fn, self.name = policy_fn, name
def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]:
return self._fn(obs, (len(obs) - 3) // 3), None
@staticmethod
def fixed(margin: float = 0.15) -> "Policy":
return Policy(lambda obs, n: fixed_price_policy(np.ones(n), margin), f"fixed_{margin:.2f}")
@staticmethod
def adaptive(base_margin: float = 0.15) -> "Policy":
return Policy(lambda obs, n: adaptive_policy(obs, n, base_margin), f"adaptive_{base_margin:.2f}")
@staticmethod
def random() -> "Policy":
return Policy(lambda obs, n: random_policy(n), "random")
@staticmethod
def myopic(greed: float = 0.3) -> "Policy":
def _fn(obs: np.ndarray, n: int) -> np.ndarray:
demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
return np.ones(n, dtype=np.float32) * np.clip(1.0 + greed * (1 + np.mean(demand_norm)), 0.5, 1.5)
return Policy(_fn, f"myopic_{greed:.1f}")
def log_metrics(writer: SummaryWriter | None, metrics: Dict[str, float], prefix: str, step: int) -> None:
if writer is None:
return
for k, v in metrics.items():
writer.add_scalar(f'{prefix}/{k}', v, step)
class MetricsCallback(BaseCallback):
def __init__(self, writer: SummaryWriter | None, verbose: int = 0):
super().__init__(verbose)
self._writer = writer
def _on_step(self) -> bool:
if self._writer is None:
return True
for info in self.locals.get('infos', []):
t = self.num_timesteps
self._writer.add_scalar('economics/revenue', info.get('revenue', 0), t)
self._writer.add_scalar('economics/profit', info.get('profit', 0), t)
self._writer.add_scalar('economics/margin', info.get('avg_margin', 0), t)
self._writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), t)
self._writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), t)
self._writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), t)
self._writer.add_scalar('agents/count', info.get('n_agents', 0), t)
return True
def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv:
def _make():
return Monitor(make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)))
return DummyVecEnv([_make for _ in range(n_envs)])
def run_episodes(policy: Policy | Any, env: PricingEnv, n_episodes: int) -> List[EpisodeMetrics]:
"""Run policy for n episodes and collect metrics."""
metrics = []
for _ in range(n_episodes):
obs, _ = env.reset()
ep, done = EpisodeMetrics(), False
while not done:
action, _ = policy.predict(obs, deterministic=True)
obs, reward, term, trunc, info = env.step(action)
done = term or trunc
ep.accumulate(info)
ep.reward += reward
metrics.append(ep)
return metrics
def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]:
env = make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999))
metrics = run_episodes(policy, env, n_episodes)
return {
'reward_mean': np.mean([m.reward for m in metrics]), 'reward_std': np.std([m.reward for m in metrics]),
**{f'{k}_mean': np.mean([m.normalized()[k] for m in metrics])
for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin']},
}
def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None):
obs, n_envs = vec_env.reset(), vec_env.num_envs
ep_rewards = np.zeros(n_envs)
for step in range(0, total_steps, n_envs):
actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)])
obs, rewards, dones, infos = vec_env.step(actions)
ep_rewards += rewards
for i, info in enumerate(infos):
if writer:
writer.add_scalar('economics/revenue', info.get('revenue', 0), step)
writer.add_scalar('economics/profit', info.get('profit', 0), step)
writer.add_scalar('economics/margin', info.get('avg_margin', 0), step)
writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), step)
writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), step)
writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), step)
writer.add_scalar('agents/count', info.get('n_agents', 0), step)
if dones[i]:
if writer:
writer.add_scalar('rollout/ep_reward', ep_rewards[i], step)
ep_rewards[i] = 0
def train(cfg: ExperimentConfig) -> Dict[str, Any]:
is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"]
if not HAS_SB3 and not is_baseline:
raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]")
log_path = Path(cfg.log_dir) / cfg.experiment_name
log_path.mkdir(parents=True, exist_ok=True)
with open(log_path / "config.json", "w") as f:
json.dump(asdict(cfg), f, indent=2)
writer = SummaryWriter(log_path) if HAS_TB else None
train_env, eval_env = make_vec_env(cfg, cfg.n_envs), make_vec_env(cfg, 1)
if is_baseline:
policy = {"fixed": Policy.fixed, "adaptive": Policy.adaptive, "random": Policy.random, "myopic": Policy.myopic}[cfg.algo.lower()]()
run_baseline(policy, train_env, cfg.total_timesteps, writer)
final_metrics = evaluate_policy(policy, cfg)
else:
algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}[cfg.algo.lower()]
common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
model = {
"ppo": lambda: PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, ent_coef=0.01, **common),
"sac": lambda: SAC("MlpPolicy", train_env, learning_rate=1e-4, buffer_size=50_000, batch_size=512, tau=0.02, gamma=0.99, learning_starts=1000, ent_coef="auto_0.1", train_freq=4, **common),
"a2c": lambda: A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common),
}[cfg.algo.lower()]()
cb = MetricsCallback(writer)
eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"), log_path=str(log_path),
eval_freq=cfg.eval_freq, n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True)
model.save(log_path / "final_model")
policy = model
final_metrics = evaluate_policy(model, cfg)
if writer:
log_metrics(writer, final_metrics, 'final', cfg.total_timesteps)
writer.close()
train_env.close(); eval_env.close()
with open(log_path / "results.json", "w") as f:
json.dump(final_metrics, f, indent=2)
return {"path": str(log_path), "metrics": final_metrics}
def _train_alpha(args: tuple) -> tuple[str, Dict]:
"""Worker for parallel sweep - must be top-level for pickling."""
cfg_dict, alpha = args
cfg_dict["alpha_true"] = alpha
cfg_dict["experiment_name"] = f"{cfg_dict['algo']}_a{alpha:.2f}_{cfg_dict['reward_mode']}"
sweep_cfg = ExperimentConfig(**cfg_dict)
print(f"[alpha={alpha:.2f}] starting")
metrics = train(sweep_cfg)["metrics"]
print(f"[alpha={alpha:.2f}] done")
return f"alpha_{alpha:.2f}", metrics
def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
cfg_dict = asdict(cfg)
if max_workers == 1: # sequential fallback
results = dict(_train_alpha((cfg_dict.copy(), a)) for a in alphas)
else:
with ProcessPoolExecutor(max_workers=max_workers) as pool:
futures = {pool.submit(_train_alpha, (cfg_dict.copy(), a)): a for a in alphas}
results = {}
for fut in as_completed(futures):
key, metrics = fut.result()
results[key] = metrics
summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json"
with open(summary_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nSweep results saved to {summary_path}")
return results
def _train_policy(args: tuple) -> tuple[str, Dict]:
"""Worker for parallel policy comparison."""
cfg_dict, algo = args
cfg_dict["algo"] = algo
cfg_dict["experiment_name"] = f"cmp_{algo}_a{cfg_dict['alpha_true']:.2f}"
cmp_cfg = ExperimentConfig(**cfg_dict)
print(f"[{algo}] starting")
metrics = train(cmp_cfg)["metrics"]
print(f"[{algo}] done")
return algo, metrics
def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
policies = policies or ["fixed", "adaptive", "myopic", "random"]
cfg_dict = asdict(cfg)
if max_workers == 1:
results = dict(_train_policy((cfg_dict.copy(), p)) for p in policies)
else:
with ProcessPoolExecutor(max_workers=max_workers) as pool:
futures = {pool.submit(_train_policy, (cfg_dict.copy(), p)): p for p in policies}
results = {}
for fut in as_completed(futures):
algo, metrics = fut.result()
results[algo] = metrics
cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json"
with open(cmp_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nComparison saved to {cmp_path}")
for algo, m in results.items():
print(f" {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} alpha_err={m['alpha_error_mean']:.4f}")
return results
def main():
parser = argparse.ArgumentParser(description="Train RL pricing policies")
parser.add_argument("--algo", default="ppo", choices=["ppo", "sac", "a2c", "fixed", "adaptive", "random", "myopic"])
parser.add_argument("--steps", type=int, default=100_000)
parser.add_argument("--alpha", type=float, default=0.2)
parser.add_argument("--reward-mode", default="robust", choices=["revenue", "profit", "robust", "coi_aware"])
parser.add_argument("--n-products", type=int, default=10)
parser.add_argument("--n-envs", type=int, default=4)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--log-dir", default="sim/case/thesis_simplified/runs")
parser.add_argument("--sweep", action="store_true", help="run contamination sweep")
parser.add_argument("--compare", action="store_true", help="compare all baselines")
parser.add_argument("--workers", type=int, default=None, help="max parallel workers for sweep (None=auto, 1=sequential)")
args = parser.parse_args()
cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
reward_mode=args.reward_mode, n_products=args.n_products,
n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir)
if args.sweep:
run_sweep(cfg, max_workers=args.workers)
elif args.compare:
compare_policies(cfg, max_workers=args.workers)
else:
result = train(cfg)
print(f"\nTraining complete: {result['path']}")
print(f"Metrics: {json.dumps(result['metrics'], indent=2)}")
if __name__ == "__main__":
main()

View File

@@ -19,6 +19,7 @@ except ImportError:
lib_make_state_repr = None lib_make_state_repr = None
lib_transition_histogram = None lib_transition_histogram = None
class BehaviorModel: class BehaviorModel:
def __init__(self, src_dir: str, loader_cls=Loader): def __init__(self, src_dir: str, loader_cls=Loader):
self.loader = loader_cls(src_dir) self.loader = loader_cls(src_dir)
@@ -206,6 +207,7 @@ def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "
def kl_divergence(p: Dict[str, float], q: Dict[str, float]) -> float: def kl_divergence(p: Dict[str, float], q: Dict[str, float]) -> float:
eps = 1e-10 eps = 1e-10
# p + log(p / q) summed over all keys in P
return sum((p[k] + eps) * np.log((p[k] + eps) / (q.get(k, 0.0) + eps)) for k in p) return sum((p[k] + eps) * np.log((p[k] + eps) / (q.get(k, 0.0) + eps)) for k in p)
if __name__ == "__main__": if __name__ == "__main__":
@@ -222,6 +224,7 @@ if __name__ == "__main__":
agent_model = AgentBehaviorModel(agent_dir) agent_model = AgentBehaviorModel(agent_dir)
agent_mdp = agent_model.build_MDP() agent_mdp = agent_model.build_MDP()
print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, " print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions") f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
if not agent_mdp['states']: if not agent_mdp['states']:
@@ -230,6 +233,7 @@ if __name__ == "__main__":
human_evt = aggregate_event_transitions(human_mdp) human_evt = aggregate_event_transitions(human_mdp)
agent_evt = aggregate_event_transitions(agent_mdp) agent_evt = aggregate_event_transitions(agent_mdp)
common = set(human_evt.keys()) & set(agent_evt.keys()) common = set(human_evt.keys()) & set(agent_evt.keys())
if not common: if not common:

View File

@@ -3,8 +3,7 @@ import numpy as np
import pandas as pd import pandas as pd
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Dict, Any from typing import Dict, Any
from environment import BusinessLogicConstraints from sim.rl.environment import BusinessLogicConstraints
""" """
An angine by default should have its own demand estimation mechanism from the observed observations whihc are the computer feature. An angine by default should have its own demand estimation mechanism from the observed observations whihc are the computer feature.
@@ -32,9 +31,12 @@ class BasePricingEngine(ABC):
""" """
pass pass
@abstractmethod def update(self, observation: Dict[str, Any], reward: float, done: bool, info: Dict[str, Any]) -> None:
def update(obs, reward, done, info): """Default no-op update. Engines can override as needed."""
pass self.last_observation = observation
self.last_reward = reward
self.last_info = info
@@ -48,14 +50,14 @@ class WildPricingEngine(BasePricingEngine):
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0): def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
super().__init__(constraints, seed) super().__init__(constraints, seed)
# per-product unit costs (unknown to customers; known to platform) # per-product unit costs (unknown to customers; known to platform)
self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catelogue_size).astype(np.float32) self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catalogue_size).astype(np.float32)
# online elasticity estimate (start moderately elastic) # online elasticity estimate (start moderately elastic)
self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32) self.e_hat = np.full((self.c.product_catalogue_size,), -1.3, dtype=np.float32)
# EWMA state for log-log regression # EWMA state for log-log regression
self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32) self.mu_logp = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32) self.mu_logq = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
self.cov_pq = np.zeros(self.c.product_catelogue_size, dtype=np.float32) self.cov_pq = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
self.var_p = np.ones(self.c.product_catelogue_size, dtype=np.float32) self.var_p = np.ones(self.c.product_catalogue_size, dtype=np.float32)
# knobs typical in production # knobs typical in production
self.lr = 0.08 self.lr = 0.08
self.ewma = 0.05 self.ewma = 0.05
@@ -140,7 +142,7 @@ class SimpleDemandEngine(BasePricingEngine):
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray: def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
self.step_count += 1 self.step_count += 1
demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32)) demand = _extract_demand(observation, self.c.product_catalogue_size)
if self.prev_demand is None: if self.prev_demand is None:
self.prev_demand = demand.copy() self.prev_demand = demand.copy()
return current_prices.copy() return current_prices.copy()
@@ -187,15 +189,15 @@ class ThompsonSamplingEngine(BasePricingEngine):
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0): def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
super().__init__(constraints, seed) super().__init__(constraints, seed)
self.n_price_levels = 5 self.n_price_levels = 5
self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32) self.alpha = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32) self.beta = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
self.price_grid = None self.price_grid = None
self.last_actions = None self.last_actions = None
def reset(self): def reset(self):
super().reset() super().reset()
self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32) self.alpha = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32) self.beta = np.ones((self.c.product_catalogue_size, self.n_price_levels), dtype=np.float32)
self.price_grid = None self.price_grid = None
self.last_actions = None self.last_actions = None
@@ -206,10 +208,10 @@ class ThompsonSamplingEngine(BasePricingEngine):
lo = current_prices * 0.7 lo = current_prices * 0.7
hi = current_prices * 1.3 hi = current_prices * 1.3
self.price_grid = np.linspace(lo, hi, self.n_price_levels).T self.price_grid = np.linspace(lo, hi, self.n_price_levels).T
demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32)) demand = _extract_demand(observation, self.c.product_catalogue_size)
# update beliefs based on last action # update beliefs based on last action
if self.last_actions is not None: if self.last_actions is not None:
for i in range(self.c.product_catelogue_size): for i in range(self.c.product_catalogue_size):
a = self.last_actions[i] a = self.last_actions[i]
reward = demand[i] reward = demand[i]
if reward > 0.5: if reward > 0.5:
@@ -217,11 +219,22 @@ class ThompsonSamplingEngine(BasePricingEngine):
else: else:
self.beta[i, a] += 1.0 self.beta[i, a] += 1.0
# thompson sampling: sample from posterior, pick best # thompson sampling: sample from posterior, pick best
new_prices = np.zeros(self.c.product_catelogue_size, dtype=np.float32) new_prices = np.zeros(self.c.product_catalogue_size, dtype=np.float32)
actions = np.zeros(self.c.product_catelogue_size, dtype=int) actions = np.zeros(self.c.product_catalogue_size, dtype=int)
for i in range(self.c.product_catelogue_size): for i in range(self.c.product_catalogue_size):
theta = self.rng.beta(self.alpha[i], self.beta[i]).astype(np.float32) theta = self.rng.beta(self.alpha[i], self.beta[i]).astype(np.float32)
actions[i] = int(np.argmax(theta)) actions[i] = int(np.argmax(theta))
new_prices[i] = self.price_grid[i, actions[i]] new_prices[i] = self.price_grid[i, actions[i]]
self.last_actions = actions self.last_actions = actions
return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32) return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
def _extract_demand(observation: Dict[str, Any], n: int) -> np.ndarray:
if "elasticity" in observation and isinstance(observation["elasticity"], dict):
d = observation["elasticity"].get("demand")
if d is not None:
return np.asarray(d, dtype=np.float32)
d = observation.get("demand")
if d is not None:
return np.asarray(d, dtype=np.float32)
return np.zeros(n, dtype=np.float32)

View File

@@ -1,319 +1,244 @@
import gymnasium as gym from __future__ import annotations
from gymnasium import spaces
import numpy as np
from dataclasses import dataclass from dataclasses import dataclass
import pandas as pd from typing import Any, Dict, Optional, Tuple
from typing import Callable, Optional, Dict, Any, List
# "learner" agent learning to optimize pricing import numpy as np
# "agent" part of environment creating demand signals that learner processes
try:
import gymnasium as gym
from gymnasium import spaces
except ImportError as e:
raise ImportError("sim.rl.environment requires gymnasium") from e
from sim.case.thesis_simplified.coi import COIWindow, coi_erosion, compute_coi_window
from sim.case.thesis_simplified.separability import estimate_alpha as estimate_session_alpha
from sim.case.thesis_simplified.simplified import Limbo, Session, put_prices_to_market
from sim.rl.thesis_core import aggregate_demand_by_product, aggregate_purchases, constrain_prices
@dataclass(frozen=True)
class BusinessLogicConstraints:
product_catalogue_size: int = 100
max_steps: int = 2000
sessions_per_step: int = 250
@dataclass
class BusinessLogicConstraints():
max_price_adjustment: float = 0.30
system_max_price: float = 500.0 system_max_price: float = 500.0
system_min_price: float = 1.0 system_min_price: float = 1.0
product_catalogue_size: int = 100 max_price_adjustment: float = 0.30
episode_length: int = 200 min_margin_pct: float = 0.05
sessions_per_step: int = 250
agent_share: float = 0.25 agent_share: float = 0.2
agent_recon_multiplier: float = 6.0 alpha_drift: float = 0.0
agent_purchase_probability: float = 0.20 alpha_bounds: tuple[float, float] = (0.0, 0.8)
coi_strength: float = 0.25 coi_strength: float = 0.25
coi_threshold: float = 4.0
coi_sigmoid_temp: float = 1.25
base_human_demand: float = 0.08
base_agent_demand: float = 0.05
human_price_elasticity: float = -1.2 # assumptions here
agent_price_elasticity: float = -0.6
w_agent_loss: float = 1.0
w_volatility: float = 5.0 w_volatility: float = 5.0
w_estimation_error: float = 0.25 w_estimation_error: float = 0.25
seed: int = 7 seed: int = 7
def _sigmoid(x: np.ndarray) -> np.ndarray: def make_env(constraints: Optional[BusinessLogicConstraints] = None) -> "PHANTOMEnv":
return 1.0 / (1.0 + np.exp(-x)) return PHANTOMEnv(constraints=constraints or BusinessLogicConstraints())
class BehavioralProfile:
"""simple markov chain model for generating synthetic interaction events"""
def __init__(self, actor: str, purchase_probs: np.ndarray):
self.actor = actor
self.purchase_probs = purchase_probs
self.states = ['view', 'cart', 'checkout']
# transition matrix: view->cart 0.3, view->view 0.6, view->exit 0.1, cart->checkout 0.5, cart->view 0.4, cart->exit 0.1
self.trans = {'view': {'view': 0.6, 'cart': 0.3, 'exit': 0.1}, 'cart': {'checkout': 0.5, 'view': 0.4, 'exit': 0.1}, 'checkout': {'exit': 1.0}}
if actor == 'agents': # agents browse more before purchasing
self.trans['view'] = {'view': 0.75, 'cart': 0.15, 'exit': 0.1}
self.trans['cart'] = {'checkout': 0.3, 'view': 0.6, 'exit': 0.1}
def sample(self, rng: np.random.Generator) -> Dict[str, Any]:
"""sample single interaction event"""
product_idx = rng.integers(0, len(self.purchase_probs))
state = 'view' # always start with view
# pick next state based on transition probs
trans = self.trans.get(state, {'exit': 1.0})
next_state = rng.choice(list(trans.keys()), p=list(trans.values()))
price_paid = 0.0 if next_state != 'checkout' else float(rng.uniform(50, 200))
return {'action': state, 'product_idx': product_idx, 'actor': 'agent' if self.actor == 'agents' else 'human', 't': 0.0, 'price_paid': price_paid}
def _load_behavioral_profile(actor: str, demand_forcing: np.ndarray) -> BehavioralProfile:
"""returns a behavioral profile for generating synthetic sessions
actor: 'humans' or 'agents'
demand_forcing: per-product purchase probabilities used to weight interactions
"""
return BehavioralProfile(actor, demand_forcing)
class CommercePlatform:
"""state management for the environment, simulates demand"""
def __init__(self, product_catalogue_size: int, max_price: float, min_price: float, constraints: BusinessLogicConstraints):
self.product_catalogue_size = product_catalogue_size
self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catalogue_size,))
self.max_price = max_price
self.min_price = min_price
self.constraints = constraints
self.simulation_history: List[Dict[str, Any]] = []
self._rng = np.random.default_rng(constraints.seed)
self._last_interaction_df: pd.DataFrame = pd.DataFrame()
def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
p = np.clip(prices, self.min_price, self.max_price)
pn = p / self.max_price
human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
return {"human_purchase_prob": np.clip(human_prob, 0.0, 0.95), "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)}
def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
demand = self.setup_true_demand(base_prices)
human_pprob = demand["human_purchase_prob"]
agent_pprob = demand["agent_purchase_prob"]
events: List[Dict[str, Any]] = []
T = self.constraints.sessions_per_step
n_agent_sessions = int(round(T * self.constraints.agent_share))
n_human_sessions = T - n_agent_sessions
n_agent_ids = max(1, n_agent_sessions // 2)
session_map = {
'humans': n_human_sessions,
'agents': n_agent_ids
}
pprob_map = {
'humans': human_pprob,
'agents': agent_pprob
}
joint_events = []
for actor, n_sessions in session_map.items():
bp = _load_behavioral_profile(actor, pprob_map[actor])
counter = 0
events = []
while counter < n_sessions:
session_events = []
while len(session_events) == 0 or session_events[-1]['action'] == 'checkout':
interaction_event = bp.sample(self._rng)
interaction_event['session_id'] = f'{actor}_{counter:06d}'
# TODO any other assignments
session_events.append(interaction_event)
events.extend(session_events)
counter += 1
joint_events.extend(events)
return pd.DataFrame(joint_events)
def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
if interaction_df.empty:
return {"mean_sale_price": 0.0, "look_to_book": 0.0}
purchases = interaction_df[interaction_df["action"] == "purchase"]
mean_sale_price = float(purchases["price_paid"].mean()) if not purchases.empty else 0.0
views = float((interaction_df["action"] == "view").sum())
buys = float((interaction_df["action"] == "purchase").sum())
return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
# TODO: adapt this
if df.empty:
return pd.DataFrame()
g = df.groupby("session_id", sort=False)
session_duration = g["t"].max() - g["t"].min()
total_interactions = g.size()
avg_time_between = g["t"].apply(lambda x: float(np.diff(np.sort(x.to_numpy())).mean()) if len(x) > 1 else 0.0)
interaction_velocity = total_interactions / (session_duration + 1e-6)
views = g.apply(lambda x: int((x["action"] == "view").sum()), include_groups=False)
cart_adds = g.apply(lambda x: int((x["action"] == "cart").sum()), include_groups=False)
purchases = g.apply(lambda x: int((x["action"] == "purchase").sum()), include_groups=False)
conversion_rate = purchases / (views + 1e-6)
is_agent = g["actor"].apply(lambda s: bool((s == "agent").any()), include_groups=False)
return pd.DataFrame({
"session_duration_sec": session_duration.astype(float),
"avg_time_between_events": avg_time_between.astype(float),
"total_interactions": total_interactions.astype(int),
"interaction_velocity": interaction_velocity.astype(float),
"item_views": views.astype(int),
"cart_adds": cart_adds.astype(int),
"purchases": purchases.astype(int),
"conversion_rate": conversion_rate.astype(float),
"is_agent": is_agent.astype(bool),
}).reset_index()
def get_interaction_data(self) -> np.ndarray:
if self._last_interaction_df.empty:
return np.array([], dtype=object)
return self._last_interaction_df.to_dict(orient="records")
class PHANTOMEnv(gym.Env): class PHANTOMEnv(gym.Env):
metadata = {"render_modes": []} metadata = {"render_modes": ["human", "ansi"]}
def __init__(self, constraints): def __init__(self, constraints: Optional[BusinessLogicConstraints] = None):
super().__init__() super().__init__()
self.constraints = BusinessLogicConstraints() self.c = constraints or BusinessLogicConstraints()
self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment, self.n = int(self.c.product_catalogue_size)
high=self.constraints.max_price_adjustment,
shape=(self.constraints.product_catalogue_size,), dtype=np.float32) self._rng = np.random.default_rng(self.c.seed)
self.observation_space = spaces.Dict({ self._t = 0
"elasticity": spaces.Dict({ self._alpha_true = float(self.c.agent_share)
"price": spaces.Box( self._alpha_hat = float(self.c.agent_share)
low=np.full((self.constraints.product_catalogue_size,), self.constraints.system_min_price, dtype=np.float32), self._costs = np.zeros(self.n, dtype=np.float32)
high=np.full((self.constraints.product_catalogue_size,), self.constraints.system_max_price, dtype=np.float32), self._refs = np.zeros(self.n, dtype=np.float32)
dtype=np.float32), self._prices: Optional[np.ndarray] = None
"demand": spaces.Box( self._last_sessions: list[Session] = []
low=np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32), self._last_coi: COIWindow | None = None
high=np.full((self.constraints.product_catalogue_size,), 1e6, dtype=np.float32), self._limbo = Limbo()
dtype=np.float32),
}) self.action_space = spaces.Box(
# TODO: define more features that we compute from the interaction data low=np.full((self.n,), self.c.system_min_price, dtype=np.float32),
}) high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
self.commerce_platform = CommercePlatform( dtype=np.float32,
product_catalogue_size=self.constraints.product_catalogue_size, )
max_price=self.constraints.system_max_price, self.observation_space = spaces.Dict(
min_price=self.constraints.system_min_price, {
constraints=self.constraints) "elasticity": spaces.Dict(
self._rng = np.random.default_rng(self.constraints.seed) {
self.t = 0 "price": spaces.Box(
self._prev_prices: Optional[np.ndarray] = None low=np.full((self.n,), self.c.system_min_price, dtype=np.float32),
self.state: Dict[str, Any] = {} high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
dtype=np.float32,
),
"demand": spaces.Box(
low=np.zeros((self.n,), dtype=np.float32),
high=np.full((self.n,), 1e9, dtype=np.float32),
dtype=np.float32,
),
}
),
"market": spaces.Dict(
{
"alpha_hat": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
"revenue_rate": spaces.Box(low=0.0, high=1e12, shape=(1,), dtype=np.float32),
"conversion_rate": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
"price_volatility": spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32),
}
),
"cost": spaces.Box(
low=np.zeros((self.n,), dtype=np.float32),
high=np.full((self.n,), self.c.system_max_price, dtype=np.float32),
dtype=np.float32,
),
}
)
def _reset_catalogue(self) -> None:
self._costs = self._rng.uniform(15.0, 60.0, size=self.n).astype(np.float32)
margins = self._rng.uniform(0.2, 0.6, size=self.n).astype(np.float32)
self._refs = (self._costs * (1.0 + margins)).astype(np.float32)
self._prices = self._refs.copy()
def _observe_market(
self, prices: np.ndarray
) -> tuple[list[Session], Dict[str, float], np.ndarray, np.ndarray, float, float, int]:
sessions, demand_map = put_prices_to_market(
prices,
costs=self._costs,
alpha=self._alpha_true,
n_sessions=int(self.c.sessions_per_step),
seed=int(self._rng.integers(0, 2**31 - 1)),
)
demand_by_product = aggregate_demand_by_product(sessions, demand_map, self.n)
purchases, revenue, cost, n_agents = aggregate_purchases(sessions, self._costs, self.n)
conversion = float(np.sum(purchases) / max(len(sessions), 1))
return sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents
def _update_alpha_hat(self, sessions: list[Session]) -> float:
scores = [estimate_session_alpha(s) for s in sessions if s.events]
if not scores:
return self._alpha_hat
alpha_step = float(np.mean(scores))
self._alpha_hat = 0.8 * self._alpha_hat + 0.2 * alpha_step
self._alpha_hat = float(np.clip(self._alpha_hat, 0.0, 1.0))
return self._alpha_hat
def _reward(self, prices: np.ndarray, revenue: float, cost: float, volatility: float) -> float:
profit = float(revenue - cost)
coi_leak = float(self._last_coi.leak) if self._last_coi else 0.0
alpha_err = abs(self._alpha_hat - self._alpha_true)
return profit - self.c.coi_strength * coi_leak - self.c.w_volatility * volatility - self.c.w_estimation_error * alpha_err
def _build_obs(
self,
prices: np.ndarray,
demand_by_product: np.ndarray,
revenue: float,
conversion: float,
volatility: float,
) -> Dict[str, Any]:
return {
"elasticity": {"price": prices.astype(np.float32), "demand": demand_by_product.astype(np.float32)},
"market": {
"alpha_hat": np.array([self._alpha_hat], dtype=np.float32),
"revenue_rate": np.array([revenue], dtype=np.float32),
"conversion_rate": np.array([conversion], dtype=np.float32),
"price_volatility": np.array([volatility], dtype=np.float32),
},
"cost": self._costs.astype(np.float32),
}
def reset(self, seed: Optional[int] = None, options: Optional[dict] = None): def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
super().reset(seed=seed) super().reset(seed=seed)
if seed is not None: if seed is not None:
self._rng = np.random.default_rng(seed) self._rng = np.random.default_rng(seed)
self.commerce_platform._rng = np.random.default_rng(seed) self._t = 0
self.t = 0 self._alpha_true = float(np.clip(self.c.agent_share, *self.c.alpha_bounds))
init_prices = self._rng.uniform(low=60.0, high=140.0, size=(self.constraints.product_catalogue_size,)).astype(np.float32) self._alpha_hat = float(self.c.agent_share)
self._prev_prices = init_prices.copy() self._reset_catalogue()
self.state = { self._limbo = Limbo()
"elasticity": { self._last_sessions = []
"price": init_prices, self._last_coi = None
"demand": np.zeros((self.constraints.product_catalogue_size,), dtype=np.float32),
}
}
return self.state, {}
def step(self, action: np.ndarray): prices = self._prices if self._prices is not None else np.zeros(self.n, dtype=np.float32)
self.t += 1 obs = self._build_obs(prices, np.zeros(self.n, dtype=np.float32), 0.0, 0.0, 0.0)
base_prices = self.state["elasticity"]["price"].astype(np.float32) return obs, {"alpha_true": self._alpha_true}
new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
self.constraints.system_min_price,
self.constraints.system_max_price).astype(np.float32)
self.state["elasticity"]["price"] = new_prices def step(self, action: np.ndarray) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
interactions_df = self.commerce_platform._simulate_sessions(new_prices) if self._prices is None:
result = self.commerce_platform.compute_interaction_features(interactions_df) raise RuntimeError("reset() must be called before step()")
COI = 0.0 # TODO: implement cost-of-information computation
volatility = 0.0 if self._prev_prices is None else \ prev = self._prices
float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6)))) prices = constrain_prices(
self._prev_prices = new_prices.copy() prev,
np.asarray(action, dtype=np.float32),
costs=self._costs,
min_price=float(self.c.system_min_price),
max_price=float(self.c.system_max_price),
max_adjustment=float(self.c.max_price_adjustment),
min_margin_pct=float(self.c.min_margin_pct),
)
self._prices = prices
self._limbo.add_update("prices", prices)
# extract metrics with safe defaults for incomplete simulation sessions, demand_map, demand_by_product, purchases, revenue, cost, n_agents = self._observe_market(prices)
revenue_observed = float(result.get("revenue_observed", result.get("mean_sale_price", 0.0))) self._last_sessions = sessions
agent_loss = float(result.get("agent_loss", 0.0)) self._limbo.add_update("demand", demand_map)
reward = (revenue_observed self._update_alpha_hat(self._last_sessions)
- COI self._last_coi = compute_coi_window(self._last_sessions, self._costs, demand_mapping=demand_map)
- self.constraints.w_agent_loss * agent_loss
- self.constraints.w_volatility * volatility
- self.constraints.w_estimation_error)
terminated = self.t >= self.constraints.episode_length self._alpha_true = float(np.clip(self._alpha_true + self.c.alpha_drift, *self.c.alpha_bounds))
volatility = float(np.std((prices - prev) / (prev + 1e-6)))
reward = float(self._reward(prices, revenue, cost, volatility))
conversion = float(np.sum(purchases) / max(len(self._last_sessions), 1))
self._t += 1
terminated = self._t >= int(self.c.max_steps)
obs = self._build_obs(prices, demand_by_product, revenue, conversion, min(volatility, 1.0))
info = { info = {
"t": self.t, "step": self._t,
"revenue_observed": revenue_observed, "reward": reward,
"revenue_oracle": float(result.get("revenue_oracle", revenue_observed)), "revenue": float(revenue),
"agent_loss": agent_loss, "profit": float(revenue - cost),
"ux_volatility": volatility, "n_sessions": int(self.c.sessions_per_step),
"look_to_book": float(result.get("look_to_book", 0.0)), "n_agents": int(n_agents),
"mean_sale_price": float(result.get("mean_sale_price", 0.0)), "alpha_true": float(self._alpha_true),
"true_human_purchases_total": 0.0, # TODO: track from simulation "alpha_hat": float(self._alpha_hat),
"true_agent_purchases_total": 0.0, # TODO: track from simulation "alpha_error": float(abs(self._alpha_hat - self._alpha_true)),
"price_std": float(np.std(prices)),
"price_volatility": float(volatility),
} }
return self.state, float(reward), terminated, False, info if self._last_coi is not None:
info.update(
{
"coi_policy": float(self._last_coi.policy),
"coi_agent": float(self._last_coi.agent),
"coi_leakage": float(self._last_coi.leak),
"coi_survival": float(self._last_coi.survival_ratio),
"coi_erosion": float(coi_erosion(self._last_coi.policy, self._last_coi.agent)),
}
)
return obs, reward, terminated, False, info
def render(self, mode: str = "human") -> str | None:
if self._prices is None:
return None
out = (
f"t={self._t}/{self.c.max_steps} "
f"alpha_true={self._alpha_true:.3f} alpha_hat={self._alpha_hat:.3f} "
f"price_std={float(np.std(self._prices)):.2f}"
)
if mode == "human":
print(out)
return out
if __name__ == "__main__": def close(self) -> None:
import matplotlib.pyplot as plt return
from collections import defaultdict
env = PHANTOMEnv(constraints=BusinessLogicConstraints())
obs, _ = env.reset(seed=42)
metrics = defaultdict(list)
total_reward = 0.0
done = False
while not done:
action = env.action_space.sample()
obs, reward, done, _, info = env.step(action)
total_reward += reward
p_mean = float(np.mean(obs["elasticity"]["price"]))
q_mean = float(np.mean(obs["elasticity"]["demand"]))
p_std = float(np.std(obs["elasticity"]["price"]))
metrics['t'].append(info['t'])
metrics['price_mean'].append(p_mean)
metrics['price_std'].append(p_std)
metrics['demand_mean'].append(q_mean)
metrics['revenue_observed'].append(info['revenue_observed'])
metrics['revenue_oracle'].append(info['revenue_oracle'])
metrics['agent_loss'].append(info['agent_loss'])
metrics['ux_volatility'].append(info['ux_volatility'])
metrics['look_to_book'].append(info['look_to_book'])
metrics['reward'].append(reward)
metrics['human_purchases'].append(info['true_human_purchases_total'])
metrics['agent_purchases'].append(info['true_agent_purchases_total'])
if info['t'] % 20 == 0 or done:
print(f"t={info['t']:03d} p={p_mean:6.2f}±{p_std:4.2f} q={q_mean:6.2f} "
f"rev={info['revenue_observed']:7.2f} oracle={info['revenue_oracle']:7.2f} "
f"loss={info['agent_loss']:6.2f} ux={info['ux_volatility']:.3f} "
f"ltb={info['look_to_book']:5.2f} r={reward:7.2f}")
print(f"total_reward={total_reward:.2f}")
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
fig.suptitle('PHANTOM Environment Run', fontsize=14, fontweight='bold')
plot_configs = [
('price_mean', 'Mean Price', 'Price'),
('demand_mean', 'Mean Demand Estimate', 'Demand'),
('revenue_observed', 'Revenue (Observed)', 'Revenue'),
('agent_loss', 'Agent Loss (Oracle - Observed)', 'Loss'),
('ux_volatility', 'UX Volatility (Price Change)', 'Volatility'),
('look_to_book', 'Look-to-Book Ratio', 'Ratio'),
('reward', 'Step Reward', 'Reward'),
('human_purchases', 'Human Purchases', 'Count'),
('agent_purchases', 'Agent Purchases', 'Count'),
]
for idx, (key, title, ylabel) in enumerate(plot_configs):
ax = axes[idx // 3, idx % 3]
ax.plot(metrics['t'], metrics[key], color='blue', alpha=0.7, linewidth=1.5)
ax.set_xlabel('Step')
ax.set_ylabel(ylabel)
ax.set_title(title, fontsize=10, fontweight='bold')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('phantom_env_comparison.png', dpi=150, bbox_inches='tight')
print("Plot saved to phantom_env_comparison.png")
plt.show()

View File

@@ -0,0 +1,11 @@
"""JAX-accelerated simulation core for PHANTOM environment."""
from .transitions import TransitionData, compile_transitions, fallback_transitions, JAX_AVAILABLE
from .simulation import SessionBatch, SimResult, sample_sessions, compute_metrics
from .features import session_features, compute_session_transitions
from .separability import compute_divergences, estimate_alpha_batch
__all__ = [
"JAX_AVAILABLE", "TransitionData", "compile_transitions", "fallback_transitions",
"SessionBatch", "SimResult", "sample_sessions", "compute_metrics",
"session_features", "compute_session_transitions", "compute_divergences", "estimate_alpha_batch",
]

View File

@@ -0,0 +1,69 @@
"""Vectorized session feature extraction."""
import numpy as np
from .transitions import N_STATES, PURCHASE_IDX, CART_IDX
from .simulation import SessionBatch
try:
import jax.numpy as jnp
from jax import jit
JAX_AVAILABLE = True
except ImportError:
jnp, JAX_AVAILABLE = np, False
def jit(f): return f
@jit
def extract_features(states, dwells, lengths):
"""Extract per-session features. Returns (n_sess, 9) array."""
n, max_len = states.shape
mask = jnp.arange(max_len)[None,:] < lengths[:,None]
duration = jnp.sum(dwells * mask, axis=1)
total = lengths.astype(jnp.float32)
count = lambda idx: jnp.sum((states == idx) & mask, axis=1).astype(jnp.float32)
views, learn, carts, purchases = count(1), count(2), count(3), count(4)
velocity = total / (duration + 1e-6)
conversion = purchases / (views + 1e-6)
avg_dwell = duration / (total + 1e-6)
return jnp.stack([duration, avg_dwell, total, velocity, views, carts, purchases, learn, conversion], axis=1)
def session_features(batch: SessionBatch) -> np.ndarray:
if JAX_AVAILABLE:
return np.asarray(extract_features(jnp.array(batch.states), jnp.array(batch.dwells), jnp.array(batch.lengths)))
# numpy fallback
n, max_len = batch.states.shape
mask = np.arange(max_len)[None,:] < batch.lengths[:,None]
duration = np.sum(batch.dwells * mask, axis=1)
total = batch.lengths.astype(np.float32)
count = lambda idx: np.sum((batch.states == idx) & mask, axis=1).astype(np.float32)
views, learn, carts, purchases = count(1), count(2), count(3), count(4)
return np.stack([duration, duration/(total+1e-6), total, total/(duration+1e-6), views, carts, purchases, learn, purchases/(views+1e-6)], axis=1)
@jit
def session_transitions(states, lengths, n_states=N_STATES):
"""Compute empirical transition counts per session. Returns (n_sess, n_states, n_states)."""
n, max_len = states.shape
mask = jnp.arange(max_len - 1)[None,:] < (lengths[:,None] - 1)
src, dst = states[:, :-1], states[:, 1:]
# handle -1 padding by clamping to valid range
src_c, dst_c = jnp.clip(src, 0, n_states-1), jnp.clip(dst, 0, n_states-1)
valid = mask & (src >= 0) & (dst >= 0)
def per_session(i):
s, d, v = src_c[i], dst_c[i], valid[i]
trans = (jnp.eye(n_states)[s,:,None] * jnp.eye(n_states)[d,None,:]).sum(0) * v[:,None,None]
return trans.sum(0)
# vmap not ideal here, use manual loop for clarity
trans = jnp.stack([per_session(i) for i in range(n)])
row_sums = trans.sum(axis=-1, keepdims=True)
return trans / (row_sums + 1e-10)
def compute_session_transitions(batch: SessionBatch) -> np.ndarray:
if JAX_AVAILABLE:
return np.asarray(session_transitions(jnp.array(batch.states), jnp.array(batch.lengths)))
# numpy fallback
n, max_len = batch.states.shape
trans = np.zeros((n, N_STATES, N_STATES), dtype=np.float32)
for i in range(n):
for t in range(batch.lengths[i] - 1):
s, d = batch.states[i, t], batch.states[i, t+1]
if s >= 0 and d >= 0: trans[i, s, d] += 1
row_sums = trans.sum(axis=-1, keepdims=True)
return trans / (row_sums + 1e-10)

View File

@@ -0,0 +1,43 @@
"""Vectorized KL divergence for separability scoring."""
import numpy as np
from typing import Tuple
try:
import jax.numpy as jnp
from jax import jit
JAX_AVAILABLE = True
except ImportError:
jnp, JAX_AVAILABLE = np, False
def jit(f): return f
@jit
def batch_kl(P, Q_human, Q_agent, eps=1e-10):
"""Compute KL(P||Q) for batched P. P:(n,s,s), Q:(s,s). Returns (delta_h, delta_a) each (n,)."""
p = P + eps
p = p / p.sum(axis=-1, keepdims=True)
qh, qa = Q_human[None] + eps, Q_agent[None] + eps
delta_h = jnp.sum(p * jnp.log(p / qh), axis=(1, 2))
delta_a = jnp.sum(p * jnp.log(p / qa), axis=(1, 2))
return delta_h, delta_a
def compute_divergences(session_trans: np.ndarray, ref_human: np.ndarray, ref_agent: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Compute KL divergence of each session from human/agent prototypes."""
if JAX_AVAILABLE:
dh, da = batch_kl(jnp.array(session_trans), jnp.array(ref_human), jnp.array(ref_agent))
return np.asarray(dh), np.asarray(da)
# numpy fallback
eps = 1e-10
p = session_trans + eps
p = p / p.sum(axis=-1, keepdims=True)
qh, qa = ref_human[None] + eps, ref_agent[None] + eps
delta_h = np.sum(p * np.log(p / qh), axis=(1, 2))
delta_a = np.sum(p * np.log(p / qa), axis=(1, 2))
return delta_h, delta_a
def estimate_alpha_batch(prob_agent: np.ndarray, delta_h: np.ndarray, delta_a: np.ndarray, temp: float = 1.0) -> np.ndarray:
"""Vectorized alpha estimation from classifier probs and divergences."""
mass = delta_h + delta_a
ratio = np.where(mass > 1e-8, delta_a / mass, 0.5)
blended = 0.5 * prob_agent + 0.5 * ratio
if temp <= 0: return np.clip(blended, 0.0, 1.0)
return np.clip(1.0 / (1.0 + np.exp(-temp * (blended - 0.5))), 0.0, 1.0)

View File

@@ -0,0 +1,116 @@
"""Vectorized Markov chain session sampling with JAX."""
from typing import NamedTuple, Tuple
import numpy as np
from functools import partial
try:
import jax, jax.numpy as jnp
from jax import lax
JAX_AVAILABLE = True
except ImportError:
JAX_AVAILABLE = False
from .transitions import TransitionData, N_STATES, TERM_IDX, PURCHASE_IDX, CART_IDX
class SessionBatch(NamedTuple):
states: np.ndarray # (n_sess, max_len) state indices, -1=padding
dwells: np.ndarray # (n_sess, max_len) dwell times
products: np.ndarray # (n_sess,) product index per session
actors: np.ndarray # (n_sess,) 0=human, 1=agent
lengths: np.ndarray # (n_sess,) actual session length
class SimResult(NamedTuple):
demand_human: np.ndarray
demand_agent: np.ndarray
revenue: float
revenue_oracle: float
agent_loss: float
coi: float
look_to_book: float
mean_sale_price: float
n_human_purchases: int
n_agent_purchases: int
sessions: SessionBatch
if JAX_AVAILABLE:
@partial(jax.jit, static_argnums=(5,6,7))
def _sample_sessions_jax(key, T_human, T_agent, dwell_human, dwell_agent, n_human, n_agent, max_steps):
n = n_human + n_agent
k1, k2, k3, k4 = jax.random.split(key, 4)
actors = jnp.concatenate([jnp.zeros(n_human, dtype=jnp.int32), jnp.ones(n_agent, dtype=jnp.int32)])
T = jnp.where(actors[:,None,None]==0, T_human[None], T_agent[None]) # (n,6,6)
dwell_p = jnp.where(actors[:,None,None]==0, dwell_human[None], dwell_agent[None]) # (n,6,2)
def step(carry, _):
s, active, k = carry
k, k1, k2 = jax.random.split(k, 3)
probs = T[jnp.arange(n), s] # (n,6)
nxt = jax.random.categorical(k1, jnp.log(probs + 1e-10))
nxt = jnp.where(active, nxt, -1)
shape = dwell_p[jnp.arange(n), s, 0]
scale = dwell_p[jnp.arange(n), s, 1]
dwell = jnp.maximum(0.3, jax.random.gamma(k2, shape) * scale)
still = active & (nxt != TERM_IDX) & (nxt >= 0)
return (nxt, still, k), (nxt, dwell)
init = (jnp.zeros(n, dtype=jnp.int32), jnp.ones(n, dtype=jnp.bool_), k3)
_, (states, dwells) = lax.scan(step, init, None, length=max_steps)
states, dwells = states.T, dwells.T # (n, max_steps)
is_term = (states == -1) | (states == TERM_IDX)
lengths = jnp.argmax(is_term, axis=1) + 1
lengths = jnp.where(jnp.any(is_term, axis=1), lengths, max_steps)
return states, dwells, actors, lengths
def sample_sessions(key, trans: TransitionData, n_human: int, n_agent: int, n_products: int, max_steps: int = 40) -> SessionBatch:
if JAX_AVAILABLE:
k1, k2 = jax.random.split(key)
states, dwells, actors, lengths = _sample_sessions_jax(k1, trans.human_T, trans.agent_T, trans.human_dwell, trans.agent_dwell, n_human, n_agent, max_steps)
products = jax.random.randint(k2, (n_human + n_agent,), 0, n_products)
return SessionBatch(np.asarray(states), np.asarray(dwells), np.asarray(products), np.asarray(actors), np.asarray(lengths))
# numpy fallback
rng = np.random.default_rng(int(key[0]) if hasattr(key, '__getitem__') else 42)
n = n_human + n_agent
actors = np.concatenate([np.zeros(n_human, dtype=np.int32), np.ones(n_agent, dtype=np.int32)])
products = rng.integers(0, n_products, size=n)
states, dwells = np.full((n, max_steps), -1, dtype=np.int32), np.zeros((n, max_steps), dtype=np.float32)
lengths = np.zeros(n, dtype=np.int32)
for i in range(n):
T = trans.human_T if actors[i] == 0 else trans.agent_T
dp = trans.human_dwell if actors[i] == 0 else trans.agent_dwell
s, t = 0, 0
while t < max_steps and s != TERM_IDX:
states[i, t] = s
dwells[i, t] = max(0.3, rng.gamma(dp[s, 0], dp[s, 1]))
s = rng.choice(N_STATES, p=T[s])
t += 1
lengths[i] = t
return SessionBatch(states, dwells, products, actors, lengths)
def compute_metrics(batch: SessionBatch, prices: np.ndarray, unit_cost: np.ndarray, base_price: np.ndarray) -> SimResult:
purchased = np.any(batch.states == PURCHASE_IDX, axis=1)
human_mask, agent_mask = batch.actors == 0, batch.actors == 1
human_purch, agent_purch = purchased & human_mask, purchased & agent_mask
demand_h = np.bincount(batch.products[human_purch], minlength=len(prices)).astype(np.float32)
demand_a = np.bincount(batch.products[agent_purch], minlength=len(prices)).astype(np.float32)
# revenue and oracle
purch_products = batch.products[purchased]
revenue = float(np.sum(prices[purch_products]))
revenue_oracle = float(np.sum(base_price[purch_products]))
# agent loss: base_price - price_paid for agent purchases (agents gaming the system)
agent_products = batch.products[agent_purch]
agent_loss = float(np.sum(base_price[agent_products] - prices[agent_products]))
# COI: margin - expected_premium*0.5 for human purchases
human_products = batch.products[human_purch]
if len(human_products) > 0:
margin = float(np.mean(prices[human_products] - unit_cost[human_products]))
premium = float(np.mean(base_price[human_products] - prices[human_products]))
coi = max(0.0, margin - premium * 0.5)
else:
coi = 0.0
# look to book: views / purchases
views = float(np.sum(batch.states == 1)) # view_item_page = index 1
n_purch = int(purchased.sum())
look_to_book = views / (n_purch + 1e-6)
mean_sale = float(np.mean(prices[purch_products])) if n_purch > 0 else 0.0
return SimResult(demand_h, demand_a, revenue, revenue_oracle, agent_loss, coi, look_to_book, mean_sale,
int(human_purch.sum()), int(agent_purch.sum()), batch)

View File

@@ -0,0 +1,47 @@
"""Dense transition matrices for JAX Markov chain sampling."""
from dataclasses import dataclass
import numpy as np
try:
import jax.numpy as jnp
JAX_AVAILABLE = True
except ImportError:
jnp, JAX_AVAILABLE = np, False
STATES = ["session_start", "view_item_page", "learn_more_about_item", "add_item_to_cart", "purchase_complete", "session_end"]
S2I = {s: i for i, s in enumerate(STATES)}
N_STATES, TERM_IDX, PURCHASE_IDX, CART_IDX = len(STATES), 5, 4, 3
@dataclass
class TransitionData:
human_T: np.ndarray # (6,6) transition probs
agent_T: np.ndarray # (6,6)
human_dwell: np.ndarray # (6,2) shape,scale
agent_dwell: np.ndarray # (6,2)
def to_jax(self):
if not JAX_AVAILABLE: return self
return TransitionData(*[jnp.array(x) for x in [self.human_T, self.agent_T, self.human_dwell, self.agent_dwell]])
def dict_to_dense(d):
m = np.zeros((N_STATES, N_STATES), dtype=np.float32)
for src, dsts in d.items():
if (i := S2I.get(src)) is not None:
for dst, p in dsts.items():
if (j := S2I.get(dst)) is not None: m[i,j] = p
m /= np.maximum(m.sum(1, keepdims=True), 1e-8)
m[TERM_IDX] = 0; m[TERM_IDX, TERM_IDX] = 1.0
return m
def compile_transitions(human_profile, agent_profile):
def dwell_arr(params): return np.array([[params.get(s, (2.0, 1.0)) for s in STATES]], dtype=np.float32).reshape(N_STATES, 2)
return TransitionData(dict_to_dense(human_profile.transitions), dict_to_dense(agent_profile.transitions),
dwell_arr(human_profile.dwell_params), dwell_arr(agent_profile.dwell_params))
def fallback_transitions():
H = {"session_start": {"view_item_page": .85, "session_end": .15}, "view_item_page": {"learn_more_about_item": .4, "add_item_to_cart": .3, "view_item_page": .2, "session_end": .1},
"learn_more_about_item": {"add_item_to_cart": .5, "view_item_page": .3, "session_end": .2}, "add_item_to_cart": {"purchase_complete": .6, "view_item_page": .25, "session_end": .15}, "purchase_complete": {"session_end": 1.0}}
A = {"session_start": {"view_item_page": .9, "session_end": .1}, "view_item_page": {"learn_more_about_item": .5, "add_item_to_cart": .25, "view_item_page": .15, "session_end": .1},
"learn_more_about_item": {"add_item_to_cart": .4, "view_item_page": .4, "session_end": .2}, "add_item_to_cart": {"purchase_complete": .5, "view_item_page": .3, "session_end": .2}, "purchase_complete": {"session_end": 1.0}}
dwell = np.full((N_STATES, 2), [2.0, 1.0], dtype=np.float32)
return TransitionData(dict_to_dense(H), dict_to_dense(A), dwell.copy(), dwell.copy())

View File

@@ -4,16 +4,17 @@ from pathlib import Path
from typing import Dict, Type, Optional from typing import Dict, Type, Optional
import pickle import pickle
from torch.utils.tensorboard import SummaryWriter from torch.utils.tensorboard import SummaryWriter
from environment import PHANTOMEnv, BusinessLogicConstraints from sim.rl.environment import PHANTOMEnv, BusinessLogicConstraints
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s') logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
try: try:
from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine, from sim.rl.engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine) SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
except ImportError: except ImportError as e:
BasePricingEngine = None # engines not required for basic usage BasePricingEngine = None # engines not required for basic usage
print(e)
""" """
@@ -36,27 +37,49 @@ class EngineTrainer:
self.global_step = 0 self.global_step = 0
def train(self, n_episodes: int, seed: int = 42): def train(self, n_episodes: int, seed: int = 42):
obs, _ = self.env.reset(seed=seed)
prices = None
for ep in range(n_episodes): for ep in range(n_episodes):
prices = self.engine.compute_prices(prices, obs) obs, _ = self.env.reset(seed=seed + ep)
obs, reward, done, _, info = self.env.step(prices) self.engine.reset()
self.engine.update(obs, reward, done, info) done = False
prev_prices = obs["elasticity"]["price"]
episode_reward = 0.0
last_info: Dict[str, float] = {}
while not done:
action_prices = self.engine.compute_prices(prev_prices, obs)
obs, reward, done, _, info = self.env.step(action_prices)
self.engine.update(obs, reward, done, info)
episode_reward += reward
prev_prices = obs["elasticity"]["price"]
last_info = info
if self.tb_writer:
self.tb_writer.add_scalar("reward/step", reward, self.global_step)
if "coi" in info:
self.tb_writer.add_scalar("diagnostics/coi", info["coi"], self.global_step)
if "alpha_hat" in info:
self.tb_writer.add_scalar("diagnostics/alpha_hat", info["alpha_hat"], self.global_step)
self.global_step += 1
last_info = dict(last_info)
last_info.update({"episode_reward": episode_reward, "episode": ep})
self.episode_metrics.append(last_info)
if self.tb_writer:
self.tb_writer.add_scalar("reward/episode", episode_reward, ep)
return self return self
def run_episode(self, seed: int = 42) -> Dict: def run_episode(self, seed: int = 42) -> Dict:
"""run single evaluation episode and return metrics""" """run single evaluation episode and return metrics"""
obs, _ = self.env.reset(seed=seed) obs, _ = self.env.reset(seed=seed)
self.engine.reset() self.engine.reset()
total_reward, prices = 0.0, None total_reward = 0.0
prev_prices = obs["elasticity"]["price"]
ep_metrics = {'total_reward': 0.0} ep_metrics = {'total_reward': 0.0}
done = False done = False
while not done: while not done:
prices = self.engine.compute_prices(prices, obs) if prices is not None else obs["elasticity"]["price"] action_prices = self.engine.compute_prices(prev_prices, obs)
obs, reward, done, _, info = self.env.step(prices) obs, reward, done, _, info = self.env.step(action_prices)
total_reward += reward total_reward += reward
for k, v in info.items(): for k, v in info.items():
ep_metrics[k] = v ep_metrics[k] = v
prev_prices = obs["elasticity"]["price"]
ep_metrics['total_reward'] = total_reward ep_metrics['total_reward'] = total_reward
return ep_metrics return ep_metrics
@@ -106,7 +129,7 @@ if __name__ == "__main__":
logger.error("Engines not available, cannot run training") logger.error("Engines not available, cannot run training")
exit(1) exit(1)
base_dir = Path("./runs") base_dir = Path("./sim/rl/runs")
base_dir.mkdir(exist_ok=True) base_dir.mkdir(exist_ok=True)
engines = { engines = {

View File

@@ -1,4 +1,9 @@
import os, requests, py7zr import os
import requests
try:
import py7zr # type: ignore
except ImportError: # pragma: no cover - optional dependency
py7zr = None
import pandas as pd import pandas as pd
from typing import Generator from typing import Generator
try: try:
@@ -22,12 +27,16 @@ class YooChooseLoader(Loader):
self.entries = list(self.data.keys()) self.entries = list(self.data.keys())
def _setup(self): def _setup(self):
if py7zr is None:
raise RuntimeError("py7zr is required to unpack YooChoose dataset. Install py7zr first.")
os.makedirs(self.root, exist_ok=True) os.makedirs(self.root, exist_ok=True)
zip_path = f"{self.root}/temp.7z" zip_path = f"{self.root}/temp.7z"
with requests.get(self.URL, stream=True) as r: with requests.get(self.URL, stream=True) as r:
with open(zip_path, 'wb') as f: with open(zip_path, 'wb') as f:
for chunk in r.iter_content(8192): f.write(chunk) for chunk in r.iter_content(8192):
with py7zr.SevenZipFile(zip_path, 'r') as z: z.extractall(self.root) f.write(chunk)
with py7zr.SevenZipFile(zip_path, 'r') as z:
z.extractall(self.root)
os.remove(zip_path) os.remove(zip_path)
def _make_interaction(self, sid: str, ts: str, item_id: str, event: str, page: str, meta: dict) -> InteractionModel: def _make_interaction(self, sid: str, ts: str, item_id: str, event: str, page: str, meta: dict) -> InteractionModel:

7
tests/e2e/.env.example Normal file
View File

@@ -0,0 +1,7 @@
WEB_URL=http://localhost:3000
BACKEND_URL=http://localhost:5000
PRICING_PROVIDER_URL=http://localhost:5001
AIRFLOW_URL=http://localhost:8085
AIRFLOW_USER=admin
AIRFLOW_PASS=admin
HEADLESS=true