chore: migrating thesis case definition

This commit is contained in:
2026-01-26 13:19:55 +01:00
parent 98a9a3738c
commit cd6c3d6006
11 changed files with 741 additions and 12 deletions

View File

@@ -0,0 +1,2 @@
"""Minimal thesis-aligned pricing simulation (self-contained)."""

View File

@@ -0,0 +1,125 @@
"""Cost of Information (COI) computation for thesis pricing system.
Core KPI: COI = E[p_shown] - p_min measures pricing power from information asymmetry.
Theorem 1 shows COI erodes as agent queries increase: as N->inf, p^(1)->p_min.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, List, TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from .simplified import Session
@dataclass(frozen=True)
class COIWindow:
"""Windowed COI metrics computed from realized price exposures.
policy: E[p_shown] - cost, the definition-level KPI
agent: E[p^(1)] - cost where p^(1) is min price under agent querying
leak: max(policy - agent, 0), observable gap from reconnaissance
survival_ratio: agent/policy, fraction of pricing power retained
"""
policy: float
agent: float
leak: float
survival_ratio: float
policy_by_product: np.ndarray
agent_by_product: np.ndarray
demand_weights: np.ndarray
def aggregate_prices(sessions: List["Session"], mode: str = "all") -> Dict[int, List[float] | float]:
"""Unified price aggregation across sessions.
mode: "all" returns all prices per product, "min_per_session" returns min price per session per product,
"min_across" returns single min price per product
"""
if mode == "min_across":
mins: Dict[int, float] = {}
for s in sessions:
for e in s.events:
pidx, price = int(e.product_idx), float(e.price_seen)
mins[pidx] = min(mins.get(pidx, price), price)
return mins
elif mode == "min_per_session":
result: Dict[int, List[float]] = {}
for s in sessions:
by_p: Dict[int, float] = {}
for e in s.events:
pidx, price = int(e.product_idx), float(e.price_seen)
by_p[pidx] = min(by_p.get(pidx, price), price)
for pidx, pmin in by_p.items():
result.setdefault(pidx, []).append(pmin)
return result
else: # "all"
prices: Dict[int, List[float]] = {}
for s in sessions:
for e in s.events:
prices.setdefault(e.product_idx, []).append(float(e.price_seen))
return prices
def demand_weights_by_product(sessions: List["Session"], demand_mapping: Dict[str, float], n_products: int) -> np.ndarray:
"""Compute demand-weighted importance per product."""
w = np.zeros(n_products, dtype=float)
sessions_by_id = {s.sid: s for s in sessions}
for sid, q in demand_mapping.items():
sess = sessions_by_id.get(sid)
if sess and sess.events:
w[int(sess.events[0].product_idx)] += float(q)
total = float(np.sum(w))
return (w / total) if total > 0 else w
def compute_coi_window(sessions: List["Session"], costs: np.ndarray, demand_mapping: Dict[str, float] | None = None) -> COIWindow:
"""Compute COI metrics over session window.
Aggregates price exposures and computes policy-level vs agent-realized COI.
"""
n = int(len(costs))
prices = aggregate_prices(sessions, mode="all")
agent_sessions = [s for s in sessions if s.actor == "A"]
agent_min = aggregate_prices(agent_sessions, mode="min_across") if agent_sessions else {}
policy_by = np.zeros(n, dtype=float)
agent_by = np.zeros(n, dtype=float)
seen = np.array([(i in prices) for i in range(n)], dtype=bool)
agent_seen = np.array([(i in agent_min) for i in range(n)], dtype=bool)
for pidx, ps in prices.items():
if 0 <= pidx < n and ps:
policy_by[pidx] = float(np.mean(ps) - float(costs[pidx]))
for pidx, pmin in agent_min.items():
if 0 <= pidx < n:
agent_by[pidx] = float(pmin - float(costs[pidx]))
agent_by[seen & ~agent_seen] = policy_by[seen & ~agent_seen] # no erosion if no agent exposure
demand_w = demand_weights_by_product(sessions, demand_mapping, n) if demand_mapping else np.zeros(n, dtype=float)
has_weights = float(np.sum(demand_w)) > 0
if has_weights:
policy, agent = float(np.dot(demand_w, policy_by)), float(np.dot(demand_w, agent_by))
elif np.any(seen):
policy, agent = float(np.mean(policy_by[seen])), float(np.mean(agent_by[seen]))
else:
policy, agent = 0.0, 0.0
leak = float(max(policy - agent, 0.0))
survival = float(np.clip(agent / policy, 0.0, 1.0)) if policy > 0 else 0.0
return COIWindow(policy=policy, agent=agent, leak=leak, survival_ratio=survival,
policy_by_product=policy_by, agent_by_product=agent_by, demand_weights=demand_w)
def coi_erosion(coi_policy: float, coi_agent: float, eps: float = 1e-9) -> float:
"""Thesis-consistent COI erosion: fraction of pricing power destroyed by agent queries.
erosion = 1 - (COI_agent / COI_policy)
When agents find low prices, COI_agent -> 0, erosion -> 1.
"""
if coi_policy <= eps:
return 0.0
return float(np.clip(1.0 - (coi_agent / (coi_policy + eps)), 0.0, 1.0))

View File

@@ -0,0 +1,325 @@
"""COI leakage experiments and policy comparisons.
Demonstrates the core thesis contribution: COI erosion under agent contamination
and recovery via robust pricing policies.
Generates TensorBoard logs for:
- COI erosion curves across contamination levels
- Policy comparison (fixed vs adaptive vs RL)
- Revenue/margin trade-offs
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple
import json
import numpy as np
try:
from torch.utils.tensorboard import SummaryWriter
HAS_TB = True
except ImportError:
HAS_TB = False
from .simplified_env import PricingEnv, EnvConfig, make_env
from .simplified import System
@dataclass
class ExperimentResult:
"""Container for experiment metrics."""
name: str
alpha: float
reward_mean: float
reward_std: float
coi_erosion: float
alpha_error: float
revenue: float
margin: float
def to_dict(self) -> dict:
return {k: getattr(self, k) for k in self.__dataclass_fields__}
def theoretical_coi_erosion_curve(alphas: np.ndarray, n_sessions: int = 1000) -> np.ndarray:
"""Theoretical COI erosion from Theorem 1 using order statistic model.
For N i.i.d. uniform queries on [p_min, p_max]:
E[p^(1)] = p_min + (p_max - p_min)/(N+1), so erosion = 1 - 2/(N+1)
"""
erosions = []
for a in alphas:
n_agents = max(1, int(a * n_sessions))
erosions.append(1.0 - 2.0 / (n_agents + 1))
return np.array(erosions)
def run_policy_episode(
env: PricingEnv,
policy_fn,
n_episodes: int = 10
) -> Tuple[List[float], List[float], List[float], List[float]]:
"""Run policy and collect per-step metrics."""
rewards, coi_erosions, alpha_errors, revenues = [], [], [], []
for _ in range(n_episodes):
obs, info = env.reset()
done = False
while not done:
action = policy_fn(obs, env.n)
obs, reward, terminated, truncated, info = env.step(action)
done = terminated or truncated
rewards.append(reward)
if 'coi_erosion' in info:
coi_erosions.append(info['coi_erosion'])
if 'alpha_true' in info and 'alpha_est' in info:
alpha_errors.append(abs(info['alpha_true'] - info['alpha_est']))
if 'revenue' in info:
revenues.append(info['revenue'])
return rewards, coi_erosions, alpha_errors, revenues
class PolicyRegistry:
"""Registry of baseline policies."""
@staticmethod
def fixed(obs: np.ndarray, n: int, margin: float = 0.15) -> np.ndarray:
return np.ones(n, dtype=np.float32) * (1.0 + margin)
@staticmethod
def random(obs: np.ndarray, n: int, rng: np.random.Generator = None) -> np.ndarray:
rng = rng or np.random.default_rng()
return rng.uniform(0.7, 1.3, n).astype(np.float32)
@staticmethod
def adaptive(obs: np.ndarray, n: int, base_margin: float = 0.15) -> np.ndarray:
"""Reduce margins when alpha estimate is high."""
alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
margin_scale = 1.0 - 0.4 * alpha_est
return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale)
@staticmethod
def aggressive(obs: np.ndarray, n: int) -> np.ndarray:
"""High margins, ignores contamination."""
return np.ones(n, dtype=np.float32) * 1.4
@staticmethod
def defensive(obs: np.ndarray, n: int) -> np.ndarray:
"""Low margins, always cautious."""
return np.ones(n, dtype=np.float32) * 1.05
@staticmethod
def alpha_proportional(obs: np.ndarray, n: int, max_margin: float = 0.3) -> np.ndarray:
"""Margin inversely proportional to estimated alpha."""
alpha_est = obs[2 * n] if len(obs) > 2 * n else 0.2
margin = max_margin * (1.0 - alpha_est)
return np.ones(n, dtype=np.float32) * (1.0 + margin)
def run_contamination_sweep(
alphas: List[float],
policies: Dict[str, callable],
n_products: int = 10,
max_steps: int = 200,
n_episodes: int = 10,
seed: int = 42,
log_dir: str = None
) -> Dict[str, List[ExperimentResult]]:
"""Run policies across contamination levels."""
results = {name: [] for name in policies}
writer = SummaryWriter(Path(log_dir) / "sweep") if log_dir and HAS_TB else None
for alpha in alphas:
print(f" alpha={alpha:.2f}", end=" ")
env_cfg = EnvConfig(
n_products=n_products, max_steps=max_steps,
alpha_true=alpha, reward_mode="robust", seed=seed)
env = make_env(env_cfg)
for name, policy_fn in policies.items():
rewards, coi_vals, alpha_errs, revenues = run_policy_episode(env, policy_fn, n_episodes)
result = ExperimentResult(
name=name, alpha=alpha,
reward_mean=float(np.mean(rewards)),
reward_std=float(np.std(rewards)),
coi_erosion=float(np.mean(coi_vals)) if coi_vals else 0.0,
alpha_error=float(np.mean(alpha_errs)) if alpha_errs else 0.0,
revenue=float(np.mean(revenues)) if revenues else 0.0,
margin=float(np.mean([policy_fn(np.zeros(3 * n_products + 3), n_products)]) - 1.0))
results[name].append(result)
if writer:
step = int(alpha * 100)
writer.add_scalar(f'{name}/reward', result.reward_mean, step)
writer.add_scalar(f'{name}/coi_erosion', result.coi_erosion, step)
writer.add_scalar(f'{name}/alpha_error', result.alpha_error, step)
writer.add_scalar(f'{name}/revenue', result.revenue, step)
print(f"done")
# add theoretical curve
if writer:
theo = theoretical_coi_erosion_curve(np.array(alphas))
for i, (a, e) in enumerate(zip(alphas, theo)):
writer.add_scalar('theoretical/coi_erosion', e, int(a * 100))
writer.close()
return results
def run_coi_demonstration(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
"""Main COI demonstration experiment."""
print("=== COI Leakage Demonstration ===\n")
Path(log_dir).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(Path(log_dir) / "coi_demo") if HAS_TB else None
# theoretical erosion curve
print("1. Theoretical COI erosion (Theorem 1)")
alphas = np.linspace(0.0, 0.6, 13)
theo_erosion = theoretical_coi_erosion_curve(alphas, n_sessions=1000)
for a, e in zip(alphas, theo_erosion):
print(f" alpha={a:.2f} -> erosion={e:.3f}")
if writer:
writer.add_scalar('theory/coi_erosion', e, int(a * 100))
# policy comparison
print("\n2. Policy comparison across contamination levels")
policies = {
'fixed': lambda obs, n: PolicyRegistry.fixed(obs, n),
'aggressive': PolicyRegistry.aggressive,
'defensive': PolicyRegistry.defensive,
'adaptive': PolicyRegistry.adaptive,
'alpha_proportional': PolicyRegistry.alpha_proportional,
}
sweep_alphas = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
results = run_contamination_sweep(
sweep_alphas, policies, n_products=10, max_steps=100,
n_episodes=5, seed=seed, log_dir=log_dir)
# summarize
print("\n3. Summary by policy")
for name, res_list in results.items():
avg_reward = np.mean([r.reward_mean for r in res_list])
avg_coi = np.mean([r.coi_erosion for r in res_list])
print(f" {name:20s}: avg_reward={avg_reward:.2f}, avg_coi={avg_coi:.3f}")
# save results
output = {
'theoretical': {'alphas': alphas.tolist(), 'erosion': theo_erosion.tolist()},
'empirical': {name: [r.to_dict() for r in res_list] for name, res_list in results.items()}}
with open(Path(log_dir) / "coi_demo_results.json", 'w') as f:
json.dump(output, f, indent=2)
if writer:
writer.close()
print(f"\nResults saved to {log_dir}/coi_demo_results.json")
print(f"TensorBoard: tensorboard --logdir {log_dir}")
return output
def run_reward_mode_comparison(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
"""Compare different reward modes."""
print("=== Reward Mode Comparison ===\n")
Path(log_dir).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(Path(log_dir) / "reward_modes") if HAS_TB else None
reward_modes = ["revenue", "profit", "robust", "coi_aware"]
alpha = 0.3 # moderate contamination
results = {}
for mode in reward_modes:
print(f" mode={mode}", end=" ")
env_cfg = EnvConfig(
n_products=10, max_steps=200, alpha_true=alpha,
reward_mode=mode, seed=seed)
env = make_env(env_cfg)
rewards, coi_vals, _, revenues = run_policy_episode(
env, PolicyRegistry.adaptive, n_episodes=10)
results[mode] = {
'reward_mean': float(np.mean(rewards)),
'reward_std': float(np.std(rewards)),
'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
'revenue': float(np.mean(revenues)) if revenues else 0.0}
if writer:
for k, v in results[mode].items():
writer.add_scalar(f'{mode}/{k}', v, 0)
print(f"reward={results[mode]['reward_mean']:.2f}, coi={results[mode]['coi_erosion']:.3f}")
if writer:
writer.close()
with open(Path(log_dir) / "reward_mode_results.json", 'w') as f:
json.dump(results, f, indent=2)
return results
def run_alpha_drift_experiment(log_dir: str = "sim/case/thesis_simplified/runs", seed: int = 42) -> Dict:
"""Test policy robustness under non-stationary contamination."""
print("=== Alpha Drift Experiment ===\n")
Path(log_dir).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(Path(log_dir) / "alpha_drift") if HAS_TB else None
drift_rates = [0.0, 0.01, 0.02, 0.05]
results = {}
for drift in drift_rates:
print(f" drift={drift:.2f}", end=" ")
env_cfg = EnvConfig(
n_products=10, max_steps=200, alpha_true=0.2,
alpha_drift=drift, reward_mode="robust", seed=seed)
env = make_env(env_cfg)
rewards, coi_vals, alpha_errs, _ = run_policy_episode(
env, PolicyRegistry.adaptive, n_episodes=10)
results[f'drift_{drift}'] = {
'reward_mean': float(np.mean(rewards)),
'coi_erosion': float(np.mean(coi_vals)) if coi_vals else 0.0,
'alpha_tracking_error': float(np.mean(alpha_errs)) if alpha_errs else 0.0}
if writer:
for k, v in results[f'drift_{drift}'].items():
writer.add_scalar(f'drift_{drift}/{k}', v, 0)
print(f"reward={results[f'drift_{drift}']['reward_mean']:.2f}, "
f"alpha_err={results[f'drift_{drift}']['alpha_tracking_error']:.3f}")
if writer:
writer.close()
return results
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Run COI experiments")
parser.add_argument("--exp", type=str, default="coi", choices=["coi", "reward", "drift", "all"])
parser.add_argument("--log-dir", type=str, default="sim/case/thesis_simplified/runs")
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
if args.exp == "coi" or args.exp == "all":
run_coi_demonstration(args.log_dir, args.seed)
if args.exp == "reward" or args.exp == "all":
run_reward_mode_comparison(args.log_dir, args.seed)
if args.exp == "drift" or args.exp == "all":
run_alpha_drift_experiment(args.log_dir, args.seed)

View File

@@ -0,0 +1,72 @@
"""Behavioral separability for human/agent detection.
Computes divergence signals delta_H, delta_A from session trajectories using
transition kernel estimation and KL divergence to prototype behavioral profiles.
"""
from __future__ import annotations
from typing import Dict, List, Tuple, TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from .simplified import Event, Session
# prototype behavioral kernels for human vs agent sessions
TRANS_H = {
"start": {"view": 0.85, "end": 0.15},
"view": {"detail": 0.4, "cart": 0.3, "view": 0.2, "end": 0.1},
"detail": {"cart": 0.5, "view": 0.3, "end": 0.2},
"cart": {"purchase": 0.6, "view": 0.25, "end": 0.15},
"purchase": {"end": 1.0},
}
TRANS_A = {
"start": {"view": 0.95, "end": 0.05},
"view": {"detail": 0.6, "view": 0.25, "cart": 0.1, "end": 0.05},
"detail": {"view": 0.5, "cart": 0.15, "detail": 0.3, "end": 0.05},
"cart": {"view": 0.4, "purchase": 0.2, "end": 0.4},
"purchase": {"end": 1.0},
}
def kl_div(p: Dict[str, float], q: Dict[str, float], eps: float = 1e-10) -> float:
"""KL divergence D_KL(p || q) for discrete distributions."""
keys = set(p.keys()) | set(q.keys())
return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys)
def build_kernel(events: List["Event"]) -> Dict[str, Dict[str, float]]:
"""Build empirical transition kernel T' from trajectory events."""
trans: Dict[str, Dict[str, int]] = {}
prev = "start"
for e in events:
curr = e.action
trans.setdefault(prev, {})
trans[prev][curr] = trans[prev].get(curr, 0) + 1
prev = curr
return {s: {d: c / sum(dsts.values()) for d, c in dsts.items()} for s, dsts in trans.items() if sum(dsts.values()) > 0}
def compute_divergence(session: "Session") -> Tuple[float, float]:
"""Compute divergence signals delta_H, delta_A for session.
delta_H = mean KL(T' || T_H) across states, measures distance to human prototype
delta_A = mean KL(T' || T_A) across states, measures distance to agent prototype
"""
kernel = build_kernel(session.events)
if not kernel:
return 0.5, 0.5
delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / len(kernel)
delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / len(kernel)
return delta_h, delta_a
def estimate_alpha(session: "Session", beta: float = 2.0) -> float:
"""Per-session contamination estimate alpha_hat = sigma(beta*(delta_H - delta_A)).
Returns probability session is agent-generated based on behavioral divergence.
"""
dh, da = compute_divergence(session)
if (dh + da) <= 0:
return 0.5
return 1.0 / (1.0 + np.exp(-beta * (dh - da)))

View File

@@ -0,0 +1,219 @@
"""Minimal implementation of thesis pricing system.
Implements the core loop: prices -> sessions -> demand -> prices
with behavioral separability and robust pricing objective.
Objects:
- Session trajectories tau_s from mixture of H/A behavioral profiles
- Demand proxy q_hat via weighted action aggregation
- COI leakage penalty for agent reconnaissance
- Limbo: alternating price/demand history for trajectory analysis
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict, List, Tuple
import numpy as np
from .coi import COIWindow, compute_coi_window
from .separability import TRANS_H, TRANS_A, kl_div, build_kernel, compute_divergence, estimate_alpha
ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0}
@dataclass
class Event:
action: str
product_idx: int
price_seen: float
ts: float
@dataclass
class Session:
sid: str
events: List[Event]
actor: str # H or A (ground truth label)
theta: Dict[str, float] = field(default_factory=dict)
def compute_demand(session: Session) -> float:
"""Compute demand proxy q_hat = sum_k omega(a_k) for session."""
return sum(ACTION_WEIGHTS.get(e.action, 0.1) for e in session.events)
def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray, costs: np.ndarray, theta: Dict[str, float],
is_agent: bool, session_noise: float = 0.02, surge: float = 0.08, max_mult: float = 1.8) -> Tuple[List[Event], int]:
"""Sample session trajectory from behavioral kernel."""
pidx = int(rng.integers(0, len(prices)))
cost, base = float(costs[pidx]), float(prices[pidx]) * (1.0 + rng.normal(0.0, session_noise))
base = float(np.clip(base, cost * 1.01, float(prices[pidx]) * 2.0))
price, signal, state, t = base, 0.0, "start", 0.0
events = []
while state != "end" and len(events) < 30:
probs = trans.get(state, {"end": 1.0})
nxt = rng.choice(list(probs.keys()), p=list(probs.values()))
if nxt == "purchase": # purchase conversion check
rel = max((price - cost) / (cost + 1e-6), 0.0)
p_buy = float(np.clip(theta.get("base_conv", 0.2) * np.exp(-theta.get("price_sens", 2.0) * rel), 0.0, 1.0))
if rng.random() > p_buy:
nxt = "end"
state = nxt
if state not in {"start", "end"}:
events.append(Event(action=state, product_idx=pidx, price_seen=float(price), ts=t))
signal += float(ACTION_WEIGHTS.get(state, 0.1))
price = float(np.clip(base * (1.0 + surge * signal), cost * 1.01, base * max_mult))
t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2))
return events, pidx
def put_prices_to_market(prices: np.ndarray, costs: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]:
"""Generate sessions from mixture model. Returns sessions and demand mapping sid -> q_hat."""
rng = np.random.default_rng(seed)
sessions, demand = [], {}
for i in range(n_sessions):
sid = f"s{i:04d}"
is_agent = rng.random() < alpha
trans = TRANS_A if is_agent else TRANS_H
theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else \
{"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
events, _ = sample_trajectory(rng, trans, prices, costs=costs, theta=theta, is_agent=is_agent)
session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
sessions.append(session)
demand[sid] = compute_demand(session)
return sessions, demand
@dataclass
class LimboUpdate:
utype: str # "prices" or "demand"
data: np.ndarray | Dict[str, float]
t: int
class Limbo:
"""Historical trajectory of alternating price/demand observations."""
def __init__(self):
self.history: List[LimboUpdate] = []
self._t = 0
def add_update(self, utype: str, data: np.ndarray | Dict[str, float]) -> Dict:
self.history.append(LimboUpdate(utype=utype, data=data, t=self._t))
self._t += 1
return {"action": "observe_demand" if utype == "prices" else "set_prices"}
def get_prices_history(self) -> List[np.ndarray]:
return [u.data for u in self.history if u.utype == "prices"]
def get_demand_history(self) -> List[Dict[str, float]]:
return [u.data for u in self.history if u.utype == "demand"]
class System:
"""Main pricing system implementing robust Stackelberg objective.
Manages the alternating loop: set prices p_t -> observe demand Q_hat(p_t) ->
estimate contamination alpha from behavioral signals -> compute next prices.
"""
def __init__(self, n_products: int = 10, costs: np.ndarray | None = None, lambda_coi: float = 0.5, seed: int | None = 42):
self.n = n_products
self.rng = np.random.default_rng(seed)
self.costs = costs if costs is not None else self.rng.uniform(10, 50, n_products)
self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products))
self.lambda_coi = lambda_coi
self.limbo = Limbo()
self._alpha_est = 0.2
self._sessions: List[Session] = []
self._last_sessions: List[Session] = []
self._last_coi: COIWindow | None = None
@property
def alpha(self) -> float:
return self._alpha_est
def _estimate_alpha_from_sessions(self) -> float:
if not self._sessions:
return self._alpha_est
return float(np.mean([estimate_alpha(s) for s in self._sessions[-50:]]))
def _revenue_under_demand(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
agg = np.zeros(self.n)
for sid, q in demand.items():
sess = next((s for s in self._sessions if s.sid == sid), None)
if sess and sess.events:
agg[sess.events[0].product_idx] += q
return float(np.dot(prices, agg))
def _compute_coi_window(self, demand: Dict[str, float]) -> COIWindow:
if not self._last_sessions:
zeros = np.zeros(self.n, dtype=float)
return COIWindow(policy=0.0, agent=0.0, leak=0.0, survival_ratio=0.0,
policy_by_product=zeros, agent_by_product=zeros, demand_weights=zeros)
return compute_coi_window(self._last_sessions, self.costs, demand_mapping=demand)
def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
"""Robust objective: R(p,d) - lambda * COI_leak."""
profit = self._revenue_under_demand(prices, demand) - float(np.sum(self.costs))
self._last_coi = self._compute_coi_window(demand)
return profit - self.lambda_coi * self._last_coi.leak
def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray:
"""Compute next prices via heuristic margin adjustment based on alpha estimate."""
self._alpha_est = self._estimate_alpha_from_sessions()
margin_scale = 1.0 - 0.5 * self._alpha_est # defensive pricing under high contamination
margins = (self.refs - self.costs) * margin_scale
noise = self.rng.normal(0, 0.02, self.n) * self.costs
prices = np.clip(self.costs + margins + noise, self.costs * 1.02, self.refs * 1.3)
self.limbo.add_update("prices", prices)
return prices
def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true,
n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
self._last_sessions = sessions
self._sessions.extend(sessions)
self.limbo.add_update("demand", demand_map)
return demand_map
def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float, COIWindow]:
demand_hist = self.limbo.get_demand_history()
prices = self.compute_prices(demand_hist[-1] if demand_hist else None)
demand = self.observe_demand(prices, alpha_true, n_sessions)
reward = self._objective(prices, demand)
return prices, demand, reward, self._last_coi or self._compute_coi_window(demand)
def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict:
traj = {"prices": [], "demand": [], "rewards": [], "alpha_est": [], "alpha_true": alpha_true,
"coi_policy": [], "coi_agent": [], "coi_leak": [], "coi_survival": []}
for _ in range(n_steps):
p, d, r, coi = self.step(alpha_true)
traj["prices"].append(p); traj["demand"].append(d); traj["rewards"].append(r)
traj["alpha_est"].append(self._alpha_est)
traj["coi_policy"].append(coi.policy); traj["coi_agent"].append(coi.agent)
traj["coi_leak"].append(coi.leak); traj["coi_survival"].append(coi.survival_ratio)
return traj
if __name__ == "__main__":
sys = System(n_products=5, seed=42)
traj = sys.run(n_steps=20, alpha_true=0.25)
print(f"avg reward: {np.mean(traj['rewards']):.2f}, final alpha_hat: {traj['alpha_est'][-1]:.3f}, "
f"COI_policy: {np.mean(traj['coi_policy']):.3f}, COI_agent: {np.mean(traj['coi_agent']):.3f}, leak: {np.mean(traj['coi_leak']):.3f}")
prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
costs = np.array([15.0, 28.0, 40.0, 18.0, 30.0])
sessions, demand = put_prices_to_market(prices, costs=costs, alpha=0.3, n_sessions=20, seed=123)
print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
for n in [1, 5, 10, 50, 100]:
# theoretical: erosion = 1 - 2/(N+1) for uniform order statistic
print(f'N={n:3d} agents -> COI erosion: {1.0 - 2.0/(n+1):.3f}')
events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0), Event('purchase', 0, 20.0, 2.0)]
print(f'human-like session alpha_hat: {estimate_alpha(Session(sid="test", events=events, actor="H")):.3f}')
events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3), Event('detail', 0, 20.0, 0.4)]
print(f'agent-like session alpha_hat: {estimate_alpha(Session(sid="test2", events=events_a, actor="A")):.3f}')

View File

@@ -0,0 +1,249 @@
"""Gymnasium-compatible RL environment for thesis pricing system.
Wraps simplified.System with standard Gym interface for training pricing policies.
Supports multiple reward modes and contamination scenarios.
Action: price multipliers [0.5, 1.5] applied to reference prices
Observation: [prices, demand_agg, alpha_est, margins, position_proxy]
Reward: configurable objective (revenue, profit, robust, coi-aware)
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, Tuple
import numpy as np
try:
import gymnasium as gym
from gymnasium import spaces
HAS_GYM = True
except ImportError:
HAS_GYM = False
from .simplified import System, Session, Event, Limbo, put_prices_to_market, compute_demand, estimate_alpha
from .coi import COIWindow, compute_coi_window, coi_erosion
@dataclass
class EnvConfig:
n_products: int = 5
max_steps: int = 200
sessions_per_step: int = 30
alpha_true: float = 0.2
alpha_drift: float = 0.0
alpha_bounds: Tuple[float, float] = (0.0, 0.6)
lambda_coi: float = 0.5
lambda_vol: float = 0.1
reward_mode: str = "robust" # revenue | profit | robust | coi_aware
normalize_reward: bool = True
seed: int | None = 42
def aggregate_purchases(sessions: list[Session], n_products: int, costs: np.ndarray) -> Tuple[np.ndarray, float, float]:
"""Aggregate purchases from sessions, returns (counts, revenue, cost)."""
purchases = np.zeros(n_products, dtype=float)
revenue, cost = 0.0, 0.0
for sess in sessions:
for e in sess.events:
if e.action == "purchase" and 0 <= e.product_idx < n_products:
purchases[e.product_idx] += 1.0
revenue += float(e.price_seen)
cost += float(costs[e.product_idx])
return purchases, revenue, cost
class PricingEnv(gym.Env if HAS_GYM else object):
"""RL environment for dynamic pricing under agent contamination.
Platform sets prices p_t, market responds with mixture demand Q(p) = (1-alpha)*D_H + alpha*D_A.
Agent estimates contamination alpha_hat from behavioral signals.
Reward balances profit vs COI leakage.
"""
metadata = {"render_modes": ["human", "ansi"]}
def __init__(self, cfg: EnvConfig | None = None):
if not HAS_GYM:
raise ImportError("gymnasium required")
self.cfg = cfg or EnvConfig()
self.n = self.cfg.n_products
self._sys: System | None = None
self._t = 0
self._alpha = self.cfg.alpha_true
self._last_prices: np.ndarray | None = None
self._last_demand: Dict[str, float] | None = None
self._episode_rewards: list[float] = []
self._demand_agg = np.zeros(self.n)
self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32)
obs_dim = self.n + self.n + 1 + 1 + self.n + 1 # prices + demand + alpha_hat + alpha + margins + t
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)
def _build_obs(self) -> np.ndarray:
if self._sys is None:
return np.zeros(self.observation_space.shape[0], dtype=np.float32)
prices = self._last_prices if self._last_prices is not None else self._sys.refs
return np.concatenate([
prices / (self._sys.refs + 1e-6),
self._demand_agg / (np.sum(self._demand_agg) + 1e-6),
[self._sys.alpha, self._alpha],
(prices - self._sys.costs) / (self._sys.costs + 1e-6),
[self._t / self.cfg.max_steps],
]).astype(np.float32)
def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
cfg, sys = self.cfg, self._sys
if sys is None:
return 0.0
# aggregate demand per product
agg = np.zeros(self.n)
for sid, q in demand.items():
sess = next((s for s in sys._sessions if s.sid == sid), None)
if sess and sess.events:
agg[sess.events[0].product_idx] += q
self._demand_agg = agg
_, revenue, cost = aggregate_purchases(sys._last_sessions, self.n, sys.costs)
profit = revenue - cost
vol_penalty = 0.0
if self._last_prices is not None:
vol_penalty = cfg.lambda_vol * float(np.mean(np.abs(prices - self._last_prices) / (sys.refs + 1e-6)))
coi = compute_coi_window(sys._last_sessions, sys.costs, demand_mapping=demand)
leak = float(coi.leak)
reward_fns = {
"revenue": lambda: revenue,
"profit": lambda: profit,
"robust": lambda: profit - cfg.lambda_coi * leak - vol_penalty,
"coi_aware": lambda: profit - cfg.lambda_coi * (1 + 2 * sys.alpha) * leak - vol_penalty,
}
r = reward_fns.get(cfg.reward_mode, lambda: profit)()
return float(r / (float(np.sum(sys.refs)) + 1e-6)) if cfg.normalize_reward else float(r)
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
seed = seed if seed is not None else self.cfg.seed
self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed)
self._t, self._alpha = 0, self.cfg.alpha_true
self._last_prices, self._last_demand = None, None
self._episode_rewards, self._demand_agg = [], np.zeros(self.n)
return self._build_obs(), {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
"costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()}
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
if self._sys is None:
raise RuntimeError("call reset() first")
action = np.clip(action, 0.5, 1.5)
prices = np.clip(self._sys.refs * action.astype(np.float64), self._sys.costs * 1.01, self._sys.refs * 2.0)
demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step)
self._sys.limbo.add_update("prices", prices)
self._sys._alpha_est = self._sys._estimate_alpha_from_sessions()
reward = self._compute_reward(prices, demand)
self._episode_rewards.append(reward)
self._last_prices, self._last_demand = prices.copy(), demand
self._t += 1
# compute info metrics using shared helper
purchases, revenue, cost = aggregate_purchases(self._sys._last_sessions, self.n, self._sys.costs)
n_agents = int(self._alpha * self.cfg.sessions_per_step)
coi = compute_coi_window(self._sys._last_sessions, self._sys.costs, demand_mapping=demand)
info = {
"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
"alpha_error": abs(self._alpha - self._sys.alpha),
"revenue": float(revenue), "profit": float(revenue - cost), "cost": float(cost),
"n_purchases": int(np.sum(purchases)),
"avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
"n_sessions": len(demand), "n_agents": n_agents, "price_std": float(np.std(prices)),
"coi_erosion": coi_erosion(coi.policy, coi.agent),
"coi_policy": float(coi.policy), "coi_agent": float(coi.agent),
"coi_leakage": float(coi.leak), "coi_survival": float(coi.survival_ratio),
"cumulative_reward": sum(self._episode_rewards), "step": self._t,
}
return self._build_obs(), reward, self._t >= self.cfg.max_steps, False, info
def render(self, mode: str = "human") -> str | None:
if self._sys is None or self._last_prices is None:
return None
out = f"t={self._t}/{self.cfg.max_steps} | alpha_true={self._alpha:.3f} alpha_hat={self._sys.alpha:.3f} | " \
f"prices: {self._last_prices.round(1)} | demand: {self._demand_agg.round(2)} | " \
f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}"
if mode == "human":
print(out)
return out
def close(self) -> None:
pass
class ContaminationSweepEnv(PricingEnv):
"""Environment that sweeps through contamination levels during training."""
def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None):
super().__init__(cfg)
self._schedule = alpha_schedule or [0.1, 0.2, 0.3, 0.4, 0.5]
self._schedule_idx = 0
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
if options and options.get("advance_schedule", False):
self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule)
self.cfg.alpha_true = self._schedule[self._schedule_idx]
return super().reset(seed, options)
class AdversarialEnv(PricingEnv):
"""Environment with adversarial contamination dynamics.
Contamination increases when prices are predictable (agents exploit).
"""
def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02):
super().__init__(cfg)
self._exploit_rate = exploitation_rate
self._price_history: list[np.ndarray] = []
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
obs, reward, term, trunc, info = super().step(action)
if self._last_prices is not None:
self._price_history.append(self._last_prices.copy())
predictability = 0.0
if len(self._price_history) > 10:
predictability = 1.0 / (float(np.std(self._price_history[-10:])) + 0.1)
self._alpha = np.clip(self._alpha + self._exploit_rate * predictability * self._sys.rng.random(), *self.cfg.alpha_bounds)
info["predictability"] = predictability
return obs, reward, term, trunc, info
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
self._price_history = []
return super().reset(seed, options)
def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv:
return {"sweep": ContaminationSweepEnv, "adversarial": AdversarialEnv}.get(env_type, PricingEnv)(cfg)
# baseline policies
fixed_price_policy = lambda refs, margin=0.0: np.ones(len(refs), dtype=np.float32) * (1.0 + margin)
random_policy = lambda n, rng=None: (rng or np.random.default_rng()).uniform(0.7, 1.3, n).astype(np.float32)
adaptive_policy = lambda obs, n, base=0.1: np.ones(n, dtype=np.float32) * (1.0 + base * (1.0 - 0.4 * obs[2 * n]))
if __name__ == "__main__":
cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust")
env = make_env(cfg)
obs, info = env.reset()
print(f"initial: alpha={info['alpha_true']:.2f}")
total_reward = 0.0
for t in range(cfg.max_steps):
action = adaptive_policy(obs, cfg.n_products)
obs, reward, done, _, info = env.step(action)
total_reward += reward
if t % 10 == 0:
env.render()
if done:
break
print(f"\ntotal reward: {total_reward:.2f}, final alpha_hat: {info['alpha_est']:.3f}")

View File

@@ -0,0 +1,168 @@
"""Summarize TensorBoard logs into comparison tables."""
from __future__ import annotations
import json
import re
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass
import pandas as pd
try:
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
HAS_TB = True
except ImportError:
HAS_TB = False
@dataclass
class RunInfo:
algo: str
alpha: float
reward_mode: str
path: Path
def parse_run_name(name: str) -> RunInfo | None:
"""Extract algo, alpha, reward_mode from run directory name."""
# patterns: ppo_a0.20_robust, cmp_fixed_a0.20, sac_a0.90_robust
m = re.match(r'(cmp_)?(\w+)_a([\d.]+)_?(\w+)?', name)
if not m:
return None
prefix, algo, alpha, mode = m.groups()
return RunInfo(algo=algo, alpha=float(alpha), reward_mode=mode or 'robust', path=Path())
def load_tb_scalars(log_dir: Path, tags: list[str], reduce: str = 'last') -> dict[str, float]:
"""Load scalar values from TensorBoard event files."""
if not HAS_TB:
return {}
ea = EventAccumulator(str(log_dir))
ea.Reload()
results = {}
for tag in tags:
if tag in ea.Tags().get('scalars', []):
events = ea.Scalars(tag)
if not events:
continue
vals = [e.value for e in events]
if reduce == 'last':
results[tag] = vals[-1]
elif reduce == 'mean':
results[tag] = sum(vals) / len(vals)
elif reduce == 'max':
results[tag] = max(vals)
elif reduce == 'min':
results[tag] = min(vals)
return results
def load_json_results(log_dir: Path) -> dict[str, float]:
"""Load metrics from results.json if available."""
results_file = log_dir / 'results.json'
if results_file.exists():
with open(results_file) as f:
return json.load(f)
return {}
def discover_runs(base_dir: Path) -> list[RunInfo]:
"""Find all experiment runs in base directory."""
runs = []
for d in base_dir.iterdir():
if not d.is_dir():
continue
info = parse_run_name(d.name)
if info:
info.path = d
runs.append(info)
return runs
def build_tables(runs: list[RunInfo], metrics: list[str], reduce: str = 'last') -> dict[str, dict[str, pd.DataFrame]]:
"""Build pivot tables: reward_mode -> metric -> DataFrame[alpha x algo]."""
# collect data: {reward_mode: {metric: {(alpha, algo): value}}}
data = defaultdict(lambda: defaultdict(dict))
tb_tags = [f'economics/{m}' if m in ['revenue', 'profit', 'margin'] else f'coi/{m}' if m in ['erosion', 'leakage'] else f'alpha/{m}' for m in metrics]
tag_map = dict(zip(tb_tags, metrics))
for run in runs:
# try json first (final eval metrics)
jm = load_json_results(run.path)
tb = load_tb_scalars(run.path, tb_tags, reduce)
for tag, metric in tag_map.items():
val = None
json_key = f'{metric}_mean' if metric != 'reward' else 'reward_mean'
if json_key in jm:
val = jm[json_key]
elif tag in tb:
val = tb[tag]
if val is not None:
data[run.reward_mode][metric][(run.alpha, run.algo)] = val
# convert to DataFrames
tables = {}
for mode, metrics_data in data.items():
tables[mode] = {}
for metric, vals in metrics_data.items():
if not vals:
continue
alphas = sorted(set(a for a, _ in vals.keys()))
algos = sorted(set(al for _, al in vals.keys()))
df = pd.DataFrame(index=alphas, columns=algos, dtype=float)
for (a, al), v in vals.items():
df.loc[a, al] = v
df.index.name = 'alpha'
tables[mode][metric] = df
return tables
def format_table(df: pd.DataFrame, fmt: str = '.3f') -> str:
"""Format DataFrame as markdown table."""
return df.to_markdown(floatfmt=fmt)
def summarize(base_dir: str = 'sim/case/thesis_simplified/runs',
metrics: list[str] | None = None,
reduce: str = 'last',
output: str | None = None) -> dict:
"""Generate summary tables from experiment runs."""
base = Path(base_dir)
metrics = metrics or ['revenue', 'profit', 'margin', 'erosion', 'leakage']
runs = discover_runs(base)
if not runs:
print(f"No runs found in {base}")
return {}
print(f"Found {len(runs)} runs")
tables = build_tables(runs, metrics, reduce)
lines = []
for mode, metric_tables in sorted(tables.items()):
lines.append(f"\n# Reward Mode: {mode}\n")
for metric, df in sorted(metric_tables.items()):
lines.append(f"\n## {metric}\n")
lines.append(format_table(df))
lines.append("")
report = '\n'.join(lines)
print(report)
if output:
Path(output).write_text(report)
print(f"\nSaved to {output}")
return tables
if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser()
p.add_argument('--dir', default='sim/case/thesis_simplified/runs')
p.add_argument('--metrics', nargs='+', default=['revenue', 'profit', 'margin', 'erosion', 'leakage'])
p.add_argument('--reduce', default='last', choices=['last', 'mean', 'max', 'min'])
p.add_argument('--output', '-o', help='save markdown to file')
args = p.parse_args()
summarize(args.dir, args.metrics, args.reduce, args.output)

View File

@@ -0,0 +1,336 @@
"""RL training for thesis pricing system with thesis-aligned metrics.
Trains pricing policies using stable-baselines3 with TensorBoard logging.
Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formulation.
"""
from __future__ import annotations
import argparse
import json
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass, asdict, field
from pathlib import Path
from typing import Dict, List, Callable, Any
import numpy as np
try:
from stable_baselines3 import PPO, SAC, A2C
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
HAS_SB3 = True
except ImportError:
HAS_SB3 = False
try:
from torch.utils.tensorboard import SummaryWriter
HAS_TB = True
except ImportError:
HAS_TB = False
from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy
@dataclass
class EpisodeMetrics:
reward: float = 0.0
revenue: float = 0.0
profit: float = 0.0
coi_erosion: float = 0.0
coi_leakage: float = 0.0
alpha_error: float = 0.0
avg_margin: float = 0.0
n_agents: int = 0
steps: int = 0
def accumulate(self, info: Dict[str, Any]) -> None:
self.steps += 1
self.reward += info.get('reward', 0)
self.revenue += info.get('revenue', 0)
self.profit += info.get('profit', 0)
self.coi_erosion += info.get('coi_erosion', 0)
self.coi_leakage += info.get('coi_leakage', 0)
self.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0))
self.avg_margin += info.get('avg_margin', 0)
self.n_agents += info.get('n_agents', 0)
def normalized(self) -> Dict[str, float]:
s = max(self.steps, 1)
return {k: getattr(self, k) / s for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin', 'n_agents']}
@dataclass
class ExperimentConfig:
algo: str = "ppo"
total_timesteps: int = 100_000
n_envs: int = 4
eval_freq: int = 5000
n_eval_episodes: int = 10
log_dir: str = "sim/case/thesis_simplified/runs"
seed: int = 42
n_products: int = 10
max_steps: int = 200
alpha_true: float = 0.2
reward_mode: str = "robust"
experiment_name: str | None = None
def __post_init__(self):
if self.experiment_name is None:
self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}"
class Policy:
"""Unified policy interface for baselines and trained models."""
def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str):
self._fn, self.name = policy_fn, name
def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]:
return self._fn(obs, (len(obs) - 3) // 3), None
@staticmethod
def fixed(margin: float = 0.15) -> "Policy":
return Policy(lambda obs, n: fixed_price_policy(np.ones(n), margin), f"fixed_{margin:.2f}")
@staticmethod
def adaptive(base_margin: float = 0.15) -> "Policy":
return Policy(lambda obs, n: adaptive_policy(obs, n, base_margin), f"adaptive_{base_margin:.2f}")
@staticmethod
def random() -> "Policy":
return Policy(lambda obs, n: random_policy(n), "random")
@staticmethod
def myopic(greed: float = 0.3) -> "Policy":
def _fn(obs: np.ndarray, n: int) -> np.ndarray:
demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
return np.ones(n, dtype=np.float32) * np.clip(1.0 + greed * (1 + np.mean(demand_norm)), 0.5, 1.5)
return Policy(_fn, f"myopic_{greed:.1f}")
def log_metrics(writer: SummaryWriter | None, metrics: Dict[str, float], prefix: str, step: int) -> None:
if writer is None:
return
for k, v in metrics.items():
writer.add_scalar(f'{prefix}/{k}', v, step)
class MetricsCallback(BaseCallback):
def __init__(self, writer: SummaryWriter | None, verbose: int = 0):
super().__init__(verbose)
self._writer = writer
def _on_step(self) -> bool:
if self._writer is None:
return True
for info in self.locals.get('infos', []):
t = self.num_timesteps
self._writer.add_scalar('economics/revenue', info.get('revenue', 0), t)
self._writer.add_scalar('economics/profit', info.get('profit', 0), t)
self._writer.add_scalar('economics/margin', info.get('avg_margin', 0), t)
self._writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), t)
self._writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), t)
self._writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), t)
self._writer.add_scalar('agents/count', info.get('n_agents', 0), t)
return True
def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv:
def _make():
return Monitor(make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)))
return DummyVecEnv([_make for _ in range(n_envs)])
def run_episodes(policy: Policy | Any, env: PricingEnv, n_episodes: int) -> List[EpisodeMetrics]:
"""Run policy for n episodes and collect metrics."""
metrics = []
for _ in range(n_episodes):
obs, _ = env.reset()
ep, done = EpisodeMetrics(), False
while not done:
action, _ = policy.predict(obs, deterministic=True)
obs, reward, term, trunc, info = env.step(action)
done = term or trunc
ep.accumulate(info)
ep.reward += reward
metrics.append(ep)
return metrics
def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]:
env = make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999))
metrics = run_episodes(policy, env, n_episodes)
return {
'reward_mean': np.mean([m.reward for m in metrics]), 'reward_std': np.std([m.reward for m in metrics]),
**{f'{k}_mean': np.mean([m.normalized()[k] for m in metrics])
for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin']},
}
def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None):
obs, n_envs = vec_env.reset(), vec_env.num_envs
ep_rewards = np.zeros(n_envs)
for step in range(0, total_steps, n_envs):
actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)])
obs, rewards, dones, infos = vec_env.step(actions)
ep_rewards += rewards
for i, info in enumerate(infos):
if writer:
writer.add_scalar('economics/revenue', info.get('revenue', 0), step)
writer.add_scalar('economics/profit', info.get('profit', 0), step)
writer.add_scalar('economics/margin', info.get('avg_margin', 0), step)
writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), step)
writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), step)
writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), step)
writer.add_scalar('agents/count', info.get('n_agents', 0), step)
if dones[i]:
if writer:
writer.add_scalar('rollout/ep_reward', ep_rewards[i], step)
ep_rewards[i] = 0
def train(cfg: ExperimentConfig) -> Dict[str, Any]:
is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"]
if not HAS_SB3 and not is_baseline:
raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]")
log_path = Path(cfg.log_dir) / cfg.experiment_name
log_path.mkdir(parents=True, exist_ok=True)
with open(log_path / "config.json", "w") as f:
json.dump(asdict(cfg), f, indent=2)
writer = SummaryWriter(log_path) if HAS_TB else None
train_env, eval_env = make_vec_env(cfg, cfg.n_envs), make_vec_env(cfg, 1)
if is_baseline:
policy = {"fixed": Policy.fixed, "adaptive": Policy.adaptive, "random": Policy.random, "myopic": Policy.myopic}[cfg.algo.lower()]()
run_baseline(policy, train_env, cfg.total_timesteps, writer)
final_metrics = evaluate_policy(policy, cfg)
else:
algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}[cfg.algo.lower()]
common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
model = {
"ppo": lambda: PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, ent_coef=0.01, **common),
"sac": lambda: SAC("MlpPolicy", train_env, learning_rate=1e-4, buffer_size=50_000, batch_size=512, tau=0.02, gamma=0.99, learning_starts=1000, ent_coef="auto_0.1", train_freq=4, **common),
"a2c": lambda: A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common),
}[cfg.algo.lower()]()
cb = MetricsCallback(writer)
eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"), log_path=str(log_path),
eval_freq=cfg.eval_freq, n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True)
model.save(log_path / "final_model")
policy = model
final_metrics = evaluate_policy(model, cfg)
if writer:
log_metrics(writer, final_metrics, 'final', cfg.total_timesteps)
writer.close()
train_env.close(); eval_env.close()
with open(log_path / "results.json", "w") as f:
json.dump(final_metrics, f, indent=2)
return {"path": str(log_path), "metrics": final_metrics}
def _train_alpha(args: tuple) -> tuple[str, Dict]:
"""Worker for parallel sweep - must be top-level for pickling."""
cfg_dict, alpha = args
cfg_dict["alpha_true"] = alpha
cfg_dict["experiment_name"] = f"{cfg_dict['algo']}_a{alpha:.2f}_{cfg_dict['reward_mode']}"
sweep_cfg = ExperimentConfig(**cfg_dict)
print(f"[alpha={alpha:.2f}] starting")
metrics = train(sweep_cfg)["metrics"]
print(f"[alpha={alpha:.2f}] done")
return f"alpha_{alpha:.2f}", metrics
def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
cfg_dict = asdict(cfg)
if max_workers == 1: # sequential fallback
results = dict(_train_alpha((cfg_dict.copy(), a)) for a in alphas)
else:
with ProcessPoolExecutor(max_workers=max_workers) as pool:
futures = {pool.submit(_train_alpha, (cfg_dict.copy(), a)): a for a in alphas}
results = {}
for fut in as_completed(futures):
key, metrics = fut.result()
results[key] = metrics
summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json"
with open(summary_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nSweep results saved to {summary_path}")
return results
def _train_policy(args: tuple) -> tuple[str, Dict]:
"""Worker for parallel policy comparison."""
cfg_dict, algo = args
cfg_dict["algo"] = algo
cfg_dict["experiment_name"] = f"cmp_{algo}_a{cfg_dict['alpha_true']:.2f}"
cmp_cfg = ExperimentConfig(**cfg_dict)
print(f"[{algo}] starting")
metrics = train(cmp_cfg)["metrics"]
print(f"[{algo}] done")
return algo, metrics
def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
policies = policies or ["fixed", "adaptive", "myopic", "random"]
cfg_dict = asdict(cfg)
if max_workers == 1:
results = dict(_train_policy((cfg_dict.copy(), p)) for p in policies)
else:
with ProcessPoolExecutor(max_workers=max_workers) as pool:
futures = {pool.submit(_train_policy, (cfg_dict.copy(), p)): p for p in policies}
results = {}
for fut in as_completed(futures):
algo, metrics = fut.result()
results[algo] = metrics
cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json"
with open(cmp_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nComparison saved to {cmp_path}")
for algo, m in results.items():
print(f" {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} alpha_err={m['alpha_error_mean']:.4f}")
return results
def main():
parser = argparse.ArgumentParser(description="Train RL pricing policies")
parser.add_argument("--algo", default="ppo", choices=["ppo", "sac", "a2c", "fixed", "adaptive", "random", "myopic"])
parser.add_argument("--steps", type=int, default=100_000)
parser.add_argument("--alpha", type=float, default=0.2)
parser.add_argument("--reward-mode", default="robust", choices=["revenue", "profit", "robust", "coi_aware"])
parser.add_argument("--n-products", type=int, default=10)
parser.add_argument("--n-envs", type=int, default=4)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--log-dir", default="sim/case/thesis_simplified/runs")
parser.add_argument("--sweep", action="store_true", help="run contamination sweep")
parser.add_argument("--compare", action="store_true", help="compare all baselines")
parser.add_argument("--workers", type=int, default=None, help="max parallel workers for sweep (None=auto, 1=sequential)")
args = parser.parse_args()
cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
reward_mode=args.reward_mode, n_products=args.n_products,
n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir)
if args.sweep:
run_sweep(cfg, max_workers=args.workers)
elif args.compare:
compare_policies(cfg, max_workers=args.workers)
else:
result = train(cfg)
print(f"\nTraining complete: {result['path']}")
print(f"Metrics: {json.dumps(result['metrics'], indent=2)}")
if __name__ == "__main__":
main()