chore: refactor session mapping

This commit is contained in:
2026-01-24 14:21:35 +01:00
parent c5eae17924
commit bae51daa1c
2 changed files with 241 additions and 377 deletions

View File

@@ -93,15 +93,15 @@ def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray,
def put_prices_to_market(prices: np.ndarray, alpha: float = 0.2, n_sessions: int = 50, def put_prices_to_market(prices: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
seed: int | None = None) -> Tuple[Dict[str, float], Dict[str, str]]: seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]:
"""Generate sessions from mixture model Q(p) = (1-α)E[d_H] + αE[d_A] (Eq 3). """Generate sessions from mixture model Q(p) = (1-α)E[d_H] + αE[d_A] (Eq 3).
Returns: Returns:
sessions: list of Session objects with events and product attribution
demand_mapping: session_id -> demand proxy q̂ demand_mapping: session_id -> demand proxy q̂
hidden_labels: session_id -> actor class (H or A)
""" """
rng = np.random.default_rng(seed) rng = np.random.default_rng(seed)
demand_mapping, hidden_labels = {}, {} sessions, demand_mapping = [], {}
for i in range(n_sessions): for i in range(n_sessions):
sid = f"s{i:04d}" sid = f"s{i:04d}"
@@ -110,10 +110,10 @@ def put_prices_to_market(prices: np.ndarray, alpha: float = 0.2, n_sessions: int
theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)} theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
events, _ = sample_trajectory(rng, trans, prices, is_agent) events, _ = sample_trajectory(rng, trans, prices, is_agent)
session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta) session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
sessions.append(session)
demand_mapping[sid] = compute_demand(session) demand_mapping[sid] = compute_demand(session)
hidden_labels[sid] = session.actor
return demand_mapping, hidden_labels return sessions, demand_mapping
@dataclass @dataclass
@@ -190,9 +190,16 @@ class System:
agg_demand[pidx] += q agg_demand[pidx] += q
return float(np.dot(prices, agg_demand)) return float(np.dot(prices, agg_demand))
def _coi_leakage(self, prices: np.ndarray) -> float: def _coi_leakage(self, prices: np.ndarray, n_agents: int = 1) -> float:
"""COI_leak = α · InfoValue (query-tax surrogate).""" """COI leakage tied to Theorem 1: erosion from order statistic collapse.
return self._alpha_est * 1.0
As N agents query, min(p_1..p_N) → p_min and COI → 0.
Leakage = erosion_rate × margin_at_risk
"""
price_std = float(np.std(prices))
erosion = coi_erosion(max(1, n_agents), price_std)
margin_at_risk = float(np.mean(prices - self.costs))
return erosion * margin_at_risk
def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float: def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
"""Robust objective: R(p,d) - λ·COI_leak (Eq 23 simplified).""" """Robust objective: R(p,d) - λ·COI_leak (Eq 23 simplified)."""
@@ -223,13 +230,8 @@ class System:
def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]: def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
"""Observe market response to prices.""" """Observe market response to prices."""
demand_map, labels = put_prices_to_market(prices, alpha=alpha_true, n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000))) sessions, demand_map = put_prices_to_market(prices, alpha=alpha_true, n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
self._sessions.extend(sessions) # store actual sessions for correct product attribution
# reconstruct sessions for α estimation
for sid, actor in labels.items():
events, _ = sample_trajectory(self.rng, TRANS_A if actor == "A" else TRANS_H, prices, actor == "A")
self._sessions.append(Session(sid=sid, events=events, actor=actor))
self.limbo.add_update("demand", demand_map) self.limbo.add_update("demand", demand_map)
return demand_map return demand_map
@@ -269,8 +271,8 @@ if __name__ == "__main__":
print(f"avg reward: {np.mean(traj['rewards']):.2f}, final α̂: {traj['alpha_est'][-1]:.3f}") print(f"avg reward: {np.mean(traj['rewards']):.2f}, final α̂: {traj['alpha_est'][-1]:.3f}")
prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0]) prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
demand, labels = put_prices_to_market(prices, alpha=0.3, n_sessions=20, seed=123) sessions, demand = put_prices_to_market(prices, alpha=0.3, n_sessions=20, seed=123)
print(f'sessions: {len(demand)}, agents: {sum(1 for l in labels.values() if l=="A")}') print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
for n in [1, 5, 10, 50, 100]: for n in [1, 5, 10, 50, 100]:
ero = coi_erosion(n, price_std=5.0) ero = coi_erosion(n, price_std=5.0)

View File

@@ -1,15 +1,17 @@
"""RL training for thesis pricing system with COI tracking. """RL training for thesis pricing system with thesis-aligned metrics.
Trains pricing policies using stable-baselines3 with TensorBoard logging. Trains pricing policies using stable-baselines3 with TensorBoard logging.
Demonstrates COI leakage under different contamination levels and policies. Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formulation.
Usage: Usage:
python -m lab.case.thesis.train --algo ppo --alpha 0.3 --steps 100000 python -m lab.case.thesis.train --algo ppo --alpha 0.3 --steps 100000
python -m lab.case.thesis.train --algo adaptive --sweep # run alpha sweep
tensorboard --logdir lab/case/thesis/runs tensorboard --logdir lab/case/thesis/runs
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from dataclasses import dataclass, field import json
from dataclasses import dataclass, asdict
from pathlib import Path from pathlib import Path
from typing import Dict, List, Callable, Any from typing import Dict, List, Callable, Any
import numpy as np import numpy as np
@@ -17,9 +19,8 @@ import numpy as np
try: try:
from stable_baselines3 import PPO, SAC, A2C from stable_baselines3 import PPO, SAC, A2C
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.logger import configure
HAS_SB3 = True HAS_SB3 = True
except ImportError: except ImportError:
HAS_SB3 = False HAS_SB3 = False
@@ -34,322 +35,203 @@ from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fi
from .simplified import coi_erosion from .simplified import coi_erosion
class BaselinePolicy: # thesis-aligned KPIs tracked per episode
"""Wrapper to make baseline policies compatible with SB3 interface.""" @dataclass
class EpisodeMetrics:
def __init__(self, policy_fn, name: str): reward: float = 0.0
self.policy_fn = policy_fn revenue: float = 0.0
self.name = name profit: float = 0.0
self.num_timesteps = 0 coi_erosion: float = 0.0 # theorem 1: order statistic erosion
coi_leakage: float = 0.0 # per-step leakage penalty
def predict(self, obs, deterministic: bool = True): alpha_error: float = 0.0 # |α - α̂|
n = (len(obs) - 3) // 3 # infer n_products from obs shape avg_margin: float = 0.0
action = self.policy_fn(obs, n) n_agents: int = 0
return action, None steps: int = 0
def learn(self, total_timesteps: int, callback=None, progress_bar: bool = False):
self.num_timesteps = total_timesteps
return self
def save(self, path):
pass # no-op for baselines
@staticmethod
def load(path):
raise NotImplementedError("baselines cannot be loaded")
def myopic_policy(obs: np.ndarray, n: int, greed: float = 0.3) -> np.ndarray:
"""Myopic: maximize immediate margin, ignore alpha and future COI erosion.
Greedy short-term optimizer that sets high prices when demand looks good,
completely ignoring the alpha estimate and long-term consequences.
"""
demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
avg_demand = np.mean(demand_norm)
multiplier = 1.0 + greed * (1 + avg_demand)
return np.ones(n, dtype=np.float32) * np.clip(multiplier, 0.5, 1.5)
def random_myopic_policy(obs: np.ndarray, n: int) -> np.ndarray:
"""Random myopic: iid random prices each step, no state awareness.
Represents worst-case baseline where pricing has no strategy at all.
"""
return np.random.uniform(0.8, 1.4, n).astype(np.float32)
@dataclass @dataclass
class TrainConfig: class ExperimentConfig:
"""Training configuration.""" """Full experiment specification for reproducibility."""
algo: str = "ppo" # ppo | sac | a2c algo: str = "ppo"
total_timesteps: int = 100_000 total_timesteps: int = 100_000
n_envs: int = 4 n_envs: int = 4
eval_freq: int = 5000 eval_freq: int = 5000
n_eval_episodes: int = 10 n_eval_episodes: int = 10
log_dir: str = "lab/case/thesis/runs" log_dir: str = "lab/case/thesis/runs"
seed: int = 42 seed: int = 42
# env config
n_products: int = 10 n_products: int = 10
max_steps: int = 200 max_steps: int = 200
alpha_true: float = 0.2 alpha_true: float = 0.2
reward_mode: str = "robust" reward_mode: str = "robust"
# baseline sweep experiment_name: str | None = None
run_baselines: bool = True
alpha_sweep: List[float] = field(default_factory=lambda: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]) def __post_init__(self):
if self.experiment_name is None:
self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}"
class COICallback(BaseCallback): # unified policy interface wrapping all baselines
"""Custom callback for tracking COI metrics in TensorBoard.""" class Policy:
"""Unified policy interface for baselines and trained models."""
def __init__(self, writer: Any = None, verbose: int = 0): def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str):
self._fn = policy_fn
self.name = name
def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]:
n = (len(obs) - 3) // 3
return self._fn(obs, n), None
@staticmethod
def fixed(margin: float = 0.15) -> "Policy":
return Policy(lambda obs, n: fixed_price_policy(np.ones(n), margin), f"fixed_{margin:.2f}")
@staticmethod
def adaptive(base_margin: float = 0.15) -> "Policy":
return Policy(lambda obs, n: adaptive_policy(obs, n, base_margin), f"adaptive_{base_margin:.2f}")
@staticmethod
def random() -> "Policy":
return Policy(lambda obs, n: random_policy(n), "random")
@staticmethod
def myopic(greed: float = 0.3) -> "Policy":
"""Myopic: maximize immediate margin, ignore alpha."""
def _fn(obs: np.ndarray, n: int) -> np.ndarray:
demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
mult = 1.0 + greed * (1 + np.mean(demand_norm))
return np.ones(n, dtype=np.float32) * np.clip(mult, 0.5, 1.5)
return Policy(_fn, f"myopic_{greed:.1f}")
class MetricsCallback(BaseCallback):
"""Tracks thesis-aligned metrics during RL training."""
def __init__(self, writer: SummaryWriter | None, verbose: int = 0):
super().__init__(verbose) super().__init__(verbose)
self._writer = writer self._writer = writer
self._episode_coi_leak = [] self._ep = EpisodeMetrics()
self._episode_alpha_err = [] self._buffer: List[EpisodeMetrics] = []
self._episode_revenues = []
self._episode_margins = []
def _on_step(self) -> bool: def _on_step(self) -> bool:
infos = self.locals.get('infos', []) for info in self.locals.get('infos', []):
for info in infos: self._ep.steps += 1
if 'alpha_true' in info and 'alpha_est' in info: self._ep.reward += info.get('reward', 0)
self._episode_alpha_err.append(abs(info['alpha_true'] - info['alpha_est'])) self._ep.revenue += info.get('revenue', 0)
if 'coi_erosion' in info: self._ep.profit += info.get('profit', 0)
self._episode_coi_leak.append(info['coi_erosion']) self._ep.coi_erosion += info.get('coi_erosion', 0)
if 'revenue' in info: self._ep.coi_leakage += info.get('coi_leakage', 0)
self._episode_revenues.append(info['revenue']) self._ep.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0))
if 'avg_margin' in info: self._ep.avg_margin += info.get('avg_margin', 0)
self._episode_margins.append(info['avg_margin']) self._ep.n_agents += info.get('n_agents', 0)
return True return True
def _on_rollout_end(self) -> None: def _on_rollout_end(self) -> None:
if self._writer is None: if self._ep.steps == 0 or self._writer is None:
return return
step = self.num_timesteps s, step = self._ep.steps, self.num_timesteps
if self._episode_coi_leak: self._writer.add_scalar('economics/revenue', self._ep.revenue / s, step)
self._writer.add_scalar('coi/erosion_mean', np.mean(self._episode_coi_leak), step) self._writer.add_scalar('economics/profit', self._ep.profit / s, step)
self._writer.add_scalar('coi/erosion_max', np.max(self._episode_coi_leak), step) self._writer.add_scalar('economics/margin', self._ep.avg_margin / s, step)
if self._episode_alpha_err: self._writer.add_scalar('coi/erosion', self._ep.coi_erosion / s, step)
self._writer.add_scalar('alpha/estimation_error', np.mean(self._episode_alpha_err), step) self._writer.add_scalar('coi/leakage', self._ep.coi_leakage / s, step)
if self._episode_revenues: self._writer.add_scalar('alpha/estimation_error', self._ep.alpha_error / s, step)
self._writer.add_scalar('economics/revenue_mean', np.mean(self._episode_revenues), step) self._writer.add_scalar('agents/count', self._ep.n_agents / s, step)
if self._episode_margins: self._buffer.append(self._ep)
self._writer.add_scalar('economics/margin_mean', np.mean(self._episode_margins), step) self._ep = EpisodeMetrics()
self._episode_coi_leak.clear()
self._episode_alpha_err.clear()
self._episode_revenues.clear()
self._episode_margins.clear()
def run_baseline_with_logging(model: BaselinePolicy, vec_env, total_timesteps: int, writer: Any) -> None: def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv:
"""Run baseline policy and log metrics identically to RL training."""
n_envs = vec_env.num_envs
obs = vec_env.reset()
step = 0
episode_rewards, episode_coi, episode_alpha_err, episode_revenues = [], [], [], []
ep_rewards = np.zeros(n_envs)
while step < total_timesteps:
actions = np.array([model.predict(obs[i])[0] for i in range(n_envs)])
obs, rewards, dones, infos = vec_env.step(actions)
step += n_envs
ep_rewards += rewards
for i, info in enumerate(infos):
if 'coi_erosion' in info:
episode_coi.append(info['coi_erosion'])
if 'alpha_true' in info and 'alpha_est' in info:
episode_alpha_err.append(abs(info['alpha_true'] - info['alpha_est']))
if 'revenue' in info:
episode_revenues.append(info['revenue'])
if dones[i]:
episode_rewards.append(ep_rewards[i])
ep_rewards[i] = 0.0
if writer and len(episode_rewards) >= 5 and step % 1000 < n_envs:
writer.add_scalar('rollout/ep_rew_mean', np.mean(episode_rewards[-10:]), step)
if episode_coi:
writer.add_scalar('coi/erosion_mean', np.mean(episode_coi[-100:]), step)
if episode_alpha_err:
writer.add_scalar('alpha/estimation_error', np.mean(episode_alpha_err[-100:]), step)
if episode_revenues:
writer.add_scalar('economics/revenue_mean', np.mean(episode_revenues[-100:]), step)
if step % 10000 < n_envs:
print(f" step {step}/{total_timesteps}, avg_reward={np.mean(episode_rewards[-20:]) if episode_rewards else 0:.2f}")
def make_vec_env(cfg: TrainConfig, n_envs: int = 1) -> DummyVecEnv:
"""Create vectorized environment."""
def _make(): def _make():
env_cfg = EnvConfig( env_cfg = EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
n_products=cfg.n_products, max_steps=cfg.max_steps, alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed) return Monitor(make_env(env_cfg))
env = make_env(env_cfg)
return Monitor(env)
return DummyVecEnv([_make for _ in range(n_envs)]) return DummyVecEnv([_make for _ in range(n_envs)])
def run_baseline( def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]:
policy_fn: Callable[[np.ndarray, int], np.ndarray], """Evaluate policy and return thesis-aligned metrics."""
env: PricingEnv, env_cfg = EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
n_episodes: int = 20, alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999)
name: str = "baseline" env = make_env(env_cfg)
) -> Dict[str, float]: metrics = []
"""Evaluate baseline policy and collect metrics."""
episode_rewards, episode_coi, episode_alpha_err = [], [], []
for _ in range(n_episodes): for _ in range(n_episodes):
obs, info = env.reset() obs, _ = env.reset()
done, ep_reward, ep_coi, ep_alpha_err = False, 0.0, [], [] ep = EpisodeMetrics()
done = False
while not done: while not done:
action = policy_fn(obs, env.n) action, _ = policy.predict(obs, deterministic=True)
obs, reward, terminated, truncated, info = env.step(action) obs, reward, term, trunc, info = env.step(action)
done = terminated or truncated done = term or trunc
ep_reward += reward ep.reward += reward
if 'coi_erosion' in info: ep.revenue += info.get('revenue', 0)
ep_coi.append(info['coi_erosion']) ep.profit += info.get('profit', 0)
if 'alpha_true' in info and 'alpha_est' in info: ep.coi_erosion += info.get('coi_erosion', 0)
ep_alpha_err.append(abs(info['alpha_true'] - info['alpha_est'])) ep.coi_leakage += info.get('coi_leakage', 0)
ep.alpha_error += abs(info['alpha_true'] - info['alpha_est'])
episode_rewards.append(ep_reward) ep.avg_margin += info.get('avg_margin', 0)
if ep_coi: ep.steps += 1
episode_coi.append(np.mean(ep_coi)) metrics.append(ep)
if ep_alpha_err:
episode_alpha_err.append(np.mean(ep_alpha_err))
n = len(metrics)
return { return {
f'{name}/reward_mean': np.mean(episode_rewards), 'reward_mean': np.mean([m.reward for m in metrics]),
f'{name}/reward_std': np.std(episode_rewards), 'reward_std': np.std([m.reward for m in metrics]),
f'{name}/coi_erosion': np.mean(episode_coi) if episode_coi else 0.0, 'revenue_mean': np.mean([m.revenue / m.steps for m in metrics]),
f'{name}/alpha_error': np.mean(episode_alpha_err) if episode_alpha_err else 0.0, 'profit_mean': np.mean([m.profit / m.steps for m in metrics]),
'coi_erosion_mean': np.mean([m.coi_erosion / m.steps for m in metrics]),
'coi_leakage_mean': np.mean([m.coi_leakage / m.steps for m in metrics]),
'alpha_error_mean': np.mean([m.alpha_error / m.steps for m in metrics]),
'margin_mean': np.mean([m.avg_margin / m.steps for m in metrics]),
} }
def run_coi_demonstration(writer: Any, cfg: TrainConfig) -> Dict[str, Dict[str, float]]: def train(cfg: ExperimentConfig) -> Dict[str, Any]:
"""Demonstrate COI leakage across contamination levels.""" """Train RL agent or evaluate baseline policy."""
results = {} is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"]
for alpha in cfg.alpha_sweep:
env_cfg = EnvConfig(
n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=alpha, reward_mode=cfg.reward_mode, seed=cfg.seed)
env = make_env(env_cfg)
# run fixed policy
fixed_metrics = run_baseline(
lambda obs, n: fixed_price_policy(np.ones(n), margin=0.15),
env, n_episodes=10, name=f"fixed_alpha{alpha:.1f}")
# run adaptive policy
adaptive_metrics = run_baseline(
lambda obs, n: adaptive_policy(obs, n, base_margin=0.15),
env, n_episodes=10, name=f"adaptive_alpha{alpha:.1f}")
# theoretical erosion
n_agents = int(alpha * cfg.max_steps * 30) # rough agent count
theo_erosion = coi_erosion(max(1, n_agents), price_std=5.0)
results[f'alpha_{alpha:.1f}'] = {
'fixed_reward': fixed_metrics[f"fixed_alpha{alpha:.1f}/reward_mean"],
'adaptive_reward': adaptive_metrics[f"adaptive_alpha{alpha:.1f}/reward_mean"],
'fixed_coi': fixed_metrics[f"fixed_alpha{alpha:.1f}/coi_erosion"],
'adaptive_coi': adaptive_metrics[f"adaptive_alpha{alpha:.1f}/coi_erosion"],
'theoretical_erosion': theo_erosion,
}
if writer:
writer.add_scalar(f'baseline/fixed_reward', fixed_metrics[f"fixed_alpha{alpha:.1f}/reward_mean"], int(alpha * 100))
writer.add_scalar(f'baseline/adaptive_reward', adaptive_metrics[f"adaptive_alpha{alpha:.1f}/reward_mean"], int(alpha * 100))
writer.add_scalar(f'baseline/coi_erosion_fixed', fixed_metrics[f"fixed_alpha{alpha:.1f}/coi_erosion"], int(alpha * 100))
writer.add_scalar(f'baseline/coi_erosion_adaptive', adaptive_metrics[f"adaptive_alpha{alpha:.1f}/coi_erosion"], int(alpha * 100))
writer.add_scalar(f'baseline/theoretical_erosion', theo_erosion, int(alpha * 100))
return results
def train_rl(cfg: TrainConfig) -> Dict[str, Any]:
"""Train RL agent or baseline policy with TensorBoard logging."""
is_baseline = cfg.algo.lower() in ["myopic", "random_myopic", "fixed", "adaptive"]
if not HAS_SB3 and not is_baseline: if not HAS_SB3 and not is_baseline:
raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]") raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]")
log_path = Path(cfg.log_dir) / f"{cfg.algo}_alpha{cfg.alpha_true:.1f}_{cfg.reward_mode}" log_path = Path(cfg.log_dir) / cfg.experiment_name
log_path.mkdir(parents=True, exist_ok=True) log_path.mkdir(parents=True, exist_ok=True)
with open(log_path / "config.json", "w") as f:
json.dump(asdict(cfg), f, indent=2)
writer = SummaryWriter(log_path) if HAS_TB else None writer = SummaryWriter(log_path) if HAS_TB else None
train_env = make_vec_env(cfg, cfg.n_envs)
# baseline demonstration eval_env = make_vec_env(cfg, 1)
if False and cfg.run_baselines:
print("Running baseline demonstrations...")
baseline_results = run_coi_demonstration(writer, cfg)
for k, v in baseline_results.items():
print(f" {k}: reward_fixed={v['fixed_reward']:.2f}, reward_adapt={v['adaptive_reward']:.2f}, "
f"coi_fixed={v['fixed_coi']:.3f}, coi_adapt={v['adaptive_coi']:.3f}, theo={v['theoretical_erosion']:.3f}")
# create envs
train_env = make_vec_env(cfg, n_envs=cfg.n_envs)
eval_env = make_vec_env(cfg, n_envs=1)
# select algorithm
algo_name = cfg.algo.lower()
if is_baseline: if is_baseline:
# baseline policies wrapped for compatibility policy_map = {"fixed": Policy.fixed(), "adaptive": Policy.adaptive(),
policy_map = { "random": Policy.random(), "myopic": Policy.myopic()}
"myopic": lambda obs, n: myopic_policy(obs, n, greed=0.3), policy = policy_map[cfg.algo.lower()]
"random_myopic": random_myopic_policy, run_baseline(policy, train_env, cfg.total_timesteps, writer)
"fixed": lambda obs, n: fixed_price_policy(np.ones(n), margin=0.15), final_metrics = evaluate_policy(policy, cfg)
"adaptive": lambda obs, n: adaptive_policy(obs, n, base_margin=0.15),
}
model = BaselinePolicy(policy_map[algo_name], algo_name)
else: else:
if not HAS_SB3: algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}.get(cfg.algo.lower())
raise ImportError("stable-baselines3 required for RL algos")
algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}.get(algo_name)
if algo_cls is None: if algo_cls is None:
raise ValueError(f"unknown algo: {cfg.algo}") raise ValueError(f"unknown algo: {cfg.algo}")
common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
common_kwargs = dict( if cfg.algo.lower() == "ppo":
verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), model = PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048,
device="auto" batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95,
) clip_range=0.2, ent_coef=0.01, **common)
if algo_name == "ppo": elif cfg.algo.lower() == "sac":
model = PPO( model = SAC("MlpPolicy", train_env, learning_rate=3e-4, buffer_size=100_000,
"MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, batch_size=256, tau=0.005, gamma=0.99, **common)
batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95,
clip_range=0.2, ent_coef=0.01, **common_kwargs)
elif algo_name == "sac":
model = SAC(
"MlpPolicy", train_env, learning_rate=3e-4, buffer_size=100_000,
batch_size=256, tau=0.005, gamma=0.99, train_freq=1,
gradient_steps=1, ent_coef="auto", **common_kwargs)
else: else:
model = A2C( model = A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common)
"MlpPolicy", train_env, learning_rate=7e-4, n_steps=5,
gamma=0.99, gae_lambda=1.0, ent_coef=0.01, **common_kwargs)
print(f"\nRunning {cfg.algo.upper()} for {cfg.total_timesteps} steps...") cb = MetricsCallback(writer)
print(f" alpha_true={cfg.alpha_true}, reward_mode={cfg.reward_mode}") eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"),
print(f" logs: {log_path}") log_path=str(log_path), eval_freq=cfg.eval_freq,
n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
if is_baseline: model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True)
# run baseline through env manually with logging
run_baseline_with_logging(model, train_env, cfg.total_timesteps, writer)
else:
coi_cb = COICallback(writer=writer, verbose=1)
eval_cb = EvalCallback(
eval_env, best_model_save_path=str(log_path / "best"),
log_path=str(log_path), eval_freq=cfg.eval_freq,
n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
model.learn(total_timesteps=cfg.total_timesteps, callback=[coi_cb, eval_cb], progress_bar=True)
model.save(log_path / "final_model") model.save(log_path / "final_model")
policy = model
final_metrics = evaluate_policy(model, cfg)
# final evaluation
final_metrics = evaluate_trained_model(model, cfg)
if writer: if writer:
for k, v in final_metrics.items(): for k, v in final_metrics.items():
writer.add_scalar(f'final/{k}', v, cfg.total_timesteps) writer.add_scalar(f'final/{k}', v, cfg.total_timesteps)
@@ -357,117 +239,97 @@ def train_rl(cfg: TrainConfig) -> Dict[str, Any]:
train_env.close() train_env.close()
eval_env.close() eval_env.close()
with open(log_path / "results.json", "w") as f:
return {"model_path": str(log_path / "final_model"), "metrics": final_metrics} json.dump(final_metrics, f, indent=2)
return {"path": str(log_path), "metrics": final_metrics}
def evaluate_trained_model(model: Any, cfg: TrainConfig, n_episodes: int = 20) -> Dict[str, float]: def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None):
"""Evaluate trained model.""" """Run baseline policy through environment with logging."""
env_cfg = EnvConfig( obs = vec_env.reset()
n_products=cfg.n_products, max_steps=cfg.max_steps, n_envs = vec_env.num_envs
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 1000) ep_rewards = np.zeros(n_envs)
env = make_env(env_cfg) all_rewards, coi_buf, alpha_buf = [], [], []
episode_rewards, episode_coi = [], [] for step in range(0, total_steps, n_envs):
for _ in range(n_episodes): actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)])
obs, _ = env.reset() obs, rewards, dones, infos = vec_env.step(actions)
done, ep_reward, ep_coi = False, 0.0, [] ep_rewards += rewards
while not done: for i, info in enumerate(infos):
action, _ = model.predict(obs, deterministic=True) coi_buf.append(info.get('coi_erosion', 0))
obs, reward, terminated, truncated, info = env.step(action) alpha_buf.append(abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)))
done = terminated or truncated if dones[i]:
ep_reward += reward all_rewards.append(ep_rewards[i])
if 'coi_erosion' in info: ep_rewards[i] = 0
ep_coi.append(info['coi_erosion']) if writer and step % 1000 < n_envs and all_rewards:
episode_rewards.append(ep_reward) writer.add_scalar('rollout/ep_rew_mean', np.mean(all_rewards[-20:]), step)
if ep_coi: writer.add_scalar('coi/erosion', np.mean(coi_buf[-100:]), step)
episode_coi.append(np.mean(ep_coi)) writer.add_scalar('alpha/estimation_error', np.mean(alpha_buf[-100:]), step)
return {
'reward_mean': np.mean(episode_rewards),
'reward_std': np.std(episode_rewards),
'coi_erosion_mean': np.mean(episode_coi) if episode_coi else 0.0,
}
def compare_policies(cfg: TrainConfig, model_paths: List[str] = None) -> None: def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None) -> Dict[str, Dict]:
"""Compare trained models against baselines.""" """Run experiment across contamination levels for scientific comparison."""
if model_paths and not HAS_SB3: alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
raise ImportError("stable-baselines3 required for loading trained models")
writer = SummaryWriter(Path(cfg.log_dir) / "comparison") if HAS_TB else None
env_cfg = EnvConfig(
n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)
env = make_env(env_cfg)
results = {} results = {}
for alpha in alphas:
sweep_cfg = ExperimentConfig(**{**asdict(cfg), "alpha_true": alpha,
"experiment_name": f"{cfg.algo}_a{alpha:.2f}_{cfg.reward_mode}"})
print(f"\n=== α={alpha:.2f} ===")
out = train(sweep_cfg)
results[f"alpha_{alpha:.2f}"] = out["metrics"]
summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json"
with open(summary_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nSweep results saved to {summary_path}")
return results
# all baseline policies
baselines = {
'random': lambda obs, n: random_policy(n),
'fixed': lambda obs, n: fixed_price_policy(np.ones(n), 0.15),
'adaptive': lambda obs, n: adaptive_policy(obs, n, 0.15),
'myopic': lambda obs, n: myopic_policy(obs, n, 0.3),
'random_myopic': random_myopic_policy,
}
for name, policy_fn in baselines.items():
results[name] = run_baseline(policy_fn, env, n_episodes=20, name=name)
# trained models def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None) -> Dict[str, Dict]:
if model_paths: """Compare multiple policies at same contamination level."""
for path in model_paths: policies = policies or ["fixed", "adaptive", "myopic", "random"]
name = Path(path).parent.name results = {}
model = PPO.load(path) # assume PPO, could detect for algo in policies:
metrics = evaluate_trained_model(model, cfg) cmp_cfg = ExperimentConfig(**{**asdict(cfg), "algo": algo,
results[name] = {f'{name}/{k}': v for k, v in metrics.items()} "experiment_name": f"cmp_{algo}_a{cfg.alpha_true:.2f}"})
print(f"\n=== {algo} ===")
print("\n=== Policy Comparison ===") out = train(cmp_cfg)
for name, metrics in results.items(): results[algo] = out["metrics"]
reward_key = [k for k in metrics if 'reward_mean' in k][0] cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json"
coi_key = [k for k in metrics if 'coi' in k][0] if any('coi' in k for k in metrics) else None with open(cmp_path, "w") as f:
print(f"{name:20s}: reward={metrics[reward_key]:.2f}", end="") json.dump(results, f, indent=2)
if coi_key: print(f"\nComparison saved to {cmp_path}")
print(f", coi={metrics[coi_key]:.3f}") for algo, m in results.items():
else: print(f" {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} "
print() f"alpha_err={m['alpha_error_mean']:.4f}")
return results
if writer:
for k, v in metrics.items():
writer.add_scalar(f'comparison/{k}', v, 0)
if writer:
writer.close()
def main(): def main():
parser = argparse.ArgumentParser(description="Train RL pricing policies") parser = argparse.ArgumentParser(description="Train RL pricing policies")
parser.add_argument("--algo", type=str, default="ppo", parser.add_argument("--algo", default="ppo", choices=["ppo", "sac", "a2c", "fixed", "adaptive", "random", "myopic"])
choices=["ppo", "sac", "a2c", "myopic", "random_myopic", "fixed", "adaptive"]) parser.add_argument("--steps", type=int, default=100_000)
parser.add_argument("--steps", type=int, default=100_000, help="total training steps") parser.add_argument("--alpha", type=float, default=0.2)
parser.add_argument("--alpha", type=float, default=0.2, help="true contamination level") parser.add_argument("--reward-mode", default="robust", choices=["revenue", "profit", "robust", "coi_aware"])
parser.add_argument("--reward-mode", type=str, default="robust", choices=["revenue", "profit", "robust", "coi_aware"])
parser.add_argument("--n-products", type=int, default=10) parser.add_argument("--n-products", type=int, default=10)
parser.add_argument("--n-envs", type=int, default=4) parser.add_argument("--n-envs", type=int, default=4)
parser.add_argument("--seed", type=int, default=42) parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--log-dir", type=str, default="lab/case/thesis/runs") parser.add_argument("--log-dir", default="lab/case/thesis/runs")
parser.add_argument("--no-baselines", action="store_true", help="skip baseline runs") parser.add_argument("--sweep", action="store_true", help="run contamination sweep")
parser.add_argument("--compare", nargs="*", help="compare model paths") parser.add_argument("--compare", action="store_true", help="compare all baselines")
args = parser.parse_args() args = parser.parse_args()
cfg = TrainConfig( cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha, reward_mode=args.reward_mode, n_products=args.n_products,
reward_mode=args.reward_mode, n_products=args.n_products, n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir)
n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir,
run_baselines=not args.no_baselines)
if args.compare is not None: if args.sweep:
compare_policies(cfg, args.compare if args.compare else None) run_sweep(cfg)
elif args.compare:
compare_policies(cfg)
else: else:
result = train_rl(cfg) result = train(cfg)
print(f"\nTraining complete. Model saved to: {result['model_path']}") print(f"\nTraining complete: {result['path']}")
print(f"Final metrics: {result['metrics']}") print(f"Metrics: {json.dumps(result['metrics'], indent=2)}")
if __name__ == "__main__": if __name__ == "__main__":