preliminary improved runs

This commit is contained in:
2026-01-24 23:51:57 +01:00
parent 4033e73ba1
commit 1224841a82
3 changed files with 279 additions and 664 deletions

View File

@@ -1,11 +1,11 @@
"""Minimal implementation of thesis pricing system. """Minimal implementation of thesis pricing system.
Implements the core loop: prices -> sessions -> demand -> prices Implements the core loop: prices -> sessions -> demand -> prices
with behavioral separability and robust pricing objective (Eq 23). with behavioral separability and robust pricing objective.
Objects: Objects:
- Session trajectories τ_s from mixture of H/A behavioral profiles - Session trajectories tau_s from mixture of H/A behavioral profiles
- Demand proxy q̂ via weighted action aggregation (Eq 2) - Demand proxy q_hat via weighted action aggregation
- COI leakage penalty for agent reconnaissance - COI leakage penalty for agent reconnaissance
- Limbo: alternating price/demand history for trajectory analysis - Limbo: alternating price/demand history for trajectory analysis
""" """
@@ -14,11 +14,10 @@ from dataclasses import dataclass, field
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
import numpy as np import numpy as np
from .coi import COIWindow, compute_coi_window, coi_erosion
from .separability import TRANS_H, TRANS_A, kl_div, build_kernel, compute_divergence, estimate_alpha
ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0} ACTION_WEIGHTS = {"add_to_cart": 0.8, "checkout": 0.9, "purchase": 1.0, "view": 0.15, "detail": 0.25, "hover": 0.3, "start": 0.05, "end": 0.0}
TRANS_H = {"start": {"view": 0.85, "end": 0.15}, "view": {"detail": 0.4, "cart": 0.3, "view": 0.2, "end": 0.1},
"detail": {"cart": 0.5, "view": 0.3, "end": 0.2}, "cart": {"purchase": 0.6, "view": 0.25, "end": 0.15}, "purchase": {"end": 1.0}}
TRANS_A = {"start": {"view": 0.95, "end": 0.05}, "view": {"detail": 0.6, "view": 0.25, "cart": 0.1, "end": 0.05},
"detail": {"view": 0.5, "cart": 0.15, "detail": 0.3, "end": 0.05}, "cart": {"view": 0.4, "purchase": 0.2, "end": 0.4}, "purchase": {"end": 1.0}}
@dataclass @dataclass
@@ -38,235 +37,52 @@ class Session:
def compute_demand(session: Session) -> float: def compute_demand(session: Session) -> float:
"""Compute demand proxy q̂ = Σ_k ω(a_k) for session (Eq 2).""" """Compute demand proxy q_hat = sum_k omega(a_k) for session."""
return sum(ACTION_WEIGHTS.get(e.action, 0.1) for e in session.events) return sum(ACTION_WEIGHTS.get(e.action, 0.1) for e in session.events)
def kl_div(p: Dict[str, float], q: Dict[str, float]) -> float: def sample_trajectory(rng: np.random.Generator, trans: Dict, prices: np.ndarray, costs: np.ndarray, theta: Dict[str, float],
"""KL divergence D_KL(p || q) for transition kernels.""" is_agent: bool, session_noise: float = 0.02, surge: float = 0.08, max_mult: float = 1.8) -> Tuple[List[Event], int]:
eps = 1e-10
keys = set(p.keys()) | set(q.keys())
return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys)
def build_kernel(events: List[Event]) -> Dict[str, Dict[str, float]]:
"""Build empirical transition kernel from trajectory."""
trans: Dict[str, Dict[str, int]] = {}
prev = "start"
for e in events:
curr = e.action
trans.setdefault(prev, {})
trans[prev][curr] = trans[prev].get(curr, 0) + 1
prev = curr
kernel = {}
for s, dsts in trans.items():
total = sum(dsts.values())
kernel[s] = {d: c / total for d, c in dsts.items()} if total > 0 else {}
return kernel
def compute_divergence(session: Session) -> Tuple[float, float]:
"""Compute Δ_H, Δ_A divergence signals (Eq 20-21)."""
kernel = build_kernel(session.events)
delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / max(len(kernel), 1)
delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / max(len(kernel), 1)
return delta_h, delta_a
def estimate_alpha(session: Session, beta: float = 2.0) -> float:
"""Per-session contamination estimate α̂(τ') = σ(β(Δ_H - Δ_A))."""
dh, da = compute_divergence(session)
return 1.0 / (1.0 + np.exp(-beta * (dh - da))) if (dh + da) > 0 else 0.5
@dataclass(frozen=True)
class COIWindow:
"""Windowed COI metrics computed from realized price exposures.
COI_policy is the definition-level KPI: E[p_shown] - p_min.
COI_agent is the theorem-level object: E[p^(1)] - p_min, where p^(1) is the minimum price realized under agent querying.
In this simplified simulator, p^(1) is approximated as the minimum price exposed to any agent in the window (per product).
Leak is the observable gap between them.
"""
policy: float
agent: float
leak: float
survival_ratio: float
policy_by_product: np.ndarray
agent_by_product: np.ndarray
demand_weights: np.ndarray
def _prices_by_product(sessions: List[Session]) -> Dict[int, List[float]]:
prices: Dict[int, List[float]] = {}
for s in sessions:
for e in s.events:
prices.setdefault(e.product_idx, []).append(float(e.price_seen))
return prices
def _min_session_prices_by_product(sessions: List[Session]) -> Dict[int, List[float]]:
mins: Dict[int, List[float]] = {}
for s in sessions:
by_p: Dict[int, float] = {}
for e in s.events:
pidx = int(e.product_idx)
price = float(e.price_seen)
by_p[pidx] = price if pidx not in by_p else min(by_p[pidx], price)
for pidx, pmin in by_p.items():
mins.setdefault(pidx, []).append(pmin)
return mins
def _min_price_across_sessions_by_product(sessions: List[Session]) -> Dict[int, float]:
mins: Dict[int, float] = {}
for s in sessions:
for e in s.events:
pidx = int(e.product_idx)
price = float(e.price_seen)
mins[pidx] = price if pidx not in mins else min(mins[pidx], price)
return mins
def _demand_weights_by_product(
sessions: List[Session],
demand_mapping: Dict[str, float],
n_products: int,
) -> np.ndarray:
w = np.zeros(n_products, dtype=float)
sessions_by_id = {s.sid: s for s in sessions}
for sid, q in demand_mapping.items():
sess = sessions_by_id.get(sid)
if not sess or not sess.events:
continue
pidx = int(sess.events[0].product_idx)
w[pidx] += float(q)
s = float(np.sum(w))
return (w / s) if s > 0 else w
def compute_coi_window(
sessions: List[Session],
costs: np.ndarray,
demand_mapping: Dict[str, float] | None = None,
) -> COIWindow:
n_products = int(len(costs))
prices = _prices_by_product(sessions)
agent_min_across = _min_price_across_sessions_by_product([s for s in sessions if s.actor == "A"])
policy_by_product = np.zeros(n_products, dtype=float)
agent_by_product = np.zeros(n_products, dtype=float)
seen = np.array([(i in prices) for i in range(n_products)], dtype=bool)
agent_seen = np.array([(i in agent_min_across) for i in range(n_products)], dtype=bool)
for pidx, ps in prices.items():
if 0 <= pidx < n_products and ps:
policy_by_product[pidx] = float(np.mean(ps) - float(costs[pidx]))
for pidx, pmin in agent_min_across.items():
if 0 <= pidx < n_products:
agent_by_product[pidx] = float(pmin - float(costs[pidx]))
# If no agent exposure exists for a product in the window, there is no realized erosion for that product.
agent_by_product[seen & ~agent_seen] = policy_by_product[seen & ~agent_seen]
demand_weights = (
_demand_weights_by_product(sessions, demand_mapping, n_products)
if demand_mapping is not None
else np.zeros(n_products, dtype=float)
)
has_weights = float(np.sum(demand_weights)) > 0
if has_weights:
policy = float(np.dot(demand_weights, policy_by_product))
agent = float(np.dot(demand_weights, agent_by_product))
else:
if not bool(np.any(seen)):
policy = 0.0
agent = 0.0
else:
policy = float(np.mean(policy_by_product[seen]))
agent = float(np.mean(agent_by_product[seen]))
leak = float(max(policy - agent, 0.0))
survival_ratio = float(np.clip(agent / policy, 0.0, 1.0)) if policy > 0 else 0.0
return COIWindow(
policy=policy,
agent=agent,
leak=leak,
survival_ratio=survival_ratio,
policy_by_product=policy_by_product,
agent_by_product=agent_by_product,
demand_weights=demand_weights,
)
def sample_trajectory(
rng: np.random.Generator,
trans: Dict,
prices: np.ndarray,
costs: np.ndarray,
theta: Dict[str, float],
is_agent: bool,
session_price_noise: float = 0.02,
surge: float = 0.08,
max_markup_mult: float = 1.8,
) -> Tuple[List[Event], int]:
"""Sample session trajectory from behavioral kernel.""" """Sample session trajectory from behavioral kernel."""
state, t, pidx = "start", 0.0, int(rng.integers(0, len(prices))) pidx = int(rng.integers(0, len(prices)))
cost = float(costs[pidx]) cost, base = float(costs[pidx]), float(prices[pidx]) * (1.0 + rng.normal(0.0, session_noise))
base_price = float(prices[pidx]) * float(1.0 + rng.normal(0.0, session_price_noise)) base = float(np.clip(base, cost * 1.01, float(prices[pidx]) * 2.0))
base_price = float(np.clip(base_price, cost * 1.01, float(prices[pidx]) * 2.0)) price, signal, state, t = base, 0.0, "start", 0.0
current_price = base_price
signal = 0.0
events = [] events = []
# TODO: instead of this very controlled setup implement same session samplin as in models.py
while state != "end" and len(events) < 30: while state != "end" and len(events) < 30:
probs = trans.get(state, {"end": 1.0}) probs = trans.get(state, {"end": 1.0})
nxt = rng.choice(list(probs.keys()), p=list(probs.values())) nxt = rng.choice(list(probs.keys()), p=list(probs.values()))
if nxt == "purchase": # purchase conversion check
if nxt == "purchase": rel = max((price - cost) / (cost + 1e-6), 0.0)
price_sens = float(theta.get("price_sens", 2.0)) p_buy = float(np.clip(theta.get("base_conv", 0.2) * np.exp(-theta.get("price_sens", 2.0) * rel), 0.0, 1.0))
base_conv = float(theta.get("base_conv", 0.2))
rel = max((current_price - cost) / (cost + 1e-6), 0.0)
p_buy = float(np.clip(base_conv * np.exp(-price_sens * rel), 0.0, 1.0))
if rng.random() > p_buy: if rng.random() > p_buy:
nxt = "end" nxt = "end"
state = nxt state = nxt
if state not in {"start", "end"}: if state not in {"start", "end"}:
events.append(Event(action=state, product_idx=pidx, price_seen=float(current_price), ts=t)) events.append(Event(action=state, product_idx=pidx, price_seen=float(price), ts=t))
signal += float(ACTION_WEIGHTS.get(state, 0.1)) signal += float(ACTION_WEIGHTS.get(state, 0.1))
current_price = float(np.clip(base_price * (1.0 + surge * signal), cost * 1.01, base_price * max_markup_mult)) price = float(np.clip(base * (1.0 + surge * signal), cost * 1.01, base * max_mult))
t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2)) t += max(0.2, rng.gamma(1.5, 0.8) if is_agent else rng.gamma(2.0, 1.2))
return events, pidx return events, pidx
def put_prices_to_market(prices: np.ndarray, costs: np.ndarray, alpha: float = 0.2, n_sessions: int = 50, def put_prices_to_market(prices: np.ndarray, costs: np.ndarray, alpha: float = 0.2, n_sessions: int = 50,
seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]: seed: int | None = None) -> Tuple[List[Session], Dict[str, float]]:
"""Generate sessions from mixture model """Generate sessions from mixture model. Returns sessions and demand mapping sid -> q_hat."""
Returns:
sessions: list of Session objects with events and product attribution
demand_mapping: session_id -> demand proxy q̂
"""
rng = np.random.default_rng(seed) rng = np.random.default_rng(seed)
sessions, demand_mapping = [], {} sessions, demand = [], {}
for i in range(n_sessions): for i in range(n_sessions):
sid = f"s{i:04d}" sid = f"s{i:04d}"
is_agent = rng.random() < alpha is_agent = rng.random() < alpha
trans = TRANS_A if is_agent else TRANS_H trans = TRANS_A if is_agent else TRANS_H
theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else {"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)} theta = {"price_sens": rng.uniform(0.05, 0.2), "base_conv": 0.01} if is_agent else \
{"price_sens": rng.uniform(1.5, 4.0), "base_conv": rng.uniform(0.2, 0.5)}
events, _ = sample_trajectory(rng, trans, prices, costs=costs, theta=theta, is_agent=is_agent) events, _ = sample_trajectory(rng, trans, prices, costs=costs, theta=theta, is_agent=is_agent)
session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta) session = Session(sid=sid, events=events, actor="A" if is_agent else "H", theta=theta)
sessions.append(session) sessions.append(session)
demand_mapping[sid] = compute_demand(session) demand[sid] = compute_demand(session)
return sessions, demand
return sessions, demand_mapping
@dataclass @dataclass
@@ -286,13 +102,7 @@ class Limbo:
def add_update(self, utype: str, data: np.ndarray | Dict[str, float]) -> Dict: def add_update(self, utype: str, data: np.ndarray | Dict[str, float]) -> Dict:
self.history.append(LimboUpdate(utype=utype, data=data, t=self._t)) self.history.append(LimboUpdate(utype=utype, data=data, t=self._t))
self._t += 1 self._t += 1
return self.on_update(utype) return {"action": "observe_demand" if utype == "prices" else "set_prices"}
def on_update(self, utype: str) -> Dict:
"""React to update: after prices -> return observed demand; after demand -> signal price update needed."""
if utype == "prices":
return {"action": "observe_demand", "msg": "awaiting market response"}
return {"action": "set_prices", "msg": "demand observed, update prices"}
def get_prices_history(self) -> List[np.ndarray]: def get_prices_history(self) -> List[np.ndarray]:
return [u.data for u in self.history if u.utype == "prices"] return [u.data for u in self.history if u.utype == "prices"]
@@ -304,21 +114,18 @@ class Limbo:
class System: class System:
"""Main pricing system implementing robust Stackelberg objective. """Main pricing system implementing robust Stackelberg objective.
Manages the alternating loop: Manages the alternating loop: set prices p_t -> observe demand Q_hat(p_t) ->
1. Set prices p_t estimate contamination alpha from behavioral signals -> compute next prices.
2. Observe demand response Q̂(p_t)
3. Estimate contamination α from behavioral signals
4. Compute next prices via robust objective (Eq 23)
""" """
def __init__(self, n_products: int = 10, costs: np.ndarray | None = None, lambda_coi: float = 0.5, seed: int | None = 42): def __init__(self, n_products: int = 10, costs: np.ndarray | None = None, lambda_coi: float = 0.5, seed: int | None = 42):
self.n = n_products self.n = n_products
self.rng = np.random.default_rng(seed) self.rng = np.random.default_rng(seed)
self.costs = costs if costs is not None else self.rng.uniform(10, 50, n_products) self.costs = costs if costs is not None else self.rng.uniform(10, 50, n_products)
self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products)) # base prices with margin self.refs = self.costs * (1 + self.rng.uniform(0.2, 0.5, n_products))
self.lambda_coi = lambda_coi self.lambda_coi = lambda_coi
self.limbo = Limbo() self.limbo = Limbo()
self._alpha_est = 0.2 # current contamination estimate self._alpha_est = 0.2
self._sessions: List[Session] = [] self._sessions: List[Session] = []
self._last_sessions: List[Session] = [] self._last_sessions: List[Session] = []
self._last_coi: COIWindow | None = None self._last_coi: COIWindow | None = None
@@ -328,127 +135,73 @@ class System:
return self._alpha_est return self._alpha_est
def _estimate_alpha_from_sessions(self) -> float: def _estimate_alpha_from_sessions(self) -> float:
"""Aggregate per-session α̂ estimates."""
if not self._sessions: if not self._sessions:
return self._alpha_est return self._alpha_est
alphas = [estimate_alpha(s) for s in self._sessions[-50:]] # use recent sessions return float(np.mean([estimate_alpha(s) for s in self._sessions[-50:]]))
return float(np.mean(alphas))
def _revenue_under_demand(self, prices: np.ndarray, demand: Dict[str, float]) -> float: def _revenue_under_demand(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
"""Compute expected revenue R(p, d) from demand proxy.""" agg = np.zeros(self.n)
agg_demand = np.zeros(self.n)
for sid, q in demand.items(): for sid, q in demand.items():
if self._sessions:
sess = next((s for s in self._sessions if s.sid == sid), None) sess = next((s for s in self._sessions if s.sid == sid), None)
if sess and sess.events: if sess and sess.events:
pidx = sess.events[0].product_idx agg[sess.events[0].product_idx] += q
agg_demand[pidx] += q return float(np.dot(prices, agg))
return float(np.dot(prices, agg_demand))
def _compute_coi_window(self, demand: Dict[str, float]) -> COIWindow: def _compute_coi_window(self, demand: Dict[str, float]) -> COIWindow:
if not self._last_sessions: if not self._last_sessions:
zeros = np.zeros(self.n, dtype=float) zeros = np.zeros(self.n, dtype=float)
return COIWindow( return COIWindow(policy=0.0, agent=0.0, leak=0.0, survival_ratio=0.0,
policy=0.0, policy_by_product=zeros, agent_by_product=zeros, demand_weights=zeros)
agent=0.0,
leak=0.0,
survival_ratio=0.0,
policy_by_product=zeros,
agent_by_product=zeros,
demand_weights=zeros,
)
return compute_coi_window(self._last_sessions, self.costs, demand_mapping=demand) return compute_coi_window(self._last_sessions, self.costs, demand_mapping=demand)
def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float: def _objective(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
"""Robust objective: R(p,d) - λ·COI_leak (Eq 23 simplified).""" """Robust objective: R(p,d) - lambda * COI_leak."""
revenue = self._revenue_under_demand(prices, demand) profit = self._revenue_under_demand(prices, demand) - float(np.sum(self.costs))
cost = float(np.sum(self.costs)) # fixed cost approximation
profit = revenue - cost
self._last_coi = self._compute_coi_window(demand) self._last_coi = self._compute_coi_window(demand)
return profit - self.lambda_coi * self._last_coi.leak return profit - self.lambda_coi * self._last_coi.leak
def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray: def compute_prices(self, demand: Dict[str, float] | None = None) -> np.ndarray:
"""Compute next prices via simple gradient-like update on robust objective. """Compute next prices via heuristic margin adjustment based on alpha estimate."""
In a full implementation this would be replaced by DR-RL policy output.
Here we use a heuristic: adjust margins based on α estimate.
"""
self._alpha_est = self._estimate_alpha_from_sessions() self._alpha_est = self._estimate_alpha_from_sessions()
margin_scale = 1.0 - 0.5 * self._alpha_est # defensive pricing under high contamination
# base margin adjustment: higher α -> lower margins (defensive pricing)
margin_scale = 1.0 - 0.5 * self._alpha_est # reduce margins under high contamination
margins = (self.refs - self.costs) * margin_scale margins = (self.refs - self.costs) * margin_scale
# add small noise for exploration
noise = self.rng.normal(0, 0.02, self.n) * self.costs noise = self.rng.normal(0, 0.02, self.n) * self.costs
prices = np.clip(self.costs + margins + noise, self.costs * 1.02, self.refs * 1.3) prices = np.clip(self.costs + margins + noise, self.costs * 1.02, self.refs * 1.3)
self.limbo.add_update("prices", prices) self.limbo.add_update("prices", prices)
return prices return prices
def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]: def observe_demand(self, prices: np.ndarray, alpha_true: float = 0.2, n_sessions: int = 50) -> Dict[str, float]:
"""Observe market response to prices.""" sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true,
sessions, demand_map = put_prices_to_market(prices, costs=self.costs, alpha=alpha_true, n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000))) n_sessions=n_sessions, seed=int(self.rng.integers(0, 10000)))
self._last_sessions = sessions self._last_sessions = sessions
self._sessions.extend(sessions) # store actual sessions for correct product attribution self._sessions.extend(sessions)
self.limbo.add_update("demand", demand_map) self.limbo.add_update("demand", demand_map)
return demand_map return demand_map
def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float, COIWindow]: def step(self, alpha_true: float = 0.2, n_sessions: int = 50) -> Tuple[np.ndarray, Dict[str, float], float, COIWindow]:
"""Single simulation step: prices -> demand -> reward."""
demand_hist = self.limbo.get_demand_history() demand_hist = self.limbo.get_demand_history()
prices = self.compute_prices(demand_hist[-1] if demand_hist else None) prices = self.compute_prices(demand_hist[-1] if demand_hist else None)
demand = self.observe_demand(prices, alpha_true, n_sessions) demand = self.observe_demand(prices, alpha_true, n_sessions)
reward = self._objective(prices, demand) reward = self._objective(prices, demand)
coi = self._last_coi or self._compute_coi_window(demand) return prices, demand, reward, self._last_coi or self._compute_coi_window(demand)
return prices, demand, reward, coi
def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict: def run(self, n_steps: int = 100, alpha_true: float = 0.2) -> Dict:
"""Run simulation for n_steps, return trajectory.""" traj = {"prices": [], "demand": [], "rewards": [], "alpha_est": [], "alpha_true": alpha_true,
trajectory = { "coi_policy": [], "coi_agent": [], "coi_leak": [], "coi_survival": []}
"prices": [],
"demand": [],
"rewards": [],
"alpha_est": [],
"alpha_true": alpha_true,
"coi_policy": [],
"coi_agent": [],
"coi_leak": [],
"coi_survival": [],
}
for _ in range(n_steps): for _ in range(n_steps):
p, d, r, coi = self.step(alpha_true) p, d, r, coi = self.step(alpha_true)
trajectory["prices"].append(p) traj["prices"].append(p); traj["demand"].append(d); traj["rewards"].append(r)
trajectory["demand"].append(d) traj["alpha_est"].append(self._alpha_est)
trajectory["rewards"].append(r) traj["coi_policy"].append(coi.policy); traj["coi_agent"].append(coi.agent)
trajectory["alpha_est"].append(self._alpha_est) traj["coi_leak"].append(coi.leak); traj["coi_survival"].append(coi.survival_ratio)
trajectory["coi_policy"].append(coi.policy) return traj
trajectory["coi_agent"].append(coi.agent)
trajectory["coi_leak"].append(coi.leak)
trajectory["coi_survival"].append(coi.survival_ratio)
return trajectory
def coi_erosion(n_agents: int, price_std: float) -> float:
"""COI erosion from Theorem 1: as N->inf, min(p_1..p_N)->p_min."""
if n_agents <= 1:
return 0.0
log_n = np.log(n_agents)
shift = price_std * (np.sqrt(2 * log_n) - (np.log(log_n) + np.log(4 * np.pi)) / (2 * np.sqrt(2 * log_n) + 1e-6))
return float(min(shift / (price_std * 2 + 1e-6), 1.0))
if __name__ == "__main__": if __name__ == "__main__":
# quick demo
sys = System(n_products=5, seed=42) sys = System(n_products=5, seed=42)
traj = sys.run(n_steps=20, alpha_true=0.25) traj = sys.run(n_steps=20, alpha_true=0.25)
print( print(f"avg reward: {np.mean(traj['rewards']):.2f}, final alpha_hat: {traj['alpha_est'][-1]:.3f}, "
f"avg reward: {np.mean(traj['rewards']):.2f}, " f"COI_policy: {np.mean(traj['coi_policy']):.3f}, COI_agent: {np.mean(traj['coi_agent']):.3f}, leak: {np.mean(traj['coi_leak']):.3f}")
f"final α̂: {traj['alpha_est'][-1]:.3f}, "
f"COI_policy: {np.mean(traj['coi_policy']):.3f}, "
f"COI_agent: {np.mean(traj['coi_agent']):.3f}, "
f"leak: {np.mean(traj['coi_leak']):.3f}"
)
prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0]) prices = np.array([20.0, 35.0, 50.0, 25.0, 40.0])
costs = np.array([15.0, 28.0, 40.0, 18.0, 30.0]) costs = np.array([15.0, 28.0, 40.0, 18.0, 30.0])
@@ -456,16 +209,10 @@ if __name__ == "__main__":
print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}') print(f'sessions: {len(sessions)}, agents: {sum(1 for s in sessions if s.actor=="A")}')
for n in [1, 5, 10, 50, 100]: for n in [1, 5, 10, 50, 100]:
ero = coi_erosion(n, price_std=5.0) print(f'N={n:3d} agents -> COI erosion: {coi_erosion(n, price_std=5.0):.3f}')
print(f'N={n:3d} agents -> COI erosion: {ero:.3f}')
# test separability events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0), Event('purchase', 0, 20.0, 2.0)]
events = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.5), Event('cart', 0, 20.0, 1.0), print(f'human-like session alpha_hat: {estimate_alpha(Session(sid="test", events=events, actor="H")):.3f}')
Event('purchase', 0, 20.0, 2.0)]
sess_h = Session(sid='test', events=events, actor='H')
print(f'human-like session α̂: {estimate_alpha(sess_h):.3f}')
events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3), events_a = [Event('view', 0, 20.0, 0.1), Event('detail', 0, 20.0, 0.2), Event('view', 0, 20.0, 0.3), Event('detail', 0, 20.0, 0.4)]
Event('detail', 0, 20.0, 0.4)] print(f'agent-like session alpha_hat: {estimate_alpha(Session(sid="test2", events=events_a, actor="A")):.3f}')
sess_a = Session(sid='test2', events=events_a, actor='A')
print(f'agent-like session α̂: {estimate_alpha(sess_a):.3f}')

View File

@@ -19,58 +19,45 @@ try:
except ImportError: except ImportError:
HAS_GYM = False HAS_GYM = False
from .simplified import ( from .simplified import System, Session, Event, Limbo, put_prices_to_market, compute_demand, estimate_alpha
System, from .coi import COIWindow, compute_coi_window, coi_erosion
Session,
Event,
Limbo,
put_prices_to_market,
compute_coi_window,
compute_demand,
estimate_alpha,
coi_erosion,
TRANS_H,
TRANS_A,
)
@dataclass @dataclass
class EnvConfig: class EnvConfig:
"""Configuration for pricing environment."""
n_products: int = 5 n_products: int = 5
max_steps: int = 200 max_steps: int = 200
sessions_per_step: int = 30 sessions_per_step: int = 30
alpha_true: float = 0.2 # true contamination level alpha_true: float = 0.2
alpha_drift: float = 0.0 # per-step drift in α alpha_drift: float = 0.0
alpha_bounds: Tuple[float, float] = (0.0, 0.6) alpha_bounds: Tuple[float, float] = (0.0, 0.6)
lambda_coi: float = 0.5 # COI penalty weight lambda_coi: float = 0.5
lambda_vol: float = 0.1 # volatility penalty weight lambda_vol: float = 0.1
reward_mode: str = "robust" # revenue | profit | robust | coi_aware reward_mode: str = "robust" # revenue | profit | robust | coi_aware
normalize_reward: bool = True normalize_reward: bool = True
seed: int | None = 42 seed: int | None = 42
def aggregate_purchases(sessions: list[Session], n_products: int, costs: np.ndarray) -> Tuple[np.ndarray, float, float]:
"""Aggregate purchases from sessions, returns (counts, revenue, cost)."""
purchases = np.zeros(n_products, dtype=float)
revenue, cost = 0.0, 0.0
for sess in sessions:
for e in sess.events:
if e.action == "purchase" and 0 <= e.product_idx < n_products:
purchases[e.product_idx] += 1.0
revenue += float(e.price_seen)
cost += float(costs[e.product_idx])
return purchases, revenue, cost
class PricingEnv(gym.Env if HAS_GYM else object): class PricingEnv(gym.Env if HAS_GYM else object):
"""RL environment for dynamic pricing under agent contamination. """RL environment for dynamic pricing under agent contamination.
Implements the thesis formulation where: Platform sets prices p_t, market responds with mixture demand Q(p) = (1-alpha)*D_H + alpha*D_A.
- Platform sets prices p_t Agent estimates contamination alpha_hat from behavioral signals.
- Market responds with mixture demand Q(p) = (1-α)D_H + αD_A Reward balances profit vs COI leakage.
- Agent estimates contamination α̂ from behavioral signals
- Reward balances profit vs COI leakage
Observation space (normalized):
[0:n] - current prices / ref_prices
[n:2n] - aggregated demand per product
[2n] - estimated contamination α̂
[2n+1] - true contamination α (if observable, else 0)
[2n+2:3n+2] - current margins (prices - costs) / costs
[3n+2] - step / max_steps
Action space:
price multipliers in [0.5, 1.5] applied to reference prices
""" """
metadata = {"render_modes": ["human", "ansi"]} metadata = {"render_modes": ["human", "ansi"]}
def __init__(self, cfg: EnvConfig | None = None): def __init__(self, cfg: EnvConfig | None = None):
@@ -86,34 +73,23 @@ class PricingEnv(gym.Env if HAS_GYM else object):
self._episode_rewards: list[float] = [] self._episode_rewards: list[float] = []
self._demand_agg = np.zeros(self.n) self._demand_agg = np.zeros(self.n)
# gymnasium spaces
self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32) self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32)
obs_dim = self.n + self.n + 1 + 1 + self.n + 1 # prices + demand + α̂ + α + margins + t obs_dim = self.n + self.n + 1 + 1 + self.n + 1 # prices + demand + alpha_hat + alpha + margins + t
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32) self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)
def _build_obs(self) -> np.ndarray: def _build_obs(self) -> np.ndarray:
"""Construct observation vector."""
if self._sys is None: if self._sys is None:
return np.zeros(self.observation_space.shape[0], dtype=np.float32) return np.zeros(self.observation_space.shape[0], dtype=np.float32)
prices = self._last_prices if self._last_prices is not None else self._sys.refs prices = self._last_prices if self._last_prices is not None else self._sys.refs
price_ratio = prices / (self._sys.refs + 1e-6) return np.concatenate([
demand_norm = self._demand_agg / (np.sum(self._demand_agg) + 1e-6) prices / (self._sys.refs + 1e-6),
margins = (prices - self._sys.costs) / (self._sys.costs + 1e-6) self._demand_agg / (np.sum(self._demand_agg) + 1e-6),
t_norm = self._t / self.cfg.max_steps [self._sys.alpha, self._alpha],
(prices - self._sys.costs) / (self._sys.costs + 1e-6),
obs = np.concatenate([ [self._t / self.cfg.max_steps],
price_ratio, # [0:n] ]).astype(np.float32)
demand_norm, # [n:2n]
[self._sys.alpha], # [2n] estimated α̂
[self._alpha], # [2n+1] true α
margins, # [2n+2:3n+2]
[t_norm], # [3n+2]
])
return obs.astype(np.float32)
def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float: def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
"""Compute reward based on configured mode."""
cfg, sys = self.cfg, self._sys cfg, sys = self.cfg, self._sys
if sys is None: if sys is None:
return 0.0 return 0.0
@@ -123,159 +99,77 @@ class PricingEnv(gym.Env if HAS_GYM else object):
for sid, q in demand.items(): for sid, q in demand.items():
sess = next((s for s in sys._sessions if s.sid == sid), None) sess = next((s for s in sys._sessions if s.sid == sid), None)
if sess and sess.events: if sess and sess.events:
pidx = sess.events[0].product_idx agg[sess.events[0].product_idx] += q
agg[pidx] += q
self._demand_agg = agg self._demand_agg = agg
revenue = 0.0 _, revenue, cost = aggregate_purchases(sys._last_sessions, self.n, sys.costs)
cost = 0.0 profit = revenue - cost
purchases = np.zeros(self.n, dtype=float)
for sess in sys._last_sessions:
for e in sess.events:
if e.action != "purchase":
continue
pidx = int(e.product_idx)
if 0 <= pidx < self.n:
purchases[pidx] += 1.0
revenue += float(e.price_seen)
cost += float(sys.costs[pidx])
profit = float(revenue - cost)
# volatility penalty (price changes)
vol_penalty = 0.0 vol_penalty = 0.0
if self._last_prices is not None: if self._last_prices is not None:
price_change = np.abs(prices - self._last_prices) / (sys.refs + 1e-6) vol_penalty = cfg.lambda_vol * float(np.mean(np.abs(prices - self._last_prices) / (sys.refs + 1e-6)))
vol_penalty = cfg.lambda_vol * float(np.mean(price_change))
coi = compute_coi_window(sys._last_sessions, sys.costs, demand_mapping=demand) coi = compute_coi_window(sys._last_sessions, sys.costs, demand_mapping=demand)
coi_leak = float(coi.leak) leak = float(coi.leak)
if cfg.reward_mode == "revenue": reward_fns = {
r = revenue "revenue": lambda: revenue,
elif cfg.reward_mode == "profit": "profit": lambda: profit,
r = profit "robust": lambda: profit - cfg.lambda_coi * leak - vol_penalty,
elif cfg.reward_mode == "robust": "coi_aware": lambda: profit - cfg.lambda_coi * (1 + 2 * sys.alpha) * leak - vol_penalty,
# robust objective: profit - λ_coi * COI_leak - λ_vol * volatility }
r = profit - cfg.lambda_coi * coi_leak - vol_penalty r = reward_fns.get(cfg.reward_mode, lambda: profit)()
elif cfg.reward_mode == "coi_aware": return float(r / (float(np.sum(sys.refs)) + 1e-6)) if cfg.normalize_reward else float(r)
# adaptive: heavier penalty at high contamination
adaptive_lambda = cfg.lambda_coi * (1 + 2 * sys.alpha)
r = profit - adaptive_lambda * coi_leak - vol_penalty
else:
r = profit
if cfg.normalize_reward:
r = r / (float(np.sum(sys.refs)) + 1e-6) # normalize by potential revenue
return float(r)
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]: def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
"""Reset environment to initial state."""
seed = seed if seed is not None else self.cfg.seed seed = seed if seed is not None else self.cfg.seed
self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed) self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed)
self._t = 0 self._t, self._alpha = 0, self.cfg.alpha_true
self._alpha = self.cfg.alpha_true self._last_prices, self._last_demand = None, None
self._last_prices = None self._episode_rewards, self._demand_agg = [], np.zeros(self.n)
self._last_demand = None return self._build_obs(), {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
self._episode_rewards = []
self._demand_agg = np.zeros(self.n)
info = {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
"costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()} "costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()}
return self._build_obs(), info
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]: def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
"""Execute one environment step.
Args:
action: price multipliers in [0.5, 1.5]
Returns:
obs, reward, terminated, truncated, info
"""
if self._sys is None: if self._sys is None:
raise RuntimeError("call reset() first") raise RuntimeError("call reset() first")
# convert action to prices
action = np.clip(action, 0.5, 1.5) action = np.clip(action, 0.5, 1.5)
prices = self._sys.refs * action.astype(np.float64) prices = np.clip(self._sys.refs * action.astype(np.float64), self._sys.costs * 1.01, self._sys.refs * 2.0)
prices = np.clip(prices, self._sys.costs * 1.01, self._sys.refs * 2.0)
# # drift contamination
# if self.cfg.alpha_drift != 0:
# self._alpha = np.clip(
# self._alpha + self.cfg.alpha_drift * self._sys.rng.normal(),
# *self.cfg.alpha_bounds)
# observe demand
demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step) demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step)
self._sys.limbo.add_update("prices", prices) self._sys.limbo.add_update("prices", prices)
# update α estimate
self._sys._alpha_est = self._sys._estimate_alpha_from_sessions() self._sys._alpha_est = self._sys._estimate_alpha_from_sessions()
reward = self._compute_reward(prices, demand) reward = self._compute_reward(prices, demand)
self._episode_rewards.append(reward) self._episode_rewards.append(reward)
self._last_prices, self._last_demand = prices.copy(), demand
self._last_prices = prices.copy()
self._last_demand = demand
self._t += 1 self._t += 1
terminated = self._t >= self.cfg.max_steps # compute info metrics using shared helper
truncated = False purchases, revenue, cost = aggregate_purchases(self._sys._last_sessions, self.n, self._sys.costs)
# compute metrics for tracking
revenue = 0.0
cost = 0.0
n_purchases = 0
for sess in self._sys._last_sessions:
for e in sess.events:
if e.action != "purchase":
continue
n_purchases += 1
revenue += float(e.price_seen)
cost += float(self._sys.costs[int(e.product_idx)])
profit = float(revenue - cost)
n_agents = int(self._alpha * self.cfg.sessions_per_step) n_agents = int(self._alpha * self.cfg.sessions_per_step)
price_std = float(np.std(prices))
coi = compute_coi_window(self._sys._last_sessions, self._sys.costs, demand_mapping=demand) coi = compute_coi_window(self._sys._last_sessions, self._sys.costs, demand_mapping=demand)
info = { info = {
"alpha_true": self._alpha, "alpha_true": self._alpha, "alpha_est": self._sys.alpha,
"alpha_est": self._sys.alpha,
"alpha_error": abs(self._alpha - self._sys.alpha), "alpha_error": abs(self._alpha - self._sys.alpha),
"revenue": float(revenue), "revenue": float(revenue), "profit": float(revenue - cost), "cost": float(cost),
"profit": float(profit), "n_purchases": int(np.sum(purchases)),
"cost": float(cost),
"n_purchases": int(n_purchases),
"avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)), "avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
"n_sessions": len(demand), "n_sessions": len(demand), "n_agents": n_agents, "price_std": float(np.std(prices)),
"n_agents": n_agents, "coi_erosion": coi_erosion(max(1, n_agents), float(np.std(prices))),
"price_std": price_std, "coi_policy": float(coi.policy), "coi_agent": float(coi.agent),
"coi_erosion": coi_erosion(max(1, n_agents), price_std), "coi_leakage": float(coi.leak), "coi_survival": float(coi.survival_ratio),
"coi_policy": float(coi.policy), "cumulative_reward": sum(self._episode_rewards), "step": self._t,
"coi_agent": float(coi.agent),
"coi_leakage": float(coi.leak),
"coi_survival": float(coi.survival_ratio),
"cumulative_reward": sum(self._episode_rewards),
"step": self._t,
} }
return self._build_obs(), reward, self._t >= self.cfg.max_steps, False, info
return self._build_obs(), reward, terminated, truncated, info
def render(self, mode: str = "human") -> str | None: def render(self, mode: str = "human") -> str | None:
"""Render environment state."""
if self._sys is None or self._last_prices is None: if self._sys is None or self._last_prices is None:
return None return None
out = f"t={self._t}/{self.cfg.max_steps} | alpha_true={self._alpha:.3f} alpha_hat={self._sys.alpha:.3f} | " \
lines = [ f"prices: {self._last_prices.round(1)} | demand: {self._demand_agg.round(2)} | " \
f"t={self._t}/{self.cfg.max_steps}", f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}"
f"α_true={self._alpha:.3f} α̂={self._sys.alpha:.3f}",
f"prices: {self._last_prices.round(1)}",
f"demand: {self._demand_agg.round(2)}",
f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}",
]
out = " | ".join(lines)
if mode == "human": if mode == "human":
print(out) print(out)
return out return out
@@ -285,10 +179,7 @@ class PricingEnv(gym.Env if HAS_GYM else object):
class ContaminationSweepEnv(PricingEnv): class ContaminationSweepEnv(PricingEnv):
"""Environment that sweeps through contamination levels during training. """Environment that sweeps through contamination levels during training."""
Useful for curriculum learning: start with low α, gradually increase.
"""
def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None): def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None):
super().__init__(cfg) super().__init__(cfg)
@@ -296,7 +187,6 @@ class ContaminationSweepEnv(PricingEnv):
self._schedule_idx = 0 self._schedule_idx = 0
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]: def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
# advance schedule on reset
if options and options.get("advance_schedule", False): if options and options.get("advance_schedule", False):
self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule) self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule)
self.cfg.alpha_true = self._schedule[self._schedule_idx] self.cfg.alpha_true = self._schedule[self._schedule_idx]
@@ -306,8 +196,7 @@ class ContaminationSweepEnv(PricingEnv):
class AdversarialEnv(PricingEnv): class AdversarialEnv(PricingEnv):
"""Environment with adversarial contamination dynamics. """Environment with adversarial contamination dynamics.
The contamination level responds to pricing policy: if prices are too predictable, Contamination increases when prices are predictable (agents exploit).
agents learn to exploit and α increases.
""" """
def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02): def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02):
@@ -317,20 +206,13 @@ class AdversarialEnv(PricingEnv):
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]: def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
obs, reward, term, trunc, info = super().step(action) obs, reward, term, trunc, info = super().step(action)
# track price history for predictability
if self._last_prices is not None: if self._last_prices is not None:
self._price_history.append(self._last_prices.copy()) self._price_history.append(self._last_prices.copy())
predictability = 0.0
# increase α if prices are predictable (low variance over recent history)
if len(self._price_history) > 10: if len(self._price_history) > 10:
recent = np.array(self._price_history[-10:]) predictability = 1.0 / (float(np.std(self._price_history[-10:])) + 0.1)
predictability = 1.0 / (float(np.std(recent)) + 0.1) self._alpha = np.clip(self._alpha + self._exploit_rate * predictability * self._sys.rng.random(), *self.cfg.alpha_bounds)
self._alpha = np.clip( info["predictability"] = predictability
self._alpha + self._exploit_rate * predictability * self._sys.rng.random(),
*self.cfg.alpha_bounds)
info["predictability"] = predictability if len(self._price_history) > 10 else 0.0
return obs, reward, term, trunc, info return obs, reward, term, trunc, info
def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]: def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
@@ -339,39 +221,20 @@ class AdversarialEnv(PricingEnv):
def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv: def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv:
"""Factory for creating pricing environments.""" return {"sweep": ContaminationSweepEnv, "adversarial": AdversarialEnv}.get(env_type, PricingEnv)(cfg)
if env_type == "sweep":
return ContaminationSweepEnv(cfg)
elif env_type == "adversarial":
return AdversarialEnv(cfg)
return PricingEnv(cfg)
# simple baseline policies for benchmarking # baseline policies
def fixed_price_policy(refs: np.ndarray, margin: float = 0.0) -> np.ndarray: fixed_price_policy = lambda refs, margin=0.0: np.ones(len(refs), dtype=np.float32) * (1.0 + margin)
"""Fixed markup policy: always return ref * (1 + margin).""" random_policy = lambda n, rng=None: (rng or np.random.default_rng()).uniform(0.7, 1.3, n).astype(np.float32)
return np.ones(len(refs), dtype=np.float32) * (1.0 + margin) adaptive_policy = lambda obs, n, base=0.1: np.ones(n, dtype=np.float32) * (1.0 + base * (1.0 - 0.4 * obs[2 * n]))
def random_policy(n: int, rng: np.random.Generator | None = None) -> np.ndarray:
"""Random policy for exploration baseline."""
rng = rng or np.random.default_rng()
return rng.uniform(0.7, 1.3, n).astype(np.float32)
def adaptive_policy(obs: np.ndarray, n: int, base_margin: float = 0.1) -> np.ndarray:
"""Simple adaptive policy: reduce margins when α̂ is high."""
alpha_est = obs[2 * n] # α̂ is at position 2n in observation
margin_scale = 1.0 - 0.4 * alpha_est # defensive when α̂ high
return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale)
if __name__ == "__main__": if __name__ == "__main__":
# demo run
cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust") cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust")
env = make_env(cfg) env = make_env(cfg)
obs, info = env.reset() obs, info = env.reset()
print(f"initial: α={info['alpha_true']:.2f}") print(f"initial: alpha={info['alpha_true']:.2f}")
total_reward = 0.0 total_reward = 0.0
for t in range(cfg.max_steps): for t in range(cfg.max_steps):
@@ -383,4 +246,4 @@ if __name__ == "__main__":
if done: if done:
break break
print(f"\ntotal reward: {total_reward:.2f}, final α̂: {info['alpha_est']:.3f}") print(f"\ntotal reward: {total_reward:.2f}, final alpha_hat: {info['alpha_est']:.3f}")

View File

@@ -6,7 +6,8 @@ Tracks COI erosion, alpha estimation error, and economic KPIs per thesis formula
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import json import json
from dataclasses import dataclass, asdict from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass, asdict, field
from pathlib import Path from pathlib import Path
from typing import Dict, List, Callable, Any from typing import Dict, List, Callable, Any
import numpy as np import numpy as np
@@ -27,10 +28,9 @@ except ImportError:
HAS_TB = False HAS_TB = False
from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy from .simplified_env import PricingEnv, EnvConfig, make_env, adaptive_policy, fixed_price_policy, random_policy
from .simplified import coi_erosion from .coi import coi_erosion
# thesis-aligned KPIs tracked per episode
@dataclass @dataclass
class EpisodeMetrics: class EpisodeMetrics:
reward: float = 0.0 reward: float = 0.0
@@ -43,10 +43,24 @@ class EpisodeMetrics:
n_agents: int = 0 n_agents: int = 0
steps: int = 0 steps: int = 0
def accumulate(self, info: Dict[str, Any]) -> None:
self.steps += 1
self.reward += info.get('reward', 0)
self.revenue += info.get('revenue', 0)
self.profit += info.get('profit', 0)
self.coi_erosion += info.get('coi_erosion', 0)
self.coi_leakage += info.get('coi_leakage', 0)
self.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0))
self.avg_margin += info.get('avg_margin', 0)
self.n_agents += info.get('n_agents', 0)
def normalized(self) -> Dict[str, float]:
s = max(self.steps, 1)
return {k: getattr(self, k) / s for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin', 'n_agents']}
@dataclass @dataclass
class ExperimentConfig: class ExperimentConfig:
"""Full experiment specification for reproducibility."""
algo: str = "ppo" algo: str = "ppo"
total_timesteps: int = 100_000 total_timesteps: int = 100_000
n_envs: int = 4 n_envs: int = 4
@@ -65,17 +79,14 @@ class ExperimentConfig:
self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}" self.experiment_name = f"{self.algo}_a{self.alpha_true:.2f}_{self.reward_mode}"
# unified policy interface wrapping all baselines
class Policy: class Policy:
"""Unified policy interface for baselines and trained models.""" """Unified policy interface for baselines and trained models."""
def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str): def __init__(self, policy_fn: Callable[[np.ndarray, int], np.ndarray], name: str):
self._fn = policy_fn self._fn, self.name = policy_fn, name
self.name = name
def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]: def predict(self, obs: np.ndarray, deterministic: bool = True) -> tuple[np.ndarray, None]:
n = (len(obs) - 3) // 3 return self._fn(obs, (len(obs) - 3) // 3), None
return self._fn(obs, n), None
@staticmethod @staticmethod
def fixed(margin: float = 0.15) -> "Policy": def fixed(margin: float = 0.15) -> "Policy":
@@ -91,99 +102,97 @@ class Policy:
@staticmethod @staticmethod
def myopic(greed: float = 0.3) -> "Policy": def myopic(greed: float = 0.3) -> "Policy":
"""Myopic: maximize immediate margin, ignore alpha."""
def _fn(obs: np.ndarray, n: int) -> np.ndarray: def _fn(obs: np.ndarray, n: int) -> np.ndarray:
demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5 demand_norm = obs[n:2*n] if len(obs) > 2*n else np.ones(n) * 0.5
mult = 1.0 + greed * (1 + np.mean(demand_norm)) return np.ones(n, dtype=np.float32) * np.clip(1.0 + greed * (1 + np.mean(demand_norm)), 0.5, 1.5)
return np.ones(n, dtype=np.float32) * np.clip(mult, 0.5, 1.5)
return Policy(_fn, f"myopic_{greed:.1f}") return Policy(_fn, f"myopic_{greed:.1f}")
class MetricsCallback(BaseCallback): def log_metrics(writer: SummaryWriter | None, metrics: Dict[str, float], prefix: str, step: int) -> None:
"""Tracks thesis-aligned metrics during RL training.""" if writer is None:
return
for k, v in metrics.items():
writer.add_scalar(f'{prefix}/{k}', v, step)
class MetricsCallback(BaseCallback):
def __init__(self, writer: SummaryWriter | None, verbose: int = 0): def __init__(self, writer: SummaryWriter | None, verbose: int = 0):
super().__init__(verbose) super().__init__(verbose)
self._writer = writer self._writer = writer
self._ep = EpisodeMetrics()
self._buffer: List[EpisodeMetrics] = []
def _on_step(self) -> bool: def _on_step(self) -> bool:
for info in self.locals.get('infos', []): if self._writer is None:
self._ep.steps += 1 return True
self._ep.reward += info.get('reward', 0) for info in self.locals.get('infos', []):
self._ep.revenue += info.get('revenue', 0) t = self.num_timesteps
self._ep.profit += info.get('profit', 0) self._writer.add_scalar('economics/revenue', info.get('revenue', 0), t)
self._ep.coi_erosion += info.get('coi_erosion', 0) self._writer.add_scalar('economics/profit', info.get('profit', 0), t)
self._ep.coi_leakage += info.get('coi_leakage', 0) self._writer.add_scalar('economics/margin', info.get('avg_margin', 0), t)
self._ep.alpha_error += abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)) self._writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), t)
self._ep.avg_margin += info.get('avg_margin', 0) self._writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), t)
self._ep.n_agents += info.get('n_agents', 0) self._writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), t)
self._writer.add_scalar('agents/count', info.get('n_agents', 0), t)
return True return True
def _on_rollout_end(self) -> None:
if self._ep.steps == 0 or self._writer is None:
return
s, step = self._ep.steps, self.num_timesteps
self._writer.add_scalar('economics/revenue', self._ep.revenue / s, step)
self._writer.add_scalar('economics/profit', self._ep.profit / s, step)
self._writer.add_scalar('economics/margin', self._ep.avg_margin / s, step)
self._writer.add_scalar('coi/erosion', self._ep.coi_erosion / s, step)
self._writer.add_scalar('coi/leakage', self._ep.coi_leakage / s, step)
self._writer.add_scalar('alpha/estimation_error', self._ep.alpha_error / s, step)
self._writer.add_scalar('agents/count', self._ep.n_agents / s, step)
self._buffer.append(self._ep)
self._ep = EpisodeMetrics()
def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv: def make_vec_env(cfg: ExperimentConfig, n_envs: int = 1) -> DummyVecEnv:
def _make(): def _make():
env_cfg = EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps, return Monitor(make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed) alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed)))
return Monitor(make_env(env_cfg))
return DummyVecEnv([_make for _ in range(n_envs)]) return DummyVecEnv([_make for _ in range(n_envs)])
def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]: def run_episodes(policy: Policy | Any, env: PricingEnv, n_episodes: int) -> List[EpisodeMetrics]:
"""Evaluate policy and return thesis-aligned metrics.""" """Run policy for n episodes and collect metrics."""
env_cfg = EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999)
env = make_env(env_cfg)
metrics = [] metrics = []
for _ in range(n_episodes): for _ in range(n_episodes):
obs, _ = env.reset() obs, _ = env.reset()
ep = EpisodeMetrics() ep, done = EpisodeMetrics(), False
done = False
while not done: while not done:
action, _ = policy.predict(obs, deterministic=True) action, _ = policy.predict(obs, deterministic=True)
obs, reward, term, trunc, info = env.step(action) obs, reward, term, trunc, info = env.step(action)
done = term or trunc done = term or trunc
ep.accumulate(info)
ep.reward += reward ep.reward += reward
ep.revenue += info.get('revenue', 0)
ep.profit += info.get('profit', 0)
ep.coi_erosion += info.get('coi_erosion', 0)
ep.coi_leakage += info.get('coi_leakage', 0)
ep.alpha_error += abs(info['alpha_true'] - info['alpha_est'])
ep.avg_margin += info.get('avg_margin', 0)
ep.steps += 1
metrics.append(ep) metrics.append(ep)
return metrics
n = len(metrics)
def evaluate_policy(policy: Policy | Any, cfg: ExperimentConfig, n_episodes: int = 20) -> Dict[str, float]:
env = make_env(EnvConfig(n_products=cfg.n_products, max_steps=cfg.max_steps,
alpha_true=cfg.alpha_true, reward_mode=cfg.reward_mode, seed=cfg.seed + 999))
metrics = run_episodes(policy, env, n_episodes)
return { return {
'reward_mean': np.mean([m.reward for m in metrics]), 'reward_mean': np.mean([m.reward for m in metrics]), 'reward_std': np.std([m.reward for m in metrics]),
'reward_std': np.std([m.reward for m in metrics]), **{f'{k}_mean': np.mean([m.normalized()[k] for m in metrics])
'revenue_mean': np.mean([m.revenue / m.steps for m in metrics]), for k in ['revenue', 'profit', 'coi_erosion', 'coi_leakage', 'alpha_error', 'avg_margin']},
'profit_mean': np.mean([m.profit / m.steps for m in metrics]),
'coi_erosion_mean': np.mean([m.coi_erosion / m.steps for m in metrics]),
'coi_leakage_mean': np.mean([m.coi_leakage / m.steps for m in metrics]),
'alpha_error_mean': np.mean([m.alpha_error / m.steps for m in metrics]),
'margin_mean': np.mean([m.avg_margin / m.steps for m in metrics]),
} }
def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None):
obs, n_envs = vec_env.reset(), vec_env.num_envs
ep_rewards = np.zeros(n_envs)
for step in range(0, total_steps, n_envs):
actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)])
obs, rewards, dones, infos = vec_env.step(actions)
ep_rewards += rewards
for i, info in enumerate(infos):
if writer:
writer.add_scalar('economics/revenue', info.get('revenue', 0), step)
writer.add_scalar('economics/profit', info.get('profit', 0), step)
writer.add_scalar('economics/margin', info.get('avg_margin', 0), step)
writer.add_scalar('coi/erosion', info.get('coi_erosion', 0), step)
writer.add_scalar('coi/leakage', info.get('coi_leakage', 0), step)
writer.add_scalar('alpha/estimation_error', abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)), step)
writer.add_scalar('agents/count', info.get('n_agents', 0), step)
if dones[i]:
if writer:
writer.add_scalar('rollout/ep_reward', ep_rewards[i], step)
ep_rewards[i] = 0
def train(cfg: ExperimentConfig) -> Dict[str, Any]: def train(cfg: ExperimentConfig) -> Dict[str, Any]:
"""Train RL agent or evaluate baseline policy."""
is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"] is_baseline = cfg.algo.lower() in ["fixed", "adaptive", "random", "myopic"]
if not HAS_SB3 and not is_baseline: if not HAS_SB3 and not is_baseline:
raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]") raise ImportError("stable-baselines3 required: pip install stable-baselines3[extra]")
@@ -194,85 +203,65 @@ def train(cfg: ExperimentConfig) -> Dict[str, Any]:
json.dump(asdict(cfg), f, indent=2) json.dump(asdict(cfg), f, indent=2)
writer = SummaryWriter(log_path) if HAS_TB else None writer = SummaryWriter(log_path) if HAS_TB else None
train_env = make_vec_env(cfg, cfg.n_envs) train_env, eval_env = make_vec_env(cfg, cfg.n_envs), make_vec_env(cfg, 1)
eval_env = make_vec_env(cfg, 1)
if is_baseline: if is_baseline:
policy_map = {"fixed": Policy.fixed(), "adaptive": Policy.adaptive(), policy = {"fixed": Policy.fixed, "adaptive": Policy.adaptive, "random": Policy.random, "myopic": Policy.myopic}[cfg.algo.lower()]()
"random": Policy.random(), "myopic": Policy.myopic()}
policy = policy_map[cfg.algo.lower()]
run_baseline(policy, train_env, cfg.total_timesteps, writer) run_baseline(policy, train_env, cfg.total_timesteps, writer)
final_metrics = evaluate_policy(policy, cfg) final_metrics = evaluate_policy(policy, cfg)
else: else:
algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}.get(cfg.algo.lower()) algo_cls = {"ppo": PPO, "sac": SAC, "a2c": A2C}[cfg.algo.lower()]
if algo_cls is None:
raise ValueError(f"unknown algo: {cfg.algo}")
common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto") common = dict(verbose=1, seed=cfg.seed, tensorboard_log=str(log_path), device="auto")
# TODO: setup hyper parameter passing to train different variations (no free lunch) model = {
if cfg.algo.lower() == "ppo": "ppo": lambda: PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, ent_coef=0.01, **common),
model = PPO("MlpPolicy", train_env, learning_rate=3e-4, n_steps=2048, "sac": lambda: SAC("MlpPolicy", train_env, learning_rate=3e-4, buffer_size=100_000, batch_size=256, tau=0.005, gamma=0.99, **common),
batch_size=64, n_epochs=10, gamma=0.99, gae_lambda=0.95, "a2c": lambda: A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common),
clip_range=0.2, ent_coef=0.01, **common) }[cfg.algo.lower()]()
elif cfg.algo.lower() == "sac":
model = SAC("MlpPolicy", train_env, learning_rate=3e-4, buffer_size=100_000,
batch_size=256, tau=0.005, gamma=0.99, **common)
else:
model = A2C("MlpPolicy", train_env, learning_rate=7e-4, n_steps=5, gamma=0.99, **common)
cb = MetricsCallback(writer) cb = MetricsCallback(writer)
eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"), eval_cb = EvalCallback(eval_env, best_model_save_path=str(log_path / "best"), log_path=str(log_path),
log_path=str(log_path), eval_freq=cfg.eval_freq, eval_freq=cfg.eval_freq, n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
n_eval_episodes=cfg.n_eval_episodes, deterministic=True)
model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True) model.learn(cfg.total_timesteps, callback=[cb, eval_cb], progress_bar=True)
model.save(log_path / "final_model") model.save(log_path / "final_model")
policy = model policy = model
final_metrics = evaluate_policy(model, cfg) final_metrics = evaluate_policy(model, cfg)
if writer: if writer:
for k, v in final_metrics.items(): log_metrics(writer, final_metrics, 'final', cfg.total_timesteps)
writer.add_scalar(f'final/{k}', v, cfg.total_timesteps)
writer.close() writer.close()
train_env.close() train_env.close(); eval_env.close()
eval_env.close()
with open(log_path / "results.json", "w") as f: with open(log_path / "results.json", "w") as f:
json.dump(final_metrics, f, indent=2) json.dump(final_metrics, f, indent=2)
return {"path": str(log_path), "metrics": final_metrics} return {"path": str(log_path), "metrics": final_metrics}
def run_baseline(policy: Policy, vec_env: DummyVecEnv, total_steps: int, writer: SummaryWriter | None): def _train_alpha(args: tuple) -> tuple[str, Dict]:
"""Run baseline policy through environment with logging.""" """Worker for parallel sweep - must be top-level for pickling."""
obs = vec_env.reset() cfg_dict, alpha = args
n_envs = vec_env.num_envs cfg_dict["alpha_true"] = alpha
ep_rewards = np.zeros(n_envs) cfg_dict["experiment_name"] = f"{cfg_dict['algo']}_a{alpha:.2f}_{cfg_dict['reward_mode']}"
all_rewards, coi_buf, alpha_buf = [], [], [] sweep_cfg = ExperimentConfig(**cfg_dict)
print(f"[alpha={alpha:.2f}] starting")
for step in range(0, total_steps, n_envs): metrics = train(sweep_cfg)["metrics"]
actions = np.array([policy.predict(obs[i])[0] for i in range(n_envs)]) print(f"[alpha={alpha:.2f}] done")
obs, rewards, dones, infos = vec_env.step(actions) return f"alpha_{alpha:.2f}", metrics
ep_rewards += rewards
for i, info in enumerate(infos):
coi_buf.append(info.get('coi_erosion', 0))
alpha_buf.append(abs(info.get('alpha_true', 0) - info.get('alpha_est', 0)))
if dones[i]:
all_rewards.append(ep_rewards[i])
ep_rewards[i] = 0
if writer and step % 1000 < n_envs and all_rewards:
writer.add_scalar('rollout/ep_rew_mean', np.mean(all_rewards[-20:]), step)
writer.add_scalar('coi/erosion', np.mean(coi_buf[-100:]), step)
writer.add_scalar('alpha/estimation_error', np.mean(alpha_buf[-100:]), step)
def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None) -> Dict[str, Dict]: def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
"""Run experiment across contamination levels for scientific comparison.""" alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
alphas = alphas or [0.0, 0.1, 0.2, 0.3, 0.4, 0.5] cfg_dict = asdict(cfg)
if max_workers == 1: # sequential fallback
results = dict(_train_alpha((cfg_dict.copy(), a)) for a in alphas)
else:
with ProcessPoolExecutor(max_workers=max_workers) as pool:
futures = {pool.submit(_train_alpha, (cfg_dict.copy(), a)): a for a in alphas}
results = {} results = {}
for alpha in alphas: for fut in as_completed(futures):
sweep_cfg = ExperimentConfig(**{**asdict(cfg), "alpha_true": alpha, key, metrics = fut.result()
"experiment_name": f"{cfg.algo}_a{alpha:.2f}_{cfg.reward_mode}"}) results[key] = metrics
print(f"\n=== α={alpha:.2f} ===")
out = train(sweep_cfg)
results[f"alpha_{alpha:.2f}"] = out["metrics"]
summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json" summary_path = Path(cfg.log_dir) / f"sweep_{cfg.algo}_{cfg.reward_mode}.json"
with open(summary_path, "w") as f: with open(summary_path, "w") as f:
json.dump(results, f, indent=2) json.dump(results, f, indent=2)
@@ -280,23 +269,38 @@ def run_sweep(cfg: ExperimentConfig, alphas: List[float] | None = None) -> Dict[
return results return results
def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None) -> Dict[str, Dict]: def _train_policy(args: tuple) -> tuple[str, Dict]:
"""Compare multiple policies at same contamination level.""" """Worker for parallel policy comparison."""
cfg_dict, algo = args
cfg_dict["algo"] = algo
cfg_dict["experiment_name"] = f"cmp_{algo}_a{cfg_dict['alpha_true']:.2f}"
cmp_cfg = ExperimentConfig(**cfg_dict)
print(f"[{algo}] starting")
metrics = train(cmp_cfg)["metrics"]
print(f"[{algo}] done")
return algo, metrics
def compare_policies(cfg: ExperimentConfig, policies: List[str] | None = None, max_workers: int | None = None) -> Dict[str, Dict]:
policies = policies or ["fixed", "adaptive", "myopic", "random"] policies = policies or ["fixed", "adaptive", "myopic", "random"]
cfg_dict = asdict(cfg)
if max_workers == 1:
results = dict(_train_policy((cfg_dict.copy(), p)) for p in policies)
else:
with ProcessPoolExecutor(max_workers=max_workers) as pool:
futures = {pool.submit(_train_policy, (cfg_dict.copy(), p)): p for p in policies}
results = {} results = {}
for algo in policies: for fut in as_completed(futures):
cmp_cfg = ExperimentConfig(**{**asdict(cfg), "algo": algo, algo, metrics = fut.result()
"experiment_name": f"cmp_{algo}_a{cfg.alpha_true:.2f}"}) results[algo] = metrics
print(f"\n=== {algo} ===")
out = train(cmp_cfg)
results[algo] = out["metrics"]
cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json" cmp_path = Path(cfg.log_dir) / f"compare_a{cfg.alpha_true:.2f}.json"
with open(cmp_path, "w") as f: with open(cmp_path, "w") as f:
json.dump(results, f, indent=2) json.dump(results, f, indent=2)
print(f"\nComparison saved to {cmp_path}") print(f"\nComparison saved to {cmp_path}")
for algo, m in results.items(): for algo, m in results.items():
print(f" {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} " print(f" {algo:12s}: reward={m['reward_mean']:.2f} coi_erosion={m['coi_erosion_mean']:.4f} alpha_err={m['alpha_error_mean']:.4f}")
f"alpha_err={m['alpha_error_mean']:.4f}")
return results return results
@@ -312,6 +316,7 @@ def main():
parser.add_argument("--log-dir", default="lab/case/thesis/runs") parser.add_argument("--log-dir", default="lab/case/thesis/runs")
parser.add_argument("--sweep", action="store_true", help="run contamination sweep") parser.add_argument("--sweep", action="store_true", help="run contamination sweep")
parser.add_argument("--compare", action="store_true", help="compare all baselines") parser.add_argument("--compare", action="store_true", help="compare all baselines")
parser.add_argument("--workers", type=int, default=None, help="max parallel workers for sweep (None=auto, 1=sequential)")
args = parser.parse_args() args = parser.parse_args()
cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha, cfg = ExperimentConfig(algo=args.algo, total_timesteps=args.steps, alpha_true=args.alpha,
@@ -319,9 +324,9 @@ def main():
n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir) n_envs=args.n_envs, seed=args.seed, log_dir=args.log_dir)
if args.sweep: if args.sweep:
run_sweep(cfg) run_sweep(cfg, max_workers=args.workers)
elif args.compare: elif args.compare:
compare_policies(cfg) compare_policies(cfg, max_workers=args.workers)
else: else:
result = train(cfg) result = train(cfg)
print(f"\nTraining complete: {result['path']}") print(f"\nTraining complete: {result['path']}")