diff --git a/engine/backends/common.py b/engine/backends/common.py index 45f03e7..f754342 100644 --- a/engine/backends/common.py +++ b/engine/backends/common.py @@ -54,6 +54,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]: coi_levels: list[float] = [] coi_leakages: list[float] = [] volatilities: list[float] = [] + upward_volatilities: list[float] = [] + supra_shares: list[float] = [] + supra_penalties: list[float] = [] agent_probs: list[float] = [] for _ in range(int(episodes)): @@ -65,6 +68,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]: ep_coi = 0.0 ep_coi_leakage = 0.0 ep_volatility = 0.0 + ep_upward_volatility = 0.0 + ep_supra_share = 0.0 + ep_supra_penalty = 0.0 ep_agent_prob = 0.0 steps = 0 @@ -78,6 +84,15 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]: ep_coi += float(econ.get("coi_level", 0.0)) ep_coi_leakage += float(econ.get("coi_leakage", 0.0)) ep_volatility += float(econ.get("volatility", 0.0)) + ep_upward_volatility += float( + info.get("upward_volatility", econ.get("upward_volatility", 0.0)) + ) + ep_supra_share += float( + info.get("supra_share", econ.get("supra_share", 0.0)) + ) + ep_supra_penalty += float( + info.get("supra_penalty", econ.get("supra_penalty", 0.0)) + ) ep_agent_prob += float(econ.get("agent_prob", info.get("agent_prob", 0.0))) steps += 1 @@ -88,6 +103,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]: coi_levels.append(ep_coi / denom) coi_leakages.append(ep_coi_leakage / denom) volatilities.append(ep_volatility / denom) + upward_volatilities.append(ep_upward_volatility / denom) + supra_shares.append(ep_supra_share / denom) + supra_penalties.append(ep_supra_penalty / denom) agent_probs.append(ep_agent_prob / denom) return { @@ -99,6 +117,13 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]: "eval/coi_level_mean": float(np.mean(coi_levels)) if coi_levels else 0.0, "eval/coi_leakage_mean": float(np.mean(coi_leakages)) if coi_leakages else 0.0, "eval/volatility_mean": float(np.mean(volatilities)) if volatilities else 0.0, + "eval/upward_volatility_mean": ( + float(np.mean(upward_volatilities)) if upward_volatilities else 0.0 + ), + "eval/supra_share_mean": float(np.mean(supra_shares)) if supra_shares else 0.0, + "eval/supra_penalty_mean": ( + float(np.mean(supra_penalties)) if supra_penalties else 0.0 + ), "eval/agent_prob_mean": float(np.mean(agent_probs)) if agent_probs else 0.0, } diff --git a/engine/lib/wrappers.py b/engine/lib/wrappers.py index 4cfd706..dcb4fd1 100644 --- a/engine/lib/wrappers.py +++ b/engine/lib/wrappers.py @@ -71,6 +71,10 @@ class EconomicMetricsWrapper(gym.Wrapper): "coi_penalty", "ux_penalty", "volatility", + "upward_volatility", + "supra_penalty", + "supra_share", + "competitive_anchor", "profit", "cost_floor", "reward_revenue", diff --git a/engine/sweeps/ppo_supra_guard.yaml b/engine/sweeps/ppo_supra_guard.yaml new file mode 100644 index 0000000..05131be --- /dev/null +++ b/engine/sweeps/ppo_supra_guard.yaml @@ -0,0 +1,53 @@ +method: random +metric: + name: eval/supra_share_mean + goal: minimize +run_cap: 256 +command: + - ${env} + - python + - -m + - engine.train +parameters: + algo: + value: ppo + seed: + values: [42, 1337, 7777] + alpha: + values: [0.1, 0.2, 0.3, 0.4, 0.6] + n_products: + values: [25, 50] + N: + value: 100 + no_robust: + values: [false, true] + lambda_coi: + values: [0.05, 0.15, 0.3] + robust_radius: + values: [0.1, 0.2, 0.3] + robust_points: + value: 7 + robust_rollouts: + value: 1 + eta_ux: + values: [0.05, 0.15, 0.3, 0.5, 0.75] + reward_profit_weight: + value: 1.0 + total_timesteps: + value: 100000 + eval_episodes: + value: 10 + eval_freq: + value: 1000 + log_freq: + value: 100 + hist_freq: + value: 500 + learning_rate: + value: 0.0003 + batch_size: + value: 256 + n_steps: + value: 2048 + device: + value: cpu diff --git a/engine/wrapper.py b/engine/wrapper.py index 2786780..1748617 100644 --- a/engine/wrapper.py +++ b/engine/wrapper.py @@ -216,18 +216,27 @@ class PHANTOM(gym.Env): coi_penalty = self.lambda_coi * coi_leakage * info_budget if len(self._price_history) > 0: - volatility = float( - np.mean( - np.abs(prices - self._price_history[-1]) - / np.maximum(self.baseline_prices, 1.0) - ) - ) + prev_prices = np.asarray(self._price_history[-1], dtype=float) + rel_change = (prices - prev_prices) / np.maximum(prev_prices, 1.0) + volatility = float(np.mean(np.abs(rel_change))) + upward_volatility = float(np.mean(np.clip(rel_change, 0.0, None))) else: volatility = 0.0 - ux_penalty = self.eta_ux * info_budget * volatility + upward_volatility = 0.0 + ux_penalty = self.eta_ux * info_budget * (volatility + 0.5 * upward_volatility) + + competitive_anchor = float( + np.clip(float(self.human_params[0]) * 1.2, *self.price_bounds) + ) + price_ratio = prices / max(competitive_anchor, 1.0) + supra_excess = np.clip(price_ratio - 1.0, 0.0, None) + supra_penalty = ( + 0.5 * self.eta_ux * info_budget * float(np.mean(np.square(supra_excess))) + ) + supra_share = float(np.mean(supra_excess > 0.0)) reward_revenue = self.reward_profit_weight * profit - reward = reward_revenue - coi_penalty - ux_penalty + reward = reward_revenue - coi_penalty - ux_penalty - supra_penalty return reward, { "revenue": revenue, @@ -240,6 +249,10 @@ class PHANTOM(gym.Env): "coi_info_budget": info_budget, "ux_penalty": ux_penalty, "volatility": volatility, + "upward_volatility": upward_volatility, + "supra_penalty": supra_penalty, + "supra_share": supra_share, + "competitive_anchor": competitive_anchor, "reward_revenue": reward_revenue, "reward_total": reward, }