fix: supra reward adjustment and sweep

This commit is contained in:
2026-03-16 15:58:05 +01:00
parent 43b952cf2b
commit 3439775fbd
4 changed files with 103 additions and 8 deletions

View File

@@ -54,6 +54,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
coi_levels: list[float] = [] coi_levels: list[float] = []
coi_leakages: list[float] = [] coi_leakages: list[float] = []
volatilities: list[float] = [] volatilities: list[float] = []
upward_volatilities: list[float] = []
supra_shares: list[float] = []
supra_penalties: list[float] = []
agent_probs: list[float] = [] agent_probs: list[float] = []
for _ in range(int(episodes)): for _ in range(int(episodes)):
@@ -65,6 +68,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
ep_coi = 0.0 ep_coi = 0.0
ep_coi_leakage = 0.0 ep_coi_leakage = 0.0
ep_volatility = 0.0 ep_volatility = 0.0
ep_upward_volatility = 0.0
ep_supra_share = 0.0
ep_supra_penalty = 0.0
ep_agent_prob = 0.0 ep_agent_prob = 0.0
steps = 0 steps = 0
@@ -78,6 +84,15 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
ep_coi += float(econ.get("coi_level", 0.0)) ep_coi += float(econ.get("coi_level", 0.0))
ep_coi_leakage += float(econ.get("coi_leakage", 0.0)) ep_coi_leakage += float(econ.get("coi_leakage", 0.0))
ep_volatility += float(econ.get("volatility", 0.0)) ep_volatility += float(econ.get("volatility", 0.0))
ep_upward_volatility += float(
info.get("upward_volatility", econ.get("upward_volatility", 0.0))
)
ep_supra_share += float(
info.get("supra_share", econ.get("supra_share", 0.0))
)
ep_supra_penalty += float(
info.get("supra_penalty", econ.get("supra_penalty", 0.0))
)
ep_agent_prob += float(econ.get("agent_prob", info.get("agent_prob", 0.0))) ep_agent_prob += float(econ.get("agent_prob", info.get("agent_prob", 0.0)))
steps += 1 steps += 1
@@ -88,6 +103,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
coi_levels.append(ep_coi / denom) coi_levels.append(ep_coi / denom)
coi_leakages.append(ep_coi_leakage / denom) coi_leakages.append(ep_coi_leakage / denom)
volatilities.append(ep_volatility / denom) volatilities.append(ep_volatility / denom)
upward_volatilities.append(ep_upward_volatility / denom)
supra_shares.append(ep_supra_share / denom)
supra_penalties.append(ep_supra_penalty / denom)
agent_probs.append(ep_agent_prob / denom) agent_probs.append(ep_agent_prob / denom)
return { return {
@@ -99,6 +117,13 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
"eval/coi_level_mean": float(np.mean(coi_levels)) if coi_levels else 0.0, "eval/coi_level_mean": float(np.mean(coi_levels)) if coi_levels else 0.0,
"eval/coi_leakage_mean": float(np.mean(coi_leakages)) if coi_leakages else 0.0, "eval/coi_leakage_mean": float(np.mean(coi_leakages)) if coi_leakages else 0.0,
"eval/volatility_mean": float(np.mean(volatilities)) if volatilities else 0.0, "eval/volatility_mean": float(np.mean(volatilities)) if volatilities else 0.0,
"eval/upward_volatility_mean": (
float(np.mean(upward_volatilities)) if upward_volatilities else 0.0
),
"eval/supra_share_mean": float(np.mean(supra_shares)) if supra_shares else 0.0,
"eval/supra_penalty_mean": (
float(np.mean(supra_penalties)) if supra_penalties else 0.0
),
"eval/agent_prob_mean": float(np.mean(agent_probs)) if agent_probs else 0.0, "eval/agent_prob_mean": float(np.mean(agent_probs)) if agent_probs else 0.0,
} }

View File

@@ -71,6 +71,10 @@ class EconomicMetricsWrapper(gym.Wrapper):
"coi_penalty", "coi_penalty",
"ux_penalty", "ux_penalty",
"volatility", "volatility",
"upward_volatility",
"supra_penalty",
"supra_share",
"competitive_anchor",
"profit", "profit",
"cost_floor", "cost_floor",
"reward_revenue", "reward_revenue",

View File

@@ -0,0 +1,53 @@
method: random
metric:
name: eval/supra_share_mean
goal: minimize
run_cap: 256
command:
- ${env}
- python
- -m
- engine.train
parameters:
algo:
value: ppo
seed:
values: [42, 1337, 7777]
alpha:
values: [0.1, 0.2, 0.3, 0.4, 0.6]
n_products:
values: [25, 50]
N:
value: 100
no_robust:
values: [false, true]
lambda_coi:
values: [0.05, 0.15, 0.3]
robust_radius:
values: [0.1, 0.2, 0.3]
robust_points:
value: 7
robust_rollouts:
value: 1
eta_ux:
values: [0.05, 0.15, 0.3, 0.5, 0.75]
reward_profit_weight:
value: 1.0
total_timesteps:
value: 100000
eval_episodes:
value: 10
eval_freq:
value: 1000
log_freq:
value: 100
hist_freq:
value: 500
learning_rate:
value: 0.0003
batch_size:
value: 256
n_steps:
value: 2048
device:
value: cpu

View File

@@ -216,18 +216,27 @@ class PHANTOM(gym.Env):
coi_penalty = self.lambda_coi * coi_leakage * info_budget coi_penalty = self.lambda_coi * coi_leakage * info_budget
if len(self._price_history) > 0: if len(self._price_history) > 0:
volatility = float( prev_prices = np.asarray(self._price_history[-1], dtype=float)
np.mean( rel_change = (prices - prev_prices) / np.maximum(prev_prices, 1.0)
np.abs(prices - self._price_history[-1]) volatility = float(np.mean(np.abs(rel_change)))
/ np.maximum(self.baseline_prices, 1.0) upward_volatility = float(np.mean(np.clip(rel_change, 0.0, None)))
)
)
else: else:
volatility = 0.0 volatility = 0.0
ux_penalty = self.eta_ux * info_budget * volatility upward_volatility = 0.0
ux_penalty = self.eta_ux * info_budget * (volatility + 0.5 * upward_volatility)
competitive_anchor = float(
np.clip(float(self.human_params[0]) * 1.2, *self.price_bounds)
)
price_ratio = prices / max(competitive_anchor, 1.0)
supra_excess = np.clip(price_ratio - 1.0, 0.0, None)
supra_penalty = (
0.5 * self.eta_ux * info_budget * float(np.mean(np.square(supra_excess)))
)
supra_share = float(np.mean(supra_excess > 0.0))
reward_revenue = self.reward_profit_weight * profit reward_revenue = self.reward_profit_weight * profit
reward = reward_revenue - coi_penalty - ux_penalty reward = reward_revenue - coi_penalty - ux_penalty - supra_penalty
return reward, { return reward, {
"revenue": revenue, "revenue": revenue,
@@ -240,6 +249,10 @@ class PHANTOM(gym.Env):
"coi_info_budget": info_budget, "coi_info_budget": info_budget,
"ux_penalty": ux_penalty, "ux_penalty": ux_penalty,
"volatility": volatility, "volatility": volatility,
"upward_volatility": upward_volatility,
"supra_penalty": supra_penalty,
"supra_share": supra_share,
"competitive_anchor": competitive_anchor,
"reward_revenue": reward_revenue, "reward_revenue": reward_revenue,
"reward_total": reward, "reward_total": reward,
} }