feature: telemetry logging

This commit is contained in:
2026-03-10 14:23:17 +01:00
parent be03b2d4d5
commit 4c7d911043
14 changed files with 454 additions and 104 deletions

View File

@@ -47,8 +47,10 @@ class PHANTOM(gym.Env):
coi_window: int = 10,
robust_radius: float = 0.0,
robust_points: int = 5,
robust_rollouts: int = 1,
info_value: float = 1.0,
eta_ux: float = 0.5,
reward_profit_weight: float = 1.0,
action_levels: int = 9,
action_scale_low: float = 0.9,
action_scale_high: float = 1.1,
@@ -75,8 +77,10 @@ class PHANTOM(gym.Env):
self.agent_params = agent_params
self.robust_radius = max(0.0, float(robust_radius))
self.robust_points = max(1, int(robust_points))
self.robust_rollouts = max(1, int(robust_rollouts))
self.info_value = float(info_value)
self.eta_ux = float(eta_ux)
self.reward_profit_weight = float(reward_profit_weight)
self.action_levels = max(2, int(action_levels))
self._action_scales = np.linspace(
float(action_scale_low), float(action_scale_high), self.action_levels
@@ -105,6 +109,12 @@ class PHANTOM(gym.Env):
shape=(n_products,),
dtype=np.float32,
),
"signals": spaces.Box(
low=np.array([0.0, 0.0, 0.0, 0.0], dtype=np.float32),
high=np.array([1.0, 1.0, 1.0, 1.0], dtype=np.float32),
shape=(4,),
dtype=np.float32,
),
}
)
@@ -119,6 +129,8 @@ class PHANTOM(gym.Env):
self._trajectories = [] # session trajectories for agent prob calculation
self.baseline_prices = np.full(self.n_products, self.price_bounds[0])
self._low_margin_streak = 0 # consecutive steps below margin_floor
self._last_agent_prob = float(self.alpha)
self._last_alpha_adv = float(self.alpha)
# load behavioral models for agent probability estimation
try:
@@ -131,7 +143,20 @@ class PHANTOM(gym.Env):
demand_arr = np.array(
[self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32
)
return {"demand": demand_arr, "prices": self._prices.astype(np.float32)}
signals = np.array(
[
float(np.clip(self._last_agent_prob, 0.0, 1.0)),
float(np.clip(self._last_alpha_adv, 0.0, 1.0)),
float(np.clip(self.nominal_alpha, 0.0, 1.0)),
float(np.clip(self.robust_radius, 0.0, 1.0)),
],
dtype=np.float32,
)
return {
"demand": demand_arr,
"prices": self._prices.astype(np.float32),
"signals": signals,
}
def _set_market_mix(self, alpha: float):
alpha = float(np.clip(alpha, 0.0, 1.0))
@@ -179,15 +204,15 @@ class PHANTOM(gym.Env):
[demand.get(i, 0.0) for i in range(self.n_products)], dtype=float
)
revenue = float(np.dot(prices, demand_arr))
floor_cost = float(np.dot(self.baseline_prices, demand_arr))
profit = revenue - floor_cost
purchases = extract_purchases(trajectories)
coi_mix = compute_uplift_coi(prices, purchases, self.baseline_prices)
# multiplicative penalty so COI term scales with revenue magnitude
coi_leakage = float(agent_prob * self.info_value)
discount = float(np.clip(1.0 - self.lambda_coi * coi_leakage, 0.0, 1.0))
coi_penalty = revenue * (1.0 - discount) # absolute penalty in revenue units
info_budget = max(floor_cost, 1.0)
coi_penalty = self.lambda_coi * coi_leakage * info_budget
# calculate UX penalty based on price volatility
if len(self._price_history) > 0:
volatility = float(
np.mean(
@@ -197,19 +222,24 @@ class PHANTOM(gym.Env):
)
else:
volatility = 0.0
ux_penalty = self.eta_ux * revenue * volatility
ux_penalty = self.eta_ux * info_budget * volatility
reward = revenue * discount - ux_penalty
reward_revenue = self.reward_profit_weight * profit
reward = reward_revenue - coi_penalty - ux_penalty
return reward, {
"revenue": revenue,
"cost_floor": floor_cost,
"profit": profit,
"coi_mix": float(coi_mix),
"coi_base": 0.0,
"coi_leakage": coi_leakage,
"coi_penalty": coi_penalty,
"coi_discount": discount,
"coi_info_budget": info_budget,
"ux_penalty": ux_penalty,
"volatility": volatility,
"reward_revenue": reward_revenue,
"reward_total": reward,
}
def _alpha_candidates(self) -> np.ndarray:
@@ -219,35 +249,26 @@ class PHANTOM(gym.Env):
hi = min(1.0, self.nominal_alpha + self.robust_radius)
return np.linspace(lo, hi, self.robust_points)
def _evaluate_candidate(
self, alpha: float, prices: np.ndarray
) -> tuple[float, dict, list, float]:
def _evaluate_candidate(self, alpha: float, prices: np.ndarray) -> float:
self._set_market_mix(alpha)
demand = self.market.act(prices)
trajectories = list(self.market.last_trajectories)
agent_prob = self._compute_agent_prob(trajectories)
reward, _ = self._compute_reward(prices, demand, agent_prob, trajectories)
return reward, demand, trajectories, agent_prob
rewards = []
for _ in range(self.robust_rollouts):
demand = self.market.act(prices)
trajectories = list(self.market.last_trajectories)
agent_prob = self._compute_agent_prob(trajectories)
reward, _ = self._compute_reward(prices, demand, agent_prob, trajectories)
rewards.append(float(reward))
return float(np.mean(rewards)) if rewards else 0.0
def _select_adversarial_alpha(
self, prices: np.ndarray
) -> tuple[float, dict, list, float]:
def _select_adversarial_alpha(self, prices: np.ndarray) -> float:
"""inner robust step: evaluate candidates and pick worst-case alpha"""
candidates = self._alpha_candidates()
evaluations = [
(alpha, *self._evaluate_candidate(float(alpha), prices))
(float(alpha), self._evaluate_candidate(float(alpha), prices))
for alpha in candidates
]
# min over alpha in Wasserstein interval
best_eval = min(evaluations, key=lambda x: x[1]) # index 1 is reward
best_alpha = best_eval[0]
best_demand = best_eval[2]
best_trajectories = best_eval[3]
best_agent_prob = best_eval[4]
return best_alpha, best_demand, best_trajectories, best_agent_prob
best_alpha, _ = min(evaluations, key=lambda x: x[1])
return best_alpha
def _record_history(self):
demand_arr = np.array(
@@ -270,19 +291,24 @@ class PHANTOM(gym.Env):
self._low_margin_streak = 0
self._demand_history, self._price_history, self._revenue_history = [], [], []
self._trajectories = list(getattr(self.market, "last_trajectories", []))
self._last_agent_prob = float(self.nominal_alpha)
self._last_alpha_adv = float(self.nominal_alpha)
self._record_history()
return self._get_obs(), {}
def step(self, action):
self._prices = self._decode_action(action)
# inner robust step returns worst-case outcome directly, no re-sampling
alpha_adv, self._demand, trajectories, agent_prob = (
self._select_adversarial_alpha(self._prices)
)
alpha_adv = self._select_adversarial_alpha(self._prices)
self._set_market_mix(alpha_adv)
self._platform_stub.set_prices(self._prices)
self._step_count += 1
self._demand = self.market.act(self._prices)
trajectories = list(self.market.last_trajectories)
agent_prob = self._compute_agent_prob(trajectories)
self._trajectories.extend(trajectories)
self._last_agent_prob = float(agent_prob)
self._last_alpha_adv = float(alpha_adv)
reward, metrics = self._compute_reward(
self._prices, self._demand, agent_prob, trajectories
@@ -304,7 +330,9 @@ class PHANTOM(gym.Env):
"step": self._step_count,
"agent_prob": agent_prob,
"alpha_adv": float(alpha_adv),
"alpha_nominal": float(self.nominal_alpha),
"wasserstein_radius": float(self.robust_radius),
"robust_rollouts": int(self.robust_rollouts),
**metrics,
"raw_revenue": np.sum(
self._prices