chore: cleaning some code

2026-07-16 01:53:37 +00:00 · 2026-02-28 23:30:16 +01:00
parent 233ce3be34
commit 803e3a2972
6 changed files with 81 additions and 30 deletions
--- a/engine/wrapper.py
+++ b/engine/wrapper.py
@@ -48,6 +48,7 @@ class PHANTOM(gym.Env):
        robust_radius: float = 0.0,
        robust_points: int = 5,
        info_value: float = 1.0,
+        eta_ux: float = 0.5,
        action_levels: int = 9,
        action_scale_low: float = 0.9,
        action_scale_high: float = 1.1,
@@ -75,6 +76,7 @@ class PHANTOM(gym.Env):
        self.robust_radius = max(0.0, float(robust_radius))
        self.robust_points = max(1, int(robust_points))
        self.info_value = float(info_value)
+        self.eta_ux = float(eta_ux)
        self.action_levels = max(2, int(action_levels))
        self._action_scales = np.linspace(
            float(action_scale_low), float(action_scale_high), self.action_levels
@@ -179,11 +181,26 @@ class PHANTOM(gym.Env):
        revenue = float(np.dot(prices, demand_arr))
        purchases = extract_purchases(trajectories)
        coi_mix = compute_uplift_coi(prices, purchases, self.baseline_prices)
+
        # multiplicative penalty so COI term scales with revenue magnitude
        coi_leakage = float(agent_prob * self.info_value)
        discount = float(np.clip(1.0 - self.lambda_coi * coi_leakage, 0.0, 1.0))
        coi_penalty = revenue * (1.0 - discount)  # absolute penalty in revenue units
-        reward = revenue * discount
+
+        # calculate UX penalty based on price volatility
+        if len(self._price_history) > 0:
+            volatility = float(
+                np.mean(
+                    np.abs(prices - self._price_history[-1])
+                    / np.maximum(self.baseline_prices, 1.0)
+                )
+            )
+        else:
+            volatility = 0.0
+        ux_penalty = self.eta_ux * revenue * volatility
+
+        reward = revenue * discount - ux_penalty
+
        return reward, {
            "revenue": revenue,
            "coi_mix": float(coi_mix),
@@ -191,6 +208,8 @@ class PHANTOM(gym.Env):
            "coi_leakage": coi_leakage,
            "coi_penalty": coi_penalty,
            "coi_discount": discount,
+            "ux_penalty": ux_penalty,
+            "volatility": volatility,
        }

    def _alpha_candidates(self) -> np.ndarray:
@@ -200,27 +219,34 @@ class PHANTOM(gym.Env):
        hi = min(1.0, self.nominal_alpha + self.robust_radius)
        return np.linspace(lo, hi, self.robust_points)

+    def _evaluate_candidate(
+        self, alpha: float, prices: np.ndarray
+    ) -> tuple[float, dict, list, float]:
+        self._set_market_mix(alpha)
+        demand = self.market.act(prices)
+        trajectories = list(self.market.last_trajectories)
+        agent_prob = self._compute_agent_prob(trajectories)
+        reward, _ = self._compute_reward(prices, demand, agent_prob, trajectories)
+        return reward, demand, trajectories, agent_prob
+
    def _select_adversarial_alpha(
        self, prices: np.ndarray
    ) -> tuple[float, dict, list, float]:
-        """inner robust step: pick worst-case alpha and return its outcome directly to avoid double-sampling"""
+        """inner robust step: evaluate candidates and pick worst-case alpha"""
        candidates = self._alpha_candidates()
-        best_alpha, worst_reward = float(candidates[0]), np.inf
-        best_demand, best_trajectories, best_agent_prob = None, [], 0.0
-        for alpha in candidates:
-            self._set_market_mix(float(alpha))
-            demand = self.market.act(prices)
-            trajectories = list(self.market.last_trajectories)
-            agent_prob = self._compute_agent_prob(trajectories)
-            reward, _ = self._compute_reward(prices, demand, agent_prob, trajectories)
-            if reward < worst_reward:
-                worst_reward = reward
-                best_alpha, best_demand, best_trajectories, best_agent_prob = (
-                    float(alpha),
-                    demand,
-                    trajectories,
-                    agent_prob,
-                )
+        evaluations = [
+            (alpha, *self._evaluate_candidate(float(alpha), prices))
+            for alpha in candidates
+        ]
+
+        # min over alpha in Wasserstein interval
+        best_eval = min(evaluations, key=lambda x: x[1])  # index 1 is reward
+
+        best_alpha = best_eval[0]
+        best_demand = best_eval[2]
+        best_trajectories = best_eval[3]
+        best_agent_prob = best_eval[4]
+
        return best_alpha, best_demand, best_trajectories, best_agent_prob

    def _record_history(self):