From 9b133cddfd67a30a4649ad6bcb93acf225d9e845 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 15 Feb 2026 17:15:25 +0100
Subject: [PATCH] introduce penalized sessions to episodes

---
 engine/lib/__init__.py |  8 +++++-
 engine/wrapper.py      | 58 ++++++++++++++++++++++++++++++++----------
 2 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/engine/lib/__init__.py b/engine/lib/__init__.py
index 2a56747..874db63 100644
--- a/engine/lib/__init__.py
+++ b/engine/lib/__init__.py
@@ -3,6 +3,12 @@ from .behavior import sample_behavior, get_transition_models, trajectory_to_even
 from .render import DashboardRenderer, style_axis
 from .wrappers import EconomicMetricsWrapper
 from .callbacks import MetricsCallback, EvalMetricsCallback
-from .providers import ProviderBenchmark, ProviderResult, BenchmarkConfig
+from .providers import (
+    ProviderBenchmark,
+    ProviderResult,
+    BenchmarkConfig,
+    RandomBaseline,
+    SurgeBaseline,
+)
 from .coi import compute_uplift_coi, extract_purchases, compute_agent_probability
 from .discrete import EventQTable
diff --git a/engine/wrapper.py b/engine/wrapper.py
index 22e958b..3e37d9a 100644
--- a/engine/wrapper.py
+++ b/engine/wrapper.py
@@ -51,6 +51,9 @@ class PHANTOM(gym.Env):
         action_levels: int = 9,
         action_scale_low: float = 0.9,
         action_scale_high: float = 1.1,
+        max_steps: int = 100,
+        margin_floor: float = 0.05,
+        margin_floor_patience: int = 5,
         render_mode: str = None,
     ):
         super().__init__()
@@ -58,6 +61,11 @@ class PHANTOM(gym.Env):
         self.price_bounds = price_bounds
         self.lambda_coi = lambda_coi
         self.coi_window = coi_window
+        self.max_steps = max(1, int(max_steps))
+        self.margin_floor = float(
+            margin_floor
+        )  # terminate if avg margin stays below this for patience steps
+        self.margin_floor_patience = max(1, int(margin_floor_patience))
         self.render_mode = render_mode
         self.alpha = float(alpha)
         self.nominal_alpha = float(alpha)
@@ -108,6 +116,7 @@ class PHANTOM(gym.Env):
         self._initial_episode_prices = None
         self._trajectories = []  # session trajectories for agent prob calculation
         self.baseline_prices = np.full(self.n_products, self.price_bounds[0])
+        self._low_margin_streak = 0  # consecutive steps below margin_floor
 
         # load behavioral models for agent probability estimation
         try:
@@ -170,14 +179,18 @@ class PHANTOM(gym.Env):
         revenue = float(np.dot(prices, demand_arr))
         purchases = extract_purchases(trajectories)
         coi_mix = compute_uplift_coi(prices, purchases, self.baseline_prices)
+        # multiplicative penalty so COI term scales with revenue magnitude
         coi_leakage = float(agent_prob * self.info_value)
-        coi_penalty = float(self.lambda_coi * coi_leakage)
-        return float(revenue - coi_penalty), {
+        discount = float(np.clip(1.0 - self.lambda_coi * coi_leakage, 0.0, 1.0))
+        coi_penalty = revenue * (1.0 - discount)  # absolute penalty in revenue units
+        reward = revenue * discount
+        return reward, {
             "revenue": revenue,
             "coi_mix": float(coi_mix),
             "coi_base": 0.0,
             "coi_leakage": coi_leakage,
             "coi_penalty": coi_penalty,
+            "coi_discount": discount,
         }
 
     def _alpha_candidates(self) -> np.ndarray:
@@ -187,21 +200,28 @@ class PHANTOM(gym.Env):
         hi = min(1.0, self.nominal_alpha + self.robust_radius)
         return np.linspace(lo, hi, self.robust_points)
 
-    def _select_adversarial_alpha(self, prices: np.ndarray) -> float:
+    def _select_adversarial_alpha(
+        self, prices: np.ndarray
+    ) -> tuple[float, dict, list, float]:
+        """inner robust step: pick worst-case alpha and return its outcome directly to avoid double-sampling"""
         candidates = self._alpha_candidates()
-        if len(candidates) == 1:
-            return float(candidates[0])
         best_alpha, worst_reward = float(candidates[0]), np.inf
+        best_demand, best_trajectories, best_agent_prob = None, [], 0.0
         for alpha in candidates:
             self._set_market_mix(float(alpha))
             demand = self.market.act(prices)
-            trajectories = self.market.last_trajectories
+            trajectories = list(self.market.last_trajectories)
             agent_prob = self._compute_agent_prob(trajectories)
             reward, _ = self._compute_reward(prices, demand, agent_prob, trajectories)
             if reward < worst_reward:
                 worst_reward = reward
-                best_alpha = float(alpha)
-        return best_alpha
+                best_alpha, best_demand, best_trajectories, best_agent_prob = (
+                    float(alpha),
+                    demand,
+                    trajectories,
+                    agent_prob,
+                )
+        return best_alpha, best_demand, best_trajectories, best_agent_prob
 
     def _record_history(self):
         demand_arr = np.array(
@@ -221,6 +241,7 @@ class PHANTOM(gym.Env):
         self._demand = self._limbo.step()
         self._initial_episode_prices = self._prices.copy()
         self._step_count = 0
+        self._low_margin_streak = 0
         self._demand_history, self._price_history, self._revenue_history = [], [], []
         self._trajectories = list(getattr(self.market, "last_trajectories", []))
         self._record_history()
@@ -228,21 +249,30 @@ class PHANTOM(gym.Env):
 
     def step(self, action):
         self._prices = self._decode_action(action)
-        alpha_adv = self._select_adversarial_alpha(self._prices)
+        # inner robust step returns worst-case outcome directly, no re-sampling
+        alpha_adv, self._demand, trajectories, agent_prob = (
+            self._select_adversarial_alpha(self._prices)
+        )
         self._set_market_mix(alpha_adv)
         self._platform_stub.set_prices(self._prices)
-        self._limbo.step()
-        self._demand = self._limbo.step()
-        trajectories = getattr(self.market, "last_trajectories", [])
         self._step_count += 1
         self._trajectories.extend(trajectories)
 
-        agent_prob = self._compute_agent_prob(trajectories)
         reward, metrics = self._compute_reward(
             self._prices, self._demand, agent_prob, trajectories
         )
         self._record_history()
-        terminated = self._step_count >= 100
+
+        # soft early termination when margin collapses for too long
+        avg_margin = float(np.mean(self._prices) - self.price_bounds[0]) / max(
+            float(np.mean(self._prices)), 1e-6
+        )
+        if avg_margin < self.margin_floor:
+            self._low_margin_streak += 1
+        else:
+            self._low_margin_streak = 0
+        margin_collapsed = self._low_margin_streak >= self.margin_floor_patience
+        terminated = self._step_count >= self.max_steps or margin_collapsed
 
         info = {
             "step": self._step_count,