introduce penalized sessions to episodes

This commit is contained in:
2026-02-15 17:15:25 +01:00
parent ded7290935
commit 9b133cddfd
2 changed files with 51 additions and 15 deletions

View File

@@ -3,6 +3,12 @@ from .behavior import sample_behavior, get_transition_models, trajectory_to_even
from .render import DashboardRenderer, style_axis from .render import DashboardRenderer, style_axis
from .wrappers import EconomicMetricsWrapper from .wrappers import EconomicMetricsWrapper
from .callbacks import MetricsCallback, EvalMetricsCallback from .callbacks import MetricsCallback, EvalMetricsCallback
from .providers import ProviderBenchmark, ProviderResult, BenchmarkConfig from .providers import (
ProviderBenchmark,
ProviderResult,
BenchmarkConfig,
RandomBaseline,
SurgeBaseline,
)
from .coi import compute_uplift_coi, extract_purchases, compute_agent_probability from .coi import compute_uplift_coi, extract_purchases, compute_agent_probability
from .discrete import EventQTable from .discrete import EventQTable

View File

@@ -51,6 +51,9 @@ class PHANTOM(gym.Env):
action_levels: int = 9, action_levels: int = 9,
action_scale_low: float = 0.9, action_scale_low: float = 0.9,
action_scale_high: float = 1.1, action_scale_high: float = 1.1,
max_steps: int = 100,
margin_floor: float = 0.05,
margin_floor_patience: int = 5,
render_mode: str = None, render_mode: str = None,
): ):
super().__init__() super().__init__()
@@ -58,6 +61,11 @@ class PHANTOM(gym.Env):
self.price_bounds = price_bounds self.price_bounds = price_bounds
self.lambda_coi = lambda_coi self.lambda_coi = lambda_coi
self.coi_window = coi_window self.coi_window = coi_window
self.max_steps = max(1, int(max_steps))
self.margin_floor = float(
margin_floor
) # terminate if avg margin stays below this for patience steps
self.margin_floor_patience = max(1, int(margin_floor_patience))
self.render_mode = render_mode self.render_mode = render_mode
self.alpha = float(alpha) self.alpha = float(alpha)
self.nominal_alpha = float(alpha) self.nominal_alpha = float(alpha)
@@ -108,6 +116,7 @@ class PHANTOM(gym.Env):
self._initial_episode_prices = None self._initial_episode_prices = None
self._trajectories = [] # session trajectories for agent prob calculation self._trajectories = [] # session trajectories for agent prob calculation
self.baseline_prices = np.full(self.n_products, self.price_bounds[0]) self.baseline_prices = np.full(self.n_products, self.price_bounds[0])
self._low_margin_streak = 0 # consecutive steps below margin_floor
# load behavioral models for agent probability estimation # load behavioral models for agent probability estimation
try: try:
@@ -170,14 +179,18 @@ class PHANTOM(gym.Env):
revenue = float(np.dot(prices, demand_arr)) revenue = float(np.dot(prices, demand_arr))
purchases = extract_purchases(trajectories) purchases = extract_purchases(trajectories)
coi_mix = compute_uplift_coi(prices, purchases, self.baseline_prices) coi_mix = compute_uplift_coi(prices, purchases, self.baseline_prices)
# multiplicative penalty so COI term scales with revenue magnitude
coi_leakage = float(agent_prob * self.info_value) coi_leakage = float(agent_prob * self.info_value)
coi_penalty = float(self.lambda_coi * coi_leakage) discount = float(np.clip(1.0 - self.lambda_coi * coi_leakage, 0.0, 1.0))
return float(revenue - coi_penalty), { coi_penalty = revenue * (1.0 - discount) # absolute penalty in revenue units
reward = revenue * discount
return reward, {
"revenue": revenue, "revenue": revenue,
"coi_mix": float(coi_mix), "coi_mix": float(coi_mix),
"coi_base": 0.0, "coi_base": 0.0,
"coi_leakage": coi_leakage, "coi_leakage": coi_leakage,
"coi_penalty": coi_penalty, "coi_penalty": coi_penalty,
"coi_discount": discount,
} }
def _alpha_candidates(self) -> np.ndarray: def _alpha_candidates(self) -> np.ndarray:
@@ -187,21 +200,28 @@ class PHANTOM(gym.Env):
hi = min(1.0, self.nominal_alpha + self.robust_radius) hi = min(1.0, self.nominal_alpha + self.robust_radius)
return np.linspace(lo, hi, self.robust_points) return np.linspace(lo, hi, self.robust_points)
def _select_adversarial_alpha(self, prices: np.ndarray) -> float: def _select_adversarial_alpha(
self, prices: np.ndarray
) -> tuple[float, dict, list, float]:
"""inner robust step: pick worst-case alpha and return its outcome directly to avoid double-sampling"""
candidates = self._alpha_candidates() candidates = self._alpha_candidates()
if len(candidates) == 1:
return float(candidates[0])
best_alpha, worst_reward = float(candidates[0]), np.inf best_alpha, worst_reward = float(candidates[0]), np.inf
best_demand, best_trajectories, best_agent_prob = None, [], 0.0
for alpha in candidates: for alpha in candidates:
self._set_market_mix(float(alpha)) self._set_market_mix(float(alpha))
demand = self.market.act(prices) demand = self.market.act(prices)
trajectories = self.market.last_trajectories trajectories = list(self.market.last_trajectories)
agent_prob = self._compute_agent_prob(trajectories) agent_prob = self._compute_agent_prob(trajectories)
reward, _ = self._compute_reward(prices, demand, agent_prob, trajectories) reward, _ = self._compute_reward(prices, demand, agent_prob, trajectories)
if reward < worst_reward: if reward < worst_reward:
worst_reward = reward worst_reward = reward
best_alpha = float(alpha) best_alpha, best_demand, best_trajectories, best_agent_prob = (
return best_alpha float(alpha),
demand,
trajectories,
agent_prob,
)
return best_alpha, best_demand, best_trajectories, best_agent_prob
def _record_history(self): def _record_history(self):
demand_arr = np.array( demand_arr = np.array(
@@ -221,6 +241,7 @@ class PHANTOM(gym.Env):
self._demand = self._limbo.step() self._demand = self._limbo.step()
self._initial_episode_prices = self._prices.copy() self._initial_episode_prices = self._prices.copy()
self._step_count = 0 self._step_count = 0
self._low_margin_streak = 0
self._demand_history, self._price_history, self._revenue_history = [], [], [] self._demand_history, self._price_history, self._revenue_history = [], [], []
self._trajectories = list(getattr(self.market, "last_trajectories", [])) self._trajectories = list(getattr(self.market, "last_trajectories", []))
self._record_history() self._record_history()
@@ -228,21 +249,30 @@ class PHANTOM(gym.Env):
def step(self, action): def step(self, action):
self._prices = self._decode_action(action) self._prices = self._decode_action(action)
alpha_adv = self._select_adversarial_alpha(self._prices) # inner robust step returns worst-case outcome directly, no re-sampling
alpha_adv, self._demand, trajectories, agent_prob = (
self._select_adversarial_alpha(self._prices)
)
self._set_market_mix(alpha_adv) self._set_market_mix(alpha_adv)
self._platform_stub.set_prices(self._prices) self._platform_stub.set_prices(self._prices)
self._limbo.step()
self._demand = self._limbo.step()
trajectories = getattr(self.market, "last_trajectories", [])
self._step_count += 1 self._step_count += 1
self._trajectories.extend(trajectories) self._trajectories.extend(trajectories)
agent_prob = self._compute_agent_prob(trajectories)
reward, metrics = self._compute_reward( reward, metrics = self._compute_reward(
self._prices, self._demand, agent_prob, trajectories self._prices, self._demand, agent_prob, trajectories
) )
self._record_history() self._record_history()
terminated = self._step_count >= 100
# soft early termination when margin collapses for too long
avg_margin = float(np.mean(self._prices) - self.price_bounds[0]) / max(
float(np.mean(self._prices)), 1e-6
)
if avg_margin < self.margin_floor:
self._low_margin_streak += 1
else:
self._low_margin_streak = 0
margin_collapsed = self._low_margin_streak >= self.margin_floor_patience
terminated = self._step_count >= self.max_steps or margin_collapsed
info = { info = {
"step": self._step_count, "step": self._step_count,