minor refactors to codebase to implement DRO

This commit is contained in:
2026-02-14 14:53:30 +01:00
parent 895eea5674
commit bc6c481d03
6 changed files with 195 additions and 75 deletions

1
.gitignore vendored
View File

@@ -30,3 +30,4 @@ sim/rl/behavior_loader/*.pdf
tests/e2e/node_modules/** tests/e2e/node_modules/**
lab/case/thesis/runs*/ lab/case/thesis/runs*/
sim/case/thesis_simplified/runs*/ sim/case/thesis_simplified/runs*/
PHANTOM_web/*

View File

@@ -19,15 +19,18 @@ class MarketEngine:
agent_params: tuple, agent_params: tuple,
demand_distribution=np.random.normal, demand_distribution=np.random.normal,
noise_std: float = 1.0, noise_std: float = 1.0,
action_weights: dict | None = None,
): ):
# no defaults for D_H, D_A - force explicit experiment design # no defaults for D_H, D_A - force explicit experiment design
self.alpha = alpha self.alpha = alpha
self.N = int(N)
self.Nagents = int(N * alpha) self.Nagents = int(N * alpha)
self.Nhumans = int(N * (1 - alpha)) self.Nhumans = int(N * (1 - alpha))
self.human_params = human_params self.human_params = human_params
self.agent_params = agent_params self.agent_params = agent_params
self.noise_std = noise_std self.noise_std = noise_std
self.demand_dist = demand_distribution self.demand_dist = demand_distribution
self.action_weights = action_weights
def act(self, prices): def act(self, prices):
# generate separate demands d() per actor type # generate separate demands d() per actor type
@@ -48,7 +51,7 @@ class MarketEngine:
agent_t = [sample_behavior(demand_a, human=False) for _ in range(self.Nagents)] agent_t = [sample_behavior(demand_a, human=False) for _ in range(self.Nagents)]
# store trajectories for agent probability calculation # store trajectories for agent probability calculation
self.last_trajectories = human_t + agent_t self.last_trajectories = human_t + agent_t
return estimate_demand(self.last_trajectories) return estimate_demand(self.last_trajectories, self.action_weights)
def measure(self): def measure(self):
pass pass
@@ -72,13 +75,16 @@ class Limbo:
self.output = None self.output = None
def step(self): def step(self):
# we could code golf this a little bit
if self.platform_turn: if self.platform_turn:
self.output = self.platform.act(self.output) self.output = self.platform.act(self.output)
else: else:
self.output = self.market.act(self.output) self.output = self.market.act(self.output)
print(self.output)
self.platform_turn = not self.platform_turn self.platform_turn = not self.platform_turn
return self.output
def reset(self):
self.platform_turn = True
self.output = None
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -1,4 +1,4 @@
from .demand import estimate_demand, generate_demand_for_actor from .demand import estimate_demand, estimate_weighted_demand, generate_demand_for_actor
from .behavior import sample_behavior, get_transition_models, trajectory_to_events from .behavior import sample_behavior, get_transition_models, trajectory_to_events
from .render import DashboardRenderer, style_axis from .render import DashboardRenderer, style_axis
from .wrappers import EconomicMetricsWrapper from .wrappers import EconomicMetricsWrapper

View File

@@ -1,9 +1,23 @@
import logging
import numpy as np import numpy as np
from logging import getLogger
logger = getLogger(__name__)
def generate_demand_for_actor(prices: np.ndarray, params: tuple, noise_std: float = 1.0, distribution_method=np.random.normal) -> np.ndarray: CATEGORY_WEIGHTS = {"cart": 4.0, "dwell": 2.0, "nav": 1.0, "filter": 0.5}
ACTION_CATEGORIES = {
"cart": {"add_item", "add_to_cart", "remove", "checkout", "purchase"},
"dwell": {"hover_title", "hover_paragraph", "hover_link"},
"nav": {"page_view", "view_item", "view", "learn_more"},
"filter": {"search", "filter_date", "filter_price", "sort"},
}
DEFAULT_ACTION_WEIGHTS = {
a: CATEGORY_WEIGHTS[c] for c, actions in ACTION_CATEGORIES.items() for a in actions
}
def generate_demand_for_actor(
prices: np.ndarray,
params: tuple,
noise_std: float = 1.0,
distribution_method=np.random.normal,
) -> np.ndarray:
"""d(p;0) = max(0, valuation - price) + epsi for single actor type """d(p;0) = max(0, valuation - price) + epsi for single actor type
params: (mean, std) for valuation distribution D_H or D_A""" params: (mean, std) for valuation distribution D_H or D_A"""
val = distribution_method(*params, size=len(prices)) val = distribution_method(*params, size=len(prices))
@@ -13,17 +27,50 @@ def generate_demand_for_actor(prices: np.ndarray, params: tuple, noise_std: floa
return demand / total * 100 if total > 0 else demand return demand / total * 100 if total > 0 else demand
def estimate_demand(trajectories): def estimate_demand(trajectories, action_weights=None):
demand_estimate = {} return estimate_weighted_demand(trajectories, action_weights)
def _parse_event_state(state: str):
if "_product" not in state:
return state, None
action, raw_pid = state.rsplit("_product", 1)
return action, int(raw_pid) if raw_pid.isdigit() else None
def _weight_for_action(action: str, action_weights: dict) -> float:
if action in action_weights:
return action_weights[action]
if action.startswith("hover"):
return CATEGORY_WEIGHTS["dwell"]
if action.startswith("filter") or action in {"search", "sort"}:
return CATEGORY_WEIGHTS["filter"]
if action.startswith("add") or action in {"checkout", "purchase", "remove"}:
return CATEGORY_WEIGHTS["cart"]
return CATEGORY_WEIGHTS["nav"]
def estimate_weighted_demand(trajectories, action_weights=None):
action_weights = (
DEFAULT_ACTION_WEIGHTS if action_weights is None else action_weights
)
scores = {}
for traj in trajectories: for traj in trajectories:
for event in traj: for state in traj:
if 'view_product' in event: action, product_id = _parse_event_state(state)
product_id = int(event.split('_')[-1].replace('product', '')) if product_id is None:
demand_estimate[product_id] = demand_estimate.get(product_id, 0) + 1 continue
total_views = sum(demand_estimate.values()) w = _weight_for_action(action, action_weights)
for product_id in demand_estimate: if w <= 0:
demand_estimate[product_id] = (demand_estimate[product_id] / total_views) * 100 # normalize to percentage continue
return demand_estimate scores[product_id] = scores.get(product_id, 0.0) + w
total = sum(scores.values())
return (
{pid: (score / total) * 100 for pid, score in scores.items()}
if total > 0
else {}
)
# Example usage # Example usage
if __name__ == "__main__": if __name__ == "__main__":
@@ -36,6 +83,7 @@ if __name__ == "__main__":
print("Human Demand:", demand_h) print("Human Demand:", demand_h)
print("Agent Demand:", demand_a) print("Agent Demand:", demand_a)
from .behavior import sample_behavior from .behavior import sample_behavior
N, alpha = 200, 0.3 N, alpha = 200, 0.3
n_h, n_a = int(N * (1 - alpha)), int(N * alpha) n_h, n_a = int(N * (1 - alpha)), int(N * alpha)
human_t = [sample_behavior(demand_h, human=True) for _ in range(n_h)] human_t = [sample_behavior(demand_h, human=True) for _ in range(n_h)]

View File

@@ -6,11 +6,26 @@ from .lib import EconomicMetricsWrapper, MetricsCallback
wandb.init( wandb.init(
project="phantom-pricing", project="phantom-pricing",
config={"alpha": 0.3, "n_products": 10, "total_timesteps": 50000} config={
"alpha": 0.3,
"n_products": 10,
"total_timesteps": 50000,
"robust_radius": 0.15,
"robust_points": 5,
"lambda_coi": 0.2,
},
) )
env = EconomicMetricsWrapper(PHANTOM(n_products=10, alpha=0.3, render_mode=None)) env_kwargs = {
eval_env = EconomicMetricsWrapper(PHANTOM(n_products=10, alpha=0.3, render_mode=None)) "n_products": 10,
"alpha": 0.3,
"lambda_coi": 0.2,
"robust_radius": 0.15,
"robust_points": 5,
"render_mode": None,
}
env = EconomicMetricsWrapper(PHANTOM(**env_kwargs))
eval_env = EconomicMetricsWrapper(PHANTOM(**env_kwargs))
model = SAC( model = SAC(
"MultiInputPolicy", "MultiInputPolicy",
@@ -31,11 +46,12 @@ model.save("phantom_sac")
wandb.finish() wandb.finish()
# test trained policy # test trained policy
env = PHANTOM(n_products=10, alpha=0.3, render_mode=None) env = PHANTOM(**env_kwargs)
obs, _ = env.reset() obs, _ = env.reset()
for _ in range(100): for _ in range(100):
action, _ = model.predict(obs, deterministic=True) action, _ = model.predict(obs, deterministic=True)
obs, reward, term, trunc, _ = env.step(action) obs, reward, term, trunc, _ = env.step(action)
env.render() env.render()
if term or trunc: break if term or trunc:
break
env.close() env.close()

View File

@@ -12,11 +12,23 @@ from .lib.behavior import get_transition_models, trajectory_to_events
from .lib.wrappers import EconomicMetricsWrapper from .lib.wrappers import EconomicMetricsWrapper
class _ActionPricingEngine(PricingEngine):
def __init__(self, n_products: int, price_bounds: tuple):
self._prices = np.full(n_products, price_bounds[0], dtype=float)
def set_prices(self, prices: np.ndarray):
self._prices = np.asarray(prices, dtype=float)
def act(self, _):
return self._prices
class PHANTOM(gym.Env): class PHANTOM(gym.Env):
"""Gymnasium wrapper for Limbo pricing-market simulation implementing thesis COI framework """Gymnasium wrapper for Limbo pricing-market simulation implementing thesis COI framework
reward = R(p,d) - λ·COI_leak(p,τ') per thesis Section on DR-RL reward = R(p,d) - λ·COI_leak(p,τ') per thesis Section on DR-RL
COI_leak uses behavioral divergence to estimate agent probability f(τ') COI_leak uses behavioral divergence to estimate agent probability f(τ')
robust inner step: min over alpha in Wasserstein interval around nominal alpha
""" """
metadata = {"render_modes": ["human", "ansi"]} metadata = {"render_modes": ["human", "ansi"]}
@@ -32,6 +44,9 @@ class PHANTOM(gym.Env):
price_bounds: tuple = (10.0, 150.0), price_bounds: tuple = (10.0, 150.0),
lambda_coi: float = 0.1, lambda_coi: float = 0.1,
coi_window: int = 10, coi_window: int = 10,
robust_radius: float = 0.0,
robust_points: int = 5,
info_value: float = 1.0,
render_mode: str = None, render_mode: str = None,
): ):
super().__init__() super().__init__()
@@ -40,10 +55,14 @@ class PHANTOM(gym.Env):
self.lambda_coi = lambda_coi self.lambda_coi = lambda_coi
self.coi_window = coi_window self.coi_window = coi_window
self.render_mode = render_mode self.render_mode = render_mode
self.alpha = alpha self.alpha = float(alpha)
self.nominal_alpha = float(alpha)
self.N = N self.N = N
self.human_params = human_params self.human_params = human_params
self.agent_params = agent_params self.agent_params = agent_params
self.robust_radius = max(0.0, float(robust_radius))
self.robust_points = max(1, int(robust_points))
self.info_value = float(info_value)
self.market = MarketEngine( self.market = MarketEngine(
alpha=alpha, alpha=alpha,
@@ -52,8 +71,9 @@ class PHANTOM(gym.Env):
agent_params=agent_params, agent_params=agent_params,
noise_std=noise_std, noise_std=noise_std,
) )
self._platform_stub = PricingEngine() self._platform_stub = _ActionPricingEngine(n_products, price_bounds)
self._limbo = Limbo(self._platform_stub, self.market) self._limbo = Limbo(self._platform_stub, self.market)
self._set_market_mix(self.nominal_alpha)
self.action_space = spaces.Box( self.action_space = spaces.Box(
low=price_bounds[0], low=price_bounds[0],
@@ -99,53 +119,72 @@ class PHANTOM(gym.Env):
) )
return {"demand": demand_arr, "prices": self._prices.astype(np.float32)} return {"demand": demand_arr, "prices": self._prices.astype(np.float32)}
def _compute_agent_prob(self) -> float: def _set_market_mix(self, alpha: float):
"""estimate agent probability from accumulated trajectories using KL divergence""" alpha = float(np.clip(alpha, 0.0, 1.0))
if ( n_agents = int(self.N * alpha)
not self._trajectories self.alpha = alpha
or self._human_trans is None self.market.alpha = alpha
or self._agent_trans is None self.market.Nagents = n_agents
): self.market.Nhumans = self.N - n_agents
return self.alpha # fallback to contamination level
# aggregate all trajectories from this episode def _compute_agent_prob(self, trajectories=None) -> float:
all_events = [] trajectories = (
for traj in self._trajectories: self.market.last_trajectories if trajectories is None else trajectories
all_events.extend(trajectory_to_events(traj))
if len(all_events) < 2:
return self.alpha
return compute_agent_probability(
all_events, self._human_trans, self._agent_trans
) )
if not trajectories or self._human_trans is None or self._agent_trans is None:
return float(self.market.alpha)
probs = []
for traj in trajectories:
events = trajectory_to_events(traj)
if len(events) < 2:
continue
probs.append(
compute_agent_probability(events, self._human_trans, self._agent_trans)
)
return float(np.mean(probs)) if probs else float(self.market.alpha)
def _compute_reward(self, prices: np.ndarray, demand: dict) -> tuple[float, dict]: def _compute_reward(
revenue = sum(prices[i] * demand.get(i, 0.0) for i in range(self.n_products)) self, prices: np.ndarray, demand: dict, agent_prob: float, trajectories: list
) -> tuple[float, dict]:
trajs_mix = self.market.last_trajectories demand_arr = np.array(
purchases_mix = extract_purchases(trajs_mix) [demand.get(i, 0.0) for i in range(self.n_products)], dtype=float
coi_mix = compute_uplift_coi(prices, purchases_mix, self.baseline_prices) )
revenue = float(np.dot(prices, demand_arr))
old_state = (self.market.alpha, self.market.Nagents, self.market.Nhumans) purchases = extract_purchases(trajectories)
self.market.alpha, self.market.Nagents, self.market.Nhumans = 0.0, 0, self.N coi_mix = compute_uplift_coi(prices, purchases, self.baseline_prices)
self.market.act(prices) coi_leakage = float(agent_prob * self.info_value)
purchases_base = extract_purchases(self.market.last_trajectories) coi_penalty = float(self.lambda_coi * coi_leakage)
coi_base = compute_uplift_coi(prices, purchases_base, self.baseline_prices)
self.market.alpha, self.market.Nagents, self.market.Nhumans = old_state
coi_leakage = max(0.0, coi_base - coi_mix)
coi_penalty = max(self.lambda_coi * coi_leakage, 1000) / 1000
coi_penalty *= revenue
return float(revenue - coi_penalty), { return float(revenue - coi_penalty), {
"revenue": float(revenue), "revenue": revenue,
"coi_mix": float(coi_mix), "coi_mix": float(coi_mix),
"coi_base": float(coi_base), "coi_base": 0.0,
"coi_leakage": float(coi_leakage), "coi_leakage": coi_leakage,
"coi_penalty": float(coi_penalty), "coi_penalty": coi_penalty,
} }
def _alpha_candidates(self) -> np.ndarray:
if self.robust_radius <= 0.0 or self.robust_points == 1:
return np.array([self.nominal_alpha], dtype=float)
lo = max(0.0, self.nominal_alpha - self.robust_radius)
hi = min(1.0, self.nominal_alpha + self.robust_radius)
return np.linspace(lo, hi, self.robust_points)
def _select_adversarial_alpha(self, prices: np.ndarray) -> float:
candidates = self._alpha_candidates()
if len(candidates) == 1:
return float(candidates[0])
best_alpha, worst_reward = float(candidates[0]), np.inf
for alpha in candidates:
self._set_market_mix(float(alpha))
demand = self.market.act(prices)
trajectories = self.market.last_trajectories
agent_prob = self._compute_agent_prob(trajectories)
reward, _ = self._compute_reward(prices, demand, agent_prob, trajectories)
if reward < worst_reward:
worst_reward = reward
best_alpha = float(alpha)
return best_alpha
def _record_history(self): def _record_history(self):
demand_arr = np.array( demand_arr = np.array(
[self._demand.get(i, 0.0) for i in range(self.n_products)] [self._demand.get(i, 0.0) for i in range(self.n_products)]
@@ -156,32 +195,42 @@ class PHANTOM(gym.Env):
def reset(self, seed=None, options=None): def reset(self, seed=None, options=None):
super().reset(seed=seed) super().reset(seed=seed)
self._set_market_mix(self.nominal_alpha)
self._limbo.reset()
self._prices = np.random.uniform(*self.price_bounds, size=self.n_products) self._prices = np.random.uniform(*self.price_bounds, size=self.n_products)
self._platform_stub.set_prices(self._prices)
self._limbo.step()
self._demand = self._limbo.step()
self._initial_episode_prices = self._prices.copy() self._initial_episode_prices = self._prices.copy()
self._demand = self.market.act(self._prices)
self._step_count = 0 self._step_count = 0
self._demand_history, self._price_history, self._revenue_history = [], [], [] self._demand_history, self._price_history, self._revenue_history = [], [], []
self._trajectories = [] self._trajectories = list(getattr(self.market, "last_trajectories", []))
self._record_history() self._record_history()
return self._get_obs(), {} return self._get_obs(), {}
def step(self, action: np.ndarray): def step(self, action: np.ndarray):
self._prices = np.clip(action, *self.price_bounds) self._prices = np.clip(action, *self.price_bounds)
self._demand = self.market.act(self._prices) alpha_adv = self._select_adversarial_alpha(self._prices)
self._set_market_mix(alpha_adv)
self._platform_stub.set_prices(self._prices)
self._limbo.step()
self._demand = self._limbo.step()
trajectories = getattr(self.market, "last_trajectories", [])
self._step_count += 1 self._step_count += 1
self._trajectories.extend(trajectories)
agent_prob = self._compute_agent_prob(trajectories)
reward, metrics = self._compute_reward(
self._prices, self._demand, agent_prob, trajectories
)
self._record_history() self._record_history()
# capture trajectories generated by market for agent prob estimation
if hasattr(self.market, "last_trajectories"):
self._trajectories.extend(self.market.last_trajectories)
agent_prob = self._compute_agent_prob()
reward, metrics = self._compute_reward(self._prices, self._demand)
terminated = self._step_count >= 100 terminated = self._step_count >= 100
info = { info = {
"step": self._step_count, "step": self._step_count,
"agent_prob": agent_prob, "agent_prob": agent_prob,
"alpha_adv": float(alpha_adv),
"wasserstein_radius": float(self.robust_radius),
**metrics, **metrics,
"raw_revenue": np.sum( "raw_revenue": np.sum(
self._prices self._prices