refactor to align moer with research in the env sims

This commit is contained in:
2025-12-17 17:41:16 +01:00
parent 201c98bcac
commit 3fa98f375d
3 changed files with 431 additions and 193 deletions

220
sim/rl/engine.py Normal file
View File

@@ -0,0 +1,220 @@
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod
from typing import Dict, Any
from environment import BusinessLogicConstraints
class BasePricingEngine(ABC):
"""base interface for all pricing engines"""
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
self.c = constraints
self.rng = np.random.default_rng(seed)
self.step_count = 0
@abstractmethod
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
"""compute new prices given current state and observation from environment
args:
current_prices: current price vector [N]
observation: dict containing 'price', 'demand', and possibly interaction data
returns:
new_prices: updated price vector [N]
"""
pass
@abstractmethod
def update(obs, reward, done, info):
pass
def reset(self):
"""reset engine state for new episode"""
self.step_count = 0
class WildPricingEngine(BasePricingEngine):
"""production-like pricing using online elasticity estimation via EWMA regression"""
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
super().__init__(constraints, seed)
# per-product unit costs (unknown to customers; known to platform)
self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catelogue_size).astype(np.float32)
# online elasticity estimate (start moderately elastic)
self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
# EWMA state for log-log regression
self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
self.cov_pq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
self.var_p = np.ones(self.c.product_catelogue_size, dtype=np.float32)
# knobs typical in production
self.lr = 0.08
self.ewma = 0.05
self.eps_explore = 0.03
self.explore_scale = 0.03
def _safe_elasticity(self, e: np.ndarray) -> np.ndarray:
return np.clip(e, -5.0, -1.05)
def reset(self):
super().reset()
self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
self.cov_pq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
self.var_p = np.ones(self.c.product_catelogue_size, dtype=np.float32)
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
self.step_count += 1
# extract demand signal (from env observation) as proxy for sales
demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
return self._update_from_demand(current_prices, demand)
def _update_from_demand(self, prices: np.ndarray, sold: np.ndarray) -> np.ndarray:
# log transforms (add 1 to handle zeros)
logp = np.log(np.clip(prices, 1e-3, None)).astype(np.float32)
logq = np.log(sold + 1.0).astype(np.float32)
# EWMA moments for per-product regression: logq ≈ a + e*logp
a = self.ewma
dp = logp - self.mu_logp
dq = logq - self.mu_logq
self.mu_logp = (1 - a) * self.mu_logp + a * logp
self.mu_logq = (1 - a) * self.mu_logq + a * logq
self.cov_pq = (1 - a) * self.cov_pq + a * (dp * dq)
self.var_p = (1 - a) * self.var_p + a * (dp * dp + 1e-6)
e_new = self.cov_pq / (self.var_p + 1e-6)
self.e_hat = self._safe_elasticity(0.9 * self.e_hat + 0.1 * e_new)
# profit-optimal price for isoelastic demand (if e < -1)
e = self.e_hat
p_star = self.unit_cost * (e / (e + 1.0))
# smooth toward p_star
new_prices = (1 - self.lr) * prices + self.lr * p_star
# exploration (small random perturbations)
if self.rng.random() < self.eps_explore:
noise = self.rng.normal(0.0, self.explore_scale, size=new_prices.shape).astype(np.float32)
new_prices = new_prices * (1.0 + noise)
# apply business guardrails (max change + bounds)
max_adj = self.c.max_price_adjustment
ratio = np.clip(new_prices / (prices + 1e-6), 1 - max_adj, 1 + max_adj)
new_prices = prices * ratio
new_prices = np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
return new_prices
class StaticPricingEngine(BasePricingEngine):
"""baseline: fixed prices throughout episode"""
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
super().__init__(constraints, seed)
self.fixed_prices = None
def reset(self):
super().reset()
self.fixed_prices = None
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
self.step_count += 1
if self.fixed_prices is None:
self.fixed_prices = current_prices.copy()
return self.fixed_prices.copy()
class SimpleDemandEngine(BasePricingEngine):
"""demand-driven pricing: increase price when demand rises, decrease when it falls"""
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
super().__init__(constraints, seed)
self.prev_demand = None
self.lr = 0.05
def reset(self):
super().reset()
self.prev_demand = None
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
self.step_count += 1
demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
if self.prev_demand is None:
self.prev_demand = demand.copy()
return current_prices.copy()
# simple rule: if demand increases, raise price; if decreases, lower price
delta_d = demand - self.prev_demand
price_adj = self.lr * np.sign(delta_d) * np.abs(delta_d) / (np.abs(self.prev_demand) + 1.0)
new_prices = current_prices * (1.0 + price_adj)
self.prev_demand = demand.copy()
# apply constraints
max_adj = self.c.max_price_adjustment
ratio = np.clip(new_prices / (current_prices + 1e-6), 1 - max_adj, 1 + max_adj)
new_prices = current_prices * ratio
return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
class RandomWalkEngine(BasePricingEngine):
"""random walk pricing with mean reversion"""
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
super().__init__(constraints, seed)
self.target_price = None
self.volatility = 0.02
def reset(self):
super().reset()
self.target_price = None
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
self.step_count += 1
if self.target_price is None:
self.target_price = current_prices.copy()
# random walk with mean reversion toward target
noise = self.rng.normal(0.0, self.volatility, size=current_prices.shape).astype(np.float32)
reversion = 0.01 * (self.target_price - current_prices)
new_prices = current_prices * (1.0 + noise) + reversion
# apply constraints
max_adj = self.c.max_price_adjustment
ratio = np.clip(new_prices / (current_prices + 1e-6), 1 - max_adj, 1 + max_adj)
new_prices = current_prices * ratio
return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
class ThompsonSamplingEngine(BasePricingEngine):
"""bayesian bandit approach per product treating price as discrete action"""
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
super().__init__(constraints, seed)
self.n_price_levels = 5
self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
self.price_grid = None
self.last_actions = None
def reset(self):
super().reset()
self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
self.price_grid = None
self.last_actions = None
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
self.step_count += 1
if self.price_grid is None:
# define price grid per product
lo = current_prices * 0.7
hi = current_prices * 1.3
self.price_grid = np.linspace(lo, hi, self.n_price_levels).T
demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
# update beliefs based on last action
if self.last_actions is not None:
for i in range(self.c.product_catelogue_size):
a = self.last_actions[i]
reward = demand[i]
if reward > 0.5:
self.alpha[i, a] += reward
else:
self.beta[i, a] += 1.0
# thompson sampling: sample from posterior, pick best
new_prices = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
actions = np.zeros(self.c.product_catelogue_size, dtype=int)
for i in range(self.c.product_catelogue_size):
theta = self.rng.beta(self.alpha[i], self.beta[i]).astype(np.float32)
actions[i] = int(np.argmax(theta))
new_prices[i] = self.price_grid[i, actions[i]]
self.last_actions = actions
return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)

View File

@@ -1,5 +1,7 @@
from sys import intern
import gymnasium as gym import gymnasium as gym
from gymnasium import spaces from gymnasium import spaces
from matplotlib import interactive
import numpy as np import numpy as np
from dataclasses import dataclass from dataclasses import dataclass
import pandas as pd import pandas as pd
@@ -24,7 +26,7 @@ class BusinessLogicConstraints():
coi_sigmoid_temp: float = 1.25 coi_sigmoid_temp: float = 1.25
base_human_demand: float = 0.08 base_human_demand: float = 0.08
base_agent_demand: float = 0.05 base_agent_demand: float = 0.05
human_price_elasticity: float = -1.2 human_price_elasticity: float = -1.2 # assumptions here
agent_price_elasticity: float = -0.6 agent_price_elasticity: float = -0.6
w_agent_loss: float = 1.0 w_agent_loss: float = 1.0
w_volatility: float = 5.0 w_volatility: float = 5.0
@@ -35,31 +37,25 @@ class BusinessLogicConstraints():
def _sigmoid(x: np.ndarray) -> np.ndarray: def _sigmoid(x: np.ndarray) -> np.ndarray:
return 1.0 / (1.0 + np.exp(-x)) return 1.0 / (1.0 + np.exp(-x))
def simple_agent_detector(session_df: pd.DataFrame) -> pd.Series:
# baseline heuristic: high velocity + low conversion
v = session_df.get("interaction_velocity", pd.Series(0.0, index=session_df.index))
cr = session_df.get("conversion_rate", pd.Series(0.0, index=session_df.index))
total = session_df.get("total_interactions", pd.Series(0, index=session_df.index))
return (total >= 12) & (v >= 0.20) & (cr <= 0.01)
class CommercePlatform: class CommercePlatform:
def __init__(self, product_catelogue_size: int, max_price: float, min_price: float, """
constraints: BusinessLogicConstraints, agent_detector: Optional[Callable[[pd.DataFrame], pd.Series]] = None, This is just an extension of the state management for the environment, it does not implement anything dynamic just helps us simulate demand.
use_defense: bool = False): """
def __init__(self,
product_catelogue_size: int,
max_price: float,
min_price: float,
constraints: BusinessLogicConstraints):
self.product_catelogue_size = product_catelogue_size self.product_catelogue_size = product_catelogue_size
self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catelogue_size,))
self.max_price = max_price self.max_price = max_price
self.min_price = min_price self.min_price = min_price
self.constraints = constraints self.constraints = constraints
self.use_defense = use_defense
self.agent_detector = agent_detector
self.simulation_history: List[Dict[str, Any]] = [] self.simulation_history: List[Dict[str, Any]] = []
self._rng = np.random.default_rng(constraints.seed) self._rng = np.random.default_rng(constraints.seed)
self._popularity = self._rng.lognormal(mean=0.0, sigma=0.6, size=self.product_catelogue_size)
self._popularity = self._popularity / (self._popularity.mean() + 1e-12)
self._last_interaction_df: pd.DataFrame = pd.DataFrame() self._last_interaction_df: pd.DataFrame = pd.DataFrame()
def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]: def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
# ground truth purchase propensities # ground truth purchase propensities
p = np.clip(prices, self.min_price, self.max_price) p = np.clip(prices, self.min_price, self.max_price)
@@ -67,14 +63,19 @@ class CommercePlatform:
human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity) human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity) agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
return { return {
"human_purchase_prob": np.clip(human_prob * self._popularity, 0.0, 0.95), "human_purchase_prob": np.clip(human_prob, 0.0, 0.95),
"agent_purchase_prob": np.clip(agent_prob * self._popularity, 0.0, 0.95) "agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)
} }
def _session_markup_multiplier(self, signal_score: float) -> float: def _load_behavioral_profile(actor : str, demand_forcing):
# session-based COI markup based on demand signal expression """
x = (signal_score - self.constraints.coi_threshold) / max(self.constraints.coi_sigmoid_temp, 1e-6) This returns a markov chain with average weights which we get from interaction data of our experiments.
return 1.0 + self.constraints.coi_strength * float(_sigmoid(np.array([x]))[0]) This defines transition probabilities between different events:
search -> view_item_price_binN: 0.7
view_item_price_binN -> add_to_cart: 0.2
we also must reweight with the demand_forcing vector or purchase probabilities per-product
"""
def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame: def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
demand = self.setup_true_demand(base_prices) demand = self.setup_true_demand(base_prices)
@@ -84,94 +85,32 @@ class CommercePlatform:
T = self.constraints.sessions_per_step T = self.constraints.sessions_per_step
n_agent_sessions = int(round(T * self.constraints.agent_share)) n_agent_sessions = int(round(T * self.constraints.agent_share))
n_human_sessions = T - n_agent_sessions n_human_sessions = T - n_agent_sessions
# human sessions: normal browse with possible purchase
for s in range(n_human_sessions):
session_id = f"h_{len(events)}_{s}"
k = int(self._rng.integers(1, 4))
prod_ids = self._rng.choice(self.product_catelogue_size, size=k, replace=False)
t = 0.0
inter_times = self._rng.gamma(shape=2.0, scale=3.0, size=3 * k)
signal_score = 0.0
purchased_any = False
for i, pid in enumerate(prod_ids):
t += float(inter_times[i])
price_shown = float(base_prices[pid])
events.append({
"session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
"action": "view", "t": t, "price_shown": price_shown, "is_purchase": 0,
"price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
})
signal_score += 1.0
if self._rng.random() < 0.35:
t += float(inter_times[i + k])
events.append({
"session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
"action": "cart", "t": t, "price_shown": price_shown, "is_purchase": 0,
"price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
})
signal_score += 2.0
if (not purchased_any) and (self._rng.random() < float(human_pprob[pid])):
t += float(inter_times[i + 2 * k])
mult = self._session_markup_multiplier(signal_score)
price_paid = float(np.clip(base_prices[pid] * mult, self.min_price, self.max_price))
events.append({
"session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
"action": "purchase", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 1,
"price_paid": price_paid, "oracle_price_paid": price_paid, "signal_score": signal_score,
})
purchased_any = True
# agent sessions: split recon/purchase to circumvent COI
n_agent_ids = max(1, n_agent_sessions // 2) n_agent_ids = max(1, n_agent_sessions // 2)
for a in range(n_agent_ids): session_map = {
agent_id = f"a_{a}" 'humans': n_human_sessions,
recon_session_id = f"{agent_id}_recon" 'agents': n_agent_ids
t = 0.0 }
n_views = int(self._rng.poisson(lam=8) * self.constraints.agent_recon_multiplier) + 5 pprob_map = {
inter_times = self._rng.gamma(shape=2.0, scale=0.6, size=max(n_views, 1)) 'humans': human_pprob,
prod_ids = self._rng.integers(0, self.product_catelogue_size, size=n_views) 'agents': agent_pprob
recon_signal = 0.0 }
joint_events = []
for actor, n_sessions in session_map.items():
bp = _load_behavioral_profile(actor, pprob_map[actor])
counter = 0
events = []
while counter < n_sessions:
session_events = []
while len(session_events) == 0 or session_events[-1]['action'] == 'checkout':
interaction_event = bp.sample(self._rng)
interaction_event['session_id'] = f'{actor}_{counter:06d}'
# TODO any other assignments
session_events.append(interaction_event)
events.extend(session_events)
counter += 1
joint_events.extend(events)
for i, pid in enumerate(prod_ids): return pd.DataFrame(joint_events)
t += float(inter_times[i])
events.append({
"session_id": recon_session_id, "actor": "agent", "agent_id": agent_id, "product_id": int(pid),
"action": "view", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 0,
"price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
})
recon_signal += 1.0
# clean purchase session with minimal interactions
if self._rng.random() < self.constraints.agent_purchase_probability:
purchase_session_id = f"{agent_id}_clean"
pid = int(self._rng.integers(0, self.product_catelogue_size))
t2 = 0.0
clean_signal = 0.0
t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
events.append({
"session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
"action": "view", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 0,
"price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
})
clean_signal += 1.0
if self._rng.random() < float(agent_pprob[pid]):
t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
obs_mult = self._session_markup_multiplier(clean_signal)
obs_paid = float(np.clip(base_prices[pid] * obs_mult, self.min_price, self.max_price))
oracle_mult = self._session_markup_multiplier(recon_signal) # oracle links recon->purchase
oracle_paid = float(np.clip(base_prices[pid] * oracle_mult, self.min_price, self.max_price))
events.append({
"session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
"action": "purchase", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 1,
"price_paid": obs_paid, "oracle_price_paid": oracle_paid, "signal_score": clean_signal,
})
return pd.DataFrame(events)
def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]: def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
if interaction_df.empty: if interaction_df.empty:
@@ -183,6 +122,7 @@ class CommercePlatform:
return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))} return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame: def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
# TODO: adapt this
if df.empty: if df.empty:
return pd.DataFrame() return pd.DataFrame()
g = df.groupby("session_id", sort=False) g = df.groupby("session_id", sort=False)
@@ -208,73 +148,6 @@ class CommercePlatform:
"is_agent": is_agent.astype(bool), "is_agent": is_agent.astype(bool),
}).reset_index() }).reset_index()
def demand_estimate(self, interaction_df: pd.DataFrame, exclude_sessions: Optional[pd.Series] = None) -> np.ndarray:
# proxy demand from weighted interaction events
if interaction_df.empty:
return np.zeros(self.product_catelogue_size, dtype=np.float32)
df = interaction_df
if exclude_sessions is not None:
bad_sessions = set(exclude_sessions.loc[exclude_sessions].index)
df = df[~df["session_id"].isin(bad_sessions)]
weights = {"view": 0.15, "cart": 0.75, "purchase": 2.5}
w = df["action"].map(weights).fillna(0.0).to_numpy(dtype=float)
prod = df["product_id"].to_numpy(dtype=int)
q_hat = np.zeros(self.product_catelogue_size, dtype=float)
np.add.at(q_hat, prod, w)
return q_hat.astype(np.float32)
def run_pricing_simulation(self, prices: np.ndarray) -> Dict[str, Any]:
interaction_df = self._simulate_sessions(prices)
self._last_interaction_df = interaction_df
session_df = self._session_feature_table(interaction_df)
predicted_agent_sessions = None
if (self.use_defense and self.agent_detector is not None and not session_df.empty):
predicted_agent_sessions = self.agent_detector(session_df.set_index("session_id"))
q_hat_naive = self.demand_estimate(interaction_df, exclude_sessions=None)
q_hat_defended = self.demand_estimate(interaction_df, exclude_sessions=predicted_agent_sessions) \
if predicted_agent_sessions is not None else q_hat_naive.copy()
true_human = np.zeros(self.product_catelogue_size, dtype=float)
true_agent = np.zeros(self.product_catelogue_size, dtype=float)
if not interaction_df.empty:
purchases = interaction_df[interaction_df["action"] == "purchase"]
if not purchases.empty:
for _, r in purchases.iterrows():
if r["actor"] == "human":
true_human[int(r["product_id"])] += 1.0
else:
true_agent[int(r["product_id"])] += 1.0
revenue_observed = float(interaction_df["price_paid"].sum()) if not interaction_df.empty else 0.0
revenue_oracle = float(interaction_df["oracle_price_paid"].sum()) if not interaction_df.empty else 0.0
agent_loss = max(0.0, revenue_oracle - revenue_observed)
eps = 1e-6
internal_error_naive = np.abs(true_human - q_hat_naive) / (true_human + eps)
internal_error_def = np.abs(true_human - q_hat_defended) / (true_human + eps)
interaction_features = self.compute_interaction_features(interaction_df)
summary = {
"prices": prices.copy(),
"interaction_df": interaction_df,
"session_df": session_df,
"q_hat_naive": q_hat_naive,
"q_hat_defended": q_hat_defended,
"true_human_demand": true_human.astype(np.float32),
"true_agent_purchases": true_agent.astype(np.float32),
"internal_error_naive": internal_error_naive.astype(np.float32),
"internal_error_defended": internal_error_def.astype(np.float32),
"interaction_features": interaction_features,
"revenue_observed": revenue_observed,
"revenue_oracle": revenue_oracle,
"agent_loss": agent_loss,
"predicted_agent_sessions": predicted_agent_sessions,
}
self.simulation_history.append(summary)
return summary
def get_interaction_data(self) -> np.ndarray: def get_interaction_data(self) -> np.ndarray:
if self._last_interaction_df.empty: if self._last_interaction_df.empty:
return np.array([], dtype=object) return np.array([], dtype=object)
@@ -284,7 +157,7 @@ class CommercePlatform:
class PHANTOMEnv(gym.Env): class PHANTOMEnv(gym.Env):
metadata = {"render_modes": []} metadata = {"render_modes": []}
def __init__(self, use_defense: bool = False): def __init__(self, constraints):
super().__init__() super().__init__()
self.constraints = BusinessLogicConstraints() self.constraints = BusinessLogicConstraints()
self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment, self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
@@ -301,14 +174,13 @@ class PHANTOMEnv(gym.Env):
high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32), high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32),
dtype=np.float32), dtype=np.float32),
}) })
# TODO: define more features that we compute from the interaction data
}) })
self.commerce_platform = CommercePlatform( self.commerce_platform = CommercePlatform(
product_catelogue_size=self.constraints.product_catelogue_size, product_catelogue_size=self.constraints.product_catelogue_size,
max_price=self.constraints.system_max_price, max_price=self.constraints.system_max_price,
min_price=self.constraints.system_min_price, min_price=self.constraints.system_min_price,
constraints=self.constraints, constraints=self.constraints)
agent_detector=simple_agent_detector,
use_defense=use_defense)
self._rng = np.random.default_rng(self.constraints.seed) self._rng = np.random.default_rng(self.constraints.seed)
self.t = 0 self.t = 0
self._prev_prices: Optional[np.ndarray] = None self._prev_prices: Optional[np.ndarray] = None
@@ -336,17 +208,13 @@ class PHANTOMEnv(gym.Env):
new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)), new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
self.constraints.system_min_price, self.constraints.system_min_price,
self.constraints.system_max_price).astype(np.float32) self.constraints.system_max_price).astype(np.float32)
result = self.commerce_platform.run_pricing_simulation(new_prices)
if self.commerce_platform.use_defense:
demand_est = result["q_hat_defended"]
internal_err = result["internal_error_defended"]
else:
demand_est = result["q_hat_naive"]
internal_err = result["internal_error_naive"]
self.state["elasticity"]["price"] = new_prices self.state["elasticity"]["price"] = new_prices
self.state["elasticity"]["demand"] = demand_est # TODO: use the commerce platform to simulate sessions
interactions_df = self.commerce_platform._simulate_sessions(new_prices)
result = self.commerce_platform.compute_interaction_features(interactions_df)
# TODO: implement COI computation to use in reward
COI = 0.0
volatility = 0.0 if self._prev_prices is None else \ volatility = 0.0 if self._prev_prices is None else \
float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6)))) float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
@@ -354,12 +222,13 @@ class PHANTOMEnv(gym.Env):
revenue_observed = float(result["revenue_observed"]) revenue_observed = float(result["revenue_observed"])
agent_loss = float(result["agent_loss"]) agent_loss = float(result["agent_loss"])
err_mean = float(np.mean(internal_err))
reward = (revenue_observed reward = (revenue_observed
- COI
- self.constraints.w_agent_loss * agent_loss - self.constraints.w_agent_loss * agent_loss
- self.constraints.w_volatility * volatility - self.constraints.w_volatility * volatility
- self.constraints.w_estimation_error * err_mean) - self.constraints.w_estimation_error
)
terminated = self.t >= self.constraints.episode_length terminated = self.t >= self.constraints.episode_length
info = { info = {

149
sim/rl/train.py Normal file
View File

@@ -0,0 +1,149 @@
import numpy as np
import logging
from pathlib import Path
from typing import Dict, Type, Optional
import pickle
from torch import neg_
from torch.utils.tensorboard import SummaryWriter
from environment import PHANTOMEnv, FastTrainingConstraints, BusinessLogicConstraints
from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
"""
Target training loop:
have base prices p0 from env reset and run the env step, collect reward and metrics
pass this to the pricing engine which computes the price action to take based on previous reward by learning
the new action gets passed to the step
so we alternate, step -> reward -> engine (produces price delta) -> step with price delta -> reward
to make sure the reinforcement learning inside the engine can learn we need to have trajectory of prices
CURRENT SOLUTION BELOW does not implement correct learning or updates.
"""
class EngineTrainer:
"""wrapper to run pricing engines through episodes and collect metrics"""
def __init__(self, engine: BasePricingEngine, env: PHANTOMEnv,
tb_writer: Optional[SummaryWriter] = None):
self.engine = engine
self.env = env
self.episode_metrics = []
self.tb_writer = tb_writer
self.global_step = 0
def train(self, n_episodes: int, seed: int = 42):
obs, _ = self.env.reset(seed=seed)
prices = None
for ep in range(n_episodes):
prices = self.engine.compute_prices(prices, obs
obs, reward, done, _, info = self.env.step(prices)
self.engine.update(obs, reward, done, info)
return self
return self.episode_metrics
def evaluate(self, n_episodes: int = 10, seed: int = 100) -> Dict:
"""evaluate trained engine"""
results = {k: [] for k in ['total_reward', 'revenue_observed', 'revenue_oracle',
'agent_loss', 'ux_volatility', 'look_to_book']}
for ep in range(n_episodes):
metrics = self.run_episode(seed=seed + ep)
for k in results: results[k].append(metrics[k])
return {k: (np.mean(v), np.std(v)) for k, v in results.items()}
def make_env(fast: bool = True):
constraints = FastTrainingConstraints() if fast else BusinessLogicConstraints()
return PHANTOMEnv(constraints=constraints)
def train_engine(engine_cls: Type[BasePricingEngine], env: PHANTOMEnv,
n_episodes: int, seed: int = 42,
tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
constraints = env.constraints
engine = engine_cls(constraints=constraints, seed=seed)
trainer = EngineTrainer(engine, env, tb_writer=tb_writer)
trainer.train(n_episodes, seed=seed)
return trainer
def save_trainer(trainer: EngineTrainer, path: Path):
"""save engine state and metrics"""
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, 'wb') as f:
pickle.dump({
'engine': trainer.engine,
'metrics': trainer.episode_metrics
}, f)
logger.info(f"Saved trainer to {path}")
def load_trainer(path: Path, env: PHANTOMEnv,
tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
"""load saved engine"""
with open(path, 'rb') as f:
data = pickle.load(f)
trainer = EngineTrainer(data['engine'], env, tb_writer=tb_writer)
trainer.episode_metrics = data['metrics']
return trainer
if __name__ == "__main__":
base_dir = Path("./runs")
base_dir.mkdir(exist_ok=True)
engines = {
"Wild": WildPricingEngine,
"Static": StaticPricingEngine,
# "SimpleDemand": SimpleDemandEngine,
"RandomWalk": RandomWalkEngine,
"ThompsonSampling": ThompsonSamplingEngine,
}
defenses = [False, True]
n_train_episodes = 50
n_eval_episodes = 10
seed = 42
fast_mode = True
logger.info(f"Training config: {n_train_episodes} episodes per engine, fast_mode={fast_mode}")
trained_trainers = {}
for engine_name, engine_cls in engines.items():
for use_defense in defenses:
defense_label = "defense_on" if use_defense else "defense_off"
run_name = f"{engine_name}_{defense_label}"
log_dir = base_dir / run_name
log_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Training {engine_name} with defense={use_defense}")
logger.info(f"Log directory: {log_dir}")
env = make_env(fast=fast_mode)
tb_writer = SummaryWriter(log_dir=str(log_dir))
trainer = train_engine(engine_cls, env, n_train_episodes, seed, tb_writer=tb_writer)
tb_writer.close()
save_path = log_dir / "trainer.pkl"
save_trainer(trainer, save_path)
trained_trainers[run_name] = (trainer, env)
logger.info("Starting evaluation")
for run_name, (trainer, env) in trained_trainers.items():
logger.info(f"Evaluating {run_name}")
results = trainer.evaluate(n_episodes=n_eval_episodes, seed=seed + 1000)
for metric, (mean, std) in results.items():
logger.info(f" {metric:20s}: {mean:10.2f} ± {std:6.2f}")
logger.info(f"Results saved to: {base_dir}")