mirror of
https://github.com/velocitatem/PHANTOM.git
synced 2026-05-31 08:33:36 +00:00
refactor to align moer with research in the env sims
This commit is contained in:
220
sim/rl/engine.py
Normal file
220
sim/rl/engine.py
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Dict, Any
|
||||||
|
from environment import BusinessLogicConstraints
|
||||||
|
|
||||||
|
|
||||||
|
class BasePricingEngine(ABC):
|
||||||
|
"""base interface for all pricing engines"""
|
||||||
|
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
|
||||||
|
self.c = constraints
|
||||||
|
self.rng = np.random.default_rng(seed)
|
||||||
|
self.step_count = 0
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
|
||||||
|
"""compute new prices given current state and observation from environment
|
||||||
|
|
||||||
|
args:
|
||||||
|
current_prices: current price vector [N]
|
||||||
|
observation: dict containing 'price', 'demand', and possibly interaction data
|
||||||
|
|
||||||
|
returns:
|
||||||
|
new_prices: updated price vector [N]
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update(obs, reward, done, info):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""reset engine state for new episode"""
|
||||||
|
self.step_count = 0
|
||||||
|
|
||||||
|
|
||||||
|
class WildPricingEngine(BasePricingEngine):
|
||||||
|
"""production-like pricing using online elasticity estimation via EWMA regression"""
|
||||||
|
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
|
||||||
|
super().__init__(constraints, seed)
|
||||||
|
# per-product unit costs (unknown to customers; known to platform)
|
||||||
|
self.unit_cost = self.rng.uniform(8.0, 40.0, size=self.c.product_catelogue_size).astype(np.float32)
|
||||||
|
# online elasticity estimate (start moderately elastic)
|
||||||
|
self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
|
||||||
|
# EWMA state for log-log regression
|
||||||
|
self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
|
||||||
|
self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
|
||||||
|
self.cov_pq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
|
||||||
|
self.var_p = np.ones(self.c.product_catelogue_size, dtype=np.float32)
|
||||||
|
# knobs typical in production
|
||||||
|
self.lr = 0.08
|
||||||
|
self.ewma = 0.05
|
||||||
|
self.eps_explore = 0.03
|
||||||
|
self.explore_scale = 0.03
|
||||||
|
|
||||||
|
def _safe_elasticity(self, e: np.ndarray) -> np.ndarray:
|
||||||
|
return np.clip(e, -5.0, -1.05)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
super().reset()
|
||||||
|
self.e_hat = np.full((self.c.product_catelogue_size,), -1.3, dtype=np.float32)
|
||||||
|
self.mu_logp = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
|
||||||
|
self.mu_logq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
|
||||||
|
self.cov_pq = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
|
||||||
|
self.var_p = np.ones(self.c.product_catelogue_size, dtype=np.float32)
|
||||||
|
|
||||||
|
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
|
||||||
|
self.step_count += 1
|
||||||
|
# extract demand signal (from env observation) as proxy for sales
|
||||||
|
demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
|
||||||
|
return self._update_from_demand(current_prices, demand)
|
||||||
|
|
||||||
|
def _update_from_demand(self, prices: np.ndarray, sold: np.ndarray) -> np.ndarray:
|
||||||
|
# log transforms (add 1 to handle zeros)
|
||||||
|
logp = np.log(np.clip(prices, 1e-3, None)).astype(np.float32)
|
||||||
|
logq = np.log(sold + 1.0).astype(np.float32)
|
||||||
|
# EWMA moments for per-product regression: logq ≈ a + e*logp
|
||||||
|
a = self.ewma
|
||||||
|
dp = logp - self.mu_logp
|
||||||
|
dq = logq - self.mu_logq
|
||||||
|
self.mu_logp = (1 - a) * self.mu_logp + a * logp
|
||||||
|
self.mu_logq = (1 - a) * self.mu_logq + a * logq
|
||||||
|
self.cov_pq = (1 - a) * self.cov_pq + a * (dp * dq)
|
||||||
|
self.var_p = (1 - a) * self.var_p + a * (dp * dp + 1e-6)
|
||||||
|
e_new = self.cov_pq / (self.var_p + 1e-6)
|
||||||
|
self.e_hat = self._safe_elasticity(0.9 * self.e_hat + 0.1 * e_new)
|
||||||
|
# profit-optimal price for isoelastic demand (if e < -1)
|
||||||
|
e = self.e_hat
|
||||||
|
p_star = self.unit_cost * (e / (e + 1.0))
|
||||||
|
# smooth toward p_star
|
||||||
|
new_prices = (1 - self.lr) * prices + self.lr * p_star
|
||||||
|
# exploration (small random perturbations)
|
||||||
|
if self.rng.random() < self.eps_explore:
|
||||||
|
noise = self.rng.normal(0.0, self.explore_scale, size=new_prices.shape).astype(np.float32)
|
||||||
|
new_prices = new_prices * (1.0 + noise)
|
||||||
|
# apply business guardrails (max change + bounds)
|
||||||
|
max_adj = self.c.max_price_adjustment
|
||||||
|
ratio = np.clip(new_prices / (prices + 1e-6), 1 - max_adj, 1 + max_adj)
|
||||||
|
new_prices = prices * ratio
|
||||||
|
new_prices = np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
|
||||||
|
return new_prices
|
||||||
|
|
||||||
|
|
||||||
|
class StaticPricingEngine(BasePricingEngine):
|
||||||
|
"""baseline: fixed prices throughout episode"""
|
||||||
|
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
|
||||||
|
super().__init__(constraints, seed)
|
||||||
|
self.fixed_prices = None
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
super().reset()
|
||||||
|
self.fixed_prices = None
|
||||||
|
|
||||||
|
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
|
||||||
|
self.step_count += 1
|
||||||
|
if self.fixed_prices is None:
|
||||||
|
self.fixed_prices = current_prices.copy()
|
||||||
|
return self.fixed_prices.copy()
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleDemandEngine(BasePricingEngine):
|
||||||
|
"""demand-driven pricing: increase price when demand rises, decrease when it falls"""
|
||||||
|
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
|
||||||
|
super().__init__(constraints, seed)
|
||||||
|
self.prev_demand = None
|
||||||
|
self.lr = 0.05
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
super().reset()
|
||||||
|
self.prev_demand = None
|
||||||
|
|
||||||
|
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
|
||||||
|
self.step_count += 1
|
||||||
|
demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
|
||||||
|
if self.prev_demand is None:
|
||||||
|
self.prev_demand = demand.copy()
|
||||||
|
return current_prices.copy()
|
||||||
|
# simple rule: if demand increases, raise price; if decreases, lower price
|
||||||
|
delta_d = demand - self.prev_demand
|
||||||
|
price_adj = self.lr * np.sign(delta_d) * np.abs(delta_d) / (np.abs(self.prev_demand) + 1.0)
|
||||||
|
new_prices = current_prices * (1.0 + price_adj)
|
||||||
|
self.prev_demand = demand.copy()
|
||||||
|
# apply constraints
|
||||||
|
max_adj = self.c.max_price_adjustment
|
||||||
|
ratio = np.clip(new_prices / (current_prices + 1e-6), 1 - max_adj, 1 + max_adj)
|
||||||
|
new_prices = current_prices * ratio
|
||||||
|
return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
class RandomWalkEngine(BasePricingEngine):
|
||||||
|
"""random walk pricing with mean reversion"""
|
||||||
|
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
|
||||||
|
super().__init__(constraints, seed)
|
||||||
|
self.target_price = None
|
||||||
|
self.volatility = 0.02
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
super().reset()
|
||||||
|
self.target_price = None
|
||||||
|
|
||||||
|
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
|
||||||
|
self.step_count += 1
|
||||||
|
if self.target_price is None:
|
||||||
|
self.target_price = current_prices.copy()
|
||||||
|
# random walk with mean reversion toward target
|
||||||
|
noise = self.rng.normal(0.0, self.volatility, size=current_prices.shape).astype(np.float32)
|
||||||
|
reversion = 0.01 * (self.target_price - current_prices)
|
||||||
|
new_prices = current_prices * (1.0 + noise) + reversion
|
||||||
|
# apply constraints
|
||||||
|
max_adj = self.c.max_price_adjustment
|
||||||
|
ratio = np.clip(new_prices / (current_prices + 1e-6), 1 - max_adj, 1 + max_adj)
|
||||||
|
new_prices = current_prices * ratio
|
||||||
|
return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
class ThompsonSamplingEngine(BasePricingEngine):
|
||||||
|
"""bayesian bandit approach per product treating price as discrete action"""
|
||||||
|
def __init__(self, constraints: BusinessLogicConstraints, seed: int = 0):
|
||||||
|
super().__init__(constraints, seed)
|
||||||
|
self.n_price_levels = 5
|
||||||
|
self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
|
||||||
|
self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
|
||||||
|
self.price_grid = None
|
||||||
|
self.last_actions = None
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
super().reset()
|
||||||
|
self.alpha = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
|
||||||
|
self.beta = np.ones((self.c.product_catelogue_size, self.n_price_levels), dtype=np.float32)
|
||||||
|
self.price_grid = None
|
||||||
|
self.last_actions = None
|
||||||
|
|
||||||
|
def compute_prices(self, current_prices: np.ndarray, observation: Dict[str, Any]) -> np.ndarray:
|
||||||
|
self.step_count += 1
|
||||||
|
if self.price_grid is None:
|
||||||
|
# define price grid per product
|
||||||
|
lo = current_prices * 0.7
|
||||||
|
hi = current_prices * 1.3
|
||||||
|
self.price_grid = np.linspace(lo, hi, self.n_price_levels).T
|
||||||
|
demand = observation.get('demand', np.zeros(self.c.product_catelogue_size, dtype=np.float32))
|
||||||
|
# update beliefs based on last action
|
||||||
|
if self.last_actions is not None:
|
||||||
|
for i in range(self.c.product_catelogue_size):
|
||||||
|
a = self.last_actions[i]
|
||||||
|
reward = demand[i]
|
||||||
|
if reward > 0.5:
|
||||||
|
self.alpha[i, a] += reward
|
||||||
|
else:
|
||||||
|
self.beta[i, a] += 1.0
|
||||||
|
# thompson sampling: sample from posterior, pick best
|
||||||
|
new_prices = np.zeros(self.c.product_catelogue_size, dtype=np.float32)
|
||||||
|
actions = np.zeros(self.c.product_catelogue_size, dtype=int)
|
||||||
|
for i in range(self.c.product_catelogue_size):
|
||||||
|
theta = self.rng.beta(self.alpha[i], self.beta[i]).astype(np.float32)
|
||||||
|
actions[i] = int(np.argmax(theta))
|
||||||
|
new_prices[i] = self.price_grid[i, actions[i]]
|
||||||
|
self.last_actions = actions
|
||||||
|
return np.clip(new_prices, self.c.system_min_price, self.c.system_max_price).astype(np.float32)
|
||||||
@@ -1,5 +1,7 @@
|
|||||||
|
from sys import intern
|
||||||
import gymnasium as gym
|
import gymnasium as gym
|
||||||
from gymnasium import spaces
|
from gymnasium import spaces
|
||||||
|
from matplotlib import interactive
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -24,7 +26,7 @@ class BusinessLogicConstraints():
|
|||||||
coi_sigmoid_temp: float = 1.25
|
coi_sigmoid_temp: float = 1.25
|
||||||
base_human_demand: float = 0.08
|
base_human_demand: float = 0.08
|
||||||
base_agent_demand: float = 0.05
|
base_agent_demand: float = 0.05
|
||||||
human_price_elasticity: float = -1.2
|
human_price_elasticity: float = -1.2 # assumptions here
|
||||||
agent_price_elasticity: float = -0.6
|
agent_price_elasticity: float = -0.6
|
||||||
w_agent_loss: float = 1.0
|
w_agent_loss: float = 1.0
|
||||||
w_volatility: float = 5.0
|
w_volatility: float = 5.0
|
||||||
@@ -35,31 +37,25 @@ class BusinessLogicConstraints():
|
|||||||
def _sigmoid(x: np.ndarray) -> np.ndarray:
|
def _sigmoid(x: np.ndarray) -> np.ndarray:
|
||||||
return 1.0 / (1.0 + np.exp(-x))
|
return 1.0 / (1.0 + np.exp(-x))
|
||||||
|
|
||||||
|
|
||||||
def simple_agent_detector(session_df: pd.DataFrame) -> pd.Series:
|
|
||||||
# baseline heuristic: high velocity + low conversion
|
|
||||||
v = session_df.get("interaction_velocity", pd.Series(0.0, index=session_df.index))
|
|
||||||
cr = session_df.get("conversion_rate", pd.Series(0.0, index=session_df.index))
|
|
||||||
total = session_df.get("total_interactions", pd.Series(0, index=session_df.index))
|
|
||||||
return (total >= 12) & (v >= 0.20) & (cr <= 0.01)
|
|
||||||
|
|
||||||
|
|
||||||
class CommercePlatform:
|
class CommercePlatform:
|
||||||
def __init__(self, product_catelogue_size: int, max_price: float, min_price: float,
|
"""
|
||||||
constraints: BusinessLogicConstraints, agent_detector: Optional[Callable[[pd.DataFrame], pd.Series]] = None,
|
This is just an extension of the state management for the environment, it does not implement anything dynamic just helps us simulate demand.
|
||||||
use_defense: bool = False):
|
"""
|
||||||
|
def __init__(self,
|
||||||
|
product_catelogue_size: int,
|
||||||
|
max_price: float,
|
||||||
|
min_price: float,
|
||||||
|
constraints: BusinessLogicConstraints):
|
||||||
self.product_catelogue_size = product_catelogue_size
|
self.product_catelogue_size = product_catelogue_size
|
||||||
|
self.product_supply = np.random.uniform(low=10, high=50, size=(self.product_catelogue_size,))
|
||||||
self.max_price = max_price
|
self.max_price = max_price
|
||||||
self.min_price = min_price
|
self.min_price = min_price
|
||||||
self.constraints = constraints
|
self.constraints = constraints
|
||||||
self.use_defense = use_defense
|
|
||||||
self.agent_detector = agent_detector
|
|
||||||
self.simulation_history: List[Dict[str, Any]] = []
|
self.simulation_history: List[Dict[str, Any]] = []
|
||||||
self._rng = np.random.default_rng(constraints.seed)
|
self._rng = np.random.default_rng(constraints.seed)
|
||||||
self._popularity = self._rng.lognormal(mean=0.0, sigma=0.6, size=self.product_catelogue_size)
|
|
||||||
self._popularity = self._popularity / (self._popularity.mean() + 1e-12)
|
|
||||||
self._last_interaction_df: pd.DataFrame = pd.DataFrame()
|
self._last_interaction_df: pd.DataFrame = pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
|
def setup_true_demand(self, prices: np.ndarray) -> Dict[str, np.ndarray]:
|
||||||
# ground truth purchase propensities
|
# ground truth purchase propensities
|
||||||
p = np.clip(prices, self.min_price, self.max_price)
|
p = np.clip(prices, self.min_price, self.max_price)
|
||||||
@@ -67,14 +63,19 @@ class CommercePlatform:
|
|||||||
human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
|
human_prob = self.constraints.base_human_demand * (pn ** self.constraints.human_price_elasticity)
|
||||||
agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
|
agent_prob = self.constraints.base_agent_demand * (pn ** self.constraints.agent_price_elasticity)
|
||||||
return {
|
return {
|
||||||
"human_purchase_prob": np.clip(human_prob * self._popularity, 0.0, 0.95),
|
"human_purchase_prob": np.clip(human_prob, 0.0, 0.95),
|
||||||
"agent_purchase_prob": np.clip(agent_prob * self._popularity, 0.0, 0.95)
|
"agent_purchase_prob": np.clip(agent_prob, 0.0, 0.95)
|
||||||
}
|
}
|
||||||
|
|
||||||
def _session_markup_multiplier(self, signal_score: float) -> float:
|
def _load_behavioral_profile(actor : str, demand_forcing):
|
||||||
# session-based COI markup based on demand signal expression
|
"""
|
||||||
x = (signal_score - self.constraints.coi_threshold) / max(self.constraints.coi_sigmoid_temp, 1e-6)
|
This returns a markov chain with average weights which we get from interaction data of our experiments.
|
||||||
return 1.0 + self.constraints.coi_strength * float(_sigmoid(np.array([x]))[0])
|
This defines transition probabilities between different events:
|
||||||
|
search -> view_item_price_binN: 0.7
|
||||||
|
view_item_price_binN -> add_to_cart: 0.2
|
||||||
|
we also must reweight with the demand_forcing vector or purchase probabilities per-product
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
|
def _simulate_sessions(self, base_prices: np.ndarray) -> pd.DataFrame:
|
||||||
demand = self.setup_true_demand(base_prices)
|
demand = self.setup_true_demand(base_prices)
|
||||||
@@ -84,94 +85,32 @@ class CommercePlatform:
|
|||||||
T = self.constraints.sessions_per_step
|
T = self.constraints.sessions_per_step
|
||||||
n_agent_sessions = int(round(T * self.constraints.agent_share))
|
n_agent_sessions = int(round(T * self.constraints.agent_share))
|
||||||
n_human_sessions = T - n_agent_sessions
|
n_human_sessions = T - n_agent_sessions
|
||||||
|
|
||||||
# human sessions: normal browse with possible purchase
|
|
||||||
for s in range(n_human_sessions):
|
|
||||||
session_id = f"h_{len(events)}_{s}"
|
|
||||||
k = int(self._rng.integers(1, 4))
|
|
||||||
prod_ids = self._rng.choice(self.product_catelogue_size, size=k, replace=False)
|
|
||||||
t = 0.0
|
|
||||||
inter_times = self._rng.gamma(shape=2.0, scale=3.0, size=3 * k)
|
|
||||||
signal_score = 0.0
|
|
||||||
purchased_any = False
|
|
||||||
|
|
||||||
for i, pid in enumerate(prod_ids):
|
|
||||||
t += float(inter_times[i])
|
|
||||||
price_shown = float(base_prices[pid])
|
|
||||||
events.append({
|
|
||||||
"session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
|
|
||||||
"action": "view", "t": t, "price_shown": price_shown, "is_purchase": 0,
|
|
||||||
"price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
|
|
||||||
})
|
|
||||||
signal_score += 1.0
|
|
||||||
|
|
||||||
if self._rng.random() < 0.35:
|
|
||||||
t += float(inter_times[i + k])
|
|
||||||
events.append({
|
|
||||||
"session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
|
|
||||||
"action": "cart", "t": t, "price_shown": price_shown, "is_purchase": 0,
|
|
||||||
"price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
|
|
||||||
})
|
|
||||||
signal_score += 2.0
|
|
||||||
|
|
||||||
if (not purchased_any) and (self._rng.random() < float(human_pprob[pid])):
|
|
||||||
t += float(inter_times[i + 2 * k])
|
|
||||||
mult = self._session_markup_multiplier(signal_score)
|
|
||||||
price_paid = float(np.clip(base_prices[pid] * mult, self.min_price, self.max_price))
|
|
||||||
events.append({
|
|
||||||
"session_id": session_id, "actor": "human", "agent_id": None, "product_id": int(pid),
|
|
||||||
"action": "purchase", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 1,
|
|
||||||
"price_paid": price_paid, "oracle_price_paid": price_paid, "signal_score": signal_score,
|
|
||||||
})
|
|
||||||
purchased_any = True
|
|
||||||
|
|
||||||
# agent sessions: split recon/purchase to circumvent COI
|
|
||||||
n_agent_ids = max(1, n_agent_sessions // 2)
|
n_agent_ids = max(1, n_agent_sessions // 2)
|
||||||
for a in range(n_agent_ids):
|
session_map = {
|
||||||
agent_id = f"a_{a}"
|
'humans': n_human_sessions,
|
||||||
recon_session_id = f"{agent_id}_recon"
|
'agents': n_agent_ids
|
||||||
t = 0.0
|
}
|
||||||
n_views = int(self._rng.poisson(lam=8) * self.constraints.agent_recon_multiplier) + 5
|
pprob_map = {
|
||||||
inter_times = self._rng.gamma(shape=2.0, scale=0.6, size=max(n_views, 1))
|
'humans': human_pprob,
|
||||||
prod_ids = self._rng.integers(0, self.product_catelogue_size, size=n_views)
|
'agents': agent_pprob
|
||||||
recon_signal = 0.0
|
}
|
||||||
|
joint_events = []
|
||||||
|
for actor, n_sessions in session_map.items():
|
||||||
|
bp = _load_behavioral_profile(actor, pprob_map[actor])
|
||||||
|
counter = 0
|
||||||
|
events = []
|
||||||
|
while counter < n_sessions:
|
||||||
|
session_events = []
|
||||||
|
while len(session_events) == 0 or session_events[-1]['action'] == 'checkout':
|
||||||
|
interaction_event = bp.sample(self._rng)
|
||||||
|
interaction_event['session_id'] = f'{actor}_{counter:06d}'
|
||||||
|
# TODO any other assignments
|
||||||
|
session_events.append(interaction_event)
|
||||||
|
events.extend(session_events)
|
||||||
|
counter += 1
|
||||||
|
joint_events.extend(events)
|
||||||
|
|
||||||
for i, pid in enumerate(prod_ids):
|
return pd.DataFrame(joint_events)
|
||||||
t += float(inter_times[i])
|
|
||||||
events.append({
|
|
||||||
"session_id": recon_session_id, "actor": "agent", "agent_id": agent_id, "product_id": int(pid),
|
|
||||||
"action": "view", "t": t, "price_shown": float(base_prices[pid]), "is_purchase": 0,
|
|
||||||
"price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
|
|
||||||
})
|
|
||||||
recon_signal += 1.0
|
|
||||||
|
|
||||||
# clean purchase session with minimal interactions
|
|
||||||
if self._rng.random() < self.constraints.agent_purchase_probability:
|
|
||||||
purchase_session_id = f"{agent_id}_clean"
|
|
||||||
pid = int(self._rng.integers(0, self.product_catelogue_size))
|
|
||||||
t2 = 0.0
|
|
||||||
clean_signal = 0.0
|
|
||||||
t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
|
|
||||||
events.append({
|
|
||||||
"session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
|
|
||||||
"action": "view", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 0,
|
|
||||||
"price_paid": 0.0, "oracle_price_paid": 0.0, "signal_score": 0.0,
|
|
||||||
})
|
|
||||||
clean_signal += 1.0
|
|
||||||
|
|
||||||
if self._rng.random() < float(agent_pprob[pid]):
|
|
||||||
t2 += float(self._rng.gamma(shape=2.0, scale=0.7))
|
|
||||||
obs_mult = self._session_markup_multiplier(clean_signal)
|
|
||||||
obs_paid = float(np.clip(base_prices[pid] * obs_mult, self.min_price, self.max_price))
|
|
||||||
oracle_mult = self._session_markup_multiplier(recon_signal) # oracle links recon->purchase
|
|
||||||
oracle_paid = float(np.clip(base_prices[pid] * oracle_mult, self.min_price, self.max_price))
|
|
||||||
events.append({
|
|
||||||
"session_id": purchase_session_id, "actor": "agent", "agent_id": agent_id, "product_id": pid,
|
|
||||||
"action": "purchase", "t": t2, "price_shown": float(base_prices[pid]), "is_purchase": 1,
|
|
||||||
"price_paid": obs_paid, "oracle_price_paid": oracle_paid, "signal_score": clean_signal,
|
|
||||||
})
|
|
||||||
|
|
||||||
return pd.DataFrame(events)
|
|
||||||
|
|
||||||
def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
|
def compute_interaction_features(self, interaction_df: pd.DataFrame) -> Dict[str, float]:
|
||||||
if interaction_df.empty:
|
if interaction_df.empty:
|
||||||
@@ -183,6 +122,7 @@ class CommercePlatform:
|
|||||||
return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
|
return {"mean_sale_price": mean_sale_price, "look_to_book": float(views / (buys + 1e-6))}
|
||||||
|
|
||||||
def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
|
def _session_feature_table(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
# TODO: adapt this
|
||||||
if df.empty:
|
if df.empty:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
g = df.groupby("session_id", sort=False)
|
g = df.groupby("session_id", sort=False)
|
||||||
@@ -208,73 +148,6 @@ class CommercePlatform:
|
|||||||
"is_agent": is_agent.astype(bool),
|
"is_agent": is_agent.astype(bool),
|
||||||
}).reset_index()
|
}).reset_index()
|
||||||
|
|
||||||
def demand_estimate(self, interaction_df: pd.DataFrame, exclude_sessions: Optional[pd.Series] = None) -> np.ndarray:
|
|
||||||
# proxy demand from weighted interaction events
|
|
||||||
if interaction_df.empty:
|
|
||||||
return np.zeros(self.product_catelogue_size, dtype=np.float32)
|
|
||||||
df = interaction_df
|
|
||||||
if exclude_sessions is not None:
|
|
||||||
bad_sessions = set(exclude_sessions.loc[exclude_sessions].index)
|
|
||||||
df = df[~df["session_id"].isin(bad_sessions)]
|
|
||||||
weights = {"view": 0.15, "cart": 0.75, "purchase": 2.5}
|
|
||||||
w = df["action"].map(weights).fillna(0.0).to_numpy(dtype=float)
|
|
||||||
prod = df["product_id"].to_numpy(dtype=int)
|
|
||||||
q_hat = np.zeros(self.product_catelogue_size, dtype=float)
|
|
||||||
np.add.at(q_hat, prod, w)
|
|
||||||
return q_hat.astype(np.float32)
|
|
||||||
|
|
||||||
def run_pricing_simulation(self, prices: np.ndarray) -> Dict[str, Any]:
|
|
||||||
interaction_df = self._simulate_sessions(prices)
|
|
||||||
self._last_interaction_df = interaction_df
|
|
||||||
session_df = self._session_feature_table(interaction_df)
|
|
||||||
|
|
||||||
predicted_agent_sessions = None
|
|
||||||
if (self.use_defense and self.agent_detector is not None and not session_df.empty):
|
|
||||||
predicted_agent_sessions = self.agent_detector(session_df.set_index("session_id"))
|
|
||||||
|
|
||||||
q_hat_naive = self.demand_estimate(interaction_df, exclude_sessions=None)
|
|
||||||
q_hat_defended = self.demand_estimate(interaction_df, exclude_sessions=predicted_agent_sessions) \
|
|
||||||
if predicted_agent_sessions is not None else q_hat_naive.copy()
|
|
||||||
|
|
||||||
true_human = np.zeros(self.product_catelogue_size, dtype=float)
|
|
||||||
true_agent = np.zeros(self.product_catelogue_size, dtype=float)
|
|
||||||
if not interaction_df.empty:
|
|
||||||
purchases = interaction_df[interaction_df["action"] == "purchase"]
|
|
||||||
if not purchases.empty:
|
|
||||||
for _, r in purchases.iterrows():
|
|
||||||
if r["actor"] == "human":
|
|
||||||
true_human[int(r["product_id"])] += 1.0
|
|
||||||
else:
|
|
||||||
true_agent[int(r["product_id"])] += 1.0
|
|
||||||
|
|
||||||
revenue_observed = float(interaction_df["price_paid"].sum()) if not interaction_df.empty else 0.0
|
|
||||||
revenue_oracle = float(interaction_df["oracle_price_paid"].sum()) if not interaction_df.empty else 0.0
|
|
||||||
agent_loss = max(0.0, revenue_oracle - revenue_observed)
|
|
||||||
|
|
||||||
eps = 1e-6
|
|
||||||
internal_error_naive = np.abs(true_human - q_hat_naive) / (true_human + eps)
|
|
||||||
internal_error_def = np.abs(true_human - q_hat_defended) / (true_human + eps)
|
|
||||||
interaction_features = self.compute_interaction_features(interaction_df)
|
|
||||||
|
|
||||||
summary = {
|
|
||||||
"prices": prices.copy(),
|
|
||||||
"interaction_df": interaction_df,
|
|
||||||
"session_df": session_df,
|
|
||||||
"q_hat_naive": q_hat_naive,
|
|
||||||
"q_hat_defended": q_hat_defended,
|
|
||||||
"true_human_demand": true_human.astype(np.float32),
|
|
||||||
"true_agent_purchases": true_agent.astype(np.float32),
|
|
||||||
"internal_error_naive": internal_error_naive.astype(np.float32),
|
|
||||||
"internal_error_defended": internal_error_def.astype(np.float32),
|
|
||||||
"interaction_features": interaction_features,
|
|
||||||
"revenue_observed": revenue_observed,
|
|
||||||
"revenue_oracle": revenue_oracle,
|
|
||||||
"agent_loss": agent_loss,
|
|
||||||
"predicted_agent_sessions": predicted_agent_sessions,
|
|
||||||
}
|
|
||||||
self.simulation_history.append(summary)
|
|
||||||
return summary
|
|
||||||
|
|
||||||
def get_interaction_data(self) -> np.ndarray:
|
def get_interaction_data(self) -> np.ndarray:
|
||||||
if self._last_interaction_df.empty:
|
if self._last_interaction_df.empty:
|
||||||
return np.array([], dtype=object)
|
return np.array([], dtype=object)
|
||||||
@@ -284,7 +157,7 @@ class CommercePlatform:
|
|||||||
class PHANTOMEnv(gym.Env):
|
class PHANTOMEnv(gym.Env):
|
||||||
metadata = {"render_modes": []}
|
metadata = {"render_modes": []}
|
||||||
|
|
||||||
def __init__(self, use_defense: bool = False):
|
def __init__(self, constraints):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.constraints = BusinessLogicConstraints()
|
self.constraints = BusinessLogicConstraints()
|
||||||
self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
|
self.action_space = spaces.Box(low=-self.constraints.max_price_adjustment,
|
||||||
@@ -301,14 +174,13 @@ class PHANTOMEnv(gym.Env):
|
|||||||
high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32),
|
high=np.full((self.constraints.product_catelogue_size,), 1e6, dtype=np.float32),
|
||||||
dtype=np.float32),
|
dtype=np.float32),
|
||||||
})
|
})
|
||||||
|
# TODO: define more features that we compute from the interaction data
|
||||||
})
|
})
|
||||||
self.commerce_platform = CommercePlatform(
|
self.commerce_platform = CommercePlatform(
|
||||||
product_catelogue_size=self.constraints.product_catelogue_size,
|
product_catelogue_size=self.constraints.product_catelogue_size,
|
||||||
max_price=self.constraints.system_max_price,
|
max_price=self.constraints.system_max_price,
|
||||||
min_price=self.constraints.system_min_price,
|
min_price=self.constraints.system_min_price,
|
||||||
constraints=self.constraints,
|
constraints=self.constraints)
|
||||||
agent_detector=simple_agent_detector,
|
|
||||||
use_defense=use_defense)
|
|
||||||
self._rng = np.random.default_rng(self.constraints.seed)
|
self._rng = np.random.default_rng(self.constraints.seed)
|
||||||
self.t = 0
|
self.t = 0
|
||||||
self._prev_prices: Optional[np.ndarray] = None
|
self._prev_prices: Optional[np.ndarray] = None
|
||||||
@@ -336,17 +208,13 @@ class PHANTOMEnv(gym.Env):
|
|||||||
new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
|
new_prices = np.clip(base_prices * (1.0 + action.astype(np.float32)),
|
||||||
self.constraints.system_min_price,
|
self.constraints.system_min_price,
|
||||||
self.constraints.system_max_price).astype(np.float32)
|
self.constraints.system_max_price).astype(np.float32)
|
||||||
result = self.commerce_platform.run_pricing_simulation(new_prices)
|
|
||||||
|
|
||||||
if self.commerce_platform.use_defense:
|
|
||||||
demand_est = result["q_hat_defended"]
|
|
||||||
internal_err = result["internal_error_defended"]
|
|
||||||
else:
|
|
||||||
demand_est = result["q_hat_naive"]
|
|
||||||
internal_err = result["internal_error_naive"]
|
|
||||||
|
|
||||||
self.state["elasticity"]["price"] = new_prices
|
self.state["elasticity"]["price"] = new_prices
|
||||||
self.state["elasticity"]["demand"] = demand_est
|
# TODO: use the commerce platform to simulate sessions
|
||||||
|
interactions_df = self.commerce_platform._simulate_sessions(new_prices)
|
||||||
|
result = self.commerce_platform.compute_interaction_features(interactions_df)
|
||||||
|
# TODO: implement COI computation to use in reward
|
||||||
|
COI = 0.0
|
||||||
|
|
||||||
volatility = 0.0 if self._prev_prices is None else \
|
volatility = 0.0 if self._prev_prices is None else \
|
||||||
float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
|
float(np.mean(np.abs((new_prices - self._prev_prices) / (self._prev_prices + 1e-6))))
|
||||||
@@ -354,12 +222,13 @@ class PHANTOMEnv(gym.Env):
|
|||||||
|
|
||||||
revenue_observed = float(result["revenue_observed"])
|
revenue_observed = float(result["revenue_observed"])
|
||||||
agent_loss = float(result["agent_loss"])
|
agent_loss = float(result["agent_loss"])
|
||||||
err_mean = float(np.mean(internal_err))
|
|
||||||
|
|
||||||
reward = (revenue_observed
|
reward = (revenue_observed
|
||||||
|
- COI
|
||||||
- self.constraints.w_agent_loss * agent_loss
|
- self.constraints.w_agent_loss * agent_loss
|
||||||
- self.constraints.w_volatility * volatility
|
- self.constraints.w_volatility * volatility
|
||||||
- self.constraints.w_estimation_error * err_mean)
|
- self.constraints.w_estimation_error
|
||||||
|
)
|
||||||
|
|
||||||
terminated = self.t >= self.constraints.episode_length
|
terminated = self.t >= self.constraints.episode_length
|
||||||
info = {
|
info = {
|
||||||
|
|||||||
149
sim/rl/train.py
Normal file
149
sim/rl/train.py
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
import numpy as np
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Type, Optional
|
||||||
|
import pickle
|
||||||
|
from torch import neg_
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
from environment import PHANTOMEnv, FastTrainingConstraints, BusinessLogicConstraints
|
||||||
|
from engine import (BasePricingEngine, WildPricingEngine, StaticPricingEngine,
|
||||||
|
SimpleDemandEngine, RandomWalkEngine, ThompsonSamplingEngine)
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Target training loop:
|
||||||
|
have base prices p0 from env reset and run the env step, collect reward and metrics
|
||||||
|
pass this to the pricing engine which computes the price action to take based on previous reward by learning
|
||||||
|
the new action gets passed to the step
|
||||||
|
so we alternate, step -> reward -> engine (produces price delta) -> step with price delta -> reward
|
||||||
|
to make sure the reinforcement learning inside the engine can learn we need to have trajectory of prices
|
||||||
|
CURRENT SOLUTION BELOW does not implement correct learning or updates.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class EngineTrainer:
|
||||||
|
"""wrapper to run pricing engines through episodes and collect metrics"""
|
||||||
|
def __init__(self, engine: BasePricingEngine, env: PHANTOMEnv,
|
||||||
|
tb_writer: Optional[SummaryWriter] = None):
|
||||||
|
self.engine = engine
|
||||||
|
self.env = env
|
||||||
|
self.episode_metrics = []
|
||||||
|
self.tb_writer = tb_writer
|
||||||
|
self.global_step = 0
|
||||||
|
|
||||||
|
def train(self, n_episodes: int, seed: int = 42):
|
||||||
|
|
||||||
|
obs, _ = self.env.reset(seed=seed)
|
||||||
|
prices = None
|
||||||
|
for ep in range(n_episodes):
|
||||||
|
prices = self.engine.compute_prices(prices, obs
|
||||||
|
obs, reward, done, _, info = self.env.step(prices)
|
||||||
|
self.engine.update(obs, reward, done, info)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return self.episode_metrics
|
||||||
|
|
||||||
|
def evaluate(self, n_episodes: int = 10, seed: int = 100) -> Dict:
|
||||||
|
"""evaluate trained engine"""
|
||||||
|
results = {k: [] for k in ['total_reward', 'revenue_observed', 'revenue_oracle',
|
||||||
|
'agent_loss', 'ux_volatility', 'look_to_book']}
|
||||||
|
for ep in range(n_episodes):
|
||||||
|
metrics = self.run_episode(seed=seed + ep)
|
||||||
|
for k in results: results[k].append(metrics[k])
|
||||||
|
return {k: (np.mean(v), np.std(v)) for k, v in results.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def make_env(fast: bool = True):
|
||||||
|
constraints = FastTrainingConstraints() if fast else BusinessLogicConstraints()
|
||||||
|
return PHANTOMEnv(constraints=constraints)
|
||||||
|
|
||||||
|
|
||||||
|
def train_engine(engine_cls: Type[BasePricingEngine], env: PHANTOMEnv,
|
||||||
|
n_episodes: int, seed: int = 42,
|
||||||
|
tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
|
||||||
|
constraints = env.constraints
|
||||||
|
engine = engine_cls(constraints=constraints, seed=seed)
|
||||||
|
trainer = EngineTrainer(engine, env, tb_writer=tb_writer)
|
||||||
|
trainer.train(n_episodes, seed=seed)
|
||||||
|
return trainer
|
||||||
|
|
||||||
|
|
||||||
|
def save_trainer(trainer: EngineTrainer, path: Path):
|
||||||
|
"""save engine state and metrics"""
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(path, 'wb') as f:
|
||||||
|
pickle.dump({
|
||||||
|
'engine': trainer.engine,
|
||||||
|
'metrics': trainer.episode_metrics
|
||||||
|
}, f)
|
||||||
|
logger.info(f"Saved trainer to {path}")
|
||||||
|
|
||||||
|
|
||||||
|
def load_trainer(path: Path, env: PHANTOMEnv,
|
||||||
|
tb_writer: Optional[SummaryWriter] = None) -> EngineTrainer:
|
||||||
|
"""load saved engine"""
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
data = pickle.load(f)
|
||||||
|
trainer = EngineTrainer(data['engine'], env, tb_writer=tb_writer)
|
||||||
|
trainer.episode_metrics = data['metrics']
|
||||||
|
return trainer
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
base_dir = Path("./runs")
|
||||||
|
base_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
engines = {
|
||||||
|
"Wild": WildPricingEngine,
|
||||||
|
"Static": StaticPricingEngine,
|
||||||
|
# "SimpleDemand": SimpleDemandEngine,
|
||||||
|
"RandomWalk": RandomWalkEngine,
|
||||||
|
"ThompsonSampling": ThompsonSamplingEngine,
|
||||||
|
}
|
||||||
|
defenses = [False, True]
|
||||||
|
n_train_episodes = 50
|
||||||
|
n_eval_episodes = 10
|
||||||
|
seed = 42
|
||||||
|
fast_mode = True
|
||||||
|
|
||||||
|
logger.info(f"Training config: {n_train_episodes} episodes per engine, fast_mode={fast_mode}")
|
||||||
|
|
||||||
|
trained_trainers = {}
|
||||||
|
|
||||||
|
for engine_name, engine_cls in engines.items():
|
||||||
|
for use_defense in defenses:
|
||||||
|
defense_label = "defense_on" if use_defense else "defense_off"
|
||||||
|
run_name = f"{engine_name}_{defense_label}"
|
||||||
|
log_dir = base_dir / run_name
|
||||||
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
logger.info(f"Training {engine_name} with defense={use_defense}")
|
||||||
|
logger.info(f"Log directory: {log_dir}")
|
||||||
|
|
||||||
|
env = make_env(fast=fast_mode)
|
||||||
|
tb_writer = SummaryWriter(log_dir=str(log_dir))
|
||||||
|
trainer = train_engine(engine_cls, env, n_train_episodes, seed, tb_writer=tb_writer)
|
||||||
|
tb_writer.close()
|
||||||
|
|
||||||
|
save_path = log_dir / "trainer.pkl"
|
||||||
|
save_trainer(trainer, save_path)
|
||||||
|
|
||||||
|
trained_trainers[run_name] = (trainer, env)
|
||||||
|
|
||||||
|
logger.info("Starting evaluation")
|
||||||
|
|
||||||
|
for run_name, (trainer, env) in trained_trainers.items():
|
||||||
|
logger.info(f"Evaluating {run_name}")
|
||||||
|
results = trainer.evaluate(n_episodes=n_eval_episodes, seed=seed + 1000)
|
||||||
|
for metric, (mean, std) in results.items():
|
||||||
|
logger.info(f" {metric:20s}: {mean:10.2f} ± {std:6.2f}")
|
||||||
|
|
||||||
|
logger.info(f"Results saved to: {base_dir}")
|
||||||
Reference in New Issue
Block a user