From c4fd1352c9e054035ca6af4f7c03ecc7dc9cbd6a Mon Sep 17 00:00:00 2001 From: Daniel Rosel Date: Mon, 2 Feb 2026 11:18:37 +0100 Subject: [PATCH] naoice COI implementation --- engine/engine.py | 58 +++++++++----- engine/lib/__init__.py | 3 +- engine/lib/behavior.py | 53 +++++++++++-- engine/wrapper.py | 173 +++++++++++++++++++++++++++++++---------- paper/src/main.tex | 2 +- 5 files changed, 221 insertions(+), 68 deletions(-) diff --git a/engine/engine.py b/engine/engine.py index a4d568d..000f03f 100644 --- a/engine/engine.py +++ b/engine/engine.py @@ -3,20 +3,23 @@ import numpy as np from .lib.demand import generate_demand_for_actor, estimate_demand from .lib.behavior import sample_behavior from logging import INFO, getLogger + logger = getLogger(__name__) logger.setLevel(INFO) -class MarketEngine(): +class MarketEngine: """implements separate demand distributions for humans and agents per Section 3.1.1""" - def __init__(self, - alpha: float, - N: int, - human_params: tuple, - agent_params: tuple, - demand_distribution = np.random.normal, - noise_std: float = 1.0): + def __init__( + self, + alpha: float, + N: int, + human_params: tuple, + agent_params: tuple, + demand_distribution=np.random.normal, + noise_std: float = 1.0, + ): # no defaults for D_H, D_A - force explicit experiment design self.alpha = alpha self.Nagents = int(N * alpha) @@ -28,31 +31,41 @@ class MarketEngine(): def act(self, prices): # generate separate demands d() per actor type - demand_h = generate_demand_for_actor(prices, self.human_params, self.noise_std, distribution_method = self.demand_dist) - demand_a = generate_demand_for_actor(prices, self.agent_params, self.noise_std, distribution_method = self.demand_dist) + demand_h = generate_demand_for_actor( + prices, + self.human_params, + self.noise_std, + distribution_method=self.demand_dist, + ) + demand_a = generate_demand_for_actor( + prices, + self.agent_params, + self.noise_std, + distribution_method=self.demand_dist, + ) # sample behavior trajectories from each demand distribution human_t = [sample_behavior(demand_h, human=True) for _ in range(self.Nhumans)] agent_t = [sample_behavior(demand_a, human=False) for _ in range(self.Nagents)] - return estimate_demand(human_t + agent_t) + # store trajectories for agent probability calculation + self.last_trajectories = human_t + agent_t + return estimate_demand(self.last_trajectories) def measure(self): pass -class PricingEngine(): - def __init__(self, - ) -> None: + +class PricingEngine: + def __init__( + self, + ) -> None: pass def act(self, demand): return np.random.uniform(low=25, high=100, size=10) - -class Limbo(): - def __init__(self, - platform, - market - ) -> None: +class Limbo: + def __init__(self, platform, market) -> None: self.platform_turn = True self.platform = platform self.market = market @@ -67,9 +80,12 @@ class Limbo(): print(self.output) self.platform_turn = not self.platform_turn + if __name__ == "__main__": platform = PricingEngine() - market = MarketEngine(alpha=0.3, N=100, human_params=(50, 10), agent_params=(45, 15)) + market = MarketEngine( + alpha=0.3, N=100, human_params=(50, 10), agent_params=(45, 15) + ) limbo = Limbo(platform, market) for _ in range(10): limbo.step() diff --git a/engine/lib/__init__.py b/engine/lib/__init__.py index d120204..0546a18 100644 --- a/engine/lib/__init__.py +++ b/engine/lib/__init__.py @@ -1,6 +1,7 @@ from .demand import estimate_demand, generate_demand_for_actor -from .behavior import sample_behavior +from .behavior import sample_behavior, get_transition_models, trajectory_to_events from .render import DashboardRenderer, style_axis from .wrappers import EconomicMetricsWrapper from .callbacks import MetricsCallback, EvalMetricsCallback from .providers import ProviderBenchmark, ProviderResult, BenchmarkConfig +from .coi import compute_coi_leakage, compute_erosion_metrics, compute_agent_probability diff --git a/engine/lib/behavior.py b/engine/lib/behavior.py index 0f8c486..34faad2 100644 --- a/engine/lib/behavior.py +++ b/engine/lib/behavior.py @@ -1,3 +1,8 @@ +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parents[2])) + from sim.rl.behavior_loader.models import ( BehaviorModel, AgentBehaviorModel, @@ -7,11 +12,9 @@ import pandas as pd import numpy as np from .demand import generate_demand_for_actor -base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments" -human_dir, agent_dir = ( - f"{base_dir}/collected_data/", - f"{base_dir}/agents/collected_data/", -) +base_dir = Path(__file__).parents[2] / "experiments" +human_dir = str(base_dir / "collected_data") +agent_dir = str(base_dir / "agents" / "collected_data") _cache = {} # lazy cache for models and base pivots @@ -25,6 +28,46 @@ def _get_base_pivot(human: bool): return _cache[key] +def get_transition_models(): + """load human and agent transition models for agent probability calculation + + returns: + tuple: (human_transitions, agent_transitions) as dicts of event->event->prob + """ + human_model = BehaviorModel(human_dir) + agent_model = AgentBehaviorModel(agent_dir) + + human_mdp = human_model.build_MDP() + agent_mdp = agent_model.build_MDP() + + human_trans = aggregate_event_transitions(human_mdp) + agent_trans = aggregate_event_transitions(agent_mdp) + + return human_trans, agent_trans + + +def trajectory_to_events(trajectory: list) -> list: + """extract event names from trajectory for KL divergence calculation + + trajectories are in format 'eventName_product0', extract just eventName + + args: + trajectory: list like ['view_product0', 'add_to_cart_product1', 'checkout_product1'] + + returns: + list: event names like ['view', 'add_to_cart', 'checkout'] + """ + events = [] + for state in trajectory: + # state format from sample_behavior: 'eventName_productX' + if "_product" in state: + event = state.rsplit("_product", 1)[0] + else: + event = state + events.append(event) + return events + + def adjust_behavior_to_condition(condition, transition_matrix): # expand NxN transition matrix to (N*P)x(N*P) weighted by demand condition cond_norm = condition / np.sum(condition) diff --git a/engine/wrapper.py b/engine/wrapper.py index 1dee8f5..e435aeb 100644 --- a/engine/wrapper.py +++ b/engine/wrapper.py @@ -3,30 +3,42 @@ from gymnasium import spaces import numpy as np from .engine import Limbo, MarketEngine, PricingEngine from .lib.render import DashboardRenderer -from .lib.coi import compute_coi_proxy +from .lib.coi import ( + compute_coi_leakage, + compute_erosion_metrics, + compute_agent_probability, +) +from .lib.behavior import get_transition_models, trajectory_to_events from .lib.wrappers import EconomicMetricsWrapper class PHANTOM(gym.Env): - """Gymnasium wrapper for the Limbo pricing-market simulation. Platform sets prices, market responds with demand.""" + """Gymnasium wrapper for Limbo pricing-market simulation implementing thesis COI framework + + reward = R(p,d) - λ·COI_leak(p,τ') per thesis Section on DR-RL + COI_leak uses behavioral divergence to estimate agent probability f(τ') + """ + metadata = {"render_modes": ["human", "ansi"]} - def __init__(self, - n_products: int = 10, - alpha: float = 0.3, - N: int = 100, - human_params: tuple = (50.0, 10.0), - agent_params: tuple = (45.0, 15.0), - noise_std: float = 1.0, - price_bounds: tuple = (10.0, 150.0), - lambda_coi: float = 0.1, - coi_window: int = 10, - render_mode: str = None): + def __init__( + self, + n_products: int = 10, + alpha: float = 0.3, + N: int = 100, + human_params: tuple = (50.0, 10.0), + agent_params: tuple = (45.0, 15.0), + noise_std: float = 1.0, + price_bounds: tuple = (10.0, 150.0), + lambda_coi: float = 0.1, + coi_window: int = 10, + render_mode: str = None, + ): super().__init__() self.n_products = n_products self.price_bounds = price_bounds self.lambda_coi = lambda_coi - self.coi_window = coi_window # K steps for rolling COI calculation + self.coi_window = coi_window self.render_mode = render_mode self.alpha = alpha self.N = N @@ -34,20 +46,34 @@ class PHANTOM(gym.Env): self.agent_params = agent_params self.market = MarketEngine( - alpha=alpha, N=N, - human_params=human_params, agent_params=agent_params, noise_std=noise_std + alpha=alpha, + N=N, + human_params=human_params, + agent_params=agent_params, + noise_std=noise_std, ) self._platform_stub = PricingEngine() self._limbo = Limbo(self._platform_stub, self.market) self.action_space = spaces.Box( - low=price_bounds[0], high=price_bounds[1], - shape=(n_products,), dtype=np.float32 + low=price_bounds[0], + high=price_bounds[1], + shape=(n_products,), + dtype=np.float32, + ) + self.observation_space = spaces.Dict( + { + "demand": spaces.Box( + low=0.0, high=100.0, shape=(n_products,), dtype=np.float32 + ), + "prices": spaces.Box( + low=price_bounds[0], + high=price_bounds[1], + shape=(n_products,), + dtype=np.float32, + ), + } ) - self.observation_space = spaces.Dict({ - "demand": spaces.Box(low=0.0, high=100.0, shape=(n_products,), dtype=np.float32), - "prices": spaces.Box(low=price_bounds[0], high=price_bounds[1], shape=(n_products,), dtype=np.float32), - }) self._prices = None self._demand = None @@ -56,25 +82,61 @@ class PHANTOM(gym.Env): self._price_history = [] self._revenue_history = [] self._renderer = None - self._initial_episode_prices = None # prices at episode start for COI calc + self._initial_episode_prices = None + self._trajectories = [] # session trajectories for agent prob calculation + + # load behavioral models for agent probability estimation + try: + self._human_trans, self._agent_trans = get_transition_models() + except Exception: + # fallback if behavioral data unavailable + self._human_trans, self._agent_trans = None, None def _get_obs(self) -> dict: - demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32) + demand_arr = np.array( + [self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32 + ) return {"demand": demand_arr, "prices": self._prices.astype(np.float32)} - def _compute_coi_proxy(self): - return compute_coi_proxy( - self._price_history, self._demand_history, self._initial_episode_prices, - self._prices, self.price_bounds, self.alpha, self.coi_window + def _compute_agent_prob(self) -> float: + """estimate agent probability from accumulated trajectories using KL divergence""" + if ( + not self._trajectories + or self._human_trans is None + or self._agent_trans is None + ): + return self.alpha # fallback to contamination level + + # aggregate all trajectories from this episode + all_events = [] + for traj in self._trajectories: + all_events.extend(trajectory_to_events(traj)) + + if len(all_events) < 2: + return self.alpha + + return compute_agent_probability( + all_events, self._human_trans, self._agent_trans ) def _compute_reward(self, prices: np.ndarray, demand: dict) -> float: - revenue = np.sum(prices * np.array([demand.get(i, 0.0) for i in range(self.n_products)])) - coi_penalty = self.lambda_coi * self._compute_coi_proxy() + revenue = np.sum( + prices * np.array([demand.get(i, 0.0) for i in range(self.n_products)]) + ) + + # compute agent probability from behavioral trajectories + agent_prob = self._compute_agent_prob() + + # COI leakage: minimal implementation per thesis + coi_leakage = compute_coi_leakage(prices, agent_prob) + coi_penalty = self.lambda_coi * coi_leakage + return float(revenue - coi_penalty) def _record_history(self): - demand_arr = np.array([self._demand.get(i, 0.0) for i in range(self.n_products)]) + demand_arr = np.array( + [self._demand.get(i, 0.0) for i in range(self.n_products)] + ) self._demand_history.append(demand_arr) self._price_history.append(self._prices.copy()) self._revenue_history.append(np.sum(self._prices * demand_arr)) @@ -82,10 +144,11 @@ class PHANTOM(gym.Env): def reset(self, seed=None, options=None): super().reset(seed=seed) self._prices = np.random.uniform(*self.price_bounds, size=self.n_products) - self._initial_episode_prices = self._prices.copy() # snapshot for COI calculation + self._initial_episode_prices = self._prices.copy() self._demand = self.market.act(self._prices) self._step_count = 0 self._demand_history, self._price_history, self._revenue_history = [], [], [] + self._trajectories = [] self._record_history() return self._get_obs(), {} @@ -95,15 +158,36 @@ class PHANTOM(gym.Env): self._step_count += 1 self._record_history() - coi_proxy = self._compute_coi_proxy() + # capture trajectories generated by market for agent prob estimation + if hasattr(self.market, "last_trajectories"): + self._trajectories.extend(self.market.last_trajectories) + + agent_prob = self._compute_agent_prob() + coi_leakage = compute_coi_leakage(self._prices, agent_prob) reward = self._compute_reward(self._prices, self._demand) terminated = self._step_count >= 100 + # legacy erosion metrics for comparison + erosion = compute_erosion_metrics( + self._price_history, + self._demand_history, + self._initial_episode_prices, + self._prices, + self.price_bounds, + self.alpha, + self.coi_window, + ) + info = { "step": self._step_count, - "coi_proxy": coi_proxy, - "coi_penalty": self.lambda_coi * coi_proxy, - "raw_revenue": np.sum(self._prices * np.array([self._demand.get(i, 0.0) for i in range(self.n_products)])), + "agent_prob": agent_prob, + "coi_leakage": coi_leakage, + "coi_penalty": self.lambda_coi * coi_leakage, + "erosion_metrics": erosion, + "raw_revenue": np.sum( + self._prices + * np.array([self._demand.get(i, 0.0) for i in range(self.n_products)]) + ), } return self._get_obs(), reward, terminated, False, info @@ -114,10 +198,16 @@ class PHANTOM(gym.Env): p, q = np.array(self._price_history), np.array(self._demand_history) dp, dq = np.diff(p, axis=0), np.diff(q, axis=0) valid = np.abs(dp) > 0.5 - with np.errstate(divide='ignore', invalid='ignore'): - elasticity = np.where(valid, (dq / dp) * (p[:-1] / np.maximum(q[:-1], 1.0)), 0.0) + with np.errstate(divide="ignore", invalid="ignore"): + elasticity = np.where( + valid, (dq / dp) * (p[:-1] / np.maximum(q[:-1], 1.0)), 0.0 + ) elasticity = np.nan_to_num(np.clip(elasticity, -5.0, 5.0), nan=0.0) - return np.mean(elasticity, axis=0) if len(elasticity) > 0 else np.zeros(self.n_products) + return ( + np.mean(elasticity, axis=0) + if len(elasticity) > 0 + else np.zeros(self.n_products) + ) def render(self): if self.render_mode == "human": @@ -125,7 +215,9 @@ class PHANTOM(gym.Env): self._renderer = DashboardRenderer() self._renderer.render(self) elif self.render_mode == "ansi": - return f"step={self._step_count}, prices={self._prices}, demand={self._demand}" + return ( + f"step={self._step_count}, prices={self._prices}, demand={self._demand}" + ) return None def close(self): @@ -140,6 +232,7 @@ if __name__ == "__main__": class RandomPolicy: """Minimal SB3-compatible random policy for baseline testing.""" + def __init__(self, env): self.env = env self.num_timesteps = 0 diff --git a/paper/src/main.tex b/paper/src/main.tex index 88260b9..3680ac8 100644 --- a/paper/src/main.tex +++ b/paper/src/main.tex @@ -27,7 +27,7 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce \noindent\textbf{Keywords:} Dynamic Pricing, LLM Agents, Adversarial Machine Learning, E-commerce, Behavioral Detection, Reinforcement Learning \vspace{1em} -\noindent\textbf{Acknowledgments:} Eugene Bykovets, PhD - ETH for helping with problem formulation. This research was supported by the TPU Research Cloud program. +\noindent\textbf{Acknowledgments:} This research was supported by the TPU Research Cloud program. \clearpage \input{chapters/01-intro}