win: refomulated and re-inspired from library

2026-06-01 00:53:36 +00:00 · 2026-01-23 17:16:32 +01:00
parent b0a1647956
commit 28669ea4c3
2 changed files with 626 additions and 0 deletions
--- a/lab/case/thesis/simplified_env.py
+++ b/lab/case/thesis/simplified_env.py
@@ -0,0 +1,338 @@
+"""Gymnasium-compatible RL environment for thesis pricing system.
+
+Wraps simplified.System with standard Gym interface for training pricing policies.
+Supports multiple reward modes and contamination scenarios.
+
+Action: price multipliers [0.5, 1.5] applied to reference prices
+Observation: [prices, demand_agg, alpha_est, margins, position_proxy]
+Reward: configurable objective (revenue, profit, robust, coi-aware)
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, Tuple
+import numpy as np
+
+try:
+    import gymnasium as gym
+    from gymnasium import spaces
+    HAS_GYM = True
+except ImportError:
+    HAS_GYM = False
+
+from .simplified import (System, Session, Event, Limbo, put_prices_to_market,
+                         compute_demand, estimate_alpha, coi_erosion, TRANS_H, TRANS_A)
+
+
+@dataclass
+class EnvConfig:
+    """Configuration for pricing environment."""
+    n_products: int = 5
+    max_steps: int = 200
+    sessions_per_step: int = 30
+    alpha_true: float = 0.2           # true contamination level
+    alpha_drift: float = 0.0          # per-step drift in α
+    alpha_bounds: Tuple[float, float] = (0.0, 0.6)
+    lambda_coi: float = 0.5           # COI penalty weight
+    lambda_vol: float = 0.1           # volatility penalty weight
+    reward_mode: str = "robust"       # revenue | profit | robust | coi_aware
+    normalize_reward: bool = True
+    seed: int | None = 42
+
+
+class PricingEnv:
+    """RL environment for dynamic pricing under agent contamination.
+
+    Implements the thesis formulation where:
+    - Platform sets prices p_t
+    - Market responds with mixture demand Q(p) = (1-α)D_H + αD_A
+    - Agent estimates contamination α̂ from behavioral signals
+    - Reward balances profit vs COI leakage
+
+    Observation space (normalized):
+        [0:n]     - current prices / ref_prices
+        [n:2n]    - aggregated demand per product
+        [2n]      - estimated contamination α̂
+        [2n+1]    - true contamination α (if observable, else 0)
+        [2n+2:3n+2] - current margins (prices - costs) / costs
+        [3n+2]    - step / max_steps
+
+    Action space:
+        price multipliers in [0.5, 1.5] applied to reference prices
+    """
+
+    metadata = {"render_modes": ["human", "ansi"]}
+
+    def __init__(self, cfg: EnvConfig | None = None):
+        if not HAS_GYM:
+            raise ImportError("gymnasium required")
+        self.cfg = cfg or EnvConfig()
+        self.n = self.cfg.n_products
+        self._sys: System | None = None
+        self._t = 0
+        self._alpha = self.cfg.alpha_true
+        self._last_prices: np.ndarray | None = None
+        self._last_demand: Dict[str, float] | None = None
+        self._episode_rewards: list[float] = []
+        self._demand_agg = np.zeros(self.n)
+
+        # gymnasium spaces
+        self.action_space = spaces.Box(low=0.5, high=1.5, shape=(self.n,), dtype=np.float32)
+        obs_dim = self.n + self.n + 1 + 1 + self.n + 1  # prices + demand + α̂ + α + margins + t
+        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)
+
+    def _build_obs(self) -> np.ndarray:
+        """Construct observation vector."""
+        if self._sys is None:
+            return np.zeros(self.observation_space.shape[0], dtype=np.float32)
+
+        prices = self._last_prices if self._last_prices is not None else self._sys.refs
+        price_ratio = prices / (self._sys.refs + 1e-6)
+        demand_norm = self._demand_agg / (np.sum(self._demand_agg) + 1e-6)
+        margins = (prices - self._sys.costs) / (self._sys.costs + 1e-6)
+        t_norm = self._t / self.cfg.max_steps
+
+        obs = np.concatenate([
+            price_ratio,                          # [0:n]
+            demand_norm,                          # [n:2n]
+            [self._sys.alpha],                    # [2n] estimated α̂
+            [self._alpha],                        # [2n+1] true α
+            margins,                              # [2n+2:3n+2]
+            [t_norm],                             # [3n+2]
+        ])
+        return obs.astype(np.float32)
+
+    def _compute_reward(self, prices: np.ndarray, demand: Dict[str, float]) -> float:
+        """Compute reward based on configured mode."""
+        cfg, sys = self.cfg, self._sys
+        if sys is None:
+            return 0.0
+
+        # aggregate demand per product
+        agg = np.zeros(self.n)
+        for sid, q in demand.items():
+            sess = next((s for s in sys._sessions if s.sid == sid), None)
+            if sess and sess.events:
+                pidx = sess.events[0].product_idx
+                agg[pidx] += q
+        self._demand_agg = agg
+
+        revenue = float(np.dot(prices, agg))
+        cost = float(np.dot(sys.costs, np.clip(agg, 0, 1)))  # simplified cost model
+        profit = revenue - cost
+
+        # volatility penalty (price changes)
+        vol_penalty = 0.0
+        if self._last_prices is not None:
+            price_change = np.abs(prices - self._last_prices) / (sys.refs + 1e-6)
+            vol_penalty = cfg.lambda_vol * float(np.mean(price_change))
+
+        # COI leakage penalty
+        avg_margin = float(np.mean(prices - sys.costs))
+        coi_leak = sys.alpha * avg_margin
+
+        if cfg.reward_mode == "revenue":
+            r = revenue
+        elif cfg.reward_mode == "profit":
+            r = profit
+        elif cfg.reward_mode == "robust":
+            # robust objective: profit - λ_coi * COI_leak - λ_vol * volatility
+            r = profit - cfg.lambda_coi * coi_leak - vol_penalty
+        elif cfg.reward_mode == "coi_aware":
+            # adaptive: heavier penalty at high contamination
+            adaptive_lambda = cfg.lambda_coi * (1 + 2 * sys.alpha)
+            r = profit - adaptive_lambda * coi_leak - vol_penalty
+        else:
+            r = profit
+
+        if cfg.normalize_reward:
+            r = r / (float(np.sum(sys.refs)) + 1e-6)  # normalize by potential revenue
+
+        return float(r)
+
+    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
+        """Reset environment to initial state."""
+        seed = seed if seed is not None else self.cfg.seed
+        self._sys = System(n_products=self.n, lambda_coi=self.cfg.lambda_coi, seed=seed)
+        self._t = 0
+        self._alpha = self.cfg.alpha_true
+        self._last_prices = None
+        self._last_demand = None
+        self._episode_rewards = []
+        self._demand_agg = np.zeros(self.n)
+
+        info = {"alpha_true": self._alpha, "alpha_est": self._sys.alpha,
+                "costs": self._sys.costs.copy(), "refs": self._sys.refs.copy()}
+        return self._build_obs(), info
+
+    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
+        """Execute one environment step.
+
+        Args:
+            action: price multipliers in [0.5, 1.5]
+
+        Returns:
+            obs, reward, terminated, truncated, info
+        """
+        if self._sys is None:
+            raise RuntimeError("call reset() first")
+
+        # convert action to prices
+        action = np.clip(action, 0.5, 1.5)
+        prices = self._sys.refs * action.astype(np.float64)
+        prices = np.clip(prices, self._sys.costs * 1.01, self._sys.refs * 2.0)
+
+        # drift contamination
+        if self.cfg.alpha_drift != 0:
+            self._alpha = np.clip(
+                self._alpha + self.cfg.alpha_drift * self._sys.rng.normal(),
+                *self.cfg.alpha_bounds)
+
+        # observe demand
+        demand = self._sys.observe_demand(prices, alpha_true=self._alpha, n_sessions=self.cfg.sessions_per_step)
+        self._sys.limbo.add_update("prices", prices)
+
+        # update α estimate
+        self._sys._alpha_est = self._sys._estimate_alpha_from_sessions()
+
+        reward = self._compute_reward(prices, demand)
+        self._episode_rewards.append(reward)
+
+        self._last_prices = prices.copy()
+        self._last_demand = demand
+        self._t += 1
+
+        terminated = self._t >= self.cfg.max_steps
+        truncated = False
+
+        info = {
+            "alpha_true": self._alpha,
+            "alpha_est": self._sys.alpha,
+            "revenue": float(np.dot(prices, self._demand_agg)),
+            "avg_margin": float(np.mean((prices - self._sys.costs) / self._sys.costs)),
+            "n_sessions": len(demand),
+            "coi_erosion": coi_erosion(int(self._alpha * self.cfg.sessions_per_step), float(np.std(prices))),
+        }
+
+        return self._build_obs(), reward, terminated, truncated, info
+
+    def render(self, mode: str = "human") -> str | None:
+        """Render environment state."""
+        if self._sys is None or self._last_prices is None:
+            return None
+
+        lines = [
+            f"t={self._t}/{self.cfg.max_steps}",
+            f"α_true={self._alpha:.3f} α̂={self._sys.alpha:.3f}",
+            f"prices: {self._last_prices.round(1)}",
+            f"demand: {self._demand_agg.round(2)}",
+            f"reward: {self._episode_rewards[-1] if self._episode_rewards else 0:.3f}",
+        ]
+        out = " | ".join(lines)
+        if mode == "human":
+            print(out)
+        return out
+
+    def close(self) -> None:
+        pass
+
+
+class ContaminationSweepEnv(PricingEnv):
+    """Environment that sweeps through contamination levels during training.
+
+    Useful for curriculum learning: start with low α, gradually increase.
+    """
+
+    def __init__(self, cfg: EnvConfig | None = None, alpha_schedule: list[float] | None = None):
+        super().__init__(cfg)
+        self._schedule = alpha_schedule or [0.1, 0.2, 0.3, 0.4, 0.5]
+        self._schedule_idx = 0
+
+    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
+        # advance schedule on reset
+        if options and options.get("advance_schedule", False):
+            self._schedule_idx = (self._schedule_idx + 1) % len(self._schedule)
+        self.cfg.alpha_true = self._schedule[self._schedule_idx]
+        return super().reset(seed, options)
+
+
+class AdversarialEnv(PricingEnv):
+    """Environment with adversarial contamination dynamics.
+
+    The contamination level responds to pricing policy: if prices are too predictable,
+    agents learn to exploit and α increases.
+    """
+
+    def __init__(self, cfg: EnvConfig | None = None, exploitation_rate: float = 0.02):
+        super().__init__(cfg)
+        self._exploit_rate = exploitation_rate
+        self._price_history: list[np.ndarray] = []
+
+    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, dict]:
+        obs, reward, term, trunc, info = super().step(action)
+
+        # track price history for predictability
+        if self._last_prices is not None:
+            self._price_history.append(self._last_prices.copy())
+
+        # increase α if prices are predictable (low variance over recent history)
+        if len(self._price_history) > 10:
+            recent = np.array(self._price_history[-10:])
+            predictability = 1.0 / (float(np.std(recent)) + 0.1)
+            self._alpha = np.clip(
+                self._alpha + self._exploit_rate * predictability * self._sys.rng.random(),
+                *self.cfg.alpha_bounds)
+
+        info["predictability"] = predictability if len(self._price_history) > 10 else 0.0
+        return obs, reward, term, trunc, info
+
+    def reset(self, seed: int | None = None, options: dict | None = None) -> Tuple[np.ndarray, dict]:
+        self._price_history = []
+        return super().reset(seed, options)
+
+
+def make_env(cfg: EnvConfig | None = None, env_type: str = "standard") -> PricingEnv:
+    """Factory for creating pricing environments."""
+    if env_type == "sweep":
+        return ContaminationSweepEnv(cfg)
+    elif env_type == "adversarial":
+        return AdversarialEnv(cfg)
+    return PricingEnv(cfg)
+
+
+# simple baseline policies for benchmarking
+def fixed_price_policy(refs: np.ndarray, margin: float = 0.0) -> np.ndarray:
+    """Fixed markup policy: always return ref * (1 + margin)."""
+    return np.ones(len(refs), dtype=np.float32) * (1.0 + margin)
+
+
+def random_policy(n: int, rng: np.random.Generator | None = None) -> np.ndarray:
+    """Random policy for exploration baseline."""
+    rng = rng or np.random.default_rng()
+    return rng.uniform(0.7, 1.3, n).astype(np.float32)
+
+
+def adaptive_policy(obs: np.ndarray, n: int, base_margin: float = 0.1) -> np.ndarray:
+    """Simple adaptive policy: reduce margins when α̂ is high."""
+    alpha_est = obs[2 * n]  # α̂ is at position 2n in observation
+    margin_scale = 1.0 - 0.4 * alpha_est  # defensive when α̂ high
+    return np.ones(n, dtype=np.float32) * (1.0 + base_margin * margin_scale)
+
+
+if __name__ == "__main__":
+    # demo run
+    cfg = EnvConfig(n_products=100, max_steps=100, alpha_true=0.25, reward_mode="robust")
+    env = make_env(cfg)
+    obs, info = env.reset()
+    print(f"initial: α={info['alpha_true']:.2f}")
+
+    total_reward = 0.0
+    for t in range(cfg.max_steps):
+        action = adaptive_policy(obs, cfg.n_products)
+        obs, reward, done, _, info = env.step(action)
+        total_reward += reward
+        if t % 10 == 0:
+            env.render()
+        if done:
+            break
+
+    print(f"\ntotal reward: {total_reward:.2f}, final α̂: {info['alpha_est']:.3f}")