adding naive jax and libraries and make adjustments

2026-07-16 01:53:37 +00:00 · 2026-02-17 14:48:18 +01:00
parent 66c4a0cd1d
commit 802f31b4a1
17 changed files with 2331 additions and 6 deletions
--- a/engine/lib/callbacks.py
+++ b/engine/lib/callbacks.py
@@ -0,0 +1,119 @@
+"""Training callbacks for W&B/TensorBoard logging - reads from info dict."""
+
+from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
+import numpy as np
+
+try:
+    import wandb
+
+    HAS_WANDB = True
+except ImportError:
+    HAS_WANDB = False
+
+
+class MetricsCallback(BaseCallback):
+    """Training metrics logger - reads info['economics'], logs to W&B."""
+
+    def __init__(
+        self, log_histograms: bool = True, log_freq: int = 100, verbose: int = 0
+    ):
+        super().__init__(verbose)
+        self.log_histograms = log_histograms
+        self.log_freq = log_freq
+        self._episode_revenues: list[float] = []
+
+    def _on_step(self) -> bool:
+        if not HAS_WANDB or wandb.run is None:
+            return True
+
+        for info in self.locals.get("infos", []):
+            if "economics" not in info:
+                continue
+
+            econ = info["economics"]
+            t = self.num_timesteps
+
+            payload = {
+                "economics/revenue": econ["revenue"],
+                "economics/margin": econ["margin"],
+                "coi/level": econ["coi_level"],
+                "economics/regret": econ["regret"],
+            }
+            if "coi_mix" in econ:
+                payload["coi/mix"] = econ["coi_mix"]
+            if "coi_base" in econ:
+                payload["coi/base"] = econ["coi_base"]
+            if "coi_leakage" in econ:
+                payload["coi/leakage"] = econ["coi_leakage"]
+            if "coi_penalty" in econ:
+                payload["coi/penalty"] = econ["coi_penalty"]
+            wandb.log(payload, step=t)
+
+            self._episode_revenues.append(econ["revenue"])
+
+        # histograms at log_freq intervals
+        if self.log_histograms and self.num_timesteps % self.log_freq == 0:
+            for info in self.locals.get("infos", []):
+                if "prices" in info:
+                    wandb.log(
+                        {"distributions/prices": wandb.Histogram(info["prices"])},
+                        step=self.num_timesteps,
+                    )
+                if "demand" in info:
+                    wandb.log(
+                        {"distributions/demand": wandb.Histogram(info["demand"])},
+                        step=self.num_timesteps,
+                    )
+
+        return True
+
+    def _on_rollout_end(self) -> None:
+        if not HAS_WANDB or wandb.run is None or not self._episode_revenues:
+            return
+        wandb.log(
+            {
+                "episode/mean_revenue": np.mean(self._episode_revenues),
+                "episode/total_revenue": np.sum(self._episode_revenues),
+            },
+            step=self.num_timesteps,
+        )
+        self._episode_revenues = []
+
+
+class EvalMetricsCallback(EvalCallback):
+    """Deterministic evaluation - true performance without exploration noise."""
+
+    def __init__(
+        self, eval_env, eval_freq: int = 1000, n_eval_episodes: int = 5, **kwargs
+    ):
+        super().__init__(
+            eval_env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes, **kwargs
+        )
+        self._eval_revenues: list[float] = []
+
+    def _on_step(self) -> bool:
+        result = super()._on_step()
+
+        if not HAS_WANDB or wandb.run is None:
+            return result
+
+        # log eval metrics after evaluation runs
+        if self.n_calls % self.eval_freq == 0 and hasattr(self, "last_mean_reward"):
+            wandb.log(
+                {
+                    "eval/mean_reward": self.last_mean_reward,
+                    "eval/mean_revenue": np.mean(self._eval_revenues)
+                    if self._eval_revenues
+                    else 0,
+                },
+                step=self.num_timesteps,
+            )
+            self._eval_revenues = []
+
+        return result
+
+    def _log_success_callback(self, locals_: dict, globals_: dict) -> None:
+        # called after each eval episode
+        info = locals_.get("info", {})
+        if "economics" in info:
+            self._eval_revenues.append(info["economics"]["revenue"])
--- a/engine/lib/coi.py
+++ b/engine/lib/coi.py
@@ -0,0 +1,76 @@
+import numpy as np
+from typing import Dict
+
+
+def compute_agent_probability(
+    trajectory: list, human_transitions: Dict, agent_transitions: Dict
+) -> float:
+    """estimate agent probability via KL divergence between trajectory transitions and reference models
+
+    compares empirical trajectory transition distribution to human/agent prototypes
+
+    args:
+        trajectory: list of state/event strings from session
+        human_transitions: reference transition dict from human MDP (event->event->prob)
+        agent_transitions: reference transition dict from agent MDP (event->event->prob)
+
+    returns:
+        agent probability in [0, 1] via softmax over KL divergences
+    """
+    if len(trajectory) < 2:
+        return 0.0  # insufficient data, assume human
+
+    # build empirical transition distribution from trajectory
+    trans_counts = {}
+    for s, s_next in zip(trajectory[:-1], trajectory[1:]):
+        if s not in trans_counts:
+            trans_counts[s] = {}
+        trans_counts[s][s_next] = trans_counts[s].get(s_next, 0) + 1
+
+    # normalize to probabilities
+    empirical = {}
+    for s, nxt in trans_counts.items():
+        total = sum(nxt.values())
+        empirical[s] = {s_n: cnt / total for s_n, cnt in nxt.items()}
+
+    # compute KL divergence to each prototype
+    def kl_div(p_dist: Dict, q_dist: Dict) -> float:
+        eps = 1e-10
+        # aggregate over all source states in empirical dist
+        kl = 0.0
+        for s in p_dist:
+            if s not in q_dist:
+                continue  # skip states not in reference
+            p_trans, q_trans = p_dist[s], q_dist[s]
+            for k in p_trans:
+                p_val = p_trans[k] + eps
+                q_val = q_trans.get(k, 0.0) + eps
+                kl += p_val * np.log(p_val / q_val)
+        return kl
+
+    kl_human = kl_div(empirical, human_transitions)
+    kl_agent = kl_div(empirical, agent_transitions)
+
+    # convert to probability via softmax (lower KL = higher prob)
+    # agent_prob = exp(-kl_agent) / (exp(-kl_human) + exp(-kl_agent))
+    exp_h = np.exp(-kl_human)
+    exp_a = np.exp(-kl_agent)
+    return float(exp_a / (exp_h + exp_a + 1e-10))
+
+
+def extract_purchases(trajectories: list) -> Dict[int, int]:
+    purchases: Dict[int, int] = {}
+    for traj in trajectories:
+        if traj and "checkout" in traj[-1] and "_product" in traj[-1]:
+            prod_id = int(traj[-1].rsplit("_product", 1)[1])
+            purchases[prod_id] = purchases.get(prod_id, 0) + 1
+    return purchases
+
+
+def compute_uplift_coi(
+    prices: np.ndarray, purchases: Dict[int, int], baseline_prices: np.ndarray
+) -> float:
+    # TODO: consider view-weighted fractional purchase for denser signal
+    return float(
+        sum(max(0.0, prices[k] - baseline_prices[k]) * n for k, n in purchases.items())
+    )
--- a/engine/lib/discrete.py
+++ b/engine/lib/discrete.py
@@ -0,0 +1,70 @@
+from collections import defaultdict
+import gymnasium as gym
+from gymnasium import spaces
+import numpy as np
+
+
+class DiscretePriceActionWrapper(gym.ActionWrapper):
+    def __init__(
+        self,
+        env: gym.Env,
+        n_levels: int = 9,
+        min_scale: float = 0.8,
+        max_scale: float = 1.2,
+    ):
+        super().__init__(env)
+        self.scales = np.linspace(min_scale, max_scale, n_levels, dtype=np.float32)
+        self.action_space = spaces.Discrete(n_levels)
+
+    def action(self, action: int):
+        scale = float(self.scales[int(action)])
+        cur = np.asarray(self.env.unwrapped._prices, dtype=np.float32)
+        lo, hi = self.env.unwrapped.price_bounds
+        return np.clip(cur * scale, lo, hi).astype(np.float32)
+
+
+class EventQTable:
+    def __init__(
+        self,
+        n_actions: int,
+        n_products: int,
+        price_bounds: tuple,
+        lr: float = 0.1,
+        gamma: float = 0.99,
+        n_bins: int = 6,
+    ):
+        self.n_actions = int(n_actions)
+        self.n_products = int(n_products)
+        self.lr = float(lr)
+        self.gamma = float(gamma)
+        self.q = defaultdict(lambda: np.zeros(self.n_actions, dtype=np.float32))
+        lo, hi = price_bounds
+        self.demand_bins = np.linspace(0.0, 100.0, n_bins + 1)[1:-1]
+        self.price_bins = np.linspace(lo, hi, n_bins + 1)[1:-1]
+
+    def encode(self, obs: np.ndarray) -> tuple:
+        obs = np.asarray(obs, dtype=np.float32)
+        d = obs[: self.n_products]
+        p = obs[self.n_products : 2 * self.n_products]
+        d_mean = float(np.mean(d)) if d.size else 0.0
+        d_std = float(np.std(d)) if d.size else 0.0
+        p_mean = float(np.mean(p)) if p.size else 0.0
+        return (
+            int(np.digitize(d_mean, self.demand_bins)),
+            int(np.digitize(d_std, self.demand_bins)),
+            int(np.digitize(p_mean, self.price_bins)),
+        )
+
+    def act(self, obs: np.ndarray, eps: float = 0.0) -> tuple[int, tuple]:
+        s = self.encode(obs)
+        if np.random.random() < eps:
+            return int(np.random.randint(self.n_actions)), s
+        return int(np.argmax(self.q[s])), s
+
+    def update(self, s: tuple, a: int, r: float, s2: tuple, done: bool):
+        target = r + (0.0 if done else self.gamma * float(np.max(self.q[s2])))
+        self.q[s][a] += self.lr * (target - self.q[s][a])
+
+    def predict(self, obs: np.ndarray, deterministic: bool = True):
+        a, _ = self.act(obs, 0.0 if deterministic else 0.05)
+        return a, None
--- a/engine/lib/providers.py
+++ b/engine/lib/providers.py
@@ -0,0 +1,182 @@
+"""Provider benchmarking - compare pricing strategies across contamination levels."""
+
+from dataclasses import dataclass, field
+from typing import Callable, Any
+import numpy as np
+import pandas as pd
+
+try:
+    import wandb
+
+    HAS_WANDB = True
+except ImportError:
+    HAS_WANDB = False
+
+
+class RandomBaseline:
+    """uniform random action selection as a lower-bound baseline"""
+
+    def __init__(self, n_actions: int):
+        self.n = n_actions
+
+    def __call__(self, obs):
+        return int(np.random.randint(self.n))
+
+    def predict(self, obs, **kw):
+        return self(obs), None
+
+
+class SurgeBaseline:
+    """heuristic surge pricing: boost price when demand is above threshold, discount when below.
+    matches the naive pricing rule from thesis Section 3.3.2"""
+
+    def __init__(
+        self, n_actions: int, high_threshold: float = 60.0, low_threshold: float = 30.0
+    ):
+        self.n = n_actions
+        self.mid = n_actions // 2  # identity action (scale ~1.0)
+        self.high_t = high_threshold
+        self.low_t = low_threshold
+
+    def __call__(self, obs):
+        obs = np.asarray(obs, dtype=np.float32)
+        n_prod = len(obs) // 2
+        demand_mean = float(np.mean(obs[:n_prod])) if n_prod > 0 else 0.0
+        if demand_mean >= self.high_t:
+            return min(self.mid + 2, self.n - 1)  # surge: two levels above identity
+        if demand_mean <= self.low_t:
+            return max(self.mid - 2, 0)  # discount: two levels below identity
+        return self.mid  # hold
+
+    def predict(self, obs, **kw):
+        return self(obs), None
+
+
+@dataclass
+class ProviderResult:
+    """Single benchmark result for one provider at one alpha level."""
+
+    name: str
+    alpha: float
+    total_revenue: float
+    mean_revenue: float
+    coi_level: float
+    coi_preserved_pct: float  # vs alpha=0 baseline
+    margin_integrity: float
+    regret: float
+    episodes: int
+
+
+@dataclass
+class BenchmarkConfig:
+    """Configuration for provider benchmark runs."""
+
+    n_episodes: int = 100
+    alpha_range: list[float] = field(default_factory=lambda: [0.0, 0.1, 0.3, 0.5])
+    baseline_name: str = "fixed"
+
+
+class ProviderBenchmark:
+    """Compare pricing providers to prove margin preservation across contamination levels.
+
+    Usage:
+        def env_factory(alpha):
+            return EconomicMetricsWrapper(PHANTOM(alpha=alpha))
+
+        providers = {
+            "fixed": lambda obs: np.ones(10) * 50,
+            "learned": model.predict,
+        }
+
+        benchmark = ProviderBenchmark(env_factory, providers)
+        results = benchmark.run()
+        print(benchmark.summary_table())
+    """
+
+    def __init__(
+        self,
+        env_factory: Callable[[float], Any],
+        providers: dict[str, Callable],
+        config: BenchmarkConfig | None = None,
+    ):
+        self.env_factory = env_factory  # fn(alpha) -> wrapped env
+        self.providers = providers  # {name: fn(obs) -> action}
+        self.config = config or BenchmarkConfig()
+        self.results: list[ProviderResult] = []
+
+    def run(self) -> list[ProviderResult]:
+        """Run benchmark across all providers and alpha levels."""
+        baseline_coi: dict[str, float] = {}  # {provider: coi at alpha=0}
+
+        for alpha in self.config.alpha_range:
+            env = self.env_factory(alpha)
+
+            for name, policy_fn in self.providers.items():
+                revenues, coi_levels, margins = [], [], []
+
+                for _ in range(self.config.n_episodes):
+                    obs, _ = env.reset()
+                    episode_revenue = 0.0
+                    done = False
+
+                    while not done:
+                        action = policy_fn(obs)
+                        # handle sb3 model.predict returning tuple
+                        if isinstance(action, tuple):
+                            action = action[0]
+                        obs, reward, term, trunc, info = env.step(action)
+                        done = term or trunc
+
+                        econ = info.get("economics", {})
+                        episode_revenue += econ.get("revenue", 0)
+                        coi_levels.append(econ.get("coi_level", 0))
+                        margins.append(econ.get("margin", 0))
+
+                    revenues.append(episode_revenue)
+
+                mean_coi = np.mean(coi_levels) if coi_levels else 0.0
+                if alpha == 0.0:
+                    baseline_coi[name] = mean_coi
+
+                base = baseline_coi.get(name, mean_coi)
+                coi_preserved = mean_coi / base if base > 0 else 1.0
+
+                result = ProviderResult(
+                    name=name,
+                    alpha=alpha,
+                    total_revenue=float(np.sum(revenues)),
+                    mean_revenue=float(np.mean(revenues)),
+                    coi_level=mean_coi,
+                    coi_preserved_pct=coi_preserved * 100,
+                    margin_integrity=float(np.mean(margins)) if margins else 0.0,
+                    regret=0.0,  # compute vs optimal if known
+                    episodes=self.config.n_episodes,
+                )
+                self.results.append(result)
+
+                # log to wandb if available
+                if HAS_WANDB and wandb.run is not None:
+                    wandb.log(
+                        {
+                            f"benchmark/{name}/revenue": result.mean_revenue,
+                            f"benchmark/{name}/coi_preserved": result.coi_preserved_pct,
+                            f"benchmark/{name}/margin": result.margin_integrity,
+                            "benchmark/alpha": alpha,
+                        }
+                    )
+
+        return self.results
+
+    def to_dataframe(self) -> pd.DataFrame:
+        """Convert results to pandas DataFrame."""
+        return pd.DataFrame([r.__dict__ for r in self.results])
+
+    def summary_table(self) -> pd.DataFrame:
+        """Pivot table: providers x alpha with revenue/COI metrics."""
+        df = self.to_dataframe()
+        return df.pivot_table(
+            index="name",
+            columns="alpha",
+            values=["mean_revenue", "coi_preserved_pct", "margin_integrity"],
+            aggfunc="mean",
+        )
--- a/engine/lib/wrappers.py
+++ b/engine/lib/wrappers.py
@@ -0,0 +1,77 @@
+"""Economic metrics wrapper - calculates thesis-aligned KPIs and injects into info dict."""
+
+import gymnasium as gym
+import numpy as np
+
+
+class EconomicMetricsWrapper(gym.Wrapper):
+    """Calculates thesis-aligned economic metrics per step, injects into info.
+
+    Metrics follow thesis definitions:
+    - COI level: E[P] - p_min (Definition 1)
+    - Margin: (avg_price - p_min) / avg_price
+    - Regret: 1 - (revenue / baseline_revenue)
+    """
+
+    def __init__(
+        self, env: gym.Env, p_min: float = 10.0, baseline_revenue: float | None = None
+    ):
+        super().__init__(env)
+        self.p_min = p_min
+        self.baseline_revenue = baseline_revenue
+        self._price_history: list[np.ndarray] = []
+        self._revenue_history: list[float] = []
+
+    def reset(self, **kwargs):
+        obs, info = self.env.reset(**kwargs)
+        self._price_history = []
+        self._revenue_history = []
+        return obs, info
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+
+        # extract from unwrapped env
+        prices = self.env.unwrapped._prices
+        demand_dict = self.env.unwrapped._demand
+        demand = np.array([demand_dict.get(i, 0.0) for i in range(len(prices))])
+        alpha = self.env.unwrapped.alpha
+
+        # core calculations
+        revenue = float(np.sum(prices * demand))
+        avg_price = float(np.mean(prices))
+        margin = (avg_price - self.p_min) / max(avg_price, 1e-6)
+        coi_level = avg_price - self.p_min  # E[P] - p_min per thesis Def 1
+
+        self._price_history.append(prices.copy())
+        self._revenue_history.append(revenue)
+
+        # regret vs baseline (golden path)
+        regret = 0.0
+        if self.baseline_revenue and self.baseline_revenue > 0:
+            regret = 1.0 - (revenue / self.baseline_revenue)
+
+        # inject structured metrics into info
+        info["economics"] = {
+            "revenue": revenue,
+            "margin": margin,
+            "coi_level": coi_level,
+            "regret": regret,
+        }
+        for key in ("coi_mix", "coi_base", "coi_leakage", "coi_penalty"):
+            if key in info:
+                info["economics"][key] = info[key]
+        info["prices"] = prices.copy()
+        info["demand"] = demand.copy()
+
+        return obs, reward, terminated, truncated, info
+
+    @property
+    def episode_revenue(self) -> float:
+        return sum(self._revenue_history)
+
+    @property
+    def episode_mean_price(self) -> float:
+        if not self._price_history:
+            return 0.0
+        return float(np.mean([np.mean(p) for p in self._price_history]))