PHANTOM/lab/case/thesis/objectives.py

"""
Thesis-specific objectives implementing robust pricing under contamination.

Implements the Maximin objective from Eq 23:
π* = argmax_π min_{Q ∈ U_ε} E_d~Q[R(p,d) - λ·COI(p)]

Key components:
- COIObjective: Cost of Information penalty (Definition 1)
- RobustStackelbergObjective: Full maximin objective with Wasserstein robustness
- UXPenalty: User experience degradation from volatility
"""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from ...outlet.objectives.base import BaseObjective, CompositeObjective
from ...outlet.types import Quote, InstrumentSet, StepMetrics, HiddenState, Observation
from ...outlet.math_util import safe_log, EPS

class COIObjective(BaseObjective):
    """Cost of Information penalty from Definition 1.

    COI(π) = E[P] - p_min

    The expected price premium over marginal cost represents the platform's
    pricing power. Agent reconnaissance erodes this by revealing price
    distribution to buyers.

    We implement COI_leakage = f(τ') · InfoValue(p, τ')
    where f(τ') is the estimated agent probability.
    """

    def __init__(self, lambda_coi: float = 1.0, use_revelation: bool = False):
        """
        Args:
            lambda_coi: Weight on COI penalty
            use_revelation: If True, use -log(π(p)) as info value (penalizes rare prices)
        """
        self.lambda_coi = lambda_coi
        self.use_revelation = use_revelation

    def reward(self, quote: Quote, instruments: InstrumentSet,
               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
        # COI_leakage = α · InfoValue
        alpha = hidden.contamination

        if self.use_revelation:
            # revelation surrogate: rare prices reveal more about policy
            # InfoValue = -log(π(p|τ')) ≈ surprise of the price
            price_surprise = np.mean(np.abs(quote.prices - instruments.refs) / (instruments.refs + EPS))
            info_value = price_surprise
        else:
            # query-tax surrogate: each agent query incurs constant leakage
            info_value = 1.0

        leakage = alpha * info_value
        return -self.lambda_coi * leakage

    def breakdown(self, quote: Quote, instruments: InstrumentSet,
                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
        alpha = hidden.contamination
        margins = (quote.prices - instruments.costs) / (instruments.costs + EPS)
        return {
            'coi_penalty': self.reward(quote, instruments, metrics, hidden, obs),
            'contamination': alpha,
            'avg_margin': float(np.mean(margins)),
        }

@dataclass
class RobustObjectiveConfig:
    """Configuration for robust Stackelberg objective.

    Attributes:
        lambda_coi: Weight on COI penalty (λ in Eq 23)
        lambda_ux: Weight on UX penalty
        lambda_volatility: Weight on price volatility penalty
        gamma_inventory: Inventory risk aversion
        wasserstein_epsilon: Ambiguity set radius (ε in Eq 21)
    """
    lambda_coi: float = 0.5
    lambda_ux: float = 0.1
    lambda_volatility: float = 0.2
    gamma_inventory: float = 0.1
    wasserstein_epsilon: float = 0.1

class RobustStackelbergObjective(BaseObjective):
    """Implements the Maximin Objective from thesis Eq 23.

    π* = argmax_π min_{Q ∈ U_ε(P̂_N)} E_d~Q[R(p,d) - λ·COI(p)]

    The objective balances:
    1. Revenue R(p,d) from human purchases
    2. COI penalty for information leakage to agents
    3. UX penalty for price volatility
    4. Inventory/holding costs

    The min over ambiguity set U_ε is approximated by penalizing
    high contamination scenarios more heavily.
    """

    def __init__(self, cfg: RobustObjectiveConfig | None = None):
        self.cfg = cfg or RobustObjectiveConfig()

    def reward(self, quote: Quote, instruments: InstrumentSet,
               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
        cfg = self.cfg

        # 1. base revenue (R(p,d))
        revenue = metrics.revenue
        cost = metrics.cost
        profit = revenue - cost

        # 2. COI penalty: scales with contamination and margin extraction
        # high margins + high contamination = high leakage
        alpha = hidden.contamination
        margins = quote.prices - instruments.costs
        avg_margin = float(np.mean(margins))
        coi_penalty = cfg.lambda_coi * avg_margin * alpha

        # 3. UX penalty: price volatility harms legitimate users
        volatility_penalty = cfg.lambda_volatility * metrics.volatility

        # 4. inventory/position cost
        position_penalty = cfg.gamma_inventory * metrics.position_cost

        # 5. lost opportunity cost (stockouts)
        lost_penalty = 0.1 * metrics.lost_opportunity

        # robust adjustment: under adversarial distribution Q,
        # expect lower revenue and higher costs
        # approximate via worst-case contamination within ε-ball
        worst_case_alpha = min(alpha + cfg.wasserstein_epsilon, 1.0)
        robustness_penalty = cfg.wasserstein_epsilon * avg_margin * worst_case_alpha

        total = profit - coi_penalty - volatility_penalty - position_penalty - lost_penalty - robustness_penalty

        return total

    def breakdown(self, quote: Quote, instruments: InstrumentSet,
                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
        cfg = self.cfg
        alpha = hidden.contamination
        margins = quote.prices - instruments.costs
        avg_margin = float(np.mean(margins))

        return {
            'revenue': metrics.revenue,
            'cost': metrics.cost,
            'profit': metrics.revenue - metrics.cost,
            'coi_penalty': -cfg.lambda_coi * avg_margin * alpha,
            'volatility_penalty': -cfg.lambda_volatility * metrics.volatility,
            'position_penalty': -cfg.gamma_inventory * metrics.position_cost,
            'lost_penalty': -0.1 * metrics.lost_opportunity,
            'robustness_penalty': -cfg.wasserstein_epsilon * avg_margin * min(alpha + cfg.wasserstein_epsilon, 1.0),
            'contamination': alpha,
            'avg_margin_pct': avg_margin / (float(np.mean(instruments.costs)) + EPS),
        }

class UXPenalty(BaseObjective):
    """User experience penalty from price volatility.

    High price volatility degrades UX for legitimate human users.
    This term ensures the defense doesn't harm real customers while
    protecting against agent reconnaissance.
    """

    def __init__(self, scale: float = 1.0, max_acceptable_volatility: float = 0.1):
        self.scale = scale
        self.max_vol = max_acceptable_volatility

    def reward(self, quote: Quote, instruments: InstrumentSet,
               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
        # penalty increases quadratically beyond threshold
        excess_vol = max(0, metrics.volatility - self.max_vol)
        return -self.scale * (excess_vol ** 2)

    def breakdown(self, quote: Quote, instruments: InstrumentSet,
                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
        return {
            'ux_penalty': self.reward(quote, instruments, metrics, hidden, obs),
            'volatility': metrics.volatility,
        }

class AdaptiveObjective(BaseObjective):
    """Objective that adapts weights based on estimated contamination.

    When contamination is low, focus on revenue maximization.
    When contamination is high, increase COI defense weight.
    """

    def __init__(self, base_lambda_coi: float = 0.3, max_lambda_coi: float = 2.0,
                 adaptation_rate: float = 2.0):
        self.base_lambda = base_lambda_coi
        self.max_lambda = max_lambda_coi
        self.rate = adaptation_rate

    def _adaptive_lambda(self, alpha: float) -> float:
        # sigmoid scaling: λ(α) = base + (max-base) * sigmoid(rate*(α-0.5))
        from ...outlet.math_util import sigmoid
        scale = sigmoid(self.rate * (alpha - 0.3))
        return self.base_lambda + (self.max_lambda - self.base_lambda) * scale

    def reward(self, quote: Quote, instruments: InstrumentSet,
               metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> float:
        alpha = hidden.contamination
        lambda_coi = self._adaptive_lambda(alpha)

        profit = metrics.revenue - metrics.cost
        margins = quote.prices - instruments.costs
        coi_penalty = lambda_coi * float(np.mean(margins)) * alpha

        return profit - coi_penalty

    def breakdown(self, quote: Quote, instruments: InstrumentSet,
                  metrics: StepMetrics, hidden: HiddenState, obs: Observation) -> dict[str, float]:
        alpha = hidden.contamination
        return {
            'profit': metrics.revenue - metrics.cost,
            'adaptive_lambda': self._adaptive_lambda(alpha),
            'contamination': alpha,
        }

def make_thesis_objective(lambda_coi: float = 0.5, lambda_ux: float = 0.1,
                          lambda_vol: float = 0.2) -> CompositeObjective:
    """Create the standard thesis objective composition."""
    return CompositeObjective([
        (RobustStackelbergObjective(RobustObjectiveConfig(
            lambda_coi=lambda_coi, lambda_ux=lambda_ux, lambda_volatility=lambda_vol)), 1.0),
    ])