% -*- TeX-master: t -*-
\documentclass[12pt,letterpaper]{article}

\input{preamble}

\begin{document}

\begin{titlepage}
    \centering
    \includegraphics[width=\textwidth]{graphics/banner.png}\\[0.8cm]
    \LARGE\textbf{PHANTOM: Pricing Heuristics Against Non-human Transaction Orchestration Mechanisms}\\[0.5cm]
    \Large\textbf{Daniel Rösel}\\
    \large\textit{Bachelor of Computer Science \& Artificial Intelligence}\\[0.5cm]
    \Large\textit{Supervised by:}\\
    \Large\textbf{Alberto Martín Izquierdo}\\
    \large\textit{IE University, Madrid, Spain}\\[1cm]
    \large\today
\end{titlepage}

\begin{abstract}
\noindent
Large language model (LLM) agents are spreading in e-commerce, one consequence is intermediaries that can separate information gathering from transaction execution. This thesis studies dynamic pricing when agents survey in isolated sessions and thereby weaken the \emph{Cost of Information} (COI), the premium platforms typically extract once demand signals are expressed.

We formalize the phenomenon and prove a Cost of Information theorem: as independent, utility-maximizing agents saturate price queries, the platform's sustainable margin goes to zero, so ordinary dynamic pricing is incentive-incompatible in the limit.

The defensive design combines behavioral signals with distributionally robust optimization (DRO). We implement a controlled storefront on a hybrid batch-streaming architecture and show that human and agent sessions induce different transition kernels. Kullback--Leibler divergence to class prototypes yields session scores that feed a distributionally robust reinforcement learning (DR-RL) policy, posed as a Stackelberg game with a Wasserstein ambiguity set over demand so the learner does not collapse to a single empirical demand curve under shifting contamination.

Factorial training on TPUs shows the expected short-run revenue hit from contamination and that the robust objective recovers COI and equilibrium structure in harder regimes (higher contamination, larger catalogs), accounting for UX to prevent supra-competitive pricing. Code and an interaction dataset are released for work on agent-mediated traffic.
\end{abstract}

\noindent\textbf{Keywords:} Dynamic Pricing, LLM Agents, Adversarial Machine Learning, E-commerce, Behavioral Detection, Reinforcement Learning

\vspace{0.5em}
\noindent\textbf{Project page:} \url{https://velocitatem.github.io/PHANTOM/}

\clearpage
\tableofcontents
\clearpage
\input{chapters/01-intro}
\input{chapters/02-literature-review}
\input{chapters/03-methodology}
\input{chapters/04-results}
\input{chapters/05-discussion}
\input{chapters/06-conclusion}

\input{chapters/acknowledgements}

\printbibliography

\clearpage
\appendix
\section{Terminology}
\begin{description}
\item[Agent $A$] A non-human actor, typically an LLM-driven system that executes web actions toward a goal.
\item[Human $H$] A human participant interacting with the platform to complete a task.
\item[Actor Class $Y$] A latent class parameter describing whether a session is generated by a human or an agent profile.
\item[Platform] A web interface exposing purchasable items and their offered prices.
\item[Session $s$] A bounded interaction record tied to one actor and one session identifier.
\item[Event $e_{s,k}$] A single interaction tuple in a session, including action, item target, and timestamp.
\item[Trajectory $\tau_s$] The ordered sequence of events generated within a session.
\item[Demand Proxy $\hat{q}_{t,i}$] A weighted aggregate of observed actions used as an operational substitute for latent demand.
\item[Action Weight Function $\omega(a)$] A mapping from action type to signal strength in the demand proxy.
\item[True Demand $d(p\mid Y,\theta)$] The latent purchase response as a function of price, actor class, and latent type.
\item[Contamination $\alpha$] The proportion of agent-generated traffic in the session mixture.
\item[Non-stationary Noise $\epsilon_t$] Time-varying residual variation not explained by the actor mixture.
\item[Pricing Policy $\pi(\tau)$] A function mapping observed interaction history to an offered price.
\item[Cost of Information (COI)] The expected premium above the minimum viable price induced by the pricing policy.
\item[COI Leakage] A per-quote penalty term modeling information revealed to reconnaissance behavior.
\item[First-Order Statistic $p_{(1)}$] The minimum observed price among multiple independent queries.
\item[Transition Kernel $\mathcal{T}$] A Markov transition matrix over behavioral states or actions.
\item[Distinguishability] The degree to which human and agent sessions can be distinguished from behavior alone.
\item[KL Divergence $D_{KL}$] A relative-entropy measure used to compare session transition structure against class prototypes.
\item[Divergence Scores $\Delta_H,\Delta_A$] Session-level distances to human and agent transition centroids.
\item[Weak Agent Probability $f(\tau)$] A session-level score estimating the likelihood that a trajectory is agent-generated.
\item[Contamination Generator $\mathcal{G}(\alpha)$] A simulator component that injects synthetic agent trajectories to reach a target mixture level.
\item[Stackelberg Game] A leader-follower formulation where the platform sets prices and demand responds.
\item[Ambiguity Set $\mathcal{U}_{\epsilon}$] A set of plausible demand distributions considered under distributional uncertainty.
\item[Wasserstein Ball] A distance-bounded neighborhood around an empirical distribution used in robust optimization.
\item[DR-RL] Distributionally Robust Reinforcement Learning for policies trained against worst-case distributional shifts.
\item[Nominal Contamination $\alpha_0$] The baseline contamination level around which robust candidates are evaluated.
\item[Robustness Radius $\epsilon_\alpha$] The local interval width used for inner minimization over contamination scenarios.
\item[Query-Tax Surrogate] A constant leakage proxy assigning fixed penalty to suspected reconnaissance queries.
\item[Revelation Surrogate] A leakage proxy based on $-\log\pi(p\mid\tau)$ to penalize highly informative quotes.
\item[Limbo Stack] The alternating game-history buffer that stores leader price moves and follower demand responses.
\item[UX Index] A bounded user-experience metric tracked to evaluate policy side effects on legitimate users.
\item[Look-to-Book Ratio] The ratio of search-like interactions to completed purchases, used as an operational contamination indicator.
\item[Hybrid Kappa-Lambda Architecture] A data design combining streaming ingestion with offline and batch learning loops.
\item[MDP / POMDP] Sequential decision models with full observability (MDP) or partial observability (POMDP).
\item[Behavioral Model] A model predicting what action is likely to follow from prior actions.
\item[LLM] Large Language Model served through an inference provider with tool-use capability.
\item[TPU] Tensor Processing Unit, a specialized accelerator architecture developed by Google.
\end{description}

\section{Aggregate Compute Budget Derivation}
\label{app:compute_budget}

The claimed peak throughput of approximately 160\,PFLOPS follows from multiplying the per-chip BF16 peak (from official Google Cloud TPU documentation) by the number of chips in each allocation tier and summing across generations.

\begin{table}[ht]
\centering
\caption{Per-generation contribution to aggregate BF16 throughput.}
\label{tab:compute_derivation}
\begin{tabular}{@{}lrrr@{}}
\toprule
\textbf{TPU Gen.} & \textbf{Chips} & \textbf{Peak BF16/chip (TFLOPS)} & \textbf{Subtotal (TFLOPS)} \\
\midrule
v6e (Trillium) & 128 & 918 & $128 \times 918 = 117{,}504$ \\
v5e            & 128 & 197 & $128 \times 197 = 25{,}216$  \\
v4             &  64 & 275 & $64  \times 275 = 17{,}600$  \\
\midrule
\textbf{Total} & \textbf{320} & & $\mathbf{160{,}320}$ \\
\bottomrule
\end{tabular}
\end{table}

Converting to petaFLOPS: $160{,}320\;\text{TFLOPS} = 160.32\;\text{PFLOPS} \approx 160\;\text{PFLOPS}$. This is the theoretical peak under sustained BF16 arithmetic; realized throughput depends on memory bandwidth utilization and inter-chip communication overhead, but the figure serves as a useful upper bound for provisioning decisions.


\section{KL divergence when the reference has zeros}
\label{app:kl_zeros}

The textbook definition $D_{\mathrm{KL}}(P\parallel Q)=\sum_k P(k)\log(P(k)/Q(k))$ is not usable as-is when our empirical reference puts $Q(k)=0$ somewhere the session distribution still visits: if $P(k)>0$ and $Q(k)=0$, that term wants to blow up to infinity. With only 29 sessions the estimated transition rows are incredibly sparse.

In code we do the basic fix: add a tiny floor $\varepsilon$ to both the numerator and denominator inside the log so nothing is exactly zero, which turns the sum into a finite, smoothed surrogate rather than a literal KL to raw counts. We also skip source states that do not exist at all in the reference kernel, because there is nowhere honest to compare against. This keeps the pipeline running and the divergence scores on a comparable scale, at the cost that the number is regularized KL behavior, not a purist information-theoretic quantity, which is acceptable here because we only use the gap between human-anchored and agent-anchored scores as a weak separability signal.


\section{Expanding the Intuition of Information Value in the Reward}
\label{app:revelation_log}

Leakage is $\text{COI}_{\text{leak}} = f(\tau')\cdot\text{InfoValue}$. The query-tax form fixes $\text{InfoValue}=c>0$. The revelation form sets $\text{InfoValue}(p,\tau')=-\log\pi(p\mid\tau')$, with $\pi(\cdot\mid\tau')$ the policy distribution over quoted prices in context $\tau'$ (discretized as in the engine).

For an outcome with probability $q$, the quantity $-\log q$ is \emph{surprisal}. For independent events, $-\log\prod_i q_i=\sum_i(-\log q_i)$. The revelation term is surprisal under $X\sim\pi(\cdot\mid\tau')$, multiplied by $f(\tau')$. In practice we do $\max\{\pi,\varepsilon\}$ in place of $\pi$ so the log stays finite (same spirit as Appendix~\ref{app:kl_zeros}).


% \input{../build/concatenated_code}

\end{document}