Paper lit review (#45)

* chore: updating apa citation and fixing citation in-text and parent * fixing in lit review * adjusting citations and improving schema * chore: fixed formating and adjusting other components * refined abstract * one page fitting * constrainative proposals * fix: syntax of transtion probs * refined lit review and soruces * research Objectives * adding logo graphics * chore: fixing citation completeness * updating with newly built algoerith * lit review document setup
2026-07-15 17:43:36 +00:00 · 2026-01-26 13:04:32 +01:00
parent a9d73ccce5
commit b5f19e04b7
9 changed files with 375 additions and 77 deletions
--- a/2
+++ b/2
@@ -73,7 +73,7 @@ stats.lines:
 	@find . \( -path '*/node_modules' -o -path '*/.venv' -o -path '*/venv' \) -prune -o \
 	\( -name "*.ts" -o -name "*.py" \) -type f -print0 | xargs -0 cat | wc -l

-.PHONY wordcount
+.PHONY: wordcount
 wordcount:
 	@echo "Counting words in main text (excluding appendix)..."
 	@texcount -nosub -total -sum -1 \
--- a/paper/src/auto/main.el
+++ b/paper/src/auto/main.el
@@ -12,11 +12,6 @@
    "preamble"
    "chapters/01-intro"
    "chapters/02-literature-review"
-    "chapters/03-methodology"
-    "chapters/04-results"
-    "chapters/05-discussion"
-    "chapters/06-conclusion"
-    "../build/concatenated_code"
    "article"
    "art12"))
 :latex)
--- a/paper/src/bib/references.bib
+++ b/paper/src/bib/references.bib
@@ -26,7 +26,7 @@
 	file = {PDF:/home/velocitatem/Zotero/storage/Q7J5EBEJ/3447815.pdf:application/pdf},
 }

-@phdthesis{salassa_politecnico_nodate,
+@phdthesis{salassa_politecnico_2024,
 	title = {Politecnico di {Torino} {Algorithmic} {Pricing} in the digital age "{Ethical} considerations on its economic and social implications, and an analysis of possible solutions to overcome its critical issues" {Tutor}: {Candidate}},
 	abstract = {Algorithmic pricing is an emerging business practice that uses computational algorithms to determine
 the prices of products and services based on a number of dynamic factors. The aim of this thesis is to
@@ -50,6 +50,8 @@ laws, for fair and non-discriminatory use.},
 	urldate = {2025-11-12},
 	school = {Politecnico di Torino},
 	author = {Salassa, Fabio and Pautassi, Paolo},
+	month = apr,
+	year = {2024},
 	file = {PDF:/home/velocitatem/Zotero/storage/L95WYQ8B/m-api-06aad998-d926-0d59-5593-82fdce5a678b.pdf:application/pdf},
 }

@@ -62,11 +64,12 @@ laws, for fair and non-discriminatory use.},
 	file = {PDF:/home/velocitatem/Zotero/storage/IZD3C5SR/m-api-26f6207c-cc89-4aed-29b6-34629f18fe9b.pdf:application/pdf},
 }

-@article{shahidi_coasean_nodate,
+@article{shahidi_coasean_2025,
 	title = {The {Coasean} {Singularity}? {Demand}, {Supply}, and {Market} {Design} with {AI} {Agents}},
 	abstract = {AI agents—autonomous systems that perceive, reason, and act on behalf of human principals—are poised to transform digital markets by dramatically reducing transaction costs. This chapter evaluates the economic implications of this transition, adopting a consumeroriented view of agents as market participants that can search, negotiate, and transact directly. From the demand side, agent adoption reflects derived demand: users trade off decision quality against effort reduction, with outcomes mediated by agent capability and task context. On the supply side, firms will design, integrate, and monetize agents, with outcomes hinging on whether agents operate within or across platforms. At the market level, agents create efficiency gains from lower search, communication, and contracting costs, but also introduce frictions such as congestion and price obfuscation. By lowering the costs of preference elicitation, contract enforcement, and identity verification, agents expand the feasible set of market designs but also raise novel regulatory challenges. While the net welfare effects remain an empirical question, the rapid onset of AI-mediated transactions presents a unique opportunity for economic research to inform real-world policy and market design.},
 	language = {en},
 	author = {Shahidi, Peyman and Rusak, Gili and Manning, Benjamin S and Fradkin, Andrey and Horton, John J},
+	year = {2025},
 	file = {PDF:/home/velocitatem/Zotero/storage/TQCAPJDP/Shahidi et al. - The Coasean Singularity Demand, Supply, and Market Design with AI Agents.pdf:application/pdf},
 }

@@ -84,10 +87,14 @@ laws, for fair and non-discriminatory use.},
 	file = {PDF:/home/velocitatem/Zotero/storage/ZLJQ4DQ9/Byrnes - 2025 - Intro to Brain-Like-AGI Safety.pdf:application/pdf},
 }

-@article{shannon_mathematical_nodate,
+@article{shannon_mathematical_1948,
 	title = {A {Mathematical} {Theory} of {Communication}},
+	volume = {27},
 	language = {en},
+	journal = {Bell System Technical Journal},
 	author = {Shannon, C E},
+	month = oct,
+	year = {1948},
 	file = {PDF:/home/velocitatem/Zotero/storage/FJRFRWK2/Shannon - A Mathematical Theory of Communication.pdf:application/pdf},
 }

@@ -96,11 +103,13 @@ laws, for fair and non-discriminatory use.},
 	file = {PDF:/home/velocitatem/Zotero/storage/D3QRGY9Z/order_stats.pdf:application/pdf},
 }

-@article{devine_nonlinear_nodate,
+@article{devine_nonlinear_2017,
 	title = {Nonlinear {Pricing} with {Costly} {Information} {Acquisition}},
 	abstract = {This paper examines a nonlinear pricing model where the ﬁrm can choose to acquire costly information prior to oﬀering contract menus to consumers; such as paying a consultant or investing in machine learning technologies. Information provides the ﬁrm with a signal about consumers types, whose accuracy increases as the ﬁrm acquires larger amounts of information. We show that the ﬁrm chooses to acquire information, only if it can purchase a suﬃcient amount that could alter its initial prior beliefs. Relative to standard settings where ﬁrms cannot acquire information, we identify how information acquisition changes optimal contract oﬀers, equilibrium proﬁts, information rents, and welfare. A better-informed ﬁrm increases its expected proﬁts, but it can also increase expected utility when the cost of information is intermediate. Our results recommend balanced online privacy laws.},
 	language = {en},
 	author = {Devine, Brett R and Munoz-Garcia, Felix},
+	month = nov,
+	year = {2017},
 	file = {PDF:/home/velocitatem/Zotero/storage/GQ28KVBF/Devine and Munoz-Garcia - Nonlinear Pricing with Costly Information Acquisition.pdf:application/pdf},
 }

@@ -202,10 +211,11 @@ laws, for fair and non-discriminatory use.},
 	file = {PDF:/home/velocitatem/Zotero/storage/U7A5Q78V/Karten et al. - 2025 - LLM Economist Large Population Models and Mechanism Design in Multi-Agent Generative Simulacra.pdf:application/pdf},
 }

-@techreport{mullapudi_reinforcement_nodate,
+@techreport{mullapudi_reinforcement_2025,
 	title = {A {Reinforcement} {Learning} {Approach} to {Dynamic} {Pricing}},
 	abstract = {Dynamic pricing represents a critical strategic challenge in modern e-commerce, where firms must navigate fluctuating demand, inventory constraints, and aggressive competitor actions. Traditional static and heuristic-based pricing models often fail to capture the complex, non-linear dynamics of competitive digital markets, leading to suboptimal profitability. This paper proposes a model-free reinforcement learning (RL) framework to address this challenge. Specifically, we design, implement, and evaluate a Q-learning agent capable of learning an optimal, state-dependent pricing policy. The agent is trained and evaluated within a simulated market environment constructed from the publicly available "Retail Price Optimization" dataset from Kaggle, which provides a rich feature set including historical sales, product characteristics, seasonality, and, crucially, competitor pricing data. The problem is formulated as a Markov Decision Process (MDP), where the agent's state incorporates its price position relative to competitors, competitor price trends, and seasonal factors. The agent's performance is benchmarked against three baseline strategies: static pricing, a reactive "follow-the-leader" heuristic, and random pricing. The results demonstrate that the Q-learning agent achieves a substantial increase in total cumulative profit over the evaluation period, outperforming all baselines by learning a nuanced policy that strategically balances price adjustments in response to market conditions. This work provides a practical and reproducible blueprint for applying reinforcement learning to optimize pricing decisions in a simulated yet realistic competitive retail environment, highlighting the potential of RL to automate complex strategic decision-making.},
 	author = {Mullapudi, Pavan},
+	year = {2025},
 	note = {Publication Title: International Journal on Science and Technology (IJSAT) IJSAT25049558
 Volume: 16
 Issue: 4},
@@ -294,10 +304,11 @@ Issue: 4},
 	file = {PDF:/home/velocitatem/Zotero/storage/S8635QX6/varian95a.pdf:application/pdf},
 }

-@book{russell_artificial_nodate,
+@book{russell_artificial_2021,
 	title = {Artificial {Intelligence} {A} {Modern} {Approach} {Fourth} {Edition} {Global} {Edition}},
 	isbn = {978-1-292-40117-1},
 	author = {Russell, Stuart and Norvig, Peter},
+	year = {2021},
 	file = {PDF:/home/velocitatem/Zotero/storage/6B8W8S27/efdd4d1d4c2087fe1cbe03d9ced67f34.pdf:application/pdf},
 }

@@ -312,10 +323,11 @@ Volume: 21},
 	file = {PDF:/home/velocitatem/Zotero/storage/N9JNXFJW/live-1333-2265-jair.pdf:application/pdf},
 }

-@techreport{shoham_multiagent_nodate,
+@techreport{shoham_multiagent_2009,
 	title = {Multiagent {Systems}: {Algorithmic}, {Game}-{Theoretic}, and {Logical} {Foundations}},
 	url = {http://www.masfoundations.org.},
 	author = {Shoham, Yoav and Leyton-Brown, Kevin},
+	year = {2009},
 	keywords = {algorithms, auctions, communication, competition, cooperation, distributed problem solving, game theory, learning, logic, mechanism design, social choice},
 	file = {PDF:/home/velocitatem/Zotero/storage/QZVYS7V9/shoham09a.pdf:application/pdf},
 }
@@ -331,11 +343,13 @@ Volume: 21},
 	file = {PDF:/home/velocitatem/Zotero/storage/H8IS64AW/2411.13768v2.pdf:application/pdf},
 }

-@techreport{xie_osworld_nodate,
+@techreport{xie_osworld_2024,
 	title = {{OSWORLD}: {Benchmarking} {Multimodal} {Agents} for {Open}-{Ended} {Tasks} in {Real} {Computer} {Environments}},
 	url = {https://os-world.github.io},
 	abstract = {Autonomous agents that accomplish complex computer tasks with minimal human interventions have the potential to transform human-computer interaction, significantly enhancing accessibility and productivity. However, existing benchmarks either lack an interactive environment or are limited to environments specific to certain applications or domains, failing to reflect the diverse and complex nature of real-world computer use, thereby limiting the scope of tasks and agent scalability. To address this issue, we introduce OSWORLD, the first-of-its-kind scalable, real computer environment for multimodal agents, supporting task setup, execution-based evaluation, and interactive learning across various operating systems such as Ubuntu, Windows, and macOS. OSWORLD can serve as a unified, integrated computer environment for assessing open-ended computer tasks that involve arbitrary applications. Building upon OSWORLD, we create a benchmark of 369 computer tasks involving real web and desktop apps in open domains, OS file I/O, and workflows spanning multiple applications. Each task example is derived from real-world computer use cases and includes a detailed initial state setup configuration and a custom execution-based evaluation script for reliable, reproducible evaluation. Extensive evaluation of state-of-the-art LLM/VLM-based agents on OSWORLD reveals significant deficiencies in their ability to serve as computer assistants. While humans can accomplish over 72.36\% of the tasks, the best model achieves only 12.24\% success, primarily struggling with GUI grounding and operational knowledge. Comprehensive analysis using OSWORLD provides valuable insights for developing multimodal generalist agents that were not possible with previous benchmarks. Our code, environment, baseline models, and data are publicly available at https://os-world.github.io.},
 	author = {Xie, Tianbao and Zhang, Danyang and Chen, Jixuan and Li, Xiaochuan and Zhao, Siheng and Cao, Ruisheng and Jing Hua, Toh and Cheng, Zhoujun and Shin, Dongchan and Lei, Fangyu and Liu, Yitao and Xu, Yiheng and Zhou, Shuyan and Savarese, Silvio and Xiong, Caiming and Zhong, Victor and Yu, Tao},
+	month = may,
+	year = {2024},
 	note = {arXiv: 2404.07972v2},
 	file = {PDF:/home/velocitatem/Zotero/storage/LLRKXIC7/full-text.pdf:application/pdf},
 }
@@ -364,17 +378,21 @@ Volume: 21},
 	file = {PDF:/home/velocitatem/Zotero/storage/QNXZJLRM/S2444883425000038.pdf:application/pdf},
 }

-@misc{ghaffary_amazon_nodate,
+@misc{ghaffary_amazon_2025,
 	title = {Amazon {Sues} to {Stop} {Perplexity} {From} {Using} {AI} {Tool} to {Buy} {Stuff}},
 	url = {https://www.bloomberg.com/news/articles/2025-11-04/amazon-demands-perplexity-stop-ai-agent-from-making-purchases},
 	author = {Ghaffary, Shirin and Day, Matt},
+	month = nov,
+	year = {2025},
 	file = {PDF:/home/velocitatem/Zotero/storage/IQL6FPWE/Amazon Sues to Stop Perplexity From Using AI Tool to Buy Stuff - Bloomberg.pdf:application/pdf},
 }

-@techreport{besbes_dynamic_nodate,
+@techreport{besbes_dynamic_2007,
 	title = {Dynamic {Pricing} {Without} {Knowing} the {Demand} {Function}: {Risk} {Bounds} and {Near}-{Optimal} {Algorithms} *},
 	abstract = {We consider a single product revenue management problem where, given an initial inventory, the objective is to dynamically adjust prices over a finite sales horizon to maximize expected revenues. Realized demand is observed over time, but the underlying functional relationship between price and mean demand rate that governs these observations (otherwise known as the demand function or demand curve), is not known. We consider two instances of this problem: i.) a setting where the demand function is assumed to belong to a known parametric family with unknown parameter values; and ii.) a setting where the demand function is assumed to belong to a broad class of functions that need not admit any parametric representation. In each case we develop policies that learn the demand function "on the fly," and optimize prices based on that. The performance of these algorithms is measured in terms of the regret: the revenue loss relative to the maximal revenues that can be extracted when the demand function is known prior to the start of the selling season. We derive lower bounds on the regret that hold for any admissible pricing policy, and then show that our proposed algorithms achieve a regret that is "close" to this lower bound. The magnitude of the regret can be interpreted as the economic value of prior knowledge on the demand function; manifested as the revenue loss due to model uncertainty.},
 	author = {Besbes, Omar and Zeevi, Assaf},
+	month = dec,
+	year = {2007},
 	note = {Publication Title: Operations Research},
 	keywords = {learning, asymptotic analysis, estimation, exploration-exploitation, pricing, Revenue management, value of information},
 	file = {PDF:/home/velocitatem/Zotero/storage/SBAIB4V2/Dp_wo_demand_risk_ob_az_posted.pdf:application/pdf},
@@ -423,3 +441,124 @@ Volume: 21},
 	keywords = {Computer Science - Computation and Language},
 	file = {PDF:/home/velocitatem/Zotero/storage/3Z2XK4QC/Ganie - 2025 - Uncertainty in Authorship Why Perfect AI Detection Is Mathematically Impossible.pdf:application/pdf},
 }
+
+@article{shi_distributionally_2024,
+	title = {Distributionally {Robust} {Model}-{Based} {Oﬄine} {Reinforcement} {Learning} with {Near}-{Optimal} {Sample} {Complexity}},
+	abstract = {This paper concerns the central issues of model robustness and sample eﬃciency in oﬄine reinforcement learning (RL), which aims to learn to perform decision making from history data without active exploration. Due to uncertainties and variabilities of the environment, it is critical to learn a robust policy—with as few samples as possible—that performs well even when the deployed environment deviates from the nominal one used to collect the history dataset. We consider a distributionally robust formulation of oﬄine RL, focusing on tabular robust Markov decision processes with an uncertainty set speciﬁed by the Kullback-Leibler divergence in both ﬁnite-horizon and inﬁnite-horizon settings. To combat with sample scarcity, a model-based algorithm that combines distributionally robust value iteration with the principle of pessimism in the face of uncertainty is proposed, by penalizing the robust value estimates with a carefully designed data-driven penalty term. Under a mild and tailored assumption of the history dataset that measures distribution shift without requiring full coverage of the state-action space, we establish the ﬁnite-sample complexity of the proposed algorithms. We further develop an informationtheoretic lower bound, which suggests that learning RMDPs is at least as hard as the standard MDPs when the uncertainty level is suﬃcient small, and corroborates the tightness of our upper bound up to polynomial factors of the (eﬀective) horizon length for a range of uncertainty levels. To the best our knowledge, this provides the ﬁrst provably near-optimal robust oﬄine RL algorithm that learns under model uncertainty and partial coverage.},
+	language = {en},
+	author = {Shi, Laixi and Chi, Yuejie},
+	month = jun,
+	year = {2024},
+	file = {PDF:/home/velocitatem/Zotero/storage/K56G4EIP/Shi and Chi - Distributionally Robust Model-Based Oﬄine Reinforcement Learning with Near-Optimal Sample Complexity.pdf:application/pdf},
+}
+
+@article{dutting_mechanism_2025,
+	title = {Mechanism {Design} for {Large} {Language} {Models} ({Extended} {Abstract})},
+	abstract = {We investigate auction mechanisms for AIgenerated content, focusing on applications like ad creative generation. In our model, agents’ preferences over stochastically generated content are encoded as large language models (LLMs). We propose an auction format that operates on a tokenby-token basis, and allows LLM agents to inﬂuence content creation through single dimensional bids. We formulate two desirable incentive properties and prove their equivalence to a monotonicity condition on output aggregation. This equivalence enables a second-price rule design, even absent explicit agent valuation functions. Our design is supported by demonstrations on a publicly available LLM.},
+	language = {en},
+	author = {Dütting, Paul and Mirrokni, Vahab and Leme, Renato Paes and Xu, Haifeng and Zuo, Song},
+	year = {2025},
+	file = {PDF:/home/velocitatem/Zotero/storage/2ABDEYDN/Dütting et al. - Mechanism Design for Large Language Models (Extended Abstract).pdf:application/pdf},
+}
+
+@misc{fcmi_machine_2025,
+	title = {Machine {Speed} {Markets}: {AI} {Agent} {Market} {Strategy} \& {Growth}},
+	shorttitle = {Machine {Speed} {Markets}},
+	url = {https://www.360strategy.co.uk/post/machine-speed-markets-ai-agents},
+	abstract = {Recent research by NBER economists suggests these AI agents in particular, could drive a "Coasean singularity," a point where transaction costs fall towards zero, radically reshaping how markets function. In essence, tasks like finding information, negotiating deals, and enforcing contracts which are traditionally costly frictions in commerce, may become nearly instantaneous and costless.},
+	language = {en},
+	urldate = {2026-01-20},
+	journal = {360 Strategy},
+	author = {FCMi, CMgr, Mark Evans MBA},
+	month = nov,
+	year = {2025},
+	file = {Snapshot:/home/velocitatem/Zotero/storage/Z22P9JJH/machine-speed-markets-ai-agents.html:text/html},
+}
+
+@article{coase_nature_1937,
+	title = {The {Nature} of the {Firm}},
+	volume = {4},
+	issn = {1468-0335},
+	url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1468-0335.1937.tb00002.x},
+	doi = {10.1111/j.1468-0335.1937.tb00002.x},
+	language = {en},
+	number = {16},
+	urldate = {2026-01-20},
+	journal = {Economica},
+	author = {Coase, R. H.},
+	year = {1937},
+	pages = {386--405},
+	file = {Full Text PDF:/home/velocitatem/Zotero/storage/TABLLPEU/Coase - 1937 - The Nature of the Firm.pdf:application/pdf;Snapshot:/home/velocitatem/Zotero/storage/Q5RFW9LJ/j.1468-0335.1937.tb00002.html:text/html},
+}
+
+@misc{fish_algorithmic_2025,
+	title = {Algorithmic {Collusion} by {Large} {Language} {Models}},
+	url = {http://arxiv.org/abs/2404.00806},
+	doi = {10.48550/arXiv.2404.00806},
+	abstract = {The rise of algorithmic pricing raises concerns of algorithmic collusion. We conduct experiments with algorithmic pricing agents based on Large Language Models (LLMs). We find that LLM-based pricing agents quickly and autonomously reach supracompetitive prices and profits in oligopoly settings and that variation in seemingly innocuous phrases in LLM instructions (“prompts”) may substantially influence the degree of supracompetitive pricing. Off-path analysis using novel techniques uncovers price-war concerns as contributing to these phenomena. Our results extend to auction settings. Our findings uncover unique challenges to any future regulation of LLM-based pricing agents, and AI-based pricing agents more broadly.},
+	language = {en},
+	urldate = {2026-01-20},
+	publisher = {arXiv},
+	author = {Fish, Sara and Gonczarowski, Yannai A. and Shorrer, Ran I.},
+	month = sep,
+	year = {2025},
+	note = {arXiv:2404.00806 [econ]},
+	keywords = {Computer Science - Computer Science and Game Theory, Computer Science - Artificial Intelligence, Economics - General Economics},
+	file = {PDF:/home/velocitatem/Zotero/storage/QHWVISCZ/Fish et al. - 2025 - Algorithmic Collusion by Large Language Models.pdf:application/pdf},
+}
+
+@misc{hardt_strategic_2015,
+	title = {Strategic {Classification}},
+	url = {http://arxiv.org/abs/1506.06980},
+	doi = {10.48550/arXiv.1506.06980},
+	abstract = {Machine learning relies on the assumption that unseen test instances of a classiﬁcation problem follow the same distribution as observed training data. However, this principle can break down when machine learning is used to make important decisions about the welfare (employment, education, health) of strategic individuals. Knowing information about the classiﬁer, such individuals may manipulate their attributes in order to obtain a better classiﬁcation outcome. As a result of this behavior—often referred to as gaming—the performance of the classiﬁer may deteriorate sharply. Indeed, gaming is a well-known obstacle for using machine learning methods in practice; in ﬁnancial policy-making, the problem is widely known as Goodhart’s law. In this paper, we formalize the problem, and pursue algorithms for learning classiﬁers that are robust to gaming.},
+	language = {en},
+	urldate = {2026-01-20},
+	publisher = {arXiv},
+	author = {Hardt, Moritz and Megiddo, Nimrod and Papadimitriou, Christos and Wootters, Mary},
+	month = nov,
+	year = {2015},
+	note = {arXiv:1506.06980 [cs]},
+	keywords = {Computer Science - Machine Learning},
+	file = {PDF:/home/velocitatem/Zotero/storage/HNCDYGWS/Hardt et al. - 2015 - Strategic Classification.pdf:application/pdf},
+}
+
+@misc{liu_contextual_2024,
+	title = {Contextual {Dynamic} {Pricing} with {Strategic} {Buyers}},
+	url = {http://arxiv.org/abs/2307.04055},
+	doi = {10.48550/arXiv.2307.04055},
+	abstract = {Personalized pricing, which involves tailoring prices based on individual characteristics, is commonly used by firms to implement a consumer-specific pricing policy. In this process, buyers can also strategically manipulate their feature data to obtain a lower price, incurring certain manipulation costs. Such strategic behavior can hinder firms from maximizing their profits. In this paper, we study the contextual dynamic pricing problem with strategic buyers. The seller does not observe the buyer's true feature, but a manipulated feature according to buyers' strategic behavior. In addition, the seller does not observe the buyers' valuation of the product, but only a binary response indicating whether a sale happens or not. Recognizing these challenges, we propose a strategic dynamic pricing policy that incorporates the buyers' strategic behavior into the online learning to maximize the seller's cumulative revenue. We first prove that existing non-strategic pricing policies that neglect the buyers' strategic behavior result in a linear \$Ω(T)\$ regret with \$T\$ the total time horizon, indicating that these policies are not better than a random pricing policy. We then establish that our proposed policy achieves a sublinear regret upper bound of \$O({\textbackslash}sqrt\{T\})\$. Importantly, our policy is not a mere amalgamation of existing dynamic pricing policies and strategic behavior handling algorithms. Our policy can also accommodate the scenario when the marginal cost of manipulation is unknown in advance. To account for it, we simultaneously estimate the valuation parameter and the cost parameter in the online pricing policy, which is shown to also achieve an \$O({\textbackslash}sqrt\{T\})\$ regret bound. Extensive experiments support our theoretical developments and demonstrate the superior performance of our policy compared to other pricing policies that are unaware of the strategic behaviors.},
+	language = {en},
+	urldate = {2026-01-20},
+	publisher = {arXiv},
+	author = {Liu, Pangpang and Yang, Zhuoran and Wang, Zhaoran and Sun, Will Wei},
+	month = jun,
+	year = {2024},
+	note = {arXiv:2307.04055 [stat]},
+	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Computer Science and Game Theory, Computer Science - Artificial Intelligence},
+	file = {PDF:/home/velocitatem/Zotero/storage/MVJNULK3/Liu et al. - 2024 - Contextual Dynamic Pricing with Strategic Buyers.pdf:application/pdf},
+}
+
+@techreport{dhir_http_2025,
+	type = {Internet {Draft}},
+	title = {{HTTP} {Agent} {Profile} ({HAP}): {Authenticated} and {Monetized} {Agent} {Traffic} on the {Web}},
+	shorttitle = {{HTTP} {Agent} {Profile} ({HAP})},
+	url = {https://datatracker.ietf.org/doc/draft-dhir-http-agent-profile},
+	abstract = {Autonomous agents such as LLM-powered crawlers, browser-integrated assistants, and task-oriented bots are rapidly becoming first-class HTTP clients on the Web. Today’s infrastructure largely assumes a human behind a browser and monetizes content through advertising and coarse subscriptions. Automated agents consume content at scale without rendering pages or viewing ads, exacerbating bot-mitigation arms races and economic misalignment between content providers and AI systems. This document describes an HTTP Agent Profile (HAP) that enables: (1) cryptographic authentication of agent traffic using HTTP Message Signatures; (2) clear separation between human and agent traffic using privacy-preserving human tokens; and (3) protocol-level value exchange for agents via HTTP status code 402 ("Payment Required") and pluggable micropayment mechanisms. The profile reuses existing HTTP features and is designed for incremental deployment via reverse proxies, CDNs, and agent libraries.},
+	number = {draft-dhir-http-agent-profile-00},
+	urldate = {2026-01-20},
+	institution = {Internet Engineering Task Force},
+	author = {Dhir, Sanat},
+	month = nov,
+	year = {2025},
+	note = {Num Pages: 13},
+}
+
+@misc{noauthor_amazoncom_2026,
+	title = {Amazon.com {Services} {LLC} v. {Perplexity} {AI}, {Inc}},
+	language = {en},
+	month = jan,
+	year = {2026},
+	note = {No. 3:25-cv-09514-MMC},
+	file = {PDF:/home/velocitatem/Zotero/storage/4JWZSTXJ/Posner - UNITED STATES DISTRICT COURT NORTHERN DISTRICT OF CALIFORNIA SAN FRANCISCO DIVISION.pdf:application/pdf},
+}
--- a/paper/src/chapters/01-intro.tex
+++ b/paper/src/chapters/01-intro.tex
@@ -14,17 +14,26 @@ This research effort touches a large variety of domains, spanning behavioral eco

 \subsection{Motivation and Market Context}

-The current innovation boom in generative artificial intelligence and its applications to knowledge-based work tasks has brought many competing technologies for browser-use automation, with benchmarks and evaluations \cite{xia_evaluation-driven_2025} motivating the development of capabilities focused on commercial research, understanding, and transaction execution \cite{xie_osworld_nodate}. The ``AI Agent'' market is forecasted to grow from around USD 5-8 billion in 2025 to USD 42-52 billion by 2030. This surge reflects adoption in e-commerce, customer service, and enterprise automation, where agents handle interactions previously done by humans, raising the question of how these systems should be designed for future robustness as well as how to maintain a competitive edge in the analytical components of e-commerce platforms \cite{markntel_advisors_global_2025}.
+The current innovation boom in generative artificial intelligence and its applications to knowledge-based work tasks has brought many competing technologies for browser-use automation, with benchmarks and evaluations \parencite{xia_evaluation-driven_2025} motivating the development of capabilities focused on commercial research, understanding, and transaction execution \parencite{xie_osworld_2024}. The ``AI Agent'' market is forecasted to grow from around USD 5-8 billion in 2025 to USD 42-52 billion by 2030. This surge reflects adoption in e-commerce, customer service, and enterprise automation, where agents handle interactions previously done by humans, raising the question of how these systems should be designed for future robustness as well as how to maintain a competitive edge in the analytical components of e-commerce platforms \parencite{markntel_advisors_global_2025}.

-The key stakeholders affected by the threat of increasing agent-driven traffic include online businesses and platform operators (especially in bot-heavy sectors like retail, travel, and financial services), their security, fraud, and engineering teams, end users whose accounts and data are exposed and whose experience degrades, regulators and legal stakeholders responding to breaches and fraud, and the attackers or bot operators driving the automation \cite{imperva_rapid_2025}.
+The key stakeholders affected by the threat of increasing agent-driven traffic include online businesses and platform operators (especially in bot-heavy sectors like retail, travel, and financial services), their security, fraud, and engineering teams, end users whose accounts and data are exposed and whose experience degrades, regulators and legal stakeholders responding to breaches and fraud, and the attackers or bot operators driving the automation \parencite{imperva_rapid_2025}.

-The industry has already seen legal action in cases like Amazon against Perplexity \cite{ghaffary_amazon_nodate}, stemming from the difficulty of identifying traffic from hybrid systems like the Commet browser. This paper explores such systems to better understand what the interaction data looks like and what it means for dynamic pricing and recommendation systems downstream. This observed impact indicates a need for prevention of secondary negative effects on the ``legacy'' systems which power modern revenue sources for many companies. Dynamic pricing algorithms rely on directly translating demand features $q$ to new price assignments $\hat{p}$ across a catalogue of products of size $N$. This opens opportunities to design a \textit{tabula rasa} of digital market mechanisms that will shape the future of commerce in the age of artificial intelligence.
+The industry has already seen legal action in cases like Amazon against Perplexity \parencite{ghaffary_amazon_2025}, stemming from the difficulty of identifying traffic from hybrid systems like the Commet browser. This paper explores such systems to better understand what the interaction data looks like and what it means for dynamic pricing and recommendation systems downstream. This observed impact indicates a need for prevention of secondary negative effects on the ``legacy'' systems which power modern revenue sources for many companies. Dynamic pricing algorithms rely on directly translating demand features $q$ to new price assignments $\hat{p}$ across a catalogue of products of size $N$. This opens opportunities to design a \textit{tabula rasa} of digital market mechanisms that will shape the future of commerce in the age of artificial intelligence.

 \subsection{Solution Space Overview}
-Dynamic pricing systems, as presented in \cite{mueller_low-rank_2019}, often deal with sparse low-rank data of demand signals which, combined with contamination from agents, creates complex interactions that impact pricing. To further complicate the problem, certain commercial settings such as the one presented in \cite{amjad_censored_2017} must address the true demand of products under censored observations. This provides a formulation for handling demand in our case with multiple kinds of commercial mediators: $\hat{q} \gets q_A + q_H$ where $q_A$ represents the distribution of demand generated by agentic mediators and $q_H$ represents that of true human demand, these are two distinct populations with divergent objective functions.
+Dynamic pricing systems, as presented by \textcite{mueller_low-rank_2019}, often deal with sparse low-rank data of demand signals which, combined with contamination from agents, creates complex interactions that impact pricing. To further complicate the problem, certain commercial settings such as the one presented by \textcite{amjad_censored_2017} must address the true demand of products under censored observations. This provides a formulation for handling demand in our case with multiple kinds of commercial mediators: $\hat{q} \gets q_A + q_H$ where $q_A$ represents the distribution of demand generated by agentic mediators and $q_H$ represents that of true human demand, these are two distinct populations with divergent objective functions.

 We formally define interaction data as coming from some actor which can either be an agent ($A$) or human ($H$). For purposes of this research, an agent is an algorithmic loop with the ability to access a web platform and perform actions such as clicks, scrolls, and input field fills. The loop terminates when the internal large language model judges the provided task definition as complete. A detailed breakdown can be found in \cref{algagent-loop}.

+\subsection{Research Questions}
+
+This work addresses three core research questions:
+\begin{enumerate}
+    \item[\textbf{RQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
+    \item[\textbf{RQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
+    \item[\textbf{RQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
+\end{enumerate}
+

 \begin{algorithm}[t]
 \DontPrintSemicolon
@@ -54,4 +63,4 @@ Extract final result $r$ from terminal state\;
 \end{algorithm}


-The previously described goal of separability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate $\hat{q}$. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \cite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary.
+The previously described goal of separability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate $\hat{q}$. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary.
--- a/paper/src/chapters/02-literature-review.tex
+++ b/paper/src/chapters/02-literature-review.tex
@@ -1,28 +1,29 @@
 \section{Literature Review}

-To better understand all wedges of the work, we must start by exploring the nature of agents and agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \cite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \cite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups.
+To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups.

 \subsection{Agent Taxonomy and Definitions}

-An agent in the context of artificial intelligence is generally defined by anything that can reason and act upon observations of its environments (collected through some sensory inputs) and carry out actions through effectors. Moreover, a rational agent is an entity that is capable of perceiving the world around them and taking actions to advance specified goals. This definition by \cite{russell_artificial_nodate} is further developed in an economic context by \cite{parkes_economic_2015}, suggesting AI research attempts to construct a synthetic \textit{homo economicus}, which may also be termed \textit{machina economicus}.
-A specific class or taxon of this \textit{machina economicus}, the Large Language Model (LLM) agent, is defined as an autonomous system capable of achieving goals and adapting post-training, often without needing explicit code or fundamental model changes. \cite{xia_evaluation-driven_2025}
+An agent in the context of artificial intelligence is generally defined by anything that can reason and act upon observations of its environments (collected through some sensory inputs) and carry out actions through effectors. Moreover, a rational agent is an entity that is capable of perceiving the world around them and taking actions to advance specified goals. This definition by \textcite{russell_artificial_2021} is further developed in an economic context by \textcite{parkes_economic_2015}, suggesting AI research attempts to construct a synthetic \textit{homo economicus}, which may also be termed \textit{machina economicus}.
+A specific class or taxon of this \textit{machina economicus}, the Large Language Model (LLM) agent, is defined as an autonomous system capable of achieving goals and adapting post-training, often without needing explicit code or fundamental model changes \parencite{xia_evaluation-driven_2025}.

-We must however acknowledge the current SOTA as presented by OSWORLD simulations in \cite{xie_osworld_nodate} have demonstrated that multi-modal tasks across desktop and web interaction modes, have a top-performing score of only 12.24\% success, whereas humans have a higher 72\% success rate. This weakness matters for this research because it clarifies the near-term threat model: practical exploitation does not require a fully competent ``computer assistant'', only enough automation to perform high-volume reconnaissance actions (search/filter/open product pages, probe availability/price boundaries) that can contaminate behavioral signals. With the expected growth of these capabilities, this threat only becomes more perilous to revenue management systems.
+We must however acknowledge the current SOTA as presented by OSWORLD simulations by \textcite{xie_osworld_2024} have demonstrated that multi-modal tasks across desktop and web interaction modes, have a top-performing score of only 12.24\% success, whereas humans have a higher 72\% success rate; this is linked to the lack of grounding of these agents and their inability of handling unexpected errors. This weakness matters for this research because it clarifies the near-term threat model: practical exploitation does not require a fully competent ``computer assistant'', only enough automation to perform high-volume reconnaissance actions (search/filter/open product pages, probe availability/price boundaries) that can contaminate behavioral signals. With the expected growth of these capabilities, this threat only becomes more perilous to revenue management systems.

-We model an agent session as producing some events with lower in-session conversion levels relative to humans, this we state in our assumption that $P(\text{purchase} \vert A) \ll P(\text{purchase} \vert H)$ but with a potentially higher volatility in $\hat{q}$, which we observe through the look-to-book metrics in our simulation.
+We model an agent session as producing some events with lower in-session conversion levels relative to humans, this we state in our assumption that $P(\text{purchase} \vert A) < P(\text{purchase} \vert H)$ but with a potentially higher volatility in $\hat{q}$, which we observe through the look-to-book metrics in our simulation.

 \subsection{Economic Agents: From Homo Economicus to Machina Economicus}

-Existing behavioral economic models tend to be criticized for the assumption of rational behavior, as is embodied in the term of homo economicus. The definition of a machina economicus by \cite{parkes_economic_2015} is quite appropriate for our case, particularly because these assumptions of rationality have been argued to be a very adequate reference for AI research by \cite{varian_economic_1995}. For modeling this behavior, the trajectories of these agents can be formally defined to be partially observable Markov decision processes. \cite{xie_osworld_nodate} Agents are however not to be confused with web-bots which have previously been known as automated software applications or scrapers which are set with a purpose of carrying out specific tasks on the internet, without a higher level of internal judgement. \cite{imperva_rapid_2025} In our research, we refer to this actor simply as an Agent belonging to the distribution $A$.
+Existing behavioral economic models tend to be criticized for the assumption of rational behavior, as is embodied in the term of homo economicus. The definition of a machina economicus by \textcite{parkes_economic_2015} is quite appropriate for our case, particularly because these assumptions of rationality have been argued to be a very adequate reference for AI research by \textcite{varian_economic_1995} due to its expected utility maximizing nature. For modeling this behavior, the trajectories of these agents can be formally defined to be partially observable Markov decision processes \parencite{xie_osworld_2024}. Agents are however not to be confused with web-bots which have previously been known as automated software applications or scrapers which are set with a purpose of carrying out specific tasks on the internet, without a higher level of internal judgement \parencite{imperva_rapid_2025}. In our research, we refer to this actor simply as an Agent belonging to the distribution $A$.

-This economic framing also helps separate two related but distinct phenomena of agents as buyers (changing market demand composition), and agents as information gatherers (changing the observed interactions used by pricing/recommendation systems). The thesis focuses on the second, where information acquisition strategically precedes purchase execution. We do not however dismiss the proposed expectation that existing economic systems serving humans, will not be populated by AIs across multiple channels and with various possibly misaligned goals as stated by \cite{parkes_economic_2015}.
+This economic framing also helps separate two related but distinct phenomena of agents as buyers (changing market demand composition), and agents as information gatherers (changing the observed interactions used by pricing/recommendation systems). The thesis focuses on the second, where information acquisition strategically precedes purchase execution. We do not however dismiss the proposed expectation that existing economic systems serving humans, will not be populated by AIs across multiple channels and with various possibly misaligned goals as stated by \textcite{parkes_economic_2015}.

+A HAP (HTTP Agent Profile) protocol has been developed as an internet draft by \textcite{dhir_http_2025} in an effort to separate agentic and human internet traffic, however the majority adoption by both the sellers and agent providers would be required for the implementation of such a solution.

 \subsection{Problem Evidence and Market Impact}

-The statistical issue of contamination in dynamic pricing systems that observe demand features as a means to update prices has been documented in various previous contexts. The airline industry (which has accounted for 24\% of observed disruptions) has seen malicious activity with a measureable impact on skewing key performance indicators by behavior visible in the look-to-book metrics. Excessive reconnaissance traffic inflates search volume without corresponding completed bookings, thereby skewing demand forecasts and disrupting dynamic pricing models. Demand proxies have also been observed to cause significant threat to inventory management by creating artificial scarcity that distorts the demand-supply relationships in the enterprise model. Censored demand as shown in \cite{amjad_censored_2017} can also be observed in low-bias demand under-estimation caused by a distortion effect coming from non-human traffic data. \cite{imperva_rapid_2025}
+The statistical issue of contamination in dynamic pricing systems that observe demand features as a means to update prices has been documented in various previous contexts. The airline industry (which has accounted for 24\% of observed disruptions) has seen malicious activity with a measureable impact on skewing key performance indicators by behavior visible in the look-to-book metrics. Excessive reconnaissance traffic inflates search volume without corresponding completed bookings, thereby skewing demand forecasts and disrupting dynamic pricing models. Demand proxies have also been observed to cause significant threat to inventory management by creating artificial scarcity that distorts the demand-supply relationships in the enterprise model. Censored demand as shown by \textcite{amjad_censored_2017} can also be observed in low-bias demand under-estimation caused by a distortion effect coming from non-human traffic data \parencite{imperva_rapid_2025}.

-When dynamic pricing algorithms operate on highly contaminated or noisy data, the risk grows significantly in creating inaccurate price inferences. The emergent mitigation driven by un-informed reward and regret signals might lead to price suppression for sales continuity which results in harming margins and resulting in a revenue loss. System that poorly fit undesired behavior might result in price gouging, which calls for strong guardrails while preserving targeted business strategy. \cite{mullapudi_reinforcement_nodate}
+When dynamic pricing algorithms operate on highly contaminated or noisy data, the risk grows significantly in creating inaccurate price inferences. The emergent mitigation driven by un-informed reward and regret signals might lead to price suppression for sales continuity which results in harming margins and resulting in a revenue loss. System that poorly fit undesired behavior might result in price gouging, which calls for strong guardrails while preserving targeted business strategy \parencite{mullapudi_reinforcement_2025}.


 %Documented instances of agent-driven market disruptions - Quantitative evidence of pricing manipulation - Case studies from affected industries
@@ -30,17 +31,41 @@ When dynamic pricing algorithms operate on highly contaminated or noisy data, th
 \subsection{Theoretical Foundations: Economic Parallels}


+Early hints of exploration of prices in a standard English auction explored by \textcite{varian_economic_1995} which hints at exploration of prices in a sequential manner, which leads to a marginally different cost to the bidder than the reservation price of the seller. This is a setting in which there is no cost incured by the buyer for their actions or exploring prices in the market. They propose that any agent responsable for the pricing of a good must be imune to dynamic strategies which might extract private information from a market. A key take-away which relates to the Vickery auction mechanism (also called a \textit{direct mechanism}) suggests that not only would defenses against such exploitation be necessary, but the construction of a mechanism in which revelation of the true willingness to pay is the dominant strategy for commerce.

-Early hints of exploration of prices in a standard English auction explored in \cite{varian_economic_1995} which hints at exploration of prices in a sequential manner, which leads to a marginally different cost to the bidder than the reservation price of the seller. This is a setting in which there is no cost incured by the buyer for their actions or exploring prices in the market. They propose that any agent responsable for the pricing of a good must be imune to dynamic strategies which might extract private information from a market. A key take-away which relates to the Vickery auction mechanism (also called a \textit{direct mechanism}) suggests that not only would defenses against such exploitation be necessary, but the construction of a mechanism in which revelation of the true willingness to pay is the dominant strategy for commerce.
+Like in classical revenue-maximizing auctions \parencite{roughgarden_cs364a_2013} we assume that the human actor in our system has a private valuation $v$ which we formally draw from intrinsically defined distributions. The important note here is that the agent proxy does not have a mechanism to convey this private information into the demand data which directly impacts the pricing systems.

-Like in classical revenue-maximizing auctions \cite{roughgarden_cs364a_2013} we assume that the human actor in our system has a private valuation $v$ which we formally draw from later defined distributions. The important note here is that the agent proxy does not have a mechanism to convey this private information into the demand data which directly impacts the pricing systems.
+The key component of this mediation between agents and commercial platforms lays in the transaction costs related to information gathering and negotiation. As proposed by \textcite{shahidi_coasean_2025} these costs are bound to collapse towards zero (which we demonstrate mathematically), calling for a re-evaluation of the boundaries between firms and markets. As argued by \textcite{coase_nature_1937}, the market participation and time associated with that participation, is critical part of the Coasean transaction cost logic which includes the discovery or relevant pricing within a given market. This process of price discovery without the presence of AI Agents can be time consuming and resource intensive. To build on top of this work we provide a proof of optimal conditions theorised by Coaes as an extension to AI-mediated markets.

 % Economic foundations: relating the problem to options pricing theory. Cost of Information (COI) concept and its relevance

-% Link Coasean Singularity and other economic market theory and highlight specific information of supra competitive pricing.


 \subsection{Landscape of Existing Work}

-Previous efforts in adversarial computer use LLM agents, show how multi-faceted the whole problem is
-Here we can show a market visualization (venn-like-diagram)
+Explorations of the algorithmic collusion by LLMs \parencite{fish_algorithmic_2025} has demonstrated a cross-model tendency of market division with a strong sensitivity to instructions provided in the ``system prompt''. If a dynamic pricing algorithm which is trained to respond to market signals learns to coordinate with competitor agents (or become manipulated by those agents), the market equilibrium is under threat of destabilization. This is particularly true for Q-learning pricing learners as demonstrated by \textcite{calvano_artificial_2018}.
+
+Our effort to combat contamination stems from research by \textcite{hardt_strategic_2015} on strategic classification, in conjunction with \textcite{liu_contextual_2024} who demonstrate a linear regret if contamination is ignored. The strategic classification adversarial effect comes from an effort to manipulate some representative features used in a learning pipeline, which can result in lower prices on loans or lower prices from dynamic pricing algorithms.
+
+To bridge the gap between detection and robust pricing, we look at work in Distributionally Robust Optimization (DRO). As defined by \textcite{kuhn_wasserstein_2024}, DRO provides a framework for decision-making under ambiguity, where the true data distribution is unknown but lies within a ``Wasserstein ball'' of a target distribution. In our context, the ``ambiguity set'' represents the uncertainty introduced by agentic reconnaissance. By optimizing for the worst-case distribution within this set, pricing mechanisms can become resilient to the distributional shifts such as the ones caused by non-human actors, effectively robustifying the revenue function against the contamination described in our problem statement.
+
+In order to create an environment in which prices can be tested against a demand estimate generated by some behavioral model, we take inspiration from the architecture proposed by \textcite{ie_recsim_2019} in the RecSim platform built for recommendation systems. By modeling the distinct user behavior as POMDPs we can generate faithful interactions which allow us to generalize, past the constraint which is also present in recommendation systems, of rarely having enough experience with individual actor's interactions for good recommendations without generalization. The key inspiration comes from the user choice modeling which we translate to a user transition model for each distinct actor type (agent or human). We further consider the possibility of modeling our quantitative research platform using dynamic Bayesian networks for the sake of tractability within the system. The contribution or RecSim enables researchers to better understand learning algorithms in fixed environments, a gap we identify as needing to be bridged within the space of dynamic pricing.
+
+We also acknowledge the difficulty in similarly affected fields such as authorship, where \textcite{ganie_uncertainty_2025} demonstrate the theoretical limits of the distributional divergence between text authored by a human or large language model. Their approach of computing the divergence between two distributions demonstrates purely theoretically that no classifier can outperform random guessing on their particular task. This is yet another factor to take into consideration when exploring the potential mitigation strategies.
+
+The setting of our work is quite complex and covers a wide range of topics, each with its own set of issues that further complicate the task at hand. There is however promise in the field of reinforcement learning and adversarial robustness to combat these problems. We can summarize the characteristics learned from the review of our environment as:
+\begin{enumerate*}[label=(\roman*)]
+\item non-stationary demand with temporal noise $\epsilon_t$
+\item contaminated behavioral signals from mixed human-agent traffic with unknown mixing ratio $\alpha$
+\item partial observability where only demand proxies $\hat{q}$ are available, not true demand $d(\cdot)$
+\item strategic actors capable of feature manipulation to influence pricing outcomes
+\item information asymmetry with private valuations $v$ drawn from unknown distributions
+\item session-based interactions modeled as POMDPs with trajectories $\tau_s$
+\item low conversion probability for agents: $P(\text{purchase} \mid A) < P(\text{purchase} \mid H)$
+\item distributional uncertainty requiring robust optimization within Wasserstein ambiguity sets
+\item potential for adversarial exploitation through false-name bidding and identity whitewashing.
+\end{enumerate*}
+
+
+%Previous efforts in adversarial computer use .LLM agents, show how multi-faceted the whole problem is
+%Here we can show a market visualization (venn-like-diagram)
--- a/paper/src/chapters/03-methodology.tex
+++ b/paper/src/chapters/03-methodology.tex
@@ -19,13 +19,15 @@ where:

 The platform does not directly observe the true underlying demand function $d(p)$. Instead, it observes a behavioral proxy $\hat{q}_t$, which is a composite signal derived from the mixture of actor types. We define the demand proxy for product $i$ at epoch $t$ as a weighted aggregation of events:
 \begin{equation}
+\label{eq:qhat}
 \hat{q}_{t,i} = \sum_{s \in \mathcal{S}_t} \sum_{k=1}^{L_s} \omega(a_{s,k}) \cdot \mathbb{1}[i_{s,k} = i]
 \end{equation}
 where $\omega: \mathcal{A} \to \mathbb{R}_+$ assigns weights to actions based on their signal strength regarding willingness to pay.

 \subsubsection{Actor Types and Demand Curves}
-We formalize the heterogeneity of actors by introducing a type space $\Theta$. An actor of class $Y_s$ is further parameterized by a type $\theta \sim \mathcal{D}_{Y}$. This type determines the actor's demand response function $d(p; \theta)$, sampled from a distribution of possible demand curves. The total observed demand is a stochastic process governed by the mixture:
+We formalize the heterogeneity of actors by introducing a type space $\Theta$. An actor of class $Y_s$ is further parameterized by a type $\theta \sim \mathcal{D}_{Y}$. This type determines the actor's demand response function $d(p; \theta)$, sampled from a distribution of possible demand curves. The total observed demand is a stochastic process governed by the naively defined mixture:
 \begin{equation}
+\label{eq:mixture_demand}
 Q(p) = (1-\alpha) \cdot \mathbb{E}_{\theta \sim \mathcal{D}_H}[d(p; \theta)] + \alpha \cdot \mathbb{E}_{\theta \sim \mathcal{D}_A}[d(p; \theta)] + \epsilon_t
 \end{equation}
 where $\alpha \in [0, 1]$ represents the contamination parameter (proportion of agents) and $\epsilon_t$ is non-stationary market noise.
@@ -164,7 +166,6 @@ The experimentation begins with the design of goals, with careful consideration

 The purpose of this effort to gather data on interactions, is the first half of our research. With this collected data on behavioral characteristics, enhanced by our feature augmentation, we can create distribution separation into two bins $y \in \{A,H\}$ with a certain probability $p$ dependent on the session-specific features. To address the second loop of our system, we use this gained capability of discrimination to enhance the learner design involved in our surrogate dynamic pricing task which simulates an independent dynamic pricing scenario under which we can train a more controlled policy with the ability to account for true demand signals under conditions of contamination from non-human actors.

-
 Our approach can be well summarized by a three-stage division, first we intend to observe and \textit{vectorize} the behavioral interaction data from our experiments, we then develop the separability which helps us deepen the semantic understanding of the behavioral patterns. Finally we use our newly gained learner to leverage a defensive mechanism within the simulation stage of a controlled dynamic pricing loop.

 \begin{figure}[ht]
@@ -174,19 +175,79 @@ Our approach can be well summarized by a three-stage division, first we intend t
  \caption{Overview of the Dynamic Pricing Tasks.}
 \end{figure}

+Our web platform (developed in similar patterns as the RecSim by \textcite{ie_recsim_2019}) allows us to setup a controled environment in which we assign tasks to human and agentic actors which are then carried out. Each actor gets a browser assigned experiment identification which is persistent across possibly multiple session identifiers. We then group by experiments and extract all the session interactions (trajectories) which follow the schema formalized below.

-Study methodology and approach. Data acquisition strategy. Defined objectives and success criteria. Observable metrics and KPIs.
+\subsubsection{Interaction Schema}
+
+We extend the basic event tuple $e_{s,k}$ to capture the full observational signal available to the platform. An interaction event is defined as the extended tuple:
+\begin{equation}
+e_{s,k} = \left( a_{s,k}, \, i_{s,k}, \, t_{s,k}, \, \mu_{s,k}, \, \delta_{s,k} \right)
+\end{equation}
+where $\mu_{s,k} \in \mathcal{M}$ is a metadata record containing action-specific context (e.g., price observed, filter parameters, element text), and $\delta_{s,k} \in \mathbb{R}_+$ is the dwell time in milliseconds for attention-based actions.
+
+A session $s$ is itself a structured record:
+\begin{equation}
+s = \left( \text{sid}, \, \text{eid}, \, t_0, \, \phi, \, \mathcal{U}, \, \tau_s \right)
+\end{equation}
+where $\text{sid}$ is a unique session identifier (UUID), $\text{eid}$ optionally links to an experiment, $t_0$ is the session start timestamp, $\phi \in \{\texttt{hotel}, \texttt{airline}\}$ denotes the platform mode, $\mathcal{U}$ is the user-agent string, and $\tau_s$ is the trajectory of events.
+
+The action space $\mathcal{A}$ is partitioned into four semantic categories based on the behavioral signal each action conveys:
+
+\begin{table}[ht]
+\centering
+\caption{Action space partition $\mathcal{A} = \mathcal{A}_{\text{nav}} \cup \mathcal{A}_{\text{cart}} \cup \mathcal{A}_{\text{filter}} \cup \mathcal{A}_{\text{dwell}}$ with signal interpretation.}
+\label{tab:action_space}
+\begin{tabular}{@{}llll@{}}
+\toprule
+\textbf{Category} & \textbf{Actions} & \textbf{Signal} & $\boldsymbol{\omega}$ \\
+\midrule
+$\mathcal{A}_{\text{cart}}$ & \texttt{add\_item}, \texttt{remove}, \texttt{checkout}, \texttt{purchase} & Purchase intent & High \\
+$\mathcal{A}_{\text{dwell}}$ & \texttt{hover\_title}, \texttt{hover\_paragraph}, \texttt{hover\_link} & Sustained attention & Medium \\
+$\mathcal{A}_{\text{nav}}$ & \texttt{page\_view}, \texttt{view\_item}, \texttt{learn\_more} & Discovery & Low \\
+$\mathcal{A}_{\text{filter}}$ & \texttt{search}, \texttt{filter\_date}, \texttt{filter\_price}, \texttt{sort} & Preference refinement & Lowest \\
+\bottomrule
+\end{tabular}
+\end{table}
+
+This partition enables the weight function $\omega$ from Eq.~\ref{eq:qhat} to assign category-specific signal strengths, with $\omega(\mathcal{A}_{\text{cart}}) > \omega(\mathcal{A}_{\text{dwell}}) > \omega(\mathcal{A}_{\text{nav}}) > \omega(\mathcal{A}_{\text{filter}})$ reflecting decreasing commitment.
+
+The metadata record $\mu$ varies by action type. For product views, $\mu$ contains the observed price $p_{\text{obs}}$ and product attributes. For dwell events, $\mu$ includes the element text and accumulated hover duration. This heterogeneous structure is captured via a schema-on-read approach in our Kafka ingestion pipeline, where events are validated against type-specific schemas before storage.
+
+In addition to behavioral events, the platform logs price observations to a separate Kafka topic. Each price query generates a record $(i, p, \text{sid}, \phi, t)$ associating the product, displayed price, requesting session, platform mode, and timestamp. This dual-stream architecture enables joint analysis of price exposure and behavioral response.


 \subsection{Generative Contamination and Separability}

-To develop a robust pricing agent, we require a simulation environment capable of generating realistic, contaminated interaction data. We achieve this by learning from our Phantom platform data using a two-stage approach.
+To develop a robust pricing learner, we require a simulation environment capable of generating realistic, contaminated interaction data. We achieve this by learning from our Phantom platform data using a two-stage approach.



 \subsubsection{GOFAI-Based Separability}
 We employ Good Old-Fashioned AI (GOFAI) heuristics to generate initial weak labels for separability. We define a set of rule-based predicates $\phi_j: \tau \to \{0, 1\}$ to partition the dataset $\mathcal{D}$ into high-confidence sets $\mathcal{D}_H$ and $\mathcal{D}_A$. We construct distinct MDPs per each behavioral profile of humans and agents and from those we establish $D_{KL}$. From initial findings we compute a KL divergence of $\approx 2.0236$ across transition probabilities between states which can be seen in \ref{fig:human_mdp_viz} and \ref{fig:agent_mdp_viz}.

+\begin{definition}[Kullback-Leibler Divergence for Transition Distributions]
+Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is:
+\begin{equation}
+  D_{\mathrm{KL}}(P_e \parallel Q_e) = \sum_{k \in \mathcal{S}_e} P_e(k) \log \frac{P_e(k)}{Q_e(k)}
+\end{equation}
+where $\mathcal{S}_e$ denotes the set of destination events that follow $e$ in the human trajectories.
+\end{definition}
+
+To obtain this statistic we aggregate state transitions by their triggering event $e$ and treat the normalized outgoing probabilities as the categorical distributions $P_e$ (human) and $Q_e$ (agent). The computation intersects the event labels observed in both datasets, then iterates over each label and accumulates the log-ratio score. In practice this is implemented exactly as in models: for each destination $k$ we multiply the human probability by the log of the probability ratio and add the result to the running sum. Large contributions (including the case where $Q_e(k)$ is near zero) point to intents, such as rapid checkout or repeated navigation, that the agent policy fails to reproduce and therefore drive the contamination analysis.
+
+With this divergence we train a contrastive learning method to estimate a weak probability of a given trajectory being an agent $f(\cdot) \to [0,1]$ which we can use as a leverage for a weighted sum. This is a first attempt at a more informed separability.
+
+
+\subsubsection{Transition Probability Estimation}
+\label{sec:tpe}
+
+
+  For both subsets, we model the session dynamics as a Markov Decision Process (MDP) and estimate the transition kernel $\mathcal{T}$. for each respective actor type we define $\hat{\mathcal{T}}_A$ and $\hat{\mathcal{T}}_H$ which are the general transition kernels subject to clustering into $\hat{\mathcal{T}}_y^i$ where $\forall i \in \text{behavioral clusters of } \hat{\mathcal{T}}_y$. This is done to avoid a lumping of all actor behavior and allows for more intral-class penalization. The probability of transitioning to state $s'$ given state $s$ is estimated via maximum likelihood:
+\begin{equation}
+    \hat{P}(s' \mid s) = \frac{N(s, s')}{\sum_{k \in \mathcal{S}} N(s, k)}
+\end{equation}
+where $N(s, s')$ is the count of observed transitions. This allows us to construct a \textit{Contamination Generator} $\mathcal{G}(\alpha)$. In addition, given a clean trajectory dataset, $\mathcal{G}$ injects synthetic agent trajectories sampled from the learned transition matrix $\hat{P}_A$ until the effective mixing ratio reaches $\alpha$. From these transition probabilities we can observe an important feature which contributes to a differentiating assumption, which is that the mouse-behavior of an agent is almost non existent and therefore not utilized as a distinguishing factor both in the prior separability nor in any feature engineering.
+
 \begin{figure}[ht]
    \centering
    \includegraphics[width=0.8\textwidth]{chapters/mdp_human.pdf}
@@ -201,30 +262,42 @@ We employ Good Old-Fashioned AI (GOFAI) heuristics to generate initial weak labe
    \label{fig:agent_mdp_viz}
  \end{figure}

-\subsubsection{Transition Probability Estimation}
-For both subsets, we model the session dynamics as a Markov Decision Process (MDP) and estimate the transition kernel $\mathcal{T}$. The probability of transitioning to state $s'$ given state $s$ is estimated via maximum likelihood:
-\begin{equation}
-    \hat{P}(s' \mid s) = \frac{N(s, s')}{\sum_{k \in \mathcal{S}} N(s, k)}
-\end{equation}
-where $N(s, s')$ is the count of observed transitions. This allows us to construct a \textit{Contamination Generator} $\mathcal{G}(\alpha)$. Given a clean trajectory dataset, $\mathcal{G}$ injects synthetic agent trajectories sampled from the learned transition matrix $\hat{P}_A$ until the effective mixing ratio reaches $\alpha$.
+
+\subsection{Stronger Classification}
+We re-map the current event schema semantically to the event schema of another dataset. Our contaminated dataset is then used in another classifier where we can now also apply better feature engineering on other features while assigning correct lables to the entire dataset so the new dataset can be contaminated with $\mathcal{G}$ under some different contamination ratio $\alpha$.
+
+This new classified can then be used in the reinforcement learning reward structure.
+

 \subsection{Distributionally Robust Reinforcement Learning (DR-RL)}

-We formulate the pricing problem as a Stackelberg Game where the Platform (Leader) sets prices $p_t$ and the Aggregate Demand (Follower) responds. However, the exact mixing parameter $\alpha$ and the demand distribution shift are non-stationary and unknown in online settings. Relying on a simple error term $\epsilon$ is insufficient. Instead, we adopt a Distributionally Robust Optimization (DRO) objective.
+We formulate the pricing problem as a Stackelberg Game where the Platform (Leader) sets prices $p_t$ and the Aggregate Demand (Follower) responds. However, the exact mixing parameter $\alpha$ and the demand distribution shift are non-stationary and unknown in online settings. Relying on a simple error term $\epsilon$ is insufficient. Instead, we adopt a Distributionally Robust Optimization (DRO) objective. To formulate the entire dependency chain from the trajctory $\tau^\prime$ which is a newly observed trajectory observed by the platform and generated by an unknown actor type (sampled over a behavioral profile defined in section \ref{sec:tpe}). As part of the dynamic pricing we need a mapping of demand parameterized by a trajectory and a price $\hat{Q}(p, \tau^\prime)$. For an observed trajectory we compute a new $\hat{\mathcal{T}}^\prime$ and using a baseline controlled observations of both $\bar{\mathcal{T}}_H$ and $\bar{\mathcal{T}}_A$ we can compute during inference time the following:
+
+\begin{align}
+  \label{eq:delta_H}
+  \Delta_H &= D_{KL}(\hat{\mathcal{T}}^\prime \parallel \bar{\mathcal{T}}_H) \\
+  \label{eq:delta_A}
+  \Delta_A &= D_{KL}(\hat{\mathcal{T}}^\prime \parallel \bar{\mathcal{T}}_A)
+\end{align}
+
+This creates two centroid-like heuristics which can on a per-session granularity basis guide our mixing paramtere $\alpha$.

 \subsubsection{Ambiguity Set Construction}
 We define an ambiguity set $\mathcal{U}_p(\hat{P}_N)$ centered around our empirical reference distribution $\hat{P}_N$ (derived from the generator $\mathcal{G}$). We utilize the Wasserstein distance metric to define the set of plausible demand distributions the agent might face:
 \begin{equation}
 \mathcal{U}_\epsilon(\hat{P}_N) = \left\{ Q \in \mathcal{P}(\Xi) : W_p(Q, \hat{P}_N) \le \epsilon \right\}
 \end{equation}
-This set captures all distributions that are statistically close to our observed training data but allows for adversarial shifts (e.g., sudden bot spikes).
+This set captures all distributions that are statistically close to our observed training data but allows for adversarial shifts.

 \subsubsection{The Min-Max Objective}
 The robust policy $\pi^*$ is obtained by solving the maximin problem:
 \begin{equation}
+\label{eq:robust_policy}
 \pi^* = \arg \max_{\pi} \min_{Q \in \mathcal{U}_\epsilon} \mathbb{E}_{d \sim Q} \left[ R(p, d) - \lambda \cdot \text{COI}(p) \right]
 \end{equation}
-where $R(p, d)$ is the revenue function and $\lambda$ weighs the penalty for information leakage (COI).
+where $R(p, d)$ is the revenue function and $\lambda$ weighs the penalty for information leakage (COI). We previously defined $\text{COI}$, however to properly connect this concept into the reward structure we need to define a parametrized version which informs us of the leakage of said structure with $\text{COI}(p)$.
+
+Another proposed formulation of the optimal policy would be to adjust the ambiguity set dyanmically over the live computed divergence where $\epsilon(\Delta_H)$ to adjust the ball around or estimator according to each behavioral signal emited through a given trajctory. We state this as a possibility but do not peruse it due to literature suggesting that wesserstine methods do not require absolute continuity and are better with ``black swans'' \parencite{kuhn_wasserstein_2024}.

 \subsubsection{Actor Implementation}
 In our simulation, the "Follower" is implemented as a set of Actors. Each Actor is initialized with a type $\theta$ which samples a specific demand curve $d(p; \theta)$ from the latent distribution. This formalization ensures that our DR-RL agent does not overfit to a single deterministic demand function but learns a policy robust to the distributional uncertainty defined by $\mathcal{U}_\epsilon$.
@@ -242,6 +315,47 @@ As part of our reward engineering we think about the UX factor ($UX \in [0,1]$)

 We also need to think about a policy like taxation to the agents Strategy-Proof Mechanism Design, specifically the Vickrey-Clarke-Groves (VCG) payment rule. We link and prove that this would create an incentive for the dominant strategy to become truth-telling.

+\subsubsection{Pricing Mechanism Summary}
+
+We now present the complete pricing mechanism that integrates the behavioral separability, contamination estimation, and robust optimization components developed in the preceding sections. Algorithm~\ref{alg:phantom_pricing_loop} formalizes the defensive pricing loop as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories.
+
+\begin{algorithm}[t]
+\caption{PHANTOM defensive pricing loop (bachelor-thesis level)}
+\label{alg:phantom_loop_clean}
+\DontPrintSemicolon
+\SetKwInOut{Input}{Input}\SetKwInOut{Output}{Output}
+
+\Input{catalog size \(N\); costs \(c\); reference prices \(p^{ref}\); behavior models \(\bar T_H,\bar T_A\);
+action weights \(\omega\); penalty \(\lambda\); horizon \(T\); sessions per step \(M\)}
+\Output{price/demand trajectory \(\{(p_t,\hat Q_t,\hat\alpha_t)\}_{t=0}^{T-1}\)}
+
+Initialize contamination estimate \(\hat\alpha \leftarrow 0.2\)\;
+
+\For{\(t \leftarrow 0\) \KwTo \(T-1\)}{
+
+  set \(p_t \leftarrow \pi(\cdot) \) %c + (1 - \kappa \hat\alpha)\,(p^{ref}-c)\)\;
+  and clip \(p_t\) to a feasible range (e.g., near cost up to a max margin)\;
+
+
+  \(\hat Q_t \leftarrow 0\), \(\mathcal S_t \leftarrow \emptyset\); \tcp{Observe sessions and compute demand proxy (Eq.~2)}
+  \For{\(m \leftarrow 1\) \KwTo \(M\)}{
+    sample a session trajectory \(\tau_m\) using \(\bar T_H\) or \(\bar T_A\)\;
+    \(\hat Q_t \leftarrow \hat Q_t + \sum_{k}\omega(a_{m,k})\)\;
+    \(\mathcal S_t \leftarrow \mathcal S_t \cup \{\tau_m\}\)\;
+  }
+
+  \tcp{Estimate contamination from behavioral separability}
+  compute \(\hat\alpha \leftarrow \frac{1}{M}\sum_{\tau\in\mathcal S_t} \Big[\sigma\big(\beta(\Delta_H(\tau)-\Delta_A(\tau))\big)\Big]\)\;
+
+  compute \(J_t \leftarrow \text{Revenue}(p_t,\hat Q_t) - \lambda\cdot \text{COILeak}(\hat\alpha)\)\;
+}
+\end{algorithm}
+
+
+The algorithm operates in discrete epochs indexed by $t$. At each epoch, the platform publishes prices (leader move), observes the resulting session trajectories (follower response), and updates its contamination estimate based on behavioral divergence from the learned human and agent transition kernels $\bar{\mathcal{T}}_H$ and $\bar{\mathcal{T}}_A$. The history buffer $\mathcal{L}$ (termed ``Limbo'' in our implementation) enforces the alternating Stackelberg structure by maintaining the temporal sequence of price publications and demand observations.
+
+%The defensive price update in Line 24 implements a contamination-aware margin shrinkage: as the estimated agent contamination $\hat{\alpha}_t$ increases, the margin $(p^{\mathrm{ref}} - c)$ is proportionally reduced by factor $\kappa \in [0,1]$, with projection $\Pi_{\mathcal{P}}$ ensuring prices remain within the feasible set $\mathcal{P}$. In subsequent experiments, this heuristic update is replaced by the DR-RL policy $\pi^*$ from Eq.~\ref{eq:robust_policy}, which optimizes against the Wasserstein ambiguity set $\mathcal{U}_\epsilon$ rather than relying on a fixed margin adjustment rule.
+
 \section{Heuristics as part of neuro-inspired steering systems}

 Steve Burns, superior culliculus (face heuristics) we create this sort of part of the 'brain' + amortized inference.
--- a/paper/src/graphics/SST.png
+++ b/paper/src/graphics/SST.png
--- a/paper/src/main.tex
+++ b/paper/src/main.tex
@@ -1,41 +1,41 @@
 % -*- TeX-master: t -*-
 \documentclass[12pt,letterpaper]{article}

-\pagestyle{plain}
-
 \input{preamble}

 \begin{document}

-\title{Adversarially Distributionally Robust Optimization and Reinforcement Learning for Informed Dynamic Pricing under Strategic Demand Contamination}
-
-\author{
-  Daniel Rösel\thanks{Primary author and student researcher. Email: daniel@alves.world} \\
-  IE University, Madrid, Spain \\[1em]
-  Alberto Martín Izquierdo\thanks{Thesis advisor. Email: amartini@faculty.ie.edu} \\
-  IE University, Madrid, Spain
-}
-
-\date{\today}
-
-\maketitle
+\begin{titlepage}
+    \centering
+    \includegraphics[width=0.3\textwidth]{graphics/SST.png}\\[1cm]
+    \LARGE\textbf{PHANTOM: Pricing Heuristics Against Non-human Transaction Orchestration Mechanisms}\\[0.5cm]
+    \Large\textbf{Daniel Rösel}\\
+    \large\textit{Bachelor of Computer Science \& Artificial Intelligence}\\[0.5cm]
+    \Large\textit{Supervised by:}\\
+    \Large\textbf{Alberto Martín Izquierdo}\\
+    \large\textit{IE University, Madrid, Spain}\\[1cm]
+    \large\today
+\end{titlepage}

 \begin{abstract}
-The primary objective of this thesis is to develop and validate pricing heuristics that protect e-commerce platforms from systematic exploitation by Large Language Model (LLM) agents within dynamic pricing environments. As AI agents increasingly mediate consumer transactions, they enable users to circumvent the Cost of Information (the price premium accumulated through demand signal expression) by conducting reconnaissance in isolated sessions before executing purchases through clean sessions at base prices. This research will make an anticipatory contribution by adapting recommendation system methodologies to distinguish between genuine human browsing behavior and agent-orchestrated information gathering, thereby enabling pricing systems to maintain margin integrity without degrading the user experience for legitimate customers or getting rid of leads generated by LLMs.
+With accelerated growth of Lager Language Model agents in e-commerce a novel adversarial dynamic to digital markets emerges. This paper address the vulnerability of dynamic pricing systems to AI intermediaries that decouple the information gather stages from the transaction execution. By conducing reconnaissance isolates sessions, agents circumvent the ``Cost of Information'' (COI) defined as the accumulated price premium typically thought demand expression estimators.
+We formally define this phenomenon and derive the Cost of Information Theorem, proving that as the saturation of independent, utility-maximizing agents increases, the platform’s ability to sustain a COI converges to zero, rendering standard dynamic pricing mechanisms incentive-incompatible.
+To respond to this threat we propose a defensive framework which integrates behavioral economics with Adversarially Distributionally Robust Optimization (DRO). We introduce a custom e-commerce research platform built on hybrid Kappa-Lambda architecture, designed to capture and simulate high-fidelity controlled interaction trajectories. We further demonstrate through modeling that human and agent behaviors exhibit distinct transition probability kernels, enabling the construction of discriminative models based on Kullback-Leibler divergence.
+These behavioral signals serve as inputs for a Distributionally Robust Reinforcement Learning (DR-RL) agent. We formulate the pricing problem as a Stackelberg game where the learner optimizes against an ambiguity set of demand distributions defined by the Wasserstein distance. This approach allows the pricing policy to remain robust against non-stationary contamination without overfitting to deterministic demand curves. The research validates a mechanism for preserving margin integrity and market equilibrium in an agent-mediated economy, while minimizing degradation to the legitimate human user experience (UX).
 \end{abstract}

+\noindent\textbf{Keywords:} Dynamic Pricing, LLM Agents, Adversarial Machine Learning, E-commerce, Behavioral Detection, Reinforcement Learning

+\vspace{1em}
+\noindent\textbf{Acknowledgments:} Eugene Bykovets, PhD - ETH for helping with problem formulation. This research was supported by the TPU Research Cloud program.
+
+\clearpage
 \input{chapters/01-intro}
 \input{chapters/02-literature-review}
-\input{chapters/03-methodology}
-\input{chapters/04-results}
-\input{chapters/05-discussion}
-\input{chapters/06-conclusion}
-
-
-\section*{Acknowledgments}
-Eugene Bykovets, PhD - ETH for helping with problem formulation.
-Research supported with Cloud TPUs from Google's TPU Research Cloud (TRC).
+% \input{chapters/03-methodology}
+% \input{chapters/04-results}
+% \input{chapters/05-discussion}
+% \input{chapters/06-conclusion}

 \printbibliography

@@ -46,6 +46,6 @@ Research supported with Cloud TPUs from Google's TPU Research Cloud (TRC).
 \item[Agent $A$] An actor of non-human nature, powered by an LLM.
 \item[Human $H$] An individual human with some job to be done.
 \end{description}
-\input{../build/concatenated_code}
+% \input{../build/concatenated_code}

 \end{document}
--- a/paper/src/preamble.tex
+++ b/paper/src/preamble.tex
@@ -1,6 +1,11 @@
+% Encoding
+\usepackage[utf8]{inputenc}
+
 % Math packages (load before fonts to avoid conflicts)
 \usepackage{amsmath}
 \usepackage{amsthm}
+\usepackage{appendix}
+\usepackage[inline]{enumitem}

 % Define theorem environments
 \newtheorem{theorem}{Theorem}
@@ -28,7 +33,8 @@
 \usepackage{xcolor}
 \usepackage[ruled,vlined]{algorithm2e}
 \usepackage{cleveref}
-
+\usepackage{adjustbox}
+\usetikzlibrary{trees}
 % Configure cleveref for algorithm2e
 \crefname{algocf}{Algorithm}{Algorithms}

@@ -49,6 +55,16 @@
    literate={·}{{\textperiodcentered}}1 {−}{{\textminus}}1 {—}{{---}}1 {–}{{--}}1
 }

-% Use biblatex instead of natbib (acmart default)
-\usepackage[backend=bibtex,style=numeric]{biblatex}
+% Use biblatex with authoryear style for in-text citations like (Author, Year)
+\usepackage[backend=bibtex,style=authoryear,natbib=true,maxcitenames=2]{biblatex}
 \addbibresource{bib/references.bib}
+
+% Page headers (SciTech format)
+\usepackage{fancyhdr}
+\setlength{\headheight}{14.5pt}
+\addtolength{\topmargin}{-2.5pt}
+\pagestyle{fancy}
+\fancyhf{}
+\fancyhead[L]{PHANTOM}
+\fancyhead[R]{\thepage}
+\renewcommand{\headrulewidth}{0pt}