Merge pull request #57 from velocitatem/first-last-todos

First last todos
2026-07-15 17:43:36 +00:00 · 2026-04-10 15:03:20 +04:00
parent e62e842faa d36a34ead9
commit d8907eb353
87 changed files with 2573 additions and 1411 deletions
--- a/.github/workflows/latex.yml
+++ b/.github/workflows/latex.yml
@@ -9,6 +9,12 @@ on:
    paths:
      - 'paper/**'
      - '.github/**'
+  workflow_dispatch:
+    inputs:
+      skip_mirrors:
+        description: Skip Codex mirror generation (avoids API quota use)
+        type: boolean
+        default: false
 jobs:
  build:
    runs-on: ubuntu-latest
@@ -24,8 +30,10 @@ jobs:
      - name: Prepare appendix code snapshot
        run: bash paper/concat_code.sh

+      # Repo variable SKIP_CODEX_MIRRORS=true skips on push/PR; workflow_dispatch can set skip_mirrors.
      - name: Generate mirrors with Codex
-        if: ${{ env.OPENAI_API_KEY != '' }}
+        if: ${{ env.OPENAI_API_KEY != '' && vars.SKIP_CODEX_MIRRORS != 'true' && (github.event_name != 'workflow_dispatch' || github.event.inputs.skip_mirrors != 'true') }}
+        continue-on-error: true
        uses: openai/codex-action@v1
        with:
          openai-api-key: ${{ env.OPENAI_API_KEY }}
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,9 @@ dist/
 **/*.parquet
 **/_build/

+# mkdocs output (run make docs.platform locally or rely on CI)
+docs/documentation/
+
 # paper build artifacts
 paper/src/bib/auto
 paper/src/auto/*
--- a/45
+++ b/45
@@ -44,7 +44,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||

 .PHONY: help
 help:
-	@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines | manim.render manim.render.all"
+	@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.summary pdf.summary.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines | docs.platform | manim.defense manim.defense.hq manim.render manim.render.full manim.render.poster manim.render.appendix manim.render.all"
 	@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
 	@echo "data.pull data.push data.whoclicked.publish | study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
 	@echo "tpu.ray.bootstrap tpu.ray.deps tpu.ray.verify tpu.ray.teardown"
@@ -102,6 +102,14 @@ pdf.genpop.watch:
 pdf.arxiv:
 	@bash scripts/nx_paper.sh build-arxiv

+.PHONY: pdf.summary
+pdf.summary:
+	@bash scripts/nx_paper.sh build-summary
+
+.PHONY: pdf.summary.watch
+pdf.summary.watch:
+	@bash scripts/nx_paper.sh watch-summary
+
 .PHONY: test.backend
 test.backend:
 	@$(NX) run research:test
@@ -186,6 +194,19 @@ study.margin-erosion:
 study.margin-erosion.quick:
 	python -m engine.studies.margin_erosion_alpha --quick

+DOCS_VENV ?= docs/.venv
+DOCS_MKDOCS := $(DOCS_VENV)/bin/mkdocs
+DOCS_PIP := $(DOCS_VENV)/bin/pip
+
+.PHONY: docs.platform
+docs.platform: $(DOCS_VENV)
+	$(DOCS_MKDOCS) build -f docs/mkdocs.yml
+
+$(DOCS_VENV):
+	python3 -m venv $(DOCS_VENV)
+	$(DOCS_PIP) install --upgrade pip
+	$(DOCS_PIP) install -r docs/requirements.txt
+
 .PHONY: wordcount
 wordcount:
 	@$(NX) run paper:wordcount
@@ -232,12 +253,28 @@ test:
 count-lines:
 	@$(NX) run research:stats

-all:
-	@$(NX) run paper:build
+# Default artifact set for this repo: thesis PDF (same as pdf).
+all: pdf
+
+.PHONY: manim.defense manim.defense.hq manim.render manim.render.full manim.render.poster manim.render.appendix manim.render.all
+# Main defense reel (paper/defense/manim/render_defense); uses paper/defense/.venv when present
+manim.defense:
+	@cd paper/defense/manim && ./render_defense full
+
+manim.defense.hq:
+	@cd paper/defense/manim && ./render_defense full --quality qh

-.PHONY: manim.render manim.render.all
 manim.render:
 	@$(NX) run manim:render

+manim.render.full:
+	@$(NX) run manim:render-full
+
+manim.render.poster:
+	@$(NX) run manim:render-poster
+
+manim.render.appendix:
+	@$(NX) run manim:render-appendix
+
 manim.render.all:
 	@$(NX) run manim:render-all
--- a/README.md
+++ b/README.md
@@ -6,10 +6,12 @@

 Agent-aware dynamic pricing research platform for studying how automated transaction orchestration changes pricing power, and for testing defenses that recover margin while protecting legitimate user experience.

-[![Build PDF](https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml/badge.svg)](https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml)
-[![Paper](https://img.shields.io/badge/Paper-PDF-red?logo=adobe-acrobat-reader)](https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf)
-[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-sm.svg)](https://huggingface.co/datasets/velocitatem/whoclickedit)
-[![TPU Research Cloud](https://img.shields.io/badge/TPU%20Research%20Cloud-TRC%20supported-4285F4?logo=googlecloud&logoColor=white)](https://sites.research.google/trc/faq/)
+<p>
+  <a href="https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml"><img src="https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml/badge.svg" alt="Build PDF" style="vertical-align: middle;" /></a>
+  <a href="https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf"><img src="https://img.shields.io/badge/Paper-PDF-red?logo=adobe-acrobat-reader" alt="Paper PDF" style="vertical-align: middle;" /></a>
+  <a href="https://huggingface.co/datasets/velocitatem/whoclickedit"><img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-sm.svg" alt="Dataset on Hugging Face" style="vertical-align: middle; position: relative; top: 1px;" /></a>
+  <a href="https://sites.research.google/trc/faq/"><img src="https://img.shields.io/badge/TPU%20Research%20Cloud-TRC%20supported-4285F4?logo=googlecloud&logoColor=white" alt="TPU Research Cloud" style="vertical-align: middle;" /></a>
+</p>

 **Live demos:** [Hotel](https://phantom-hotel.vercel.app) | [Airline](https://phantom-airline.vercel.app) | [Academic page](https://velocitatem.github.io/PHANTOM/)

@@ -140,7 +142,10 @@ flowchart LR
 | `experiments/` | Data processing, ETL ideas, and analysis assets |
 | `docker/` | Dockerfiles for platform services |
 | `tests/e2e/` | Playwright end-to-end tests |
-| `docs/` | Academic project page source |
+| `docs/` | Academic project page (GitHub Pages root) + MkDocs config |
+| `docs/src/` | Markdown sources for the operator documentation site |
+| `docs/documentation/` | MkDocs build output (gitignored; run `make docs.platform`; served at `/documentation/` on Pages) |
+| `SETUP.md` | Unified operator guide: stack, kernels, RL training, thesis refs by chapter |

 ## Operational notes

@@ -149,6 +154,11 @@ flowchart LR
 - Research commands (`make train`, `make benchmark*`, `make train.agent`) auto-load `.env.sweep`.
 - Paper builds call `paper/concat_code.sh` before compilation to flatten code into the appendix.

+## Operator documentation
+
+- Full setup guide (platform + research): [`SETUP.md`](SETUP.md)
+- Hosted operator docs (after `make docs.platform`): […/PHANTOM/documentation/](https://velocitatem.github.io/PHANTOM/documentation/) on GitHub Pages
+
 ## Research artifacts

 - Thesis PDF: `thesis-latest.pdf` or [hosted PDF](https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf)
--- a/SETUP.md
+++ b/SETUP.md
@@ -0,0 +1,298 @@
+# PHANTOM: setup for operators and partners
+
+This guide walks a team from **business context** (what you sell, how you price, what traffic you worry about) through a **running PHANTOM stack**, **behavioral kernels and contamination**, and **RL training / benchmarking**. The math lives in the thesis PDF; here we tie operations to that math without re-deriving it. References to the thesis use **chapter numbers** only (build the PDF locally if you need line-level citations).
+
+**Thesis (PDF):** [thesis-latest.pdf](https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf)
+
+---
+
+## 1. Who this is for / prerequisites
+
+**Audience:** Engineers and researchers who run Docker, a Next.js app, and Python tooling; product or risk stakeholders who define experiment goals and acceptable UX tradeoffs.
+
+**Skills:** Docker Compose, Node/npm, Python 3.8+, basic Kafka/Redis mental model.
+
+**Decide up front:**
+
+- **Vertical vs demo:** The repo ships `hotel` and `airline` storefront modes (`STORE_MODE`). Anything beyond that is custom integration work.
+- **Data residency:** Event streams and training artifacts default to paths under the repo (overridable via `PHANTOM_`* env vars in `lib/config.py`). Decide where logs and models may live before you point production-like traffic at the stack.
+- **Experiment governance:** Who may run human vs agent sessions, how sessions are labeled or weak-labeled for research, and retention policy for interaction logs.
+
+### Theoretical implications
+
+The formal model assumes each session is generated by a latent **actor class** $Y \in H,A$ (human vs agent). Your deployment choices implicitly assert **which sessions are valid for estimating human vs agent behavior** and whether experimental conditions are stable. If you mix exploratory QA traffic with labeled experiments without recording that fact, you blur the empirical partitions $D_H$ and $D_A$ that the methodology needs for transition kernels and contamination studies. See the **Introduction** (research questions) and **Methodology**, Problem Formalization, in the thesis PDF.
+
+---
+
+## 2. Business fit framing
+
+**What PHANTOM is for:** Studying how **automated browsing and transaction orchestration** interact with **session-based pricing**: behavior generates a demand proxy $\hat{q}$; pricing policies map interaction history to prices; **Cost of Information (COI)** is the premium the platform can sustain above a floor when information is scarce. Agent-mediated **reconnaissance in one session** and **purchase in another** undermines that asymmetry; the thesis proves a **COI erosion** mechanism under many independent price queries.
+
+**What you must supply:**
+
+- A **product catalog** path: defaults assume Supabase-backed product data (`NEXT_PUBLIC_SUPABASE_URL`, `NEXT_PUBLIC_SUPABASE_ANON_KEY`).
+- A plan for **interaction and price events** reaching the ingestion path (backend → Kafka) or an adapter you maintain.
+- Clear **experiment goals:** e.g. compare human vs agent KPIs under the same task, measure margin under varying contamination $\alpha$.
+
+### Theoretical implications
+
+Aggregate demand in the thesis is a **mixture** over human and agent types with contamination $\alpha$ plus noise $\epsilon_t$; see the mixture demand discussion in **Chapter 3 (Methodology)**. COI is defined as $\mathbb{E}[P]-\underline{p}$; the **COI framework** and theorem in the same chapter explain why saturated agent querying collapses extractable premium. Your business scenario determines which **actions** enter $\hat{q}$ and how interpretable $\alpha$ is for your traffic.
+
+---
+
+## 3. Environment and secrets
+
+**Bootstrap files (from repo root):**
+
+```bash
+npm install
+cp .env.example .env
+cp .env.sweep.example .env.sweep
+```
+
+**Core `.env` (platform + web + docker):** See `[.env.example](.env.example)`. You must also set the variables called out in `[README.md](README.md)` for a full stack: `NEXT_PUBLIC_SUPABASE_URL`, `NEXT_PUBLIC_SUPABASE_ANON_KEY`, `AIRFLOW_FERNET_KEY`, `AIRFLOW_SECRET_KEY` (and provider ports per your compose file).
+
+**Training / sweeps (`.env.sweep`):** Used by `make train`, `make benchmark`, sweep agents. Typically `WANDB_API_KEY`, optional `WANDB_ENTITY` / `WANDB_PROJECT`, `GITHUB_TOKEN` for bootstrap flows, `SWEEP_ID` for W&B sweep workers. See `[.env.sweep.example](.env.sweep.example)`.
+
+**Security:** Never commit real `.env` or `.env.sweep` files. Rotate keys if they leak.
+
+### Theoretical implications
+
+Splitting **online platform credentials** (ingestion, catalog, Kafka) from **offline training credentials** (W&B, cloud TPUs, GitHub tokens for workers) mirrors the **hybrid Kappa–Lambda** data loop in the thesis: streaming observation vs batch / long-running training jobs. That split is named in the **Terminology** appendix of the thesis PDF.
+
+---
+
+## 4. Bring-up (commands)
+
+Aligned with `[README.md](README.md)`:
+
+```bash
+npm install
+cp .env.example .env
+cp .env.sweep.example .env.sweep
+# edit .env: Supabase, Airflow keys, etc.
+
+make platform.up
+make web.dev
+```
+
+**Sanity checks:**
+
+
+| Endpoint                                                      | Role                              |
+| ------------------------------------------------------------- | --------------------------------- |
+| `http://localhost:3000`                                       | Next.js storefront                |
+| `http://localhost:5000/health`                                | Backend ingest API                |
+| `http://localhost:5001/health`                                | Pricing provider                  |
+| `http://localhost:8085`                                       | Airflow UI (default compose port) |
+| `http://localhost:8084` or configured `REDPANDA_CONSOLE_PORT` | Kafka console (see your `.env`)   |
+
+
+**Optional tests:** `make test.backend` (with venv/tooling as in Makefile); `make test.e2e` requires backend, web, and Airflow up per README.
+
+### Theoretical implications
+
+A correctly wired stack logs **trajectories** $\tau_s$ (sequences of events) and **price exposure** together. **Chapter 3** defines events $e_{s,k}=(a,i,t)$ and proxies $\hat{q}$ from weighted actions—without joint logging of behavior and quotes, you cannot recover the objects the theory reasons about (Problem Formalization).
+
+---
+
+## 5. Service map
+
+```mermaid
+flowchart LR
+  U[Human / Agent Browser] --> W[Next.js Web App]
+  W -->|Price requests| P[Pricing Provider]
+  W -->|Interaction events| B[Backend Ingest API]
+  B --> K[Kafka]
+  K --> A[Airflow + Worker Jobs]
+  A --> R[Redis Model Registry]
+  P -->|Session/global prices| W
+  E[Research Engine + Experiments] --> A
+  E --> R
+```
+
+
+
+**Ports (typical; confirm in `docker-compose` and `.env`):** `BACKEND_PORT` (5000), `PROVIDER_PORT` (5001), `KAFKA_PORT`, `REDIS_PORT`, Airflow `AIRFLOW_WEBSERVER_PORT` (8085 default), Redpanda console.
+
+### Theoretical implications
+
+The platform **observes** behavioral proxies and quoted prices, not the latent demand curve $d(p\mid\theta)$. The distinction between $\hat{q}$ and true demand is explicit in **Chapter 3**. Misattributing proxy noise to “true” elasticity breaks both estimation and any causal story about COI.
+
+---
+
+## 6. Tailoring to your business
+
+**Storefront mode:** `STORE_MODE=hotel` or `airline` (see `[web/src/lib/config.ts](web/src/lib/config.ts)` and env). This switches catalog and UI, not the core ingestion pattern.
+
+**API base / environment:** `NEXT_PUBLIC_API_BASE`, `NEXT_PUBLIC_APP_ENV` (validated in `config.ts`).
+
+**Paths for data and runs:** Override with `PHANTOM_DATA_DIR`, `PHANTOM_SIM_RUNS_DIR`, `PHANTOM_MODEL_REGISTRY_DIR`, `PHANTOM_COLLECTED_DATA_DIR`, etc. (`[lib/config.py](lib/config.py)`).
+
+**Scope:** A new vertical (custom product ontology, checkout rules, pricing rules) means **new UI, events, and possibly new reward features** in the engine. Budget engineering time; the repo is a research platform, not a turnkey SaaS skin for arbitrary catalogs without code changes.
+
+### Theoretical implications
+
+Transition kernels $\hat{\mathcal{T}}_H,\hat{\mathcal{T}}_A$ are estimated on a **finite action / state space** derived from your instrumentation. Changing catalog depth or event taxonomy changes the MDP state space; old kernel estimates are not portable. See the transition kernel discussion in **Chapter 3**.
+
+---
+
+## 7. Data collection and experiments
+
+**Flow:** Browser → backend → **Kafka** → downstream consumers (Airflow DAGs, notebooks, ETL under `experiments/`). Ensure **session identity**, **item identifiers**, and **action types** are consistent enough to build trajectories.
+
+**Weak labels:** The thesis discusses partitioning data into human vs agent subsets for MLE transition counts. In production you may only have heuristic labels—document bias explicitly.
+
+### Theoretical implications
+
+Distinguishability (sub-question SQ1 in the **Introduction**) asks whether $H$ vs $A$ is identifiable from behavior alone. Your labeling and experimental design determine whether $\Delta_H,\Delta_A$ and $f(\tau)$ are meaningful or dominated by noise. Symbols appear in the **Terminology** appendix ($\Delta_H,\Delta_A$, $f(\tau)$, contamination generator $\mathcal{G}(\alpha)$).
+
+---
+
+## 8. Transition kernels and agent scoring (theory → practice)
+
+**Theory:** Sessions yield trajectories $\tau_s$. For each actor class $y\inH,A$, the thesis estimates a **Markov transition kernel** by counting transitions and normalizing (MLE):
+
+$$
+\hat{P}(s' \mid s) = \frac{N(s,s')}{\sum_k N(s,k)}
+$$
+
+Human and agent prototypes $\hat{\mathcal{T}}_H,\hat{\mathcal{T}}_A$ support comparing an empirical kernel from a partial trajectory to prototypes (e.g. KL-style divergences $\Delta_H,\Delta_A$) and mapping to a **weak agent probability** $f(\tau)$. See **Chapter 3** and the **Terminology** appendix.
+
+**Code:** `[engine/lib/coi.py](engine/lib/coi.py)` (`compute_agent_probability`: empirical transition counts vs human/agent reference dicts, KL-style terms, mapped via `[lib/agent_probability.py](lib/agent_probability.py)`).
+
+**Optional narrative:** `[blog/02-behavioral-fingerprinting.md](blog/02-behavioral-fingerprinting.md)` walks a concrete study design (not required for operators).
+
+### Theoretical implications
+
+If reference kernels are fit on **stale** or **mislabeled** partitions, $\Delta_H-\Delta_A$ is not interpretable as distinguishability. Ground claims in SQ1 (**Introduction**) and the kernel subsection of **Chapter 3**.
+
+---
+
+## 9. Contamination generator $\mathcal{G}(\alpha)$
+
+**Theory:** Given clean trajectories, $\mathcal{G}(\alpha)$ injects synthetic agent trajectories until the effective mixture reaches contamination $\alpha\in[0,1]$, defining training scenarios for robust policies (**Chapter 3**). Catalog-scale block expansion of kernels is discussed there with validation caveats—treat large product spaces as **research-grade** until your team signs off.
+
+**Code:** `[engine/engine.py](engine/engine.py)` — `MarketEngine` mixes human/agent demand, uses `get_adjusted_transitions` / `sample_behavior_from_transitions`, and `alpha` when combining actor types and building demand proxies (`estimate_demand`). This is the **simulator** path, not a drop-in replacement for your production database.
+
+### Theoretical implications
+
+$\alpha$ in mixture $Q(p)$ is **agentic demand contribution** in the formal model, not necessarily “bot share of page views” unless your instrumentation equates them. Mismeasuring $\alpha$ biases robust objectives tied to a fixed contamination level.
+
+---
+
+## 10. Training and evaluation — local workflow
+
+**Environment:** Python venv via Nx (`make install` / `nx run research:install`). Training commands load `.env.sweep`.
+
+```bash
+make train LOCAL_TRAIN_ARGS='--algo ppo --total-timesteps 50000'
+make benchmark LOCAL_BENCHMARK_ARGS='--tiers static,surge,linear,qtable,ppo --alpha-values 0.0,0.3 --episodes 3 --no-wandb'
+make benchmark.simple
+```
+
+Entrypoints: `[engine/train.py](engine/train.py)`, `[engine/benchmark.py](engine/benchmark.py)`, `[engine/spec.py](engine/spec.py)` (Nx wraps these—see `project.json` / research targets).
+
+**Artifacts:** `[lib/config.py](lib/config.py)` — `PHANTOM_SIM_RUNS_DIR` (default `sim/rl/runs`), `PHANTOM_MODEL_REGISTRY_DIR`, etc.
+
+**TensorBoard (optional):** `[docker-compose.yml](docker-compose.yml)` includes `tensorboard-rl` on host port **6007** (`./sim/rl/runs`) and `tensorboard-ml` on **6006** (`./experiments/ml/runs`).
+
+### Theoretical implications
+
+Local runs instantiate the **offline defense gym**: policies trained on simulator-induced distributions approximate the DR-RL narrative in **Chapter 3**, but hyperparameters ($\lambda$ on COI leakage, $\eta$ on UX, robust radius) change the effective ambiguity set. Cross-check `engine/` against the thesis before claiming figure-for-figure replication.
+
+---
+
+## 11. Training and evaluation — remote / scaled deployment
+
+For **research at scale** (cloud quota and secrets required):
+
+
+| Mechanism                                   | Role                                                                                                                      |
+| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `[submit_ray_job.sh](submit_ray_job.sh)`    | Ray jobs with `.env` injected; `RAY_MODE=single|distributed|benchmark|sweep`. Set the script’s `ROOT` to your clone path. |
+| `make tpu.ray.bootstrap` / `tpu.ray.`*      | TPU Ray bootstrap (`TPU_CONF`, e.g. `tpu_orchestration/configs/v4_spot_us.conf`).                                         |
+| `make train.agent` / `make benchmark.agent` | W&B sweeps: `SWEEP_ID` in `.env.sweep`.                                                                                   |
+| `make train.bootstrap`                      | Worker bootstrap: `REPO_URL`, `SWEEP_ID`, `GITHUB_TOKEN`.                                                                 |
+| `make docker.train.publish`                 | Trainer image (`TRAIN_IMAGE_REF` in Makefile).                                                                            |
+
+
+See `submit_ray_job.sh` for env vars (`WANDB_*`, `PHANTOM_*` TPU toggles).
+
+### Theoretical implications
+
+Distributed training does not change the **definitions** of the Stackelberg game or Wasserstein ambiguity; it changes compute and variance of empirical estimates. Align random seeds and data protocol across nodes or split results explicitly—otherwise you mix distributions in a way a single empirical law $\hat{P}_N$ in the thesis does not describe.
+
+---
+
+## 12. Evaluation, artifacts, and audit trail
+
+**Benchmarks:** `make benchmark`* sweeps tiers and $\alpha$; CLI includes robustness knobs (see default `BENCHMARK_ARGS` in `submit_ray_job.sh`: `--robust-radius`, `--lambda-coi`, `--eta-ux`, etc.).
+
+**Audit trail:** Store `git` SHA, CLI argv, non-secret `.env.sweep` keys, and W&B run IDs with published tables. For scientific claims, cite **Chapters 4–5 (Results, Discussion)** in the thesis PDF.
+
+### Theoretical implications
+
+Evaluation quality equals **simulator fidelity** plus **contamination modeling**. Separate theorem statements (assumption-based) from empirical curves (`engine`-dependent).
+
+---
+
+## 13. Operational suggestions
+
+- **Staging:** Non-production namespaces; separate Kafka topics and Supabase projects where possible.
+- **Rate limits / abuse:** Protect ingest endpoints; respect participant privacy.
+- **Human vs agent sessions:** Comparable cohorts; record experimental condition in metadata.
+- **Contracts:** `tests/e2e/` encodes minimal flows—use when APIs change.
+
+### Theoretical implications
+
+Non-stationary noise $\epsilon_t$ and drifting $\alpha$ confound benchmark interpretation. **Chapter 3** discusses mixture identification: isolate treatments when possible and document confounders when not.
+
+---
+
+## 14. Roadmap and gaps
+
+**In repo:** Local dockerized stack, demo verticals, engine benchmarks, documented env and paths.
+
+**Usually custom:** Production catalog without Supabase, identity/fraud layers, legal review of logging, Kafka/Airflow SLAs, hardening the pricing provider for real money.
+
+**Thesis vs code:** The PDF is the **spec**; not every robustness term or large-catalog kernel construction is production-verified—see caveats in **Chapter 3**.
+
+### Theoretical implications
+
+Theorems in the thesis can be **stronger** than what observational firm logs support. The COI result assumes a clean experimental reading of the pricing policy; live market data may only support weaker claims.
+
+---
+
+## 15. Theory and thesis cross-references (quick index)
+
+Use the **PDF table of contents** with these anchors:
+
+
+| Topic                                                                      | Thesis location                                       |
+| -------------------------------------------------------------------------- | ----------------------------------------------------- |
+| Research questions (margin, distinguishability, contamination, mitigation) | **Introduction**                                      |
+| Sessions, events, $\hat{q}$, mixture $Q(p)$, $\alpha$                      | **Chapter 3** — Problem Formalization, mixture demand |
+| COI definition and erosion theorem                                         | **Chapter 3** — COI framework                         |
+| Transition kernels, MLE, $\mathcal{G}(\alpha)$                             | **Chapter 3**                                         |
+| DR-RL, ambiguity sets, Stackelberg                                         | **Chapter 3**                                         |
+| Symbol glossary (COI leakage, $f(\tau)$, UX, surrogates)                   | **Appendix — Terminology**                            |
+| Empirical results and limitations                                          | **Chapters 4–5**                                      |
+
+
+---
+
+## 16. Quick file index (code)
+
+
+| File                                                                               | Role                                               |
+| ---------------------------------------------------------------------------------- | -------------------------------------------------- |
+| `[engine/lib/coi.py](engine/lib/coi.py)`                                           | KL-style trajectory comparison; agent probability. |
+| `[engine/engine.py](engine/engine.py)`                                             | `MarketEngine`, mixture, demand proxy path.        |
+| `[lib/agent_probability.py](lib/agent_probability.py)`                             | Divergence → probability score.                    |
+| `[lib/config.py](lib/config.py)`                                                   | Paths and ports for artifacts.                     |
+| `[engine/train.py](engine/train.py)`, `[engine/benchmark.py](engine/benchmark.py)` | CLI entrypoints.                                   |
+| `[tpu_orchestration/](tpu_orchestration/)`                                         | TPU configs and helpers.                           |
+
+
+Many offline benchmarks run without a storefront once the research Python environment is installed; connecting production trajectories to kernel estimation still requires aligned instrumentation.
--- a/docs/index.html
+++ b/docs/index.html
@@ -183,6 +183,14 @@
          </div>
          <i class="fas fa-external-link-alt"></i>
        </a>
+        <a href="documentation/" class="work-item">
+          <div class="work-info">
+            <h5>Documentation</h5>
+            <p>Operator setup, configuration, architecture, and research pipeline (MkDocs).</p>
+            <span class="work-venue">Platform</span>
+          </div>
+          <i class="fas fa-book"></i>
+        </a>
        <a href="https://github.com/velocitatem/p4p" class="work-item" target="_blank">
          <div class="work-info">
            <h5>P4P Interaction Layer</h5>
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -0,0 +1,53 @@
+site_name: PHANTOM Platform
+site_description: Operator and research documentation for the PHANTOM dynamic pricing research platform.
+site_url: https://velocitatem.github.io/PHANTOM/documentation/
+site_author: Daniel Rösel
+
+repo_url: https://github.com/velocitatem/PHANTOM
+repo_name: velocitatem/PHANTOM
+
+docs_dir: src
+site_dir: documentation
+strict: true
+
+theme:
+  name: material
+  palette:
+    - scheme: default
+      primary: indigo
+      toggle:
+        icon: material/brightness-7
+        name: Switch to dark mode
+    - scheme: slate
+      primary: indigo
+      toggle:
+        icon: material/brightness-4
+        name: Switch to light mode
+  features:
+    - navigation.instant
+    - navigation.tracking
+    - content.code.copy
+    - search.suggest
+    - search.highlight
+
+nav:
+  - Home: index.md
+  - Setup: platform-setup.md
+  - Business overview: business.md
+  - Architecture: architecture.md
+  - Configuration: configuration.md
+  - Glossary: glossary.md
+  - Roadmap & implementation notes: roadmap.md
+
+markdown_extensions:
+  - pymdownx.snippets:
+      base_path:
+        - ..
+  - pymdownx.superfences
+  - admonition
+  - tables
+  - toc:
+      permalink: true
+
+plugins:
+  - search
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -0,0 +1 @@
+mkdocs-material>=9.5,<10
--- a/docs/src/architecture.md
+++ b/docs/src/architecture.md
@@ -0,0 +1,30 @@
+# Architecture
+
+## System map
+
+```mermaid
+flowchart LR
+  U[Human / Agent Browser] --> W[Next.js Web App]
+  W -->|Price requests| P[Pricing Provider]
+  W -->|Interaction events| B[Backend Ingest API]
+  B --> K[Kafka]
+  K --> A[Airflow + Worker Jobs]
+  A --> R[Redis Model Registry]
+  P -->|Session/global prices| W
+  E[Research Engine + Experiments] --> A
+  E --> R
+```
+
+
+
+## Event and training path (conceptual)
+
+1. **Online:** The browser emits events; the backend publishes to **Kafka**; schedulers and workers consume for ETL and model registry updates.
+2. **Offline:** Notebooks and scripts under `experiments/` transform logs; `**engine/`** runs simulations, training, and benchmarks; artifacts land under paths from `[lib/config.py](https://github.com/velocitatem/PHANTOM/blob/main/lib/config.py)`.
+3. **Feedback:** Trained or rule-based policies surface through the **pricing provider** to the web app.
+
+## Where to read more
+
+- Ports and health checks: [README](https://github.com/velocitatem/PHANTOM/blob/main/README.md) and [Configuration](configuration.md).
+- Formal notation for sessions, $\hat{q}$, and mixture demand: **Chapter 3 (Methodology)** in the thesis PDF.
+
--- a/docs/src/business.md
+++ b/docs/src/business.md
@@ -0,0 +1,21 @@
+# Business overview
+
+PHANTOM targets **platform operators and researchers** who need to:
+
+1. **Observe** session-level behavior and price quotes together (trajectories and policies—not just clicks).
+2. **Separate** human-driven demand signals from agent-mediated reconnaissance where possible (distinguishability and contamination \alpha in the thesis).
+3. **Evaluate** pricing policies that remain useful when **Cost of Information (COI)** is under pressure from automated querying (formal COI framework and theorem in the thesis PDF).
+
+## What this product is not
+
+- A drop-in fraud API that returns “bot score” for every request without your event schema.
+- A certified compliance guarantee for regulated pricing: it is a **research stack** with configurable experiments.
+- A hosted SaaS: you run the stack (or adapt components) under your infrastructure policy.
+
+## Self-service story (ideal path)
+
+A team connects their **catalog** (today: Supabase-backed flows in this repo), streams **interaction events** through the ingest path, runs **labeled or weak-labeled** human vs agent sessions, estimates **behavioral kernels**, varies **contamination** in simulation, and **trains or benchmarks** robust policies via `engine/`. Steps and caveats are in [Setup](platform-setup.md) (same content as root `SETUP.md`).
+
+## Thesis link
+
+Problem statement, contributions, and research questions: **Introduction** and abstract in the [thesis PDF](https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf).
--- a/docs/src/configuration.md
+++ b/docs/src/configuration.md
@@ -0,0 +1,63 @@
+# Configuration reference
+
+This page condenses tables from `[README.md](https://github.com/velocitatem/PHANTOM/blob/main/README.md)` and points to code. Authoritative env templates: `[.env.example](https://github.com/velocitatem/PHANTOM/blob/main/.env.example)`, `[.env.sweep.example](https://github.com/velocitatem/PHANTOM/blob/main/.env.sweep.example)`.
+
+## Core runtime (`.env`)
+
+
+| Variable                        | Purpose                        | Typical value           |
+| ------------------------------- | ------------------------------ | ----------------------- |
+| `STORE_MODE`                    | Web mode (`hotel` / `airline`) | `hotel`                 |
+| `BACKEND_PORT`                  | Backend API                    | `5000`                  |
+| `PROVIDER_PORT`                 | Pricing provider               | `5001`                  |
+| `KAFKA_HOST`                    | Kafka broker host              | `localhost`             |
+| `KAFKA_PORT`                    | Kafka port                     | `9092`                  |
+| `REDIS_PORT`                    | Redis port                     | `6377`                  |
+| `REDPANDA_CONSOLE_PORT`         | Kafka UI                       | `8084` (see compose)    |
+| `NEXT_PUBLIC_SUPABASE_URL`      | Catalog / data                 | required for full stack |
+| `NEXT_PUBLIC_SUPABASE_ANON_KEY` | Catalog / data                 | required                |
+| `AIRFLOW_FERNET_KEY`            | Airflow                        | required                |
+| `AIRFLOW_SECRET_KEY`            | Airflow web                    | required                |
+
+
+Web client validation: `[web/src/lib/config.ts](https://github.com/velocitatem/PHANTOM/blob/main/web/src/lib/config.ts)`.
+
+## Training / sweeps (`.env.sweep`)
+
+
+| Variable        | Purpose                                         |
+| --------------- | ----------------------------------------------- |
+| `WANDB_API_KEY` | Weights & Biases                                |
+| `WANDB_ENTITY`  | Optional override                               |
+| `WANDB_PROJECT` | Project name (default `capstone`)               |
+| `GITHUB_TOKEN`  | Bootstrap / workers                             |
+| `SWEEP_ID`      | Sweep agents (`train.agent`, `benchmark.agent`) |
+
+
+## Path overrides (`PHANTOM_*`)
+
+Defined in `[lib/config.py](https://github.com/velocitatem/PHANTOM/blob/main/lib/config.py)`:
+
+
+| Variable                     | Default (conceptual)                |
+| ---------------------------- | ----------------------------------- |
+| `PHANTOM_DATA_DIR`           | `data/`                             |
+| `PHANTOM_EXPERIMENTS_DIR`    | `experiments/`                      |
+| `PHANTOM_SIM_RUNS_DIR`       | `sim/rl/runs`                       |
+| `PHANTOM_MODEL_REGISTRY_DIR` | `data/models`                       |
+| `PHANTOM_COLLECTED_DATA_DIR` | `experiments/agents/collected_data` |
+
+
+## Makefile entrypoints
+
+
+| Goal             | Command                                     |
+| ---------------- | ------------------------------------------- |
+| Platform up/down | `make platform.up` / `make platform.down`   |
+| Web dev          | `make web.dev`                              |
+| Train            | `make train` (+ `LOCAL_TRAIN_ARGS`)         |
+| Benchmark        | `make benchmark` (+ `LOCAL_BENCHMARK_ARGS`) |
+| Docs site        | `make docs.platform`                        |
+
+
+See `make help` for the full list.
--- a/docs/src/glossary.md
+++ b/docs/src/glossary.md
@@ -0,0 +1,17 @@
+# Glossary
+
+Short definitions point to the thesis **Terminology** appendix in the [PDF](https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf) for full precision.
+
+| Term | Meaning (operational) |
+| --- | --- |
+| **COI (Cost of Information)** | Expected price premium above a floor under the platform’s policy; thesis KPI for pricing power. |
+| **Trajectory \(\tau_s\)** | Ordered session events used as the behavioral record. |
+| **Demand proxy \(\hat{q}\)** | Weighted aggregation of actions—what the platform observes instead of true demand. |
+| **Contamination \(\alpha\)** | Agent share in the mixture demand model (thesis); not automatically “% of bots” in raw logs. |
+| **Transition kernel \(\hat{\mathcal{T}}\)** | MLE Markov model over behavioral states / events for class \(H\) or \(A\). |
+| **\(\Delta_H,\Delta_A\)** | Divergence scores vs human/agent prototypes (thesis notation). |
+| **\(f(\tau)\)** | Weak agent probability from trajectory (implementation: `engine/lib/coi.py`). |
+| **\(\mathcal{G}(\alpha)\)** | Contamination generator: synthetic agent trajectories to reach mixture level \(\alpha\). |
+| **DR-RL** | Distributionally robust reinforcement learning training narrative in the thesis. |
+| **Ambiguity set / Wasserstein** | Robust optimization neighborhood around an empirical demand law. |
+| **Kappa–Lambda architecture** | Thesis term for streaming (online) vs batch/offline learning loops. |
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -0,0 +1,21 @@
+# PHANTOM platform documentation
+
+Welcome. This site mirrors the **operator and research** documentation for the PHANTOM repository: a research platform for studying **dynamic pricing** under **LLM-mediated browsing and transaction orchestration**, with ties to the academic thesis.
+
+## Start here
+
+| Document | Audience |
+| --- | --- |
+| [Setup](platform-setup.md) | Full walkthrough: Docker/web/ingest, kernels, contamination, RL training, and audit—content from `SETUP.md` in the repo. |
+| [Configuration reference](configuration.md) | Env vars, paths, and Makefile entrypoints in one place. |
+| [Roadmap & implementation notes](roadmap.md) | What is turnkey vs research-grade; thesis vs code. |
+
+## Canonical sources in the repo
+
+- Thesis PDF: [thesis-latest.pdf](https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf)
+- Root onboarding: single file [`SETUP.md`](https://github.com/velocitatem/PHANTOM/blob/main/SETUP.md) (included on this site via snippets—edit that file to change content).
+- Quick start and command tables: [`README.md`](https://github.com/velocitatem/PHANTOM/blob/main/README.md)
+
+## Academic project page
+
+The research landing page (figures, abstract, links) is the site root on GitHub Pages: [velocitatem.github.io/PHANTOM/](https://velocitatem.github.io/PHANTOM/). Open **Documentation** in the Project Links menu there to return to this subsite.
--- a/docs/src/platform-setup.md
+++ b/docs/src/platform-setup.md
@@ -0,0 +1,5 @@
+# Setup
+
+The content below is included from the repository root file `SETUP.md` (single source of truth: platform bring-up, kernels, contamination, RL training, and thesis pointers by chapter).
+
+--8<-- "SETUP.md"
--- a/docs/src/roadmap.md
+++ b/docs/src/roadmap.md
@@ -0,0 +1,26 @@
+# Roadmap & implementation notes
+
+This page is the **honesty pass** from the documentation plan: what clients can expect today versus what remains research-heavy.
+
+## Turnkey in this repository
+
+- **Local stack:** Docker Compose services for backend, Kafka, Redis, Airflow, pricing provider, etc.; Next.js via `make web.dev` (see [Platform setup](platform-setup.md)).
+- **Demo verticals:** `hotel` and `airline` storefront modes.
+- **Engine:** Benchmarks and training entrypoints (`make train`, `make benchmark`), KL-based agent scoring in `[engine/lib/coi.py](https://github.com/velocitatem/PHANTOM/blob/main/engine/lib/coi.py)`, simulator mixing in `[engine/engine.py](https://github.com/velocitatem/PHANTOM/blob/main/engine/engine.py)`.
+- **Orchestration hooks:** Ray/TPU scripts (`submit_ray_job.sh`, `make tpu.ray.`*), W&B sweep agents, Docker trainer publish target.
+
+## Usually requires custom engineering
+
+- **Non-Supabase catalog** or checkout flows without adapting the web + backend contracts.
+- **Production SLAs** on Kafka, schema registry, or PII boundaries for your jurisdiction.
+- **Tight coupling** to a legacy pricing engine without mapping its API to the provider abstraction.
+
+## Thesis vs code
+
+- The **thesis** states theorems and constructions (COI erosion, kernels, \mathcal{G}(\alpha), DR-RL).  
+- The **codebase** implements a **subset** of that story for experiments: verify CLI flags and simulator assumptions before claiming 1:1 equivalence with every equation.
+- **Catalog-scale kernel expansion** is discussed in **Chapter 3** with explicit validation caveats—do not assume row-stochasticity and Markov structure are automatically preserved at full product cardinality without review.
+
+## Suggested client messaging
+
+Position PHANTOM as a **reproducible research and evaluation stack** for agent-aware pricing, with a path to custom integration—not as a black-box “turn on anti-agent pricing” product without data and engineering investment.
--- a/docs/static/images/banner.svg
+++ b/docs/static/images/banner.svg
@@ -41,7 +41,7 @@

        <!-- Markers p and E[P] -->
        <line x1="150" y1="340" x2="150" y2="160" stroke="#E37862" stroke-width="2" stroke-dasharray="6,4"/>
-        <text x="150" y="375" font-family="Georgia" font-style="italic" font-size="22" fill="#E37862" text-anchor="middle">p</text>
+        <text x="150" y="375" font-family="Georgia" font-style="italic" font-size="22" fill="#E37862" text-anchor="middle"><tspan text-decoration="underline">p</tspan></text>

        <line x1="260" y1="340" x2="260" y2="160" stroke="#85B589" stroke-width="2" stroke-dasharray="6,4"/>
        <text x="260" y="375" font-family="Georgia" font-style="italic" font-size="22" fill="#85B589" text-anchor="middle">E[P]</text>
@@ -49,7 +49,7 @@
        <!-- COI Annotation -->
        <line x1="150" y1="150" x2="260" y2="150" stroke="#E37862" stroke-width="2" marker-start="url(#arrow)" marker-end="url(#arrow)"/>
        <text x="310" y="138" font-size="16" fill="#E37862" text-anchor="middle">average information rent</text>
-        <text x="310" y="118" font-family="Georgia" font-style="italic" font-size="22" fill="#E37862" font-weight="bold" text-anchor="middle">COI := E[P] - p</text>
+        <text x="310" y="118" font-family="Georgia" font-style="italic" font-size="22" fill="#E37862" font-weight="bold" text-anchor="middle">COI = E[P] - <tspan text-decoration="underline">p</tspan></text>
    </g>

    <!-- Bottom: Agent Saturation -->
--- a/engine/lib/coi.py
+++ b/engine/lib/coi.py
@@ -1,12 +1,15 @@
 import numpy as np
 from typing import Dict

+from lib.agent_probability import DEFAULT_AGENT_PRIOR, estimate_agent_probability
+

 def compute_agent_probability(
    trajectory: list,
    human_transitions: Dict,
    agent_transitions: Dict,
    temperature: float = 1.0,
+    prior_agent: float = DEFAULT_AGENT_PRIOR,
 ) -> float:
    """estimate agent probability via KL divergence between trajectory transitions and reference models

@@ -18,10 +21,10 @@ def compute_agent_probability(
        agent_transitions: reference transition dict from agent MDP (event->event->prob)

    returns:
-        agent probability in [0, 1] via softmax over KL divergences
+        agent probability in [0, 1] via sigma((delta_h - delta_a) / T)
    """
    if len(trajectory) < 2:
-        return 0.0  # insufficient data, assume human
+        return float(prior_agent)

    # build empirical transition distribution from trajectory
    trans_counts = {}
@@ -54,11 +57,12 @@ def compute_agent_probability(
    kl_human = kl_div(empirical, human_transitions)
    kl_agent = kl_div(empirical, agent_transitions)

-    # convert to probability via softmax (lower KL = higher prob)
-    t = float(max(temperature, 1e-6))
-    exp_h = np.exp(-kl_human / t)
-    exp_a = np.exp(-kl_agent / t)
-    return float(exp_a / (exp_h + exp_a + 1e-10))
+    return estimate_agent_probability(
+        delta_h=kl_human,
+        delta_a=kl_agent,
+        temperature=temperature,
+        prior_agent=prior_agent,
+    )


 def extract_purchases(trajectories: list) -> Dict[int, int]:
--- a/lib/separability.py
+++ b/lib/separability.py
@@ -7,10 +7,9 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Iterable, List, Sequence

-import joblib
 import numpy as np

-from experiments.ml.arch import featurize_trajectory
+from lib.agent_probability import DEFAULT_AGENT_PRIOR, estimate_agent_probability


 DEFAULT_ARTIFACT_DIR = Path("data/separability")
@@ -18,11 +17,7 @@ DEFAULT_ARTIFACT_DIR = Path("data/separability")

@dataclass
 class SeparabilityArtifacts:
-    scaler: object
-    classifier: object
-    states: List[str]
    event_transitions: Dict[str, Dict[str, float]]
-    feature_dim: int


 def _normalize_events(raw_events: Sequence[object]) -> List[object]:
@@ -36,7 +31,9 @@ def _normalize_events(raw_events: Sequence[object]) -> List[object]:
    return events


-def _event_transition_distribution(events: Sequence[object]) -> Dict[str, Dict[str, float]]:
+def _event_transition_distribution(
+    events: Sequence[object],
+) -> Dict[str, Dict[str, float]]:
    counts: Dict[str, Dict[str, int]] = {}
    for src_evt, dst_evt in zip(events, events[1:]):
        src_name = getattr(src_evt, "eventName", "unknown")
@@ -47,11 +44,15 @@ def _event_transition_distribution(events: Sequence[object]) -> Dict[str, Dict[s
    distribution: Dict[str, Dict[str, float]] = {}
    for src, dsts in counts.items():
        total = float(sum(dsts.values()))
-        distribution[src] = {dst: val / total for dst, val in dsts.items()} if total else {}
+        distribution[src] = (
+            {dst: val / total for dst, val in dsts.items()} if total else {}
+        )
    return distribution


-def _kl_divergence(p: Dict[str, Dict[str, float]], q: Dict[str, Dict[str, float]]) -> float:
+def _kl_divergence(
+    p: Dict[str, Dict[str, float]], q: Dict[str, Dict[str, float]]
+) -> float:
    eps = 1e-10
    total = 0.0
    for src, dsts in p.items():
@@ -61,28 +62,28 @@ def _kl_divergence(p: Dict[str, Dict[str, float]], q: Dict[str, Dict[str, float]
    return float(total)


-def load_artifacts(artifact_dir: Path | str = DEFAULT_ARTIFACT_DIR) -> SeparabilityArtifacts:
+def load_artifacts(
+    artifact_dir: Path | str = DEFAULT_ARTIFACT_DIR,
+) -> SeparabilityArtifacts:
    artifact_dir = Path(artifact_dir)
-    scaler_path = artifact_dir / "scaler.joblib"
-    model_path = artifact_dir / "classifier.joblib"
    metadata_path = artifact_dir / "metadata.json"

-    if not (scaler_path.exists() and model_path.exists() and metadata_path.exists()):
+    if not metadata_path.exists():
        raise FileNotFoundError(
-            f"Separability artifacts not found in {artifact_dir}. Run sim.strong_learner.train first."
+            f"Separability metadata not found in {artifact_dir}. Provide metadata.json with event transitions."
        )

-    scaler = joblib.load(scaler_path)
-    classifier = joblib.load(model_path)
    with open(metadata_path, "r", encoding="utf-8") as fin:
        metadata = json.load(fin)

+    transitions = metadata.get("event_transitions")
+    if not isinstance(transitions, dict):
+        raise ValueError(
+            "metadata.json must contain an 'event_transitions' object with 'human' and 'agent' kernels"
+        )
+
    return SeparabilityArtifacts(
-        scaler=scaler,
-        classifier=classifier,
-        states=list(metadata["reference_states"]),
-        event_transitions=metadata["event_transitions"],
-        feature_dim=int(metadata["feature_dim"]),
+        event_transitions=transitions,
    )


@@ -92,37 +93,44 @@ def score_session(
 ) -> dict:
    events = _normalize_events(raw_events)
    if not events:
-        return {"prob_agent": 0.0, "delta_h": 0.0, "delta_a": 0.0}
-
-    reference_mdp = {"states": artifacts.states}
-    features = featurize_trajectory(events, mdp=reference_mdp, input_dim=artifacts.feature_dim)
-    scaled = artifacts.scaler.transform(features.reshape(1, -1))
-    prob_agent = float(artifacts.classifier.predict_proba(scaled)[0, 1])
+        return {
+            "prob_agent": float(DEFAULT_AGENT_PRIOR),
+            "delta_h": 0.0,
+            "delta_a": 0.0,
+            "gap": 0.0,
+        }

    session_dist = _event_transition_distribution(events)
    delta_h = _kl_divergence(session_dist, artifacts.event_transitions.get("human", {}))
    delta_a = _kl_divergence(session_dist, artifacts.event_transitions.get("agent", {}))
+    gap = float(delta_h - delta_a)
+    prob_agent = estimate_agent_probability(delta_h=delta_h, delta_a=delta_a)

    return {
        "prob_agent": prob_agent,
        "delta_h": delta_h,
        "delta_a": delta_a,
+        "gap": gap,
    }


-def estimate_alpha(prob_agent: float, delta_h: float, delta_a: float, temperature: float = 1.0) -> float:
-    divergence_mass = delta_h + delta_a
-    if divergence_mass <= 1e-8:
-        return float(prob_agent)
-
-    ratio = delta_a / divergence_mass
-    blended = 0.5 * prob_agent + 0.5 * ratio
-    if temperature <= 0:
-        return float(np.clip(blended, 0.0, 1.0))
-
-    scaled = 1.0 / (1.0 + np.exp(-temperature * (blended - 0.5)))
-    return float(np.clip(scaled, 0.0, 1.0))
+def estimate_alpha(
+    prob_agent: float,
+    delta_h: float,
+    delta_a: float,
+    temperature: float = 1.0,
+    prior_agent: float = DEFAULT_AGENT_PRIOR,
+) -> float:
+    _ = prob_agent
+    return estimate_agent_probability(
+        delta_h=delta_h,
+        delta_a=delta_a,
+        temperature=temperature,
+        prior_agent=prior_agent,
+    )


-def score_sessions(raw_sessions: Iterable[Sequence[object]], artifacts: SeparabilityArtifacts) -> List[dict]:
+def score_sessions(
+    raw_sessions: Iterable[Sequence[object]], artifacts: SeparabilityArtifacts
+) -> List[dict]:
    return [score_session(events, artifacts) for events in raw_sessions]
--- a/paper/.latexmkrc
+++ b/paper/.latexmkrc
@@ -1,6 +1,26 @@
 $pdf_mode = 1;
 $pdflatex = 'pdflatex -synctex=1 -interaction=nonstopmode -file-line-error %O %S';
-$bibtex_use = 2;                       # run bibtex when needed
-$bibtex   = 'bibtex %O %B';
+$bibtex_use = 2;                       # run biber when biblatex .bcf changes
+# biber cwd is paper/build; scripts/nx_paper.sh symlinks ../build/bib -> ../src/bib so
+# datasources log as bib/references.bib and latexmk's -e check works from paper/src
+$biber    = 'biber %O %S';
+
+# Stale latexmk db: biblatex uses biber + .bcf, but the fdb can keep a "bibtex" rule after a bad
+# run. Then biber never runs and citations stay undefined. Read whole fdb (small) so the rule
+# line is never missed after a long dependency list.
+for my $job (qw(main main-genpop summary)) {
+    my $bcf = "../build/$job.bcf";
+    my $bbl = "../build/$job.bbl";
+    my $fdb = "../build/$job.fdb_latexmk";
+    next unless -e $fdb && -e $bcf;
+    my $drop = !-e $bbl;
+    if ( !$drop && open my $fh, '<', $fdb ) {
+        local $/;
+        my $body = <$fh>;
+        close $fh;
+        $drop = 1 if defined $body && $body =~ /\["bibtex $job"\]/;
+    }
+    unlink $fdb if $drop;
+}
 $pdf_previewer = 'zathura %O %S';
 $clean_ext = 'synctex.gz bbl bcf run.xml fls fdb_latexmk glg glo gls ist blg lof lot out toc';
--- a/paper/defense/manim/requirements.txt
+++ b/paper/defense/manim/requirements.txt
@@ -1,2 +0,0 @@
-manim>=0.18,<1
-numpy>=1.24
--- a/paper/project.json
+++ b/paper/project.json
@@ -44,6 +44,23 @@
        "command": "bash scripts/nx_paper.sh build-arxiv",
        "cwd": "."
      }
+    },
+    "build-summary": {
+      "executor": "nx:run-commands",
+      "outputs": [
+        "{projectRoot}/build/summary.pdf"
+      ],
+      "options": {
+        "command": "bash scripts/nx_paper.sh build-summary",
+        "cwd": "."
+      }
+    },
+    "watch-summary": {
+      "executor": "nx:run-commands",
+      "options": {
+        "command": "bash scripts/nx_paper.sh watch-summary",
+        "cwd": "."
+      }
    }
  },
  "tags": [
--- a/paper/src/auto/main.el
+++ b/paper/src/auto/main.el
@@ -16,7 +16,13 @@
    "chapters/04-results"
    "chapters/05-discussion"
    "chapters/06-conclusion"
+    "chapters/acknowledgements"
    "article"
-    "art12"))
+    "art12")
+   (LaTeX-add-labels
+    "app:compute_budget"
+    "tab:compute_derivation"
+    "app:kl_zeros"
+    "app:revelation_log"))
 :latex)

--- a/paper/src/chapters/01-intro.tex
+++ b/paper/src/chapters/01-intro.tex
@@ -7,18 +7,19 @@
 %% \end{figure}

 \section{Introduction}
+\label{sec:introduction}

-In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove distinguishability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned distinguishability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.
+In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove distinguishability) as a guiding teacher for downstream mitigation of contamination by non-human entities, translation of such learned distinguishability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.

-This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned distinguishability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.}
+This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned distinguishability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a knowledge cutoff set at the date of March 1st 2026.}

 \subsection{Motivation and Market Context}

 The current innovation boom in generative artificial intelligence and its applications to knowledge-based work tasks has brought many competing technologies for browser-use automation, with benchmarks and evaluations \parencite{xia_evaluation-driven_2025} motivating the development of capabilities focused on commercial research, understanding, and transaction execution \parencite{xie_osworld_2024}. The ``AI Agent'' market is forecasted to grow from around USD 5-8 billion in 2025 to USD 42-52 billion by 2030. This surge reflects adoption in e-commerce, customer service, and enterprise automation, where agents handle interactions previously done by humans, raising the question of how these systems should be designed for future robustness as well as how to maintain a competitive edge in the analytical components of e-commerce platforms \parencite{markntel_advisors_global_2025}.

-The key stakeholders affected by the threat of increasing agent-driven traffic include online businesses and platform operators (especially in bot-heavy sectors like retail, travel, and financial services), their security, fraud, and engineering teams, end users whose accounts and data are exposed and whose experience degrades, regulators and legal stakeholders responding to breaches and fraud, and the attackers or bot operators driving the automation \parencite{imperva_rapid_2025}.
+The key stakeholders affected by the threat of increasing agent-driven traffic include online businesses and platform operators (especially in bot-heavy sectors like retail, travel, and financial services), their security, fraud, and engineering teams, end users whose accounts and data are exposed and whose user experience degrades, regulators and legal stakeholders responding to breaches and fraud, and the attackers or bot operators driving the automation \parencite{imperva_rapid_2025}.

-The industry has already seen legal action in cases like Amazon against Perplexity \parencite{ghaffary_amazon_2025}, stemming from the difficulty of identifying traffic from hybrid systems like the Commet browser. This paper explores such systems to better understand what the interaction data looks like and what it means for dynamic pricing and recommendation systems downstream. This observed impact indicates a need for prevention of secondary negative effects on the ``legacy'' systems which power modern revenue sources for many companies. Dynamic pricing algorithms rely on directly translating demand features $q$ to new price assignments $\hat{p}$ across a catalogue of products of size $N$. This opens opportunities to design a \textit{tabula rasa} of digital market mechanisms that will shape the future of commerce in the age of artificial intelligence.
+The industry has already seen legal action in cases like Amazon against Perplexity \parencite{ghaffary_amazon_2025}, stemming from the difficulty of identifying traffic from hybrid systems like the Comet browser. This paper explores such systems to better understand what the interaction data looks like and what it means for dynamic pricing and recommendation systems downstream. This observed impact indicates a need for prevention of secondary negative effects on the ``legacy'' systems which power modern revenue sources for many companies. Dynamic pricing algorithms rely on directly translating demand features $q$ to new price assignments $\hat{p}$ across a catalogue of products of size $N$. Our exploration of this field opens opportunities to design a \textit{tabula rasa} of digital market mechanisms that will shape the future of commerce in the age of artificial intelligence.

 \subsection{Solution Space Overview}
 Dynamic pricing systems, as presented by \textcite{mueller_low-rank_2019}, often deal with sparse low-rank data of demand signals which, combined with contamination from agents, creates complex interactions that impact pricing. To further complicate the problem, certain commercial settings such as the one presented by \textcite{amjad_censored_2017} must address the true demand of products under censored observations. This provides a formulation for handling demand in our case with multiple kinds of commercial mediators: $\hat{q} \gets q_A + q_H$ where $q_A$ represents the distribution of demand generated by agentic mediators and $q_H$ represents that of true human demand, these are two distinct populations with divergent objective functions.
@@ -26,13 +27,14 @@ Dynamic pricing systems, as presented by \textcite{mueller_low-rank_2019}, often
 We formally define interaction data as coming from some actor which can either be an agent ($A$) or human ($H$). For purposes of this research, an agent is an algorithmic loop with the ability to access a web platform and perform actions such as clicks, scrolls, and input field fills. The loop terminates when the internal large language model judges the provided task definition as complete. A detailed breakdown can be found in \cref{algagent-loop}.

 \subsection{Research Questions}
+\label{sec:research_questions}

-This dissertation is organized around one main research question and three supporting sub-questions:
+This dissertation is organized around one main research question and three supporting pillar questions:
 \begin{enumerate}
    \item[\textbf{Main RQ}] How can dynamic pricing systems preserve margin integrity when transaction orchestration is increasingly mediated by non-human agents?
-    \item[\textbf{SQ1}] \textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
-    \item[\textbf{SQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
-    \item[\textbf{SQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
+    \item[\textbf{SQ1}] \hypertarget{sq1}{}\textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
+    \item[\textbf{SQ2}] \hypertarget{sq2}{}\textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
+    \item[\textbf{SQ3}] \hypertarget{sq3}{}\textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
 \end{enumerate}


@@ -64,4 +66,6 @@ Extract final result $r$ from terminal state\;
 \end{algorithm}


-The previously described goal of distinguishability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate $\hat{q}$. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary.
+The previously described goal of distinguishability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate $\hat{q}$. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of distributionally robust optimization \parencite{kuhn_distributionally_2025} in which the learner guards against adversarial contamination in observed demand \emph{distributions}. The decision rule (in the policy) must perform when the data-generating mechanism is not a single known distribution but any member of an ambiguity set described only partially. Here that mechanism is a mixture whose weight and components need not be stationary.
+
+Our work's contributions are best understood as a dependency chain centered around dynamic pricing. The work begins with a formal account of why a non-human mediator threatens pricing power, then we construct a platform capable of generating the interaction data needed for our study of that threat. On top of that \textit{substrate} we build behavioral models to determine whether human and agent traffic can be separated. The resulting contamination estimate is then translated into the pricing core itself, where it serves as a key signal for robust control under distributional uncertainty. The breadth of the thesis is therefore a consequence of the problem structure: the theoretical, behavioral, systems, and control components are not separate projects, but successive requirements of a single argument.
--- a/paper/src/chapters/02-literature-review.tex
+++ b/paper/src/chapters/02-literature-review.tex
@@ -1,15 +1,16 @@
 \section{Literature Review}
+\label{sec:literature_review}

-To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups.
+To situate the work we review agents and agentic computer use, web automation, economic reasoning, and strategic interaction, then turn to data-driven dynamic pricing under uncertainty. The main technical risk is not ``agents buying things'' in isolation but agents reshaping the behavioral and demand signals on which downstream pricing depends. Related litigation is already underway---for example \textcite{noauthor_amazoncom_2026} under the Computer Fraud and Abuse Act. Mediating actors surface classic concerns such as false-name bidding \parencite{yokoo_effect_2004} or pseudonymous re-entry which can whitewash reputation and weaken defenses \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, whereas classical bot detection targets security and access control. The gap we target is a principled way to separate non-human reconnaissance from genuine human demand expression and to fold that signal into pricing without degrading legitimate users (we track harm with a user-experience index), for the stakeholders named in the introduction.

 \subsection{Agent Taxonomy and Definitions}

 An agent in the context of artificial intelligence is generally defined by anything that can reason and act upon observations of its environments (collected through some sensory inputs) and carry out actions through effectors. Moreover, a rational agent is an entity that is capable of perceiving the world around them and taking actions to advance specified goals. This definition by \textcite{russell_artificial_2021} is further developed in an economic context by \textcite{parkes_economic_2015}, suggesting AI research attempts to construct a synthetic \textit{homo economicus}, which may also be termed \textit{machina economicus}.
 A specific class or taxon of this \textit{machina economicus}, the Large Language Model (LLM) agent, is defined as an autonomous system capable of achieving goals and adapting post-training, often without needing explicit code or fundamental model changes \parencite{xia_evaluation-driven_2025}.

-We must however acknowledge the current SOTA as presented by OSWORLD simulations by \textcite{xie_osworld_2024} have demonstrated that multi-modal tasks across desktop and web interaction modes, have a top-performing score of only 12.24\% success, whereas humans have a higher 72\% success rate; this is linked to the lack of grounding of these agents and their inability of handling unexpected errors. This weakness matters for this research because it clarifies the near-term threat model: practical exploitation does not require a fully competent ``computer assistant'', only enough automation to perform high-volume reconnaissance actions (search/filter/open product pages, probe availability/price boundaries) that can contaminate behavioral signals. With the expected growth of these capabilities, this threat only becomes more perilous to revenue management systems.
+We must however acknowledge that OSWORLD simulations by \textcite{xie_osworld_2024} report a top success rate of only 12.24\% on multi-modal desktop and web tasks, versus about 72\% for humans, reflecting limited grounding and brittle recovery from unexpected errors. This weakness matters for this research because it clarifies the near-term threat model: practical exploitation does not require a fully competent ``computer assistant'', only enough automation to perform high-volume reconnaissance actions (search/filter/open product pages, probe availability/price boundaries) that can contaminate behavioral signals. With the expected growth of these capabilities, this threat only becomes more perilous to revenue management systems.

-We model an agent session as producing some events with lower in-session conversion levels relative to humans, this we state in our assumption that $P(\text{purchase} \vert A) < P(\text{purchase} \vert H)$ but with a potentially higher volatility in $\hat{q}$, which we observe through the look-to-book metrics in our simulation.
+We model agent sessions as producing lower in-session conversion than humans, i.e.\ $P(\text{purchase} \vert A) < P(\text{purchase} \vert H)$, with potentially higher volatility in $\hat{q}$, which we proxy with look-to-book metrics in the simulator.

 \subsection{Economic Agents: From Homo Economicus to Machina Economicus}

@@ -21,9 +22,9 @@ A HAP (HTTP Agent Profile) protocol has been developed as an internet draft by \

 \subsection{Problem Evidence and Market Impact}

-The statistical issue of contamination in dynamic pricing systems that observe demand features as a means to update prices has been documented in various previous contexts. The airline industry (which has accounted for 24\% of observed disruptions) has seen malicious activity with a measureable impact on skewing key performance indicators by behavior visible in the look-to-book metrics. Excessive reconnaissance traffic inflates search volume without corresponding completed bookings, thereby skewing demand forecasts and disrupting dynamic pricing models. Demand proxies have also been observed to cause significant threat to inventory management by creating artificial scarcity that distorts the demand-supply relationships in the enterprise model. Censored demand as shown by \textcite{amjad_censored_2017} can also be observed in low-bias demand under-estimation caused by a distortion effect coming from non-human traffic data \parencite{imperva_rapid_2025}.
+Contamination in dynamic pricing systems that observe demand features to update prices appears across several industries. Aviation (about 24\% of observed disruptions in one industry survey) illustrates how malicious or scripted traffic can skew KPIs visible in look-to-book metrics. Excessive reconnaissance traffic inflates search volume without corresponding completed bookings, thereby skewing demand forecasts and disrupting dynamic pricing models. Demand proxies have also been observed to cause significant threat to inventory management by creating artificial scarcity that distorts the demand-supply relationships in the enterprise model. Censored demand as shown by \textcite{amjad_censored_2017} can also be observed in low-bias demand under-estimation caused by a distortion effect coming from non-human traffic data \parencite{imperva_rapid_2025}.

-When dynamic pricing algorithms operate on highly contaminated or noisy data, the risk grows significantly in creating inaccurate price inferences. The emergent mitigation driven by un-informed reward and regret signals might lead to price suppression for sales continuity which results in harming margins and resulting in a revenue loss. System that poorly fit undesired behavior might result in price gouging, which calls for strong guardrails while preserving targeted business strategy \parencite{mullapudi_reinforcement_2025}.
+When dynamic pricing algorithms train on highly contaminated or noisy data, mis-inference risk rises and revenue is threatened. Mis-specified reward and regret signals can push prices down to preserve volume, eroding margins, while misfit to legitimate demand can produce the opposite failure mode where both call for guardrails that preserve commercial intent \parencite{mullapudi_reinforcement_2025}.


 %Documented instances of agent-driven market disruptions - Quantitative evidence of pricing manipulation - Case studies from affected industries
@@ -31,11 +32,11 @@ When dynamic pricing algorithms operate on highly contaminated or noisy data, th
 \subsection{Theoretical Foundations: Economic Parallels}


-Early hints of exploration of prices in a standard English auction explored by \textcite{varian_economic_1995} which hints at exploration of prices in a sequential manner, which leads to a marginally different cost to the bidder than the reservation price of the seller. This is a setting in which there is no cost incured by the buyer for their actions or exploring prices in the market. They propose that any agent responsable for the pricing of a good must be imune to dynamic strategies which might extract private information from a market. A key take-away which relates to the Vickery auction mechanism (also called a \textit{direct mechanism}) suggests that not only would defenses against such exploitation be necessary, but the construction of a mechanism in which revelation of the true willingness to pay is the dominant strategy for commerce.
+\textcite{varian_economic_1995} studies sequential exploration of prices in an English auction: the bidder's cost can differ slightly from the seller's reservation price. In that setting the buyer incurs no separate cost for searching or exploring prices. The authors argue that any party \emph{responsible} for pricing must be immune to dynamic strategies that extract private information. The link to the Vickrey (second-price) auction, a \textit{direct mechanism}, is that defenses against exploitation may need to pair with mechanisms in which truthful revelation of willingness to pay is incentive-compatible.

 Like in classical revenue-maximizing auctions \parencite{roughgarden_cs364a_2013} we assume that the human actor in our system has a private valuation $v$ which we formally draw from intrinsically defined distributions. The important note here is that the agent proxy does not have a mechanism to convey this private information into the demand data which directly impacts the pricing systems.

-The key component of this mediation between agents and commercial platforms lays in the transaction costs related to information gathering and negotiation. As proposed by \textcite{shahidi_coasean_2025} these costs are bound to collapse towards zero (which we demonstrate mathematically), calling for a re-evaluation of the boundaries between firms and markets. As argued by \textcite{coase_nature_1937}, the market participation and time associated with that participation, is critical part of the Coasean transaction cost logic which includes the discovery or relevant pricing within a given market. This process of price discovery without the presence of AI Agents can be time consuming and resource intensive. To build on top of this work we provide a proof of optimal conditions theorised by Coaes as an extension to AI-mediated markets.
+The mediation between agents and commercial platforms turns on transaction costs of information gathering and negotiation. \textcite{shahidi_coasean_2025} argue these costs tend toward zero (we give a complementary formal result in Section~3). \textcite{coase_nature_1937} treats search and participation time as central to Coasean transaction costs, including discovery of relevant prices. Price discovery without AI intermediaries is already costly. We extend this classical Coasean logic to AI-mediated markets.

 % Economic foundations: relating the problem to options pricing theory. Cost of Information (COI) concept and its relevance

@@ -43,13 +44,13 @@ The key component of this mediation between agents and commercial platforms lays

 \subsection{Landscape of Existing Work}

-Explorations of the algorithmic collusion by LLMs \parencite{fish_algorithmic_2025} has demonstrated a cross-model tendency of market division with a strong sensitivity to instructions provided in the ``system prompt''. If a dynamic pricing algorithm which is trained to respond to market signals learns to coordinate with competitor agents (or become manipulated by those agents), the market equilibrium is under threat of destabilization. This is particularly true for Q-learning pricing learners as demonstrated by \textcite{calvano_artificial_2018}.
+Work on algorithmic collusion by LLMs \parencite{fish_algorithmic_2025} reports cross-model sensitivity to instructions in the ``system prompt,'' including tendencies toward market division. If a dynamic pricing algorithm which is trained to respond to market signals learns to coordinate with competitor agents (or become manipulated by those agents), the market equilibrium is under threat of destabilization. This is particularly true for Q-learning pricing learners as demonstrated by \textcite{calvano_artificial_2018}.

 Our effort to combat contamination stems from research by \textcite{hardt_strategic_2015} on strategic classification, in conjunction with \textcite{liu_contextual_2024} who demonstrate a linear regret if contamination is ignored. The strategic classification adversarial effect comes from an effort to manipulate some representative features used in a learning pipeline, which can result in lower prices on loans or lower prices from dynamic pricing algorithms.

 To bridge the gap between detection and robust pricing, we look at work in Distributionally Robust Optimization (DRO). As defined by \textcite{kuhn_wasserstein_2024}, DRO provides a framework for decision-making under ambiguity, where the true data distribution is unknown but lies within a ``Wasserstein ball'' of a target distribution. In our context, the ``ambiguity set'' represents the uncertainty introduced by agentic reconnaissance. By optimizing for the worst-case distribution within this set, pricing mechanisms can become resilient to the distributional shifts such as the ones caused by non-human actors, effectively robustifying the revenue function against the contamination described in our problem statement.

-In order to create an environment in which prices can be tested against a demand estimate generated by some behavioral model, we take inspiration from the architecture proposed by \textcite{ie_recsim_2019} in the RecSim platform built for recommendation systems. By modeling the distinct user behavior as POMDPs we can generate faithful interactions which allow us to generalize, past the constraint which is also present in recommendation systems, of rarely having enough experience with individual actor's interactions for good recommendations without generalization. The key inspiration comes from the user choice modeling which we translate to a user transition model for each distinct actor type (agent or human). We further consider the possibility of modeling our quantitative research platform using dynamic Bayesian networks for the sake of tractability within the system. The contribution or RecSim enables researchers to better understand learning algorithms in fixed environments, a gap we identify as needing to be bridged within the space of dynamic pricing.
+To build an environment where prices face a demand estimate from a behavioral model, we draw on RecSim \parencite{ie_recsim_2019}. Modeling user behavior as partially observable Markov decision processes yields synthetic interaction that generalizes past the usual cold-start limit of per-user data. We translate RecSim-style user choice modeling into per-class transition models (human versus agent). Dynamic Bayesian networks remain a tractability option for the full platform. RecSim's main contribution is a sandbox for recommender learners and we adapt that idea to dynamic pricing under contamination into a sort of contaminated pricing simulator.
 % TODO: mention https://github.com/meta-pytorch/OpenEnv/tree/main/envs/browsergym_env

 We also acknowledge the difficulty in similarly affected fields such as authorship, where \textcite{ganie_uncertainty_2025} demonstrate the theoretical limits of the distributional divergence between text authored by a human or large language model. Their approach of computing the divergence between two distributions demonstrates purely theoretically that no classifier can outperform random guessing on their particular task. This is yet another factor to take into consideration when exploring the potential mitigation strategies.
--- a/paper/src/chapters/03-methodology.tex
+++ b/paper/src/chapters/03-methodology.tex
@@ -1,13 +1,14 @@
 \section{Methodology}
+\label{sec:methodology}

 % Extra notes and clarifications: we observed some humans and get their transition probabilities between event types
 % We modify behavioral profiles of transition matrices with price elasticity matrices generated by sample valuations of a distributing.

-This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven distinguishability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets.
+This section addresses the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven distinguishability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets.

 \subsection{Problem Formalization}

-We define a commercial environment where the platform interacts with a stream of sessions. Let $\mathcal{S}$ denote the set of all sessions. Each session $s \in \mathcal{S}$ is generated by an actor belonging to a latent class $\theta_s \in \{H, A\}$, where $H$ denotes Human and $A$ denotes Agent.
+We define a commercial environment where the platform interacts with a stream of sessions. Let $\mathcal{S}$ denote the set of all sessions. Each session $s \in \mathcal{S}$ is generated by an actor belonging to a latent class $Y_s \in \{H, A\}$, where $H$ denotes Human and $A$ denotes Agent.

 Each session produces a trajectory of observable events $\tau_s = (e_{s,1}, \ldots, e_{s,L_s})$. An event $e_{s,k}$ is a tuple defined as:
 \begin{equation}
@@ -20,12 +21,12 @@ where:
    \item $t_{s,k} \in \mathbb{R}_+$ is the continuous timestamp.
 \end{itemize}

-The platform does not directly observe the true underlying demand function $d(p)$. Instead, it observes a behavioral proxy $\hat{q}_t$, which is a composite signal derived from the mixture of actor types. We define the demand proxy for product $i$ at epoch $t$ as a weighted aggregation of events:
+The platform does not directly observe the true underlying demand function $d(p)$ where $d \in \mathbb{R}^{+}$ and our proxy $\hat{q} \in \mathbb{R}^{+}$. Instead, it observes a behavioral proxy $\hat{q}_t$, which is a composite signal derived from the mixture of actor types. We define the demand proxy for product $i$ at epoch $t$ as a weighted aggregation of events:
 \begin{equation}
 \label{eq:qhat}
-\hat{q}_{t,i} = \sum_{s \in \mathcal{S}_t} \sum_{k=1}^{L_s} \omega(a_{s,k}) \cdot \mathbb{1}[i_{s,k} = i]
+\hat{q}_{t,i} = \sum_{s \in \mathcal{S}_t} \sum_{k=1}^{L_s} \omega(a_{s,k}) \cdot \mathbf{1}[i_{s,k} = i]
 \end{equation}
-where $\omega: \mathcal{A} \to \mathbb{R}_+$ assigns weights to actions based on their signal strength regarding willingness to pay.
+where $\omega: \mathcal{A} \to \mathbb{R}^+$ assigns weights to actions based on their signal strength regarding willingness to pay.

 In the current engine implementation, we use the normalized variant of this proxy for each step:
 \begin{equation}
@@ -34,20 +35,21 @@ In the current engine implementation, we use the normalized variant of this prox
 with fixed category-level weights (cart, dwell, nav, filter) following the same rank order from Table~\ref{tab:action_space}. This keeps the signal dense and directly usable in the simulator.

 \subsubsection{Actor Types and Demand Curves}
-We formalize the heterogeneity of actors by introducing a type space $\Theta$. An actor of class $Y_s$ is further parameterized by a type $\theta \sim \mathcal{D}_{Y}$. This type determines the actor's demand response function $d(p; \theta)$, sampled from a distribution of possible demand curves. The total observed demand is a stochastic process governed by the naively defined mixture:
+We formalize the heterogeneity of actors by introducing a type space $\Theta$. An actor of class $Y_s$ is further parameterized by a type $\theta \sim \mathcal{D}_{Y_s}$. This type determines the actor's demand response function $d\!\left(p \mid Y_s,\theta\right)$, sampled from a distribution of possible demand curves. In compact form, demand remains price-dependent as $d(p\mid Y=y)$. The total observed demand is a stochastic process governed by the naively defined mixture:
 \begin{equation}
 \label{eq:mixture_demand}
-Q(p) = (1-\alpha) \cdot \mathbb{E}_{\theta \sim \mathcal{D}_H}[d(p; \theta)] + \alpha \cdot \mathbb{E}_{\theta \sim \mathcal{D}_A}[d(p; \theta)] + \epsilon_t
+Q(p) = (1-\alpha) \cdot \mathbb{E}_{\theta \sim \mathcal{D}_H}[d(p\mid Y=H,\theta)] + \alpha \cdot \mathbb{E}_{\theta \sim \mathcal{D}_A}[d(p\mid Y=A,\theta)] + \epsilon_t
 \end{equation}
 where $\alpha \in [0, 1]$ represents the contamination parameter (proportion of agents) and $\epsilon_t$ is non-stationary market noise.
+We address that the composition of two non-stationary variables can cause difficulty distinguishing the sources of possible dynamic composition in online environments, whether from market noise or agents specifically.
 Accounting for behavioral and market variation, we also treat $\epsilon_t$ as absorbing serving-path variability from LLM infrastructure (e.g., batch-size-dependent inference behavior under changing load), which appears stochastic at the request level even under greedy decoding \parencite{horace_he_and_thinking_machines_lab_defeating_2025}.



 \subsection{Cost of Information (COI) Framework}

-The platform's pricing power comes from information asymmetry: users who express strong interest signals pay more than the base price. We quantify this markup as the \textit{Cost of Information} (COI), which represents the average premium extracted above marginal cost. COI measures the revenue at risk when information asymmetry collapses.
-A top-level view in the current AI discourse is that sufficiently large productivity gains can induce vertical deflation through cost compression and supply expansion \parencite{rachitsky_marc_2026}. Our contribution is narrower and mechanism-level: even under long-run deflation, platform revenue still depends on short-run information costs to the user. We formalize that rent as the Cost of Information (COI) and study how agentic reconnaissance accelerates its erosion.
+The platform's pricing power comes from information asymmetry: users who express strong interest signals pay more than the base price. We quantify this markup as the \textit{Cost of Information} (COI), which represents the average premium extracted above marginal cost. The intuition behind this being a cost comes from the perspective of the user who is interacting with the platform, where the user is the one incurring that ``cost.'' COI measures the revenue at risk when information asymmetry collapses.
+A top-level view in the current AI discourse is that sufficiently large productivity gains can induce vertical deflation (vertical supply chain price decrease) through cost compression and supply expansion \parencite{rachitsky_marc_2026}. Our contribution is narrower and mechanism-level: even under long-run deflation, platform revenue still depends on short-run information costs to the user. We formalize that rent as the Cost of Information (COI) and study how agentic reconnaissance accelerates its erosion.

 \begin{definition}[Cost of Information]
 Let $\pi(\tau)$ be a pricing policy mapping interaction histories to prices. The COI is defined as:
@@ -88,13 +90,14 @@ where $\mathbb{E}[P]$ is the expected price charged by the policy and $\underlin
        \draw[<->, thick, red] (\pmin, 2.0) -- (\mean, 2.0) node[midway, above] {COI};

    \end{tikzpicture}
-    \caption{Illustration of the Cost of Information (COI). The COI is defined as the difference between the expected price $\mathbb{E}[p]$ realized by the policy and the minimum viable price $\underline{p}$.}
+    \caption{Illustration of the Cost of Information (COI). The COI is defined as the difference between the expected price $\mathbb{E}[p]$ realized by the policy and the minimum viable price $\underline{p}$. The abstraction we assume is that the reservation price $\underline{p}$ already has some innate margin and would always result in at least a break-even transaction.}
    \label{fig:coi_illustration}
 \end{figure}

 We now formally demonstrate that standard dynamic pricing mechanisms are not incentive-compatible with high-frequency agentic traffic. As the number of independent competitive agents $N$ querying the system grows, the platform's ability to sustain a COI vanishes.

-  A fundamental assumption for our claim lies in the alignment of the AI agent through its prompt which has been demonstrated by \cite{fish_algorithmic_2025} to cause strong collusive behavior under linguistic nudges. This assumption can be generalized to the human user asking the agent to research products with a minimizing objective.
+\paragraph{Assumption Scope}
+The theorem and core experiments in this thesis assume a non-collusive independent-session setting: each agent queries prices independently and does not share sampled quotes across agents. Collusive coordination is outside the current proof scope and is treated as an extension scenario.

 \begin{theorem}[COI Erosion in the Limit]
 Let $N$ be the number of independent, utility-maximizing agents querying the platform. Let $p_{(1)}$ be the first order statistic (minimum) of the prices offered to these agents. As $N \to \infty$, the Cost of Information converges to 0.
@@ -126,7 +129,7 @@ Since the integrand vanishes as $N \to \infty$ for all $t > \underline{p}$, the
 \end{proof}


-This result naively proves that standard pricing policies $\pi$ fail to extract surplus in the presence of large-scale agentic search, necessitating a robust counter-mechanism.
+This result implies that standard pricing policies $\pi$ cannot extract the same surplus under large-scale agentic search without additional structure, which motivates the robust control layer below.

 % The DRO objective creates a lower bound on COI extraction, effectively guaranteeing a minimum margin even in the presence of adversarial agents. we need to prove this and demonstrate that in a theorem.

@@ -135,22 +138,22 @@ This result naively proves that standard pricing policies $\pi$ fail to extract

 \subsection{System Architecture: Hybrid Kappa-Lambda Architecture}

-In order for our research to have grounding in interactions we built a robust e-commerce web-platform. We initially conducted a survey of the leading platforms of airlines and hotel booking sites to identify the specific interface patterns that effectively manage complex travel data. Our analysis revealed a clear industry standard: while both sectors rely on tabbed service selection and left-sidebar filtering to streamline navigation, they diverge in result presentation: airlines utilize visual date-price bars and multi-step wizards to optimize for logistical transparency, whereas hotel platforms leverage image-led cards and scarcity triggers to drive emotional engagement and urgency. Our web framework defines a highly agnostic boilerplate which can be seeded with any data-modality with an easy-to-tailor pattern, which we leverage to define a \texttt{hotel} and \texttt{airline} mode. Both modes are then individually deployed via an environment level argument which adjusts the proxy routing with a custom middleware inside next.js to render only the desired mode. The purpose of this was to create a baseline adaptable to any use-case or desired commercial application.
+In order for our research to have grounding in interactions we built a robust e-commerce web-platform. In this framing Kappa represents streamed processing and Lambda batch operations as is given by terminology in big-data processing. We initially conducted a survey of the leading platforms of airlines and hotel booking sites to identify the specific interface patterns that effectively manage complex travel data. To better understand the playing field, we collected artifacts on design across various airlines and hotels. While both sectors rely on tabbed service selection and left-sidebar filtering to streamline navigation, they diverge in result presentation: airlines utilize visual date-price bars and multi-step wizards to optimize for logistical transparency, whereas hotel platforms leverage image-led cards and scarcity triggers to drive emotional engagement and urgency. Our web framework defines a highly agnostic boilerplate which can be seeded with any data-modality with an easy-to-tailor pattern, which we leverage to define a \texttt{hotel} and \texttt{airline} mode. Both modes are then individually deployed via an environment-level argument which adjusts the proxy routing with custom middleware in Next.js to render only the desired mode. The purpose of this was to create a baseline adaptable to any use-case or desired commercial application.

-The architecture of this platform begins with the deployed web-apps posting interaction data to our backend which processes them and stores each ingested interaction into a kafka cluster. This serves as our data reservoir tracking and associating each interaction with its session and importantly with which experiment it belongs to. Not only do we track the behavioral interactions, but our pricing provider micro-service, once called by the frontend reports the observed/queried price-product into kafka. This kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger. The final stage of the pricing pipeline, submits computed dynamic pricing results into a redis database for quick updates which is then read by the pricing provider and displayed on the webapp. This is a very generic end-to-end mechanism which is applicable to a variety of different e-commerce tasks. We intentionally put emphasis on the development of this infrastructure to establish a reproducible framework for interaction and to minimize any noise.
+The architecture begins with deployed web applications posting interaction data to a backend that stores each record in Apache Kafka. Kafka acts as the reservoir linking sessions to experiments. Behavioral events and, separately, price observations from the pricing-provider microservice (invoked by the frontend) land in Kafka topics. A scheduled Airflow pipeline (with manual triggers) consumes the stream and the final pricing stage writes vectors to Redis for low-latency reads by the provider and display in the client. This design pattern allows us to generalize to other commercial settings, where Kafka is used for durability and replay, Redis for serving and quick queries. We invested in this stack to keep runs reproducible and to limit extraneous variance so the same skeleton applies across e-commerce settings.

-\paragraph{Public Web Artifact} We transition the Kappa like architecture of the data collection to a Lambda architecture for actual learning in a surrogate environment. This allows us to move faster on data which is provided and helps us create a feedback loop for production deployment. To support further research in this intersection of fields we release P4P \footnote{\url{https://github.com/velocitatem/p4p}} as a public repository providing the interaction layer of the PHANTOM framework. This provides a configurable storefront which can be tailored to any commercial setting with a standardized session-level event tracking. We document the API adapters or what the framework expects in terms of schemas for pricing providers and log ingestion servicse. The repository is intended for controlled experimentation and method replication rather than production commerce deployment.
+\paragraph{Public Web Artifact} We transition the Kappa-like architecture of the data collection to a Lambda architecture for actual learning in a surrogate environment. This allows us to move faster on data which is provided and helps us create a feedback loop for production deployment. To support further research in this intersection of fields we release P4P \footnote{\url{https://github.com/velocitatem/p4p}} as a public repository providing the interaction layer of the PHANTOM framework. This provides a configurable storefront which can be tailored to any commercial setting with a standardized session-level event tracking. We document the API adapters and expected schemas for pricing providers and log ingestion services. The repository is intended for controlled experimentation and method replication rather than production commerce deployment.

-\paragraph{Public Dataset} For reproducibility of the behavioral analysis and distinguishability experiments, we also release the interaction dataset used in this thesis as \textit{WhoClickedIt}. The dataset is hosted on Hugging Face \footnote{\url{https://huggingface.co/datasets/velocitatem/whoclickedit}} and is distributed as one flattened event sheet (\texttt{whoclicked.csv}) with explicit labels (\texttt{actor\_type}, \texttt{is\_agent}, and \texttt{record\_type}). The associated dataset card specifies the schema, collection process, and known limitations; a full copy is included in Appendix~\ref{app:whoclicked_card}.
+\paragraph{Public Dataset} For reproducibility of the behavioral analysis and distinguishability experiments, we also release the interaction dataset used in this thesis as \textit{WhoClickedIt}. The dataset is hosted on Hugging Face \footnote{\url{https://huggingface.co/datasets/velocitatem/whoclickedit}} and is distributed as one flattened event sheet (\texttt{whoclicked.csv}) with explicit labels (\texttt{actor\_type}, \texttt{is\_agent}, and \texttt{record\_type}). The dataset card on that page documents the schema, collection process, and known limitations.


 \subsubsection{DevOps Principles}

-Reproducible results are key to quality research platforms, this is taken into mind when deploying and working with our research platform. From a deployment standpoint the platform can be deployed across a large variety of providers and can be run locally. When developing a new interaction modality apart from the ones that come out of the box, a simple template pattern can be followed. The middleware of the framework is designed to properly render the chosen modality from environmental variables, thus deployment of different or parallel version of the software can be easily parametrized.
+Reproducibility guided deployment choices: the stack runs locally or on common cloud providers. New interaction modalities follow a small template; middleware reads environment variables so parallel deployments (e.g.\ staging versus production-like experiments) differ only in configuration, not in forked codebases.

 \subsubsection{Online Dynamic Pricing}

-In order to collect data from actors under correct conditions we replicate a naive and simple dynamic pricing algorithm which runs in the background during the experiments.
+To expose participants to state-dependent prices without over-constraining the study, we run a transparent surge--discount heuristic in the background during data collection.
 The dynamic pricing done is handled by a pipeline which computes a demand estimate on a per-product basis of a specific window of the data, defined by the period $T$ which by default is 5 minutes. This dynamic pricing pipeline computes a demand estimate vector $\hat{q} \in \mathbb{R}^N$ by a weighted sum of interactions for each product, it additionally computes a price elasticity vector $\hat{\epsilon}$ in the same dimensions as our demand. The final features matrix is of the size $N \times 2$ which we translate to a new price vector $\hat{p} \in \mathbb{R}^N$.


@@ -158,14 +161,14 @@ The transformation that governs this dynamic pricing is a very simple surge-base

 \begin{equation}
 \hat{p}_i = \begin{cases}
-p_{0,i} \cdot \lambda_{\text{surge}} & \text{if } \hat{q}_i \geq \theta_{\text{high}} \\
-p_{0,i} \cdot \lambda_{\text{disc}} & \text{if } \hat{q}_i \leq \theta_{\text{low}} \\
-p_{0,i} & \text{otherwise}
+ p_{0,i} \cdot \lambda_{\text{surge}} & \text{if } \hat{q}_i \geq \varrho_{\text{high}} \\
+ p_{0,i} \cdot \lambda_{\text{disc}} & \text{if } \hat{q}_i \leq \varrho_{\text{low}} \\
+ p_{0,i} & \text{otherwise}
 \end{cases}
 \quad \forall i \in \{1, \ldots, N\}
 \end{equation}

-where $p_0 \in \mathbb{R}^N$ is the base price vector (which is seeded into our database distinctly for each mode of the commerce platform), $\theta_{\text{high}}, \theta_{\text{low}} \in \mathbb{R}$ are demand thresholds defining surge and discount regions, and $\lambda_{\text{surge}}, \lambda_{\text{disc}} \in \mathbb{R}^+$ are multiplicative factors with typical values $\lambda_{\text{surge}} = 1.2$ and $\lambda_{\text{disc}} = 0.9$. This piecewise function enables rapid price adjustment in response to observed demand without requiring complex elasticity estimation or historical calibration, allowing us to expose actors within our experiments to a system with a dynamic component of pricing.
+where $p_0 \in \mathbb{R}^N$ is the base price vector (which is seeded into our database distinctly for each mode of the commerce platform), $\varrho_{\text{high}}, \varrho_{\text{low}} \in \mathbb{R}$ are demand thresholds defining surge and discount regions, and $\lambda_{\text{surge}}, \lambda_{\text{disc}} \in \mathbb{R}^+$ are multiplicative factors with typical values $\lambda_{\text{surge}} = 1.2$ and $\lambda_{\text{disc}} = 0.9$. This piecewise function enables rapid price adjustment in response to observed demand without requiring complex elasticity estimation or historical calibration, allowing us to work with actors within our experiments to a system with a dynamic component of pricing.

 % For our offline experimental setting, we generalize a master value function that can encompass different demand estimation and pricing strategies.
 %
@@ -177,19 +180,32 @@ where $p_0 \in \mathbb{R}^N$ is the base price vector (which is seeded into our

 \subsection{Experimental Design}

-We start from a practical constraint: we do not have access to proprietary production data. Because of that, we design our own fictional platform that still represents how commercial platforms work in the real world. The design comes from a survey of hotel and airline websites, where we extracted common interface components and used them as a high-level template for dynamic pricing environments.
+% We start from a practical constraint: we do not have access to proprietary production data. Because of that, we design our own fictional platform that still represents how commercial platforms work in the real world. The design comes from a survey of hotel and airline websites, where we extracted common interface components and used them as a high-level template for dynamic pricing environments.
+In the aforementioned platform we develop for our experiments, we use the surveyed websites and create an \textit{average} representation of what the most expected interfaces would be by extracting common components and designing a high level template for dynamic pricing environments.

-The interface is organized as a product catalog where each product belongs to a time-bounded price vector (for example, a daily pricing period). During each period we collect interaction data by instrumenting UI components and predefined action templates that are still customizable. This gives us control without losing realism.
+
+The interface is organized as a product catalog where each product belongs to a time-bounded price vector (for example, a daily pricing period). During each period we collect interaction data by instrumenting UI components and predefined action templates that are still customizable. That yields controlled variation while keeping the interface controlled-for.

 Since users act with motivations, we define a pool of tasks (jobs to be done) and assign tasks randomly to participants.
-The task pool is stored as a structured table with fields \texttt{id}, \texttt{created\_at}, \texttt{task\_name}, \texttt{task\_description}, and \texttt{task\_def\_of\_done}. We formulate the tasks as compact jobs-to-be-done rather than as strict click scripts, because the target is to elicit realistic browsing and comparison behavior which can capture nuance of different people. In hotel mode the assigned tasks include \textit{Cheapest Room}, \textit{Cheapest Room w/ View}, \textit{MultiStep Cheapest Room}, \textit{The Digital Nomad (Executive)}, and \textit{The 3-Way Tradeoff (Desk + Quiet + Flexible)}. These prompts deliberately require critical thought in search, inspection of room details, comparison of amenities or images, return visits to the listing page, and a final booking decision which create a degree of cognitive load. In airline mode we use \textit{Last-Minute One-Way Flight}, where the actor must urgently travel to LAX from either SEA or JFK within the next 1--3 days, inspect at least a small set of candidate itineraries, and then book a reasonable earliest departure.
-A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor.
+We discuss limitations and choices made in this experimental design in Section~\ref{sec:limitations_risks}.
+The task pool is stored as a structured table with fields \texttt{id}, \texttt{created\_at}, \texttt{task\_name}, \texttt{task\_description}, and \texttt{task\_def\_of\_done}. We formulate the tasks as compact jobs-to-be-done rather than as rigid instructions, because the target is to elicit realistic browsing and comparison behavior which can capture nuance of different people. In hotel mode the assigned tasks include \textit{Cheapest Room}, \textit{Cheapest Room w/ View}, \textit{MultiStep Cheapest Room}, \textit{The Digital Nomad (Executive)}, and \textit{The 3-Way Tradeoff (Desk + Quiet + Flexible)}. These prompts deliberately require critical thought in search, inspection of room details, comparison of amenities or images, return visits to the listing page, and a final booking decision which create a degree of cognitive load. In airline mode we use \textit{Last-Minute One-Way Flight} or \textit{Family/Work Emergency Travel}, where the actor must urgently travel to LAX from either SEA or JFK within the next 1 to 3 days, inspect at least a small set of candidate itineraries, and then book a reasonable earliest departure. Figure~\ref{fig:exp_design_tree} summarizes the assignment tree.
+
+\begin{figure}[ht]
+  \centering
+  \resizebox{0.88\columnwidth}{!}{%
+    \input{chapters/figures/experiment_design_tree.tex}
+  }
+  \caption{Experimental design decision tree for participant assignment.}
+  \label{fig:exp_design_tree}
+\end{figure}
+
+A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor. This de-risks our lower sample size of individuals by allowing broad interaction data to come from each one.

 The human data collection involved 13 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 13 human sessions we ran 16 agent sessions of equivalent task scope, yielding 29 labeled trajectories in total (45\% human, 55\% agent). Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy.

 To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior.

-Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to distinguish classes $\theta \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner.
+Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to distinguish classes $Y \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner.

 Our process follows three stages: (1) observe and \textit{vectorize} behavioral interactions, (2) learn distinguishability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.

@@ -215,16 +231,15 @@ Our web platform (developed in similar spirit to RecSim \parencite{ie_recsim_201

 To speak to realism, user interviews reported that the platform architecture mirrored standard booking interfaces and reduced the cognitive load required to learn the system. One participant described the flow as ``intuitive'' and close to a ``normal'' transaction, suggesting observed behavior was primarily driven by pricing treatment rather than interface novelty.

-The dynamic pricing mechanism elicited immediate behavioral adjustments. Participants were sensitive to price volatility: sudden boosts triggered urgency and faster booking attempts, while large listing-to-final discrepancies triggered deeper comparison behavior. This is comforting because the controlled setup still produces commercially relevant interaction data.
+The dynamic pricing mechanism elicited immediate behavioral adjustments. Participants were sensitive to price volatility: sudden boosts triggered urgency and faster booking attempts, while large listing-to-final discrepancies triggered deeper comparison behavior. The responses match what one expects from live e-commerce experiences, such as reactions to volatility, which supports external validity despite the lab setting.


-\subsubsection{Design of Training Factorial Study}
+\subsubsection{Design of Training Sweeps}

-The simulator has multiple configurable factors. We design a multi-factor study across five axes derived from the sweep configurations: (1) RL algorithm (\texttt{ppo}, \texttt{a2c}, \texttt{dqn}, \texttt{qtable}; 4 levels), (2) contamination ratio $\alpha$ sampled from $[0.1, 0.6]$ at four representative levels, (3) robustness radius $\epsilon_\alpha \in \{0.0, 0.15, 0.3\}$ (3 levels), (4) COI penalty weight $\lambda_\text{coi}$ at two reference levels, and (5) pricing action granularity (two discretization settings for \texttt{action\_levels}); giving a grid of $4\times4\times3\times2\times2 = 192$ configurations. Statistical power for the behavioral comparisons is determined by a two-sample test over per-session KL divergence scores; a formal power analysis with minimum detectable effect size at $n_H=13$, $n_A=16$ is reported in the results.
-% Power analysis plan: apply a two-sample Mann-Whitney U (or permutation test) on per-session (delta_H - delta_A) divergence scores comparing the human and agent groups. Compute minimum detectable effect size at alpha=0.05, power=0.8, given n_H=13 and n_A=16. Bootstrap confidence intervals on mean KL are a cleaner complement given the non-normality of divergence distributions.
+The simulator has multiple configurable factors. Training runs are driven by Weights \& Biases sweep definitions versioned with the codebase, mixing random and grid schedules rather than a single full factorial. For the contamination ratio $\alpha$, exploratory sweeps draw $\alpha$ uniformly on $[0.1,0.6]$ and then some sweeps use the narrower interval $[0.1,0.5]$. Grid sweeps fix explicit level sets, for example $\alpha\in\{0.1,0.2,0.3,0.4,0.6,0.8\}$ (six levels, including $0.8$ beyond the typical exploratory upper endpoint) or five levels $\{0.1,0.2,0.3,0.4,0.6\}$. Auxiliary schedules also include $\alpha=0$ alongside positive values. Robustness radius $\epsilon_\alpha$, COI penalty $\lambda_\text{coi}$, RL algorithm (\texttt{ppo}, \texttt{a2c}, \texttt{dqn}, \texttt{qtable}), and the discretization of the price action grid vary by sweep. Broad random search may use uniform $\epsilon_\alpha\in[0,0.3]$ and $\lambda_\text{coi}\in[0.05,0.6]$; tighter grids may fix $\epsilon_\alpha=0.2$ and restrict $\lambda_\text{coi}$ to $\{0.15,0.30\}$. Behavioral distinguishability is assessed with a two-sample Mann--Whitney test on per-session divergence gap scores at cohort sizes $n_H=13$ and $n_A=16$.
 While this scale is generally expensive for reinforcement learning, we execute it on a large TPU cluster to make the sweep tractable.

-Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak BF16 throughput this corresponds to approximately 160\,PFLOPS of aggregate compute (derivation in Appendix~\ref{app:compute_budget}), which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration where throughput-per-dollar is favorable, and reserve on-demand v4 capacity for runs that should not be interrupted.
+Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak BF16 throughput this corresponds to approximately 160\,PFLOPS of aggregate compute (derivation in Appendix~\ref{app:compute_budget}), which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration, and reserve on-demand v4 capacity for runs that should not be interrupted.

 \begin{table}[ht]
 \centering
@@ -259,12 +274,13 @@ v4 & 64 (32 + 32) & us-central2-b & 32 Spot + 32 On-demand \\
 \end{tabular}
 \end{table}

-For connections from Madrid, we prioritize the europe-west4 allocation for latency-sensitive runs with the benefit of having the most grouped chips within a single region. This regional grouping is important for the deployment of our Kubernetes cluster which cannot span multiple regions. All sweep metadata, model checkpoints, and reward traces are logged in Weights \& Biases. % TODO: cite this (from bib)
-Hardware specifications are from the official Google Cloud TPU documentation \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}.
+For connections from Madrid, we prioritize the europe-west4 allocation for the sake of latency and the benefit of having the most grouped chips within a single region. This regional grouping is important for the deployment of our Kubernetes cluster which cannot span multiple regions. All sweep metadata, model checkpoints, and reward traces are logged in Weights \& Biases. \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}.
+% TODO: cite this (from bib)

-Design of training processes: we build docker image with the fact in mind of different caching over layers in order to most speed up docker re-building and such we place the most volatile steps towards the end of the image building. What is means in practice is that any dependency installations are isolated so edits to source code do no trigger rebuilds. Only if we update our entry point of training a sweep, Docker will also rebuild the source-code copy stage.

-Due to the preemptive nature of the current demand of TPU chips we sttle for running our on demeaned as the primary source of compute. The on demand TPU pod of 32 chips spread across 4 virtual hosts creates a relatively unique parallelization setup. Despite our desire to use a traditional approach of clustering and perhaps deploying SLURM jobs of our sweep agent, the lack of predictability in provisioning each instance of a compute resource makes this an high friction layer we do not want to add.
+Training images abide by Docker layer caching principles with maximal caching on the lowest levels. Dependency layers are separate from the copy of application source so code edits or tweaks do not re-boot the entire build such that only changes to the training entrypoint or dependencies force a full rebuild.
+
+TPU capacity is scarce and often preemptible, so we rely primarily on on-demand pods for workloads that must finish without interruption. A typical reservation is a 32-chip pod across four worker VMs. That layout already gives enough parallelism for our sweep driver without adding a separate cluster scheduler. We considered SLURM-style job arrays, but fluctuating provisioning times would have added operational overhead with little benefit for our workload, so orchestration stays in the container and Ray layer described below.

 \subsubsection{Interaction Schema}

@@ -272,7 +288,7 @@ We extend the basic event tuple $e_{s,k}$ to capture the full observational sign
 \begin{equation}
 e_{s,k} = \left( a_{s,k}, \, i_{s,k}, \, t_{s,k}, \, \mu_{s,k}, \, \delta_{s,k} \right)
 \end{equation}
-where $\mu_{s,k} \in \mathcal{M}$ is a metadata record containing action-specific context (e.g., price observed, filter parameters, element text), and $\delta_{s,k} \in \mathbb{R}_+$ is the dwell time in milliseconds for attention-based actions.
+where $\mu_{s,k} \in \mathcal{M}$ is a metadata record containing action-specific context (e.g., price observed, filter parameters, element text), and $\delta_{s,k} \in \mathbb{R}^+$ is the dwell time in milliseconds for attention-based actions.

 A session $s$ is itself a structured record:
 \begin{equation}
@@ -299,8 +315,7 @@ $\mathcal{A}_{\text{filter}}$ & \texttt{search}, \texttt{filter\_date}, \texttt{
 \end{table}

 This partition enables the weight function $\omega$ from Eq.~\ref{eq:qhat} to assign category-specific signal strengths, with $\omega(\mathcal{A}_{\text{cart}}) > \omega(\mathcal{A}_{\text{dwell}}) > \omega(\mathcal{A}_{\text{nav}}) > \omega(\mathcal{A}_{\text{filter}})$ reflecting decreasing commitment.
-Its important to acknowledge that this creates a very blatant assumption in the weighting, we do motivate the scale of each weight by the per-category observed divergence between each behavioral profile.
-In the simulator baseline this order is encoded with a compact fixed scale: cart $=4.0$, dwell $=2.0$, nav $=1.0$, filter $=0.5$. Unknown actions are mapped by prefix heuristics to the nearest category.
+The ordering cart $>$ dwell $>$ nav $>$ filter is a deliberate simplification: we set it from early data by ranking categories by KL divergence between human and agent transition rows and then spacing weights in powers of two. The simulator encodes cart $=4.0$, dwell $=2.0$, nav $=1.0$, filter $=0.5$ and finally unknown actions map by prefix to the nearest category (or are discarded).

 The metadata record $\mu$ varies by action type. For product views, $\mu$ contains the observed price $p_{\text{obs}}$ and product attributes. For dwell events, $\mu$ includes the element text and accumulated hover duration. This heterogeneous structure is captured via a schema-on-read approach in our Kafka ingestion pipeline, where events are validated against type-specific schemas before storage.

@@ -315,11 +330,11 @@ To train a robust pricing learner, we need a simulator that can generate realist


 \subsubsection{Ground-Truth Distinguishability}
-Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability?
+Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $Y_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability?

-To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session distinguishability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
+For each session $s$ we fit a session-level transition kernel $\hat{\mathcal{T}}_s$, then average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The distinguishability score is the gap $\Delta_{H,s} - \Delta_{A,s}$ (negative $\approx$ human-like, positive $\approx$ agent-like). KL is used because it compares full categorical rows, not single features.

-The normality assumption cannot be made for KL divergence distributions, which are right-skewed and bounded below by zero, so we do not use a Student's $t$-test. Instead we apply a Mann-Whitney $U$ test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the stochastic ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We report $U$, the exact two-sided $p$-value, and group-level descriptive statistics for the gap scores.
+Gap scores are skewed and nonnegative, so we test cohort differences with a Mann--Whitney $U$ test \parencite{mann_test_1947} rather than a $t$-test. We report $U$, the two-sided $p$-value, and descriptive statistics for each group.

 \begin{definition}[Kullback-Leibler Divergence for Transition Distributions]
 Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is:
@@ -328,21 +343,22 @@ Let $P_e$ and $Q_e$ be categorical distributions over destination states followi
 \end{equation}
 where $\mathcal{S}_e$ denotes the set of destination events that follow $e$ in the human trajectories.
 \end{definition}
+We exploit KL asymmetry so that ``distance from human-like'' is explicit in the score, not only distance from agents.

 To obtain this statistic, we aggregate transitions by triggering event $e$ and treat normalized outgoing probabilities as categorical distributions $P_e$ (human) and $Q_e$ (agent). We intersect shared event labels, then accumulate log-ratio contributions over shared destinations. Large contributions, including near-zero $Q_e(k)$ cases, identify transitions where one actor class is difficult to mimic.

-With these divergence features we train a contrastive model to estimate a weak agent probability $f(\tau)\in[0,1]$, which we later use as a weighting and control signal.
+With these divergence features we compute a weak agent probability $f(\tau')\in[0,1]$ directly from divergence gaps, which we later use as a weighting and control signal.


 \subsubsection{Transition Probability Estimation}
 \label{sec:tpe}


-For both subsets, we model session dynamics as an MDP and estimate transition kernel $\mathcal{T}$. For each actor type we estimate global kernels $\hat{\mathcal{T}}_A$ and $\hat{\mathcal{T}}_H$, then cluster into behavioral sub-kernels $\hat{\mathcal{T}}_y^i$ to avoid collapsing all behavior into one average profile. Transition probabilities are estimated by maximum likelihood:
+For both subsets, we model session dynamics as a Markov decision process and estimate transition kernel $\mathcal{T}$. For each actor type we estimate global kernels $\hat{\mathcal{T}}_A$ and $\hat{\mathcal{T}}_H$, then cluster into behavioral sub-kernels $\hat{\mathcal{T}}_y^i$ to avoid collapsing all behavior into one average profile. Transition probabilities are estimated by maximum likelihood:
 \begin{equation}
    \hat{P}(s' \mid s) = \frac{N(s, s')}{\sum_{k \in \mathcal{S}} N(s, k)}
 \end{equation}
-where $N(s, s')$ is the observed transition count. This allows us to construct a \textit{Contamination Generator} $\mathcal{G}(\alpha)$. Given a clean trajectory dataset, $\mathcal{G}$ injects synthetic agent trajectories sampled from $\hat{\mathcal{T}}_A$ until the effective mixing ratio reaches $\alpha$. The properties of an MDP such as ... should be preserved by the operation described below.
+where $N(s, s')$ is the observed transition count. This allows us to construct a \textit{Contamination Generator} $\mathcal{G}(\alpha)$. Given a clean trajectory dataset, $\mathcal{G}$ injects synthetic agent trajectories sampled from $\hat{\mathcal{T}}_A$ until the effective mixing ratio reaches $\alpha$. The properties of an MDP such as a discrete state space, nonnegative transition mass, and row-stochasticity ($\sum_{s'}\hat{P}(s'\mid s)=1$ for visited states) should be preserved by the operation described below.

 To scale this to catalog-level pricing, we expand the base event transition matrix from $T\times T$ into product-specific transitions using the current demand condition. In practice, we normalize the demand vector across products and use it to weight how much transition mass each product pair receives. Concretely, each cell of the base matrix becomes an $N\times N$ block (for $N$ products), so the transition matrix grows from $T\times T$ to $(T\cdot N)\times(T\cdot N)$. Finally, we add $C$ generic states (homepage, login, checkout terminal states), which gives the full kernel size $(T\cdot N + C)\times(T\cdot N + C)$.
 % The validity of this demand-weighted block expansion is still subject to formal proof: it needs to be shown that the resulting matrix retains row-stochasticity (rows summing to 1) and that the weighting by the demand vector preserves the Markov property for the expanded state space. In the engine source this is the target of ongoing validation before the expansion is relied on for behavioral generation at scale.
@@ -364,7 +380,8 @@ To scale this to catalog-level pricing, we expand the base event transition matr

 \subsection{Distributionally Robust Reinforcement Learning (DR-RL)}

-We formulate pricing as a Stackelberg game: the platform (leader) sets prices $p_t$, and the population (follower) responds through trajectories and demand. A useful intuition is that the platform behaves like a distorted mirror at a 45-degree angle: what it mirrors is population demand into an estimated demand proxy, and that proxy drives revenue.
+We formulate pricing as a Stackelberg game in which the platform (leader) sets prices $p_t$, and the population (follower) responds through trajectories and demand. A useful intuition is that the platform behaves like a distorted mirror at a 45-degree angle: what it mirrors is population demand into an estimated demand proxy, and that proxy drives revenue.
+% TODO: add canonical Stackelberg citation.

 Because contamination level $\alpha$ and demand shift are non-stationary online, a simple error term is not enough. We therefore use a Distributionally Robust Optimization objective. Let $\tau'$ be a newly observed trajectory generated by an unknown actor profile (sampled from the behavioral models in Section~\ref{sec:tpe}). We need a demand mapping conditioned on price and trajectory, $\hat{Q}(p,\tau')$. For each $\tau'$, we compute $\hat{\mathcal{T}}'$ and compare it with controlled baselines $\bar{\mathcal{T}}_H$ and $\bar{\mathcal{T}}_A$:

@@ -375,9 +392,38 @@ Because contamination level $\alpha$ and demand shift are non-stationary online,
  \Delta_A &= D_{KL}(\hat{\mathcal{T}}^\prime \parallel \bar{\mathcal{T}}_A)
 \end{align}

-This yields two centroid-like heuristics that act as a session-level agent score in the engine. On a per-customer or use-case basis a similar study should be done in order to obtain ground truth behavior models for humans and agents and their specific interaction with a given products website.
+From these two divergences we define the gap score following previously highlighted intuition of the divergence:
+\begin{equation}
+g(\tau') = \Delta_H(\tau') - \Delta_A(\tau').
+\end{equation}
+Positive values indicate trajectories farther from the human centroid and closer to the agent centroid.

-In implementation, we maintain an alternating game-history stack (our \textit{Limbo} stack) and execute it explicitly every epoch with exactly two transitions: first the platform publishes a price vector (leader move), then the market responds with trajectory-derived demand (follower move).
+We map this gap to a weak agent probability using a temperature-controlled logistic map:
+\begin{equation}
+f(\tau') = P(Y=A\mid\tau') = \operatorname{softmax}(-\Delta_A,-\Delta_H)_A = \sigma\left(\frac{\Delta_H-\Delta_A}{T}\right), \quad T>0.
+\end{equation}
+The session-level control signal injected into pricing is then
+\begin{equation}
+\hat{\alpha}(\tau') = f(\tau').
+\end{equation}
+
+\begin{figure}[ht]
+  \centering
+  \input{chapters/figures/sigmoid_softmax_gap.tex}
+  \caption{Logistic mapping from the gap $\Delta_H-\Delta_A$ to the weak agent probability $f(\tau')$. Markers indicate the contrasts $\Delta_H<\Delta_A$ and $\Delta_H>\Delta_A$.}
+  \label{fig:sigmoid_softmax_gap}
+\end{figure}
+
+This turns distinguishability into an operational control input in the engine. On a per-customer or use-case basis, a similar data collection and fitting process should be repeated to obtain domain-specific behavior kernels.
+
+In implementation we keep an alternating game-history buffer and advance it each epoch with two transitions where the platform publishes a price vector (leader move), then the environment returns trajectory-derived demand (follower move). We call this the \textit{Limbo}.
+
+To avoid notation drift, we separate two COI objects used for different purposes:
+\begin{align}
+\text{COI}_{\text{level}}(\pi) &= \mathbb{E}[P]-\underline{p}\\
+\text{COI}_{\text{leak}}(p,\tau') &= f(\tau')\cdot \text{InfoValue}(p,\tau')
+\end{align}
+where $\text{COI}_{\text{level}}$ is evaluated at policy level and $\text{COI}_{\text{leak}}$ is evaluated per observed quote during training. Subsequently, when discussing the reward structure, we will better understand the term of the information value.

 % Mention discretized action space and the clipping and over shotting in continuous action spaces
 % Also talk about catastrophic economics, we add termination on bankrupcy or zero demand so market collaps
@@ -398,7 +444,7 @@ and we evaluate a small fixed grid in $\mathcal{A}_{\epsilon_\alpha}(\alpha_0)$


 \subsubsection{Environment Setup for Dynamic Pricing}
-The complete pricing-demand-trajectory loop is illustrated in Figure~\ref{fig:oracle_flow}. The Oracle maps historical price and demand state to a new price vector, which is exposed to a distribution of demand curves. Each product generates trajectories weighted by behavioral kernels $\tau_\theta$, producing a full transition matrix $\tau'$ over sessions. Sampled trajectories $\{\tau_k\}$ are aggregated through the demand proxy function $Q(\cdot)$ to yield the next demand vector, which feeds back into the Oracle.
+The complete pricing-demand-trajectory loop is illustrated in Figure~\ref{fig:oracle_flow}. The Oracle maps historical price and demand state to a new price vector, which is exposed to a distribution of demand curves. Each product generates trajectories weighted by behavioral kernels $\tau_Y$, producing a full transition matrix $\tau'$ over sessions. Sampled trajectories $\{\tau_k\}$ are aggregated through the demand proxy function $Q(\cdot)$ to yield the next demand vector, which feeds back into the Oracle.

 \begin{figure}[ht]
 \centering
@@ -414,7 +460,7 @@ p_N
 \end{pmatrix}
 \underrightarrow{d_i \sim \mathcal{N}_{\vec{p}}}
 \begin{pmatrix}d_0\\ d_1\\ \cdots \\ d_N\end{pmatrix}
-\underrightarrow{\vec{d}\otimes \tau_\theta}
+\underrightarrow{\vec{d}\otimes \tau_Y}
 \begin{bmatrix}
 0.01 & 0.02 & \cdots & 0.3 \\
 0.41 & 0.24 & \cdots & 0.0 \\
@@ -434,7 +480,7 @@ p_N
 \end{aligned}
 $}%
 }
-\caption{Oracle-based pricing loop: historical price and demand state map to a new price vector; each product samples demand curves from $\mathcal{N}_{\vec{p}}$; trajectories are generated via the Kronecker product $\vec{d}\otimes\tau_\theta$ into transition matrix $\tau'$; sampled trajectories $\{\tau_k\}$ aggregate through proxy $Q(\cdot)$ to yield updated demand $\vec{\hat{q}}$, closing the feedback loop.}
+\caption{Oracle-based pricing loop: historical price and demand state map to a new price vector; each product samples demand curves from $\mathcal{N}_{\vec{p}}$; trajectories are generated via the Kronecker product $\vec{d}\otimes\tau_Y$ into transition matrix $\tau'$; sampled trajectories $\{\tau_k\}$ aggregate through proxy $Q(\cdot)$ to yield updated demand $\vec{\hat{q}}$, closing the feedback loop.}
 \label{fig:oracle_flow}
 \end{figure}

@@ -442,39 +488,45 @@ $}%
 The robust policy $\pi^*$ is obtained by solving the maximin problem:
 \begin{equation}
 \label{eq:robust_policy}
-\pi^* = \arg \max_{\pi} \min_{Q \in \mathcal{U}_\epsilon} \mathbb{E}_{d \sim Q} \left[ R(p, d) - \lambda \cdot \text{COI}_{\text{leak}}(p,\tau') \right]
+\pi^* = \arg \max_{\pi} \min_{Q \in \mathcal{U}_\epsilon} \mathbb{E}_{d \sim Q} \left[ R(p, d) - \lambda \cdot \text{COI}_{\text{leak}}(p,\tau') - \eta_{\text{ux}} \cdot \text{UX}(\tau', p) \right]
 \end{equation}
-where $R(p, d)$ is the revenue function and $\lambda$ weighs the information-leakage penalty.
+where $R(p, d)$ is the revenue function, $\lambda$ weighs the information-leakage penalty, $\eta_{\text{ux}}$ weighs the user-experience penalty, and $\text{UX}(\tau', p)\in[0,1]$. We note that $p$ is directly dependent on $\pi$, which is the one deciding this as its action.
+Looking at the reward structure, note that we are not subtracting COI but rather the leakage of COI, which is as defined below.
+

 In practice, we parameterize this with a session-level leakage term:
 \begin{equation}
 \text{COI}_{\text{leak}}(p,\tau') = f(\tau')\cdot \text{InfoValue}(p,\tau')
 \end{equation}
-where $f(\tau')$ is the weak agent probability and $\text{InfoValue}$ is implemented either as a constant query-tax surrogate or as a revelation surrogate $-\log\pi(p\mid\tau')$.
+where $f(\tau')$ is the weak agent probability and $\text{InfoValue}$ is implemented either as a constant query-tax surrogate or as a revelation surrogate $-\log\pi(p\mid\tau')$. This is the surprise of a certain price-setting probability. Essentially, we proxy the leakage term as a surprise of the price our policy is setting, weighted by the contamination estimate. Appendix~\ref{app:revelation_log} expands on why the logarithm is used in the revelation surrogate.
+
+The inner minimization selects the contamination candidate that makes the penalized reward smallest, so the outer policy update faces the worst plausible leakage scenario inside the ambiguity set rather than an average case.

 For the baseline engine reported here, we intentionally use the constant query-tax surrogate to keep the mechanism minimal:
 \begin{equation}
-r_t = R(p_t,\tilde q_t) - \lambda\,f(\tau_t')\,c_{\text{info}}
+\label{eq:baseline_step_reward}
+r_t = R\!\left(p_t,\hat{Q}_t\right) - \lambda\,f(\tau_t')\,c_{\text{info}} - \eta_{\text{ux}}\,\text{UX}(\tau_t', p_t)
 \end{equation}
-with fixed $c_{\text{info}}>0$.
+with fixed $c_{\text{info}}>0$, matching the leakage term $\text{COI}_{\text{leak}}=f(\tau_t')\,c_{\text{info}}$ and the user-experience penalty already introduced in~\eqref{eq:robust_policy}.


 Another possible extension is to adapt the ambiguity radius online, e.g., $\epsilon(\Delta_H)$, so the Wasserstein ball changes with live divergence. We keep this as future work and retain a fixed-radius setup because Wasserstein ambiguity already handles heavy-tail and ``black swan'' behavior without absolute continuity assumptions \parencite{kuhn_wasserstein_2024}.

 \subsubsection{Actor Implementation}
-In our simulation, the ``follower'' is implemented as a set of Actors. Each Actor is initialized with a type $\theta$ which samples a specific demand curve $d(p; \theta)$ from the latent distribution. This formalization ensures that our DR-RL agent does not overfit to a single deterministic demand function but learns a policy robust to the distributional uncertainty defined by $\mathcal{U}_\epsilon$.
+In our simulation, the ``follower'' is implemented as a set of Actors. Each Actor is initialized with a class $Y$ and a latent type $\theta \sim \mathcal{D}_Y$, which samples a specific demand curve $d\!\left(p\mid Y,\theta\right)$ from the latent distribution. This formalization ensures that our DR-RL agent does not overfit to a single deterministic demand function but learns a policy robust to the distributional uncertainty defined by $\mathcal{U}_\epsilon$.

 Practical implementation of browser agents is a strongly evolving field with near-weekly releases of SOTA architectures. In this thesis implementation we abstract that layer into trajectory generators learned from observed human/agent transition kernels.


-As part of reward engineering, we keep a UX factor ($UX\in[0,1]$) as an auxiliary evaluation axis. In the current baseline it is not injected into the core reward; it is tracked separately to compare policy trade-offs.
+As part of reward engineering, we keep a UX factor ($UX\in[0,1]$) as an auxiliary evaluation axis. In code, the UX index is implemented as a volatility penalty on relative price changes, with an extra upward-volatility component weighted by $0.5$ and scaled by $\eta_{\text{ux}}$ and an information-budget term. We also keep a separate supra-competitive penalty tied to persistent price excess above a competitive anchor, which punishes high-price behavior even when volatility is low.
+We measure volatility as mean absolute relative price movement, $v_t=\frac{1}{N}\sum_{i=1}^N\bigl|(p_{t,i}-p_{t-1,i})/\max(p_{t-1,i},1)\bigr|$.

 \begin{figure}[ht]
  \centering
  \resizebox{0.5\columnwidth}{!}{%
    \input{chapters/balance_figure.tex}
  }
-  \caption{Introducing the UX index allows us to better distinguish the kind of impact different methods have and allows us to compare them on this Pareto-like scale.}
+  \caption{Introducing the UX index allows us to better distinguish the kind of impact different methods have and allows us to compare them on this Pareto-efficiency-like scale.}
 \end{figure}

 We also consider taxation-like overlays for agent traffic under strategy-proof mechanism design (e.g., Vickrey-Clarke-Groves style rules). This remains an extension path and is not part of the main implementation in this thesis.
@@ -511,13 +563,13 @@ We now present the complete pricing mechanism that integrates the behavioral dis
 \end{algorithm}


-The algorithm operates in discrete epochs indexed by $t$. At each epoch, the platform applies one discrete multiplicative price action, the environment samples a batch of sessions, and demand is recomputed from weighted events. Robustness is implemented as an inner minimization over a small local grid of contamination candidates around nominal $\alpha_0$, matching the current engine implementation. The history buffer $\mathcal{L}$ (``Limbo'' in our implementation) enforces the alternating Stackelberg structure by preserving the temporal sequence of price publications and demand observations.
+The algorithm operates in discrete epochs indexed by $t$. At each epoch, the platform applies one discrete multiplicative price action, the environment samples a batch of sessions, and demand is recomputed from weighted events. Robustness is implemented as an inner minimization over a small local grid of contamination candidates around nominal $\alpha_0$, matching the current engine implementation. The history buffer $\mathcal{L}$ enforces the alternating Stackelberg structure by preserving the temporal sequence of price publications and demand observations.

 %The defensive price update in Line 24 implements contamination-aware margin shrinkage: as estimated contamination $\hat{\alpha}_t$ rises, the margin $(p^{\mathrm{ref}} - c)$ is reduced by factor $\kappa\in[0,1]$, with projection $\Pi_{\mathcal{P}}$ ensuring feasibility. In subsequent experiments this heuristic rule is replaced by DR-RL policy $\pi^*$ from Eq.~\ref{eq:robust_policy}.

 \subsection{Parallelization Strategy}

-To avoid preemption of compute mid-training we settle on using a v4 generation, 40 chip compute node with 5 parallel workers. The login node creates an orchestration node with Ray \parencite{moritz_ray_2018} and we distribute ray compute nodes per each other worker.
+To reduce mid-job preemption we standardize on a TPU v4 allocation with 40 chips and five workers. A head process launches Ray \parencite{moritz_ray_2018} and attaches workers across the remaining hosts.

 \subsubsection{Computational Cost Analysis of the Simulation Step}
 The per-step cost of Algorithm~\ref{alg:phantom_loop_clean} is not uniform across its components. To inform hardware provisioning and to identify where algorithmic improvements are most impactful, we profile the hot path of the engine using Python's \texttt{cProfile} instrumentation over 20 environment steps under two configurations: a baseline with the robustness inner loop disabled ($K=1$, $\epsilon_\alpha=0$) and a standard robust setting ($K=5$, $\epsilon_\alpha=0.2$). Both runs use $M=10$ sessions per market call and $N=3$ products.
@@ -526,7 +578,7 @@ The baseline achieves approximately 26 steps per second. Enabling the robustness

 \begin{table}[ht]
 \centering
-\caption{Per-step profiling results (20 steps, $M=10$ sessions, $N=3$ products). Self-time measures time spent inside the function excluding callees; cumulative time includes the full call subtree.}
+\caption{Per-step profiling results (20 steps, $M=10$ sessions, $N=3$ products). Self-time measures time spent inside the function excluding callees and cumulative time includes the full call subtree.}
 \label{tab:profile_results}
 \begingroup
 \small
--- a/paper/src/chapters/04-results.tex
+++ b/paper/src/chapters/04-results.tex
@@ -1,8 +1,14 @@
 \section{Results}
+\label{sec:results}
+
+% The gap we target is not detection for its own sake but whether behavioral signals can support pricing decisions once agent traffic is present. This section follows the supporting questions in \cref{sec:research_questions}: we first establish session-level distinguishability (behavioral evidence and a rank test), then estimate how contamination shifts revenue in a controlled sweep, and finally compare robust and baseline policies under factorial training with COI and revenue readouts. The ordering is deliberate---each stage feeds the next so that separability, contamination effects, and policy outcomes form one connected line of evidence.
+
+In our work, the gap we target is not the detection for its own sake. Our aim is to understand behavioral signals which can support pricing decisions once agent traffic is present. Now we set to conclude and piece together the path we laid out in \cref{sec:research_questions}. We established distinguishability (behavioral evidence and test) that estimate how contamination shifts revenue in an adversarial environment and finally we compare robust and baseline pricing under factorial training.
+
 \begin{figure}[ht]
    \centering
    \input{chapters/figures/supra/supra.tex}
-    \caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as SAC as can be clearly seen by the high density at the highest available price.}
+    \caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as Soft Actor Critic as can be clearly seen by the high density at the highest available price.}
    \label{fig:supra_heatmap}
 \end{figure}

@@ -40,7 +46,26 @@ We report two preliminary stages before the full factorial interpretation. First

 \subsubsection{The Impact of Contamination on Revenue}

-A linear fit test on run-level data ($n=95$) shows a strong negative association between contamination and mean revenue. The fitted model mapping $\alpha \to \text{revenue}$ result in $t(93)=-8.2148$, $p=1.20\times 10^{-12}$, $R^2=0.4205$, and a 95\% confidence interval for the slope of $[-75{,}288.76,\,-45{,}975.13]$. In practical terms, a $+0.1$ increase in $\alpha$ corresponds to an average decrease of about $6{,}063$ revenue units within our environment.
+The contamination--revenue slope is estimated on a controlled cohort (single sweep, baseline policy, $n_{\text{products}}=100$, $n=95$). In this setting, contamination $\alpha$ is set exogenously by the experiment, so the slope identifies the within-sweep causal effect of contamination on revenue under fixed policy and environment settings. These results are in favor of our second research question \hyperlink{sq2}{\textbf{SQ2}} (\textit{Theoretical Impact}) from \cref{sec:research_questions}.
+
+\begin{table}[ht]
+\centering
+\caption{Slope verification table for contamination versus revenue.}
+\label{tab:contamination_slope_table}
+\begin{tabular}{@{}lrrrrr@{}}
+\toprule
+Term & Coef. & Std. Err. & $t$ & $p>|t|$ & 95\% CI \\
+\midrule
+Intercept & 348,823.41 & 784.29 & 444.77 & $<10^{-99}$ & $[347,264.96,\,350,381.86]$ \\
+$\alpha$ & $-90,140.53$ & 1,466.90 & $-61.45$ & $4.27\times10^{-77}$ & $[-93,053.38,\,-87,227.68]$ \\
+\midrule
+HC1 robust check ($\alpha$) & $-90,140.53$ & 2,185.22 & $-41.25$ & $1.42\times10^{-61}$ & -- \\
+\bottomrule
+\end{tabular}
+\end{table}
+
+Interpreted on the contamination grid, a $+0.1$ increase in $\alpha$ corresponds to an average revenue decrease of about $9{,}014$ units, and the robust check preserves both direction and significance.
+% TODO: add a compact proposal note for re-running tests with statsmodels in the appendix methodology notes.

 \subsubsection{Large Scale Factorial Training}

@@ -54,32 +79,33 @@ In our complete training runs we logged $\approx 180$ days of net compute time.

 \begin{figure}[ht]
    \centering
-    \input{chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex}
+    \input{chapters/figures/results/includes/final_focus_revenue_by_alpha.tex}
    \caption{Revenue curves by contamination for the final cohort. The baseline remains above the defended curve in most cells, but the gap narrows in the high-contamination region.}
    \label{fig:final_focus_revenue_by_alpha}
 \end{figure}
-% TODO: we need a similar plot which shows the COI preserved (what we gain across teh multiple conatmination leves, showing that the robust method has better COI optimization.)

 \begin{figure}[ht]
    \centering
-    \input{chapters/figures/results/includes/final/final_focus_revenue_delta.tex}
-    \caption{Defended-minus-baseline revenue delta over contamination for the final cohort. The strongest high-contamination deviation begins at $\alpha=0.7$, followed by recovery toward near parity by $\alpha=1.0$.}
-    \label{fig:final_focus_revenue_delta}
+    \input{chapters/figures/results/includes/final_focus_coi_by_alpha.tex}
+    \caption{COI level curves by contamination for the final cohort. The shaded band marks the per-$\alpha$ gap between defended and baseline policies.}
+    \label{fig:final_focus_coi_by_alpha}
 \end{figure}

 \begin{figure}[ht]
    \centering
-    \input{chapters/figures/results/includes/final/final_focus_risk_deltas.tex}
-    \caption{Defended-minus-baseline leakage and volatility deltas for the final cohort. Leakage remains lower for the defended policy across the full contamination range.}
-    \label{fig:final_focus_risk_deltas}
+    \input{chapters/figures/results/includes/final_focus_coi_preservation_grid.tex}
+    \caption{COI preservation by product count at the contamination endpoints ($\alpha=0.0$ and $\alpha=1.0$). Bars report defended-minus-baseline mean COI level, with the zero line separating preservation from erosion.}
+    \label{fig:final_focus_coi_preservation_grid}
 \end{figure}

+
+
 \subsection{Interpretation and Insights}
-The Mann-Whitney result ($p<0.001$) confirms that per-session divergence gaps distinguish the two actor classes with near-zero overlap in rank ordering. This is the condition required for distinguishability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
+The Mann-Whitney result ($p<0.001$) confirms that per-session divergence gaps distinguish the two actor classes with near-zero overlap in rank ordering. This is the condition required for distinguishability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score. This is a direct result relevant to our first pillar \hyperlink{sq1}{\textbf{SQ1}} (\textit{Distinguishability}) from \cref{sec:research_questions}.

 The first calibration and paired benchmark runs additionally confirm three practical points aligned with the thesis. First, the control loop is reproducible end-to-end (training, evaluation, artifact generation) across algorithms and contamination levels. Second, policy class materially changes price trajectories and resulting COI/revenue profiles under identical environment settings. Third, objective improvements from robustness are regime-dependent in the current baseline, which is consistent with the thesis claim that contamination-aware pricing needs explicit calibration rather than a one-size-fits-all penalty.

-We also note that maximizing revenue in isolation can favor aggressive high-price behavior; even in these early runs, the non-robust aggregate shows slightly higher mean COI and margin. For this reason, all subsequent reporting in this thesis is interpreted on a multi-metric basis (objective, revenue, COI, and stability), and not by revenue alone.
+We also note that maximizing revenue in isolation can favor aggressive high-price behavior, even in our early runs, the non-robust aggregate shows slightly higher mean COI and margin. For this reason, all subsequent reporting in this thesis is interpreted on a multi-metric basis (objective, revenue, COI, and stability), and not by revenue alone. This is another direct answer to our third pillar \hyperlink{sq3}{\textbf{SQ3}} (\textit{Robust Mitigation}) from \cref{sec:research_questions}.


 \subsection{Anomalies}
--- a/paper/src/chapters/05-discussion.tex
+++ b/paper/src/chapters/05-discussion.tex
@@ -1,19 +1,26 @@
 \section{Discussion}
+\label{sec:discussion}

+% TODO: Gpdr here


 \subsection{Transition to Agentic Market Microstructure}

-Our analysis of the interaction dynamics between the platform and non-human actors suggests that the current static pricing models are insufficient for an agent-mediated economy. If we assume a transition toward a direct revelation mechanism, where actors must reveal their true valuation of a good through bidding dynamics, we inevitably introduce significant stochasticity into the pricing system. Unlike traditional e-commerce where prices are relatively sticky, such a mechanism implies a high volatility characteristic of financial equity markets (without the fungability however).
-
-However, ecommerce commodities differ fundamentally from financial securities: they possess a hard floor defined by unit economics and reservation prices. The market might react enthusiastically to an iPhone priced at \$1, such a transaction is not permissible. The platform must establish an initial valuation anchor ($P_{0}$) defined by the marginal cost plus a target margin, around which the market price is permitted to fluctuate. We float the introduction of GenAI Agents as Institutional Market Makers. As the arms race for greater autonomy of agnetic systems grows, the commercial viability of AI agents has the potential to disseminate into every-day users directly interacting with them rather than e-commerce platforms. This is also under the assumption of expected transactional capabilities being given to AI Agents.
+Our analysis of the interaction dynamics between the platform and non-human actors suggests that the current static pricing models are insufficient for an agent-mediated economy. If we assume a transition toward a direct revelation mechanism, where actors must reveal their true valuation of a good through bidding dynamics, we inevitably introduce significant stochasticity into the pricing system. Unlike traditional e-commerce where prices are relatively sticky, such a mechanism implies a high volatility characteristic of financial equity markets (without the fungibility however).

+However, e-commerce commodities differ fundamentally from financial securities: they possess a hard floor defined by unit economics and reservation prices. The market might react enthusiastically to an iPhone priced at \$1. Such a transaction is not permissible. The platform must establish an initial valuation anchor ($P_0$) defined by the marginal cost plus a target margin, around which the market price is permitted to fluctuate.

+We float the introduction of GenAI Agents as Institutional Market Makers. As the arms race for greater autonomy of agentic systems grows, the commercial viability of AI agents has the potential to disseminate into everyday users directly interacting with them rather than e-commerce platforms. This is also under the assumption of expected transactional capabilities being given to AI Agents.

 \subsection{Risk Assessment and Limitations}
+\label{sec:limitations_risks}

-This technology does not come without a more bitter side, ethical concerns do arise from the idea of deploying black-box like solutions to set prices based on a behavioral attributes. Approaches like universal behavioral profile modeling (UBPM) used in recommendation systems is very broadly utilized.
+Behavior-based pricing raises predictable ethics questions when models are opaque: a behavioral profile can become a basis for price discrimination or exclusion if deployed without governance. Universal behavioral profile modeling (UBPM) in recommendation already shows how fine-grained traces enable strong personalization. The same machinery applied to prices needs guardrails.

-With a system like this there is potential for strong drift given the rapid advance of agentic systems and user preference. Our intent behind adding the UX term into the reward shaping process was to further address the risk of degraded user experience. Looking deeper at the underlying methodology, reinforcement learning does not come without it's complications such as reward hacking and often the lack of intepretability which is quite critical in systems that have a strong impact on the revenue of a company.
+
+We balance human and agent sessions near one-to-one so cohorts are comparable despite different population sizes. The row-level dataset still contains thousands of events.
+
+% Rapid change in agent capabilities and user expectations induces model drift; the UX term in reward shaping was included partly to penalize policies that sacrifice legitimate users for short-run revenue. Reinforcement learning adds its own risks---reward hacking and limited interpretability---which matter when policies touch live revenue; deployment would require monitoring and constraints beyond what we exercised here.
+With the exponential growth in capability of agents aswell as user expectations, a degree of model drift is expected in this setting. The computational requirements for continuous extraction of margin as demonstrated by our work are required by the persistent speed of the market. Reinforcement learning that sacrifices legitimate user experience for short run revenue does not hold up in the long run. Reward hacking, to which pricing algorithms are not impervious due to their limited interpretability, is a significant risk for a company if live revenue is in play. Deployment requires consistent monitoring and constraints beyond what was done as an exercise in this work.

 % \subsection{Implications of Findings} Interpretation of results and altenrative scenarios with broader market implications.
--- a/paper/src/chapters/06-conclusion.tex
+++ b/paper/src/chapters/06-conclusion.tex
@@ -1,24 +1,27 @@
 \section{Conclusion}
+\label{sec:conclusion}

-Our research has explored how reinforcement learning works within pricing systems and environments which are substantially disrupted by an adversarial participant. Our findings include the optimization for our newly introduced metrics.
+This thesis examined reinforcement-learning policies for dynamic pricing when a fraction of traffic is orchestrated by non-human agents intent on extracting information before purchase. We introduced COI-oriented metrics, a behavioral distinguishability layer, and a distributionally robust training loop, empirical runs show where robustness helps and where it must be tuned.

 \subsection{Summary of contributions}
-The contribution was not without the advice of many experienced experts in the field. We thank Marco Casalaina VP Products, Core AI and AI Futurist at Microsoft for the initial critical discussion on the topic of dynamic pricing systems and the spark which has lead to this work. Eugene Bykovets, PhD pointing out the parallels in blockchain systems and the complexity of anonymous interaction and understanding of intent. Importantly, the contributions of Alberto Martín Izquierdo, my academic advisor for the support over and for taking on the challenge of this ambitious work. Many breakthroughs were thanks to numerous discussions with my peers on the topics covered here.
-A thanks to the head of innovation at Amadeus for insight into the industry split on the topic of collapsing margins. Finally we acknowledge the power and use of generative AI technologies for in depth research, rapid prototyping and surfacing of key topics and niches.
+Our work has yielded a broad set of dependencies which we carefully orchestrated to give us measurable results. To give a clear picture we outline the specific contributions of each stage of our work. The theoretical component formalizes why agent-mediated reconnaissance erodes pricing power, the behavioral component establishes that such contamination is detectable from interaction traces alone, the control component translates that distinguishability into a robust pricing mechanism, and the systems component provides the controlled experimental environment required to observe, test, and reproduce these effects.

-Now we very explicitly mention what we contribute in this paper:
 \begin{itemize}
-    \item TPU-accelerated parallelization of the behavioral simulation and reinforcement learning pipeline, making large-scale factorial sweeps tractable.
+    \item TPU-accelerated parallelization of the behavioral simulation and reinforcement learning pipeline, making large factorial sweeps tractable.
    \item Formalization of non-human transaction orchestration in e-commerce as a distinct source of contamination in dynamic pricing systems.
-    \item Definition of the Cost of Information (COI) as a mechanism-level quantity for pricing power, together with a theorem showing its erosion under increasing agent saturation.
-    \item Design and implementation of a controlled e-commerce research platform, built on a hybrid Kappa-Lambda architecture, for collecting and replaying high-fidelity interaction trajectories.
-    \item Construction and empirical validation of a behavioral distinguishability framework that distinguishes human and agent sessions from interaction signals alone using transition kernels and KL-based divergence.
-    \item Development of a generative contamination mechanism that injects learned agent behavior into the pricing environment for controlled robustness experiments.
-    \item Translation of behavioral distinguishability into a defensive pricing mechanism through a distributionally robust reinforcement learning formulation of pricing under non-stationary contamination.
-    \item Empirical evidence that agent contamination reduces revenue and that robustness is condition-dependent, requiring explicit calibration rather than a one-size-fits-all penalty.
-    \item Release of a reusable public experimental artifact for reproducing and extending research on dynamic pricing under agent-mediated traffic.
+    \item Definition of the Cost of Information (COI) as a mechanism-level quantity for pricing power, together with a theorem on its erosion under increasing agent saturation.
+    \item Design and implementation of a controlled e-commerce research platform on a hybrid Kappa--Lambda architecture for collecting and replaying high-fidelity interaction trajectories.
+    \item Construction and empirical validation of a behavioral distinguishability framework that separates human and agent sessions from interaction signals alone using transition kernels and KL-based divergence.
+    \item A generative contamination mechanism that injects learned agent behavior into the pricing environment for controlled robustness experiments.
+    \item Translation of distinguishability scores into defensive pricing via distributionally robust reinforcement learning under non-stationary contamination.
+    \item Evidence that contamination depresses revenue and that robustness gains are regime-dependent, so penalties and radii need calibration rather than a single default.
+    \item Release of a public experimental artifact (code and dataset) for reproducing and extending work on agent-mediated traffic.
 \end{itemize}

-\subsection{Future Works and Next Steps}
+\subsection{Limitations and future work}

-During the eights months of research dedicated to this work, a plethora of opportunities and industry gaps was identified, sadly a majority of which could not be addressed directly.
+Several constraints are intentional and could be relaxed later. Action weights in the demand proxy are currently derived from simple divergence rankings, learning them from data is an obvious next step. We propose a jointly learn the demand proxy, policy, and simulator parameters instead of treating them modularly. Another avenue we could not cover in this work is incorporating Bayesian methods better capture demand uncertainty and propagation of that uncertainty into reward systems.
+The Stackelberg interface assumes a clean alternation between platform move and market response. Richer histories (multi-agent, multi-platform) would need a less rigid state definition. Non-perishable catalog supply in the simulator widens the sim-to-real gap for inventory-constrained domains. Within-session contamination is modeled as stable, time-varying $\alpha$ inside a session would better match some attack patterns.
+
+Before any deployment, human baselines should grow beyond the convenience sample used here, catalog scaling laws should be re-checked when transition matrices grow with SKU count, and the full pipeline should be re-validated under production traffic volumes, governance constraints, and product mixes.
+We conclude our work with enthusiasm for future developments in the field of agent mediated commerce, we are excited to provide the foundations for these developments and hope to see future work in similar spirit.
--- a/paper/src/chapters/acknowledgements.tex
+++ b/paper/src/chapters/acknowledgements.tex
@@ -1,3 +1,7 @@
-\section{Acknowledgements}
+\section*{Acknowledgements}

-Eugene Bykovets, PhD - ETH
+This research was supported by the TPU Research Cloud program, which provided access to Google Cloud Tensor Processing Unit (TPU) accelerators, including TPU v4, v5e, and v6e.
+
+I am grateful to Marco Casalaina (VP of Product, Core AI, Microsoft) for an early conversation on dynamic pricing that helped frame the problem. Eugene Bykovets (Ph.D.) pointed out useful parallels with blockchain systems and the difficulty of inferring intent under pseudonymity. Alberto Mart\'{i}n Izquierdo supervised this work and accepted an unusually wide brief. Several peers contributed through discussion of the topics covered here. The head of innovation at Amadeus offered industry perspective on margin compression under automation.
+
+Generative tools were used for literature search, prototyping, and drafting support; all claims, experiments, and final wording remain the author's responsibility.
--- a/paper/src/chapters/figures/experiment_design_tree.tex
+++ b/paper/src/chapters/figures/experiment_design_tree.tex
@@ -0,0 +1,36 @@
+% Horizontal tree: level distance must exceed ~half parent + half child width or nodes overlap (resizebox does not fix that).
+\begin{tikzpicture}[
+  grow=right,
+  level distance=30mm,
+  sibling distance=23mm,
+  decision/.style={
+    rectangle,
+    draw,
+    rounded corners=1.5pt,
+    align=center,
+    inner sep=1.2pt,
+    minimum width=14mm,
+    minimum height=4.8mm,
+    font=\scriptsize,
+  },
+  leaf/.style={
+    rectangle,
+    draw,
+    align=center,
+    inner sep=1.2pt,
+    text width=19mm,
+    minimum height=4mm,
+    font=\scriptsize,
+  },
+  edge from parent/.style={draw, -{Latex[length=1.2mm]}},
+]
+\node[decision] {Participant}
+  child {
+    node[decision] {Platform: Hotel}
+      child {node[leaf] {Task sampled\\from hotel pool}}
+  }
+  child {
+    node[decision] {Platform: Airline}
+      child {node[leaf] {Task sampled\\from airline pool}}
+  };
+\end{tikzpicture}
--- a/paper/src/chapters/figures/results/generated/final/final_focus_alpha_deltas.csv
+++ b/paper/src/chapters/figures/results/generated/final/final_focus_alpha_deltas.csv
@@ -3,7 +3,7 @@ alpha,revenue_delta,revenue_delta_pct,reward_delta,reward_delta_pct,volatility_d
 0.1,-14962.041501283413,-4.410637208586118,-14303.760282736213,-4.531344436782669,0.0011858665298920962,0.0,-0.004133727080174038
 0.2,-16153.416666167905,-4.826514761457546,-15398.621298776357,-4.9418165571901715,0.00200624274016295,0.0,-0.0033201883450373615
 0.3,-17294.9275360335,-5.382423616385397,-16544.91845114401,-5.533399709364953,-0.0011022484400295268,0.0,-0.0029151149203366505
-0.4,-19661.294346174283,-6.250307313590199,-18728.35578200908,-6.3953153560217535,3.582812967113658e-05,0.0,-0.0038123361988749577
+0.4,-19543.8750398212,-6.215299839915013,-18613.487687777204,-6.35858461426586,-2.7530592947980215e-05,0.0,-0.0038561140856475523
 0.5,-16411.03168918495,-5.3630681206030015,-15638.77510066732,-5.4888928630525315,0.00015428950526953644,0.0,-0.00439661338956944
 0.6,-14729.668247641937,-5.069964928178309,-13912.22417824401,-5.148827377884945,-0.002735776807082743,0.0,-0.004310129386364658
 0.7,-21160.81910514756,-7.351404104505076,-20171.762105623755,-7.525169314210056,-0.0008903632602569461,0.0,-0.0026198461183787186
--- a/paper/src/chapters/figures/results/generated/final/final_focus_alpha_mode_summary.csv
+++ b/paper/src/chapters/figures/results/generated/final/final_focus_alpha_mode_summary.csv
@@ -7,7 +7,7 @@ alpha,mode,runs,revenue_mean,reward_mean,supra_mean,volatility_mean,coi_leakage_
 0.2,defended,35,318527.35122792586,296199.77820822067,0.0,0.07048630468445288,0.11265850300394666,137.2758153292305
 0.3,baseline,30,321322.30327214615,299000.9636054795,0.0,0.07085669473747759,0.11527347603412934,136.4452630715689
 0.3,defended,44,304027.37573611265,282456.0451543355,0.0,0.06975444629744806,0.11235836111379269,136.4704115371568
-0.4,baseline,33,314565.2423109539,292844.914432166,0.0,0.07031811881503117,0.11300307992768284,136.72547178046122
+0.4,baseline,33,314447.8230046008,292730.04633793415,0.0,0.07038147753765028,0.11304685781445543,136.70817144219887
 0.4,defended,38,294903.9479647796,274116.55865015695,0.0,0.0703539469447023,0.10919074372880788,136.75671002806396
 0.5,baseline,33,306000.80625751516,284916.7489847879,0.0,0.06938663916591635,0.11118137138243217,136.9528780620641
 0.5,defended,35,289589.7745683302,269277.9738841206,0.0,0.06954092867118589,0.10678475799286273,136.65018588845163
--- a/paper/src/chapters/figures/results/generated/final/final_focus_coi_preservation_grid.csv
+++ b/paper/src/chapters/figures/results/generated/final/final_focus_coi_preservation_grid.csv
@@ -0,0 +1,45 @@
+alpha,n_products,baseline_runs,defended_runs,baseline_coi_level_mean,defended_coi_level_mean,coi_preserved,coi_preserved_pct
+0.0,5.0,9,10,137.060822623968,136.18680853180368,-0.874014092164316,-0.6376833842316922
+0.0,25.0,9,2,137.114858903596,136.13793579187393,-0.9769231117220727,-0.7124852255501622
+0.0,50.0,9,11,137.16224858153575,136.92415566181484,-0.23809291972091273,-0.17358487643878118
+0.0,100.0,9,12,135.86629045322655,137.3609873086303,1.4946968554037596,1.1001234010420895
+0.1,5.0,3,6,136.59581715538818,135.6308466787041,-0.9649704766840728,-0.7064421859904723
+0.1,25.0,11,8,135.9860669350444,136.43616365263273,0.45009671758833747,0.33098737814318313
+0.1,50.0,10,11,136.28362874897243,136.92880179422633,0.6451730452538982,0.4734046570203046
+0.1,100.0,8,8,137.35578496752095,137.53394777402949,0.17816280650853855,0.12970899372797937
+0.2,5.0,8,9,135.55116314329388,137.30311388107864,1.7519507377847674,1.2924645551973204
+0.2,25.0,10,9,137.01587649612287,137.22137163685403,0.20549514073115915,0.1499790724887083
+0.2,50.0,4,8,137.45096138958434,137.1307018163465,-0.32025957323784837,-0.2329991511155169
+0.2,100.0,9,9,137.50780776750915,137.43195025898902,-0.07585750852013007,-0.0551659645744523
+0.3,5.0,6,6,134.95569459599133,134.21855668602896,-0.7371379099623709,-0.5462073402453271
+0.3,25.0,9,16,136.38346021911525,136.32131251342705,-0.06214770568820427,-0.04556835967378819
+0.3,50.0,8,6,136.97414077213367,136.88041560990786,-0.09372516222580884,-0.06842544271310845
+0.3,100.0,7,16,137.19706520314455,137.31020460277784,0.11313939963329744,0.08246488324351146
+0.4,5.0,8,11,135.6494813257779,136.5487738152141,0.899292489436192,0.6629531352769695
+0.4,25.0,7,9,136.38451372914378,136.10614648175604,-0.27836724738773455,-0.20410473284420322
+0.4,50.0,7,10,137.12976275807247,136.98838321468799,-0.14137954338448822,-0.10309909427460566
+0.4,100.0,11,8,137.4158065068933,137.4849148270489,0.06910832015560686,0.050291390715769026
+0.5,5.0,7,19,135.91101413475477,136.145621134976,0.2346070002212457,0.1726180925915501
+0.5,25.0,8,7,137.0972914279529,137.35620682163616,0.2589153936832531,0.18885522170896996
+0.5,50.0,8,1,137.0714841014652,135.66696334266234,-1.404520758802846,-1.0246629837050352
+0.5,100.0,10,8,137.4717672869487,137.35366167964338,-0.11810560730532416,-0.08591262746975456
+0.6,5.0,8,13,133.13626070539635,136.09936023073067,2.9630995253343144,2.225614201296411
+0.6,25.0,5,10,136.0741624588533,136.26219778039936,0.18803532154606728,0.13818591137970535
+0.6,50.0,8,10,135.09036188289087,136.05846380616936,0.968101923278482,0.7166328595060871
+0.6,100.0,7,8,137.29304001584052,137.07512338179083,-0.2179166340496863,-0.15872372993164377
+0.7,5.0,7,7,136.0533783988379,135.14350016006424,-0.9098782387736719,-0.6687656341075052
+0.7,25.0,8,11,137.12781750399415,136.8176582131797,-0.3101592908144539,-0.2261826203172962
+0.7,50.0,14,11,137.06965735909125,136.7028634119364,-0.3667939471548607,-0.26759674914335285
+0.7,100.0,11,11,137.48279078937205,137.09121810549402,-0.39157268387802446,-0.28481578067317975
+0.8,5.0,4,7,135.3095773096514,136.59715728802078,1.2875799783693935,0.9515808148766959
+0.8,25.0,12,13,136.93488398652164,135.73319876476054,-1.201685221761096,-0.8775596011600497
+0.8,50.0,6,8,136.4704324290659,136.86568018140107,0.39524775233516607,0.289621528487943
+0.8,100.0,4,11,137.519864039095,137.4763376137669,-0.04352642532811046,-0.03165100957032396
+0.9,5.0,5,5,134.77024204025943,136.6651608019597,1.8949187617002679,1.4060364758669837
+0.9,25.0,9,13,136.7554042236364,136.06108143100832,-0.6943227926280713,-0.507711411164888
+0.9,50.0,10,12,136.08715955450202,137.07569864767092,0.988539093168896,0.7264014447836223
+0.9,100.0,11,9,137.57053132642514,137.30115968842037,-0.2693716380047704,-0.19580620602940735
+1.0,5.0,5,7,136.43177888041947,135.92674388998284,-0.5050349904366271,-0.37017401266847305
+1.0,25.0,11,9,136.7037183889911,136.22617845471228,-0.47753993427880914,-0.34932475861407586
+1.0,50.0,11,5,136.93074105866745,137.05826644845806,0.12752538979060546,0.09313130769953819
+1.0,100.0,8,9,136.4880191421812,137.41913068956546,0.9311115473842619,0.682192879079234
--- a/paper/src/chapters/figures/results/generated/final/final_focus_headline_summary.json
+++ b/paper/src/chapters/figures/results/generated/final/final_focus_headline_summary.json
@@ -1,11 +1,14 @@
 {
-  "bundle": "engine/studies/results/wandb_sweep_bundles/bundle_20260317_093826",
+  "bundle": "/home/velocitatem/Documents/Projects/PHANTOM/engine/studies/results/wandb_sweep_bundles/bundle_20260317_122818",
  "focus_cohort": "max_alpha_coverage",
+  "focus_sweep_id": "i88nw811",
+  "focus_run_count": 768,
+  "git_commit": "ace52e8e14e0f7fa96ab5eb113c0c898b0bce1a0",
  "alpha_cells": 11,
  "alpha_min": 0.0,
  "alpha_max": 1.0,
-  "mean_revenue_delta_pct": -4.787221975639986,
-  "mean_reward_delta_pct": -4.91730667541704,
+  "mean_revenue_delta_pct": -4.784039478033151,
+  "mean_reward_delta_pct": -4.913967517075595,
  "zone_summary": [
    {
      "zone": "high_alpha_0_7_plus",
@@ -18,10 +21,10 @@
    {
      "zone": "low_alpha_below_0_7",
      "alpha_cells": 7,
-      "revenue_delta_pct_mean": -5.201949225367208,
-      "reward_delta_pct_mean": -5.324947138914036,
-      "coi_leakage_delta_mean": -0.0037041938968711296,
-      "volatility_delta_mean": 0.00011102505536893832
+      "revenue_delta_pct_mean": -5.196948157699325,
+      "reward_delta_pct_mean": -5.319699890091765,
+      "coi_leakage_delta_mean": -0.003710447880695786,
+      "volatility_delta_mean": 0.00010197380928049306
    }
  ]
 }
--- a/paper/src/chapters/figures/results/generated/final/final_focus_zone_summary.csv
+++ b/paper/src/chapters/figures/results/generated/final/final_focus_zone_summary.csv
@@ -1,3 +1,3 @@
 zone,alpha_cells,revenue_delta_pct_mean,reward_delta_pct_mean,coi_leakage_delta_mean,volatility_delta_mean
 high_alpha_0_7_plus,4,-4.0614492886173466,-4.2039358642972955,-0.0018236753956396637,0.00026289072427068336
-low_alpha_below_0_7,7,-5.201949225367208,-5.324947138914036,-0.0037041938968711296,0.00011102505536893832
+low_alpha_below_0_7,7,-5.196948157699325,-5.319699890091765,-0.003710447880695786,0.00010197380928049306
--- a/paper/src/chapters/figures/results/generated/final/plots/final_focus_coi_by_alpha.pdf
+++ b/paper/src/chapters/figures/results/generated/final/plots/final_focus_coi_by_alpha.pdf
--- a/paper/src/chapters/figures/results/generated/final/plots/final_focus_coi_preservation_grid.pdf
+++ b/paper/src/chapters/figures/results/generated/final/plots/final_focus_coi_preservation_grid.pdf
--- a/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf
+++ b/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf
--- a/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf
+++ b/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf
--- a/paper/src/chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf
+++ b/paper/src/chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf
--- a/paper/src/chapters/figures/results/generated/final/revenue_alpha_diagnostics.json
+++ b/paper/src/chapters/figures/results/generated/final/revenue_alpha_diagnostics.json
@@ -0,0 +1,24 @@
+{
+  "normality": {
+    "test": "jarque_bera",
+    "available": true,
+    "statistic": 362.38850707984324,
+    "p_value": 2.0339278125496517e-79
+  },
+  "heteroskedasticity": {
+    "test": "breusch_pagan",
+    "available": true,
+    "lm_stat": 6.0366025380616275,
+    "df": 1,
+    "p_value": 0.014012224810767138
+  },
+  "influence": {
+    "max_leverage": 0.03769234230180875,
+    "mean_leverage": 0.021052631578947392,
+    "high_leverage_threshold": 0.042105263157894736,
+    "high_leverage_count": 0,
+    "max_cooks_distance": 0.29121755538277183,
+    "high_cooks_threshold": 0.042105263157894736,
+    "high_cooks_count": 6
+  }
+}
--- a/paper/src/chapters/figures/results/generated/final/revenue_alpha_filtered.csv
+++ b/paper/src/chapters/figures/results/generated/final/revenue_alpha_filtered.csv
@@ -0,0 +1,96 @@
+sweep_id,sweep_full_id,run_id,run_name,state,run_url,created_at,runtime,downloaded_files,history_rows,selected_for_clone,download_error,alpha,n_products,eta_ux,lambda_coi,baseline_mode,no_robust,study_mode,eval_revenue_mean,eval_reward_mean,eval_stress_revenue_worst,eval_stress_reward_worst,eval_supra_share_mean,eval_supra_penalty_mean,eval_volatility_mean,eval_upward_volatility_mean,eval_coi_level_mean,eval_coi_leakage_mean,objective_score,mode
+i88nw811,lusiana/capstone_tpu/i88nw811,0yph6ddt,sweep/ppo/sb3/cpu/default/a0.7/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/0yph6ddt,2026-03-15T13:48:47Z,7579.766959963,0,0,0,,0.7,100.0,0.0,0.05,True,True,baseline,285875.15518050164,266287.2051805016,274356.50146499986,255620.24146499988,0.0,0.0,0.0711188680417482,0.0,137.42722406640746,0.1099719716550294,255620.24146499988,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,bjwmxlf4,sweep/ppo/sb3/cpu/default/a0.9/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/bjwmxlf4,2026-03-15T13:48:49Z,7514.003863569,0,0,0,,0.9,100.0,0.0,0.05,True,True,baseline,267194.6114143838,248902.78141438385,258791.60782635584,241079.0878263559,0.0,0.0,0.0706779448814682,0.0,137.4716591479769,0.1060063717489262,241079.0878263559,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,afod7srx,sweep/ppo/sb3/cpu/default/a0/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/afod7srx,2026-03-15T13:48:55Z,8428.923550896,0,0,0,,0.0,100.0,0.0,0.15,True,True,baseline,331626.71399641165,307929.2839964116,301903.22363424243,278909.22363424255,0.0,0.0,0.0699106903089938,0.0,134.44341240328637,0.1239456985672444,278909.22363424255,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,czbwbw4o,sweep/ppo/sb3/cpu/default/a0.3/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/czbwbw4o,2026-03-15T13:48:55Z,8019.834460958,0,0,0,,0.3,100.0,0.0,0.05,True,True,baseline,325062.60932028474,302657.9893202848,313580.73955351143,292103.1195535114,0.0,0.0,0.0700934793925504,0.0,137.30226556155992,0.1156304945350146,292103.1195535114,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,spncr5i5,sweep/ppo/sb3/cpu/default/a0.4/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/spncr5i5,2026-03-15T13:48:57Z,7984.536208498,0,0,0,,0.4,100.0,0.0,0.3,True,True,baseline,313890.156459866,292317.566459866,301905.6061551721,281189.2661551722,0.0,0.0,0.0700585666613017,0.0,137.27393385978286,0.1140225013120235,281189.2661551722,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,9utcbgal,sweep/ppo/sb3/cpu/default/a0.6/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/9utcbgal,2026-03-15T13:48:58Z,7794.573495005,0,0,0,,0.6,100.0,0.0,0.3,True,True,baseline,296881.4938150014,276559.4338150014,282693.0664052287,263321.0864052287,0.0,0.0,0.0689497793839256,0.0,137.65459475595475,0.1116745762120893,263321.0864052287,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,6uhc0zfi,sweep/ppo/sb3/cpu/default/a0.1/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/6uhc0zfi,2026-03-15T13:48:59Z,8739.343652451,5,5000,1,,0.1,100.0,0.0,0.3,True,True,baseline,345607.36851277394,321934.388512774,330271.9018417394,307619.2418417394,0.0,0.0,0.0688978199434404,0.0,137.65927138408344,0.1180576040723697,307619.2418417394,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,mid9h16o,sweep/ppo/sb3/cpu/default/a0.3/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/mid9h16o,2026-03-15T13:48:59Z,7934.709025792,0,0,0,,0.3,100.0,0.0,0.15,True,True,baseline,321120.1030044527,298922.9430044526,312002.2572538445,290604.6972538445,0.0,0.0,0.0725338635316591,0.0,136.9642983472208,0.1152504371251349,290604.6972538445,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,hm8geh95,sweep/ppo/sb3/cpu/default/a0.3/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/hm8geh95,2026-03-15T13:49:01Z,8324.170881475,0,0,0,,0.3,100.0,0.0,0.05,True,True,baseline,321120.1030044527,298922.9430044526,312002.2572538445,290604.6972538445,0.0,0.0,0.0725338635316591,0.0,136.9642983472208,0.1152504371251349,290604.6972538445,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,2k3bx48e,sweep/ppo/sb3/cpu/default/a0.7/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/2k3bx48e,2026-03-15T13:49:03Z,7579.046562713,0,0,0,,0.7,100.0,0.0,0.3,True,True,baseline,288003.5379862045,268208.7279862045,274205.49798255006,255466.81798255,0.0,0.0,0.0732015803628115,0.0,137.25851714050424,0.1065894678006264,255466.81798255,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,mlcllxuf,sweep/ppo/sb3/cpu/default/a0.3/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/mlcllxuf,2026-03-15T15:28:13Z,8048.447950291,0,0,0,,0.3,100.0,0.0,0.05,True,True,baseline,325062.60932028474,302657.9893202848,313580.73955351143,292103.1195535114,0.0,0.0,0.0700934793925504,0.0,137.30226556155992,0.1156304945350146,292103.1195535114,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,gsx5p3xl,sweep/ppo/sb3/cpu/default/a0.7/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/gsx5p3xl,2026-03-15T15:29:00Z,7666.062008427,0,0,0,,0.7,100.0,0.0,0.3,True,True,baseline,286859.8032779717,267231.9932779717,273198.5349293896,254530.3349293896,0.0,0.0,0.0694378534785247,0.0,137.6169536272908,0.1086813731317916,254530.3349293896,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,dh2sidg0,sweep/ppo/sb3/cpu/default/a0.8/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/dh2sidg0,2026-03-15T15:31:51Z,7450.114589126,0,0,0,,0.8,100.0,0.0,0.3,True,True,baseline,277537.1135308166,258574.23353081665,260525.6140973399,242761.4740973399,0.0,0.0,0.0691119185711536,0.0,137.63850710873982,0.1055234893030045,242761.4740973399,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,izb1xfjn,sweep/ppo/sb3/cpu/default/a0.4/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/izb1xfjn,2026-03-15T15:38:35Z,8138.431632101,0,0,0,,0.4,100.0,0.0,0.05,True,True,baseline,313890.156459866,292317.566459866,301905.6061551721,281189.2661551722,0.0,0.0,0.0700585666613017,0.0,137.27393385978286,0.1140225013120235,281189.2661551722,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,h5v0bjkk,sweep/ppo/sb3/cpu/default/a1/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/h5v0bjkk,2026-03-15T15:53:08Z,7430.137394885,0,0,0,,1.0,100.0,0.0,0.05,True,True,baseline,258250.4083985968,240558.37839859675,257579.27605596423,239906.35605596425,0.0,0.0,0.0710781742010645,0.0,137.43891114039735,0.1034797519569495,239906.35605596425,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,oo9x7mtj,sweep/ppo/sb3/cpu/default/a0/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/oo9x7mtj,2026-03-15T17:08:57Z,8434.676111878,0,0,0,,0.0,100.0,0.0,0.15,True,True,baseline,331626.71399641165,307929.2839964116,301903.22363424243,278909.22363424255,0.0,0.0,0.0699106903089938,0.0,134.44341240328637,0.1239456985672444,278909.22363424255,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,2tnqjvsr,sweep/ppo/sb3/cpu/default/a0.2/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/2tnqjvsr,2026-03-15T17:10:41Z,8326.316856098,0,0,0,,0.2,100.0,0.0,0.3,True,True,baseline,333463.32883383776,310606.38883383776,322375.37087837915,300349.6308783791,0.0,0.0,0.0694238399850746,0.0,137.6206723870474,0.1176551945750585,300349.6308783791,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,uwl4b1t4,sweep/ppo/sb3/cpu/default/a0.6/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/uwl4b1t4,2026-03-15T17:11:41Z,7730.138244902,0,0,0,,0.6,100.0,0.0,0.15,True,True,baseline,293934.0132863448,273673.5532863448,278235.2158621181,259045.3158621181,0.0,0.0,0.0702286844227449,0.0,137.02187396075487,0.1108792101893818,259045.3158621181,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,mq08631s,sweep/ppo/sb3/cpu/default/a0.7/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/mq08631s,2026-03-15T17:11:46Z,7830.903683379,0,0,0,,0.7,100.0,0.0,0.3,True,True,baseline,286859.8032779717,267231.9932779717,273198.5349293896,254530.3349293896,0.0,0.0,0.0694378534785247,0.0,137.6169536272908,0.1086813731317916,254530.3349293896,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,oenf81vs,sweep/ppo/sb3/cpu/default/a0.9/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/oenf81vs,2026-03-15T17:14:03Z,7571.420325966,0,0,0,,0.9,100.0,0.0,0.15,True,True,baseline,268129.28805568966,249777.98805568964,259354.03651639624,241657.8165163962,0.0,0.0,0.0692141212557269,0.0,137.56737533812094,0.1028102128114812,241657.8165163962,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,imvig8ea,sweep/ppo/sb3/cpu/default/a0.9/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/imvig8ea,2026-03-15T17:26:17Z,7548.356923917,0,0,0,,0.9,100.0,0.0,0.05,True,True,baseline,269095.26288012683,250709.3028801269,257985.06236888352,240343.2023688835,0.0,0.0,0.0687681637998595,0.0,137.63174822647662,0.1040919495927453,240343.2023688835,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,kc46mwot,sweep/ppo/sb3/cpu/default/a0.9/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/kc46mwot,2026-03-15T17:36:54Z,7402.437478922,0,0,0,,0.9,100.0,0.0,0.3,True,True,baseline,269095.26288012683,250709.3028801269,257985.06236888352,240343.2023688835,0.0,0.0,0.0687681637998595,0.0,137.63174822647662,0.1040919495927453,240343.2023688835,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,6c5g20m0,sweep/ppo/sb3/cpu/default/a0.4/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/6c5g20m0,2026-03-15T17:39:15Z,7987.751960449,0,0,0,,0.4,100.0,0.0,0.05,True,True,baseline,314792.9405088838,293199.96050888376,304000.02795477153,283160.5079547715,0.0,0.0,0.0706474903672308,0.0,137.54347765167836,0.1134114537317883,283160.5079547715,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,zmfirgme,sweep/ppo/sb3/cpu/default/a0.6/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/zmfirgme,2026-03-15T17:39:38Z,7729.43292327,0,0,0,,0.6,100.0,0.0,0.3,True,True,baseline,296881.4938150014,276559.4338150014,282693.0664052287,263321.0864052287,0.0,0.0,0.0689497793839256,0.0,137.65459475595475,0.1116745762120893,263321.0864052287,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,5w978f6n,sweep/ppo/sb3/cpu/default/a0.2/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/5w978f6n,2026-03-15T17:42:23Z,8196.563842857,0,0,0,,0.2,100.0,0.0,0.3,True,True,baseline,328662.28105387173,305848.95105387166,316489.4913151873,294621.8913151873,0.0,0.0,0.0726481757500429,0.0,136.60489081120323,0.115056283050696,294621.8913151873,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,v6yuq532,sweep/ppo/sb3/cpu/default/a0.3/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/v6yuq532,2026-03-15T18:27:32Z,8171.524047551,0,0,0,,0.3,100.0,0.0,0.3,True,True,baseline,325536.3728999571,303203.77289995714,311530.19009115506,290169.93009115505,0.0,0.0,0.0690101249418158,0.0,137.57976469566975,0.115140125484157,290169.93009115505,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,wzs4h708,sweep/ppo/sb3/cpu/default/a1/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/wzs4h708,2026-03-15T18:44:40Z,7213.500579862,0,0,0,,1.0,100.0,0.0,0.3,True,True,baseline,258250.4083985968,240558.37839859675,257579.27605596423,239906.35605596425,0.0,0.0,0.0710781742010645,0.0,137.43891114039735,0.1034797519569495,239906.35605596425,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,drjegsa8,sweep/ppo/sb3/cpu/default/a0.8/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/drjegsa8,2026-03-15T18:53:51Z,7642.750902648,0,0,0,,0.8,100.0,0.0,0.05,True,True,baseline,278042.9708277731,258987.21082777312,265119.53279206343,246979.39279206347,0.0,0.0,0.069699479796535,0.0,137.47635104131075,0.1063946886684759,246979.39279206347,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,np3fvzwt,sweep/ppo/sb3/cpu/default/a0.9/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/np3fvzwt,2026-03-15T18:57:50Z,7300.325366337,0,0,0,,0.9,100.0,0.0,0.3,True,True,baseline,269095.26288012683,250709.3028801269,257985.06236888352,240343.2023688835,0.0,0.0,0.0687681637998595,0.0,137.63174822647662,0.1040919495927453,240343.2023688835,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,kk0sqa97,sweep/ppo/sb3/cpu/default/a0.1/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/kk0sqa97,2026-03-15T19:06:17Z,8525.177181009,0,0,0,,0.1,100.0,0.0,0.3,True,True,baseline,341404.1205957663,317885.0305957663,329505.50925893825,306817.3492589383,0.0,0.0,0.0685274095002656,0.0,137.33021724658855,0.1206998447923596,306817.3492589383,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,i0rpx1kf,sweep/ppo/sb3/cpu/default/a0.2/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/i0rpx1kf,2026-03-15T19:20:36Z,8356.73493734,0,0,0,,0.2,100.0,0.0,0.05,True,True,baseline,333463.32883383776,310606.38883383776,322375.37087837915,300349.6308783791,0.0,0.0,0.0694238399850746,0.0,137.6206723870474,0.1176551945750585,300349.6308783791,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,lqmaq5g2,sweep/ppo/sb3/cpu/default/a1/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/lqmaq5g2,2026-03-15T20:02:28Z,7470.274064026,0,0,0,,1.0,100.0,0.0,0.05,True,True,baseline,246584.29279154172,229303.12279154177,244564.78814724492,227386.888147245,0.0,0.0,0.0692074374069363,0.0,135.2844805658817,0.1093837602765936,227386.888147245,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,2umearxm,sweep/ppo/sb3/cpu/default/a0.5/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/2umearxm,2026-03-15T20:09:56Z,7829.406313163,0,0,0,,0.5,100.0,0.0,0.3,True,True,baseline,303325.5596877454,282520.29968774534,291965.65710567136,271937.69710567134,0.0,0.0,0.0686525035124021,0.0,137.57073544790862,0.1132342695408356,271937.69710567134,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,k7pirqxy,sweep/ppo/sb3/cpu/default/a1/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/k7pirqxy,2026-03-15T20:33:53Z,7216.626889631,0,0,0,,1.0,100.0,0.0,0.15,True,True,baseline,254537.24517731377,236935.99517731369,254471.2696855663,236912.16968556636,0.0,0.0,0.0703905833083271,0.0,136.6143424312229,0.1038838810036006,236912.16968556636,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,algnjce4,sweep/ppo/sb3/cpu/default/a0.6/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/algnjce4,2026-03-15T20:54:24Z,7739.30650029,0,0,0,,0.6,100.0,0.0,0.05,True,True,baseline,296881.4938150014,276559.4338150014,282693.0664052287,263321.0864052287,0.0,0.0,0.0689497793839256,0.0,137.65459475595475,0.1116745762120893,263321.0864052287,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,vqe2dmcq,sweep/ppo/sb3/cpu/default/a0.4/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/vqe2dmcq,2026-03-15T21:08:22Z,7815.774646473,0,0,0,,0.4,100.0,0.0,0.05,True,True,baseline,316543.04043212667,294899.01043212664,299980.59649797506,279386.7564979751,0.0,0.0,0.067603468946279,0.0,137.7846896269947,0.1128739206843639,279386.7564979751,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,0xlvpawh,sweep/ppo/sb3/cpu/default/a0.3/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/0xlvpawh,2026-03-15T21:16:04Z,7997.68392245,0,0,0,,0.3,100.0,0.0,0.15,True,True,baseline,325062.60932028474,302657.9893202848,313580.73955351143,292103.1195535114,0.0,0.0,0.0700934793925504,0.0,137.30226556155992,0.1156304945350146,292103.1195535114,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,bofuxayn,sweep/ppo/sb3/cpu/default/a0.7/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/bofuxayn,2026-03-15T21:18:05Z,7486.102336723,0,0,0,,0.7,100.0,0.0,0.05,True,True,baseline,285875.15518050164,266287.2051805016,274356.50146499986,255620.24146499988,0.0,0.0,0.0711188680417482,0.0,137.42722406640746,0.1099719716550294,255620.24146499988,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,rujnezt7,sweep/ppo/sb3/cpu/default/a0.5/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/rujnezt7,2026-03-15T21:20:23Z,7936.01356938,0,0,0,,0.5,100.0,0.0,0.15,True,True,baseline,305342.590984541,284402.02098454104,287794.11179162114,267934.8717916211,0.0,0.0,0.0698329564541014,0.0,137.34875112178105,0.1110975441706762,267934.8717916211,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,f9e6wtv0,sweep/ppo/sb3/cpu/default/a0.7/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/f9e6wtv0,2026-03-15T22:07:04Z,8030.825365422,0,0,0,,0.7,100.0,0.0,0.05,True,True,baseline,288003.5379862045,268208.7279862045,274205.49798255006,255466.81798255,0.0,0.0,0.0732015803628115,0.0,137.25851714050424,0.1065894678006264,255466.81798255,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,r8hsz3ko,sweep/ppo/sb3/cpu/default/a0.7/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/r8hsz3ko,2026-03-15T22:13:06Z,7691.998775531,0,0,0,,0.7,100.0,0.0,0.3,True,True,baseline,286859.8032779717,267231.9932779717,273198.5349293896,254530.3349293896,0.0,0.0,0.0694378534785247,0.0,137.6169536272908,0.1086813731317916,254530.3349293896,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,yukg46hv,sweep/ppo/sb3/cpu/default/a1/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/yukg46hv,2026-03-15T23:03:27Z,7094.861108483,0,0,0,,1.0,100.0,0.0,0.15,True,True,baseline,254537.24517731377,236935.99517731369,254471.2696855663,236912.16968556636,0.0,0.0,0.0703905833083271,0.0,136.6143424312229,0.1038838810036006,236912.16968556636,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,e5tciezz,sweep/ppo/sb3/cpu/default/a0.7/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/e5tciezz,2026-03-16T00:16:08Z,7569.145925588,0,0,0,,0.7,100.0,0.0,0.05,True,True,baseline,285875.15518050164,266287.2051805016,274356.50146499986,255620.24146499988,0.0,0.0,0.0711188680417482,0.0,137.42722406640746,0.1099719716550294,255620.24146499988,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,1rop5sf9,sweep/ppo/sb3/cpu/default/a0.3/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/1rop5sf9,2026-03-16T00:21:00Z,8354.617713686,0,0,0,,0.3,100.0,0.0,0.05,True,True,baseline,321120.1030044527,298922.9430044526,312002.2572538445,290604.6972538445,0.0,0.0,0.0725338635316591,0.0,136.9642983472208,0.1152504371251349,290604.6972538445,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,7muxpseb,sweep/ppo/sb3/cpu/default/a0.2/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/7muxpseb,2026-03-16T00:21:21Z,8514.602541985,0,0,0,,0.2,100.0,0.0,0.05,True,True,baseline,333463.32883383776,310606.38883383776,322375.37087837915,300349.6308783791,0.0,0.0,0.0694238399850746,0.0,137.6206723870474,0.1176551945750585,300349.6308783791,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,304dyypp,sweep/ppo/sb3/cpu/default/a0.4/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/304dyypp,2026-03-16T00:37:04Z,7949.736292204,0,0,0,,0.4,100.0,0.0,0.3,True,True,baseline,313890.156459866,292317.566459866,301905.6061551721,281189.2661551722,0.0,0.0,0.0700585666613017,0.0,137.27393385978286,0.1140225013120235,281189.2661551722,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,zbw7nmeo,sweep/ppo/sb3/cpu/default/a0.1/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/zbw7nmeo,2026-03-16T00:53:02Z,8423.598177489,0,0,0,,0.1,100.0,0.0,0.05,True,True,baseline,340941.7898046945,317438.6698046944,328185.5337341634,305593.15373416344,0.0,0.0,0.0709483560344898,0.0,137.21682561970587,0.1186714838821206,305593.15373416344,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,oxu7rm37,sweep/ppo/sb3/cpu/default/a0.9/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/oxu7rm37,2026-03-16T00:53:31Z,7464.830361968,0,0,0,,0.9,100.0,0.0,0.3,True,True,baseline,268129.28805568966,249777.98805568964,259354.03651639624,241657.8165163962,0.0,0.0,0.0692141212557269,0.0,137.56737533812094,0.1028102128114812,241657.8165163962,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,m78p26vk,sweep/ppo/sb3/cpu/default/a0/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/m78p26vk,2026-03-16T00:56:58Z,8717.289024041,5,1004,1,,0.0,100.0,0.0,0.15,True,True,baseline,348861.1454509751,324713.0754509751,335967.6160126648,312660.3160126648,0.0,0.0,0.0674835742466741,0.0,136.8813175598437,0.118985751213389,312660.3160126648,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,65zzmszh,sweep/ppo/sb3/cpu/default/a1/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/65zzmszh,2026-03-16T01:14:03Z,7326.553384609,0,0,0,,1.0,100.0,0.0,0.3,True,True,baseline,246584.29279154172,229303.12279154177,244564.78814724492,227386.888147245,0.0,0.0,0.0692074374069363,0.0,135.2844805658817,0.1093837602765936,227386.888147245,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,47xraqt6,sweep/ppo/sb3/cpu/default/a0.9/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/47xraqt6,2026-03-16T01:22:01Z,7299.814264453,0,0,0,,0.9,100.0,0.0,0.3,True,True,baseline,269095.26288012683,250709.3028801269,257985.06236888352,240343.2023688835,0.0,0.0,0.0687681637998595,0.0,137.63174822647662,0.1040919495927453,240343.2023688835,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,mibyt0bf,sweep/ppo/sb3/cpu/default/a0.9/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/mibyt0bf,2026-03-16T01:34:44Z,7541.153639959,0,0,0,,0.9,100.0,0.0,0.3,True,True,baseline,267194.6114143838,248902.78141438385,258791.60782635584,241079.0878263559,0.0,0.0,0.0706779448814682,0.0,137.4716591479769,0.1060063717489262,241079.0878263559,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,8ww25eu1,sweep/ppo/sb3/cpu/default/a0.4/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/8ww25eu1,2026-03-16T01:45:51Z,8003.812511886,0,0,0,,0.4,100.0,0.0,0.3,True,True,baseline,313890.156459866,292317.566459866,301905.6061551721,281189.2661551722,0.0,0.0,0.0700585666613017,0.0,137.27393385978286,0.1140225013120235,281189.2661551722,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,cxdz0iyj,sweep/ppo/sb3/cpu/default/a0.6/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/cxdz0iyj,2026-03-16T01:50:19Z,7623.493600288,0,0,0,,0.6,100.0,0.0,0.3,True,True,baseline,293934.0132863448,273673.5532863448,278235.2158621181,259045.3158621181,0.0,0.0,0.0702286844227449,0.0,137.02187396075487,0.1108792101893818,259045.3158621181,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,1aeqr4sw,sweep/ppo/sb3/cpu/default/a1/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/1aeqr4sw,2026-03-16T01:58:10Z,7156.375097998,0,0,0,,1.0,100.0,0.0,0.3,True,True,baseline,254537.24517731377,236935.99517731369,254471.2696855663,236912.16968556636,0.0,0.0,0.0703905833083271,0.0,136.6143424312229,0.1038838810036006,236912.16968556636,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,7sgqchvk,sweep/ppo/sb3/cpu/default/a0.9/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/7sgqchvk,2026-03-16T02:09:14Z,7268.202978965,0,0,0,,0.9,100.0,0.0,0.15,True,True,baseline,267194.6114143838,248902.78141438385,258791.60782635584,241079.0878263559,0.0,0.0,0.0706779448814682,0.0,137.4716591479769,0.1060063717489262,241079.0878263559,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,3s777ena,sweep/ppo/sb3/cpu/default/a0.5/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/3s777ena,2026-03-16T02:14:54Z,7762.769931002,0,0,0,,0.5,100.0,0.0,0.05,True,True,baseline,303325.5596877454,282520.29968774534,291965.65710567136,271937.69710567134,0.0,0.0,0.0686525035124021,0.0,137.57073544790862,0.1132342695408356,271937.69710567134,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,oxsvuh5p,sweep/ppo/sb3/cpu/default/a0.1/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/oxsvuh5p,2026-03-16T02:27:01Z,8529.692612353,0,0,0,,0.1,100.0,0.0,0.15,True,True,baseline,340941.7898046945,317438.6698046944,328185.5337341634,305593.15373416344,0.0,0.0,0.0709483560344898,0.0,137.21682561970587,0.1186714838821206,305593.15373416344,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,4unnwl9l,sweep/ppo/sb3/cpu/default/a0.7/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/4unnwl9l,2026-03-16T02:34:01Z,7780.065361146,0,0,0,,0.7,100.0,0.0,0.15,True,True,baseline,286859.8032779717,267231.9932779717,273198.5349293896,254530.3349293896,0.0,0.0,0.0694378534785247,0.0,137.6169536272908,0.1086813731317916,254530.3349293896,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,qlfu6ts4,sweep/ppo/sb3/cpu/default/a0.1/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/qlfu6ts4,2026-03-16T02:46:52Z,8357.276406226,0,0,0,,0.1,100.0,0.0,0.3,True,True,baseline,340941.7898046945,317438.6698046944,328185.5337341634,305593.15373416344,0.0,0.0,0.0709483560344898,0.0,137.21682561970587,0.1186714838821206,305593.15373416344,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,ya2bb56z,sweep/ppo/sb3/cpu/default/a1/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/ya2bb56z,2026-03-16T03:04:37Z,7161.126998896,0,0,0,,1.0,100.0,0.0,0.15,True,True,baseline,254537.24517731377,236935.99517731369,254471.2696855663,236912.16968556636,0.0,0.0,0.0703905833083271,0.0,136.6143424312229,0.1038838810036006,236912.16968556636,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,9hrjmcaf,sweep/ppo/sb3/cpu/default/a0.1/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/9hrjmcaf,2026-03-16T03:13:29Z,8543.819880598,5,1004,1,,0.1,100.0,0.0,0.15,True,True,baseline,345607.36851277394,321934.388512774,330271.9018417394,307619.2418417394,0.0,0.0,0.0688978199434404,0.0,137.65927138408344,0.1180576040723697,307619.2418417394,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,bdz7jpg9,sweep/ppo/sb3/cpu/default/a0.4/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/bdz7jpg9,2026-03-16T03:19:29Z,8156.512730959,0,0,0,,0.4,100.0,0.0,0.15,True,True,baseline,313890.156459866,292317.566459866,301905.6061551721,281189.2661551722,0.0,0.0,0.0700585666613017,0.0,137.27393385978286,0.1140225013120235,281189.2661551722,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,4e8bw9fr,sweep/ppo/sb3/cpu/default/a0.4/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/4e8bw9fr,2026-03-16T03:23:44Z,7900.988162577,0,0,0,,0.4,100.0,0.0,0.3,True,True,baseline,313890.156459866,292317.566459866,301905.6061551721,281189.2661551722,0.0,0.0,0.0700585666613017,0.0,137.27393385978286,0.1140225013120235,281189.2661551722,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,rudposqg,sweep/ppo/sb3/cpu/default/a0.8/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/rudposqg,2026-03-16T04:16:36Z,7803.944972672,0,0,0,,0.8,100.0,0.0,0.15,True,True,baseline,277186.5585556976,258169.5585556976,260819.58418764165,242908.9641876417,0.0,0.0,0.0684627361221973,0.0,137.3260908975896,0.1077409453905398,242908.9641876417,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,r24xwwl9,sweep/ppo/sb3/cpu/default/a0.1/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/r24xwwl9,2026-03-16T04:43:43Z,8571.635566955,0,0,0,,0.1,100.0,0.0,0.15,True,True,baseline,340941.7898046945,317438.6698046944,328185.5337341634,305593.15373416344,0.0,0.0,0.0709483560344898,0.0,137.21682561970587,0.1186714838821206,305593.15373416344,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,34c0wzgt,sweep/ppo/sb3/cpu/default/a0.5/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/34c0wzgt,2026-03-16T04:43:54Z,7912.776898111,0,0,0,,0.5,100.0,0.0,0.05,True,True,baseline,306631.1127310434,285624.6727310434,292140.0218133485,272205.32181334845,0.0,0.0,0.0706121906603894,0.0,137.48236407441985,0.112886126809283,272205.32181334845,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,7bvonhab,sweep/ppo/sb3/cpu/default/a0.2/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/7bvonhab,2026-03-16T04:59:24Z,8276.510250338,0,0,0,,0.2,100.0,0.0,0.15,True,True,baseline,333463.32883383776,310606.38883383776,322375.37087837915,300349.6308783791,0.0,0.0,0.0694238399850746,0.0,137.6206723870474,0.1176551945750585,300349.6308783791,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,4f7j1z4p,sweep/ppo/sb3/cpu/default/a0/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/4f7j1z4p,2026-03-16T05:37:06Z,8672.519975981,5,1004,1,,0.0,100.0,0.0,0.3,True,True,baseline,352771.72255003714,328513.3625500371,337718.8770159761,314393.4970159762,0.0,0.0,0.0709252720738168,0.0,137.49769422651883,0.1192149910017191,314393.4970159762,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,c33cyjv9,sweep/ppo/sb3/cpu/default/a0.4/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/c33cyjv9,2026-03-16T05:38:08Z,8164.154912737,0,0,0,,0.4,100.0,0.0,0.15,True,True,baseline,314792.9405088838,293199.96050888376,304000.02795477153,283160.5079547715,0.0,0.0,0.0706474903672308,0.0,137.54347765167836,0.1134114537317883,283160.5079547715,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,i0pylqm1,sweep/ppo/sb3/cpu/default/a0.6/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/i0pylqm1,2026-03-16T05:54:46Z,7692.357589996,0,0,0,,0.6,100.0,0.0,0.15,True,True,baseline,293934.0132863448,273673.5532863448,278235.2158621181,259045.3158621181,0.0,0.0,0.0702286844227449,0.0,137.02187396075487,0.1108792101893818,259045.3158621181,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,p1lrhc1t,sweep/ppo/sb3/cpu/default/a0.5/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/p1lrhc1t,2026-03-16T06:06:24Z,7906.656203638,0,0,0,,0.5,100.0,0.0,0.15,True,True,baseline,304711.516143744,283789.716143744,290536.18598250934,270609.3259825093,0.0,0.0,0.0700712626186499,0.0,137.43043602946972,0.1112796769387625,270609.3259825093,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,lkhtnobk,sweep/ppo/sb3/cpu/default/a0.9/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/lkhtnobk,2026-03-16T06:25:11Z,7304.77470818,0,0,0,,0.9,100.0,0.0,0.3,True,True,baseline,269095.26288012683,250709.3028801269,257985.06236888352,240343.2023688835,0.0,0.0,0.0687681637998595,0.0,137.63174822647662,0.1040919495927453,240343.2023688835,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,dvf0av6p,sweep/ppo/sb3/cpu/default/a0/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/dvf0av6p,2026-03-16T06:34:22Z,8568.236301103,0,0,0,,0.0,100.0,0.0,0.3,True,True,baseline,331626.71399641165,307929.2839964116,301903.22363424243,278909.22363424255,0.0,0.0,0.0699106903089938,0.0,134.44341240328637,0.1239456985672444,278909.22363424255,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,k6dz4he1,sweep/ppo/sb3/cpu/default/a0/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/k6dz4he1,2026-03-16T06:38:33Z,8384.405275426,0,0,0,,0.0,100.0,0.0,0.05,True,True,baseline,331626.71399641165,307929.2839964116,301903.22363424243,278909.22363424255,0.0,0.0,0.0699106903089938,0.0,134.44341240328637,0.1239456985672444,278909.22363424255,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,3afj9zm5,sweep/ppo/sb3/cpu/default/a0.4/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/3afj9zm5,2026-03-16T06:51:33Z,7947.433015786,0,0,0,,0.4,100.0,0.0,0.3,True,True,baseline,313890.156459866,292317.566459866,301905.6061551721,281189.2661551722,0.0,0.0,0.0700585666613017,0.0,137.27393385978286,0.1140225013120235,281189.2661551722,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,lvlojvjv,sweep/ppo/sb3/cpu/default/a0.5/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/lvlojvjv,2026-03-16T07:17:09Z,8072.460782252,0,0,0,,0.5,100.0,0.0,0.05,True,True,baseline,305342.590984541,284402.02098454104,287794.11179162114,267934.8717916211,0.0,0.0,0.0698329564541014,0.0,137.34875112178105,0.1110975441706762,267934.8717916211,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,e6xtq7h5,sweep/ppo/sb3/cpu/default/a0.5/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/e6xtq7h5,2026-03-16T07:20:29Z,8062.476629606,0,0,0,,0.5,100.0,0.0,0.05,True,True,baseline,306631.1127310434,285624.6727310434,292140.0218133485,272205.32181334845,0.0,0.0,0.0706121906603894,0.0,137.48236407441985,0.112886126809283,272205.32181334845,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,6yrs8xci,sweep/ppo/sb3/cpu/default/a0.6/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/6yrs8xci,2026-03-16T07:50:01Z,7609.609823102,0,0,0,,0.6,100.0,0.0,0.15,True,True,baseline,293934.0132863448,273673.5532863448,278235.2158621181,259045.3158621181,0.0,0.0,0.0702286844227449,0.0,137.02187396075487,0.1108792101893818,259045.3158621181,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,16l3qjpm,sweep/ppo/sb3/cpu/default/a0/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/16l3qjpm,2026-03-16T07:50:41Z,8443.503878801,5,1004,1,,0.0,100.0,0.0,0.15,True,True,baseline,348861.1454509751,324713.0754509751,335967.6160126648,312660.3160126648,0.0,0.0,0.0674835742466741,0.0,136.8813175598437,0.118985751213389,312660.3160126648,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,rg98ht1b,sweep/ppo/sb3/cpu/default/a0/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/rg98ht1b,2026-03-16T07:55:36Z,8843.938343818,5,1004,1,,0.0,100.0,0.0,0.05,True,True,baseline,348861.1454509751,324713.0754509751,335967.6160126648,312660.3160126648,0.0,0.0,0.0674835742466741,0.0,136.8813175598437,0.118985751213389,312660.3160126648,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,mxd3i6wr,sweep/ppo/sb3/cpu/default/a0.2/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/mxd3i6wr,2026-03-16T07:58:03Z,8393.28184472,0,0,0,,0.2,100.0,0.0,0.15,True,True,baseline,333463.32883383776,310606.38883383776,322375.37087837915,300349.6308783791,0.0,0.0,0.0694238399850746,0.0,137.6206723870474,0.1176551945750585,300349.6308783791,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,0xvyhpg2,sweep/ppo/sb3/cpu/default/a0.9/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/0xvyhpg2,2026-03-16T08:01:43Z,7441.092473369,0,0,0,,0.9,100.0,0.0,0.05,True,True,baseline,268129.28805568966,249777.98805568964,259354.03651639624,241657.8165163962,0.0,0.0,0.0692141212557269,0.0,137.56737533812094,0.1028102128114812,241657.8165163962,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,eull6lat,sweep/ppo/sb3/cpu/default/a0.2/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/eull6lat,2026-03-16T08:03:08Z,8338.76018915,0,0,0,,0.2,100.0,0.0,0.05,True,True,baseline,333463.32883383776,310606.38883383776,322375.37087837915,300349.6308783791,0.0,0.0,0.0694238399850746,0.0,137.6206723870474,0.1176551945750585,300349.6308783791,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,5zekml75,sweep/ppo/sb3/cpu/default/a0.8/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/5zekml75,2026-03-16T08:06:29Z,7265.4990034,0,0,0,,0.8,100.0,0.0,0.15,True,True,baseline,277537.1135308166,258574.23353081665,260525.6140973399,242761.4740973399,0.0,0.0,0.0691119185711536,0.0,137.63850710873982,0.1055234893030045,242761.4740973399,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,fed0y4px,sweep/ppo/sb3/cpu/default/a0.7/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/fed0y4px,2026-03-16T08:13:55Z,7800.555020283,0,0,0,,0.7,100.0,0.0,0.05,True,True,baseline,286859.8032779717,267231.9932779717,273198.5349293896,254530.3349293896,0.0,0.0,0.0694378534785247,0.0,137.6169536272908,0.1086813731317916,254530.3349293896,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,byifn20j,sweep/ppo/sb3/cpu/default/a0.4/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/byifn20j,2026-03-16T08:20:55Z,8108.199462596,0,0,0,,0.4,100.0,0.0,0.3,True,True,baseline,316543.04043212667,294899.01043212664,299980.59649797506,279386.7564979751,0.0,0.0,0.067603468946279,0.0,137.7846896269947,0.1128739206843639,279386.7564979751,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,35rb8529,sweep/ppo/sb3/cpu/default/a0.5/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/35rb8529,2026-03-16T08:24:52Z,7749.649896228,0,0,0,,0.5,100.0,0.0,0.05,True,True,baseline,304711.516143744,283789.716143744,290536.18598250934,270609.3259825093,0.0,0.0,0.0700712626186499,0.0,137.43043602946972,0.1112796769387625,270609.3259825093,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,foinu2r1,sweep/ppo/sb3/cpu/default/a0.5/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/foinu2r1,2026-03-16T08:51:50Z,7924.351691656,0,0,0,,0.5,100.0,0.0,0.05,True,True,baseline,306631.1127310434,285624.6727310434,292140.0218133485,272205.32181334845,0.0,0.0,0.0706121906603894,0.0,137.48236407441985,0.112886126809283,272205.32181334845,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,nsg7m2ud,sweep/ppo/sb3/cpu/default/a0.5/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/nsg7m2ud,2026-03-16T09:06:10Z,7732.794663489,0,0,0,,0.5,100.0,0.0,0.3,True,True,baseline,303325.5596877454,282520.29968774534,291965.65710567136,271937.69710567134,0.0,0.0,0.0686525035124021,0.0,137.57073544790862,0.1132342695408356,271937.69710567134,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,gpririem,sweep/ppo/sb3/cpu/default/a0.2/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/gpririem,2026-03-16T09:20:57Z,8532.119121611,0,0,0,,0.2,100.0,0.0,0.3,True,True,baseline,333463.32883383776,310606.38883383776,322375.37087837915,300349.6308783791,0.0,0.0,0.0694238399850746,0.0,137.6206723870474,0.1176551945750585,300349.6308783791,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,9bmbalnk,sweep/ppo/sb3/cpu/default/a0.7/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/9bmbalnk,2026-03-16T10:05:49Z,7576.93090345,0,0,0,,0.7,100.0,0.0,0.15,True,True,baseline,285875.15518050164,266287.2051805016,274356.50146499986,255620.24146499988,0.0,0.0,0.0711188680417482,0.0,137.42722406640746,0.1099719716550294,255620.24146499988,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,9ma76sch,sweep/ppo/sb3/cpu/default/a0.1/baseline/s1337,finished,https://wandb.ai/lusiana/capstone_tpu/runs/9ma76sch,2026-03-16T10:23:59Z,8544.8427845,0,0,0,,0.1,100.0,0.0,0.3,True,True,baseline,341404.1205957663,317885.0305957663,329505.50925893825,306817.3492589383,0.0,0.0,0.0685274095002656,0.0,137.33021724658855,0.1206998447923596,306817.3492589383,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,cvrztiyb,sweep/ppo/sb3/cpu/default/a0.2/baseline/s42,finished,https://wandb.ai/lusiana/capstone_tpu/runs/cvrztiyb,2026-03-16T10:27:26Z,8353.396268583,0,0,0,,0.2,100.0,0.0,0.3,True,True,baseline,333463.32883383776,310606.38883383776,322375.37087837915,300349.6308783791,0.0,0.0,0.0694238399850746,0.0,137.6206723870474,0.1176551945750585,300349.6308783791,baseline
+i88nw811,lusiana/capstone_tpu/i88nw811,7z9spcc6,sweep/ppo/sb3/cpu/default/a0/baseline/s7777,finished,https://wandb.ai/lusiana/capstone_tpu/runs/7z9spcc6,2026-03-16T10:29:46Z,8444.449882423,5,1004,1,,0.0,100.0,0.0,0.3,True,True,baseline,348861.1454509751,324713.0754509751,335967.6160126648,312660.3160126648,0.0,0.0,0.0674835742466741,0.0,136.8813175598437,0.118985751213389,312660.3160126648,baseline
--- a/paper/src/chapters/figures/results/generated/final/revenue_alpha_fixed_effects.json
+++ b/paper/src/chapters/figures/results/generated/final/revenue_alpha_fixed_effects.json
@@ -0,0 +1,31 @@
+{
+  "n": 95,
+  "k": 2,
+  "dof": 93,
+  "df_t": 93,
+  "cov_type": "hc1",
+  "clusters": null,
+  "r2": 0.9759651432807543,
+  "adj_r2": 0.9757067039611925,
+  "sse": 1872600419.7223544,
+  "coefficients": [
+    {
+      "name": "intercept",
+      "coef": 348823.4131652292,
+      "std_error": 1383.3660823209932,
+      "t_stat": 252.15553397115096,
+      "p_value": 0.0,
+      "ci95_low": 346076.3222890517,
+      "ci95_high": 351570.5040414067
+    },
+    {
+      "name": "alpha",
+      "coef": -90140.52744561416,
+      "std_error": 2185.134882447838,
+      "t_stat": -41.25169945785529,
+      "p_value": 0.0,
+      "ci95_low": -94479.77225976942,
+      "ci95_high": -85801.2826314589
+    }
+  ]
+}
--- a/paper/src/chapters/figures/results/generated/final/revenue_alpha_per_sweep.csv
+++ b/paper/src/chapters/figures/results/generated/final/revenue_alpha_per_sweep.csv
@@ -0,0 +1,2 @@
+sweep_id,n,alpha_coef,alpha_std_error,alpha_t_stat,alpha_p_value,alpha_ci95_low,alpha_ci95_high,r2
+i88nw811,95,-90140.52744561416,2185.134882447838,-41.25169945785529,0.0,-94479.77225976942,-85801.2826314589,0.9759651432807543
--- a/paper/src/chapters/figures/results/generated/final/revenue_alpha_sample_accounting.json
+++ b/paper/src/chapters/figures/results/generated/final/revenue_alpha_sample_accounting.json
@@ -0,0 +1,37 @@
+{
+  "bundle_dir": "/home/velocitatem/Documents/Projects/PHANTOM/engine/studies/results/wandb_sweep_bundles/bundle_20260317_122818",
+  "git_commit": "e62e842faad79b143f5555d187075e85c8926363",
+  "cohort_name": "original_n95_baseline_n100",
+  "filters": {
+    "sweep_id": [
+      "i88nw811"
+    ],
+    "mode": "baseline",
+    "n_products": 100.0,
+    "eta_ux": 0.0,
+    "lambda_coi": null,
+    "alpha_min": 0.0,
+    "alpha_max": 1.0
+  },
+  "n_rows": 95,
+  "n_sweeps": 1,
+  "alpha_unique": [
+    0.0,
+    0.1,
+    0.2,
+    0.3,
+    0.4,
+    0.5,
+    0.6,
+    0.7,
+    0.8,
+    0.9,
+    1.0
+  ],
+  "rows_by_sweep": {
+    "i88nw811": 95
+  },
+  "rows_by_mode": {
+    "baseline": 95
+  }
+}
--- a/paper/src/chapters/figures/results/generated/final/revenue_alpha_simple_ols.json
+++ b/paper/src/chapters/figures/results/generated/final/revenue_alpha_simple_ols.json
@@ -0,0 +1,31 @@
+{
+  "n": 95,
+  "k": 2,
+  "dof": 93,
+  "df_t": 93,
+  "cov_type": "iid",
+  "clusters": null,
+  "r2": 0.9759651432807543,
+  "adj_r2": 0.9757067039611925,
+  "sse": 1872600419.7223544,
+  "coefficients": [
+    {
+      "name": "intercept",
+      "coef": 348823.4131652292,
+      "std_error": 860.7176431608721,
+      "t_stat": 405.2704344298337,
+      "p_value": 0.0,
+      "ci95_low": 347114.1985078009,
+      "ci95_high": 350532.6278226575
+    },
+    {
+      "name": "alpha",
+      "coef": -90140.52744561416,
+      "std_error": 1466.838282353916,
+      "t_stat": -61.452259959401054,
+      "p_value": 0.0,
+      "ci95_low": -93053.37756806448,
+      "ci95_high": -87227.67732316385
+    }
+  ]
+}
--- a/paper/src/chapters/figures/results/generated/legacy/first_sweep_headline_summary.json
+++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_headline_summary.json
@@ -1,10 +0,0 @@
-{
-  "runs": 340,
-  "tiers": 5,
-  "alphas": 6,
-  "status": "ok",
-  "mean_tier_revenue_robust": 190714.62212212436,
-  "mean_tier_revenue_no_robust": 197371.17216609977,
-  "mean_tier_revenue_delta": -6656.5500439754105,
-  "mean_tier_revenue_delta_pct": -3.3726050116242514
-}
--- a/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_deltas.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_deltas.csv
@@ -1,31 +0,0 @@
-tier,alpha,runs_robust,runs_no_robust,eval_revenue_mean_delta,eval_revenue_mean_delta_pct,eval_reward_mean_delta,eval_reward_mean_delta_pct,eval_coi_level_mean_delta,eval_coi_level_mean_delta_pct,eval_margin_mean_delta,eval_margin_mean_delta_pct,objective_score_delta,objective_score_delta_pct,train_alpha_adv_delta,train_alpha_adv_delta_pct
-dqn,0.0,5.0,2.0,-31308.987414117495,-8.73651226889534,-1909.7427407095092,-0.5742991901121623,-2.8982436567700063,-2.1108702433020436,-0.001972064237093285,-0.2116777198290971,-1909.7427407095092,-0.5742991901121623,,
-dqn,0.1,8.0,4.0,-7723.542755668925,-2.2789188721535494,-74239.37371836061,-21.063854618469847,1.7435833801418141,1.2859365583872486,0.0011891962142838164,0.1278074871971924,-74239.37371836061,-21.063854618469847,0.17619791666666657,176.19791666666694
-dqn,0.25,7.0,3.0,-12344.82818986749,-3.7035466052614323,93154.03627578515,36.06691230407512,0.03214544949867104,0.023426184113378143,1.763733457238459e-05,0.001893256490383175,93154.03627578515,36.06691230407512,0.14530952380952394,58.12380952380958
-dqn,0.4,5.0,10.0,-7816.300706216833,-2.4694340725162824,-42362.74668471434,-13.411888482380219,0.6251272343707797,0.4579446603861758,0.0002750615520492605,0.02953644634355915,-42362.74668471434,-13.411888482380219,0.09856666666666747,24.64166666666691
-dqn,0.6,5.0,4.0,-16150.011887742497,-5.347485987139731,-28508.74710866122,-10.151356300001888,-0.63306323164079,-0.46056970247177387,-0.00034537433455417155,-0.0370668515552649,-28508.74710866122,-10.151356300001888,0.1361999999999981,22.699999999999644
-dqn,0.8,7.0,6.0,-18191.8826663699,-6.440527544692988,-55296.94441124235,-20.19273590083627,-0.796733634735034,-0.579832425016392,-0.0006423984775592029,-0.0689476165584585,-55296.94441124235,-20.19273590083627,0.1532857142857158,19.160714285714512
-linear,0.0,9.0,8.0,-14967.67388588126,-4.273413942959129,-20107.23171681742,-6.60039931288617,-0.06127790826209889,-0.04564810574240612,-7.607744079518586e-05,-0.008177885913528719,-20107.23171681742,-6.60039931288617,,
-linear,0.1,3.0,5.0,-24531.399901538738,-7.171831328305365,-96669.7835552101,-26.44920711447249,-0.3680976907859872,-0.2733723058172187,-0.0002515287835096469,-0.02702956778346356,-96669.7835552101,-26.44920711447249,,
-linear,0.25,6.0,9.0,-14840.859479571285,-4.520682292638562,-26510.179456423968,-8.033117756667396,-0.13734776448131925,-0.10212641096230607,-9.41162442338328e-05,-0.010115001392981545,-26510.179456423968,-8.033117756667396,,
-linear,0.4,4.0,11.0,-17196.7642560167,-5.486915251242723,-74520.10209817477,-25.042311510043184,0.12217076984330788,0.09098828726103136,0.00010713887099822461,0.011516865671259795,-74520.10209817477,-25.042311510043184,,
-linear,0.6,5.0,3.0,-14284.06615788641,-4.854766876637072,38417.71856593515,14.088596762512362,0.24251461234271687,0.1806530855220358,0.0002606811969937395,0.028024824619509187,38417.71856593515,14.088596762512362,,
-linear,0.8,4.0,11.0,-10840.488575784548,-3.933600919557566,15749.581078662042,6.447651726824251,0.028051260535562506,0.020876236575910773,5.361882659971062e-05,0.005763158099097226,15749.581078662042,6.447651726824251,,
-qtable,0.0,9.0,8.0,-18644.457288398524,-8.15323701554329,32993.42568058451,20.675688115613053,10.369779227648095,10.682768960780463,0.018566897519637582,2.0803084179092814,32993.42568058451,20.675688115613053,0.11839814814814797,
-qtable,0.1,6.0,5.0,-12549.400855549495,-4.616991193742389,-37207.79701261924,-15.336047254435487,0.0884057957559321,0.07703761042583206,-0.01127789819771663,-1.2272540823820444,-37207.79701261924,-15.336047254435487,0.07577777777777787,75.77777777777803
-qtable,0.25,6.0,5.0,-1534.3527429780224,-0.5456640130847226,18433.43663451099,7.304472653867784,-0.5776125938941306,-0.45734160960552755,-0.003316338490628068,-0.3584028328803385,18433.43663451099,7.304472653867784,0.1181458333333334,47.258333333333354
-qtable,0.4,8.0,6.0,-15146.258176090778,-5.274860187729517,-37364.22587794208,-13.005651205148677,0.4611471727478005,0.3629050099230144,0.0071046453227539,0.7751478467862876,-37364.22587794208,-13.005651205148677,0.11010416666666772,27.52604166666698
-qtable,0.6,6.0,6.0,-9577.578548656049,-3.9322693501816666,-19088.152339068736,-9.571307395166029,0.9081750157567683,0.7495917946306662,0.0015520804425310786,0.16838348372043557,-19088.152339068736,-9.571307395166029,0.16983333333333228,28.305555555555333
-qtable,0.8,5.0,2.0,-52751.680936846446,-19.699089872409548,-16508.209313987172,-7.589601869470744,-15.022454081083623,-11.215398490282094,-0.007791824761087751,-0.8384414846099099,-16508.209313987172,-7.589601869470744,0.11120000000000174,13.900000000000245
-static,0.0,5.0,6.0,-4782.871053113384,-5.233544525848519,14411.4689779756,25.538141347978577,1.307060701942973,1.8731997380823568,0.002537468952847566,0.2911381045328444,14411.4689779756,25.538141347978577,,
-static,0.1,8.0,5.0,1629.4524528499896,1.880088900553112,-5347.078589385725,-8.14812684380662,0.3600324838305795,0.5019134064795009,-4.6492644957929485e-05,-0.005316014641356001,-5347.078589385725,-8.14812684380662,,
-static,0.25,5.0,6.0,-9938.662276761897,-10.398087633377964,-23616.087243780566,-27.701108621456626,-3.0513860773271233,-4.099238223547561,-0.003519771479853273,-0.40113716461596144,-23616.087243780566,-27.701108621456626,,
-static,0.4,3.0,4.0,1850.8400595222774,2.1912497828943436,15058.659457798465,23.67199439061036,3.669612467486587,5.430169778169349,0.006763447803564415,0.7804393835882188,15058.659457798465,23.67199439061036,,
-static,0.6,6.0,5.0,1038.893948415236,1.2765037688226162,-6062.864079504681,-9.363144945348399,-1.712609061865976,-2.3996341009364213,-0.0042285583442709385,-0.48362088973179423,-6062.864079504681,-9.363144945348399,,
-static,0.8,3.0,7.0,2696.6340631967323,3.6826150812750567,149.22406835677975,0.27280281303997084,0.8491716126507072,1.2427748744725668,0.0032786525965587954,0.3777595573932637,149.22406835677975,0.27280281303997084,,
-surge,0.0,6.0,6.0,-606.73760243367,-5.066579306500225,-244.17585425326251,-5.525800641331023,0.014874931199557295,0.09186560988877175,0.0019308940532419272,0.4471794260021321,-244.17585425326251,-5.525800641331023,,
-surge,0.1,2.0,5.0,169.78743573408792,1.446343107913299,-1012.7706974660168,-20.02053666691211,-0.14459518037699226,-0.864651254901582,-0.0018650458785858248,-0.4260349899970559,-1012.7706974660168,-20.02053666691211,,
-surge,0.25,10.0,7.0,-128.20993816584632,-1.1276930411162496,-81.21373487263281,-1.7081453033360994,0.3008506477195141,1.839047728806548,0.0030750148302954305,0.7102446987902812,-81.21373487263281,-1.7081453033360994,,
-surge,0.4,6.0,6.0,-473.03722764431404,-4.297928307550563,28.557452243338048,0.6755106104955642,-0.5027452173053764,-3.072002360121898,-0.005581380442163164,-1.288152985482699,28.557452243338048,0.6755106104955642,,
-surge,0.6,2.0,5.0,307.79436325796996,3.0356727142643067,2060.57396030564,63.382050333909866,0.2339650444065704,1.438519400758399,0.001302270025389629,0.30077697380833807,2060.57396030564,63.382050333909866,,
-surge,0.8,3.0,3.0,423.15386247993047,4.372210191290083,1117.0942083304312,34.86182570616373,0.8971464536957541,5.327339899805159,0.007068630716831503,1.6094191039618562,1117.0942083304312,34.86182570616373,,
--- a/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_mode_summary.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_mode_summary.csv
@@ -1,61 +0,0 @@
-tier,alpha,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_margin_mean_mean,eval_margin_mean_std,objective_score_mean,objective_score_std,train_alpha_adv_mean,train_alpha_adv_std
-dqn,0.0,no_robust,2,358369.40933039243,3531.782519351935,332534.46523867303,114183.5587841961,137.30089123035202,0.8184776440325546,0.9316352418598786,0.0006839003676302996,332534.46523867303,114183.5587841961,,
-dqn,0.0,robust,5,327060.42191627494,24311.17412598574,330624.7224979635,62834.39223547943,134.40264757358202,6.160000643680792,0.9296631776227853,0.004262039730140749,330624.7224979635,62834.39223547943,0.17835000000000004,0.08829347371125472
-dqn,0.1,no_robust,4,338912.58043645386,19584.736810155388,352449.13650924934,34076.74819101191,135.58860029055563,3.4055508991301524,0.9304589585186211,0.0023438665484978773,352449.13650924934,34076.74819101191,0.0999999999999998,0.0
-dqn,0.1,robust,8,331189.03768078494,8060.912085646968,278209.7627908887,57861.69545853692,137.33218367069745,0.43113256118808096,0.931648154732905,0.000296560958972609,278209.7627908887,57861.69545853692,0.2761979166666664,0.09826648189130198
-dqn,0.25,no_robust,3,333324.4996115304,6101.717861804452,258281.15112936878,46772.05216097596,137.2201692904545,0.9866477887862672,0.9315871706751672,0.0006356053229300815,258281.15112936878,46772.05216097596,0.25,0.0
-dqn,0.25,robust,7,320979.6714216629,7345.8761269427705,351435.18740515393,40320.63699261721,137.25231473995316,0.3527287960309152,0.9316048080097395,0.0002575240668471541,351435.18740515393,40320.63699261721,0.39530952380952394,0.073021206240698
-dqn,0.4,no_robust,10,316521.94295076875,3631.1820920182718,315859.66987697606,59129.03566963754,136.50715652926755,0.5085743959240285,0.931261495881483,0.00031280530251053175,315859.66987697606,59129.03566963754,0.3999999999999993,0.0
-dqn,0.4,robust,5,308705.6422445519,10654.571556448245,273496.9231922617,68868.59270778317,137.13228376363833,0.9543108715306617,0.9315365574335323,0.0006302636717132419,273496.9231922617,68868.59270778317,0.49856666666666677,0.05745573175159429
-dqn,0.6,no_robust,4,302011.2988903938,2354.1141598720183,280836.828756133,58683.00124997926,137.4522093492651,0.4692723362517602,0.9317606434396914,0.0003317518021682495,280836.828756133,58683.00124997926,0.600000000000001,0.0
-dqn,0.6,robust,5,285861.2870026513,10386.571631344234,252328.08164747176,59388.56063758225,136.8191461176243,1.0629203361893034,0.9314152691051373,0.0005692783702932289,252328.08164747176,59388.56063758225,0.7361999999999991,0.07108625433623189
-dqn,0.8,no_robust,6,282459.51189759385,2625.018247527438,273845.72691287595,66378.16690732416,137.4075681801531,0.29728950101826707,0.9317196295169007,0.00022799290978965786,273845.72691287595,66378.16690732416,0.7999999999999985,0.0
-dqn,0.8,robust,7,264267.62923122395,6771.288971321149,218548.7825016336,50043.2009443344,136.61083454541807,1.2319662937254596,0.9310772310393415,0.0010118564779437284,218548.7825016336,50043.2009443344,0.9532857142857143,0.04709817507333055
-linear,0.0,no_robust,8,350250.9723061577,3156.286820918861,304636.59490360576,71682.88027353655,134.2397614654424,0.32611787466946035,0.9302824910938235,0.00024020749661685483,304636.59490360576,71682.88027353655,,
-linear,0.0,robust,9,335283.29842027643,7707.594869976611,284529.36318678834,55524.58819004573,134.1784835571803,0.4477314164684001,0.9302064136530284,0.00034781034181738526,284529.36318678834,55524.58819004573,,
-linear,0.1,no_robust,5,342052.1032713031,2576.546352056584,365492.17954557994,44890.93522299766,134.65068807375954,0.2181027640393531,0.930569018064469,0.00014058935916940913,365492.17954557994,44890.93522299766,,
-linear,0.1,robust,3,317520.7033697644,4796.580459456527,268822.39599036984,39256.421140635124,134.28259038297355,0.24570499109363475,0.9303174892809594,0.00018817899183709092,268822.39599036984,39256.421140635124,,
-linear,0.25,no_robust,9,328288.0441241802,2178.525494145428,330011.0898339667,38591.36053388808,134.48799697074742,0.2199303973026469,0.9304619997297959,0.00015341642413402035,330011.0898339667,38591.36053388808,,
-linear,0.25,robust,6,313447.18464460893,11811.426711620714,303500.9103775427,63358.917144214036,134.3506492062661,0.2947034403278951,0.9303678834855621,0.00021446628431268986,303500.9103775427,63358.917144214036,,
-linear,0.4,no_robust,11,313414.0672597746,1982.9537556159262,297576.7714904776,69396.90446617964,134.2708754290745,0.3062093691351849,0.9302780292522507,0.00023067974755288992,297576.7714904776,69396.90446617964,,
-linear,0.4,robust,4,296217.3030037579,5109.898340355844,223056.66939230284,38293.73688466607,134.3930461989178,0.12347753686382154,0.9303851681232489,7.324605809708878e-05,223056.66939230284,38293.73688466607,,
-linear,0.6,no_robust,3,294227.64307441004,2081.9176570448135,272686.62176604365,66672.50905805513,134.24327165069943,0.30764332256042104,0.9301795837547151,0.00020453921786790446,272686.62176604365,66672.50905805513,,
-linear,0.6,robust,5,279943.5769165236,9866.031719660255,311104.3403319788,28363.930707781863,134.48578626304214,0.21280262186464388,0.9304402649517088,0.00020533894868120649,311104.3403319788,28363.930707781863,,
-linear,0.8,no_robust,11,275586.89347174135,1618.038877505867,244268.4832547461,56201.44465269986,134.36933631960773,0.2845660213184439,0.9303723007028001,0.00017640716421186918,244268.4832547461,56201.44465269986,,
-linear,0.8,robust,4,264746.4048959568,7976.6279174956235,260018.06433340814,57942.49882730146,134.3973875801433,0.31511916357643405,0.9304259195293998,0.00023606570471334208,260018.06433340814,57942.49882730146,,
-qtable,0.0,no_robust,8,228675.52179404112,103199.70453252994,159575.94976328663,95848.81008103945,97.07014413321637,33.0637115678536,0.8925069648229078,0.04890522141482132,159575.94976328663,95848.81008103945,0.0,0.0
-qtable,0.0,robust,9,210031.0645056426,84361.3834579348,192569.37544387113,116824.7880426837,107.43992336086447,21.41128645838254,0.9110738623425454,0.019188350719133364,192569.37544387113,116824.7880426837,0.11839814814814797,0.061909456985161225
-qtable,0.1,no_robust,5,271809.0706466638,14898.209045050968,242616.60384397948,49181.45526408063,114.75666919996793,3.461383158930426,0.9189538140159812,0.002294693249439748,242616.60384397948,49181.45526408063,0.0999999999999998,0.0
-qtable,0.1,robust,6,259259.66979111428,102995.29934229614,205408.80683136024,94155.1845420674,114.84507499572386,36.206421837506966,0.9076759158182646,0.048591979839360346,205408.80683136024,94155.1845420674,0.17577777777777767,0.06720562696899951
-qtable,0.25,no_robust,5,281190.01916657295,70274.10208723843,252358.2126733039,129868.46825082717,126.29784427276161,15.368804047323954,0.9253103453385114,0.009044883517550522,252358.2126733039,129868.46825082717,0.25,0.0
-qtable,0.25,robust,6,279655.6664235949,93056.2549557545,270791.6493078149,116021.46257259768,125.72023167886748,26.760714047253796,0.9219940068478834,0.022785695882060884,270791.6493078149,116021.46257259768,0.3681458333333334,0.08845114686619042
-qtable,0.4,no_robust,6,287140.4669895195,32698.16434426399,287292.23388022534,83855.95000252876,127.07104066863859,9.200301166154173,0.9165535777734913,0.01306001923887748,287292.23388022534,83855.95000252876,0.3999999999999993,0.0
-qtable,0.4,robust,8,271994.2088134287,79259.3185780895,249928.00800228326,88265.30801790548,127.53218784138639,23.406428094683015,0.9236582230962452,0.020073747007871224,249928.00800228326,88265.30801790548,0.510104166666667,0.09294655989347765
-qtable,0.6,no_robust,6,243563.64469828535,67006.60707045678,199430.98211127534,79119.52886604435,121.15594411011905,17.91243944823949,0.9217533740470492,0.011558797825966702,199430.98211127534,79119.52886604435,0.600000000000001,0.0
-qtable,0.6,robust,6,233986.0661496293,43155.478617087436,180342.8297722066,48117.79957836251,122.06411912587582,12.160951090203252,0.9233054544895802,0.006840854872863436,180342.8297722066,48117.79957836251,0.7698333333333333,0.09107066853090896
-qtable,0.8,no_robust,2,267787.4017455507,1552.038101264713,217510.87340156303,45358.788584678456,133.9448981157492,0.47346860040111405,0.9293224278749692,0.0002998116010539045,217510.87340156303,45358.788584678456,0.7999999999999985,0.0
-qtable,0.8,robust,5,215035.72080870424,32869.73253165852,201002.66408757586,63247.67956376057,118.92244403466557,8.586916805142152,0.9215306031138815,0.004644709320891907,201002.66408757586,63247.67956376057,0.9112000000000002,0.07381653307732307
-static,0.0,no_robust,6,91388.75248869567,13415.65534300268,56431.15832748852,8525.098185703384,69.77689967440658,3.670744870085874,0.8715688236409825,0.005831496806767582,56431.15832748852,8525.098185703384,,
-static,0.0,robust,5,86605.88143558228,7614.909395960895,70842.62730546412,8033.737230392738,71.08396037634955,3.6802889678420283,0.8741062925938301,0.005083911544334936,70842.62730546412,8033.737230392738,,
-static,0.1,no_robust,5,86668.90445290186,8037.955688932984,65623.40881389238,19329.448262530004,71.73199185012882,4.199046495412734,0.874577067494122,0.006610505646022198,65623.40881389238,19329.448262530004,,
-static,0.1,robust,8,88298.35690575185,9576.838833058617,60276.33022450666,13359.490452744656,72.0920243339594,6.7706096714767865,0.8745305748491641,0.010083585815241344,60276.33022450666,13359.490452744656,,
-static,0.25,no_robust,6,95581.63603909909,8345.698435455577,85253.22060752509,13111.526873622026,74.43788116042678,2.1078820386097368,0.8774483618896327,0.0037254791853004897,85253.22060752509,13111.526873622026,,
-static,0.25,robust,5,85642.97376233719,9472.880627242153,61637.13336374452,15937.429780623212,71.38649508309966,4.0264905454627264,0.8739285904097794,0.005323853359397925,61637.13336374452,15937.429780623212,,
-static,0.4,no_robust,4,84465.04245981346,12101.831388745604,63613.81812329075,7778.361846092061,67.5782271530322,3.9088888968092,0.8666205147756862,0.007149121199217965,63613.81812329075,7778.361846092061,,
-static,0.4,robust,3,86315.88251933573,8642.748496122398,78672.47758108922,17823.74997200773,71.24783962051879,2.790416943786253,0.8733839625792507,0.005990544453538607,78672.47758108922,17823.74997200773,,
-static,0.6,no_robust,5,81385.88962988024,12343.523894997037,64752.43216774836,23486.779472906223,71.36959177224794,5.100226704959064,0.874353948320141,0.007787250295491337,64752.43216774836,23486.779472906223,,
-static,0.6,robust,6,82424.78357829548,9831.886701625144,58689.56808824368,12672.506035553573,69.65698271038197,3.484982360048201,0.8701253899758701,0.005917711231889304,58689.56808824368,12672.506035553573,,
-static,0.8,no_robust,7,73226.06364450825,4447.877985963851,54700.340767716196,14406.881298569717,68.32867561883204,3.68262917356943,0.8679204886788817,0.007467501164611224,54700.340767716196,14406.881298569717,,
-static,0.8,robust,3,75922.69770770498,5046.089536162847,54849.564836072976,22780.98012221352,69.17784723148274,1.5268167784698885,0.8711991412754405,0.0033278715575433297,54849.564836072976,22780.98012221352,,
-surge,0.0,no_robust,6,11975.290738176132,411.4052900076416,4418.832131346071,896.5828048394391,16.192056219479124,0.8040364003224534,0.4317940274006973,0.008271862690929055,4418.832131346071,896.5828048394391,,
-surge,0.0,robust,6,11368.553135742462,623.8217438159004,4174.6562770928085,639.9963040241264,16.20693115067868,0.9853827520149101,0.4337249214539392,0.010371668289035135,4174.6562770928085,639.9963040241264,,
-surge,0.1,no_robust,5,11739.084232858655,332.778792718381,5058.659087494994,1110.8409258976824,16.722948073839394,0.6578121995950104,0.4377682402562083,0.005683401047550787,5058.659087494994,1110.8409258976824,,
-surge,0.1,robust,2,11908.871668592743,81.41250285550258,4045.8883900289775,784.7169500268457,16.5783528934624,0.4088194924856508,0.4359031943776225,0.004531137621699143,4045.8883900289775,784.7169500268457,,
-surge,0.25,no_robust,7,11369.223138855004,236.1121240061105,4754.4980344481255,1038.0550037539617,16.359045119223275,0.3945156775653057,0.4329514652531622,0.0038762110261952457,4754.4980344481255,1038.0550037539617,,
-surge,0.25,robust,10,11241.013200689158,684.503587066406,4673.284299575493,1187.78635131025,16.65989576694279,1.0515950311117155,0.4360264800834576,0.009701952962125513,4673.284299575493,1187.78635131025,,
-surge,0.4,no_robust,6,11006.168409400554,364.6584583108646,4227.535704048808,1414.7964077877168,16.365391636138824,0.9138430058543858,0.4332855262584901,0.008024003783434592,4227.535704048808,1414.7964077877168,,
-surge,0.4,robust,6,10533.13118175624,526.0758051960169,4256.093156292146,783.7965507386594,15.862646418833448,0.7732699435426456,0.42770414581632693,0.008967505611725135,4256.093156292146,783.7965507386594,,
-surge,0.6,no_robust,5,10139.2472848498,97.448078425168,3251.037082975553,742.2100315641153,16.26429537781848,0.4432465691073604,0.4329686574409998,0.004121820888165019,3251.037082975553,742.2100315641153,,
-surge,0.6,robust,2,10447.04164810777,524.0029334247373,5311.611043281193,1808.6200710093085,16.49826042222505,0.6088756908260344,0.43427092746638946,0.007817511630542989,5311.611043281193,1808.6200710093085,,
-surge,0.8,no_robust,3,9678.259826640971,272.83530913170915,3204.3479815026553,556.8799617962688,16.840420745981802,0.4589959822922529,0.43920385308157944,0.004953937449529005,3204.3479815026553,556.8799617962688,,
-surge,0.8,robust,3,10101.413689120902,526.8318040489241,4321.442189833087,1284.166148011517,17.737567199677557,0.6586775330563983,0.44627248379841095,0.004644261847052545,4321.442189833087,1284.166148011517,,
--- a/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_mode_summary.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_mode_summary.csv
@@ -1,11 +0,0 @@
-tier,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_margin_mean_mean,eval_margin_mean_std,objective_score_mean,objective_score_std,train_alpha_adv_mean,train_alpha_adv_std
-dqn,no_robust,29,315185.66674813855,23538.781000060844,302576.8036266896,62951.88633145167,136.82560356086017,1.3692652218935986,0.9313739013618878,0.0009314135057224836,302576.8036266896,62951.88633145167,0.45740740740740693,0.2368477698794438
-dqn,robust,37,306875.13950902375,27585.74444520695,283724.7169827867,69843.05611741856,136.68837571992978,2.3797541654948753,0.9312171495138941,0.0016512408492580111,283724.7169827867,69843.05611741856,0.5058198198198196,0.28324483129860284
-linear,no_robust,47,315501.15296155965,27105.014861872147,298149.1730416604,67664.7308344108,134.36884359609928,0.29743647613433244,0.9303607531364,0.0002152647006739543,298149.1730416604,67664.7308344108,,
-linear,robust,31,306269.9232239004,26399.875293394463,279872.824370329,54401.104602086416,134.32737693008372,0.31909212993628877,0.9303375215162144,0.00025000448833182963,279872.824370329,54401.104602086416,,
-qtable,no_robust,32,259818.72178238883,67188.58622318009,222088.83510765125,94450.12569617687,116.84641954166946,22.42810298937963,0.9140582213134033,0.02778864370791322,222088.83510765125,94450.12569617687,0.29218749999999993,0.2559326319498438
-qtable,robust,40,244470.50673219413,78666.30912808319,216920.53697298188,93983.50987622296,118.94013969887506,23.1428303249914,0.9178608956089163,0.023827311253270544,216920.53697298188,93983.50987622296,0.4396239583333334,0.29521865862482416
-static,no_robust,33,85228.452028227,12041.415672002751,64828.579890468536,17681.280330831738,70.58818912317687,4.204964531595236,0.8721419294578765,0.007107262779462876,64828.579890468536,17681.280330831738,,
-static,robust,30,84963.18577955024,8926.291379160475,63243.76603076817,14880.924342692271,70.94358095957392,4.363134562111469,0.8730306888410219,0.006660289247744752,63243.76603076817,14880.924342692271,,
-surge,no_robust,32,11121.867310184698,809.9895800277001,4260.038064073964,1160.4282377968032,16.416108827015794,0.641203520341943,0.43413855082681374,0.006214799767130059,4260.038064073964,1160.4282377968032,,
-surge,robust,29,10994.355365953365,750.5115890942825,4448.160863178768,1000.7519971246122,16.495943148858906,0.9823026347466668,0.4347587896392907,0.009698591291108968,4448.160863178768,1000.7519971246122,,
--- a/paper/src/chapters/figures/results/generated/legacy/first_sweep_top_configs.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_top_configs.csv
@@ -1,26 +0,0 @@
-Name,tier,alpha,mode,objective/score,eval/revenue_mean,eval/reward_mean,eval/coi_level_mean,lambda_coi,robust_radius,learning_rate,batch_size,n_steps,total_timesteps
-eager-sweep-244,dqn,0.0,no_robust,413274.4339549909,355872.06196128257,413274.4339549909,136.722140138007,0.2,0.1,0.0003,256,4096,15000
-efficient-sweep-319,linear,0.0,no_robust,410094.0151741567,353309.5198146561,410094.0151741567,134.55152038805429,0.4,0.1,0.001,128,4096,15000
-swept-sweep-422,linear,0.0,no_robust,403130.32747386186,347611.2815474988,403130.32747386186,133.8559785775022,0.4,0.3,0.0001,512,1024,15000
-decent-sweep-478,linear,0.1,no_robust,400452.36418713134,345284.5750647792,400452.36418713134,134.73082941975588,0.1,0.2,0.001,128,1024,50000
-eternal-sweep-339,linear,0.1,no_robust,399628.4231731644,344154.38525771734,399628.4231731644,134.89479277649667,0.4,0.1,0.0001,256,1024,50000
-ethereal-sweep-21,dqn,0.1,no_robust,398492.807245857,343580.6802427996,398492.807245857,136.67160732585188,0.1,0.2,0.001,512,2048,50000
-dark-sweep-418,linear,0.1,no_robust,394615.3720658343,339749.76272695075,394615.3720658343,134.39233246711,0.2,0.1,0.0003,256,1024,50000
-wandering-sweep-122,dqn,0.0,robust,394061.3617726404,339512.43434806296,394061.3617726404,137.6864755964331,0.1,0.3,0.0001,256,2048,30000
-laced-sweep-132,dqn,0.1,robust,389274.54998495104,335600.5979215904,389274.54998495104,137.36888574027677,0.4,0.2,0.001,256,2048,30000
-rich-sweep-53,qtable,0.0,robust,388601.2626147048,335630.6853337664,388601.2626147048,133.4414069888203,0.2,0.1,0.0001,512,1024,50000
-faithful-sweep-430,qtable,0.25,no_robust,387035.6970938766,333255.5771210341,387035.6970938766,137.4906091183188,0.1,0.2,0.0003,128,1024,15000
-dark-sweep-280,qtable,0.25,no_robust,386318.8845004527,332220.0316564078,386318.8845004527,137.26992450099925,0.4,0.1,0.0001,256,1024,50000
-chocolate-sweep-383,linear,0.25,no_robust,383989.49015403807,331071.7003244704,383989.49015403807,134.60590742050857,0.1,0.2,0.001,512,1024,30000
-dry-sweep-263,dqn,0.0,robust,383372.6880637367,330436.0312615148,383372.6880637367,137.40558130223476,0.1,0.3,0.001,128,1024,50000
-different-sweep-143,qtable,0.0,robust,383278.4198015018,330546.16800945485,383278.4198015018,135.9021538079678,0.1,0.3,0.001,256,2048,30000
-woven-sweep-139,dqn,0.25,robust,382788.1296637251,329427.735752473,382788.1296637251,136.8968339394894,0.1,0.1,0.001,512,1024,15000
-dark-sweep-215,dqn,0.25,robust,382358.2401374872,329330.0097603144,382358.2401374872,137.64528612332785,0.2,0.1,0.0001,512,4096,30000
-charmed-sweep-136,linear,0.25,no_robust,382249.5728044314,329646.2053260979,382249.5728044314,134.46825608007862,0.4,0.1,0.0001,256,2048,15000
-light-sweep-308,linear,0.0,robust,381939.1275250679,329628.9436641051,381939.1275250679,133.6209821974879,0.2,0.2,0.001,128,4096,30000
-treasured-sweep-325,linear,0.25,robust,381322.0104772589,328353.58675398555,381322.0104772589,134.8950293943581,0.1,0.1,0.0001,512,2048,15000
-fine-sweep-202,dqn,0.25,robust,378751.33572275366,326518.9068184018,378751.33572275366,137.2900973301052,0.1,0.2,0.0001,512,2048,30000
-treasured-sweep-380,linear,0.25,no_robust,377898.0979419424,325869.1953595453,377898.0979419424,134.54118723889738,0.4,0.3,0.001,128,1024,50000
-pretty-sweep-49,qtable,0.25,robust,377318.4766808995,325282.0152823859,377318.4766808995,137.19609012644068,0.4,0.1,0.0001,128,4096,50000
-desert-sweep-253,linear,0.25,robust,376808.6335063269,325146.3478714648,376808.6335063269,134.48396340732663,0.2,0.1,0.0003,256,1024,30000
-jolly-sweep-133,qtable,0.4,no_robust,376419.57394710975,323709.24588324485,376419.57394710975,137.8349363778071,0.1,0.3,0.0001,128,2048,50000
--- a/paper/src/chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf
+++ b/paper/src/chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf
--- a/paper/src/chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf
+++ b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf
--- a/paper/src/chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf
+++ b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf
--- a/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_deltas.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_deltas.csv
@@ -1,7 +0,0 @@
-alpha,runs_robust,runs_no_robust,eval_revenue_mean_robust,eval_revenue_mean_no_robust,eval_revenue_mean_delta,eval_revenue_mean_delta_pct,eval_reward_mean_robust,eval_reward_mean_no_robust,eval_reward_mean_delta,eval_reward_mean_delta_pct,eval_coi_level_mean_robust,eval_coi_level_mean_no_robust,eval_coi_level_mean_delta,eval_coi_level_mean_delta_pct,eval_coi_leakage_mean_robust,eval_coi_leakage_mean_no_robust,eval_coi_leakage_mean_delta,eval_coi_leakage_mean_delta_pct,eval_volatility_mean_robust,eval_volatility_mean_no_robust,eval_volatility_mean_delta,eval_volatility_mean_delta_pct,eval_margin_mean_robust,eval_margin_mean_no_robust,eval_margin_mean_delta,eval_margin_mean_delta_pct,train_alpha_adv_robust,train_alpha_adv_no_robust,train_alpha_adv_delta,train_alpha_adv_delta_pct,train_coi_penalty_robust,train_coi_penalty_no_robust,train_coi_penalty_delta,train_coi_penalty_delta_pct,train_ux_penalty_robust,train_ux_penalty_no_robust,train_ux_penalty_delta,train_ux_penalty_delta_pct,train_agent_prob_robust,train_agent_prob_no_robust,train_agent_prob_delta,train_agent_prob_delta_pct
-0.0,4.0,4.0,3379.9042994670963,3565.2912010160844,-185.38690154898813,-5.199768857482219,313527.4707462,331300.229069,-17772.758322799986,-5.364547550342456,137.08358925982625,137.28764358955686,-0.2040543297306101,-0.14863269875959326,0.1146626165658294,0.11861133504329742,-0.003948718477468013,-3.3291240470622716,0.06687153537785637,0.06445662162531288,0.0024149137525434905,3.746572022625408,0.9315273502623671,0.9317078361627993,-0.00018048590043218127,-0.019371512552207898,0.18958333333333333,,,,5.553200113221484,,,,61.35134238638615,66.58479574844135,-5.233453362055201,-7.859832418540847,0.12778212146468534,0.11615891320235115,0.011623208262334192,10.00629907933654
-0.1,4.0,4.0,3307.028238366196,3458.002436284769,-150.97419791857283,-4.365936713473732,306772.49146475,321215.477968,-14442.986503249966,-4.4963544704059375,137.1182041122497,136.82757579763506,0.29062831461465066,0.21240478238427865,0.1128546052304944,0.11704917861668755,-0.004194573386193154,-3.5835991638433753,0.0685405649303561,0.06737596899527175,0.0011645959350843477,1.728503430007924,0.9315331673960889,0.9313276818191593,0.00020548557692967595,0.0220637248243606,0.2818749999999999,0.1,0.18187499999999987,181.87499999999986,5.079528726095333,,,,52.44772950699336,53.288869747139515,-0.841140240146153,-1.578453895039319,0.11644381911386253,0.11765277436070229,-0.0012089552468397546,-1.0275620387270383
-0.25,4.0,4.0,3134.3438215278165,3300.5539051855053,-166.21008365768876,-5.035823938416998,290691.4771835,306522.90003785,-15831.422854350007,-5.16484179563586,136.89990884669214,136.71752459667877,0.18238425001337077,0.1334022471160229,0.11113957413522965,0.1139905600539111,-0.0028509859186814507,-2.50107194607439,0.06427159998376095,0.06846858821082077,-0.004196988227059828,-6.12980103246314,0.9314501501825461,0.9313053225630614,0.0001448276194846443,0.015551035302371268,0.44833333333333336,0.25,0.19833333333333336,79.33333333333334,4.7183804755060255,,,,49.04307009982127,55.2030005738411,-6.159930474019831,-11.158687770568074,0.10998505830218755,0.11684259343269415,-0.0068575351305066035,-5.869037077182653
-0.4,4.0,4.0,2983.852437569374,3180.7872854626567,-196.9348478932825,-6.191386918369099,276545.26309355,295433.5405797,-18888.277486150037,-6.393409986248494,136.19210761854086,136.5783021470118,-0.38619452847095204,-0.2827641890402586,0.10875560547061063,0.11189234314151972,-0.0031367376709090927,-2.8033532794480807,0.07452230347799255,0.07104688223410768,0.003475421243884863,4.891729425132195,0.9307282962514367,0.9310542820602117,-0.0003259858087749645,-0.03501254599824534,0.5999999999999999,0.4000000000000001,0.1999999999999998,49.999999999999936,4.174996403604185,,,,47.99794119802058,50.794260008988424,-2.796318810967847,-5.505186630286606,0.10222958892923095,0.11161526349272373,-0.009385674563492777,-8.408952565976458
-0.6,4.0,4.0,2789.0434220430398,2982.2460998252786,-193.20267778223888,-6.4784283830083,258688.11700405,277051.95613675,-18363.8391327,-6.628301560749781,136.86774320500828,136.81931587629953,0.04842732870875466,0.035395096371142916,0.10501047827147733,0.10802266412956946,-0.0030121858580921257,-2.788475809557069,0.06914180963767007,0.06698591531512615,0.0021558943225439137,3.2184292957732996,0.9314130089130337,0.9313849217310588,2.8087181974889575e-05,0.003015636319588161,0.7733333333333334,0.5999999999999999,0.17333333333333356,28.888888888888935,4.178300996512875,,,,39.928062615509425,47.86860429278531,-7.940541677275881,-16.588203885594947,0.11297979438696983,0.1162670925925253,-0.0032872982055554695,-2.827367686122743
-0.8,4.0,4.0,2586.098242115281,2841.1305915063504,-255.03234939106915,-8.97643882169642,239765.24959855,264140.55002745,-24375.300428900024,-9.228155399224729,136.5038826686135,137.28163778418497,-0.7777551155714661,-0.5665397995864124,0.10253056902792507,0.1031498585902154,-0.0006192895622903344,-0.6003784888844036,0.07325665736408164,0.06592454978099352,0.007332107583088124,11.1219683827132,0.9311235469993302,0.9316596013994161,-0.0005360544000858614,-0.05753758124541101,1.0,0.8000000000000002,0.19999999999999984,24.99999999999998,3.5384100686094007,,,,37.14414699970415,37.43809775029793,-0.29395075059377973,-0.7851647606519765,0.09990322635678014,0.10432800196112454,-0.0044247756043444,-4.241215705437541
--- a/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_mode_summary.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_mode_summary.csv
@@ -1,13 +0,0 @@
-alpha,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_coi_leakage_mean_mean,eval_coi_leakage_mean_std,eval_volatility_mean_mean,eval_volatility_mean_std,eval_margin_mean_mean,eval_margin_mean_std,train_alpha_adv_mean,train_alpha_adv_std,train_coi_penalty_mean,train_coi_penalty_std,train_ux_penalty_mean,train_ux_penalty_std,train_agent_prob_mean,train_agent_prob_std
-0.0,no_robust,4,3565.2912010160844,52.219179508209216,331300.229069,5038.96659004527,137.28764358955686,0.6434240315013728,0.11861133504329742,0.004019332768284657,0.06445662162531288,0.004080405219050139,0.9317078361627993,0.00038018051704976865,,,,,66.58479574844135,32.282270089830455,0.11615891320235115,0.016558627227281013
-0.0,robust,4,3379.9042994670963,54.727408939657735,313527.4707462,5408.058196552377,137.08358925982625,1.047386315387148,0.1146626165658294,0.0025627354157035497,0.06687153537785637,0.008577061675868377,0.9315273502623671,0.0007274203134899985,0.18958333333333333,0.02083333333333336,5.553200113221484,0.45981481828856186,61.35134238638615,30.27964905193963,0.12778212146468534,0.027929667978205217
-0.1,no_robust,4,3458.002436284769,60.75923217871363,321215.477968,6016.373193216596,136.82757579763506,1.1899102161551907,0.11704917861668755,0.0021220259908233973,0.06737596899527175,0.006801136773079149,0.9313276818191593,0.0008352263172197586,0.1,0.0,,,53.288869747139515,18.480340945815023,0.11765277436070229,0.017544197575138736
-0.1,robust,4,3307.028238366196,35.58495715224888,306772.49146475,3488.2690530060245,137.1182041122497,0.8582218376452346,0.1128546052304944,0.0005963155492967403,0.0685405649303561,0.0050673362512629015,0.9315331673960889,0.0005217376436765336,0.2818749999999999,0.03624999999999999,5.079528726095333,0.6109585102054891,52.44772950699336,29.0263361696475,0.11644381911386253,0.021152545180088765
-0.25,no_robust,4,3300.5539051855053,50.460978662647115,306522.90003785,4860.668937531515,136.71752459667877,0.7410676951244369,0.1139905600539111,0.003319948537321803,0.06846858821082077,0.008614994548315848,0.9313053225630614,0.0004919872662680591,0.25,0.0,,,55.2030005738411,26.88247558235345,0.11684259343269415,0.013462146346772591
-0.25,robust,4,3134.3438215278165,64.06834403659167,290691.4771835,6331.196493752059,136.89990884669214,1.3796663751798552,0.11113957413522965,0.0015044942041406348,0.06427159998376095,0.0042331619171274894,0.9314501501825461,0.0008939739741734515,0.44833333333333336,0.0033333333333333518,4.7183804755060255,0.4538389380858333,49.04307009982127,28.20484665432831,0.10998505830218755,0.010731404693185651
-0.4,no_robust,4,3180.7872854626567,71.87564776824694,295433.5405797,7035.374110540269,136.5783021470118,1.7095219574599192,0.11189234314151972,0.0013821115134030936,0.07104688223410768,0.005766138692685495,0.9310542820602117,0.0013989725050689828,0.4000000000000001,0.0,,,50.794260008988424,24.836708377642946,0.11161526349272373,0.005787749200301594
-0.4,robust,4,2983.852437569374,45.51290575912758,276545.26309355,4555.1725323898245,136.19210761854086,1.5546063667946701,0.10875560547061063,0.001118798290958954,0.07452230347799255,0.0040446395928049874,0.9307282962514367,0.0013558080014763189,0.5999999999999999,0.0,4.174996403604185,0.12189448324552496,47.99794119802058,33.51782503281748,0.10222958892923095,0.0031686467591609474
-0.6,no_robust,4,2982.2460998252786,39.93674476199945,277051.95613675,3931.02017169463,136.81931587629953,1.1995405806950865,0.10802266412956946,0.000405835985606262,0.06698591531512615,0.002805894772223563,0.9313849217310588,0.0008100530228792662,0.5999999999999999,0.0,,,47.86860429278531,23.830502772642472,0.1162670925925253,0.028676813474186293
-0.6,robust,4,2789.0434220430398,35.297482315631626,258688.11700405,3420.6735023624556,136.86774320500828,0.7097303238857778,0.10501047827147733,0.0008273121554488608,0.06914180963767007,0.009066158371268139,0.9314130089130337,0.0005024421703994162,0.7733333333333334,0.053333333333333385,4.178300996512875,0.5865970573865015,39.928062615509425,30.25078643153115,0.11297979438696983,0.0274101056520461
-0.8,no_robust,4,2841.1305915063504,21.84043179776092,264140.55002745,2073.353315114627,137.28163778418497,0.6288968799501957,0.1031498585902154,0.0012877581835795701,0.06592454978099352,0.00340700896766341,0.9316596013994161,0.00038430108058413553,0.8000000000000002,0.0,,,37.43809775029793,32.01740090550489,0.10432800196112454,0.018337841526911584
-0.8,robust,4,2586.098242115281,48.05539265296157,239765.24959855,4681.6472175597555,136.5038826686135,1.0611320896043694,0.10253056902792507,0.002587472569909977,0.07325665736408164,0.0015359324114246234,0.9311235469993302,0.0006145440308596868,1.0,0.0,3.5384100686094007,0.391972726035734,37.14414699970415,25.614063825315505,0.09990322635678014,0.010269342031085898
--- a/paper/src/chapters/figures/results/generated/legacy/ppo_headline_summary.json
+++ b/paper/src/chapters/figures/results/generated/legacy/ppo_headline_summary.json
@@ -1,7 +0,0 @@
-{
-  "status": "ok",
-  "revenue_delta": -191.29017636530716,
-  "revenue_delta_pct": -5.938226273545598,
-  "coi_leakage_delta": -0.002960415145605702,
-  "coi_leakage_delta_pct": -2.6404147469510946
-}
--- a/paper/src/chapters/figures/results/generated/legacy/ppo_overall_mode_summary.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/ppo_overall_mode_summary.csv
@@ -1,3 +0,0 @@
-mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_coi_leakage_mean_mean,eval_coi_leakage_mean_std,eval_volatility_mean_mean,eval_volatility_mean_std,eval_margin_mean_mean,eval_margin_mean_std,train_alpha_adv_mean,train_alpha_adv_std,train_coi_penalty_mean,train_coi_penalty_std,train_ux_penalty_mean,train_ux_penalty_std,train_agent_prob_mean,train_agent_prob_std
-no_robust,24,3221.335253213441,262.46595166337727,299277.442303125,24382.561944761477,136.9186666318945,1.0038463876967063,0.11211932326253345,0.005805494533542669,0.06737642102693879,0.005402738047823369,0.9314066076226178,0.0007436370959663933,0.43,0.2546411303445653,,,51.86293802024894,25.340287421525442,0.11381077317368686,0.016664235359362907
-robust,24,3030.0450768481337,288.262657026656,280998.34484843333,26820.020161880373,136.77757261848845,1.06224696086916,0.10915890811692774,0.004616462637659704,0.06943407846195294,0.006435789449278624,0.9312959200008004,0.0007858424519830652,0.5488541666666666,0.2860373751485706,4.540469463924883,0.7906156355346259,47.985382134405825,27.407657819442747,0.11155393475895271,0.01943348418653492
--- a/paper/src/chapters/figures/results/generated/legacy/ppo_pairwise_win_rates.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/ppo_pairwise_win_rates.csv
@@ -1,25 +0,0 @@
-alpha,metric,direction,wins,ties,total_pairs,win_probability
-0.0,eval/revenue_mean,higher,0,0,16,0.0
-0.0,eval/reward_mean,higher,0,0,16,0.0
-0.0,eval/coi_leakage_mean,lower,14,0,16,0.875
-0.0,eval/volatility_mean,lower,8,0,16,0.5
-0.1,eval/revenue_mean,higher,0,0,16,0.0
-0.1,eval/reward_mean,higher,0,0,16,0.0
-0.1,eval/coi_leakage_mean,lower,16,0,16,1.0
-0.1,eval/volatility_mean,lower,8,0,16,0.5
-0.25,eval/revenue_mean,higher,0,0,16,0.0
-0.25,eval/reward_mean,higher,0,0,16,0.0
-0.25,eval/coi_leakage_mean,lower,12,0,16,0.75
-0.25,eval/volatility_mean,lower,11,0,16,0.6875
-0.4,eval/revenue_mean,higher,0,0,16,0.0
-0.4,eval/reward_mean,higher,0,0,16,0.0
-0.4,eval/coi_leakage_mean,lower,16,0,16,1.0
-0.4,eval/volatility_mean,lower,6,0,16,0.375
-0.6,eval/revenue_mean,higher,0,0,16,0.0
-0.6,eval/reward_mean,higher,0,0,16,0.0
-0.6,eval/coi_leakage_mean,lower,16,0,16,1.0
-0.6,eval/volatility_mean,lower,7,0,16,0.4375
-0.8,eval/revenue_mean,higher,0,0,16,0.0
-0.8,eval/reward_mean,higher,0,0,16,0.0
-0.8,eval/coi_leakage_mean,lower,11,0,16,0.6875
-0.8,eval/volatility_mean,lower,0,0,16,0.0
--- a/paper/src/chapters/figures/results/includes/final_focus_coi_by_alpha.tex
+++ b/paper/src/chapters/figures/results/includes/final_focus_coi_by_alpha.tex
@@ -1 +1 @@
-\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf}
+\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/final/plots/final_focus_coi_by_alpha.pdf}
--- a/paper/src/chapters/figures/results/includes/final_focus_coi_preservation_grid.tex
+++ b/paper/src/chapters/figures/results/includes/final_focus_coi_preservation_grid.tex
@@ -1 +1 @@
-\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_delta_curves.pdf}
+\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/final/plots/final_focus_coi_preservation_grid.pdf}
--- a/paper/src/chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex
+++ b/paper/src/chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex
--- a/paper/src/chapters/figures/results/includes/final/final_focus_revenue_delta.tex
+++ b/paper/src/chapters/figures/results/includes/final/final_focus_revenue_delta.tex
--- a/paper/src/chapters/figures/results/includes/final/final_focus_risk_deltas.tex
+++ b/paper/src/chapters/figures/results/includes/final/final_focus_risk_deltas.tex
--- a/paper/src/chapters/figures/results/includes/legacy/first_sweep_tier_revenue.tex
+++ b/paper/src/chapters/figures/results/includes/legacy/first_sweep_tier_revenue.tex
@@ -1 +0,0 @@
-\includegraphics[width=0.99\linewidth]{chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf}
--- a/paper/src/chapters/figures/results/includes/legacy/ppo_tradeoff_scatter.tex
+++ b/paper/src/chapters/figures/results/includes/legacy/ppo_tradeoff_scatter.tex
@@ -1 +0,0 @@
-\includegraphics[width=0.88\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf}
--- a/paper/src/chapters/figures/results/plot_results.py
+++ b/paper/src/chapters/figures/results/plot_results.py
@@ -1,313 +0,0 @@
-from __future__ import annotations
-
-import argparse
-from pathlib import Path
-
-import matplotlib
-
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-from matplotlib.ticker import FuncFormatter
-import numpy as np
-import pandas as pd
-
-from process_first_sweep import run as run_first_sweep
-from process_ppo_benchmark import run as run_ppo_benchmark
-
-
-def _output_dir() -> Path:
-    return Path(__file__).resolve().parent / "generated" / "legacy"
-
-
-def _plot_dir() -> Path:
-    return _output_dir() / "plots"
-
-
-def _configure_style() -> None:
-    plt.rcParams.update(
-        {
-            "font.family": "serif",
-            "font.size": 10,
-            "axes.titlesize": 10,
-            "axes.labelsize": 9,
-            "legend.fontsize": 8,
-            "xtick.labelsize": 8,
-            "ytick.labelsize": 8,
-            "figure.dpi": 220,
-            "savefig.dpi": 320,
-            "axes.spines.top": False,
-            "axes.spines.right": False,
-            "axes.grid": True,
-            "grid.alpha": 0.22,
-        }
-    )
-
-
-def _fmt_thousands(value: float, _: int) -> str:
-    return f"{int(value):,}"
-
-
-def _load_csv(path: Path) -> pd.DataFrame:
-    if not path.exists():
-        raise FileNotFoundError(f"Missing required input: {path}")
-    return pd.read_csv(path)
-
-
-def _plot_ppo_alpha_curves(alpha_mode: pd.DataFrame, out_dir: Path) -> Path:
-    fig, axes = plt.subplots(2, 2, figsize=(9.3, 6.4), constrained_layout=True)
-    robust_color = "#C44E52"
-    baseline_color = "#4C72B0"
-    mode_colors = {"robust": robust_color, "no_robust": baseline_color}
-    mode_labels = {"robust": "Robust", "no_robust": "Non-robust"}
-
-    panels = [
-        ("eval_revenue_mean", "Mean Episode Revenue", "Revenue"),
-        ("eval_reward_mean", "Mean Episode Reward", "Reward"),
-        ("eval_coi_leakage_mean", "Mean COI Leakage", "COI Leakage"),
-        ("eval_volatility_mean", "Mean Price Volatility", "Volatility"),
-    ]
-
-    for ax, (metric_prefix, title, ylabel) in zip(axes.flat, panels):
-        mean_col = f"{metric_prefix}_mean"
-        std_col = f"{metric_prefix}_std"
-        for mode in ("no_robust", "robust"):
-            sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha")
-            if sub.empty:
-                continue
-            x = sub["alpha"].to_numpy(dtype=float)
-            y = sub[mean_col].to_numpy(dtype=float)
-            ax.plot(
-                x,
-                y,
-                marker="o",
-                linewidth=1.8,
-                markersize=4,
-                color=mode_colors[mode],
-                label=mode_labels[mode],
-            )
-            if std_col in sub.columns:
-                sigma = sub[std_col].fillna(0.0).to_numpy(dtype=float)
-                ax.fill_between(
-                    x,
-                    y - sigma,
-                    y + sigma,
-                    color=mode_colors[mode],
-                    alpha=0.14,
-                    linewidth=0,
-                )
-
-        ax.set_title(title)
-        ax.set_xlabel(r"Contamination $\alpha$")
-        ax.set_ylabel(ylabel)
-        ax.set_xticks(sorted(alpha_mode["alpha"].unique()))
-        if metric_prefix in {"eval_revenue_mean", "eval_reward_mean"}:
-            ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
-
-    handles, labels = axes.flat[0].get_legend_handles_labels()
-    fig.legend(handles, labels, ncol=2, loc="upper center", bbox_to_anchor=(0.5, 1.02))
-
-    out_path = out_dir / "ppo_alpha_curves.pdf"
-    fig.savefig(out_path, bbox_inches="tight")
-    plt.close(fig)
-    return out_path
-
-
-def _plot_ppo_delta_curves(deltas: pd.DataFrame, out_dir: Path) -> Path:
-    fig, axes = plt.subplots(2, 1, figsize=(8.6, 6.0), constrained_layout=True)
-    deltas = deltas.sort_values("alpha")
-    x = deltas["alpha"].to_numpy(dtype=float)
-
-    top_metrics = [
-        ("eval_revenue_mean_delta_pct", "Revenue", "#4C72B0"),
-        ("eval_reward_mean_delta_pct", "Reward", "#8172B3"),
-    ]
-    for col, label, color in top_metrics:
-        axes[0].plot(
-            x,
-            deltas[col].to_numpy(dtype=float),
-            marker="o",
-            linewidth=1.8,
-            markersize=4,
-            color=color,
-            label=label,
-        )
-    axes[0].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
-    axes[0].set_title("Robust Minus Non-robust Delta by Contamination")
-    axes[0].set_ylabel("Delta (%)")
-    axes[0].set_xlabel(r"Contamination $\alpha$")
-    axes[0].set_xticks(x)
-    axes[0].legend(loc="lower left")
-
-    bottom_metrics = [
-        ("eval_coi_leakage_mean_delta_pct", "COI Leakage", "#55A868"),
-        ("eval_volatility_mean_delta_pct", "Volatility", "#DD8452"),
-    ]
-    for col, label, color in bottom_metrics:
-        axes[1].plot(
-            x,
-            deltas[col].to_numpy(dtype=float),
-            marker="o",
-            linewidth=1.8,
-            markersize=4,
-            color=color,
-            label=label,
-        )
-    axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
-    axes[1].set_ylabel("Delta (%)")
-    axes[1].set_xlabel(r"Contamination $\alpha$")
-    axes[1].set_xticks(x)
-    axes[1].legend(loc="lower left")
-
-    out_path = out_dir / "ppo_delta_curves.pdf"
-    fig.savefig(out_path, bbox_inches="tight")
-    plt.close(fig)
-    return out_path
-
-
-def _plot_ppo_tradeoff_scatter(deltas: pd.DataFrame, out_dir: Path) -> Path:
-    fig, ax = plt.subplots(figsize=(6.4, 5.2), constrained_layout=True)
-    data = deltas.sort_values("alpha")
-    x = data["eval_coi_leakage_mean_delta_pct"].to_numpy(dtype=float)
-    y = data["eval_revenue_mean_delta_pct"].to_numpy(dtype=float)
-    alphas = data["alpha"].to_numpy(dtype=float)
-
-    scatter = ax.scatter(
-        x,
-        y,
-        c=alphas,
-        cmap="viridis",
-        s=72,
-        edgecolor="#222222",
-        linewidth=0.5,
-    )
-    for x_i, y_i, alpha in zip(x, y, alphas):
-        ax.annotate(
-            rf"$\alpha={alpha:.2f}$",
-            (x_i, y_i),
-            textcoords="offset points",
-            xytext=(5, 4),
-            fontsize=8,
-        )
-
-    ax.axhline(0.0, color="#555555", linewidth=1.0, linestyle="--")
-    ax.axvline(0.0, color="#555555", linewidth=1.0, linestyle="--")
-    ax.set_xlabel("COI Leakage Delta (%)")
-    ax.set_ylabel("Revenue Delta (%)")
-    ax.set_title("PPO Robust Tradeoff Frontier")
-    cbar = fig.colorbar(scatter, ax=ax)
-    cbar.set_label(r"Contamination $\alpha$")
-
-    out_path = out_dir / "ppo_tradeoff_scatter.pdf"
-    fig.savefig(out_path, bbox_inches="tight")
-    plt.close(fig)
-    return out_path
-
-
-def _plot_first_sweep_tier_revenue(tier_mode: pd.DataFrame, out_dir: Path) -> Path:
-    pivot = (
-        tier_mode.pivot(index="tier", columns="mode", values="eval_revenue_mean_mean")
-        .dropna(subset=["robust", "no_robust"], how="any")
-        .copy()
-    )
-    if pivot.empty:
-        raise ValueError("First sweep tier summary missing robust/non-robust pairs")
-
-    order = sorted(pivot.index.tolist())
-    pivot = pivot.loc[order]
-    delta_pct = 100.0 * (pivot["robust"] - pivot["no_robust"]) / pivot["no_robust"]
-
-    fig, axes = plt.subplots(1, 2, figsize=(10.2, 4.3), constrained_layout=True)
-    x = np.arange(len(order))
-    width = 0.36
-
-    axes[0].bar(
-        x - width / 2,
-        pivot["no_robust"].to_numpy(dtype=float),
-        width=width,
-        label="Non-robust",
-        color="#4C72B0",
-    )
-    axes[0].bar(
-        x + width / 2,
-        pivot["robust"].to_numpy(dtype=float),
-        width=width,
-        label="Robust",
-        color="#C44E52",
-    )
-    axes[0].set_xticks(x)
-    axes[0].set_xticklabels(order, rotation=20)
-    axes[0].set_ylabel("Mean Revenue")
-    axes[0].set_yscale("log")
-    axes[0].yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
-    axes[0].set_title("First Sweep Tier Revenue (log scale)")
-    axes[0].legend()
-
-    axes[1].bar(x, delta_pct.to_numpy(dtype=float), color="#55A868", width=0.55)
-    axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
-    axes[1].set_xticks(x)
-    axes[1].set_xticklabels(order, rotation=20)
-    axes[1].set_ylabel("Revenue Delta (%)")
-    axes[1].set_title("Robust Minus Non-robust by Tier")
-
-    out_path = out_dir / "first_sweep_tier_revenue.pdf"
-    fig.savefig(out_path, bbox_inches="tight")
-    plt.close(fig)
-    return out_path
-
-
-def build_plots(data_dir: Path, out_dir: Path) -> list[Path]:
-    alpha_mode = _load_csv(data_dir / "ppo_alpha_mode_summary.csv")
-    deltas = _load_csv(data_dir / "ppo_alpha_deltas.csv")
-    tier_mode = _load_csv(data_dir / "first_sweep_tier_mode_summary.csv")
-
-    out_dir.mkdir(parents=True, exist_ok=True)
-    paths = [
-        _plot_ppo_alpha_curves(alpha_mode, out_dir),
-        _plot_ppo_delta_curves(deltas, out_dir),
-        _plot_ppo_tradeoff_scatter(deltas, out_dir),
-        _plot_first_sweep_tier_revenue(tier_mode, out_dir),
-    ]
-    return paths
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Create paper-ready plots from result CSVs"
-    )
-    parser.add_argument("--data-dir", type=Path, default=_output_dir())
-    parser.add_argument("--plot-dir", type=Path, default=_plot_dir())
-    parser.add_argument(
-        "--refresh-data",
-        action="store_true",
-        help="Regenerate processed CSVs before plotting",
-    )
-    args = parser.parse_args()
-
-    _configure_style()
-
-    if bool(args.refresh_data):
-        run_ppo_benchmark(
-            input_path=Path(__file__).resolve().parents[5]
-            / "tpu_orchestration"
-            / "results"
-            / "ppo_benchmark.csv",
-            output_dir=args.data_dir,
-            include_non_finished=False,
-        )
-        run_first_sweep(
-            input_path=Path(__file__).resolve().parents[5]
-            / "tpu_orchestration"
-            / "results"
-            / "first_sweep.csv",
-            output_dir=args.data_dir,
-            include_non_finished=False,
-            top_n=25,
-        )
-
-    outputs = build_plots(data_dir=args.data_dir, out_dir=args.plot_dir)
-    for path in outputs:
-        print(path)
-
-
-if __name__ == "__main__":
-    main()
--- a/paper/src/chapters/figures/results/process_all_results.py
+++ b/paper/src/chapters/figures/results/process_all_results.py
@@ -1,51 +0,0 @@
-from __future__ import annotations
-
-import argparse
-from pathlib import Path
-
-from process_first_sweep import run as run_first_sweep
-from process_ppo_benchmark import run as run_ppo_benchmark
-
-
-def _default_output_dir() -> Path:
-    return Path(__file__).resolve().parent / "generated" / "legacy"
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Process all result CSV exports for paper figures"
-    )
-    parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
-    parser.add_argument("--include-non-finished", action="store_true")
-    parser.add_argument("--top-n", type=int, default=25)
-    args = parser.parse_args()
-
-    written: list[Path] = []
-    written.extend(
-        run_ppo_benchmark(
-            input_path=Path(__file__).resolve().parents[5]
-            / "tpu_orchestration"
-            / "results"
-            / "ppo_benchmark.csv",
-            output_dir=args.output_dir,
-            include_non_finished=bool(args.include_non_finished),
-        )
-    )
-    written.extend(
-        run_first_sweep(
-            input_path=Path(__file__).resolve().parents[5]
-            / "tpu_orchestration"
-            / "results"
-            / "first_sweep.csv",
-            output_dir=args.output_dir,
-            include_non_finished=bool(args.include_non_finished),
-            top_n=int(args.top_n),
-        )
-    )
-
-    for path in written:
-        print(path)
-
-
-if __name__ == "__main__":
-    main()
--- a/paper/src/chapters/figures/results/process_final_sweeps.py
+++ b/paper/src/chapters/figures/results/process_final_sweeps.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
+import subprocess
 from typing import Any

 import matplotlib
@@ -37,6 +38,20 @@ def _default_plot_dir(output_dir: Path) -> Path:
    return output_dir / "plots"


+def _git_commit() -> str:
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "HEAD"],
+            check=True,
+            text=True,
+            capture_output=True,
+            cwd=_project_root(),
+        )
+    except Exception:
+        return "unknown"
+    return result.stdout.strip()
+
+
 def _truthy(value: Any) -> bool:
    if isinstance(value, bool):
        return value
@@ -195,6 +210,48 @@ def _zone_summary(alpha_deltas: pd.DataFrame) -> pd.DataFrame:
    )


+def _alpha_product_coi_preservation(runs: pd.DataFrame) -> pd.DataFrame:
+    grouped = (
+        runs.groupby(["alpha", "n_products", "mode"], as_index=False)
+        .agg(
+            runs=("run_id", "size"),
+            coi_level_mean=("eval_coi_level_mean", "mean"),
+        )
+        .sort_values(["alpha", "n_products", "mode"])
+        .reset_index(drop=True)
+    )
+
+    rows: list[dict[str, float | int]] = []
+    for (alpha, n_products), group in grouped.groupby(
+        ["alpha", "n_products"], sort=True
+    ):
+        defended = group[group["mode"] == "defended"]
+        baseline = group[group["mode"] == "baseline"]
+        if defended.empty or baseline.empty:
+            continue
+
+        d_coi = float(defended["coi_level_mean"].iloc[0])
+        b_coi = float(baseline["coi_level_mean"].iloc[0])
+        rows.append(
+            {
+                "alpha": float(alpha),
+                "n_products": float(n_products),
+                "baseline_runs": int(baseline["runs"].iloc[0]),
+                "defended_runs": int(defended["runs"].iloc[0]),
+                "baseline_coi_level_mean": b_coi,
+                "defended_coi_level_mean": d_coi,
+                "coi_preserved": d_coi - b_coi,
+                "coi_preserved_pct": 0.0
+                if b_coi == 0.0
+                else 100.0 * (d_coi - b_coi) / b_coi,
+            }
+        )
+
+    return (
+        pd.DataFrame(rows).sort_values(["alpha", "n_products"]).reset_index(drop=True)
+    )
+
+
 def _save_plot(fig: plt.Figure, path: Path) -> Path:
    path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(path, bbox_inches="tight")
@@ -202,6 +259,61 @@ def _save_plot(fig: plt.Figure, path: Path) -> Path:
    return path


+def _smoothed_curve(
+    x: np.ndarray,
+    y: np.ndarray,
+    *,
+    window: int = 5,
+    points: int = 320,
+) -> tuple[np.ndarray, np.ndarray]:
+    x_values = np.asarray(x, dtype=float)
+    y_values = np.asarray(y, dtype=float)
+    mask = np.isfinite(x_values) & np.isfinite(y_values)
+    x_values = x_values[mask]
+    y_values = y_values[mask]
+    if x_values.size == 0:
+        return x_values, y_values
+
+    order = np.argsort(x_values)
+    x_values = x_values[order]
+    y_values = y_values[order]
+
+    unique_x = np.unique(x_values)
+    if unique_x.size != x_values.size:
+        dedup = (
+            pd.DataFrame({"x": x_values, "y": y_values})
+            .groupby("x", as_index=False)
+            .agg(y=("y", "mean"))
+            .sort_values("x")
+        )
+        x_values = dedup["x"].to_numpy(dtype=float)
+        y_values = dedup["y"].to_numpy(dtype=float)
+
+    if x_values.size < 3:
+        return x_values, y_values
+
+    win = int(max(3, window))
+    if win % 2 == 0:
+        win += 1
+    if win > x_values.size:
+        win = x_values.size if x_values.size % 2 == 1 else x_values.size - 1
+    if win < 3:
+        return x_values, y_values
+
+    half = win // 2
+    offsets = np.arange(-half, half + 1, dtype=float)
+    sigma = max(win / 3.0, 1.0)
+    kernel = np.exp(-0.5 * (offsets / sigma) ** 2)
+    kernel = kernel / np.sum(kernel)
+    y_padded = np.pad(y_values, (half, half), mode="edge")
+    y_smooth = np.convolve(y_padded, kernel, mode="valid")
+
+    n_points = max(int(points), x_values.size)
+    x_dense = np.linspace(float(np.min(x_values)), float(np.max(x_values)), n_points)
+    y_dense = np.interp(x_dense, x_values, y_smooth)
+    return x_dense, y_dense
+
+
 def _plot_focus_revenue_by_alpha(alpha_mode: pd.DataFrame, out_path: Path) -> Path:
    fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True)
    for mode, color, label in (
@@ -220,7 +332,6 @@ def _plot_focus_revenue_by_alpha(alpha_mode: pd.DataFrame, out_path: Path) -> Pa
            color=color,
            label=label,
        )
-    ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--")
    ax.set_xlabel(r"Contamination $\alpha$")
    ax.set_ylabel("Mean episode revenue")
    ax.set_title("Final Cohort Revenue Curves")
@@ -228,6 +339,147 @@ def _plot_focus_revenue_by_alpha(alpha_mode: pd.DataFrame, out_path: Path) -> Pa
    return _save_plot(fig, out_path)


+def _plot_focus_coi_by_alpha(alpha_mode: pd.DataFrame, out_path: Path) -> Path:
+    fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True)
+    for mode, color, label in (
+        ("baseline", "#4C72B0", "Baseline"),
+        ("defended", "#C44E52", "Defended"),
+    ):
+        sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha")
+        if sub.empty:
+            continue
+        x_raw = sub["alpha"].to_numpy(dtype=float)
+        y_raw = sub["coi_level_mean"].to_numpy(dtype=float)
+        x_smooth, y_smooth = _smoothed_curve(x_raw, y_raw)
+        ax.plot(
+            x_smooth,
+            y_smooth,
+            linewidth=1.9,
+            color=color,
+            label=label,
+        )
+        ax.scatter(
+            x_raw,
+            y_raw,
+            s=18,
+            color=color,
+            edgecolor="#FFFFFF",
+            linewidth=0.45,
+            zorder=3,
+        )
+
+    paired = alpha_mode.pivot_table(
+        index="alpha",
+        columns="mode",
+        values="coi_level_mean",
+        aggfunc="mean",
+    ).sort_index()
+    if {"baseline", "defended"}.issubset(set(paired.columns)):
+        paired = paired.dropna(subset=["baseline", "defended"], how="any")
+        if not paired.empty:
+            x = paired.index.to_numpy(dtype=float)
+            y_baseline = paired["baseline"].to_numpy(dtype=float)
+            y_defended = paired["defended"].to_numpy(dtype=float)
+            x_fill, y_baseline_smooth = _smoothed_curve(x, y_baseline)
+            _, y_defended_smooth = _smoothed_curve(x, y_defended)
+            ax.fill_between(
+                x_fill,
+                y_baseline_smooth,
+                y_defended_smooth,
+                color="#55A868",
+                alpha=0.12,
+                label="Gap",
+            )
+
+    ax.set_xlabel(r"Contamination $\alpha$")
+    ax.set_ylabel("Mean COI level")
+    ax.set_title("Final Cohort COI Curves")
+    ax.legend(loc="lower left")
+    return _save_plot(fig, out_path)
+
+
+def _plot_focus_coi_preservation_grid(
+    coi_preservation: pd.DataFrame, out_path: Path
+) -> Path:
+    if coi_preservation.empty:
+        raise ValueError("COI preservation grid requires at least one paired cell")
+
+    alpha_levels = sorted(coi_preservation["alpha"].dropna().unique().tolist())
+    endpoint_targets = (0.0, 1.0)
+    endpoint_levels = [
+        alpha
+        for target in endpoint_targets
+        for alpha in alpha_levels
+        if np.isclose(alpha, target, atol=1e-9)
+    ]
+    if len(endpoint_levels) < 2 and alpha_levels:
+        endpoint_levels = [alpha_levels[0], alpha_levels[-1]]
+    endpoint_levels = sorted(set(endpoint_levels))
+
+    data = coi_preservation[coi_preservation["alpha"].isin(endpoint_levels)].copy()
+    if data.empty:
+        raise ValueError(
+            "COI preservation grid has no rows for selected alpha endpoints"
+        )
+
+    alpha_levels = sorted(data["alpha"].dropna().unique().tolist())
+    product_levels = sorted(data["n_products"].dropna().unique().tolist())
+
+    bars = data.pivot_table(
+        index="n_products",
+        columns="alpha",
+        values="coi_preserved",
+        aggfunc="mean",
+    ).reindex(index=product_levels, columns=alpha_levels)
+
+    x = np.arange(len(product_levels), dtype=float)
+    n_alpha = max(len(alpha_levels), 1)
+    bar_width = min(0.78 / n_alpha, 0.35)
+    offsets = (np.arange(n_alpha, dtype=float) - (n_alpha - 1) / 2.0) * bar_width
+    palette = ["#4C72B0", "#C44E52", "#55A868", "#8172B3"]
+
+    fig, ax = plt.subplots(figsize=(7.8, 5.0), constrained_layout=True)
+    for idx, alpha in enumerate(alpha_levels):
+        values = bars[alpha].to_numpy(dtype=float)
+        mask = np.isfinite(values)
+        if not np.any(mask):
+            continue
+        xpos = x[mask] + offsets[idx]
+        v = values[mask]
+        ax.bar(
+            xpos,
+            v,
+            width=bar_width * 0.96,
+            color=palette[idx % len(palette)],
+            label=rf"$\alpha={alpha:.1f}$",
+        )
+        for x_i, y_i in zip(xpos, v):
+            ax.text(
+                float(x_i),
+                float(y_i) + (0.035 if y_i >= 0 else -0.035),
+                f"{y_i:+.2f}",
+                ha="center",
+                va="bottom" if y_i >= 0 else "top",
+                fontsize=7,
+            )
+
+    valid = bars.to_numpy(dtype=float)
+    valid = valid[np.isfinite(valid)]
+    max_abs = float(np.max(np.abs(valid))) if valid.size else 1.0
+    max_abs = max(max_abs * 1.22, 0.4)
+    ax.set_ylim(-max_abs, max_abs)
+
+    ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
+    ax.set_xticks(x)
+    ax.set_xticklabels([f"{int(v)}" for v in product_levels])
+    ax.set_xlabel("Product count")
+    ax.set_ylabel("COI preserved (defended minus baseline)")
+    ax.set_title("COI Preservation by Product Count at $\\alpha=0.0$ vs $\\alpha=1.0$")
+    ax.legend(loc="upper right")
+    ax.grid(axis="y", alpha=0.22)
+    return _save_plot(fig, out_path)
+
+
 def _plot_focus_revenue_delta(alpha_deltas: pd.DataFrame, out_path: Path) -> Path:
    fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True)
    x = alpha_deltas["alpha"].to_numpy(dtype=float)
@@ -235,7 +487,6 @@ def _plot_focus_revenue_delta(alpha_deltas: pd.DataFrame, out_path: Path) -> Pat
    ax.plot(x, y, marker="o", linewidth=2.0, markersize=4, color="#C44E52")
    ax.fill_between(x, y, 0.0, color="#C44E52", alpha=0.12)
    ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
-    ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--")
    high = alpha_deltas[alpha_deltas["alpha"] >= 0.7]
    if not high.empty:
        best = high.reindex(
@@ -283,7 +534,6 @@ def _plot_focus_risk_deltas(alpha_deltas: pd.DataFrame, out_path: Path) -> Path:
        label="Volatility delta",
    )
    ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
-    ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--")
    ax.set_xlabel(r"Contamination $\alpha$")
    ax.set_ylabel("Defended minus baseline")
    ax.set_title("Leakage and Stability Deltas (Final Cohort)")
@@ -297,13 +547,21 @@ def _write_include(path: Path, figure_rel_path: str, width: str) -> Path:
    return path


-def run(bundle_dir: Path, output_dir: Path, plot_dir: Path) -> list[Path]:
+def run(
+    bundle_dir: Path,
+    output_dir: Path,
+    plot_dir: Path,
+    focus_sweep_id: str | None = None,
+) -> list[Path]:
    all_runs = _load_runs(bundle_dir)
-    focus_id = _focus_sweep(all_runs)
+    focus_id = str(focus_sweep_id) if focus_sweep_id else _focus_sweep(all_runs)
+    if focus_id not in set(all_runs["sweep_id"].astype(str).unique()):
+        raise ValueError(f"Requested focus sweep_id not found: {focus_id}")
    focus_runs = all_runs[all_runs["sweep_id"] == focus_id].copy()
    alpha_mode = _alpha_mode_summary(focus_runs)
    deltas = _alpha_deltas(alpha_mode)
    zones = _zone_summary(deltas)
+    coi_preservation = _alpha_product_coi_preservation(focus_runs)

    output_dir.mkdir(parents=True, exist_ok=True)
    plot_dir.mkdir(parents=True, exist_ok=True)
@@ -321,9 +579,16 @@ def run(bundle_dir: Path, output_dir: Path, plot_dir: Path) -> list[Path]:
    zones.to_csv(zone_path, index=False)
    written.append(zone_path)

+    coi_grid_path = output_dir / "final_focus_coi_preservation_grid.csv"
+    coi_preservation.to_csv(coi_grid_path, index=False)
+    written.append(coi_grid_path)
+
    headline = {
        "bundle": str(bundle_dir),
        "focus_cohort": "max_alpha_coverage",
+        "focus_sweep_id": focus_id,
+        "focus_run_count": int(len(focus_runs)),
+        "git_commit": _git_commit(),
        "alpha_cells": int(deltas["alpha"].nunique()) if not deltas.empty else 0,
        "alpha_min": float(deltas["alpha"].min()) if not deltas.empty else None,
        "alpha_max": float(deltas["alpha"].max()) if not deltas.empty else None,
@@ -345,6 +610,18 @@ def run(bundle_dir: Path, output_dir: Path, plot_dir: Path) -> list[Path]:
            plot_dir / "final_focus_revenue_by_alpha.pdf",
        )
    )
+    written.append(
+        _plot_focus_coi_by_alpha(
+            alpha_mode,
+            plot_dir / "final_focus_coi_by_alpha.pdf",
+        )
+    )
+    written.append(
+        _plot_focus_coi_preservation_grid(
+            coi_preservation,
+            plot_dir / "final_focus_coi_preservation_grid.pdf",
+        )
+    )
    written.append(
        _plot_focus_revenue_delta(
            deltas,
@@ -358,7 +635,7 @@ def run(bundle_dir: Path, output_dir: Path, plot_dir: Path) -> list[Path]:
        )
    )

-    include_dir = Path(__file__).resolve().parent / "includes" / "final"
+    include_dir = Path(__file__).resolve().parent / "includes"
    written.append(
        _write_include(
            include_dir / "final_focus_revenue_by_alpha.tex",
@@ -366,6 +643,20 @@ def run(bundle_dir: Path, output_dir: Path, plot_dir: Path) -> list[Path]:
            "0.98\\linewidth",
        )
    )
+    written.append(
+        _write_include(
+            include_dir / "final_focus_coi_by_alpha.tex",
+            "chapters/figures/results/generated/final/plots/final_focus_coi_by_alpha.pdf",
+            "0.98\\linewidth",
+        )
+    )
+    written.append(
+        _write_include(
+            include_dir / "final_focus_coi_preservation_grid.tex",
+            "chapters/figures/results/generated/final/plots/final_focus_coi_preservation_grid.pdf",
+            "0.98\\linewidth",
+        )
+    )
    written.append(
        _write_include(
            include_dir / "final_focus_revenue_delta.tex",
@@ -390,6 +681,7 @@ def main() -> None:
    parser.add_argument("--bundle-dir", type=Path, default=_default_bundle_dir())
    parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
    parser.add_argument("--plot-dir", type=Path, default=None)
+    parser.add_argument("--focus-sweep-id", type=str, default=None)
    args = parser.parse_args()

    _configure_style()
@@ -399,7 +691,10 @@ def main() -> None:
        else _default_plot_dir(args.output_dir)
    )
    outputs = run(
-        bundle_dir=args.bundle_dir, output_dir=args.output_dir, plot_dir=plot_dir
+        bundle_dir=args.bundle_dir,
+        output_dir=args.output_dir,
+        plot_dir=plot_dir,
+        focus_sweep_id=args.focus_sweep_id,
    )
    for path in outputs:
        print(path)
--- a/paper/src/chapters/figures/results/process_first_sweep.py
+++ b/paper/src/chapters/figures/results/process_first_sweep.py
@@ -1,272 +0,0 @@
-from __future__ import annotations
-
-import argparse
-import json
-from pathlib import Path
-from typing import Iterable
-
-import numpy as np
-import pandas as pd
-
-
-def _project_root() -> Path:
-    return Path(__file__).resolve().parents[5]
-
-
-def _default_input() -> Path:
-    return _project_root() / "tpu_orchestration" / "results" / "first_sweep.csv"
-
-
-def _default_output_dir() -> Path:
-    return Path(__file__).resolve().parent / "generated" / "legacy"
-
-
-def _sanitize(key: str) -> str:
-    return key.replace("/", "_").replace("-", "_")
-
-
-def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None:
-    for column in columns:
-        if column in frame.columns:
-            frame[column] = pd.to_numeric(frame[column], errors="coerce")
-
-
-def _extract_alpha(frame: pd.DataFrame) -> pd.Series:
-    if "study/alpha" in frame.columns:
-        return pd.to_numeric(frame["study/alpha"], errors="coerce")
-    if "alpha" in frame.columns:
-        return pd.to_numeric(frame["alpha"], errors="coerce")
-    return pd.Series(np.nan, index=frame.index, dtype=float)
-
-
-def _extract_mode(frame: pd.DataFrame) -> pd.Series:
-    if "study/mode" in frame.columns:
-        return frame["study/mode"].astype(str).str.strip().str.lower()
-    if "study/no_robust" in frame.columns:
-        no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0)
-        return pd.Series(
-            np.where(no_robust > 0.5, "no_robust", "robust"),
-            index=frame.index,
-            dtype="object",
-        )
-    if "no_robust" in frame.columns:
-        no_robust = (
-            frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"})
-        )
-        return pd.Series(
-            np.where(no_robust, "no_robust", "robust"),
-            index=frame.index,
-            dtype="object",
-        )
-    return pd.Series("", index=frame.index, dtype="object")
-
-
-def _extract_tier(frame: pd.DataFrame) -> pd.Series:
-    for column in ("tiers", "runtime/backend", "algo", "run.backend", "run.algo"):
-        if column in frame.columns:
-            tier = frame[column].astype(str).str.strip().str.lower()
-            if tier.notna().any():
-                return tier
-    return pd.Series("unknown", index=frame.index, dtype="object")
-
-
-def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame:
-    data = frame.copy()
-    if not include_non_finished and "State" in data.columns:
-        data = data[data["State"].astype(str).str.lower() == "finished"].copy()
-
-    data["alpha"] = _extract_alpha(data)
-    data["mode"] = _extract_mode(data)
-    data["tier"] = _extract_tier(data)
-    data = data[data["mode"].isin({"robust", "no_robust"})]
-    data = data[data["alpha"].notna()]
-
-    _coerce_numeric(
-        data,
-        [
-            "eval/revenue_mean",
-            "eval/reward_mean",
-            "eval/coi_level_mean",
-            "eval/coi_leakage_mean",
-            "eval/margin_mean",
-            "eval/volatility_mean",
-            "objective/score",
-            "train/alpha_adv",
-            "lambda_coi",
-            "robust_radius",
-            "learning_rate",
-            "batch_size",
-            "n_steps",
-            "total_timesteps",
-        ],
-    )
-    return data.sort_values(["tier", "alpha", "mode"]).reset_index(drop=True)
-
-
-def _group_summary(
-    frame: pd.DataFrame, by: list[str], metrics: list[str]
-) -> pd.DataFrame:
-    agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
-    for metric in metrics:
-        safe = _sanitize(metric)
-        agg_spec[f"{safe}_mean"] = (metric, "mean")
-        agg_spec[f"{safe}_std"] = (metric, "std")
-    return frame.groupby(by, as_index=False).agg(**agg_spec).sort_values(by)
-
-
-def _tier_alpha_deltas(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
-    rows: list[dict[str, float | str]] = []
-    for (tier, alpha), group in summary.groupby(["tier", "alpha"], sort=True):
-        robust = group[group["mode"] == "robust"]
-        no_robust = group[group["mode"] == "no_robust"]
-        if robust.empty or no_robust.empty:
-            continue
-
-        row: dict[str, float | str] = {
-            "tier": str(tier),
-            "alpha": float(alpha),
-            "runs_robust": float(robust["runs"].iloc[0]),
-            "runs_no_robust": float(no_robust["runs"].iloc[0]),
-        }
-        for metric in metrics:
-            safe = _sanitize(metric)
-            robust_value = float(robust[f"{safe}_mean"].iloc[0])
-            no_robust_value = float(no_robust[f"{safe}_mean"].iloc[0])
-            delta = robust_value - no_robust_value
-            row[f"{safe}_delta"] = delta
-            row[f"{safe}_delta_pct"] = (
-                np.nan if no_robust_value == 0 else 100.0 * delta / no_robust_value
-            )
-        rows.append(row)
-
-    return pd.DataFrame(rows)
-
-
-def _top_runs(frame: pd.DataFrame, n: int) -> pd.DataFrame:
-    rank_metric = "objective/score"
-    if rank_metric not in frame.columns or frame[rank_metric].notna().sum() == 0:
-        rank_metric = "eval/reward_mean"
-
-    keep = [
-        "Name",
-        "tier",
-        "alpha",
-        "mode",
-        rank_metric,
-        "eval/revenue_mean",
-        "eval/reward_mean",
-        "eval/coi_level_mean",
-        "eval/coi_leakage_mean",
-        "lambda_coi",
-        "robust_radius",
-        "learning_rate",
-        "batch_size",
-        "n_steps",
-        "total_timesteps",
-    ]
-    present = [column for column in keep if column in frame.columns]
-    ranked = frame[present].copy().sort_values(rank_metric, ascending=False)
-    return ranked.head(max(1, int(n))).reset_index(drop=True)
-
-
-def _headline_json(
-    frame: pd.DataFrame, tier_mode: pd.DataFrame
-) -> dict[str, float | str]:
-    out: dict[str, float | str] = {
-        "runs": int(len(frame)),
-        "tiers": int(frame["tier"].nunique()),
-        "alphas": int(frame["alpha"].nunique()),
-    }
-
-    robust_rows = tier_mode[tier_mode["mode"] == "robust"]
-    no_robust_rows = tier_mode[tier_mode["mode"] == "no_robust"]
-    if robust_rows.empty or no_robust_rows.empty:
-        out["status"] = "incomplete_modes"
-        return out
-
-    robust_mean = robust_rows["eval_revenue_mean_mean"].mean()
-    no_robust_mean = no_robust_rows["eval_revenue_mean_mean"].mean()
-    out.update(
-        {
-            "status": "ok",
-            "mean_tier_revenue_robust": float(robust_mean),
-            "mean_tier_revenue_no_robust": float(no_robust_mean),
-            "mean_tier_revenue_delta": float(robust_mean - no_robust_mean),
-            "mean_tier_revenue_delta_pct": float(
-                100.0 * (robust_mean - no_robust_mean) / no_robust_mean
-            )
-            if no_robust_mean
-            else np.nan,
-        }
-    )
-    return out
-
-
-def run(
-    input_path: Path, output_dir: Path, include_non_finished: bool, top_n: int
-) -> list[Path]:
-    output_dir.mkdir(parents=True, exist_ok=True)
-    raw = pd.read_csv(input_path)
-    frame = _prepare_frame(raw, include_non_finished=include_non_finished)
-
-    metrics = [
-        metric
-        for metric in (
-            "eval/revenue_mean",
-            "eval/reward_mean",
-            "eval/coi_level_mean",
-            "eval/coi_leakage_mean",
-            "eval/margin_mean",
-            "eval/volatility_mean",
-            "objective/score",
-            "train/alpha_adv",
-        )
-        if metric in frame.columns
-    ]
-
-    tier_mode = _group_summary(frame, ["tier", "mode"], metrics)
-    tier_alpha_mode = _group_summary(frame, ["tier", "alpha", "mode"], metrics)
-    deltas = _tier_alpha_deltas(tier_alpha_mode, metrics)
-    top_configs = _top_runs(frame, n=top_n)
-    headline = _headline_json(frame, tier_mode)
-
-    outputs = {
-        "first_sweep_tier_mode_summary.csv": tier_mode,
-        "first_sweep_tier_alpha_mode_summary.csv": tier_alpha_mode,
-        "first_sweep_tier_alpha_deltas.csv": deltas,
-        "first_sweep_top_configs.csv": top_configs,
-    }
-    written_paths: list[Path] = []
-    for filename, table in outputs.items():
-        path = output_dir / filename
-        table.to_csv(path, index=False)
-        written_paths.append(path)
-
-    headline_path = output_dir / "first_sweep_headline_summary.json"
-    headline_path.write_text(json.dumps(headline, indent=2))
-    written_paths.append(headline_path)
-    return written_paths
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Process first sweep CSV for paper tables"
-    )
-    parser.add_argument("--input", type=Path, default=_default_input())
-    parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
-    parser.add_argument("--include-non-finished", action="store_true")
-    parser.add_argument("--top-n", type=int, default=25)
-    args = parser.parse_args()
-
-    written = run(
-        input_path=args.input,
-        output_dir=args.output_dir,
-        include_non_finished=bool(args.include_non_finished),
-        top_n=int(args.top_n),
-    )
-    for path in written:
-        print(path)
-
-
-if __name__ == "__main__":
-    main()
--- a/paper/src/chapters/figures/results/process_ppo_benchmark.py
+++ b/paper/src/chapters/figures/results/process_ppo_benchmark.py
@@ -1,277 +0,0 @@
-from __future__ import annotations
-
-import argparse
-import json
-from pathlib import Path
-from typing import Iterable
-
-import numpy as np
-import pandas as pd
-
-
-def _project_root() -> Path:
-    return Path(__file__).resolve().parents[5]
-
-
-def _default_input() -> Path:
-    return _project_root() / "tpu_orchestration" / "results" / "ppo_benchmark.csv"
-
-
-def _default_output_dir() -> Path:
-    return Path(__file__).resolve().parent / "generated" / "legacy"
-
-
-def _sanitize(key: str) -> str:
-    return key.replace("/", "_").replace("-", "_")
-
-
-def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None:
-    for column in columns:
-        if column in frame.columns:
-            frame[column] = pd.to_numeric(frame[column], errors="coerce")
-
-
-def _extract_alpha(frame: pd.DataFrame) -> pd.Series:
-    if "study/alpha" in frame.columns:
-        return pd.to_numeric(frame["study/alpha"], errors="coerce")
-    if "alpha" in frame.columns:
-        return pd.to_numeric(frame["alpha"], errors="coerce")
-    return pd.Series(np.nan, index=frame.index, dtype=float)
-
-
-def _extract_mode(frame: pd.DataFrame) -> pd.Series:
-    if "study/mode" in frame.columns:
-        return frame["study/mode"].astype(str).str.strip().str.lower()
-    if "study/no_robust" in frame.columns:
-        no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0)
-        return pd.Series(
-            np.where(no_robust > 0.5, "no_robust", "robust"),
-            index=frame.index,
-            dtype="object",
-        )
-    if "no_robust" in frame.columns:
-        no_robust = (
-            frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"})
-        )
-        return pd.Series(
-            np.where(no_robust, "no_robust", "robust"),
-            index=frame.index,
-            dtype="object",
-        )
-    return pd.Series("", index=frame.index, dtype="object")
-
-
-def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame:
-    data = frame.copy()
-    if not include_non_finished and "State" in data.columns:
-        data = data[data["State"].astype(str).str.lower() == "finished"].copy()
-
-    data["alpha"] = _extract_alpha(data)
-    data["mode"] = _extract_mode(data)
-    data = data[data["mode"].isin({"robust", "no_robust"})]
-    data = data[data["alpha"].notna()]
-
-    numeric_cols = [
-        "eval/revenue_mean",
-        "eval/reward_mean",
-        "eval/coi_level_mean",
-        "eval/coi_leakage_mean",
-        "eval/volatility_mean",
-        "eval/margin_mean",
-        "train/alpha_adv",
-        "train/coi_penalty",
-        "train/ux_penalty",
-        "train/agent_prob",
-    ]
-    _coerce_numeric(data, numeric_cols)
-    return data.sort_values(["alpha", "mode"]).reset_index(drop=True)
-
-
-def _summary_by_alpha_mode(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
-    agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
-    for metric in metrics:
-        safe = _sanitize(metric)
-        agg_spec[f"{safe}_mean"] = (metric, "mean")
-        agg_spec[f"{safe}_std"] = (metric, "std")
-
-    return (
-        frame.groupby(["alpha", "mode"], as_index=False)
-        .agg(**agg_spec)
-        .sort_values(["alpha", "mode"])
-        .reset_index(drop=True)
-    )
-
-
-def _delta_by_alpha(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
-    rows: list[dict[str, float]] = []
-    for alpha, alpha_group in summary.groupby("alpha", sort=True):
-        robust = alpha_group[alpha_group["mode"] == "robust"]
-        no_robust = alpha_group[alpha_group["mode"] == "no_robust"]
-        if robust.empty or no_robust.empty:
-            continue
-
-        row: dict[str, float] = {
-            "alpha": float(alpha),
-            "runs_robust": float(robust["runs"].iloc[0]),
-            "runs_no_robust": float(no_robust["runs"].iloc[0]),
-        }
-        for metric in metrics:
-            safe = _sanitize(metric)
-            robust_value = float(robust[f"{safe}_mean"].iloc[0])
-            no_robust_value = float(no_robust[f"{safe}_mean"].iloc[0])
-            delta = robust_value - no_robust_value
-            row[f"{safe}_robust"] = robust_value
-            row[f"{safe}_no_robust"] = no_robust_value
-            row[f"{safe}_delta"] = delta
-            row[f"{safe}_delta_pct"] = (
-                np.nan if no_robust_value == 0 else 100.0 * delta / no_robust_value
-            )
-        rows.append(row)
-
-    return pd.DataFrame(rows)
-
-
-def _pairwise_win_rates(frame: pd.DataFrame) -> pd.DataFrame:
-    rules = {
-        "eval/revenue_mean": "higher",
-        "eval/reward_mean": "higher",
-        "eval/coi_leakage_mean": "lower",
-        "eval/volatility_mean": "lower",
-    }
-    rows: list[dict[str, float]] = []
-    for alpha, alpha_group in frame.groupby("alpha", sort=True):
-        robust = alpha_group[alpha_group["mode"] == "robust"]
-        no_robust = alpha_group[alpha_group["mode"] == "no_robust"]
-        if robust.empty or no_robust.empty:
-            continue
-
-        for metric, direction in rules.items():
-            if metric not in frame.columns:
-                continue
-            robust_values = robust[metric].dropna().to_numpy(dtype=float)
-            no_robust_values = no_robust[metric].dropna().to_numpy(dtype=float)
-            if robust_values.size == 0 or no_robust_values.size == 0:
-                continue
-
-            if direction == "higher":
-                wins = (robust_values[:, None] > no_robust_values[None, :]).sum()
-            else:
-                wins = (robust_values[:, None] < no_robust_values[None, :]).sum()
-            ties = (robust_values[:, None] == no_robust_values[None, :]).sum()
-            total = robust_values.size * no_robust_values.size
-            win_prob = (wins + 0.5 * ties) / total
-            rows.append(
-                {
-                    "alpha": float(alpha),
-                    "metric": metric,
-                    "direction": direction,
-                    "wins": int(wins),
-                    "ties": int(ties),
-                    "total_pairs": int(total),
-                    "win_probability": float(win_prob),
-                }
-            )
-    return pd.DataFrame(rows)
-
-
-def _overall_mode_summary(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
-    agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
-    for metric in metrics:
-        safe = _sanitize(metric)
-        agg_spec[f"{safe}_mean"] = (metric, "mean")
-        agg_spec[f"{safe}_std"] = (metric, "std")
-    return frame.groupby("mode", as_index=False).agg(**agg_spec).sort_values("mode")
-
-
-def _headline_json(overall: pd.DataFrame) -> dict[str, float | str]:
-    if {"robust", "no_robust"} - set(overall["mode"].tolist()):
-        return {"status": "incomplete_modes"}
-
-    robust = overall[overall["mode"] == "robust"].iloc[0]
-    no_robust = overall[overall["mode"] == "no_robust"].iloc[0]
-
-    revenue_delta = float(
-        robust["eval_revenue_mean_mean"] - no_robust["eval_revenue_mean_mean"]
-    )
-    leakage_delta = float(
-        robust["eval_coi_leakage_mean_mean"] - no_robust["eval_coi_leakage_mean_mean"]
-    )
-    return {
-        "status": "ok",
-        "revenue_delta": revenue_delta,
-        "revenue_delta_pct": float(
-            100.0 * revenue_delta / no_robust["eval_revenue_mean_mean"]
-        ),
-        "coi_leakage_delta": leakage_delta,
-        "coi_leakage_delta_pct": float(
-            100.0 * leakage_delta / no_robust["eval_coi_leakage_mean_mean"]
-        ),
-    }
-
-
-def run(input_path: Path, output_dir: Path, include_non_finished: bool) -> list[Path]:
-    output_dir.mkdir(parents=True, exist_ok=True)
-    raw = pd.read_csv(input_path)
-    frame = _prepare_frame(raw, include_non_finished=include_non_finished)
-
-    metrics = [
-        metric
-        for metric in (
-            "eval/revenue_mean",
-            "eval/reward_mean",
-            "eval/coi_level_mean",
-            "eval/coi_leakage_mean",
-            "eval/volatility_mean",
-            "eval/margin_mean",
-            "train/alpha_adv",
-            "train/coi_penalty",
-            "train/ux_penalty",
-            "train/agent_prob",
-        )
-        if metric in frame.columns
-    ]
-
-    alpha_mode = _summary_by_alpha_mode(frame, metrics)
-    deltas = _delta_by_alpha(alpha_mode, metrics)
-    win_rates = _pairwise_win_rates(frame)
-    overall = _overall_mode_summary(frame, metrics)
-    headline = _headline_json(overall)
-
-    outputs = {
-        "ppo_alpha_mode_summary.csv": alpha_mode,
-        "ppo_alpha_deltas.csv": deltas,
-        "ppo_pairwise_win_rates.csv": win_rates,
-        "ppo_overall_mode_summary.csv": overall,
-    }
-    written_paths: list[Path] = []
-    for filename, table in outputs.items():
-        path = output_dir / filename
-        table.to_csv(path, index=False)
-        written_paths.append(path)
-
-    headline_path = output_dir / "ppo_headline_summary.json"
-    headline_path.write_text(json.dumps(headline, indent=2))
-    written_paths.append(headline_path)
-    return written_paths
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Process PPO benchmark CSV for paper tables"
-    )
-    parser.add_argument("--input", type=Path, default=_default_input())
-    parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
-    parser.add_argument("--include-non-finished", action="store_true")
-    args = parser.parse_args()
-
-    written = run(
-        input_path=args.input,
-        output_dir=args.output_dir,
-        include_non_finished=bool(args.include_non_finished),
-    )
-    for path in written:
-        print(path)
-
-
-if __name__ == "__main__":
-    main()
--- a/paper/src/chapters/figures/results/revenue_alpha_analysis.py
+++ b/paper/src/chapters/figures/results/revenue_alpha_analysis.py
@@ -0,0 +1,454 @@
+from __future__ import annotations
+
+import argparse
+import json
+import subprocess
+from pathlib import Path
+from typing import Iterable
+
+import numpy as np
+import pandas as pd
+from scipy import stats
+
+
+def _project_root() -> Path:
+    return Path(__file__).resolve().parents[5]
+
+
+def _default_bundle_dir() -> Path:
+    base = _project_root() / "engine" / "studies" / "results" / "wandb_sweep_bundles"
+    bundles = sorted(
+        [path for path in base.glob("bundle_*") if path.is_dir()],
+        key=lambda path: path.stat().st_mtime,
+        reverse=True,
+    )
+    if not bundles:
+        raise FileNotFoundError(f"No sweep bundle directories found in {base}")
+    return bundles[0]
+
+
+def _bundle_dir_from_id(bundle_id: str) -> Path:
+    token = str(bundle_id).strip()
+    name = token if token.startswith("bundle_") else f"bundle_{token}"
+    path = (
+        _project_root()
+        / "engine"
+        / "studies"
+        / "results"
+        / "wandb_sweep_bundles"
+        / name
+    )
+    if not path.exists():
+        raise FileNotFoundError(f"Bundle not found: {path}")
+    return path
+
+
+def _default_output_dir() -> Path:
+    return Path(__file__).resolve().parent / "generated" / "final"
+
+
+def _truthy(value: object) -> bool:
+    if isinstance(value, bool):
+        return value
+    if value is None:
+        return False
+    return str(value).strip().lower() in {"1", "true", "yes", "on"}
+
+
+def _mode_of(row: pd.Series) -> str:
+    mode_hint = str(row.get("study_mode", "")).strip().lower()
+    if mode_hint in {"baseline", "no_robust"}:
+        return "baseline"
+    if mode_hint in {"defended", "robust"}:
+        return "defended"
+    if _truthy(row.get("baseline_mode")) or _truthy(row.get("no_robust")):
+        return "baseline"
+    return "defended"
+
+
+def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None:
+    for column in columns:
+        if column in frame.columns:
+            frame[column] = pd.to_numeric(frame[column], errors="coerce")
+
+
+def _load_runs(bundle_dir: Path) -> pd.DataFrame:
+    path = bundle_dir / "runs_finished.csv"
+    if not path.exists():
+        raise FileNotFoundError(f"Missing required file: {path}")
+    frame = pd.read_csv(path)
+    frame["mode"] = frame.apply(_mode_of, axis=1)
+    _coerce_numeric(
+        frame,
+        [
+            "alpha",
+            "n_products",
+            "eta_ux",
+            "lambda_coi",
+            "eval_revenue_mean",
+        ],
+    )
+    frame = frame[frame["mode"].isin({"baseline", "defended"})].copy()
+    return frame
+
+
+def _get_git_commit() -> str:
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "HEAD"],
+            check=True,
+            text=True,
+            capture_output=True,
+            cwd=_project_root(),
+        )
+    except Exception:
+        return "unknown"
+    return result.stdout.strip()
+
+
+def _apply_filters(frame: pd.DataFrame, args: argparse.Namespace) -> pd.DataFrame:
+    data = frame.copy()
+    if args.sweep_id:
+        allowed = {str(value) for value in args.sweep_id}
+        data = data[data["sweep_id"].astype(str).isin(allowed)]
+    if args.mode != "all":
+        data = data[data["mode"] == args.mode]
+    if args.n_products is not None:
+        data = data[data["n_products"] == float(args.n_products)]
+    if args.eta_ux is not None:
+        data = data[data["eta_ux"] == float(args.eta_ux)]
+    if args.lambda_coi is not None:
+        data = data[data["lambda_coi"] == float(args.lambda_coi)]
+    data = data[data["alpha"].notna() & data["eval_revenue_mean"].notna()]
+    data = data[data["alpha"] >= float(args.alpha_min)]
+    data = data[data["alpha"] <= float(args.alpha_max)]
+    return data.reset_index(drop=True)
+
+
+def _design_matrix(
+    frame: pd.DataFrame,
+    *,
+    include_sweep_fixed_effects: bool,
+) -> tuple[np.ndarray, np.ndarray, list[str]]:
+    y = frame["eval_revenue_mean"].to_numpy(dtype=float)
+    x_alpha = frame["alpha"].to_numpy(dtype=float)
+    columns = ["intercept", "alpha"]
+    blocks = [np.ones_like(x_alpha), x_alpha]
+    if include_sweep_fixed_effects:
+        dummies = pd.get_dummies(
+            frame["sweep_id"].astype(str), prefix="sweep", drop_first=True
+        )
+        if not dummies.empty:
+            blocks.append(dummies.to_numpy(dtype=float).T)
+            columns.extend(dummies.columns.tolist())
+    X = np.vstack(blocks).T
+    return X, y, columns
+
+
+def _covariance_hc1(X: np.ndarray, residuals: np.ndarray) -> np.ndarray:
+    n, k = X.shape
+    xtx_inv = np.linalg.pinv(X.T @ X)
+    xr = X * residuals[:, None]
+    meat = xr.T @ xr
+    scale = float(n) / max(n - k, 1)
+    return scale * (xtx_inv @ meat @ xtx_inv)
+
+
+def _covariance_cluster(
+    X: np.ndarray, residuals: np.ndarray, groups: pd.Series
+) -> tuple[np.ndarray, int]:
+    xtx_inv = np.linalg.pinv(X.T @ X)
+    unique = pd.Series(groups).astype(str).dropna().unique().tolist()
+    g = len(unique)
+    n, k = X.shape
+    if g <= 1:
+        return _covariance_hc1(X, residuals), g
+    meat = np.zeros((k, k), dtype=float)
+    for value in unique:
+        mask = pd.Series(groups).astype(str).to_numpy() == value
+        Xg = X[mask]
+        ug = residuals[mask]
+        xu = Xg.T @ ug
+        meat += np.outer(xu, xu)
+    c = (g / (g - 1.0)) * ((n - 1.0) / max(n - k, 1.0))
+    return c * (xtx_inv @ meat @ xtx_inv), g
+
+
+def _fit_ols(
+    X: np.ndarray,
+    y: np.ndarray,
+    columns: list[str],
+    *,
+    cov_type: str,
+    groups: pd.Series | None = None,
+) -> dict[str, object]:
+    n, k = X.shape
+    beta, _, _, _ = np.linalg.lstsq(X, y, rcond=None)
+    fitted = X @ beta
+    residuals = y - fitted
+    dof = max(n - k, 1)
+    sse = float(np.sum(residuals**2))
+    y_centered = y - float(np.mean(y))
+    sst = float(np.sum(y_centered**2))
+    r2 = float(1.0 - sse / sst) if sst > 0 else 0.0
+    adj_r2 = float(1.0 - (1.0 - r2) * ((n - 1.0) / max(n - k, 1.0)))
+
+    if cov_type == "iid":
+        sigma2 = sse / dof
+        cov = sigma2 * np.linalg.pinv(X.T @ X)
+        df_t = dof
+        clusters = None
+    elif cov_type == "hc1":
+        cov = _covariance_hc1(X, residuals)
+        df_t = dof
+        clusters = None
+    elif cov_type == "cluster":
+        if groups is None:
+            raise ValueError("groups are required when cov_type='cluster'")
+        cov, clusters = _covariance_cluster(X, residuals, groups)
+        df_t = max(clusters - 1, 1)
+    else:
+        raise ValueError(f"Unsupported cov_type: {cov_type}")
+
+    se = np.sqrt(np.clip(np.diag(cov), 0.0, np.inf))
+    t_stats = np.divide(beta, se, out=np.zeros_like(beta), where=se > 0)
+    p_values = 2.0 * (1.0 - stats.t.cdf(np.abs(t_stats), df=df_t))
+    t_crit = float(stats.t.ppf(0.975, df=df_t))
+    ci_low = beta - t_crit * se
+    ci_high = beta + t_crit * se
+
+    coef_rows = []
+    for idx, name in enumerate(columns):
+        coef_rows.append(
+            {
+                "name": name,
+                "coef": float(beta[idx]),
+                "std_error": float(se[idx]),
+                "t_stat": float(t_stats[idx]),
+                "p_value": float(p_values[idx]),
+                "ci95_low": float(ci_low[idx]),
+                "ci95_high": float(ci_high[idx]),
+            }
+        )
+
+    return {
+        "n": int(n),
+        "k": int(k),
+        "dof": int(dof),
+        "df_t": int(df_t),
+        "cov_type": cov_type,
+        "clusters": int(clusters) if clusters is not None else None,
+        "r2": r2,
+        "adj_r2": adj_r2,
+        "sse": sse,
+        "coefficients": coef_rows,
+        "residuals": residuals,
+        "fitted": fitted,
+        "beta": beta,
+    }
+
+
+def _diagnostics(
+    X: np.ndarray, y: np.ndarray, fit: dict[str, object]
+) -> dict[str, object]:
+    residuals = np.asarray(fit["residuals"], dtype=float)
+    n, k = X.shape
+    if residuals.size < 8:
+        normality = {"test": "jarque_bera", "available": False}
+    else:
+        jb_stat, jb_p = stats.jarque_bera(residuals)
+        normality = {
+            "test": "jarque_bera",
+            "available": True,
+            "statistic": float(jb_stat),
+            "p_value": float(jb_p),
+        }
+
+    if k <= 1:
+        hetero = {"test": "breusch_pagan", "available": False}
+    else:
+        u2 = residuals**2
+        aux = _fit_ols(X, u2, [f"x{i}" for i in range(k)], cov_type="iid")
+        lm = float(len(u2) * float(aux["r2"]))
+        df_bp = k - 1
+        p_bp = float(1.0 - stats.chi2.cdf(lm, df_bp))
+        hetero = {
+            "test": "breusch_pagan",
+            "available": True,
+            "lm_stat": lm,
+            "df": int(df_bp),
+            "p_value": p_bp,
+        }
+
+    xtx_inv = np.linalg.pinv(X.T @ X)
+    leverages = np.sum((X @ xtx_inv) * X, axis=1)
+    mse = float(np.sum(residuals**2) / max(n - k, 1))
+    if mse <= 0:
+        cooks = np.zeros(n, dtype=float)
+    else:
+        denom = np.clip((1.0 - leverages) ** 2, 1e-10, np.inf)
+        cooks = ((residuals**2) / (k * mse)) * (leverages / denom)
+
+    return {
+        "normality": normality,
+        "heteroskedasticity": hetero,
+        "influence": {
+            "max_leverage": float(np.max(leverages)) if leverages.size else 0.0,
+            "mean_leverage": float(np.mean(leverages)) if leverages.size else 0.0,
+            "high_leverage_threshold": float(2.0 * k / max(n, 1)),
+            "high_leverage_count": int(np.sum(leverages > (2.0 * k / max(n, 1)))),
+            "max_cooks_distance": float(np.max(cooks)) if cooks.size else 0.0,
+            "high_cooks_threshold": float(4.0 / max(n, 1)),
+            "high_cooks_count": int(np.sum(cooks > (4.0 / max(n, 1)))),
+        },
+    }
+
+
+def run(args: argparse.Namespace) -> list[Path]:
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    runs = _load_runs(Path(args.bundle_dir))
+    filtered = _apply_filters(runs, args)
+    if len(filtered) < 3:
+        raise ValueError("Filtered cohort must contain at least 3 rows")
+    if filtered["alpha"].nunique() < 2:
+        raise ValueError("Filtered cohort must contain at least 2 unique alpha values")
+
+    filtered_csv = output_dir / "revenue_alpha_filtered.csv"
+    filtered.to_csv(filtered_csv, index=False)
+
+    sample_accounting = {
+        "bundle_dir": str(Path(args.bundle_dir)),
+        "git_commit": _get_git_commit(),
+        "cohort_name": str(args.cohort_name),
+        "filters": {
+            "sweep_id": args.sweep_id,
+            "mode": args.mode,
+            "n_products": args.n_products,
+            "eta_ux": args.eta_ux,
+            "lambda_coi": args.lambda_coi,
+            "alpha_min": args.alpha_min,
+            "alpha_max": args.alpha_max,
+        },
+        "n_rows": int(len(filtered)),
+        "n_sweeps": int(filtered["sweep_id"].nunique()),
+        "alpha_unique": sorted(
+            float(v) for v in filtered["alpha"].dropna().unique().tolist()
+        ),
+        "rows_by_sweep": filtered.groupby("sweep_id").size().astype(int).to_dict(),
+        "rows_by_mode": filtered.groupby("mode").size().astype(int).to_dict(),
+    }
+    sample_path = output_dir / "revenue_alpha_sample_accounting.json"
+    sample_path.write_text(json.dumps(sample_accounting, indent=2) + "\n")
+
+    X_simple, y, cols_simple = _design_matrix(
+        filtered, include_sweep_fixed_effects=False
+    )
+    fit_simple = _fit_ols(X_simple, y, cols_simple, cov_type="iid")
+    simple_path = output_dir / "revenue_alpha_simple_ols.json"
+    simple_path.write_text(
+        json.dumps(
+            {
+                k: v
+                for k, v in fit_simple.items()
+                if k not in {"residuals", "fitted", "beta"}
+            },
+            indent=2,
+        )
+        + "\n"
+    )
+
+    X_fe, y_fe, cols_fe = _design_matrix(filtered, include_sweep_fixed_effects=True)
+    cov_type = "cluster" if filtered["sweep_id"].nunique() > 1 else "hc1"
+    fit_fe = _fit_ols(
+        X_fe, y_fe, cols_fe, cov_type=cov_type, groups=filtered["sweep_id"]
+    )
+    fe_path = output_dir / "revenue_alpha_fixed_effects.json"
+    fe_path.write_text(
+        json.dumps(
+            {
+                k: v
+                for k, v in fit_fe.items()
+                if k not in {"residuals", "fitted", "beta"}
+            },
+            indent=2,
+        )
+        + "\n"
+    )
+
+    per_sweep_rows: list[dict[str, float | str | int]] = []
+    for sweep_id, group in filtered.groupby("sweep_id"):
+        if len(group) < 3 or group["alpha"].nunique() < 2:
+            continue
+        X_sw, y_sw, cols_sw = _design_matrix(group, include_sweep_fixed_effects=False)
+        fit_sw = _fit_ols(X_sw, y_sw, cols_sw, cov_type="hc1")
+        alpha_row = next(
+            row for row in fit_sw["coefficients"] if row["name"] == "alpha"
+        )
+        per_sweep_rows.append(
+            {
+                "sweep_id": str(sweep_id),
+                "n": int(fit_sw["n"]),
+                "alpha_coef": float(alpha_row["coef"]),
+                "alpha_std_error": float(alpha_row["std_error"]),
+                "alpha_t_stat": float(alpha_row["t_stat"]),
+                "alpha_p_value": float(alpha_row["p_value"]),
+                "alpha_ci95_low": float(alpha_row["ci95_low"]),
+                "alpha_ci95_high": float(alpha_row["ci95_high"]),
+                "r2": float(fit_sw["r2"]),
+            }
+        )
+    per_sweep_frame = pd.DataFrame(per_sweep_rows)
+    if not per_sweep_frame.empty:
+        per_sweep_frame = per_sweep_frame.sort_values("sweep_id").reset_index(drop=True)
+    per_sweep_path = output_dir / "revenue_alpha_per_sweep.csv"
+    per_sweep_frame.to_csv(per_sweep_path, index=False)
+
+    fit_for_diagnostics = fit_fe if cov_type == "cluster" else fit_simple
+    X_for_diagnostics = X_fe if cov_type == "cluster" else X_simple
+    diagnostics = _diagnostics(X_for_diagnostics, y, fit_for_diagnostics)
+    diagnostics_path = output_dir / "revenue_alpha_diagnostics.json"
+    diagnostics_path.write_text(json.dumps(diagnostics, indent=2) + "\n")
+
+    return [
+        filtered_csv,
+        sample_path,
+        simple_path,
+        fe_path,
+        per_sweep_path,
+        diagnostics_path,
+    ]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Reproducible contamination-vs-revenue analysis from a sweep bundle"
+    )
+    parser.add_argument("--bundle-dir", type=Path, default=None)
+    parser.add_argument("--bundle-id", type=str, default=None)
+    parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
+    parser.add_argument("--cohort-name", type=str, default="custom")
+    parser.add_argument("--sweep-id", action="append", default=[])
+    parser.add_argument(
+        "--mode", choices=["all", "baseline", "defended"], default="all"
+    )
+    parser.add_argument("--n-products", type=float, default=None)
+    parser.add_argument("--eta-ux", type=float, default=None)
+    parser.add_argument("--lambda-coi", type=float, default=None)
+    parser.add_argument("--alpha-min", type=float, default=0.0)
+    parser.add_argument("--alpha-max", type=float, default=1.0)
+    args = parser.parse_args()
+
+    if args.bundle_id:
+        args.bundle_dir = _bundle_dir_from_id(args.bundle_id)
+    elif args.bundle_dir is None:
+        args.bundle_dir = _default_bundle_dir()
+
+    outputs = run(args)
+    for path in outputs:
+        print(path)
+
+
+if __name__ == "__main__":
+    main()
--- a/paper/src/chapters/figures/sigmoid_softmax_gap.tex
+++ b/paper/src/chapters/figures/sigmoid_softmax_gap.tex
@@ -0,0 +1,38 @@
+\begin{tikzpicture}
+\begin{axis}[
+  width=8.8cm,
+  height=5.2cm,
+  xmin=-4.6,
+  xmax=4.6,
+  ymin=-0.02,
+  ymax=1.06,
+  axis lines=left,
+  xlabel={$\Delta_H - \Delta_A$},
+  xlabel style={yshift=-1.5pt},
+  ylabel={$f(\tau')$},
+  xtick={-4,-2,0,2,4},
+  ytick={0,0.5,1},
+  tick label style={font=\small},
+  label style={font=\small},
+  line width=0.6pt,
+  clip=false,
+  enlarge x limits=false,
+]
+\addplot[
+  thick,
+  domain=-4.6:4.6,
+  samples=201,
+  smooth,
+] {1/(1+exp(-x))};
+\draw[dashed, line width=0.45pt, black!38]
+  (axis cs:-2.15,0) -- (axis cs:-2.15,{1/(1+exp(2.15))});
+\draw[dashed, line width=0.45pt, black!38]
+  (axis cs:2.15,0) -- (axis cs:2.15,{1/(1+exp(-2.15))});
+\addplot[only marks, mark=*, mark size=2.2pt, forget plot, draw=black!55, fill=black!55]
+  coordinates {(-2.15, {1/(1+exp(2.15))})};
+\addplot[only marks, mark=*, mark size=2.2pt, forget plot, draw=black, fill=black]
+  coordinates {(2.15, {1/(1+exp(-2.15))})};
+\node[font=\footnotesize, anchor=south, inner sep=11pt] at (axis cs:-2.15,{1/(1+exp(2.15))}) {$\Delta_H<\Delta_A$};
+\node[font=\footnotesize, anchor=south, inner sep=6pt] at (axis cs:2.15,{1/(1+exp(-2.15))}) {$\Delta_H>\Delta_A$};
+\end{axis}
+\end{tikzpicture}
--- a/paper/src/chapters/hero_architecture_figure.tex
+++ b/paper/src/chapters/hero_architecture_figure.tex
@@ -20,11 +20,10 @@
    bA/.style={rectangle, rounded corners=3pt, draw=heroAmberBorder, fill=heroAmber,
               line width=0.9pt, align=center, minimum height=0.85cm},
    bY/.style={rectangle, rounded corners=3pt, draw=heroGrayBorder,  fill=heroGray,
-               line width=0.9pt, align=center, minimum height=0.82cm},
+               line width=0.9pt, align=center, minimum height=0.85cm},
    pill/.style={ellipse, draw=black!50, fill=black!4, line width=0.75pt,
                 align=center, minimum width=1.6cm, minimum height=0.68cm},
    arr/.style={->, draw=black!80, line width=0.88pt},
-    bidir/.style={<->, draw=black!80, line width=0.88pt},
    darr/.style={->, draw=black!60, line width=0.80pt, densely dashed},
    crossA/.style={->, draw=heroAmberBorder!90!black, line width=1.15pt, dash pattern=on 3.5pt off 2pt},
    crossG/.style={->, draw=heroGreenBorder!90!black, line width=1.15pt, dash pattern=on 3.5pt off 2pt},
@@ -55,7 +54,8 @@
 \draw[arr]   (human.east)    -- (web.west);
 \draw[arr]   (agent.east)    -- (web.west);
 \draw[arr]   (web.east)      -- (provider.west);
-\draw[bidir] (provider.east) -- (redis.west);
+% single arrow: bidir on a short edge stacks two tips and reads as a messy cross
+\draw[arr] (provider.east) -- (redis.west);

 % web/provider -> kafka
 \draw[arr] (web.south)      -- (kBehav.north)
@@ -63,9 +63,9 @@
 \draw[arr] (provider.south) -- (kQuotes.north)
    node[midway, right, lbl] {$(i,p,\mathrm{sid},\phi,t)$};

-% kafka -> worker (straight south)
-\draw[arr] (kBehav.south)  -- (worker.north);
-\draw[arr] (kQuotes.south) -- (worker.north);
+% kafka -> worker: behavior stream vertical; price quotes L-shaped so both meet worker without a diagonal across the panel
+\draw[arr] (kBehav.south) -- (worker.north);
+\draw[arr, rounded corners=3pt] (kQuotes.south) -- (7.5, 5.35) -| (worker.north);

 % worker -> registry
 \draw[arr] (worker.east) -- (registry.west);
@@ -79,36 +79,37 @@
    -- (provider.north);

 %% ============================================================
-%%  Panel B   x: 11.6–20.4    y: 2.2–10.0
+%%  Panel B   x: 11.6–20.0    y: 2.2–10.0
 %% ============================================================
-\draw[panel] (11.6,2.2) rectangle (19.8,10.0);
+\draw[panel] (11.6,2.2) rectangle (20.0,10.0);
 \node[anchor=west, font=\small\bfseries] at (11.85,9.72) {(b) Distinguishability layer};

-\node[bG, minimum width=2.4cm]  (session) at (14.0, 8.9)  {Session prefix\\$\tau'$};
-\node[bB, minimum width=2.4cm]  (empKern) at (13.65,7.45) {Empirical kernel\\$\hat T'$};
-\node[bY, minimum width=2.4cm]  (weakLab) at (17.55,8.9) {Weak labels\\$\mathcal{D}_H,\mathcal{D}_A$};
-\node[bY, minimum width=2.2cm]  (protoH)  at (12.8, 5.9)  {Prototype\\$\bar T_H$};
-\node[bA, minimum width=2.4cm]  (kldist)  at (15.55,5.9)  {KL distances\\$\Delta_H,\Delta_A$};
-\node[bY, minimum width=2.2cm]  (protoA)  at (18.3, 5.9)  {Prototype\\$\bar T_A$};
-\node[bB, minimum width=2.9cm]  (calHead) at (13.55,4.25) {Contrastive\\calibration head};
-\node[bG, minimum width=2.55cm] (score)   at (17.75,4.25) {Session score\\$f(\tau'),\hat\alpha(\tau')$};
+% x positions shifted +0.3 from the original layout (between left-heavy and +0.55 which hugged the right edge)
+\node[bG, minimum width=2.4cm]  (session) at (14.3, 8.9)  {Session prefix\\$\tau'$};
+\node[bB, minimum width=2.4cm]  (empKern) at (13.95,7.45) {Empirical kernel\\$\hat T'$};
+\node[bY, minimum width=2.4cm]  (weakLab) at (17.85,8.9) {Weak labels\\$\mathcal{D}_H,\mathcal{D}_A$};
+\node[bY, minimum width=2.2cm]  (protoH)  at (13.1, 5.9)  {Prototype\\$\bar T_H$};
+\node[bA, minimum width=2.4cm]  (kldist)  at (15.85,5.9)  {KL distances\\$\Delta_H,\Delta_A$};
+\node[bY, minimum width=2.2cm]  (protoA)  at (18.6, 5.9)  {Prototype\\$\bar T_A$};
+\node[bB, minimum width=2.9cm]  (calHead) at (13.85,4.25) {Contrastive\\calibration head};
+\node[bG, minimum width=2.55cm] (score)   at (18.05,4.25) {Session score\\$f(\tau'),\hat\alpha(\tau')$};

-\node[lbl] at (15.55, 3.15) {$\hat\alpha(\tau')=\sigma\!\left(\beta(\Delta_H-\Delta_A)\right)$};
+\node[lbl] at (15.85, 3.15) {$\hat\alpha(\tau')=\sigma\!\left(\beta(\Delta_H-\Delta_A)\right)$};

 \draw[arr, rounded corners=4pt] (session.south)  -- (empKern.north);
-\draw[arr, rounded corners=4pt] (empKern.south) -- (13.65, 6.8) -| (protoH.north);
-\draw[arr, rounded corners=4pt] (weakLab.south) -- (17.55, 6.8) -| (protoA.north);
+\draw[arr, rounded corners=4pt] (empKern.south) -- (13.95, 6.8) -| (protoH.north);
+\draw[arr, rounded corners=4pt] (weakLab.south) -- (17.85, 6.8) -| (protoA.north);
 % weak labels -> protoH: go south then hard-left below weakLab
-\draw[arr, rounded corners=4pt] (weakLab.south) -- (17.55,6.8) -| (protoH.north east);
+\draw[arr, rounded corners=4pt] (weakLab.south) -- (17.85,6.8) -| (protoH.north);
 \draw[arr] (protoH.east)    -- (kldist.west);
 \draw[arr] (protoA.west)    -- (kldist.east);
-\draw[arr] (kldist.south)   -- (calHead.north east);
+\draw[arr, rounded corners=4pt] (kldist.south)  -- (calHead.north);
 \draw[arr] (calHead.east)   -- (score.west);

 %% ============================================================
-%%  Panel C   x: 20.8–31.0    y: 2.2–10.0
+%%  Panel C   x: 20.4–31.0    y: 2.2–10.0
 %% ============================================================
-\draw[panel] (20.8,2.2) rectangle (31.0,10.0);
+\draw[panel] (20.4,2.2) rectangle (31.0,10.0);
 \node[anchor=west, font=\small\bfseries] at (21.05,9.72) {(c) Distributionally robust control};

 \node[bB, minimum width=3.1cm] (state)    at (23.15, 8.9)
@@ -129,13 +130,13 @@
 \draw[arr, rounded corners=4pt] (ambSet.south) -- (23.15, 6.6) -| ([xshift=-2cm]contScen.north);
 \draw[arr, rounded corners=4pt] (innerMin.south) -- (28.55, 6.6) -| ([xshift=2cm]contScen.north);
 \draw[arr] (contScen.south)   -- (reward.north);
-\draw[arr, rounded corners=6pt] (reward.south) -- (25.9, 3.7) -| (policy.north);
+% join reward to policy along policy.north y so the last segment never approaches north from below (avoids upward arrowhead on top edge)
+\draw[arr, rounded corners=4pt] (reward.south) -- (reward.south |- policy.north) -- (policy.north);
 \draw[arr] (policy.east)      -- (publish.west);
-% market response: up the right edge of panel C, entirely inside, rounded
+% market response: up the right edge, then left into state summary from the east
 \draw[arrG, rounded corners=6pt] (publish.east) -- (30.6, 3.05)
-    -- (30.6, 9.8)
-    -- node[midway, lbl] {market response} (state.north |- 0, 9.8)
-    -- (state.north);
+    -- (30.6, 8.9)
+    -- node[midway, above, lbl] {market response} (state.east);

 %% ============================================================
 %%  Cross-panel connectors – gutter at y = 1.0..2.2
@@ -152,8 +153,8 @@
 % 2. Score -> State  (depth y=1.45)
 \draw[crossG, rounded corners=6pt]
    (score.south) -- (score.south |- 0, 1.45)
-    -- node[pos=0.5, lbl] {contamination signal} (20.6, 1.45)
-    -- (20.6, 8.9)
+    -- node[pos=0.5, lbl] {contamination signal} (20.2, 1.45)
+    -- (20.2, 8.9)
    -- (state.west);

 % 3. Publish -> Provider  (depth y=1.05, deepest)
--- a/paper/src/chapters/mdp_agent.pdf
+++ b/paper/src/chapters/mdp_agent.pdf
--- a/paper/src/chapters/mdp_human.pdf
+++ b/paper/src/chapters/mdp_human.pdf
--- a/paper/src/main-genpop.tex
+++ b/paper/src/main-genpop.tex
@@ -84,4 +84,18 @@ v4             &  64 & 275 & $64  \times 275 = 17{,}600$  \\

 Converting to petaFLOPS: 160,320 TFLOPS equals approximately 160 PFLOPS. This is the theoretical peak under sustained arithmetic operations; realized throughput depends on memory bandwidth utilization and inter-chip communication overhead, but the figure serves as a useful upper bound for provisioning decisions.

+\section{KL divergence when the reference has zeros}
+\label{app:kl_zeros}
+
+The textbook definition $D_{\mathrm{KL}}(P\parallel Q)=\sum_k P(k)\log(P(k)/Q(k))$ is not usable as-is when our empirical reference puts $Q(k)=0$ somewhere the session distribution still visits: if $P(k)>0$ and $Q(k)=0$, that term wants to blow up to infinity. With only 29 sessions the estimated transition rows are incredibly sparse, so ``never seen in the prototype'' happens a lot.
+
+In code we do the boring fix: add a tiny floor $\varepsilon$ to both the numerator and denominator inside the log so nothing is exactly zero, which turns the sum into a finite, smoothed surrogate rather than a literal KL to raw counts. We also skip source states that do not exist at all in the reference kernel, because there is nowhere honest to compare against. This keeps the pipeline running and the divergence scores on a comparable scale, at the cost that the number is regularized KL-ish behavior, not a purist information-theoretic quantity---which is acceptable here because we only use the gap between human-anchored and agent-anchored scores as a weak separability signal, not as a calibrated physical constant.
+
+\section{Why the logarithm appears in the revelation surrogate}
+\label{app:revelation_log}
+
+$\text{COI}_{\text{leak}} = f(\tau')\cdot\text{InfoValue}$. Either $\text{InfoValue}=c>0$ (query-tax) or $\text{InfoValue}=-\log\pi(p\mid\tau')$ (revelation), with $\pi(\cdot\mid\tau')$ the policy over quoted prices in context $\tau'$.
+
+For probability $q$, $-\log q$ is surprisal; for independent events, $-\log\prod_i q_i=\sum_i(-\log q_i)$. The revelation surrogate is that surprisal under $\pi(\cdot\mid\tau')$, scaled by $f(\tau')$. Use $\max\{\pi,\varepsilon\}$ so the term stays finite (cf.\ Appendix~\ref{app:kl_zeros}).
+
 \end{document}
--- a/paper/src/main.tex
+++ b/paper/src/main.tex
@@ -18,20 +18,23 @@
 \end{titlepage}

 \begin{abstract}
-With accelerated growth of Lager Language Model agents in e-commerce a novel adversarial dynamic to digital markets emerges. This paper address the vulnerability of dynamic pricing systems to AI intermediaries that decouple the information gather stages from the transaction execution. By conducing reconnaissance isolates sessions, agents circumvent the ``Cost of Information'' (COI) defined as the accumulated price premium typically thought demand expression estimators.
-We formally define this phenomenon and derive the Cost of Information Theorem, proving that as the saturation of independent, utility-maximizing agents increases, the platform’s ability to sustain a COI converges to zero, rendering standard dynamic pricing mechanisms incentive-incompatible.
-To respond to this threat we propose a defensive framework which integrates behavioral economics with Adversarially Distributionally Robust Optimization (DRO). We introduce a custom e-commerce research platform built on hybrid Kappa-Lambda architecture, designed to capture and simulate high-fidelity controlled interaction trajectories. We further demonstrate through modeling that human and agent behaviors exhibit distinct transition probability kernels, enabling the construction of discriminative models based on Kullback-Leibler divergence.
-These behavioral signals serve as inputs for a Distributionally Robust Reinforcement Learning (DR-RL) agent. We formulate the pricing problem as a Stackelberg game where the learner optimizes against an ambiguity set of demand distributions defined by the Wasserstein distance. This approach allows the pricing policy to remain robust against non-stationary contamination without overfitting to deterministic demand curves. The research validates a mechanism for preserving margin integrity and market equilibrium in an agent-mediated economy, while minimizing degradation to the legitimate human user experience (UX).
+\noindent
+Large language model (LLM) agents are spreading in e-commerce, one consequence is intermediaries that can separate information gathering from transaction execution. This thesis studies dynamic pricing when agents survey in isolated sessions and thereby weaken the \emph{Cost of Information} (COI), the premium platforms typically extract once demand signals are expressed.
+
+We formalize the phenomenon and prove a Cost of Information theorem: as independent, utility-maximizing agents saturate price queries, the platform's sustainable margin goes to zero, so ordinary dynamic pricing is incentive-incompatible in the limit.
+
+The defensive design combines behavioral signals with distributionally robust optimization (DRO). We implement a controlled storefront on a hybrid batch-streaming architecture and show that human and agent sessions induce different transition kernels. Kullback--Leibler divergence to class prototypes yields session scores that feed a distributionally robust reinforcement learning (DR-RL) policy, posed as a Stackelberg game with a Wasserstein ambiguity set over demand so the learner does not collapse to a single empirical demand curve under shifting contamination.
+
+Factorial training on TPUs shows the expected short-run revenue hit from contamination and that the robust objective recovers COI and equilibrium structure in harder regimes (higher contamination, larger catalogs), accounting for UX to prevent supra-competitive pricing. Code and an interaction dataset are released for work on agent-mediated traffic.
 \end{abstract}

 \noindent\textbf{Keywords:} Dynamic Pricing, LLM Agents, Adversarial Machine Learning, E-commerce, Behavioral Detection, Reinforcement Learning

-\vspace{1em}
-\noindent\textbf{Acknowledgments:} This research was supported by the TPU Research Cloud program, which provided access to Google Cloud TPU accelerators (including TPU v4, v5e, and v6e).
-
 \vspace{0.5em}
 \noindent\textbf{Project page:} \url{https://velocitatem.github.io/PHANTOM/}

+\clearpage
+\tableofcontents
 \clearpage
 \input{chapters/01-intro}
 \input{chapters/02-literature-review}
@@ -40,6 +43,8 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce
 \input{chapters/05-discussion}
 \input{chapters/06-conclusion}

+\input{chapters/acknowledgements}
+
 \printbibliography

 \clearpage
@@ -48,14 +53,14 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce
 \begin{description}
 \item[Agent $A$] A non-human actor, typically an LLM-driven system that executes web actions toward a goal.
 \item[Human $H$] A human participant interacting with the platform to complete a task.
-\item[Actor Type $\theta$] A latent class parameter describing whether a session is generated by a human or an agent profile.
+\item[Actor Class $Y$] A latent class parameter describing whether a session is generated by a human or an agent profile.
 \item[Platform] A web interface exposing purchasable items and their offered prices.
 \item[Session $s$] A bounded interaction record tied to one actor and one session identifier.
 \item[Event $e_{s,k}$] A single interaction tuple in a session, including action, item target, and timestamp.
 \item[Trajectory $\tau_s$] The ordered sequence of events generated within a session.
 \item[Demand Proxy $\hat{q}_{t,i}$] A weighted aggregate of observed actions used as an operational substitute for latent demand.
 \item[Action Weight Function $\omega(a)$] A mapping from action type to signal strength in the demand proxy.
-\item[True Demand $d(p;\theta)$] The latent purchase response as a function of price and actor type.
+\item[True Demand $d(p\mid Y,\theta)$] The latent purchase response as a function of price, actor class, and latent type.
 \item[Contamination $\alpha$] The proportion of agent-generated traffic in the session mixture.
 \item[Non-stationary Noise $\epsilon_t$] Time-varying residual variation not explained by the actor mixture.
 \item[Pricing Policy $\pi(\tau)$] A function mapping observed interaction history to an offered price.
@@ -110,29 +115,23 @@ v4             &  64 & 275 & $64  \times 275 = 17{,}600$  \\

 Converting to petaFLOPS: $160{,}320\;\text{TFLOPS} = 160.32\;\text{PFLOPS} \approx 160\;\text{PFLOPS}$. This is the theoretical peak under sustained BF16 arithmetic; realized throughput depends on memory bandwidth utilization and inter-chip communication overhead, but the figure serves as a useful upper bound for provisioning decisions.

-\section{Slope-Test Verification: Revenue vs. Contamination}
-\label{app:alpha_revenue_slope}

-This appendix provides a compact verification of the slope result reported in the main results section. Using the same run-level pairs $x_i=\texttt{study/alpha}_i$ and $y_i=\texttt{eval/revenue\_mean}_i$ ($n=95$), we re-checked the ordinary least squares slope test in Python with standard test routines (SciPy two-sided $t$ test for the slope).

-\[
-\widehat{y}=326{,}878.57-60{,}631.95\,x,
-\]
-\[
-t(93)=-8.2148,\qquad p=1.2038\times 10^{-12},\qquad R^2=0.4205,\qquad 95\%\,\text{CI}_{\beta_1}=[-75{,}288.76,\,-45{,}975.13].
-\]
+\section{KL divergence when the reference has zeros}
+\label{app:kl_zeros}

-The Python verification reproduces the reported coefficients and inference values, confirming that the slope-test results are correct under standard methods.
+The textbook definition $D_{\mathrm{KL}}(P\parallel Q)=\sum_k P(k)\log(P(k)/Q(k))$ is not usable as-is when our empirical reference puts $Q(k)=0$ somewhere the session distribution still visits: if $P(k)>0$ and $Q(k)=0$, that term wants to blow up to infinity. With only 29 sessions the estimated transition rows are incredibly sparse.

-\section{whoclickedit Dataset Card}
-\label{app:whoclicked_card}
+In code we do the basic fix: add a tiny floor $\varepsilon$ to both the numerator and denominator inside the log so nothing is exactly zero, which turns the sum into a finite, smoothed surrogate rather than a literal KL to raw counts. We also skip source states that do not exist at all in the reference kernel, because there is nowhere honest to compare against. This keeps the pipeline running and the divergence scores on a comparable scale, at the cost that the number is regularized KL behavior, not a purist information-theoretic quantity, which is acceptable here because we only use the gap between human-anchored and agent-anchored scores as a weak separability signal.

-For transparency and reproducibility, this appendix includes the full dataset card used for the public release of the \texttt{whoclickedit} dataset.

-\lstinputlisting[
-  caption={whoclickedit dataset card (README snapshot)},
-  label={lst:whoclicked_dataset_card}
-]{chapters/auto/whoclicked_dataset_card.md}
+\section{Expanding the Intuition of Information Value in the Reward}
+\label{app:revelation_log}
+
+Leakage is $\text{COI}_{\text{leak}} = f(\tau')\cdot\text{InfoValue}$. The query-tax form fixes $\text{InfoValue}=c>0$. The revelation form sets $\text{InfoValue}(p,\tau')=-\log\pi(p\mid\tau')$, with $\pi(\cdot\mid\tau')$ the policy distribution over quoted prices in context $\tau'$ (discretized as in the engine).
+
+For an outcome with probability $q$, the quantity $-\log q$ is \emph{surprisal}. For independent events, $-\log\prod_i q_i=\sum_i(-\log q_i)$. The revelation term is surprisal under $X\sim\pi(\cdot\mid\tau')$, multiplied by $f(\tau')$. In practice we do $\max\{\pi,\varepsilon\}$ in place of $\pi$ so the log stays finite (same spirit as Appendix~\ref{app:kl_zeros}).
+

 % \input{../build/concatenated_code}

--- a/paper/src/mirrors/cais2026/main.tex
+++ b/paper/src/mirrors/cais2026/main.tex
@@ -300,9 +300,9 @@ where $W_p$ is the $p$-Wasserstein distance and $\epsilon > 0$ is the ambiguity
 The platform seeks a policy $\pi^*$ that maximizes worst-case revenue over the ambiguity set while penalizing information leakage to suspected agents:
 \begin{equation}
 \label{eq:robust_policy}
-\pi^* = \arg \max_{\pi} \min_{Q \in \mathcal{U}_\epsilon} \; \mathbb{E}_{d \sim Q} \left[ R(p, d) - \lambda \cdot \text{COI}_{\text{leak}}(p, \tau') - \eta \cdot \text{UX}(\tau', p) \right]
+\pi^* = \arg \max_{\pi} \min_{Q \in \mathcal{U}_\epsilon} \; \mathbb{E}_{d \sim Q} \left[ R(p, d) - \lambda \cdot \text{COI}_{\text{leak}}(p, \tau') - \eta_{\text{ux}} \cdot \text{UX}(\tau', p) \right]
 \end{equation}
-where $R(p, d) = p \cdot d$ is the revenue function.
+where $R(p, d) = p \cdot d$ is the revenue function, $\lambda$ scales COI leakage, and $\eta_{\text{ux}}$ scales the UX penalty with $\text{UX}(\tau', p)\in[0,1]$.

 \begin{definition}[COI Leakage]
 The per-query information leakage cost is:
--- a/paper/src/mirrors/genpop/03-methodology.tex
+++ b/paper/src/mirrors/genpop/03-methodology.tex
@@ -83,7 +83,7 @@ In order for our research to have grounding in interactions we built a robust e-

 The architecture of this platform begins with the deployed web-apps posting interaction data to our backend which processes them and stores each ingested interaction into a kafka cluster. This serves as our data reservoir tracking and associating each interaction with its session and importantly with which experiment it belongs to. Not only do we track the behavioral interactions, but our pricing provider micro-service, once called by the frontend reports the observed/queried price-product into kafka. This kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger. The final stage of the pricing pipeline, submits computed dynamic pricing results into a redis database for quick updates which is then read by the pricing provider and displayed on the webapp. This is a very generic end-to-end mechanism which is applicable to a variety of different e-commerce tasks. We intentionally put emphasis on the development of this infrastructure to establish a reproducible framework for interaction and to minimize any noise.

-\paragraph{Public Web Artifact} We transition the Kappa like architecture of the data collection to a Lambda architecture for actual learning in a surrogate environment. This allows us to move faster on data which is provided and helps us create a feedback loop for production deployment. To support further research in this intersection of fields we release P4P \footnote{\url{https://github.com/velocitatem/p4p}} as a public repository providing the interaction layer of the PHANTOM framework. This provides a configurable storefront which can be tailored to any commercial setting with a standardized session-level event tracking. We document the API adapters or what the framework expects in terms of schemas for pricing providers and log ingestion servicse. The repository is intended for controlled experimentation and method replication rather than production commerce deployment.
+\paragraph{Public Web Artifact} We transition the Kappa-like architecture of the data collection to a Lambda architecture for actual learning in a surrogate environment. This allows us to move faster on data which is provided and helps us create a feedback loop for production deployment. To support further research in this intersection of fields we release P4P \footnote{\url{https://github.com/velocitatem/p4p}} as a public repository providing the interaction layer of the PHANTOM framework. This provides a configurable storefront which can be tailored to any commercial setting with a standardized session-level event tracking. We document the API adapters or what the framework expects in terms of schemas for pricing providers and log ingestion servicse. The repository is intended for controlled experimentation and method replication rather than production commerce deployment.

 \subsubsection{DevOps Principles}

@@ -130,9 +130,9 @@ To speak to realism, user interviews reported that the platform architecture mir

 The dynamic pricing mechanism elicited immediate behavioral adjustments. Participants were sensitive to price volatility: sudden boosts triggered urgency and faster booking attempts, while large listing-to-final discrepancies triggered deeper comparison behavior. This is comforting because the controlled setup still produces commercially relevant interaction data.

-\subsubsection{Design of Training Factorial Study}
+\subsubsection{Design of Training Sweeps}

-The simulator has multiple configurable factors. We design a multi-factor study across five axes derived from the sweep configurations: (1) RL algorithm (PPO, A2C, DQN, Q-table; 4 levels), (2) contamination ratio sampled at four representative levels between 0.1 and 0.6, (3) robustness radius (3 levels), (4) COI penalty weight at two reference levels, and (5) pricing action granularity (two discretization settings for action levels); giving a grid of 192 configurations. Statistical power for the behavioral comparisons is determined by a two-sample test over per-session divergence scores.
+The simulator has multiple configurable factors. Training runs are driven by Weights \& Biases sweep definitions versioned with the codebase, mixing random and grid schedules rather than a single full factorial. For the contamination ratio $\alpha$, exploratory sweeps draw $\alpha$ uniformly on $[0.1,0.6]$; some sweeps use the narrower interval $[0.1,0.5]$. Grid sweeps fix explicit level sets, for example $\alpha\in\{0.1,0.2,0.3,0.4,0.6,0.8\}$ (six levels, including $0.8$ beyond the typical exploratory upper endpoint) or five levels $\{0.1,0.2,0.3,0.4,0.6\}$. Auxiliary schedules also include $\alpha=0$ alongside positive values. Robustness radius $\epsilon_\alpha$, COI penalty $\lambda_\text{coi}$, RL algorithm (\texttt{ppo}, \texttt{a2c}, \texttt{dqn}, \texttt{qtable}), and the discretization of the price action grid vary by sweep. Broad random search may use uniform $\epsilon_\alpha\in[0,0.3]$ and $\lambda_\text{coi}\in[0.05,0.6]$; tighter grids may fix $\epsilon_\alpha=0.2$ and restrict $\lambda_\text{coi}$ to $\{0.15,0.30\}$. Behavioral distinguishability is assessed with a two-sample Mann--Whitney test on per-session divergence gap scores at cohort sizes $n_H=13$ and $n_A=16$.

 While this scale is generally expensive for reinforcement learning, we execute it on a large TPU cluster to make the sweep tractable.

--- a/paper/src/mirrors/genpop/05-discussion.tex
+++ b/paper/src/mirrors/genpop/05-discussion.tex
@@ -2,9 +2,11 @@

 \subsection{Transition to Agentic Market Microstructure}

-Our analysis of the interaction dynamics between the platform and non-human actors suggests that the current static pricing models are insufficient for an agent-mediated economy. If we assume a transition toward a direct revelation mechanism, where actors must reveal their true valuation of a good through bidding dynamics, we inevitably introduce significant stochasticity into the pricing system. Unlike traditional e-commerce where prices are relatively sticky, such a mechanism implies a high volatility characteristic of financial equity markets (without the fungability however).
+Our analysis of the interaction dynamics between the platform and non-human actors suggests that the current static pricing models are insufficient for an agent-mediated economy. If we assume a transition toward a direct revelation mechanism, where actors must reveal their true valuation of a good through bidding dynamics, we inevitably introduce significant stochasticity into the pricing system. Unlike traditional e-commerce where prices are relatively sticky, such a mechanism implies a high volatility characteristic of financial equity markets (without the fungibility however).

-However, ecommerce commodities differ fundamentally from financial securities: they possess a hard floor defined by unit economics and reservation prices. The market might react enthusiastically to an iPhone priced at \$1, such a transaction is not permissible. The platform must establish an initial valuation anchor defined by the marginal cost plus a target margin, around which the market price is permitted to fluctuate. We float the introduction of GenAI Agents as Institutional Market Makers. As the arms race for greater autonomy of agnetic systems grows, the commercial viability of AI agents has the potential to disseminate into every-day users directly interacting with them rather than e-commerce platforms. This is also under the assumption of expected transactional capabilities being given to AI Agents.
+However, e-commerce commodities differ fundamentally from financial securities: they possess a hard floor defined by unit economics and reservation prices. The market might react enthusiastically to an iPhone priced at \$1. Such a transaction is not permissible. The platform must establish an initial valuation anchor ($P_0$) defined by the marginal cost plus a target margin, around which the market price is permitted to fluctuate.
+
+We float the introduction of GenAI Agents as Institutional Market Makers. As the arms race for greater autonomy of agentic systems grows, the commercial viability of AI agents has the potential to disseminate into everyday users directly interacting with them rather than e-commerce platforms. This is also under the assumption of expected transactional capabilities being given to AI Agents.

 \subsection{Risk Assessment and Limitations}

--- a/paper/src/preamble.tex
+++ b/paper/src/preamble.tex
@@ -25,6 +25,7 @@
 \usepackage{graphicx}
 \usepackage{hyperref}
 \usepackage{booktabs}
+\usepackage[american]{babel}
 \usepackage{csquotes}
 \usepackage{subcaption}
 \usepackage{siunitx}
@@ -57,8 +58,8 @@
    literate={·}{{\textperiodcentered}}1 {−}{{\textminus}}1 {—}{{---}}1 {–}{{--}}1
 }

-% Use biblatex with authoryear style for in-text citations like (Author, Year)
-\usepackage[backend=bibtex,style=authoryear,natbib=true,maxcitenames=2]{biblatex}
+% APA 7-style references and citations (requires biber)
+\usepackage[style=apa,backend=biber]{biblatex}
 \addbibresource{bib/references.bib}

 % Page headers (SciTech format)
--- a/paper/src/summary.tex
+++ b/paper/src/summary.tex
@@ -0,0 +1,110 @@
+% -*- TeX-master: t -*-
+% Two-page summary: one self-contained source file (no \input chapters).
+\documentclass[10pt,letterpaper]{article}
+
+\input{preamble}
+
+\begin{document}
+\singlespacing
+\setlength{\parskip}{0.35em}
+\setlength{\parindent}{0pt}
+\small
+\fancyhead[L]{}
+
+\begin{center}
+  {\small\url{https://velocitatem.github.io/PHANTOM/}}\\[0.65em]
+  {\large\bfseries PHANTOM: Pricing Heuristics Against Non-human\\[0.15em] Transaction Orchestration Mechanisms}\\[0.55em]
+  {\normalsize Daniel Rösel\footnote{Bachelor of Computer Science \& Artificial Intelligence @ IE University, Madrid}}\\[0.55em]
+  {\small Supervised by Alberto Martín Izquierdo}\\[0.35em]
+  {\small \today}
+\end{center}
+
+\vspace{0.75em}
+
+Large language model (LLM) agents are spreading in e-commerce, one consequence is intermediaries that can separate information gathering from transaction execution.
+This thesis studies dynamic pricing when agents reconnoitre in isolated sessions and thereby weaken the \emph{Cost of Information} (COI), the premium platforms typically extract once demand signals are expressed.
+The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on \parencite{xia_evaluation-driven_2025}.
+Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control.
+The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index).
+This economic framing also helps separate two related but distinct phenomena of agents as buyers (changing market demand composition), and agents as information gatherers (changing the observed interactions used by pricing/recommendation systems).
+The thesis focuses on the second, where information acquisition strategically precedes purchase execution.
+Our effort to combat contamination stems from research by \textcite{hardt_strategic_2015} on strategic classification, in conjunction with \textcite{liu_contextual_2024} who demonstrate a linear regret if contamination is ignored.
+To bridge the gap between detection and robust pricing, we look at work in Distributionally Robust Optimization (DRO): by optimizing for the worst-case distribution within this set, pricing mechanisms can become resilient to the distributional shifts such as the ones caused by non-human actors \parencite{kuhn_wasserstein_2024}.
+In order to create an environment in which prices can be tested against a demand estimate generated by some behavioral model, we take inspiration from the architecture proposed by \textcite{ie_recsim_2019} in the RecSim platform built for recommendation systems.
+The key component of this mediation between agents and commercial platforms lays in the transaction costs related to information gathering and negotiation.
+As proposed by \textcite{shahidi_coasean_2025} these costs are bound to collapse towards zero (which we demonstrate mathematically), calling for a re-evaluation of the boundaries between firms and markets.
+
+\vspace{0.5em}
+In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI.
+We formally define interaction data as coming from some actor which can either be an agent ($A$) or human ($H$).
+Dynamic pricing algorithms rely on directly translating demand features $q$ to new price assignments $\hat{p}$ across a catalogue of products of size $N$.
+This opens opportunities to design a \textit{tabula rasa} of digital market mechanisms that will shape the future of commerce in the age of artificial intelligence.
+We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributions.
+For purposes of this research, an agent is an algorithmic loop with the ability to access a web platform and perform actions such as clicks, scrolls, and input field fills.
+
+\vspace{0.5em}
+The platform does not directly observe the true underlying demand function $d(p)$ where $d \in \mathbb{R}^{+}$ and our proxy $\hat{q} \in \mathbb{R}^{+}$.
+Instead, it observes a behavioral proxy $\hat{q}_t$, which is a composite signal derived from the mixture of actor types.
+The total observed demand is a stochastic process governed by the naively defined mixture $Q(p) = (1-\alpha) \cdot \mathbb{E}_{\theta \sim \mathcal{D}_H}[d(p\mid Y=H,\theta)] + \alpha \cdot \mathbb{E}_{\theta \sim \mathcal{D}_A}[d(p\mid Y=A,\theta)] + \epsilon_t$ where $\alpha \in [0, 1]$ represents the contamination parameter (proportion of agents) and $\epsilon_t$ is non-stationary market noise.
+The platform's pricing power comes from information asymmetry: users who express strong interest signals pay more than the base price.
+We quantify this markup as the \textit{Cost of Information} (COI), which represents the average premium extracted above marginal cost.
+We formally demonstrate that standard dynamic pricing mechanisms are not incentive-compatible with high-frequency agentic traffic.
+As the number of independent competitive agents $N$ querying the system grows, the platform's ability to sustain a COI vanishes.
+
+\vspace{0.5em}
+In order for our research to have grounding in interactions we built a robust e-commerce web-platform.
+The architecture of this platform begins with the deployed web-apps posting interaction data to our backend which processes them and stores each ingested interaction into a Kafka cluster.
+This serves as our data reservoir tracking and associating each interaction with its session and importantly with which experiment it belongs to.
+Not only do we track the behavioral interactions, but our pricing provider micro-service, once called by the frontend reports the observed/queried price-product into Kafka.
+This Kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger.
+The final stage of the pricing pipeline, submits computed dynamic pricing results into a redis database for quick updates which is then read by the pricing provider and displayed on the webapp.
+This is a very generic end-to-end mechanism which is applicable to a variety of different e-commerce tasks.
+We intentionally put emphasis on the development of this infrastructure to establish a reproducible framework for interaction and to minimize any noise.
+In addition to behavioral events, the platform logs price observations to a separate Kafka topic.
+Each price query generates a record $(i, p, \text{sid}, \phi, t)$ associating the product, displayed price, requesting session, platform mode, and timestamp.
+This dual-stream architecture enables joint analysis of price exposure and behavioral response.
+We transition the Kappa-like architecture of the data collection to a Lambda architecture for actual learning in a surrogate environment.
+This allows us to move faster on data which is provided and helps us create a feedback loop for production deployment.
+Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping).
+This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions.
+The second half uses collected behavioral traces to distinguish classes $Y \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner.
+Our process follows three stages: (1) observe and \textit{vectorize} behavioral interactions, (2) learn distinguishability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
+Our web platform (developed in similar spirit to RecSim \parencite{ie_recsim_2019}) gives us a controlled environment where tasks are assigned to human and agentic actors and then executed.
+
+\vspace{0.5em}
+Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $Y_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier.
+We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class.
+This allows us to construct a \textit{Contamination Generator} $\mathcal{G}(\alpha)$.
+We formulate pricing as a Stackelberg game: the platform (leader) sets prices $p_t$, and the population (follower) responds through trajectories and demand.
+Because contamination level $\alpha$ and demand shift are non-stationary online, a simple error term is not enough.
+We therefore use a Distributionally Robust Optimization objective.
+We define an ambiguity set $\mathcal{U}_\epsilon(\hat{P}_N)$ centered around our empirical reference distribution $\hat{P}_N$ (derived from the generator $\mathcal{G}$).
+We utilize the Wasserstein distance metric to define the set of plausible demand distributions the agent might face.
+The robust policy $\pi^*$ is obtained by solving the maximin problem $\pi^* = \arg \max_{\pi} \min_{Q \in \mathcal{U}_\epsilon} \mathbb{E}_{d \sim Q} \left[ R(p, d) - \lambda \cdot \text{COI}_{\text{leak}}(p,\tau') - \eta_{\text{ux}} \cdot \text{UX}(\tau', p) \right]$ where $R(p, d)$ is the revenue function, $\lambda$ weighs the information-leakage penalty, and $\eta_{\text{ux}}$ weighs the UX term.
+In practice, we parameterize this with a session-level leakage term $\text{COI}_{\text{leak}}(p,\tau') = f(\tau')\cdot \text{InfoValue}(p,\tau')$ where $f(\tau')$ is the weak agent probability.
+As part of reward engineering, we keep a UX factor ($UX\in[0,1]$) as an auxiliary evaluation axis.
+Our training budget is provisioned through TPU Research Cloud and spans 320 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve.
+At peak BF16 throughput this corresponds to approximately $160$\,PFLOPS of aggregate compute.
+
+\vspace{0.5em}
+The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid).
+The two-sided test result ($p<0.001$) at $n_H=13$, $n_A=16$ indicates strong rank distinction between groups, providing evidence that the transition kernels are distinguishable enough to justify their use as a control signal in downstream pricing.
+Interpreted on the contamination grid, a $+0.1$ increase in $\alpha$ corresponds to an average revenue decrease of about $9{,}014$ units, and the robust check preserves both direction and significance.
+The ability to extract COI is greater in the presence of robustness within the training loop; empirical evidence shows that agent contamination reduces revenue and that robustness is condition-dependent, requiring explicit calibration rather than a one-size-fits-all penalty.
+
+\vspace{0.5em}
+Our analysis of the interaction dynamics between the platform and non-human actors suggests that the current static pricing models are insufficient for an agent-mediated economy.
+This technology does not come without a more bitter side, ethical concerns do arise from the idea of deploying black-box like solutions to set prices based on a behavioral attributes.
+
+\vspace{0.5em}
+Contributions include formalization of non-human transaction orchestration in e-commerce as a distinct source of contamination, definition of COI together with a theorem showing its erosion under increasing agent saturation, a controlled e-commerce research platform built on a hybrid Kappa-Lambda architecture, empirical validation of behavioral distinguishability, translation of distinguishability into a distributionally robust reinforcement learning formulation, and release of a reusable public experimental artifact.
+
+\vspace{0.65em}
+\noindent\textbf{Acknowledgments.}\quad
+This research was supported by the TPU Research Cloud program, which provided access to Google Cloud TPU accelerators (including TPU v4, v5e, and v6e).
+Eugene Bykovets, PhD---ETH.
+
+\renewcommand*{\bibfont}{\footnotesize}
+\printbibliography[title={References}]
+
+\end{document}
--- a/scripts/nx_paper.sh
+++ b/scripts/nx_paper.sh
@@ -4,16 +4,58 @@ set -euo pipefail

 cmd="${1:-}"

+sync_mdp_figures() {
+  local script_dir project_root sim_dir chapters_dir
+  script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+  project_root="$(cd "$script_dir/.." && pwd)"
+  sim_dir="$project_root/sim/rl/behavior_loader"
+  chapters_dir="$project_root/paper/src/chapters"
+
+  printf '%s\n' 'Refreshing MDP figures for paper...'
+  (
+    cd "$sim_dir"
+    python models.py
+  )
+
+  cp "$sim_dir/human_mdp_viz.pdf" "$chapters_dir/mdp_human.pdf"
+  cp "$sim_dir/agent_mdp_viz.pdf" "$chapters_dir/mdp_agent.pdf"
+}
+
+# Biber runs with cwd paper/build; \addbibresource{bib/references.bib} must resolve there.
+# Symlink makes biber log 'bib/references.bib' (not ../src/...) so latexmk's post-check passes.
+link_build_bib() {
+  ln -sfn ../src/bib ../build/bib
+}
+
+# Biblatex uses biber; a stale latexmk fdb can still record ["bibtex <job>"], so latexmk skips
+# biber, main.bbl is missing or wrong, and every citation stays undefined. Drop only that case.
+drop_stale_latexmk_bibtex_fdb() {
+  local job fdb tag
+  for job in main main-genpop summary; do
+    fdb="../build/${job}.fdb_latexmk"
+    tag=$(printf '["bibtex %s"]' "$job")
+    if [[ -f "$fdb" ]] && grep -Fq "$tag" "$fdb"; then
+      rm -f "$fdb"
+    fi
+  done
+}
+
 case "$cmd" in
  build)
    mkdir -p paper/build
+    sync_mdp_figures
    bash paper/concat_code.sh
    cd paper/src
+    link_build_bib
+    drop_stale_latexmk_bibtex_fdb
    latexmk -pdf -jobname=main -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build main.tex
    ;;
  watch)
    mkdir -p paper/build
+    sync_mdp_figures
    cd paper/src
+    link_build_bib
+    drop_stale_latexmk_bibtex_fdb
    latexmk -pvc -pdf -jobname=main -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build main.tex
    ;;
  clean)
@@ -33,12 +75,18 @@ case "$cmd" in
    ;;
  build-genpop)
    mkdir -p paper/build
+    sync_mdp_figures
    cd paper/src
+    link_build_bib
+    drop_stale_latexmk_bibtex_fdb
    latexmk -pdf -jobname=main-genpop -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build main-genpop.tex
    ;;
  watch-genpop)
    mkdir -p paper/build
+    sync_mdp_figures
    cd paper/src
+    link_build_bib
+    drop_stale_latexmk_bibtex_fdb
    latexmk -pvc -pdf -jobname=main-genpop -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build main-genpop.tex
    ;;
  build-arxiv)
@@ -50,6 +98,20 @@ case "$cmd" in
    pdflatex -interaction=nonstopmode -file-line-error main.tex
    cp main.pdf ../../../build/main-arxiv.pdf
    ;;
+  build-summary)
+    mkdir -p paper/build
+    cd paper/src
+    link_build_bib
+    drop_stale_latexmk_bibtex_fdb
+    latexmk -pdf -jobname=summary -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build summary.tex
+    ;;
+  watch-summary)
+    mkdir -p paper/build
+    cd paper/src
+    link_build_bib
+    drop_stale_latexmk_bibtex_fdb
+    latexmk -pvc -pdf -jobname=summary -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build summary.tex
+    ;;
  *)
    printf '%s\n' "Unknown paper command: $cmd" >&2
    exit 1
--- a/sim/case/thesis_simplified/separability.py
+++ b/sim/case/thesis_simplified/separability.py
@@ -3,10 +3,13 @@
 Computes divergence signals delta_H, delta_A from session trajectories using
 transition kernel estimation and KL divergence to prototype behavioral profiles.
 """
+
 from __future__ import annotations
 from typing import Dict, List, Tuple, TYPE_CHECKING
 import numpy as np

+from lib.agent_probability import DEFAULT_AGENT_PRIOR, estimate_agent_probability
+
 if TYPE_CHECKING:
    from .simplified import Event, Session

@@ -32,7 +35,10 @@ TRANS_A = {
 def kl_div(p: Dict[str, float], q: Dict[str, float], eps: float = 1e-10) -> float:
    """KL divergence D_KL(p || q) for discrete distributions."""
    keys = set(p.keys()) | set(q.keys())
-    return sum(p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps)) for k in keys)
+    return sum(
+        p.get(k, eps) * np.log((p.get(k, eps) + eps) / (q.get(k, eps) + eps))
+        for k in keys
+    )


 def build_kernel(events: List["Event"]) -> Dict[str, Dict[str, float]]:
@@ -44,7 +50,11 @@ def build_kernel(events: List["Event"]) -> Dict[str, Dict[str, float]]:
        trans.setdefault(prev, {})
        trans[prev][curr] = trans[prev].get(curr, 0) + 1
        prev = curr
-    return {s: {d: c / sum(dsts.values()) for d, c in dsts.items()} for s, dsts in trans.items() if sum(dsts.values()) > 0}
+    return {
+        s: {d: c / sum(dsts.values()) for d, c in dsts.items()}
+        for s, dsts in trans.items()
+        if sum(dsts.values()) > 0
+    }


 def compute_divergence(session: "Session") -> Tuple[float, float]:
@@ -55,18 +65,35 @@ def compute_divergence(session: "Session") -> Tuple[float, float]:
    """
    kernel = build_kernel(session.events)
    if not kernel:
-        return 0.5, 0.5
-    delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / len(kernel)
-    delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / len(kernel)
+        return 0.0, 0.0
+    delta_h = sum(kl_div(kernel.get(s, {}), TRANS_H.get(s, {})) for s in kernel) / len(
+        kernel
+    )
+    delta_a = sum(kl_div(kernel.get(s, {}), TRANS_A.get(s, {})) for s in kernel) / len(
+        kernel
+    )
    return delta_h, delta_a


-def estimate_alpha(session: "Session", beta: float = 2.0) -> float:
-    """Per-session contamination estimate alpha_hat = sigma(beta*(delta_H - delta_A)).
+def estimate_alpha(
+    session: "Session",
+    beta: float = 2.0,
+    prior_agent: float = DEFAULT_AGENT_PRIOR,
+) -> float:
+    """Per-session contamination estimate alpha_hat = sigma((delta_H - delta_A) / T).

    Returns probability session is agent-generated based on behavioral divergence.
    """
    dh, da = compute_divergence(session)
    if (dh + da) <= 0:
-        return 0.5
-    return 1.0 / (1.0 + np.exp(-beta * (dh - da)))
+        return float(prior_agent)
+    if beta <= 0:
+        return estimate_agent_probability(
+            dh, da, temperature=1.0, prior_agent=prior_agent
+        )
+    return estimate_agent_probability(
+        delta_h=dh,
+        delta_a=da,
+        temperature=1.0 / beta,
+        prior_agent=prior_agent,
+    )
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -3,7 +3,7 @@ try:
 except ImportError:
    from sim.rl.behavior_loader.loader import Loader, AgentLoader, JointLoader
 from collections import defaultdict
-from typing import Dict, List, Tuple, Set
+from typing import Dict, List, Optional, Set, Tuple
 import numpy as np
 import graphviz
 import sys
@@ -195,6 +195,110 @@ def aggregate_event_transitions(mdp: Dict) -> Dict[str, Dict[str, float]]:
    return dict(evt_trans)


+def _resolve_event_order(
+    evt_trans: Dict[str, Dict[str, float]],
+    event_order: Optional[List[str]] = None,
+) -> List[str]:
+    observed = set(evt_trans.keys()) | {
+        dst for transitions in evt_trans.values() for dst in transitions
+    }
+    if event_order:
+        ordered = list(dict.fromkeys(event_order))
+        missing = sorted(observed - set(ordered))
+        return ordered + missing
+    return sorted(observed)
+
+
+def _compass_from_angle(angle_rad: float) -> str:
+    ports = ("e", "ne", "n", "nw", "w", "sw", "s", "se")
+    normalized = (angle_rad + (2 * np.pi)) % (2 * np.pi)
+    step = np.pi / 4
+    idx = int(np.round(normalized / step)) % len(ports)
+    return ports[idx]
+
+
+def _edge_ports(
+    src: str,
+    dst: str,
+    positions: Dict[str, Tuple[float, float]],
+    has_reverse: bool,
+) -> Tuple[str, str]:
+    src_x, src_y = positions[src]
+    dst_x, dst_y = positions[dst]
+    angle = float(np.arctan2(dst_y - src_y, dst_x - src_x))
+
+    if has_reverse:
+        bend = np.pi / 10
+        angle += bend if src < dst else -bend
+
+    tail_port = _compass_from_angle(angle)
+    head_port = _compass_from_angle(angle + np.pi)
+    return tail_port, head_port
+
+
+def _edge_style(prob: float) -> Dict[str, str]:
+    if prob >= 0.75:
+        edge_color = "#111827"
+    elif prob >= 0.50:
+        edge_color = "#374151"
+    elif prob >= 0.25:
+        edge_color = "#6b7280"
+    else:
+        edge_color = "#9ca3af"
+    return {
+        "color": edge_color,
+        "fontcolor": "#111827",
+        "fontsize": "10",
+        "penwidth": f"{0.9 + 3.6 * prob:.2f}",
+        "arrowsize": f"{0.55 + 0.55 * prob:.2f}",
+    }
+
+
+def _format_node_label(evt: str) -> str:
+    max_line_len = 16
+    tokens = evt.split("_")
+    if len(tokens) == 1:
+        return evt
+
+    lines: List[str] = []
+    curr = ""
+    for token in tokens:
+        piece = token if not curr else f"_{token}"
+        if curr and len(curr) + len(piece) > max_line_len:
+            lines.append(curr)
+            curr = token
+        else:
+            curr = f"{curr}{piece}" if curr else token
+    if curr:
+        lines.append(curr)
+    return "\n".join(lines)
+
+
+def _compute_flow_positions(
+    events: List[str],
+    layout_radius: float,
+) -> Dict[str, Tuple[float, float]]:
+    """Balanced grid layout for paper-friendly diagrams."""
+    if not events:
+        return {}
+
+    num_events = len(events)
+    cols = int(np.ceil(np.sqrt(num_events)))
+    rows = int(np.ceil(num_events / cols))
+    x_step = max(layout_radius * 1.10, 3.6)
+    y_step = max(layout_radius * 0.95, 3.2)
+
+    positions: Dict[str, Tuple[float, float]] = {}
+    for idx, evt in enumerate(events):
+        row = idx // cols
+        col = idx % cols
+        x = (col - (cols - 1) / 2.0) * x_step
+        y = ((rows - 1) / 2.0 - row) * y_step
+        positions[evt] = (float(x), float(y))
+
+    return positions
+
+
 def visualize_mdp(
    model: BehaviorModel,
    threshold: float = 0.05,
@@ -202,25 +306,91 @@ def visualize_mdp(
    fmt: str = "svg",
    view: bool = False,
    export_dot: bool = False,
+    event_order: Optional[List[str]] = None,
+    layout_radius: float = 10.0,
+    node_diameter: float = 1.8,
+    label_threshold: float = 0.08,
+    drop_isolated_nodes: bool = False,
 ):
    if not model.mdp:
        raise ValueError("build MDP first")

    evt_trans = aggregate_event_transitions(model.mdp)
-    g = graphviz.Digraph(format=fmt)
-    g.attr(rankdir="LR", size="30")
-    g.attr("node", shape="circle", width="1", height="1")
+    ordered_events = _resolve_event_order(evt_trans, event_order=event_order)

-    events = set(evt_trans.keys()) | {
-        e for trans in evt_trans.values() for e in trans.keys()
+    edges = [
+        (src, dst, prob)
+        for src, dsts in evt_trans.items()
+        for dst, prob in dsts.items()
+        if prob > threshold
+    ]
+    if drop_isolated_nodes:
+        connected = {src for src, _, _ in edges} | {dst for _, dst, _ in edges}
+        ordered_events = [evt for evt in ordered_events if evt in connected]
+
+    positions = _compute_flow_positions(ordered_events, layout_radius=layout_radius)
+
+    g = graphviz.Digraph(format=fmt, engine="neato")
+    g.attr(
+        overlap="false",
+        splines="true",
+        outputorder="edgesfirst",
+        pad="0.5",
+        sep="+9",
+        esep="+4",
+        bgcolor="white",
+        dpi="180",
+    )
+    g.attr(
+        "node",
+        shape="circle",
+        fixedsize="true",
+        width=f"{node_diameter:.2f}",
+        height=f"{node_diameter:.2f}",
+        fontsize="11",
+        fontname="Helvetica",
+        style="filled",
+        fillcolor="white",
+        color="#374151",
+        fontcolor="#111827",
+        penwidth="1.8",
+        peripheries="1",
+    )
+    g.attr(
+        "edge",
+        fontname="Helvetica",
+    )
+
+    for evt in ordered_events:
+        x, y = positions[evt]
+        g.node(evt, label=_format_node_label(evt), pos=f"{x:.2f},{y:.2f}!", pin="true")
+
+    edge_set = {
+        (src, dst) for src, dst, _ in edges if src in positions and dst in positions
    }
-    for evt in events:
-        g.node(evt)

-    for src, dsts in evt_trans.items():
-        for dst, prob in dsts.items():
-            if prob > threshold:
-                g.edge(src, dst, label=f"{prob:.2f}")
+    for src, dst, prob in sorted(edges, key=lambda row: row[2]):
+        if src not in positions or dst not in positions:
+            continue
+
+        edge_attrs: Dict[str, str] = _edge_style(prob)
+
+        if src == dst:
+            # pick a loop port away from the main flow
+            sx, sy = positions[src]
+            loop_port = "n" if sy <= 0 else "s"
+            edge_attrs.update({"tailport": loop_port, "headport": loop_port})
+        else:
+            has_reverse = (dst, src) in edge_set
+            tail_port, head_port = _edge_ports(src, dst, positions, has_reverse)
+            edge_attrs.update({"tailport": tail_port, "headport": head_port})
+            if has_reverse:
+                edge_attrs["constraint"] = "false"
+
+        if prob >= label_threshold or src == dst:
+            edge_attrs["label"] = f" {prob:.2f} "
+
+        g.edge(src, dst, **edge_attrs)

    g.render(output, view=view, cleanup=True)
    print(f"Saved MDP graph to {output}.{fmt}")
@@ -342,11 +512,6 @@ if __name__ == "__main__":
        f"Built MDP: {human_mdp['num_states']} states, "
        f"{sum(len(t) for t in human_mdp['transitions'].values())} transitions"
    )
-    if not human_mdp["states"]:
-        exit("No states found")
-    visualize_mdp(
-        human_model, threshold=0.05, output="human_mdp_viz", fmt="pdf", export_dot=True
-    )

    agent_model = AgentBehaviorModel(agent_dir)
    agent_mdp = agent_model.build_MDP()
@@ -355,14 +520,36 @@ if __name__ == "__main__":
        f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
        f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions"
    )
-    if not agent_mdp["states"]:
-        exit("No states found")
-    visualize_mdp(
-        agent_model, threshold=0.05, output="agent_mdp_viz", fmt="pdf", export_dot=True
-    )

    human_evt = aggregate_event_transitions(human_mdp)
    agent_evt = aggregate_event_transitions(agent_mdp)
+    canonical_events = sorted(
+        (set(human_evt.keys()) | {e for tr in human_evt.values() for e in tr.keys()})
+        | (set(agent_evt.keys()) | {e for tr in agent_evt.values() for e in tr.keys()})
+    )
+
+    if not human_mdp["states"]:
+        exit("No states found")
+    visualize_mdp(
+        human_model,
+        threshold=0.05,
+        output="human_mdp_viz",
+        fmt="pdf",
+        export_dot=True,
+        event_order=canonical_events,
+    )
+
+    if not agent_mdp["states"]:
+        exit("No states found")
+    visualize_mdp(
+        agent_model,
+        threshold=0.05,
+        output="agent_mdp_viz",
+        fmt="pdf",
+        export_dot=True,
+        event_order=canonical_events,
+        drop_isolated_nodes=True,
+    )

    common = set(human_evt.keys()) & set(agent_evt.keys())

@@ -394,6 +581,7 @@ if __name__ == "__main__":
            output="joint_mdp_viz",
            fmt="pdf",
            export_dot=True,
+            event_order=canonical_events,
        )

    inter_class_avg = float(np.mean([kl for _, kl in kl_divs]))
--- a/sim/rl/jax_core/separability.py
+++ b/sim/rl/jax_core/separability.py
@@ -1,14 +1,24 @@
 """Vectorized KL divergence for separability scoring."""
+
 import numpy as np
 from typing import Tuple

+from lib.agent_probability import (
+    DEFAULT_AGENT_PRIOR,
+    estimate_agent_probability_batch,
+)
+
 try:
    import jax.numpy as jnp
    from jax import jit
+
    JAX_AVAILABLE = True
 except ImportError:
    jnp, JAX_AVAILABLE = np, False
-    def jit(f): return f
+
+    def jit(f):
+        return f
+

@jit
 def batch_kl(P, Q_human, Q_agent, eps=1e-10):
@@ -20,10 +30,15 @@ def batch_kl(P, Q_human, Q_agent, eps=1e-10):
    delta_a = jnp.sum(p * jnp.log(p / qa), axis=(1, 2))
    return delta_h, delta_a

-def compute_divergences(session_trans: np.ndarray, ref_human: np.ndarray, ref_agent: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+
+def compute_divergences(
+    session_trans: np.ndarray, ref_human: np.ndarray, ref_agent: np.ndarray
+) -> Tuple[np.ndarray, np.ndarray]:
    """Compute KL divergence of each session from human/agent prototypes."""
    if JAX_AVAILABLE:
-        dh, da = batch_kl(jnp.array(session_trans), jnp.array(ref_human), jnp.array(ref_agent))
+        dh, da = batch_kl(
+            jnp.array(session_trans), jnp.array(ref_human), jnp.array(ref_agent)
+        )
        return np.asarray(dh), np.asarray(da)
    # numpy fallback
    eps = 1e-10
@@ -34,10 +49,19 @@ def compute_divergences(session_trans: np.ndarray, ref_human: np.ndarray, ref_ag
    delta_a = np.sum(p * np.log(p / qa), axis=(1, 2))
    return delta_h, delta_a

-def estimate_alpha_batch(prob_agent: np.ndarray, delta_h: np.ndarray, delta_a: np.ndarray, temp: float = 1.0) -> np.ndarray:
-    """Vectorized alpha estimation from classifier probs and divergences."""
-    mass = delta_h + delta_a
-    ratio = np.where(mass > 1e-8, delta_a / mass, 0.5)
-    blended = 0.5 * prob_agent + 0.5 * ratio
-    if temp <= 0: return np.clip(blended, 0.0, 1.0)
-    return np.clip(1.0 / (1.0 + np.exp(-temp * (blended - 0.5))), 0.0, 1.0)
+
+def estimate_alpha_batch(
+    prob_agent: np.ndarray,
+    delta_h: np.ndarray,
+    delta_a: np.ndarray,
+    temp: float = 1.0,
+    prior_agent: float = DEFAULT_AGENT_PRIOR,
+) -> np.ndarray:
+    """Vectorized alpha estimation using divergence gap mapping."""
+    _ = prob_agent
+    return estimate_agent_probability_batch(
+        delta_h=np.asarray(delta_h, dtype=float),
+        delta_a=np.asarray(delta_a, dtype=float),
+        temperature=temp,
+        prior_agent=prior_agent,
+    )
				`@@ -1 +0,0 @@`
				`\includegraphics[width=0.99\linewidth]{chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf}`
				`@@ -1 +0,0 @@`
				`\includegraphics[width=0.88\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf}`