Merge pull request #55 from velocitatem/optimizing-runs

Enhance TPU orchestration and parallelization with benchmarks
2026-07-16 01:53:37 +00:00 · 2026-03-23 15:15:35 +01:00
parent ae6cffe825 ae2860a0ee
commit 128911decc
123 changed files with 7644 additions and 2152 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 .env.*
 !.env.*.example
 **/.venv
 **/.venv-ray
 # python build/cache artifacts
 **/__pycache__
--- a/.rayignore
+++ b/.rayignore
@@ -0,0 +1,35 @@
 # Virtual environments
 .venv
 .venv*
 venv
 venv*
 **/.venv
 **/venv
 **/node_modules
 node_modules/
 # Python caches
 __pycache__/
 *.pyc
 .ruff_cache/
 .pytest_cache/
 # Git
 .git/
 # Large data and logs
 data/
 experiments/
 wandb/
 dumplogs*
 *.zip
 *.pdf
 *.log
 *.dot
 # Other large dirs
 PHANTOM_web/
 web/
 docs/
 paper/
 .nx/
--- a/58
+++ b/58
@@ -11,6 +11,7 @@ PYTEST    := $(VENV)/bin/pytest
 NX        := npx nx
 SWEEP_ENV_FILE ?= .env.sweep
 TPU_CONF ?= tpu_orchestration/configs/v4_spot_us.conf
 WANDB_ENTITY ?=
 WANDB_PROJECT ?= capstone
@@ -21,6 +22,14 @@ SIMPLE_BENCHMARK_ARGS ?= --tiers qtable,ppo,dqn,a2c --alpha-values 0.0,0.15,0.3,
 BENCHMARK_AGENT_ARGS ?=
 AGENT_COUNT ?= 0
 WHOCLICKED_REPO ?= velocitatem/whoclickedit
 WHOCLICKED_CSV ?= experiments/exports/whoclicked.csv
 WHOCLICKED_CARD ?= experiments/exports/whoclicked_dataset_card.md
 WHOCLICKED_CSV_PATH_IN_REPO ?= whoclicked.csv
 WHOCLICKED_CARD_PATH_IN_REPO ?= README.md
 WHOCLICKED_DATASET_MESSAGE ?= Update flattened whoclickedit dataset
 WHOCLICKED_CARD_MESSAGE ?= Update dataset card for whoclickedit
 REPO_URL ?=
 BRANCH ?= main
 WORKDIR ?= $(HOME)/PHANTOM-agent
@@ -35,8 +44,10 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
 .PHONY: help
 help:
-	@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines"
+	@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines | manim.render manim.render.all"
 	@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
 	@echo "data.pull data.push data.whoclicked.publish | study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
 	@echo "tpu.ray.bootstrap tpu.ray.deps tpu.ray.verify tpu.ray.teardown"
 	@echo ""
 	@echo "Build general public version:"
 	@echo "  make pdf.genpop"
@@ -56,6 +67,12 @@ help:
 	@echo "Bootstrap private repo worker from anywhere:"
 	@echo "  make train.bootstrap REPO_URL=https://github.com/org/repo.git BRANCH=main SWEEP_ID=entity/project/id"
 	@echo ""
 	@echo "Bootstrap Ray on TPU slice from config:"
 	@echo "  make tpu.ray.bootstrap TPU_CONF=tpu_orchestration/configs/v4_spot_us.conf"
 	@echo ""
 	@echo "Publish whoclickedit dataset + card:"
 	@echo "  make data.whoclicked.publish HF_TOKEN=... WHOCLICKED_REPO=velocitatem/whoclickedit"
 	@echo ""
 	@echo "Config source: $(SWEEP_ENV_FILE) (auto-loaded)"
 $(BUILDDIR):
@@ -133,10 +150,42 @@ train.agent:
 train.bootstrap:
 	@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap
 .PHONY: tpu.ray.bootstrap tpu.ray.deps tpu.ray.verify tpu.ray.teardown
 tpu.ray.bootstrap:
 	@TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-bootstrap
 tpu.ray.deps:
 	@TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-deps
 tpu.ray.verify:
 	@TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-verify
 tpu.ray.teardown:
 	@TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-teardown
 .PHONY: data.pull data.push
 data.pull:
 	python scripts/hf_data.py pull
 data.push:
 	python scripts/hf_data.py push
 .PHONY: data.whoclicked.publish
 data.whoclicked.publish:
 	@HF_TOKEN="$(HF_TOKEN)" WHOCLICKED_REPO="$(WHOCLICKED_REPO)" WHOCLICKED_CSV="$(WHOCLICKED_CSV)" WHOCLICKED_CARD="$(WHOCLICKED_CARD)" WHOCLICKED_CSV_PATH_IN_REPO="$(WHOCLICKED_CSV_PATH_IN_REPO)" WHOCLICKED_CARD_PATH_IN_REPO="$(WHOCLICKED_CARD_PATH_IN_REPO)" WHOCLICKED_DATASET_MESSAGE="$(WHOCLICKED_DATASET_MESSAGE)" WHOCLICKED_CARD_MESSAGE="$(WHOCLICKED_CARD_MESSAGE)" $(NX) run research:whoclicked-publish
 .PHONY: stats.lines
 stats.lines:
 	@$(NX) run research:stats
 .PHONY: study.margin-erosion
 study.margin-erosion:
 	python -m engine.studies.margin_erosion_alpha
 .PHONY: study.margin-erosion.quick
 study.margin-erosion.quick:
 	python -m engine.studies.margin_erosion_alpha --quick
 .PHONY: wordcount
 wordcount:
 	@$(NX) run paper:wordcount
@@ -185,3 +234,10 @@ count-lines:
 all:
 	@$(NX) run paper:build
 .PHONY: manim.render manim.render.all
 manim.render:
 	@$(NX) run manim:render
 manim.render.all:
 	@$(NX) run manim:render-all
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 ### PHANTOM
 [![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-sm.svg)](https://huggingface.co/datasets/velocitatem/whoclickedit)
 [![Build PDF](https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml/badge.svg)](https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml)
 [![Paper](https://img.shields.io/badge/Paper-PDF-red?logo=adobe-acrobat-reader)](https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf)
 [![TPU Research Cloud](https://img.shields.io/badge/TPU%20Research%20Cloud-TRC%20supported-4285F4?logo=googlecloud&logoColor=white)](https://sites.research.google/trc/faq/)
--- a/TPUS/README.md
+++ b/TPUS/README.md
@@ -1,6 +0,0 @@
 64 spot Cloud TPU v6e chips in zone europe-west4-a
 32 spot Cloud TPU v4 chips in zone us-central2-b
 64 spot Cloud TPU v5e chips in zone us-central1-a
 64 spot Cloud TPU v6e chips in zone us-east1-d
 32 on-demand Cloud TPU v4 chips in zone us-central2-b
 64 spot Cloud TPU v5e chips in zone europe-west4-b
--- a/TPUS/v4_32_spot_uscentral2b.sh
+++ b/TPUS/v4_32_spot_uscentral2b.sh
@@ -1,22 +0,0 @@
 # 32 spot Cloud TPU v4 chips in zone us-central2-b
 export PROJECT_ID=phantom-trc
 export QR_NAME=TPUv4s32spotUC2B
 export TPU_NAME=tpu-v4-32-uc2b-spot
 export ZONE=us-central2-b
 export ACCELERATOR_TYPE=v4-32
 export RUNTIME_VERSION=v2-alpha-tpuv4
 gcloud compute tpus tpu-vm create ${TPU_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --version=${RUNTIME_VERSION} \
       --spot \
 || \
 gcloud compute tpus queued-resources create ${QR_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --node-id=${TPU_NAME} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --runtime-version=${RUNTIME_VERSION} \
       --spot
--- a/TPUS/v4_uscentral2b.sh
+++ b/TPUS/v4_uscentral2b.sh
@@ -1,13 +0,0 @@
 # 32 on-demand Cloud TPU v4 chips in zone us-central2-b
 export PROJECT_ID=phantom-trc
 export QR_NAME=TPUlong
 export ZONE=us-central2-b
 export ACCELERATOR_TYPE=v4-32
 export RUNTIME_VERSION=v2-alpha-tpuv4
 #gcloud compute tpus tpu-vm create ${TPU_NAME}     --zone=${ZONE}     --project=${PROJECT_ID}     --accelerator-type=${ACCELERATOR_TYPE}     --version=${RUNTIME_VERSION}
 gcloud compute tpus queued-resources create ${QR_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --node-id=${TPU_NAME} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --runtime-version=${RUNTIME_VERSION}
--- a/TPUS/v5e_64_spot_europewest4b.sh
+++ b/TPUS/v5e_64_spot_europewest4b.sh
@@ -1,22 +0,0 @@
 # 64 spot Cloud TPU v5e chips in zone europe-west4-b
 export PROJECT_ID=phantom-trc
 export QR_NAME=TPUv5e64spotEW4B
 export TPU_NAME=tpu-v5e-64-ew4b
 export ZONE=europe-west4-b
 export ACCELERATOR_TYPE=v5e-64
 export RUNTIME_VERSION=v2-alpha-tpuv5-lite
 gcloud compute tpus tpu-vm create ${TPU_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --version=${RUNTIME_VERSION} \
       --spot \
 || \
 gcloud compute tpus queued-resources create ${QR_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --node-id=${TPU_NAME} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --runtime-version=${RUNTIME_VERSION} \
       --spot
--- a/TPUS/v5e_64_spot_uscentral1a.sh
+++ b/TPUS/v5e_64_spot_uscentral1a.sh
@@ -1,22 +0,0 @@
 # 64 spot Cloud TPU v5e chips in zone us-central1-a
 export PROJECT_ID=phantom-trc
 export QR_NAME=TPUv5e64spotUC1A
 export TPU_NAME=tpu-v5e-64-uc1a
 export ZONE=us-central1-a
 export ACCELERATOR_TYPE=v5e-64
 export RUNTIME_VERSION=v2-alpha-tpuv5-lite
 gcloud compute tpus tpu-vm create ${TPU_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --version=${RUNTIME_VERSION} \
       --spot \
 || \
 gcloud compute tpus queued-resources create ${QR_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --node-id=${TPU_NAME} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --runtime-version=${RUNTIME_VERSION} \
       --spot
--- a/TPUS/v6e_64_spot_europewest4a.sh
+++ b/TPUS/v6e_64_spot_europewest4a.sh
@@ -1,22 +0,0 @@
 # 64 spot Cloud TPU v6e chips in zone europe-west4-a
 export PROJECT_ID=phantom-trc
 export QR_NAME=TPUv6e64spotEW4A
 export TPU_NAME=tpu-v6e-64-ew4a
 export ZONE=europe-west4-a
 export ACCELERATOR_TYPE=v6e-64
 export RUNTIME_VERSION=v2-alpha-tpuv6e
 gcloud compute tpus tpu-vm create ${TPU_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --version=${RUNTIME_VERSION} \
       --spot \
 || \
 gcloud compute tpus queued-resources create ${QR_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --node-id=${TPU_NAME} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --runtime-version=${RUNTIME_VERSION} \
       --spot
--- a/TPUS/v6e_64_spot_useast1d.sh
+++ b/TPUS/v6e_64_spot_useast1d.sh
@@ -1,22 +0,0 @@
 # 64 spot Cloud TPU v6e chips in zone us-east1-d
 export PROJECT_ID=phantom-trc
 export QR_NAME=TPUv6e64spotUE1D
 export TPU_NAME=tpu-v6e-64-ue1d
 export ZONE=us-east1-d
 export ACCELERATOR_TYPE=v6e-64
 export RUNTIME_VERSION=v2-alpha-tpuv6e
 gcloud compute tpus tpu-vm create ${TPU_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --version=${RUNTIME_VERSION} \
       --spot \
 || \
 gcloud compute tpus queued-resources create ${QR_NAME} \
       --project=${PROJECT_ID} \
       --zone=${ZONE} \
       --node-id=${TPU_NAME} \
       --accelerator-type=${ACCELERATOR_TYPE} \
       --runtime-version=${RUNTIME_VERSION} \
       --spot
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,4 +1,23 @@
 services:
  tpu-watchdogs:
    build:
      context: .
      dockerfile: docker/TPUWatchdog.dockerfile
    container_name: "PHANTOM-tpu-watchdogs"
    restart: unless-stopped
    user: "${UID:-1000}:${GID:-1000}"
    environment:
      - HF_TOKEN=${HF_TOKEN}
      - WANDB_API_KEY=${WANDB_API_KEY}
      - GITHUB_TOKEN=${GITHUB_TOKEN}
      - GOOGLE_APPLICATION_CREDENTIALS=/secrets/gcp-sa.json
      - GCP_ACCOUNT=${GCP_ACCOUNT:-}
      - WATCHDOG_CONFIG_PATTERN=${WATCHDOG_CONFIG_PATTERN:-v[46]*.conf}
      - CLOUDSDK_CONFIG=/.config/gcloud
    volumes:
      - ~/.config/gcloud:/.config/gcloud:rw
      - ./secrets/gcp-sa.json:/secrets/gcp-sa.json:ro
  tensorboard-rl:
    image: tensorflow/tensorflow:latest
    container_name: "PHANTOM-tensorboard-rl"
--- a/docker/TPUWatchdog.dockerfile
+++ b/docker/TPUWatchdog.dockerfile
@@ -0,0 +1,112 @@
 FROM google/cloud-sdk:slim
 # Install tmux to manage multiple watchdogs and jq for json parsing
 RUN apt-get update && \
    apt-get install -y tmux jq && \
    rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 # Copy the orchestration scripts and configs
 COPY tpu_orchestration/ /app/tpu_orchestration/
 # Make sure scripts are executable
 RUN chmod +x /app/tpu_orchestration/watchdog.sh
 RUN chmod +x /app/tpu_orchestration/tpu_startup.sh
 # Create an entrypoint script that launches a watchdog for each config
 COPY <<-'EOF' /app/entrypoint.sh
 #!/bin/bash
 set -e
 # Make sure required variables are set
 if [ -z "$HF_TOKEN" ]; then
    echo "Error: HF_TOKEN environment variable is required."
    exit 1
 fi
 if [ -z "$WANDB_API_KEY" ]; then
    echo "Warning: WANDB_API_KEY environment variable is not set. Wandb logging may fail on TPUs."
 fi
 # Authenticate gcloud if credentials are provided
 if [ -n "$GOOGLE_APPLICATION_CREDENTIALS" ] && [ -f "$GOOGLE_APPLICATION_CREDENTIALS" ]; then
    CRED_TYPE=$(jq -r '.type' "$GOOGLE_APPLICATION_CREDENTIALS" 2>/dev/null || echo "unknown")
    if [ "$CRED_TYPE" = "service_account" ]; then
        echo "Authenticating gcloud using service account key..."
        gcloud auth activate-service-account --key-file="$GOOGLE_APPLICATION_CREDENTIALS"
        if [ -z "$PROJECT_ID" ]; then
            PROJECT_ID=$(jq -r '.project_id // empty' "$GOOGLE_APPLICATION_CREDENTIALS")
        fi
    elif [ "$CRED_TYPE" = "authorized_user" ]; then
        echo "Using authorized_user credentials via credential file override..."
        export CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE="$GOOGLE_APPLICATION_CREDENTIALS"
        if gcloud auth print-access-token >/dev/null 2>&1; then
            ACTIVE_ACCOUNT=$(gcloud config get-value account 2>/dev/null || true)
            if [ -z "$ACTIVE_ACCOUNT" ] || [ "$ACTIVE_ACCOUNT" = "(unset)" ]; then
                ACTIVE_ACCOUNT=$(jq -r '.account // empty' "$GOOGLE_APPLICATION_CREDENTIALS")
            fi
            if [ -n "$ACTIVE_ACCOUNT" ] && [ "$ACTIVE_ACCOUNT" != "(unset)" ]; then
                echo "Using gcloud account: $ACTIVE_ACCOUNT"
            else
                echo "Using gcloud credential override from $GOOGLE_APPLICATION_CREDENTIALS"
            fi
        else
            echo "Warning: credential file override token check failed. Falling back to mounted gcloud config."
            unset CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE
            if [ -n "$GCP_ACCOUNT" ]; then
                gcloud config set account "$GCP_ACCOUNT" >/dev/null 2>&1 || true
            fi
            ACTIVE_ACCOUNT=$(gcloud config get-value account 2>/dev/null || true)
            if [ -z "$ACTIVE_ACCOUNT" ] || [ "$ACTIVE_ACCOUNT" = "(unset)" ]; then
                echo "Error: no active gcloud account available. Run 'gcloud auth login' on host and mount ~/.config/gcloud, or use a service account key."
                exit 1
            fi
            echo "Using gcloud account: $ACTIVE_ACCOUNT"
        fi
    else
        echo "Warning: unsupported credential file type '$CRED_TYPE'. Falling back to mounted gcloud config."
    fi
 else
    echo "Note: Assuming gcloud config is mounted from host."
 fi
 if [ -n "$PROJECT_ID" ]; then
    gcloud config set project "$PROJECT_ID"
    echo "Set project to $PROJECT_ID"
 fi
 # Run the watchdogs in the background using bash instead of tmux
 # Tmux needs a TTY to attach properly which we might not have in docker
 # Stagger startups by 15s to prevent simultaneous TPU creation quota hits
 CONFIG_PATTERN=${WATCHDOG_CONFIG_PATTERN:-"*.conf"}
 shopt -s nullglob
 CONFIGS=(/app/tpu_orchestration/configs/$CONFIG_PATTERN)
 if [ ${#CONFIGS[@]} -eq 0 ]; then
    echo "Error: no watchdog configs matched pattern '$CONFIG_PATTERN'."
    exit 1
 fi
 echo "Using watchdog config pattern: $CONFIG_PATTERN"
 DELAY=0
 for conf in "${CONFIGS[@]}"; do
    echo "Starting watchdog for $(basename "$conf" .conf) (delay: ${DELAY}s)"
    (sleep $DELAY && /app/tpu_orchestration/watchdog.sh "$conf") &
    DELAY=$((DELAY + 15))
 done
 echo "All watchdogs queued with staggered startup."
 # Keep the container running
 wait
 EOF
 RUN chmod +x /app/entrypoint.sh
 CMD ["/app/entrypoint.sh"]
--- a/docs/index.html
+++ b/docs/index.html
@@ -272,12 +272,12 @@
                    </span>
                    <span class="link-block">
-                      <a href="goals/goals.csv" target="_blank"
+                      <a href="https://huggingface.co/datasets/velocitatem/whoclickedit" target="_blank"
                      class="external-link button is-normal is-rounded is-dark">
                      <span class="icon">
-                        <i class="fas fa-list"></i>
+                        <i class="fas fa-database"></i>
                      </span>
-                      <span>Goal Set</span>
+                      <span>Dataset</span>
                    </a>
                  </span>
--- a/engine/init.py
+++ b/engine/init.py
--- a/engine/backends/common.py
+++ b/engine/backends/common.py
@@ -15,6 +15,10 @@ def make_env(cfg: Mapping[str, Any]):
        n_products=int(cfg["n_products"]),
        alpha=float(cfg["alpha"]),
        N=int(cfg["N"]),
        agent_params=(
            float(cfg.get("agent_mu", 45.0)),
            float(cfg.get("agent_std", 15.0)),
        ),
        price_bounds=(float(cfg["price_low"]), float(cfg["price_high"])),
        lambda_coi=float(cfg["lambda_coi"]),
        robust_radius=float(cfg["robust_radius"]),
@@ -50,6 +54,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
    coi_levels: list[float] = []
    coi_leakages: list[float] = []
    volatilities: list[float] = []
    upward_volatilities: list[float] = []
    supra_shares: list[float] = []
    supra_penalties: list[float] = []
    agent_probs: list[float] = []
    for _ in range(int(episodes)):
@@ -61,6 +68,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
        ep_coi = 0.0
        ep_coi_leakage = 0.0
        ep_volatility = 0.0
        ep_upward_volatility = 0.0
        ep_supra_share = 0.0
        ep_supra_penalty = 0.0
        ep_agent_prob = 0.0
        steps = 0
@@ -74,6 +84,15 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
            ep_coi += float(econ.get("coi_level", 0.0))
            ep_coi_leakage += float(econ.get("coi_leakage", 0.0))
            ep_volatility += float(econ.get("volatility", 0.0))
            ep_upward_volatility += float(
                info.get("upward_volatility", econ.get("upward_volatility", 0.0))
            )
            ep_supra_share += float(
                info.get("supra_share", econ.get("supra_share", 0.0))
            )
            ep_supra_penalty += float(
                info.get("supra_penalty", econ.get("supra_penalty", 0.0))
            )
            ep_agent_prob += float(econ.get("agent_prob", info.get("agent_prob", 0.0)))
            steps += 1
@@ -84,6 +103,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
        coi_levels.append(ep_coi / denom)
        coi_leakages.append(ep_coi_leakage / denom)
        volatilities.append(ep_volatility / denom)
        upward_volatilities.append(ep_upward_volatility / denom)
        supra_shares.append(ep_supra_share / denom)
        supra_penalties.append(ep_supra_penalty / denom)
        agent_probs.append(ep_agent_prob / denom)
    return {
@@ -95,6 +117,13 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
        "eval/coi_level_mean": float(np.mean(coi_levels)) if coi_levels else 0.0,
        "eval/coi_leakage_mean": float(np.mean(coi_leakages)) if coi_leakages else 0.0,
        "eval/volatility_mean": float(np.mean(volatilities)) if volatilities else 0.0,
        "eval/upward_volatility_mean": (
            float(np.mean(upward_volatilities)) if upward_volatilities else 0.0
        ),
        "eval/supra_share_mean": float(np.mean(supra_shares)) if supra_shares else 0.0,
        "eval/supra_penalty_mean": (
            float(np.mean(supra_penalties)) if supra_penalties else 0.0
        ),
        "eval/agent_prob_mean": float(np.mean(agent_probs)) if agent_probs else 0.0,
    }
@@ -128,15 +157,15 @@ def evaluate(
        shifted_env.close()
        shifted_rows.append((tag, alpha, shifted_metrics))
-    metrics["eval/robust_alpha_low"] = low_alpha
+    metrics["eval/stress_alpha_low"] = low_alpha
-    metrics["eval/robust_alpha_high"] = high_alpha
+    metrics["eval/stress_alpha_high"] = high_alpha
-    metrics["eval/robust_reward_worst"] = float(
+    metrics["eval/stress_reward_worst"] = float(
        min(row[2]["eval/reward_mean"] for row in shifted_rows)
    )
-    metrics["eval/robust_revenue_worst"] = float(
+    metrics["eval/stress_revenue_worst"] = float(
        min(row[2]["eval/revenue_mean"] for row in shifted_rows)
    )
-    metrics["eval/robust_coi_leakage_worst"] = float(
+    metrics["eval/stress_coi_leakage_worst"] = float(
        max(row[2]["eval/coi_leakage_mean"] for row in shifted_rows)
    )
    for tag, alpha, shifted_metrics in shifted_rows:
--- a/engine/backends/qtable.py
+++ b/engine/backends/qtable.py
@@ -80,7 +80,11 @@ def train_qtable(
                "train/global_step": int(steps),
            }
            if wandb_live:
                try:
                    wandb.log(dict(event), step=step_offset + int(steps))
                except Exception:
                    wandb_live = False
                    train_events.append(event)
            else:
                train_events.append(event)
            if console_progress:
@@ -113,7 +117,11 @@ def train_qtable(
            "train/global_step": int(steps),
        }
        if wandb_live:
            try:
                wandb.log(dict(tail_event), step=step_offset + int(steps))
            except Exception:
                wandb_live = False
                train_events.append(tail_event)
        else:
            train_events.append(tail_event)
--- a/engine/backends/sb3.py
+++ b/engine/backends/sb3.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 import json
 import os
 from pathlib import Path
 from typing import Any, Mapping
-from ..lib.callbacks import MetricsCallback
+from ..lib.callbacks import EvalMetricsCallback, MetricsCallback
 from ..wandb_checkpoint import checkpoint_artifact_name, log_checkpoint_file
 from .common import evaluate, make_env
@@ -117,7 +119,6 @@ def build_model(cfg: Mapping[str, Any], env: Any):
 def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]:
    try:
        from stable_baselines3.common.callbacks import EvalCallback
        from stable_baselines3.common.monitor import Monitor
    except ImportError as exc:
        raise ImportError("stable-baselines3 is required for SB3 models") from exc
@@ -144,20 +145,20 @@ def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]:
        pass
    metrics_callback = MetricsCallback(
-        log_histograms=False,
+        log_histograms=True,
        log_freq=int(cfg["log_freq"]),
        hist_freq=int(cfg.get("hist_freq", 500)),
        step_offset=int(cfg.get("wandb_step_offset", 0)),
    )
-    callbacks = [metrics_callback]
+    eval_callback = EvalMetricsCallback(
    callbacks.append(
        EvalCallback(
        eval_env,
        eval_freq=int(cfg["eval_freq"]),
        n_eval_episodes=int(cfg["eval_episodes"]),
        step_offset=int(cfg.get("wandb_step_offset", 0)),
        deterministic=True,
        verbose=0,
    )
-    )
+    callbacks = [metrics_callback, eval_callback]
    target_steps = int(cfg["total_timesteps"])
    remaining_steps = max(0, target_steps - int(getattr(model, "num_timesteps", 0)))
@@ -173,6 +174,29 @@ def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]:
    model_path = model_dir / f"phantom_{cfg['algo']}"
    model.save(str(model_path))
    artifact_name = checkpoint_artifact_name(
        cfg,
        backend="sb3",
        sweep_id=os.getenv("WANDB_SWEEP_ID"),
    )
    artifact_logged = False
    try:
        artifact_logged = bool(
            log_checkpoint_file(
                artifact_name,
                file_path=model_path.with_suffix(".zip"),
                artifact_file_name="model.zip",
                metadata={
                    "algo": str(cfg.get("algo", "ppo")),
                    "backend": "sb3",
                    "seed": int(cfg.get("seed", 0)),
                    "step": int(getattr(model, "num_timesteps", 0)),
                },
            )
        )
    except Exception:
        artifact_logged = False
    metrics: dict[str, Any] = evaluate(
        model,
        eval_env,
@@ -181,7 +205,12 @@ def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]:
    )
    metrics["train/global_step"] = int(model.num_timesteps)
    metrics["model/path"] = str(model_path.with_suffix(".zip"))
-    metrics["_train_events"] = list(metrics_callback.events)
+    metrics["model/artifact_name"] = str(artifact_name)
    metrics["model/artifact_logged"] = float(artifact_logged)
    metrics["_train_events"] = sorted(
        [*metrics_callback.events, *eval_callback.events],
        key=lambda event: int(event.get("train/global_step", 0)),
    )
    env.close()
    eval_env.close()
--- a/engine/benchmark.py
+++ b/engine/benchmark.py
@@ -1,12 +1,32 @@
 from __future__ import annotations
 import os
 import subprocess
 import sys
 import argparse
 import json
 import logging
-import os
+from datetime import datetime, timezone
 from datetime import datetime, UTC
 from pathlib import Path
 # clear stale TPU locks on startup
 if os.path.exists("/dev/accel0"):
    try:
        subprocess.run(
            ["rm", "-f", "/tmp/.libtpu_lockfile", "/tmp/libtpu_lockfile"],
            stderr=subprocess.DEVNULL,
        )
    except:
        pass
 try:
    import jax
    jax.config.update("jax_threefry_partitionable", True)
 except ImportError:
    pass
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
@@ -25,6 +45,10 @@ def _log(message: str) -> None:
    logger.info(message)
 def _wandb_run_active() -> bool:
    return bool(HAS_WANDB and getattr(wandb, "run", None) is not None)
 def _parse_list(raw: str) -> list[str]:
    return [x.strip().lower() for x in str(raw).split(",") if x.strip()]
@@ -41,6 +65,10 @@ def _truthy(value: str | bool | None) -> bool:
    return str(value).strip().lower() in {"1", "true", "yes", "on"}
 def _mode_label_from_baseline(is_baseline: bool) -> str:
    return "baseline" if bool(is_baseline) else "defended"
 def _action(policy, obs: np.ndarray):
    out = policy.predict(obs, deterministic=True)
    action = out[0] if isinstance(out, tuple) else out
@@ -146,7 +174,7 @@ def _log_train_events(
    alpha: float,
    step_offset: int,
 ) -> int:
-    if not (HAS_WANDB and wandb.run is not None):
+    if not _wandb_run_active():
        return int(step_offset)
    if not events:
        return int(step_offset)
@@ -167,11 +195,14 @@ def _log_train_events(
                "run.kind": "benchmark",
                "runtime/backend": tier_name,
                "study/mode": mode_label,
-                "study/no_robust": float(mode_label == "no_robust"),
+                "study/baseline_mode": float(mode_label == "baseline"),
                "study/alpha": float(alpha),
            }
        )
        try:
            wandb.log(payload, step=cursor + rel_step)
        except Exception:
            return int(step_offset)
    max_rel = max(max(1, int(evt.get("train/global_step", 0))) for evt in ordered)
    return cursor + max_rel + 1
@@ -183,6 +214,7 @@ def run_benchmark(
    n_episodes: int,
    mode_label: str,
    step_cursor_start: int = 0,
    eval_alpha_values: list[float] | None = None,
 ):
    from .backends.common import make_env
@@ -219,14 +251,22 @@ def run_benchmark(
                "dqn",
            }:
                wandb_step_cursor += max(1, int(cfg.get("total_timesteps", 1))) + 1
-            env = make_env({**cfg, "alpha": float(alpha)})
+            eval_targets = (
                [float(value) for value in eval_alpha_values]
                if eval_alpha_values
                else [float(alpha)]
            )
            for eval_alpha in eval_targets:
                env = make_env({**cfg, "alpha": float(eval_alpha)})
                eps = [_run_eval_episode(env, policy) for _ in range(int(n_episodes))]
                env.close()
                row = {
                    "tier": tier_name,
                    "mode": mode_label,
-                "alpha": float(alpha),
+                    "alpha": float(eval_alpha),
                    "train_alpha": float(alpha),
                    "eval_alpha": float(eval_alpha),
                    "episodes": int(n_episodes),
                    "mean_reward": float(np.mean([e["reward"] for e in eps])),
                    "mean_revenue": float(np.mean([e["revenue"] for e in eps])),
@@ -237,7 +277,8 @@ def run_benchmark(
                row["objective_score"] = row["mean_reward"]
                rows.append(row)
                _log(
-                f"[{run_index}/{total_runs}] alpha={float(alpha):.2f} tier={tier_name}: "
+                    f"[{run_index}/{total_runs}] train_alpha={float(alpha):.2f} "
                    f"eval_alpha={float(eval_alpha):.2f} tier={tier_name}: "
                    f"reward={row['mean_reward']:.3f} revenue={row['mean_revenue']:.3f} "
                    f"coi={row['mean_coi']:.4f} score={row['objective_score']:.3f}"
                )
@@ -246,25 +287,32 @@ def run_benchmark(
                step_means = []
                for step in range(max_len):
                    vals = [
-                    e["price_trace"][step] for e in eps if step < len(e["price_trace"])
+                        e["price_trace"][step]
                        for e in eps
                        if step < len(e["price_trace"])
                    ]
                    step_means.append(float(np.mean(vals)) if vals else np.nan)
                traces.append(
                    {
                        "tier": tier_name,
-                    "alpha": float(alpha),
+                        "alpha": float(eval_alpha),
                        "train_alpha": float(alpha),
                        "eval_alpha": float(eval_alpha),
                        "mean_price_trace": step_means,
                    }
                )
-            if HAS_WANDB and wandb.run is not None:
+                if _wandb_run_active():
                    try:
                        wandb.log(
                            {
                                "run.kind": "benchmark",
                                "runtime/backend": tier_name,
                                "study/mode": mode_label,
-                        "study/no_robust": float(mode_label == "no_robust"),
+                                "study/baseline_mode": float(mode_label == "baseline"),
-                        "study/alpha": float(alpha),
+                                "study/alpha": float(eval_alpha),
                                "study/train_alpha": float(alpha),
                                "study/eval_alpha": float(eval_alpha),
                                "eval/reward_mean": row["mean_reward"],
                                "eval/revenue_mean": row["mean_revenue"],
                                "eval/margin_mean": row["mean_margin"],
@@ -274,6 +322,8 @@ def run_benchmark(
                            },
                            step=wandb_step_cursor,
                        )
                    except Exception:
                        pass
                    wandb_step_cursor += 1
    return pd.DataFrame(rows), traces, int(wandb_step_cursor)
@@ -358,7 +408,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
        if compare_robust_override is not None
        else _truthy(os.environ.get("PHANTOM_BENCHMARK_COMPARE_ROBUST"))
    )
-    robust_modes = [False, True] if compare_robust else [bool(args.no_robust)]
+    baseline_modes = [False, True] if compare_robust else [bool(args.no_robust)]
    base_overrides = {
        "seed": args.seed,
@@ -369,6 +419,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
        "robust_radius": args.robust_radius,
        "robust_points": args.robust_points,
        "robust_rollouts": args.robust_rollouts,
        "margin_floor": args.margin_floor,
        "eta_ux": args.eta_ux,
        "reward_profit_weight": args.reward_profit_weight,
        "price_low": args.price_low,
@@ -385,12 +436,20 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
    }
    tiers = _parse_list(args.tiers)
    alpha_values = _parse_float_list(args.alpha_values)
    eval_alpha_values = (
        _parse_float_list(args.eval_alpha_values)
        if str(getattr(args, "eval_alpha_values", "")).strip()
        else []
    )
    _log(
        "starting run "
        + json.dumps(
            {
                "tiers": tiers,
                "alpha_values": alpha_values,
                "eval_alpha_values": (
                    eval_alpha_values if eval_alpha_values else alpha_values
                ),
                "episodes": int(args.episodes),
                "total_timesteps": int(args.total_timesteps),
                "device": str(args.device),
@@ -401,14 +460,14 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
    all_frames: list[pd.DataFrame] = []
    all_traces: list[dict] = []
    wandb_step_cursor = 0
-    for no_robust in robust_modes:
+    for baseline_mode in baseline_modes:
        overrides = dict(base_overrides)
-        overrides["no_robust"] = bool(no_robust)
+        overrides["baseline_mode"] = bool(baseline_mode)
        cfg = TrainSpec.from_flat(
            {k: v for k, v in overrides.items() if v is not None}
        ).to_flat_dict()
        cfg["linear_warmup_steps"] = int(args.linear_warmup_steps)
-        mode_label = "no_robust" if no_robust else "robust"
+        mode_label = _mode_label_from_baseline(bool(baseline_mode))
        _log(f"mode={mode_label}: begin")
        df_mode, traces_mode, wandb_step_cursor = run_benchmark(
            cfg,
@@ -417,6 +476,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
            args.episodes,
            mode_label=mode_label,
            step_cursor_start=wandb_step_cursor,
            eval_alpha_values=eval_alpha_values,
        )
        _log(f"mode={mode_label}: complete ({len(df_mode)} rows)")
        for trace in traces_mode:
@@ -429,7 +489,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
    out_dir = Path(args.output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
-    stamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
+    stamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    csv_path = out_dir / f"benchmark_{stamp}.csv"
    trace_path = out_dir / f"benchmark_traces_{stamp}.json"
    df.to_csv(csv_path, index=False)
@@ -445,7 +505,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
            + json.dumps(
                {
                    "tier": best["tier"],
-                    "mode": best.get("mode", "robust"),
+                    "mode": best.get("mode", "defended"),
                    "alpha": float(best["alpha"]),
                    "objective_score": float(best["objective_score"]),
                    "mean_revenue": float(best["mean_revenue"]),
@@ -466,6 +526,7 @@ def run_cli(raw_args: list[str] | None = None):
    parser.add_argument("--project", default="capstone")
    parser.add_argument("--tiers", default="static,surge,linear,qtable,ppo")
    parser.add_argument("--alpha-values", default="0.0,0.3,0.6")
    parser.add_argument("--eval-alpha-values", default="")
    parser.add_argument("--episodes", type=int, default=10)
    parser.add_argument("--output-dir", default="engine/studies/results")
    parser.add_argument("--seed", type=int, default=42)
@@ -476,6 +537,7 @@ def run_cli(raw_args: list[str] | None = None):
    parser.add_argument("--robust-radius", type=float, default=0.15)
    parser.add_argument("--robust-points", type=int, default=5)
    parser.add_argument("--robust-rollouts", type=int, default=1)
    parser.add_argument("--margin-floor", type=float, default=0.85)
    parser.add_argument("--eta-ux", type=float, default=0.5)
    parser.add_argument("--reward-profit-weight", type=float, default=1.0)
    parser.add_argument("--price-low", type=float, default=10.0)
@@ -509,35 +571,47 @@ def run_cli(raw_args: list[str] | None = None):
                key_to_attr = {
                    "tiers": "tiers",
                    "alpha_values": "alpha_values",
                    "eval_alpha_values": "eval_alpha_values",
                    "episodes": "episodes",
                    "total_timesteps": "total_timesteps",
                    "lambda_coi": "lambda_coi",
                    "robust_radius": "robust_radius",
                    "robust_points": "robust_points",
                    "robust_rollouts": "robust_rollouts",
                    "ambiguity_radius": "robust_radius",
                    "ambiguity_points": "robust_points",
                    "ambiguity_rollouts": "robust_rollouts",
                    "eta_ux": "eta_ux",
                    "reward_profit_weight": "reward_profit_weight",
                    "learning_rate": "learning_rate",
                    "batch_size": "batch_size",
                    "n_steps": "n_steps",
                    "baseline_mode": "no_robust",
                    "no_robust": "no_robust",
                    "margin_floor": "margin_floor",
                    "device": "device",
                }
                for key in (
                    "tiers",
                    "alpha_values",
                    "eval_alpha_values",
                    "episodes",
                    "total_timesteps",
                    "lambda_coi",
                    "robust_radius",
                    "robust_points",
                    "robust_rollouts",
                    "ambiguity_radius",
                    "ambiguity_points",
                    "ambiguity_rollouts",
                    "eta_ux",
                    "reward_profit_weight",
                    "learning_rate",
                    "batch_size",
                    "n_steps",
                    "baseline_mode",
                    "no_robust",
                    "margin_floor",
                    "device",
                ):
                    if key in wandb.config:
@@ -560,18 +634,18 @@ def run_cli(raw_args: list[str] | None = None):
    tiers = _parse_list(args.tiers)
    alpha_values = _parse_float_list(args.alpha_values)
-    run_stamp = datetime.now(UTC).strftime("%m%d-%H%M%S")
+    run_stamp = datetime.now(timezone.utc).strftime("%m%d-%H%M%S")
    compare_enabled = _truthy(os.environ.get("PHANTOM_BENCHMARK_COMPARE_ROBUST"))
-    compare_tag = "robust-compare" if compare_enabled else "single-mode"
+    compare_tag = "defended-compare" if compare_enabled else "single-mode"
    modes = (
-        [("no_robust", True), ("robust", False)]
+        [("baseline", True), ("defended", False)]
        if compare_enabled
-        else [("no_robust" if bool(args.no_robust) else "robust", bool(args.no_robust))]
+        else [(_mode_label_from_baseline(bool(args.no_robust)), bool(args.no_robust))]
    )
    run_idx = 0
    for tier in tiers:
-        for mode_label, no_robust in modes:
+        for mode_label, baseline_mode in modes:
            for alpha in alpha_values:
                run_idx += 1
                alpha_token = (
@@ -580,7 +654,7 @@ def run_cli(raw_args: list[str] | None = None):
                tier_args = argparse.Namespace(**vars(args))
                tier_args.tiers = tier
                tier_args.alpha_values = str(float(alpha))
-                tier_args.no_robust = bool(no_robust)
+                tier_args.no_robust = bool(baseline_mode)
                run = wandb.init(
                    project=args.project,
                    name=(
@@ -597,16 +671,19 @@ def run_cli(raw_args: list[str] | None = None):
                        "run.kind": "benchmark",
                        "runtime/backend": tier,
                        "study/mode": mode_label,
-                        "study/no_robust": float(no_robust),
+                        "study/baseline_mode": float(baseline_mode),
                        "study/alpha": float(alpha),
                        "tiers": tier,
                        "alpha_values": str(float(alpha)),
                        "eval_alpha_values": args.eval_alpha_values,
                        "episodes": args.episodes,
                        "total_timesteps": args.total_timesteps,
                        "lambda_coi": args.lambda_coi,
-                        "robust_radius": args.robust_radius,
+                        "ambiguity_radius": args.robust_radius,
-                        "robust_points": args.robust_points,
+                        "ambiguity_points": args.robust_points,
-                        "robust_rollouts": args.robust_rollouts,
+                        "ambiguity_rollouts": args.robust_rollouts,
                        "margin_floor": args.margin_floor,
                        "baseline_mode": float(baseline_mode),
                        "eta_ux": args.eta_ux,
                        "reward_profit_weight": args.reward_profit_weight,
                        "learning_rate": args.learning_rate,
--- a/engine/engine.py
+++ b/engine/engine.py
@@ -48,7 +48,8 @@ class MarketEngine:
        )
        human_transitions = get_adjusted_transitions(demand_h, human=True)
        agent_transitions = get_adjusted_transitions(demand_a, human=False)
-        # sample behavior trajectories from each demand distribution
+        # sample N trajectories in parallel; each chain is independent so threads
        # do not share state and numpy's per-call RNG is thread-safe
        human_t = [
            sample_behavior_from_transitions(human_transitions)
            for _ in range(self.Nhumans)
@@ -59,7 +60,25 @@ class MarketEngine:
        ]
        # store trajectories for agent probability calculation
        self.last_trajectories = human_t + agent_t
-        return estimate_demand(self.last_trajectories, self.action_weights)
+
        demand_proxy = estimate_demand(
            self.last_trajectories,
            self.action_weights,
            normalize=True,
            per_session=False,
        )
        raw_mix = ((1.0 - float(self.alpha)) * demand_h) + (
            float(self.alpha) * demand_a
        )
        total_raw_demand = float(np.sum(raw_mix))
        if not demand_proxy:
            return {i: float(raw_mix[i]) for i in range(len(prices))}
        if total_raw_demand <= 0.0:
            return {i: 0.0 for i in range(len(prices))}
        return {
            i: total_raw_demand * float(demand_proxy.get(i, 0.0)) / 100.0
            for i in range(len(prices))
        }
    def measure(self):
        pass
--- a/engine/jax/init.py
+++ b/engine/jax/init.py
@@ -0,0 +1,3 @@
 from .robust import select_adversarial_alpha_jax, _JAX_OK
 __all__ = ["select_adversarial_alpha_jax", "_JAX_OK"]
--- a/engine/jax/robust.py
+++ b/engine/jax/robust.py
@@ -0,0 +1,197 @@
 """JAX-accelerated robust inner loop for PHANTOM.
 provides a drop-in replacement for the sequential alpha-candidate evaluation in
 wrapper.py::_select_adversarial_alpha.  the demand generation and reward
 computation are vmapped over the K candidate alpha values so all candidates are
 evaluated in a single vectorized pass instead of K sequential Python calls.
 public surface:
    select_adversarial_alpha_jax(candidates, prices, human_params, agent_params,
                                  noise_std, n_sessions, n_products,
                                  baseline_prices, lambda_coi, info_value,
                                  reward_profit_weight, rng_key)
        -> (best_alpha: float, rewards: np.ndarray)
 falls back gracefully when JAX is unavailable.
 """
 from __future__ import annotations
 import numpy as np
 try:
    import jax
    import jax.numpy as jnp
    from jax import vmap, jit
    _JAX_OK = True
 except ImportError:
    _JAX_OK = False
 _JAX_RUNTIME_OK = True
 def _demand_for_actor_jax(prices, mean, std, noise_std, key):
    """d(p;theta) = max(0, val - price + noise), normalized to sum 100."""
    k1, k2 = jax.random.split(key)
    val = jax.random.normal(k1, shape=prices.shape) * std + mean
    noise = jax.random.normal(k2, shape=prices.shape) * noise_std
    demand = jnp.maximum(0.0, val - prices + noise)
    total = demand.sum()
    return jnp.where(total > 0, demand / total * 100.0, demand)
 def _reward_for_candidate(
    alpha,
    prices,
    human_mean,
    human_std,
    agent_mean,
    agent_std,
    noise_std,
    baseline_prices,
    lambda_coi,
    info_value,
    reward_profit_weight,
    key,
 ):
    """compute a scalar reward for a single alpha candidate (pure JAX, vmappable)."""
    k_h, k_a = jax.random.split(key)
    # mixed demand proxy: weighted sum of human and agent demand signals
    demand_h = _demand_for_actor_jax(prices, human_mean, human_std, noise_std, k_h)
    demand_a = _demand_for_actor_jax(prices, agent_mean, agent_std, noise_std, k_a)
    demand = (1.0 - alpha) * demand_h + alpha * demand_a
    revenue = jnp.dot(prices, demand)
    floor_cost = jnp.dot(baseline_prices, demand)
    profit = revenue - floor_cost
    # agent_prob proxy: use alpha directly (no trajectory available in vectorized path)
    coi_leakage = alpha * info_value
    info_budget = jnp.maximum(floor_cost, 1.0)
    coi_penalty = lambda_coi * coi_leakage * info_budget
    return reward_profit_weight * profit - coi_penalty
 if _JAX_OK:
    # compile once; retracing only happens on shape/dtype changes
    # 12 args: alpha, prices, h_mean, h_std, a_mean, a_std, noise_std,
    #          baseline_prices, lambda_coi, info_value, reward_profit_weight, key
    _reward_batched = jit(
        vmap(
            _reward_for_candidate,
            in_axes=(0, None, None, None, None, None, None, None, None, None, None, 0),
        )
    )
 def select_adversarial_alpha_jax(
    candidates: np.ndarray,
    prices: np.ndarray,
    human_params: tuple,
    agent_params: tuple,
    noise_std: float,
    baseline_prices: np.ndarray,
    lambda_coi: float,
    info_value: float,
    reward_profit_weight: float,
    rng_seed: int = 0,
 ) -> tuple[float, np.ndarray]:
    """evaluate all alpha candidates in a single vmapped pass.
    returns (best_alpha, rewards_array) where best_alpha minimizes reward
    (worst case for the platform, driving robust policy training).
    falls back to a pure-numpy sequential loop when JAX is unavailable so the
    wrapper can call this function unconditionally.
    """
    global _JAX_RUNTIME_OK
    if not _JAX_OK or not _JAX_RUNTIME_OK:
        return _fallback(
            candidates,
            prices,
            human_params,
            agent_params,
            noise_std,
            baseline_prices,
            lambda_coi,
            info_value,
            reward_profit_weight,
        )
    try:
        k = len(candidates)
        key = jax.random.PRNGKey(rng_seed)
        keys = jax.random.split(key, k)
        rewards = np.asarray(
            _reward_batched(
                jnp.asarray(candidates, dtype=jnp.float32),
                jnp.asarray(prices, dtype=jnp.float32),
                float(human_params[0]),
                float(human_params[1]),
                float(agent_params[0]),
                float(agent_params[1]),
                float(noise_std),
                jnp.asarray(baseline_prices, dtype=jnp.float32),
                float(lambda_coi),
                float(info_value),
                float(reward_profit_weight),
                keys,
            )
        )
        best_idx = int(np.argmin(rewards))
        return float(candidates[best_idx]), rewards
    except Exception as exc:
        # TPU contention / backend init failures can happen in distributed schedulers.
        # Degrade to numpy path for the remainder of the process.
        _JAX_RUNTIME_OK = False
        print(f"PHANTOM_JAX_FALLBACK: {exc}")
        return _fallback(
            candidates,
            prices,
            human_params,
            agent_params,
            noise_std,
            baseline_prices,
            lambda_coi,
            info_value,
            reward_profit_weight,
        )
 def _fallback(
    candidates,
    prices,
    human_params,
    agent_params,
    noise_std,
    baseline_prices,
    lambda_coi,
    info_value,
    reward_profit_weight,
 ):
    """numpy fallback matching the reward formula above."""
    rewards = []
    for alpha in candidates:
        rng = np.random.default_rng()
        val_h = rng.normal(*human_params, size=len(prices))
        val_a = rng.normal(*agent_params, size=len(prices))
        noise_h = rng.normal(0, noise_std, len(prices))
        noise_a = rng.normal(0, noise_std, len(prices))
        d_h = np.maximum(0, val_h - prices + noise_h)
        d_a = np.maximum(0, val_a - prices + noise_a)
        s_h, s_a = d_h.sum(), d_a.sum()
        d_h = d_h / s_h * 100 if s_h > 0 else d_h
        d_a = d_a / s_a * 100 if s_a > 0 else d_a
        demand = (1.0 - alpha) * d_h + alpha * d_a
        revenue = float(np.dot(prices, demand))
        floor_cost = float(np.dot(baseline_prices, demand))
        profit = revenue - floor_cost
        coi_penalty = lambda_coi * alpha * info_value * max(floor_cost, 1.0)
        rewards.append(reward_profit_weight * profit - coi_penalty)
    rewards = np.array(rewards)
    best_idx = int(np.argmin(rewards))
    return float(candidates[best_idx]), rewards
--- a/engine/lib/behavior.py
+++ b/engine/lib/behavior.py
@@ -22,6 +22,9 @@ human_dir = str(base_dir / "collected_data")
 agent_dir = str(base_dir / "agents" / "collected_data")
 _cache = {}  # lazy cache for models and base pivots
 # cache keyed by (human: bool, condition_tuple) so we skip Kronecker re-expansion
 # for repeated calls with the same demand condition inside the robustness inner loop
 _transition_cache: dict = {}
 def _get_base_pivot(human: bool):
@@ -68,22 +71,41 @@ def trajectory_to_events(trajectory: list) -> list:
    """extract event names from trajectory for KL divergence calculation
    trajectories are in format 'eventName_product0', extract just eventName
    args:
        trajectory: list like ['view_product0', 'add_to_cart_product1', 'checkout_product1']
    returns:
        list: event names like ['view', 'add_to_cart', 'checkout']
    """
-    events = []
+    return [s.rsplit("_product", 1)[0] if "_product" in s else s for s in trajectory]
-    for state in trajectory:
+
-        # state format from sample_behavior: 'eventName_productX'
+
-        if "_product" in state:
+class _TransitionTable:
-            event = state.rsplit("_product", 1)[0]
+    """numpy-backed transition table; replaces per-step pandas .loc[] indexing.
-        else:
+
-            event = state
+    the profiling hotspot was DataFrame.xs called ~4-16k times per outer step.
-        events.append(event)
+    converting once to a dense float32 array with an int-keyed state index map
-    return events
+    reduces each row lookup to a single array slice with no pandas overhead.
    rows are pre-normalized so sampling requires no per-step division.
    """
    __slots__ = ("matrix", "states", "state_index", "n_states")
    def __init__(self, df: pd.DataFrame):
        self.states: list[str] = df.index.tolist()
        self.state_index: dict[str, int] = {s: i for i, s in enumerate(self.states)}
        # float64 throughout: float32 row-sums can drift enough to break np.random.choice
        mat = np.nan_to_num(
            df.values.astype(np.float64), nan=0.0, posinf=0.0, neginf=0.0
        )
        mat = np.clip(mat, 0.0, None)
        row_sums = mat.sum(axis=1)
        # dead rows (all zero) get uniform distribution so sampling never receives NaN
        dead = row_sums <= 0
        mat[dead] = 1.0
        row_sums[dead] = float(mat.shape[1])
        mat = mat / row_sums[:, np.newaxis]
        # final nan guard in case fp still drifts
        np.nan_to_num(mat, nan=0.0, copy=False)
        row_sums2 = mat.sum(axis=1, keepdims=True)
        row_sums2[row_sums2 <= 0] = 1.0
        self.matrix: np.ndarray = mat / row_sums2
        self.n_states: int = len(self.states)
 def adjust_behavior_to_condition(condition, transition_matrix):
@@ -92,46 +114,73 @@ def adjust_behavior_to_condition(condition, transition_matrix):
    condition = np.nan_to_num(condition, nan=0.0, posinf=0.0, neginf=0.0)
    condition = np.clip(condition, 0.0, None)
    s = float(np.sum(condition))
-    if not np.isfinite(s) or s <= 0:
+    cond_norm = (
-        cond_norm = np.full(len(condition), 1.0 / max(len(condition), 1), dtype=float)
+        condition / s
-    else:
+        if np.isfinite(s) and s > 0
-        cond_norm = condition / s
+        else np.full(len(condition), 1.0 / max(len(condition), 1), dtype=float)
    )
    n_products = len(condition)
    base_vals = transition_matrix.values
    base_cols, base_rows = (
        transition_matrix.columns.tolist(),
        transition_matrix.index.tolist(),
    )
    # expand via kronecker-like tiling: each cell becomes a P*P block weighted by outer product of cond_norm
    expanded = np.kron(base_vals, np.outer(cond_norm, cond_norm))
    new_cols = [f"{c}_product{p}" for c in base_cols for p in range(n_products)]
    new_rows = [f"{r}_product{p}" for r in base_rows for p in range(n_products)]
    return pd.DataFrame(expanded, index=new_rows, columns=new_cols)
-def get_adjusted_transitions(condition, human=True):
+def get_adjusted_transitions(condition, human=True) -> _TransitionTable:
    """return a _TransitionTable for the given demand condition.
    results are cached by (human, rounded-condition) so that repeated calls with
    the same condition inside the robustness inner loop (K candidates, same prices)
    skip the Kronecker expansion entirely.
    """
    condition = np.asarray(condition, dtype=float)
    # round to 4 significant digits for cache key stability
    cache_key = (human, tuple(np.round(condition, 4).tolist()))
    if cache_key in _transition_cache:
        return _transition_cache[cache_key]
    # prevent OOM by capping cache size
    if len(_transition_cache) > 100:
        _transition_cache.clear()
    base_pivot = _get_base_pivot(human)
-    return adjust_behavior_to_condition(condition, base_pivot)
+    df = adjust_behavior_to_condition(condition, base_pivot)
    table = _TransitionTable(df)
    _transition_cache[cache_key] = table
    return table
-def sample_behavior_from_transitions(adjusted_transitions, max_len=40):
+def clear_transition_cache():
-    trajectory = [np.random.choice(adjusted_transitions.index)]
+    """drop cached transition tables; call between episodes if condition space is large."""
    _transition_cache.clear()
 def sample_behavior_from_transitions(table, max_len=40):
    """sample a Markov trajectory.
    accepts _TransitionTable (fast path) or a legacy pandas DataFrame so existing
    call sites that pass a DataFrame directly continue to work unchanged.
    """
    if isinstance(table, pd.DataFrame):
        table = _TransitionTable(table)
    idx = np.random.randint(table.n_states)
    trajectory = [table.states[idx]]
    while len(trajectory) < max_len and "checkout" not in trajectory[-1]:
-        probs = np.asarray(adjusted_transitions.loc[trajectory[-1]].values, dtype=float)
+        row = table.matrix[table.state_index[trajectory[-1]]]
-        probs = np.nan_to_num(probs, nan=0.0, posinf=0.0, neginf=0.0)
+        idx = int(np.random.choice(table.n_states, p=row))
-        probs = np.clip(probs, 0.0, None)
+        trajectory.append(table.states[idx])
        s = float(np.sum(probs))
        sample = np.random.choice(
            adjusted_transitions.columns, p=(probs / s) if s > 0 else None
        )
        trajectory.append(sample)
    return trajectory
 def sample_behavior(condition, human=True, max_len=40):
-    adjusted_transitions = get_adjusted_transitions(condition, human=human)
+    table = get_adjusted_transitions(condition, human=human)
-    return sample_behavior_from_transitions(adjusted_transitions, max_len=max_len)
+    return sample_behavior_from_transitions(table, max_len=max_len)
 if __name__ == "__main__":
--- a/engine/lib/callbacks.py
+++ b/engine/lib/callbacks.py
@@ -15,15 +15,19 @@ class MetricsCallback(BaseCallback):
        self,
        log_histograms: bool = False,
        log_freq: int = 100,
        hist_freq: int = 500,
        step_offset: int = 0,
        verbose: int = 0,
    ):
        super().__init__(verbose)
        self.log_histograms = log_histograms
        self.log_freq = max(1, int(log_freq))
        self.hist_freq = max(1, int(hist_freq))
        self.step_offset = max(0, int(step_offset))
        self._wandb = get_wandb_module()
        self._wandb_live = bool(self._wandb is not None and self._wandb.run is not None)
        self._price_samples: list[float] = []
        self._demand_samples: list[float] = []
        self._window_sums = {
            "train/revenue_mean": 0.0,
            "train/margin_mean": 0.0,
@@ -74,9 +78,67 @@ class MetricsCallback(BaseCallback):
            )
        self._window_count += 1
-    def _flush(self, step: int) -> None:
+    def _accumulate_histograms(self, info: dict[str, Any]) -> None:
-        if self._window_count <= 0:
+        if not self.log_histograms:
            return
        for key in ("effective_prices", "prices"):
            if key not in info:
                continue
            try:
                values = np.asarray(info.get(key), dtype=float).reshape(-1)
            except Exception:
                continue
            if values.size <= 0:
                continue
            finite_values = values[np.isfinite(values)]
            if finite_values.size > 0:
                self._price_samples.extend(finite_values.tolist())
            break
        if "demand" in info:
            try:
                demand_values = np.asarray(info.get("demand"), dtype=float).reshape(-1)
            except Exception:
                demand_values = np.array([], dtype=float)
            if demand_values.size > 0:
                finite_demand = demand_values[np.isfinite(demand_values)]
                if finite_demand.size > 0:
                    self._demand_samples.extend(finite_demand.tolist())
    def _flush_histograms(self, step: int, force: bool = False) -> None:
        if not self.log_histograms:
            return
        if not force and step % self.hist_freq != 0:
            return
        if not self._price_samples and not self._demand_samples:
            return
        if self._wandb is None:
            self._price_samples.clear()
            self._demand_samples.clear()
            return
        payload: dict[str, Any] = {}
        if self._price_samples:
            payload["train/price_dist"] = self._wandb.Histogram(
                np.asarray(self._price_samples, dtype=np.float32)
            )
        if self._demand_samples:
            payload["train/demand_dist"] = self._wandb.Histogram(
                np.asarray(self._demand_samples, dtype=np.float32)
            )
        if payload and self._wandb_live:
            try:
                self._wandb.log(payload, step=self.step_offset + int(step))
            except Exception:
                self._wandb_live = False
        self._price_samples.clear()
        self._demand_samples.clear()
    def _flush(self, step: int, *, force_hist: bool = False) -> None:
        if self._window_count > 0:
            denom = float(self._window_count)
            payload = {
                key: (value / denom)
@@ -92,17 +154,24 @@ class MetricsCallback(BaseCallback):
            }
            payload["train/global_step"] = int(step)
            if self._wandb_live:
                try:
                    self._wandb.log(dict(payload), step=self.step_offset + int(step))
                except Exception:
                    self._wandb_live = False
                    self.events.append(payload)
            else:
                self.events.append(payload)
            for key in self._window_sums:
                self._window_sums[key] = 0.0
            self._window_count = 0
        self._flush_histograms(step=step, force=force_hist)
    def _on_step(self) -> bool:
        for info in self.locals.get("infos", []):
            if isinstance(info, dict):
                self._accumulate(info)
                self._accumulate_histograms(info)
        if self.num_timesteps % self.log_freq == 0:
            self._flush(step=self.num_timesteps)
@@ -110,39 +179,81 @@ class MetricsCallback(BaseCallback):
        return True
    def _on_training_end(self) -> None:
-        self._flush(step=self.num_timesteps)
+        self._flush(step=self.num_timesteps, force_hist=True)
 class EvalMetricsCallback(EvalCallback):
    """Deterministic evaluation collector detached from logging backends."""
    def __init__(
-        self, eval_env, eval_freq: int = 1000, n_eval_episodes: int = 5, **kwargs
+        self,
        eval_env,
        eval_freq: int = 1000,
        n_eval_episodes: int = 5,
        step_offset: int = 0,
        **kwargs,
    ):
        super().__init__(
            eval_env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes, **kwargs
        )
-        self._eval_revenues: list[float] = []
+        self.step_offset = max(0, int(step_offset))
        self._wandb = get_wandb_module()
        self._wandb_live = bool(self._wandb is not None and self._wandb.run is not None)
        self._eval_stats: dict[str, list[float]] = {
            "eval/revenue_mean": [],
            "eval/margin_mean": [],
            "eval/coi_level_mean": [],
            "eval/coi_leakage_mean": [],
            "eval/volatility_mean": [],
            "eval/agent_prob_mean": [],
        }
        self.events: list[dict[str, float | int]] = []
    def _on_step(self) -> bool:
        result = super()._on_step()
        if self.n_calls % self.eval_freq == 0 and hasattr(self, "last_mean_reward"):
-            self.events.append(
+            payload: dict[str, float | int] = {
                {
                "eval/reward_mean": float(self.last_mean_reward),
                    "eval/revenue_mean": float(np.mean(self._eval_revenues))
                    if self._eval_revenues
                    else 0.0,
                "train/global_step": int(self.num_timesteps),
            }
            for key, values in self._eval_stats.items():
                payload[key] = float(np.mean(values)) if values else 0.0
            if self._wandb_live:
                try:
                    self._wandb.log(
                        dict(payload),
                        step=self.step_offset + int(self.num_timesteps),
                    )
-            self._eval_revenues = []
+                except Exception:
                    self._wandb_live = False
                    self.events.append(payload)
            else:
                self.events.append(payload)
            for values in self._eval_stats.values():
                values.clear()
        return result
    def _log_success_callback(self, locals_: dict, globals_: dict) -> None:
        # called after each eval episode
        info = locals_.get("info", {})
-        if "economics" in info:
+        econ = info.get("economics") if isinstance(info, dict) else None
-            self._eval_revenues.append(info["economics"]["revenue"])
+        if not isinstance(econ, dict):
            return
        self._eval_stats["eval/revenue_mean"].append(float(econ.get("revenue", 0.0)))
        self._eval_stats["eval/margin_mean"].append(float(econ.get("margin", 0.0)))
        self._eval_stats["eval/coi_level_mean"].append(
            float(econ.get("coi_level", 0.0))
        )
        self._eval_stats["eval/coi_leakage_mean"].append(
            float(econ.get("coi_leakage", 0.0))
        )
        self._eval_stats["eval/volatility_mean"].append(
            float(econ.get("volatility", 0.0))
        )
        self._eval_stats["eval/agent_prob_mean"].append(
            float(econ.get("agent_prob", 0.0))
        )
--- a/engine/lib/demand.py
+++ b/engine/lib/demand.py
@@ -17,18 +17,32 @@ def generate_demand_for_actor(
    params: tuple,
    noise_std: float = 1.0,
    distribution_method=np.random.normal,
    normalize: bool = False,
 ) -> np.ndarray:
    """d(p;0) = max(0, valuation - price) + epsi for single actor type
    params: (mean, std) for valuation distribution D_H or D_A"""
    val = distribution_method(*params, size=len(prices))
    noise = distribution_method(0, noise_std, len(prices))
    demand = np.maximum(0, val - prices + noise)
    if not normalize:
        return demand
    total = np.sum(demand)
    return demand / total * 100 if total > 0 else demand
-def estimate_demand(trajectories, action_weights=None):
+def estimate_demand(
-    return estimate_weighted_demand(trajectories, action_weights)
+    trajectories,
    action_weights=None,
    *,
    normalize: bool = False,
    per_session: bool = True,
 ):
    return estimate_weighted_demand(
        trajectories,
        action_weights,
        normalize=normalize,
        per_session=per_session,
    )
 def _parse_event_state(state: str):
@@ -50,7 +64,13 @@ def _weight_for_action(action: str, action_weights: dict) -> float:
    return CATEGORY_WEIGHTS["nav"]
-def estimate_weighted_demand(trajectories, action_weights=None):
+def estimate_weighted_demand(
    trajectories,
    action_weights=None,
    *,
    normalize: bool = False,
    per_session: bool = True,
 ):
    action_weights = (
        DEFAULT_ACTION_WEIGHTS if action_weights is None else action_weights
    )
@@ -64,12 +84,20 @@ def estimate_weighted_demand(trajectories, action_weights=None):
            if w <= 0:
                continue
            scores[product_id] = scores.get(product_id, 0.0) + w
-    total = sum(scores.values())
+    if not scores:
-    return (
+        return {}
-        {pid: (score / total) * 100 for pid, score in scores.items()}
+
-        if total > 0
+    if per_session and len(trajectories) > 0:
-        else {}
+        inv_n = 1.0 / float(len(trajectories))
-    )
+        scores = {pid: score * inv_n for pid, score in scores.items()}
    if not normalize:
        return scores
    total = float(sum(scores.values()))
    if total <= 0:
        return {}
    return {pid: (score / total) * 100.0 for pid, score in scores.items()}
 # Example usage
--- a/engine/lib/providers.py
+++ b/engine/lib/providers.py
@@ -156,6 +156,7 @@ class ProviderBenchmark:
                # log to wandb if available
                if HAS_WANDB and wandb.run is not None:
                    try:
                        wandb.log(
                            {
                                f"benchmark/{name}/revenue": result.mean_revenue,
@@ -164,6 +165,8 @@ class ProviderBenchmark:
                                "benchmark/alpha": alpha,
                            }
                        )
                    except Exception:
                        pass
        return self.results
--- a/engine/lib/wrappers.py
+++ b/engine/lib/wrappers.py
@@ -32,17 +32,23 @@ class EconomicMetricsWrapper(gym.Wrapper):
        obs, reward, terminated, truncated, info = self.env.step(action)
        # extract from unwrapped env
-        prices = self.env.unwrapped._prices
+        quoted_prices = np.asarray(self.env.unwrapped._prices, dtype=float)
        effective_prices = np.asarray(
            info.get("effective_prices", quoted_prices), dtype=float
        )
        if effective_prices.shape != quoted_prices.shape:
            effective_prices = quoted_prices
        demand_dict = self.env.unwrapped._demand
-        demand = np.array([demand_dict.get(i, 0.0) for i in range(len(prices))])
+        demand = np.array([demand_dict.get(i, 0.0) for i in range(len(quoted_prices))])
        # core calculations
-        revenue = float(np.sum(prices * demand))
+        revenue = float(info.get("revenue", np.sum(effective_prices * demand)))
-        avg_price = float(np.mean(prices))
+        quoted_revenue = float(np.sum(quoted_prices * demand))
        avg_price = float(np.mean(effective_prices))
        margin = (avg_price - self.p_min) / max(avg_price, 1e-6)
        coi_level = avg_price - self.p_min  # E[P] - p_min per thesis Def 1
-        self._price_history.append(prices.copy())
+        self._price_history.append(effective_prices.copy())
        self._revenue_history.append(revenue)
        # regret vs baseline (golden path)
@@ -53,6 +59,7 @@ class EconomicMetricsWrapper(gym.Wrapper):
        # inject structured metrics into info
        info["economics"] = {
            "revenue": revenue,
            "quoted_revenue": quoted_revenue,
            "margin": margin,
            "coi_level": coi_level,
            "regret": regret,
@@ -64,6 +71,10 @@ class EconomicMetricsWrapper(gym.Wrapper):
            "coi_penalty",
            "ux_penalty",
            "volatility",
            "upward_volatility",
            "supra_penalty",
            "supra_share",
            "competitive_anchor",
            "profit",
            "cost_floor",
            "reward_revenue",
@@ -71,10 +82,13 @@ class EconomicMetricsWrapper(gym.Wrapper):
            "agent_prob",
            "alpha_adv",
            "alpha_nominal",
            "erosion_share",
            "effective_price_mean",
        ):
            if key in info:
                info["economics"][key] = info[key]
-        info["prices"] = prices.copy()
+        info["prices"] = quoted_prices.copy()
        info["effective_prices"] = effective_prices.copy()
        info["demand"] = demand.copy()
        return obs, reward, terminated, truncated, info
--- a/engine/orchestrators/sweep_agent.py
+++ b/engine/orchestrators/sweep_agent.py
@@ -9,6 +9,7 @@ from ..telemetry.wandb import (
    get_wandb_module,
    init_run,
    run_agent,
    update_summary,
 )
 from .train import run_with_active_sweep_run
@@ -43,6 +44,7 @@ def run_sweep_agent(
            spec = TrainSpec.from_flat(merged)
            if run is not None:
                run.name = run_name(spec, kind=kind, scenario=scenario)
            try:
                run_with_active_sweep_run(
                    spec,
                    kind=kind,
@@ -50,6 +52,15 @@ def run_sweep_agent(
                    group=group,
                    extra_tags=extra_tags,
                )
                update_summary({"run/status": "finished"})
            except Exception as exc:
                update_summary(
                    {
                        "run/status": "crashed",
                        "run/error": str(exc),
                    }
                )
                raise
        finally:
            finish_run()
--- a/engine/orchestrators/train.py
+++ b/engine/orchestrators/train.py
@@ -20,7 +20,7 @@ def _tags_for_run(spec: TrainSpec, kind: str, extra_tags: Sequence[str]) -> list
        kind,
        spec.algorithm.name,
        spec.runtime.backend,
-        "vanilla" if spec.study.no_robust else "robust",
+        "baseline" if spec.study.no_robust else "defended",
    ]
    tags.extend([tag for tag in extra_tags if tag])
    return tags
--- a/engine/project.json
+++ b/engine/project.json
@@ -91,6 +91,44 @@
        "command": "bash scripts/nx_research.sh docker-train-publish",
        "cwd": "."
      }
    },
    "whoclicked-publish": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "bash scripts/nx_research.sh whoclicked-publish",
        "cwd": "."
      }
    },
    "tpu-ray-bootstrap": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_research.sh tpu-ray-bootstrap",
        "cwd": "."
      }
    },
    "tpu-ray-deps": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_research.sh tpu-ray-deps",
        "cwd": "."
      }
    },
    "tpu-ray-verify": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_research.sh tpu-ray-verify",
        "cwd": "."
      }
    },
    "tpu-ray-teardown": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_research.sh tpu-ray-teardown",
        "cwd": "."
      }
    }
  },
  "tags": [
--- a/engine/spec.py
+++ b/engine/spec.py
@@ -32,10 +32,17 @@ def _normalize_keys(raw: Mapping[str, Any]) -> dict[str, Any]:
        "study.robust_radius": "robust_radius",
        "study.robust_points": "robust_points",
        "study.robust_rollouts": "robust_rollouts",
        "study.ambiguity_radius": "robust_radius",
        "study.ambiguity_points": "robust_points",
        "study.ambiguity_rollouts": "robust_rollouts",
        "study.info_value": "info_value",
        "study.eta_ux": "eta_ux",
        "study.reward_profit_weight": "reward_profit_weight",
-        "study.revenue_weight": "revenue_weight",
+        "ambiguity_radius": "robust_radius",
        "ambiguity_points": "robust_points",
        "ambiguity_rollouts": "robust_rollouts",
        "baseline_mode": "no_robust",
        "stress_eval_enabled": "robust_eval_enabled",
        "optimizer.learning_rate": "learning_rate",
        "optimizer.gamma": "gamma",
        "optimizer.batch_size": "batch_size",
@@ -45,6 +52,7 @@ def _normalize_keys(raw: Mapping[str, Any]) -> dict[str, Any]:
        "runtime.seed": "seed",
        "runtime.total_timesteps": "total_timesteps",
        "runtime.checkpoint_interval": "checkpoint_interval",
        "runtime.hist_freq": "hist_freq",
        "eval.eval_freq": "eval_freq",
        "eval.eval_episodes": "eval_episodes",
    }
@@ -72,6 +80,8 @@ class EnvSpec:
    max_steps: int = 100
    margin_floor: float = 0.05
    margin_floor_patience: int = 5
    agent_mu: float = 45.0
    agent_std: float = 15.0
@dataclass(frozen=True)
@@ -84,7 +94,6 @@ class StudySpec:
    info_value: float = 1.0
    eta_ux: float = 0.5
    reward_profit_weight: float = 1.0
    revenue_weight: float = 0.01
    no_robust: bool = False
@@ -126,6 +135,7 @@ class RuntimeSpec:
    checkpoint_interval: int = 200_000
    model_dir: str = "engine/models"
    log_freq: int = 100
    hist_freq: int = 500
@dataclass(frozen=True)
@@ -157,6 +167,7 @@ class TrainSpec:
            "backend": self.runtime.backend,
            "device": self.runtime.device,
            "checkpoint_interval": self.runtime.checkpoint_interval,
            "hist_freq": self.runtime.hist_freq,
            "n_products": self.env.n_products,
            "N": self.env.n_sessions,
            "price_low": self.env.price_low,
@@ -167,6 +178,8 @@ class TrainSpec:
            "max_steps": self.env.max_steps,
            "margin_floor": self.env.margin_floor,
            "margin_floor_patience": self.env.margin_floor_patience,
            "agent_mu": self.env.agent_mu,
            "agent_std": self.env.agent_std,
            "alpha": self.study.alpha,
            "lambda_coi": self.study.lambda_coi,
            "robust_radius": self.study.robust_radius,
@@ -175,7 +188,6 @@ class TrainSpec:
            "info_value": self.study.info_value,
            "eta_ux": self.study.eta_ux,
            "reward_profit_weight": self.study.reward_profit_weight,
            "revenue_weight": self.study.revenue_weight,
            "no_robust": self.study.no_robust,
            "learning_rate": self.optimizer.learning_rate,
            "gamma": self.optimizer.gamma,
@@ -246,6 +258,8 @@ class TrainSpec:
                max_steps=int(base["max_steps"]),
                margin_floor=float(base["margin_floor"]),
                margin_floor_patience=int(base["margin_floor_patience"]),
                agent_mu=float(base.get("agent_mu", 45.0)),
                agent_std=float(base.get("agent_std", 15.0)),
            ),
            study=StudySpec(
                alpha=float(base["alpha"]),
@@ -256,7 +270,6 @@ class TrainSpec:
                info_value=float(base["info_value"]),
                eta_ux=float(base["eta_ux"]),
                reward_profit_weight=float(base["reward_profit_weight"]),
                revenue_weight=float(base["revenue_weight"]),
                no_robust=no_robust,
            ),
            optimizer=OptimizerSpec(
@@ -294,6 +307,7 @@ class TrainSpec:
                checkpoint_interval=int(base["checkpoint_interval"]),
                model_dir=str(base["model_dir"]),
                log_freq=int(base["log_freq"]),
                hist_freq=int(base["hist_freq"]),
            ),
            eval=EvalSpec(
                eval_freq=int(base["eval_freq"]),
@@ -304,9 +318,11 @@ class TrainSpec:
 def run_name(spec: TrainSpec, *, kind: str, scenario: str) -> str:
    alpha_token = f"{float(spec.study.alpha):.2f}".rstrip("0").rstrip(".")
    mode = "baseline" if bool(spec.study.no_robust) else "defended"
    return (
        f"{kind}/{spec.algorithm.name}/{spec.runtime.backend}/"
-        f"{spec.runtime.device}/{scenario}/s{spec.runtime.seed}"
+        f"{spec.runtime.device}/{scenario}/a{alpha_token}/{mode}/s{spec.runtime.seed}"
    )
@@ -318,6 +334,7 @@ def run_metadata(
    group: str | None = None,
    tags: Sequence[str] = (),
 ) -> dict[str, Any]:
    mode = "baseline" if bool(spec.study.no_robust) else "defended"
    metadata: dict[str, Any] = {
        "run.kind": str(kind),
        "run.algo": spec.algorithm.name,
@@ -326,6 +343,10 @@ def run_metadata(
        "run.scenario": str(scenario),
        "run.seed": spec.runtime.seed,
        "run.tags": list(tags),
        "study/alpha": float(spec.study.alpha),
        "study/mode": mode,
        "study/baseline_mode": float(bool(spec.study.no_robust)),
        "tiers": spec.algorithm.name,
    }
    if group:
        metadata["run.group"] = group
--- a/engine/studies/margin_erosion_alpha.py
+++ b/engine/studies/margin_erosion_alpha.py
@@ -0,0 +1,133 @@
 """validate core thesis problem: margin erosion under agent contamination
 trains standard RL (no robust components) across α levels to demonstrate systematic failure
 """
 from __future__ import annotations
 import json, sys, time
 from pathlib import Path
 import numpy as np
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from engine.spec import TrainSpec
 from engine.orchestrators import run_train_once
 def _run_baseline(alpha: float, algo: str, seed: int, steps: int) -> dict:
    spec = TrainSpec.from_flat(
        {
            "algo": algo,
            "seed": seed,
            "alpha": alpha,
            "total_timesteps": steps,
            "lambda_coi": 0.0,
            "robust_radius": 0.0,
            "robust_points": 1,
            "robust_rollouts": 1,
            "no_robust": True,
            "arch": "small",
            "n_products": 10,
            "N": 100,
            "max_steps": 50,
            "eval_freq": 5000,
            "eval_episodes": 10,
            "log_freq": 500,
            "robust_eval_enabled": False,
            "agent_mu": 12.0,
            "agent_std": 2.0,
        }
    )
    result = run_train_once(
        spec,
        project="phantom-margin-erosion",
        offline=True,
        no_wandb=True,
        kind="study",
        scenario=f"alpha{int(alpha * 100):02d}",
        group=f"baseline_{algo}",
        extra_tags=("margin_erosion", "baseline"),
    )
    return {
        "alpha": alpha,
        "algo": algo,
        "seed": seed,
        "eval_reward": result.get("eval/reward_mean", np.nan),
        "eval_revenue": result.get("eval/revenue_mean", np.nan),
        "eval_coi_level": result.get("eval/coi_level_mean", np.nan),
        "eval_margin": result.get("eval/margin_mean", np.nan),
        "eval_agent_prob": result.get("eval/agent_prob_mean", np.nan),
    }
 def run_margin_erosion_study(
    alphas: list[float] | None = None,
    algos: list[str] | None = None,
    seeds: int = 3,
    steps: int = 30_000,
 ) -> dict:
    alphas = alphas or [0.1, 0.3, 0.5, 0.7, 0.9]
    algos = algos or ["ppo", "dqn", "qtable"]
    output_dir = Path(__file__).parent / "results"
    output_dir.mkdir(exist_ok=True)
    ts = time.strftime("%Y%m%d_%H%M%S")
    results = []
    for α in alphas:
        for algo in algos:
            for si in range(seeds):
                seed = 42 + si
                print(f"α={α:.1f} {algo} seed={seed}")
                m = _run_baseline(α, algo, seed, steps)
                results.append(m)
                print(
                    f"  margin={m['eval_margin']:.3f} rev={m['eval_revenue']:.0f} coi={m['eval_coi_level']:.1f}"
                )
    summary = {}
    for α in alphas:
        runs = [r for r in results if abs(r["alpha"] - α) < 0.01]
        if not runs:
            continue
        s = {}
        for metric in ["margin", "revenue", "coi_level", "agent_prob"]:
            vals = [r[f"eval_{metric}"] for r in runs]
            s[f"{metric}_mean"] = float(np.mean(vals))
            s[f"{metric}_std"] = float(np.std(vals))
        s["n_runs"] = len(runs)
        summary[f"alpha_{α:.1f}"] = s
    output = {
        "timestamp": ts,
        "config": {"alphas": alphas, "algos": algos, "seeds": seeds, "steps": steps},
        "results": results,
        "summary": summary,
    }
    path = output_dir / f"margin_erosion_alpha_{ts}.json"
    with open(path, "w") as f:
        json.dump(output, f, indent=2)
    print(f"\n→ {path}")
    for α in alphas:
        k = f"alpha_{α:.1f}"
        if k in summary:
            s = summary[k]
            print(
                f"  {k}: margin={s['margin_mean']:.3f}±{s['margin_std']:.3f} "
                f"coi={s['coi_level_mean']:.1f}±{s['coi_level_std']:.1f}"
            )
    return output
 if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser(description="margin erosion vs α")
    p.add_argument("--quick", action="store_true", help="fast test")
    args = p.parse_args()
    run_margin_erosion_study(
        alphas=[0.1, 0.7] if args.quick else [0.1, 0.3, 0.5, 0.7, 0.9],
        algos=["qtable"] if args.quick else ["ppo", "dqn", "qtable"],
        seeds=1 if args.quick else 3,
        steps=5_000 if args.quick else 30_000,
    )
--- a/engine/sweeps/final_thesis_proof.yaml
+++ b/engine/sweeps/final_thesis_proof.yaml
@@ -0,0 +1,60 @@
 method: grid
 metric:
  name: eval/stress_reward_worst
  goal: maximize
 command:
  - ${env}
  - python
  - -m
  - engine.train
 parameters:
  algo:
    value: ppo
  backend:
    value: sb3
  device:
    value: cpu
  seed:
    values: [42, 1337, 7777]
  alpha:
    values: [0.1, 0.2, 0.3, 0.4, 0.6, 0.8]
  n_products:
    values: [25, 50, 100]
  N:
    value: 100
  no_robust:
    values: [false, true]
  lambda_coi:
    values: [0.15, 0.30]
  robust_radius:
    value: 0.2
  robust_points:
    value: 7
  robust_rollouts:
    value: 1
  eta_ux:
    value: 0.5
  reward_profit_weight:
    value: 1.0
  action_levels:
    value: 9
  action_scale_low:
    value: 0.8
  action_scale_high:
    value: 1.2
  total_timesteps:
    value: 100000
  eval_episodes:
    value: 12
  eval_freq:
    value: 1000
  log_freq:
    value: 100
  hist_freq:
    value: 500
  learning_rate:
    value: 0.0003
  batch_size:
    value: 256
  n_steps:
    value: 2048
--- a/engine/sweeps/ppo_supra_guard.yaml
+++ b/engine/sweeps/ppo_supra_guard.yaml
@@ -0,0 +1,53 @@
 method: random
 metric:
  name: eval/supra_share_mean
  goal: minimize
 run_cap: 256
 command:
  - ${env}
  - python
  - -m
  - engine.train
 parameters:
  algo:
    value: ppo
  seed:
    values: [42, 1337, 7777]
  alpha:
    values: [0.1, 0.2, 0.3, 0.4, 0.6]
  n_products:
    values: [25, 50]
  N:
    value: 100
  no_robust:
    values: [false, true]
  lambda_coi:
    values: [0.05, 0.15, 0.3]
  robust_radius:
    values: [0.1, 0.2, 0.3]
  robust_points:
    value: 7
  robust_rollouts:
    value: 1
  eta_ux:
    values: [0.05, 0.15, 0.3, 0.5, 0.75]
  reward_profit_weight:
    value: 1.0
  total_timesteps:
    value: 100000
  eval_episodes:
    value: 10
  eval_freq:
    value: 1000
  log_freq:
    value: 100
  hist_freq:
    value: 500
  learning_rate:
    value: 0.0003
  batch_size:
    value: 256
  n_steps:
    value: 2048
  device:
    value: cpu
--- a/engine/telemetry/metrics.py
+++ b/engine/telemetry/metrics.py
@@ -36,7 +36,12 @@ def canonicalize_metrics(raw: Mapping[str, Any], spec: TrainSpec) -> dict[str, A
    eval_reward = (
        _as_float(
-            metrics.get("eval/robust_reward_worst", metrics.get("eval/reward_mean")),
+            metrics.get(
                "eval/stress_reward_worst",
                metrics.get(
                    "eval/robust_reward_worst", metrics.get("eval/reward_mean")
                ),
            ),
            0.0,
        )
        or 0.0
@@ -51,9 +56,12 @@ def canonicalize_metrics(raw: Mapping[str, Any], spec: TrainSpec) -> dict[str, A
    metrics["objective/coi_preserved"] = 0.0 if coi_level is None else coi_level
    metrics["study/alpha"] = spec.study.alpha
    metrics["study/mode"] = "baseline" if bool(spec.study.no_robust) else "defended"
    metrics["study/baseline_mode"] = float(bool(spec.study.no_robust))
    metrics["study/lambda_coi"] = spec.study.lambda_coi
-    metrics["study/robust_radius"] = spec.study.robust_radius
+    metrics["study/ambiguity_radius"] = spec.study.robust_radius
    metrics["study/info_value"] = spec.study.info_value
    metrics["tiers"] = spec.algorithm.name
    metrics["runtime/backend"] = spec.runtime.backend
    metrics["runtime/device"] = spec.runtime.device
--- a/engine/telemetry/wandb.py
+++ b/engine/telemetry/wandb.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 import os
 import time
 from typing import Any, Callable, Iterable, Mapping
@@ -19,6 +21,42 @@ def _require_wandb():
    return wandb
 def _warn(message: str) -> None:
    print(f"PHANTOM_WANDB_WARNING: {message}")
 def _sanitize_key(raw_key: str) -> str | None:
    key = str(raw_key)
    replacements = {
        "no_robust": "baseline_mode",
        "study/no_robust": "study/baseline_mode",
        "study/robust_radius": "study/ambiguity_radius",
        "robust_radius": "ambiguity_radius",
        "robust_points": "ambiguity_points",
        "robust_rollouts": "ambiguity_rollouts",
        "robust_eval_enabled": "stress_eval_enabled",
        "eval/robust_alpha_high": "eval/stress_alpha_high",
        "eval/robust_alpha_low": "eval/stress_alpha_low",
        "eval/robust_reward_worst": "eval/stress_reward_worst",
        "eval/robust_revenue_worst": "eval/stress_revenue_worst",
        "eval/robust_coi_leakage_worst": "eval/stress_coi_leakage_worst",
    }
    key = replacements.get(key, key)
    if "robust" in key.lower():
        return None
    return key
 def _sanitize_payload(payload: Mapping[str, Any]) -> dict[str, Any]:
    sanitized: dict[str, Any] = {}
    for key, value in payload.items():
        clean_key = _sanitize_key(str(key))
        if clean_key is None:
            continue
        sanitized[clean_key] = value
    return sanitized
 def init_run(
    *,
    mode: str,
@@ -34,7 +72,11 @@ def init_run(
    if group:
        kwargs["group"] = group
    if sweep_mode:
        try:
            run = wandb.init(**kwargs)
        except Exception as exc:
            _warn(f"init failed in sweep mode ({exc})")
            return None
        if name and run is not None:
            run.name = name
        return run
@@ -42,18 +84,25 @@ def init_run(
    init_kwargs = dict(kwargs)
    init_kwargs["project"] = project
    if config is not None:
-        init_kwargs["config"] = dict(config)
+        init_kwargs["config"] = _sanitize_payload(dict(config))
    if name:
        init_kwargs["name"] = name
    if tags:
        init_kwargs["tags"] = list(tags)
    try:
        return wandb.init(**init_kwargs)
    except Exception as exc:
        _warn(f"init failed ({exc})")
        return None
 def finish_run() -> None:
    wandb = get_wandb_module()
    if wandb is not None and wandb.run is not None:
        try:
            wandb.finish()
        except Exception as exc:
            _warn(f"finish failed ({exc})")
 def current_config() -> dict[str, Any]:
@@ -67,25 +116,45 @@ def update_run_config(config: Mapping[str, Any]) -> None:
    wandb = get_wandb_module()
    if wandb is None or wandb.run is None:
        return
    payload = _sanitize_payload(dict(config))
    if not payload:
        return
    try:
-        wandb.config.update(dict(config), allow_val_change=True)
+        wandb.config.update(payload, allow_val_change=True)
    except TypeError:
-        wandb.config.update(dict(config))
+        try:
            wandb.config.update(payload)
        except Exception as exc:
            _warn(f"config update failed ({exc})")
    except Exception as exc:
        _warn(f"config update failed ({exc})")
 def log_metrics(metrics: Mapping[str, Any], *, step: int) -> None:
    wandb = get_wandb_module()
    if wandb is None or wandb.run is None:
        return
-    wandb.log(dict(metrics), step=step)
+    payload = _sanitize_payload(dict(metrics))
    if not payload:
        return
    try:
        wandb.log(payload, step=step)
    except Exception as exc:
        _warn(f"log failed at step {step} ({exc})")
 def update_summary(metrics: Mapping[str, Any]) -> None:
    wandb = get_wandb_module()
    if wandb is None or wandb.run is None:
        return
-    for key, value in metrics.items():
+    payload = _sanitize_payload(dict(metrics))
    if not payload:
        return
    try:
        for key, value in payload.items():
            wandb.run.summary[key] = value
    except Exception as exc:
        _warn(f"summary update failed ({exc})")
 def run_agent(
@@ -95,4 +164,39 @@ def run_agent(
    count: int | None = None,
 ) -> None:
    wandb = _require_wandb()
-    wandb.agent(sweep_id, function=fn, count=count)
+    retry_max = max(0, int(os.getenv("PHANTOM_WANDB_AGENT_RETRIES", "8")))
    retry_delay = max(1.0, float(os.getenv("PHANTOM_WANDB_AGENT_RETRY_DELAY", "5")))
    retry_backoff = max(
        1.0, float(os.getenv("PHANTOM_WANDB_AGENT_RETRY_BACKOFF", "1.5"))
    )
    retry_max_delay = max(
        retry_delay,
        float(os.getenv("PHANTOM_WANDB_AGENT_MAX_RETRY_DELAY", "60")),
    )
    target = None if count is None else max(0, int(count))
    completed = 0
    def _wrapped() -> None:
        nonlocal completed
        fn()
        completed += 1
    attempt = 0
    while True:
        remaining = None if target is None else max(0, int(target - completed))
        if target is not None and remaining == 0:
            return
        try:
            wandb.agent(sweep_id, function=_wrapped, count=remaining)
            return
        except Exception as exc:
            attempt += 1
            if attempt > retry_max:
                raise
            wait = min(retry_max_delay, retry_delay * (retry_backoff ** (attempt - 1)))
            _warn(
                f"agent disconnected (attempt {attempt}/{retry_max}, "
                f"completed={completed}, remaining={remaining}): {exc}"
            )
            time.sleep(wait)
--- a/engine/train.py
+++ b/engine/train.py
@@ -54,6 +54,7 @@ def _build_parser() -> argparse.ArgumentParser:
    parser.add_argument("--total-timesteps", type=int)
    parser.add_argument("--model-dir", type=str)
    parser.add_argument("--log-freq", type=int)
    parser.add_argument("--hist-freq", type=int)
    parser.add_argument("--checkpoint-interval", type=int)
    parser.add_argument("--device", type=str)
@@ -68,7 +69,6 @@ def _build_parser() -> argparse.ArgumentParser:
    parser.add_argument("--no-robust", action="store_true")
    parser.add_argument("--eta-ux", type=float)
    parser.add_argument("--reward-profit-weight", type=float)
    parser.add_argument("--revenue-weight", type=float)
    parser.add_argument("--price-low", type=float)
    parser.add_argument("--price-high", type=float)
@@ -126,6 +126,7 @@ def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]:
        "total_timesteps": args.total_timesteps,
        "model_dir": args.model_dir,
        "log_freq": args.log_freq,
        "hist_freq": args.hist_freq,
        "checkpoint_interval": args.checkpoint_interval,
        "device": args.device,
        "alpha": args.alpha,
@@ -139,7 +140,6 @@ def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]:
        "no_robust": args.no_robust,
        "eta_ux": args.eta_ux,
        "reward_profit_weight": args.reward_profit_weight,
        "revenue_weight": args.revenue_weight,
        "price_low": args.price_low,
        "price_high": args.price_high,
        "action_levels": args.action_levels,
@@ -179,8 +179,29 @@ def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]:
 def main(argv: list[str] | None = None) -> None:
    import subprocess
    import sys
    # Ensure data is downloaded
    from pathlib import Path
    project_root = Path(__file__).parents[1]
    data_dir = project_root / "experiments" / "collected_data"
    needs_pull = (not data_dir.exists()) or (not any(data_dir.iterdir()))
    if needs_pull:
        try:
            subprocess.run(["make", "data.pull"], cwd=str(project_root), check=True)
        except (subprocess.SubprocessError, OSError) as exc:
            sys.path.insert(0, str(project_root))
            try:
                from scripts.hf_data import pull
                pull()
            except (ImportError, OSError, RuntimeError, ValueError) as fallback_exc:
                print(
                    f"Warning: data.pull failed ({exc}); fallback pull failed ({fallback_exc})"
                )
    configure_logging()
    raw_args = list(sys.argv[1:] if argv is None else argv)
    run_kind = _probe_run_kind(raw_args)
--- a/engine/wrapper.py
+++ b/engine/wrapper.py
@@ -10,6 +10,7 @@ from .lib.coi import (
 )
 from .lib.behavior import get_transition_models, trajectory_to_events
 from .lib.wrappers import EconomicMetricsWrapper
 from .jax.robust import select_adversarial_alpha_jax, _JAX_OK
 class _ActionPricingEngine(PricingEngine):
@@ -121,6 +122,7 @@ class PHANTOM(gym.Env):
        self._prices = None
        self._demand = None
        self._step_count = 0
        self._global_step = 0  # monotonic; used as JAX RNG seed across resets
        self._demand_history = []
        self._price_history = []
        self._revenue_history = []
@@ -128,6 +130,13 @@ class PHANTOM(gym.Env):
        self._initial_episode_prices = None
        self._trajectories = []  # session trajectories for agent prob calculation
        self.baseline_prices = np.full(self.n_products, self.price_bounds[0])
        self.anchor_prices = np.full(
            self.n_products,
            float(np.clip(float(self.human_params[0]), *self.price_bounds)),
        )
        self.competitive_cap = float(
            min(self.price_bounds[1], float(np.mean(self.anchor_prices)) * 1.15)
        )
        self._low_margin_streak = 0  # consecutive steps below margin_floor
        self._last_agent_prob = float(self.alpha)
        self._last_alpha_adv = float(self.alpha)
@@ -167,19 +176,28 @@ class PHANTOM(gym.Env):
        self.market.Nhumans = self.N - n_agents
    def _decode_action(self, action) -> np.ndarray:
-        base = (
+        prev = self._prices
-            self._prices
+        base = self.anchor_prices
-            if self._prices is not None
+
-            else np.full(self.n_products, self.price_bounds[0], dtype=float)
+        def _blend(target: np.ndarray) -> np.ndarray:
-        )
+            if prev is None:
                lower = float(self.price_bounds[0])
                return np.clip(target, lower, self.competitive_cap)
            blended = 0.75 * np.asarray(prev, dtype=float) + 0.25 * target
            lower = float(self.price_bounds[0])
            return np.clip(blended, lower, self.competitive_cap)
        if np.isscalar(action):
            idx = int(np.clip(int(action), 0, self.action_levels - 1))
-            return np.clip(base * self._action_scales[idx], *self.price_bounds)
+            target = base * self._action_scales[idx]
            return _blend(target)
        a = np.asarray(action)
        if a.size == 1:
            idx = int(np.clip(int(a.reshape(-1)[0]), 0, self.action_levels - 1))
-            return np.clip(base * self._action_scales[idx], *self.price_bounds)
+            target = base * self._action_scales[idx]
-        return np.clip(a.astype(float), *self.price_bounds)
+            return _blend(target)
        lower = float(self.price_bounds[0])
        return np.clip(a.astype(float), lower, self.competitive_cap)
    def _compute_agent_prob(self, trajectories=None) -> float:
        trajectories = (
@@ -214,18 +232,23 @@ class PHANTOM(gym.Env):
        coi_penalty = self.lambda_coi * coi_leakage * info_budget
        if len(self._price_history) > 0:
-            volatility = float(
+            prev_prices = np.asarray(self._price_history[-1], dtype=float)
-                np.mean(
+            rel_change = (prices - prev_prices) / np.maximum(prev_prices, 1.0)
-                    np.abs(prices - self._price_history[-1])
+            volatility = float(np.mean(np.abs(rel_change)))
-                    / np.maximum(self.baseline_prices, 1.0)
+            upward_volatility = float(np.mean(np.clip(rel_change, 0.0, None)))
                )
            )
        else:
            volatility = 0.0
-        ux_penalty = self.eta_ux * info_budget * volatility
+            upward_volatility = 0.0
        ux_penalty = self.eta_ux * info_budget * (volatility + 0.5 * upward_volatility)
        competitive_anchor = float(np.mean(self.anchor_prices))
        price_ratio = prices / max(competitive_anchor, 1.0)
        supra_excess = np.clip(price_ratio - 1.15, 0.0, None)
        supra_penalty = 4.0 * info_budget * float(np.mean(np.square(supra_excess)))
        supra_share = float(np.mean(supra_excess > 0.0))
        reward_revenue = self.reward_profit_weight * profit
-        reward = reward_revenue - coi_penalty - ux_penalty
+        reward = reward_revenue - coi_penalty - ux_penalty - supra_penalty
        return reward, {
            "revenue": revenue,
@@ -238,6 +261,10 @@ class PHANTOM(gym.Env):
            "coi_info_budget": info_budget,
            "ux_penalty": ux_penalty,
            "volatility": volatility,
            "upward_volatility": upward_volatility,
            "supra_penalty": supra_penalty,
            "supra_share": supra_share,
            "competitive_anchor": competitive_anchor,
            "reward_revenue": reward_revenue,
            "reward_total": reward,
        }
@@ -261,8 +288,37 @@ class PHANTOM(gym.Env):
        return float(np.mean(rewards)) if rewards else 0.0
    def _select_adversarial_alpha(self, prices: np.ndarray) -> float:
-        """inner robust step: evaluate candidates and pick worst-case alpha"""
+        """inner robust step: pick worst-case alpha from the ambiguity interval.
        when JAX is available and robust_rollouts==1 we use a vmapped pass over
        all K candidates in a single call (no Python loop, no market.act overhead).
        the JAX path approximates demand as the mixed closed-form d(p;theta) signal
        rather than running full trajectory sampling, which is accurate for the
        alpha-selection decision while being dramatically cheaper.
        when robust_rollouts>1 or JAX is unavailable we fall back to the sequential
        market.act() loop so behavior is identical to the original implementation.
        """
        candidates = self._alpha_candidates()
        if len(candidates) == 1:
            return float(candidates[0])
        if _JAX_OK and self.robust_rollouts == 1:
            best_alpha, _ = select_adversarial_alpha_jax(
                candidates=candidates,
                prices=prices,
                human_params=self.market.human_params,
                agent_params=self.market.agent_params,
                noise_std=self.market.noise_std,
                baseline_prices=self.baseline_prices,
                lambda_coi=self.lambda_coi,
                info_value=self.info_value,
                reward_profit_weight=self.reward_profit_weight,
                rng_seed=self._global_step,
            )
            return best_alpha
        # fallback: full trajectory-based sequential evaluation
        evaluations = [
            (float(alpha), self._evaluate_candidate(float(alpha), prices))
            for alpha in candidates
@@ -299,6 +355,7 @@ class PHANTOM(gym.Env):
    def step(self, action):
        self._prices = self._decode_action(action)
        alpha_adv = self._select_adversarial_alpha(self._prices)
        self._global_step += 1  # always increment; JAX path may have already done so
        self._set_market_mix(alpha_adv)
        self._platform_stub.set_prices(self._prices)
        self._step_count += 1
--- a/lib/config.py
+++ b/lib/config.py
@@ -2,6 +2,7 @@
 All hardcoded paths should reference this module
 Paths can be overridden via environment variables
 """
 import os
 from pathlib import Path
@@ -9,24 +10,34 @@ from pathlib import Path
 PROJECT_ROOT = Path(__file__).parent.parent.resolve()
 # data directories
-DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data'))
+DATA_DIR = Path(os.getenv("PHANTOM_DATA_DIR", PROJECT_ROOT / "data"))
-EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments'))
+EXPERIMENTS_DIR = Path(
    os.getenv("PHANTOM_EXPERIMENTS_DIR", PROJECT_ROOT / "experiments")
 )
 # agent/human interaction data
-AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents'))
+AGENT_DATA_DIR = Path(os.getenv("PHANTOM_AGENT_DATA_DIR", DATA_DIR / "agents"))
-HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans'))
+HUMAN_DATA_DIR = Path(os.getenv("PHANTOM_HUMAN_DATA_DIR", DATA_DIR / "humans"))
 # RL simulation runs
-SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs'))
+SIM_RUNS_DIR = Path(
    os.getenv("PHANTOM_SIM_RUNS_DIR", PROJECT_ROOT / "sim" / "rl" / "runs")
 )
 # model artifacts
-MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models'))
+MODEL_REGISTRY_DIR = Path(os.getenv("PHANTOM_MODEL_REGISTRY_DIR", DATA_DIR / "models"))
 # collected experiment data
-COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data'))
+COLLECTED_DATA_DIR = Path(
    os.getenv(
        "PHANTOM_COLLECTED_DATA_DIR", EXPERIMENTS_DIR / "agents" / "collected_data"
    )
 )
 # notebook outputs
-NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs'))
+NOTEBOOK_OUTPUT_DIR = Path(
    os.getenv("PHANTOM_NOTEBOOK_OUTPUT_DIR", EXPERIMENTS_DIR / "notebooks" / "outputs")
 )
 def ensure_dir(path: Path) -> Path:
@@ -51,15 +62,18 @@ def get_sim_path(*parts: str) -> Path:
 # service configuration (from .env)
-KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost')
+KAFKA_HOST = os.getenv("KAFKA_HOST", "localhost")
-KAFKA_PORT = os.getenv('KAFKA_PORT', '9092')
+KAFKA_PORT = os.getenv("KAFKA_PORT", "9092")
 KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}"
-REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
+REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
-REDIS_PORT = int(os.getenv('REDIS_PORT', '6379'))
+REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
-SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '')
+SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
-SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '')
+SUPABASE_ANON_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")
-BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000'))
+BACKEND_PORT = int(os.getenv("BACKEND_PORT", "5000"))
-PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001'))
+PROVIDER_PORT = int(os.getenv("PROVIDER_PORT", "5001"))
 # huggingface dataset repo for collected behavioral data
 HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "velocitatem/phantom-collected-data")
--- a/nx.json
+++ b/nx.json
@@ -58,6 +58,21 @@
    "benchmark": {
      "cache": false
    },
    "whoclicked-publish": {
      "cache": false
    },
    "tpu-ray-bootstrap": {
      "cache": false
    },
    "tpu-ray-deps": {
      "cache": false
    },
    "tpu-ray-verify": {
      "cache": false
    },
    "tpu-ray-teardown": {
      "cache": false
    },
    "up": {
      "cache": false
    },
--- a/package.json
+++ b/package.json
@@ -7,6 +7,8 @@
  ],
  "scripts": {
    "nx": "nx",
    "manim:render": "nx run manim:render",
    "manim:render-all": "nx run manim:render-all",
    "projects": "nx show projects",
    "graph": "nx graph",
    "web:dev": "nx run web:dev",
--- a/paper/defense/manim/render.py
+++ b/paper/defense/manim/render.py
@@ -1,84 +0,0 @@
 from __future__ import annotations
 import argparse
 import subprocess
 import sys
 from pathlib import Path
 from scenes import SCENE_ORDER
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Render thesis-defense Manim scenes")
    parser.add_argument(
        "--quality",
        default="qm",
        choices=["ql", "qm", "qh", "qk"],
        help="Manim quality preset",
    )
    parser.add_argument(
        "--scene",
        action="append",
        dest="scenes",
        help="Scene name; repeat flag to render many",
    )
    parser.add_argument(
        "--preview", action="store_true", help="Open video after each render"
    )
    parser.add_argument(
        "--list", action="store_true", help="List available scenes and exit"
    )
    return parser.parse_args()
 def validate_requested(requested: list[str]) -> list[str]:
    missing = [name for name in requested if name not in SCENE_ORDER]
    if missing:
        choices = ", ".join(SCENE_ORDER)
        raise ValueError(f"Unknown scenes: {', '.join(missing)}. Choices: {choices}")
    return requested
 def run_manim(scene_file: Path, scene_name: str, quality: str, preview: bool) -> None:
    cmd = [sys.executable, "-m", "manim"]
    if preview:
        cmd.append("-p")
    cmd.extend([f"-{quality}", str(scene_file), scene_name])
    subprocess.run(cmd, cwd=scene_file.parent, check=True)
 def main() -> int:
    args = parse_args()
    if args.list:
        for scene in SCENE_ORDER:
            print(scene)
        return 0
    scenes = validate_requested(args.scenes) if args.scenes else list(SCENE_ORDER)
    scene_file = Path(__file__).resolve().parent / "scenes.py"
    try:
        for scene_name in scenes:
            run_manim(
                scene_file=scene_file,
                scene_name=scene_name,
                quality=args.quality,
                preview=args.preview,
            )
    except FileNotFoundError:
        print(
            "manim executable not found. Install Manim in your Python environment.",
            file=sys.stderr,
        )
        return 2
    except ValueError as exc:
        print(str(exc), file=sys.stderr)
        return 2
    except subprocess.CalledProcessError as exc:
        return exc.returncode
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/paper/defense/manim/scenes.py
+++ b/paper/defense/manim/scenes.py
--- a/paper/src/bib/references.bib
+++ b/paper/src/bib/references.bib
@@ -630,3 +630,41 @@ Volume: 21},
 	note = {Publisher: Institute of Mathematical Statistics},
 	pages = {50 -- 60},
 }
@article{horace_he_and_thinking_machines_lab_defeating_2025,
 	title = {Defeating {Nondeterminism} in {LLM} {Inference}},
 	url = {https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/},
 	doi = {10.64434/tml.20250910},
 	abstract = {Reproducibility is a bedrock of scientific progress. However, it’s remarkably difficult to get reproducible results out of large language models.
 For example, you might observe that asking ChatGPT the same question multiple times provides different results. This by itself is not surprising, since getting a result from a language model involves “sampling”, a process that converts the language model’s output into a probability distribution and probabilistically selects a token.
 What might be more surprising is that even when we adjust the temperature down to 0This means that the LLM always chooses the highest probability token, which is called greedy sampling. (thus making the sampling theoretically deterministic), LLM APIs are still not deterministic in practice (see past discussions here, here, or here). Even when running inference on your own hardware with an OSS inference library like vLLM or SGLang, sampling still isn’t deterministic (see here or here).},
 	language = {en},
 	urldate = {2026-03-10},
 	journal = {Thinking Machines Lab: Connectionism},
 	author = {{Horace He and Thinking Machines Lab}},
 	year = {2025},
 	file = {Snapshot:/home/velocitatem/Zotero/storage/U5JG4CNM/defeating-nondeterminism-in-llm-inference.html:text/html},
 }
@misc{moritz_ray_2018,
 	title = {Ray: {A} {Distributed} {Framework} for {Emerging} {AI} {Applications}},
 	shorttitle = {Ray},
 	url = {http://arxiv.org/abs/1712.05889},
 	doi = {10.48550/arXiv.1712.05889},
 	abstract = {The next generation of AI applications will continuously interact with the environment and learn from these interactions. These applications impose new and demanding systems requirements, both in terms of performance and flexibility. In this paper, we consider these requirements and present Ray---a distributed system to address them. Ray implements a unified interface that can express both task-parallel and actor-based computations, supported by a single dynamic execution engine. To meet the performance requirements, Ray employs a distributed scheduler and a distributed and fault-tolerant store to manage the system's control state. In our experiments, we demonstrate scaling beyond 1.8 million tasks per second and better performance than existing specialized systems for several challenging reinforcement learning applications.},
 	urldate = {2026-03-13},
 	publisher = {arXiv},
 	author = {Moritz, Philipp and Nishihara, Robert and Wang, Stephanie and Tumanov, Alexey and Liaw, Richard and Liang, Eric and Elibol, Melih and Yang, Zongheng and Paul, William and Jordan, Michael I. and Stoica, Ion},
 	month = sep,
 	year = {2018},
 	note = {arXiv:1712.05889 [cs]},
 	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Distributed, Parallel, and Cluster Computing},
 	file = {Preprint PDF:/home/velocitatem/Zotero/storage/SUTDF5BP/Moritz et al. - 2018 - Ray A Distributed Framework for Emerging AI Applications.pdf:application/pdf;Snapshot:/home/velocitatem/Zotero/storage/5GV2DUAA/1712.html:text/html},
 }
@misc{biewald_experiment_2020,
 	title = {Experiment {Tracking} with {Weights} and {Biases}},
 	url = {https://www.wandb.com/},
 	author = {Biewald, Lukas},
 	year = {2020},
 }
--- a/paper/src/chapters/01-intro.tex
+++ b/paper/src/chapters/01-intro.tex
@@ -8,9 +8,9 @@
 \section{Introduction}
-In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove separability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned separability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.
+In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove distinguishability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned distinguishability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.
-This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned separability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.}
+This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned distinguishability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.}
 \subsection{Motivation and Market Context}
@@ -30,7 +30,7 @@ We formally define interaction data as coming from some actor which can either b
 This dissertation is organized around one main research question and three supporting sub-questions:
 \begin{enumerate}
    \item[\textbf{Main RQ}] How can dynamic pricing systems preserve margin integrity when transaction orchestration is increasingly mediated by non-human agents?
-    \item[\textbf{SQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
+    \item[\textbf{SQ1}] \textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
    \item[\textbf{SQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
    \item[\textbf{SQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
 \end{enumerate}
@@ -64,4 +64,4 @@ Extract final result $r$ from terminal state\;
 \end{algorithm}
-The previously described goal of separability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate $\hat{q}$. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary.
+The previously described goal of distinguishability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate $\hat{q}$. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary.
--- a/paper/src/chapters/02-literature-review.tex
+++ b/paper/src/chapters/02-literature-review.tex
@@ -1,6 +1,6 @@
 \section{Literature Review}
-To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups.
+To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups.
 \subsection{Agent Taxonomy and Definitions}
--- a/paper/src/chapters/03-methodology.tex
+++ b/paper/src/chapters/03-methodology.tex
@@ -3,7 +3,7 @@
 % Extra notes and clarifications: we observed some humans and get their transition probabilities between event types
 % We modify behavioral profiles of transition matrices with price elasticity matrices generated by sample valuations of a distributing.
-This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven separability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets.
+This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven distinguishability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets.
 \subsection{Problem Formalization}
@@ -40,6 +40,7 @@ We formalize the heterogeneity of actors by introducing a type space $\Theta$. A
 Q(p) = (1-\alpha) \cdot \mathbb{E}_{\theta \sim \mathcal{D}_H}[d(p; \theta)] + \alpha \cdot \mathbb{E}_{\theta \sim \mathcal{D}_A}[d(p; \theta)] + \epsilon_t
 \end{equation}
 where $\alpha \in [0, 1]$ represents the contamination parameter (proportion of agents) and $\epsilon_t$ is non-stationary market noise.
 Accounting for behavioral and market variation, we also treat $\epsilon_t$ as absorbing serving-path variability from LLM infrastructure (e.g., batch-size-dependent inference behavior under changing load), which appears stochastic at the request level even under greedy decoding \parencite{horace_he_and_thinking_machines_lab_defeating_2025}.
@@ -140,6 +141,8 @@ The architecture of this platform begins with the deployed web-apps posting inte
 \paragraph{Public Web Artifact} We transition the Kappa like architecture of the data collection to a Lambda architecture for actual learning in a surrogate environment. This allows us to move faster on data which is provided and helps us create a feedback loop for production deployment. To support further research in this intersection of fields we release P4P \footnote{\url{https://github.com/velocitatem/p4p}} as a public repository providing the interaction layer of the PHANTOM framework. This provides a configurable storefront which can be tailored to any commercial setting with a standardized session-level event tracking. We document the API adapters or what the framework expects in terms of schemas for pricing providers and log ingestion servicse. The repository is intended for controlled experimentation and method replication rather than production commerce deployment.
 \paragraph{Public Dataset} For reproducibility of the behavioral analysis and distinguishability experiments, we also release the interaction dataset used in this thesis as \textit{WhoClickedIt}. The dataset is hosted on Hugging Face \footnote{\url{https://huggingface.co/datasets/velocitatem/whoclickedit}} and is distributed as one flattened event sheet (\texttt{whoclicked.csv}) with explicit labels (\texttt{actor\_type}, \texttt{is\_agent}, and \texttt{record\_type}). The associated dataset card specifies the schema, collection process, and known limitations; a full copy is included in Appendix~\ref{app:whoclicked_card}.
 \subsubsection{DevOps Principles}
@@ -182,13 +185,24 @@ Since users act with motivations, we define a pool of tasks (jobs to be done) an
 The task pool is stored as a structured table with fields \texttt{id}, \texttt{created\_at}, \texttt{task\_name}, \texttt{task\_description}, and \texttt{task\_def\_of\_done}. We formulate the tasks as compact jobs-to-be-done rather than as strict click scripts, because the target is to elicit realistic browsing and comparison behavior which can capture nuance of different people. In hotel mode the assigned tasks include \textit{Cheapest Room}, \textit{Cheapest Room w/ View}, \textit{MultiStep Cheapest Room}, \textit{The Digital Nomad (Executive)}, and \textit{The 3-Way Tradeoff (Desk + Quiet + Flexible)}. These prompts deliberately require critical thought in search, inspection of room details, comparison of amenities or images, return visits to the listing page, and a final booking decision which create a degree of cognitive load. In airline mode we use \textit{Last-Minute One-Way Flight}, where the actor must urgently travel to LAX from either SEA or JFK within the next 1--3 days, inspect at least a small set of candidate itineraries, and then book a reasonable earliest departure.
 A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor.
-The human data collection involved 18 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 18 human sessions we ran 18 agent sessions of equivalent task scope, giving a balanced dataset of 36 labeled trajectories. Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy.
+The human data collection involved 13 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 13 human sessions we ran 16 agent sessions of equivalent task scope, yielding 29 labeled trajectories in total (45\% human, 55\% agent). Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy.
 To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior.
-Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to separate classes $\theta \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner.
+Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to distinguish classes $\theta \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner.
-Our process follows three stages: (1) observe and \textit{vectorize} behavioral interactions, (2) learn separability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
+Our process follows three stages: (1) observe and \textit{vectorize} behavioral interactions, (2) learn distinguishability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
 Figure~\ref{fig:phantom_unified_architecture} summarizes the full mechanism from online interaction capture to divergence-based contamination scoring and robust control of pricing decisions.
 \begin{figure}[ht]
  \centering
  \resizebox{\textwidth}{!}{%
    \input{chapters/hero_architecture_figure.tex}
  }
  \caption{Unified PHANTOM defense architecture. (a) Online serving and logging with behavioral and price-query streams. (b) Distinguishability layer that estimates KL divergence to human/agent prototypes and derives session-level contamination scores. (c) Distributionally robust pricing control that optimizes under an ambiguity set while penalizing COI leakage and tracking UX cost.}
  \label{fig:phantom_unified_architecture}
 \end{figure}
 \begin{figure}[ht]
  \resizebox{\columnwidth}{!}{%
@@ -206,8 +220,8 @@ The dynamic pricing mechanism elicited immediate behavioral adjustments. Partici
 \subsubsection{Design of Training Factorial Study}
-The simulator has multiple configurable factors. We design a multi-factor study across five axes derived from the sweep configurations: (1) RL algorithm (\texttt{ppo}, \texttt{a2c}, \texttt{dqn}, \texttt{qtable}; 4 levels), (2) contamination ratio $\alpha$ sampled from $[0.1, 0.6]$ at four representative levels, (3) robustness radius $\epsilon_\alpha \in \{0.0, 0.15, 0.3\}$ (3 levels), (4) COI penalty weight $\lambda_\text{coi}$ at two reference levels, and (5) pricing action granularity (two discretization settings for \texttt{action\_levels}); giving a grid of $4\times4\times3\times2\times2 = 192$ configurations. Statistical power for the behavioral comparisons is determined by a two-sample test over per-session KL divergence scores; a formal power analysis with minimum detectable effect size at $n=18+18$ is reported in the results.
+The simulator has multiple configurable factors. We design a multi-factor study across five axes derived from the sweep configurations: (1) RL algorithm (\texttt{ppo}, \texttt{a2c}, \texttt{dqn}, \texttt{qtable}; 4 levels), (2) contamination ratio $\alpha$ sampled from $[0.1, 0.6]$ at four representative levels, (3) robustness radius $\epsilon_\alpha \in \{0.0, 0.15, 0.3\}$ (3 levels), (4) COI penalty weight $\lambda_\text{coi}$ at two reference levels, and (5) pricing action granularity (two discretization settings for \texttt{action\_levels}); giving a grid of $4\times4\times3\times2\times2 = 192$ configurations. Statistical power for the behavioral comparisons is determined by a two-sample test over per-session KL divergence scores; a formal power analysis with minimum detectable effect size at $n_H=13$, $n_A=16$ is reported in the results.
-% Power analysis plan: apply a two-sample Mann-Whitney U (or permutation test) on per-session (delta_H - delta_A) divergence scores comparing the human and agent groups. Compute minimum detectable effect size at alpha=0.05, power=0.8, given n=18 per group. Bootstrap confidence intervals on mean KL are a cleaner complement given the non-normality of divergence distributions.
+% Power analysis plan: apply a two-sample Mann-Whitney U (or permutation test) on per-session (delta_H - delta_A) divergence scores comparing the human and agent groups. Compute minimum detectable effect size at alpha=0.05, power=0.8, given n_H=13 and n_A=16. Bootstrap confidence intervals on mean KL are a cleaner complement given the non-normality of divergence distributions.
 While this scale is generally expensive for reinforcement learning, we execute it on a large TPU cluster to make the sweep tractable.
 Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak BF16 throughput this corresponds to approximately 160\,PFLOPS of aggregate compute (derivation in Appendix~\ref{app:compute_budget}), which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration where throughput-per-dollar is favorable, and reserve on-demand v4 capacity for runs that should not be interrupted.
@@ -245,7 +259,8 @@ v4 & 64 (32 + 32) & us-central2-b & 32 Spot + 32 On-demand \\
 \end{tabular}
 \end{table}
-For connections from Madrid, we prioritize the europe-west4 allocation for latency-sensitive runs with the benefit of having the most grouped chips within a single region. This regional grouping is important for the deployment of our Kubernetes cluster which cannot span multiple regions. All sweep metadata, model checkpoints, and reward traces are logged in Weights \& Biases. Hardware specifications are from the official Google Cloud TPU documentation \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}.
+For connections from Madrid, we prioritize the europe-west4 allocation for latency-sensitive runs with the benefit of having the most grouped chips within a single region. This regional grouping is important for the deployment of our Kubernetes cluster which cannot span multiple regions. All sweep metadata, model checkpoints, and reward traces are logged in Weights \& Biases. % TODO: cite this (from bib)
 Hardware specifications are from the official Google Cloud TPU documentation \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}.
 Design of training processes: we build docker image with the fact in mind of different caching over layers in order to most speed up docker re-building and such we place the most volatile steps towards the end of the image building. What is means in practice is that any dependency installations are isolated so edits to source code do no trigger rebuilds. Only if we update our entry point of training a sweep, Docker will also rebuild the source-code copy stage.
@@ -294,15 +309,15 @@ In addition to behavioral events, the platform logs price observations to a sepa
-\subsection{Generative Contamination and Separability}
+\subsection{Generative Contamination and Distinguishability}
 To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from Phantom data using a two-stage approach.
-\subsubsection{Ground-Truth Separability}
+\subsubsection{Ground-Truth Distinguishability}
-Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability?
+Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability?
-To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session separability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
+To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session distinguishability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
 The normality assumption cannot be made for KL divergence distributions, which are right-skewed and bounded below by zero, so we do not use a Student's $t$-test. Instead we apply a Mann-Whitney $U$ test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the stochastic ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We report $U$, the exact two-sided $p$-value, and group-level descriptive statistics for the gap scores.
@@ -387,8 +402,10 @@ The complete pricing-demand-trajectory loop is illustrated in Figure~\ref{fig:or
 \begin{figure}[ht]
 \centering
-\[
+{\setlength{\arraycolsep}{4pt}%
-\text{Oracle}(\vec{p}_{t-1},\vec{\hat{q}})\to
+\resizebox{0.85\linewidth}{!}{$
 \begin{aligned}
 &\text{Oracle}(\vec{p}_{t-1},\vec{\hat{q}})\to
 \begin{pmatrix}
 p_0\\
 p_1\\
@@ -397,14 +414,15 @@ p_N
 \end{pmatrix}
 \underrightarrow{d_i \sim \mathcal{N}_{\vec{p}}}
 \begin{pmatrix}d_0\\ d_1\\ \cdots \\ d_N\end{pmatrix}
-\underrightarrow{\vec{d}\times \tau_\theta \to \tau^\prime}
+\underrightarrow{\vec{d}\otimes \tau_\theta}
 \begin{bmatrix}
 0.01 & 0.02 & \cdots & 0.3 \\
 0.41 & 0.24 & \cdots & 0.0 \\
 \cdots & \cdots & \cdots & \cdots \\
 0.51 & 0.09 & \cdots & 0.1 \\
 \end{bmatrix}
-\underrightarrow{\tau_k \sim \tau^\prime}
+\\
 &\underrightarrow{\tau_k \sim \tau^\prime}
 \{\tau_k\}_{k=0}^K \to \hat{Q}(\tau_k)
 \to \begin{pmatrix}
 \hat{q}_0 \\
@@ -413,8 +431,10 @@ p_N
 \hat{q}_N \\
 \end{pmatrix}
 \to \text{Oracle}(\cdot)
-\]
+\end{aligned}
-\caption{Oracle-based pricing loop: historical price and demand state map to a new price vector; each product samples demand curves from $\mathcal{N}_{\vec{p}}$; trajectories are generated by mixing demand with behavioral kernels $\tau_\theta$ into transition matrix $\tau'$; sampled trajectories $\{\tau_k\}$ aggregate through proxy $Q(\cdot)$ to yield updated demand $\vec{\hat{q}}$, closing the feedback loop.}
+$}%
 }
 \caption{Oracle-based pricing loop: historical price and demand state map to a new price vector; each product samples demand curves from $\mathcal{N}_{\vec{p}}$; trajectories are generated via the Kronecker product $\vec{d}\otimes\tau_\theta$ into transition matrix $\tau'$; sampled trajectories $\{\tau_k\}$ aggregate through proxy $Q(\cdot)$ to yield updated demand $\vec{\hat{q}}$, closing the feedback loop.}
 \label{fig:oracle_flow}
 \end{figure}
@@ -461,7 +481,7 @@ We also consider taxation-like overlays for agent traffic under strategy-proof m
 \subsubsection{Pricing Mechanism Summary}
-We now present the complete pricing mechanism that integrates the behavioral separability, contamination estimation, and robust optimization components developed in the preceding sections. Algorithm~\ref{alg:phantom_loop_clean} formalizes the defensive pricing loop as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories.
+We now present the complete pricing mechanism that integrates the behavioral distinguishability, contamination estimation, and robust optimization components developed in the preceding sections. Algorithm~\ref{alg:phantom_loop_clean} formalizes the defensive pricing loop as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories.
 \begin{algorithm}[t]
 \caption{PHANTOM defensive pricing loop}
@@ -494,3 +514,47 @@ We now present the complete pricing mechanism that integrates the behavioral sep
 The algorithm operates in discrete epochs indexed by $t$. At each epoch, the platform applies one discrete multiplicative price action, the environment samples a batch of sessions, and demand is recomputed from weighted events. Robustness is implemented as an inner minimization over a small local grid of contamination candidates around nominal $\alpha_0$, matching the current engine implementation. The history buffer $\mathcal{L}$ (``Limbo'' in our implementation) enforces the alternating Stackelberg structure by preserving the temporal sequence of price publications and demand observations.
 %The defensive price update in Line 24 implements contamination-aware margin shrinkage: as estimated contamination $\hat{\alpha}_t$ rises, the margin $(p^{\mathrm{ref}} - c)$ is reduced by factor $\kappa\in[0,1]$, with projection $\Pi_{\mathcal{P}}$ ensuring feasibility. In subsequent experiments this heuristic rule is replaced by DR-RL policy $\pi^*$ from Eq.~\ref{eq:robust_policy}.
 \subsection{Parallelization Strategy}
 To avoid preemption of compute mid-training we settle on using a v4 generation, 40 chip compute node with 5 parallel workers. The login node creates an orchestration node with Ray \parencite{moritz_ray_2018} and we distribute ray compute nodes per each other worker.
 \subsubsection{Computational Cost Analysis of the Simulation Step}
 The per-step cost of Algorithm~\ref{alg:phantom_loop_clean} is not uniform across its components. To inform hardware provisioning and to identify where algorithmic improvements are most impactful, we profile the hot path of the engine using Python's \texttt{cProfile} instrumentation over 20 environment steps under two configurations: a baseline with the robustness inner loop disabled ($K=1$, $\epsilon_\alpha=0$) and a standard robust setting ($K=5$, $\epsilon_\alpha=0.2$). Both runs use $M=10$ sessions per market call and $N=3$ products.
 The baseline achieves approximately 26 steps per second. Enabling the robustness inner loop with $K=5$ candidates drops throughput to 7.2 steps per second, a $3.6\times$ slowdown that is directly proportional to $K$, consistent with the $O(K)$ scaling of the adversarial alpha selection in the implementation.
 \begin{table}[ht]
 \centering
 \caption{Per-step profiling results (20 steps, $M=10$ sessions, $N=3$ products). Self-time measures time spent inside the function excluding callees; cumulative time includes the full call subtree.}
 \label{tab:profile_results}
 \begingroup
 \small
 \setlength{\tabcolsep}{4pt}
 \begin{tabular}{@{}lrrrr@{}}
 \toprule
 \textbf{Function} & \textbf{Calls} & \textbf{Self (ms)} & \textbf{Cum. (ms)} & \textbf{Cum. \%} \\
 \midrule
 \multicolumn{5}{l}{\textit{Baseline ($K=1$, 0.77\,s total, 26 steps/s)}} \\
 \texttt{sample\_behavior\_from\_transitions} & 420 & 131 & 658 & 86\% \\
 \texttt{DataFrame.xs} & 4,820 & 30 & 201 & 26\% \\
 \texttt{numpy.nan\_to\_num} & 4,904 & 43 & 97 & 13\% \\
 \texttt{adjust\_behavior\_to\_condition} & 84 & 3 & 54 & 7\% \\
 \midrule
 \multicolumn{5}{l}{\textit{Robust ($K=5$, 2.79\,s total, 7.2 steps/s)}} \\
 \texttt{sample\_behavior\_from\_transitions} & 1,220 & 519 & 2,447 & 88\% \\
 \texttt{DataFrame.xs} & 16,668 & 108 & 729 & 26\% \\
 \texttt{numpy.nan\_to\_num} & 16,912 & 164 & 363 & 13\% \\
 \texttt{adjust\_behavior\_to\_condition} & 244 & 11 & 108 & 4\% \\
 \bottomrule
 \end{tabular}
 \endgroup
 \end{table}
 Across both configurations, \texttt{sample\_behavior\_from\_transitions} accounts for 86--88\% of total wall time. The function implements the Markov chain sampler described in Section~\ref{sec:tpe}: at each transition it retrieves the current-state row from the expanded transition \texttt{DataFrame} via label-based indexing, which internally dispatches through the pandas \texttt{xs} and \texttt{fast\_xs} code paths. For $M$ sessions each running up to $L_{\max}=40$ transitions, a single \texttt{market.act()} call issues up to $M \cdot L_{\max}$ individual row lookups. With $K=5$ robustness candidates per outer step this accumulates to $5 \times 10 \times 40 = 2{,}000$ row accesses per outer step, producing the 16k \texttt{xs} invocations observed in Table~\ref{tab:profile_results}.
 The \texttt{numpy.nan\_to\_num} calls, accounting for 13\% of self-time, occur once per row lookup to sanitize sampled probability vectors before normalization; their call count therefore tracks the \texttt{xs} count exactly.
 \texttt{adjust\_behavior\_to\_condition} expands the base $E \times E$ event transition matrix to a $(E \cdot N) \times (E \cdot N)$ product-specific matrix via a Kronecker product. At $N=3$ this is inexpensive, but the cost scales as $O(E^2 N^2)$, so at the $N=10$ default it becomes a more significant contributor. The result is not cached across the $K$ robustness candidates inside a single outer step, meaning the Kronecker expansion is recomputed $2K$ times per step (once for the human kernel and once for the agent kernel at each candidate $\alpha_k$).
 The dominant bottleneck therefore has a clear structural cause: the expanded transition matrix is a string-keyed \texttt{DataFrame}, and pandas object-level indexing carries substantial per-call overhead relative to the arithmetic being performed. Converting the expanded matrix to a \texttt{numpy} array with an accompanying integer state-to-index map, computed once per \texttt{market.act()} call and cached for the duration of the robustness inner loop, eliminates the entire pandas dispatch chain. We leverage this bottleneck identified as an opportunity to squeeze the gap which is left by the computational needs of the pricing learner. We make use of JAX to parallelize on the TPU, and surprisingly we open up a large speedup even on CPU-only compute, improving throughput from 26 to 220 steps/s in the baseline configuration and from 7.2 to 136 steps/s under the full robust inner loop, an 8.5$\times$ and 19$\times$ speedup respectively.
--- a/paper/src/chapters/04-results.tex
+++ b/paper/src/chapters/04-results.tex
@@ -1,7 +1,7 @@
 \section{Results}
 \begin{figure}[ht]
    \centering
-    \input{chapters/figures/supra.tex}
+    \input{chapters/figures/supra/supra.tex}
    \caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as SAC as can be clearly seen by the high density at the highest available price.}
    \label{fig:supra_heatmap}
 \end{figure}
@@ -10,7 +10,7 @@
 \subsection{Behavioral Analysis}
-Separability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. Table~\ref{tab:divergence_significance} reports the group-level descriptive statistics for the gap scores and the test result.
+Distinguishability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. The full recorded cohort contains $n_H=13$ human sessions and $n_A=16$ agent sessions, and Table~\ref{tab:divergence_significance} reports the corresponding group-level statistics and test result.
 \begin{table}[ht]
 \centering
@@ -20,48 +20,67 @@ Separability between human and agent sessions is evaluated by computing per-sess
 \toprule
 Group & $n$ & Mean gap & Std \\
 \midrule
-Human sessions & 11 & $-3.3522$ & $2.6748$ \\
+Human sessions & 13 & $-3.35$ & $2.67$ \\
-Agent sessions & 6 & $+1.6482$ & $2.8349$ \\
+Agent sessions & 16 & $+1.65$ & $2.83$ \\
 \midrule
-\multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\
+\multicolumn{4}{l}{Mann-Whitney two-sided test: $p<0.001$} \\
 \bottomrule
 \end{tabular}
 \end{table}
-The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided $p$-value of $0.0006$ indicates near-complete rank separation between the groups at $n_H=11$, $n_A=6$, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing.
+The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided test result ($p<0.001$) at $n_H=13$, $n_A=16$ indicates strong rank distinction between groups, providing evidence that the transition kernels are distinguishable enough to justify their use as a control signal in downstream pricing.
 \subsection{Experimental Outcomes}
-To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) non-robust baseline with revenue-only reward (\texttt{--no-robust}).
+To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) a baseline policy with revenue-only reward.
 We report two preliminary stages before the full factorial interpretation. First, we executed a short calibration run at $\alpha=0.3$ (2 evaluation episodes, 3000 training timesteps per tier) across \texttt{qtable}, \texttt{ppo}, \texttt{a2c}, and \texttt{dqn}. In that first run, \texttt{ppo} produced the highest objective score and revenue (objective $=3.76\mathrm{e}5$, revenue $=4.15\mathrm{e}5$), while the remaining tiers stayed lower in this small-budget regime. The corresponding price traces show a monotone escalation for \texttt{ppo} (mean price from $8.61\mathrm{e}1$ to $1.49\mathrm{e}2$), whereas \texttt{qtable}, \texttt{a2c}, and \texttt{dqn} remained nearly flat over the episode horizon. This confirms that the simulation loop is able to express policy-dependent pricing dynamics rather than collapsing into a single trajectory shape.
 Second, we launched an overnight paired benchmark over $\alpha \in \{0.00,0.15,0.30,0.45,0.60\}$ with 8 evaluation episodes and 8000 timesteps, comparing robust and non-robust settings at fixed seed/tier/contamination tuples. At the time of writing, two seeds (11 and 22) are complete and one additional seed is still running. We therefore frame the numbers below as an initial signal, not a final claim.
-\begin{table}[ht]
+\subsubsection{The Impact of Contamination on Revenue}
 \centering
 \caption{Early overnight aggregate over completed seeds ($n=2$; seeds 11 and 22).}
 \label{tab:pricing_benchmark}
 \begin{tabular}{lcccc}
 \toprule
 Mode & Mean objective score & Mean revenue & Mean COI level & Mean margin \\
 \midrule
 Robust & $3.41\mathrm{e}5$ & $3.80\mathrm{e}5$ & $1.08\mathrm{e}2$ & 0.901 \\
 Non-robust (\texttt{--no-robust}) & $3.91\mathrm{e}5$ & $4.18\mathrm{e}5$ & $1.11\mathrm{e}2$ & 0.906 \\
 \bottomrule
 \end{tabular}
 \end{table}
-At pair level (same seed, tier, and contamination), robust exceeds non-robust in $13/40$ configurations on objective score and in $16/40$ configurations on revenue. The current early evidence therefore suggests a conditional robustness effect: the defense is active and measurable, but not yet uniformly beneficial without further calibration.
+A linear fit test on run-level data ($n=95$) shows a strong negative association between contamination and mean revenue. The fitted model mapping $\alpha \to \text{revenue}$ result in $t(93)=-8.2148$, $p=1.20\times 10^{-12}$, $R^2=0.4205$, and a 95\% confidence interval for the slope of $[-75{,}288.76,\,-45{,}975.13]$. In practical terms, a $+0.1$ increase in $\alpha$ corresponds to an average decrease of about $6{,}063$ revenue units within our environment.
 \subsubsection{Large Scale Factorial Training}
 In our complete training runs we logged $\approx 180$ days of net compute time. The results we draw from extensive training are
 \begin{enumerate*}[label=(\roman*)]
  \item the ability to extract COI is greater in the presence of robustness within the training loop
  \item short term revenue measurements suffer $\approx 3\%$ loss but COI margin compensates for this loss in the long run
  \item a larger catalog size contributes positively to COI preservation under higher contamination ratios
  \item supra-competitive pricing is a natural reward hacking tendency which is drastically reduced by a balanced UX penalty
 \end{enumerate*}
 \begin{figure}[ht]
    \centering
    \input{chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex}
    \caption{Revenue curves by contamination for the final cohort. The baseline remains above the defended curve in most cells, but the gap narrows in the high-contamination region.}
    \label{fig:final_focus_revenue_by_alpha}
 \end{figure}
 % TODO: we need a similar plot which shows the COI preserved (what we gain across teh multiple conatmination leves, showing that the robust method has better COI optimization.)
 \begin{figure}[ht]
    \centering
    \input{chapters/figures/results/includes/final/final_focus_revenue_delta.tex}
    \caption{Defended-minus-baseline revenue delta over contamination for the final cohort. The strongest high-contamination deviation begins at $\alpha=0.7$, followed by recovery toward near parity by $\alpha=1.0$.}
    \label{fig:final_focus_revenue_delta}
 \end{figure}
 \begin{figure}[ht]
    \centering
    \input{chapters/figures/results/includes/final/final_focus_risk_deltas.tex}
    \caption{Defended-minus-baseline leakage and volatility deltas for the final cohort. Leakage remains lower for the defended policy across the full contamination range.}
    \label{fig:final_focus_risk_deltas}
 \end{figure}
 \subsection{Interpretation and Insights}
-The Mann-Whitney result ($U=2.0$, $p<0.001$) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
+The Mann-Whitney result ($p<0.001$) confirms that per-session divergence gaps distinguish the two actor classes with near-zero overlap in rank ordering. This is the condition required for distinguishability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
-The first calibration and overnight runs additionally confirm three practical points aligned with the thesis mechanism. First, the control loop is reproducible end-to-end (training, evaluation, artifact generation) across algorithms and contamination levels. Second, policy class materially changes price trajectories and resulting COI/revenue profiles under identical environment settings. Third, objective improvements from robustness are regime-dependent in the current baseline, which is consistent with the thesis claim that contamination-aware pricing needs explicit calibration rather than a one-size-fits-all penalty.
+The first calibration and paired benchmark runs additionally confirm three practical points aligned with the thesis. First, the control loop is reproducible end-to-end (training, evaluation, artifact generation) across algorithms and contamination levels. Second, policy class materially changes price trajectories and resulting COI/revenue profiles under identical environment settings. Third, objective improvements from robustness are regime-dependent in the current baseline, which is consistent with the thesis claim that contamination-aware pricing needs explicit calibration rather than a one-size-fits-all penalty.
 We also note that maximizing revenue in isolation can favor aggressive high-price behavior; even in these early runs, the non-robust aggregate shows slightly higher mean COI and margin. For this reason, all subsequent reporting in this thesis is interpreted on a multi-metric basis (objective, revenue, COI, and stability), and not by revenue alone.
 \subsection{Anomalies}
 In our initial runs, we observed an instability pocket in one completed run (A2C, robust, seed 11, $\alpha=0.30$) with a large performance drop relative to neighboring configurations. We retain this run in the preliminary summary to avoid survivorship bias and treat it as evidence that robustness sensitivity analysis is necessary before final conclusions.
--- a/paper/src/chapters/05-discussion.tex
+++ b/paper/src/chapters/05-discussion.tex
@@ -16,6 +16,4 @@ This technology does not come without a more bitter side, ethical concerns do ar
 With a system like this there is potential for strong drift given the rapid advance of agentic systems and user preference. Our intent behind adding the UX term into the reward shaping process was to further address the risk of degraded user experience. Looking deeper at the underlying methodology, reinforcement learning does not come without it's complications such as reward hacking and often the lack of intepretability which is quite critical in systems that have a strong impact on the revenue of a company.
-\subsection{Implications of Findings}
+% \subsection{Implications of Findings} Interpretation of results and altenrative scenarios with broader market implications.
 Interpretation of results and altenrative scenarios with broader market implications.
--- a/paper/src/chapters/06-conclusion.tex
+++ b/paper/src/chapters/06-conclusion.tex
@@ -1,11 +1,24 @@
 \section{Conclusion}
-For our troubles, we now conclude that...
+Our research has explored how reinforcement learning works within pricing systems and environments which are substantially disrupted by an adversarial participant. Our findings include the optimization for our newly introduced metrics.
 \subsection{Summary of contributions}
-The authors contribution was not without the advice of many experienced experts in the field. We thank Marco Casalaina VP Products, Core AI and AI Futurist at Microsoft for the initial critical discussion on the topic of dynamic pricing systems and the spark which has lead to this work. Eugene Bykovets, PhD pointing out the parallels in blockchain systems and the complexity of anonymous interaction and understanding of intent. Importantly, the contributions of Alberto Martín Izquierdo, my academic advisor for the support over and for taking on the challenge of this ambitious work. Many breakthroughs were thanks to numerous discussions with my peers on the topics covered here.
+The contribution was not without the advice of many experienced experts in the field. We thank Marco Casalaina VP Products, Core AI and AI Futurist at Microsoft for the initial critical discussion on the topic of dynamic pricing systems and the spark which has lead to this work. Eugene Bykovets, PhD pointing out the parallels in blockchain systems and the complexity of anonymous interaction and understanding of intent. Importantly, the contributions of Alberto Martín Izquierdo, my academic advisor for the support over and for taking on the challenge of this ambitious work. Many breakthroughs were thanks to numerous discussions with my peers on the topics covered here.
 A thanks to the head of innovation at Amadeus for insight into the industry split on the topic of collapsing margins. Finally we acknowledge the power and use of generative AI technologies for in depth research, rapid prototyping and surfacing of key topics and niches.
 Now we very explicitly mention what we contribute in this paper:
 \begin{itemize}
    \item TPU-accelerated parallelization of the behavioral simulation and reinforcement learning pipeline, making large-scale factorial sweeps tractable.
    \item Formalization of non-human transaction orchestration in e-commerce as a distinct source of contamination in dynamic pricing systems.
    \item Definition of the Cost of Information (COI) as a mechanism-level quantity for pricing power, together with a theorem showing its erosion under increasing agent saturation.
    \item Design and implementation of a controlled e-commerce research platform, built on a hybrid Kappa-Lambda architecture, for collecting and replaying high-fidelity interaction trajectories.
    \item Construction and empirical validation of a behavioral distinguishability framework that distinguishes human and agent sessions from interaction signals alone using transition kernels and KL-based divergence.
    \item Development of a generative contamination mechanism that injects learned agent behavior into the pricing environment for controlled robustness experiments.
    \item Translation of behavioral distinguishability into a defensive pricing mechanism through a distributionally robust reinforcement learning formulation of pricing under non-stationary contamination.
    \item Empirical evidence that agent contamination reduces revenue and that robustness is condition-dependent, requiring explicit calibration rather than a one-size-fits-all penalty.
    \item Release of a reusable public experimental artifact for reproducing and extending research on dynamic pricing under agent-mediated traffic.
 \end{itemize}
 \subsection{Future Works and Next Steps}
 During the eights months of research dedicated to this work, a plethora of opportunities and industry gaps was identified, sadly a majority of which could not be addressed directly.
--- a/paper/src/chapters/auto/whoclicked_dataset_card.md
+++ b/paper/src/chapters/auto/whoclicked_dataset_card.md
@@ -0,0 +1,165 @@
 ---
 pretty_name: whoclickedit
 license: mit
 language:
 - en
 task_categories:
 - tabular-classification
 task_ids:
 - tabular-multi-class-classification
 tags:
 - e-commerce
 - dynamic-pricing
 - behavioral-telemetry
 - human-vs-agent
 - session-data
 size_categories:
 - 1K<n<10K
 ---
 <img align="right" width="280" src="https://raw.githubusercontent.com/velocitatem/PHANTOM/main/docs/static/images/banner.svg" alt="PHANTOM research banner" />
 # [whoclickedit](https://huggingface.co/datasets/velocitatem/whoclickedit)
 [![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-sm.svg)](https://huggingface.co/datasets/velocitatem/whoclickedit)
 ![Rows](https://img.shields.io/badge/Rows-3874-0A9396?style=flat-square)
 ![Columns](https://img.shields.io/badge/Columns-42-005F73?style=flat-square)
 ![Sessions](https://img.shields.io/badge/Sessions-36-1D3557?style=flat-square)
 ![Human rows](https://img.shields.io/badge/Human%20rows-798-2A9D8F?style=flat-square)
 ![Agent rows](https://img.shields.io/badge/Agent%20rows-3076-E76F51?style=flat-square)
 ![License](https://img.shields.io/badge/License-MIT-111827?style=flat-square)
 > **Event-level behavior data for dynamic pricing research.**
 > This dataset captures how humans and automated agents browse, query prices, and move through the PHANTOM storefronts during controlled experiments.
 ## What this dataset gives you
 - A single flat file (`whoclicked.csv`) with both interaction and price-log events.
 - Explicit labels for actor origin: `actor_type` and `is_agent`.
 - Provenance fields from Kafka envelopes when available.
 - Metadata flattened into feature-ready `metadata_*` columns.
 ## Snapshot
 | Metric | Value |
 | --- | --- |
 | Rows | `3874` |
 | Columns | `42` |
 | Time range (UTC) | `2025-12-05T09:43:31.301000+00:00` -> `2026-03-23T12:08:30.151000+00:00` |
 | Unique sessions | `36` |
 ## Composition
 ### Rows by actor
 | Actor | Rows | Share |
 | --- | --- | --- |
 | `human` | 798 | 20.6% |
 | `agent` | 3076 | 79.4% |
 ### Rows by actor and record type
 | Actor | Record type | Rows |
 | --- | --- | --- |
 | `agent` | `interaction` | 197 |
 | `agent` | `price_log` | 2879 |
 | `human` | `interaction` | 328 |
 | `human` | `price_log` | 470 |
 ### Store mode coverage
 | Store mode | Rows |
 | --- | --- |
 | `hotel` | 3628 |
 | `airline` | 196 |
 | `shop` | 50 |
 ### Top interaction events
 | Interaction event | Count |
 | --- | --- |
 | `page_view` | 246 |
 | `learn_more_about_item` | 91 |
 | `view_item_page` | 88 |
 | `add_item_to_cart` | 47 |
 | `hover_over_title` | 23 |
 | `checkout_start` | 20 |
 | `hover_over_paragraph` | 6 |
 | `remove_item` | 4 |
 ## Collection pipeline
 Data is sourced from two roots inside PHANTOM:
 - `experiments/collected_data` (human sessions)
 - `experiments/agents/collected_data` (agent sessions)
 Each session directory contains:
 - `int.json`: user interaction events
 - `price.json`: price quote observations
 ETL behavior:
 1. Accepts both Kafka-envelope records and flat payload records.
 2. Flattens nested JSON to a tabular schema.
 3. Preserves row-level provenance (`source_session_dir`, `source_row_index`, topic fields).
 4. Adds modeling labels (`actor_type`, `is_agent`, `record_type`).
 ## Schema highlights
 Core modeling fields:
 - `actor_type`, `is_agent`, `record_type`
 - `sessionId`, `experimentId`, `storeMode`, `ts`
 - `eventName`, `page`, `productId`, `price`, `userAgent`
 Kafka provenance fields:
 - `kafka_partition_id`, `kafka_offset`, `kafka_timestamp_ms`, `kafka_compression`
 - `kafka_is_transactional`, `kafka_headers`, `kafka_key_*`, `kafka_value_*`
 <details>
 <summary>Metadata columns in this release</summary>
 - `metadata_cabinClass`
 - `metadata_dateIndex`
 - `metadata_dwellTime`
 - `metadata_elementText`
 - `metadata_fareRule`
 - `metadata_flightType`
 - `metadata_itemCount`
 - `metadata_nights`
 - `metadata_price`
 - `metadata_referrer`
 - `metadata_roomType`
 - `metadata_total`
 - `metadata_type`
 </details>
 ## Quick start
 ```python
 from datasets import load_dataset
 ds = load_dataset("velocitatem/whoclickedit")
 ```
 Recommended split strategy:
 - Prefer session-aware or time-aware splits.
 - Do not split rows from the same `sessionId` across train and test.
 ## Intended use
 - Human-vs-agent behavior classification.
 - Session-level telemetry modeling for dynamic pricing defenses.
 - Robustness experiments under agent-mediated reconnaissance.
 ## Safety and limitations
 - `userAgent` and referrer metadata can be quasi-identifying in very small samples.
 - Data comes from a controlled research platform, not a full production marketplace.
 - Current release has stronger coverage for `hotel` flows than `airline` flows.
 ## Citation
 If you use this dataset, cite the PHANTOM thesis project and link this page:
 `https://huggingface.co/datasets/velocitatem/whoclickedit`
--- a/paper/src/chapters/figures/.gitignore
+++ b/paper/src/chapters/figures/.gitignore
@@ -0,0 +1,3 @@
 __pycache__/
 *.pyc
 .pdf-view-restore
--- a/paper/src/chapters/figures/results/generated/final/final_focus_alpha_deltas.csv
+++ b/paper/src/chapters/figures/results/generated/final/final_focus_alpha_deltas.csv
@@ -0,0 +1,12 @@
 alpha,revenue_delta,revenue_delta_pct,reward_delta,reward_delta_pct,volatility_delta,supra_delta,coi_leakage_delta
 0.0,-17982.383542886935,-5.11072862876989,-17145.799161982606,-5.235033672101227,0.001232973729699119,0.0,-0.0030412479577408003
 0.1,-14962.041501283413,-4.410637208586118,-14303.760282736213,-4.531344436782669,0.0011858665298920962,0.0,-0.004133727080174038
 0.2,-16153.416666167905,-4.826514761457546,-15398.621298776357,-4.9418165571901715,0.00200624274016295,0.0,-0.0033201883450373615
 0.3,-17294.9275360335,-5.382423616385397,-16544.91845114401,-5.533399709364953,-0.0011022484400295268,0.0,-0.0029151149203366505
 0.4,-19661.294346174283,-6.250307313590199,-18728.35578200908,-6.3953153560217535,3.582812967113658e-05,0.0,-0.0038123361988749577
 0.5,-16411.03168918495,-5.3630681206030015,-15638.77510066732,-5.4888928630525315,0.00015428950526953644,0.0,-0.00439661338956944
 0.6,-14729.668247641937,-5.069964928178309,-13912.22417824401,-5.148827377884945,-0.002735776807082743,0.0,-0.004310129386364658
 0.7,-21160.81910514756,-7.351404104505076,-20171.762105623755,-7.525169314210056,-0.0008903632602569461,0.0,-0.0026198461183787186
 0.8,-16404.76825612632,-5.9342582959227075,-15645.025250480074,-6.078699946285722,0.0010338614665691137,0.0,-0.002542765270289696
 0.9,-8674.090655496111,-3.2592966246269577,-8371.30734891587,-3.378943339994106,-0.0005579187914590139,0.0,-0.0013720835439427759
 1.0,768.8099906174757,0.2991618705853567,399.7394696234842,0.16706914330070038,0.0014659834822295797,0.0,-0.0007600066499474645
--- a/paper/src/chapters/figures/results/generated/final/final_focus_alpha_mode_summary.csv
+++ b/paper/src/chapters/figures/results/generated/final/final_focus_alpha_mode_summary.csv
@@ -0,0 +1,23 @@
 alpha,mode,runs,revenue_mean,reward_mean,supra_mean,volatility_mean,coi_leakage_mean,coi_level_mean
 0.0,baseline,36,351855.57381502265,327520.32242613373,0.0,0.06922494093544151,0.11931704468268205,136.80105514058158
 0.0,defended,35,333873.1902721357,310374.5232641511,0.0,0.07045791466514063,0.11627579672494125,136.81832905386602
 0.1,baseline,32,339226.3020897988,315662.6136522988,0.0,0.06952778671756812,0.11924519238669087,136.47864859317326
 0.1,defended,33,324264.2605885154,301358.8533695626,0.0,0.07071365324746022,0.11511146530651684,136.7200845824852
 0.2,baseline,31,334680.76789409376,311598.399506997,0.0,0.06848006194428993,0.11597869134898402,136.83684469591932
 0.2,defended,35,318527.35122792586,296199.77820822067,0.0,0.07048630468445288,0.11265850300394666,137.2758153292305
 0.3,baseline,30,321322.30327214615,299000.9636054795,0.0,0.07085669473747759,0.11527347603412934,136.4452630715689
 0.3,defended,44,304027.37573611265,282456.0451543355,0.0,0.06975444629744806,0.11235836111379269,136.4704115371568
 0.4,baseline,33,314565.2423109539,292844.914432166,0.0,0.07031811881503117,0.11300307992768284,136.72547178046122
 0.4,defended,38,294903.9479647796,274116.55865015695,0.0,0.0703539469447023,0.10919074372880788,136.75671002806396
 0.5,baseline,33,306000.80625751516,284916.7489847879,0.0,0.06938663916591635,0.11118137138243217,136.9528780620641
 0.5,defended,35,289589.7745683302,269277.9738841206,0.0,0.06954092867118589,0.10678475799286273,136.65018588845163
 0.6,baseline,28,290528.0106727377,270201.7985298805,0.0,0.07139577980623227,0.11081647254398667,135.258395468266
 0.6,defended,41,275798.3424250958,256289.57435163652,0.0,0.06866000299914952,0.10650634315762202,136.3194947785247
 0.7,baseline,40,287847.3119465684,268057.25244656845,0.0,0.07132313199532896,0.10746267580456732,137.0170522633547
 0.7,defended,40,266686.49284142087,247885.4903409447,0.0,0.07043276873507201,0.1048428296861886,136.56834095392904
 0.8,baseline,26,276441.76303208206,257374.52726285128,0.0,0.06945655282263205,0.1063246766773884,136.66765260798618
 0.8,defended,39,260036.99477595574,241729.5020123712,0.0,0.07049041428920116,0.1037819114070987,136.61222667078658
 0.9,baseline,35,266133.8213268301,247749.2667554015,0.0,0.0709569180547784,0.10455882265976374,136.5370653814206
 0.9,defended,39,257459.73067133396,239377.95940648564,0.0,0.07039899926331938,0.10318673911582096,136.7368893225831
 1.0,baseline,35,256987.96076959255,239265.888198164,0.0,0.06888231148034313,0.10369761394735275,136.68691718467974
 1.0,defended,30,257756.77076021003,239665.62766778748,0.0,0.07034829496257271,0.10293760729740528,136.65287739235566
--- a/paper/src/chapters/figures/results/generated/final/final_focus_headline_summary.json
+++ b/paper/src/chapters/figures/results/generated/final/final_focus_headline_summary.json
@@ -0,0 +1,27 @@
 {
  "bundle": "engine/studies/results/wandb_sweep_bundles/bundle_20260317_093826",
  "focus_cohort": "max_alpha_coverage",
  "alpha_cells": 11,
  "alpha_min": 0.0,
  "alpha_max": 1.0,
  "mean_revenue_delta_pct": -4.787221975639986,
  "mean_reward_delta_pct": -4.91730667541704,
  "zone_summary": [
    {
      "zone": "high_alpha_0_7_plus",
      "alpha_cells": 4,
      "revenue_delta_pct_mean": -4.0614492886173466,
      "reward_delta_pct_mean": -4.2039358642972955,
      "coi_leakage_delta_mean": -0.0018236753956396637,
      "volatility_delta_mean": 0.00026289072427068336
    },
    {
      "zone": "low_alpha_below_0_7",
      "alpha_cells": 7,
      "revenue_delta_pct_mean": -5.201949225367208,
      "reward_delta_pct_mean": -5.324947138914036,
      "coi_leakage_delta_mean": -0.0037041938968711296,
      "volatility_delta_mean": 0.00011102505536893832
    }
  ]
 }
--- a/paper/src/chapters/figures/results/generated/final/final_focus_zone_summary.csv
+++ b/paper/src/chapters/figures/results/generated/final/final_focus_zone_summary.csv
@@ -0,0 +1,3 @@
 zone,alpha_cells,revenue_delta_pct_mean,reward_delta_pct_mean,coi_leakage_delta_mean,volatility_delta_mean
 high_alpha_0_7_plus,4,-4.0614492886173466,-4.2039358642972955,-0.0018236753956396637,0.00026289072427068336
 low_alpha_below_0_7,7,-5.201949225367208,-5.324947138914036,-0.0037041938968711296,0.00011102505536893832
--- a/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf
+++ b/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf
--- a/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf
+++ b/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf
--- a/paper/src/chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf
+++ b/paper/src/chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf
--- a/paper/src/chapters/figures/results/generated/legacy/first_sweep_headline_summary.json
+++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_headline_summary.json
@@ -0,0 +1,10 @@
 {
  "runs": 340,
  "tiers": 5,
  "alphas": 6,
  "status": "ok",
  "mean_tier_revenue_robust": 190714.62212212436,
  "mean_tier_revenue_no_robust": 197371.17216609977,
  "mean_tier_revenue_delta": -6656.5500439754105,
  "mean_tier_revenue_delta_pct": -3.3726050116242514
 }
--- a/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_deltas.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_deltas.csv
@@ -0,0 +1,31 @@
 tier,alpha,runs_robust,runs_no_robust,eval_revenue_mean_delta,eval_revenue_mean_delta_pct,eval_reward_mean_delta,eval_reward_mean_delta_pct,eval_coi_level_mean_delta,eval_coi_level_mean_delta_pct,eval_margin_mean_delta,eval_margin_mean_delta_pct,objective_score_delta,objective_score_delta_pct,train_alpha_adv_delta,train_alpha_adv_delta_pct
 dqn,0.0,5.0,2.0,-31308.987414117495,-8.73651226889534,-1909.7427407095092,-0.5742991901121623,-2.8982436567700063,-2.1108702433020436,-0.001972064237093285,-0.2116777198290971,-1909.7427407095092,-0.5742991901121623,,
 dqn,0.1,8.0,4.0,-7723.542755668925,-2.2789188721535494,-74239.37371836061,-21.063854618469847,1.7435833801418141,1.2859365583872486,0.0011891962142838164,0.1278074871971924,-74239.37371836061,-21.063854618469847,0.17619791666666657,176.19791666666694
 dqn,0.25,7.0,3.0,-12344.82818986749,-3.7035466052614323,93154.03627578515,36.06691230407512,0.03214544949867104,0.023426184113378143,1.763733457238459e-05,0.001893256490383175,93154.03627578515,36.06691230407512,0.14530952380952394,58.12380952380958
 dqn,0.4,5.0,10.0,-7816.300706216833,-2.4694340725162824,-42362.74668471434,-13.411888482380219,0.6251272343707797,0.4579446603861758,0.0002750615520492605,0.02953644634355915,-42362.74668471434,-13.411888482380219,0.09856666666666747,24.64166666666691
 dqn,0.6,5.0,4.0,-16150.011887742497,-5.347485987139731,-28508.74710866122,-10.151356300001888,-0.63306323164079,-0.46056970247177387,-0.00034537433455417155,-0.0370668515552649,-28508.74710866122,-10.151356300001888,0.1361999999999981,22.699999999999644
 dqn,0.8,7.0,6.0,-18191.8826663699,-6.440527544692988,-55296.94441124235,-20.19273590083627,-0.796733634735034,-0.579832425016392,-0.0006423984775592029,-0.0689476165584585,-55296.94441124235,-20.19273590083627,0.1532857142857158,19.160714285714512
 linear,0.0,9.0,8.0,-14967.67388588126,-4.273413942959129,-20107.23171681742,-6.60039931288617,-0.06127790826209889,-0.04564810574240612,-7.607744079518586e-05,-0.008177885913528719,-20107.23171681742,-6.60039931288617,,
 linear,0.1,3.0,5.0,-24531.399901538738,-7.171831328305365,-96669.7835552101,-26.44920711447249,-0.3680976907859872,-0.2733723058172187,-0.0002515287835096469,-0.02702956778346356,-96669.7835552101,-26.44920711447249,,
 linear,0.25,6.0,9.0,-14840.859479571285,-4.520682292638562,-26510.179456423968,-8.033117756667396,-0.13734776448131925,-0.10212641096230607,-9.41162442338328e-05,-0.010115001392981545,-26510.179456423968,-8.033117756667396,,
 linear,0.4,4.0,11.0,-17196.7642560167,-5.486915251242723,-74520.10209817477,-25.042311510043184,0.12217076984330788,0.09098828726103136,0.00010713887099822461,0.011516865671259795,-74520.10209817477,-25.042311510043184,,
 linear,0.6,5.0,3.0,-14284.06615788641,-4.854766876637072,38417.71856593515,14.088596762512362,0.24251461234271687,0.1806530855220358,0.0002606811969937395,0.028024824619509187,38417.71856593515,14.088596762512362,,
 linear,0.8,4.0,11.0,-10840.488575784548,-3.933600919557566,15749.581078662042,6.447651726824251,0.028051260535562506,0.020876236575910773,5.361882659971062e-05,0.005763158099097226,15749.581078662042,6.447651726824251,,
 qtable,0.0,9.0,8.0,-18644.457288398524,-8.15323701554329,32993.42568058451,20.675688115613053,10.369779227648095,10.682768960780463,0.018566897519637582,2.0803084179092814,32993.42568058451,20.675688115613053,0.11839814814814797,
 qtable,0.1,6.0,5.0,-12549.400855549495,-4.616991193742389,-37207.79701261924,-15.336047254435487,0.0884057957559321,0.07703761042583206,-0.01127789819771663,-1.2272540823820444,-37207.79701261924,-15.336047254435487,0.07577777777777787,75.77777777777803
 qtable,0.25,6.0,5.0,-1534.3527429780224,-0.5456640130847226,18433.43663451099,7.304472653867784,-0.5776125938941306,-0.45734160960552755,-0.003316338490628068,-0.3584028328803385,18433.43663451099,7.304472653867784,0.1181458333333334,47.258333333333354
 qtable,0.4,8.0,6.0,-15146.258176090778,-5.274860187729517,-37364.22587794208,-13.005651205148677,0.4611471727478005,0.3629050099230144,0.0071046453227539,0.7751478467862876,-37364.22587794208,-13.005651205148677,0.11010416666666772,27.52604166666698
 qtable,0.6,6.0,6.0,-9577.578548656049,-3.9322693501816666,-19088.152339068736,-9.571307395166029,0.9081750157567683,0.7495917946306662,0.0015520804425310786,0.16838348372043557,-19088.152339068736,-9.571307395166029,0.16983333333333228,28.305555555555333
 qtable,0.8,5.0,2.0,-52751.680936846446,-19.699089872409548,-16508.209313987172,-7.589601869470744,-15.022454081083623,-11.215398490282094,-0.007791824761087751,-0.8384414846099099,-16508.209313987172,-7.589601869470744,0.11120000000000174,13.900000000000245
 static,0.0,5.0,6.0,-4782.871053113384,-5.233544525848519,14411.4689779756,25.538141347978577,1.307060701942973,1.8731997380823568,0.002537468952847566,0.2911381045328444,14411.4689779756,25.538141347978577,,
 static,0.1,8.0,5.0,1629.4524528499896,1.880088900553112,-5347.078589385725,-8.14812684380662,0.3600324838305795,0.5019134064795009,-4.6492644957929485e-05,-0.005316014641356001,-5347.078589385725,-8.14812684380662,,
 static,0.25,5.0,6.0,-9938.662276761897,-10.398087633377964,-23616.087243780566,-27.701108621456626,-3.0513860773271233,-4.099238223547561,-0.003519771479853273,-0.40113716461596144,-23616.087243780566,-27.701108621456626,,
 static,0.4,3.0,4.0,1850.8400595222774,2.1912497828943436,15058.659457798465,23.67199439061036,3.669612467486587,5.430169778169349,0.006763447803564415,0.7804393835882188,15058.659457798465,23.67199439061036,,
 static,0.6,6.0,5.0,1038.893948415236,1.2765037688226162,-6062.864079504681,-9.363144945348399,-1.712609061865976,-2.3996341009364213,-0.0042285583442709385,-0.48362088973179423,-6062.864079504681,-9.363144945348399,,
 static,0.8,3.0,7.0,2696.6340631967323,3.6826150812750567,149.22406835677975,0.27280281303997084,0.8491716126507072,1.2427748744725668,0.0032786525965587954,0.3777595573932637,149.22406835677975,0.27280281303997084,,
 surge,0.0,6.0,6.0,-606.73760243367,-5.066579306500225,-244.17585425326251,-5.525800641331023,0.014874931199557295,0.09186560988877175,0.0019308940532419272,0.4471794260021321,-244.17585425326251,-5.525800641331023,,
 surge,0.1,2.0,5.0,169.78743573408792,1.446343107913299,-1012.7706974660168,-20.02053666691211,-0.14459518037699226,-0.864651254901582,-0.0018650458785858248,-0.4260349899970559,-1012.7706974660168,-20.02053666691211,,
 surge,0.25,10.0,7.0,-128.20993816584632,-1.1276930411162496,-81.21373487263281,-1.7081453033360994,0.3008506477195141,1.839047728806548,0.0030750148302954305,0.7102446987902812,-81.21373487263281,-1.7081453033360994,,
 surge,0.4,6.0,6.0,-473.03722764431404,-4.297928307550563,28.557452243338048,0.6755106104955642,-0.5027452173053764,-3.072002360121898,-0.005581380442163164,-1.288152985482699,28.557452243338048,0.6755106104955642,,
 surge,0.6,2.0,5.0,307.79436325796996,3.0356727142643067,2060.57396030564,63.382050333909866,0.2339650444065704,1.438519400758399,0.001302270025389629,0.30077697380833807,2060.57396030564,63.382050333909866,,
 surge,0.8,3.0,3.0,423.15386247993047,4.372210191290083,1117.0942083304312,34.86182570616373,0.8971464536957541,5.327339899805159,0.007068630716831503,1.6094191039618562,1117.0942083304312,34.86182570616373,,
--- a/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_mode_summary.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_mode_summary.csv
@@ -0,0 +1,61 @@
 tier,alpha,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_margin_mean_mean,eval_margin_mean_std,objective_score_mean,objective_score_std,train_alpha_adv_mean,train_alpha_adv_std
 dqn,0.0,no_robust,2,358369.40933039243,3531.782519351935,332534.46523867303,114183.5587841961,137.30089123035202,0.8184776440325546,0.9316352418598786,0.0006839003676302996,332534.46523867303,114183.5587841961,,
 dqn,0.0,robust,5,327060.42191627494,24311.17412598574,330624.7224979635,62834.39223547943,134.40264757358202,6.160000643680792,0.9296631776227853,0.004262039730140749,330624.7224979635,62834.39223547943,0.17835000000000004,0.08829347371125472
 dqn,0.1,no_robust,4,338912.58043645386,19584.736810155388,352449.13650924934,34076.74819101191,135.58860029055563,3.4055508991301524,0.9304589585186211,0.0023438665484978773,352449.13650924934,34076.74819101191,0.0999999999999998,0.0
 dqn,0.1,robust,8,331189.03768078494,8060.912085646968,278209.7627908887,57861.69545853692,137.33218367069745,0.43113256118808096,0.931648154732905,0.000296560958972609,278209.7627908887,57861.69545853692,0.2761979166666664,0.09826648189130198
 dqn,0.25,no_robust,3,333324.4996115304,6101.717861804452,258281.15112936878,46772.05216097596,137.2201692904545,0.9866477887862672,0.9315871706751672,0.0006356053229300815,258281.15112936878,46772.05216097596,0.25,0.0
 dqn,0.25,robust,7,320979.6714216629,7345.8761269427705,351435.18740515393,40320.63699261721,137.25231473995316,0.3527287960309152,0.9316048080097395,0.0002575240668471541,351435.18740515393,40320.63699261721,0.39530952380952394,0.073021206240698
 dqn,0.4,no_robust,10,316521.94295076875,3631.1820920182718,315859.66987697606,59129.03566963754,136.50715652926755,0.5085743959240285,0.931261495881483,0.00031280530251053175,315859.66987697606,59129.03566963754,0.3999999999999993,0.0
 dqn,0.4,robust,5,308705.6422445519,10654.571556448245,273496.9231922617,68868.59270778317,137.13228376363833,0.9543108715306617,0.9315365574335323,0.0006302636717132419,273496.9231922617,68868.59270778317,0.49856666666666677,0.05745573175159429
 dqn,0.6,no_robust,4,302011.2988903938,2354.1141598720183,280836.828756133,58683.00124997926,137.4522093492651,0.4692723362517602,0.9317606434396914,0.0003317518021682495,280836.828756133,58683.00124997926,0.600000000000001,0.0
 dqn,0.6,robust,5,285861.2870026513,10386.571631344234,252328.08164747176,59388.56063758225,136.8191461176243,1.0629203361893034,0.9314152691051373,0.0005692783702932289,252328.08164747176,59388.56063758225,0.7361999999999991,0.07108625433623189
 dqn,0.8,no_robust,6,282459.51189759385,2625.018247527438,273845.72691287595,66378.16690732416,137.4075681801531,0.29728950101826707,0.9317196295169007,0.00022799290978965786,273845.72691287595,66378.16690732416,0.7999999999999985,0.0
 dqn,0.8,robust,7,264267.62923122395,6771.288971321149,218548.7825016336,50043.2009443344,136.61083454541807,1.2319662937254596,0.9310772310393415,0.0010118564779437284,218548.7825016336,50043.2009443344,0.9532857142857143,0.04709817507333055
 linear,0.0,no_robust,8,350250.9723061577,3156.286820918861,304636.59490360576,71682.88027353655,134.2397614654424,0.32611787466946035,0.9302824910938235,0.00024020749661685483,304636.59490360576,71682.88027353655,,
 linear,0.0,robust,9,335283.29842027643,7707.594869976611,284529.36318678834,55524.58819004573,134.1784835571803,0.4477314164684001,0.9302064136530284,0.00034781034181738526,284529.36318678834,55524.58819004573,,
 linear,0.1,no_robust,5,342052.1032713031,2576.546352056584,365492.17954557994,44890.93522299766,134.65068807375954,0.2181027640393531,0.930569018064469,0.00014058935916940913,365492.17954557994,44890.93522299766,,
 linear,0.1,robust,3,317520.7033697644,4796.580459456527,268822.39599036984,39256.421140635124,134.28259038297355,0.24570499109363475,0.9303174892809594,0.00018817899183709092,268822.39599036984,39256.421140635124,,
 linear,0.25,no_robust,9,328288.0441241802,2178.525494145428,330011.0898339667,38591.36053388808,134.48799697074742,0.2199303973026469,0.9304619997297959,0.00015341642413402035,330011.0898339667,38591.36053388808,,
 linear,0.25,robust,6,313447.18464460893,11811.426711620714,303500.9103775427,63358.917144214036,134.3506492062661,0.2947034403278951,0.9303678834855621,0.00021446628431268986,303500.9103775427,63358.917144214036,,
 linear,0.4,no_robust,11,313414.0672597746,1982.9537556159262,297576.7714904776,69396.90446617964,134.2708754290745,0.3062093691351849,0.9302780292522507,0.00023067974755288992,297576.7714904776,69396.90446617964,,
 linear,0.4,robust,4,296217.3030037579,5109.898340355844,223056.66939230284,38293.73688466607,134.3930461989178,0.12347753686382154,0.9303851681232489,7.324605809708878e-05,223056.66939230284,38293.73688466607,,
 linear,0.6,no_robust,3,294227.64307441004,2081.9176570448135,272686.62176604365,66672.50905805513,134.24327165069943,0.30764332256042104,0.9301795837547151,0.00020453921786790446,272686.62176604365,66672.50905805513,,
 linear,0.6,robust,5,279943.5769165236,9866.031719660255,311104.3403319788,28363.930707781863,134.48578626304214,0.21280262186464388,0.9304402649517088,0.00020533894868120649,311104.3403319788,28363.930707781863,,
 linear,0.8,no_robust,11,275586.89347174135,1618.038877505867,244268.4832547461,56201.44465269986,134.36933631960773,0.2845660213184439,0.9303723007028001,0.00017640716421186918,244268.4832547461,56201.44465269986,,
 linear,0.8,robust,4,264746.4048959568,7976.6279174956235,260018.06433340814,57942.49882730146,134.3973875801433,0.31511916357643405,0.9304259195293998,0.00023606570471334208,260018.06433340814,57942.49882730146,,
 qtable,0.0,no_robust,8,228675.52179404112,103199.70453252994,159575.94976328663,95848.81008103945,97.07014413321637,33.0637115678536,0.8925069648229078,0.04890522141482132,159575.94976328663,95848.81008103945,0.0,0.0
 qtable,0.0,robust,9,210031.0645056426,84361.3834579348,192569.37544387113,116824.7880426837,107.43992336086447,21.41128645838254,0.9110738623425454,0.019188350719133364,192569.37544387113,116824.7880426837,0.11839814814814797,0.061909456985161225
 qtable,0.1,no_robust,5,271809.0706466638,14898.209045050968,242616.60384397948,49181.45526408063,114.75666919996793,3.461383158930426,0.9189538140159812,0.002294693249439748,242616.60384397948,49181.45526408063,0.0999999999999998,0.0
 qtable,0.1,robust,6,259259.66979111428,102995.29934229614,205408.80683136024,94155.1845420674,114.84507499572386,36.206421837506966,0.9076759158182646,0.048591979839360346,205408.80683136024,94155.1845420674,0.17577777777777767,0.06720562696899951
 qtable,0.25,no_robust,5,281190.01916657295,70274.10208723843,252358.2126733039,129868.46825082717,126.29784427276161,15.368804047323954,0.9253103453385114,0.009044883517550522,252358.2126733039,129868.46825082717,0.25,0.0
 qtable,0.25,robust,6,279655.6664235949,93056.2549557545,270791.6493078149,116021.46257259768,125.72023167886748,26.760714047253796,0.9219940068478834,0.022785695882060884,270791.6493078149,116021.46257259768,0.3681458333333334,0.08845114686619042
 qtable,0.4,no_robust,6,287140.4669895195,32698.16434426399,287292.23388022534,83855.95000252876,127.07104066863859,9.200301166154173,0.9165535777734913,0.01306001923887748,287292.23388022534,83855.95000252876,0.3999999999999993,0.0
 qtable,0.4,robust,8,271994.2088134287,79259.3185780895,249928.00800228326,88265.30801790548,127.53218784138639,23.406428094683015,0.9236582230962452,0.020073747007871224,249928.00800228326,88265.30801790548,0.510104166666667,0.09294655989347765
 qtable,0.6,no_robust,6,243563.64469828535,67006.60707045678,199430.98211127534,79119.52886604435,121.15594411011905,17.91243944823949,0.9217533740470492,0.011558797825966702,199430.98211127534,79119.52886604435,0.600000000000001,0.0
 qtable,0.6,robust,6,233986.0661496293,43155.478617087436,180342.8297722066,48117.79957836251,122.06411912587582,12.160951090203252,0.9233054544895802,0.006840854872863436,180342.8297722066,48117.79957836251,0.7698333333333333,0.09107066853090896
 qtable,0.8,no_robust,2,267787.4017455507,1552.038101264713,217510.87340156303,45358.788584678456,133.9448981157492,0.47346860040111405,0.9293224278749692,0.0002998116010539045,217510.87340156303,45358.788584678456,0.7999999999999985,0.0
 qtable,0.8,robust,5,215035.72080870424,32869.73253165852,201002.66408757586,63247.67956376057,118.92244403466557,8.586916805142152,0.9215306031138815,0.004644709320891907,201002.66408757586,63247.67956376057,0.9112000000000002,0.07381653307732307
 static,0.0,no_robust,6,91388.75248869567,13415.65534300268,56431.15832748852,8525.098185703384,69.77689967440658,3.670744870085874,0.8715688236409825,0.005831496806767582,56431.15832748852,8525.098185703384,,
 static,0.0,robust,5,86605.88143558228,7614.909395960895,70842.62730546412,8033.737230392738,71.08396037634955,3.6802889678420283,0.8741062925938301,0.005083911544334936,70842.62730546412,8033.737230392738,,
 static,0.1,no_robust,5,86668.90445290186,8037.955688932984,65623.40881389238,19329.448262530004,71.73199185012882,4.199046495412734,0.874577067494122,0.006610505646022198,65623.40881389238,19329.448262530004,,
 static,0.1,robust,8,88298.35690575185,9576.838833058617,60276.33022450666,13359.490452744656,72.0920243339594,6.7706096714767865,0.8745305748491641,0.010083585815241344,60276.33022450666,13359.490452744656,,
 static,0.25,no_robust,6,95581.63603909909,8345.698435455577,85253.22060752509,13111.526873622026,74.43788116042678,2.1078820386097368,0.8774483618896327,0.0037254791853004897,85253.22060752509,13111.526873622026,,
 static,0.25,robust,5,85642.97376233719,9472.880627242153,61637.13336374452,15937.429780623212,71.38649508309966,4.0264905454627264,0.8739285904097794,0.005323853359397925,61637.13336374452,15937.429780623212,,
 static,0.4,no_robust,4,84465.04245981346,12101.831388745604,63613.81812329075,7778.361846092061,67.5782271530322,3.9088888968092,0.8666205147756862,0.007149121199217965,63613.81812329075,7778.361846092061,,
 static,0.4,robust,3,86315.88251933573,8642.748496122398,78672.47758108922,17823.74997200773,71.24783962051879,2.790416943786253,0.8733839625792507,0.005990544453538607,78672.47758108922,17823.74997200773,,
 static,0.6,no_robust,5,81385.88962988024,12343.523894997037,64752.43216774836,23486.779472906223,71.36959177224794,5.100226704959064,0.874353948320141,0.007787250295491337,64752.43216774836,23486.779472906223,,
 static,0.6,robust,6,82424.78357829548,9831.886701625144,58689.56808824368,12672.506035553573,69.65698271038197,3.484982360048201,0.8701253899758701,0.005917711231889304,58689.56808824368,12672.506035553573,,
 static,0.8,no_robust,7,73226.06364450825,4447.877985963851,54700.340767716196,14406.881298569717,68.32867561883204,3.68262917356943,0.8679204886788817,0.007467501164611224,54700.340767716196,14406.881298569717,,
 static,0.8,robust,3,75922.69770770498,5046.089536162847,54849.564836072976,22780.98012221352,69.17784723148274,1.5268167784698885,0.8711991412754405,0.0033278715575433297,54849.564836072976,22780.98012221352,,
 surge,0.0,no_robust,6,11975.290738176132,411.4052900076416,4418.832131346071,896.5828048394391,16.192056219479124,0.8040364003224534,0.4317940274006973,0.008271862690929055,4418.832131346071,896.5828048394391,,
 surge,0.0,robust,6,11368.553135742462,623.8217438159004,4174.6562770928085,639.9963040241264,16.20693115067868,0.9853827520149101,0.4337249214539392,0.010371668289035135,4174.6562770928085,639.9963040241264,,
 surge,0.1,no_robust,5,11739.084232858655,332.778792718381,5058.659087494994,1110.8409258976824,16.722948073839394,0.6578121995950104,0.4377682402562083,0.005683401047550787,5058.659087494994,1110.8409258976824,,
 surge,0.1,robust,2,11908.871668592743,81.41250285550258,4045.8883900289775,784.7169500268457,16.5783528934624,0.4088194924856508,0.4359031943776225,0.004531137621699143,4045.8883900289775,784.7169500268457,,
 surge,0.25,no_robust,7,11369.223138855004,236.1121240061105,4754.4980344481255,1038.0550037539617,16.359045119223275,0.3945156775653057,0.4329514652531622,0.0038762110261952457,4754.4980344481255,1038.0550037539617,,
 surge,0.25,robust,10,11241.013200689158,684.503587066406,4673.284299575493,1187.78635131025,16.65989576694279,1.0515950311117155,0.4360264800834576,0.009701952962125513,4673.284299575493,1187.78635131025,,
 surge,0.4,no_robust,6,11006.168409400554,364.6584583108646,4227.535704048808,1414.7964077877168,16.365391636138824,0.9138430058543858,0.4332855262584901,0.008024003783434592,4227.535704048808,1414.7964077877168,,
 surge,0.4,robust,6,10533.13118175624,526.0758051960169,4256.093156292146,783.7965507386594,15.862646418833448,0.7732699435426456,0.42770414581632693,0.008967505611725135,4256.093156292146,783.7965507386594,,
 surge,0.6,no_robust,5,10139.2472848498,97.448078425168,3251.037082975553,742.2100315641153,16.26429537781848,0.4432465691073604,0.4329686574409998,0.004121820888165019,3251.037082975553,742.2100315641153,,
 surge,0.6,robust,2,10447.04164810777,524.0029334247373,5311.611043281193,1808.6200710093085,16.49826042222505,0.6088756908260344,0.43427092746638946,0.007817511630542989,5311.611043281193,1808.6200710093085,,
 surge,0.8,no_robust,3,9678.259826640971,272.83530913170915,3204.3479815026553,556.8799617962688,16.840420745981802,0.4589959822922529,0.43920385308157944,0.004953937449529005,3204.3479815026553,556.8799617962688,,
 surge,0.8,robust,3,10101.413689120902,526.8318040489241,4321.442189833087,1284.166148011517,17.737567199677557,0.6586775330563983,0.44627248379841095,0.004644261847052545,4321.442189833087,1284.166148011517,,
--- a/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_mode_summary.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_mode_summary.csv
@@ -0,0 +1,11 @@
 tier,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_margin_mean_mean,eval_margin_mean_std,objective_score_mean,objective_score_std,train_alpha_adv_mean,train_alpha_adv_std
 dqn,no_robust,29,315185.66674813855,23538.781000060844,302576.8036266896,62951.88633145167,136.82560356086017,1.3692652218935986,0.9313739013618878,0.0009314135057224836,302576.8036266896,62951.88633145167,0.45740740740740693,0.2368477698794438
 dqn,robust,37,306875.13950902375,27585.74444520695,283724.7169827867,69843.05611741856,136.68837571992978,2.3797541654948753,0.9312171495138941,0.0016512408492580111,283724.7169827867,69843.05611741856,0.5058198198198196,0.28324483129860284
 linear,no_robust,47,315501.15296155965,27105.014861872147,298149.1730416604,67664.7308344108,134.36884359609928,0.29743647613433244,0.9303607531364,0.0002152647006739543,298149.1730416604,67664.7308344108,,
 linear,robust,31,306269.9232239004,26399.875293394463,279872.824370329,54401.104602086416,134.32737693008372,0.31909212993628877,0.9303375215162144,0.00025000448833182963,279872.824370329,54401.104602086416,,
 qtable,no_robust,32,259818.72178238883,67188.58622318009,222088.83510765125,94450.12569617687,116.84641954166946,22.42810298937963,0.9140582213134033,0.02778864370791322,222088.83510765125,94450.12569617687,0.29218749999999993,0.2559326319498438
 qtable,robust,40,244470.50673219413,78666.30912808319,216920.53697298188,93983.50987622296,118.94013969887506,23.1428303249914,0.9178608956089163,0.023827311253270544,216920.53697298188,93983.50987622296,0.4396239583333334,0.29521865862482416
 static,no_robust,33,85228.452028227,12041.415672002751,64828.579890468536,17681.280330831738,70.58818912317687,4.204964531595236,0.8721419294578765,0.007107262779462876,64828.579890468536,17681.280330831738,,
 static,robust,30,84963.18577955024,8926.291379160475,63243.76603076817,14880.924342692271,70.94358095957392,4.363134562111469,0.8730306888410219,0.006660289247744752,63243.76603076817,14880.924342692271,,
 surge,no_robust,32,11121.867310184698,809.9895800277001,4260.038064073964,1160.4282377968032,16.416108827015794,0.641203520341943,0.43413855082681374,0.006214799767130059,4260.038064073964,1160.4282377968032,,
 surge,robust,29,10994.355365953365,750.5115890942825,4448.160863178768,1000.7519971246122,16.495943148858906,0.9823026347466668,0.4347587896392907,0.009698591291108968,4448.160863178768,1000.7519971246122,,
--- a/paper/src/chapters/figures/results/generated/legacy/first_sweep_top_configs.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_top_configs.csv
@@ -0,0 +1,26 @@
 Name,tier,alpha,mode,objective/score,eval/revenue_mean,eval/reward_mean,eval/coi_level_mean,lambda_coi,robust_radius,learning_rate,batch_size,n_steps,total_timesteps
 eager-sweep-244,dqn,0.0,no_robust,413274.4339549909,355872.06196128257,413274.4339549909,136.722140138007,0.2,0.1,0.0003,256,4096,15000
 efficient-sweep-319,linear,0.0,no_robust,410094.0151741567,353309.5198146561,410094.0151741567,134.55152038805429,0.4,0.1,0.001,128,4096,15000
 swept-sweep-422,linear,0.0,no_robust,403130.32747386186,347611.2815474988,403130.32747386186,133.8559785775022,0.4,0.3,0.0001,512,1024,15000
 decent-sweep-478,linear,0.1,no_robust,400452.36418713134,345284.5750647792,400452.36418713134,134.73082941975588,0.1,0.2,0.001,128,1024,50000
 eternal-sweep-339,linear,0.1,no_robust,399628.4231731644,344154.38525771734,399628.4231731644,134.89479277649667,0.4,0.1,0.0001,256,1024,50000
 ethereal-sweep-21,dqn,0.1,no_robust,398492.807245857,343580.6802427996,398492.807245857,136.67160732585188,0.1,0.2,0.001,512,2048,50000
 dark-sweep-418,linear,0.1,no_robust,394615.3720658343,339749.76272695075,394615.3720658343,134.39233246711,0.2,0.1,0.0003,256,1024,50000
 wandering-sweep-122,dqn,0.0,robust,394061.3617726404,339512.43434806296,394061.3617726404,137.6864755964331,0.1,0.3,0.0001,256,2048,30000
 laced-sweep-132,dqn,0.1,robust,389274.54998495104,335600.5979215904,389274.54998495104,137.36888574027677,0.4,0.2,0.001,256,2048,30000
 rich-sweep-53,qtable,0.0,robust,388601.2626147048,335630.6853337664,388601.2626147048,133.4414069888203,0.2,0.1,0.0001,512,1024,50000
 faithful-sweep-430,qtable,0.25,no_robust,387035.6970938766,333255.5771210341,387035.6970938766,137.4906091183188,0.1,0.2,0.0003,128,1024,15000
 dark-sweep-280,qtable,0.25,no_robust,386318.8845004527,332220.0316564078,386318.8845004527,137.26992450099925,0.4,0.1,0.0001,256,1024,50000
 chocolate-sweep-383,linear,0.25,no_robust,383989.49015403807,331071.7003244704,383989.49015403807,134.60590742050857,0.1,0.2,0.001,512,1024,30000
 dry-sweep-263,dqn,0.0,robust,383372.6880637367,330436.0312615148,383372.6880637367,137.40558130223476,0.1,0.3,0.001,128,1024,50000
 different-sweep-143,qtable,0.0,robust,383278.4198015018,330546.16800945485,383278.4198015018,135.9021538079678,0.1,0.3,0.001,256,2048,30000
 woven-sweep-139,dqn,0.25,robust,382788.1296637251,329427.735752473,382788.1296637251,136.8968339394894,0.1,0.1,0.001,512,1024,15000
 dark-sweep-215,dqn,0.25,robust,382358.2401374872,329330.0097603144,382358.2401374872,137.64528612332785,0.2,0.1,0.0001,512,4096,30000
 charmed-sweep-136,linear,0.25,no_robust,382249.5728044314,329646.2053260979,382249.5728044314,134.46825608007862,0.4,0.1,0.0001,256,2048,15000
 light-sweep-308,linear,0.0,robust,381939.1275250679,329628.9436641051,381939.1275250679,133.6209821974879,0.2,0.2,0.001,128,4096,30000
 treasured-sweep-325,linear,0.25,robust,381322.0104772589,328353.58675398555,381322.0104772589,134.8950293943581,0.1,0.1,0.0001,512,2048,15000
 fine-sweep-202,dqn,0.25,robust,378751.33572275366,326518.9068184018,378751.33572275366,137.2900973301052,0.1,0.2,0.0001,512,2048,30000
 treasured-sweep-380,linear,0.25,no_robust,377898.0979419424,325869.1953595453,377898.0979419424,134.54118723889738,0.4,0.3,0.001,128,1024,50000
 pretty-sweep-49,qtable,0.25,robust,377318.4766808995,325282.0152823859,377318.4766808995,137.19609012644068,0.4,0.1,0.0001,128,4096,50000
 desert-sweep-253,linear,0.25,robust,376808.6335063269,325146.3478714648,376808.6335063269,134.48396340732663,0.2,0.1,0.0003,256,1024,30000
 jolly-sweep-133,qtable,0.4,no_robust,376419.57394710975,323709.24588324485,376419.57394710975,137.8349363778071,0.1,0.3,0.0001,128,2048,50000
--- a/paper/src/chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf
+++ b/paper/src/chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf
--- a/paper/src/chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf
+++ b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf
--- a/paper/src/chapters/figures/results/generated/legacy/plots/ppo_delta_curves.pdf
+++ b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_delta_curves.pdf
--- a/paper/src/chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf
+++ b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf
--- a/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_deltas.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_deltas.csv
@@ -0,0 +1,7 @@
 alpha,runs_robust,runs_no_robust,eval_revenue_mean_robust,eval_revenue_mean_no_robust,eval_revenue_mean_delta,eval_revenue_mean_delta_pct,eval_reward_mean_robust,eval_reward_mean_no_robust,eval_reward_mean_delta,eval_reward_mean_delta_pct,eval_coi_level_mean_robust,eval_coi_level_mean_no_robust,eval_coi_level_mean_delta,eval_coi_level_mean_delta_pct,eval_coi_leakage_mean_robust,eval_coi_leakage_mean_no_robust,eval_coi_leakage_mean_delta,eval_coi_leakage_mean_delta_pct,eval_volatility_mean_robust,eval_volatility_mean_no_robust,eval_volatility_mean_delta,eval_volatility_mean_delta_pct,eval_margin_mean_robust,eval_margin_mean_no_robust,eval_margin_mean_delta,eval_margin_mean_delta_pct,train_alpha_adv_robust,train_alpha_adv_no_robust,train_alpha_adv_delta,train_alpha_adv_delta_pct,train_coi_penalty_robust,train_coi_penalty_no_robust,train_coi_penalty_delta,train_coi_penalty_delta_pct,train_ux_penalty_robust,train_ux_penalty_no_robust,train_ux_penalty_delta,train_ux_penalty_delta_pct,train_agent_prob_robust,train_agent_prob_no_robust,train_agent_prob_delta,train_agent_prob_delta_pct
 0.0,4.0,4.0,3379.9042994670963,3565.2912010160844,-185.38690154898813,-5.199768857482219,313527.4707462,331300.229069,-17772.758322799986,-5.364547550342456,137.08358925982625,137.28764358955686,-0.2040543297306101,-0.14863269875959326,0.1146626165658294,0.11861133504329742,-0.003948718477468013,-3.3291240470622716,0.06687153537785637,0.06445662162531288,0.0024149137525434905,3.746572022625408,0.9315273502623671,0.9317078361627993,-0.00018048590043218127,-0.019371512552207898,0.18958333333333333,,,,5.553200113221484,,,,61.35134238638615,66.58479574844135,-5.233453362055201,-7.859832418540847,0.12778212146468534,0.11615891320235115,0.011623208262334192,10.00629907933654
 0.1,4.0,4.0,3307.028238366196,3458.002436284769,-150.97419791857283,-4.365936713473732,306772.49146475,321215.477968,-14442.986503249966,-4.4963544704059375,137.1182041122497,136.82757579763506,0.29062831461465066,0.21240478238427865,0.1128546052304944,0.11704917861668755,-0.004194573386193154,-3.5835991638433753,0.0685405649303561,0.06737596899527175,0.0011645959350843477,1.728503430007924,0.9315331673960889,0.9313276818191593,0.00020548557692967595,0.0220637248243606,0.2818749999999999,0.1,0.18187499999999987,181.87499999999986,5.079528726095333,,,,52.44772950699336,53.288869747139515,-0.841140240146153,-1.578453895039319,0.11644381911386253,0.11765277436070229,-0.0012089552468397546,-1.0275620387270383
 0.25,4.0,4.0,3134.3438215278165,3300.5539051855053,-166.21008365768876,-5.035823938416998,290691.4771835,306522.90003785,-15831.422854350007,-5.16484179563586,136.89990884669214,136.71752459667877,0.18238425001337077,0.1334022471160229,0.11113957413522965,0.1139905600539111,-0.0028509859186814507,-2.50107194607439,0.06427159998376095,0.06846858821082077,-0.004196988227059828,-6.12980103246314,0.9314501501825461,0.9313053225630614,0.0001448276194846443,0.015551035302371268,0.44833333333333336,0.25,0.19833333333333336,79.33333333333334,4.7183804755060255,,,,49.04307009982127,55.2030005738411,-6.159930474019831,-11.158687770568074,0.10998505830218755,0.11684259343269415,-0.0068575351305066035,-5.869037077182653
 0.4,4.0,4.0,2983.852437569374,3180.7872854626567,-196.9348478932825,-6.191386918369099,276545.26309355,295433.5405797,-18888.277486150037,-6.393409986248494,136.19210761854086,136.5783021470118,-0.38619452847095204,-0.2827641890402586,0.10875560547061063,0.11189234314151972,-0.0031367376709090927,-2.8033532794480807,0.07452230347799255,0.07104688223410768,0.003475421243884863,4.891729425132195,0.9307282962514367,0.9310542820602117,-0.0003259858087749645,-0.03501254599824534,0.5999999999999999,0.4000000000000001,0.1999999999999998,49.999999999999936,4.174996403604185,,,,47.99794119802058,50.794260008988424,-2.796318810967847,-5.505186630286606,0.10222958892923095,0.11161526349272373,-0.009385674563492777,-8.408952565976458
 0.6,4.0,4.0,2789.0434220430398,2982.2460998252786,-193.20267778223888,-6.4784283830083,258688.11700405,277051.95613675,-18363.8391327,-6.628301560749781,136.86774320500828,136.81931587629953,0.04842732870875466,0.035395096371142916,0.10501047827147733,0.10802266412956946,-0.0030121858580921257,-2.788475809557069,0.06914180963767007,0.06698591531512615,0.0021558943225439137,3.2184292957732996,0.9314130089130337,0.9313849217310588,2.8087181974889575e-05,0.003015636319588161,0.7733333333333334,0.5999999999999999,0.17333333333333356,28.888888888888935,4.178300996512875,,,,39.928062615509425,47.86860429278531,-7.940541677275881,-16.588203885594947,0.11297979438696983,0.1162670925925253,-0.0032872982055554695,-2.827367686122743
 0.8,4.0,4.0,2586.098242115281,2841.1305915063504,-255.03234939106915,-8.97643882169642,239765.24959855,264140.55002745,-24375.300428900024,-9.228155399224729,136.5038826686135,137.28163778418497,-0.7777551155714661,-0.5665397995864124,0.10253056902792507,0.1031498585902154,-0.0006192895622903344,-0.6003784888844036,0.07325665736408164,0.06592454978099352,0.007332107583088124,11.1219683827132,0.9311235469993302,0.9316596013994161,-0.0005360544000858614,-0.05753758124541101,1.0,0.8000000000000002,0.19999999999999984,24.99999999999998,3.5384100686094007,,,,37.14414699970415,37.43809775029793,-0.29395075059377973,-0.7851647606519765,0.09990322635678014,0.10432800196112454,-0.0044247756043444,-4.241215705437541
--- a/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_mode_summary.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_mode_summary.csv
@@ -0,0 +1,13 @@
 alpha,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_coi_leakage_mean_mean,eval_coi_leakage_mean_std,eval_volatility_mean_mean,eval_volatility_mean_std,eval_margin_mean_mean,eval_margin_mean_std,train_alpha_adv_mean,train_alpha_adv_std,train_coi_penalty_mean,train_coi_penalty_std,train_ux_penalty_mean,train_ux_penalty_std,train_agent_prob_mean,train_agent_prob_std
 0.0,no_robust,4,3565.2912010160844,52.219179508209216,331300.229069,5038.96659004527,137.28764358955686,0.6434240315013728,0.11861133504329742,0.004019332768284657,0.06445662162531288,0.004080405219050139,0.9317078361627993,0.00038018051704976865,,,,,66.58479574844135,32.282270089830455,0.11615891320235115,0.016558627227281013
 0.0,robust,4,3379.9042994670963,54.727408939657735,313527.4707462,5408.058196552377,137.08358925982625,1.047386315387148,0.1146626165658294,0.0025627354157035497,0.06687153537785637,0.008577061675868377,0.9315273502623671,0.0007274203134899985,0.18958333333333333,0.02083333333333336,5.553200113221484,0.45981481828856186,61.35134238638615,30.27964905193963,0.12778212146468534,0.027929667978205217
 0.1,no_robust,4,3458.002436284769,60.75923217871363,321215.477968,6016.373193216596,136.82757579763506,1.1899102161551907,0.11704917861668755,0.0021220259908233973,0.06737596899527175,0.006801136773079149,0.9313276818191593,0.0008352263172197586,0.1,0.0,,,53.288869747139515,18.480340945815023,0.11765277436070229,0.017544197575138736
 0.1,robust,4,3307.028238366196,35.58495715224888,306772.49146475,3488.2690530060245,137.1182041122497,0.8582218376452346,0.1128546052304944,0.0005963155492967403,0.0685405649303561,0.0050673362512629015,0.9315331673960889,0.0005217376436765336,0.2818749999999999,0.03624999999999999,5.079528726095333,0.6109585102054891,52.44772950699336,29.0263361696475,0.11644381911386253,0.021152545180088765
 0.25,no_robust,4,3300.5539051855053,50.460978662647115,306522.90003785,4860.668937531515,136.71752459667877,0.7410676951244369,0.1139905600539111,0.003319948537321803,0.06846858821082077,0.008614994548315848,0.9313053225630614,0.0004919872662680591,0.25,0.0,,,55.2030005738411,26.88247558235345,0.11684259343269415,0.013462146346772591
 0.25,robust,4,3134.3438215278165,64.06834403659167,290691.4771835,6331.196493752059,136.89990884669214,1.3796663751798552,0.11113957413522965,0.0015044942041406348,0.06427159998376095,0.0042331619171274894,0.9314501501825461,0.0008939739741734515,0.44833333333333336,0.0033333333333333518,4.7183804755060255,0.4538389380858333,49.04307009982127,28.20484665432831,0.10998505830218755,0.010731404693185651
 0.4,no_robust,4,3180.7872854626567,71.87564776824694,295433.5405797,7035.374110540269,136.5783021470118,1.7095219574599192,0.11189234314151972,0.0013821115134030936,0.07104688223410768,0.005766138692685495,0.9310542820602117,0.0013989725050689828,0.4000000000000001,0.0,,,50.794260008988424,24.836708377642946,0.11161526349272373,0.005787749200301594
 0.4,robust,4,2983.852437569374,45.51290575912758,276545.26309355,4555.1725323898245,136.19210761854086,1.5546063667946701,0.10875560547061063,0.001118798290958954,0.07452230347799255,0.0040446395928049874,0.9307282962514367,0.0013558080014763189,0.5999999999999999,0.0,4.174996403604185,0.12189448324552496,47.99794119802058,33.51782503281748,0.10222958892923095,0.0031686467591609474
 0.6,no_robust,4,2982.2460998252786,39.93674476199945,277051.95613675,3931.02017169463,136.81931587629953,1.1995405806950865,0.10802266412956946,0.000405835985606262,0.06698591531512615,0.002805894772223563,0.9313849217310588,0.0008100530228792662,0.5999999999999999,0.0,,,47.86860429278531,23.830502772642472,0.1162670925925253,0.028676813474186293
 0.6,robust,4,2789.0434220430398,35.297482315631626,258688.11700405,3420.6735023624556,136.86774320500828,0.7097303238857778,0.10501047827147733,0.0008273121554488608,0.06914180963767007,0.009066158371268139,0.9314130089130337,0.0005024421703994162,0.7733333333333334,0.053333333333333385,4.178300996512875,0.5865970573865015,39.928062615509425,30.25078643153115,0.11297979438696983,0.0274101056520461
 0.8,no_robust,4,2841.1305915063504,21.84043179776092,264140.55002745,2073.353315114627,137.28163778418497,0.6288968799501957,0.1031498585902154,0.0012877581835795701,0.06592454978099352,0.00340700896766341,0.9316596013994161,0.00038430108058413553,0.8000000000000002,0.0,,,37.43809775029793,32.01740090550489,0.10432800196112454,0.018337841526911584
 0.8,robust,4,2586.098242115281,48.05539265296157,239765.24959855,4681.6472175597555,136.5038826686135,1.0611320896043694,0.10253056902792507,0.002587472569909977,0.07325665736408164,0.0015359324114246234,0.9311235469993302,0.0006145440308596868,1.0,0.0,3.5384100686094007,0.391972726035734,37.14414699970415,25.614063825315505,0.09990322635678014,0.010269342031085898
--- a/paper/src/chapters/figures/results/generated/legacy/ppo_headline_summary.json
+++ b/paper/src/chapters/figures/results/generated/legacy/ppo_headline_summary.json
@@ -0,0 +1,7 @@
 {
  "status": "ok",
  "revenue_delta": -191.29017636530716,
  "revenue_delta_pct": -5.938226273545598,
  "coi_leakage_delta": -0.002960415145605702,
  "coi_leakage_delta_pct": -2.6404147469510946
 }
--- a/paper/src/chapters/figures/results/generated/legacy/ppo_overall_mode_summary.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/ppo_overall_mode_summary.csv
@@ -0,0 +1,3 @@
 mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_coi_leakage_mean_mean,eval_coi_leakage_mean_std,eval_volatility_mean_mean,eval_volatility_mean_std,eval_margin_mean_mean,eval_margin_mean_std,train_alpha_adv_mean,train_alpha_adv_std,train_coi_penalty_mean,train_coi_penalty_std,train_ux_penalty_mean,train_ux_penalty_std,train_agent_prob_mean,train_agent_prob_std
 no_robust,24,3221.335253213441,262.46595166337727,299277.442303125,24382.561944761477,136.9186666318945,1.0038463876967063,0.11211932326253345,0.005805494533542669,0.06737642102693879,0.005402738047823369,0.9314066076226178,0.0007436370959663933,0.43,0.2546411303445653,,,51.86293802024894,25.340287421525442,0.11381077317368686,0.016664235359362907
 robust,24,3030.0450768481337,288.262657026656,280998.34484843333,26820.020161880373,136.77757261848845,1.06224696086916,0.10915890811692774,0.004616462637659704,0.06943407846195294,0.006435789449278624,0.9312959200008004,0.0007858424519830652,0.5488541666666666,0.2860373751485706,4.540469463924883,0.7906156355346259,47.985382134405825,27.407657819442747,0.11155393475895271,0.01943348418653492
--- a/paper/src/chapters/figures/results/generated/legacy/ppo_pairwise_win_rates.csv
+++ b/paper/src/chapters/figures/results/generated/legacy/ppo_pairwise_win_rates.csv
@@ -0,0 +1,25 @@
 alpha,metric,direction,wins,ties,total_pairs,win_probability
 0.0,eval/revenue_mean,higher,0,0,16,0.0
 0.0,eval/reward_mean,higher,0,0,16,0.0
 0.0,eval/coi_leakage_mean,lower,14,0,16,0.875
 0.0,eval/volatility_mean,lower,8,0,16,0.5
 0.1,eval/revenue_mean,higher,0,0,16,0.0
 0.1,eval/reward_mean,higher,0,0,16,0.0
 0.1,eval/coi_leakage_mean,lower,16,0,16,1.0
 0.1,eval/volatility_mean,lower,8,0,16,0.5
 0.25,eval/revenue_mean,higher,0,0,16,0.0
 0.25,eval/reward_mean,higher,0,0,16,0.0
 0.25,eval/coi_leakage_mean,lower,12,0,16,0.75
 0.25,eval/volatility_mean,lower,11,0,16,0.6875
 0.4,eval/revenue_mean,higher,0,0,16,0.0
 0.4,eval/reward_mean,higher,0,0,16,0.0
 0.4,eval/coi_leakage_mean,lower,16,0,16,1.0
 0.4,eval/volatility_mean,lower,6,0,16,0.375
 0.6,eval/revenue_mean,higher,0,0,16,0.0
 0.6,eval/reward_mean,higher,0,0,16,0.0
 0.6,eval/coi_leakage_mean,lower,16,0,16,1.0
 0.6,eval/volatility_mean,lower,7,0,16,0.4375
 0.8,eval/revenue_mean,higher,0,0,16,0.0
 0.8,eval/reward_mean,higher,0,0,16,0.0
 0.8,eval/coi_leakage_mean,lower,11,0,16,0.6875
 0.8,eval/volatility_mean,lower,0,0,16,0.0
--- a/paper/src/chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex
+++ b/paper/src/chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex
@@ -0,0 +1 @@
 \includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf}
--- a/paper/src/chapters/figures/results/includes/final/final_focus_revenue_delta.tex
+++ b/paper/src/chapters/figures/results/includes/final/final_focus_revenue_delta.tex
@@ -0,0 +1 @@
 \includegraphics[width=0.95\linewidth]{chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf}
--- a/paper/src/chapters/figures/results/includes/final/final_focus_risk_deltas.tex
+++ b/paper/src/chapters/figures/results/includes/final/final_focus_risk_deltas.tex
@@ -0,0 +1 @@
 \includegraphics[width=0.95\linewidth]{chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf}
--- a/paper/src/chapters/figures/results/includes/legacy/first_sweep_tier_revenue.tex
+++ b/paper/src/chapters/figures/results/includes/legacy/first_sweep_tier_revenue.tex
@@ -0,0 +1 @@
 \includegraphics[width=0.99\linewidth]{chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf}
--- a/paper/src/chapters/figures/results/includes/legacy/ppo_alpha_curves.tex
+++ b/paper/src/chapters/figures/results/includes/legacy/ppo_alpha_curves.tex
@@ -0,0 +1 @@
 \includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf}
--- a/paper/src/chapters/figures/results/includes/legacy/ppo_delta_curves.tex
+++ b/paper/src/chapters/figures/results/includes/legacy/ppo_delta_curves.tex
@@ -0,0 +1 @@
 \includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_delta_curves.pdf}
--- a/paper/src/chapters/figures/results/includes/legacy/ppo_tradeoff_scatter.tex
+++ b/paper/src/chapters/figures/results/includes/legacy/ppo_tradeoff_scatter.tex
@@ -0,0 +1 @@
 \includegraphics[width=0.88\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf}
--- a/paper/src/chapters/figures/results/plot_results.py
+++ b/paper/src/chapters/figures/results/plot_results.py
@@ -0,0 +1,313 @@
 from __future__ import annotations
 import argparse
 from pathlib import Path
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 from matplotlib.ticker import FuncFormatter
 import numpy as np
 import pandas as pd
 from process_first_sweep import run as run_first_sweep
 from process_ppo_benchmark import run as run_ppo_benchmark
 def _output_dir() -> Path:
    return Path(__file__).resolve().parent / "generated" / "legacy"
 def _plot_dir() -> Path:
    return _output_dir() / "plots"
 def _configure_style() -> None:
    plt.rcParams.update(
        {
            "font.family": "serif",
            "font.size": 10,
            "axes.titlesize": 10,
            "axes.labelsize": 9,
            "legend.fontsize": 8,
            "xtick.labelsize": 8,
            "ytick.labelsize": 8,
            "figure.dpi": 220,
            "savefig.dpi": 320,
            "axes.spines.top": False,
            "axes.spines.right": False,
            "axes.grid": True,
            "grid.alpha": 0.22,
        }
    )
 def _fmt_thousands(value: float, _: int) -> str:
    return f"{int(value):,}"
 def _load_csv(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Missing required input: {path}")
    return pd.read_csv(path)
 def _plot_ppo_alpha_curves(alpha_mode: pd.DataFrame, out_dir: Path) -> Path:
    fig, axes = plt.subplots(2, 2, figsize=(9.3, 6.4), constrained_layout=True)
    robust_color = "#C44E52"
    baseline_color = "#4C72B0"
    mode_colors = {"robust": robust_color, "no_robust": baseline_color}
    mode_labels = {"robust": "Robust", "no_robust": "Non-robust"}
    panels = [
        ("eval_revenue_mean", "Mean Episode Revenue", "Revenue"),
        ("eval_reward_mean", "Mean Episode Reward", "Reward"),
        ("eval_coi_leakage_mean", "Mean COI Leakage", "COI Leakage"),
        ("eval_volatility_mean", "Mean Price Volatility", "Volatility"),
    ]
    for ax, (metric_prefix, title, ylabel) in zip(axes.flat, panels):
        mean_col = f"{metric_prefix}_mean"
        std_col = f"{metric_prefix}_std"
        for mode in ("no_robust", "robust"):
            sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha")
            if sub.empty:
                continue
            x = sub["alpha"].to_numpy(dtype=float)
            y = sub[mean_col].to_numpy(dtype=float)
            ax.plot(
                x,
                y,
                marker="o",
                linewidth=1.8,
                markersize=4,
                color=mode_colors[mode],
                label=mode_labels[mode],
            )
            if std_col in sub.columns:
                sigma = sub[std_col].fillna(0.0).to_numpy(dtype=float)
                ax.fill_between(
                    x,
                    y - sigma,
                    y + sigma,
                    color=mode_colors[mode],
                    alpha=0.14,
                    linewidth=0,
                )
        ax.set_title(title)
        ax.set_xlabel(r"Contamination $\alpha$")
        ax.set_ylabel(ylabel)
        ax.set_xticks(sorted(alpha_mode["alpha"].unique()))
        if metric_prefix in {"eval_revenue_mean", "eval_reward_mean"}:
            ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
    handles, labels = axes.flat[0].get_legend_handles_labels()
    fig.legend(handles, labels, ncol=2, loc="upper center", bbox_to_anchor=(0.5, 1.02))
    out_path = out_dir / "ppo_alpha_curves.pdf"
    fig.savefig(out_path, bbox_inches="tight")
    plt.close(fig)
    return out_path
 def _plot_ppo_delta_curves(deltas: pd.DataFrame, out_dir: Path) -> Path:
    fig, axes = plt.subplots(2, 1, figsize=(8.6, 6.0), constrained_layout=True)
    deltas = deltas.sort_values("alpha")
    x = deltas["alpha"].to_numpy(dtype=float)
    top_metrics = [
        ("eval_revenue_mean_delta_pct", "Revenue", "#4C72B0"),
        ("eval_reward_mean_delta_pct", "Reward", "#8172B3"),
    ]
    for col, label, color in top_metrics:
        axes[0].plot(
            x,
            deltas[col].to_numpy(dtype=float),
            marker="o",
            linewidth=1.8,
            markersize=4,
            color=color,
            label=label,
        )
    axes[0].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
    axes[0].set_title("Robust Minus Non-robust Delta by Contamination")
    axes[0].set_ylabel("Delta (%)")
    axes[0].set_xlabel(r"Contamination $\alpha$")
    axes[0].set_xticks(x)
    axes[0].legend(loc="lower left")
    bottom_metrics = [
        ("eval_coi_leakage_mean_delta_pct", "COI Leakage", "#55A868"),
        ("eval_volatility_mean_delta_pct", "Volatility", "#DD8452"),
    ]
    for col, label, color in bottom_metrics:
        axes[1].plot(
            x,
            deltas[col].to_numpy(dtype=float),
            marker="o",
            linewidth=1.8,
            markersize=4,
            color=color,
            label=label,
        )
    axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
    axes[1].set_ylabel("Delta (%)")
    axes[1].set_xlabel(r"Contamination $\alpha$")
    axes[1].set_xticks(x)
    axes[1].legend(loc="lower left")
    out_path = out_dir / "ppo_delta_curves.pdf"
    fig.savefig(out_path, bbox_inches="tight")
    plt.close(fig)
    return out_path
 def _plot_ppo_tradeoff_scatter(deltas: pd.DataFrame, out_dir: Path) -> Path:
    fig, ax = plt.subplots(figsize=(6.4, 5.2), constrained_layout=True)
    data = deltas.sort_values("alpha")
    x = data["eval_coi_leakage_mean_delta_pct"].to_numpy(dtype=float)
    y = data["eval_revenue_mean_delta_pct"].to_numpy(dtype=float)
    alphas = data["alpha"].to_numpy(dtype=float)
    scatter = ax.scatter(
        x,
        y,
        c=alphas,
        cmap="viridis",
        s=72,
        edgecolor="#222222",
        linewidth=0.5,
    )
    for x_i, y_i, alpha in zip(x, y, alphas):
        ax.annotate(
            rf"$\alpha={alpha:.2f}$",
            (x_i, y_i),
            textcoords="offset points",
            xytext=(5, 4),
            fontsize=8,
        )
    ax.axhline(0.0, color="#555555", linewidth=1.0, linestyle="--")
    ax.axvline(0.0, color="#555555", linewidth=1.0, linestyle="--")
    ax.set_xlabel("COI Leakage Delta (%)")
    ax.set_ylabel("Revenue Delta (%)")
    ax.set_title("PPO Robust Tradeoff Frontier")
    cbar = fig.colorbar(scatter, ax=ax)
    cbar.set_label(r"Contamination $\alpha$")
    out_path = out_dir / "ppo_tradeoff_scatter.pdf"
    fig.savefig(out_path, bbox_inches="tight")
    plt.close(fig)
    return out_path
 def _plot_first_sweep_tier_revenue(tier_mode: pd.DataFrame, out_dir: Path) -> Path:
    pivot = (
        tier_mode.pivot(index="tier", columns="mode", values="eval_revenue_mean_mean")
        .dropna(subset=["robust", "no_robust"], how="any")
        .copy()
    )
    if pivot.empty:
        raise ValueError("First sweep tier summary missing robust/non-robust pairs")
    order = sorted(pivot.index.tolist())
    pivot = pivot.loc[order]
    delta_pct = 100.0 * (pivot["robust"] - pivot["no_robust"]) / pivot["no_robust"]
    fig, axes = plt.subplots(1, 2, figsize=(10.2, 4.3), constrained_layout=True)
    x = np.arange(len(order))
    width = 0.36
    axes[0].bar(
        x - width / 2,
        pivot["no_robust"].to_numpy(dtype=float),
        width=width,
        label="Non-robust",
        color="#4C72B0",
    )
    axes[0].bar(
        x + width / 2,
        pivot["robust"].to_numpy(dtype=float),
        width=width,
        label="Robust",
        color="#C44E52",
    )
    axes[0].set_xticks(x)
    axes[0].set_xticklabels(order, rotation=20)
    axes[0].set_ylabel("Mean Revenue")
    axes[0].set_yscale("log")
    axes[0].yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
    axes[0].set_title("First Sweep Tier Revenue (log scale)")
    axes[0].legend()
    axes[1].bar(x, delta_pct.to_numpy(dtype=float), color="#55A868", width=0.55)
    axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
    axes[1].set_xticks(x)
    axes[1].set_xticklabels(order, rotation=20)
    axes[1].set_ylabel("Revenue Delta (%)")
    axes[1].set_title("Robust Minus Non-robust by Tier")
    out_path = out_dir / "first_sweep_tier_revenue.pdf"
    fig.savefig(out_path, bbox_inches="tight")
    plt.close(fig)
    return out_path
 def build_plots(data_dir: Path, out_dir: Path) -> list[Path]:
    alpha_mode = _load_csv(data_dir / "ppo_alpha_mode_summary.csv")
    deltas = _load_csv(data_dir / "ppo_alpha_deltas.csv")
    tier_mode = _load_csv(data_dir / "first_sweep_tier_mode_summary.csv")
    out_dir.mkdir(parents=True, exist_ok=True)
    paths = [
        _plot_ppo_alpha_curves(alpha_mode, out_dir),
        _plot_ppo_delta_curves(deltas, out_dir),
        _plot_ppo_tradeoff_scatter(deltas, out_dir),
        _plot_first_sweep_tier_revenue(tier_mode, out_dir),
    ]
    return paths
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Create paper-ready plots from result CSVs"
    )
    parser.add_argument("--data-dir", type=Path, default=_output_dir())
    parser.add_argument("--plot-dir", type=Path, default=_plot_dir())
    parser.add_argument(
        "--refresh-data",
        action="store_true",
        help="Regenerate processed CSVs before plotting",
    )
    args = parser.parse_args()
    _configure_style()
    if bool(args.refresh_data):
        run_ppo_benchmark(
            input_path=Path(__file__).resolve().parents[5]
            / "tpu_orchestration"
            / "results"
            / "ppo_benchmark.csv",
            output_dir=args.data_dir,
            include_non_finished=False,
        )
        run_first_sweep(
            input_path=Path(__file__).resolve().parents[5]
            / "tpu_orchestration"
            / "results"
            / "first_sweep.csv",
            output_dir=args.data_dir,
            include_non_finished=False,
            top_n=25,
        )
    outputs = build_plots(data_dir=args.data_dir, out_dir=args.plot_dir)
    for path in outputs:
        print(path)
 if __name__ == "__main__":
    main()
--- a/paper/src/chapters/figures/results/plot_wandb_export.py
+++ b/paper/src/chapters/figures/results/plot_wandb_export.py
@@ -0,0 +1,658 @@
 from __future__ import annotations
 import argparse
 from pathlib import Path
 from typing import Iterable
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 from matplotlib.ticker import FuncFormatter
 import numpy as np
 import pandas as pd
 def _load_tikzplotlib():
    def _patch_webcolors() -> None:
        try:
            import webcolors
            if hasattr(webcolors, "CSS3_HEX_TO_NAMES"):
                return
            css3 = getattr(webcolors, "CSS3", "css3")
            webcolors.CSS3_HEX_TO_NAMES = {
                webcolors.name_to_hex(name, spec=css3): name
                for name in webcolors.names(spec=css3)
            }
        except Exception:
            return
    _patch_webcolors()
    try:
        from matplotlib.legend import Legend
        if not hasattr(Legend, "_ncol") and hasattr(Legend, "_ncols"):
            Legend._ncol = property(lambda self: self._ncols)
    except Exception:
        pass
    try:
        import tikzplotlib as module
        return module, None
    except Exception:
        pass
    try:
        from matplotlib.backends import backend_pgf
        if not hasattr(backend_pgf, "common_texification") and hasattr(
            backend_pgf, "_tex_escape"
        ):
            backend_pgf.common_texification = backend_pgf._tex_escape
        _patch_webcolors()
        import tikzplotlib as module
        return module, None
    except Exception as exc:
        return None, exc
 TIKZPLOTLIB, TIKZPLOTLIB_IMPORT_ERROR = _load_tikzplotlib()
 def _default_output_dir() -> Path:
    return Path(__file__).resolve().parent / "generated" / "wandb"
 def _default_plot_dir(output_dir: Path) -> Path:
    return output_dir / "plots"
 def _sanitize(key: str) -> str:
    return key.replace("/", "_").replace("-", "_")
 def _configure_style() -> None:
    plt.rcParams.update(
        {
            "font.family": "serif",
            "font.size": 10,
            "axes.titlesize": 10,
            "axes.labelsize": 9,
            "legend.fontsize": 8,
            "xtick.labelsize": 8,
            "ytick.labelsize": 8,
            "figure.dpi": 220,
            "savefig.dpi": 320,
            "axes.spines.top": False,
            "axes.spines.right": False,
            "axes.grid": True,
            "grid.alpha": 0.22,
        }
    )
 def _fmt_thousands(value: float, _: int) -> str:
    return f"{int(value):,}"
 def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None:
    for column in columns:
        if column in frame.columns:
            frame[column] = pd.to_numeric(frame[column], errors="coerce")
 def _extract_alpha(frame: pd.DataFrame) -> pd.Series:
    if "study/alpha" in frame.columns:
        return pd.to_numeric(frame["study/alpha"], errors="coerce")
    if "alpha" in frame.columns:
        return pd.to_numeric(frame["alpha"], errors="coerce")
    return pd.Series(np.nan, index=frame.index, dtype=float)
 def _extract_mode(frame: pd.DataFrame) -> pd.Series:
    if "study/mode" in frame.columns:
        mode = frame["study/mode"].astype(str).str.strip().str.lower()
        mapping = {
            "baseline": "baseline",
            "no_robust": "baseline",
            "defended": "defended",
            "robust": "defended",
        }
        return mode.map(mapping).fillna("")
    if "study/no_robust" in frame.columns:
        no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0)
        return pd.Series(
            np.where(no_robust > 0.5, "baseline", "defended"),
            index=frame.index,
            dtype="object",
        )
    if "no_robust" in frame.columns:
        no_robust = (
            frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"})
        )
        return pd.Series(
            np.where(no_robust, "baseline", "defended"),
            index=frame.index,
            dtype="object",
        )
    return pd.Series("", index=frame.index, dtype="object")
 def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame:
    data = frame.copy()
    if not include_non_finished and "State" in data.columns:
        data = data[data["State"].astype(str).str.lower() == "finished"].copy()
    data["alpha"] = _extract_alpha(data)
    data["mode"] = _extract_mode(data)
    data = data[data["mode"].isin({"baseline", "defended"})]
    data = data[data["alpha"].notna()]
    _coerce_numeric(
        data,
        [
            "eval/revenue_mean",
            "eval/reward_mean",
            "eval/coi_level_mean",
            "eval/coi_leakage_mean",
            "eval/volatility_mean",
            "eval/revenue_std",
            "eval/reward_std",
            "eval/margin_mean",
            "train/agent_prob",
            "train/alpha_adv",
            "lambda_coi",
            "ambiguity_radius",
            "n_products",
        ],
    )
    return data.sort_values(["alpha", "mode"]).reset_index(drop=True)
 def _summary_by_alpha_mode(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
    agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
    for metric in metrics:
        safe = _sanitize(metric)
        agg_spec[f"{safe}_mean"] = (metric, "mean")
        agg_spec[f"{safe}_std"] = (metric, "std")
    return (
        frame.groupby(["alpha", "mode"], as_index=False)
        .agg(**agg_spec)
        .sort_values(["alpha", "mode"])
        .reset_index(drop=True)
    )
 def _delta_by_alpha(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
    rows: list[dict[str, float]] = []
    for alpha, alpha_group in summary.groupby("alpha", sort=True):
        defended = alpha_group[alpha_group["mode"] == "defended"]
        baseline = alpha_group[alpha_group["mode"] == "baseline"]
        if defended.empty or baseline.empty:
            continue
        row: dict[str, float] = {
            "alpha": float(alpha),
            "runs_defended": float(defended["runs"].iloc[0]),
            "runs_baseline": float(baseline["runs"].iloc[0]),
        }
        for metric in metrics:
            safe = _sanitize(metric)
            defended_value = float(defended[f"{safe}_mean"].iloc[0])
            baseline_value = float(baseline[f"{safe}_mean"].iloc[0])
            delta = defended_value - baseline_value
            row[f"{safe}_defended"] = defended_value
            row[f"{safe}_baseline"] = baseline_value
            row[f"{safe}_delta"] = delta
            row[f"{safe}_delta_pct"] = (
                np.nan if baseline_value == 0 else 100.0 * delta / baseline_value
            )
        rows.append(row)
    return pd.DataFrame(rows)
 def _summary_by_parameter(
    frame: pd.DataFrame, parameter: str, metrics: list[str]
 ) -> pd.DataFrame:
    defended = frame[frame["mode"] == "defended"].copy()
    defended = defended[defended[parameter].notna()].copy()
    agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
    for metric in metrics:
        safe = _sanitize(metric)
        agg_spec[f"{safe}_mean"] = (metric, "mean")
        agg_spec[f"{safe}_std"] = (metric, "std")
    return (
        defended.groupby(["alpha", parameter], as_index=False)
        .agg(**agg_spec)
        .sort_values(["alpha", parameter])
        .reset_index(drop=True)
    )
 def _save_table(frame: pd.DataFrame, path: Path) -> Path:
    path.parent.mkdir(parents=True, exist_ok=True)
    frame.to_csv(path, index=False)
    return path
 def _save_figure(fig: plt.Figure, pdf_path: Path, export_tikz: bool) -> list[Path]:
    pdf_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(pdf_path, bbox_inches="tight")
    written = [pdf_path]
    if export_tikz:
        if TIKZPLOTLIB is None:
            raise RuntimeError(
                "tikzplotlib import failed. Install/upgrade tikzplotlib and matplotlib-compatible dependencies. "
                f"Original error: {TIKZPLOTLIB_IMPORT_ERROR}"
            )
        try:
            from matplotlib.legend import Legend
            from matplotlib.lines import Line2D
            for legend in fig.findobj(Legend):
                if not hasattr(legend, "_ncol") and hasattr(legend, "_ncols"):
                    setattr(legend, "_ncol", legend._ncols)
                if not hasattr(legend, "legendHandles") and hasattr(
                    legend, "legend_handles"
                ):
                    setattr(legend, "legendHandles", legend.legend_handles)
            for line in fig.findobj(Line2D):
                if hasattr(line, "_us_dashSeq"):
                    continue
                if not hasattr(line, "_dash_pattern"):
                    continue
                dash_pattern = getattr(line, "_dash_pattern")
                if not isinstance(dash_pattern, tuple) or len(dash_pattern) != 2:
                    continue
                setattr(line, "_us_dashOffset", dash_pattern[0])
                setattr(line, "_us_dashSeq", dash_pattern[1])
        except Exception:
            pass
        tikz_path = pdf_path.with_suffix(".tikz.tex")
        TIKZPLOTLIB.save(str(tikz_path), figure=fig)
        written.append(tikz_path)
    plt.close(fig)
    return written
 def _plot_alpha_curves(
    alpha_mode: pd.DataFrame, out_dir: Path, export_tikz: bool
 ) -> list[Path]:
    fig, axes = plt.subplots(2, 2, figsize=(9.3, 6.4), constrained_layout=True)
    mode_colors = {"baseline": "#4C72B0", "defended": "#C44E52"}
    mode_labels = {"baseline": "Baseline", "defended": "Defended"}
    panels = [
        ("eval_revenue_mean", "Mean Episode Revenue", "Revenue"),
        ("eval_reward_mean", "Mean Episode Reward", "Reward"),
        ("eval_coi_leakage_mean", "Mean COI Leakage", "COI Leakage"),
        ("eval_volatility_mean", "Mean Price Volatility", "Volatility"),
    ]
    for ax, (metric_prefix, title, ylabel) in zip(axes.flat, panels):
        mean_col = f"{metric_prefix}_mean"
        std_col = f"{metric_prefix}_std"
        for mode in ("baseline", "defended"):
            sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha")
            if sub.empty:
                continue
            x = sub["alpha"].to_numpy(dtype=float)
            y = sub[mean_col].to_numpy(dtype=float)
            ax.plot(
                x,
                y,
                marker="o",
                linewidth=1.8,
                markersize=4,
                color=mode_colors[mode],
                label=mode_labels[mode],
            )
            sigma = sub[std_col].fillna(0.0).to_numpy(dtype=float)
            ax.fill_between(
                x,
                y - sigma,
                y + sigma,
                color=mode_colors[mode],
                alpha=0.14,
                linewidth=0,
            )
        ax.set_title(title)
        ax.set_xlabel(r"Contamination $\alpha$")
        ax.set_ylabel(ylabel)
        ax.set_xticks(sorted(alpha_mode["alpha"].unique()))
        if metric_prefix in {"eval_revenue_mean", "eval_reward_mean"}:
            ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
    handles, labels = axes.flat[0].get_legend_handles_labels()
    fig.legend(handles, labels, ncol=2, loc="upper center", bbox_to_anchor=(0.5, 1.02))
    return _save_figure(fig, out_dir / "wandb_alpha_curves.pdf", export_tikz)
 def _plot_delta_curves(
    deltas: pd.DataFrame, out_dir: Path, export_tikz: bool
 ) -> list[Path]:
    fig, axes = plt.subplots(2, 1, figsize=(8.6, 6.0), constrained_layout=True)
    deltas = deltas.sort_values("alpha")
    x = deltas["alpha"].to_numpy(dtype=float)
    top_metrics = [
        ("eval_revenue_mean_delta_pct", "Revenue", "#4C72B0"),
        ("eval_reward_mean_delta_pct", "Reward", "#8172B3"),
    ]
    for col, label, color in top_metrics:
        axes[0].plot(
            x,
            deltas[col].to_numpy(dtype=float),
            marker="o",
            linewidth=1.8,
            markersize=4,
            color=color,
            label=label,
        )
    axes[0].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
    axes[0].set_title("Defended Minus Baseline Delta by Contamination")
    axes[0].set_ylabel("Delta (%)")
    axes[0].set_xlabel(r"Contamination $\alpha$")
    axes[0].set_xticks(x)
    axes[0].legend(loc="lower left")
    bottom_metrics = [
        ("eval_coi_leakage_mean_delta_pct", "COI Leakage", "#55A868"),
        ("eval_volatility_mean_delta_pct", "Volatility", "#DD8452"),
    ]
    for col, label, color in bottom_metrics:
        axes[1].plot(
            x,
            deltas[col].to_numpy(dtype=float),
            marker="o",
            linewidth=1.8,
            markersize=4,
            color=color,
            label=label,
        )
    axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
    axes[1].set_ylabel("Delta (%)")
    axes[1].set_xlabel(r"Contamination $\alpha$")
    axes[1].set_xticks(x)
    axes[1].legend(loc="lower left")
    return _save_figure(fig, out_dir / "wandb_delta_curves.pdf", export_tikz)
 def _plot_tradeoff_scatter(
    deltas: pd.DataFrame, out_dir: Path, export_tikz: bool
 ) -> list[Path]:
    fig, ax = plt.subplots(figsize=(6.4, 5.2), constrained_layout=True)
    data = deltas.sort_values("alpha")
    x = data["eval_coi_leakage_mean_delta_pct"].to_numpy(dtype=float)
    y = data["eval_revenue_mean_delta_pct"].to_numpy(dtype=float)
    alphas = data["alpha"].to_numpy(dtype=float)
    scatter = ax.scatter(
        x,
        y,
        c=alphas,
        cmap="viridis",
        s=72,
        edgecolor="#222222",
        linewidth=0.5,
    )
    for x_i, y_i, alpha in zip(x, y, alphas):
        ax.annotate(
            rf"$\alpha={alpha:.2f}$",
            (x_i, y_i),
            textcoords="offset points",
            xytext=(5, 4),
            fontsize=8,
        )
    ax.axhline(0.0, color="#555555", linewidth=1.0, linestyle="--")
    ax.axvline(0.0, color="#555555", linewidth=1.0, linestyle="--")
    ax.set_xlabel("COI Leakage Delta (%)")
    ax.set_ylabel("Revenue Delta (%)")
    ax.set_title("Defended Tradeoff Frontier")
    cbar = fig.colorbar(scatter, ax=ax)
    cbar.set_label(r"Contamination $\alpha$")
    return _save_figure(fig, out_dir / "wandb_tradeoff_scatter.pdf", export_tikz)
 def _plot_reward_robustness(
    alpha_mode: pd.DataFrame, out_dir: Path, export_tikz: bool
 ) -> list[Path]:
    fig, ax = plt.subplots(figsize=(7.6, 4.5), constrained_layout=True)
    mode_colors = {"baseline": "#4C72B0", "defended": "#C44E52"}
    mode_labels = {"baseline": "Baseline", "defended": "Defended"}
    for mode in ("baseline", "defended"):
        sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha")
        x = sub["alpha"].to_numpy(dtype=float)
        y = sub["eval_reward_mean_std"].fillna(0.0).to_numpy(dtype=float)
        ax.plot(
            x,
            y,
            marker="o",
            linewidth=1.8,
            markersize=4,
            color=mode_colors[mode],
            label=mode_labels[mode],
        )
    ax.set_title("Reward Robustness Across Contamination")
    ax.set_xlabel(r"Contamination $\alpha$")
    ax.set_ylabel("Reward Std Across Runs")
    ax.set_xticks(sorted(alpha_mode["alpha"].unique()))
    ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
    ax.legend(loc="upper left")
    return _save_figure(fig, out_dir / "wandb_reward_robustness.pdf", export_tikz)
 def _plot_parameter_sensitivity(
    summary: pd.DataFrame,
    parameter: str,
    out_name: str,
    out_dir: Path,
    export_tikz: bool,
 ) -> list[Path]:
    fig, axes = plt.subplots(1, 2, figsize=(10.0, 4.2), constrained_layout=True)
    values = sorted(summary[parameter].dropna().unique())
    cmap = plt.get_cmap("viridis")
    colors = [cmap(i) for i in np.linspace(0.1, 0.9, len(values))]
    panels = [
        ("eval_revenue_mean", "Revenue"),
        ("eval_coi_leakage_mean", "COI Leakage"),
    ]
    for ax, (metric_prefix, ylabel) in zip(axes, panels):
        mean_col = f"{metric_prefix}_mean"
        std_col = f"{metric_prefix}_std"
        for value, color in zip(values, colors):
            sub = summary[summary[parameter] == value].sort_values("alpha")
            if sub.empty:
                continue
            x = sub["alpha"].to_numpy(dtype=float)
            y = sub[mean_col].to_numpy(dtype=float)
            sigma = sub[std_col].fillna(0.0).to_numpy(dtype=float)
            ax.plot(
                x,
                y,
                marker="o",
                linewidth=1.6,
                markersize=3.6,
                color=color,
                label=f"{parameter}={value:.2f}",
            )
            ax.fill_between(
                x, y - sigma, y + sigma, color=color, alpha=0.10, linewidth=0
            )
        ax.set_xlabel(r"Contamination $\alpha$")
        ax.set_ylabel(ylabel)
        ax.set_xticks(sorted(summary["alpha"].unique()))
        if metric_prefix == "eval_revenue_mean":
            ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
    axes[0].set_title(f"{parameter} Sensitivity (Defended)")
    axes[1].set_title("Leakage Side-Effect")
    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(
        handles,
        labels,
        ncol=max(1, len(values) // 2),
        loc="upper center",
        bbox_to_anchor=(0.5, 1.06),
    )
    return _save_figure(fig, out_dir / f"{out_name}.pdf", export_tikz)
 def _plot_delta_summary(
    deltas: pd.DataFrame, out_dir: Path, export_tikz: bool
 ) -> list[Path]:
    data = deltas.sort_values("alpha")
    x = np.arange(len(data))
    labels = [f"{alpha:.1f}" for alpha in data["alpha"].to_numpy(dtype=float)]
    fig, axes = plt.subplots(1, 3, figsize=(11.0, 3.8), constrained_layout=True)
    panels = [
        ("eval_revenue_mean_delta_pct", "Revenue Delta (%)", "#4C72B0"),
        ("eval_reward_mean_delta_pct", "Reward Delta (%)", "#8172B3"),
        ("eval_coi_leakage_mean_delta_pct", "COI Leakage Delta (%)", "#55A868"),
    ]
    for ax, (column, title, color) in zip(axes, panels):
        values = data[column].to_numpy(dtype=float)
        ax.bar(x, values, color=color, alpha=0.85)
        ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
        ax.set_xticks(x)
        ax.set_xticklabels(labels)
        ax.set_xlabel(r"$\alpha$")
        ax.set_title(title)
    return _save_figure(fig, out_dir / "wandb_delta_summary.pdf", export_tikz)
 def build_artifacts(
    input_path: Path,
    output_dir: Path,
    plot_dir: Path,
    include_non_finished: bool,
    export_tikz: bool,
 ) -> list[Path]:
    raw = pd.read_csv(input_path)
    frame = _prepare_frame(raw, include_non_finished=include_non_finished)
    metrics = [
        metric
        for metric in (
            "eval/revenue_mean",
            "eval/reward_mean",
            "eval/coi_level_mean",
            "eval/coi_leakage_mean",
            "eval/volatility_mean",
            "eval/margin_mean",
            "train/agent_prob",
            "train/alpha_adv",
        )
        if metric in frame.columns
    ]
    alpha_mode = _summary_by_alpha_mode(frame, metrics)
    deltas = _delta_by_alpha(alpha_mode, metrics)
    lambda_summary = _summary_by_parameter(frame, "lambda_coi", metrics)
    radius_summary = _summary_by_parameter(frame, "ambiguity_radius", metrics)
    output_dir.mkdir(parents=True, exist_ok=True)
    plot_dir.mkdir(parents=True, exist_ok=True)
    written: list[Path] = []
    written.append(_save_table(alpha_mode, output_dir / "wandb_alpha_mode_summary.csv"))
    written.append(_save_table(deltas, output_dir / "wandb_alpha_deltas.csv"))
    written.append(
        _save_table(lambda_summary, output_dir / "wandb_lambda_alpha_summary.csv")
    )
    written.append(
        _save_table(radius_summary, output_dir / "wandb_radius_alpha_summary.csv")
    )
    written.extend(_plot_alpha_curves(alpha_mode, plot_dir, export_tikz))
    written.extend(_plot_delta_curves(deltas, plot_dir, export_tikz))
    written.extend(_plot_tradeoff_scatter(deltas, plot_dir, export_tikz))
    written.extend(_plot_reward_robustness(alpha_mode, plot_dir, export_tikz))
    written.extend(
        _plot_parameter_sensitivity(
            summary=lambda_summary,
            parameter="lambda_coi",
            out_name="wandb_lambda_sensitivity",
            out_dir=plot_dir,
            export_tikz=export_tikz,
        )
    )
    written.extend(
        _plot_parameter_sensitivity(
            summary=radius_summary,
            parameter="ambiguity_radius",
            out_name="wandb_radius_sensitivity",
            out_dir=plot_dir,
            export_tikz=export_tikz,
        )
    )
    written.extend(_plot_delta_summary(deltas, plot_dir, export_tikz))
    return written
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Generate W&B sweep visualizations for PHANTOM results"
    )
    parser.add_argument(
        "--input", type=Path, required=True, help="Path to W&B export CSV"
    )
    parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
    parser.add_argument("--plot-dir", type=Path, default=None)
    parser.add_argument("--include-non-finished", action="store_true")
    parser.add_argument(
        "--export-tikz",
        action="store_true",
        help="Export matplotlib figures to TikZ via tikzplotlib",
    )
    args = parser.parse_args()
    _configure_style()
    plot_dir = (
        args.plot_dir
        if args.plot_dir is not None
        else _default_plot_dir(args.output_dir)
    )
    outputs = build_artifacts(
        input_path=args.input,
        output_dir=args.output_dir,
        plot_dir=plot_dir,
        include_non_finished=bool(args.include_non_finished),
        export_tikz=bool(args.export_tikz),
    )
    for path in outputs:
        print(path)
 if __name__ == "__main__":
    main()
--- a/paper/src/chapters/figures/results/process_all_results.py
+++ b/paper/src/chapters/figures/results/process_all_results.py
@@ -0,0 +1,51 @@
 from __future__ import annotations
 import argparse
 from pathlib import Path
 from process_first_sweep import run as run_first_sweep
 from process_ppo_benchmark import run as run_ppo_benchmark
 def _default_output_dir() -> Path:
    return Path(__file__).resolve().parent / "generated" / "legacy"
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Process all result CSV exports for paper figures"
    )
    parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
    parser.add_argument("--include-non-finished", action="store_true")
    parser.add_argument("--top-n", type=int, default=25)
    args = parser.parse_args()
    written: list[Path] = []
    written.extend(
        run_ppo_benchmark(
            input_path=Path(__file__).resolve().parents[5]
            / "tpu_orchestration"
            / "results"
            / "ppo_benchmark.csv",
            output_dir=args.output_dir,
            include_non_finished=bool(args.include_non_finished),
        )
    )
    written.extend(
        run_first_sweep(
            input_path=Path(__file__).resolve().parents[5]
            / "tpu_orchestration"
            / "results"
            / "first_sweep.csv",
            output_dir=args.output_dir,
            include_non_finished=bool(args.include_non_finished),
            top_n=int(args.top_n),
        )
    )
    for path in written:
        print(path)
 if __name__ == "__main__":
    main()
--- a/paper/src/chapters/figures/results/process_final_sweeps.py
+++ b/paper/src/chapters/figures/results/process_final_sweeps.py
@@ -0,0 +1,409 @@
 from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
 from typing import Any
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 def _project_root() -> Path:
    return Path(__file__).resolve().parents[5]
 def _default_bundle_dir() -> Path:
    base = _project_root() / "engine" / "studies" / "results" / "wandb_sweep_bundles"
    bundles = sorted(
        [path for path in base.glob("bundle_*") if path.is_dir()],
        key=lambda path: path.stat().st_mtime,
        reverse=True,
    )
    if not bundles:
        raise FileNotFoundError(f"No sweep bundle directories found in {base}")
    return bundles[0]
 def _default_output_dir() -> Path:
    return Path(__file__).resolve().parent / "generated" / "final"
 def _default_plot_dir(output_dir: Path) -> Path:
    return output_dir / "plots"
 def _truthy(value: Any) -> bool:
    if isinstance(value, bool):
        return value
    if value is None:
        return False
    return str(value).strip().lower() in {"1", "true", "yes", "on"}
 def _mode_of(row: pd.Series) -> str:
    mode_hint = str(row.get("study_mode", "")).strip().lower()
    if mode_hint in {"baseline", "no_robust"}:
        return "baseline"
    if mode_hint in {"defended", "robust"}:
        return "defended"
    if _truthy(row.get("baseline_mode")) or _truthy(row.get("no_robust")):
        return "baseline"
    return "defended"
 def _coerce_numeric(frame: pd.DataFrame, columns: list[str]) -> None:
    for column in columns:
        if column in frame.columns:
            frame[column] = pd.to_numeric(frame[column], errors="coerce")
 def _configure_style() -> None:
    plt.rcParams.update(
        {
            "font.family": "serif",
            "font.size": 10,
            "axes.titlesize": 10,
            "axes.labelsize": 9,
            "legend.fontsize": 8,
            "xtick.labelsize": 8,
            "ytick.labelsize": 8,
            "figure.dpi": 220,
            "savefig.dpi": 320,
            "axes.spines.top": False,
            "axes.spines.right": False,
            "axes.grid": True,
            "grid.alpha": 0.22,
        }
    )
 def _load_runs(bundle_dir: Path) -> pd.DataFrame:
    path = bundle_dir / "runs_finished.csv"
    if not path.exists():
        raise FileNotFoundError(f"Missing required file: {path}")
    frame = pd.read_csv(path)
    frame["mode"] = frame.apply(_mode_of, axis=1)
    _coerce_numeric(
        frame,
        [
            "alpha",
            "n_products",
            "eval_revenue_mean",
            "eval_reward_mean",
            "eval_supra_share_mean",
            "eval_volatility_mean",
            "eval_coi_level_mean",
            "eval_coi_leakage_mean",
            "objective_score",
        ],
    )
    return frame
 def _focus_sweep(runs: pd.DataFrame) -> str:
    coverage = (
        runs.groupby("sweep_id", as_index=False)
        .agg(
            n_alpha=("alpha", lambda s: int(pd.Series(s).dropna().nunique())),
            max_alpha=("alpha", "max"),
            run_count=("run_id", "size"),
        )
        .sort_values(
            ["n_alpha", "max_alpha", "run_count"], ascending=[False, False, False]
        )
    )
    if coverage.empty:
        raise ValueError("No sweep rows available in runs_finished.csv")
    return str(coverage.iloc[0]["sweep_id"])
 def _alpha_mode_summary(runs: pd.DataFrame) -> pd.DataFrame:
    return (
        runs.groupby(["alpha", "mode"], as_index=False)
        .agg(
            runs=("run_id", "size"),
            revenue_mean=("eval_revenue_mean", "mean"),
            reward_mean=("eval_reward_mean", "mean"),
            supra_mean=("eval_supra_share_mean", "mean"),
            volatility_mean=("eval_volatility_mean", "mean"),
            coi_leakage_mean=("eval_coi_leakage_mean", "mean"),
            coi_level_mean=("eval_coi_level_mean", "mean"),
        )
        .sort_values(["alpha", "mode"])
        .reset_index(drop=True)
    )
 def _alpha_deltas(alpha_mode: pd.DataFrame) -> pd.DataFrame:
    rows: list[dict[str, float]] = []
    for alpha, group in alpha_mode.groupby("alpha", sort=True):
        defended = group[group["mode"] == "defended"]
        baseline = group[group["mode"] == "baseline"]
        if defended.empty or baseline.empty:
            continue
        d_rev = float(defended["revenue_mean"].iloc[0])
        b_rev = float(baseline["revenue_mean"].iloc[0])
        d_reward = float(defended["reward_mean"].iloc[0])
        b_reward = float(baseline["reward_mean"].iloc[0])
        d_vol = float(defended["volatility_mean"].iloc[0])
        b_vol = float(baseline["volatility_mean"].iloc[0])
        d_supra = float(defended["supra_mean"].iloc[0])
        b_supra = float(baseline["supra_mean"].iloc[0])
        d_coi_leak = float(defended["coi_leakage_mean"].iloc[0])
        b_coi_leak = float(baseline["coi_leakage_mean"].iloc[0])
        rows.append(
            {
                "alpha": float(alpha),
                "revenue_delta": d_rev - b_rev,
                "revenue_delta_pct": 0.0
                if b_rev == 0.0
                else 100.0 * (d_rev - b_rev) / b_rev,
                "reward_delta": d_reward - b_reward,
                "reward_delta_pct": 0.0
                if b_reward == 0.0
                else 100.0 * (d_reward - b_reward) / b_reward,
                "volatility_delta": d_vol - b_vol,
                "supra_delta": d_supra - b_supra,
                "coi_leakage_delta": d_coi_leak - b_coi_leak,
            }
        )
    return pd.DataFrame(rows).sort_values("alpha").reset_index(drop=True)
 def _zone_summary(alpha_deltas: pd.DataFrame) -> pd.DataFrame:
    if alpha_deltas.empty:
        return pd.DataFrame()
    data = alpha_deltas.copy()
    data["zone"] = np.where(
        data["alpha"] >= 0.7, "high_alpha_0_7_plus", "low_alpha_below_0_7"
    )
    return (
        data.groupby("zone", as_index=False)
        .agg(
            alpha_cells=("alpha", "size"),
            revenue_delta_pct_mean=("revenue_delta_pct", "mean"),
            reward_delta_pct_mean=("reward_delta_pct", "mean"),
            coi_leakage_delta_mean=("coi_leakage_delta", "mean"),
            volatility_delta_mean=("volatility_delta", "mean"),
        )
        .sort_values("zone")
    )
 def _save_plot(fig: plt.Figure, path: Path) -> Path:
    path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(path, bbox_inches="tight")
    plt.close(fig)
    return path
 def _plot_focus_revenue_by_alpha(alpha_mode: pd.DataFrame, out_path: Path) -> Path:
    fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True)
    for mode, color, label in (
        ("baseline", "#4C72B0", "Baseline"),
        ("defended", "#C44E52", "Defended"),
    ):
        sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha")
        if sub.empty:
            continue
        ax.plot(
            sub["alpha"],
            sub["revenue_mean"],
            marker="o",
            linewidth=1.9,
            markersize=4,
            color=color,
            label=label,
        )
    ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--")
    ax.set_xlabel(r"Contamination $\alpha$")
    ax.set_ylabel("Mean episode revenue")
    ax.set_title("Final Cohort Revenue Curves")
    ax.legend(loc="lower left")
    return _save_plot(fig, out_path)
 def _plot_focus_revenue_delta(alpha_deltas: pd.DataFrame, out_path: Path) -> Path:
    fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True)
    x = alpha_deltas["alpha"].to_numpy(dtype=float)
    y = alpha_deltas["revenue_delta_pct"].to_numpy(dtype=float)
    ax.plot(x, y, marker="o", linewidth=2.0, markersize=4, color="#C44E52")
    ax.fill_between(x, y, 0.0, color="#C44E52", alpha=0.12)
    ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
    ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--")
    high = alpha_deltas[alpha_deltas["alpha"] >= 0.7]
    if not high.empty:
        best = high.reindex(
            high["revenue_delta_pct"].abs().sort_values(ascending=False).index
        ).iloc[0]
        ax.scatter(
            [best["alpha"]],
            [best["revenue_delta_pct"]],
            color="#1f77b4",
            s=45,
            zorder=3,
        )
        ax.annotate(
            f"high-alpha peak {best['revenue_delta_pct']:.2f}%",
            (float(best["alpha"]), float(best["revenue_delta_pct"])),
            textcoords="offset points",
            xytext=(6, 6),
            fontsize=8,
        )
    ax.set_xlabel(r"Contamination $\alpha$")
    ax.set_ylabel("Defended minus baseline revenue (%)")
    ax.set_title("Revenue Delta by Contamination (Final Cohort)")
    return _save_plot(fig, out_path)
 def _plot_focus_risk_deltas(alpha_deltas: pd.DataFrame, out_path: Path) -> Path:
    fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True)
    x = alpha_deltas["alpha"].to_numpy(dtype=float)
    ax.plot(
        x,
        alpha_deltas["coi_leakage_delta"].to_numpy(dtype=float),
        marker="o",
        linewidth=1.8,
        markersize=4,
        color="#55A868",
        label="COI leakage delta",
    )
    ax.plot(
        x,
        alpha_deltas["volatility_delta"].to_numpy(dtype=float),
        marker="s",
        linewidth=1.8,
        markersize=3.8,
        color="#8172B3",
        label="Volatility delta",
    )
    ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
    ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--")
    ax.set_xlabel(r"Contamination $\alpha$")
    ax.set_ylabel("Defended minus baseline")
    ax.set_title("Leakage and Stability Deltas (Final Cohort)")
    ax.legend(loc="lower left")
    return _save_plot(fig, out_path)
 def _write_include(path: Path, figure_rel_path: str, width: str) -> Path:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(f"\\includegraphics[width={width}]{{{figure_rel_path}}}\n")
    return path
 def run(bundle_dir: Path, output_dir: Path, plot_dir: Path) -> list[Path]:
    all_runs = _load_runs(bundle_dir)
    focus_id = _focus_sweep(all_runs)
    focus_runs = all_runs[all_runs["sweep_id"] == focus_id].copy()
    alpha_mode = _alpha_mode_summary(focus_runs)
    deltas = _alpha_deltas(alpha_mode)
    zones = _zone_summary(deltas)
    output_dir.mkdir(parents=True, exist_ok=True)
    plot_dir.mkdir(parents=True, exist_ok=True)
    written: list[Path] = []
    alpha_mode_path = output_dir / "final_focus_alpha_mode_summary.csv"
    alpha_mode.to_csv(alpha_mode_path, index=False)
    written.append(alpha_mode_path)
    delta_path = output_dir / "final_focus_alpha_deltas.csv"
    deltas.to_csv(delta_path, index=False)
    written.append(delta_path)
    zone_path = output_dir / "final_focus_zone_summary.csv"
    zones.to_csv(zone_path, index=False)
    written.append(zone_path)
    headline = {
        "bundle": str(bundle_dir),
        "focus_cohort": "max_alpha_coverage",
        "alpha_cells": int(deltas["alpha"].nunique()) if not deltas.empty else 0,
        "alpha_min": float(deltas["alpha"].min()) if not deltas.empty else None,
        "alpha_max": float(deltas["alpha"].max()) if not deltas.empty else None,
        "mean_revenue_delta_pct": float(deltas["revenue_delta_pct"].mean())
        if not deltas.empty
        else None,
        "mean_reward_delta_pct": float(deltas["reward_delta_pct"].mean())
        if not deltas.empty
        else None,
        "zone_summary": zones.to_dict(orient="records"),
    }
    headline_path = output_dir / "final_focus_headline_summary.json"
    headline_path.write_text(json.dumps(headline, indent=2) + "\n")
    written.append(headline_path)
    written.append(
        _plot_focus_revenue_by_alpha(
            alpha_mode,
            plot_dir / "final_focus_revenue_by_alpha.pdf",
        )
    )
    written.append(
        _plot_focus_revenue_delta(
            deltas,
            plot_dir / "final_focus_revenue_delta.pdf",
        )
    )
    written.append(
        _plot_focus_risk_deltas(
            deltas,
            plot_dir / "final_focus_risk_deltas.pdf",
        )
    )
    include_dir = Path(__file__).resolve().parent / "includes" / "final"
    written.append(
        _write_include(
            include_dir / "final_focus_revenue_by_alpha.tex",
            "chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf",
            "0.98\\linewidth",
        )
    )
    written.append(
        _write_include(
            include_dir / "final_focus_revenue_delta.tex",
            "chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf",
            "0.95\\linewidth",
        )
    )
    written.append(
        _write_include(
            include_dir / "final_focus_risk_deltas.tex",
            "chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf",
            "0.95\\linewidth",
        )
    )
    return written
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Generate final paper figures/tables from the final sweep cohort"
    )
    parser.add_argument("--bundle-dir", type=Path, default=_default_bundle_dir())
    parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
    parser.add_argument("--plot-dir", type=Path, default=None)
    args = parser.parse_args()
    _configure_style()
    plot_dir = (
        args.plot_dir
        if args.plot_dir is not None
        else _default_plot_dir(args.output_dir)
    )
    outputs = run(
        bundle_dir=args.bundle_dir, output_dir=args.output_dir, plot_dir=plot_dir
    )
    for path in outputs:
        print(path)
 if __name__ == "__main__":
    main()
--- a/paper/src/chapters/figures/results/process_first_sweep.py
+++ b/paper/src/chapters/figures/results/process_first_sweep.py
@@ -0,0 +1,272 @@
 from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
 from typing import Iterable
 import numpy as np
 import pandas as pd
 def _project_root() -> Path:
    return Path(__file__).resolve().parents[5]
 def _default_input() -> Path:
    return _project_root() / "tpu_orchestration" / "results" / "first_sweep.csv"
 def _default_output_dir() -> Path:
    return Path(__file__).resolve().parent / "generated" / "legacy"
 def _sanitize(key: str) -> str:
    return key.replace("/", "_").replace("-", "_")
 def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None:
    for column in columns:
        if column in frame.columns:
            frame[column] = pd.to_numeric(frame[column], errors="coerce")
 def _extract_alpha(frame: pd.DataFrame) -> pd.Series:
    if "study/alpha" in frame.columns:
        return pd.to_numeric(frame["study/alpha"], errors="coerce")
    if "alpha" in frame.columns:
        return pd.to_numeric(frame["alpha"], errors="coerce")
    return pd.Series(np.nan, index=frame.index, dtype=float)
 def _extract_mode(frame: pd.DataFrame) -> pd.Series:
    if "study/mode" in frame.columns:
        return frame["study/mode"].astype(str).str.strip().str.lower()
    if "study/no_robust" in frame.columns:
        no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0)
        return pd.Series(
            np.where(no_robust > 0.5, "no_robust", "robust"),
            index=frame.index,
            dtype="object",
        )
    if "no_robust" in frame.columns:
        no_robust = (
            frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"})
        )
        return pd.Series(
            np.where(no_robust, "no_robust", "robust"),
            index=frame.index,
            dtype="object",
        )
    return pd.Series("", index=frame.index, dtype="object")
 def _extract_tier(frame: pd.DataFrame) -> pd.Series:
    for column in ("tiers", "runtime/backend", "algo", "run.backend", "run.algo"):
        if column in frame.columns:
            tier = frame[column].astype(str).str.strip().str.lower()
            if tier.notna().any():
                return tier
    return pd.Series("unknown", index=frame.index, dtype="object")
 def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame:
    data = frame.copy()
    if not include_non_finished and "State" in data.columns:
        data = data[data["State"].astype(str).str.lower() == "finished"].copy()
    data["alpha"] = _extract_alpha(data)
    data["mode"] = _extract_mode(data)
    data["tier"] = _extract_tier(data)
    data = data[data["mode"].isin({"robust", "no_robust"})]
    data = data[data["alpha"].notna()]
    _coerce_numeric(
        data,
        [
            "eval/revenue_mean",
            "eval/reward_mean",
            "eval/coi_level_mean",
            "eval/coi_leakage_mean",
            "eval/margin_mean",
            "eval/volatility_mean",
            "objective/score",
            "train/alpha_adv",
            "lambda_coi",
            "robust_radius",
            "learning_rate",
            "batch_size",
            "n_steps",
            "total_timesteps",
        ],
    )
    return data.sort_values(["tier", "alpha", "mode"]).reset_index(drop=True)
 def _group_summary(
    frame: pd.DataFrame, by: list[str], metrics: list[str]
 ) -> pd.DataFrame:
    agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
    for metric in metrics:
        safe = _sanitize(metric)
        agg_spec[f"{safe}_mean"] = (metric, "mean")
        agg_spec[f"{safe}_std"] = (metric, "std")
    return frame.groupby(by, as_index=False).agg(**agg_spec).sort_values(by)
 def _tier_alpha_deltas(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
    rows: list[dict[str, float | str]] = []
    for (tier, alpha), group in summary.groupby(["tier", "alpha"], sort=True):
        robust = group[group["mode"] == "robust"]
        no_robust = group[group["mode"] == "no_robust"]
        if robust.empty or no_robust.empty:
            continue
        row: dict[str, float | str] = {
            "tier": str(tier),
            "alpha": float(alpha),
            "runs_robust": float(robust["runs"].iloc[0]),
            "runs_no_robust": float(no_robust["runs"].iloc[0]),
        }
        for metric in metrics:
            safe = _sanitize(metric)
            robust_value = float(robust[f"{safe}_mean"].iloc[0])
            no_robust_value = float(no_robust[f"{safe}_mean"].iloc[0])
            delta = robust_value - no_robust_value
            row[f"{safe}_delta"] = delta
            row[f"{safe}_delta_pct"] = (
                np.nan if no_robust_value == 0 else 100.0 * delta / no_robust_value
            )
        rows.append(row)
    return pd.DataFrame(rows)
 def _top_runs(frame: pd.DataFrame, n: int) -> pd.DataFrame:
    rank_metric = "objective/score"
    if rank_metric not in frame.columns or frame[rank_metric].notna().sum() == 0:
        rank_metric = "eval/reward_mean"
    keep = [
        "Name",
        "tier",
        "alpha",
        "mode",
        rank_metric,
        "eval/revenue_mean",
        "eval/reward_mean",
        "eval/coi_level_mean",
        "eval/coi_leakage_mean",
        "lambda_coi",
        "robust_radius",
        "learning_rate",
        "batch_size",
        "n_steps",
        "total_timesteps",
    ]
    present = [column for column in keep if column in frame.columns]
    ranked = frame[present].copy().sort_values(rank_metric, ascending=False)
    return ranked.head(max(1, int(n))).reset_index(drop=True)
 def _headline_json(
    frame: pd.DataFrame, tier_mode: pd.DataFrame
 ) -> dict[str, float | str]:
    out: dict[str, float | str] = {
        "runs": int(len(frame)),
        "tiers": int(frame["tier"].nunique()),
        "alphas": int(frame["alpha"].nunique()),
    }
    robust_rows = tier_mode[tier_mode["mode"] == "robust"]
    no_robust_rows = tier_mode[tier_mode["mode"] == "no_robust"]
    if robust_rows.empty or no_robust_rows.empty:
        out["status"] = "incomplete_modes"
        return out
    robust_mean = robust_rows["eval_revenue_mean_mean"].mean()
    no_robust_mean = no_robust_rows["eval_revenue_mean_mean"].mean()
    out.update(
        {
            "status": "ok",
            "mean_tier_revenue_robust": float(robust_mean),
            "mean_tier_revenue_no_robust": float(no_robust_mean),
            "mean_tier_revenue_delta": float(robust_mean - no_robust_mean),
            "mean_tier_revenue_delta_pct": float(
                100.0 * (robust_mean - no_robust_mean) / no_robust_mean
            )
            if no_robust_mean
            else np.nan,
        }
    )
    return out
 def run(
    input_path: Path, output_dir: Path, include_non_finished: bool, top_n: int
 ) -> list[Path]:
    output_dir.mkdir(parents=True, exist_ok=True)
    raw = pd.read_csv(input_path)
    frame = _prepare_frame(raw, include_non_finished=include_non_finished)
    metrics = [
        metric
        for metric in (
            "eval/revenue_mean",
            "eval/reward_mean",
            "eval/coi_level_mean",
            "eval/coi_leakage_mean",
            "eval/margin_mean",
            "eval/volatility_mean",
            "objective/score",
            "train/alpha_adv",
        )
        if metric in frame.columns
    ]
    tier_mode = _group_summary(frame, ["tier", "mode"], metrics)
    tier_alpha_mode = _group_summary(frame, ["tier", "alpha", "mode"], metrics)
    deltas = _tier_alpha_deltas(tier_alpha_mode, metrics)
    top_configs = _top_runs(frame, n=top_n)
    headline = _headline_json(frame, tier_mode)
    outputs = {
        "first_sweep_tier_mode_summary.csv": tier_mode,
        "first_sweep_tier_alpha_mode_summary.csv": tier_alpha_mode,
        "first_sweep_tier_alpha_deltas.csv": deltas,
        "first_sweep_top_configs.csv": top_configs,
    }
    written_paths: list[Path] = []
    for filename, table in outputs.items():
        path = output_dir / filename
        table.to_csv(path, index=False)
        written_paths.append(path)
    headline_path = output_dir / "first_sweep_headline_summary.json"
    headline_path.write_text(json.dumps(headline, indent=2))
    written_paths.append(headline_path)
    return written_paths
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Process first sweep CSV for paper tables"
    )
    parser.add_argument("--input", type=Path, default=_default_input())
    parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
    parser.add_argument("--include-non-finished", action="store_true")
    parser.add_argument("--top-n", type=int, default=25)
    args = parser.parse_args()
    written = run(
        input_path=args.input,
        output_dir=args.output_dir,
        include_non_finished=bool(args.include_non_finished),
        top_n=int(args.top_n),
    )
    for path in written:
        print(path)
 if __name__ == "__main__":
    main()
--- a/paper/src/chapters/figures/results/process_ppo_benchmark.py
+++ b/paper/src/chapters/figures/results/process_ppo_benchmark.py
@@ -0,0 +1,277 @@
 from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
 from typing import Iterable
 import numpy as np
 import pandas as pd
 def _project_root() -> Path:
    return Path(__file__).resolve().parents[5]
 def _default_input() -> Path:
    return _project_root() / "tpu_orchestration" / "results" / "ppo_benchmark.csv"
 def _default_output_dir() -> Path:
    return Path(__file__).resolve().parent / "generated" / "legacy"
 def _sanitize(key: str) -> str:
    return key.replace("/", "_").replace("-", "_")
 def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None:
    for column in columns:
        if column in frame.columns:
            frame[column] = pd.to_numeric(frame[column], errors="coerce")
 def _extract_alpha(frame: pd.DataFrame) -> pd.Series:
    if "study/alpha" in frame.columns:
        return pd.to_numeric(frame["study/alpha"], errors="coerce")
    if "alpha" in frame.columns:
        return pd.to_numeric(frame["alpha"], errors="coerce")
    return pd.Series(np.nan, index=frame.index, dtype=float)
 def _extract_mode(frame: pd.DataFrame) -> pd.Series:
    if "study/mode" in frame.columns:
        return frame["study/mode"].astype(str).str.strip().str.lower()
    if "study/no_robust" in frame.columns:
        no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0)
        return pd.Series(
            np.where(no_robust > 0.5, "no_robust", "robust"),
            index=frame.index,
            dtype="object",
        )
    if "no_robust" in frame.columns:
        no_robust = (
            frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"})
        )
        return pd.Series(
            np.where(no_robust, "no_robust", "robust"),
            index=frame.index,
            dtype="object",
        )
    return pd.Series("", index=frame.index, dtype="object")
 def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame:
    data = frame.copy()
    if not include_non_finished and "State" in data.columns:
        data = data[data["State"].astype(str).str.lower() == "finished"].copy()
    data["alpha"] = _extract_alpha(data)
    data["mode"] = _extract_mode(data)
    data = data[data["mode"].isin({"robust", "no_robust"})]
    data = data[data["alpha"].notna()]
    numeric_cols = [
        "eval/revenue_mean",
        "eval/reward_mean",
        "eval/coi_level_mean",
        "eval/coi_leakage_mean",
        "eval/volatility_mean",
        "eval/margin_mean",
        "train/alpha_adv",
        "train/coi_penalty",
        "train/ux_penalty",
        "train/agent_prob",
    ]
    _coerce_numeric(data, numeric_cols)
    return data.sort_values(["alpha", "mode"]).reset_index(drop=True)
 def _summary_by_alpha_mode(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
    agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
    for metric in metrics:
        safe = _sanitize(metric)
        agg_spec[f"{safe}_mean"] = (metric, "mean")
        agg_spec[f"{safe}_std"] = (metric, "std")
    return (
        frame.groupby(["alpha", "mode"], as_index=False)
        .agg(**agg_spec)
        .sort_values(["alpha", "mode"])
        .reset_index(drop=True)
    )
 def _delta_by_alpha(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
    rows: list[dict[str, float]] = []
    for alpha, alpha_group in summary.groupby("alpha", sort=True):
        robust = alpha_group[alpha_group["mode"] == "robust"]
        no_robust = alpha_group[alpha_group["mode"] == "no_robust"]
        if robust.empty or no_robust.empty:
            continue
        row: dict[str, float] = {
            "alpha": float(alpha),
            "runs_robust": float(robust["runs"].iloc[0]),
            "runs_no_robust": float(no_robust["runs"].iloc[0]),
        }
        for metric in metrics:
            safe = _sanitize(metric)
            robust_value = float(robust[f"{safe}_mean"].iloc[0])
            no_robust_value = float(no_robust[f"{safe}_mean"].iloc[0])
            delta = robust_value - no_robust_value
            row[f"{safe}_robust"] = robust_value
            row[f"{safe}_no_robust"] = no_robust_value
            row[f"{safe}_delta"] = delta
            row[f"{safe}_delta_pct"] = (
                np.nan if no_robust_value == 0 else 100.0 * delta / no_robust_value
            )
        rows.append(row)
    return pd.DataFrame(rows)
 def _pairwise_win_rates(frame: pd.DataFrame) -> pd.DataFrame:
    rules = {
        "eval/revenue_mean": "higher",
        "eval/reward_mean": "higher",
        "eval/coi_leakage_mean": "lower",
        "eval/volatility_mean": "lower",
    }
    rows: list[dict[str, float]] = []
    for alpha, alpha_group in frame.groupby("alpha", sort=True):
        robust = alpha_group[alpha_group["mode"] == "robust"]
        no_robust = alpha_group[alpha_group["mode"] == "no_robust"]
        if robust.empty or no_robust.empty:
            continue
        for metric, direction in rules.items():
            if metric not in frame.columns:
                continue
            robust_values = robust[metric].dropna().to_numpy(dtype=float)
            no_robust_values = no_robust[metric].dropna().to_numpy(dtype=float)
            if robust_values.size == 0 or no_robust_values.size == 0:
                continue
            if direction == "higher":
                wins = (robust_values[:, None] > no_robust_values[None, :]).sum()
            else:
                wins = (robust_values[:, None] < no_robust_values[None, :]).sum()
            ties = (robust_values[:, None] == no_robust_values[None, :]).sum()
            total = robust_values.size * no_robust_values.size
            win_prob = (wins + 0.5 * ties) / total
            rows.append(
                {
                    "alpha": float(alpha),
                    "metric": metric,
                    "direction": direction,
                    "wins": int(wins),
                    "ties": int(ties),
                    "total_pairs": int(total),
                    "win_probability": float(win_prob),
                }
            )
    return pd.DataFrame(rows)
 def _overall_mode_summary(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
    agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
    for metric in metrics:
        safe = _sanitize(metric)
        agg_spec[f"{safe}_mean"] = (metric, "mean")
        agg_spec[f"{safe}_std"] = (metric, "std")
    return frame.groupby("mode", as_index=False).agg(**agg_spec).sort_values("mode")
 def _headline_json(overall: pd.DataFrame) -> dict[str, float | str]:
    if {"robust", "no_robust"} - set(overall["mode"].tolist()):
        return {"status": "incomplete_modes"}
    robust = overall[overall["mode"] == "robust"].iloc[0]
    no_robust = overall[overall["mode"] == "no_robust"].iloc[0]
    revenue_delta = float(
        robust["eval_revenue_mean_mean"] - no_robust["eval_revenue_mean_mean"]
    )
    leakage_delta = float(
        robust["eval_coi_leakage_mean_mean"] - no_robust["eval_coi_leakage_mean_mean"]
    )
    return {
        "status": "ok",
        "revenue_delta": revenue_delta,
        "revenue_delta_pct": float(
            100.0 * revenue_delta / no_robust["eval_revenue_mean_mean"]
        ),
        "coi_leakage_delta": leakage_delta,
        "coi_leakage_delta_pct": float(
            100.0 * leakage_delta / no_robust["eval_coi_leakage_mean_mean"]
        ),
    }
 def run(input_path: Path, output_dir: Path, include_non_finished: bool) -> list[Path]:
    output_dir.mkdir(parents=True, exist_ok=True)
    raw = pd.read_csv(input_path)
    frame = _prepare_frame(raw, include_non_finished=include_non_finished)
    metrics = [
        metric
        for metric in (
            "eval/revenue_mean",
            "eval/reward_mean",
            "eval/coi_level_mean",
            "eval/coi_leakage_mean",
            "eval/volatility_mean",
            "eval/margin_mean",
            "train/alpha_adv",
            "train/coi_penalty",
            "train/ux_penalty",
            "train/agent_prob",
        )
        if metric in frame.columns
    ]
    alpha_mode = _summary_by_alpha_mode(frame, metrics)
    deltas = _delta_by_alpha(alpha_mode, metrics)
    win_rates = _pairwise_win_rates(frame)
    overall = _overall_mode_summary(frame, metrics)
    headline = _headline_json(overall)
    outputs = {
        "ppo_alpha_mode_summary.csv": alpha_mode,
        "ppo_alpha_deltas.csv": deltas,
        "ppo_pairwise_win_rates.csv": win_rates,
        "ppo_overall_mode_summary.csv": overall,
    }
    written_paths: list[Path] = []
    for filename, table in outputs.items():
        path = output_dir / filename
        table.to_csv(path, index=False)
        written_paths.append(path)
    headline_path = output_dir / "ppo_headline_summary.json"
    headline_path.write_text(json.dumps(headline, indent=2))
    written_paths.append(headline_path)
    return written_paths
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Process PPO benchmark CSV for paper tables"
    )
    parser.add_argument("--input", type=Path, default=_default_input())
    parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
    parser.add_argument("--include-non-finished", action="store_true")
    args = parser.parse_args()
    written = run(
        input_path=args.input,
        output_dir=args.output_dir,
        include_non_finished=bool(args.include_non_finished),
    )
    for path in written:
        print(path)
 if __name__ == "__main__":
    main()
--- a/paper/src/chapters/figures/supra/process_supra.py
+++ b/paper/src/chapters/figures/supra/process_supra.py
--- a/paper/src/chapters/figures/supra/supra.csv
+++ b/paper/src/chapters/figures/supra/supra.csv
--- a/paper/src/chapters/figures/supra/supra.tex
+++ b/paper/src/chapters/figures/supra/supra.tex
@@ -21,7 +21,7 @@
        surf,
        shader=flat,
        mesh/check=false % Disable check to rely on empty lines
-    ] table [col sep=comma, x=step, y=price, z=density] {chapters/figures/supra_data.csv};
+    ] table [col sep=comma, x=step, y=price, z=density] {chapters/figures/supra/supra_data.csv};
    \end{axis}
 \end{tikzpicture}
--- a/paper/src/chapters/figures/supra/supra_data.csv
+++ b/paper/src/chapters/figures/supra/supra_data.csv
@@ -4038,4 +4038,3 @@ step,price,density
 4000,146.51098761558535,0.0
 4000,147.9065925693512,0.0
 4000,149.30219752311706,10.0
--- a/paper/src/chapters/hero_architecture_figure.tex
+++ b/paper/src/chapters/hero_architecture_figure.tex
@@ -0,0 +1,166 @@
 \definecolor{heroBlue}{RGB}{212, 228, 255}
 \definecolor{heroBlueBorder}{RGB}{64, 103, 178}
 \definecolor{heroGreen}{RGB}{214, 238, 216}
 \definecolor{heroGreenBorder}{RGB}{48, 133, 66}
 \definecolor{heroAmber}{RGB}{246, 230, 202}
 \definecolor{heroAmberBorder}{RGB}{166, 121, 51}
 \definecolor{heroGray}{RGB}{236, 236, 236}
 \definecolor{heroGrayBorder}{RGB}{120, 120, 120}
 % Panels occupy y = 2.2 .. 10.0
 % Cross-panel connector gutter lives at y = 1.0 .. 2.2  (clearly below all nodes)
 \begin{tikzpicture}[
    >=Stealth,
    font=\small,
    panel/.style={draw=black!65, dashed, rounded corners=4pt, line width=0.85pt},
    bB/.style={rectangle, rounded corners=3pt, draw=heroBlueBorder,  fill=heroBlue,
               line width=0.9pt, align=center, minimum height=0.85cm},
    bG/.style={rectangle, rounded corners=3pt, draw=heroGreenBorder, fill=heroGreen,
               line width=0.9pt, align=center, minimum height=0.85cm},
    bA/.style={rectangle, rounded corners=3pt, draw=heroAmberBorder, fill=heroAmber,
               line width=0.9pt, align=center, minimum height=0.85cm},
    bY/.style={rectangle, rounded corners=3pt, draw=heroGrayBorder,  fill=heroGray,
               line width=0.9pt, align=center, minimum height=0.82cm},
    pill/.style={ellipse, draw=black!50, fill=black!4, line width=0.75pt,
                 align=center, minimum width=1.6cm, minimum height=0.68cm},
    arr/.style={->, draw=black!80, line width=0.88pt},
    bidir/.style={<->, draw=black!80, line width=0.88pt},
    darr/.style={->, draw=black!60, line width=0.80pt, densely dashed},
    crossA/.style={->, draw=heroAmberBorder!90!black, line width=1.15pt, dash pattern=on 3.5pt off 2pt},
    crossG/.style={->, draw=heroGreenBorder!90!black, line width=1.15pt, dash pattern=on 3.5pt off 2pt},
    arrG/.style={->, draw=heroGreenBorder!90!black, line width=1.15pt},
    lbl/.style={font=\scriptsize, align=center, fill=white, inner sep=1.5pt, text=black}
 ]
 %% ============================================================
 %%  Panel A   x: 0.2–11.2    y: 2.2–10.0
 %% ============================================================
 \draw[panel] (0.2,2.2) rectangle (11.2,10.0);
 \node[anchor=west, font=\small\bfseries] at (0.45,9.72) {(a) Online platform and data plane};
 \node[pill] (human) at (1.3, 8.55) {Human};
 \node[pill] (agent) at (1.3, 7.45) {Agent};
 \node[bB, minimum width=2.75cm] (web)      at (4.2,  8.0)  {Next.js\\Web App};
 \node[bB, minimum width=2.75cm] (provider) at (7.35, 8.0)  {Pricing\\Provider};
 \node[bY, minimum width=1.85cm] (redis)    at (9.85, 8.0)  {Redis};
 \node[bG, minimum width=3.1cm]  (kBehav)  at (4.0,  6.2)  {Kafka stream\\Behavior events};
 \node[bG, minimum width=3.0cm]  (kQuotes) at (7.5,  6.2)  {Kafka stream\\Price quotes};
 \node[bA, minimum width=3.1cm]  (worker)   at (4.0, 4.4)  {Worker / ETL\\Feature jobs};
 \node[bA, minimum width=2.65cm] (registry) at (8.45, 4.4)  {Model\\Registry};
 % service row
 \draw[arr]   (human.east)    -- (web.west);
 \draw[arr]   (agent.east)    -- (web.west);
 \draw[arr]   (web.east)      -- (provider.west);
 \draw[bidir] (provider.east) -- (redis.west);
 % web/provider -> kafka
 \draw[arr] (web.south)      -- (kBehav.north)
    node[midway, left, lbl] {$e=(a,i,t,\mu,\delta)$};
 \draw[arr] (provider.south) -- (kQuotes.north)
    node[midway, right, lbl] {$(i,p,\mathrm{sid},\phi,t)$};
 % kafka -> worker (straight south)
 \draw[arr] (kBehav.south)  -- (worker.north);
 \draw[arr] (kQuotes.south) -- (worker.north);
 % worker -> registry
 \draw[arr] (worker.east) -- (registry.west);
 % model refresh: registry east -> goes right to x=11.0, north to y=9.2, left to provider
 % this keeps it entirely inside panel A with no crossing of nodes
 \draw[crossA, rounded corners=6pt]
    (registry.east) -- (11.0, 4.4)
    -- (11.0, 9.2)
    -- node[midway, lbl] {model refresh} (provider.north |- 0, 9.2)
    -- (provider.north);
 %% ============================================================
 %%  Panel B   x: 11.6–20.4    y: 2.2–10.0
 %% ============================================================
 \draw[panel] (11.6,2.2) rectangle (19.8,10.0);
 \node[anchor=west, font=\small\bfseries] at (11.85,9.72) {(b) Distinguishability layer};
 \node[bG, minimum width=2.4cm]  (session) at (14.0, 8.9)  {Session prefix\\$\tau'$};
 \node[bB, minimum width=2.4cm]  (empKern) at (13.65,7.45) {Empirical kernel\\$\hat T'$};
 \node[bY, minimum width=2.4cm]  (weakLab) at (17.55,8.9) {Weak labels\\$\mathcal{D}_H,\mathcal{D}_A$};
 \node[bY, minimum width=2.2cm]  (protoH)  at (12.8, 5.9)  {Prototype\\$\bar T_H$};
 \node[bA, minimum width=2.4cm]  (kldist)  at (15.55,5.9)  {KL distances\\$\Delta_H,\Delta_A$};
 \node[bY, minimum width=2.2cm]  (protoA)  at (18.3, 5.9)  {Prototype\\$\bar T_A$};
 \node[bB, minimum width=2.9cm]  (calHead) at (13.55,4.25) {Contrastive\\calibration head};
 \node[bG, minimum width=2.55cm] (score)   at (17.75,4.25) {Session score\\$f(\tau'),\hat\alpha(\tau')$};
 \node[lbl] at (15.55, 3.15) {$\hat\alpha(\tau')=\sigma\!\left(\beta(\Delta_H-\Delta_A)\right)$};
 \draw[arr, rounded corners=4pt] (session.south)  -- (empKern.north);
 \draw[arr, rounded corners=4pt] (empKern.south) -- (13.65, 6.8) -| (protoH.north);
 \draw[arr, rounded corners=4pt] (weakLab.south) -- (17.55, 6.8) -| (protoA.north);
 % weak labels -> protoH: go south then hard-left below weakLab
 \draw[arr, rounded corners=4pt] (weakLab.south) -- (17.55,6.8) -| (protoH.north east);
 \draw[arr] (protoH.east)    -- (kldist.west);
 \draw[arr] (protoA.west)    -- (kldist.east);
 \draw[arr] (kldist.south)   -- (calHead.north east);
 \draw[arr] (calHead.east)   -- (score.west);
 %% ============================================================
 %%  Panel C   x: 20.8–31.0    y: 2.2–10.0
 %% ============================================================
 \draw[panel] (20.8,2.2) rectangle (31.0,10.0);
 \node[anchor=west, font=\small\bfseries] at (21.05,9.72) {(c) Distributionally robust control};
 \node[bB, minimum width=3.1cm] (state)    at (23.15, 8.9)
    {State summary\\$[p_{t-1},\hat q_{t-1},f(\tau')]$};
 \node[bY, minimum width=2.9cm] (ambSet)   at (23.15, 7.45) {Ambiguity set\\$\mathcal U_\epsilon(\hat P_N)$};
 \node[bG, minimum width=2.9cm] (innerMin) at (28.55, 7.45) {Inner minimisation\\$\min_{Q\in\mathcal U_\epsilon}$};
 \node[bY, minimum width=8.2cm] (contScen) at (25.9,  5.9)
    {Contamination scenarios $\;\alpha_k\in\mathcal A_{\epsilon_\alpha}(\alpha_0)$};
 \node[bA, minimum width=8.8cm] (reward)   at (25.9,  4.45)
    {$r_t = R(p_t,\hat q_t) - \lambda\,\mathrm{COI}_{\mathrm{leak}}(p_t,\tau_t') - \eta\,UX_t$};
 \node[bB, minimum width=2.85cm] (policy)  at (22.75, 3.05) {Robust policy $\pi^*$};
 \node[bG, minimum width=2.85cm] (publish) at (29.05, 3.05) {Publish price\\vector $p_t$};
 \node[lbl] at (25.9, 2.55) {$\pi^*=\arg\max_\pi\min_{Q\in\mathcal U_\epsilon}\mathbb{E}[r_t]$};
 \draw[arr] (state.south)      -- (ambSet.north);
 \draw[arr] (ambSet.east)      -- (innerMin.west);
 \draw[arr, rounded corners=4pt] (ambSet.south) -- (23.15, 6.6) -| ([xshift=-2cm]contScen.north);
 \draw[arr, rounded corners=4pt] (innerMin.south) -- (28.55, 6.6) -| ([xshift=2cm]contScen.north);
 \draw[arr] (contScen.south)   -- (reward.north);
 \draw[arr, rounded corners=6pt] (reward.south) -- (25.9, 3.7) -| (policy.north);
 \draw[arr] (policy.east)      -- (publish.west);
 % market response: up the right edge of panel C, entirely inside, rounded
 \draw[arrG, rounded corners=6pt] (publish.east) -- (30.6, 3.05)
    -- (30.6, 9.8)
    -- node[midway, lbl] {market response} (state.north |- 0, 9.8)
    -- (state.north);
 %% ============================================================
 %%  Cross-panel connectors – gutter at y = 1.0..2.2
 %%  Three separate depths: 1.85, 1.45, 1.05  (no overlaps)
 %% ============================================================
 % 1. Worker -> Session  (depth y=1.85, shallowest)
 \draw[crossA, rounded corners=6pt]
    (worker.south) -- (worker.south |- 0, 1.85)
    -- node[pos=0.5, lbl] {offline extraction} (11.4, 1.85)
    -- (11.4, 8.9)
    -- (session.west);
 % 2. Score -> State  (depth y=1.45)
 \draw[crossG, rounded corners=6pt]
    (score.south) -- (score.south |- 0, 1.45)
    -- node[pos=0.5, lbl] {contamination signal} (20.6, 1.45)
    -- (20.6, 8.9)
    -- (state.west);
 % 3. Publish -> Provider  (depth y=1.05, deepest)
 \draw[crossG, rounded corners=3pt]
    (publish.south) -- (publish.south |- 0, 1.05)
    -- node[pos=0.4, lbl] {serve online} (5.8, 1.05)
    -- (5.8, 7.7)
    -- ([yshift=-0.3cm]provider.west);
 \end{tikzpicture}
--- a/paper/src/chapters/slacberger.tex
+++ b/paper/src/chapters/slacberger.tex
@@ -62,7 +62,7 @@ We propose a robust optimization objective. The platform seeks a pricing policy
 Here:
 \begin{itemize}
    \item The first term, $p_t \cdot \hat{q}_t(p_t | \theta=H)$, represents the revenue generated strictly from the estimated human segment.
-    \item $\mathcal{L}_{detect}$ is a penalty term for failing to separate distributions (the cost of confusion).
+    \item $\mathcal{L}_{detect}$ is a penalty term for failing to distinguish distributions (the cost of confusion).
    \item $\lambda$ is a hyperparameter balancing revenue exploitation vs. robust detection.
 \end{itemize}
--- a/paper/src/graphics/banner.png
+++ b/paper/src/graphics/banner.png
--- a/paper/src/main-genpop.tex
+++ b/paper/src/main-genpop.tex
@@ -57,7 +57,7 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce
 \item[Trajectory] Defined as a series of unspecified length, collecting data on states of some object over time.
 \item[Cost of Information (COI)] The average premium extracted above marginal cost due to information asymmetry.
 \item[Contamination Ratio] The proportion of agent sessions versus human sessions in the system.
-\item[Separability] The ability to distinguish between human and agent behavioral patterns.
+\item[Distinguishability] The ability to distinguish between human and agent behavioral patterns.
 \end{description}
 \section{Aggregate Compute Budget Derivation}
--- a/paper/src/main.tex
+++ b/paper/src/main.tex
@@ -29,6 +29,9 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce
 \vspace{1em}
 \noindent\textbf{Acknowledgments:} This research was supported by the TPU Research Cloud program, which provided access to Google Cloud TPU accelerators (including TPU v4, v5e, and v6e).
 \vspace{0.5em}
 \noindent\textbf{Project page:} \url{https://velocitatem.github.io/PHANTOM/}
 \clearpage
 \input{chapters/01-intro}
 \input{chapters/02-literature-review}
@@ -43,15 +46,44 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce
 \appendix
 \section{Terminology}
 \begin{description}
-\item[Agent $A$] An actor of non-human nature, powered by an LLM.
+\item[Agent $A$] A non-human actor, typically an LLM-driven system that executes web actions toward a goal.
-\item[Human $H$] An individual human with some job to be done.
+\item[Human $H$] A human participant interacting with the platform to complete a task.
-\item[Actor $\theta$] Defines a type of class which is either Agent or Human and has the capability to carry out actions on a web platform.
+\item[Actor Type $\theta$] A latent class parameter describing whether a session is generated by a human or an agent profile.
-\item[Platform] Any web-based platform which serves an interface to a collection of items that can be purchased, each at some price $p_i$.
+\item[Platform] A web interface exposing purchasable items and their offered prices.
-\item[Behavioral Model] A mathematical model predicting what action comes after a series of prior actions.
+\item[Session $s$] A bounded interaction record tied to one actor and one session identifier.
-\item[LLM] Large Language Model served by some provider with the abstracted capability of tool calling.
+\item[Event $e_{s,k}$] A single interaction tuple in a session, including action, item target, and timestamp.
-\item[TPU] Tensor Processing Unit which is a unique kind of chip architecture developed by Google.
+\item[Trajectory $\tau_s$] The ordered sequence of events generated within a session.
-\item[Trajectory] Defined as a series of unspecified length, collecting data on states of some object over time.
+\item[Demand Proxy $\hat{q}_{t,i}$] A weighted aggregate of observed actions used as an operational substitute for latent demand.
-% TODO: maybe define other things in a similar succient manner
+\item[Action Weight Function $\omega(a)$] A mapping from action type to signal strength in the demand proxy.
 \item[True Demand $d(p;\theta)$] The latent purchase response as a function of price and actor type.
 \item[Contamination $\alpha$] The proportion of agent-generated traffic in the session mixture.
 \item[Non-stationary Noise $\epsilon_t$] Time-varying residual variation not explained by the actor mixture.
 \item[Pricing Policy $\pi(\tau)$] A function mapping observed interaction history to an offered price.
 \item[Cost of Information (COI)] The expected premium above the minimum viable price induced by the pricing policy.
 \item[COI Leakage] A per-quote penalty term modeling information revealed to reconnaissance behavior.
 \item[First-Order Statistic $p_{(1)}$] The minimum observed price among multiple independent queries.
 \item[Transition Kernel $\mathcal{T}$] A Markov transition matrix over behavioral states or actions.
 \item[Distinguishability] The degree to which human and agent sessions can be distinguished from behavior alone.
 \item[KL Divergence $D_{KL}$] A relative-entropy measure used to compare session transition structure against class prototypes.
 \item[Divergence Scores $\Delta_H,\Delta_A$] Session-level distances to human and agent transition centroids.
 \item[Weak Agent Probability $f(\tau)$] A session-level score estimating the likelihood that a trajectory is agent-generated.
 \item[Contamination Generator $\mathcal{G}(\alpha)$] A simulator component that injects synthetic agent trajectories to reach a target mixture level.
 \item[Stackelberg Game] A leader-follower formulation where the platform sets prices and demand responds.
 \item[Ambiguity Set $\mathcal{U}_{\epsilon}$] A set of plausible demand distributions considered under distributional uncertainty.
 \item[Wasserstein Ball] A distance-bounded neighborhood around an empirical distribution used in robust optimization.
 \item[DR-RL] Distributionally Robust Reinforcement Learning for policies trained against worst-case distributional shifts.
 \item[Nominal Contamination $\alpha_0$] The baseline contamination level around which robust candidates are evaluated.
 \item[Robustness Radius $\epsilon_\alpha$] The local interval width used for inner minimization over contamination scenarios.
 \item[Query-Tax Surrogate] A constant leakage proxy assigning fixed penalty to suspected reconnaissance queries.
 \item[Revelation Surrogate] A leakage proxy based on $-\log\pi(p\mid\tau)$ to penalize highly informative quotes.
 \item[Limbo Stack] The alternating game-history buffer that stores leader price moves and follower demand responses.
 \item[UX Index] A bounded user-experience metric tracked to evaluate policy side effects on legitimate users.
 \item[Look-to-Book Ratio] The ratio of search-like interactions to completed purchases, used as an operational contamination indicator.
 \item[Hybrid Kappa-Lambda Architecture] A data design combining streaming ingestion with offline and batch learning loops.
 \item[MDP / POMDP] Sequential decision models with full observability (MDP) or partial observability (POMDP).
 \item[Behavioral Model] A model predicting what action is likely to follow from prior actions.
 \item[LLM] Large Language Model served through an inference provider with tool-use capability.
 \item[TPU] Tensor Processing Unit, a specialized accelerator architecture developed by Google.
 \end{description}
 \section{Aggregate Compute Budget Derivation}
@@ -78,6 +110,30 @@ v4             &  64 & 275 & $64  \times 275 = 17{,}600$  \\
 Converting to petaFLOPS: $160{,}320\;\text{TFLOPS} = 160.32\;\text{PFLOPS} \approx 160\;\text{PFLOPS}$. This is the theoretical peak under sustained BF16 arithmetic; realized throughput depends on memory bandwidth utilization and inter-chip communication overhead, but the figure serves as a useful upper bound for provisioning decisions.
 \section{Slope-Test Verification: Revenue vs. Contamination}
 \label{app:alpha_revenue_slope}
 This appendix provides a compact verification of the slope result reported in the main results section. Using the same run-level pairs $x_i=\texttt{study/alpha}_i$ and $y_i=\texttt{eval/revenue\_mean}_i$ ($n=95$), we re-checked the ordinary least squares slope test in Python with standard test routines (SciPy two-sided $t$ test for the slope).
 \[
 \widehat{y}=326{,}878.57-60{,}631.95\,x,
 \]
 \[
 t(93)=-8.2148,\qquad p=1.2038\times 10^{-12},\qquad R^2=0.4205,\qquad 95\%\,\text{CI}_{\beta_1}=[-75{,}288.76,\,-45{,}975.13].
 \]
 The Python verification reproduces the reported coefficients and inference values, confirming that the slope-test results are correct under standard methods.
 \section{whoclickedit Dataset Card}
 \label{app:whoclicked_card}
 For transparency and reproducibility, this appendix includes the full dataset card used for the public release of the \texttt{whoclickedit} dataset.
 \lstinputlisting[
  caption={whoclickedit dataset card (README snapshot)},
  label={lst:whoclicked_dataset_card}
 ]{chapters/auto/whoclicked_dataset_card.md}
 % \input{../build/concatenated_code}
 \end{document}
--- a/paper/src/mirrors/cais2026/main.tex
+++ b/paper/src/mirrors/cais2026/main.tex
@@ -41,7 +41,7 @@
 \begin{abstract}
 Dynamic pricing pipelines in e-commerce consume behavioral demand signals to set prices, but the growing presence of LLM-powered agents introduces a novel contamination vector: these agents decouple information gathering from transaction execution across isolated sessions, eroding the platform's pricing power.
-We present PHANTOM, a modular compound system that addresses this threat end-to-end. The system is composed of five orchestrated components: (1)~a configurable e-commerce research platform with dual-stream Kafka ingestion for behavioral and price-exposure events, (2)~a GOFAI-based weak labeling stage that partitions sessions into human and agent classes using rule-based predicates, (3)~a transition-kernel estimator that learns separable Markov models for each actor type and constructs a Contamination Generator for controlled simulation, (4)~a Distributionally Robust Reinforcement Learning policy that optimizes pricing under a Wasserstein ambiguity set conditioned on per-session divergence signals, and (5)~an Airflow-orchestrated pipeline that connects online data collection to offline policy training via Redis-backed model serving.
+We present PHANTOM, a modular compound system that addresses this threat end-to-end. The system is composed of five orchestrated components: (1)~a configurable e-commerce research platform with dual-stream Kafka ingestion for behavioral and price-exposure events, (2)~a GOFAI-based weak labeling stage that partitions sessions into human and agent classes using rule-based predicates, (3)~a transition-kernel estimator that learns distinguishable Markov models for each actor type and constructs a Contamination Generator for controlled simulation, (4)~a Distributionally Robust Reinforcement Learning policy that optimizes pricing under a Wasserstein ambiguity set conditioned on per-session divergence signals, and (5)~an Airflow-orchestrated pipeline that connects online data collection to offline policy training via Redis-backed model serving.
 We formally derive the Cost of Information Theorem, proving that standard pricing mechanisms become incentive-incompatible as agent query volume grows. The system architecture, interaction schema, and factorial experiment harness are designed for reproducibility and are released as open artifacts. We evaluate system-level tradeoffs between revenue protection, information leakage, and user-experience degradation through a three-objective reward structure.
 \end{abstract}
@@ -58,7 +58,7 @@ The current innovation boom in generative artificial intelligence and its applic
 The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. Dynamic pricing algorithms rely on directly translating demand features $q$ to new price assignments $\hat{p}$ across a catalogue of products of size $N$. When agent-driven reconnaissance traffic contaminates these demand signals, the pricing pipeline produces biased estimates that erode margins. This is not a single-model failure but a \textit{compound system} failure: the data ingestion, demand estimation, policy optimization, and model serving stages each propagate and amplify the contamination.
-Existing work treats bot detection and dynamic pricing as separate concerns. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience. This gap is what our contribution aims to address.
+Existing work treats bot detection and dynamic pricing as separate concerns. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience. This gap is what our contribution aims to address.
 \subsection{System-Level Contributions}
@@ -78,7 +78,7 @@ We frame our contribution along the four CAIS pillars---architectural patterns,
 This work addresses three core research questions:
 \begin{enumerate}
-    \item[\textbf{RQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
+    \item[\textbf{RQ1}] \textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
    \item[\textbf{RQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
    \item[\textbf{RQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
 \end{enumerate}
@@ -115,7 +115,7 @@ Each price query generates a record $(i, p, \text{sid}, \phi, t)$ associating th
 \subsection{Offline Loop: Policy Training}
-The Kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger. The offline loop consumes collected trajectories, performs weak labeling and transition-kernel estimation (Section~\ref{sec:separability}), trains the DR-RL policy (Section~\ref{sec:drrl}) in a simulator, and pushes the resulting policy to Redis for the pricing provider to read.
+The Kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger. The offline loop consumes collected trajectories, performs weak labeling and transition-kernel estimation (Section~\ref{sec:distinguishability}), trains the DR-RL policy (Section~\ref{sec:drrl}) in a simulator, and pushes the resulting policy to Redis for the pricing provider to read.
 \subsection{Online Dynamic Pricing (Baseline)}
@@ -165,7 +165,7 @@ The metadata record $\mu$ varies by action type. This heterogeneous structure is
 %% ====================================================================
 \section{Methodology: Pipeline Components}
-This section details the theoretical and practical framework behind each pipeline component. We formalize the problem environment, derive the \textit{Cost of Information} (COI) theorem that motivates the system design, describe the separability and contamination modules, and formulate the robust pricing policy.
+This section details the theoretical and practical framework behind each pipeline component. We formalize the problem environment, derive the \textit{Cost of Information} (COI) theorem that motivates the system design, describe the distinguishability and contamination modules, and formulate the robust pricing policy.
 \subsection{Problem Formalization}
@@ -225,15 +225,15 @@ Since the integrand vanishes as $N \to \infty$ for all $t > \underline{p}$, the
 This result is the theoretical motivation for the system design: it proves that standard pricing policies $\pi$ fail to extract surplus in the presence of large-scale agentic search, necessitating a contamination-aware component in the pipeline.
-\subsection{Module: Separability and Contamination Generation}
+\subsection{Module: Distinguishability and Contamination Generation}
-\label{sec:separability}
+\label{sec:distinguishability}
 To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from collected data using a two-stage approach.
 \subsubsection{GOFAI-Based Weak Labeling.}
-We use Good Old-Fashioned AI (GOFAI) heuristics to generate weak labels for separability. A set of rule-based predicates $\phi_j: \tau \to \{0,1\}$ partitions dataset $\mathcal{D}$ into high-confidence sets $\mathcal{D}_H$ and $\mathcal{D}_A$. We then estimate separate transition models for both groups and ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability?
+We use Good Old-Fashioned AI (GOFAI) heuristics to generate weak labels for distinguishability. A set of rule-based predicates $\phi_j: \tau \to \{0,1\}$ partitions dataset $\mathcal{D}$ into high-confidence sets $\mathcal{D}_H$ and $\mathcal{D}_A$. We then estimate separate transition models for both groups and ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability?
-To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global separability and event-level diagnostics at the same time. In our balanced dataset (50\% human, 50\% agent), the average divergence is approximately $1.8$.
+To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global distinguishability and event-level diagnostics at the same time. In our recorded dataset (13 human sessions, 16 agent sessions; 45\%/55\%), the average divergence is approximately $1.8$.
 \begin{definition}[KL Divergence for Transition Distributions]
 Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is:
@@ -243,7 +243,7 @@ Let $P_e$ and $Q_e$ be categorical distributions over destination states followi
 where $\mathcal{S}_e$ denotes the set of destination events that follow $e$ in the human trajectories.
 \end{definition}
-With these divergence features we train a contrastive model to estimate a weak agent probability $f(\tau)\in[0,1]$, which serves as the interface between the separability module and the downstream pricing policy.
+With these divergence features we train a contrastive model to estimate a weak agent probability $f(\tau)\in[0,1]$, which serves as the interface between the distinguishability module and the downstream pricing policy.
 \subsubsection{Transition-Kernel Estimation and Contamination Generator.}
 \label{sec:tpe}
@@ -282,12 +282,12 @@ Given a newly observed partial trajectory $\tau'$, we compute its empirical tran
  \Delta_A(\tau') &= D_{KL}(\hat{\mathcal{T}}^\prime \parallel \bar{\mathcal{T}}_A)
 \end{align}
-These divergence statistics serve as the operational connector between the separability module and the pricing policy. We define the per-session contamination estimate as:
+These divergence statistics serve as the operational connector between the distinguishability module and the pricing policy. We define the per-session contamination estimate as:
 \begin{equation}
 \label{eq:alpha_hat}
    \hat{\alpha}(\tau') = \sigma\big(\beta(\Delta_H(\tau') - \Delta_A(\tau'))\big)
 \end{equation}
-where $\sigma$ is the logistic function and $\beta > 0$ is a temperature parameter. This maps separability directly into a scalar control input for the pricing objective.
+where $\sigma$ is the logistic function and $\beta > 0$ is a temperature parameter. This maps distinguishability directly into a scalar control input for the pricing objective.
 \subsubsection{Ambiguity Set Construction.}
 Because the contamination level $\alpha$ and demand shift are non-stationary, a point estimate of the demand distribution is insufficient. Let $\hat{P}_N$ denote the empirical reference distribution induced by the Contamination Generator $\mathcal{G}(\alpha)$. We define the Wasserstein ambiguity set:
@@ -344,7 +344,7 @@ The simulator has multiple configurable factors, including valuation distributio
 Our training budget spans 384 TPU chips across v4, v5e, and v6e generations, distributed across Europe and U.S. regions with a spot-heavy mix and an on-demand reserve. At peak BF16 throughput this corresponds to roughly 160 PFLOPS of aggregate compute. We allocate v6e capacity to the heaviest policy training, use v5e for broad hyperparameter sweeps, and reserve on-demand v4 quota for runs that should not be preempted \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}.
-Our process follows three stages: (1)~observe and \textit{vectorize} behavioral interactions, (2)~learn separability to characterize human versus agent patterns, and (3)~use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
+Our process follows three stages: (1)~observe and \textit{vectorize} behavioral interactions, (2)~learn distinguishability to characterize human versus agent patterns, and (3)~use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
 Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is intentionally a disconnected component that feeds the later contributions.
@@ -375,7 +375,7 @@ Initialize contamination estimate $\hat\alpha \leftarrow 0.2$\;
    $\mathcal S_t \leftarrow \mathcal S_t \cup \{\tau_m\}$\;
  }
-  \tcp{Estimate contamination from separability module}
+  \tcp{Estimate contamination from distinguishability module}
  compute $\hat\alpha \leftarrow \frac{1}{M}\sum_{\tau\in\mathcal S_t} \Big[\sigma\big(\beta(\Delta_H(\tau)-\Delta_A(\tau))\big)\Big]$\;
  compute $J_t \leftarrow \text{Revenue}(p_t,\hat Q_t) - \lambda\cdot \text{COILeak}(\hat\alpha) - \eta\cdot \text{UX}(\hat\alpha)$\;
@@ -430,7 +430,7 @@ We formally defined the Cost of Information and proved that as the saturation of
 The system architecture, interaction schema, configurable e-commerce testbed, and factorial experiment harness are designed for reproducibility and released as open artifacts. This is a very generic end-to-end mechanism which is applicable to a variety of different e-commerce tasks. We intentionally put emphasis on the development of this infrastructure to establish a reproducible framework for interaction and to minimize any noise.
-Future work includes full factorial evaluation of the DR-RL policy across contamination levels, online adaptation of the ambiguity radius $\epsilon$ as a function of live divergence estimates, extension to multi-agent market maker settings, and integration of the HAP protocol~\cite{dhir_http_2025} as an additional signal source for the separability module.
+Future work includes full factorial evaluation of the DR-RL policy across contamination levels, online adaptation of the ambiguity radius $\epsilon$ as a function of live divergence estimates, extension to multi-agent market maker settings, and integration of the HAP protocol~\cite{dhir_http_2025} as an additional signal source for the distinguishability module.
 %% ====================================================================
--- a/paper/src/mirrors/genpop/01-intro.tex
+++ b/paper/src/mirrors/genpop/01-intro.tex
@@ -2,9 +2,9 @@
 \section{Introduction}
-In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove separability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned separability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.
+In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove distinguishability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned distinguishability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.
-This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned separability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.}
+This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned distinguishability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.}
 \subsection{Motivation and Market Context}
@@ -25,7 +25,7 @@ We formally define interaction data as coming from some actor which can either b
 This dissertation is organized around one main research question and three supporting sub-questions:
 \begin{enumerate}
    \item[\textbf{Main RQ}] How can dynamic pricing systems preserve margin integrity when transaction orchestration is increasingly mediated by non-human agents?
-    \item[\textbf{SQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
+    \item[\textbf{SQ1}] \textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
    \item[\textbf{SQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
    \item[\textbf{SQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
 \end{enumerate}
@@ -59,4 +59,4 @@ Extract final result from terminal state\;
 \end{algorithm}
-The previously described goal of separability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary.
+The previously described goal of distinguishability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary.
--- a/paper/src/mirrors/genpop/02-literature-review.tex
+++ b/paper/src/mirrors/genpop/02-literature-review.tex
@@ -1,6 +1,6 @@
 \section{Literature Review}
-To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups.
+To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups.
 \subsection{Agent Taxonomy and Definitions}
--- a/paper/src/mirrors/genpop/03-methodology.tex
+++ b/paper/src/mirrors/genpop/03-methodology.tex
@@ -1,6 +1,6 @@
 \section{Methodology}
-This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven separability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets.
+This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven distinguishability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets.
 \subsection{Problem Formalization}
@@ -109,13 +109,13 @@ Since users act with motivations, we define a pool of tasks (jobs to be done) an
 A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor.
-The human data collection involved 18 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 18 human sessions we ran 18 agent sessions of equivalent task scope, giving a balanced dataset of 36 labeled trajectories. Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy.
+The human data collection involved 13 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 13 human sessions we ran 16 agent sessions of equivalent task scope, yielding 29 labeled trajectories in total (45\% human, 55\% agent). Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy.
 To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior.
-Operationally, goals and experiment runs are tracked in PostgreSQL. This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to separate classes (agent vs human) with session-conditioned probability estimates, then injects those estimates into the pricing learner.
+Operationally, goals and experiment runs are tracked in PostgreSQL. This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to distinguish classes (agent vs human) with session-conditioned probability estimates, then injects those estimates into the pricing learner.
-Our process follows three stages: (1) observe and vectorize behavioral interactions, (2) learn separability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
+Our process follows three stages: (1) observe and vectorize behavioral interactions, (2) learn distinguishability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
 \begin{figure}[ht]
  \resizebox{\columnwidth}{!}{%
@@ -209,15 +209,15 @@ In the simulator baseline this order is encoded with a compact fixed scale: cart
 In addition to behavioral events, the platform logs price observations to a separate Kafka topic. Each price query generates a record associating the product, displayed price, requesting session, platform mode, and timestamp. This dual-stream architecture enables joint analysis of price exposure and behavioral response.
-\subsection{Generative Contamination and Separability}
+\subsection{Generative Contamination and Distinguishability}
 To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from Phantom data using a two-stage approach.
-\subsubsection{Ground-Truth Separability}
+\subsubsection{Ground-Truth Distinguishability}
-Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels (human or agent) are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition, treating the resulting human and agent kernels as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability?
+Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels (human or agent) are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition, treating the resulting human and agent kernels as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability?
-To answer this, we compute per-session divergence scores against both class-level centroids. For each session in either partition, we fit a session-level event transition kernel from that session's trajectory alone, then compute its average divergence to the human centroid and to the agent centroid. The per-session separability score is the gap between these two divergences: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
+To answer this, we compute per-session divergence scores against both class-level centroids. For each session in either partition, we fit a session-level event transition kernel from that session's trajectory alone, then compute its average divergence to the human centroid and to the agent centroid. The per-session distinguishability score is the gap between these two divergences: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
 We cannot assume normal distributions for divergence scores, which are right-skewed and bounded below by zero, so we do not use a Student's t-test. Instead we apply a Mann-Whitney U test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations.
@@ -305,7 +305,7 @@ We also consider taxation-like overlays for agent traffic under strategy-proof m
 \subsubsection{Pricing Mechanism Summary}
-We now present the complete pricing mechanism that integrates the behavioral separability, contamination estimation, and robust optimization components developed in the preceding sections. The defensive pricing loop algorithm formalizes the process as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories.
+We now present the complete pricing mechanism that integrates the behavioral distinguishability, contamination estimation, and robust optimization components developed in the preceding sections. The defensive pricing loop algorithm formalizes the process as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories.
 \begin{algorithm}[t]
 \caption{PHANTOM defensive pricing loop}
--- a/paper/src/mirrors/genpop/04-results.tex
+++ b/paper/src/mirrors/genpop/04-results.tex
@@ -1,14 +1,14 @@
 \section{Results}
 \begin{figure}[ht]
    \centering
-    \input{chapters/figures/supra.tex}
+    \input{chapters/figures/supra/supra.tex}
    \caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as SAC as can be clearly seen by the high density at the highest available price.}
    \label{fig:supra_heatmap}
 \end{figure}
 \subsection{Behavioral Analysis}
-Separability between human and agent sessions is evaluated by computing per-session divergence gap scores (how much closer each session is to the human baseline versus the agent baseline) and comparing the two groups with a Mann-Whitney U test. The table below reports the group-level descriptive statistics for the gap scores and the test result.
+Distinguishability between human and agent sessions is evaluated by computing per-session divergence gap scores (how much closer each session is to the human baseline versus the agent baseline) and comparing the two groups with a Mann-Whitney U test. The full recorded cohort contains 13 human sessions and 16 agent sessions, and the table below reports the corresponding group-level statistics and test result.
 \begin{table}[ht]
 \centering
@@ -18,19 +18,19 @@ Separability between human and agent sessions is evaluated by computing per-sess
 \toprule
 Group & n & Mean gap & Std \\
 \midrule
-Human sessions & 11 & $-3.3522$ & $2.6748$ \\
+Human sessions & 13 & $-3.35$ & $2.67$ \\
-Agent sessions & 6 & $+1.6482$ & $2.8349$ \\
+Agent sessions & 16 & $+1.65$ & $2.83$ \\
 \midrule
-\multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\
+\multicolumn{4}{l}{Mann-Whitney two-sided test: $p<0.001$} \\
 \bottomrule
 \end{tabular}
 \end{table}
-The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided p-value of 0.0006 (which means there is only a 0.06\% chance this pattern occurred by random luck) indicates near-complete rank separation between the groups at n=11 humans and n=6 agents, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing.
+The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided test result (p less than 0.001) at n=13 humans and n=16 agents indicates strong rank distinction between groups, providing evidence that the transition kernels are distinguishable enough to justify their use as a control signal in downstream pricing.
 \subsection{Experimental Outcomes}
-To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) non-robust baseline with revenue-only reward (no-robust flag).
+To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) a baseline policy with revenue-only reward.
 \begin{table}[ht]
 \centering
@@ -41,7 +41,7 @@ To evaluate robustness contributions, we compare two policies on the same enviro
 Policy & Eval reward & Eval revenue & COI leakage & Margin collapse rate \\
 \midrule
 Robust policy & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\
-Non-robust baseline (\texttt{--no-robust}) & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\
+Baseline policy & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\
 \bottomrule
 \end{tabular}
 \end{table}
@@ -50,6 +50,6 @@ This comparison isolates the effect of robustness terms from model capacity and
 \subsection{Interpretation and Insights}
-The Mann-Whitney result (U=2.0, p less than 0.001) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
+The Mann-Whitney result (p less than 0.001) confirms that per-session divergence gaps distinguish the two actor classes with near-zero overlap in rank ordering. This is the condition required for distinguishability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
 \subsection{Anomalies}
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,3 @@`
							`from .robust import select_adversarial_alpha_jax, _JAX_OK`

							`__all__ = ["select_adversarial_alpha_jax", "_JAX_OK"]`
		`@@ -0,0 +1 @@`
							`\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf}`
		`@@ -0,0 +1 @@`
							`\includegraphics[width=0.95\linewidth]{chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf}`
		`@@ -0,0 +1 @@`
							`\includegraphics[width=0.99\linewidth]{chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf}`
		`@@ -0,0 +1 @@`
							`\includegraphics[width=0.88\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf}`