Merge pull request #54 from velocitatem/baseline-comparisons

Baseline comparisons
2026-07-16 01:53:37 +00:00 · 2026-03-10 15:16:30 +01:00
parent 8f20359c8c 43bcad2a98
commit ae6cffe825
101 changed files with 7339 additions and 3806 deletions
--- a/.env.sweep.example
+++ b/.env.sweep.example
@@ -3,7 +3,7 @@
 # Required for wandb runs and sweep agent workers.
 WANDB_API_KEY=
 WANDB_ENTITY=
-WANDB_PROJECT=phantom-pricing
+WANDB_PROJECT=capstone
 # Required for private repo bootstrap workers.
 GITHUB_TOKEN=
@@ -16,3 +16,9 @@ GITHUB_TOKEN=
 # AGENT_COUNT=0
 # AGENT_LOOP=1
 # RETRY_SECONDS=20
 # Optional local benchmark defaults.
 # LOCAL_BENCHMARK_ARGS=--tiers static,surge,linear,qtable,ppo --alpha-values 0.0,0.3 --episodes 3 --total-timesteps 3000 --max-steps 40 --device cpu
 # SIMPLE_BENCHMARK_ARGS=--tiers qtable,ppo,dqn,a2c --alpha-values 0.0,0.15,0.3,0.45,0.6 --episodes 8 --total-timesteps 8000 --max-steps 40 --device cpu
 # PHANTOM_BENCHMARK_COMPARE_ROBUST=1
 # BENCHMARK_AGENT_ARGS=--tiers static,surge,linear,qtable,ppo --alpha-values 0.0,0.3,0.6 --episodes 5
--- a/.github/workflows/latex.yml
+++ b/.github/workflows/latex.yml
@@ -12,32 +12,92 @@ on:
 jobs:
  build:
    runs-on: ubuntu-latest
    env:
      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
      R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
      R2_ENDPOINT: ${{ secrets.R2_ENDPOINT }}
      R2_BUCKET_NAME: ${{ secrets.R2_BUCKET_NAME }}
    steps:
      - uses: actions/checkout@v4
-      - name: Compile LaTeX document
+
      - name: Prepare appendix code snapshot
        run: bash paper/concat_code.sh
      - name: Generate mirrors with Codex
        if: ${{ env.OPENAI_API_KEY != '' }}
        uses: openai/codex-action@v1
        with:
          openai-api-key: ${{ env.OPENAI_API_KEY }}
          sandbox: workspace-write
          safety-strategy: drop-sudo
          working-directory: .
          prompt: |
            Read and follow the mirror instructions in `paper/src/mirrors/genpop/INSTRUCTIONS.md`.
            Source chapters are in `paper/src/chapters/`:
            - 01-intro.tex
            - 02-literature-review.tex
            - 03-methodology.tex
            - 04-results.tex
            - 05-discussion.tex
            - 06-conclusion.tex
            Update `paper/src/mirrors/genpop/*.tex` so they mirror the thesis for a general audience according to the instruction file.
            Keep LaTeX valid and preserve citation commands and section order.
            Then create or update `paper/src/main-mirror-genpop.tex` by using `paper/src/main.tex` as the base and replacing chapter inputs from `chapters/...` to `mirrors/genpop/...`.
            Do not change any other project files.
      - name: Compute LaTeX roots
        id: roots
        run: |
          {
            echo "root_files<<EOF"
            echo "main.tex"
            for file in paper/src/main-mirror-*.tex; do
              if [ -f "$file" ]; then
                basename "$file"
              fi
            done
            echo "EOF"
          } >> "$GITHUB_OUTPUT"
          echo "Compiling roots:"
          echo "main.tex"
          for file in paper/src/main-mirror-*.tex; do
            if [ -f "$file" ]; then
              basename "$file"
            fi
          done
      - name: Compile LaTeX documents
        uses: xu-cheng/latex-action@v3
        with:
-          root_file: main.tex
+          root_file: ${{ steps.roots.outputs.root_files }}
          working_directory: paper/src
-          args: -pdf -f -interaction=nonstopmode -file-line-error -outdir=../build
+          args: -pdf -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build
-          pre_compile: bash ../concat_code.sh
+
-      - name: Upload PDF
+      - name: Upload PDF artifacts
        uses: actions/upload-artifact@v4
        with:
          name: thesis-pdf
-          path: paper/build/main.pdf
+          path: |
            paper/build/main.pdf
            paper/build/main-mirror-*.pdf
      - name: Get current date
        id: date
        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
      - name: Upload to Cloudflare R2
        if: ${{ env.R2_ACCESS_KEY_ID != '' && env.R2_SECRET_ACCESS_KEY != '' && env.R2_ENDPOINT != '' && env.R2_BUCKET_NAME != '' }}
        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
+          AWS_ACCESS_KEY_ID: ${{ env.R2_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
+          AWS_SECRET_ACCESS_KEY: ${{ env.R2_SECRET_ACCESS_KEY }}
-          AWS_ENDPOINT_URL: ${{ secrets.R2_ENDPOINT }}
+          AWS_ENDPOINT_URL: ${{ env.R2_ENDPOINT }}
          DATE: ${{ steps.date.outputs.date }}
-          BUCKET_NAME: ${{ secrets.R2_BUCKET_NAME }}
+          BUCKET_NAME: ${{ env.R2_BUCKET_NAME }}
        run: |
          pip install boto3
          python3 << 'EOF'
@@ -71,4 +131,49 @@ jobs:
              ExtraArgs={'ContentType': 'application/pdf'}
          )
          print(f"Uploaded thesis-latest.pdf")
          # upload mirror versions (if generated)
          build_dir = 'paper/build'
          for filename in os.listdir(build_dir):
              if not filename.startswith('main-mirror-') or not filename.endswith('.pdf'):
                  continue
              mirror_name = filename[len('main-mirror-'):-4]
              source_path = os.path.join(build_dir, filename)
              dated_mirror = f"thesis-{mirror_name}-{date}.pdf"
              latest_mirror = f"thesis-{mirror_name}-latest.pdf"
              namespaced_dated = f"mirrors/{mirror_name}/thesis-{date}.pdf"
              namespaced_latest = f"mirrors/{mirror_name}/thesis-latest.pdf"
              s3.upload_file(
                  source_path,
                  bucket,
                  dated_mirror,
                  ExtraArgs={'ContentType': 'application/pdf'}
              )
              print(f"Uploaded {dated_mirror}")
              s3.upload_file(
                  source_path,
                  bucket,
                  latest_mirror,
                  ExtraArgs={'ContentType': 'application/pdf'}
              )
              print(f"Uploaded {latest_mirror}")
              s3.upload_file(
                  source_path,
                  bucket,
                  namespaced_dated,
                  ExtraArgs={'ContentType': 'application/pdf'}
              )
              print(f"Uploaded {namespaced_dated}")
              s3.upload_file(
                  source_path,
                  bucket,
                  namespaced_latest,
                  ExtraArgs={'ContentType': 'application/pdf'}
              )
              print(f"Uploaded {namespaced_latest}")
          EOF
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,9 @@ phantom.egg-info/
 .nextstep
 .ignore-gitlogue
 .cloudflare
 .nx/
 node_modules/
 dist/
 # generated svg/graphics
 **/session_*.svg
@@ -36,10 +39,11 @@ paper/src/auto/*
 paper/src/bib/auto
 paper/template/*
 paper/build-cais/
 paper/defense/manim/media/
 paper/defense/manim/.manim/
 paper/src/main.pdf
 paper/src/main-blx.bib
 paper/src/svg-inkscape/
 paper/src/mirrors/
 paper/variations/
 paper/src/graphics/test_*.png
 thesis-latest.pdf
@@ -66,6 +70,7 @@ sim/case/thesis_simplified/runs*/
 # model binaries
 engine/models/*.zip
 engine/studies/results/*
 *.zip
 # wandb local state
--- a/221
+++ b/221
@@ -8,13 +8,17 @@ VENV      := .venv
 PYTHON    := $(VENV)/bin/python
 PIP       := $(VENV)/bin/pip
 PYTEST    := $(VENV)/bin/pytest
 NX        := npx nx
 SWEEP_ENV_FILE ?= .env.sweep
 WANDB_ENTITY ?=
-WANDB_PROJECT ?= phantom-pricing
+WANDB_PROJECT ?= capstone
 SWEEP_ID ?=
 LOCAL_TRAIN_ARGS ?= --algo ppo --total-timesteps 50000
 LOCAL_BENCHMARK_ARGS ?= --tiers static,surge,linear,qtable,ppo --alpha-values 0.0,0.3 --episodes 3 --total-timesteps 3000 --max-steps 40 --device cpu
 SIMPLE_BENCHMARK_ARGS ?= --tiers qtable,ppo,dqn,a2c --alpha-values 0.0,0.15,0.3,0.45,0.6 --episodes 8 --total-timesteps 8000 --max-steps 40 --device cpu
 BENCHMARK_AGENT_ARGS ?=
 AGENT_COUNT ?= 0
 REPO_URL ?=
@@ -24,10 +28,6 @@ AGENT_LOOP ?= 1
 RETRY_SECONDS ?= 20
 TRAIN_IMAGE_REF := us-central1-docker.pkg.dev/phantom-trc/phantom/phantom-trainer
 TPU_NAME ?=
 TPU_ZONE ?= us-central2-b
 TPU_PROJECT ?= phantom-trc
 TPU_REPO_DIR ?= /tmp/PHANTOM
 SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" || true; set +a
@@ -35,12 +35,21 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
 .PHONY: help
 help:
-	@echo "pdf.build pdf.watch pdf.clean | test.backend test.e2e test.all | web.dev | install | train | train.agent | train.bootstrap | train.tpu.pod | train.tpu.vm | train.tpu.vm.sweep | stats.lines"
+	@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines"
-	@echo "docker.train.publish"
+	@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
 	@echo ""
 	@echo "Build general public version:"
 	@echo "  make pdf.genpop"
 	@echo ""
 	@echo "Local wandb run:"
 	@echo "  make train LOCAL_TRAIN_ARGS='--algo ppo --total-timesteps 50000'"
 	@echo ""
 	@echo "Local benchmark run:"
 	@echo "  make benchmark LOCAL_BENCHMARK_ARGS='--tiers static,surge,linear --alpha-values 0.0,0.3 --episodes 3 --no-wandb'"
 	@echo ""
 	@echo "Simple benchmark run (.env.sweep defaults, robust+no_robust compare by default):"
 	@echo "  make benchmark.simple"
 	@echo ""
 	@echo "Local sweep agent from this repo:"
 	@echo "  make train.agent SWEEP_ID=entity/project/id AGENT_COUNT=5"
 	@echo ""
@@ -53,166 +62,126 @@ $(BUILDDIR):
 	mkdir -p paper/$(BUILDDIR)
 .PHONY: pdf.build
-pdf.build: $(BUILDDIR)
+pdf.build:
-	@bash paper/concat_code.sh
+	@$(NX) run paper:build
 	@cd $(SRCDIR) && \
 	$(LATEXMK) -pdf -jobname=$(JOBNAME) -f \
 		-interaction=nonstopmode -file-line-error \
 		-r ../.latexmkrc \
 		-outdir=../$(BUILDDIR) $(TEX)
 .PHONY: pdf.watch
-pdf.watch: $(BUILDDIR)
+pdf.watch:
-	@cd $(SRCDIR) && \
+	@$(NX) run paper:watch
 	$(LATEXMK) -pvc -pdf -jobname=$(JOBNAME) -f \
 		-interaction=nonstopmode -file-line-error \
 		-r ../.latexmkrc \
 		-outdir=../$(BUILDDIR) $(TEX)
 .PHONY: pdf.clean
 pdf.clean:
-	@cd $(SRCDIR) && \
+	@$(NX) run paper:clean
-	$(LATEXMK) -C -jobname=$(JOBNAME) -outdir=../$(BUILDDIR) || true
+
-	rm -rf paper/$(BUILDDIR)/*
+.PHONY: pdf.genpop
 pdf.genpop:
 	@bash scripts/nx_paper.sh build-genpop
 .PHONY: pdf.genpop.watch
 pdf.genpop.watch:
 	@bash scripts/nx_paper.sh watch-genpop
 .PHONY: pdf.arxiv
 pdf.arxiv:
 	@bash scripts/nx_paper.sh build-arxiv
 .PHONY: test.backend
-test.backend: $(VENV)
+test.backend:
-	$(PYTEST) -v
+	@$(NX) run research:test
 .PHONY: test.e2e
 test.e2e:
-	@cd tests/e2e && npm install
+	@$(NX) run e2e:test
 	@cd tests/e2e && npx playwright install chromium
 	@test -f tests/e2e/.env || cp tests/e2e/.env.example tests/e2e/.env
 	@timeout 30 bash -c 'until curl -sf http://localhost:5000/health > /dev/null 2>&1; do sleep 1; done' || (echo "Backend not ready" && exit 1)
 	@timeout 30 bash -c 'until curl -sf http://localhost:3000 > /dev/null 2>&1; do sleep 1; done' || (echo "Web app not ready" && exit 1)
 	@timeout 30 bash -c 'until curl -sf http://localhost:8085/health > /dev/null 2>&1; do sleep 1; done' || (echo "Airflow not ready" && exit 1)
 	@cd tests/e2e && npm test
 .PHONY: test.all
-test.all: test.backend test.e2e
+test.all:
 	@$(NX) run-many -t test --projects=research,e2e --parallel=1
 .PHONY: web.dev
 web.dev:
-	@cd web && npm install && npm run dev
+	@$(NX) run web:dev
 $(VENV):
 	python3 -m venv $(VENV)
 	$(PIP) install --upgrade pip
 .PHONY: install
-install: $(VENV)
+install:
-	$(PIP) install -r requirements.txt
+	@$(NX) run research:install
 .PHONY: train
-train: install
+train:
-	@$(SWEEP_ENV_LOAD); test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY required — set it in $(SWEEP_ENV_FILE)" && exit 1)
+	@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" LOCAL_TRAIN_ARGS="$(LOCAL_TRAIN_ARGS)" $(NX) run research:train
-	@$(SWEEP_ENV_LOAD); WANDB_API_KEY="$$WANDB_API_KEY" WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" \
+
-		$(PYTHON) -m engine.train $(LOCAL_TRAIN_ARGS)
+.PHONY: benchmark
 benchmark:
 	@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" LOCAL_BENCHMARK_ARGS="$(LOCAL_BENCHMARK_ARGS)" $(NX) run research:benchmark
 .PHONY: benchmark.simple
 benchmark.simple:
 	@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" SIMPLE_BENCHMARK_ARGS="$(SIMPLE_BENCHMARK_ARGS)" PHANTOM_BENCHMARK_COMPARE_ROBUST="$(PHANTOM_BENCHMARK_COMPARE_ROBUST)" $(NX) run research:benchmark-simple
 .PHONY: benchmark.agent
 benchmark.agent:
 	@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" BENCHMARK_AGENT_ARGS="$(BENCHMARK_AGENT_ARGS)" $(NX) run research:benchmark-agent
 .PHONY: train.agent
-train.agent: install
+train.agent:
-	@$(SWEEP_ENV_LOAD); test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY required — set it in $(SWEEP_ENV_FILE)" && exit 1)
+	@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" $(NX) run research:train-agent
 	@test -n "$(SWEEP_ID)" || (echo "SWEEP_ID required, e.g. SWEEP_ID=entity/project/id" && exit 1)
 	@$(SWEEP_ENV_LOAD); WANDB_API_KEY="$$WANDB_API_KEY" WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" \
 		$(PYTHON) -m engine.train --sweep-agent --sweep-id "$(SWEEP_ID)" \
 		$(if $(filter-out 0,$(AGENT_COUNT)),--count $(AGENT_COUNT),)
 .PHONY: train.bootstrap
 train.bootstrap:
-	@$(SWEEP_ENV_LOAD); test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY required — set it in $(SWEEP_ENV_FILE)" && exit 1)
+	@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap
 	@$(SWEEP_ENV_LOAD); test -n "$$GITHUB_TOKEN" || (echo "GITHUB_TOKEN required — set it in $(SWEEP_ENV_FILE)" && exit 1)
 	@test -n "$(REPO_URL)" || (echo "REPO_URL required, e.g. REPO_URL=https://github.com/org/repo.git" && exit 1)
 	@test -n "$(SWEEP_ID)" || (echo "SWEEP_ID required, e.g. SWEEP_ID=entity/project/id" && exit 1)
 	@$(SWEEP_ENV_LOAD); \
 		WANDB_API_KEY="$$WANDB_API_KEY" \
 		WANDB_ENTITY="$(WANDB_ENTITY)" \
 		WANDB_PROJECT="$(WANDB_PROJECT)" \
 		GITHUB_TOKEN="$$GITHUB_TOKEN" \
 		REPO_URL="$(REPO_URL)" \
 		BRANCH="$(BRANCH)" \
 		WORKDIR="$(WORKDIR)" \
 		SWEEP_ID="$(SWEEP_ID)" \
 		AGENT_COUNT="$(AGENT_COUNT)" \
 		AGENT_LOOP="$(AGENT_LOOP)" \
 		RETRY_SECONDS="$(RETRY_SECONDS)" \
 		bash scripts/wandb_agent_bootstrap.sh
 .PHONY: stats.lines
 stats.lines:
-	@find . \( -path '*/node_modules' -o -path '*/.venv' -o -path '*/venv' \) -prune -o \
+	@$(NX) run research:stats
 	\( -name "*.ts" -o -name "*.py" \) -type f -print0 | xargs -0 cat | wc -l
 .PHONY: wordcount
 wordcount:
-	@echo "Counting words in main text (excluding appendix)..."
+	@$(NX) run paper:wordcount
 	@texcount -nosub -total -sum -1 \
 		$(SRCDIR)/chapters/01-intro.tex \
 		$(SRCDIR)/chapters/02-literature-review.tex \
 		$(SRCDIR)/chapters/03-methodology.tex \
 		$(SRCDIR)/chapters/04-results.tex \
 		$(SRCDIR)/chapters/05-discussion.tex \
 		$(SRCDIR)/chapters/06-conclusion.tex
 .PHONY: docker.train.publish
 docker.train.publish:
-	docker build -f docker/Trainer.dockerfile --target gpu -t $(TRAIN_IMAGE_REF):gpu-latest .
+	@TRAIN_IMAGE_REF="$(TRAIN_IMAGE_REF)" $(NX) run research:docker-train-publish
 	docker push $(TRAIN_IMAGE_REF):gpu-latest
 	docker build -f docker/Trainer.dockerfile --target tpu -t $(TRAIN_IMAGE_REF):tpu-latest .
 	docker push $(TRAIN_IMAGE_REF):tpu-latest
-.PHONY: train.tpu.pod
+.PHONY: backend.server backend.provider backend.worker platform.up platform.down platform.logs
-train.tpu.pod:
+backend.server:
-	@test -n "$(TPU_NAME)"  || (echo "TPU_NAME required, e.g. TPU_NAME=TPUlong" && exit 1)
+	@$(NX) run backend-server:dev
 	@test -n "$(SWEEP_ID)"  || (echo "SWEEP_ID required, e.g. SWEEP_ID=entity/project/id" && exit 1)
 	@$(SWEEP_ENV_LOAD); test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY required — set it in $(SWEEP_ENV_FILE)" && exit 1)
 	gcloud compute tpus tpu-vm scp scripts/tpu_pod_run.sh $(TPU_NAME):/tmp/tpu_pod_run.sh \
 		--zone=$(TPU_ZONE) --project=$(TPU_PROJECT) --worker=all
 	@$(SWEEP_ENV_LOAD); \
 		gcloud compute tpus tpu-vm ssh $(TPU_NAME) \
 		--zone=$(TPU_ZONE) --project=$(TPU_PROJECT) --worker=all \
 		--command="WANDB_API_KEY='$$WANDB_API_KEY' SWEEP_ID='$(SWEEP_ID)' AGENT_COUNT='$(AGENT_COUNT)' sh /tmp/tpu_pod_run.sh"
-.PHONY: train.tpu.vm.prepare
+backend.provider:
-train.tpu.vm.prepare:
+	@$(NX) run pricing-provider:dev
 	@test -n "$(TPU_NAME)"  || (echo "TPU_NAME required, e.g. TPU_NAME=TPUlong" && exit 1)
 	TPU_NAME="$(TPU_NAME)" TPU_ZONE="$(TPU_ZONE)" TPU_PROJECT="$(TPU_PROJECT)" \
 		LOCAL_REPO_DIR="$(CURDIR)" REMOTE_REPO_DIR="$(TPU_REPO_DIR)" \
 		sh scripts/tpu_sync_repo.sh
 	gcloud compute tpus tpu-vm scp scripts/tpu_vm_train.sh $(TPU_NAME):/tmp/tpu_vm_train.sh \
 		--zone=$(TPU_ZONE) --project=$(TPU_PROJECT) --worker=all
-.PHONY: train.tpu.vm.run
+backend.worker:
-train.tpu.vm.run:
+	@$(NX) run backend-worker:dev
 	@test -n "$(TPU_NAME)"  || (echo "TPU_NAME required, e.g. TPU_NAME=TPUlong" && exit 1)
 	@test -n "$(LOCAL_TRAIN_ARGS)" || (echo "LOCAL_TRAIN_ARGS required, e.g. --algo ppo --jax --total-timesteps 200000" && exit 1)
 	@$(SWEEP_ENV_LOAD); \
 		gcloud compute tpus tpu-vm ssh $(TPU_NAME) \
 		--zone=$(TPU_ZONE) --project=$(TPU_PROJECT) --worker=all \
 		--command="REPO_DIR='$(TPU_REPO_DIR)' TRAIN_ARGS='$(LOCAL_TRAIN_ARGS)' WANDB_API_KEY='$$WANDB_API_KEY' sh /tmp/tpu_vm_train.sh"
-.PHONY: train.tpu.vm
+platform.up:
-train.tpu.vm: train.tpu.vm.prepare train.tpu.vm.run
+	@$(NX) run platform:up
-.PHONY: train.tpu.vm.sweep
+platform.down:
-train.tpu.vm.sweep:
+	@$(NX) run platform:down
-	@test -n "$(TPU_NAME)"  || (echo "TPU_NAME required, e.g. TPU_NAME=TPUlong" && exit 1)
+
-	@test -n "$(SWEEP_ID)"  || (echo "SWEEP_ID required, e.g. SWEEP_ID=lusiana/phantom-pricing/abc123" && exit 1)
+platform.logs:
-	@$(SWEEP_ENV_LOAD); test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY required — set it in $(SWEEP_ENV_FILE)" && exit 1)
+	@$(NX) run platform:logs
 	@$(SWEEP_ENV_LOAD); WANDB_API_KEY="$$WANDB_API_KEY" \
 		python3 scripts/tpu_vm_sweep_agent.py \
 		--sweep-id "$(SWEEP_ID)" \
 		--tpu-name "$(TPU_NAME)" \
 		--tpu-zone "$(TPU_ZONE)" \
 		--tpu-project "$(TPU_PROJECT)" \
 		--tpu-repo-dir "$(TPU_REPO_DIR)" \
 		$(if $(filter-out 0,$(AGENT_COUNT)),--count $(AGENT_COUNT),)
 .PHONY: pdf clean watch run.webapp test count-lines all
-pdf: pdf.build
+pdf:
-clean: pdf.clean
+	@$(NX) run paper:build
-watch: pdf.watch
+
-run.webapp: web.dev
+clean:
-test: test.backend
+	@$(NX) run paper:clean
-count-lines: stats.lines
+
-all: pdf.build
+watch:
 	@$(NX) run paper:watch
 run.webapp:
 	@$(NX) run web:dev
 test:
 	@$(NX) run research:test
 count-lines:
 	@$(NX) run research:stats
 all:
 	@$(NX) run paper:build
--- a/backend/project.json
+++ b/backend/project.json
@@ -0,0 +1,33 @@
 {
  "$schema": "../node_modules/nx/schemas/project-schema.json",
  "name": "platform",
  "projectType": "application",
  "sourceRoot": "backend",
  "targets": {
    "up": {
      "executor": "nx:run-commands",
      "options": {
        "command": "docker compose up -d",
        "cwd": "."
      }
    },
    "down": {
      "executor": "nx:run-commands",
      "options": {
        "command": "docker compose down",
        "cwd": "."
      }
    },
    "logs": {
      "executor": "nx:run-commands",
      "options": {
        "command": "docker compose logs --tail=100 -f",
        "cwd": "."
      }
    }
  },
  "tags": [
    "scope:platform",
    "type:infra"
  ]
 }
--- a/backend/provider/project.json
+++ b/backend/provider/project.json
@@ -0,0 +1,39 @@
 {
  "$schema": "../../node_modules/nx/schemas/project-schema.json",
  "name": "pricing-provider",
  "projectType": "application",
  "sourceRoot": "backend/provider",
  "targets": {
    "install": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash -lc '[ -x ../../.venv/bin/python ] || python3 -m venv ../../.venv; ../../.venv/bin/python -m ensurepip --upgrade; ../../.venv/bin/python -m pip install -r requirements.txt'",
        "cwd": "backend/provider"
      }
    },
    "dev": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "../../.venv/bin/uvicorn app:app --host 0.0.0.0 --port ${PROVIDER_PORT:-5001} --reload",
        "cwd": "backend/provider"
      }
    },
    "start": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "../../.venv/bin/uvicorn app:app --host 0.0.0.0 --port ${PROVIDER_PORT:-5001}",
        "cwd": "backend/provider"
      }
    }
  },
  "tags": [
    "scope:backend",
    "type:provider"
  ]
 }
--- a/backend/server/project.json
+++ b/backend/server/project.json
@@ -0,0 +1,39 @@
 {
  "$schema": "../../node_modules/nx/schemas/project-schema.json",
  "name": "backend-server",
  "projectType": "application",
  "sourceRoot": "backend/server",
  "targets": {
    "install": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash -lc '[ -x ../../.venv/bin/python ] || python3 -m venv ../../.venv; ../../.venv/bin/python -m ensurepip --upgrade; ../../.venv/bin/python -m pip install -r requirements.txt'",
        "cwd": "backend/server"
      }
    },
    "dev": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "../../.venv/bin/uvicorn app:app --host 0.0.0.0 --port ${BACKEND_PORT:-5000} --reload",
        "cwd": "backend/server"
      }
    },
    "start": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "../../.venv/bin/uvicorn app:app --host 0.0.0.0 --port ${BACKEND_PORT:-5000}",
        "cwd": "backend/server"
      }
    }
  },
  "tags": [
    "scope:backend",
    "type:api"
  ]
 }
--- a/backend/server/requirements.txt
+++ b/backend/server/requirements.txt
@@ -1,6 +1,6 @@
-fastapi==0.104.1
+fastapi>=0.135,<0.136
-uvicorn[standard]==0.24.0
+uvicorn[standard]>=0.41,<0.42
-kafka-python==2.0.2
+kafka-python>=2.3,<2.4
-pydantic==2.5.0
+pydantic>=2.12,<3
-python-dotenv==1.0.0
+python-dotenv>=1.0,<2
-supabase==2.9.1
+supabase>=2.28,<3
--- a/backend/worker/project.json
+++ b/backend/worker/project.json
@@ -0,0 +1,39 @@
 {
  "$schema": "../../node_modules/nx/schemas/project-schema.json",
  "name": "backend-worker",
  "projectType": "application",
  "sourceRoot": "backend/worker",
  "targets": {
    "install": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash -lc '[ -x ../../.venv/bin/python ] || python3 -m venv ../../.venv; ../../.venv/bin/python -m ensurepip --upgrade; ../../.venv/bin/python -m pip install -r requirements.txt'",
        "cwd": "backend/worker"
      }
    },
    "dev": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "../../.venv/bin/celery -A main:app worker --loglevel=info",
        "cwd": "backend/worker"
      }
    },
    "start": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "../../.venv/bin/python main.py",
        "cwd": "backend/worker"
      }
    }
  },
  "tags": [
    "scope:backend",
    "type:worker"
  ]
 }
--- a/backend/worker/requirements.txt
+++ b/backend/worker/requirements.txt
@@ -0,0 +1,3 @@
 celery>=5.3,<6
 python-dotenv>=1.0.0
 redis>=5.0.0
--- a/docker/Trainer.dockerfile
+++ b/docker/Trainer.dockerfile
@@ -7,36 +7,9 @@ WORKDIR /app
 COPY docker/trainer.requirements.txt /tmp/requirements.txt
 RUN pip install --no-cache-dir -r /tmp/requirements.txt
 # Optional for JAX-on-GPU workflows.
 ARG INSTALL_JAX_GPU=false
 RUN if [ "${INSTALL_JAX_GPU}" = "true" ]; then \
      pip install --no-cache-dir "jax[cuda12]==0.4.30" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html; \
    fi
 COPY --chmod=755 docker/trainer-agent-entrypoint.sh /usr/local/bin/trainer-agent-entrypoint
 COPY engine /app/engine
-ENV PYTHONPATH=/app \
+ENV PYTHONPATH=/app
    XLA_PYTHON_CLIENT_PREALLOCATE=false
 ENTRYPOINT ["/usr/local/bin/trainer-agent-entrypoint"]
 FROM python:3.11-slim AS tpu
 WORKDIR /app
 COPY docker/trainer.requirements.txt /tmp/requirements.txt
 RUN pip install --no-cache-dir -r /tmp/requirements.txt
 RUN pip install --no-cache-dir "jax[tpu]==0.4.30" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 COPY --chmod=755 docker/trainer-agent-entrypoint.sh /usr/local/bin/trainer-agent-entrypoint
 COPY engine /app/engine
 ENV PYTHONPATH=/app \
    PHANTOM_USE_JAX=1 \
    PHANTOM_DEFAULT_AGENT_ARGS="--jax" \
    XLA_PYTHON_CLIENT_PREALLOCATE=false
 ENTRYPOINT ["/usr/local/bin/trainer-agent-entrypoint"]
--- a/docker/trainer.requirements.txt
+++ b/docker/trainer.requirements.txt
@@ -5,9 +5,3 @@ gymnasium>=0.29.0
 stable-baselines3>=2.2.0
 tensorboard>=2.15.0
 wandb>=0.17.0
 tensorflow-probability==0.24.0
 flax==0.10.7
 optax==0.2.7
 distrax==0.1.5
 orbax-checkpoint==0.11.32
 chex==0.1.90
--- a/docs/index.html
+++ b/docs/index.html
@@ -17,8 +17,8 @@
  <meta property="og:site_name" content="PHANTOM Research">
  <meta property="og:title" content="PHANTOM: Pricing Heuristics Against Non-human Transaction Orchestration Mechanisms">
  <meta property="og:description" content="Developing pricing heuristics to protect e-commerce platforms from systematic exploitation by LLM agents in dynamic pricing environments through behavioral signature detection.">
-  <meta property="og:url" content="TODO">
+  <meta property="og:url" content="https://velocitatem.github.io/PHANTOM/">
-  <meta property="og:image" content="TODO">
+  <meta property="og:image" content="https://raw.githubusercontent.com/velocitatem/PHANTOM/main/docs/static/images/carousel1.jpg">
  <meta property="og:image:width" content="1200">
  <meta property="og:image:height" content="630">
  <meta property="og:image:alt" content="PHANTOM Research Preview">
@@ -30,17 +30,12 @@
  <!-- Twitter -->
  <meta name="twitter:card" content="summary_large_image">
-  <!-- TODO: Replace with your lab/institution Twitter handle -->
+  <meta name="twitter:site" content="@velocitatem">
-  <meta name="twitter:site" content="@YOUR_TWITTER_HANDLE">
+  <meta name="twitter:creator" content="@velocitatem">
-  <!-- TODO: Replace with first author's Twitter handle -->
+  <meta name="twitter:title" content="PHANTOM: Pricing Heuristics Against Non-human Transaction Orchestration Mechanisms">
-  <meta name="twitter:creator" content="@AUTHOR_TWITTER_HANDLE">
+  <meta name="twitter:description" content="A thesis project on defending dynamic pricing against LLM-driven reconnaissance and transaction orchestration.">
-  <!-- TODO: Same as paper title above -->
+  <meta name="twitter:image" content="https://raw.githubusercontent.com/velocitatem/PHANTOM/main/docs/static/images/carousel1.jpg">
-  <meta name="twitter:title" content="PAPER_TITLE">
+  <meta name="twitter:image:alt" content="PHANTOM research visual">
  <!-- TODO: Same as description above -->
  <meta name="twitter:description" content="BRIEF_DESCRIPTION_OF_YOUR_RESEARCH_CONTRIBUTION_AND_FINDINGS">
  <!-- TODO: Same as social preview image above -->
  <meta name="twitter:image" content="https://YOUR_DOMAIN.com/static/images/social_preview.png">
  <meta name="twitter:image:alt" content="PAPER_TITLE - Research Preview">
  <!-- Academic/Research Specific -->
  <meta name="citation_title" content="Pricing Heuristics Against Non-human Transaction Orchestration Mechanisms">
@@ -103,50 +98,42 @@
  {
    "@context": "https://schema.org",
    "@type": "ScholarlyArticle",
-    "headline": "PAPER_TITLE",
+    "headline": "PHANTOM: Pricing Heuristics Against Non-human Transaction Orchestration Mechanisms",
-    "description": "BRIEF_DESCRIPTION_OF_YOUR_RESEARCH_CONTRIBUTION_AND_FINDINGS",
+    "description": "Research on preserving dynamic pricing integrity under LLM-mediated reconnaissance and purchasing behavior.",
    "author": [
      {
        "@type": "Person",
-        "name": "FIRST_AUTHOR_NAME",
+        "name": "Daniel Rösel",
        "affiliation": {
          "@type": "Organization",
-          "name": "INSTITUTION_NAME"
+          "name": "IE University"
        }
      },
      {
        "@type": "Person",
        "name": "SECOND_AUTHOR_NAME",
        "affiliation": {
          "@type": "Organization",
          "name": "INSTITUTION_NAME"
        }
      }
    ],
-    "datePublished": "2024-01-01",
+    "datePublished": "2025-01-01",
    "publisher": {
      "@type": "Organization",
-      "name": "CONFERENCE_OR_JOURNAL_NAME"
+      "name": "IE University"
    },
-    "url": "https://YOUR_DOMAIN.com/YOUR_PROJECT_PAGE",
+    "url": "https://velocitatem.github.io/PHANTOM/",
-    "image": "https://YOUR_DOMAIN.com/static/images/social_preview.png",
+    "image": "https://raw.githubusercontent.com/velocitatem/PHANTOM/main/docs/static/images/carousel1.jpg",
-    "keywords": ["KEYWORD1", "KEYWORD2", "KEYWORD3", "machine learning", "computer vision"],
+    "keywords": ["dynamic pricing", "llm agents", "e-commerce", "distributionally robust optimization", "reinforcement learning"],
-    "abstract": "FULL_ABSTRACT_TEXT_HERE",
+    "abstract": "This thesis formalizes Cost of Information erosion under agentic reconnaissance, learns separable human and agent behavior kernels, and trains contamination-aware robust pricing policies.",
-    "citation": "BIBTEX_CITATION_HERE",
+    "citation": "Rösel, Daniel. PHANTOM: Pricing Heuristics Against Non-human Transaction Orchestration Mechanisms. IE University, 2025.",
    "isAccessibleForFree": true,
    "license": "https://creativecommons.org/licenses/by/4.0/",
    "mainEntity": {
      "@type": "WebPage",
-      "@id": "https://YOUR_DOMAIN.com/YOUR_PROJECT_PAGE"
+      "@id": "https://velocitatem.github.io/PHANTOM/"
    },
    "about": [
      {
        "@type": "Thing",
-        "name": "RESEARCH_AREA_1"
+        "name": "Dynamic Pricing"
      },
      {
        "@type": "Thing",
-        "name": "RESEARCH_AREA_2"
+        "name": "Agent Behavior Modeling"
      }
    ]
  }
@@ -158,8 +145,7 @@
    "@context": "https://schema.org",
    "@type": "Organization",
    "name": "IE University",
-    "url": "https://www.ie.edu",
+    "url": "https://www.ie.edu"
    "logo": "TODO"
  }
  </script>
 </head>
@@ -173,45 +159,72 @@
  <!-- More Works Dropdown -->
  <div class="more-works-container">
-    <button class="more-works-btn" onclick="toggleMoreWorks()" title="View More Works from Our Lab">
+    <button class="more-works-btn" onclick="toggleMoreWorks()" title="View project links and artifacts">
      <i class="fas fa-flask"></i>
-      More Works
+      Project Links
      <i class="fas fa-chevron-down dropdown-arrow"></i>
    </button>
    <div class="more-works-dropdown" id="moreWorksDropdown">
      <div class="dropdown-header">
-        <h4>More Works from Our Lab</h4>
+        <h4>Project Links</h4>
        <button class="close-btn" onclick="toggleMoreWorks()">
          <i class="fas fa-times"></i>
        </button>
      </div>
      <div class="works-list">
-        <!-- TODO: Replace with your lab's related works -->
+        <a href="https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf" class="work-item" target="_blank">
        <a href="https://arxiv.org/abs/PAPER_ID_1" class="work-item" target="_blank">
          <div class="work-info">
-            <!-- TODO: Replace with actual paper title -->
+            <h5>Thesis PDF</h5>
-            <h5>Paper Title 1</h5>
+            <p>Latest public build of the full thesis document.</p>
-            <!-- TODO: Replace with brief description -->
+            <span class="work-venue">IE University, 2025</span>
            <p>Brief description of the work and its main contribution.</p>
            <!-- TODO: Replace with venue and year -->
            <span class="work-venue">Conference/Journal 2024</span>
          </div>
          <i class="fas fa-external-link-alt"></i>
        </a>
-        <!-- TODO: Add more related works or remove extra items -->
+        <a href="https://github.com/velocitatem/PHANTOM" class="work-item" target="_blank">
        <a href="https://arxiv.org/abs/PAPER_ID_2" class="work-item" target="_blank">
          <div class="work-info">
-            <h5>Paper Title 2</h5>
+            <h5>PHANTOM Repository</h5>
-            <p>Brief description of the work and its main contribution.</p>
+            <p>Monorepo with paper source, platform code, and experiments.</p>
-            <span class="work-venue">Conference/Journal 2023</span>
+            <span class="work-venue">Open Source</span>
          </div>
          <i class="fas fa-external-link-alt"></i>
        </a>
-        <a href="https://arxiv.org/abs/PAPER_ID_3" class="work-item" target="_blank">
+        <a href="https://github.com/velocitatem/p4p" class="work-item" target="_blank">
          <div class="work-info">
-            <h5>Paper Title 3</h5>
+            <h5>P4P Interaction Layer</h5>
-            <p>Brief description of the work and its main contribution.</p>
+            <p>Reusable storefront and logging layer released for replication.</p>
-            <span class="work-venue">Conference/Journal 2023</span>
+            <span class="work-venue">Public Artifact</span>
          </div>
          <i class="fas fa-external-link-alt"></i>
        </a>
        <a href="https://phantom-hotel.vercel.app" class="work-item" target="_blank">
          <div class="work-info">
            <h5>Hotel Mode Demo</h5>
            <p>Public deployment of the hotel-style experiment interface.</p>
            <span class="work-venue">Live Demo</span>
          </div>
          <i class="fas fa-external-link-alt"></i>
        </a>
        <a href="https://phantom-airline.vercel.app" class="work-item" target="_blank">
          <div class="work-info">
            <h5>Airline Mode Demo</h5>
            <p>Public deployment of the airline-style experiment interface.</p>
            <span class="work-venue">Live Demo</span>
          </div>
          <i class="fas fa-external-link-alt"></i>
        </a>
        <a href="https://blog.alves.world/series/phantom" class="work-item" target="_blank">
          <div class="work-info">
            <h5>Blog Series</h5>
            <p>Behind-the-scenes posts covering thesis process, tooling, and insights.</p>
            <span class="work-venue">To Boldly Code</span>
          </div>
          <i class="fas fa-external-link-alt"></i>
        </a>
        <a href="goals/README.md" class="work-item" target="_blank">
          <div class="work-info">
            <h5>Goal Library</h5>
            <p>Task definitions used to assign actor objectives in experiments.</p>
            <span class="work-venue">Experiment Design</span>
          </div>
          <i class="fas fa-external-link-alt"></i>
        </a>
@@ -238,6 +251,16 @@
                  <div class="column has-text-centered">
                    <div class="publication-links">
                      <span class="link-block">
                        <a href="https://blog.alves.world/series/phantom" target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                        <span class="icon">
                          <i class="fas fa-blog"></i>
                        </span>
                        <span>Blog Series</span>
                      </a>
                    </span>
                    <span class="link-block">
                        <a href="https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf" target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
@@ -248,14 +271,13 @@
                      </a>
                    </span>
                    <!-- TODO: Add your supplementary material PDF or remove this section -->
                    <span class="link-block">
-                      <a href="static/pdfs/supplementary_material.pdf" target="_blank"
+                      <a href="goals/goals.csv" target="_blank"
                      class="external-link button is-normal is-rounded is-dark">
                      <span class="icon">
-                        <i class="fas fa-file-pdf"></i>
+                        <i class="fas fa-list"></i>
                      </span>
-                      <span>Supplementary</span>
+                      <span>Goal Set</span>
                    </a>
                  </span>
@@ -269,14 +291,23 @@
                  </a>
                </span>
                <!-- TODO: Update with your arXiv paper ID -->
                <span class="link-block">
-                  <a href="https://arxiv.org/abs/<ARXIV PAPER ID>" target="_blank"
+                  <a href="https://phantom-hotel.vercel.app" target="_blank"
                  class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
-                    <i class="ai ai-arxiv"></i>
+                    <i class="fas fa-globe"></i>
                  </span>
-                  <span>arXiv</span>
+                  <span>Hotel Demo</span>
                </a>
              </span>
              <span class="link-block">
                <a href="https://phantom-airline.vercel.app" target="_blank"
                class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                  <i class="fas fa-plane"></i>
                </span>
                <span>Airline Demo</span>
              </a>
            </span>
            </div>
@@ -284,27 +315,19 @@
        </div>
      </div>
    </div>
  </div>
 </section>
 <!-- Teaser video-->
 <section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
-      <!-- TODO: Replace with your teaser video -->
+      <div class="publication-banner">
-      <video poster="" id="tree" autoplay controls muted loop height="100%" preload="metadata">
+        <img src="static/images/banner.svg" alt="PHANTOM teaser diagram connecting vulnerability, behavioral signal, and robust control" width="1920" height="1080" decoding="async" style="display:block; width:100%; height:auto;" onerror="this.onerror=null;this.src='static/images/carousel2.jpg';"/>
-        <!-- TODO: Add your video file path here -->
+      </div>
        <source src="static/videos/banner_video.mp4" type="video/mp4">
      </video>
      <!-- TODO: Replace with your video description -->
      <h2 class="subtitle has-text-centered">
        Aliquam vitae elit ullamcorper tellus egestas pellentesque. Ut lacus tellus, maximus vel lectus at, placerat pretium mi. Maecenas dignissim tincidunt vestibulum. Sed consequat hendrerit nisl ut maximus.
      </h2>
    </div>
  </div>
 </section>
-<!-- End teaser video -->
+
 <!-- Paper abstract -->
 <section class="section hero is-light">
@@ -314,10 +337,10 @@
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
-            This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model to prove separability as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned separability into existing dynamic pricing machine learning loops, and establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.
+            When you shop online, prices often change based on how much interest you show — the more you browse, the more the site learns about your intent and may raise prices accordingly. This works because stores assume that a curious, engaged shopper is more likely to buy. But AI assistants are now doing the shopping research on behalf of users: they browse in one session to gather price information and then let the user purchase in a fresh session at the lower, unadjusted price. The store never sees the connection between the two, so it never gets to factor in that genuine intent — and loses the revenue it would have earned.
          </p>
          <p>
-            This work develops behavioral signature models using recommendation system techniques to profile session-level interaction, temporal engagement, and cross-session correlation. The AI Agent market is forecasted to grow from around USD 5-8 billion in 2025 to USD 42-52 billion by 2030, raising the question of how these systems should be designed for future robustness and how to maintain a competitive edge in the analytical components of e-commerce platforms.
+            PHANTOM studies this problem and builds defenses against it. We created a realistic fake store (in hotel and airline modes) where both real people and AI agents were given shopping tasks, and we recorded every click, scroll, and page visit. By comparing how humans and AI agents move through a site, we found clear patterns that tell them apart. We then used those patterns to build a smarter pricing system that can recognize when it is likely talking to an AI scout and adjust its strategy accordingly — protecting the store's margins without making things worse for genuine shoppers.
          </p>
        </div>
      </div>
@@ -326,97 +349,90 @@
 </section>
 <!-- End paper abstract -->
 <section class="section">
  <div class="container is-max-desktop">
    <div class="content has-text-justified">
      <h2 class="title is-3 has-text-centered">Project Scope</h2>
      <p>
        The current thesis revision extends both theory and implementation. The main research question is how a pricing system can preserve margin integrity when browsing and purchasing are increasingly orchestrated by AI agents.
      </p>
      <ul>
        <li>Formal contribution: a Cost of Information erosion theorem showing why price-query saturation can collapse dynamic pricing power.</li>
        <li>System contribution: a hybrid online/offline stack (Next.js storefront, pricing provider, Kafka event streams, Airflow ETL, Redis serving layer).</li>
        <li>Modeling contribution: class-specific transition kernels for human and agent behavior, with KL-divergence based separability scores.</li>
        <li>Control contribution: a contamination-aware DR-RL pricing policy trained under distributional uncertainty using Wasserstein-style robustness.</li>
      </ul>
      <p>
        Controlled trials currently include balanced human and agent sessions with goal-driven tasks across hotel and airline interfaces. Early separability results are strong (Mann-Whitney U=2.0, p=0.0006), while robust pricing gains remain regime-dependent and are being calibrated in larger sweeps.
      </p>
    </div>
  </div>
 </section>
 <!-- Image carousel -->
 <!--
 <section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <div id="results-carousel" class="carousel results-carousel">
       <div class="item">
        <!-- TODO: Replace with your research result images -->
        <img src="static/images/carousel1.jpg" alt="First research result visualization" loading="lazy"/>
        <!-- TODO: Replace with description of this result -->
        <h2 class="subtitle has-text-centered">
-          First image description.
+          Early simulator traces showing how policy choice can push prices toward aggressive high-end regimes.
        </h2>
      </div>
      <div class="item">
        <!-- Your image here -->
        <img src="static/images/carousel2.jpg" alt="Second research result visualization" loading="lazy"/>
        <h2 class="subtitle has-text-centered">
-          Second image description.
+          Human and agent behavior diverge at the transition-kernel level, enabling usable session-level separability.
        </h2>
      </div>
      <div class="item">
        <!-- Your image here -->
        <img src="static/images/carousel3.jpg" alt="Third research result visualization" loading="lazy"/>
        <h2 class="subtitle has-text-centered">
-         Third image description.
+         End-to-end architecture linking web interactions, pricing queries, event streams, and model updates.
       </h2>
     </div>
     <div class="item">
      <!-- Your image here -->
      <img src="static/images/carousel4.jpg" alt="Fourth research result visualization" loading="lazy"/>
      <h2 class="subtitle has-text-centered">
-        Fourth image description.
+        Contamination-aware evaluation compares robust and non-robust pricing behavior across alpha sweeps.
      </h2>
    </div>
  </div>
 </div>
 </div>
 </section>
 -->
 <!-- End image carousel -->
 <!-- Youtube video -->
 <section class="hero is-small is-light">
  <div class="hero-body">
    <div class="container">
      <!-- Paper video. -->
      <h2 class="title is-3">Video Presentation</h2>
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <div class="publication-video">
            <!-- TODO: Replace with your YouTube video ID -->
            <iframe src="https://www.youtube.com/embed/JkaxUblCGz0" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>
          </div>
        </div>
      </div>
    </div>
  </div>
 </section>
 <!-- End youtube video -->
 <!-- Video carousel -->
 <section class="hero is-small">
  <div class="hero-body">
    <div class="container">
-      <h2 class="title is-3">Another Carousel</h2>
+      <h2 class="title is-3">Defense Scenes</h2>
-      <div id="results-carousel" class="carousel results-carousel">
+      <div id="videos-carousel" class="carousel results-carousel">
        <div class="item item-video1">
          <!-- TODO: Add poster image for better preview -->
          <video poster="" id="video1" controls muted loop height="100%" preload="metadata">
-            <!-- Your video file here -->
+            <source src="static/videos/COIFirstPrinciplesScene.mp4" type="video/mp4">
            <source src="static/videos/carousel1.mp4" type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">COI from first principles.</h2>
        </div>
        <div class="item item-video2">
          <!-- TODO: Add poster image for better preview -->
          <video poster="" id="video2" controls muted loop height="100%" preload="metadata">
-            <!-- Your video file here -->
+            <source src="static/videos/BehaviorKernelConstructionScene.mp4" type="video/mp4">
            <source src="static/videos/carousel2.mp4" type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">Behavioral kernel construction: learning how humans and agents differ.</h2>
        </div>
        <div class="item item-video3">
          <!-- TODO: Add poster image for better preview -->
          <video poster="" id="video3" controls muted loop height="100%" preload="metadata">
-            <!-- Your video file here -->
+            <source src="static/videos/RobustControlScene.mp4" type="video/mp4">
            <source src="static/videos/carousel3.mp4" type="video/mp4">
          </video>
          <h2 class="subtitle has-text-centered">Distributionally robust control loop.</h2>
        </div>
      </div>
    </div>
@@ -433,9 +449,9 @@
 <section class="hero is-small is-light">
  <div class="hero-body">
      <div class="container">
-      <h2 class="title">Poster</h2>
+      <h2 class="title">Full Thesis</h2>
-      <iframe  src="https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf" width="100%" height="550">
+      <iframe title="PHANTOM thesis PDF" src="https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf" width="100%" height="550">
          </iframe>
      </div>
@@ -457,7 +473,7 @@
      </div>
      <pre id="bibtex-code"><code>@thesis{Rosel2025PHANTOM,
  title={Pricing Heuristics Against Non-human Transaction Orchestration Mechanisms},
-  author={R{\"o}sel, Daniel},
+  author={Rösel, Daniel},
  school={IE University},
  year={2025},
  address={Madrid, Spain},
--- a/docs/static/images/banner.svg
+++ b/docs/static/images/banner.svg
@@ -0,0 +1,246 @@
 <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1920 1080" width="1920" height="1080" style="background-color: #FAFAFA; font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;">
    <defs>
        <!-- Soft Drop Shadow for Panels -->
        <filter id="shadow" x="-10%" y="-10%" width="130%" height="130%">
            <feDropShadow dx="2" dy="4" stdDeviation="6" flood-color="#000000" flood-opacity="0.06"/>
        </filter>
        <filter id="light-shadow" x="-5%" y="-5%" width="110%" height="110%">
            <feDropShadow dx="1" dy="2" stdDeviation="2" flood-color="#000000" flood-opacity="0.04"/>
        </filter>
        <!-- Arrowhead Marker -->
        <marker id="arrow" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
            <path d="M 0 0 L 10 5 L 0 10 z" fill="#888888" />
        </marker>
        <marker id="arrow-dark" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
            <path d="M 0 0 L 10 5 L 0 10 z" fill="#555555" />
        </marker>
    </defs>
    <!-- COLUMN DIVIDERS -->
    <line x1="640" y1="60" x2="640" y2="1020" stroke="#EAEAEA" stroke-width="2" stroke-dasharray="10,10"/>
    <line x1="1280" y1="60" x2="1280" y2="1020" stroke="#EAEAEA" stroke-width="2" stroke-dasharray="10,10"/>
    <!-- ========================================================= -->
    <!-- COLUMN 1: THE THREAT (COI & SATURATION)                   -->
    <!-- ========================================================= -->
    <text x="60" y="80" font-family="Georgia, serif" font-size="28" font-weight="bold" fill="#333333">1. The Vulnerability</text>
    <line x1="60" y1="100" x2="580" y2="100" stroke="#DDDDDD" stroke-width="2"/>
    <!-- Top: COI Bell Curve -->
    <g transform="translate(60, 130)">
        <text x="0" y="30" font-size="24" font-weight="bold" fill="#444">Cost of Information from First Principles</text>
        <text x="0" y="70" font-family="Georgia, serif" font-style="italic" font-size="22" fill="#8C7A6B">P ~ π(τ)</text>
        <text x="0" y="105" font-family="Georgia, serif" font-style="italic" font-size="22" fill="#8C7A6B"><tspan text-decoration="underline">p</tspan> = reservation price</text>
        <text x="0" y="140" font-family="Georgia, serif" font-style="italic" font-size="22" fill="#8C7A6B">M = P - <tspan text-decoration="underline">p</tspan></text>
        <!-- Bell Curve -->
        <path d="M 40 340 C 140 340, 160 160, 260 160 C 360 160, 380 340, 480 340" stroke="#3AB09E" stroke-width="5" fill="none"/>
        <line x1="40" y1="340" x2="500" y2="340" stroke="#333" stroke-width="2"/>
        <!-- Markers p and E[P] -->
        <line x1="150" y1="340" x2="150" y2="160" stroke="#E37862" stroke-width="2" stroke-dasharray="6,4"/>
        <text x="150" y="375" font-family="Georgia" font-style="italic" font-size="22" fill="#E37862" text-anchor="middle">p</text>
        <line x1="260" y1="340" x2="260" y2="160" stroke="#85B589" stroke-width="2" stroke-dasharray="6,4"/>
        <text x="260" y="375" font-family="Georgia" font-style="italic" font-size="22" fill="#85B589" text-anchor="middle">E[P]</text>
        <!-- COI Annotation -->
        <line x1="150" y1="150" x2="260" y2="150" stroke="#E37862" stroke-width="2" marker-start="url(#arrow)" marker-end="url(#arrow)"/>
        <text x="310" y="138" font-size="16" fill="#E37862" text-anchor="middle">average information rent</text>
        <text x="310" y="118" font-family="Georgia" font-style="italic" font-size="22" fill="#E37862" font-weight="bold" text-anchor="middle">COI := E[P] - p</text>
    </g>
    <!-- Bottom: Agent Saturation -->
    <g transform="translate(60, 580)">
        <text x="0" y="30" font-size="24" font-weight="bold" fill="#444">Why COI Erodes with Agent Saturation</text>
        <text x="0" y="75" font-family="Georgia, serif" font-style="italic" font-size="22" fill="#8C7A6B">p<tspan font-size="14" dy="5">(1)</tspan><tspan dy="-5"> = min(p</tspan><tspan font-size="14" dy="5">1</tspan><tspan dy="-5">, ..., p</tspan><tspan font-size="14" dy="5">N</tspan><tspan dy="-5">)</tspan></text>
        <text x="0" y="115" font-family="Georgia, serif" font-style="italic" font-size="22" fill="#8C7A6B">P(p<tspan font-size="14" dy="5">(1)</tspan><tspan dy="-5"> > t) = [1 - F(t)]</tspan><tspan font-size="14" dy="-10">N</tspan></text>
        <!-- Erosion Graph -->
        <rect x="120" y="150" width="280" height="230" fill="#FFFFFF" filter="url(#shadow)" rx="8"/>
        <line x1="140" y1="350" x2="380" y2="350" stroke="#333" stroke-width="2"/>
        <line x1="140" y1="350" x2="140" y2="170" stroke="#333" stroke-width="2"/>
        <text x="260" y="375" font-size="16" font-style="italic" fill="#555" text-anchor="middle">F(t)</text>
        <text x="120" y="260" font-size="16" font-style="italic" fill="#555" text-anchor="middle" transform="rotate(-90 120 260)">[1 - F(t)]^N</text>
        <!-- Curves -->
        <path d="M 140 170 C 220 250, 300 320, 380 350" stroke="#4EA5D9" stroke-width="3" fill="none"/>
        <text x="390" y="220" font-size="16" fill="#4EA5D9" font-weight="bold">N=1</text>
        <path d="M 140 170 C 180 260, 240 330, 380 350" stroke="#85B589" stroke-width="3" fill="none"/>
        <text x="390" y="250" font-size="16" fill="#85B589" font-weight="bold">N=4</text>
        <path d="M 140 170 C 150 290, 180 340, 380 350" stroke="#E37862" stroke-width="3" fill="none"/>
        <text x="390" y="280" font-size="16" fill="#E37862" font-weight="bold">N=16</text>
        <text x="260" y="420" font-size="20" fill="#555" text-anchor="middle">As independent query count grows,</text>
        <text x="260" y="445" font-size="20" fill="#E37862" font-weight="bold" text-anchor="middle">realizable markup collapses.</text>
    </g>
    <!-- ========================================================= -->
    <!-- COLUMN 2: THE BEHAVIORAL SIGNAL                           -->
    <!-- ========================================================= -->
    <text x="700" y="80" font-family="Georgia, serif" font-size="28" font-weight="bold" fill="#333333">2. The Behavioral Signals</text>
    <line x1="700" y1="100" x2="1220" y2="100" stroke="#DDDDDD" stroke-width="2"/>
    <!-- Top: Transition Kernels -->
    <g transform="translate(700, 130)">
        <text x="0" y="30" font-size="24" font-weight="bold" fill="#444">From Session Paths to Transition Kernels</text>
        <text x="0" y="75" font-size="20" fill="#85B589" font-weight="bold">human: start → view → detail → cart → purchase</text>
        <text x="0" y="115" font-size="20" fill="#E37862" font-weight="bold">agent: start → view → detail → view → detail</text>
        <text x="0" y="170" font-family="Georgia, serif" font-style="italic" font-size="24" fill="#8C7A6B">
            P&#770;(s'|s) = <tspan font-size="18" dy="-12">N(s,s')</tspan> / <tspan font-size="18" dy="12">Σ N(s,k)</tspan>
        </text>
        <!-- Matrix Representation -->
        <rect x="0" y="220" width="500" height="180" fill="#FFFFFF" filter="url(#shadow)" rx="8"/>
        <text x="125" y="250" font-size="16" fill="#4EA5D9" text-anchor="middle">transition counts N(s,s')</text>
        <text x="375" y="250" font-size="16" fill="#85B589" text-anchor="middle">normalized kernel T</text>
        <!-- Matrix 1 -->
        <g transform="translate(45, 270)">
            <rect x="-6" y="-8" width="172" height="128" rx="6" fill="none" stroke="#DDDDDD" stroke-width="1.5"/>
            <path d="M 10 0 L 0 0 L 0 110 L 10 110 M 150 0 L 160 0 L 160 110 L 150 110" stroke="#A0A0A0" stroke-width="2.5" fill="none"/>
            <text x="80" y="20" font-family="monospace" font-size="14" fill="#555" text-anchor="middle" textLength="142" lengthAdjust="spacingAndGlyphs">0.00 8.00 0.00 0.00</text>
            <text x="80" y="50" font-family="monospace" font-size="14" fill="#555" text-anchor="middle" textLength="142" lengthAdjust="spacingAndGlyphs">0.00 2.00 5.00 1.00</text>
            <text x="80" y="80" font-family="monospace" font-size="14" fill="#555" text-anchor="middle" textLength="142" lengthAdjust="spacingAndGlyphs">0.00 3.00 2.00 4.00</text>
            <text x="80" y="110" font-family="monospace" font-size="14" fill="#555" text-anchor="middle" textLength="142" lengthAdjust="spacingAndGlyphs">0.00 1.00 0.00 6.00</text>
        </g>
        <!-- Arrow -->
        <line x1="225" y1="320" x2="265" y2="320" stroke="#999" stroke-width="3" marker-end="url(#arrow-dark)"/>
        <!-- Matrix 2 -->
        <g transform="translate(295, 270)">
            <rect x="-6" y="-8" width="172" height="128" rx="6" fill="none" stroke="#DDDDDD" stroke-width="1.5"/>
            <path d="M 10 0 L 0 0 L 0 110 L 10 110 M 150 0 L 160 0 L 160 110 L 150 110" stroke="#A0A0A0" stroke-width="2.5" fill="none"/>
            <text x="80" y="20" font-family="monospace" font-size="14" fill="#333" text-anchor="middle" textLength="142" lengthAdjust="spacingAndGlyphs">0.00 1.00 0.00 0.00</text>
            <text x="80" y="50" font-family="monospace" font-size="14" fill="#333" text-anchor="middle" textLength="142" lengthAdjust="spacingAndGlyphs">0.00 0.25 0.62 0.13</text>
            <text x="80" y="80" font-family="monospace" font-size="14" fill="#333" text-anchor="middle" textLength="142" lengthAdjust="spacingAndGlyphs">0.00 0.33 0.22 0.45</text>
            <text x="80" y="110" font-family="monospace" font-size="14" fill="#333" text-anchor="middle" textLength="142" lengthAdjust="spacingAndGlyphs">0.00 0.14 0.00 0.86</text>
        </g>
        <text x="250" y="440" font-size="18" fill="#777" text-anchor="middle">Kernel shape is the compact behavioral signature used downstream.</text>
    </g>
    <!-- Bottom: Separability Distributions -->
    <g transform="translate(700, 600)">
        <text x="0" y="30" font-size="24" font-weight="bold" fill="#444">Separability into a Control Signal</text>
        <text x="0" y="75" font-family="Georgia, serif" font-style="italic" font-size="22" fill="#8C7A6B">Δ<tspan font-size="16" dy="5">H</tspan><tspan dy="-5"> = D</tspan><tspan font-size="16" dy="5">KL</tspan><tspan dy="-5">(T&#770;' || T&#772;</tspan><tspan font-size="16" dy="5">H</tspan><tspan dy="-5">)</tspan></text>
        <text x="0" y="115" font-family="Georgia, serif" font-style="italic" font-size="22" fill="#8C7A6B">Δ<tspan font-size="16" dy="5">A</tspan><tspan dy="-5"> = D</tspan><tspan font-size="16" dy="5">KL</tspan><tspan dy="-5">(T&#770;' || T&#772;</tspan><tspan font-size="16" dy="5">A</tspan><tspan dy="-5">)</tspan></text>
        <text x="0" y="155" font-family="Georgia, serif" font-style="italic" font-size="24" fill="#8C7A6B">g = Δ<tspan font-size="16" dy="5">H</tspan><tspan dy="-5"> - Δ</tspan><tspan font-size="16" dy="5">A</tspan></text>
        <!-- Curves -->
        <g transform="translate(80, 160)">
            <line x1="0" y1="200" x2="360" y2="200" stroke="#333" stroke-width="2"/>
            <text x="180" y="235" font-family="Georgia, serif" font-style="italic" font-size="22" text-anchor="middle">g = Δ<tspan font-size="16" dy="5">H</tspan><tspan dy="-5"> - Δ</tspan><tspan font-size="16" dy="5">A</tspan></text>
            <!-- Human Curve -->
            <path d="M 0 200 C 50 200, 80 40, 130 40 C 180 40, 210 200, 260 200" stroke="#4EA5D9" stroke-width="5" fill="none"/>
            <text x="70" y="110" font-size="22" fill="#4EA5D9" font-weight="bold">human</text>
            <!-- Agent Curve -->
            <path d="M 100 200 C 150 200, 180 40, 230 40 C 280 40, 310 200, 360 200" stroke="#E37862" stroke-width="5" fill="none"/>
            <text x="290" y="110" font-size="22" fill="#E37862" font-weight="bold">agent</text>
            <!-- Decision Boundary -->
            <line x1="180" y1="200" x2="180" y2="10" stroke="#999" stroke-width="2" stroke-dasharray="8,5"/>
            <text x="180" y="-5" font-size="16" fill="#777" text-anchor="middle">decision boundary</text>
            <circle cx="210" cy="200" r="6" fill="#ECA233"/>
            <text x="210" y="180" font-family="Georgia" font-style="italic" font-size="20" fill="#ECA233" text-anchor="middle">g_obs</text>
            <text x="180" y="280" font-size="18" fill="#555" text-anchor="middle">Positive gap shifts score toward agent traffic.</text>
        </g>
    </g>
    <!-- ========================================================= -->
    <!-- COLUMN 3: THE SOLUTION (CONTAMINATION & DR-RL)            -->
    <!-- ========================================================= -->
    <text x="1340" y="80" font-family="Georgia, serif" font-size="28" font-weight="bold" fill="#333333">3. Robust Control &amp; Contamination</text>
    <line x1="1340" y1="100" x2="1860" y2="100" stroke="#DDDDDD" stroke-width="2"/>
    <!-- Top: Contamination Generator -->
    <g transform="translate(1340, 130)">
        <text x="0" y="30" font-size="24" font-weight="bold" fill="#444">Contamination Generator G(α)</text>
        <!-- Boxes -->
        <rect x="20" y="70" width="200" height="50" fill="#D0E5E0" filter="url(#shadow)" rx="6"/>
        <text x="120" y="100" font-size="18" fill="#222" text-anchor="middle">labeled human sessions</text>
        <rect x="280" y="70" width="200" height="50" fill="#EAD0C8" filter="url(#shadow)" rx="6"/>
        <text x="380" y="100" font-size="18" fill="#222" text-anchor="middle">synthetic agent sessions</text>
        <!-- Arrows -->
        <line x1="120" y1="130" x2="200" y2="180" stroke="#888" stroke-width="3" marker-end="url(#arrow-dark)"/>
        <line x1="380" y1="130" x2="300" y2="180" stroke="#888" stroke-width="3" marker-end="url(#arrow-dark)"/>
        <!-- Mixed Batch -->
        <rect x="150" y="190" width="200" height="50" fill="#F4E9CD" filter="url(#shadow)" rx="6"/>
        <text x="250" y="220" font-size="18" fill="#222" text-anchor="middle">mixed batch for training</text>
        <!-- Alpha Bar -->
        <text x="250" y="275" font-family="Georgia, serif" font-size="20" fill="#555" text-anchor="middle">alpha = 0.33</text>
        <rect x="50" y="290" width="268" height="30" fill="#4EA5D9"/>
        <rect x="318" y="290" width="132" height="30" fill="#E37862"/>
        <text x="184" y="340" font-size="18" fill="#4EA5D9" text-anchor="middle">human share (1-α)</text>
        <text x="384" y="340" font-size="18" fill="#E37862" text-anchor="middle">agent share (α)</text>
    </g>
    <!-- Bottom: Distributionally Robust Control -->
    <g transform="translate(1340, 600)">
        <text x="0" y="30" font-size="24" font-weight="bold" fill="#444">Distributionally Robust Control Layer</text>
        <text x="0" y="80" font-family="Georgia, serif" font-style="italic" font-size="22" fill="#8C7A6B">
            π* = arg max<tspan font-size="16" dy="5">π</tspan> min<tspan font-size="16" dy="0">Q ∈ U<tspan font-size="12" dy="5">ε</tspan></tspan>
            <tspan dy="-10"> E</tspan><tspan font-size="16" dy="5">d ~ Q</tspan>
            <tspan dy="-5">[ R(p,d) - λ COI</tspan><tspan font-size="16" dy="5">leak</tspan><tspan dy="-5">(p,τ') ]</tspan>
        </text>
        <!-- Ambiguity Ball -->
        <g transform="translate(140, 260)">
            <line x1="-130" y1="0" x2="130" y2="0" stroke="#CCC" stroke-width="2"/>
            <line x1="0" y1="-130" x2="0" y2="130" stroke="#CCC" stroke-width="2"/>
            <circle cx="0" cy="0" r="110" stroke="#C4A45B" stroke-width="4" fill="rgba(196,164,91,0.06)"/>
            <text x="-95" y="-120" font-family="Georgia" font-style="italic" font-size="24" fill="#C4A45B">U<tspan font-size="16" dy="5">ε</tspan></text>
            <!-- Points -->
            <circle cx="0" cy="0" r="7" fill="#4EA5D9"/>
            <text x="12" y="24" font-family="Georgia" font-style="italic" font-size="22" fill="#4EA5D9">P&#770;<tspan font-size="14" dy="5">N</tspan></text>
            <circle cx="-60" cy="-40" r="7" fill="#E37862"/>
            <text x="-140" y="-50" font-family="Georgia" font-style="italic" font-size="18" fill="#E37862">worst-case Q*</text>
            <circle cx="50" cy="-70" r="6" fill="#85B589"/>
            <circle cx="70" cy="50" r="6" fill="#85B589"/>
            <circle cx="-40" cy="80" r="6" fill="#85B589"/>
        </g>
        <!-- Process Steps -->
        <g transform="translate(320, 140)">
            <rect x="0" y="0" width="220" height="45" fill="#FDEFEF" filter="url(#light-shadow)" rx="6"/>
            <text x="110" y="28" font-size="16" fill="#E37862" font-weight="bold" text-anchor="middle">inner min picks Q*</text>
            <line x1="110" y1="55" x2="110" y2="85" stroke="#999" stroke-width="2" marker-end="url(#arrow-dark)"/>
            <rect x="0" y="95" width="220" height="45" fill="#F4E9CD" filter="url(#light-shadow)" rx="6"/>
            <text x="110" y="123" font-size="16" fill="#9E8033" font-weight="bold" text-anchor="middle">sample demand from Q*</text>
            <line x1="110" y1="150" x2="110" y2="180" stroke="#999" stroke-width="2" marker-end="url(#arrow-dark)"/>
            <rect x="0" y="190" width="220" height="45" fill="#E6F2ED" filter="url(#light-shadow)" rx="6"/>
            <text x="110" y="218" font-size="16" fill="#428062" font-weight="bold" text-anchor="middle">outer max updates policy</text>
        </g>
        <text x="250" y="440" font-size="18" fill="#555" text-anchor="middle">Reward is evaluated on demand drawn from Q*, then used for the policy step.</text>
    </g>
 </svg>
--- a/docs/static/videos/BehaviorKernelConstructionScene.mp4
+++ b/docs/static/videos/BehaviorKernelConstructionScene.mp4
--- a/docs/static/videos/COIFirstPrinciplesScene.mp4
+++ b/docs/static/videos/COIFirstPrinciplesScene.mp4
--- a/docs/static/videos/COIOrderStatisticProofScene.mp4
+++ b/docs/static/videos/COIOrderStatisticProofScene.mp4
--- a/docs/static/videos/CardMarketAnalogyScene.mp4
+++ b/docs/static/videos/CardMarketAnalogyScene.mp4
--- a/docs/static/videos/ContaminationGeneratorScene.mp4
+++ b/docs/static/videos/ContaminationGeneratorScene.mp4
--- a/docs/static/videos/DefenseOpening.mp4
+++ b/docs/static/videos/DefenseOpening.mp4
--- a/docs/static/videos/ObjectiveAndResultsScene.mp4
+++ b/docs/static/videos/ObjectiveAndResultsScene.mp4
--- a/docs/static/videos/RobustControlScene.mp4
+++ b/docs/static/videos/RobustControlScene.mp4
--- a/docs/static/videos/SeparabilitySignalScene.mp4
+++ b/docs/static/videos/SeparabilitySignalScene.mp4
--- a/docs/static/videos/SystemLoopScene.mp4
+++ b/docs/static/videos/SystemLoopScene.mp4
--- a/docs/static/videos/TakeawayScene.mp4
+++ b/docs/static/videos/TakeawayScene.mp4
--- a/engine/backends/init.py
+++ b/engine/backends/init.py
@@ -0,0 +1 @@
 __all__ = ["evaluate", "make_env", "train_qtable", "train_sb3"]
--- a/engine/backends/common.py
+++ b/engine/backends/common.py
@@ -0,0 +1,152 @@
 from __future__ import annotations
 from typing import Any, Mapping
 import numpy as np
 def make_env(cfg: Mapping[str, Any]):
    from gymnasium.wrappers import FlattenObservation
    from ..lib.wrappers import EconomicMetricsWrapper
    from ..wrapper import PHANTOM
    env = PHANTOM(
        n_products=int(cfg["n_products"]),
        alpha=float(cfg["alpha"]),
        N=int(cfg["N"]),
        price_bounds=(float(cfg["price_low"]), float(cfg["price_high"])),
        lambda_coi=float(cfg["lambda_coi"]),
        robust_radius=float(cfg["robust_radius"]),
        robust_points=int(cfg["robust_points"]),
        robust_rollouts=int(cfg.get("robust_rollouts", 1)),
        info_value=float(cfg["info_value"]),
        eta_ux=float(cfg.get("eta_ux", 0.5)),
        reward_profit_weight=float(cfg.get("reward_profit_weight", 1.0)),
        action_levels=int(cfg["action_levels"]),
        action_scale_low=float(cfg["action_scale_low"]),
        action_scale_high=float(cfg["action_scale_high"]),
        max_steps=int(cfg.get("max_steps", 100)),
        margin_floor=float(cfg.get("margin_floor", 0.05)),
        margin_floor_patience=int(cfg.get("margin_floor_patience", 5)),
        render_mode=None,
    )
    env = EconomicMetricsWrapper(env)
    return FlattenObservation(env)
 def _action(agent: Any, obs: Any, deterministic: bool = True):
    out = agent.predict(obs, deterministic=deterministic)
    action = out[0] if isinstance(out, tuple) else out
    if isinstance(action, np.ndarray) and action.size == 1:
        return int(action.reshape(-1)[0])
    return action
 def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
    rewards: list[float] = []
    revenues: list[float] = []
    margins: list[float] = []
    coi_levels: list[float] = []
    coi_leakages: list[float] = []
    volatilities: list[float] = []
    agent_probs: list[float] = []
    for _ in range(int(episodes)):
        obs, _ = env.reset()
        done = False
        ep_reward = 0.0
        ep_revenue = 0.0
        ep_margin = 0.0
        ep_coi = 0.0
        ep_coi_leakage = 0.0
        ep_volatility = 0.0
        ep_agent_prob = 0.0
        steps = 0
        while not done:
            obs, reward, term, trunc, info = env.step(_action(agent, obs, True))
            done = bool(term or trunc)
            econ = info.get("economics", {})
            ep_reward += float(reward)
            ep_revenue += float(econ.get("revenue", info.get("revenue", 0.0)))
            ep_margin += float(econ.get("margin", 0.0))
            ep_coi += float(econ.get("coi_level", 0.0))
            ep_coi_leakage += float(econ.get("coi_leakage", 0.0))
            ep_volatility += float(econ.get("volatility", 0.0))
            ep_agent_prob += float(econ.get("agent_prob", info.get("agent_prob", 0.0)))
            steps += 1
        rewards.append(ep_reward)
        revenues.append(ep_revenue)
        denom = max(steps, 1)
        margins.append(ep_margin / denom)
        coi_levels.append(ep_coi / denom)
        coi_leakages.append(ep_coi_leakage / denom)
        volatilities.append(ep_volatility / denom)
        agent_probs.append(ep_agent_prob / denom)
    return {
        "eval/reward_mean": float(np.mean(rewards)) if rewards else 0.0,
        "eval/reward_std": float(np.std(rewards)) if rewards else 0.0,
        "eval/revenue_mean": float(np.mean(revenues)) if revenues else 0.0,
        "eval/revenue_std": float(np.std(revenues)) if revenues else 0.0,
        "eval/margin_mean": float(np.mean(margins)) if margins else 0.0,
        "eval/coi_level_mean": float(np.mean(coi_levels)) if coi_levels else 0.0,
        "eval/coi_leakage_mean": float(np.mean(coi_leakages)) if coi_leakages else 0.0,
        "eval/volatility_mean": float(np.mean(volatilities)) if volatilities else 0.0,
        "eval/agent_prob_mean": float(np.mean(agent_probs)) if agent_probs else 0.0,
    }
 def evaluate(
    agent: Any,
    env: Any,
    episodes: int,
    cfg: Mapping[str, Any] | None = None,
 ) -> dict[str, float]:
    metrics = _evaluate_env(agent, env, episodes)
    if cfg is None or not bool(cfg.get("robust_eval_enabled", True)):
        return metrics
    nominal_alpha = float(cfg.get("alpha", 0.0))
    eval_radius = max(float(cfg.get("robust_radius", 0.0)), 0.15)
    low_alpha = float(np.clip(nominal_alpha - eval_radius, 0.0, 1.0))
    high_alpha = float(np.clip(nominal_alpha + eval_radius, 0.0, 1.0))
    shifted_episodes = max(1, int(np.ceil(int(episodes) / 2)))
    shifted_rows = []
    for tag, alpha in (
        ("low", low_alpha),
        ("nominal", nominal_alpha),
        ("high", high_alpha),
    ):
        eval_cfg = dict(cfg)
        eval_cfg["alpha"] = float(alpha)
        shifted_env = make_env(eval_cfg)
        shifted_metrics = _evaluate_env(agent, shifted_env, shifted_episodes)
        shifted_env.close()
        shifted_rows.append((tag, alpha, shifted_metrics))
    metrics["eval/robust_alpha_low"] = low_alpha
    metrics["eval/robust_alpha_high"] = high_alpha
    metrics["eval/robust_reward_worst"] = float(
        min(row[2]["eval/reward_mean"] for row in shifted_rows)
    )
    metrics["eval/robust_revenue_worst"] = float(
        min(row[2]["eval/revenue_mean"] for row in shifted_rows)
    )
    metrics["eval/robust_coi_leakage_worst"] = float(
        max(row[2]["eval/coi_leakage_mean"] for row in shifted_rows)
    )
    for tag, alpha, shifted_metrics in shifted_rows:
        metrics[f"eval/{tag}_alpha"] = float(alpha)
        metrics[f"eval/{tag}_reward_mean"] = float(shifted_metrics["eval/reward_mean"])
        metrics[f"eval/{tag}_revenue_mean"] = float(
            shifted_metrics["eval/revenue_mean"]
        )
        metrics[f"eval/{tag}_coi_leakage_mean"] = float(
            shifted_metrics["eval/coi_leakage_mean"]
        )
    return metrics
--- a/engine/backends/qtable.py
+++ b/engine/backends/qtable.py
@@ -0,0 +1,131 @@
 from __future__ import annotations
 import logging
 import time
 from typing import Any, Mapping
 import numpy as np
 from .common import evaluate, make_env
 from ..telemetry.wandb import get_wandb_module
 logger = logging.getLogger(__name__)
 def train_qtable(
    cfg: Mapping[str, Any],
 ) -> tuple[object, dict[str, Any]]:
    from ..lib.discrete import EventQTable
    np.random.seed(int(cfg["seed"]))
    env = make_env(cfg)
    eval_env = make_env(cfg)
    agent = EventQTable(
        env.action_space.n,
        int(cfg["n_products"]),
        (float(cfg["price_low"]), float(cfg["price_high"])),
        lr=float(cfg["q_lr"]),
        gamma=float(cfg["gamma"]),
        n_bins=int(cfg["q_bins"]),
    )
    total_reward = 0.0
    total_revenue = 0.0
    steps = 0
    epsilon = float(cfg["eps_start"])
    log_freq = max(1, int(cfg.get("log_freq", 100)))
    console_progress = bool(cfg.get("console_progress", False))
    obs, _ = env.reset(seed=int(cfg["seed"]))
    started_at = time.perf_counter()
    wandb = get_wandb_module()
    wandb_live = bool(wandb is not None and wandb.run is not None)
    step_offset = max(0, int(cfg.get("wandb_step_offset", 0)))
    interval_sums = {
        "reward": 0.0,
        "revenue": 0.0,
        "agent_prob": 0.0,
        "alpha_adv": 0.0,
        "coi_leakage": 0.0,
    }
    interval_count = 0
    train_events: list[dict[str, float | int]] = []
    for _ in range(int(cfg["total_timesteps"])):
        action, state = agent.act(obs, epsilon)
        nxt, reward, term, trunc, info = env.step(action)
        done = bool(term or trunc)
        agent.update(state, action, float(reward), agent.encode(nxt), done)
        total_reward += float(reward)
        revenue = float(info.get("economics", {}).get("revenue", 0.0))
        total_revenue += revenue
        steps += 1
        interval_sums["reward"] += float(reward)
        interval_sums["revenue"] += revenue
        interval_sums["agent_prob"] += float(info.get("agent_prob", 0.0))
        interval_sums["alpha_adv"] += float(info.get("alpha_adv", 0.0))
        interval_sums["coi_leakage"] += float(info.get("coi_leakage", 0.0))
        interval_count += 1
        if steps % log_freq == 0 and interval_count > 0:
            denom = float(interval_count)
            event = {
                "train/reward_mean": interval_sums["reward"] / denom,
                "train/revenue_mean": interval_sums["revenue"] / denom,
                "train/agent_prob": interval_sums["agent_prob"] / denom,
                "train/alpha_adv": interval_sums["alpha_adv"] / denom,
                "train/coi_leakage": interval_sums["coi_leakage"] / denom,
                "train/epsilon": float(epsilon),
                "train/global_step": int(steps),
            }
            if wandb_live:
                wandb.log(dict(event), step=step_offset + int(steps))
            else:
                train_events.append(event)
            if console_progress:
                elapsed = max(time.perf_counter() - started_at, 1e-6)
                speed = steps / elapsed
                logger.info(
                    "step=%d/%d reward=%.3f revenue=%.3f eps=%.4f speed=%.1f steps/s",
                    steps,
                    int(cfg["total_timesteps"]),
                    event["train/reward_mean"],
                    event["train/revenue_mean"],
                    event["train/epsilon"],
                    speed,
                )
            interval_sums = {key: 0.0 for key in interval_sums}
            interval_count = 0
        epsilon = max(float(cfg["eps_end"]), epsilon * float(cfg["eps_decay"]))
        obs = env.reset()[0] if done else nxt
    if interval_count > 0:
        denom = float(interval_count)
        tail_event = {
            "train/reward_mean": interval_sums["reward"] / denom,
            "train/revenue_mean": interval_sums["revenue"] / denom,
            "train/agent_prob": interval_sums["agent_prob"] / denom,
            "train/alpha_adv": interval_sums["alpha_adv"] / denom,
            "train/coi_leakage": interval_sums["coi_leakage"] / denom,
            "train/epsilon": float(epsilon),
            "train/global_step": int(steps),
        }
        if wandb_live:
            wandb.log(dict(tail_event), step=step_offset + int(steps))
        else:
            train_events.append(tail_event)
    metrics: dict[str, Any] = {
        "train/reward_mean": total_reward / max(steps, 1),
        "train/revenue_mean": total_revenue / max(steps, 1),
        "train/epsilon": float(epsilon),
        "train/global_step": int(cfg["total_timesteps"]),
    }
    metrics.update(evaluate(agent, eval_env, int(cfg["eval_episodes"]), cfg=cfg))
    metrics["_train_events"] = train_events
    env.close()
    eval_env.close()
    return agent, metrics
--- a/engine/backends/sb3.py
+++ b/engine/backends/sb3.py
@@ -0,0 +1,188 @@
 from __future__ import annotations
 import json
 from pathlib import Path
 from typing import Any, Mapping
 from ..lib.callbacks import MetricsCallback
 from .common import evaluate, make_env
 def _net_arch(name: Any) -> list[int]:
    presets = {
        "tiny": [32, 32],
        "small": [64, 64],
        "medium": [128, 128],
        "large": [256, 256],
    }
    if isinstance(name, (list, tuple)):
        return [int(v) for v in name]
    raw = str(name).lower().strip()
    if raw in presets:
        return presets[raw]
    if "x" in raw:
        try:
            parsed = [int(v) for v in raw.split("x") if v]
            return parsed if parsed else presets["small"]
        except ValueError:
            return presets["small"]
    return presets["small"]
 def _activation(name: Any):
    try:
        import torch.nn as nn
    except ImportError:
        return None
    return {
        "relu": nn.ReLU,
        "tanh": nn.Tanh,
        "elu": nn.ELU,
        "leaky_relu": nn.LeakyReLU,
    }.get(str(name).lower().strip(), nn.ReLU)
 def _policy_kwargs(cfg: Mapping[str, Any]) -> dict[str, Any]:
    kwargs: dict[str, Any] = {"net_arch": _net_arch(cfg.get("arch", "small"))}
    activation = _activation(cfg.get("activation", "relu"))
    if activation is not None:
        kwargs["activation_fn"] = activation
    return kwargs
 def build_model(cfg: Mapping[str, Any], env: Any):
    try:
        from stable_baselines3 import A2C, DQN, PPO
    except ImportError as exc:
        raise ImportError("stable-baselines3 is required for SB3 algorithms") from exc
    algo = str(cfg["algo"])
    policy_kwargs = _policy_kwargs(cfg)
    device = str(cfg.get("device", "auto"))
    seed = int(cfg["seed"])
    if algo == "sac":
        raise ValueError("sac is not supported with the discrete core env")
    if algo == "ppo":
        return PPO(
            "MlpPolicy",
            env,
            verbose=1,
            device=device,
            policy_kwargs=policy_kwargs,
            seed=seed,
            learning_rate=float(cfg["learning_rate"]),
            n_steps=int(cfg["n_steps"]),
            batch_size=int(cfg["batch_size"]),
            n_epochs=int(cfg["n_epochs"]),
            gamma=float(cfg["gamma"]),
            gae_lambda=float(cfg["gae_lambda"]),
            clip_range=float(cfg["clip_range"]),
            ent_coef=float(cfg["ent_coef"]),
        )
    if algo == "a2c":
        return A2C(
            "MlpPolicy",
            env,
            verbose=1,
            device=device,
            policy_kwargs=policy_kwargs,
            seed=seed,
            learning_rate=float(cfg["learning_rate"]),
            n_steps=max(5, int(cfg["n_steps"]) // 32),
            gamma=float(cfg["gamma"]),
            gae_lambda=float(cfg["gae_lambda"]),
            ent_coef=float(cfg["ent_coef"]),
        )
    if algo == "dqn":
        return DQN(
            "MlpPolicy",
            env,
            verbose=1,
            device=device,
            policy_kwargs=policy_kwargs,
            seed=seed,
            learning_rate=float(cfg["learning_rate"]),
            buffer_size=int(cfg["buffer_size"]),
            batch_size=int(cfg["batch_size"]),
            gamma=float(cfg["gamma"]),
            train_freq=int(cfg["train_freq"]),
            learning_starts=int(cfg["learning_starts"]),
            target_update_interval=int(cfg["target_update_interval"]),
            exploration_fraction=float(cfg["exploration_fraction"]),
            exploration_final_eps=float(cfg["exploration_final_eps"]),
        )
    raise ValueError(f"unsupported algo '{algo}'")
 def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]:
    try:
        from stable_baselines3.common.callbacks import EvalCallback
        from stable_baselines3.common.monitor import Monitor
    except ImportError as exc:
        raise ImportError("stable-baselines3 is required for SB3 models") from exc
    env = Monitor(make_env(cfg))
    eval_env = Monitor(make_env(cfg))
    model = build_model(cfg, env)
    try:
        import torch
        print(
            "PHANTOM_DEVICE: "
            + json.dumps(
                {
                    "requested": str(cfg.get("device", "auto")),
                    "torch_cuda_available": bool(torch.cuda.is_available()),
                    "torch_device_count": int(torch.cuda.device_count()),
                    "sb3_device": str(getattr(model, "device", "unknown")),
                }
            )
        )
    except Exception:
        pass
    metrics_callback = MetricsCallback(
        log_histograms=False,
        log_freq=int(cfg["log_freq"]),
        step_offset=int(cfg.get("wandb_step_offset", 0)),
    )
    callbacks = [metrics_callback]
    callbacks.append(
        EvalCallback(
            eval_env,
            eval_freq=int(cfg["eval_freq"]),
            n_eval_episodes=int(cfg["eval_episodes"]),
            deterministic=True,
            verbose=0,
        )
    )
    target_steps = int(cfg["total_timesteps"])
    remaining_steps = max(0, target_steps - int(getattr(model, "num_timesteps", 0)))
    if remaining_steps > 0:
        model.learn(
            total_timesteps=remaining_steps,
            callback=callbacks,
            reset_num_timesteps=False,
        )
    model_dir = Path(str(cfg["model_dir"]))
    model_dir.mkdir(parents=True, exist_ok=True)
    model_path = model_dir / f"phantom_{cfg['algo']}"
    model.save(str(model_path))
    metrics: dict[str, Any] = evaluate(
        model,
        eval_env,
        int(cfg["eval_episodes"]),
        cfg=cfg,
    )
    metrics["train/global_step"] = int(model.num_timesteps)
    metrics["model/path"] = str(model_path.with_suffix(".zip"))
    metrics["_train_events"] = list(metrics_callback.events)
    env.close()
    eval_env.close()
    return model, metrics
--- a/engine/benchmark.py
+++ b/engine/benchmark.py
@@ -0,0 +1,625 @@
 from __future__ import annotations
 import argparse
 import json
 import logging
 import os
 from datetime import datetime, UTC
 from pathlib import Path
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from .lib.tiers import LinearElasticityPolicy, StaticPolicy, SurgePolicy
 from .logging_utils import configure_logging
 from .spec import TrainSpec
 from .telemetry.wandb import get_wandb_module
 wandb = get_wandb_module()
 HAS_WANDB = wandb is not None
 logger = logging.getLogger(__name__)
 def _log(message: str) -> None:
    logger.info(message)
 def _parse_list(raw: str) -> list[str]:
    return [x.strip().lower() for x in str(raw).split(",") if x.strip()]
 def _parse_float_list(raw: str) -> list[float]:
    return [float(x.strip()) for x in str(raw).split(",") if x.strip()]
 def _truthy(value: str | bool | None) -> bool:
    if isinstance(value, bool):
        return value
    if value is None:
        return False
    return str(value).strip().lower() in {"1", "true", "yes", "on"}
 def _action(policy, obs: np.ndarray):
    out = policy.predict(obs, deterministic=True)
    action = out[0] if isinstance(out, tuple) else out
    if isinstance(action, np.ndarray) and action.size == 1:
        return int(action.reshape(-1)[0])
    return int(action)
 def _run_eval_episode(env, policy) -> dict:
    obs, _ = env.reset()
    done = False
    total_reward = 0.0
    total_revenue = 0.0
    total_margin = 0.0
    total_coi = 0.0
    price_trace: list[float] = []
    step_count = 0
    while not done:
        action = _action(policy, obs)
        obs, reward, term, trunc, info = env.step(action)
        done = bool(term or trunc)
        econ = info.get("economics", {})
        total_reward += float(reward)
        total_revenue += float(econ.get("revenue", 0.0))
        total_margin += float(econ.get("margin", 0.0))
        total_coi += float(econ.get("coi_level", 0.0))
        prices = np.asarray(info.get("prices", []), dtype=np.float32)
        if prices.size > 0:
            price_trace.append(float(np.mean(prices)))
        step_count += 1
    denom = max(step_count, 1)
    return {
        "reward": total_reward,
        "revenue": total_revenue,
        "mean_margin": total_margin / denom,
        "mean_coi": total_coi / denom,
        "price_trace": price_trace,
    }
 def _build_tier(name: str, cfg: dict, alpha: float, *, step_offset: int = 0):
    from .backends.common import make_env
    tier = name.lower().strip()
    run_cfg = dict(cfg)
    run_cfg["alpha"] = float(alpha)
    run_cfg["wandb_step_offset"] = int(step_offset)
    if tier == "static":
        return StaticPolicy(int(run_cfg["action_levels"])), []
    if tier == "surge":
        return (
            SurgePolicy(
                n_actions=int(run_cfg["action_levels"]),
                n_products=int(run_cfg["n_products"]),
            ),
            [],
        )
    if tier == "linear":
        warmup_env = make_env(run_cfg)
        policy = LinearElasticityPolicy(
            n_actions=int(run_cfg["action_levels"]),
            n_products=int(run_cfg["n_products"]),
            price_low=float(run_cfg["price_low"]),
            price_high=float(run_cfg["price_high"]),
        )
        policy.fit(
            warmup_env,
            warmup_steps=int(run_cfg.get("linear_warmup_steps", 800)),
            seed=int(run_cfg["seed"]),
        )
        warmup_env.close()
        return policy, []
    if tier == "qtable":
        from .backends.qtable import train_qtable
        run_cfg["console_progress"] = True
        agent, metrics = train_qtable(run_cfg)
        events = metrics.get("_train_events", [])
        return agent, events if isinstance(events, list) else []
    if tier in {"ppo", "a2c", "dqn"}:
        from .backends.sb3 import train_sb3
        run_cfg["algo"] = tier
        agent, metrics = train_sb3(run_cfg)
        events = metrics.get("_train_events", [])
        return agent, events if isinstance(events, list) else []
    raise ValueError(f"unsupported tier '{name}'")
 def _log_train_events(
    events: list[dict],
    *,
    tier_name: str,
    mode_label: str,
    alpha: float,
    step_offset: int,
 ) -> int:
    if not (HAS_WANDB and wandb.run is not None):
        return int(step_offset)
    if not events:
        return int(step_offset)
    ordered = sorted(
        [evt for evt in events if isinstance(evt, dict)],
        key=lambda evt: int(evt.get("train/global_step", 0)),
    )
    if not ordered:
        return int(step_offset)
    cursor = int(step_offset)
    for evt in ordered:
        rel_step = max(1, int(evt.get("train/global_step", 0)))
        payload = dict(evt)
        payload.update(
            {
                "run.kind": "benchmark",
                "runtime/backend": tier_name,
                "study/mode": mode_label,
                "study/no_robust": float(mode_label == "no_robust"),
                "study/alpha": float(alpha),
            }
        )
        wandb.log(payload, step=cursor + rel_step)
    max_rel = max(max(1, int(evt.get("train/global_step", 0))) for evt in ordered)
    return cursor + max_rel + 1
 def run_benchmark(
    cfg: dict,
    tiers: list[str],
    alpha_values: list[float],
    n_episodes: int,
    mode_label: str,
    step_cursor_start: int = 0,
 ):
    from .backends.common import make_env
    rows: list[dict] = []
    traces: list[dict] = []
    total_runs = max(1, len(alpha_values) * len(tiers))
    run_index = 0
    wandb_step_cursor = int(step_cursor_start)
    for alpha in alpha_values:
        for tier_name in tiers:
            run_index += 1
            _log(
                f"[{run_index}/{total_runs}] alpha={float(alpha):.2f} tier={tier_name}: training"
            )
            policy, train_events = _build_tier(
                tier_name,
                cfg,
                alpha,
                step_offset=wandb_step_cursor,
            )
            prev_cursor = int(wandb_step_cursor)
            wandb_step_cursor = _log_train_events(
                train_events,
                tier_name=tier_name,
                mode_label=mode_label,
                alpha=float(alpha),
                step_offset=wandb_step_cursor,
            )
            if wandb_step_cursor == prev_cursor and tier_name in {
                "qtable",
                "ppo",
                "a2c",
                "dqn",
            }:
                wandb_step_cursor += max(1, int(cfg.get("total_timesteps", 1))) + 1
            env = make_env({**cfg, "alpha": float(alpha)})
            eps = [_run_eval_episode(env, policy) for _ in range(int(n_episodes))]
            env.close()
            row = {
                "tier": tier_name,
                "mode": mode_label,
                "alpha": float(alpha),
                "episodes": int(n_episodes),
                "mean_reward": float(np.mean([e["reward"] for e in eps])),
                "mean_revenue": float(np.mean([e["revenue"] for e in eps])),
                "mean_margin": float(np.mean([e["mean_margin"] for e in eps])),
                "mean_coi": float(np.mean([e["mean_coi"] for e in eps])),
                "std_revenue": float(np.std([e["revenue"] for e in eps])),
            }
            row["objective_score"] = row["mean_reward"]
            rows.append(row)
            _log(
                f"[{run_index}/{total_runs}] alpha={float(alpha):.2f} tier={tier_name}: "
                f"reward={row['mean_reward']:.3f} revenue={row['mean_revenue']:.3f} "
                f"coi={row['mean_coi']:.4f} score={row['objective_score']:.3f}"
            )
            max_len = max((len(e["price_trace"]) for e in eps), default=0)
            step_means = []
            for step in range(max_len):
                vals = [
                    e["price_trace"][step] for e in eps if step < len(e["price_trace"])
                ]
                step_means.append(float(np.mean(vals)) if vals else np.nan)
            traces.append(
                {
                    "tier": tier_name,
                    "alpha": float(alpha),
                    "mean_price_trace": step_means,
                }
            )
            if HAS_WANDB and wandb.run is not None:
                wandb.log(
                    {
                        "run.kind": "benchmark",
                        "runtime/backend": tier_name,
                        "study/mode": mode_label,
                        "study/no_robust": float(mode_label == "no_robust"),
                        "study/alpha": float(alpha),
                        "eval/reward_mean": row["mean_reward"],
                        "eval/revenue_mean": row["mean_revenue"],
                        "eval/margin_mean": row["mean_margin"],
                        "eval/coi_level_mean": row["mean_coi"],
                        "objective/score": row["objective_score"],
                        "objective/coi_preserved": row["mean_coi"],
                    },
                    step=wandb_step_cursor,
                )
                wandb_step_cursor += 1
    return pd.DataFrame(rows), traces, int(wandb_step_cursor)
 def _plot_outputs(df: pd.DataFrame, traces: list[dict], out_dir: Path, stamp: str):
    fig1 = plt.figure(figsize=(11, 4.5))
    if "mode" in df.columns:
        groups = sorted(df[["tier", "mode"]].drop_duplicates().values.tolist())
        for tier, mode in groups:
            sub = df[(df["tier"] == tier) & (df["mode"] == mode)].sort_values("alpha")
            plt.plot(
                sub["alpha"],
                sub["mean_revenue"],
                marker="o",
                label=f"{tier}:{mode}",
            )
    else:
        for tier in sorted(df["tier"].unique()):
            sub = df[df["tier"] == tier].sort_values("alpha")
            plt.plot(sub["alpha"], sub["mean_revenue"], marker="o", label=tier)
    plt.xlabel("contamination alpha")
    plt.ylabel("mean episode revenue")
    plt.title("Revenue under contamination")
    plt.grid(alpha=0.3)
    plt.legend()
    fig1.tight_layout()
    rev_path = out_dir / f"benchmark_revenue_{stamp}.png"
    fig1.savefig(rev_path, dpi=220)
    plt.close(fig1)
    fig2 = plt.figure(figsize=(11, 4.5))
    if "mode" in df.columns:
        groups = sorted(df[["tier", "mode"]].drop_duplicates().values.tolist())
        for tier, mode in groups:
            sub = df[(df["tier"] == tier) & (df["mode"] == mode)].sort_values("alpha")
            plt.plot(
                sub["alpha"],
                sub["mean_coi"],
                marker="s",
                label=f"{tier}:{mode}",
            )
    else:
        for tier in sorted(df["tier"].unique()):
            sub = df[df["tier"] == tier].sort_values("alpha")
            plt.plot(sub["alpha"], sub["mean_coi"], marker="s", label=tier)
    plt.xlabel("contamination alpha")
    plt.ylabel("mean COI level")
    plt.title("COI preservation")
    plt.grid(alpha=0.3)
    plt.legend()
    fig2.tight_layout()
    coi_path = out_dir / f"benchmark_coi_{stamp}.png"
    fig2.savefig(coi_path, dpi=220)
    plt.close(fig2)
    focus_alpha = float(df["alpha"].min()) if not df.empty else 0.0
    alpha_traces = [t for t in traces if abs(float(t["alpha"]) - focus_alpha) < 1e-9]
    fig3 = plt.figure(figsize=(11, 4.5))
    for item in alpha_traces:
        xs = np.arange(len(item["mean_price_trace"]))
        ys = np.asarray(item["mean_price_trace"], dtype=np.float32)
        mode = item.get("mode")
        label = f"{item['tier']}:{mode}" if mode is not None else str(item["tier"])
        plt.plot(xs, ys, label=label)
    plt.xlabel("step")
    plt.ylabel("mean price")
    plt.title(f"Price evolution (alpha={focus_alpha:.2f})")
    plt.grid(alpha=0.3)
    plt.legend()
    fig3.tight_layout()
    price_path = out_dir / f"benchmark_price_trace_{stamp}.png"
    fig3.savefig(price_path, dpi=220)
    plt.close(fig3)
    return rev_path, coi_path, price_path
 def _run_with_args(args, compare_robust_override: bool | None = None):
    compare_robust = (
        bool(compare_robust_override)
        if compare_robust_override is not None
        else _truthy(os.environ.get("PHANTOM_BENCHMARK_COMPARE_ROBUST"))
    )
    robust_modes = [False, True] if compare_robust else [bool(args.no_robust)]
    base_overrides = {
        "seed": args.seed,
        "total_timesteps": args.total_timesteps,
        "n_products": args.n_products,
        "N": args.N,
        "lambda_coi": args.lambda_coi,
        "robust_radius": args.robust_radius,
        "robust_points": args.robust_points,
        "robust_rollouts": args.robust_rollouts,
        "eta_ux": args.eta_ux,
        "reward_profit_weight": args.reward_profit_weight,
        "price_low": args.price_low,
        "price_high": args.price_high,
        "action_levels": args.action_levels,
        "action_scale_low": args.action_scale_low,
        "action_scale_high": args.action_scale_high,
        "max_steps": args.max_steps,
        "learning_rate": args.learning_rate,
        "batch_size": args.batch_size,
        "n_steps": args.n_steps,
        "linear_warmup_steps": args.linear_warmup_steps,
        "device": args.device,
    }
    tiers = _parse_list(args.tiers)
    alpha_values = _parse_float_list(args.alpha_values)
    _log(
        "starting run "
        + json.dumps(
            {
                "tiers": tiers,
                "alpha_values": alpha_values,
                "episodes": int(args.episodes),
                "total_timesteps": int(args.total_timesteps),
                "device": str(args.device),
            }
        )
    )
    all_frames: list[pd.DataFrame] = []
    all_traces: list[dict] = []
    wandb_step_cursor = 0
    for no_robust in robust_modes:
        overrides = dict(base_overrides)
        overrides["no_robust"] = bool(no_robust)
        cfg = TrainSpec.from_flat(
            {k: v for k, v in overrides.items() if v is not None}
        ).to_flat_dict()
        cfg["linear_warmup_steps"] = int(args.linear_warmup_steps)
        mode_label = "no_robust" if no_robust else "robust"
        _log(f"mode={mode_label}: begin")
        df_mode, traces_mode, wandb_step_cursor = run_benchmark(
            cfg,
            tiers,
            alpha_values,
            args.episodes,
            mode_label=mode_label,
            step_cursor_start=wandb_step_cursor,
        )
        _log(f"mode={mode_label}: complete ({len(df_mode)} rows)")
        for trace in traces_mode:
            trace["mode"] = mode_label
        all_frames.append(df_mode)
        all_traces.extend(traces_mode)
    df = pd.concat(all_frames, ignore_index=True) if all_frames else pd.DataFrame()
    traces = all_traces
    out_dir = Path(args.output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    stamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
    csv_path = out_dir / f"benchmark_{stamp}.csv"
    trace_path = out_dir / f"benchmark_traces_{stamp}.json"
    df.to_csv(csv_path, index=False)
    trace_path.write_text(json.dumps(traces, indent=2))
    rev_path, coi_path, price_path = _plot_outputs(df, traces, out_dir, stamp)
    _log(f"artifacts written in {out_dir}")
    if not df.empty:
        best_idx = int(df["objective_score"].idxmax())
        best = df.iloc[best_idx]
        _log(
            "BEST_TIER="
            + json.dumps(
                {
                    "tier": best["tier"],
                    "mode": best.get("mode", "robust"),
                    "alpha": float(best["alpha"]),
                    "objective_score": float(best["objective_score"]),
                    "mean_revenue": float(best["mean_revenue"]),
                    "mean_coi": float(best["mean_coi"]),
                }
            )
        )
    _log(f"BENCHMARK_CSV={csv_path}")
    _log(f"BENCHMARK_TRACES={trace_path}")
    _log(f"BENCHMARK_PLOT_REVENUE={rev_path}")
    _log(f"BENCHMARK_PLOT_COI={coi_path}")
    _log(f"BENCHMARK_PLOT_PRICE={price_path}")
 def run_cli(raw_args: list[str] | None = None):
    configure_logging()
    parser = argparse.ArgumentParser(description="PHANTOM benchmark orchestrator")
    parser.add_argument("--project", default="capstone")
    parser.add_argument("--tiers", default="static,surge,linear,qtable,ppo")
    parser.add_argument("--alpha-values", default="0.0,0.3,0.6")
    parser.add_argument("--episodes", type=int, default=10)
    parser.add_argument("--output-dir", default="engine/studies/results")
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--total-timesteps", type=int, default=25_000)
    parser.add_argument("--n-products", type=int, default=10)
    parser.add_argument("--N", type=int, default=100)
    parser.add_argument("--lambda-coi", type=float, default=0.2)
    parser.add_argument("--robust-radius", type=float, default=0.15)
    parser.add_argument("--robust-points", type=int, default=5)
    parser.add_argument("--robust-rollouts", type=int, default=1)
    parser.add_argument("--eta-ux", type=float, default=0.5)
    parser.add_argument("--reward-profit-weight", type=float, default=1.0)
    parser.add_argument("--price-low", type=float, default=10.0)
    parser.add_argument("--price-high", type=float, default=150.0)
    parser.add_argument("--action-levels", type=int, default=9)
    parser.add_argument("--action-scale-low", type=float, default=0.8)
    parser.add_argument("--action-scale-high", type=float, default=1.2)
    parser.add_argument("--max-steps", type=int, default=100)
    parser.add_argument("--learning-rate", type=float, default=3e-4)
    parser.add_argument("--batch-size", type=int, default=256)
    parser.add_argument("--n-steps", type=int, default=2048)
    parser.add_argument("--linear-warmup-steps", type=int, default=800)
    parser.add_argument("--device", type=str, default="auto")
    parser.add_argument("--no-robust", action="store_true")
    parser.add_argument("--no-wandb", action="store_true")
    parser.add_argument("--offline", action="store_true")
    parser.add_argument("--sweep-agent", action="store_true")
    parser.add_argument("--sweep-id", type=str)
    parser.add_argument("--count", type=int, default=0)
    args = parser.parse_args(raw_args)
    if args.sweep_agent:
        if args.no_wandb or not HAS_WANDB:
            raise ValueError("sweep agent requires wandb")
        if not args.sweep_id:
            raise ValueError("--sweep-id is required with --sweep-agent")
        def _sweep_run():
            run = wandb.init(mode="offline" if args.offline else "online")
            try:
                key_to_attr = {
                    "tiers": "tiers",
                    "alpha_values": "alpha_values",
                    "episodes": "episodes",
                    "total_timesteps": "total_timesteps",
                    "lambda_coi": "lambda_coi",
                    "robust_radius": "robust_radius",
                    "robust_points": "robust_points",
                    "robust_rollouts": "robust_rollouts",
                    "eta_ux": "eta_ux",
                    "reward_profit_weight": "reward_profit_weight",
                    "learning_rate": "learning_rate",
                    "batch_size": "batch_size",
                    "n_steps": "n_steps",
                    "no_robust": "no_robust",
                    "device": "device",
                }
                for key in (
                    "tiers",
                    "alpha_values",
                    "episodes",
                    "total_timesteps",
                    "lambda_coi",
                    "robust_radius",
                    "robust_points",
                    "robust_rollouts",
                    "eta_ux",
                    "reward_profit_weight",
                    "learning_rate",
                    "batch_size",
                    "n_steps",
                    "no_robust",
                    "device",
                ):
                    if key in wandb.config:
                        setattr(args, key_to_attr[key], wandb.config[key])
                _run_with_args(args)
            finally:
                if run is not None:
                    wandb.finish()
        wandb.agent(
            args.sweep_id,
            function=_sweep_run,
            count=args.count if args.count > 0 else None,
        )
        return
    if args.no_wandb or not HAS_WANDB:
        _run_with_args(args)
        return
    tiers = _parse_list(args.tiers)
    alpha_values = _parse_float_list(args.alpha_values)
    run_stamp = datetime.now(UTC).strftime("%m%d-%H%M%S")
    compare_enabled = _truthy(os.environ.get("PHANTOM_BENCHMARK_COMPARE_ROBUST"))
    compare_tag = "robust-compare" if compare_enabled else "single-mode"
    modes = (
        [("no_robust", True), ("robust", False)]
        if compare_enabled
        else [("no_robust" if bool(args.no_robust) else "robust", bool(args.no_robust))]
    )
    run_idx = 0
    for tier in tiers:
        for mode_label, no_robust in modes:
            for alpha in alpha_values:
                run_idx += 1
                alpha_token = (
                    f"{float(alpha):.2f}".rstrip("0").rstrip(".").replace(".", "p")
                )
                tier_args = argparse.Namespace(**vars(args))
                tier_args.tiers = tier
                tier_args.alpha_values = str(float(alpha))
                tier_args.no_robust = bool(no_robust)
                run = wandb.init(
                    project=args.project,
                    name=(
                        f"benchmark-{tier}-{mode_label}-a{alpha_token}-{run_stamp}-{run_idx}"
                    ),
                    tags=[
                        "benchmark",
                        compare_tag,
                        f"backend:{tier}",
                        f"mode:{mode_label}",
                        f"alpha:{alpha_token}",
                    ],
                    config={
                        "run.kind": "benchmark",
                        "runtime/backend": tier,
                        "study/mode": mode_label,
                        "study/no_robust": float(no_robust),
                        "study/alpha": float(alpha),
                        "tiers": tier,
                        "alpha_values": str(float(alpha)),
                        "episodes": args.episodes,
                        "total_timesteps": args.total_timesteps,
                        "lambda_coi": args.lambda_coi,
                        "robust_radius": args.robust_radius,
                        "robust_points": args.robust_points,
                        "robust_rollouts": args.robust_rollouts,
                        "eta_ux": args.eta_ux,
                        "reward_profit_weight": args.reward_profit_weight,
                        "learning_rate": args.learning_rate,
                        "device": args.device,
                    },
                    mode="offline" if args.offline else "online",
                )
                try:
                    _run_with_args(tier_args, compare_robust_override=False)
                finally:
                    if run is not None:
                        wandb.finish()
 if __name__ == "__main__":
    run_cli()
--- a/engine/engine.py
+++ b/engine/engine.py
@@ -1,7 +1,7 @@
 from sys import platform
 import numpy as np
 from .lib.demand import generate_demand_for_actor, estimate_demand
-from .lib.behavior import sample_behavior
+from .lib.behavior import get_adjusted_transitions, sample_behavior_from_transitions
 from logging import INFO, getLogger
 logger = getLogger(__name__)
@@ -46,9 +46,17 @@ class MarketEngine:
            self.noise_std,
            distribution_method=self.demand_dist,
        )
        human_transitions = get_adjusted_transitions(demand_h, human=True)
        agent_transitions = get_adjusted_transitions(demand_a, human=False)
        # sample behavior trajectories from each demand distribution
-        human_t = [sample_behavior(demand_h, human=True) for _ in range(self.Nhumans)]
+        human_t = [
-        agent_t = [sample_behavior(demand_a, human=False) for _ in range(self.Nagents)]
+            sample_behavior_from_transitions(human_transitions)
            for _ in range(self.Nhumans)
        ]
        agent_t = [
            sample_behavior_from_transitions(agent_transitions)
            for _ in range(self.Nagents)
        ]
        # store trajectories for agent probability calculation
        self.last_trajectories = human_t + agent_t
        return estimate_demand(self.last_trajectories, self.action_weights)
--- a/engine/jax/init.py
+++ b/engine/jax/init.py
@@ -1,13 +0,0 @@
 """JAX-compatible training and environment modules for PHANTOM."""
 from __future__ import annotations
 try:
    import jax  # noqa: F401
    import jax.numpy as jnp  # noqa: F401
    JAX_AVAILABLE = True
 except ImportError:
    JAX_AVAILABLE = False
 __all__ = ["JAX_AVAILABLE"]
--- a/engine/jax/checkpoint.py
+++ b/engine/jax/checkpoint.py
@@ -1,49 +0,0 @@
 """Orbax checkpoint helpers for JAX training runs."""
 from __future__ import annotations
 from pathlib import Path
 from typing import Any
 try:
    import orbax.checkpoint as ocp
    HAS_ORBAX = True
 except ImportError:
    HAS_ORBAX = False
 def _require_orbax() -> None:
    if not HAS_ORBAX:
        raise ImportError(
            "orbax-checkpoint is required for checkpoint support. "
            "Install engine/jax/requirements.txt first."
        )
 def create_manager(directory: str | Path, max_to_keep: int = 5):
    _require_orbax()
    root = Path(directory)
    root.mkdir(parents=True, exist_ok=True)
    options = ocp.CheckpointManagerOptions(
        max_to_keep=max(1, int(max_to_keep)), create=True
    )
    return ocp.CheckpointManager(root.as_posix(), ocp.PyTreeCheckpointer(), options)
 def save(manager, *, step: int, payload: Any) -> bool:
    _require_orbax()
    return bool(manager.save(int(step), payload))
 def latest_step(manager) -> int | None:
    _require_orbax()
    return manager.latest_step()
 def restore(manager, *, target: Any, step: int | None = None) -> Any:
    _require_orbax()
    step_to_restore = manager.latest_step() if step is None else int(step)
    if step_to_restore is None:
        return target
    return manager.restore(step_to_restore, items=target)
--- a/engine/jax/env.py
+++ b/engine/jax/env.py
@@ -1,287 +0,0 @@
 """JAX-native PHANTOM environment with robust contamination step."""
 from __future__ import annotations
 from typing import NamedTuple
 try:
    import jax
    import jax.numpy as jnp
 except ImportError as exc:  # pragma: no cover
    raise ImportError("engine.jax.env requires JAX") from exc
 from .primitives import (
    _sample_sessions_jax,
    agent_probability_from_kl,
    batch_kl,
    compute_session_transitions,
    load_transition_data,
    purchase_flags,
    reward_with_coi_penalty,
    revenue_from_demand,
    weighted_demand,
 )
 class EnvParams(NamedTuple):
    n_products: int
    n_sessions: int
    max_episode_steps: int
    max_session_steps: int
    price_low: float
    price_high: float
    lambda_coi: float
    info_value: float
    robust_radius: float
    margin_floor: float
    margin_floor_patience: int
    action_scales: jax.Array
    alpha_nominal: float
    alpha_candidates: jax.Array
    human_T: jax.Array
    agent_T: jax.Array
    terminal_mask: jax.Array
    purchase_mask: jax.Array
    event_weights: jax.Array
    start_idx: int
    term_idx: int
 class EnvState(NamedTuple):
    prices: jax.Array
    demand: jax.Array
    step_count: jax.Array
    low_margin_streak: jax.Array
    last_agent_prob: jax.Array
    last_alpha_adv: jax.Array
 class CandidateEval(NamedTuple):
    reward: jax.Array
    revenue: jax.Array
    demand: jax.Array
    agent_prob: jax.Array
    leakage: jax.Array
    discount: jax.Array
    n_purchases: jax.Array
    n_agents: jax.Array
 def make_env_params(
    *,
    n_products: int,
    alpha: float,
    n_sessions: int,
    lambda_coi: float,
    robust_radius: float,
    robust_points: int,
    info_value: float,
    action_levels: int,
    action_scale_low: float,
    action_scale_high: float,
    price_low: float,
    price_high: float,
    max_episode_steps: int,
    max_session_steps: int = 40,
    margin_floor: float = 0.05,
    margin_floor_patience: int = 5,
    prefer_behavior_data: bool = True,
 ) -> EnvParams:
    transition = load_transition_data(prefer_data=prefer_behavior_data).to_jax()
    if robust_radius <= 0.0 or robust_points <= 1:
        alpha_candidates = jnp.asarray([float(alpha)], dtype=jnp.float32)
    else:
        lo = max(0.0, float(alpha) - float(robust_radius))
        hi = min(1.0, float(alpha) + float(robust_radius))
        alpha_candidates = jnp.linspace(lo, hi, int(robust_points), dtype=jnp.float32)
    action_scales = jnp.linspace(
        float(action_scale_low),
        float(action_scale_high),
        int(action_levels),
        dtype=jnp.float32,
    )
    return EnvParams(
        n_products=int(n_products),
        n_sessions=int(n_sessions),
        max_episode_steps=int(max_episode_steps),
        max_session_steps=int(max_session_steps),
        price_low=float(price_low),
        price_high=float(price_high),
        lambda_coi=float(lambda_coi),
        info_value=float(info_value),
        robust_radius=float(robust_radius),
        margin_floor=float(margin_floor),
        margin_floor_patience=int(margin_floor_patience),
        action_scales=action_scales,
        alpha_nominal=float(alpha),
        alpha_candidates=alpha_candidates,
        human_T=jnp.asarray(transition.human_T),
        agent_T=jnp.asarray(transition.agent_T),
        terminal_mask=jnp.asarray(transition.terminal_mask),
        purchase_mask=jnp.asarray(transition.purchase_mask),
        event_weights=jnp.asarray(transition.event_weights),
        start_idx=int(transition.start_idx),
        term_idx=int(transition.term_idx),
    )
 def _flatten_obs(demand: jax.Array, prices: jax.Array) -> jax.Array:
    return jnp.concatenate([demand.astype(jnp.float32), prices.astype(jnp.float32)])
 def _decode_action(
    prices: jax.Array, action: jax.Array, params: EnvParams
 ) -> jax.Array:
    idx = jnp.clip(action.astype(jnp.int32), 0, params.action_scales.shape[0] - 1)
    scale = params.action_scales[idx]
    next_prices = prices * scale
    return jnp.clip(next_prices, params.price_low, params.price_high)
 def _evaluate_candidate(
    key: jax.Array,
    alpha_candidate: jax.Array,
    prices: jax.Array,
    params: EnvParams,
 ) -> CandidateEval:
    states, products, actors, lengths = _sample_sessions_jax(
        key,
        params.human_T,
        params.agent_T,
        params.terminal_mask,
        params.start_idx,
        params.term_idx,
        alpha_candidate,
        params.n_products,
        params.n_sessions,
        params.max_session_steps,
        int(params.human_T.shape[0]),
    )
    session_trans = compute_session_transitions(
        states, lengths, int(params.human_T.shape[0])
    )
    delta_h, delta_a = batch_kl(session_trans, params.human_T, params.agent_T)
    agent_probs = agent_probability_from_kl(delta_h, delta_a)
    agent_prob = jnp.mean(agent_probs)
    demand = weighted_demand(states, products, params.n_products, params.event_weights)
    revenue = revenue_from_demand(prices, demand)
    reward, leakage, discount = reward_with_coi_penalty(
        revenue,
        agent_prob,
        params.lambda_coi,
        params.info_value,
    )
    purchases = purchase_flags(states, params.purchase_mask)
    return CandidateEval(
        reward=reward,
        revenue=revenue,
        demand=demand,
        agent_prob=agent_prob,
        leakage=leakage,
        discount=discount,
        n_purchases=jnp.sum(purchases.astype(jnp.float32)),
        n_agents=jnp.sum(actors.astype(jnp.float32)),
    )
 def reset_env(key: jax.Array, params: EnvParams) -> tuple[jax.Array, EnvState]:
    prices = jax.random.uniform(
        key,
        shape=(params.n_products,),
        minval=params.price_low,
        maxval=params.price_high,
    )
    demand = jnp.zeros((params.n_products,), dtype=jnp.float32)
    state = EnvState(
        prices=prices,
        demand=demand,
        step_count=jnp.asarray(0, dtype=jnp.int32),
        low_margin_streak=jnp.asarray(0, dtype=jnp.int32),
        last_agent_prob=jnp.asarray(params.alpha_nominal, dtype=jnp.float32),
        last_alpha_adv=jnp.asarray(params.alpha_nominal, dtype=jnp.float32),
    )
    return _flatten_obs(demand, prices), state
 def step_env(
    key: jax.Array,
    state: EnvState,
    action: jax.Array,
    params: EnvParams,
 ) -> tuple[jax.Array, EnvState, jax.Array, jax.Array, dict[str, jax.Array]]:
    prices = _decode_action(state.prices, action, params)
    n_candidates = params.alpha_candidates.shape[0]
    cand_keys = jax.random.split(key, n_candidates)
    evals = jax.vmap(
        lambda k, a: _evaluate_candidate(k, a, prices, params),
        in_axes=(0, 0),
    )(cand_keys, params.alpha_candidates)
    idx = jnp.argmin(evals.reward)
    demand = evals.demand[idx]
    reward = evals.reward[idx]
    revenue = evals.revenue[idx]
    agent_prob = evals.agent_prob[idx]
    leakage = evals.leakage[idx]
    discount = evals.discount[idx]
    n_purchases = evals.n_purchases[idx]
    n_agents = evals.n_agents[idx]
    alpha_adv = params.alpha_candidates[idx]
    step_count = state.step_count + 1
    avg_price = jnp.maximum(jnp.mean(prices), 1e-6)
    avg_margin = (avg_price - params.price_low) / avg_price
    next_streak = jnp.where(
        avg_margin < params.margin_floor, state.low_margin_streak + 1, 0
    )
    margin_collapsed = next_streak >= params.margin_floor_patience
    done = (step_count >= params.max_episode_steps) | margin_collapsed
    next_state = EnvState(
        prices=prices,
        demand=demand,
        step_count=step_count,
        low_margin_streak=next_streak,
        last_agent_prob=agent_prob,
        last_alpha_adv=alpha_adv,
    )
    obs = _flatten_obs(demand, prices)
    info = {
        "revenue": revenue,
        "agent_prob": agent_prob,
        "alpha_adv": alpha_adv,
        "coi_leakage": leakage,
        "coi_discount": discount,
        "n_purchases": n_purchases,
        "n_agents": n_agents,
        "avg_margin": avg_margin,
    }
    return obs, next_state, reward, done, info
 class PHANTOMJAXEnv:
    def __init__(self, params: EnvParams):
        self.params = params
    def reset(self, key: jax.Array, params: EnvParams | None = None):
        return reset_env(key, self.params if params is None else params)
    def step(
        self,
        key: jax.Array,
        state: EnvState,
        action: jax.Array,
        params: EnvParams | None = None,
    ):
        return step_env(key, state, action, self.params if params is None else params)
    def action_space_n(self, params: EnvParams | None = None) -> int:
        p = self.params if params is None else params
        return int(p.action_scales.shape[0])
    def observation_dim(self, params: EnvParams | None = None) -> int:
        p = self.params if params is None else params
        return int(p.n_products * 2)
--- a/engine/jax/primitives.py
+++ b/engine/jax/primitives.py
@@ -1,495 +0,0 @@
 """JAX-compatible primitives for PHANTOM session simulation and separability."""
 from __future__ import annotations
 from dataclasses import dataclass
 from functools import partial
 from typing import Mapping, Sequence
 import numpy as np
 try:
    import jax
    import jax.numpy as jnp
    JAX_AVAILABLE = True
 except ImportError:
    jax = None  # type: ignore[assignment]
    jnp = np  # type: ignore[assignment]
    JAX_AVAILABLE = False
 STATE_START_KEYS = ("session_start", "start")
 TERMINAL_EVENT_TOKENS = (
    "session_end",
    "end",
    "purchase_complete",
    "checkout_start",
    "checkout",
 )
 PURCHASE_EVENT_TOKENS = (
    "purchase_complete",
    "purchase",
    "checkout_start",
    "checkout",
 )
 CATEGORY_WEIGHTS = {"cart": 4.0, "dwell": 2.0, "nav": 1.0, "filter": 0.5}
 ACTION_CATEGORIES = {
    "cart": {"add_item", "add_to_cart", "remove", "checkout", "purchase"},
    "dwell": {
        "hover_title",
        "hover_paragraph",
        "hover_link",
        "hover_over_title",
        "hover_over_paragraph",
        "hover_over_link",
        "hover_over_button",
    },
    "nav": {
        "page_view",
        "view_item",
        "view",
        "learn_more",
        "learn_more_about_item",
        "view_item_page",
        "session_start",
    },
    "filter": {
        "search",
        "filter_date",
        "filter_price",
        "sort",
        "filter_for_date",
        "filter_for_price",
        "filter_for_amenities",
        "sort_change",
    },
 }
 DEFAULT_ACTION_WEIGHTS = {
    action: CATEGORY_WEIGHTS[group]
    for group, actions in ACTION_CATEGORIES.items()
    for action in actions
 }
@dataclass(frozen=True)
 class TransitionData:
    """Dense transition kernels and per-state metadata."""
    human_T: np.ndarray
    agent_T: np.ndarray
    terminal_mask: np.ndarray
    purchase_mask: np.ndarray
    event_weights: np.ndarray
    event_names: tuple[str, ...]
    start_idx: int
    term_idx: int
    def to_jax(self) -> "TransitionData":
        if not JAX_AVAILABLE:
            return self
        return TransitionData(
            human_T=jnp.asarray(self.human_T),
            agent_T=jnp.asarray(self.agent_T),
            terminal_mask=jnp.asarray(self.terminal_mask),
            purchase_mask=jnp.asarray(self.purchase_mask),
            event_weights=jnp.asarray(self.event_weights),
            event_names=self.event_names,
            start_idx=int(self.start_idx),
            term_idx=int(self.term_idx),
        )
@dataclass(frozen=True)
 class SessionBatch:
    states: np.ndarray
    products: np.ndarray
    actors: np.ndarray
    lengths: np.ndarray
 def _event_weight(name: str) -> float:
    if name in DEFAULT_ACTION_WEIGHTS:
        return float(DEFAULT_ACTION_WEIGHTS[name])
    if name.startswith("hover"):
        return float(CATEGORY_WEIGHTS["dwell"])
    if name.startswith("filter") or name in {"search", "sort", "sort_change"}:
        return float(CATEGORY_WEIGHTS["filter"])
    if name.startswith("add") or name in {
        "checkout",
        "checkout_start",
        "purchase",
        "remove_item",
        "purchase_complete",
    }:
        return float(CATEGORY_WEIGHTS["cart"])
    if any(token in name for token in TERMINAL_EVENT_TOKENS):
        return 0.0
    return float(CATEGORY_WEIGHTS["nav"])
 def _is_terminal(name: str) -> bool:
    return any(token in name for token in TERMINAL_EVENT_TOKENS)
 def _is_purchase(name: str) -> bool:
    return any(token in name for token in PURCHASE_EVENT_TOKENS)
 def _collect_events(*transitions: Mapping[str, Mapping[str, float]]) -> tuple[str, ...]:
    names: set[str] = set()
    for trans in transitions:
        for src, dsts in trans.items():
            names.add(src)
            names.update(dsts.keys())
    names.discard("__terminal__")
    return tuple(sorted(names))
 def _normalize_rows(matrix: np.ndarray, term_idx: int) -> np.ndarray:
    row_sums = matrix.sum(axis=1, keepdims=True)
    dead_rows = np.isclose(row_sums.squeeze(-1), 0.0)
    if np.any(dead_rows):
        matrix[dead_rows] = 0.0
        matrix[dead_rows, term_idx] = 1.0
        row_sums = matrix.sum(axis=1, keepdims=True)
    return matrix / np.maximum(row_sums, 1e-8)
 def _dense_from_dict(
    transitions: Mapping[str, Mapping[str, float]],
    event_to_idx: Mapping[str, int],
    term_idx: int,
 ) -> np.ndarray:
    n_states = len(event_to_idx)
    matrix = np.zeros((n_states, n_states), dtype=np.float32)
    for src, dsts in transitions.items():
        i = event_to_idx.get(src)
        if i is None:
            continue
        for dst, prob in dsts.items():
            j = event_to_idx.get(dst)
            if j is None:
                continue
            matrix[i, j] += float(prob)
    return _normalize_rows(matrix, term_idx)
 def compile_transition_data(
    human_transitions: Mapping[str, Mapping[str, float]],
    agent_transitions: Mapping[str, Mapping[str, float]],
 ) -> TransitionData:
    event_names = _collect_events(human_transitions, agent_transitions)
    if not event_names:
        return fallback_transition_data()
    event_names = tuple([*event_names, "__terminal__"])
    term_idx = len(event_names) - 1
    event_to_idx = {name: i for i, name in enumerate(event_names)}
    human_T = _dense_from_dict(human_transitions, event_to_idx, term_idx)
    agent_T = _dense_from_dict(agent_transitions, event_to_idx, term_idx)
    terminal_mask = np.array([_is_terminal(name) for name in event_names], dtype=bool)
    purchase_mask = np.array([_is_purchase(name) for name in event_names], dtype=bool)
    event_weights = np.array(
        [_event_weight(name) for name in event_names], dtype=np.float32
    )
    terminal_mask[term_idx] = True
    for idx, is_term in enumerate(terminal_mask):
        if not is_term:
            continue
        human_T[idx] = 0.0
        agent_T[idx] = 0.0
        human_T[idx, idx] = 1.0
        agent_T[idx, idx] = 1.0
    start_idx = 0
    for key in STATE_START_KEYS:
        if key in event_to_idx:
            start_idx = int(event_to_idx[key])
            break
    return TransitionData(
        human_T=human_T,
        agent_T=agent_T,
        terminal_mask=terminal_mask,
        purchase_mask=purchase_mask,
        event_weights=event_weights,
        event_names=event_names,
        start_idx=start_idx,
        term_idx=term_idx,
    )
 def fallback_transition_data() -> TransitionData:
    human = {
        "session_start": {
            "page_view": 0.80,
            "view_item_page": 0.15,
            "session_end": 0.05,
        },
        "page_view": {"view_item_page": 0.55, "search": 0.25, "session_end": 0.20},
        "view_item_page": {
            "learn_more_about_item": 0.40,
            "add_item_to_cart": 0.28,
            "session_end": 0.32,
        },
        "learn_more_about_item": {
            "add_item_to_cart": 0.50,
            "view_item_page": 0.30,
            "session_end": 0.20,
        },
        "add_item_to_cart": {
            "checkout_start": 0.58,
            "view_item_page": 0.24,
            "session_end": 0.18,
        },
        "checkout_start": {"purchase_complete": 0.70, "session_end": 0.30},
        "purchase_complete": {"session_end": 1.0},
    }
    agent = {
        "session_start": {
            "page_view": 0.90,
            "view_item_page": 0.08,
            "session_end": 0.02,
        },
        "page_view": {"view_item_page": 0.40, "search": 0.35, "session_end": 0.25},
        "view_item_page": {
            "learn_more_about_item": 0.55,
            "add_item_to_cart": 0.15,
            "session_end": 0.30,
        },
        "learn_more_about_item": {
            "view_item_page": 0.45,
            "add_item_to_cart": 0.20,
            "session_end": 0.35,
        },
        "add_item_to_cart": {
            "checkout_start": 0.42,
            "view_item_page": 0.28,
            "session_end": 0.30,
        },
        "checkout_start": {"purchase_complete": 0.52, "session_end": 0.48},
        "purchase_complete": {"session_end": 1.0},
    }
    return compile_transition_data(human, agent)
 def load_transition_data(prefer_data: bool = True) -> TransitionData:
    if not prefer_data:
        return fallback_transition_data()
    try:
        from ..lib.behavior import get_transition_models
        human_trans, agent_trans = get_transition_models()
        return compile_transition_data(human_trans, agent_trans)
    except Exception:
        return fallback_transition_data()
 if JAX_AVAILABLE:
    @partial(jax.jit, static_argnums=(8, 9, 10))
    def _sample_sessions_jax(
        key: jax.Array,
        human_T: jax.Array,
        agent_T: jax.Array,
        terminal_mask: jax.Array,
        start_idx: int,
        term_idx: int,
        alpha: float,
        n_products: int,
        n_sessions: int,
        max_steps: int,
        n_states: int,
    ) -> tuple[jax.Array, jax.Array, jax.Array, jax.Array]:
        k_actor, k_product, k_step = jax.random.split(key, 3)
        start_idx_i32 = jnp.asarray(start_idx, dtype=jnp.int32)
        term_idx_i32 = jnp.asarray(term_idx, dtype=jnp.int32)
        actor_draw = jax.random.uniform(k_actor, (n_sessions,))
        actors = (actor_draw < alpha).astype(jnp.int32)
        products = jax.random.randint(
            k_product, (n_sessions,), 0, n_products, dtype=jnp.int32
        )
        active_init = jnp.ones((n_sessions,), dtype=jnp.bool_)
        state_init = jnp.full((n_sessions,), start_idx_i32, dtype=jnp.int32)
        def _scan_step(carry, _):
            states, active, rng = carry
            rng, k = jax.random.split(rng)
            probs_h = human_T[states]
            probs_a = agent_T[states]
            probs = jnp.where(actors[:, None] == 0, probs_h, probs_a)
            next_state = jax.random.categorical(k, jnp.log(probs + 1e-10), axis=-1)
            next_state = jnp.where(active, next_state, term_idx_i32)
            emitted = jnp.where(active, next_state, -1)
            is_terminal = terminal_mask[jnp.clip(next_state, 0, n_states - 1)]
            next_active = active & (~is_terminal)
            carry_states = jnp.where(next_active, next_state, term_idx_i32)
            return (carry_states, next_active, rng), emitted
        _, state_t = jax.lax.scan(
            _scan_step, (state_init, active_init, k_step), None, length=max_steps
        )
        states = state_t.T
        lengths = jnp.sum(states >= 0, axis=1, dtype=jnp.int32)
        return states, products, actors, lengths
 def sample_sessions(
    key,
    transition_data: TransitionData,
    alpha: float,
    n_products: int,
    n_sessions: int,
    max_steps: int,
 ) -> SessionBatch:
    if JAX_AVAILABLE:
        td = transition_data.to_jax()
        states, products, actors, lengths = _sample_sessions_jax(
            key,
            td.human_T,
            td.agent_T,
            td.terminal_mask,
            int(td.start_idx),
            int(td.term_idx),
            float(alpha),
            int(n_products),
            int(n_sessions),
            int(max_steps),
            int(td.human_T.shape[0]),
        )
        return SessionBatch(
            states=states, products=products, actors=actors, lengths=lengths
        )
    rng = np.random.default_rng(int(np.asarray(key).reshape(-1)[0]))
    n_states = transition_data.human_T.shape[0]
    products = rng.integers(0, n_products, size=n_sessions, dtype=np.int32)
    actors = (rng.random(size=n_sessions) < alpha).astype(np.int32)
    states = np.full((n_sessions, max_steps), -1, dtype=np.int32)
    lengths = np.zeros((n_sessions,), dtype=np.int32)
    for i in range(n_sessions):
        current = int(transition_data.start_idx)
        mat = transition_data.agent_T if actors[i] == 1 else transition_data.human_T
        for t in range(max_steps):
            nxt = int(rng.choice(n_states, p=mat[current]))
            states[i, t] = nxt
            if transition_data.terminal_mask[nxt]:
                lengths[i] = t + 1
                break
            current = nxt
        if lengths[i] == 0:
            lengths[i] = max_steps
    return SessionBatch(
        states=states, products=products, actors=actors, lengths=lengths
    )
 if JAX_AVAILABLE:
    @partial(jax.jit, static_argnums=(2,))
    def compute_session_transitions(states, lengths, n_states: int):
        src = states[:, :-1]
        dst = states[:, 1:]
        time_idx = jnp.arange(src.shape[1])[None, :]
        valid = (src >= 0) & (dst >= 0) & (time_idx < (lengths[:, None] - 1))
        src_clip = jnp.clip(src, 0, n_states - 1)
        dst_clip = jnp.clip(dst, 0, n_states - 1)
        src_oh = jax.nn.one_hot(src_clip, n_states)
        dst_oh = jax.nn.one_hot(dst_clip, n_states)
        counts = jnp.einsum(
            "nti,ntj,nt->nij", src_oh, dst_oh, valid.astype(jnp.float32)
        )
        row_sums = jnp.sum(counts, axis=-1, keepdims=True)
        return counts / (row_sums + 1e-10)
 else:
    def compute_session_transitions(states, lengths, n_states: int):
        trans = np.zeros((states.shape[0], n_states, n_states), dtype=np.float32)
        for i in range(states.shape[0]):
            for t in range(max(int(lengths[i]) - 1, 0)):
                s = int(states[i, t])
                d = int(states[i, t + 1])
                if s >= 0 and d >= 0:
                    trans[i, s, d] += 1.0
        row_sums = trans.sum(axis=-1, keepdims=True)
        return trans / (row_sums + 1e-10)
 def batch_kl(P, Q_human, Q_agent, eps: float = 1e-10):
    p = P + eps
    p = p / jnp.sum(p, axis=-1, keepdims=True)
    qh = Q_human[None, ...] + eps
    qa = Q_agent[None, ...] + eps
    delta_h = jnp.sum(p * jnp.log(p / qh), axis=(1, 2))
    delta_a = jnp.sum(p * jnp.log(p / qa), axis=(1, 2))
    return delta_h, delta_a
 if JAX_AVAILABLE:
    batch_kl = jax.jit(batch_kl)
 def agent_probability_from_kl(delta_h, delta_a, temperature: float = 1.0):
    t = jnp.maximum(float(temperature), 1e-6)
    exp_h = jnp.exp(-delta_h / t)
    exp_a = jnp.exp(-delta_a / t)
    return exp_a / (exp_h + exp_a + 1e-10)
 def estimate_alpha_from_kl(delta_h, delta_a, beta: float = 2.0):
    logits = beta * (delta_h - delta_a)
    return 1.0 / (1.0 + jnp.exp(-logits))
 def weighted_demand(states, products, n_products: int, event_weights):
    valid = states >= 0
    state_clip = jnp.clip(states, 0, event_weights.shape[0] - 1)
    weights = event_weights[state_clip] * valid
    per_session = jnp.sum(weights, axis=1)
    demand = jnp.zeros((n_products,), dtype=jnp.float32)
    demand = demand.at[products].add(per_session)
    total = jnp.sum(demand)
    return jnp.where(total > 0.0, (demand / total) * 100.0, demand)
 if JAX_AVAILABLE:
    weighted_demand = jax.jit(weighted_demand, static_argnums=(2,))
 def purchase_flags(states, purchase_mask):
    state_clip = jnp.clip(states, 0, purchase_mask.shape[0] - 1)
    hits = purchase_mask[state_clip] & (states >= 0)
    return jnp.any(hits, axis=1)
 if JAX_AVAILABLE:
    purchase_flags = jax.jit(purchase_flags)
 def revenue_from_demand(prices, demand):
    return jnp.dot(prices, demand)
 if JAX_AVAILABLE:
    revenue_from_demand = jax.jit(revenue_from_demand)
 def reward_with_coi_penalty(
    revenue, agent_prob: float, lambda_coi: float, info_value: float
 ):
    leakage = agent_prob * info_value
    discount = jnp.clip(1.0 - lambda_coi * leakage, 0.0, 1.0)
    return revenue * discount, leakage, discount
 if JAX_AVAILABLE:
    reward_with_coi_penalty = jax.jit(reward_with_coi_penalty)
--- a/engine/jax/requirements.txt
+++ b/engine/jax/requirements.txt
@@ -1,5 +0,0 @@
 flax==0.10.7
 optax==0.2.7
 distrax==0.1.5
 orbax-checkpoint==0.11.32
 chex==0.1.90
--- a/engine/jax/train.py
+++ b/engine/jax/train.py
--- a/engine/lib/init.py
+++ b/engine/lib/init.py
@@ -1,14 +1,38 @@
-from .demand import estimate_demand, estimate_weighted_demand, generate_demand_for_actor
+from __future__ import annotations
-from .behavior import sample_behavior, get_transition_models, trajectory_to_events
+
-from .render import DashboardRenderer, style_axis
+from importlib import import_module
-from .wrappers import EconomicMetricsWrapper
+
-from .callbacks import MetricsCallback, EvalMetricsCallback, CheckpointArtifactCallback
+_EXPORTS: dict[str, tuple[str, str]] = {
-from .providers import (
+    "estimate_demand": (".demand", "estimate_demand"),
-    ProviderBenchmark,
+    "estimate_weighted_demand": (".demand", "estimate_weighted_demand"),
-    ProviderResult,
+    "generate_demand_for_actor": (".demand", "generate_demand_for_actor"),
-    BenchmarkConfig,
+    "sample_behavior": (".behavior", "sample_behavior"),
-    RandomBaseline,
+    "get_transition_models": (".behavior", "get_transition_models"),
-    SurgeBaseline,
+    "trajectory_to_events": (".behavior", "trajectory_to_events"),
-)
+    "DashboardRenderer": (".render", "DashboardRenderer"),
-from .coi import compute_uplift_coi, extract_purchases, compute_agent_probability
+    "style_axis": (".render", "style_axis"),
-from .discrete import EventQTable
+    "EconomicMetricsWrapper": (".wrappers", "EconomicMetricsWrapper"),
    "MetricsCallback": (".callbacks", "MetricsCallback"),
    "EvalMetricsCallback": (".callbacks", "EvalMetricsCallback"),
    "ProviderBenchmark": (".providers", "ProviderBenchmark"),
    "ProviderResult": (".providers", "ProviderResult"),
    "BenchmarkConfig": (".providers", "BenchmarkConfig"),
    "RandomBaseline": (".providers", "RandomBaseline"),
    "SurgeBaseline": (".providers", "SurgeBaseline"),
    "compute_uplift_coi": (".coi", "compute_uplift_coi"),
    "extract_purchases": (".coi", "extract_purchases"),
    "compute_agent_probability": (".coi", "compute_agent_probability"),
    "EventQTable": (".discrete", "EventQTable"),
 }
 __all__ = sorted(_EXPORTS)
 def __getattr__(name: str):
    if name not in _EXPORTS:
        raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
    module_name, attr_name = _EXPORTS[name]
    module = import_module(module_name, package=__name__)
    value = getattr(module, attr_name)
    globals()[name] = value
    return value
--- a/engine/lib/behavior.py
+++ b/engine/lib/behavior.py
@@ -110,10 +110,12 @@ def adjust_behavior_to_condition(condition, transition_matrix):
    return pd.DataFrame(expanded, index=new_rows, columns=new_cols)
-def sample_behavior(condition, human=True, max_len=40):
+def get_adjusted_transitions(condition, human=True):
    base_pivot = _get_base_pivot(human)
-    adjusted_transitions = adjust_behavior_to_condition(condition, base_pivot)
+    return adjust_behavior_to_condition(condition, base_pivot)
 def sample_behavior_from_transitions(adjusted_transitions, max_len=40):
    trajectory = [np.random.choice(adjusted_transitions.index)]
    while len(trajectory) < max_len and "checkout" not in trajectory[-1]:
        probs = np.asarray(adjusted_transitions.loc[trajectory[-1]].values, dtype=float)
@@ -127,6 +129,11 @@ def sample_behavior(condition, human=True, max_len=40):
    return trajectory
 def sample_behavior(condition, human=True, max_len=40):
    adjusted_transitions = get_adjusted_transitions(condition, human=human)
    return sample_behavior_from_transitions(adjusted_transitions, max_len=max_len)
 if __name__ == "__main__":
    t = sample_behavior(generate_demand_for_actor(np.array([10, 20, 30])), human=True)
    print(t)
--- a/engine/lib/callbacks.py
+++ b/engine/lib/callbacks.py
@@ -1,150 +1,120 @@
-"""Training callbacks for W&B/TensorBoard logging - reads from info dict."""
+"""Training callbacks with algorithm-agnostic metric extraction."""
-from pathlib import Path
+from typing import Any
 from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
 import numpy as np
-from ..wandb_checkpoint import checkpoint_artifact_name, log_checkpoint_file
+from ..telemetry.wandb import get_wandb_module
 try:
    import wandb
    HAS_WANDB = True
 except ImportError:
    HAS_WANDB = False
 class MetricsCallback(BaseCallback):
-    """Training metrics logger - reads info['economics'], logs to W&B."""
+    """Collects interval train metrics from env info dictionaries."""
    def __init__(
-        self, log_histograms: bool = True, log_freq: int = 100, verbose: int = 0
+        self,
        log_histograms: bool = False,
        log_freq: int = 100,
        step_offset: int = 0,
        verbose: int = 0,
    ):
        super().__init__(verbose)
        self.log_histograms = log_histograms
-        self.log_freq = log_freq
+        self.log_freq = max(1, int(log_freq))
-        self._episode_revenues: list[float] = []
+        self.step_offset = max(0, int(step_offset))
-
+        self._wandb = get_wandb_module()
-    def _on_step(self) -> bool:
+        self._wandb_live = bool(self._wandb is not None and self._wandb.run is not None)
-        if not HAS_WANDB or wandb.run is None:
+        self._window_sums = {
-            return True
+            "train/revenue_mean": 0.0,
-
+            "train/margin_mean": 0.0,
-        for info in self.locals.get("infos", []):
+            "train/coi_level_mean": 0.0,
-            if "economics" not in info:
+            "train/regret_mean": 0.0,
-                continue
+            "train/profit_mean": 0.0,
-
+            "train/agent_prob": 0.0,
-            econ = info["economics"]
+            "train/alpha_adv": 0.0,
-            t = self.num_timesteps
+            "train/ux_penalty": 0.0,
-
+            "train/volatility": 0.0,
-            payload = {
+            "train/coi_mix": 0.0,
-                "economics/revenue": econ["revenue"],
+            "train/coi_base": 0.0,
-                "economics/margin": econ["margin"],
+            "train/coi_leakage": 0.0,
-                "coi/level": econ["coi_level"],
+            "train/coi_penalty": 0.0,
                "economics/regret": econ["regret"],
        }
        self._window_count = 0
        self.events: list[dict[str, Any]] = []
    def _accumulate(self, info: dict[str, Any]) -> None:
        econ = info.get("economics")
        if not isinstance(econ, dict):
            return
        self._window_sums["train/revenue_mean"] += float(econ.get("revenue", 0.0))
        self._window_sums["train/margin_mean"] += float(econ.get("margin", 0.0))
        self._window_sums["train/coi_level_mean"] += float(econ.get("coi_level", 0.0))
        self._window_sums["train/regret_mean"] += float(econ.get("regret", 0.0))
        if "profit" in econ:
            self._window_sums["train/profit_mean"] += float(econ.get("profit", 0.0))
        if "agent_prob" in econ:
            self._window_sums["train/agent_prob"] += float(econ.get("agent_prob", 0.0))
        if "alpha_adv" in econ:
            self._window_sums["train/alpha_adv"] += float(econ.get("alpha_adv", 0.0))
        if "ux_penalty" in econ:
            self._window_sums["train/ux_penalty"] += float(econ.get("ux_penalty", 0.0))
        if "volatility" in econ:
            self._window_sums["train/volatility"] += float(econ.get("volatility", 0.0))
        if "coi_mix" in econ:
-                payload["coi/mix"] = econ["coi_mix"]
+            self._window_sums["train/coi_mix"] += float(econ.get("coi_mix", 0.0))
        if "coi_base" in econ:
-                payload["coi/base"] = econ["coi_base"]
+            self._window_sums["train/coi_base"] += float(econ.get("coi_base", 0.0))
        if "coi_leakage" in econ:
-                payload["coi/leakage"] = econ["coi_leakage"]
+            self._window_sums["train/coi_leakage"] += float(
                econ.get("coi_leakage", 0.0)
            )
        if "coi_penalty" in econ:
-                payload["coi/penalty"] = econ["coi_penalty"]
+            self._window_sums["train/coi_penalty"] += float(
-            wandb.log(payload, step=t)
+                econ.get("coi_penalty", 0.0)
            self._episode_revenues.append(econ["revenue"])
        # histograms at log_freq intervals
        if self.log_histograms and self.num_timesteps % self.log_freq == 0:
            for info in self.locals.get("infos", []):
                if "prices" in info:
                    wandb.log(
                        {"distributions/prices": wandb.Histogram(info["prices"])},
                        step=self.num_timesteps,
                    )
                if "demand" in info:
                    wandb.log(
                        {"distributions/demand": wandb.Histogram(info["demand"])},
                        step=self.num_timesteps,
            )
        self._window_count += 1
-        return True
+    def _flush(self, step: int) -> None:
-
+        if self._window_count <= 0:
    def _on_rollout_end(self) -> None:
        if not HAS_WANDB or wandb.run is None or not self._episode_revenues:
            return
-        wandb.log(
+        denom = float(self._window_count)
-            {
+        payload = {
-                "episode/mean_revenue": np.mean(self._episode_revenues),
+            key: (value / denom)
-                "episode/total_revenue": np.sum(self._episode_revenues),
+            for key, value in self._window_sums.items()
-            },
+            if value != 0.0
-            step=self.num_timesteps,
+            or key
-        )
+            in {
-        self._episode_revenues = []
+                "train/revenue_mean",
-
+                "train/margin_mean",
-
+                "train/coi_level_mean",
-class CheckpointArtifactCallback(BaseCallback):
+                "train/regret_mean",
    """Periodic SB3 checkpoint uploader backed by W&B artifacts."""
    def __init__(self, cfg: dict, interval: int = 10_000, verbose: int = 0):
        super().__init__(verbose)
        self.cfg = dict(cfg)
        self.interval = max(1, int(interval))
        self.model_dir = Path(str(self.cfg.get("model_dir", "engine/models")))
        self.model_dir.mkdir(parents=True, exist_ok=True)
        self._next_checkpoint = self.interval
        self._last_saved_step = -1
    def _artifact_name(self) -> str:
        sweep_id = (
            getattr(wandb.run, "sweep_id", None)
            if HAS_WANDB and wandb.run is not None
            else None
        )
        return checkpoint_artifact_name(self.cfg, backend="sb3", sweep_id=sweep_id)
    def _checkpoint_file(self) -> Path:
        algo = str(self.cfg.get("algo", "model"))
        base = self.model_dir / f"phantom_{algo}_checkpoint"
        self.model.save(str(base))
        return base.with_suffix(".zip")
    def _save_checkpoint(self) -> None:
        if not HAS_WANDB or wandb.run is None:
            return
        step = int(self.num_timesteps)
        if step <= self._last_saved_step:
            return
        checkpoint_path = self._checkpoint_file()
        metadata = {
            "step": step,
            "algo": str(self.cfg.get("algo", "unknown")),
            "sweep_id": getattr(wandb.run, "sweep_id", None),
            }
-        saved = log_checkpoint_file(
+        }
-            self._artifact_name(),
+        payload["train/global_step"] = int(step)
-            file_path=checkpoint_path,
+        if self._wandb_live:
-            artifact_file_name=checkpoint_path.name,
+            self._wandb.log(dict(payload), step=self.step_offset + int(step))
-            metadata=metadata,
+        else:
-        )
+            self.events.append(payload)
-        if saved:
+        for key in self._window_sums:
-            self._last_saved_step = step
+            self._window_sums[key] = 0.0
        self._window_count = 0
    def _on_step(self) -> bool:
-        if self.num_timesteps < self._next_checkpoint:
+        for info in self.locals.get("infos", []):
-            return True
+            if isinstance(info, dict):
-        self._save_checkpoint()
+                self._accumulate(info)
-        while self._next_checkpoint <= self.num_timesteps:
+
-            self._next_checkpoint += self.interval
+        if self.num_timesteps % self.log_freq == 0:
            self._flush(step=self.num_timesteps)
        return True
    def _on_training_end(self) -> None:
-        self._save_checkpoint()
+        self._flush(step=self.num_timesteps)
 class EvalMetricsCallback(EvalCallback):
-    """Deterministic evaluation - true performance without exploration noise."""
+    """Deterministic evaluation collector detached from logging backends."""
    def __init__(
        self, eval_env, eval_freq: int = 1000, n_eval_episodes: int = 5, **kwargs
@@ -153,23 +123,19 @@ class EvalMetricsCallback(EvalCallback):
            eval_env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes, **kwargs
        )
        self._eval_revenues: list[float] = []
        self.events: list[dict[str, float | int]] = []
    def _on_step(self) -> bool:
        result = super()._on_step()
        if not HAS_WANDB or wandb.run is None:
            return result
        # log eval metrics after evaluation runs
        if self.n_calls % self.eval_freq == 0 and hasattr(self, "last_mean_reward"):
-            wandb.log(
+            self.events.append(
                {
-                    "eval/mean_reward": self.last_mean_reward,
+                    "eval/reward_mean": float(self.last_mean_reward),
-                    "eval/mean_revenue": np.mean(self._eval_revenues)
+                    "eval/revenue_mean": float(np.mean(self._eval_revenues))
                    if self._eval_revenues
-                    else 0,
+                    else 0.0,
-                },
+                    "train/global_step": int(self.num_timesteps),
-                step=self.num_timesteps,
+                }
            )
            self._eval_revenues = []
--- a/engine/lib/coi.py
+++ b/engine/lib/coi.py
@@ -3,7 +3,10 @@ from typing import Dict
 def compute_agent_probability(
-    trajectory: list, human_transitions: Dict, agent_transitions: Dict
+    trajectory: list,
    human_transitions: Dict,
    agent_transitions: Dict,
    temperature: float = 1.0,
 ) -> float:
    """estimate agent probability via KL divergence between trajectory transitions and reference models
@@ -52,9 +55,9 @@ def compute_agent_probability(
    kl_agent = kl_div(empirical, agent_transitions)
    # convert to probability via softmax (lower KL = higher prob)
-    # agent_prob = exp(-kl_agent) / (exp(-kl_human) + exp(-kl_agent))
+    t = float(max(temperature, 1e-6))
-    exp_h = np.exp(-kl_human)
+    exp_h = np.exp(-kl_human / t)
-    exp_a = np.exp(-kl_agent)
+    exp_a = np.exp(-kl_agent / t)
    return float(exp_a / (exp_h + exp_a + 1e-10))
--- a/engine/lib/render.py
+++ b/engine/lib/render.py
@@ -1,15 +1,19 @@
 """rendering logic for PHANTOM environment dashboard"""
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.gridspec import GridSpec
 def style_axis(ax, title: str = None, xlabel: str = None, ylabel: str = None):
-    ax.spines['top'].set_visible(False)
+    ax.spines["top"].set_visible(False)
-    ax.spines['right'].set_visible(False)
+    ax.spines["right"].set_visible(False)
-    if title: ax.set_title(title, fontsize=11, fontweight='bold', pad=8)
+    if title:
-    if xlabel: ax.set_xlabel(xlabel, fontsize=9)
+        ax.set_title(title, fontsize=11, fontweight="bold", pad=8)
-    if ylabel: ax.set_ylabel(ylabel, fontsize=9)
+    if xlabel:
        ax.set_xlabel(xlabel, fontsize=9)
    if ylabel:
        ax.set_ylabel(ylabel, fontsize=9)
 class DashboardRenderer:
@@ -23,13 +27,25 @@ class DashboardRenderer:
        if self.fig is None:
            plt.ion()
            self.fig = plt.figure(figsize=(14, 10))
-            self.gs = GridSpec(3, 3, figure=self.fig, hspace=0.35, wspace=0.3,
+            self.gs = GridSpec(
-                               left=0.07, right=0.95, top=0.92, bottom=0.08)
+                3,
                3,
                figure=self.fig,
                hspace=0.35,
                wspace=0.3,
                left=0.07,
                right=0.95,
                top=0.92,
                bottom=0.08,
            )
            plt.show(block=False)
        self.fig.clear()
-        self.fig.suptitle(f'PHANTOM  Market Dynamics  [t={env._step_count}, a={env.alpha:.2f}]',
+        self.fig.suptitle(
-                          fontsize=14, fontweight='bold')
+            f"PHANTOM  Market Dynamics  [t={env._step_count}, a={env.alpha:.2f}]",
            fontsize=14,
            fontweight="bold",
        )
        demand_mat = np.array(env._demand_history).T
        price_mat = np.array(env._price_history).T
@@ -51,40 +67,56 @@ class DashboardRenderer:
        prices_flat = np.array(env._price_history).flatten()
        demands_flat = np.array(env._demand_history).flatten()
        product_ids = np.tile(np.arange(env.n_products), len(env._price_history))
-        ax.scatter(prices_flat, demands_flat, c=product_ids, cmap='plasma', alpha=0.6, s=15, edgecolors='none')
+        ax.scatter(
            prices_flat,
            demands_flat,
            c=product_ids,
            cmap="plasma",
            alpha=0.6,
            s=15,
            edgecolors="none",
        )
        if len(prices_flat) > 1:
            z = np.polyfit(prices_flat, demands_flat, 1)
            p_line = np.linspace(prices_flat.min(), prices_flat.max(), 50)
-            ax.plot(p_line, np.polyval(z, p_line), '--', lw=1.5, alpha=0.8)
+            ax.plot(p_line, np.polyval(z, p_line), "--", lw=1.5, alpha=0.8)
        style_axis(ax, "Price-Demand Relationship", "Price ($)", "Demand")
    def _render_elasticity_bar(self, env, elasticity):
        ax = self.fig.add_subplot(self.gs[0, 1])
        ax.barh(range(env.n_products), elasticity, alpha=0.8)
        ax.axvline(0, lw=0.8, alpha=0.5)
-        ax.axvline(-1, lw=1, ls='--', alpha=0.5)
+        ax.axvline(-1, lw=1, ls="--", alpha=0.5)
        ax.set_yticks(range(env.n_products))
-        ax.set_yticklabels([f'P{i}' for i in range(env.n_products)], fontsize=7)
+        ax.set_yticklabels([f"P{i}" for i in range(env.n_products)], fontsize=7)
        style_axis(ax, "Price Elasticity", "(dQ/dP)(P/Q)", None)
    def _render_session_pie(self, env):
        ax = self.fig.add_subplot(self.gs[0, 2])
        n_h, n_a = env.market.Nhumans, env.market.Nagents
-        wedges, _ = ax.pie([n_h, n_a], startangle=90, wedgeprops={'linewidth': 2, 'edgecolor': 'white'})
+        wedges, _ = ax.pie(
-        ax.legend(wedges, [f'H ({n_h})', f'A ({n_a})'], loc='lower center', fontsize=8,
+            [n_h, n_a], startangle=90, wedgeprops={"linewidth": 2, "edgecolor": "white"}
-                  frameon=False, bbox_to_anchor=(0.5, -0.05))
+        )
-        ax.set_title("Session Mix", fontsize=11, fontweight='bold')
+        ax.legend(
            wedges,
            [f"H ({n_h})", f"A ({n_a})"],
            loc="lower center",
            fontsize=8,
            frameon=False,
            bbox_to_anchor=(0.5, -0.05),
        )
        ax.set_title("Session Mix", fontsize=11, fontweight="bold")
    def _render_price_heatmap(self, price_mat):
        ax = self.fig.add_subplot(self.gs[1, :2])
-        im = ax.imshow(price_mat, aspect='auto', cmap='viridis', origin='lower')
+        im = ax.imshow(price_mat, aspect="auto", cmap="viridis", origin="lower")
        style_axis(ax, "Price Heatmap P(product, t)", "Step", "Product")
        cbar = self.fig.colorbar(im, ax=ax, fraction=0.03, pad=0.02)
-        cbar.set_label('$', fontsize=8)
+        cbar.set_label("$", fontsize=8)
    def _render_demand_heatmap(self, demand_mat):
        ax = self.fig.add_subplot(self.gs[1, 2])
-        im = ax.imshow(demand_mat, aspect='auto', cmap='Blues', origin='lower')
+        im = ax.imshow(demand_mat, aspect="auto", cmap="Blues", origin="lower")
        style_axis(ax, "Demand Q(product, t)", "Step", None)
        self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
@@ -92,11 +124,11 @@ class DashboardRenderer:
        ax = self.fig.add_subplot(self.gs[2, 0])
        if price_mat.shape[1] > 2:
            corr = np.corrcoef(price_mat, demand_mat)[:n_products, n_products:]
-            im = ax.imshow(corr, cmap='RdBu', vmin=-1, vmax=1, aspect='auto')
+            im = ax.imshow(corr, cmap="RdBu", vmin=-1, vmax=1, aspect="auto")
            ax.set_xticks(range(n_products))
            ax.set_yticks(range(n_products))
-            ax.set_xticklabels([f'Q{i}' for i in range(n_products)], fontsize=6)
+            ax.set_xticklabels([f"Q{i}" for i in range(n_products)], fontsize=6)
-            ax.set_yticklabels([f'P{i}' for i in range(n_products)], fontsize=6)
+            ax.set_yticklabels([f"P{i}" for i in range(n_products)], fontsize=6)
            self.fig.colorbar(im, ax=ax, fraction=0.046, pad=0.02)
        style_axis(ax, "Price-Demand Correlation", None, None)
@@ -105,20 +137,27 @@ class DashboardRenderer:
        n_steps = len(env._revenue_history)
        demand_std = [np.std(d) for d in env._demand_history]
        ax.fill_between(range(n_steps), env._revenue_history, alpha=0.3)
-        ax.plot(env._revenue_history, linewidth=2, label='Revenue')
+        ax.plot(env._revenue_history, linewidth=2, label="Revenue")
        ax.set_xlim(0, max(n_steps, 1))
        ax.set_ylim(0, max(env._revenue_history) * 1.1 if env._revenue_history else 1)
        ax2 = ax.twinx()
-        ax2.plot(range(n_steps), demand_std, linewidth=2, ls='-', alpha=0.9, label='sigma(Demand)')
+        ax2.plot(
            range(n_steps),
            demand_std,
            linewidth=2,
            ls="-",
            alpha=0.9,
            label="sigma(Demand)",
        )
        d_min, d_max = min(demand_std), max(demand_std)
        margin = (d_max - d_min) * 0.2 if d_max > d_min else 0.5
        ax2.set_ylim(max(0, d_min - margin), d_max + margin)
-        ax2.set_ylabel('Demand sigma', fontsize=9)
+        ax2.set_ylabel("Demand sigma", fontsize=9)
        style_axis(ax, "Revenue & Demand Dispersion", "Step", "Revenue ($)")
-        ax.legend(loc='upper left', fontsize=7, frameon=False)
+        ax.legend(loc="upper left", fontsize=7, frameon=False)
-        ax2.legend(loc='upper right', fontsize=7, frameon=False)
+        ax2.legend(loc="upper right", fontsize=7, frameon=False)
    def close(self):
        if self.fig:
--- a/engine/lib/tiers.py
+++ b/engine/lib/tiers.py
@@ -0,0 +1,101 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Protocol
 import numpy as np
 class PolicyLike(Protocol):
    def predict(self, obs: np.ndarray, deterministic: bool = True): ...
 class StaticPolicy:
    def __init__(self, n_actions: int):
        self._action = int(max(0, n_actions // 2))
    def predict(self, obs: np.ndarray, deterministic: bool = True):
        return self._action, None
 class SurgePolicy:
    def __init__(
        self,
        n_actions: int,
        n_products: int,
        high_threshold: float = 60.0,
        low_threshold: float = 30.0,
    ):
        self.n_actions = int(n_actions)
        self.n_products = int(n_products)
        self.mid = self.n_actions // 2
        self.high_t = float(high_threshold)
        self.low_t = float(low_threshold)
    def predict(self, obs: np.ndarray, deterministic: bool = True):
        obs_arr = np.asarray(obs, dtype=np.float32)
        demand = obs_arr[: self.n_products]
        demand_mean = float(np.mean(demand)) if demand.size > 0 else 0.0
        if demand_mean >= self.high_t:
            return min(self.mid + 2, self.n_actions - 1), None
        if demand_mean <= self.low_t:
            return max(self.mid - 2, 0), None
        return self.mid, None
@dataclass
 class LinearElasticityPolicy:
    n_actions: int
    n_products: int
    price_low: float
    price_high: float
    def __post_init__(self):
        self.n_actions = int(self.n_actions)
        self.n_products = int(self.n_products)
        self.price_low = float(self.price_low)
        self.price_high = float(self.price_high)
        self._target_price = 0.5 * (self.price_low + self.price_high)
        self._action_scales = np.linspace(0.8, 1.2, self.n_actions)
    def fit(self, env, warmup_steps: int = 800, seed: int = 42):
        rng = np.random.default_rng(int(seed))
        obs, _ = env.reset(seed=int(seed))
        prices: list[float] = []
        demands: list[float] = []
        for _ in range(int(max(10, warmup_steps))):
            action = int(rng.integers(0, self.n_actions))
            obs, _, term, trunc, info = env.step(action)
            done = bool(term or trunc)
            p = np.asarray(info.get("prices", []), dtype=np.float32)
            d = np.asarray(info.get("demand", []), dtype=np.float32)
            if p.size > 0 and d.size > 0:
                prices.append(float(np.mean(p)))
                demands.append(float(np.mean(d)))
            if done:
                obs, _ = env.reset()
        if len(prices) < 8:
            self._target_price = 0.5 * (self.price_low + self.price_high)
            return self
        slope, intercept = np.polyfit(np.asarray(prices), np.asarray(demands), 1)
        if slope < -1e-6:
            p_star = -intercept / (2.0 * slope)
            self._target_price = float(np.clip(p_star, self.price_low, self.price_high))
        else:
            self._target_price = 0.5 * (self.price_low + self.price_high)
        return self
    def predict(self, obs: np.ndarray, deterministic: bool = True):
        obs_arr = np.asarray(obs, dtype=np.float32)
        cur_prices = obs_arr[self.n_products : 2 * self.n_products]
        cur_mean = (
            float(np.mean(cur_prices)) if cur_prices.size > 0 else self._target_price
        )
        scale = self._target_price / max(cur_mean, 1e-6)
        action = int(np.argmin(np.abs(self._action_scales - scale)))
        return int(np.clip(action, 0, self.n_actions - 1)), None
--- a/engine/lib/wrappers.py
+++ b/engine/lib/wrappers.py
@@ -35,7 +35,6 @@ class EconomicMetricsWrapper(gym.Wrapper):
        prices = self.env.unwrapped._prices
        demand_dict = self.env.unwrapped._demand
        demand = np.array([demand_dict.get(i, 0.0) for i in range(len(prices))])
        alpha = self.env.unwrapped.alpha
        # core calculations
        revenue = float(np.sum(prices * demand))
@@ -58,7 +57,21 @@ class EconomicMetricsWrapper(gym.Wrapper):
            "coi_level": coi_level,
            "regret": regret,
        }
-        for key in ("coi_mix", "coi_base", "coi_leakage", "coi_penalty"):
+        for key in (
            "coi_mix",
            "coi_base",
            "coi_leakage",
            "coi_penalty",
            "ux_penalty",
            "volatility",
            "profit",
            "cost_floor",
            "reward_revenue",
            "reward_total",
            "agent_prob",
            "alpha_adv",
            "alpha_nominal",
        ):
            if key in info:
                info["economics"][key] = info[key]
        info["prices"] = prices.copy()
--- a/engine/logging_utils.py
+++ b/engine/logging_utils.py
@@ -0,0 +1,33 @@
 from __future__ import annotations
 import logging
 import os
 import sys
 _CONFIGURED = False
 def _resolve_level(raw: str | None) -> int:
    name = str(raw or os.environ.get("PHANTOM_LOG_LEVEL", "INFO")).upper().strip()
    return int(getattr(logging, name, logging.INFO))
 def configure_logging(level: str | None = None) -> None:
    global _CONFIGURED
    if _CONFIGURED:
        return
    logger = logging.getLogger("engine")
    logger.setLevel(_resolve_level(level))
    logger.propagate = False
    if logger.handlers:
        _CONFIGURED = True
        return
    handler = logging.StreamHandler(stream=sys.stdout)
    handler.setFormatter(
        logging.Formatter("%(asctime)s %(levelname)s [%(name)s] %(message)s")
    )
    logger.addHandler(handler)
    _CONFIGURED = True
--- a/engine/orchestrators/init.py
+++ b/engine/orchestrators/init.py
@@ -0,0 +1,5 @@
 from .benchmark import run_benchmark_cli
 from .sweep_agent import run_sweep_agent
 from .train import run_train_once
 __all__ = ["run_benchmark_cli", "run_sweep_agent", "run_train_once"]
--- a/engine/orchestrators/benchmark.py
+++ b/engine/orchestrators/benchmark.py
@@ -0,0 +1,7 @@
 from __future__ import annotations
 def run_benchmark_cli(raw_args: list[str] | None = None) -> None:
    from ..benchmark import run_cli
    run_cli(raw_args)
--- a/engine/orchestrators/sweep_agent.py
+++ b/engine/orchestrators/sweep_agent.py
@@ -0,0 +1,60 @@
 from __future__ import annotations
 from typing import Any, Mapping, Sequence
 from ..spec import TrainSpec, run_name
 from ..telemetry.wandb import (
    current_config,
    finish_run,
    get_wandb_module,
    init_run,
    run_agent,
 )
 from .train import run_with_active_sweep_run
 def run_sweep_agent(
    *,
    project: str,
    sweep_id: str,
    count: int,
    offline: bool,
    no_wandb: bool,
    base_overrides: Mapping[str, Any],
    kind: str,
    scenario: str,
    group: str | None,
    extra_tags: Sequence[str],
 ) -> None:
    if no_wandb:
        raise ValueError("sweep agent requires wandb")
    if not sweep_id:
        raise ValueError("--sweep-id is required with --sweep-agent")
    if get_wandb_module() is None:
        raise ImportError("wandb is required for sweep runs")
    mode = "offline" if offline else "online"
    def _sweep_trial() -> None:
        run = init_run(mode=mode, project=project, group=group, sweep_mode=True)
        try:
            merged = dict(base_overrides)
            merged.update(current_config())
            spec = TrainSpec.from_flat(merged)
            if run is not None:
                run.name = run_name(spec, kind=kind, scenario=scenario)
            run_with_active_sweep_run(
                spec,
                kind=kind,
                scenario=scenario,
                group=group,
                extra_tags=extra_tags,
            )
        finally:
            finish_run()
    run_agent(
        sweep_id,
        _sweep_trial,
        count=count if count > 0 else None,
    )
--- a/engine/orchestrators/train.py
+++ b/engine/orchestrators/train.py
@@ -0,0 +1,124 @@
 from __future__ import annotations
 import json
 from typing import Any, Sequence
 from ..spec import TrainSpec, run_metadata, run_name
 from ..telemetry.wandb import (
    finish_run,
    get_wandb_module,
    init_run,
    log_metrics,
    update_run_config,
    update_summary,
 )
 from ..train_core import run_train
 def _tags_for_run(spec: TrainSpec, kind: str, extra_tags: Sequence[str]) -> list[str]:
    tags = [
        kind,
        spec.algorithm.name,
        spec.runtime.backend,
        "vanilla" if spec.study.no_robust else "robust",
    ]
    tags.extend([tag for tag in extra_tags if tag])
    return tags
 def _print_local_metrics(metrics: dict[str, Any]) -> None:
    print(json.dumps(metrics, indent=2))
    print("PHANTOM_METRICS:" + json.dumps(metrics))
 def _log_train_events(events: list[dict[str, Any]], log_freq: int) -> None:
    if not events:
        return
    period = max(1, int(log_freq))
    last_logged_step = -period
    for event in sorted(
        [evt for evt in events if isinstance(evt, dict)],
        key=lambda evt: int(evt.get("train/global_step", 0)),
    ):
        step = int(event.get("train/global_step", 0))
        if step <= 0 or (step - last_logged_step) < period:
            continue
        log_metrics(event, step=step)
        last_logged_step = step
 def run_train_once(
    spec: TrainSpec,
    *,
    project: str,
    offline: bool,
    no_wandb: bool,
    kind: str,
    scenario: str,
    group: str | None,
    extra_tags: Sequence[str],
 ) -> dict[str, Any]:
    wandb = get_wandb_module()
    if no_wandb or wandb is None:
        result = run_train(spec)
        _print_local_metrics(result.metrics)
        return result.metrics
    mode = "offline" if offline else "online"
    tags = _tags_for_run(spec, kind, extra_tags)
    metadata = run_metadata(
        spec,
        kind=kind,
        scenario=scenario,
        group=group,
        tags=tags,
    )
    config = spec.to_flat_dict()
    config.update(metadata)
    name = run_name(spec, kind=kind, scenario=scenario)
    init_run(
        mode=mode,
        project=project,
        config=config,
        name=name,
        tags=tags,
        group=group,
        sweep_mode=False,
    )
    try:
        result = run_train(spec)
        _log_train_events(result.events, spec.runtime.log_freq)
        metrics = result.metrics
        step = int(metrics.get("train/global_step", spec.runtime.total_timesteps))
        log_metrics(metrics, step=step)
        update_summary(metrics)
        return metrics
    finally:
        finish_run()
 def run_with_active_sweep_run(
    spec: TrainSpec,
    *,
    kind: str,
    scenario: str,
    group: str | None,
    extra_tags: Sequence[str],
 ) -> dict[str, Any]:
    tags = _tags_for_run(spec, kind, extra_tags)
    metadata = run_metadata(
        spec,
        kind=kind,
        scenario=scenario,
        group=group,
        tags=tags,
    )
    update_run_config({**spec.to_flat_dict(), **metadata})
    result = run_train(spec)
    _log_train_events(result.events, spec.runtime.log_freq)
    metrics = result.metrics
    step = int(metrics.get("train/global_step", spec.runtime.total_timesteps))
    log_metrics(metrics, step=step)
    update_summary(metrics)
    return metrics
--- a/engine/project.json
+++ b/engine/project.json
@@ -0,0 +1,100 @@
 {
  "$schema": "../node_modules/nx/schemas/project-schema.json",
  "name": "research",
  "projectType": "application",
  "sourceRoot": "engine",
  "targets": {
    "install": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_research.sh install",
        "cwd": "."
      }
    },
    "test": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": ".venv/bin/pytest -v",
        "cwd": "."
      }
    },
    "train": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "bash scripts/nx_research.sh train",
        "cwd": "."
      }
    },
    "benchmark": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "bash scripts/nx_research.sh benchmark",
        "cwd": "."
      }
    },
    "benchmark-simple": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "bash scripts/nx_research.sh benchmark-simple",
        "cwd": "."
      }
    },
    "benchmark-agent": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "bash scripts/nx_research.sh benchmark-agent",
        "cwd": "."
      }
    },
    "train-agent": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "bash scripts/nx_research.sh train-agent",
        "cwd": "."
      }
    },
    "train-bootstrap": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_research.sh train-bootstrap",
        "cwd": "."
      }
    },
    "stats": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_research.sh stats",
        "cwd": "."
      }
    },
    "docker-train-publish": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_research.sh docker-train-publish",
        "cwd": "."
      }
    }
  },
  "tags": [
    "scope:research",
    "type:python"
  ]
 }
--- a/engine/spec.py
+++ b/engine/spec.py
@@ -0,0 +1,332 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
 import os
 from typing import Any, Mapping, Sequence
 def _truthy(value: str | bool | None) -> bool:
    if isinstance(value, bool):
        return value
    if value is None:
        return False
    return str(value).strip().lower() in {"1", "true", "yes", "on"}
 def _normalize_keys(raw: Mapping[str, Any]) -> dict[str, Any]:
    alias_map = {
        "algorithm": "algo",
        "algorithm.name": "algo",
        "env.n_products": "n_products",
        "env.action_levels": "action_levels",
        "env.action_scale_low": "action_scale_low",
        "env.action_scale_high": "action_scale_high",
        "env.price_low": "price_low",
        "env.price_high": "price_high",
        "env.max_steps": "max_steps",
        "env.margin_floor": "margin_floor",
        "env.margin_floor_patience": "margin_floor_patience",
        "env.n_sessions": "N",
        "study.alpha": "alpha",
        "study.lambda_coi": "lambda_coi",
        "study.robust_radius": "robust_radius",
        "study.robust_points": "robust_points",
        "study.robust_rollouts": "robust_rollouts",
        "study.info_value": "info_value",
        "study.eta_ux": "eta_ux",
        "study.reward_profit_weight": "reward_profit_weight",
        "study.revenue_weight": "revenue_weight",
        "optimizer.learning_rate": "learning_rate",
        "optimizer.gamma": "gamma",
        "optimizer.batch_size": "batch_size",
        "optimizer.n_steps": "n_steps",
        "runtime.backend": "backend",
        "runtime.device": "device",
        "runtime.seed": "seed",
        "runtime.total_timesteps": "total_timesteps",
        "runtime.checkpoint_interval": "checkpoint_interval",
        "eval.eval_freq": "eval_freq",
        "eval.eval_episodes": "eval_episodes",
    }
    normalized: dict[str, Any] = {}
    for key, value in raw.items():
        canonical = alias_map.get(str(key), str(key))
        normalized[canonical] = value
    return normalized
@dataclass(frozen=True)
 class AlgorithmSpec:
    name: str = "ppo"
@dataclass(frozen=True)
 class EnvSpec:
    n_products: int = 10
    n_sessions: int = 100
    price_low: float = 10.0
    price_high: float = 150.0
    action_levels: int = 9
    action_scale_low: float = 0.8
    action_scale_high: float = 1.2
    max_steps: int = 100
    margin_floor: float = 0.05
    margin_floor_patience: int = 5
@dataclass(frozen=True)
 class StudySpec:
    alpha: float = 0.3
    lambda_coi: float = 0.2
    robust_radius: float = 0.15
    robust_points: int = 5
    robust_rollouts: int = 1
    info_value: float = 1.0
    eta_ux: float = 0.5
    reward_profit_weight: float = 1.0
    revenue_weight: float = 0.01
    no_robust: bool = False
@dataclass(frozen=True)
 class OptimizerSpec:
    learning_rate: float = 3e-4
    gamma: float = 0.99
    buffer_size: int = 50_000
    batch_size: int = 256
    tau: float = 0.005
    train_freq: int = 1
    learning_starts: int = 1_000
    target_update_interval: int = 1_000
    exploration_fraction: float = 0.2
    exploration_final_eps: float = 0.05
    n_steps: int = 2_048
    n_epochs: int = 10
    gae_lambda: float = 0.95
    clip_range: float = 0.2
    ent_coef: float = 0.0
    q_lr: float = 0.1
    q_bins: int = 6
    eps_start: float = 1.0
    eps_end: float = 0.05
    eps_decay: float = 0.9995
    arch: str = "small"
    activation: str = "relu"
    vf_coef: float = 0.5
    max_grad_norm: float = 0.5
@dataclass(frozen=True)
 class RuntimeSpec:
    project: str = "capstone"
    backend: str = "sb3"
    device: str = "auto"
    seed: int = 42
    total_timesteps: int = 50_000
    checkpoint_interval: int = 200_000
    model_dir: str = "engine/models"
    log_freq: int = 100
@dataclass(frozen=True)
 class EvalSpec:
    eval_freq: int = 1_000
    eval_episodes: int = 5
    robust_eval_enabled: bool = True
@dataclass(frozen=True)
 class TrainSpec:
    algorithm: AlgorithmSpec = field(default_factory=AlgorithmSpec)
    env: EnvSpec = field(default_factory=EnvSpec)
    study: StudySpec = field(default_factory=StudySpec)
    optimizer: OptimizerSpec = field(default_factory=OptimizerSpec)
    runtime: RuntimeSpec = field(default_factory=RuntimeSpec)
    eval: EvalSpec = field(default_factory=EvalSpec)
    def to_flat_dict(self) -> dict[str, Any]:
        return {
            "project": self.runtime.project,
            "algo": self.algorithm.name,
            "seed": self.runtime.seed,
            "total_timesteps": self.runtime.total_timesteps,
            "eval_episodes": self.eval.eval_episodes,
            "eval_freq": self.eval.eval_freq,
            "log_freq": self.runtime.log_freq,
            "model_dir": self.runtime.model_dir,
            "backend": self.runtime.backend,
            "device": self.runtime.device,
            "checkpoint_interval": self.runtime.checkpoint_interval,
            "n_products": self.env.n_products,
            "N": self.env.n_sessions,
            "price_low": self.env.price_low,
            "price_high": self.env.price_high,
            "action_levels": self.env.action_levels,
            "action_scale_low": self.env.action_scale_low,
            "action_scale_high": self.env.action_scale_high,
            "max_steps": self.env.max_steps,
            "margin_floor": self.env.margin_floor,
            "margin_floor_patience": self.env.margin_floor_patience,
            "alpha": self.study.alpha,
            "lambda_coi": self.study.lambda_coi,
            "robust_radius": self.study.robust_radius,
            "robust_points": self.study.robust_points,
            "robust_rollouts": self.study.robust_rollouts,
            "info_value": self.study.info_value,
            "eta_ux": self.study.eta_ux,
            "reward_profit_weight": self.study.reward_profit_weight,
            "revenue_weight": self.study.revenue_weight,
            "no_robust": self.study.no_robust,
            "learning_rate": self.optimizer.learning_rate,
            "gamma": self.optimizer.gamma,
            "buffer_size": self.optimizer.buffer_size,
            "batch_size": self.optimizer.batch_size,
            "tau": self.optimizer.tau,
            "train_freq": self.optimizer.train_freq,
            "learning_starts": self.optimizer.learning_starts,
            "target_update_interval": self.optimizer.target_update_interval,
            "exploration_fraction": self.optimizer.exploration_fraction,
            "exploration_final_eps": self.optimizer.exploration_final_eps,
            "n_steps": self.optimizer.n_steps,
            "n_epochs": self.optimizer.n_epochs,
            "gae_lambda": self.optimizer.gae_lambda,
            "clip_range": self.optimizer.clip_range,
            "ent_coef": self.optimizer.ent_coef,
            "q_lr": self.optimizer.q_lr,
            "q_bins": self.optimizer.q_bins,
            "eps_start": self.optimizer.eps_start,
            "eps_end": self.optimizer.eps_end,
            "eps_decay": self.optimizer.eps_decay,
            "arch": self.optimizer.arch,
            "activation": self.optimizer.activation,
            "vf_coef": self.optimizer.vf_coef,
            "max_grad_norm": self.optimizer.max_grad_norm,
            "robust_eval_enabled": self.eval.robust_eval_enabled,
        }
    @classmethod
    def from_flat(
        cls,
        raw: Mapping[str, Any] | None = None,
        *,
        env_vars: Mapping[str, str] | None = None,
    ) -> "TrainSpec":
        base = cls().to_flat_dict()
        incoming = _normalize_keys(raw or {})
        base.update({k: v for k, v in incoming.items() if v is not None})
        runtime_env = os.environ if env_vars is None else env_vars
        base["device"] = str(
            base.get("device", runtime_env.get("PHANTOM_DEVICE", "auto"))
        )
        backend = str(base.get("backend", "sb3")).lower()
        if backend == "auto":
            backend = "sb3"
        if backend != "sb3":
            backend = "sb3"
        no_robust = _truthy(base.get("no_robust"))
        if no_robust:
            base["lambda_coi"] = 0.0
            base["robust_radius"] = 0.0
            base["robust_points"] = 1
            base["robust_rollouts"] = 1
        return cls(
            algorithm=AlgorithmSpec(name=str(base["algo"]).lower().strip()),
            env=EnvSpec(
                n_products=int(base["n_products"]),
                n_sessions=int(base["N"]),
                price_low=float(base["price_low"]),
                price_high=float(base["price_high"]),
                action_levels=int(base["action_levels"]),
                action_scale_low=float(base["action_scale_low"]),
                action_scale_high=float(base["action_scale_high"]),
                max_steps=int(base["max_steps"]),
                margin_floor=float(base["margin_floor"]),
                margin_floor_patience=int(base["margin_floor_patience"]),
            ),
            study=StudySpec(
                alpha=float(base["alpha"]),
                lambda_coi=float(base["lambda_coi"]),
                robust_radius=float(base["robust_radius"]),
                robust_points=int(base["robust_points"]),
                robust_rollouts=int(base["robust_rollouts"]),
                info_value=float(base["info_value"]),
                eta_ux=float(base["eta_ux"]),
                reward_profit_weight=float(base["reward_profit_weight"]),
                revenue_weight=float(base["revenue_weight"]),
                no_robust=no_robust,
            ),
            optimizer=OptimizerSpec(
                learning_rate=float(base["learning_rate"]),
                gamma=float(base["gamma"]),
                buffer_size=int(base["buffer_size"]),
                batch_size=int(base["batch_size"]),
                tau=float(base["tau"]),
                train_freq=int(base["train_freq"]),
                learning_starts=int(base["learning_starts"]),
                target_update_interval=int(base["target_update_interval"]),
                exploration_fraction=float(base["exploration_fraction"]),
                exploration_final_eps=float(base["exploration_final_eps"]),
                n_steps=int(base["n_steps"]),
                n_epochs=int(base["n_epochs"]),
                gae_lambda=float(base["gae_lambda"]),
                clip_range=float(base["clip_range"]),
                ent_coef=float(base["ent_coef"]),
                q_lr=float(base["q_lr"]),
                q_bins=int(base["q_bins"]),
                eps_start=float(base["eps_start"]),
                eps_end=float(base["eps_end"]),
                eps_decay=float(base["eps_decay"]),
                arch=str(base["arch"]),
                activation=str(base["activation"]),
                vf_coef=float(base["vf_coef"]),
                max_grad_norm=float(base["max_grad_norm"]),
            ),
            runtime=RuntimeSpec(
                project=str(base["project"]),
                backend=backend,
                device=str(base["device"]),
                seed=int(base["seed"]),
                total_timesteps=int(base["total_timesteps"]),
                checkpoint_interval=int(base["checkpoint_interval"]),
                model_dir=str(base["model_dir"]),
                log_freq=int(base["log_freq"]),
            ),
            eval=EvalSpec(
                eval_freq=int(base["eval_freq"]),
                eval_episodes=int(base["eval_episodes"]),
                robust_eval_enabled=_truthy(base.get("robust_eval_enabled", True)),
            ),
        )
 def run_name(spec: TrainSpec, *, kind: str, scenario: str) -> str:
    return (
        f"{kind}/{spec.algorithm.name}/{spec.runtime.backend}/"
        f"{spec.runtime.device}/{scenario}/s{spec.runtime.seed}"
    )
 def run_metadata(
    spec: TrainSpec,
    *,
    kind: str,
    scenario: str,
    group: str | None = None,
    tags: Sequence[str] = (),
 ) -> dict[str, Any]:
    metadata: dict[str, Any] = {
        "run.kind": str(kind),
        "run.algo": spec.algorithm.name,
        "run.backend": spec.runtime.backend,
        "run.device": spec.runtime.device,
        "run.scenario": str(scenario),
        "run.seed": spec.runtime.seed,
        "run.tags": list(tags),
    }
    if group:
        metadata["run.group"] = group
    return metadata
--- a/engine/studies/factors.py
+++ b/engine/studies/factors.py
@@ -1,7 +1,6 @@
 """shared factor definitions for experimental designs"""
 import numpy as np
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Callable, Any
@dataclass
 class Factor:
--- a/engine/studies/full_factorial.py
+++ b/engine/studies/full_factorial.py
@@ -1,5 +1,7 @@
 """full factorial design - all factor combinations"""
 import sys
 sys.path.insert(0, "..")
 import logging
 from itertools import product
@@ -12,6 +14,7 @@ from .factors import FACTORS, DEMAND_FUNCTIONS, SEEDS_PER_CONFIG
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 log = logging.getLogger(__name__)
 def generate_configs():
    """generate all factor combinations with seeds"""
    all_levels = [f.levels for f in FACTORS]
@@ -22,10 +25,13 @@ def generate_configs():
        base = {names[i]: combo[i] for i in range(len(names))}
        for seed in range(SEEDS_PER_CONFIG):
            cfg = {**base, "seed": seed}
-            cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
+            cfg["id"] = hashlib.md5(
                json.dumps(cfg, sort_keys=True).encode()
            ).hexdigest()[:8]
            configs.append(cfg)
    return configs
 def run_single(cfg: dict) -> dict:
    """execute one experiment config, return metrics"""
    from engine.wrapper import PHANTOM
@@ -49,7 +55,8 @@ def run_single(cfg: dict) -> dict:
        obs, reward, term, trunc, _ = env.step(action)
        total_reward += reward
        steps += 1
-        if term: break
+        if term:
            break
    env.close()
    return {
@@ -60,22 +67,28 @@ def run_single(cfg: dict) -> dict:
        "steps": steps,
    }
 def run_study(max_workers: int = None, output: str = "results_full.jsonl"):
    configs = generate_configs()
-    log.info(f"full factorial: {len(configs)} configs ({len(configs)//SEEDS_PER_CONFIG} unique × {SEEDS_PER_CONFIG} seeds)")
+    log.info(
        f"full factorial: {len(configs)} configs ({len(configs) // SEEDS_PER_CONFIG} unique × {SEEDS_PER_CONFIG} seeds)"
    )
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as ex:
        for i, result in enumerate(ex.map(run_single, configs)):
            results.append(result)
-            if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
+            if (i + 1) % 100 == 0:
                log.info(f"progress: {i + 1}/{len(configs)}")
    Path(output).write_text("\n".join(json.dumps(r) for r in results))
    log.info(f"wrote {len(results)} results to {output}")
    return results
 if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser()
    p.add_argument("--workers", type=int, default=None)
    p.add_argument("--output", default="results_full.jsonl")
@@ -83,7 +96,9 @@ if __name__ == "__main__":
    args = p.parse_args()
    configs = generate_configs()
-    log.info(f"design: {len(configs)} runs | factors: {[f.name for f in FACTORS]} | levels: {[len(f.levels) for f in FACTORS]}")
+    log.info(
        f"design: {len(configs)} runs | factors: {[f.name for f in FACTORS]} | levels: {[len(f.levels) for f in FACTORS]}"
    )
    if not args.dry_run:
        run_study(args.workers, args.output)
--- a/engine/studies/local_comparison.py
+++ b/engine/studies/local_comparison.py
@@ -0,0 +1,136 @@
 import sys
 import numpy as np
 import pandas as pd
 from pathlib import Path
 import matplotlib.pyplot as plt
 from gymnasium.wrappers import FlattenObservation
 from stable_baselines3 import PPO
 # Add parent directory to path to allow importing engine
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from engine.wrapper import PHANTOM
 from engine.lib.wrappers import EconomicMetricsWrapper
 from engine.lib.providers import (
    ProviderBenchmark,
    BenchmarkConfig,
    RandomBaseline,
    SurgeBaseline,
 )
 def env_factory(alpha: float):
    """Creates a wrapped PHANTOM environment for testing at a specific alpha level."""
    # Action levels=9 matches the trained PPO model
    # n_products=8 matches the pretrained model's expectation of Box(16,)
    env = PHANTOM(
        n_products=8,
        alpha=alpha,
        N=100,
        action_levels=9,
        action_scale_low=0.8,
        action_scale_high=1.2,
        max_steps=20,  # Short episodes so simulation goes fast
        robust_points=1,  # disable expensive adversarial lookaheads
        render_mode=None,
    )
    env = EconomicMetricsWrapper(env)
    return FlattenObservation(env)
 def main():
    print("Loading pre-trained Robust RL model...")
    model_path = Path(__file__).parent.parent / "models" / "phantom_ppo.zip"
    if not model_path.exists():
        print(f"Error: Model not found at {model_path}")
        print("Please ensure you have a trained model before running this script.")
        return
    rl_model = PPO.load(model_path)
    # The action space is Discrete(9). Index 4 is the middle (1.0 scale).
    n_actions = 9
    mid_action = n_actions // 2
    providers = {
        "Static (Base)": lambda obs: mid_action,
        "Random": RandomBaseline(n_actions),
        "Heuristic Surge": SurgeBaseline(
            n_actions, high_threshold=60.0, low_threshold=30.0
        ),
        "Robust RL (PPO)": lambda obs: rl_model.predict(obs, deterministic=True)[0],
    }
    config = BenchmarkConfig(
        n_episodes=10,  # Lower episodes to run faster
        alpha_range=[0.0, 0.5, 1.0],  # Fewer alpha levels
        baseline_name="Static (Base)",
    )
    print(f"\nStarting benchmark across alpha levels: {config.alpha_range}")
    print(
        f"Testing {len(providers)} strategies for {config.n_episodes} episodes each...\n"
    )
    benchmark = ProviderBenchmark(env_factory, providers, config)
    results = benchmark.run()
    # 1. Print tabular results
    df = benchmark.to_dataframe()
    summary = benchmark.summary_table()
    print("\n--- Benchmark Summary Table ---")
    print(summary)
    # 2. Save results to CSV for thesis inclusion
    out_dir = Path(__file__).parent / "results"
    out_dir.mkdir(exist_ok=True)
    csv_path = out_dir / "provider_comparison.csv"
    df.to_csv(csv_path, index=False)
    print(f"\nSaved raw results to {csv_path}")
    # 3. Plot the degradation of COI / Revenue as alpha increases
    plt.figure(figsize=(12, 5))
    # Plot 1: Revenue vs Alpha
    plt.subplot(1, 2, 1)
    for name in providers.keys():
        provider_data = df[df["name"] == name]
        plt.plot(
            provider_data["alpha"],
            provider_data["mean_revenue"],
            marker="o",
            label=name,
            linewidth=2,
        )
    plt.title("Revenue under Agent Contamination")
    plt.xlabel("Contamination Level (α)")
    plt.ylabel("Mean Episode Revenue ($)")
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.legend()
    # Plot 2: COI Preservation vs Alpha
    plt.subplot(1, 2, 2)
    for name in providers.keys():
        provider_data = df[df["name"] == name]
        plt.plot(
            provider_data["alpha"],
            provider_data["coi_preserved_pct"],
            marker="s",
            label=name,
            linewidth=2,
        )
    plt.title("Cost of Information (COI) Preservation")
    plt.xlabel("Contamination Level (α)")
    plt.ylabel("COI Preserved (%)")
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.legend()
    plt.tight_layout()
    plot_path = out_dir / "alpha_degradation_plot.png"
    plt.savefig(plot_path, dpi=300)
    print(f"Saved visualization to {plot_path}")
 if __name__ == "__main__":
    main()
--- a/engine/studies/mixed_lh.py
+++ b/engine/studies/mixed_lh.py
@@ -1,5 +1,7 @@
 """mixed design: full factorial on primary factors, latin hypercube on secondary"""
 import sys
 sys.path.insert(0, "..")
 import logging
 from itertools import product
@@ -16,6 +18,7 @@ log = logging.getLogger(__name__)
 LH_SAMPLES = 10
 def generate_configs(lh_samples: int = LH_SAMPLES):
    primary = [f for f in FACTORS if f.primary]
    secondary = [f for f in FACTORS if not f.primary]
@@ -28,7 +31,9 @@ def generate_configs(lh_samples: int = LH_SAMPLES):
        samples = lhs.random(n=lh_samples)
        for s in samples:
            sec_vals = {
-                secondary[i].name: secondary[i].levels[int(s[i] * len(secondary[i].levels))]
+                secondary[i].name: secondary[i].levels[
                    int(s[i] * len(secondary[i].levels))
                ]
                for i in range(len(secondary))
            }
            base = {primary[i].name: p_combo[i] for i in range(len(primary))}
@@ -36,10 +41,13 @@ def generate_configs(lh_samples: int = LH_SAMPLES):
            for seed in range(SEEDS_PER_CONFIG):
                cfg = {**base, "seed": seed}
-                cfg["id"] = hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:8]
+                cfg["id"] = hashlib.md5(
                    json.dumps(cfg, sort_keys=True).encode()
                ).hexdigest()[:8]
                configs.append(cfg)
    return configs
 def run_single(cfg: dict) -> dict:
    from engine.wrapper import PHANTOM
    import numpy as np
@@ -62,7 +70,8 @@ def run_single(cfg: dict) -> dict:
        obs, reward, term, trunc, _ = env.step(action)
        total_reward += reward
        steps += 1
-        if term: break
+        if term:
            break
    env.close()
    return {
@@ -73,23 +82,33 @@ def run_single(cfg: dict) -> dict:
        "steps": steps,
    }
-def run_study(max_workers: int = None, output: str = "results_mixed.jsonl", lh_samples: int = LH_SAMPLES):
+
 def run_study(
    max_workers: int = None,
    output: str = "results_mixed.jsonl",
    lh_samples: int = LH_SAMPLES,
 ):
    configs = generate_configs(lh_samples)
    n_primary_cells = int(np.prod([len(f.levels) for f in FACTORS if f.primary]))
-    log.info(f"mixed LH: {len(configs)} configs ({n_primary_cells} primary × {lh_samples} LH × {SEEDS_PER_CONFIG} seeds)")
+    log.info(
        f"mixed LH: {len(configs)} configs ({n_primary_cells} primary × {lh_samples} LH × {SEEDS_PER_CONFIG} seeds)"
    )
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as ex:
        for i, result in enumerate(ex.map(run_single, configs)):
            results.append(result)
-            if (i+1) % 100 == 0: log.info(f"progress: {i+1}/{len(configs)}")
+            if (i + 1) % 100 == 0:
                log.info(f"progress: {i + 1}/{len(configs)}")
    Path(output).write_text("\n".join(json.dumps(r) for r in results))
    log.info(f"wrote {len(results)} results to {output}")
    return results
 if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser()
    p.add_argument("--workers", type=int, default=None)
    p.add_argument("--output", default="results_mixed.jsonl")
@@ -100,7 +119,9 @@ if __name__ == "__main__":
    primary = [f for f in FACTORS if f.primary]
    secondary = [f for f in FACTORS if not f.primary]
    configs = generate_configs(args.lh_samples)
-    log.info(f"design: {len(configs)} runs | primary: {[f.name for f in primary]} | secondary (LH): {[f.name for f in secondary]}")
+    log.info(
        f"design: {len(configs)} runs | primary: {[f.name for f in primary]} | secondary (LH): {[f.name for f in secondary]}"
    )
    if not args.dry_run:
        run_study(args.workers, args.output, args.lh_samples)
--- a/engine/sweeps/model_mix.yaml
+++ b/engine/sweeps/model_mix.yaml
@@ -1,6 +1,6 @@
 method: random
 metric:
-  name: sweep/score
+  name: objective/score
  goal: maximize
 command:
  - ${env}
--- a/engine/sweeps/models_only.yaml
+++ b/engine/sweeps/models_only.yaml
@@ -1,6 +1,6 @@
 method: grid
 metric:
-  name: sweep/score
+  name: objective/score
  goal: maximize
 run_cap: 4
 command:
--- a/engine/sweeps/sac_tune.yaml
+++ b/engine/sweeps/sac_tune.yaml
@@ -1,6 +1,6 @@
 method: bayes
 metric:
-  name: sweep/score
+  name: objective/score
  goal: maximize
 command:
  - ${env}
--- a/engine/sweeps/small_arch_compare.yaml
+++ b/engine/sweeps/small_arch_compare.yaml
@@ -1,6 +1,6 @@
 method: random
 metric:
-  name: sweep/score
+  name: objective/score
  goal: maximize
 command:
  - ${env}
--- a/engine/sweeps/tpu_jax.yaml
+++ b/engine/sweeps/tpu_jax.yaml
@@ -1,93 +0,0 @@
 method: bayes
 metric:
  name: sweep/score
  goal: maximize
 command:
  - ${env}
  - python
  - -m
  - engine.train
 parameters:
  # fixed: always use JAX backend so TPU chips are actually exercised
  use_jax:
    value: true
  # all four algos have JAX implementations
  algo:
    values: [ppo, a2c, dqn, qtable]
  total_timesteps:
    values: [50000, 80000, 120000]
  checkpoint_interval:
    value: 200000
  seed:
    values: [13, 42, 77]
  n_products:
    values: [8, 10, 12]
  # COI framework parameters -- primary research variables
  alpha:
    distribution: uniform
    min: 0.1
    max: 0.6
  lambda_coi:
    distribution: uniform
    min: 0.05
    max: 0.6
  robust_radius:
    distribution: uniform
    min: 0.0
    max: 0.3
  robust_points:
    values: [3, 5, 7]
  info_value:
    distribution: uniform
    min: 0.5
    max: 2.0
  revenue_weight:
    values: [0.005, 0.01, 0.02]
  # shared hyperparameters
  learning_rate:
    distribution: log_uniform_values
    min: 1.0e-5
    max: 1.0e-3
  gamma:
    values: [0.97, 0.99, 0.995]
  # JAX parallelism -- key lever for TPU throughput
  jax_num_envs:
    values: [8, 16, 32]
  jax_num_steps:
    values: [64, 128, 256]
  jax_num_minibatches:
    values: [2, 4, 8]
  jax_update_epochs:
    values: [2, 4, 8]
  # PPO/A2C specific
  gae_lambda:
    values: [0.9, 0.95, 0.98]
  clip_range:
    values: [0.1, 0.2, 0.3]
  ent_coef:
    values: [0.0, 0.005, 0.01]
  # DQN specific
  buffer_size:
    values: [20000, 50000, 100000]
  batch_size:
    values: [128, 256, 512]
  learning_starts:
    values: [500, 1000, 3000]
  exploration_fraction:
    values: [0.1, 0.2, 0.3]
  exploration_final_eps:
    values: [0.01, 0.03, 0.05]
  # QTable specific
  q_lr:
    values: [0.03, 0.05, 0.1, 0.2]
  eps_end:
    values: [0.02, 0.05, 0.1]
  eps_decay:
    values: [0.999, 0.9995, 0.9999]
  # action space
  action_levels:
    values: [7, 9, 11]
  action_scale_low:
    values: [0.75, 0.8, 0.85]
  action_scale_high:
    values: [1.15, 1.2, 1.25]
--- a/engine/sweeps/tpu_pod.yaml
+++ b/engine/sweeps/tpu_pod.yaml
@@ -1,64 +0,0 @@
 method: bayes
 metric:
  name: sweep/score
  goal: maximize
 command:
  - ${env}
  - python
  - -m
  - engine.train
 parameters:
  use_jax:
    value: true
  # pmap requires all workers to compile the same computation graph shape,
  # so structural params are fixed -- only research/scalar params are swept
  algo:
    values: [ppo, a2c]
  jax_num_envs:
    value: 32
  jax_num_steps:
    value: 128
  jax_num_minibatches:
    value: 4
  jax_update_epochs:
    value: 4
  total_timesteps:
    value: 100000
  checkpoint_interval:
    value: 200000
  n_products:
    value: 10
  action_levels:
    value: 9
  # research parameters -- primary sweep targets
  alpha:
    distribution: uniform
    min: 0.1
    max: 0.6
  lambda_coi:
    distribution: uniform
    min: 0.05
    max: 0.6
  robust_radius:
    distribution: uniform
    min: 0.0
    max: 0.3
  info_value:
    distribution: uniform
    min: 0.5
    max: 2.0
  revenue_weight:
    values: [0.005, 0.01, 0.02]
  # training hyperparameters
  learning_rate:
    distribution: log_uniform_values
    min: 1.0e-5
    max: 1.0e-3
  gamma:
    values: [0.97, 0.99, 0.995]
  gae_lambda:
    values: [0.9, 0.95, 0.98]
  clip_range:
    values: [0.1, 0.2, 0.3]
  ent_coef:
    values: [0.0, 0.005, 0.01]
--- a/engine/telemetry/init.py
+++ b/engine/telemetry/init.py
@@ -0,0 +1,23 @@
 from .metrics import canonicalize_metrics
 from .wandb import (
    current_config,
    finish_run,
    get_wandb_module,
    init_run,
    log_metrics,
    run_agent,
    update_run_config,
    update_summary,
 )
 __all__ = [
    "canonicalize_metrics",
    "current_config",
    "finish_run",
    "get_wandb_module",
    "init_run",
    "log_metrics",
    "run_agent",
    "update_run_config",
    "update_summary",
 ]
--- a/engine/telemetry/metrics.py
+++ b/engine/telemetry/metrics.py
@@ -0,0 +1,62 @@
 from __future__ import annotations
 from typing import Any, Mapping
 from ..spec import TrainSpec
 _ALIASES = {
    "train/reward": "train/reward_mean",
    "train/revenue": "train/revenue_mean",
    "train/dqn_loss": "train/loss",
    "eval/reward": "eval/reward_mean",
    "eval/revenue": "eval/revenue_mean",
    "train/steps_per_second": "runtime/steps_per_second",
 }
 def _as_float(value: Any, default: float | None = None) -> float | None:
    if value is None:
        return default
    try:
        return float(value)
    except (TypeError, ValueError):
        return default
 def canonicalize_metrics(raw: Mapping[str, Any], spec: TrainSpec) -> dict[str, Any]:
    metrics: dict[str, Any] = {}
    for key, value in raw.items():
        canonical = _ALIASES.get(str(key), str(key))
        if canonical in metrics and canonical != key:
            continue
        metrics[canonical] = value
    metrics.setdefault("train/global_step", spec.runtime.total_timesteps)
    eval_reward = (
        _as_float(
            metrics.get("eval/robust_reward_worst", metrics.get("eval/reward_mean")),
            0.0,
        )
        or 0.0
    )
    metrics["objective/score"] = eval_reward
    margin_mean = _as_float(metrics.get("eval/margin_mean"), None)
    if margin_mean is not None:
        metrics["objective/constraint_margin"] = margin_mean - spec.env.margin_floor
    coi_level = _as_float(metrics.get("eval/coi_level_mean"), None)
    metrics["objective/coi_preserved"] = 0.0 if coi_level is None else coi_level
    metrics["study/alpha"] = spec.study.alpha
    metrics["study/lambda_coi"] = spec.study.lambda_coi
    metrics["study/robust_radius"] = spec.study.robust_radius
    metrics["study/info_value"] = spec.study.info_value
    metrics["runtime/backend"] = spec.runtime.backend
    metrics["runtime/device"] = spec.runtime.device
    metrics["runtime/seed"] = spec.runtime.seed
    return metrics
--- a/engine/telemetry/wandb.py
+++ b/engine/telemetry/wandb.py
@@ -0,0 +1,98 @@
 from __future__ import annotations
 from typing import Any, Callable, Iterable, Mapping
 def get_wandb_module():
    try:
        import wandb
        return wandb
    except ImportError:
        return None
 def _require_wandb():
    wandb = get_wandb_module()
    if wandb is None:
        raise ImportError("wandb is required for this workflow")
    return wandb
 def init_run(
    *,
    mode: str,
    project: str | None = None,
    config: Mapping[str, Any] | None = None,
    name: str | None = None,
    tags: Iterable[str] | None = None,
    group: str | None = None,
    sweep_mode: bool = False,
 ):
    wandb = _require_wandb()
    kwargs: dict[str, Any] = {"mode": mode}
    if group:
        kwargs["group"] = group
    if sweep_mode:
        run = wandb.init(**kwargs)
        if name and run is not None:
            run.name = name
        return run
    init_kwargs = dict(kwargs)
    init_kwargs["project"] = project
    if config is not None:
        init_kwargs["config"] = dict(config)
    if name:
        init_kwargs["name"] = name
    if tags:
        init_kwargs["tags"] = list(tags)
    return wandb.init(**init_kwargs)
 def finish_run() -> None:
    wandb = get_wandb_module()
    if wandb is not None and wandb.run is not None:
        wandb.finish()
 def current_config() -> dict[str, Any]:
    wandb = get_wandb_module()
    if wandb is None or wandb.run is None:
        return {}
    return {key: wandb.config[key] for key in wandb.config.keys()}
 def update_run_config(config: Mapping[str, Any]) -> None:
    wandb = get_wandb_module()
    if wandb is None or wandb.run is None:
        return
    try:
        wandb.config.update(dict(config), allow_val_change=True)
    except TypeError:
        wandb.config.update(dict(config))
 def log_metrics(metrics: Mapping[str, Any], *, step: int) -> None:
    wandb = get_wandb_module()
    if wandb is None or wandb.run is None:
        return
    wandb.log(dict(metrics), step=step)
 def update_summary(metrics: Mapping[str, Any]) -> None:
    wandb = get_wandb_module()
    if wandb is None or wandb.run is None:
        return
    for key, value in metrics.items():
        wandb.run.summary[key] = value
 def run_agent(
    sweep_id: str,
    fn: Callable[[], None],
    *,
    count: int | None = None,
 ) -> None:
    wandb = _require_wandb()
    wandb.agent(sweep_id, function=fn, count=count)
--- a/engine/train.py
+++ b/engine/train.py
@@ -1,512 +1,133 @@
 from __future__ import annotations
 import argparse
-import json
+from typing import Any
 import os
 from pathlib import Path
 import numpy as np
-from .wandb_checkpoint import checkpoint_artifact_name, download_latest_checkpoint
+from .logging_utils import configure_logging
-
+from .orchestrators import run_benchmark_cli, run_sweep_agent, run_train_once
-try:
+from .spec import TrainSpec
    import wandb as _wandb
    if hasattr(_wandb, "init") and callable(_wandb.init):
        wandb = _wandb
        HAS_WANDB = True
    else:
        wandb = None
        HAS_WANDB = False
 except ImportError:
    wandb = None
    HAS_WANDB = False
 try:
    from stable_baselines3 import PPO, A2C, DQN
    from stable_baselines3.common.callbacks import EvalCallback
    from stable_baselines3.common.monitor import Monitor
    HAS_SB3 = True
 except ImportError:
    HAS_SB3 = False
 from .jax import JAX_AVAILABLE
-DEFAULT_CFG = {
+def _parse_tags(raw: str | None) -> list[str]:
-    "project": "phantom-pricing",
+    if raw is None:
-    "algo": "ppo",
+        return []
-    "seed": 42,
+    return [piece.strip() for piece in str(raw).split(",") if piece.strip()]
    "total_timesteps": 50_000,
    "eval_episodes": 5,
    "eval_freq": 1_000,
    "log_freq": 100,
    "revenue_weight": 0.01,
    "n_products": 10,
    "N": 100,
    "alpha": 0.3,
    "lambda_coi": 0.2,
    "robust_radius": 0.15,
    "robust_points": 5,
    "info_value": 1.0,
    "price_low": 10.0,
    "price_high": 150.0,
    "action_levels": 9,
    "action_scale_low": 0.8,
    "action_scale_high": 1.2,
    "learning_rate": 3e-4,
    "gamma": 0.99,
    "buffer_size": 50_000,
    "batch_size": 256,
    "tau": 0.005,
    "train_freq": 1,
    "learning_starts": 1_000,
    "target_update_interval": 1_000,
    "exploration_fraction": 0.2,
    "exploration_final_eps": 0.05,
    "n_steps": 2_048,
    "n_epochs": 10,
    "gae_lambda": 0.95,
    "clip_range": 0.2,
    "ent_coef": 0.0,
    "q_lr": 0.1,
    "eps_start": 1.0,
    "eps_end": 0.05,
    "eps_decay": 0.9995,
    "model_dir": "engine/models",
    "arch": "small",
    "activation": "relu",
    "q_bins": 6,
    "max_steps": 100,
    "margin_floor": 0.05,
    "margin_floor_patience": 5,
    "use_jax": False,
    "jax_num_envs": 16,
    "jax_num_steps": 128,
    "jax_num_minibatches": 4,
    "jax_update_epochs": 4,
    "jax_anneal_lr": True,
    "checkpoint_interval": 200_000,
 }
-def _truthy(value: str | bool | None) -> bool:
+def _probe_run_kind(argv: list[str]) -> str:
-    if isinstance(value, bool): return value
+    probe = argparse.ArgumentParser(add_help=False)
-    if value is None: return False
+    probe.add_argument("--run-kind", choices=["train", "benchmark"])
-    return str(value).strip().lower() in {"1", "true", "yes", "on"}
+    probe.add_argument("--run-mode", choices=["train", "benchmark"])
    args, _ = probe.parse_known_args(argv)
    return str(args.run_kind or args.run_mode or "train")
-def _cfg(raw: dict | None = None) -> dict:
+def _strip_run_kind(argv: list[str]) -> list[str]:
-    cfg = dict(DEFAULT_CFG)
+    stripped: list[str] = []
-    if raw:
+    skip_next = False
-        cfg.update({k: v for k, v in raw.items() if v is not None})
+    for item in argv:
-    cfg["algo"] = str(cfg["algo"]).lower()
+        if skip_next:
-    cfg["use_jax"] = _truthy(cfg.get("use_jax")) or _truthy(
+            skip_next = False
-        os.environ.get("PHANTOM_USE_JAX")
+            continue
-    )
+        if item in {"--run-kind", "--run-mode"}:
-    return cfg
+            skip_next = True
            continue
        if item.startswith("--run-kind=") or item.startswith("--run-mode="):
            continue
        stripped.append(item)
    return stripped
-def _wandb_cfg_dict() -> dict:
+def _build_parser() -> argparse.ArgumentParser:
-    return (
+    parser = argparse.ArgumentParser(description="PHANTOM unified training entrypoint")
-        {k: wandb.config[k] for k in wandb.config.keys()}
+    parser.add_argument("--run-kind", choices=["train", "benchmark"], default="train")
-        if HAS_WANDB and wandb.run
+    parser.add_argument("--run-mode", choices=["train", "benchmark"])
-        else {}
+
-    )
+    parser.add_argument("--project", default="capstone")
    parser.add_argument("--scenario", default="default")
    parser.add_argument("--group", type=str)
    parser.add_argument("--tags", type=str)
    parser.add_argument("--backend", choices=["auto", "sb3"], default="auto")
    parser.add_argument("--algo", choices=["ppo", "a2c", "dqn", "qtable", "sac"])
    parser.add_argument("--seed", type=int)
    parser.add_argument("--total-timesteps", type=int)
    parser.add_argument("--model-dir", type=str)
    parser.add_argument("--log-freq", type=int)
    parser.add_argument("--checkpoint-interval", type=int)
    parser.add_argument("--device", type=str)
    parser.add_argument("--alpha", type=float)
    parser.add_argument("--N", type=int)
    parser.add_argument("--n-products", type=int)
    parser.add_argument("--lambda-coi", type=float)
    parser.add_argument("--info-value", type=float)
    parser.add_argument("--robust-radius", type=float)
    parser.add_argument("--robust-points", type=int)
    parser.add_argument("--robust-rollouts", type=int)
    parser.add_argument("--no-robust", action="store_true")
    parser.add_argument("--eta-ux", type=float)
    parser.add_argument("--reward-profit-weight", type=float)
    parser.add_argument("--revenue-weight", type=float)
    parser.add_argument("--price-low", type=float)
    parser.add_argument("--price-high", type=float)
    parser.add_argument("--action-levels", type=int)
    parser.add_argument("--action-scale-low", type=float)
    parser.add_argument("--action-scale-high", type=float)
    parser.add_argument("--max-steps", type=int)
    parser.add_argument("--margin-floor", type=float)
    parser.add_argument("--margin-floor-patience", type=int)
    parser.add_argument("--learning-rate", type=float)
    parser.add_argument("--gamma", type=float)
    parser.add_argument("--buffer-size", type=int)
    parser.add_argument("--batch-size", type=int)
    parser.add_argument("--tau", type=float)
    parser.add_argument("--train-freq", type=int)
    parser.add_argument("--learning-starts", type=int)
    parser.add_argument("--target-update-interval", type=int)
    parser.add_argument("--exploration-fraction", type=float)
    parser.add_argument("--exploration-final-eps", type=float)
    parser.add_argument("--n-steps", type=int)
    parser.add_argument("--n-epochs", type=int)
    parser.add_argument("--gae-lambda", type=float)
    parser.add_argument("--clip-range", type=float)
    parser.add_argument("--ent-coef", type=float)
    parser.add_argument("--q-lr", type=float)
    parser.add_argument("--q-bins", type=int)
    parser.add_argument("--eps-start", type=float)
    parser.add_argument("--eps-end", type=float)
    parser.add_argument("--eps-decay", type=float)
    parser.add_argument("--arch", type=str)
    parser.add_argument("--activation", type=str)
    parser.add_argument("--vf-coef", type=float)
    parser.add_argument("--max-grad-norm", type=float)
    parser.add_argument("--eval-freq", type=int)
    parser.add_argument("--eval-episodes", type=int)
    parser.add_argument("--sweep-agent", action="store_true")
    parser.add_argument("--sweep-id", type=str)
    parser.add_argument("--count", type=int, default=0)
    parser.add_argument("--offline", action="store_true")
    parser.add_argument("--no-wandb", action="store_true")
    return parser
-def make_env(cfg: dict):
+def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]:
-    from gymnasium.wrappers import FlattenObservation
+    backend = None if args.backend == "auto" else args.backend
    from .wrapper import PHANTOM
    from .lib.wrappers import EconomicMetricsWrapper
    env = PHANTOM(
        n_products=int(cfg["n_products"]),
        alpha=float(cfg["alpha"]),
        N=int(cfg["N"]),
        price_bounds=(float(cfg["price_low"]), float(cfg["price_high"])),
        lambda_coi=float(cfg["lambda_coi"]),
        robust_radius=float(cfg["robust_radius"]),
        robust_points=int(cfg["robust_points"]),
        info_value=float(cfg["info_value"]),
        action_levels=int(cfg["action_levels"]),
        action_scale_low=float(cfg["action_scale_low"]),
        action_scale_high=float(cfg["action_scale_high"]),
        max_steps=int(cfg.get("max_steps", 100)),
        margin_floor=float(cfg.get("margin_floor", 0.05)),
        margin_floor_patience=int(cfg.get("margin_floor_patience", 5)),
        render_mode=None,
    )
    env = EconomicMetricsWrapper(env)
    env = FlattenObservation(env)
    return env
 def _net_arch(name) -> list[int]:
    presets = {
        "tiny": [32, 32],
        "small": [64, 64],
        "medium": [128, 128],
        "large": [256, 256],
    }
    if isinstance(name, (list, tuple)):
        return [int(v) for v in name]
    s = str(name).lower().strip()
    if s in presets:
        return presets[s]
    if "x" in s:
        try:
            vals = [int(v) for v in s.split("x") if v]
            return vals if vals else presets["small"]
        except ValueError:
            return presets["small"]
    return presets["small"]
 def _activation(name):
    try:
        import torch.nn as nn
    except ImportError:
        return None
    return {
        "relu": nn.ReLU,
        "tanh": nn.Tanh,
        "elu": nn.ELU,
        "leaky_relu": nn.LeakyReLU,
    }.get(str(name).lower().strip(), nn.ReLU)
 def _policy_kwargs(cfg: dict) -> dict:
    kw = {"net_arch": _net_arch(cfg.get("arch", "small"))}
    act = _activation(cfg.get("activation", "relu"))
    if act is not None:
        kw["activation_fn"] = act
    return kw
 def _action(agent, obs, deterministic: bool = True):
    out = agent.predict(obs, deterministic=deterministic)
    a = out[0] if isinstance(out, tuple) else out
    if isinstance(a, np.ndarray) and a.size == 1:
        return int(a.reshape(-1)[0])
    return a
 def evaluate(agent, env, episodes: int) -> dict:
    rewards, revenues = [], []
    for _ in range(int(episodes)):
        obs, _ = env.reset()
        done, ep_r, ep_rev = False, 0.0, 0.0
        while not done:
            obs, reward, term, trunc, info = env.step(_action(agent, obs, True))
            done = term or trunc
            ep_r += float(reward)
            ep_rev += float(
                info.get("economics", {}).get("revenue", info.get("revenue", 0.0))
            )
        rewards.append(ep_r)
        revenues.append(ep_rev)
    return {
        "eval/reward": float(np.mean(rewards)),
        "eval/revenue": float(np.mean(revenues)),
        "eval/reward_std": float(np.std(rewards)),
        "eval/revenue_std": float(np.std(revenues)),
    }
 def build_model(cfg: dict, env):
    algo = cfg["algo"]
    policy_kwargs = _policy_kwargs(cfg)
    if algo == "sac":
        raise ValueError("sac is not supported with the discrete core env")
    if algo == "ppo":
        return PPO(
            "MlpPolicy",
            env,
            verbose=1,
            policy_kwargs=policy_kwargs,
            seed=int(cfg["seed"]),
            learning_rate=float(cfg["learning_rate"]),
            n_steps=int(cfg["n_steps"]),
            batch_size=int(cfg["batch_size"]),
            n_epochs=int(cfg["n_epochs"]),
            gamma=float(cfg["gamma"]),
            gae_lambda=float(cfg["gae_lambda"]),
            clip_range=float(cfg["clip_range"]),
            ent_coef=float(cfg["ent_coef"]),
        )
    if algo == "a2c":
        return A2C(
            "MlpPolicy",
            env,
            verbose=1,
            policy_kwargs=policy_kwargs,
            seed=int(cfg["seed"]),
            learning_rate=float(cfg["learning_rate"]),
            n_steps=max(5, int(cfg["n_steps"]) // 32),
            gamma=float(cfg["gamma"]),
            gae_lambda=float(cfg["gae_lambda"]),
            ent_coef=float(cfg["ent_coef"]),
        )
    if algo == "dqn":
        return DQN(
            "MlpPolicy",
            env,
            verbose=1,
            policy_kwargs=policy_kwargs,
            seed=int(cfg["seed"]),
            learning_rate=float(cfg["learning_rate"]),
            buffer_size=int(cfg["buffer_size"]),
            batch_size=int(cfg["batch_size"]),
            gamma=float(cfg["gamma"]),
            train_freq=int(cfg["train_freq"]),
            learning_starts=int(cfg["learning_starts"]),
            target_update_interval=int(cfg["target_update_interval"]),
            exploration_fraction=float(cfg["exploration_fraction"]),
            exploration_final_eps=float(cfg["exploration_final_eps"]),
        )
    raise ValueError(f"unsupported algo '{algo}'")
 def _sb3_model_cls(algo: str):
    if algo == "ppo":
        return PPO
    if algo == "a2c":
        return A2C
    if algo == "dqn":
        return DQN
    raise ValueError(f"unsupported algo '{algo}'")
 def train_qtable(cfg: dict) -> tuple[EventQTable, dict]:
    from .lib.discrete import EventQTable
    np.random.seed(int(cfg["seed"]))
    env = make_env(cfg)
    eval_env = make_env(cfg)
    agent = EventQTable(
        env.action_space.n,
        int(cfg["n_products"]),
        (float(cfg["price_low"]), float(cfg["price_high"])),
        lr=float(cfg["q_lr"]),
        gamma=float(cfg["gamma"]),
        n_bins=int(cfg["q_bins"]),
    )
    eps = float(cfg["eps_start"])
    obs, _ = env.reset(seed=int(cfg["seed"]))
    for t in range(int(cfg["total_timesteps"])):
        a, s = agent.act(obs, eps)
        nxt, reward, term, trunc, info = env.step(a)
        done = term or trunc
        agent.update(s, a, float(reward), agent.encode(nxt), done)
        eps = max(float(cfg["eps_end"]), eps * float(cfg["eps_decay"]))
        if HAS_WANDB and wandb.run and (t + 1) % int(cfg["log_freq"]) == 0:
            econ = info.get("economics", {})
            wandb.log(
                {
                    "train/reward": float(reward),
                    "train/revenue": float(econ.get("revenue", 0.0)),
                    "train/epsilon": float(eps),
                },
                step=t + 1,
            )
        obs = env.reset()[0] if done else nxt
    metrics = evaluate(agent, eval_env, int(cfg["eval_episodes"]))
    metrics["train/global_step"] = int(cfg["total_timesteps"])
    env.close()
    eval_env.close()
    return agent, metrics
 def train_sb3(cfg: dict) -> tuple[object, dict]:
    if not HAS_SB3:
        raise ImportError("stable-baselines3 is required for SB3 models")
    from .lib.callbacks import CheckpointArtifactCallback, MetricsCallback
    env = make_env(cfg)
    eval_env = make_env(cfg)
    env = Monitor(env)
    eval_env = Monitor(eval_env)
    model = build_model(cfg, env)
    resume_step = 0
    if HAS_WANDB and wandb.run is not None:
        sweep_id = getattr(wandb.run, "sweep_id", None)
        artifact_name = checkpoint_artifact_name(cfg, backend="sb3", sweep_id=sweep_id)
        checkpoint_file = f"phantom_{cfg['algo']}_checkpoint.zip"
        restored = download_latest_checkpoint(artifact_name, file_name=checkpoint_file)
        if restored is not None:
            checkpoint_path, metadata = restored
            model = _sb3_model_cls(cfg["algo"]).load(
                checkpoint_path.as_posix(), env=env
            )
            resume_step = int(metadata.get("step", getattr(model, "num_timesteps", 0)))
            model.num_timesteps = max(
                int(getattr(model, "num_timesteps", 0)), resume_step
            )
    cbs = [MetricsCallback(log_histograms=True, log_freq=int(cfg["log_freq"]))]
    cbs.append(
        CheckpointArtifactCallback(
            cfg,
            interval=int(cfg.get("checkpoint_interval", 10_000)),
        )
    )
    cbs.append(
        EvalCallback(
            eval_env,
            eval_freq=int(cfg["eval_freq"]),
            n_eval_episodes=int(cfg["eval_episodes"]),
            deterministic=True,
            verbose=0,
        )
    )
    target_steps = int(cfg["total_timesteps"])
    remaining_steps = max(0, target_steps - int(getattr(model, "num_timesteps", 0)))
    if remaining_steps > 0:
        model.learn(
            total_timesteps=remaining_steps,
            callback=cbs,
            reset_num_timesteps=False,
        )
    model_path = Path(cfg["model_dir"])
    model_path.mkdir(parents=True, exist_ok=True)
    model.save(str(model_path / f"phantom_{cfg['algo']}"))
    metrics = evaluate(model, eval_env, int(cfg["eval_episodes"]))
    metrics["train/global_step"] = int(model.num_timesteps)
    env.close()
    eval_env.close()
    return model, metrics
 def train_once(cfg: dict) -> dict:
    algo = cfg["algo"]
    if cfg.get("use_jax"):
        if not JAX_AVAILABLE:
            raise ImportError(
                "JAX backend requested but JAX is not installed. "
                "Install engine/jax/requirements.txt and jax[tpu] for TPU runs."
            )
        try:
            from .jax.train import train_jax
        except Exception as exc:  # pragma: no cover
            raise ImportError(f"Failed to import JAX trainer: {exc}") from exc
        _, metrics = train_jax(cfg)
    elif algo == "qtable":
        _, metrics = train_qtable(cfg)
    else:
        _, metrics = train_sb3(cfg)
    metrics["sweep/score"] = float(
        metrics["eval/reward"] + float(cfg["revenue_weight"]) * metrics["eval/revenue"]
    )
    return metrics
 def run_wandb(
    project: str, overrides: dict, mode: str = "online", sweep_mode: bool = False
 ) -> dict:
    if not HAS_WANDB:
        raise ImportError("wandb is required for sweep runs")
    if not sweep_mode:
        pre_cfg = _cfg(overrides)
        if pre_cfg.get("use_jax"):
            try:
                import jax
                if jax.process_count() > 1 and jax.process_index() != 0:
                    return train_once(pre_cfg)
            except Exception:
                pass
    init_kwargs = {"mode": mode}
    if sweep_mode:
        run = wandb.init(**init_kwargs)
    else:
        run = wandb.init(project=project, config=overrides, **init_kwargs)
    try:
        cfg = _cfg(_wandb_cfg_dict())
        if sweep_mode:
            for k, v in overrides.items():
                if k not in wandb.config:
                    cfg[k] = v
        metrics = train_once(cfg)
        step = int(metrics.get("train/global_step", cfg["total_timesteps"]))
        wandb.log(metrics, step=step)
        for k, v in metrics.items():
            run.summary[k] = v
        return metrics
    finally:
        if wandb.run is not None:
            wandb.finish()
 def run_local(overrides: dict) -> dict:
    cfg = _cfg(overrides)
    metrics = train_once(cfg)
    should_print = True
    if cfg.get("use_jax"):
        try:
            import jax
            should_print = jax.process_index() == 0
        except Exception:
            should_print = True
    if should_print:
        print(json.dumps(metrics, indent=2))
        # sentinel line for machine-readable extraction; must stay on one line
        print("PHANTOM_METRICS:" + json.dumps(metrics))
    return metrics
 def main():
    p = argparse.ArgumentParser(description="PHANTOM training and W&B sweeps")
    p.add_argument("--project", default=DEFAULT_CFG["project"])
    p.add_argument("--algo", choices=["ppo", "a2c", "dqn", "qtable"])
    p.add_argument("--seed", type=int)
    p.add_argument("--total-timesteps", type=int)
    p.add_argument("--alpha", type=float)
    p.add_argument("--N", type=int)
    p.add_argument("--n-products", type=int)
    p.add_argument("--lambda-coi", type=float)
    p.add_argument("--info-value", type=float)
    p.add_argument("--robust-radius", type=float)
    p.add_argument("--robust-points", type=int)
    p.add_argument("--learning-rate", type=float)
    p.add_argument("--gamma", type=float)
    p.add_argument("--gae-lambda", type=float)
    p.add_argument("--clip-range", type=float)
    p.add_argument("--ent-coef", type=float)
    p.add_argument("--revenue-weight", type=float)
    p.add_argument("--price-low", type=float)
    p.add_argument("--price-high", type=float)
    p.add_argument("--action-levels", type=int)
    p.add_argument("--action-scale-low", type=float)
    p.add_argument("--action-scale-high", type=float)
    p.add_argument("--max-steps", type=int)
    p.add_argument("--margin-floor", type=float)
    p.add_argument("--margin-floor-patience", type=int)
    p.add_argument("--arch", type=str)
    p.add_argument("--activation", type=str)
    p.add_argument("--jax", action="store_true")
    p.add_argument("--jax-num-envs", type=int)
    p.add_argument("--jax-num-steps", type=int)
    p.add_argument("--jax-num-minibatches", type=int)
    p.add_argument("--jax-update-epochs", type=int)
    p.add_argument("--jax-anneal-lr", type=str)
    p.add_argument("--checkpoint-interval", type=int)
    p.add_argument("--sweep-agent", action="store_true")
    p.add_argument("--sweep-id", type=str)
    p.add_argument("--count", type=int, default=0)
    p.add_argument("--offline", action="store_true")
    p.add_argument("--no-wandb", action="store_true")
    args = p.parse_args()
    overrides = {
        "project": args.project,
        "backend": backend,
        "algo": args.algo,
        "seed": args.seed,
        "total_timesteps": args.total_timesteps,
        "model_dir": args.model_dir,
        "log_freq": args.log_freq,
        "checkpoint_interval": args.checkpoint_interval,
        "device": args.device,
        "alpha": args.alpha,
        "N": args.N,
        "n_products": args.n_products,
@@ -514,11 +135,10 @@ def main():
        "info_value": args.info_value,
        "robust_radius": args.robust_radius,
        "robust_points": args.robust_points,
-        "learning_rate": args.learning_rate,
+        "robust_rollouts": args.robust_rollouts,
-        "gamma": args.gamma,
+        "no_robust": args.no_robust,
-        "gae_lambda": args.gae_lambda,
+        "eta_ux": args.eta_ux,
-        "clip_range": args.clip_range,
+        "reward_profit_weight": args.reward_profit_weight,
        "ent_coef": args.ent_coef,
        "revenue_weight": args.revenue_weight,
        "price_low": args.price_low,
        "price_high": args.price_high,
@@ -528,40 +148,82 @@ def main():
        "max_steps": args.max_steps,
        "margin_floor": args.margin_floor,
        "margin_floor_patience": args.margin_floor_patience,
        "learning_rate": args.learning_rate,
        "gamma": args.gamma,
        "buffer_size": args.buffer_size,
        "batch_size": args.batch_size,
        "tau": args.tau,
        "train_freq": args.train_freq,
        "learning_starts": args.learning_starts,
        "target_update_interval": args.target_update_interval,
        "exploration_fraction": args.exploration_fraction,
        "exploration_final_eps": args.exploration_final_eps,
        "n_steps": args.n_steps,
        "n_epochs": args.n_epochs,
        "gae_lambda": args.gae_lambda,
        "clip_range": args.clip_range,
        "ent_coef": args.ent_coef,
        "q_lr": args.q_lr,
        "q_bins": args.q_bins,
        "eps_start": args.eps_start,
        "eps_end": args.eps_end,
        "eps_decay": args.eps_decay,
        "arch": args.arch,
        "activation": args.activation,
-        "use_jax": args.jax,
+        "vf_coef": args.vf_coef,
-        "jax_num_envs": args.jax_num_envs,
+        "max_grad_norm": args.max_grad_norm,
-        "jax_num_steps": args.jax_num_steps,
+        "eval_freq": args.eval_freq,
-        "jax_num_minibatches": args.jax_num_minibatches,
+        "eval_episodes": args.eval_episodes,
        "jax_update_epochs": args.jax_update_epochs,
        "checkpoint_interval": args.checkpoint_interval,
        "jax_anneal_lr": _truthy(args.jax_anneal_lr)
        if args.jax_anneal_lr is not None
        else None,
    }
-    overrides = {k: v for k, v in overrides.items() if v is not None}
+    return {key: value for key, value in overrides.items() if value is not None}
 def main(argv: list[str] | None = None) -> None:
    import sys
    configure_logging()
    raw_args = list(sys.argv[1:] if argv is None else argv)
    run_kind = _probe_run_kind(raw_args)
    if run_kind == "benchmark":
        run_benchmark_cli(_strip_run_kind(raw_args))
        return
    parser = _build_parser()
    args, unknown = parser.parse_known_args(raw_args)
    if unknown:
        raise ValueError(f"Unknown arguments for training mode: {' '.join(unknown)}")
    overrides = _overrides_from_args(args)
    scenario = str(args.scenario)
    group = args.group
    extra_tags = tuple(_parse_tags(args.tags))
    if args.sweep_agent:
-        if args.no_wandb:
+        run_sweep_agent(
-            raise ValueError("sweep agent requires wandb")
+            project=args.project,
-        if not args.sweep_id:
+            sweep_id=str(args.sweep_id or ""),
-            raise ValueError("--sweep-id is required with --sweep-agent")
+            count=int(args.count),
-        mode = "offline" if args.offline else "online"
+            offline=bool(args.offline),
-        wandb.agent(
+            no_wandb=bool(args.no_wandb),
-            args.sweep_id,
+            base_overrides=overrides,
-            function=lambda: run_wandb(
+            kind="sweep",
-                args.project, overrides, mode=mode, sweep_mode=True
+            scenario=scenario,
-            ),
+            group=group,
-            count=args.count if args.count > 0 else None,
+            extra_tags=extra_tags,
        )
        return
-    if args.no_wandb or not HAS_WANDB:
+    spec = TrainSpec.from_flat(overrides)
-        run_local(overrides)
+    run_train_once(
-        return
+        spec,
-
+        project=args.project,
-    run_wandb(args.project, overrides, mode="offline" if args.offline else "online")
+        offline=bool(args.offline),
        no_wandb=bool(args.no_wandb),
        kind="train",
        scenario=scenario,
        group=group,
        extra_tags=extra_tags,
    )
 if __name__ == "__main__":
--- a/engine/train_core.py
+++ b/engine/train_core.py
@@ -0,0 +1,40 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any
 from .spec import TrainSpec
 from .telemetry.metrics import canonicalize_metrics
@dataclass(frozen=True)
 class TrainResult:
    spec: TrainSpec
    metrics: dict[str, Any]
    artifacts: dict[str, str]
    events: list[dict[str, Any]]
 def run_train(spec: TrainSpec) -> TrainResult:
    cfg = spec.to_flat_dict()
    algo = spec.algorithm.name
    if algo == "qtable":
        from .backends.qtable import train_qtable
        _, raw_metrics = train_qtable(cfg)
    else:
        from .backends.sb3 import train_sb3
        _, raw_metrics = train_sb3(cfg)
    events_raw = raw_metrics.pop("_train_events", [])
    events = [evt for evt in events_raw if isinstance(evt, dict)]
    metrics = canonicalize_metrics(raw_metrics, spec)
    artifacts: dict[str, str] = {}
    model_path = raw_metrics.get("model/path")
    if isinstance(model_path, str):
        artifacts["model/path"] = model_path
    return TrainResult(spec=spec, metrics=metrics, artifacts=artifacts, events=events)
--- a/engine/wrapper.py
+++ b/engine/wrapper.py
@@ -47,7 +47,10 @@ class PHANTOM(gym.Env):
        coi_window: int = 10,
        robust_radius: float = 0.0,
        robust_points: int = 5,
        robust_rollouts: int = 1,
        info_value: float = 1.0,
        eta_ux: float = 0.5,
        reward_profit_weight: float = 1.0,
        action_levels: int = 9,
        action_scale_low: float = 0.9,
        action_scale_high: float = 1.1,
@@ -74,7 +77,10 @@ class PHANTOM(gym.Env):
        self.agent_params = agent_params
        self.robust_radius = max(0.0, float(robust_radius))
        self.robust_points = max(1, int(robust_points))
        self.robust_rollouts = max(1, int(robust_rollouts))
        self.info_value = float(info_value)
        self.eta_ux = float(eta_ux)
        self.reward_profit_weight = float(reward_profit_weight)
        self.action_levels = max(2, int(action_levels))
        self._action_scales = np.linspace(
            float(action_scale_low), float(action_scale_high), self.action_levels
@@ -103,6 +109,12 @@ class PHANTOM(gym.Env):
                    shape=(n_products,),
                    dtype=np.float32,
                ),
                "signals": spaces.Box(
                    low=np.array([0.0, 0.0, 0.0, 0.0], dtype=np.float32),
                    high=np.array([1.0, 1.0, 1.0, 1.0], dtype=np.float32),
                    shape=(4,),
                    dtype=np.float32,
                ),
            }
        )
@@ -117,6 +129,8 @@ class PHANTOM(gym.Env):
        self._trajectories = []  # session trajectories for agent prob calculation
        self.baseline_prices = np.full(self.n_products, self.price_bounds[0])
        self._low_margin_streak = 0  # consecutive steps below margin_floor
        self._last_agent_prob = float(self.alpha)
        self._last_alpha_adv = float(self.alpha)
        # load behavioral models for agent probability estimation
        try:
@@ -129,7 +143,20 @@ class PHANTOM(gym.Env):
        demand_arr = np.array(
            [self._demand.get(i, 0.0) for i in range(self.n_products)], dtype=np.float32
        )
-        return {"demand": demand_arr, "prices": self._prices.astype(np.float32)}
+        signals = np.array(
            [
                float(np.clip(self._last_agent_prob, 0.0, 1.0)),
                float(np.clip(self._last_alpha_adv, 0.0, 1.0)),
                float(np.clip(self.nominal_alpha, 0.0, 1.0)),
                float(np.clip(self.robust_radius, 0.0, 1.0)),
            ],
            dtype=np.float32,
        )
        return {
            "demand": demand_arr,
            "prices": self._prices.astype(np.float32),
            "signals": signals,
        }
    def _set_market_mix(self, alpha: float):
        alpha = float(np.clip(alpha, 0.0, 1.0))
@@ -177,20 +204,42 @@ class PHANTOM(gym.Env):
            [demand.get(i, 0.0) for i in range(self.n_products)], dtype=float
        )
        revenue = float(np.dot(prices, demand_arr))
        floor_cost = float(np.dot(self.baseline_prices, demand_arr))
        profit = revenue - floor_cost
        purchases = extract_purchases(trajectories)
        coi_mix = compute_uplift_coi(prices, purchases, self.baseline_prices)
-        # multiplicative penalty so COI term scales with revenue magnitude
+
        coi_leakage = float(agent_prob * self.info_value)
-        discount = float(np.clip(1.0 - self.lambda_coi * coi_leakage, 0.0, 1.0))
+        info_budget = max(floor_cost, 1.0)
-        coi_penalty = revenue * (1.0 - discount)  # absolute penalty in revenue units
+        coi_penalty = self.lambda_coi * coi_leakage * info_budget
-        reward = revenue * discount
+
        if len(self._price_history) > 0:
            volatility = float(
                np.mean(
                    np.abs(prices - self._price_history[-1])
                    / np.maximum(self.baseline_prices, 1.0)
                )
            )
        else:
            volatility = 0.0
        ux_penalty = self.eta_ux * info_budget * volatility
        reward_revenue = self.reward_profit_weight * profit
        reward = reward_revenue - coi_penalty - ux_penalty
        return reward, {
            "revenue": revenue,
            "cost_floor": floor_cost,
            "profit": profit,
            "coi_mix": float(coi_mix),
            "coi_base": 0.0,
            "coi_leakage": coi_leakage,
            "coi_penalty": coi_penalty,
-            "coi_discount": discount,
+            "coi_info_budget": info_budget,
            "ux_penalty": ux_penalty,
            "volatility": volatility,
            "reward_revenue": reward_revenue,
            "reward_total": reward,
        }
    def _alpha_candidates(self) -> np.ndarray:
@@ -200,28 +249,26 @@ class PHANTOM(gym.Env):
        hi = min(1.0, self.nominal_alpha + self.robust_radius)
        return np.linspace(lo, hi, self.robust_points)
-    def _select_adversarial_alpha(
+    def _evaluate_candidate(self, alpha: float, prices: np.ndarray) -> float:
-        self, prices: np.ndarray
+        self._set_market_mix(alpha)
-    ) -> tuple[float, dict, list, float]:
+        rewards = []
-        """inner robust step: pick worst-case alpha and return its outcome directly to avoid double-sampling"""
+        for _ in range(self.robust_rollouts):
        candidates = self._alpha_candidates()
        best_alpha, worst_reward = float(candidates[0]), np.inf
        best_demand, best_trajectories, best_agent_prob = None, [], 0.0
        for alpha in candidates:
            self._set_market_mix(float(alpha))
            demand = self.market.act(prices)
            trajectories = list(self.market.last_trajectories)
            agent_prob = self._compute_agent_prob(trajectories)
            reward, _ = self._compute_reward(prices, demand, agent_prob, trajectories)
-            if reward < worst_reward:
+            rewards.append(float(reward))
-                worst_reward = reward
+        return float(np.mean(rewards)) if rewards else 0.0
-                best_alpha, best_demand, best_trajectories, best_agent_prob = (
+
-                    float(alpha),
+    def _select_adversarial_alpha(self, prices: np.ndarray) -> float:
-                    demand,
+        """inner robust step: evaluate candidates and pick worst-case alpha"""
-                    trajectories,
+        candidates = self._alpha_candidates()
-                    agent_prob,
+        evaluations = [
-                )
+            (float(alpha), self._evaluate_candidate(float(alpha), prices))
-        return best_alpha, best_demand, best_trajectories, best_agent_prob
+            for alpha in candidates
        ]
        best_alpha, _ = min(evaluations, key=lambda x: x[1])
        return best_alpha
    def _record_history(self):
        demand_arr = np.array(
@@ -244,19 +291,24 @@ class PHANTOM(gym.Env):
        self._low_margin_streak = 0
        self._demand_history, self._price_history, self._revenue_history = [], [], []
        self._trajectories = list(getattr(self.market, "last_trajectories", []))
        self._last_agent_prob = float(self.nominal_alpha)
        self._last_alpha_adv = float(self.nominal_alpha)
        self._record_history()
        return self._get_obs(), {}
    def step(self, action):
        self._prices = self._decode_action(action)
-        # inner robust step returns worst-case outcome directly, no re-sampling
+        alpha_adv = self._select_adversarial_alpha(self._prices)
        alpha_adv, self._demand, trajectories, agent_prob = (
            self._select_adversarial_alpha(self._prices)
        )
        self._set_market_mix(alpha_adv)
        self._platform_stub.set_prices(self._prices)
        self._step_count += 1
        self._demand = self.market.act(self._prices)
        trajectories = list(self.market.last_trajectories)
        agent_prob = self._compute_agent_prob(trajectories)
        self._trajectories.extend(trajectories)
        self._last_agent_prob = float(agent_prob)
        self._last_alpha_adv = float(alpha_adv)
        reward, metrics = self._compute_reward(
            self._prices, self._demand, agent_prob, trajectories
@@ -278,7 +330,9 @@ class PHANTOM(gym.Env):
            "step": self._step_count,
            "agent_prob": agent_prob,
            "alpha_adv": float(alpha_adv),
            "alpha_nominal": float(self.nominal_alpha),
            "wasserstein_radius": float(self.robust_radius),
            "robust_rollouts": int(self.robust_rollouts),
            **metrics,
            "raw_revenue": np.sum(
                self._prices
@@ -355,7 +409,7 @@ if __name__ == "__main__":
        def predict(self, obs, **kwargs):
            return self.env.action_space.sample(), None
-    wandb.init(project="phantom-pricing", config={"policy": "random", "alpha": 0.3})
+    wandb.init(project="capstone", config={"policy": "random", "alpha": 0.3})
    env = EconomicMetricsWrapper(PHANTOM(n_products=15, alpha=0.3, render_mode=None))
    model = RandomPolicy(env)
--- a/nx.json
+++ b/nx.json
@@ -0,0 +1,71 @@
 {
  "$schema": "./node_modules/nx/schemas/nx-schema.json",
  "useInferencePlugins": false,
  "defaultBase": "main",
  "namedInputs": {
    "sharedGlobals": [
      "{workspaceRoot}/nx.json",
      "{workspaceRoot}/package.json",
      "{workspaceRoot}/Makefile",
      "{workspaceRoot}/pyproject.toml",
      "{workspaceRoot}/docker-compose.yml"
    ],
    "default": [
      "{projectRoot}/**/*",
      "sharedGlobals"
    ],
    "production": [
      "default",
      "!{projectRoot}/node_modules/**/*",
      "!{projectRoot}/.next/**/*",
      "!{projectRoot}/test-results/**/*",
      "!{projectRoot}/build/**/*"
    ]
  },
  "targetDefaults": {
    "build": {
      "cache": true,
      "inputs": [
        "production",
        "^production"
      ]
    },
    "test": {
      "cache": false,
      "inputs": [
        "default",
        "^production"
      ]
    },
    "install": {
      "cache": false
    },
    "dev": {
      "cache": false
    },
    "start": {
      "cache": false
    },
    "watch": {
      "cache": false
    },
    "clean": {
      "cache": false
    },
    "train": {
      "cache": false
    },
    "benchmark": {
      "cache": false
    },
    "up": {
      "cache": false
    },
    "down": {
      "cache": false
    },
    "logs": {
      "cache": false
    }
  }
 }
--- a/package.json
+++ b/package.json
@@ -0,0 +1,29 @@
 {
  "name": "phantom-monorepo",
  "private": true,
  "workspaces": [
    "web",
    "tests/e2e"
  ],
  "scripts": {
    "nx": "nx",
    "projects": "nx show projects",
    "graph": "nx graph",
    "web:dev": "nx run web:dev",
    "web:build": "nx run web:build",
    "backend:server": "nx run backend-server:dev",
    "backend:provider": "nx run pricing-provider:dev",
    "backend:worker": "nx run backend-worker:dev",
    "paper:build": "nx run paper:build",
    "platform:up": "nx run platform:up",
    "platform:down": "nx run platform:down",
    "platform:logs": "nx run platform:logs",
    "research:test": "nx run research:test",
    "research:benchmark": "nx run research:benchmark",
    "research:benchmark:simple": "nx run research:benchmark-simple",
    "e2e:test": "nx run e2e:test"
  },
  "devDependencies": {
    "nx": "^20.4.0"
  }
 }
--- a/paper/defense/manim/render.py
+++ b/paper/defense/manim/render.py
@@ -0,0 +1,84 @@
 from __future__ import annotations
 import argparse
 import subprocess
 import sys
 from pathlib import Path
 from scenes import SCENE_ORDER
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Render thesis-defense Manim scenes")
    parser.add_argument(
        "--quality",
        default="qm",
        choices=["ql", "qm", "qh", "qk"],
        help="Manim quality preset",
    )
    parser.add_argument(
        "--scene",
        action="append",
        dest="scenes",
        help="Scene name; repeat flag to render many",
    )
    parser.add_argument(
        "--preview", action="store_true", help="Open video after each render"
    )
    parser.add_argument(
        "--list", action="store_true", help="List available scenes and exit"
    )
    return parser.parse_args()
 def validate_requested(requested: list[str]) -> list[str]:
    missing = [name for name in requested if name not in SCENE_ORDER]
    if missing:
        choices = ", ".join(SCENE_ORDER)
        raise ValueError(f"Unknown scenes: {', '.join(missing)}. Choices: {choices}")
    return requested
 def run_manim(scene_file: Path, scene_name: str, quality: str, preview: bool) -> None:
    cmd = [sys.executable, "-m", "manim"]
    if preview:
        cmd.append("-p")
    cmd.extend([f"-{quality}", str(scene_file), scene_name])
    subprocess.run(cmd, cwd=scene_file.parent, check=True)
 def main() -> int:
    args = parse_args()
    if args.list:
        for scene in SCENE_ORDER:
            print(scene)
        return 0
    scenes = validate_requested(args.scenes) if args.scenes else list(SCENE_ORDER)
    scene_file = Path(__file__).resolve().parent / "scenes.py"
    try:
        for scene_name in scenes:
            run_manim(
                scene_file=scene_file,
                scene_name=scene_name,
                quality=args.quality,
                preview=args.preview,
            )
    except FileNotFoundError:
        print(
            "manim executable not found. Install Manim in your Python environment.",
            file=sys.stderr,
        )
        return 2
    except ValueError as exc:
        print(str(exc), file=sys.stderr)
        return 2
    except subprocess.CalledProcessError as exc:
        return exc.returncode
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/paper/defense/manim/requirements.txt
+++ b/paper/defense/manim/requirements.txt
@@ -0,0 +1,2 @@
 manim>=0.18,<1
 numpy>=1.24
--- a/paper/defense/manim/scenes.py
+++ b/paper/defense/manim/scenes.py
--- a/paper/project.json
+++ b/paper/project.json
@@ -0,0 +1,53 @@
 {
  "$schema": "../node_modules/nx/schemas/project-schema.json",
  "name": "paper",
  "projectType": "application",
  "sourceRoot": "paper/src",
  "targets": {
    "build": {
      "executor": "nx:run-commands",
      "outputs": [
        "{projectRoot}/build"
      ],
      "options": {
        "command": "bash scripts/nx_paper.sh build",
        "cwd": "."
      }
    },
    "watch": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_paper.sh watch",
        "cwd": "."
      }
    },
    "clean": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_paper.sh clean",
        "cwd": "."
      }
    },
    "wordcount": {
      "executor": "nx:run-commands",
      "options": {
        "command": "bash scripts/nx_paper.sh wordcount",
        "cwd": "."
      }
    },
    "build-arxiv": {
      "executor": "nx:run-commands",
      "outputs": [
        "{projectRoot}/build/main-arxiv.pdf"
      ],
      "options": {
        "command": "bash scripts/nx_paper.sh build-arxiv",
        "cwd": "."
      }
    }
  },
  "tags": [
    "scope:paper",
    "type:latex"
  ]
 }
--- a/paper/src/bib/references.bib
+++ b/paper/src/bib/references.bib
@@ -616,3 +616,17 @@ Volume: 21},
 	year = {2026},
 	file = {Snapshot:/home/velocitatem/Zotero/storage/N724QGF6/v4.html:text/html},
 }
@article{mann_test_1947,
 	title = {On a {Test} of {Whether} one of {Two} {Random} {Variables} is {Stochastically} {Larger} than the {Other}},
 	volume = {18},
 	url = {https://doi.org/10.1214/aoms/1177730491},
 	doi = {10.1214/aoms/1177730491},
 	abstract = {Let x and y be two random variables with continuous cumulative distribution functions f and g. A statistic U depending on the relative ranks of the x's and y's is proposed for testing the hypothesis f = g. Wilcoxon proposed an equivalent test in the Biometrics Bulletin, December, 1945, but gave only a few points of the distribution of his statistic. Under the hypothesis f = g the probability of obtaining a given U in a sample of n x's and m y's is the solution of a certain recurrence relation involving n and m. Using this recurrence relation tables have been computed giving the probability of U for samples up to n = m = 8. At this point the distribution is almost normal. From the recurrence relation explicit expressions for the mean, variance, and fourth moment are obtained. The 2rth moment is shown to have a certain form which enabled us to prove that the limit distribution is normal if m, n go to infinity in any arbitrary manner. The test is shown to be consistent with respect to the class of alternatives f(x) {\textgreater} g(x) for every x.},
 	number = {1},
 	journal = {The Annals of Mathematical Statistics},
 	author = {Mann, H. B. and Whitney, D. R.},
 	year = {1947},
 	note = {Publisher: Institute of Mathematical Statistics},
 	pages = {50 -- 60},
 }
--- a/paper/src/chapters/01-intro.tex
+++ b/paper/src/chapters/01-intro.tex
@@ -10,7 +10,7 @@
 In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove separability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned separability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.
-This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned separability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 31st 2026.}
+This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned separability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.}
 \subsection{Motivation and Market Context}
--- a/paper/src/chapters/03-methodology.tex
+++ b/paper/src/chapters/03-methodology.tex
@@ -7,7 +7,7 @@ This section details the theoretical and practical framework developed to addres
 \subsection{Problem Formalization}
-We define a commercial environment where the platform interacts with a stream of sessions. Let $\mathcal{S}$ denote the set of all sessions. Each session $s \in \mathcal{S}$ is generated by an actor belonging to a latent class $Y_s \in \{H, A\}$, where $H$ denotes Human and $A$ denotes Agent.
+We define a commercial environment where the platform interacts with a stream of sessions. Let $\mathcal{S}$ denote the set of all sessions. Each session $s \in \mathcal{S}$ is generated by an actor belonging to a latent class $\theta_s \in \{H, A\}$, where $H$ denotes Human and $A$ denotes Agent.
 Each session produces a trajectory of observable events $\tau_s = (e_{s,1}, \ldots, e_{s,L_s})$. An event $e_{s,k}$ is a tuple defined as:
 \begin{equation}
@@ -148,7 +148,10 @@ Reproducible results are key to quality research platforms, this is taken into m
 \subsubsection{Online Dynamic Pricing}
 In order to collect data from actors under correct conditions we replicate a naive and simple dynamic pricing algorithm which runs in the background during the experiments.
-The dynamic pricing done is handled by a pipeline which computes a demand estimate on a per-product basis of a specific window of the data, defined by the period $T$ which by default is 5 minutes. This dynamic pricing pipeline computes a demand estimate vector $\hat{q} \in \mathbb{R}^N$ by a weighted sum of interactions for each product, it additionally computes a price elasticity vector $\hat{\epsilon}$ in the same dimensions as our demand. The final features matrix is of the size $N \times 2$ which we translate to a new price vector $\hat{p} \in \mathbb{R}^N$. The transformation that governs this dynamic pricing is a very simple surge-based pricing (a special case of our later defined policy $\pi$):
+The dynamic pricing done is handled by a pipeline which computes a demand estimate on a per-product basis of a specific window of the data, defined by the period $T$ which by default is 5 minutes. This dynamic pricing pipeline computes a demand estimate vector $\hat{q} \in \mathbb{R}^N$ by a weighted sum of interactions for each product, it additionally computes a price elasticity vector $\hat{\epsilon}$ in the same dimensions as our demand. The final features matrix is of the size $N \times 2$ which we translate to a new price vector $\hat{p} \in \mathbb{R}^N$.
 The transformation that governs this dynamic pricing is a very simple surge-based pricing (a special case of our later defined policy $\pi$):
 \begin{equation}
 \hat{p}_i = \begin{cases}
@@ -176,14 +179,14 @@ We start from a practical constraint: we do not have access to proprietary produ
 The interface is organized as a product catalog where each product belongs to a time-bounded price vector (for example, a daily pricing period). During each period we collect interaction data by instrumenting UI components and predefined action templates that are still customizable. This gives us control without losing realism.
 Since users act with motivations, we define a pool of tasks (jobs to be done) and assign tasks randomly to participants.
-% TODO: describe the task pool in detail here -- list the specific tasks used in the experiments
+The task pool is stored as a structured table with fields \texttt{id}, \texttt{created\_at}, \texttt{task\_name}, \texttt{task\_description}, and \texttt{task\_def\_of\_done}. We formulate the tasks as compact jobs-to-be-done rather than as strict click scripts, because the target is to elicit realistic browsing and comparison behavior which can capture nuance of different people. In hotel mode the assigned tasks include \textit{Cheapest Room}, \textit{Cheapest Room w/ View}, \textit{MultiStep Cheapest Room}, \textit{The Digital Nomad (Executive)}, and \textit{The 3-Way Tradeoff (Desk + Quiet + Flexible)}. These prompts deliberately require critical thought in search, inspection of room details, comparison of amenities or images, return visits to the listing page, and a final booking decision which create a degree of cognitive load. In airline mode we use \textit{Last-Minute One-Way Flight}, where the actor must urgently travel to LAX from either SEA or JFK within the next 1--3 days, inspect at least a small set of candidate itineraries, and then book a reasonable earliest departure.
 A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor.
 The human data collection involved 18 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 18 human sessions we ran 18 agent sessions of equivalent task scope, giving a balanced dataset of 36 labeled trajectories. Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy.
 To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior.
-Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to separate classes $y \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner.
+Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to separate classes $\theta \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner.
 Our process follows three stages: (1) observe and \textit{vectorize} behavioral interactions, (2) learn separability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
@@ -207,7 +210,7 @@ The simulator has multiple configurable factors. We design a multi-factor study
 % Power analysis plan: apply a two-sample Mann-Whitney U (or permutation test) on per-session (delta_H - delta_A) divergence scores comparing the human and agent groups. Compute minimum detectable effect size at alpha=0.05, power=0.8, given n=18 per group. Bootstrap confidence intervals on mean KL are a cleaner complement given the non-normality of divergence distributions.
 While this scale is generally expensive for reinforcement learning, we execute it on a large TPU cluster to make the sweep tractable.
-Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak BF16 throughput this corresponds to approximately 160 PFLOPS of aggregate compute, which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration where throughput-per-dollar is favorable, and reserve on-demand v4 capacity for runs that should not be interrupted.
+Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak BF16 throughput this corresponds to approximately 160\,PFLOPS of aggregate compute (derivation in Appendix~\ref{app:compute_budget}), which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration where throughput-per-dollar is favorable, and reserve on-demand v4 capacity for runs that should not be interrupted.
 \begin{table}[ht]
 \centering
@@ -281,7 +284,7 @@ $\mathcal{A}_{\text{filter}}$ & \texttt{search}, \texttt{filter\_date}, \texttt{
 \end{table}
 This partition enables the weight function $\omega$ from Eq.~\ref{eq:qhat} to assign category-specific signal strengths, with $\omega(\mathcal{A}_{\text{cart}}) > \omega(\mathcal{A}_{\text{dwell}}) > \omega(\mathcal{A}_{\text{nav}}) > \omega(\mathcal{A}_{\text{filter}})$ reflecting decreasing commitment.
-
+Its important to acknowledge that this creates a very blatant assumption in the weighting, we do motivate the scale of each weight by the per-category observed divergence between each behavioral profile.
 In the simulator baseline this order is encoded with a compact fixed scale: cart $=4.0$, dwell $=2.0$, nav $=1.0$, filter $=0.5$. Unknown actions are mapped by prefix heuristics to the nearest category.
 The metadata record $\mu$ varies by action type. For product views, $\mu$ contains the observed price $p_{\text{obs}}$ and product attributes. For dwell events, $\mu$ includes the element text and accumulated hover duration. This heterogeneous structure is captured via a schema-on-read approach in our Kafka ingestion pipeline, where events are validated against type-specific schemas before storage.
@@ -289,16 +292,19 @@ The metadata record $\mu$ varies by action type. For product views, $\mu$ contai
 In addition to behavioral events, the platform logs price observations to a separate Kafka topic. Each price query generates a record $(i, p, \text{sid}, \phi, t)$ associating the product, displayed price, requesting session, platform mode, and timestamp. This dual-stream architecture enables joint analysis of price exposure and behavioral response.
 \subsection{Generative Contamination and Separability}
 To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from Phantom data using a two-stage approach.
 \subsubsection{Ground-Truth Separability}
-Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $y_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability?
+Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability?
-To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global separability and event-level diagnostics at the same time. In our balanced dataset (50\% human, 50\% agent), the average divergence is approximately $1.8$. To contextualize this divergence metric we compare with an intra-class comparison baseline of randomly selected transitions.
+To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session separability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
-% To contextualize this figure a useful intra-class baseline is to randomly split D_H into two equal halves, estimate a kernel from each half, compute the same average KL statistic, and repeat for B bootstrap samples (e.g. B=100). The resulting null distribution (mean +/- std) gives the divergence expected purely from estimation noise at this sample size. A between-class KL substantially above this null confirms the separation is real and not a finite-sample artefact. In practice: for each of B splits, partition D_H 50/50 without replacement, run build_kernel() on each half, average the per-state KL values, and collect the B scores into a reference distribution to compare against the 1.8 figure.
+
 The normality assumption cannot be made for KL divergence distributions, which are right-skewed and bounded below by zero, so we do not use a Student's $t$-test. Instead we apply a Mann-Whitney $U$ test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the stochastic ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We report $U$, the exact two-sided $p$-value, and group-level descriptive statistics for the gap scores.
 \begin{definition}[Kullback-Leibler Divergence for Transition Distributions]
 Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is:
@@ -341,9 +347,6 @@ To scale this to catalog-level pricing, we expand the base event transition matr
  \end{figure}
 \subsection{Second-Stage Classification}
 After contamination, we run a second classification stage. We remap events into a semantically aligned feature space, apply richer feature engineering, and retrain to obtain cleaner label probabilities across the full dataset. This classifier is then used directly in the reinforcement-learning reward structure.
 \subsection{Distributionally Robust Reinforcement Learning (DR-RL)}
 We formulate pricing as a Stackelberg game: the platform (leader) sets prices $p_t$, and the population (follower) responds through trajectories and demand. A useful intuition is that the platform behaves like a distorted mirror at a 45-degree angle: what it mirrors is population demand into an estimated demand proxy, and that proxy drives revenue.
@@ -378,6 +381,43 @@ For the current engine baseline, we use a compact inner-robust approximation by
 and we evaluate a small fixed grid in $\mathcal{A}_{\epsilon_\alpha}(\alpha_0)$ per step, selecting the worst-case candidate for the learner.
 % A proper Wasserstein ball implementation over the full demand distribution (rather than a scalar alpha interval) would use the POT library (Python Optimal Transport): compute W_2 between the empirical reference P_hat and each candidate Q using ot.emd2() or ot.sliced_wasserstein_distance() for scalability, then accept only candidates within epsilon. In practice the inner minimization becomes: candidates = [G(alpha) for alpha in linspace]; dists = [ot.emd2(p_hat, q, M) for q in candidates]; worst = candidates[argmin(reward[dists <= epsilon])]. The current grid-on-alpha approximation is a computationally cheap substitute; moving to a true Wasserstein ball would tighten the worst-case guarantee but requires specifying the ground metric M over the demand space.
 \subsubsection{Environment Setup for Dynamic Pricing}
 The complete pricing-demand-trajectory loop is illustrated in Figure~\ref{fig:oracle_flow}. The Oracle maps historical price and demand state to a new price vector, which is exposed to a distribution of demand curves. Each product generates trajectories weighted by behavioral kernels $\tau_\theta$, producing a full transition matrix $\tau'$ over sessions. Sampled trajectories $\{\tau_k\}$ are aggregated through the demand proxy function $Q(\cdot)$ to yield the next demand vector, which feeds back into the Oracle.
 \begin{figure}[ht]
 \centering
 \[
 \text{Oracle}(\vec{p}_{t-1},\vec{\hat{q}})\to
 \begin{pmatrix}
 p_0\\
 p_1\\
 \cdots\\
 p_N
 \end{pmatrix}
 \underrightarrow{d_i \sim \mathcal{N}_{\vec{p}}}
 \begin{pmatrix}d_0\\ d_1\\ \cdots \\ d_N\end{pmatrix}
 \underrightarrow{\vec{d}\times \tau_\theta \to \tau^\prime}
 \begin{bmatrix}
 0.01 & 0.02 & \cdots & 0.3 \\
 0.41 & 0.24 & \cdots & 0.0 \\
 \cdots & \cdots & \cdots & \cdots \\
 0.51 & 0.09 & \cdots & 0.1 \\
 \end{bmatrix}
 \underrightarrow{\tau_k \sim \tau^\prime}
 \{\tau_k\}_{k=0}^K \to \hat{Q}(\tau_k)
 \to \begin{pmatrix}
 \hat{q}_0 \\
 \hat{q}_1 \\
 \cdots \\
 \hat{q}_N \\
 \end{pmatrix}
 \to \text{Oracle}(\cdot)
 \]
 \caption{Oracle-based pricing loop: historical price and demand state map to a new price vector; each product samples demand curves from $\mathcal{N}_{\vec{p}}$; trajectories are generated by mixing demand with behavioral kernels $\tau_\theta$ into transition matrix $\tau'$; sampled trajectories $\{\tau_k\}$ aggregate through proxy $Q(\cdot)$ to yield updated demand $\vec{\hat{q}}$, closing the feedback loop.}
 \label{fig:oracle_flow}
 \end{figure}
 \subsubsection{The Min-Max Objective}
 The robust policy $\pi^*$ is obtained by solving the maximin problem:
 \begin{equation}
--- a/paper/src/chapters/04-results.tex
+++ b/paper/src/chapters/04-results.tex
@@ -6,17 +6,62 @@
    \label{fig:supra_heatmap}
 \end{figure}
 \subsection{Behavioral Analysis}
-Include markov chains of transition matrices, compare distributions (look at Divergence metrics)
+Separability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. Table~\ref{tab:divergence_significance} reports the group-level descriptive statistics for the gap scores and the test result.
 \begin{table}[ht]
 \centering
 \caption{Per-session divergence gap ($\Delta_H - \Delta_A$) by actor class with Mann-Whitney $U$ test.}
 \label{tab:divergence_significance}
 \begin{tabular}{lccc}
 \toprule
 Group & $n$ & Mean gap & Std \\
 \midrule
 Human sessions & 11 & $-3.3522$ & $2.6748$ \\
 Agent sessions & 6 & $+1.6482$ & $2.8349$ \\
 \midrule
 \multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\
 \bottomrule
 \end{tabular}
 \end{table}
 The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided $p$-value of $0.0006$ indicates near-complete rank separation between the groups at $n_H=11$, $n_A=6$, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing.
 \subsection{Experimental Outcomes}
-Align with defined objectives, show results and statistical significance (or not).
+To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) non-robust baseline with revenue-only reward (\texttt{--no-robust}).
 We report two preliminary stages before the full factorial interpretation. First, we executed a short calibration run at $\alpha=0.3$ (2 evaluation episodes, 3000 training timesteps per tier) across \texttt{qtable}, \texttt{ppo}, \texttt{a2c}, and \texttt{dqn}. In that first run, \texttt{ppo} produced the highest objective score and revenue (objective $=3.76\mathrm{e}5$, revenue $=4.15\mathrm{e}5$), while the remaining tiers stayed lower in this small-budget regime. The corresponding price traces show a monotone escalation for \texttt{ppo} (mean price from $8.61\mathrm{e}1$ to $1.49\mathrm{e}2$), whereas \texttt{qtable}, \texttt{a2c}, and \texttt{dqn} remained nearly flat over the episode horizon. This confirms that the simulation loop is able to express policy-dependent pricing dynamics rather than collapsing into a single trajectory shape.
 Second, we launched an overnight paired benchmark over $\alpha \in \{0.00,0.15,0.30,0.45,0.60\}$ with 8 evaluation episodes and 8000 timesteps, comparing robust and non-robust settings at fixed seed/tier/contamination tuples. At the time of writing, two seeds (11 and 22) are complete and one additional seed is still running. We therefore frame the numbers below as an initial signal, not a final claim.
 \begin{table}[ht]
 \centering
 \caption{Early overnight aggregate over completed seeds ($n=2$; seeds 11 and 22).}
 \label{tab:pricing_benchmark}
 \begin{tabular}{lcccc}
 \toprule
 Mode & Mean objective score & Mean revenue & Mean COI level & Mean margin \\
 \midrule
 Robust & $3.41\mathrm{e}5$ & $3.80\mathrm{e}5$ & $1.08\mathrm{e}2$ & 0.901 \\
 Non-robust (\texttt{--no-robust}) & $3.91\mathrm{e}5$ & $4.18\mathrm{e}5$ & $1.11\mathrm{e}2$ & 0.906 \\
 \bottomrule
 \end{tabular}
 \end{table}
 At pair level (same seed, tier, and contamination), robust exceeds non-robust in $13/40$ configurations on objective score and in $16/40$ configurations on revenue. The current early evidence therefore suggests a conditional robustness effect: the defense is active and measurable, but not yet uniformly beneficial without further calibration.
 \subsection{Interpretation and Insights}
-Inference from given patterns and show key findings.
+The Mann-Whitney result ($U=2.0$, $p<0.001$) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
 The first calibration and overnight runs additionally confirm three practical points aligned with the thesis mechanism. First, the control loop is reproducible end-to-end (training, evaluation, artifact generation) across algorithms and contamination levels. Second, policy class materially changes price trajectories and resulting COI/revenue profiles under identical environment settings. Third, objective improvements from robustness are regime-dependent in the current baseline, which is consistent with the thesis claim that contamination-aware pricing needs explicit calibration rather than a one-size-fits-all penalty.
 We also note that maximizing revenue in isolation can favor aggressive high-price behavior; even in these early runs, the non-robust aggregate shows slightly higher mean COI and margin. For this reason, all subsequent reporting in this thesis is interpreted on a multi-metric basis (objective, revenue, COI, and stability), and not by revenue alone.
 \subsection{Anomalies}
 In our initial runs, we observed an instability pocket in one completed run (A2C, robust, seed 11, $\alpha=0.30$) with a large performance drop relative to neighboring configurations. We retain this run in the preliminary summary to avoid survivorship bias and treat it as evidence that robustness sensitivity analysis is necessary before final conclusions.
--- a/paper/src/chapters/05-discussion.tex
+++ b/paper/src/chapters/05-discussion.tex
@@ -1,18 +1,20 @@
 \section{Discussion}
 \subsection{Transition to Agentic Market Microstructure}
 Our analysis of the interaction dynamics between the platform and non-human actors suggests that the current static pricing models are insufficient for an agent-mediated economy. If we assume a transition toward a direct revelation mechanism, where actors must reveal their true valuation of a good through bidding dynamics, we inevitably introduce significant stochasticity into the pricing system. Unlike traditional e-commerce where prices are relatively sticky, such a mechanism implies a high volatility characteristic of financial equity markets (without the fungability however).
-However, ecommerce commodities differ fundamentally from financial securities: they possess a hard floor defined by unit economics and reservation prices. The market might react enthusiastically to an iPhone priced at \$1, such a transaction is not permissible. The platform must establish an initial valuation anchor ($P_{0}$) defined by the marginal cost plus a target margin, around which the market price is permitted to fluctuate. We propose the introduction of GenAI Agents as Institutional Market Makers.
+However, ecommerce commodities differ fundamentally from financial securities: they possess a hard floor defined by unit economics and reservation prices. The market might react enthusiastically to an iPhone priced at \$1, such a transaction is not permissible. The platform must establish an initial valuation anchor ($P_{0}$) defined by the marginal cost plus a target margin, around which the market price is permitted to fluctuate. We float the introduction of GenAI Agents as Institutional Market Makers. As the arms race for greater autonomy of agnetic systems grows, the commercial viability of AI agents has the potential to disseminate into every-day users directly interacting with them rather than e-commerce platforms. This is also under the assumption of expected transactional capabilities being given to AI Agents.
 This is also under the assumption of expected transactional capabilities being given to AI Agents.
 \subsection{Risk Assessment and Limitations}
-Acknowledge risks and constraints and data sizes.
+This technology does not come without a more bitter side, ethical concerns do arise from the idea of deploying black-box like solutions to set prices based on a behavioral attributes. Approaches like universal behavioral profile modeling (UBPM) used in recommendation systems is very broadly utilized.
 With a system like this there is potential for strong drift given the rapid advance of agentic systems and user preference. Our intent behind adding the UX term into the reward shaping process was to further address the risk of degraded user experience. Looking deeper at the underlying methodology, reinforcement learning does not come without it's complications such as reward hacking and often the lack of intepretability which is quite critical in systems that have a strong impact on the revenue of a company.
 \subsection{Implications of Findings}
--- a/paper/src/chapters/06-conclusion.tex
+++ b/paper/src/chapters/06-conclusion.tex
@@ -1,8 +1,11 @@
 \section{Conclusion}
 For our troubles, we now conclude that...
 \subsection{Summary of contributions}
-Restate the thesis and key findings with validation of research objectives.
+The authors contribution was not without the advice of many experienced experts in the field. We thank Marco Casalaina VP Products, Core AI and AI Futurist at Microsoft for the initial critical discussion on the topic of dynamic pricing systems and the spark which has lead to this work. Eugene Bykovets, PhD pointing out the parallels in blockchain systems and the complexity of anonymous interaction and understanding of intent. Importantly, the contributions of Alberto Martín Izquierdo, my academic advisor for the support over and for taking on the challenge of this ambitious work. Many breakthroughs were thanks to numerous discussions with my peers on the topics covered here.
 A thanks to the head of innovation at Amadeus for insight into the industry split on the topic of collapsing margins. Finally we acknowledge the power and use of generative AI technologies for in depth research, rapid prototyping and surfacing of key topics and niches.
 \subsection{Future Works and Next Steps}
-Identify the research gaps here and potential business implications and setup of business + Proposed extensions and a long term agenda.
+During the eights months of research dedicated to this work, a plethora of opportunities and industry gaps was identified, sadly a majority of which could not be addressed directly.
--- a/paper/src/main-genpop.tex
+++ b/paper/src/main-genpop.tex
@@ -0,0 +1,87 @@
 % -*- TeX-master: t -*-
 \documentclass[12pt,letterpaper]{article}
 \input{preamble}
 \begin{document}
 \begin{titlepage}
    \centering
    \includegraphics[width=\textwidth]{graphics/banner.png}\\[0.8cm]
    \LARGE\textbf{PHANTOM: Pricing Heuristics Against Non-human Transaction Orchestration Mechanisms}\\[0.5cm]
    \large\textit{General Public Edition}\\[0.3cm]
    \Large\textbf{Daniel Rösel}\\
    \large\textit{Bachelor of Computer Science \& Artificial Intelligence}\\[0.5cm]
    \Large\textit{Supervised by:}\\
    \Large\textbf{Alberto Martín Izquierdo}\\
    \large\textit{IE University, Madrid, Spain}\\[1cm]
    \large\today
 \end{titlepage}
 \begin{abstract}
 With accelerated growth of Lager Language Model agents in e-commerce a novel adversarial dynamic to digital markets emerges. This paper address the vulnerability of dynamic pricing systems to AI intermediaries that decouple the information gather stages from the transaction execution. By conducing reconnaissance isolates sessions, agents circumvent the ``Cost of Information'' (COI) defined as the accumulated price premium typically thought demand expression estimators.
 We formally define this phenomenon and derive the Cost of Information Theorem, proving that as the saturation of independent, utility-maximizing agents increases, the platform's ability to sustain a COI converges to zero, rendering standard dynamic pricing mechanisms incentive-incompatible.
 To respond to this threat we propose a defensive framework which integrates behavioral economics with Adversarially Distributionally Robust Optimization (DRO). We introduce a custom e-commerce research platform built on hybrid Kappa-Lambda architecture, designed to capture and simulate high-fidelity controlled interaction trajectories. We further demonstrate through modeling that human and agent behaviors exhibit distinct transition probability kernels, enabling the construction of discriminative models based on Kullback-Leibler divergence.
 These behavioral signals serve as inputs for a Distributionally Robust Reinforcement Learning (DR-RL) agent. We formulate the pricing problem as a Stackelberg game where the learner optimizes against an ambiguity set of demand distributions defined by the Wasserstein distance. This approach allows the pricing policy to remain robust against non-stationary contamination without overfitting to deterministic demand curves. The research validates a mechanism for preserving margin integrity and market equilibrium in an agent-mediated economy, while minimizing degradation to the legitimate human user experience (UX).
 \end{abstract}
 \noindent\textbf{Keywords:} Dynamic Pricing, LLM Agents, Adversarial Machine Learning, E-commerce, Behavioral Detection, Reinforcement Learning
 \vspace{1em}
 \noindent\textbf{Acknowledgments:} This research was supported by the TPU Research Cloud program, which provided access to Google Cloud TPU accelerators (including TPU v4, v5e, and v6e).
 \vspace{1em}
 \noindent\textbf{Note to Readers:} This is a general public edition of the technical thesis. Mathematical formulas and complex algorithms have been translated into plain language explanations while preserving the complete narrative and all research findings.
 \clearpage
 \input{mirrors/genpop/01-intro}
 \input{mirrors/genpop/02-literature-review}
 \input{mirrors/genpop/03-methodology}
 \input{mirrors/genpop/04-results}
 \input{mirrors/genpop/05-discussion}
 \input{mirrors/genpop/06-conclusion}
 \printbibliography
 \clearpage
 \appendix
 \section{Terminology}
 \begin{description}
 \item[Agent A] An actor of non-human nature, powered by an LLM.
 \item[Human H] An individual human with some job to be done.
 \item[Actor] Defines a type of class which is either Agent or Human and has the capability to carry out actions on a web platform.
 \item[Platform] Any web-based platform which serves an interface to a collection of items that can be purchased, each at some price.
 \item[Behavioral Model] A mathematical model predicting what action comes after a series of prior actions.
 \item[LLM] Large Language Model served by some provider with the abstracted capability of tool calling.
 \item[TPU] Tensor Processing Unit which is a unique kind of chip architecture developed by Google.
 \item[Trajectory] Defined as a series of unspecified length, collecting data on states of some object over time.
 \item[Cost of Information (COI)] The average premium extracted above marginal cost due to information asymmetry.
 \item[Contamination Ratio] The proportion of agent sessions versus human sessions in the system.
 \item[Separability] The ability to distinguish between human and agent behavioral patterns.
 \end{description}
 \section{Aggregate Compute Budget Derivation}
 \label{app:compute_budget}
 The claimed peak throughput of approximately 160 PFLOPS (petaflops, a measure of computational power) follows from multiplying the per-chip peak performance by the number of chips in each allocation tier and summing across generations.
 \begin{table}[ht]
 \centering
 \caption{Per-generation contribution to aggregate throughput.}
 \label{tab:compute_derivation}
 \begin{tabular}{@{}lrrr@{}}
 \toprule
 \textbf{TPU Gen.} & \textbf{Chips} & \textbf{Peak per chip (TFLOPS)} & \textbf{Subtotal (TFLOPS)} \\
 \midrule
 v6e (Trillium) & 128 & 918 & $128 \times 918 = 117{,}504$ \\
 v5e            & 128 & 197 & $128 \times 197 = 25{,}216$  \\
 v4             &  64 & 275 & $64  \times 275 = 17{,}600$  \\
 \midrule
 \textbf{Total} & \textbf{320} & & $\mathbf{160{,}320}$ \\
 \bottomrule
 \end{tabular}
 \end{table}
 Converting to petaFLOPS: 160,320 TFLOPS equals approximately 160 PFLOPS. This is the theoretical peak under sustained arithmetic operations; realized throughput depends on memory bandwidth utilization and inter-chip communication overhead, but the figure serves as a useful upper bound for provisioning decisions.
 \end{document}
--- a/paper/src/main.tex
+++ b/paper/src/main.tex
@@ -45,7 +45,39 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce
 \begin{description}
 \item[Agent $A$] An actor of non-human nature, powered by an LLM.
 \item[Human $H$] An individual human with some job to be done.
 \item[Actor $\theta$] Defines a type of class which is either Agent or Human and has the capability to carry out actions on a web platform.
 \item[Platform] Any web-based platform which serves an interface to a collection of items that can be purchased, each at some price $p_i$.
 \item[Behavioral Model] A mathematical model predicting what action comes after a series of prior actions.
 \item[LLM] Large Language Model served by some provider with the abstracted capability of tool calling.
 \item[TPU] Tensor Processing Unit which is a unique kind of chip architecture developed by Google.
 \item[Trajectory] Defined as a series of unspecified length, collecting data on states of some object over time.
 % TODO: maybe define other things in a similar succient manner
 \end{description}
 \section{Aggregate Compute Budget Derivation}
 \label{app:compute_budget}
 The claimed peak throughput of approximately 160\,PFLOPS follows from multiplying the per-chip BF16 peak (from official Google Cloud TPU documentation) by the number of chips in each allocation tier and summing across generations.
 \begin{table}[ht]
 \centering
 \caption{Per-generation contribution to aggregate BF16 throughput.}
 \label{tab:compute_derivation}
 \begin{tabular}{@{}lrrr@{}}
 \toprule
 \textbf{TPU Gen.} & \textbf{Chips} & \textbf{Peak BF16/chip (TFLOPS)} & \textbf{Subtotal (TFLOPS)} \\
 \midrule
 v6e (Trillium) & 128 & 918 & $128 \times 918 = 117{,}504$ \\
 v5e            & 128 & 197 & $128 \times 197 = 25{,}216$  \\
 v4             &  64 & 275 & $64  \times 275 = 17{,}600$  \\
 \midrule
 \textbf{Total} & \textbf{320} & & $\mathbf{160{,}320}$ \\
 \bottomrule
 \end{tabular}
 \end{table}
 Converting to petaFLOPS: $160{,}320\;\text{TFLOPS} = 160.32\;\text{PFLOPS} \approx 160\;\text{PFLOPS}$. This is the theoretical peak under sustained BF16 arithmetic; realized throughput depends on memory bandwidth utilization and inter-chip communication overhead, but the figure serves as a useful upper bound for provisioning decisions.
 % \input{../build/concatenated_code}
 \end{document}
--- a/paper/src/mirrors/arxiv/INSTRUCTIONS.md
+++ b/paper/src/mirrors/arxiv/INSTRUCTIONS.md
@@ -0,0 +1,64 @@
 The source thesis lives in paper/src/ (main.tex + chapters/ + preamble.tex + bib/references.bib).
 Your job is to produce a self-contained arxiv-ready submission in this directory.
 ## What to produce
 A single compilable main.tex that reproduces the full thesis content and compiles
 cleanly with pdflatex + bibtex (no latexmk extras, no shell-escape). The output
 must pass arxiv's AutoTeX pipeline without manual intervention.
 ## arxiv constraints to satisfy
 1. Flat layout: all files referenced by \input or \includegraphics must sit in
   this directory or one level below (no ../../ relative paths). Copy or symlink
   chapters and graphics here, then rewrite \input paths accordingly.
 2. Bibliography: arxiv does not run biber. Use bibtex. The preamble currently
   loads biblatex with backend=bibtex — keep that, or switch to natbib +
   \bibliographystyle{plainnat} if biblatex causes trouble. Either way, include
   the pre-built .bbl file in the submission tarball (arxiv runs bibtex once but
   having the .bbl avoids failures).
 3. Packages: remove or replace anything arxiv's TeX Live snapshot may not carry.
   Known problematic ones in the current preamble:
   - newtxtext/newtxmath: fine on recent TeX Live, but have a fallback to
     \usepackage{times} + \usepackage{mathptmx} if the build fails.
   - algorithm2e: arxiv supports it, keep it.
   - cleveref: fine.
   - pgfplots: fine, but pin compat=1.18.
 4. No \include of generated code appendix: the concat_code.sh appendix that gets
   appended at build time is for the university submission only. Omit it here or
   replace with a short note pointing to the repository URL.
 5. Hyperref: keep it but add \usepackage[hidelinks]{hyperref} to suppress colored
   boxes, which look bad in arxiv's PDF renderer.
 6. Title / author block: use a normal \maketitle with full author name, affiliation,
   and date. Do not use the titlepage environment from the university version.
 7. Double spacing: remove \doublespacing (arxiv readers expect single spacing).
 8. Page headers: remove the fancyhdr block; arxiv adds its own stamp header.
 ## Minimal diff principle
 Preserve all content, section order, equations, theorems, figures, and tables from
 the original thesis exactly. Only make the structural changes listed above. Do not
 paraphrase, summarize, or rewrite prose. The mirror is a format adaptation, not
 an editorial one.
 ## How to verify
 From paper/src/mirrors/arxiv/ run:
  pdflatex main.tex
  bibtex main
  pdflatex main.tex
  pdflatex main.tex
 The build must complete without errors (warnings are acceptable). The resulting
 main.pdf should be visually equivalent to paper/build/main.pdf modulo formatting
 differences from removing double spacing and the titlepage.
 To build via make from the repo root:
  make pdf.arxiv
--- a/paper/src/mirrors/cais2026/main.tex
+++ b/paper/src/mirrors/cais2026/main.tex
@@ -0,0 +1,441 @@
 % CAIS 2026 submission — ACM sigconf double-column
 % 9 pages (excluding references and appendices)
 \documentclass[sigconf]{acmart}
 % math
 \usepackage{amsmath,amsthm}
 \newtheorem{theorem}{Theorem}
 \newtheorem{definition}{Definition}
 % figures / tables
 \usepackage{booktabs}
 \usepackage{subcaption}
 \usepackage{tikz}
 \usepackage{pgfplots}
 \pgfplotsset{compat=1.18}
 \usetikzlibrary{positioning, shapes, arrows.meta, fit, backgrounds}
 % algorithm
 \usepackage[ruled,vlined]{algorithm2e}
 % inline enumerations
 \usepackage[inline]{enumitem}
 % bibliography — acmart expects biber by default
 \bibliographystyle{ACM-Reference-Format}
 % remove ACM copyright block for submission draft
 \setcopyright{none}
 \settopmatter{printacmref=false}
 \renewcommand\footnotetextcopyrightpermission[1]{}
 \pagestyle{plain}
 \begin{document}
 \title{PHANTOM: A Compound System for Robust Dynamic Pricing\\under Agentic Traffic Contamination}
 % Double-blind submission: author information withheld for review
 \author{Anonymous Submission}
 \affiliation{}
 \email{}
 \begin{abstract}
 Dynamic pricing pipelines in e-commerce consume behavioral demand signals to set prices, but the growing presence of LLM-powered agents introduces a novel contamination vector: these agents decouple information gathering from transaction execution across isolated sessions, eroding the platform's pricing power.
 We present PHANTOM, a modular compound system that addresses this threat end-to-end. The system is composed of five orchestrated components: (1)~a configurable e-commerce research platform with dual-stream Kafka ingestion for behavioral and price-exposure events, (2)~a GOFAI-based weak labeling stage that partitions sessions into human and agent classes using rule-based predicates, (3)~a transition-kernel estimator that learns separable Markov models for each actor type and constructs a Contamination Generator for controlled simulation, (4)~a Distributionally Robust Reinforcement Learning policy that optimizes pricing under a Wasserstein ambiguity set conditioned on per-session divergence signals, and (5)~an Airflow-orchestrated pipeline that connects online data collection to offline policy training via Redis-backed model serving.
 We formally derive the Cost of Information Theorem, proving that standard pricing mechanisms become incentive-incompatible as agent query volume grows. The system architecture, interaction schema, and factorial experiment harness are designed for reproducibility and are released as open artifacts. We evaluate system-level tradeoffs between revenue protection, information leakage, and user-experience degradation through a three-objective reward structure.
 \end{abstract}
 \keywords{Compound AI Systems, Dynamic Pricing, LLM Agents, System Architecture, Behavioral Detection, Distributionally Robust Optimization, Evaluation Testbed}
 \maketitle
 %% ====================================================================
 %% 1  INTRODUCTION
 %% ====================================================================
 \section{Introduction}
 The current innovation boom in generative artificial intelligence and its applications to knowledge-based work tasks has brought many competing technologies for browser-use automation, with benchmarks and evaluations~\cite{xia_evaluation-driven_2025} motivating the development of capabilities focused on commercial research, understanding, and transaction execution~\cite{xie_osworld_2024}. The ``AI Agent'' market is forecasted to grow from around USD 5--8 billion in 2025 to USD 42--52 billion by 2030. This surge reflects adoption in e-commerce, customer service, and enterprise automation, where agents handle interactions previously done by humans, raising the question of how these systems should be designed for future robustness as well as how to maintain a competitive edge in the analytical components of e-commerce platforms~\cite{markntel_advisors_global_2025}.
 The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. Dynamic pricing algorithms rely on directly translating demand features $q$ to new price assignments $\hat{p}$ across a catalogue of products of size $N$. When agent-driven reconnaissance traffic contaminates these demand signals, the pricing pipeline produces biased estimates that erode margins. This is not a single-model failure but a \textit{compound system} failure: the data ingestion, demand estimation, policy optimization, and model serving stages each propagate and amplify the contamination.
 Existing work treats bot detection and dynamic pricing as separate concerns. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience. This gap is what our contribution aims to address.
 \subsection{System-Level Contributions}
 We frame our contribution along the four CAIS pillars---architectural patterns, system optimization, engineering and operations, and evaluation---rather than as a standalone pricing algorithm:
 \begin{enumerate}
    \item \textbf{Architectural Pattern.} We present PHANTOM as a modular pipeline: ingestion (Kafka dual-stream) $\to$ weak labeling (GOFAI predicates) $\to$ transition-kernel estimation $\to$ contamination-aware robust policy (DR-RL) $\to$ model serving (Redis). Each component has a clean interface and can be independently replaced or extended.
    \item \textbf{System Optimization.} The pipeline includes a non-differentiable decision stage (the RL policy operating through a Stackelberg game simulator). We address cost--performance tradeoffs by designing a factorial experiment grid ($4\times4\times3\times2\times2$) executed on a TPU cluster, and define a three-objective reward that explicitly trades off revenue, leakage prevention, and user-experience preservation.
    \item \textbf{Engineering and Operations.} The system is deployed as a set of containerized microservices orchestrated by Airflow, with Kafka for event streaming, Redis for low-latency price serving, and PostgreSQL for experiment tracking. We describe the observability design including dual-stream event logging and session-level contamination monitoring.
    \item \textbf{Evaluation Testbed.} We release a configurable e-commerce platform (hotel and airline modes) with instrumented UI components, a pool of assignable tasks for controlled experiments, and a reproducible interaction schema. This testbed enables evaluation of pricing systems under controlled agentic contamination with metrics for stability, regret, revenue, and safety constraints.
 \end{enumerate}
 \subsection{Research Questions}
 This work addresses three core research questions:
 \begin{enumerate}
    \item[\textbf{RQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
    \item[\textbf{RQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
    \item[\textbf{RQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
 \end{enumerate}
 %% ====================================================================
 %% 2  RELATED WORK
 %% ====================================================================
 \section{Related Work}
 An agent in the context of artificial intelligence is generally defined by anything that can reason and act upon observations of its environments and carry out actions through effectors. This definition by Russell and Norvig~\cite{russell_artificial_2021} is further developed in an economic context by Parkes and Wellman~\cite{parkes_economic_2015}, suggesting AI research attempts to construct a synthetic \textit{homo economicus}, which may also be termed \textit{machina economicus}. A specific class of this \textit{machina economicus}, the Large Language Model (LLM) agent, is defined as an autonomous system capable of achieving goals and adapting post-training, often without needing explicit code or fundamental model changes~\cite{xia_evaluation-driven_2025}.
 We must however acknowledge the current SOTA as presented by OSWORLD simulations by Xie et al.~\cite{xie_osworld_2024} have demonstrated that multi-modal tasks across desktop and web interaction modes have a top-performing score of only 12.24\% success, whereas humans have a higher 72\% success rate. This weakness matters because it clarifies the near-term threat model: practical exploitation does not require a fully competent ``computer assistant'', only enough automation to perform high-volume reconnaissance actions that can contaminate behavioral signals.
 The introduction of these mediating actor entities into economic systems is further creating a threat of false-name bidding~\cite{yokoo_effect_2004}. Other research on pseudonyms in dynamic systems demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities~\cite{feldman_free-riding_2004}. The transaction costs related to information gathering and negotiation are bound to collapse towards zero as proposed by Shahidi et al.~\cite{shahidi_coasean_2025}, calling for a re-evaluation of the boundaries between firms and markets.
 Explorations of algorithmic collusion by LLMs~\cite{fish_algorithmic_2025} have demonstrated a cross-model tendency of market division with a strong sensitivity to instructions provided in the ``system prompt''. Our effort to combat contamination stems from research by Hardt et al.~\cite{hardt_strategic_2015} on strategic classification, in conjunction with Liu et al.~\cite{liu_contextual_2024} who demonstrate a linear regret if contamination is ignored. To bridge the gap between detection and robust pricing, we look at Distributionally Robust Optimization (DRO). As defined by Kuhn et al.~\cite{kuhn_wasserstein_2024}, DRO provides a framework for decision-making under ambiguity, where the true data distribution is unknown but lies within a ``Wasserstein ball'' of a target distribution. In our context, the ``ambiguity set'' represents the uncertainty introduced by agentic reconnaissance.
 In order to create an environment in which prices can be tested against a demand estimate generated by some behavioral model, we take inspiration from the architecture proposed by Ie et al.~\cite{ie_recsim_2019} in the RecSim platform built for recommendation systems. The contribution of RecSim enables researchers to better understand learning algorithms in fixed environments, a gap we identify as needing to be bridged within the space of dynamic pricing.
 %% ====================================================================
 %% 3  SYSTEM ARCHITECTURE
 %% ====================================================================
 \section{System Architecture}
 We present the PHANTOM system as a compound pipeline with five modules connected by clean interfaces. Each module is independently deployable and replaceable. The architecture is designed around two operational loops: an \textit{online loop} for data collection and price serving, and an \textit{offline loop} for policy training and evaluation.
 \subsection{Online Loop: Data Ingestion and Price Serving}
 The architecture begins with the deployed web-apps posting interaction data to our backend which processes them and stores each ingested interaction into a Kafka cluster. This serves as our data reservoir tracking and associating each interaction with its session and importantly with which experiment it belongs to. Not only do we track the behavioral interactions, but our pricing provider micro-service, once called by the frontend reports the observed/queried price-product into Kafka. This dual-stream architecture enables joint analysis of price exposure and behavioral response.
 Each price query generates a record $(i, p, \text{sid}, \phi, t)$ associating the product, displayed price, requesting session, platform mode, and timestamp. The final stage of the pricing pipeline submits computed dynamic pricing results into a Redis database for quick updates which is then read by the pricing provider and displayed on the webapp.
 \subsection{Offline Loop: Policy Training}
 The Kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger. The offline loop consumes collected trajectories, performs weak labeling and transition-kernel estimation (Section~\ref{sec:separability}), trains the DR-RL policy (Section~\ref{sec:drrl}) in a simulator, and pushes the resulting policy to Redis for the pricing provider to read.
 \subsection{Online Dynamic Pricing (Baseline)}
 In order to collect data from actors under correct conditions we replicate a naive and simple dynamic pricing algorithm which runs in the background during the experiments. The dynamic pricing pipeline computes a demand estimate vector $\hat{q} \in \mathbb{R}^N$ by a weighted sum of interactions for each product. The transformation that governs this baseline pricing is a surge-based policy:
 \begin{equation}
 \hat{p}_i = \begin{cases}
 p_{0,i} \cdot \lambda_{\text{surge}} & \text{if } \hat{q}_i \geq \theta_{\text{high}} \\
 p_{0,i} \cdot \lambda_{\text{disc}} & \text{if } \hat{q}_i \leq \theta_{\text{low}} \\
 p_{0,i} & \text{otherwise}
 \end{cases}
 \quad \forall i \in \{1, \ldots, N\}
 \end{equation}
 where $p_0 \in \mathbb{R}^N$ is the base price vector, $\theta_{\text{high}}, \theta_{\text{low}} \in \mathbb{R}$ are demand thresholds, and $\lambda_{\text{surge}} = 1.2$, $\lambda_{\text{disc}} = 0.9$ are multiplicative factors. This baseline is the component that the DR-RL policy replaces once trained.
 \subsection{Interaction Schema}
 We extend the basic event tuple $e_{s,k}$ to capture the full observational signal available to the platform:
 \begin{equation}
 e_{s,k} = \left( a_{s,k}, \, i_{s,k}, \, t_{s,k}, \, \mu_{s,k}, \, \delta_{s,k} \right)
 \end{equation}
 where $\mu_{s,k} \in \mathcal{M}$ is a metadata record containing action-specific context (e.g., price observed, filter parameters, element text), and $\delta_{s,k} \in \mathbb{R}_+$ is the dwell time in milliseconds for attention-based actions.
 The action space $\mathcal{A}$ is partitioned into four semantic categories based on the behavioral signal each action conveys:
 \begin{table}[t]
 \centering
 \small
 \caption{Action space partition $\mathcal{A} = \mathcal{A}_{\text{nav}} \cup \mathcal{A}_{\text{cart}} \cup \mathcal{A}_{\text{filter}} \cup \mathcal{A}_{\text{dwell}}$ with signal interpretation.}
 \label{tab:action_space}
 \begin{tabular}{@{}llll@{}}
 \toprule
 \textbf{Category} & \textbf{Actions} & \textbf{Signal} & $\boldsymbol{\omega}$ \\
 \midrule
 $\mathcal{A}_{\text{cart}}$ & add, remove, checkout, purchase & Purchase intent & High \\
 $\mathcal{A}_{\text{dwell}}$ & hover\_title, hover\_paragraph & Sustained attention & Medium \\
 $\mathcal{A}_{\text{nav}}$ & page\_view, view\_item, learn\_more & Discovery & Low \\
 $\mathcal{A}_{\text{filter}}$ & search, filter\_date, filter\_price, sort & Preference refinement & Lowest \\
 \bottomrule
 \end{tabular}
 \end{table}
 The metadata record $\mu$ varies by action type. This heterogeneous structure is captured via a schema-on-read approach in our Kafka ingestion pipeline, where events are validated against type-specific schemas before storage.
 %% ====================================================================
 %% 4  METHODOLOGY
 %% ====================================================================
 \section{Methodology: Pipeline Components}
 This section details the theoretical and practical framework behind each pipeline component. We formalize the problem environment, derive the \textit{Cost of Information} (COI) theorem that motivates the system design, describe the separability and contamination modules, and formulate the robust pricing policy.
 \subsection{Problem Formalization}
 We define a commercial environment where the platform interacts with a stream of sessions. Let $\mathcal{S}$ denote the set of all sessions. Each session $s \in \mathcal{S}$ is generated by an actor belonging to a latent class $Y_s \in \{H, A\}$, where $H$ denotes Human and $A$ denotes Agent.
 Each session produces a trajectory of observable events $\tau_s = (e_{s,1}, \ldots, e_{s,L_s})$. The platform does not directly observe the true underlying demand function $d(p)$. Instead, it observes a behavioral proxy $\hat{q}_t$, which is a composite signal derived from the mixture of actor types. We define the demand proxy for product $i$ at epoch $t$ as a weighted aggregation of events:
 \begin{equation}
 \label{eq:qhat}
 \hat{q}_{t,i} = \sum_{s \in \mathcal{S}_t} \sum_{k=1}^{L_s} \omega(a_{s,k}) \cdot \mathbb{1}[i_{s,k} = i]
 \end{equation}
 where $\omega: \mathcal{A} \to \mathbb{R}_+$ assigns weights to actions based on their signal strength regarding willingness to pay, with $\omega(\mathcal{A}_{\text{cart}}) > \omega(\mathcal{A}_{\text{dwell}}) > \omega(\mathcal{A}_{\text{nav}}) > \omega(\mathcal{A}_{\text{filter}})$ reflecting decreasing commitment (Table~\ref{tab:action_space}).
 The total observed demand is a stochastic process governed by the mixture:
 \begin{equation}
 \label{eq:mixture_demand}
 Q(p) = (1-\alpha) \cdot \mathbb{E}_{\theta \sim \mathcal{D}_H}[d(p; \theta)] + \alpha \cdot \mathbb{E}_{\theta \sim \mathcal{D}_A}[d(p; \theta)] + \epsilon_t
 \end{equation}
 where $\alpha \in [0, 1]$ represents the contamination parameter (proportion of agents) and $\epsilon_t$ is non-stationary market noise. This mixture is the source of the compound-system failure: each downstream component inherits the bias introduced by $\alpha$.
 \subsection{Cost of Information: Why Standard Pipelines Fail}
 The platform's pricing power comes from information asymmetry: users who express strong interest signals pay more than the base price. We quantify this markup as the \textit{Cost of Information} (COI), which represents the average premium extracted above marginal cost.
 \begin{definition}[Cost of Information]
 Let $\pi(\tau)$ be a pricing policy mapping interaction histories to prices. The COI is defined as:
 \begin{equation}
 \text{COI} = \mathbb{E}[P] - \underline{p}
 \end{equation}
 where $\mathbb{E}[P]$ is the expected price charged by the policy and $\underline{p}$ is the minimum viable price (marginal cost).
 \end{definition}
 We now formally demonstrate that any pricing pipeline without a contamination-aware component is structurally vulnerable.
 \begin{theorem}[COI Erosion in the Limit]
 Let $N$ be the number of independent, utility-maximizing agents querying the platform. Let $p_{(1)}$ be the first order statistic (minimum) of the prices offered to these agents. As $N \to \infty$, the Cost of Information converges to 0.
 \end{theorem}
 \begin{proof}
 Consider $N$ independent agents querying the platform, each receiving a price sample $p_i$ drawn from the pricing policy's distribution $F(p)$ with support $[\underline{p}, \bar{p}]$. A strategic agent conducting reconnaissance will select the minimum observed price: $p_{(1)} = \min(p_1, \ldots, p_N)$.
 The probability that the minimum price exceeds some threshold $t$ is:
 \begin{equation}
 P(p_{(1)} > t) = [1 - F(t)]^N
 \end{equation}
 For any price $t > \underline{p}$, the CDF satisfies $F(t) > 0$, so $1 - F(t) < 1$. The expected minimum price can be written as:
 \begin{equation}
 \mathbb{E}[p_{(1)}] = \underline{p} + \int_{\underline{p}}^{\bar{p}} [1 - F(t)]^N \, dt
 \end{equation}
 Since the integrand vanishes as $N \to \infty$ for all $t > \underline{p}$, the integral converges to zero:
 \begin{equation}
 \lim_{N \to \infty} \text{COI} = \lim_{N \to \infty} (\mathbb{E}[p_{(1)}] - \underline{p}) = 0
 \end{equation}
 \end{proof}
 This result is the theoretical motivation for the system design: it proves that standard pricing policies $\pi$ fail to extract surplus in the presence of large-scale agentic search, necessitating a contamination-aware component in the pipeline.
 \subsection{Module: Separability and Contamination Generation}
 \label{sec:separability}
 To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from collected data using a two-stage approach.
 \subsubsection{GOFAI-Based Weak Labeling.}
 We use Good Old-Fashioned AI (GOFAI) heuristics to generate weak labels for separability. A set of rule-based predicates $\phi_j: \tau \to \{0,1\}$ partitions dataset $\mathcal{D}$ into high-confidence sets $\mathcal{D}_H$ and $\mathcal{D}_A$. We then estimate separate transition models for both groups and ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability?
 To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global separability and event-level diagnostics at the same time. In our balanced dataset (50\% human, 50\% agent), the average divergence is approximately $1.8$.
 \begin{definition}[KL Divergence for Transition Distributions]
 Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is:
 \begin{equation}
  D_{\mathrm{KL}}(P_e \parallel Q_e) = \sum_{k \in \mathcal{S}_e} P_e(k) \log \frac{P_e(k)}{Q_e(k)}
 \end{equation}
 where $\mathcal{S}_e$ denotes the set of destination events that follow $e$ in the human trajectories.
 \end{definition}
 With these divergence features we train a contrastive model to estimate a weak agent probability $f(\tau)\in[0,1]$, which serves as the interface between the separability module and the downstream pricing policy.
 \subsubsection{Transition-Kernel Estimation and Contamination Generator.}
 \label{sec:tpe}
 For both subsets, we model session dynamics as an MDP and estimate transition kernel $\mathcal{T}$. For each actor type we estimate global kernels $\hat{\mathcal{T}}_A$ and $\hat{\mathcal{T}}_H$, then cluster into behavioral sub-kernels $\hat{\mathcal{T}}_y^i$ to avoid collapsing all behavior into one average profile. Transition probabilities are estimated by maximum likelihood:
 \begin{equation}
    \hat{P}(s' \mid s) = \frac{N(s, s')}{\sum_{k \in \mathcal{S}} N(s, k)}
 \end{equation}
 where $N(s, s')$ is the observed transition count. This allows us to construct a \textit{Contamination Generator} $\mathcal{G}(\alpha)$. Given a clean trajectory dataset, $\mathcal{G}$ injects synthetic agent trajectories sampled from $\hat{\mathcal{T}}_A$ until the effective mixing ratio reaches $\alpha$. The generator is the key enabler of the offline training loop: it produces the empirical distribution that centers the ambiguity set for robust optimization.
 \subsection{Module: Distributionally Robust Pricing Policy}
 \label{sec:drrl}
 We formalize the interaction between the dynamic pricing system and non-human actors as a \textit{Stackelberg Game} (Leader-Follower) with incomplete information. This framework captures the hierarchical nature of the problem: the Platform (Leader) sets a pricing policy, and the Actors (Followers)---both Humans and Agents---observe these prices and react strategically.
 \subsubsection{Players and Objectives.}
 Let $t \in \{1, \dots, T\}$ denote discrete time steps. At each step $t$, given a state $s_t \in \mathcal{S}$ (representing inventory, time of day, and historical interactions), the platform sets a price $p_t \in [p_{\min}, p_{\max}]$. The platform's goal is to maximize the cumulative revenue from genuine human transactions while mitigating the distortion caused by agent interactions.
 \begin{itemize}
    \item \textbf{The Human ($H$):} Acts as a \textit{myopic utility maximizer}. A human $i$ has a private valuation $v_i$ for the product. They execute a purchase decision $d_i \in \{0, 1\}$ based on the consumer surplus:
    \begin{equation}
        d_i(p_t) = \mathbb{I}(v_i - p_t \geq 0)
    \end{equation}
    \item \textbf{The Agent ($A$):} Acts as an \textit{information maximizer} (reconnaissance). The agent generates interaction events to estimate the platform's pricing function. The agent's reward function is defined by Information Gain:
    \begin{equation}
        R_A(p_t) = H(\mathcal{P}) - H(\mathcal{P} \mid p_t) - c_{\text{query}}
    \end{equation}
    where $H(\mathcal{P})$ is the entropy of the agent's belief regarding the price distribution, and $c_{\text{query}} \approx 0$ for LLMs.
 \end{itemize}
 \subsubsection{Contamination-Conditioned Demand.}
 Given a newly observed partial trajectory $\tau'$, we compute its empirical transition kernel $\hat{\mathcal{T}}'$ and measure divergence against the learned prototypes $\bar{\mathcal{T}}_H$ and $\bar{\mathcal{T}}_A$ from Section~\ref{sec:tpe}:
 \begin{align}
  \Delta_H(\tau') &= D_{KL}(\hat{\mathcal{T}}^\prime \parallel \bar{\mathcal{T}}_H) \\
  \Delta_A(\tau') &= D_{KL}(\hat{\mathcal{T}}^\prime \parallel \bar{\mathcal{T}}_A)
 \end{align}
 These divergence statistics serve as the operational connector between the separability module and the pricing policy. We define the per-session contamination estimate as:
 \begin{equation}
 \label{eq:alpha_hat}
    \hat{\alpha}(\tau') = \sigma\big(\beta(\Delta_H(\tau') - \Delta_A(\tau'))\big)
 \end{equation}
 where $\sigma$ is the logistic function and $\beta > 0$ is a temperature parameter. This maps separability directly into a scalar control input for the pricing objective.
 \subsubsection{Ambiguity Set Construction.}
 Because the contamination level $\alpha$ and demand shift are non-stationary, a point estimate of the demand distribution is insufficient. Let $\hat{P}_N$ denote the empirical reference distribution induced by the Contamination Generator $\mathcal{G}(\alpha)$. We define the Wasserstein ambiguity set:
 \begin{equation}
 \mathcal{U}_\epsilon(\hat{P}_N) = \left\{ Q \in \mathcal{P}(\Xi) : W_p(Q, \hat{P}_N) \le \epsilon \right\}
 \end{equation}
 where $W_p$ is the $p$-Wasserstein distance and $\epsilon > 0$ is the ambiguity radius. The choice of Wasserstein distance is deliberate: unlike $f$-divergence based ambiguity sets, Wasserstein balls do not require absolute continuity between the nominal and adversarial distributions~\cite{kuhn_wasserstein_2024}, which is critical when the adversary can introduce distributional ``black swans'' through novel agent behaviors not present in training data.
 \subsubsection{Robust Pricing Objective.}
 The platform seeks a policy $\pi^*$ that maximizes worst-case revenue over the ambiguity set while penalizing information leakage to suspected agents:
 \begin{equation}
 \label{eq:robust_policy}
 \pi^* = \arg \max_{\pi} \min_{Q \in \mathcal{U}_\epsilon} \; \mathbb{E}_{d \sim Q} \left[ R(p, d) - \lambda \cdot \text{COI}_{\text{leak}}(p, \tau') - \eta \cdot \text{UX}(\tau', p) \right]
 \end{equation}
 where $R(p, d) = p \cdot d$ is the revenue function.
 \begin{definition}[COI Leakage]
 The per-query information leakage cost is:
 \begin{equation}
 \text{COI}_{\text{leak}}(p,\tau') = \hat{\alpha}(\tau') \cdot \text{InfoValue}(p,\tau')
 \end{equation}
 where $\hat{\alpha}(\tau')$ is the session contamination estimate from Eq.~\ref{eq:alpha_hat}. The $\text{InfoValue}$ admits two instantiations:
 \begin{enumerate}
    \item \textbf{Query-tax surrogate:} $\text{InfoValue} = 1$. Each suspected agent query incurs a constant leakage cost, reflecting the COI Erosion Theorem where more queries drive COI to zero.
    \item \textbf{Revelation surrogate:} $\text{InfoValue} = -\log \pi(p \mid \tau')$. Rare or precise prices reveal more information about the policy's support and are more valuable to a reconnaissance agent.
 \end{enumerate}
 \end{definition}
 \begin{definition}[UX Index]
 The user-experience penalty $\text{UX}(\tau', p) \in [0, 1]$ measures degradation to legitimate human users from defensive pricing actions. A false-positive (human misclassified as agent) incurs a UX penalty proportional to the price distortion applied. The coefficient $\eta > 0$ governs the platform's tolerance for UX degradation relative to revenue protection.
 \end{definition}
 The three-term structure captures a fundamental trilemma in defensive pricing: revenue maximization, leakage prevention, and user-experience preservation. The parameters $\lambda$ and $\eta$ define the platform's operating point on the Pareto frontier between these objectives.
 %% ====================================================================
 %% 5  EVALUATION FRAMEWORK
 %% ====================================================================
 \section{Evaluation Framework}
 We present the evaluation design as a reproducible testbed for pricing systems under agentic contamination.
 \subsection{Platform as Testbed}
 We start from a practical constraint: we do not have access to proprietary production data. Because of that, we design our own fictional platform that still represents how commercial platforms work in the real world. We initially conducted a survey of the leading platforms of airlines and hotel booking sites to identify the specific interface patterns that effectively manage complex travel data. Our web framework defines a highly agnostic boilerplate which can be seeded with any data-modality, which we leverage to define a \texttt{hotel} and \texttt{airline} mode. Both modes are individually deployed via an environment level argument which adjusts the proxy routing with a custom middleware inside next.js to render only the desired mode.
 Since users act with motivations, we define a pool of tasks (jobs to be done) and assign tasks randomly to participants. A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline).
 To speak to realism, user interviews reported that the platform architecture mirrored standard booking interfaces and reduced the cognitive load required to learn the system. The dynamic pricing mechanism elicited immediate behavioral adjustments. Participants were sensitive to price volatility: sudden boosts triggered urgency and faster booking attempts, while large listing-to-final discrepancies triggered deeper comparison behavior.
 \subsection{Factorial Experiment Design}
 The simulator has multiple configurable factors, including valuation distributions, demand parametrization, contamination ratio, and policy settings. We therefore design a multi-factor study (current grid: $4\times4\times3\times2\times2$). While this scale is generally expensive for reinforcement learning, we execute it on a large TPU cluster to make the sweep tractable.
 Our training budget spans 384 TPU chips across v4, v5e, and v6e generations, distributed across Europe and U.S. regions with a spot-heavy mix and an on-demand reserve. At peak BF16 throughput this corresponds to roughly 160 PFLOPS of aggregate compute. We allocate v6e capacity to the heaviest policy training, use v5e for broad hyperparameter sweeps, and reserve on-demand v4 quota for runs that should not be preempted \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}.
 Our process follows three stages: (1)~observe and \textit{vectorize} behavioral interactions, (2)~learn separability to characterize human versus agent patterns, and (3)~use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
 Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is intentionally a disconnected component that feeds the later contributions.
 \subsection{Defensive Pricing Loop}
 Algorithm~\ref{alg:phantom_loop_clean} formalizes the end-to-end defensive pricing loop, integrating all pipeline components into the Stackelberg game structure.
 \begin{algorithm}[t]
 \caption{PHANTOM defensive pricing loop}
 \label{alg:phantom_loop_clean}
 \DontPrintSemicolon
 \SetKwInput{Input}{Input}
 \SetKwInput{Output}{Output}
 \Input{catalog size $N$; costs $c$; reference prices $p^{ref}$; behavior models $\bar T_H,\bar T_A$; action weights $\omega$; penalty $\lambda$; horizon $T$; sessions per step $M$}
 \Output{price/demand trajectory $\{(p_t,\hat Q_t,\hat\alpha_t)\}_{t=0}^{T-1}$}
 Initialize contamination estimate $\hat\alpha \leftarrow 0.2$\;
 \For{$t \leftarrow 0$ \KwTo $T-1$}{
  set $p_t \leftarrow \pi(\cdot)$
  and clip $p_t$ to a feasible range\;
  $\hat Q_t \leftarrow 0$, $\mathcal S_t \leftarrow \emptyset$\; \tcp{Observe sessions}
  \For{$m \leftarrow 1$ \KwTo $M$}{
    sample a session trajectory $\tau_m$ using $\bar T_H$ or $\bar T_A$\;
    $\hat Q_t \leftarrow \hat Q_t + \sum_{k}\omega(a_{m,k})$\;
    $\mathcal S_t \leftarrow \mathcal S_t \cup \{\tau_m\}$\;
  }
  \tcp{Estimate contamination from separability module}
  compute $\hat\alpha \leftarrow \frac{1}{M}\sum_{\tau\in\mathcal S_t} \Big[\sigma\big(\beta(\Delta_H(\tau)-\Delta_A(\tau))\big)\Big]$\;
  compute $J_t \leftarrow \text{Revenue}(p_t,\hat Q_t) - \lambda\cdot \text{COILeak}(\hat\alpha) - \eta\cdot \text{UX}(\hat\alpha)$\;
 }
 \end{algorithm}
 \subsection{System-Level Metrics}
 The characteristics of our evaluation environment can be summarized as:
 \begin{enumerate*}[label=(\roman*)]
 \item non-stationary demand with temporal noise $\epsilon_t$,
 \item contaminated behavioral signals from mixed human-agent traffic with unknown mixing ratio $\alpha$,
 \item partial observability where only demand proxies $\hat{q}$ are available, not true demand $d(\cdot)$,
 \item strategic actors capable of feature manipulation to influence pricing outcomes,
 \item session-based interactions modeled as POMDPs with trajectories $\tau_s$,
 \item low conversion probability for agents: $P(\text{purchase} \mid A) < P(\text{purchase} \mid H)$, and
 \item distributional uncertainty requiring robust optimization within Wasserstein ambiguity sets.
 \end{enumerate*}
 These properties define the evaluation surface: any pricing system deployed in this testbed must handle all seven simultaneously, making it a stress test for compound pipeline robustness rather than isolated component performance.
 %% ====================================================================
 %% 6  DISCUSSION
 %% ====================================================================
 \section{Discussion}
 \subsection{System Tradeoffs and Operational Considerations}
 The three-objective reward structure (Eq.~\ref{eq:robust_policy}) surfaces a fundamental operational tradeoff. Our analysis of the interaction dynamics between the platform and non-human actors suggests that the current static pricing models are insufficient for an agent-mediated economy. If we assume a transition toward a direct revelation mechanism, where actors must reveal their true valuation of a good through bidding dynamics, we inevitably introduce significant stochasticity into the pricing system.
 However, ecommerce commodities differ fundamentally from financial securities: they possess a hard floor defined by unit economics and reservation prices. The platform must establish an initial valuation anchor defined by the marginal cost plus a target margin, around which the market price is permitted to fluctuate. The parameters $\lambda$ and $\eta$ in the reward function are system-level knobs that must be tuned per deployment context: a platform with low false-positive tolerance (luxury retail) will set $\eta \gg \lambda$, while a platform under heavy bot attack (airline flash sales) will invert this.
 \subsection{Component Replaceability}
 The modular architecture is deliberately designed so that each component can be upgraded independently. The GOFAI weak labeling stage can be replaced by a neural classifier without changing the interface to the transition-kernel estimator. The surge-based baseline can be swapped for any policy that reads from Redis. The Wasserstein ambiguity set can be replaced by a KL-ball if absolute continuity can be assumed. This composability is the primary engineering contribution: the theoretical results (COI theorem, Stackelberg formulation) motivate and constrain the system design, but the system itself is the deployable artifact.
 \subsection{Limitations}
 The key stakeholders affected by the threat of increasing agent-driven traffic include online businesses and platform operators (especially in bot-heavy sectors like retail, travel, and financial services), their security, fraud, and engineering teams, end users whose accounts and data are exposed and whose experience degrades, regulators and legal stakeholders responding to breaches and fraud, and the attackers or bot operators driving the automation~\cite{imperva_rapid_2025}. We acknowledge that our testbed operates on synthetic and small-sample data rather than production traffic at scale. We also acknowledge the difficulty in similarly affected fields such as authorship, where Ganie~\cite{ganie_uncertainty_2025} demonstrates the theoretical limits of the distributional divergence between text authored by a human or large language model.
 %% ====================================================================
 %% 7  CONCLUSION
 %% ====================================================================
 \section{Conclusion}
 We have presented PHANTOM, a compound system for defending dynamic pricing pipelines against exploitation by LLM-powered agents. Rather than proposing a standalone pricing algorithm, we contribute a modular end-to-end architecture: ingestion, weak labeling, transition-kernel estimation, contamination-aware robust policy, and model serving, connected by clean interfaces and deployed as containerized microservices.
 We formally defined the Cost of Information and proved that as the saturation of independent agents increases, standard pricing mechanisms become incentive-incompatible---motivating the need for a contamination-aware component in any production pricing pipeline. We demonstrated that human and agent behaviors exhibit distinct transition probability kernels, enabling the construction of discriminative models based on Kullback-Leibler divergence. These behavioral signals serve as inputs for a Distributionally Robust Reinforcement Learning policy that formulates the pricing problem as a Stackelberg game, optimizing against an ambiguity set of demand distributions defined by the Wasserstein distance.
 The system architecture, interaction schema, configurable e-commerce testbed, and factorial experiment harness are designed for reproducibility and released as open artifacts. This is a very generic end-to-end mechanism which is applicable to a variety of different e-commerce tasks. We intentionally put emphasis on the development of this infrastructure to establish a reproducible framework for interaction and to minimize any noise.
 Future work includes full factorial evaluation of the DR-RL policy across contamination levels, online adaptation of the ambiguity radius $\epsilon$ as a function of live divergence estimates, extension to multi-agent market maker settings, and integration of the HAP protocol~\cite{dhir_http_2025} as an additional signal source for the separability module.
 %% ====================================================================
 %% REFERENCES
 %% ====================================================================
 \bibliography{references}
 \end{document}
--- a/paper/src/mirrors/genpop/01-intro.tex
+++ b/paper/src/mirrors/genpop/01-intro.tex
@@ -0,0 +1,62 @@
 %% General public mirror of 01-intro.tex
 \section{Introduction}
 In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove separability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned separability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.
 This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned separability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.}
 \subsection{Motivation and Market Context}
 The current innovation boom in generative artificial intelligence and its applications to knowledge-based work tasks has brought many competing technologies for browser-use automation, with benchmarks and evaluations \parencite{xia_evaluation-driven_2025} motivating the development of capabilities focused on commercial research, understanding, and transaction execution \parencite{xie_osworld_2024}. The ``AI Agent'' market is forecasted to grow from around USD 5-8 billion in 2025 to USD 42-52 billion by 2030. This surge reflects adoption in e-commerce, customer service, and enterprise automation, where agents handle interactions previously done by humans, raising the question of how these systems should be designed for future robustness as well as how to maintain a competitive edge in the analytical components of e-commerce platforms \parencite{markntel_advisors_global_2025}.
 The key stakeholders affected by the threat of increasing agent-driven traffic include online businesses and platform operators (especially in bot-heavy sectors like retail, travel, and financial services), their security, fraud, and engineering teams, end users whose accounts and data are exposed and whose experience degrades, regulators and legal stakeholders responding to breaches and fraud, and the attackers or bot operators driving the automation \parencite{imperva_rapid_2025}.
 The industry has already seen legal action in cases like Amazon against Perplexity \parencite{ghaffary_amazon_2025}, stemming from the difficulty of identifying traffic from hybrid systems like the Commet browser. This paper explores such systems to better understand what the interaction data looks like and what it means for dynamic pricing and recommendation systems downstream. This observed impact indicates a need for prevention of secondary negative effects on the ``legacy'' systems which power modern revenue sources for many companies. Dynamic pricing algorithms rely on directly translating demand features (which we call demand signal q) to new price assignments (which we call estimated price) across a catalogue of products of size N. This opens opportunities to design a clean slate of digital market mechanisms that will shape the future of commerce in the age of artificial intelligence.
 \subsection{Solution Space Overview}
 Dynamic pricing systems, as presented by \textcite{mueller_low-rank_2019}, often deal with sparse low-rank data of demand signals which, combined with contamination from agents, creates complex interactions that impact pricing. To further complicate the problem, certain commercial settings such as the one presented by \textcite{amjad_censored_2017} must address the true demand of products under censored observations. This provides a formulation for handling demand in our case with multiple kinds of commercial mediators: we estimate total demand as the combination of agent-generated demand and true human demand, where agent demand and human demand represent two distinct populations with divergent objective functions.
 We formally define interaction data as coming from some actor which can either be an agent (A) or human (H). For purposes of this research, an agent is an algorithmic loop with the ability to access a web platform and perform actions such as clicks, scrolls, and input field fills. The loop terminates when the internal large language model judges the provided task definition as complete. A detailed breakdown can be found in the Agent Interaction Loop algorithm.
 \subsection{Research Questions}
 This dissertation is organized around one main research question and three supporting sub-questions:
 \begin{enumerate}
    \item[\textbf{Main RQ}] How can dynamic pricing systems preserve margin integrity when transaction orchestration is increasingly mediated by non-human agents?
    \item[\textbf{SQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
    \item[\textbf{SQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
    \item[\textbf{SQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
 \end{enumerate}
 \begin{algorithm}[t]
 \DontPrintSemicolon
 \SetKwInput{Input}{Input}
 \SetKwInput{Output}{Output}
 \Input{Goal G, Platform URL u, LLM (large language model)}
 \Output{Task completion result r}
 Initialize browser instance with connection to platform URL\;
 Construct prompt from goal and URL\;
 Set done status to False\;
 \While{task is not done}{
    Observe current page state from browser\;
    Ask the language model what action to take next (click, scroll, fill, or navigate) based on the current state\;
    Execute that action in the browser to transition to the next state\;
    Ask the language model whether the goal has been achieved in this new state\;
 }
 Extract final result from terminal state\;
 \Return{result}\;
 \caption{AI Agent's Interaction Loop}
 \label{algagent-loop}
 \end{algorithm}
 The previously described goal of separability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary.
--- a/paper/src/mirrors/genpop/02-literature-review.tex
+++ b/paper/src/mirrors/genpop/02-literature-review.tex
@@ -0,0 +1,59 @@
 \section{Literature Review}
 To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups.
 \subsection{Agent Taxonomy and Definitions}
 An agent in the context of artificial intelligence is generally defined by anything that can reason and act upon observations of its environments (collected through some sensory inputs) and carry out actions through effectors. Moreover, a rational agent is an entity that is capable of perceiving the world around them and taking actions to advance specified goals. This definition by \textcite{russell_artificial_2021} is further developed in an economic context by \textcite{parkes_economic_2015}, suggesting AI research attempts to construct a synthetic homo economicus (rational economic human), which may also be termed machina economicus (rational economic machine).
 A specific class or taxon of this machina economicus, the Large Language Model (LLM) agent, is defined as an autonomous system capable of achieving goals and adapting post-training, often without needing explicit code or fundamental model changes \parencite{xia_evaluation-driven_2025}.
 We must however acknowledge the current SOTA as presented by OSWORLD simulations by \textcite{xie_osworld_2024} have demonstrated that multi-modal tasks across desktop and web interaction modes, have a top-performing score of only 12.24\% success, whereas humans have a higher 72\% success rate; this is linked to the lack of grounding of these agents and their inability of handling unexpected errors. This weakness matters for this research because it clarifies the near-term threat model: practical exploitation does not require a fully competent ``computer assistant'', only enough automation to perform high-volume reconnaissance actions (search/filter/open product pages, probe availability/price boundaries) that can contaminate behavioral signals. With the expected growth of these capabilities, this threat only becomes more perilous to revenue management systems.
 We model an agent session as producing some events with lower in-session conversion levels relative to humans, this we state in our assumption that the probability of purchase given an agent is less than the probability of purchase given a human but with a potentially higher volatility in estimated demand, which we observe through the look-to-book metrics in our simulation.
 \subsection{Economic Agents: From Homo Economicus to Machina Economicus}
 Existing behavioral economic models tend to be criticized for the assumption of rational behavior, as is embodied in the term of homo economicus. The definition of a machina economicus by \textcite{parkes_economic_2015} is quite appropriate for our case, particularly because these assumptions of rationality have been argued to be a very adequate reference for AI research by \textcite{varian_economic_1995} due to its expected utility maximizing nature. For modeling this behavior, the trajectories of these agents can be formally defined to be partially observable Markov decision processes \parencite{xie_osworld_2024}. Agents are however not to be confused with web-bots which have previously been known as automated software applications or scrapers which are set with a purpose of carrying out specific tasks on the internet, without a higher level of internal judgement \parencite{imperva_rapid_2025}. In our research, we refer to this actor simply as an Agent belonging to the distribution A.
 This economic framing also helps separate two related but distinct phenomena of agents as buyers (changing market demand composition), and agents as information gatherers (changing the observed interactions used by pricing/recommendation systems). The thesis focuses on the second, where information acquisition strategically precedes purchase execution. We do not however dismiss the proposed expectation that existing economic systems serving humans, will not be populated by AIs across multiple channels and with various possibly misaligned goals as stated by \textcite{parkes_economic_2015}.
 A HAP (HTTP Agent Profile) protocol has been developed as an internet draft by \textcite{dhir_http_2025} in an effort to separate agentic and human internet traffic, however the majority adoption by both the sellers and agent providers would be required for the implementation of such a solution.
 \subsection{Problem Evidence and Market Impact}
 The statistical issue of contamination in dynamic pricing systems that observe demand features as a means to update prices has been documented in various previous contexts. The airline industry (which has accounted for 24\% of observed disruptions) has seen malicious activity with a measureable impact on skewing key performance indicators by behavior visible in the look-to-book metrics. Excessive reconnaissance traffic inflates search volume without corresponding completed bookings, thereby skewing demand forecasts and disrupting dynamic pricing models. Demand proxies have also been observed to cause significant threat to inventory management by creating artificial scarcity that distorts the demand-supply relationships in the enterprise model. Censored demand as shown by \textcite{amjad_censored_2017} can also be observed in low-bias demand under-estimation caused by a distortion effect coming from non-human traffic data \parencite{imperva_rapid_2025}.
 When dynamic pricing algorithms operate on highly contaminated or noisy data, the risk grows significantly in creating inaccurate price inferences. The emergent mitigation driven by un-informed reward and regret signals might lead to price suppression for sales continuity which results in harming margins and resulting in a revenue loss. System that poorly fit undesired behavior might result in price gouging, which calls for strong guardrails while preserving targeted business strategy \parencite{mullapudi_reinforcement_2025}.
 \subsection{Theoretical Foundations: Economic Parallels}
 Early hints of exploration of prices in a standard English auction explored by \textcite{varian_economic_1995} which hints at exploration of prices in a sequential manner, which leads to a marginally different cost to the bidder than the reservation price of the seller. This is a setting in which there is no cost incured by the buyer for their actions or exploring prices in the market. They propose that any agent responsable for the pricing of a good must be imune to dynamic strategies which might extract private information from a market. A key take-away which relates to the Vickery auction mechanism (also called a direct mechanism) suggests that not only would defenses against such exploitation be necessary, but the construction of a mechanism in which revelation of the true willingness to pay is the dominant strategy for commerce.
 Like in classical revenue-maximizing auctions \parencite{roughgarden_cs364a_2013} we assume that the human actor in our system has a private valuation v which we formally draw from intrinsically defined distributions. The important note here is that the agent proxy does not have a mechanism to convey this private information into the demand data which directly impacts the pricing systems.
 The key component of this mediation between agents and commercial platforms lays in the transaction costs related to information gathering and negotiation. As proposed by \textcite{shahidi_coasean_2025} these costs are bound to collapse towards zero (which we demonstrate mathematically), calling for a re-evaluation of the boundaries between firms and markets. As argued by \textcite{coase_nature_1937}, the market participation and time associated with that participation, is critical part of the Coasean transaction cost logic which includes the discovery or relevant pricing within a given market. This process of price discovery without the presence of AI Agents can be time consuming and resource intensive. To build on top of this work we provide a proof of optimal conditions theorised by Coaes as an extension to AI-mediated markets.
 \subsection{Landscape of Existing Work}
 Explorations of the algorithmic collusion by LLMs \parencite{fish_algorithmic_2025} has demonstrated a cross-model tendency of market division with a strong sensitivity to instructions provided in the ``system prompt''. If a dynamic pricing algorithm which is trained to respond to market signals learns to coordinate with competitor agents (or become manipulated by those agents), the market equilibrium is under threat of destabilization. This is particularly true for Q-learning pricing learners as demonstrated by \textcite{calvano_artificial_2018}.
 Our effort to combat contamination stems from research by \textcite{hardt_strategic_2015} on strategic classification, in conjunction with \textcite{liu_contextual_2024} who demonstrate a linear regret if contamination is ignored. The strategic classification adversarial effect comes from an effort to manipulate some representative features used in a learning pipeline, which can result in lower prices on loans or lower prices from dynamic pricing algorithms.
 To bridge the gap between detection and robust pricing, we look at work in Distributionally Robust Optimization (DRO). As defined by \textcite{kuhn_wasserstein_2024}, DRO provides a framework for decision-making under ambiguity, where the true data distribution is unknown but lies within a ``Wasserstein ball'' of a target distribution. In our context, the ``ambiguity set'' represents the uncertainty introduced by agentic reconnaissance. By optimizing for the worst-case distribution within this set, pricing mechanisms can become resilient to the distributional shifts such as the ones caused by non-human actors, effectively robustifying the revenue function against the contamination described in our problem statement.
 In order to create an environment in which prices can be tested against a demand estimate generated by some behavioral model, we take inspiration from the architecture proposed by \textcite{ie_recsim_2019} in the RecSim platform built for recommendation systems. By modeling the distinct user behavior as POMDPs we can generate faithful interactions which allow us to generalize, past the constraint which is also present in recommendation systems, of rarely having enough experience with individual actor's interactions for good recommendations without generalization. The key inspiration comes from the user choice modeling which we translate to a user transition model for each distinct actor type (agent or human). We further consider the possibility of modeling our quantitative research platform using dynamic Bayesian networks for the sake of tractability within the system. The contribution or RecSim enables researchers to better understand learning algorithms in fixed environments, a gap we identify as needing to be bridged within the space of dynamic pricing.
 We also acknowledge the difficulty in similarly affected fields such as authorship, where \textcite{ganie_uncertainty_2025} demonstrate the theoretical limits of the distributional divergence between text authored by a human or large language model. Their approach of computing the divergence between two distributions demonstrates purely theoretically that no classifier can outperform random guessing on their particular task. This is yet another factor to take into consideration when exploring the potential mitigation strategies.
 The setting of our work is quite complex and covers a wide range of topics, each with its own set of issues that further complicate the task at hand. There is however promise in the field of reinforcement learning and adversarial robustness to combat these problems. We can summarize the characteristics learned from the review of our environment as:
 \begin{enumerate*}[label=(\roman*)]
 \item non-stationary demand with temporal noise
 \item contaminated behavioral signals from mixed human-agent traffic with unknown mixing ratio
 \item partial observability where only demand proxies are available, not true demand
 \item strategic actors capable of feature manipulation to influence pricing outcomes
 \item information asymmetry with private valuations drawn from unknown distributions
 \item session-based interactions modeled as POMDPs with trajectories
 \item low conversion probability for agents compared to humans
 \item distributional uncertainty requiring robust optimization within Wasserstein ambiguity sets
 \item potential for adversarial exploitation through false-name bidding and identity whitewashing.
 \end{enumerate*}
--- a/paper/src/mirrors/genpop/03-methodology.tex
+++ b/paper/src/mirrors/genpop/03-methodology.tex
@@ -0,0 +1,338 @@
 \section{Methodology}
 This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven separability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets.
 \subsection{Problem Formalization}
 We define a commercial environment where the platform interacts with a stream of sessions. Each session belongs to the set of all sessions. Each session is generated by an actor belonging to a latent class, either Human (H) or Agent (A).
 Each session produces a trajectory of observable events. An event is a tuple containing:
 the action taken (e.g., view item, add to cart),
 the target item index,
 and the continuous timestamp.
 The platform does not directly observe the true underlying demand function. Instead, it observes a behavioral proxy, which is a composite signal derived from the mixture of actor types. We define the demand proxy for product i at epoch t as a weighted aggregation of events: for each session in a time period, we sum up all the events where a specific product was interacted with, and we weight those events by how strong a signal they provide about willingness to pay. For example, adding an item to a cart is a stronger signal than just viewing it.
 In the current engine implementation, we use the normalized variant of this proxy for each step: we scale the raw demand signal for each product to a percentage out of 100, distributing it proportionally across all products. This keeps the signal dense and directly usable in the simulator. The weights follow a fixed category-level ordering: cart actions have the highest weight, then dwell actions, then navigation, then filtering.
 \subsubsection{Actor Types and Demand Curves}
 We formalize the heterogeneity of actors by introducing a type space. An actor of class H or A is further parameterized by a type that determines their demand response function. This type is sampled from a distribution of possible demand curves. The total observed demand is a stochastic process governed by the mixture:
 Total observed demand equals a combination of human demand (weighted by one minus the contamination ratio) and agent demand (weighted by the contamination ratio), plus some temporal market noise. The contamination parameter represents the proportion of agents in the system and ranges from 0 to 1.
 \subsection{Cost of Information (COI) Framework}
 The platform's pricing power comes from information asymmetry: users who express strong interest signals pay more than the base price. We quantify this markup as the \textit{Cost of Information} (COI), which represents the average premium extracted above marginal cost. COI measures the revenue at risk when information asymmetry collapses.
 A top-level view in the current AI discourse is that sufficiently large productivity gains can induce vertical deflation through cost compression and supply expansion \parencite{rachitsky_marc_2026}. Our contribution is narrower and mechanism-level: even under long-run deflation, platform revenue still depends on short-run information costs to the user. We formalize that rent as the Cost of Information (COI) and study how agentic reconnaissance accelerates its erosion.
 \textbf{Definition: Cost of Information.} The COI is defined as the difference between the expected price charged by the pricing policy and the minimum viable price (marginal cost). In other words, COI measures how much extra revenue the platform extracts on average by observing user behavior, beyond what it would get if everyone paid the rock-bottom price.
 \begin{figure}[ht]
    \centering
    \begin{tikzpicture}[scale=1.2]
        % Define the Gaussian function: centered at 2
        \def\bellcurve(#1){1.5 * exp(-0.5*((#1-2)/0.6)^2)}
        % Draw the main axis
        \draw[->, thick] (0, 0) -- (4.5, 0) node[right] {price};
        \draw[->, thick] (0, 0) -- (0, 2) node[above] {Density};
        \draw[thick, smooth, samples=100] plot[domain=0:4] (\x, {\bellcurve(\x)});
        \node at (3.2, 1.2) {price distribution};
        % Define minimum price and average price
        \def\pmin{0.8}
        \def\mean{2}
        % Vertical lines
        \draw[dashed] (\pmin, 0) -- (\pmin, 2.0);
        \draw[dashed] (\mean, 0) -- (\mean, 2.0);
        % Labels on axis
        \node[below] at (\pmin, 0) {min price};
        \node[below] at (\mean, 0) {avg price};
        \draw[<->, thick, red] (\pmin, 2.0) -- (\mean, 2.0) node[midway, above] {COI};
    \end{tikzpicture}
    \caption{Illustration of the Cost of Information (COI). The COI is defined as the difference between the expected price realized by the policy and the minimum viable price.}
    \label{fig:coi_illustration}
 \end{figure}
 We now formally demonstrate that standard dynamic pricing mechanisms are not incentive-compatible with high-frequency agentic traffic. As the number of independent competitive agents querying the system grows, the platform's ability to sustain a COI vanishes.
 A fundamental assumption for our claim lies in the alignment of the AI agent through its prompt which has been demonstrated by \textcite{fish_algorithmic_2025} to cause strong collusive behavior under linguistic nudges. This assumption can be generalized to the human user asking the agent to research products with a minimizing objective.
 \textbf{Theorem: COI Erosion in the Limit.} Let N be the number of independent, utility-maximizing agents querying the platform. Let the minimum price be the lowest price offered to these agents. As N grows toward infinity, the Cost of Information converges to 0.
 \textbf{Proof sketch.} Consider N independent agents querying the platform, each receiving a price sample drawn from the pricing policy's distribution bounded by a minimum and maximum price. A strategic agent conducting reconnaissance will select the minimum observed price.
 The probability that the minimum price exceeds some threshold equals the probability that all sampled prices exceed that threshold. This can be written as a product: since the samples are independent, the chance that all N prices are above the threshold equals the chance that one price is above it, raised to the power N.
 For any price above the minimum, there is always some positive probability of seeing a lower price. So the probability that one sample exceeds the threshold is less than 1. When we raise a number less than 1 to higher and higher powers (as N grows), it decays exponentially toward zero.
 The expected minimum price can be written as the minimum price plus an integral that captures the tail probability. As N grows, this tail probability vanishes for all prices above the minimum, so the integral converges to zero. Therefore, as the number of agents increases, the expected minimum price approaches the floor price, and the Cost of Information (the difference between expected price and minimum price) vanishes.
 This result proves that standard pricing policies fail to extract surplus in the presence of large-scale agentic search, necessitating a robust counter-mechanism.
 \subsection{System Architecture: Hybrid Kappa-Lambda Architecture}
 In order for our research to have grounding in interactions we built a robust e-commerce web-platform. We initially conducted a survey of the leading platforms of airlines and hotel booking sites to identify the specific interface patterns that effectively manage complex travel data. Our analysis revealed a clear industry standard: while both sectors rely on tabbed service selection and left-sidebar filtering to streamline navigation, they diverge in result presentation: airlines utilize visual date-price bars and multi-step wizards to optimize for logistical transparency, whereas hotel platforms leverage image-led cards and scarcity triggers to drive emotional engagement and urgency. Our web framework defines a highly agnostic boilerplate which can be seeded with any data-modality with an easy-to-tailor pattern, which we leverage to define a hotel and airline mode. Both modes are then individually deployed via an environment level argument which adjusts the proxy routing with a custom middleware inside next.js to render only the desired mode. The purpose of this was to create a baseline adaptable to any use-case or desired commercial application.
 The architecture of this platform begins with the deployed web-apps posting interaction data to our backend which processes them and stores each ingested interaction into a kafka cluster. This serves as our data reservoir tracking and associating each interaction with its session and importantly with which experiment it belongs to. Not only do we track the behavioral interactions, but our pricing provider micro-service, once called by the frontend reports the observed/queried price-product into kafka. This kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger. The final stage of the pricing pipeline, submits computed dynamic pricing results into a redis database for quick updates which is then read by the pricing provider and displayed on the webapp. This is a very generic end-to-end mechanism which is applicable to a variety of different e-commerce tasks. We intentionally put emphasis on the development of this infrastructure to establish a reproducible framework for interaction and to minimize any noise.
 \paragraph{Public Web Artifact} We transition the Kappa like architecture of the data collection to a Lambda architecture for actual learning in a surrogate environment. This allows us to move faster on data which is provided and helps us create a feedback loop for production deployment. To support further research in this intersection of fields we release P4P \footnote{\url{https://github.com/velocitatem/p4p}} as a public repository providing the interaction layer of the PHANTOM framework. This provides a configurable storefront which can be tailored to any commercial setting with a standardized session-level event tracking. We document the API adapters or what the framework expects in terms of schemas for pricing providers and log ingestion servicse. The repository is intended for controlled experimentation and method replication rather than production commerce deployment.
 \subsubsection{DevOps Principles}
 Reproducible results are key to quality research platforms, this is taken into mind when deploying and working with our research platform. From a deployment standpoint the platform can be deployed across a large variety of providers and can be run locally. When developing a new interaction modality apart from the ones that come out of the box, a simple template pattern can be followed. The middleware of the framework is designed to properly render the chosen modality from environmental variables, thus deployment of different or parallel version of the software can be easily parametrized.
 \subsubsection{Online Dynamic Pricing}
 In order to collect data from actors under correct conditions we replicate a naive and simple dynamic pricing algorithm which runs in the background during the experiments.
 The dynamic pricing done is handled by a pipeline which computes a demand estimate on a per-product basis of a specific window of the data, defined by the period T which by default is 5 minutes. This dynamic pricing pipeline computes a demand estimate vector by a weighted sum of interactions for each product, it additionally computes a price elasticity vector in the same dimensions as our demand. The final features matrix contains two columns for each product: demand and elasticity.
 The transformation that governs this dynamic pricing is a very simple surge-based pricing: for each product, if the estimated demand is high enough (above a surge threshold), we multiply the base price by a surge factor (typically 1.2). If demand is low enough (below a discount threshold), we multiply by a discount factor (typically 0.9). Otherwise, we keep the base price unchanged.
 This piecewise function enables rapid price adjustment in response to observed demand without requiring complex elasticity estimation or historical calibration, allowing us to expose actors within our experiments to a system with a dynamic component of pricing.
 \subsection{Experimental Design}
 We start from a practical constraint: we do not have access to proprietary production data. Because of that, we design our own fictional platform that still represents how commercial platforms work in the real world. The design comes from a survey of hotel and airline websites, where we extracted common interface components and used them as a high-level template for dynamic pricing environments.
 The interface is organized as a product catalog where each product belongs to a time-bounded price vector (for example, a daily pricing period). During each period we collect interaction data by instrumenting UI components and predefined action templates that are still customizable. This gives us control without losing realism.
 Since users act with motivations, we define a pool of tasks (jobs to be done) and assign tasks randomly to participants. The task pool is stored as a structured table with fields for task ID, creation timestamp, task name, description, and definition of done. We formulate the tasks as compact jobs-to-be-done rather than as strict click scripts, because the target is to elicit realistic browsing and comparison behavior which can capture nuance of different people. In hotel mode the assigned tasks include \textit{Cheapest Room}, \textit{Cheapest Room w/ View}, \textit{MultiStep Cheapest Room}, \textit{The Digital Nomad (Executive)}, and \textit{The 3-Way Tradeoff (Desk + Quiet + Flexible)}. These prompts deliberately require critical thought in search, inspection of room details, comparison of amenities or images, return visits to the listing page, and a final booking decision which create a degree of cognitive load. In airline mode we use \textit{Last-Minute One-Way Flight}, where the actor must urgently travel to LAX from either SEA or JFK within the next 1--3 days, inspect at least a small set of candidate itineraries, and then book a reasonable earliest departure.
 A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor.
 The human data collection involved 18 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 18 human sessions we ran 18 agent sessions of equivalent task scope, giving a balanced dataset of 36 labeled trajectories. Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy.
 To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior.
 Operationally, goals and experiment runs are tracked in PostgreSQL. This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to separate classes (agent vs human) with session-conditioned probability estimates, then injects those estimates into the pricing learner.
 Our process follows three stages: (1) observe and vectorize behavioral interactions, (2) learn separability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
 \begin{figure}[ht]
  \resizebox{\columnwidth}{!}{%
    \input{chapters/loop_figure.tex}
  }
  \caption{Overview of the Dynamic Pricing Tasks.}
 \end{figure}
 Our web platform (developed in similar spirit to RecSim \parencite{ie_recsim_2019}) gives us a controlled environment where tasks are assigned to human and agentic actors and then executed. Each actor receives a browser-level experiment identifier that may persist across multiple session IDs. We then group by experiment and extract session trajectories using the schema below.
 To speak to realism, user interviews reported that the platform architecture mirrored standard booking interfaces and reduced the cognitive load required to learn the system. One participant described the flow as ``intuitive'' and close to a ``normal'' transaction, suggesting observed behavior was primarily driven by pricing treatment rather than interface novelty.
 The dynamic pricing mechanism elicited immediate behavioral adjustments. Participants were sensitive to price volatility: sudden boosts triggered urgency and faster booking attempts, while large listing-to-final discrepancies triggered deeper comparison behavior. This is comforting because the controlled setup still produces commercially relevant interaction data.
 \subsubsection{Design of Training Factorial Study}
 The simulator has multiple configurable factors. We design a multi-factor study across five axes derived from the sweep configurations: (1) RL algorithm (PPO, A2C, DQN, Q-table; 4 levels), (2) contamination ratio sampled at four representative levels between 0.1 and 0.6, (3) robustness radius (3 levels), (4) COI penalty weight at two reference levels, and (5) pricing action granularity (two discretization settings for action levels); giving a grid of 192 configurations. Statistical power for the behavioral comparisons is determined by a two-sample test over per-session divergence scores.
 While this scale is generally expensive for reinforcement learning, we execute it on a large TPU cluster to make the sweep tractable.
 Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak throughput this corresponds to approximately 160 PFLOPS (petaflops, a measure of computational power), which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration where throughput-per-dollar is favorable, and reserve on-demand v4 capacity for runs that should not be interrupted.
 \begin{table}[ht]
 \centering
 \caption{Compact comparison of TPU generations used in the training stack.}
 \label{tab:tpu_specs}
 \begin{tabular}{@{}llll@{}}
 \toprule
 \textbf{Feature} & \textbf{TPU v4} & \textbf{TPU v5e} & \textbf{TPU v6e (Trillium)} \\
 \midrule
 Peak BF16 per chip (TFLOPS) & 275 & 197 & 918 \\
 HBM capacity per chip (GB) & 32 & 16 & 32 \\
 HBM bandwidth per chip (GB/s) & 1200 & 819 & 1600 \\
 TensorCores per chip & 2 & 1 & 1 \\
 Interconnect topology & 3D mesh/torus & 2D torus & 2D torus \\
 Max pod size (chips) & 4096 & 256 & 256 \\
 \bottomrule
 \end{tabular}
 \end{table}
 \begin{table}[ht]
 \centering
 \caption{TPU allocation used for the factorial study.}
 \label{tab:tpu_allocation}
 \begin{tabular}{@{}llll@{}}
 \toprule
 \textbf{TPU Type} & \textbf{Total Chips} & \textbf{Zone(s)} & \textbf{Provisioning} \\
 \midrule
 v6e & 128 (64 + 64) & europe-west4-a, us-east1-d & Spot \\
 v5e & 128 (64 + 64) & us-central1-a, europe-west4-b & Spot \\
 v4 & 64 (32 + 32) & us-central2-b & 32 Spot + 32 On-demand \\
 \bottomrule
 \end{tabular}
 \end{table}
 For connections from Madrid, we prioritize the europe-west4 allocation for latency-sensitive runs with the benefit of having the most grouped chips within a single region. This regional grouping is important for the deployment of our Kubernetes cluster which cannot span multiple regions. All sweep metadata, model checkpoints, and reward traces are logged in Weights \& Biases. Hardware specifications are from the official Google Cloud TPU documentation \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}.
 Design of training processes: we build docker image with the fact in mind of different caching over layers in order to most speed up docker re-building and such we place the most volatile steps towards the end of the image building. What is means in practice is that any dependency installations are isolated so edits to source code do no trigger rebuilds. Only if we update our entry point of training a sweep, Docker will also rebuild the source-code copy stage.
 Due to the preemptive nature of the current demand of TPU chips we settle for running our on demand as the primary source of compute. The on demand TPU pod of 32 chips spread across 4 virtual hosts creates a relatively unique parallelization setup. Despite our desire to use a traditional approach of clustering and perhaps deploying SLURM jobs of our sweep agent, the lack of predictability in provisioning each instance of a compute resource makes this an high friction layer we do not want to add.
 \subsubsection{Interaction Schema}
 We extend the basic event tuple to capture the full observational signal available to the platform. An interaction event is defined as the extended tuple containing: action, item index, timestamp, metadata record, and dwell time in milliseconds.
 The metadata record contains action-specific context (e.g., price observed, filter parameters, element text). For product views, metadata contains the observed price and product attributes. For dwell events, metadata includes the element text and accumulated hover duration.
 A session is itself a structured record containing: session ID (UUID), optional experiment link, session start timestamp, platform mode (hotel or airline), user-agent string, and the trajectory of events.
 The action space is partitioned into four semantic categories based on the behavioral signal each action conveys:
 \begin{table}[ht]
 \centering
 \caption{Action space partition with signal interpretation.}
 \label{tab:action_space}
 \begin{tabular}{@{}llll@{}}
 \toprule
 \textbf{Category} & \textbf{Actions} & \textbf{Signal} & \textbf{Weight} \\
 \midrule
 Cart & add item, remove, checkout, purchase & Purchase intent & High \\
 Dwell & hover title, hover paragraph, hover link & Sustained attention & Medium \\
 Navigation & page view, view item, learn more & Discovery & Low \\
 Filter & search, filter date, filter price, sort & Preference refinement & Lowest \\
 \bottomrule
 \end{tabular}
 \end{table}
 This partition enables the weight function to assign category-specific signal strengths, with cart actions having the highest weight, followed by dwell, navigation, and filter in decreasing order of commitment.
 In the simulator baseline this order is encoded with a compact fixed scale: cart equals 4.0, dwell equals 2.0, navigation equals 1.0, filter equals 0.5. Unknown actions are mapped by prefix heuristics to the nearest category.
 In addition to behavioral events, the platform logs price observations to a separate Kafka topic. Each price query generates a record associating the product, displayed price, requesting session, platform mode, and timestamp. This dual-stream architecture enables joint analysis of price exposure and behavioral response.
 \subsection{Generative Contamination and Separability}
 To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from Phantom data using a two-stage approach.
 \subsubsection{Ground-Truth Separability}
 Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels (human or agent) are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition, treating the resulting human and agent kernels as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability?
 To answer this, we compute per-session divergence scores against both class-level centroids. For each session in either partition, we fit a session-level event transition kernel from that session's trajectory alone, then compute its average divergence to the human centroid and to the agent centroid. The per-session separability score is the gap between these two divergences: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
 We cannot assume normal distributions for divergence scores, which are right-skewed and bounded below by zero, so we do not use a Student's t-test. Instead we apply a Mann-Whitney U test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations.
 \textbf{Definition: Divergence for Transition Distributions.} Let P and Q be categorical distributions over destination states following an event, derived from human and agent trajectories respectively. The divergence between these distributions measures how different P is from Q by summing over all possible destination states: for each destination, we take the probability under P, multiply by the log of the ratio of P to Q, and sum all these contributions. Large contributions occur when P assigns high probability to states that Q assigns low probability to.
 To obtain this statistic, we aggregate transitions by triggering event and treat normalized outgoing probabilities as categorical distributions. We intersect shared event labels, then accumulate log-ratio contributions over shared destinations. Large contributions identify transitions where one actor class is difficult to mimic.
 With these divergence features we train a contrastive model to estimate a weak agent probability, which we later use as a weighting and control signal.
 \subsubsection{Transition Probability Estimation}
 \label{sec:tpe}
 For both subsets, we model session dynamics as a process and estimate transition kernels. For each actor type we estimate global kernels for humans and agents, then cluster into behavioral sub-kernels to avoid collapsing all behavior into one average profile. Transition probabilities are estimated by maximum likelihood: the probability of transitioning from state s to state s' equals the number of times we observed that transition divided by the total number of times we left state s.
 This allows us to construct a Contamination Generator. Given a clean trajectory dataset, the generator injects synthetic agent trajectories sampled from the agent kernel until the effective mixing ratio reaches the desired contamination level.
 To scale this to catalog-level pricing, we expand the base event transition matrix into product-specific transitions using the current demand condition. In practice, we normalize the demand vector across products and use it to weight how much transition mass each product pair receives. Concretely, each cell of the base matrix becomes a block for N products, so the transition matrix grows substantially. Finally, we add generic states (homepage, login, checkout terminal states), which gives the full kernel size.
 \begin{figure}[ht]
    \centering
    \includegraphics[width=0.8\textwidth]{chapters/mdp_human.pdf}
    \caption{Markov Decision Process visualization illustrating the behavioral transition dynamics for \textbf{human} actions.}
    \label{fig:human_mdp_viz}
 \end{figure}
 \begin{figure}[ht]
    \centering
    \includegraphics[width=0.8\textwidth]{chapters/mdp_agent.pdf}
    \caption{Markov Decision Process visualization illustrating the behavioral transition dynamics for \textbf{agent} behavior profiles. The state space and transition probabilities are learned from observed session trajectories to enable generative contamination.}
    \label{fig:agent_mdp_viz}
  \end{figure}
 \subsection{Distributionally Robust Reinforcement Learning (DR-RL)}
 We formulate pricing as a Stackelberg game: the platform (leader) sets prices, and the population (follower) responds through trajectories and demand. A useful intuition is that the platform behaves like a distorted mirror at a 45-degree angle: what it mirrors is population demand into an estimated demand proxy, and that proxy drives revenue.
 Because contamination level and demand shift are non-stationary online, a simple error term is not enough. We therefore use a Distributionally Robust Optimization objective. For each newly observed trajectory generated by an unknown actor profile, we need a demand mapping conditioned on price and trajectory. For each trajectory, we compute its transition kernel and compare it with controlled baselines for humans and agents.
 We compute two divergence scores: divergence from the human baseline and divergence from the agent baseline. This yields two centroid-like heuristics that act as a session-level agent score in the engine. On a per-customer or use-case basis a similar study should be done in order to obtain ground truth behavior models for humans and agents and their specific interaction with a given products website.
 In implementation, we maintain an alternating game-history stack (our Limbo stack) and execute it explicitly every epoch with exactly two transitions: first the platform publishes a price vector (leader move), then the market responds with trajectory-derived demand (follower move).
 \subsubsection{Ambiguity Set Construction}
 We define an ambiguity set centered around our empirical reference distribution (derived from the generator). We utilize a distance metric to define the set of plausible demand distributions the agent might face: the ambiguity set contains all distributions that are statistically close to our observed training data but allows for adversarial shifts.
 For the current engine baseline, we use a compact approximation by applying ambiguity over contamination in a local interval around nominal contamination: we consider all contamination values within a small radius of the target contamination level, and we evaluate a small fixed grid in that interval per step, selecting the worst-case candidate for the learner.
 \subsubsection{Environment Setup for Dynamic Pricing}
 The complete pricing-demand-trajectory loop is illustrated in the oracle flow figure. The Oracle maps historical price and demand state to a new price vector, which is exposed to a distribution of demand curves. Each product generates trajectories weighted by behavioral kernels, producing a full transition matrix over sessions. Sampled trajectories are aggregated through the demand proxy function to yield the next demand vector, which feeds back into the Oracle.
 \begin{figure}[ht]
 \centering
 The oracle takes previous prices and demand, outputs new prices for each product. Each product's price generates demand curves from a distribution. Demand curves are combined with behavioral transition patterns to create a full transition matrix. We sample trajectories from this matrix, aggregate them through the demand proxy function, and get the next demand vector, which feeds back into the oracle.
 \caption{Oracle-based pricing loop: historical price and demand state map to a new price vector; each product samples demand curves; trajectories are generated by mixing demand with behavioral kernels into transition matrix; sampled trajectories aggregate through proxy to yield updated demand, closing the feedback loop.}
 \label{fig:oracle_flow}
 \end{figure}
 \subsubsection{The Min-Max Objective}
 The robust policy is obtained by solving the maximin problem: we want to maximize revenue in the worst-case scenario. Specifically, we choose a pricing policy that maximizes the minimum expected reward across all plausible demand distributions in the ambiguity set. The reward consists of revenue from sales minus a penalty for information leakage.
 In practice, we parameterize this with a session-level leakage term: information leakage equals the agent probability (how likely this session is from an agent) multiplied by the information value of the price quote. For the baseline engine, we use a constant query-tax surrogate: each suspected agent query incurs a fixed leakage cost.
 Another possible extension is to adapt the ambiguity radius online based on observed divergence, so the uncertainty set changes with live data. We keep this as future work and retain a fixed-radius setup.
 \subsubsection{Actor Implementation}
 In our simulation, the follower is implemented as a set of Actors. Each Actor is initialized with a type which samples a specific demand curve from the latent distribution. This formalization ensures that our agent does not overfit to a single deterministic demand function but learns a policy robust to the distributional uncertainty.
 Practical implementation of browser agents is a strongly evolving field with near-weekly releases of state-of-the-art architectures. In this thesis implementation we abstract that layer into trajectory generators learned from observed human/agent transition kernels.
 As part of reward engineering, we keep a UX factor (user experience, ranging from 0 to 1) as an auxiliary evaluation axis. In the current baseline it is not injected into the core reward; it is tracked separately to compare policy trade-offs.
 \begin{figure}[ht]
  \centering
  \resizebox{0.5\columnwidth}{!}{%
    \input{chapters/balance_figure.tex}
  }
  \caption{Introducing the UX index allows us to better distinguish the kind of impact different methods have and allows us to compare them on this Pareto-like scale.}
 \end{figure}
 We also consider taxation-like overlays for agent traffic under strategy-proof mechanism design (e.g., Vickrey-Clarke-Groves style rules). This remains an extension path and is not part of the main implementation in this thesis.
 \subsubsection{Pricing Mechanism Summary}
 We now present the complete pricing mechanism that integrates the behavioral separability, contamination estimation, and robust optimization components developed in the preceding sections. The defensive pricing loop algorithm formalizes the process as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories.
 \begin{algorithm}[t]
 \caption{PHANTOM defensive pricing loop}
 \label{alg:phantom_loop_clean}
 \DontPrintSemicolon
 \SetKwInput{Input}{Input}
 \SetKwInput{Output}{Output}
 \Input{catalog size N; action scale grid; nominal contamination; ambiguity radius; candidate count K; horizon T; sessions per step M; behavior kernels for humans and agents; event weights; COI penalty}
 \Output{trajectory of price, demand, and contamination over time}
 \For{each time step t from 0 to T-1}{
  observe previous demand and price\;
  choose discrete action from policy\;
  set new price by scaling previous price with chosen action, keeping within bounds\;
  define local ambiguity interval around nominal contamination\;
  \For{each candidate k from 1 to K}{
    set contamination level for this candidate from a uniform grid\;
    sample M sessions from mixture of human and agent behaviors weighted by contamination\;
    compute demand proxy by summing weighted actions across all sessions\;
    compute divergence scores and session agent score from transition kernel\;
    compute candidate reward as revenue minus COI leakage penalty\;
  }
  choose worst-case candidate (lowest reward), set contamination to that level\;
  set demand and reward to worst-case values\;
 }
 \end{algorithm}
 The algorithm operates in discrete epochs indexed by time. At each epoch, the platform applies one discrete multiplicative price action, the environment samples a batch of sessions, and demand is recomputed from weighted events. Robustness is implemented as an inner minimization over a small local grid of contamination candidates around nominal contamination, matching the current engine implementation. The history buffer enforces the alternating Stackelberg structure by preserving the temporal sequence of price publications and demand observations.
--- a/paper/src/mirrors/genpop/04-results.tex
+++ b/paper/src/mirrors/genpop/04-results.tex
@@ -0,0 +1,55 @@
 \section{Results}
 \begin{figure}[ht]
    \centering
    \input{chapters/figures/supra.tex}
    \caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as SAC as can be clearly seen by the high density at the highest available price.}
    \label{fig:supra_heatmap}
 \end{figure}
 \subsection{Behavioral Analysis}
 Separability between human and agent sessions is evaluated by computing per-session divergence gap scores (how much closer each session is to the human baseline versus the agent baseline) and comparing the two groups with a Mann-Whitney U test. The table below reports the group-level descriptive statistics for the gap scores and the test result.
 \begin{table}[ht]
 \centering
 \caption{Per-session divergence gap (distance to human baseline minus distance to agent baseline) by actor class with Mann-Whitney U test.}
 \label{tab:divergence_significance}
 \begin{tabular}{lccc}
 \toprule
 Group & n & Mean gap & Std \\
 \midrule
 Human sessions & 11 & $-3.3522$ & $2.6748$ \\
 Agent sessions & 6 & $+1.6482$ & $2.8349$ \\
 \midrule
 \multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\
 \bottomrule
 \end{tabular}
 \end{table}
 The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided p-value of 0.0006 (which means there is only a 0.06\% chance this pattern occurred by random luck) indicates near-complete rank separation between the groups at n=11 humans and n=6 agents, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing.
 \subsection{Experimental Outcomes}
 To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) non-robust baseline with revenue-only reward (no-robust flag).
 \begin{table}[ht]
 \centering
 \caption{Pricing policy benchmark for robust vs non-robust training.}
 \label{tab:pricing_benchmark}
 \begin{tabular}{lcccc}
 \toprule
 Policy & Eval reward & Eval revenue & COI leakage & Margin collapse rate \\
 \midrule
 Robust policy & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\
 Non-robust baseline (\texttt{--no-robust}) & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\
 \bottomrule
 \end{tabular}
 \end{table}
 This comparison isolates the effect of robustness terms from model capacity and optimization settings, and provides the benchmark needed for interpreting the value of COI-aware control.
 \subsection{Interpretation and Insights}
 The Mann-Whitney result (U=2.0, p less than 0.001) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
 \subsection{Anomalies}
--- a/paper/src/mirrors/genpop/05-discussion.tex
+++ b/paper/src/mirrors/genpop/05-discussion.tex
@@ -0,0 +1,17 @@
 \section{Discussion}
 \subsection{Transition to Agentic Market Microstructure}
 Our analysis of the interaction dynamics between the platform and non-human actors suggests that the current static pricing models are insufficient for an agent-mediated economy. If we assume a transition toward a direct revelation mechanism, where actors must reveal their true valuation of a good through bidding dynamics, we inevitably introduce significant stochasticity into the pricing system. Unlike traditional e-commerce where prices are relatively sticky, such a mechanism implies a high volatility characteristic of financial equity markets (without the fungability however).
 However, ecommerce commodities differ fundamentally from financial securities: they possess a hard floor defined by unit economics and reservation prices. The market might react enthusiastically to an iPhone priced at \$1, such a transaction is not permissible. The platform must establish an initial valuation anchor defined by the marginal cost plus a target margin, around which the market price is permitted to fluctuate. We float the introduction of GenAI Agents as Institutional Market Makers. As the arms race for greater autonomy of agnetic systems grows, the commercial viability of AI agents has the potential to disseminate into every-day users directly interacting with them rather than e-commerce platforms. This is also under the assumption of expected transactional capabilities being given to AI Agents.
 \subsection{Risk Assessment and Limitations}
 This technology does not come without a more bitter side, ethical concerns do arise from the idea of deploying black-box like solutions to set prices based on a behavioral attributes. Approaches like universal behavioral profile modeling (UBPM) used in recommendation systems is very broadly utilized.
 With a system like this there is potential for strong drift given the rapid advance of agentic systems and user preference. Our intent behind adding the UX term into the reward shaping process was to further address the risk of degraded user experience. Looking deeper at the underlying methodology, reinforcement learning does not come without it's complications such as reward hacking and often the lack of intepretability which is quite critical in systems that have a strong impact on the revenue of a company.
 \subsection{Implications of Findings}
 Interpretation of results and altenrative scenarios with broader market implications.
--- a/paper/src/mirrors/genpop/06-conclusion.tex
+++ b/paper/src/mirrors/genpop/06-conclusion.tex
@@ -0,0 +1,12 @@
 \section{Conclusion}
 For our troubles, we now conclude that...
 \subsection{Summary of contributions}
 The authors contribution was not without the advice of many experienced experts in the field. We thank (NAME) the director of innovation at Microsoft for the initial critical discussion on the topic of dynamic pricing systems and the spark which has lead to this work. Eugene, Bykovets pointing out the parallels in blockchain systems and the complexity of anonymous interaction and understanding of intent. Importantly, the contributions of Alverto Martin, my academic advisor for the support over and for taking on the challenge of this ambitious work. Many breakthroughs were thanks to numerous discussions with my peers on the topics covered here.
 A thanks to the head of innovation at Amadeus for insight into the industry split on the topic of collapsing margins. Finally we acknowledge the power and use of generative AI technologies for in depth research, rapid prototyping and surfacing of key topics and niches.
 \subsection{Future Works and Next Steps}
 During the eights months of research dedicated to this work, a plethora of opportunities and industry gaps was identified, sadly a majority of which could not be addressed directly.
--- a/paper/src/mirrors/genpop/INSTRUCTIONS.md
+++ b/paper/src/mirrors/genpop/INSTRUCTIONS.md
@@ -0,0 +1 @@
 The defined paper in chapters/ is a complete technical thesis written with all math and software aspects. Your job is to review and fully read this thesis and synthesize a mirrored version of it which can be read by the general public. This version should preserve all language and phrasing of the original thesis (aswell as the order) it should however remove any math formulas or complex algorithms. Instead your job is to understand those formulas and explain them in-line for a normal person to understand. That said, anything that is written you should rewrite word-by-word unless its something too technical, then you should create a minimal diff/adjustment which would replace the technical with a less technical version.
--- a/paper/src/mirrors/genpop/README.md
+++ b/paper/src/mirrors/genpop/README.md
@@ -0,0 +1,83 @@
 # General Public Mirror
 This directory contains a general-public edition of the PHANTOM thesis. Mathematical formulas and complex algorithms have been translated into plain language explanations while preserving the complete narrative and all research findings.
 ## Build Instructions
 ### Quick Build
 From the project root:
 ```bash
 make pdf.genpop
 ```
 ### Watch Mode (auto-rebuild on changes)
 ```bash
 make pdf.genpop.watch
 ```
 ### Manual Build
 ```bash
 cd paper/src
 latexmk -pdf -jobname=main-genpop -f -interaction=nonstopmode -outdir=../build main-genpop.tex
 ```
 ## Output Location
 The compiled PDF will be at:
 ```
 paper/build/main-genpop.pdf
 ```
 ## What's Different?
 ### Original Technical Version
 - Complex mathematical formulas and equations
 - Formal algorithmic pseudocode
 - Statistical notation and proofs
 - Assumes advanced math background
 ### General Public Version
 - Plain language explanations of formulas
 - Step-by-step algorithm descriptions
 - Intuitive statistical explanations
 - Accessible to non-technical readers
 ## Structure
 All mirrored chapters follow the same structure as the original:
 - `01-intro.tex` - Introduction
 - `02-literature-review.tex` - Literature Review
 - `03-methodology.tex` - Methodology (most heavily adapted)
 - `04-results.tex` - Results
 - `05-discussion.tex` - Discussion
 - `06-conclusion.tex` - Conclusion
 ## Translation Approach
 Following the instructions in `INSTRUCTIONS.md`, we:
 1. **Preserve** all language and phrasing from the original
 2. **Replace** mathematical formulas with inline plain-language explanations
 3. **Simplify** complex algorithms into readable step descriptions
 4. **Maintain** all citations, figures, tables, and narrative flow
 5. **Keep** technical terms when commonly understood
 6. **Explain** technical concepts inline for general readers
 ## Example Transformations
 **Mathematical formula:**
 ```latex
 \hat{q}_{t,i} = \sum_{s \in \mathcal{S}_t} \sum_{k=1}^{L_s} \omega(a_{s,k}) \cdot \mathbb{1}[i_{s,k} = i]
 ```
 **Becomes:**
 "for each session in a time period, we sum up all the events where a specific product was interacted with, and we weight those events by how strong a signal they provide about willingness to pay"
 **Proof notation:**
 ```latex
 P(p_{(1)} > t) = [1 - F(t)]^N \to 0
 ```
 **Becomes:**
 "When we raise a number less than 1 to higher and higher powers (as N grows), it decays exponentially toward zero"
--- a/scripts/nx_paper.sh
+++ b/scripts/nx_paper.sh
@@ -0,0 +1,57 @@
 #!/usr/bin/env bash
 set -euo pipefail
 cmd="${1:-}"
 case "$cmd" in
  build)
    mkdir -p paper/build
    bash paper/concat_code.sh
    cd paper/src
    latexmk -pdf -jobname=main -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build main.tex
    ;;
  watch)
    mkdir -p paper/build
    cd paper/src
    latexmk -pvc -pdf -jobname=main -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build main.tex
    ;;
  clean)
    cd paper/src
    latexmk -C -jobname=main -outdir=../build || true
    rm -rf ../build/*
    ;;
  wordcount)
    printf '%s\n' 'Counting words in main text (excluding appendix)...'
    texcount -nosub -total -sum -1 \
      paper/src/chapters/01-intro.tex \
      paper/src/chapters/02-literature-review.tex \
      paper/src/chapters/03-methodology.tex \
      paper/src/chapters/04-results.tex \
      paper/src/chapters/05-discussion.tex \
      paper/src/chapters/06-conclusion.tex
    ;;
  build-genpop)
    mkdir -p paper/build
    cd paper/src
    latexmk -pdf -jobname=main-genpop -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build main-genpop.tex
    ;;
  watch-genpop)
    mkdir -p paper/build
    cd paper/src
    latexmk -pvc -pdf -jobname=main-genpop -f -interaction=nonstopmode -file-line-error -r ../.latexmkrc -outdir=../build main-genpop.tex
    ;;
  build-arxiv)
    mkdir -p paper/build
    cd paper/src/mirrors/arxiv
    pdflatex -interaction=nonstopmode -file-line-error main.tex
    bibtex main
    pdflatex -interaction=nonstopmode -file-line-error main.tex
    pdflatex -interaction=nonstopmode -file-line-error main.tex
    cp main.pdf ../../../build/main-arxiv.pdf
    ;;
  *)
    printf '%s\n' "Unknown paper command: $cmd" >&2
    exit 1
    ;;
 esac
--- a/scripts/nx_research.sh
+++ b/scripts/nx_research.sh
@@ -0,0 +1,127 @@
 #!/usr/bin/env bash
 set -euo pipefail
 cmd="${1:-}"
 env_file="${SWEEP_ENV_FILE:-.env.sweep}"
 load_sweep_env() {
  set -a
  [ -f "$env_file" ] && . "$env_file" || true
  set +a
 }
 require_var() {
  local name="$1"
  local msg="$2"
  if [ -z "${!name:-}" ]; then
    printf '%s\n' "$msg" >&2
    exit 1
  fi
 }
 case "$cmd" in
  install)
    [ -x .venv/bin/python ] || python3 -m venv .venv
    .venv/bin/python -m ensurepip --upgrade
    .venv/bin/python -m pip install -r requirements.txt
    ;;
  train)
    load_sweep_env
    require_var WANDB_API_KEY "WANDB_API_KEY required - set it in $env_file"
    WANDB_ENTITY="${WANDB_ENTITY:-}" \
    WANDB_PROJECT="${WANDB_PROJECT:-capstone}" \
    WANDB_API_KEY="$WANDB_API_KEY" \
      .venv/bin/python -m engine.train ${LOCAL_TRAIN_ARGS:---algo ppo --total-timesteps 50000}
    ;;
  benchmark)
    load_sweep_env
    if [[ " ${LOCAL_BENCHMARK_ARGS:-} " != *" --no-wandb "* ]]; then
      require_var WANDB_API_KEY "WANDB_API_KEY required - set it in $env_file"
    fi
    WANDB_ENTITY="${WANDB_ENTITY:-}" \
    WANDB_PROJECT="${WANDB_PROJECT:-capstone}" \
    WANDB_API_KEY="${WANDB_API_KEY:-}" \
      .venv/bin/python -m engine.train --run-kind benchmark ${LOCAL_BENCHMARK_ARGS:---tiers static,surge,linear,qtable,ppo --alpha-values 0.0,0.3 --episodes 3 --total-timesteps 3000 --max-steps 40 --device cpu}
    ;;
  benchmark-simple)
    load_sweep_env
    if [[ " ${SIMPLE_BENCHMARK_ARGS:-} " != *" --no-wandb "* ]]; then
      require_var WANDB_API_KEY "WANDB_API_KEY required - set it in $env_file"
    fi
    WANDB_ENTITY="${WANDB_ENTITY:-}" \
    WANDB_PROJECT="${WANDB_PROJECT:-capstone}" \
    WANDB_API_KEY="${WANDB_API_KEY:-}" \
    PHANTOM_BENCHMARK_COMPARE_ROBUST="${PHANTOM_BENCHMARK_COMPARE_ROBUST:-1}" \
      .venv/bin/python -m engine.train --run-kind benchmark ${SIMPLE_BENCHMARK_ARGS:---tiers qtable,ppo,dqn,a2c --alpha-values 0.0,0.15,0.3,0.45,0.6 --episodes 8 --total-timesteps 8000 --max-steps 40 --device cpu}
    ;;
  train-agent)
    load_sweep_env
    require_var WANDB_API_KEY "WANDB_API_KEY required - set it in $env_file"
    require_var SWEEP_ID "SWEEP_ID required, e.g. SWEEP_ID=entity/project/id"
    args=(--sweep-agent --sweep-id "$SWEEP_ID")
    if [ -n "${AGENT_COUNT:-}" ] && [ "${AGENT_COUNT}" != "0" ]; then
      args+=(--count "$AGENT_COUNT")
    fi
    WANDB_ENTITY="${WANDB_ENTITY:-}" \
    WANDB_PROJECT="${WANDB_PROJECT:-capstone}" \
    WANDB_API_KEY="$WANDB_API_KEY" \
      .venv/bin/python -m engine.train "${args[@]}"
    ;;
  benchmark-agent)
    load_sweep_env
    require_var WANDB_API_KEY "WANDB_API_KEY required - set it in $env_file"
    require_var SWEEP_ID "SWEEP_ID required, e.g. SWEEP_ID=entity/project/id"
    args=(--sweep-agent --sweep-id "$SWEEP_ID")
    if [ -n "${AGENT_COUNT:-}" ] && [ "${AGENT_COUNT}" != "0" ]; then
      args+=(--count "$AGENT_COUNT")
    fi
    WANDB_ENTITY="${WANDB_ENTITY:-}" \
    WANDB_PROJECT="${WANDB_PROJECT:-capstone}" \
    WANDB_API_KEY="$WANDB_API_KEY" \
      .venv/bin/python -m engine.train --run-kind benchmark "${args[@]}" ${BENCHMARK_AGENT_ARGS:-}
    ;;
  train-bootstrap)
    load_sweep_env
    require_var WANDB_API_KEY "WANDB_API_KEY required - set it in $env_file"
    require_var GITHUB_TOKEN "GITHUB_TOKEN required - set it in $env_file"
    require_var REPO_URL "REPO_URL required, e.g. REPO_URL=https://github.com/org/repo.git"
    require_var SWEEP_ID "SWEEP_ID required, e.g. SWEEP_ID=entity/project/id"
    WANDB_API_KEY="$WANDB_API_KEY" \
    WANDB_ENTITY="${WANDB_ENTITY:-}" \
    WANDB_PROJECT="${WANDB_PROJECT:-capstone}" \
    GITHUB_TOKEN="$GITHUB_TOKEN" \
    REPO_URL="$REPO_URL" \
    BRANCH="${BRANCH:-main}" \
    WORKDIR="${WORKDIR:-$HOME/PHANTOM-agent}" \
    SWEEP_ID="$SWEEP_ID" \
    AGENT_COUNT="${AGENT_COUNT:-0}" \
    AGENT_LOOP="${AGENT_LOOP:-1}" \
    RETRY_SECONDS="${RETRY_SECONDS:-20}" \
      bash scripts/wandb_agent_bootstrap.sh
    ;;
  stats)
    python3 - <<'PY'
 from pathlib import Path
 skip = {"node_modules", ".venv", "venv"}
 exts = {".ts", ".py"}
 total = 0
 for path in Path(".").rglob("*"):
    if not path.is_file() or path.suffix not in exts or any(part in skip for part in path.parts):
        continue
    text = path.read_text(errors="ignore")
    total += text.count("\n") + (1 if text and not text.endswith("\n") else 0)
 print(total)
 PY
    ;;
  docker-train-publish)
    image_ref="${TRAIN_IMAGE_REF:-us-central1-docker.pkg.dev/phantom-trc/phantom/phantom-trainer}"
    docker build -f docker/Trainer.dockerfile --target gpu -t "$image_ref:gpu-latest" .
    docker push "$image_ref:gpu-latest"
    ;;
  *)
    printf '%s\n' "Unknown research command: $cmd" >&2
    exit 1
    ;;
 esac
--- a/scripts/tpu_pod_run.sh
+++ b/scripts/tpu_pod_run.sh
@@ -1,32 +0,0 @@
 #!/usr/bin/env sh
 # Executed on each TPU pod worker via `gcloud tpu-vm scp` + `gcloud tpu-vm ssh --worker=all`.
 # Authenticates with Artifact Registry using the VM's service account metadata token,
 # pulls the TPU trainer image, then runs the W&B sweep agent inside Docker.
 # TPU chip devices (/dev/accel*) are exposed via --privileged + /dev volume mount.
 # Required env vars: WANDB_API_KEY, SWEEP_ID
 # Optional: AGENT_COUNT (default 1, 0 = run until sweep ends)
 set -eu
 IMAGE="us-central1-docker.pkg.dev/phantom-trc/phantom/phantom-trainer:tpu-latest"
 AGENT_COUNT="${AGENT_COUNT:-1}"
 # use VM service account — no manual key needed on the pod
 TOKEN=$(curl -sf -H "Metadata-Flavor: Google" \
  "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/token" \
  | python3 -c 'import sys, json; print(json.load(sys.stdin)["access_token"])')
 echo "$TOKEN" | sudo docker login -u oauth2accesstoken \
  --password-stdin https://us-central1-docker.pkg.dev
 sudo docker pull "$IMAGE"
 # --privileged + /dev mount gives the container access to /dev/accel* (TPU chips)
 # --network host lets JAX reach the other pod workers for distributed init
 sudo docker run --rm \
  --privileged \
  --network host \
  --volume /dev:/dev \
  -e WANDB_API_KEY="$WANDB_API_KEY" \
  -e SWEEP_ID="$SWEEP_ID" \
  -e AGENT_COUNT="$AGENT_COUNT" \
  "$IMAGE"
--- a/scripts/tpu_sync_repo.sh
+++ b/scripts/tpu_sync_repo.sh
@@ -1,83 +0,0 @@
 #!/usr/bin/env sh
 set -eu
 TPU_NAME="${TPU_NAME:?TPU_NAME is required}"
 TPU_ZONE="${TPU_ZONE:-us-central2-b}"
 TPU_PROJECT="${TPU_PROJECT:-phantom-trc}"
 LOCAL_REPO_DIR="${LOCAL_REPO_DIR:-$(pwd)}"
 REMOTE_REPO_DIR="${REMOTE_REPO_DIR:-/tmp/PHANTOM}"
 ARCHIVE_PATH="${ARCHIVE_PATH:-/tmp/phantom-sync.tgz}"
 FILE_LIST="$(mktemp /tmp/phantom-sync-files.XXXXXX)"
 CLEANUP_LIST=true
 cleanup() {
  if [ "$CLEANUP_LIST" = "true" ]; then
    rm -f "$FILE_LIST"
  fi
 }
 trap cleanup EXIT
 if [ ! -d "$LOCAL_REPO_DIR" ]; then
  echo "local repo directory not found: $LOCAL_REPO_DIR"
  exit 1
 fi
 if git -C "$LOCAL_REPO_DIR" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
  git -C "$LOCAL_REPO_DIR" ls-files -co --exclude-standard > "$FILE_LIST"
  python3 - "$FILE_LIST" <<'PY'
 import sys
 from pathlib import Path
 file_list = Path(sys.argv[1])
 skip_prefixes = (
    "wandb/",
    ".venv/",
    "venv/",
    "node_modules/",
    ".next/",
    ".turbo/",
    "__pycache__/",
    ".mypy_cache/",
    ".pytest_cache/",
    ".ruff_cache/",
    "paper/build/",
    "tests/e2e/test-results/",
 )
 rows = file_list.read_text().splitlines()
 kept = [
    row
    for row in rows
    if row and not any(row == p.rstrip("/") or row.startswith(p) for p in skip_prefixes)
 ]
 file_list.write_text("\n".join(kept) + ("\n" if kept else ""))
 PY
  tar -czf "$ARCHIVE_PATH" -C "$LOCAL_REPO_DIR" -T "$FILE_LIST"
 else
  tar \
    --exclude-vcs \
    --exclude=".venv" --exclude="*/.venv" \
    --exclude="venv" --exclude="*/venv" \
    --exclude="node_modules" --exclude="*/node_modules" \
    --exclude=".next" --exclude="*/.next" \
    --exclude=".turbo" --exclude="*/.turbo" \
    --exclude="__pycache__" --exclude="*/__pycache__" \
    --exclude=".mypy_cache" --exclude="*/.mypy_cache" \
    --exclude=".pytest_cache" --exclude="*/.pytest_cache" \
    --exclude=".ruff_cache" --exclude="*/.ruff_cache" \
    --exclude="wandb" --exclude="*/wandb" \
    --exclude="paper/build" \
    --exclude="tests/e2e/test-results" \
    -czf "$ARCHIVE_PATH" \
    -C "$LOCAL_REPO_DIR" .
 fi
 gcloud compute tpus tpu-vm scp "$ARCHIVE_PATH" "$TPU_NAME:/tmp/phantom-sync.tgz" \
  --zone="$TPU_ZONE" --project="$TPU_PROJECT" --worker=all
 gcloud compute tpus tpu-vm ssh "$TPU_NAME" \
  --zone="$TPU_ZONE" --project="$TPU_PROJECT" --worker=all \
  --command="rm -rf '$REMOTE_REPO_DIR' && mkdir -p '$REMOTE_REPO_DIR' && tar -xzf /tmp/phantom-sync.tgz -C '$REMOTE_REPO_DIR' && rm -f /tmp/phantom-sync.tgz"
 rm -f "$ARCHIVE_PATH"
--- a/scripts/tpu_vm_sweep_agent.py
+++ b/scripts/tpu_vm_sweep_agent.py
@@ -1,183 +0,0 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import argparse
 import json
 import os
 import re
 import shlex
 import subprocess
 import time
 from pathlib import Path
 import wandb
 CLI_MAP: dict[str, str] = {
    "algo": "--algo",
    "total_timesteps": "--total-timesteps",
    "alpha": "--alpha",
    "N": "--N",
    "n_products": "--n-products",
    "lambda_coi": "--lambda-coi",
    "info_value": "--info-value",
    "robust_radius": "--robust-radius",
    "robust_points": "--robust-points",
    "learning_rate": "--learning-rate",
    "gamma": "--gamma",
    "gae_lambda": "--gae-lambda",
    "clip_range": "--clip-range",
    "ent_coef": "--ent-coef",
    "revenue_weight": "--revenue-weight",
    "max_steps": "--max-steps",
    "margin_floor": "--margin-floor",
    "margin_floor_patience": "--margin-floor-patience",
    "arch": "--arch",
    "activation": "--activation",
    "jax_num_envs": "--jax-num-envs",
    "jax_num_steps": "--jax-num-steps",
    "jax_num_minibatches": "--jax-num-minibatches",
    "jax_update_epochs": "--jax-update-epochs",
    "jax_anneal_lr": "--jax-anneal-lr",
    "checkpoint_interval": "--checkpoint-interval",
    "action_levels": "--action-levels",
    "action_scale_low": "--action-scale-low",
    "action_scale_high": "--action-scale-high",
 }
 def _to_cli_args(cfg: dict) -> str:
    parts: list[str] = ["--jax", "--no-wandb"]
    for key, flag in CLI_MAP.items():
        if key not in cfg:
            continue
        value = cfg[key]
        if value is None:
            continue
        if isinstance(value, bool):
            if key == "jax_anneal_lr":
                parts.extend([flag, "true" if value else "false"])
            elif value:
                parts.append(flag)
            continue
        parts.extend([flag, str(value)])
    return " ".join(shlex.quote(p) for p in parts)
 _SENTINEL = "PHANTOM_METRICS:"
 def _extract_metrics(output: str) -> dict:
    # fast path: look for the dedicated sentinel line emitted by run_local
    for line in output.splitlines():
        if line.startswith(_SENTINEL):
            try:
                return json.loads(line[len(_SENTINEL) :])
            except Exception:
                break
    # fallback: scan for any JSON block containing eval/sweep keys;
    # use greedy match to capture the largest possible block first
    for block in re.findall(r"\{[^{}]*\}", output):
        try:
            obj = json.loads(block)
        except Exception:
            continue
        if isinstance(obj, dict) and ("sweep/score" in obj or "eval/reward" in obj):
            return obj
    return {}
 def main() -> None:
    p = argparse.ArgumentParser(
        description="Run W&B sweep where each trial uses full TPU pod"
    )
    p.add_argument("--sweep-id", required=True)
    p.add_argument("--tpu-name", required=True)
    p.add_argument("--tpu-zone", default="us-central2-b")
    p.add_argument("--tpu-project", default="phantom-trc")
    p.add_argument("--tpu-repo-dir", default="/tmp/PHANTOM")
    p.add_argument("--count", type=int, default=0)
    p.add_argument("--workdir", default=str(Path(__file__).resolve().parents[1]))
    args = p.parse_args()
    workdir = Path(args.workdir).resolve()
    env = os.environ.copy()
    prepare_cmd = [
        "make",
        "train.tpu.vm.prepare",
        f"TPU_NAME={args.tpu_name}",
        f"TPU_ZONE={args.tpu_zone}",
        f"TPU_PROJECT={args.tpu_project}",
        f"TPU_REPO_DIR={args.tpu_repo_dir}",
    ]
    prepare = subprocess.run(
        prepare_cmd,
        cwd=workdir,
        env=env,
        text=True,
        capture_output=False,
        check=False,
    )
    if prepare.returncode != 0:
        raise RuntimeError("Failed to prepare TPU workers for sweep")
    def run_trial() -> None:
        run = None
        try:
            run = wandb.init()
            cfg = dict(wandb.config)
            cli_args = _to_cli_args(cfg)
            env_trial = dict(env)
            env_trial["LOCAL_TRAIN_ARGS"] = cli_args
            cmd = [
                "make",
                "train.tpu.vm.run",
                f"TPU_NAME={args.tpu_name}",
                f"TPU_ZONE={args.tpu_zone}",
                f"TPU_PROJECT={args.tpu_project}",
                f"TPU_REPO_DIR={args.tpu_repo_dir}",
            ]
            proc = subprocess.run(
                cmd,
                cwd=workdir,
                env=env_trial,
                text=True,
                capture_output=True,
                check=False,
            )
            if proc.stdout:
                print(proc.stdout)
            if proc.stderr:
                print(proc.stderr)
            if proc.returncode != 0:
                if run is not None:
                    run.summary["runner/exit_code"] = proc.returncode
                raise RuntimeError(f"TPU trial failed with exit code {proc.returncode}")
            metrics = _extract_metrics(proc.stdout)
            if metrics:
                wandb.log(metrics)
                for k, v in metrics.items():
                    run.summary[k] = v
            run.summary["runner/exit_code"] = 0
        except Exception:
            time.sleep(2)
            raise
        finally:
            if run is not None and wandb.run is not None:
                wandb.finish()
    wandb.agent(
        args.sweep_id,
        function=run_trial,
        count=args.count if args.count > 0 else None,
    )
 if __name__ == "__main__":
    main()
--- a/scripts/tpu_vm_train.sh
+++ b/scripts/tpu_vm_train.sh
@@ -1,43 +0,0 @@
 #!/usr/bin/env sh
 set -eu
 REPO_DIR="${REPO_DIR:-$HOME/PHANTOM}"
 PYTHON_BIN="${PYTHON_BIN:-python3}"
 TRAIN_ARGS="${TRAIN_ARGS:---algo ppo --jax --total-timesteps 200000 --jax-num-envs 32 --jax-num-steps 128 --jax-num-minibatches 4 --jax-update-epochs 4}"
 EXTRA_PIP="${EXTRA_PIP:-flax optax distrax}"
 INSTALL_FULL_REQUIREMENTS="${INSTALL_FULL_REQUIREMENTS:-0}"
 if [ ! -d "$REPO_DIR" ]; then
  echo "repo directory not found: $REPO_DIR"
  exit 1
 fi
 cd "$REPO_DIR"
 if [ -d "wandb" ]; then
  rm -rf wandb
 fi
 # keep install idempotent and avoid re-installing jax/libtpu each run
 if [ "$INSTALL_FULL_REQUIREMENTS" = "1" ] && [ -f "requirements.txt" ]; then
  $PYTHON_BIN -m pip install -r requirements.txt
 fi
 if ! $PYTHON_BIN -c 'import flax, optax, distrax' >/dev/null 2>&1; then
  if [ -f "engine/jax/requirements.txt" ]; then
    $PYTHON_BIN -m pip install -r engine/jax/requirements.txt
  fi
  $PYTHON_BIN -m pip install -U $EXTRA_PIP
 fi
 if [ -n "${WANDB_API_KEY:-}" ]; then
  if ! $PYTHON_BIN -c 'import wandb; import inspect; assert hasattr(wandb, "init") and callable(wandb.init)' >/dev/null 2>&1; then
    $PYTHON_BIN -m pip install -U wandb
  fi
 fi
 if [ -n "${WANDB_API_KEY:-}" ]; then
  export WANDB_API_KEY
  exec $PYTHON_BIN -m engine.train $TRAIN_ARGS
 fi
 exec $PYTHON_BIN -m engine.train $TRAIN_ARGS --no-wandb
--- a/sim/rl/behavior_loader/loader.py
+++ b/sim/rl/behavior_loader/loader.py
@@ -2,6 +2,7 @@ import os
 import json
 from pydantic import BaseModel as Base
 class PayloadModel(Base):
    sessionId: str
    experimentId: str | None
@@ -13,6 +14,7 @@ class PayloadModel(Base):
    userAgent: str
    ts: str
 class ValueModel(Base):
    payload: PayloadModel
    encoding: str
@@ -20,6 +22,7 @@ class ValueModel(Base):
    schemaId: int
    size: int
 class InteractionModel(Base):
    partitionID: int
    offset: int
@@ -30,14 +33,17 @@ class InteractionModel(Base):
    key: dict
    value: ValueModel
 def _is_admin(page: str | None) -> bool:
    return page is not None and page.startswith("/admin/")
 class Loader:
    def __init__(self, src_dir: str):
        self.src_dir = src_dir
        self.entries = os.listdir(src_dir)
-        if not self.entries: raise ValueError("empty directory")
+        if not self.entries:
            raise ValueError("empty directory")
        self.data = self._load_sessions()
    def _load_sessions(self) -> dict:
@@ -55,16 +61,21 @@ class Loader:
    def get_entries(self) -> tuple[list[str], int]:
        return self.entries, len(self.entries)
 class AgentLoader(Loader):
    def _load_sessions(self) -> dict:
        sessions = {}
        for entry in self.entries:
-            with open(f"{self.src_dir}/{entry}/int.json") as f:
+            path = f"{self.src_dir}/{entry}/int.json"
            if not os.path.isfile(path):
                continue
            with open(path) as f:
                raw = json.load(f)
            ints = [PayloadModel(**i) for i in raw]
            sessions[entry] = [i for i in ints if not _is_admin(i.page)]
        return sessions
 class JointLoader:
    def __init__(self, human_dir: str, agent_dir: str):
        self.human_loader = Loader(human_dir)
@@ -74,10 +85,14 @@ class JointLoader:
    def _merge(self) -> dict:
        return {
-            **{f"human_{sid}": [e.value.payload for e in evts]
+            **{
-               for sid, evts in self.human_loader.get_data().items()},
+                f"human_{sid}": [e.value.payload for e in evts]
-            **{f"agent_{sid}": evts
+                for sid, evts in self.human_loader.get_data().items()
-               for sid, evts in self.agent_loader.get_data().items()}
+            },
            **{
                f"agent_{sid}": evts
                for sid, evts in self.agent_loader.get_data().items()
            },
        }
    def get_data(self) -> dict:
@@ -86,12 +101,17 @@ class JointLoader:
    def get_entries(self) -> tuple[list[str], int]:
        return self.entries, len(self.entries)
 if __name__ == "__main__":
    agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-    human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+    human_dir = (
        "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
    )
-    for name, cls, path in [("agent", AgentLoader, agent_dir),
+    for name, cls, path in [
        ("agent", AgentLoader, agent_dir),
        ("human", Loader, human_dir),
-                             ("joint", lambda d: JointLoader(human_dir, d), agent_dir)]:
+        ("joint", lambda d: JointLoader(human_dir, d), agent_dir),
    ]:
        ldr = cls(path) if name != "joint" else cls(agent_dir)
        print(f"Loaded {len(ldr.get_entries()[0])} {name} sessions")
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -11,7 +11,7 @@ from pathlib import Path
 # import lib utilities for optional use - models keep their own _state_repr for backwards compat
 # with the specific event structure (evt.value.payload)
-sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / 'lib'))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "lib"))
 try:
    from lib.state import make_state_repr as lib_make_state_repr
    from lib.features import transition_histogram as lib_transition_histogram
@@ -37,7 +37,8 @@ class BehaviorModel:
    def _extract_sessions(self) -> List[List[str]]:
        trajs = []
        for evts in self.data.values():
-            if len(evts) < 2: continue
+            if len(evts) < 2:
                continue
            states = [self._state_repr(e) for e in sorted(evts, key=self._sort_key)]
            trajs.append(states)
        return trajs
@@ -59,8 +60,10 @@ class BehaviorModel:
        return rwd
    def _normalize_trans(self, cnts: Dict) -> Dict:
-        return {s: {s_n: cnt/sum(nxt.values()) for s_n, cnt in nxt.items()}
+        return {
-                for s, nxt in cnts.items()}
+            s: {s_n: cnt / sum(nxt.values()) for s_n, cnt in nxt.items()}
            for s, nxt in cnts.items()
        }
    def build_MDP(self) -> Dict:
        trajs = self._extract_sessions()
@@ -69,34 +72,40 @@ class BehaviorModel:
        state_rwd = self._calc_rewards(trajs)
        self.mdp = {
-            'states': sorted(states),
+            "states": sorted(states),
-            'num_states': len(states),
+            "num_states": len(states),
-            'transitions': trans_prob,
+            "transitions": trans_prob,
-            'state_values': {s: np.mean(r) for s, r in state_rwd.items()},
+            "state_values": {s: np.mean(r) for s, r in state_rwd.items()},
-            'state_rewards': state_rwd,
+            "state_rewards": state_rwd,
-            'trans_counts': trans_cnt,
+            "trans_counts": trans_cnt,
        }
        return self.mdp
    def transition_prob(self, s: str, s_next: str) -> float:
-        if not self.mdp: raise ValueError("build MDP first")
+        if not self.mdp:
-        return self.mdp['transitions'].get(s, {}).get(s_next, 0.0)
+            raise ValueError("build MDP first")
        return self.mdp["transitions"].get(s, {}).get(s_next, 0.0)
    def state_value(self, s: str) -> float:
-        if not self.mdp: raise ValueError("build MDP first")
+        if not self.mdp:
-        return self.mdp['state_values'].get(s, 0.0)
+            raise ValueError("build MDP first")
        return self.mdp["state_values"].get(s, 0.0)
    def sample_traj(self, start: str, max_len: int = 50) -> List[str]:
-        if not self.mdp: raise ValueError("build MDP first")
+        if not self.mdp:
            raise ValueError("build MDP first")
        path, curr = [start], start
        for _ in range(max_len):
-            nxt = self.mdp['transitions'].get(curr, {})
+            nxt = self.mdp["transitions"].get(curr, {})
-            if not nxt: break
+            if not nxt:
                break
            curr = np.random.choice(list(nxt.keys()), p=list(nxt.values()))
            path.append(curr)
        return path
-    def extract_trajectory_features(self, events: List, max_trans_dim: int = 50) -> np.ndarray:
+    def extract_trajectory_features(
        self, events: List, max_trans_dim: int = 50
    ) -> np.ndarray:
        """Convert trajectory to feature vector using MDP structure for contrastive learning"""
        if not self.mdp:
            self.build_MDP()
@@ -108,7 +117,11 @@ class BehaviorModel:
        trans_counts = defaultdict(int)
        for s, s_next in zip(states, states[1:]):
            trans_counts[(s, s_next)] += 1
-        all_trans = [(s, t) for s in self.mdp['states'] for t in self.mdp['transitions'].get(s, {}).keys()]
+        all_trans = [
            (s, t)
            for s in self.mdp["states"]
            for t in self.mdp["transitions"].get(s, {}).keys()
        ]
        trans_vec = [trans_counts.get(tr, 0) for tr in all_trans[:max_trans_dim]]
        trans_vec = trans_vec + [0] * (max_trans_dim - len(trans_vec))  # pad
        total_trans = sum(trans_counts.values()) or 1
@@ -116,11 +129,13 @@ class BehaviorModel:
        # state coverage ratio
        visited = set(states)
-        features.append(len(visited) / max(self.mdp['num_states'], 1))
+        features.append(len(visited) / max(self.mdp["num_states"], 1))
        # temporal entropy of transitions
        if len(states) > 1:
-            trans_probs = [self.transition_prob(s, s_n) for s, s_n in zip(states, states[1:])]
+            trans_probs = [
                self.transition_prob(s, s_n) for s, s_n in zip(states, states[1:])
            ]
            entropy = -sum(p * np.log(p + 1e-10) for p in trans_probs if p > 0)
            features.append(entropy / max(len(states), 1))
        else:
@@ -150,6 +165,7 @@ class AgentBehaviorModel(BehaviorModel):
    def _sort_key(self, evt):
        return evt.ts
 class JointBehaviorModel(BehaviorModel):
    def __init__(self, human_dir: str, agent_dir: str):
        self.loader = JointLoader(human_dir, agent_dir)
@@ -163,73 +179,187 @@ class JointBehaviorModel(BehaviorModel):
    def _sort_key(self, evt):
        return evt.ts
 def aggregate_event_transitions(mdp: Dict) -> Dict[str, Dict[str, float]]:
    evt_trans = defaultdict(lambda: defaultdict(float))
-    for s, trans in mdp['transitions'].items():
+    for s, trans in mdp["transitions"].items():
-        src = s.split('|')[2]
+        src = s.split("|")[2]
        for s_next, prob in trans.items():
-            dst = s_next.split('|')[2]
+            dst = s_next.split("|")[2]
            evt_trans[src][dst] += prob
    for src in evt_trans:
        total = sum(evt_trans[src].values())
        if total > 0:
-            evt_trans[src] = {dst: p/total for dst, p in evt_trans[src].items()}
+            evt_trans[src] = {dst: p / total for dst, p in evt_trans[src].items()}
    return dict(evt_trans)
-def visualize_mdp(model: BehaviorModel, threshold: float = 0.05, output: str = "mdp_graph",
+
-                  fmt: str = "svg", view: bool = False, export_dot: bool = False):
+def visualize_mdp(
-    if not model.mdp: raise ValueError("build MDP first")
+    model: BehaviorModel,
    threshold: float = 0.05,
    output: str = "mdp_graph",
    fmt: str = "svg",
    view: bool = False,
    export_dot: bool = False,
 ):
    if not model.mdp:
        raise ValueError("build MDP first")
    evt_trans = aggregate_event_transitions(model.mdp)
    g = graphviz.Digraph(format=fmt)
-    g.attr(rankdir='LR', size='30')
+    g.attr(rankdir="LR", size="30")
-    g.attr('node', shape='circle', width='1', height='1')
+    g.attr("node", shape="circle", width="1", height="1")
-    events = set(evt_trans.keys()) | {e for trans in evt_trans.values() for e in trans.keys()}
+    events = set(evt_trans.keys()) | {
        e for trans in evt_trans.values() for e in trans.keys()
    }
    for evt in events:
        g.node(evt)
    for src, dsts in evt_trans.items():
        for dst, prob in dsts.items():
            if prob > threshold:
-                g.edge(src, dst, label=f'{prob:.2f}')
+                g.edge(src, dst, label=f"{prob:.2f}")
    g.render(output, view=view, cleanup=True)
    print(f"Saved MDP graph to {output}.{fmt}")
    if export_dot:
-        with open(f"{output}.dot", 'w') as f:
+        with open(f"{output}.dot", "w") as f:
            f.write(g.source)
        print(f"Exported DOT source to {output}.dot")
    return g
 def kl_divergence(p: Dict[str, float], q: Dict[str, float]) -> float:
    eps = 1e-10
    # p + log(p / q) summed over all keys in P
    return sum((p[k] + eps) * np.log((p[k] + eps) / (q.get(k, 0.0) + eps)) for k in p)
 def _build_subset_mdp(model: BehaviorModel, session_ids: List) -> Dict:
    trajs = []
    for sid in session_ids:
        evts = model.data.get(sid, [])
        if len(evts) < 2:
            continue
        states = [model._state_repr(e) for e in sorted(evts, key=model._sort_key)]
        trajs.append(states)
    trans_cnt, _ = model._calc_transitions(trajs)
    return {"transitions": model._normalize_trans(trans_cnt)}
 def _avg_event_kl(
    src_evt: Dict[str, Dict[str, float]], dst_evt: Dict[str, Dict[str, float]]
 ) -> float:
    common = set(src_evt.keys()) & set(dst_evt.keys())
    if not common:
        return 0.0
    return float(np.mean([kl_divergence(src_evt[e], dst_evt[e]) for e in common]))
 def per_session_divergence(
    model: BehaviorModel,
    reference_evt: Dict[str, Dict[str, float]],
 ) -> List[float]:
    """KL from each session's event-level transition dist to a reference kernel. Returns one scalar per session."""
    scores = []
    for sid, evts in model.data.items():
        if len(evts) < 2:
            continue
        subset_mdp = _build_subset_mdp(model, [sid])
        sess_evt = aggregate_event_transitions(subset_mdp)
        common = set(sess_evt.keys()) & set(reference_evt.keys())
        if not common:
            scores.append(0.0)
            continue
        scores.append(
            float(
                np.mean([kl_divergence(sess_evt[e], reference_evt[e]) for e in common])
            )
        )
    return scores
 def bootstrap_intra_class_divergence(
    model: BehaviorModel,
    n_bootstrap: int = 100,
    seed: int = 42,
 ) -> Dict[str, float]:
    session_ids = list(model.data.keys())
    n = len(session_ids)
    if n < 2:
        return {
            "mean": 0.0,
            "std": 0.0,
            "q05": 0.0,
            "q95": 0.0,
            "n_bootstrap": 0,
            "scores": [],
            "available": False,
            "num_sessions": int(n),
        }
    half = n // 2
    rng = np.random.default_rng(seed)
    scores = []
    for _ in range(n_bootstrap):
        perm = rng.permutation(session_ids)
        split_a, split_b = perm[:half], perm[half:]
        mdp_a = _build_subset_mdp(model, list(split_a))
        mdp_b = _build_subset_mdp(model, list(split_b))
        score = _avg_event_kl(
            aggregate_event_transitions(mdp_a),
            aggregate_event_transitions(mdp_b),
        )
        scores.append(score)
    arr = np.array(scores, dtype=float)
    return {
        "mean": float(np.mean(arr)),
        "std": float(np.std(arr)),
        "q05": float(np.quantile(arr, 0.05)),
        "q95": float(np.quantile(arr, 0.95)),
        "n_bootstrap": int(n_bootstrap),
        "scores": arr.tolist(),
        "available": True,
        "num_sessions": int(n),
    }
 if __name__ == "__main__":
    base_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments"
-    human_dir, agent_dir = f"{base_dir}/collected_data/", f"{base_dir}/agents/collected_data/"
+    human_dir, agent_dir = (
        f"{base_dir}/collected_data/",
        f"{base_dir}/agents/collected_data/",
    )
    human_model = BehaviorModel(human_dir)
    human_mdp = human_model.build_MDP()
-    print(f"Built MDP: {human_mdp['num_states']} states, "
+    print(
-          f"{sum(len(t) for t in human_mdp['transitions'].values())} transitions")
+        f"Built MDP: {human_mdp['num_states']} states, "
-    if not human_mdp['states']:
+        f"{sum(len(t) for t in human_mdp['transitions'].values())} transitions"
    )
    if not human_mdp["states"]:
        exit("No states found")
-    visualize_mdp(human_model, threshold=0.05, output="human_mdp_viz", fmt="pdf", export_dot=True)
+    visualize_mdp(
        human_model, threshold=0.05, output="human_mdp_viz", fmt="pdf", export_dot=True
    )
    agent_model = AgentBehaviorModel(agent_dir)
    agent_mdp = agent_model.build_MDP()
-    print(f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
+    print(
-          f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions")
+        f"AGENT... Built MDP: {agent_mdp['num_states']} states, "
-    if not agent_mdp['states']:
+        f"{sum(len(t) for t in agent_mdp['transitions'].values())} transitions"
    )
    if not agent_mdp["states"]:
        exit("No states found")
-    visualize_mdp(agent_model, threshold=0.05, output="agent_mdp_viz", fmt="pdf", export_dot=True)
+    visualize_mdp(
        agent_model, threshold=0.05, output="agent_mdp_viz", fmt="pdf", export_dot=True
    )
    human_evt = aggregate_event_transitions(human_mdp)
    agent_evt = aggregate_event_transitions(agent_mdp)
@@ -239,8 +369,11 @@ if __name__ == "__main__":
    if not common:
        exit("No common event types for KL divergence analysis")
-    kl_divs = sorted([(e, kl_divergence(human_evt[e], agent_evt[e])) for e in common],
+    kl_divs = sorted(
-                     key=lambda x: x[1], reverse=True)
+        [(e, kl_divergence(human_evt[e], agent_evt[e])) for e in common],
        key=lambda x: x[1],
        reverse=True,
    )
    print(f"Average KL divergence: {np.mean([kl for _, kl in kl_divs]):.4f}")
    print("\nMost divergent event types:")
@@ -250,9 +383,88 @@ if __name__ == "__main__":
    print("\n=== Joint Model (Human + Agent Combined) ===")
    joint_model = JointBehaviorModel(human_dir, agent_dir)
    joint_mdp = joint_model.build_MDP()
-    print(f"Built joint MDP: {joint_mdp['num_states']} states, "
+    print(
-          f"{sum(len(t) for t in joint_mdp['transitions'].values())} transitions")
+        f"Built joint MDP: {joint_mdp['num_states']} states, "
-    if joint_mdp['states']:
+        f"{sum(len(t) for t in joint_mdp['transitions'].values())} transitions"
-        visualize_mdp(joint_model, threshold=0.05, output="joint_mdp_viz", fmt="pdf", export_dot=True)
+    )
    if joint_mdp["states"]:
        visualize_mdp(
            joint_model,
            threshold=0.05,
            output="joint_mdp_viz",
            fmt="pdf",
            export_dot=True,
        )
-    # TODO: setup intra class divergence as baseline for evaluating and adding significance to the divergence which we observe across class
+    inter_class_avg = float(np.mean([kl for _, kl in kl_divs]))
    human_intra = bootstrap_intra_class_divergence(
        human_model, n_bootstrap=100, seed=42
    )
    agent_intra = bootstrap_intra_class_divergence(
        agent_model, n_bootstrap=100, seed=43
    )
    pooled_scores = human_intra["scores"] + agent_intra["scores"]
    if not pooled_scores:
        pooled_scores = [0.0]
    pooled_null = np.array(pooled_scores, dtype=float)
    p_empirical = float(
        (np.sum(pooled_null >= inter_class_avg) + 1) / (len(pooled_null) + 1)
    )
    print("\nIntra-class KL bootstrap baseline:")
    if human_intra["available"]:
        print(
            f"  Human split KL: {human_intra['mean']:.4f} +- {human_intra['std']:.4f} "
            f"(5-95%: {human_intra['q05']:.4f}-{human_intra['q95']:.4f}, n_sessions={human_intra['num_sessions']})"
        )
    else:
        print(
            f"  Human split KL: unavailable (need >=2 sessions, got {human_intra['num_sessions']})"
        )
    if agent_intra["available"]:
        print(
            f"  Agent split KL: {agent_intra['mean']:.4f} +- {agent_intra['std']:.4f} "
            f"(5-95%: {agent_intra['q05']:.4f}-{agent_intra['q95']:.4f}, n_sessions={agent_intra['num_sessions']})"
        )
    else:
        print(
            f"  Agent split KL: unavailable (need >=2 sessions, got {agent_intra['num_sessions']})"
        )
    print(f"  Between-class KL: {inter_class_avg:.4f}")
    print(
        f"  Lift vs pooled intra mean: {inter_class_avg / max(float(np.mean(pooled_null)), 1e-10):.2f}x"
    )
    print(f"  Empirical p-value (inter > intra): {p_empirical:.4f}")
    # per-session divergence scores: delta_H - delta_A per session (positive means closer to agent behavior)
    from scipy.stats import mannwhitneyu
    human_dH = per_session_divergence(
        human_model, human_evt
    )  # human session vs human centroid
    human_dA = per_session_divergence(
        human_model, agent_evt
    )  # human session vs agent centroid
    agent_dH = per_session_divergence(
        agent_model, human_evt
    )  # agent session vs human centroid
    agent_dA = per_session_divergence(
        agent_model, agent_evt
    )  # agent session vs agent centroid
    # score = delta_H - delta_A: high means far from humans, close to agents
    n_h = min(len(human_dH), len(human_dA))
    n_a = min(len(agent_dH), len(agent_dA))
    human_diff = [human_dH[i] - human_dA[i] for i in range(n_h)]
    agent_diff = [agent_dH[i] - agent_dA[i] for i in range(n_a)]
    print(f"\nPer-session divergence gap (delta_H - delta_A):")
    print(
        f"  Human sessions (n={n_h}): mean={np.mean(human_diff):.4f}, std={np.std(human_diff):.4f}"
    )
    print(
        f"  Agent sessions (n={n_a}): mean={np.mean(agent_diff):.4f}, std={np.std(agent_diff):.4f}"
    )
    if n_h >= 2 and n_a >= 2:
        U, mw_p = mannwhitneyu(human_diff, agent_diff, alternative="two-sided")
        print(f"  Mann-Whitney U={U:.1f}, p={mw_p:.4f}")
    else:
        print("  Insufficient sessions for Mann-Whitney test")
--- a/tests/e2e/project.json
+++ b/tests/e2e/project.json
@@ -0,0 +1,60 @@
 {
  "$schema": "../../node_modules/nx/schemas/project-schema.json",
  "name": "e2e",
  "projectType": "application",
  "sourceRoot": "tests/e2e",
  "targets": {
    "install": {
      "executor": "nx:run-commands",
      "options": {
        "command": "npm install",
        "cwd": "tests/e2e"
      }
    },
    "test": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "outputs": [
        "{projectRoot}/test-results"
      ],
      "options": {
        "commands": [
          "npx playwright install chromium",
          "test -f .env || cp .env.example .env",
          "timeout 30 bash -c \"until curl -sf http://localhost:5000/health > /dev/null 2>&1; do sleep 1; done\" || (echo 'Backend not ready' && exit 1)",
          "timeout 30 bash -c \"until curl -sf http://localhost:3000 > /dev/null 2>&1; do sleep 1; done\" || (echo 'Web app not ready' && exit 1)",
          "timeout 30 bash -c \"until curl -sf http://localhost:8085/health > /dev/null 2>&1; do sleep 1; done\" || (echo 'Airflow not ready' && exit 1)",
          "npm test"
        ],
        "parallel": false,
        "cwd": "tests/e2e"
      }
    },
    "test-ui": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "npm run test:ui",
        "cwd": "tests/e2e"
      }
    },
    "test-debug": {
      "executor": "nx:run-commands",
      "dependsOn": [
        "install"
      ],
      "options": {
        "command": "npm run test:debug",
        "cwd": "tests/e2e"
      }
    }
  },
  "tags": [
    "scope:test",
    "type:e2e"
  ]
 }
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1 @@`
							`__all__ = ["evaluate", "make_env", "train_qtable", "train_sb3"]`
		`@@ -0,0 +1 @@`
							The defined paper in chapters/ is a complete technical thesis written with all math and software aspects. Your job is to review and fully read this thesis and synthesize a mirrored version of it which can be read by the general public. This version should preserve all language and phrasing of the original thesis (aswell as the order) it should however remove any math formulas or complex algorithms. Instead your job is to understand those formulas and explain them in-line for a normal person to understand. That said, anything that is written you should rewrite word-by-word unless its something too technical, then you should create a minimal diff/adjustment which would replace the technical with a less technical version.