diff --git a/.gitignore b/.gitignore index 7644627..11ff6b1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ .env.* !.env.*.example **/.venv +**/.venv-ray # python build/cache artifacts **/__pycache__ diff --git a/.rayignore b/.rayignore new file mode 100644 index 0000000..c71492e --- /dev/null +++ b/.rayignore @@ -0,0 +1,35 @@ +# Virtual environments +.venv +.venv* +venv +venv* +**/.venv +**/venv +**/node_modules +node_modules/ + +# Python caches +__pycache__/ +*.pyc +.ruff_cache/ +.pytest_cache/ + +# Git +.git/ + +# Large data and logs +data/ +experiments/ +wandb/ +dumplogs* +*.zip +*.pdf +*.log +*.dot + +# Other large dirs +PHANTOM_web/ +web/ +docs/ +paper/ +.nx/ diff --git a/Makefile b/Makefile index 94e7e2a..9e2d5d2 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,7 @@ PYTEST := $(VENV)/bin/pytest NX := npx nx SWEEP_ENV_FILE ?= .env.sweep +TPU_CONF ?= tpu_orchestration/configs/v4_spot_us.conf WANDB_ENTITY ?= WANDB_PROJECT ?= capstone @@ -21,6 +22,14 @@ SIMPLE_BENCHMARK_ARGS ?= --tiers qtable,ppo,dqn,a2c --alpha-values 0.0,0.15,0.3, BENCHMARK_AGENT_ARGS ?= AGENT_COUNT ?= 0 +WHOCLICKED_REPO ?= velocitatem/whoclickedit +WHOCLICKED_CSV ?= experiments/exports/whoclicked.csv +WHOCLICKED_CARD ?= experiments/exports/whoclicked_dataset_card.md +WHOCLICKED_CSV_PATH_IN_REPO ?= whoclicked.csv +WHOCLICKED_CARD_PATH_IN_REPO ?= README.md +WHOCLICKED_DATASET_MESSAGE ?= Update flattened whoclickedit dataset +WHOCLICKED_CARD_MESSAGE ?= Update dataset card for whoclickedit + REPO_URL ?= BRANCH ?= main WORKDIR ?= $(HOME)/PHANTOM-agent @@ -35,8 +44,10 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" || .PHONY: help help: - @echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines" + @echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines | manim.render manim.render.all" @echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish" + @echo "data.pull data.push data.whoclicked.publish | study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot" + @echo "tpu.ray.bootstrap tpu.ray.deps tpu.ray.verify tpu.ray.teardown" @echo "" @echo "Build general public version:" @echo " make pdf.genpop" @@ -56,6 +67,12 @@ help: @echo "Bootstrap private repo worker from anywhere:" @echo " make train.bootstrap REPO_URL=https://github.com/org/repo.git BRANCH=main SWEEP_ID=entity/project/id" @echo "" + @echo "Bootstrap Ray on TPU slice from config:" + @echo " make tpu.ray.bootstrap TPU_CONF=tpu_orchestration/configs/v4_spot_us.conf" + @echo "" + @echo "Publish whoclickedit dataset + card:" + @echo " make data.whoclicked.publish HF_TOKEN=... WHOCLICKED_REPO=velocitatem/whoclickedit" + @echo "" @echo "Config source: $(SWEEP_ENV_FILE) (auto-loaded)" $(BUILDDIR): @@ -133,10 +150,42 @@ train.agent: train.bootstrap: @WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap +.PHONY: tpu.ray.bootstrap tpu.ray.deps tpu.ray.verify tpu.ray.teardown +tpu.ray.bootstrap: + @TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-bootstrap + +tpu.ray.deps: + @TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-deps + +tpu.ray.verify: + @TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-verify + +tpu.ray.teardown: + @TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-teardown + +.PHONY: data.pull data.push +data.pull: + python scripts/hf_data.py pull + +data.push: + python scripts/hf_data.py push + +.PHONY: data.whoclicked.publish +data.whoclicked.publish: + @HF_TOKEN="$(HF_TOKEN)" WHOCLICKED_REPO="$(WHOCLICKED_REPO)" WHOCLICKED_CSV="$(WHOCLICKED_CSV)" WHOCLICKED_CARD="$(WHOCLICKED_CARD)" WHOCLICKED_CSV_PATH_IN_REPO="$(WHOCLICKED_CSV_PATH_IN_REPO)" WHOCLICKED_CARD_PATH_IN_REPO="$(WHOCLICKED_CARD_PATH_IN_REPO)" WHOCLICKED_DATASET_MESSAGE="$(WHOCLICKED_DATASET_MESSAGE)" WHOCLICKED_CARD_MESSAGE="$(WHOCLICKED_CARD_MESSAGE)" $(NX) run research:whoclicked-publish + .PHONY: stats.lines stats.lines: @$(NX) run research:stats +.PHONY: study.margin-erosion +study.margin-erosion: + python -m engine.studies.margin_erosion_alpha + +.PHONY: study.margin-erosion.quick +study.margin-erosion.quick: + python -m engine.studies.margin_erosion_alpha --quick + .PHONY: wordcount wordcount: @$(NX) run paper:wordcount @@ -185,3 +234,10 @@ count-lines: all: @$(NX) run paper:build + +.PHONY: manim.render manim.render.all +manim.render: + @$(NX) run manim:render + +manim.render.all: + @$(NX) run manim:render-all diff --git a/README.md b/README.md index 17a8c45..6f744c3 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ ### PHANTOM +[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-sm.svg)](https://huggingface.co/datasets/velocitatem/whoclickedit) [![Build PDF](https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml/badge.svg)](https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml) [![Paper](https://img.shields.io/badge/Paper-PDF-red?logo=adobe-acrobat-reader)](https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf) [![TPU Research Cloud](https://img.shields.io/badge/TPU%20Research%20Cloud-TRC%20supported-4285F4?logo=googlecloud&logoColor=white)](https://sites.research.google/trc/faq/) diff --git a/TPUS/README.md b/TPUS/README.md deleted file mode 100644 index bb88fce..0000000 --- a/TPUS/README.md +++ /dev/null @@ -1,6 +0,0 @@ -64 spot Cloud TPU v6e chips in zone europe-west4-a -32 spot Cloud TPU v4 chips in zone us-central2-b -64 spot Cloud TPU v5e chips in zone us-central1-a -64 spot Cloud TPU v6e chips in zone us-east1-d -32 on-demand Cloud TPU v4 chips in zone us-central2-b -64 spot Cloud TPU v5e chips in zone europe-west4-b diff --git a/TPUS/v4_32_spot_uscentral2b.sh b/TPUS/v4_32_spot_uscentral2b.sh deleted file mode 100644 index 661bcdc..0000000 --- a/TPUS/v4_32_spot_uscentral2b.sh +++ /dev/null @@ -1,22 +0,0 @@ -# 32 spot Cloud TPU v4 chips in zone us-central2-b -export PROJECT_ID=phantom-trc -export QR_NAME=TPUv4s32spotUC2B -export TPU_NAME=tpu-v4-32-uc2b-spot -export ZONE=us-central2-b -export ACCELERATOR_TYPE=v4-32 -export RUNTIME_VERSION=v2-alpha-tpuv4 - -gcloud compute tpus tpu-vm create ${TPU_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --version=${RUNTIME_VERSION} \ - --spot \ -|| \ -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} \ - --spot diff --git a/TPUS/v4_uscentral2b.sh b/TPUS/v4_uscentral2b.sh deleted file mode 100644 index a372078..0000000 --- a/TPUS/v4_uscentral2b.sh +++ /dev/null @@ -1,13 +0,0 @@ -# 32 on-demand Cloud TPU v4 chips in zone us-central2-b -export PROJECT_ID=phantom-trc -export QR_NAME=TPUlong -export ZONE=us-central2-b -export ACCELERATOR_TYPE=v4-32 -export RUNTIME_VERSION=v2-alpha-tpuv4 -#gcloud compute tpus tpu-vm create ${TPU_NAME} --zone=${ZONE} --project=${PROJECT_ID} --accelerator-type=${ACCELERATOR_TYPE} --version=${RUNTIME_VERSION} -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} diff --git a/TPUS/v5e_64_spot_europewest4b.sh b/TPUS/v5e_64_spot_europewest4b.sh deleted file mode 100644 index 7a35d7e..0000000 --- a/TPUS/v5e_64_spot_europewest4b.sh +++ /dev/null @@ -1,22 +0,0 @@ -# 64 spot Cloud TPU v5e chips in zone europe-west4-b -export PROJECT_ID=phantom-trc -export QR_NAME=TPUv5e64spotEW4B -export TPU_NAME=tpu-v5e-64-ew4b -export ZONE=europe-west4-b -export ACCELERATOR_TYPE=v5e-64 -export RUNTIME_VERSION=v2-alpha-tpuv5-lite - -gcloud compute tpus tpu-vm create ${TPU_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --version=${RUNTIME_VERSION} \ - --spot \ -|| \ -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} \ - --spot diff --git a/TPUS/v5e_64_spot_uscentral1a.sh b/TPUS/v5e_64_spot_uscentral1a.sh deleted file mode 100644 index 96375fd..0000000 --- a/TPUS/v5e_64_spot_uscentral1a.sh +++ /dev/null @@ -1,22 +0,0 @@ -# 64 spot Cloud TPU v5e chips in zone us-central1-a -export PROJECT_ID=phantom-trc -export QR_NAME=TPUv5e64spotUC1A -export TPU_NAME=tpu-v5e-64-uc1a -export ZONE=us-central1-a -export ACCELERATOR_TYPE=v5e-64 -export RUNTIME_VERSION=v2-alpha-tpuv5-lite - -gcloud compute tpus tpu-vm create ${TPU_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --version=${RUNTIME_VERSION} \ - --spot \ -|| \ -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} \ - --spot diff --git a/TPUS/v6e_64_spot_europewest4a.sh b/TPUS/v6e_64_spot_europewest4a.sh deleted file mode 100644 index 1ea17ac..0000000 --- a/TPUS/v6e_64_spot_europewest4a.sh +++ /dev/null @@ -1,22 +0,0 @@ -# 64 spot Cloud TPU v6e chips in zone europe-west4-a -export PROJECT_ID=phantom-trc -export QR_NAME=TPUv6e64spotEW4A -export TPU_NAME=tpu-v6e-64-ew4a -export ZONE=europe-west4-a -export ACCELERATOR_TYPE=v6e-64 -export RUNTIME_VERSION=v2-alpha-tpuv6e - -gcloud compute tpus tpu-vm create ${TPU_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --version=${RUNTIME_VERSION} \ - --spot \ -|| \ -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} \ - --spot diff --git a/TPUS/v6e_64_spot_useast1d.sh b/TPUS/v6e_64_spot_useast1d.sh deleted file mode 100644 index cada53f..0000000 --- a/TPUS/v6e_64_spot_useast1d.sh +++ /dev/null @@ -1,22 +0,0 @@ -# 64 spot Cloud TPU v6e chips in zone us-east1-d -export PROJECT_ID=phantom-trc -export QR_NAME=TPUv6e64spotUE1D -export TPU_NAME=tpu-v6e-64-ue1d -export ZONE=us-east1-d -export ACCELERATOR_TYPE=v6e-64 -export RUNTIME_VERSION=v2-alpha-tpuv6e - -gcloud compute tpus tpu-vm create ${TPU_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --version=${RUNTIME_VERSION} \ - --spot \ -|| \ -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} \ - --spot diff --git a/docker-compose.yml b/docker-compose.yml index ba2e8a3..acbc37c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,23 @@ services: + tpu-watchdogs: + build: + context: . + dockerfile: docker/TPUWatchdog.dockerfile + container_name: "PHANTOM-tpu-watchdogs" + restart: unless-stopped + user: "${UID:-1000}:${GID:-1000}" + environment: + - HF_TOKEN=${HF_TOKEN} + - WANDB_API_KEY=${WANDB_API_KEY} + - GITHUB_TOKEN=${GITHUB_TOKEN} + - GOOGLE_APPLICATION_CREDENTIALS=/secrets/gcp-sa.json + - GCP_ACCOUNT=${GCP_ACCOUNT:-} + - WATCHDOG_CONFIG_PATTERN=${WATCHDOG_CONFIG_PATTERN:-v[46]*.conf} + - CLOUDSDK_CONFIG=/.config/gcloud + volumes: + - ~/.config/gcloud:/.config/gcloud:rw + - ./secrets/gcp-sa.json:/secrets/gcp-sa.json:ro + tensorboard-rl: image: tensorflow/tensorflow:latest container_name: "PHANTOM-tensorboard-rl" diff --git a/docker/TPUWatchdog.dockerfile b/docker/TPUWatchdog.dockerfile new file mode 100644 index 0000000..83358f1 --- /dev/null +++ b/docker/TPUWatchdog.dockerfile @@ -0,0 +1,112 @@ +FROM google/cloud-sdk:slim + +# Install tmux to manage multiple watchdogs and jq for json parsing +RUN apt-get update && \ + apt-get install -y tmux jq && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy the orchestration scripts and configs +COPY tpu_orchestration/ /app/tpu_orchestration/ + +# Make sure scripts are executable +RUN chmod +x /app/tpu_orchestration/watchdog.sh +RUN chmod +x /app/tpu_orchestration/tpu_startup.sh + +# Create an entrypoint script that launches a watchdog for each config +COPY <<-'EOF' /app/entrypoint.sh +#!/bin/bash +set -e + +# Make sure required variables are set +if [ -z "$HF_TOKEN" ]; then + echo "Error: HF_TOKEN environment variable is required." + exit 1 +fi + +if [ -z "$WANDB_API_KEY" ]; then + echo "Warning: WANDB_API_KEY environment variable is not set. Wandb logging may fail on TPUs." +fi + +# Authenticate gcloud if credentials are provided +if [ -n "$GOOGLE_APPLICATION_CREDENTIALS" ] && [ -f "$GOOGLE_APPLICATION_CREDENTIALS" ]; then + CRED_TYPE=$(jq -r '.type' "$GOOGLE_APPLICATION_CREDENTIALS" 2>/dev/null || echo "unknown") + if [ "$CRED_TYPE" = "service_account" ]; then + echo "Authenticating gcloud using service account key..." + gcloud auth activate-service-account --key-file="$GOOGLE_APPLICATION_CREDENTIALS" + + if [ -z "$PROJECT_ID" ]; then + PROJECT_ID=$(jq -r '.project_id // empty' "$GOOGLE_APPLICATION_CREDENTIALS") + fi + elif [ "$CRED_TYPE" = "authorized_user" ]; then + echo "Using authorized_user credentials via credential file override..." + export CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE="$GOOGLE_APPLICATION_CREDENTIALS" + + if gcloud auth print-access-token >/dev/null 2>&1; then + ACTIVE_ACCOUNT=$(gcloud config get-value account 2>/dev/null || true) + if [ -z "$ACTIVE_ACCOUNT" ] || [ "$ACTIVE_ACCOUNT" = "(unset)" ]; then + ACTIVE_ACCOUNT=$(jq -r '.account // empty' "$GOOGLE_APPLICATION_CREDENTIALS") + fi + + if [ -n "$ACTIVE_ACCOUNT" ] && [ "$ACTIVE_ACCOUNT" != "(unset)" ]; then + echo "Using gcloud account: $ACTIVE_ACCOUNT" + else + echo "Using gcloud credential override from $GOOGLE_APPLICATION_CREDENTIALS" + fi + else + echo "Warning: credential file override token check failed. Falling back to mounted gcloud config." + unset CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE + + if [ -n "$GCP_ACCOUNT" ]; then + gcloud config set account "$GCP_ACCOUNT" >/dev/null 2>&1 || true + fi + + ACTIVE_ACCOUNT=$(gcloud config get-value account 2>/dev/null || true) + if [ -z "$ACTIVE_ACCOUNT" ] || [ "$ACTIVE_ACCOUNT" = "(unset)" ]; then + echo "Error: no active gcloud account available. Run 'gcloud auth login' on host and mount ~/.config/gcloud, or use a service account key." + exit 1 + fi + echo "Using gcloud account: $ACTIVE_ACCOUNT" + fi + else + echo "Warning: unsupported credential file type '$CRED_TYPE'. Falling back to mounted gcloud config." + fi +else + echo "Note: Assuming gcloud config is mounted from host." +fi + +if [ -n "$PROJECT_ID" ]; then + gcloud config set project "$PROJECT_ID" + echo "Set project to $PROJECT_ID" +fi + +# Run the watchdogs in the background using bash instead of tmux +# Tmux needs a TTY to attach properly which we might not have in docker +# Stagger startups by 15s to prevent simultaneous TPU creation quota hits +CONFIG_PATTERN=${WATCHDOG_CONFIG_PATTERN:-"*.conf"} +shopt -s nullglob +CONFIGS=(/app/tpu_orchestration/configs/$CONFIG_PATTERN) + +if [ ${#CONFIGS[@]} -eq 0 ]; then + echo "Error: no watchdog configs matched pattern '$CONFIG_PATTERN'." + exit 1 +fi + +echo "Using watchdog config pattern: $CONFIG_PATTERN" +DELAY=0 +for conf in "${CONFIGS[@]}"; do + echo "Starting watchdog for $(basename "$conf" .conf) (delay: ${DELAY}s)" + (sleep $DELAY && /app/tpu_orchestration/watchdog.sh "$conf") & + DELAY=$((DELAY + 15)) +done + +echo "All watchdogs queued with staggered startup." + +# Keep the container running +wait +EOF + +RUN chmod +x /app/entrypoint.sh + +CMD ["/app/entrypoint.sh"] diff --git a/docs/index.html b/docs/index.html index 863120c..151dc9c 100644 --- a/docs/index.html +++ b/docs/index.html @@ -272,12 +272,12 @@ - - + - Goal Set + Dataset diff --git a/engine/__init__.py b/engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/engine/backends/common.py b/engine/backends/common.py index 9e50d48..f754342 100644 --- a/engine/backends/common.py +++ b/engine/backends/common.py @@ -15,6 +15,10 @@ def make_env(cfg: Mapping[str, Any]): n_products=int(cfg["n_products"]), alpha=float(cfg["alpha"]), N=int(cfg["N"]), + agent_params=( + float(cfg.get("agent_mu", 45.0)), + float(cfg.get("agent_std", 15.0)), + ), price_bounds=(float(cfg["price_low"]), float(cfg["price_high"])), lambda_coi=float(cfg["lambda_coi"]), robust_radius=float(cfg["robust_radius"]), @@ -50,6 +54,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]: coi_levels: list[float] = [] coi_leakages: list[float] = [] volatilities: list[float] = [] + upward_volatilities: list[float] = [] + supra_shares: list[float] = [] + supra_penalties: list[float] = [] agent_probs: list[float] = [] for _ in range(int(episodes)): @@ -61,6 +68,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]: ep_coi = 0.0 ep_coi_leakage = 0.0 ep_volatility = 0.0 + ep_upward_volatility = 0.0 + ep_supra_share = 0.0 + ep_supra_penalty = 0.0 ep_agent_prob = 0.0 steps = 0 @@ -74,6 +84,15 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]: ep_coi += float(econ.get("coi_level", 0.0)) ep_coi_leakage += float(econ.get("coi_leakage", 0.0)) ep_volatility += float(econ.get("volatility", 0.0)) + ep_upward_volatility += float( + info.get("upward_volatility", econ.get("upward_volatility", 0.0)) + ) + ep_supra_share += float( + info.get("supra_share", econ.get("supra_share", 0.0)) + ) + ep_supra_penalty += float( + info.get("supra_penalty", econ.get("supra_penalty", 0.0)) + ) ep_agent_prob += float(econ.get("agent_prob", info.get("agent_prob", 0.0))) steps += 1 @@ -84,6 +103,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]: coi_levels.append(ep_coi / denom) coi_leakages.append(ep_coi_leakage / denom) volatilities.append(ep_volatility / denom) + upward_volatilities.append(ep_upward_volatility / denom) + supra_shares.append(ep_supra_share / denom) + supra_penalties.append(ep_supra_penalty / denom) agent_probs.append(ep_agent_prob / denom) return { @@ -95,6 +117,13 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]: "eval/coi_level_mean": float(np.mean(coi_levels)) if coi_levels else 0.0, "eval/coi_leakage_mean": float(np.mean(coi_leakages)) if coi_leakages else 0.0, "eval/volatility_mean": float(np.mean(volatilities)) if volatilities else 0.0, + "eval/upward_volatility_mean": ( + float(np.mean(upward_volatilities)) if upward_volatilities else 0.0 + ), + "eval/supra_share_mean": float(np.mean(supra_shares)) if supra_shares else 0.0, + "eval/supra_penalty_mean": ( + float(np.mean(supra_penalties)) if supra_penalties else 0.0 + ), "eval/agent_prob_mean": float(np.mean(agent_probs)) if agent_probs else 0.0, } @@ -128,15 +157,15 @@ def evaluate( shifted_env.close() shifted_rows.append((tag, alpha, shifted_metrics)) - metrics["eval/robust_alpha_low"] = low_alpha - metrics["eval/robust_alpha_high"] = high_alpha - metrics["eval/robust_reward_worst"] = float( + metrics["eval/stress_alpha_low"] = low_alpha + metrics["eval/stress_alpha_high"] = high_alpha + metrics["eval/stress_reward_worst"] = float( min(row[2]["eval/reward_mean"] for row in shifted_rows) ) - metrics["eval/robust_revenue_worst"] = float( + metrics["eval/stress_revenue_worst"] = float( min(row[2]["eval/revenue_mean"] for row in shifted_rows) ) - metrics["eval/robust_coi_leakage_worst"] = float( + metrics["eval/stress_coi_leakage_worst"] = float( max(row[2]["eval/coi_leakage_mean"] for row in shifted_rows) ) for tag, alpha, shifted_metrics in shifted_rows: diff --git a/engine/backends/qtable.py b/engine/backends/qtable.py index b314fdb..cfb79d1 100644 --- a/engine/backends/qtable.py +++ b/engine/backends/qtable.py @@ -80,7 +80,11 @@ def train_qtable( "train/global_step": int(steps), } if wandb_live: - wandb.log(dict(event), step=step_offset + int(steps)) + try: + wandb.log(dict(event), step=step_offset + int(steps)) + except Exception: + wandb_live = False + train_events.append(event) else: train_events.append(event) if console_progress: @@ -113,7 +117,11 @@ def train_qtable( "train/global_step": int(steps), } if wandb_live: - wandb.log(dict(tail_event), step=step_offset + int(steps)) + try: + wandb.log(dict(tail_event), step=step_offset + int(steps)) + except Exception: + wandb_live = False + train_events.append(tail_event) else: train_events.append(tail_event) diff --git a/engine/backends/sb3.py b/engine/backends/sb3.py index 37f23c5..7a62d81 100644 --- a/engine/backends/sb3.py +++ b/engine/backends/sb3.py @@ -1,10 +1,12 @@ from __future__ import annotations import json +import os from pathlib import Path from typing import Any, Mapping -from ..lib.callbacks import MetricsCallback +from ..lib.callbacks import EvalMetricsCallback, MetricsCallback +from ..wandb_checkpoint import checkpoint_artifact_name, log_checkpoint_file from .common import evaluate, make_env @@ -117,7 +119,6 @@ def build_model(cfg: Mapping[str, Any], env: Any): def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]: try: - from stable_baselines3.common.callbacks import EvalCallback from stable_baselines3.common.monitor import Monitor except ImportError as exc: raise ImportError("stable-baselines3 is required for SB3 models") from exc @@ -144,20 +145,20 @@ def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]: pass metrics_callback = MetricsCallback( - log_histograms=False, + log_histograms=True, log_freq=int(cfg["log_freq"]), + hist_freq=int(cfg.get("hist_freq", 500)), step_offset=int(cfg.get("wandb_step_offset", 0)), ) - callbacks = [metrics_callback] - callbacks.append( - EvalCallback( - eval_env, - eval_freq=int(cfg["eval_freq"]), - n_eval_episodes=int(cfg["eval_episodes"]), - deterministic=True, - verbose=0, - ) + eval_callback = EvalMetricsCallback( + eval_env, + eval_freq=int(cfg["eval_freq"]), + n_eval_episodes=int(cfg["eval_episodes"]), + step_offset=int(cfg.get("wandb_step_offset", 0)), + deterministic=True, + verbose=0, ) + callbacks = [metrics_callback, eval_callback] target_steps = int(cfg["total_timesteps"]) remaining_steps = max(0, target_steps - int(getattr(model, "num_timesteps", 0))) @@ -173,6 +174,29 @@ def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]: model_path = model_dir / f"phantom_{cfg['algo']}" model.save(str(model_path)) + artifact_name = checkpoint_artifact_name( + cfg, + backend="sb3", + sweep_id=os.getenv("WANDB_SWEEP_ID"), + ) + artifact_logged = False + try: + artifact_logged = bool( + log_checkpoint_file( + artifact_name, + file_path=model_path.with_suffix(".zip"), + artifact_file_name="model.zip", + metadata={ + "algo": str(cfg.get("algo", "ppo")), + "backend": "sb3", + "seed": int(cfg.get("seed", 0)), + "step": int(getattr(model, "num_timesteps", 0)), + }, + ) + ) + except Exception: + artifact_logged = False + metrics: dict[str, Any] = evaluate( model, eval_env, @@ -181,7 +205,12 @@ def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]: ) metrics["train/global_step"] = int(model.num_timesteps) metrics["model/path"] = str(model_path.with_suffix(".zip")) - metrics["_train_events"] = list(metrics_callback.events) + metrics["model/artifact_name"] = str(artifact_name) + metrics["model/artifact_logged"] = float(artifact_logged) + metrics["_train_events"] = sorted( + [*metrics_callback.events, *eval_callback.events], + key=lambda event: int(event.get("train/global_step", 0)), + ) env.close() eval_env.close() diff --git a/engine/benchmark.py b/engine/benchmark.py index 7e0afaf..1cc6acc 100644 --- a/engine/benchmark.py +++ b/engine/benchmark.py @@ -1,12 +1,32 @@ from __future__ import annotations +import os +import subprocess +import sys + import argparse import json import logging -import os -from datetime import datetime, UTC +from datetime import datetime, timezone from pathlib import Path +# clear stale TPU locks on startup +if os.path.exists("/dev/accel0"): + try: + subprocess.run( + ["rm", "-f", "/tmp/.libtpu_lockfile", "/tmp/libtpu_lockfile"], + stderr=subprocess.DEVNULL, + ) + except: + pass + +try: + import jax + + jax.config.update("jax_threefry_partitionable", True) +except ImportError: + pass + import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -25,6 +45,10 @@ def _log(message: str) -> None: logger.info(message) +def _wandb_run_active() -> bool: + return bool(HAS_WANDB and getattr(wandb, "run", None) is not None) + + def _parse_list(raw: str) -> list[str]: return [x.strip().lower() for x in str(raw).split(",") if x.strip()] @@ -41,6 +65,10 @@ def _truthy(value: str | bool | None) -> bool: return str(value).strip().lower() in {"1", "true", "yes", "on"} +def _mode_label_from_baseline(is_baseline: bool) -> str: + return "baseline" if bool(is_baseline) else "defended" + + def _action(policy, obs: np.ndarray): out = policy.predict(obs, deterministic=True) action = out[0] if isinstance(out, tuple) else out @@ -146,7 +174,7 @@ def _log_train_events( alpha: float, step_offset: int, ) -> int: - if not (HAS_WANDB and wandb.run is not None): + if not _wandb_run_active(): return int(step_offset) if not events: return int(step_offset) @@ -167,11 +195,14 @@ def _log_train_events( "run.kind": "benchmark", "runtime/backend": tier_name, "study/mode": mode_label, - "study/no_robust": float(mode_label == "no_robust"), + "study/baseline_mode": float(mode_label == "baseline"), "study/alpha": float(alpha), } ) - wandb.log(payload, step=cursor + rel_step) + try: + wandb.log(payload, step=cursor + rel_step) + except Exception: + return int(step_offset) max_rel = max(max(1, int(evt.get("train/global_step", 0))) for evt in ordered) return cursor + max_rel + 1 @@ -183,6 +214,7 @@ def run_benchmark( n_episodes: int, mode_label: str, step_cursor_start: int = 0, + eval_alpha_values: list[float] | None = None, ): from .backends.common import make_env @@ -219,62 +251,80 @@ def run_benchmark( "dqn", }: wandb_step_cursor += max(1, int(cfg.get("total_timesteps", 1))) + 1 - env = make_env({**cfg, "alpha": float(alpha)}) - eps = [_run_eval_episode(env, policy) for _ in range(int(n_episodes))] - env.close() - - row = { - "tier": tier_name, - "mode": mode_label, - "alpha": float(alpha), - "episodes": int(n_episodes), - "mean_reward": float(np.mean([e["reward"] for e in eps])), - "mean_revenue": float(np.mean([e["revenue"] for e in eps])), - "mean_margin": float(np.mean([e["mean_margin"] for e in eps])), - "mean_coi": float(np.mean([e["mean_coi"] for e in eps])), - "std_revenue": float(np.std([e["revenue"] for e in eps])), - } - row["objective_score"] = row["mean_reward"] - rows.append(row) - _log( - f"[{run_index}/{total_runs}] alpha={float(alpha):.2f} tier={tier_name}: " - f"reward={row['mean_reward']:.3f} revenue={row['mean_revenue']:.3f} " - f"coi={row['mean_coi']:.4f} score={row['objective_score']:.3f}" + eval_targets = ( + [float(value) for value in eval_alpha_values] + if eval_alpha_values + else [float(alpha)] ) + for eval_alpha in eval_targets: + env = make_env({**cfg, "alpha": float(eval_alpha)}) + eps = [_run_eval_episode(env, policy) for _ in range(int(n_episodes))] + env.close() - max_len = max((len(e["price_trace"]) for e in eps), default=0) - step_means = [] - for step in range(max_len): - vals = [ - e["price_trace"][step] for e in eps if step < len(e["price_trace"]) - ] - step_means.append(float(np.mean(vals)) if vals else np.nan) - traces.append( - { + row = { "tier": tier_name, - "alpha": float(alpha), - "mean_price_trace": step_means, + "mode": mode_label, + "alpha": float(eval_alpha), + "train_alpha": float(alpha), + "eval_alpha": float(eval_alpha), + "episodes": int(n_episodes), + "mean_reward": float(np.mean([e["reward"] for e in eps])), + "mean_revenue": float(np.mean([e["revenue"] for e in eps])), + "mean_margin": float(np.mean([e["mean_margin"] for e in eps])), + "mean_coi": float(np.mean([e["mean_coi"] for e in eps])), + "std_revenue": float(np.std([e["revenue"] for e in eps])), } - ) - - if HAS_WANDB and wandb.run is not None: - wandb.log( - { - "run.kind": "benchmark", - "runtime/backend": tier_name, - "study/mode": mode_label, - "study/no_robust": float(mode_label == "no_robust"), - "study/alpha": float(alpha), - "eval/reward_mean": row["mean_reward"], - "eval/revenue_mean": row["mean_revenue"], - "eval/margin_mean": row["mean_margin"], - "eval/coi_level_mean": row["mean_coi"], - "objective/score": row["objective_score"], - "objective/coi_preserved": row["mean_coi"], - }, - step=wandb_step_cursor, + row["objective_score"] = row["mean_reward"] + rows.append(row) + _log( + f"[{run_index}/{total_runs}] train_alpha={float(alpha):.2f} " + f"eval_alpha={float(eval_alpha):.2f} tier={tier_name}: " + f"reward={row['mean_reward']:.3f} revenue={row['mean_revenue']:.3f} " + f"coi={row['mean_coi']:.4f} score={row['objective_score']:.3f}" ) - wandb_step_cursor += 1 + + max_len = max((len(e["price_trace"]) for e in eps), default=0) + step_means = [] + for step in range(max_len): + vals = [ + e["price_trace"][step] + for e in eps + if step < len(e["price_trace"]) + ] + step_means.append(float(np.mean(vals)) if vals else np.nan) + traces.append( + { + "tier": tier_name, + "alpha": float(eval_alpha), + "train_alpha": float(alpha), + "eval_alpha": float(eval_alpha), + "mean_price_trace": step_means, + } + ) + + if _wandb_run_active(): + try: + wandb.log( + { + "run.kind": "benchmark", + "runtime/backend": tier_name, + "study/mode": mode_label, + "study/baseline_mode": float(mode_label == "baseline"), + "study/alpha": float(eval_alpha), + "study/train_alpha": float(alpha), + "study/eval_alpha": float(eval_alpha), + "eval/reward_mean": row["mean_reward"], + "eval/revenue_mean": row["mean_revenue"], + "eval/margin_mean": row["mean_margin"], + "eval/coi_level_mean": row["mean_coi"], + "objective/score": row["objective_score"], + "objective/coi_preserved": row["mean_coi"], + }, + step=wandb_step_cursor, + ) + except Exception: + pass + wandb_step_cursor += 1 return pd.DataFrame(rows), traces, int(wandb_step_cursor) @@ -358,7 +408,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None): if compare_robust_override is not None else _truthy(os.environ.get("PHANTOM_BENCHMARK_COMPARE_ROBUST")) ) - robust_modes = [False, True] if compare_robust else [bool(args.no_robust)] + baseline_modes = [False, True] if compare_robust else [bool(args.no_robust)] base_overrides = { "seed": args.seed, @@ -369,6 +419,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None): "robust_radius": args.robust_radius, "robust_points": args.robust_points, "robust_rollouts": args.robust_rollouts, + "margin_floor": args.margin_floor, "eta_ux": args.eta_ux, "reward_profit_weight": args.reward_profit_weight, "price_low": args.price_low, @@ -385,12 +436,20 @@ def _run_with_args(args, compare_robust_override: bool | None = None): } tiers = _parse_list(args.tiers) alpha_values = _parse_float_list(args.alpha_values) + eval_alpha_values = ( + _parse_float_list(args.eval_alpha_values) + if str(getattr(args, "eval_alpha_values", "")).strip() + else [] + ) _log( "starting run " + json.dumps( { "tiers": tiers, "alpha_values": alpha_values, + "eval_alpha_values": ( + eval_alpha_values if eval_alpha_values else alpha_values + ), "episodes": int(args.episodes), "total_timesteps": int(args.total_timesteps), "device": str(args.device), @@ -401,14 +460,14 @@ def _run_with_args(args, compare_robust_override: bool | None = None): all_frames: list[pd.DataFrame] = [] all_traces: list[dict] = [] wandb_step_cursor = 0 - for no_robust in robust_modes: + for baseline_mode in baseline_modes: overrides = dict(base_overrides) - overrides["no_robust"] = bool(no_robust) + overrides["baseline_mode"] = bool(baseline_mode) cfg = TrainSpec.from_flat( {k: v for k, v in overrides.items() if v is not None} ).to_flat_dict() cfg["linear_warmup_steps"] = int(args.linear_warmup_steps) - mode_label = "no_robust" if no_robust else "robust" + mode_label = _mode_label_from_baseline(bool(baseline_mode)) _log(f"mode={mode_label}: begin") df_mode, traces_mode, wandb_step_cursor = run_benchmark( cfg, @@ -417,6 +476,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None): args.episodes, mode_label=mode_label, step_cursor_start=wandb_step_cursor, + eval_alpha_values=eval_alpha_values, ) _log(f"mode={mode_label}: complete ({len(df_mode)} rows)") for trace in traces_mode: @@ -429,7 +489,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None): out_dir = Path(args.output_dir) out_dir.mkdir(parents=True, exist_ok=True) - stamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") + stamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") csv_path = out_dir / f"benchmark_{stamp}.csv" trace_path = out_dir / f"benchmark_traces_{stamp}.json" df.to_csv(csv_path, index=False) @@ -445,7 +505,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None): + json.dumps( { "tier": best["tier"], - "mode": best.get("mode", "robust"), + "mode": best.get("mode", "defended"), "alpha": float(best["alpha"]), "objective_score": float(best["objective_score"]), "mean_revenue": float(best["mean_revenue"]), @@ -466,6 +526,7 @@ def run_cli(raw_args: list[str] | None = None): parser.add_argument("--project", default="capstone") parser.add_argument("--tiers", default="static,surge,linear,qtable,ppo") parser.add_argument("--alpha-values", default="0.0,0.3,0.6") + parser.add_argument("--eval-alpha-values", default="") parser.add_argument("--episodes", type=int, default=10) parser.add_argument("--output-dir", default="engine/studies/results") parser.add_argument("--seed", type=int, default=42) @@ -476,6 +537,7 @@ def run_cli(raw_args: list[str] | None = None): parser.add_argument("--robust-radius", type=float, default=0.15) parser.add_argument("--robust-points", type=int, default=5) parser.add_argument("--robust-rollouts", type=int, default=1) + parser.add_argument("--margin-floor", type=float, default=0.85) parser.add_argument("--eta-ux", type=float, default=0.5) parser.add_argument("--reward-profit-weight", type=float, default=1.0) parser.add_argument("--price-low", type=float, default=10.0) @@ -509,35 +571,47 @@ def run_cli(raw_args: list[str] | None = None): key_to_attr = { "tiers": "tiers", "alpha_values": "alpha_values", + "eval_alpha_values": "eval_alpha_values", "episodes": "episodes", "total_timesteps": "total_timesteps", "lambda_coi": "lambda_coi", "robust_radius": "robust_radius", "robust_points": "robust_points", "robust_rollouts": "robust_rollouts", + "ambiguity_radius": "robust_radius", + "ambiguity_points": "robust_points", + "ambiguity_rollouts": "robust_rollouts", "eta_ux": "eta_ux", "reward_profit_weight": "reward_profit_weight", "learning_rate": "learning_rate", "batch_size": "batch_size", "n_steps": "n_steps", + "baseline_mode": "no_robust", "no_robust": "no_robust", + "margin_floor": "margin_floor", "device": "device", } for key in ( "tiers", "alpha_values", + "eval_alpha_values", "episodes", "total_timesteps", "lambda_coi", "robust_radius", "robust_points", "robust_rollouts", + "ambiguity_radius", + "ambiguity_points", + "ambiguity_rollouts", "eta_ux", "reward_profit_weight", "learning_rate", "batch_size", "n_steps", + "baseline_mode", "no_robust", + "margin_floor", "device", ): if key in wandb.config: @@ -560,18 +634,18 @@ def run_cli(raw_args: list[str] | None = None): tiers = _parse_list(args.tiers) alpha_values = _parse_float_list(args.alpha_values) - run_stamp = datetime.now(UTC).strftime("%m%d-%H%M%S") + run_stamp = datetime.now(timezone.utc).strftime("%m%d-%H%M%S") compare_enabled = _truthy(os.environ.get("PHANTOM_BENCHMARK_COMPARE_ROBUST")) - compare_tag = "robust-compare" if compare_enabled else "single-mode" + compare_tag = "defended-compare" if compare_enabled else "single-mode" modes = ( - [("no_robust", True), ("robust", False)] + [("baseline", True), ("defended", False)] if compare_enabled - else [("no_robust" if bool(args.no_robust) else "robust", bool(args.no_robust))] + else [(_mode_label_from_baseline(bool(args.no_robust)), bool(args.no_robust))] ) run_idx = 0 for tier in tiers: - for mode_label, no_robust in modes: + for mode_label, baseline_mode in modes: for alpha in alpha_values: run_idx += 1 alpha_token = ( @@ -580,7 +654,7 @@ def run_cli(raw_args: list[str] | None = None): tier_args = argparse.Namespace(**vars(args)) tier_args.tiers = tier tier_args.alpha_values = str(float(alpha)) - tier_args.no_robust = bool(no_robust) + tier_args.no_robust = bool(baseline_mode) run = wandb.init( project=args.project, name=( @@ -597,16 +671,19 @@ def run_cli(raw_args: list[str] | None = None): "run.kind": "benchmark", "runtime/backend": tier, "study/mode": mode_label, - "study/no_robust": float(no_robust), + "study/baseline_mode": float(baseline_mode), "study/alpha": float(alpha), "tiers": tier, "alpha_values": str(float(alpha)), + "eval_alpha_values": args.eval_alpha_values, "episodes": args.episodes, "total_timesteps": args.total_timesteps, "lambda_coi": args.lambda_coi, - "robust_radius": args.robust_radius, - "robust_points": args.robust_points, - "robust_rollouts": args.robust_rollouts, + "ambiguity_radius": args.robust_radius, + "ambiguity_points": args.robust_points, + "ambiguity_rollouts": args.robust_rollouts, + "margin_floor": args.margin_floor, + "baseline_mode": float(baseline_mode), "eta_ux": args.eta_ux, "reward_profit_weight": args.reward_profit_weight, "learning_rate": args.learning_rate, diff --git a/engine/engine.py b/engine/engine.py index b4a2cbc..0e6f143 100644 --- a/engine/engine.py +++ b/engine/engine.py @@ -48,7 +48,8 @@ class MarketEngine: ) human_transitions = get_adjusted_transitions(demand_h, human=True) agent_transitions = get_adjusted_transitions(demand_a, human=False) - # sample behavior trajectories from each demand distribution + # sample N trajectories in parallel; each chain is independent so threads + # do not share state and numpy's per-call RNG is thread-safe human_t = [ sample_behavior_from_transitions(human_transitions) for _ in range(self.Nhumans) @@ -59,7 +60,25 @@ class MarketEngine: ] # store trajectories for agent probability calculation self.last_trajectories = human_t + agent_t - return estimate_demand(self.last_trajectories, self.action_weights) + + demand_proxy = estimate_demand( + self.last_trajectories, + self.action_weights, + normalize=True, + per_session=False, + ) + raw_mix = ((1.0 - float(self.alpha)) * demand_h) + ( + float(self.alpha) * demand_a + ) + total_raw_demand = float(np.sum(raw_mix)) + if not demand_proxy: + return {i: float(raw_mix[i]) for i in range(len(prices))} + if total_raw_demand <= 0.0: + return {i: 0.0 for i in range(len(prices))} + return { + i: total_raw_demand * float(demand_proxy.get(i, 0.0)) / 100.0 + for i in range(len(prices)) + } def measure(self): pass diff --git a/engine/jax/__init__.py b/engine/jax/__init__.py new file mode 100644 index 0000000..84e3375 --- /dev/null +++ b/engine/jax/__init__.py @@ -0,0 +1,3 @@ +from .robust import select_adversarial_alpha_jax, _JAX_OK + +__all__ = ["select_adversarial_alpha_jax", "_JAX_OK"] diff --git a/engine/jax/robust.py b/engine/jax/robust.py new file mode 100644 index 0000000..cacf663 --- /dev/null +++ b/engine/jax/robust.py @@ -0,0 +1,197 @@ +"""JAX-accelerated robust inner loop for PHANTOM. + +provides a drop-in replacement for the sequential alpha-candidate evaluation in +wrapper.py::_select_adversarial_alpha. the demand generation and reward +computation are vmapped over the K candidate alpha values so all candidates are +evaluated in a single vectorized pass instead of K sequential Python calls. + +public surface: + select_adversarial_alpha_jax(candidates, prices, human_params, agent_params, + noise_std, n_sessions, n_products, + baseline_prices, lambda_coi, info_value, + reward_profit_weight, rng_key) + -> (best_alpha: float, rewards: np.ndarray) + +falls back gracefully when JAX is unavailable. +""" + +from __future__ import annotations + +import numpy as np + +try: + import jax + import jax.numpy as jnp + from jax import vmap, jit + + _JAX_OK = True +except ImportError: + _JAX_OK = False + +_JAX_RUNTIME_OK = True + + +def _demand_for_actor_jax(prices, mean, std, noise_std, key): + """d(p;theta) = max(0, val - price + noise), normalized to sum 100.""" + k1, k2 = jax.random.split(key) + val = jax.random.normal(k1, shape=prices.shape) * std + mean + noise = jax.random.normal(k2, shape=prices.shape) * noise_std + demand = jnp.maximum(0.0, val - prices + noise) + total = demand.sum() + return jnp.where(total > 0, demand / total * 100.0, demand) + + +def _reward_for_candidate( + alpha, + prices, + human_mean, + human_std, + agent_mean, + agent_std, + noise_std, + baseline_prices, + lambda_coi, + info_value, + reward_profit_weight, + key, +): + """compute a scalar reward for a single alpha candidate (pure JAX, vmappable).""" + k_h, k_a = jax.random.split(key) + # mixed demand proxy: weighted sum of human and agent demand signals + demand_h = _demand_for_actor_jax(prices, human_mean, human_std, noise_std, k_h) + demand_a = _demand_for_actor_jax(prices, agent_mean, agent_std, noise_std, k_a) + demand = (1.0 - alpha) * demand_h + alpha * demand_a + + revenue = jnp.dot(prices, demand) + floor_cost = jnp.dot(baseline_prices, demand) + profit = revenue - floor_cost + + # agent_prob proxy: use alpha directly (no trajectory available in vectorized path) + coi_leakage = alpha * info_value + info_budget = jnp.maximum(floor_cost, 1.0) + coi_penalty = lambda_coi * coi_leakage * info_budget + + return reward_profit_weight * profit - coi_penalty + + +if _JAX_OK: + # compile once; retracing only happens on shape/dtype changes + # 12 args: alpha, prices, h_mean, h_std, a_mean, a_std, noise_std, + # baseline_prices, lambda_coi, info_value, reward_profit_weight, key + _reward_batched = jit( + vmap( + _reward_for_candidate, + in_axes=(0, None, None, None, None, None, None, None, None, None, None, 0), + ) + ) + + +def select_adversarial_alpha_jax( + candidates: np.ndarray, + prices: np.ndarray, + human_params: tuple, + agent_params: tuple, + noise_std: float, + baseline_prices: np.ndarray, + lambda_coi: float, + info_value: float, + reward_profit_weight: float, + rng_seed: int = 0, +) -> tuple[float, np.ndarray]: + """evaluate all alpha candidates in a single vmapped pass. + + returns (best_alpha, rewards_array) where best_alpha minimizes reward + (worst case for the platform, driving robust policy training). + + falls back to a pure-numpy sequential loop when JAX is unavailable so the + wrapper can call this function unconditionally. + """ + global _JAX_RUNTIME_OK + + if not _JAX_OK or not _JAX_RUNTIME_OK: + return _fallback( + candidates, + prices, + human_params, + agent_params, + noise_std, + baseline_prices, + lambda_coi, + info_value, + reward_profit_weight, + ) + + try: + k = len(candidates) + key = jax.random.PRNGKey(rng_seed) + keys = jax.random.split(key, k) + + rewards = np.asarray( + _reward_batched( + jnp.asarray(candidates, dtype=jnp.float32), + jnp.asarray(prices, dtype=jnp.float32), + float(human_params[0]), + float(human_params[1]), + float(agent_params[0]), + float(agent_params[1]), + float(noise_std), + jnp.asarray(baseline_prices, dtype=jnp.float32), + float(lambda_coi), + float(info_value), + float(reward_profit_weight), + keys, + ) + ) + best_idx = int(np.argmin(rewards)) + return float(candidates[best_idx]), rewards + except Exception as exc: + # TPU contention / backend init failures can happen in distributed schedulers. + # Degrade to numpy path for the remainder of the process. + _JAX_RUNTIME_OK = False + print(f"PHANTOM_JAX_FALLBACK: {exc}") + return _fallback( + candidates, + prices, + human_params, + agent_params, + noise_std, + baseline_prices, + lambda_coi, + info_value, + reward_profit_weight, + ) + + +def _fallback( + candidates, + prices, + human_params, + agent_params, + noise_std, + baseline_prices, + lambda_coi, + info_value, + reward_profit_weight, +): + """numpy fallback matching the reward formula above.""" + rewards = [] + for alpha in candidates: + rng = np.random.default_rng() + val_h = rng.normal(*human_params, size=len(prices)) + val_a = rng.normal(*agent_params, size=len(prices)) + noise_h = rng.normal(0, noise_std, len(prices)) + noise_a = rng.normal(0, noise_std, len(prices)) + d_h = np.maximum(0, val_h - prices + noise_h) + d_a = np.maximum(0, val_a - prices + noise_a) + s_h, s_a = d_h.sum(), d_a.sum() + d_h = d_h / s_h * 100 if s_h > 0 else d_h + d_a = d_a / s_a * 100 if s_a > 0 else d_a + demand = (1.0 - alpha) * d_h + alpha * d_a + revenue = float(np.dot(prices, demand)) + floor_cost = float(np.dot(baseline_prices, demand)) + profit = revenue - floor_cost + coi_penalty = lambda_coi * alpha * info_value * max(floor_cost, 1.0) + rewards.append(reward_profit_weight * profit - coi_penalty) + rewards = np.array(rewards) + best_idx = int(np.argmin(rewards)) + return float(candidates[best_idx]), rewards diff --git a/engine/lib/behavior.py b/engine/lib/behavior.py index 588ebc9..52a9d7d 100644 --- a/engine/lib/behavior.py +++ b/engine/lib/behavior.py @@ -22,6 +22,9 @@ human_dir = str(base_dir / "collected_data") agent_dir = str(base_dir / "agents" / "collected_data") _cache = {} # lazy cache for models and base pivots +# cache keyed by (human: bool, condition_tuple) so we skip Kronecker re-expansion +# for repeated calls with the same demand condition inside the robustness inner loop +_transition_cache: dict = {} def _get_base_pivot(human: bool): @@ -68,22 +71,41 @@ def trajectory_to_events(trajectory: list) -> list: """extract event names from trajectory for KL divergence calculation trajectories are in format 'eventName_product0', extract just eventName - - args: - trajectory: list like ['view_product0', 'add_to_cart_product1', 'checkout_product1'] - - returns: - list: event names like ['view', 'add_to_cart', 'checkout'] """ - events = [] - for state in trajectory: - # state format from sample_behavior: 'eventName_productX' - if "_product" in state: - event = state.rsplit("_product", 1)[0] - else: - event = state - events.append(event) - return events + return [s.rsplit("_product", 1)[0] if "_product" in s else s for s in trajectory] + + +class _TransitionTable: + """numpy-backed transition table; replaces per-step pandas .loc[] indexing. + + the profiling hotspot was DataFrame.xs called ~4-16k times per outer step. + converting once to a dense float32 array with an int-keyed state index map + reduces each row lookup to a single array slice with no pandas overhead. + rows are pre-normalized so sampling requires no per-step division. + """ + + __slots__ = ("matrix", "states", "state_index", "n_states") + + def __init__(self, df: pd.DataFrame): + self.states: list[str] = df.index.tolist() + self.state_index: dict[str, int] = {s: i for i, s in enumerate(self.states)} + # float64 throughout: float32 row-sums can drift enough to break np.random.choice + mat = np.nan_to_num( + df.values.astype(np.float64), nan=0.0, posinf=0.0, neginf=0.0 + ) + mat = np.clip(mat, 0.0, None) + row_sums = mat.sum(axis=1) + # dead rows (all zero) get uniform distribution so sampling never receives NaN + dead = row_sums <= 0 + mat[dead] = 1.0 + row_sums[dead] = float(mat.shape[1]) + mat = mat / row_sums[:, np.newaxis] + # final nan guard in case fp still drifts + np.nan_to_num(mat, nan=0.0, copy=False) + row_sums2 = mat.sum(axis=1, keepdims=True) + row_sums2[row_sums2 <= 0] = 1.0 + self.matrix: np.ndarray = mat / row_sums2 + self.n_states: int = len(self.states) def adjust_behavior_to_condition(condition, transition_matrix): @@ -92,46 +114,73 @@ def adjust_behavior_to_condition(condition, transition_matrix): condition = np.nan_to_num(condition, nan=0.0, posinf=0.0, neginf=0.0) condition = np.clip(condition, 0.0, None) s = float(np.sum(condition)) - if not np.isfinite(s) or s <= 0: - cond_norm = np.full(len(condition), 1.0 / max(len(condition), 1), dtype=float) - else: - cond_norm = condition / s + cond_norm = ( + condition / s + if np.isfinite(s) and s > 0 + else np.full(len(condition), 1.0 / max(len(condition), 1), dtype=float) + ) n_products = len(condition) base_vals = transition_matrix.values base_cols, base_rows = ( transition_matrix.columns.tolist(), transition_matrix.index.tolist(), ) - - # expand via kronecker-like tiling: each cell becomes a P*P block weighted by outer product of cond_norm expanded = np.kron(base_vals, np.outer(cond_norm, cond_norm)) new_cols = [f"{c}_product{p}" for c in base_cols for p in range(n_products)] new_rows = [f"{r}_product{p}" for r in base_rows for p in range(n_products)] return pd.DataFrame(expanded, index=new_rows, columns=new_cols) -def get_adjusted_transitions(condition, human=True): +def get_adjusted_transitions(condition, human=True) -> _TransitionTable: + """return a _TransitionTable for the given demand condition. + + results are cached by (human, rounded-condition) so that repeated calls with + the same condition inside the robustness inner loop (K candidates, same prices) + skip the Kronecker expansion entirely. + """ + condition = np.asarray(condition, dtype=float) + # round to 4 significant digits for cache key stability + cache_key = (human, tuple(np.round(condition, 4).tolist())) + if cache_key in _transition_cache: + return _transition_cache[cache_key] + + # prevent OOM by capping cache size + if len(_transition_cache) > 100: + _transition_cache.clear() + base_pivot = _get_base_pivot(human) - return adjust_behavior_to_condition(condition, base_pivot) + df = adjust_behavior_to_condition(condition, base_pivot) + table = _TransitionTable(df) + _transition_cache[cache_key] = table + return table -def sample_behavior_from_transitions(adjusted_transitions, max_len=40): - trajectory = [np.random.choice(adjusted_transitions.index)] +def clear_transition_cache(): + """drop cached transition tables; call between episodes if condition space is large.""" + _transition_cache.clear() + + +def sample_behavior_from_transitions(table, max_len=40): + """sample a Markov trajectory. + + accepts _TransitionTable (fast path) or a legacy pandas DataFrame so existing + call sites that pass a DataFrame directly continue to work unchanged. + """ + if isinstance(table, pd.DataFrame): + table = _TransitionTable(table) + + idx = np.random.randint(table.n_states) + trajectory = [table.states[idx]] while len(trajectory) < max_len and "checkout" not in trajectory[-1]: - probs = np.asarray(adjusted_transitions.loc[trajectory[-1]].values, dtype=float) - probs = np.nan_to_num(probs, nan=0.0, posinf=0.0, neginf=0.0) - probs = np.clip(probs, 0.0, None) - s = float(np.sum(probs)) - sample = np.random.choice( - adjusted_transitions.columns, p=(probs / s) if s > 0 else None - ) - trajectory.append(sample) + row = table.matrix[table.state_index[trajectory[-1]]] + idx = int(np.random.choice(table.n_states, p=row)) + trajectory.append(table.states[idx]) return trajectory def sample_behavior(condition, human=True, max_len=40): - adjusted_transitions = get_adjusted_transitions(condition, human=human) - return sample_behavior_from_transitions(adjusted_transitions, max_len=max_len) + table = get_adjusted_transitions(condition, human=human) + return sample_behavior_from_transitions(table, max_len=max_len) if __name__ == "__main__": diff --git a/engine/lib/callbacks.py b/engine/lib/callbacks.py index 2193894..ec5c6ef 100644 --- a/engine/lib/callbacks.py +++ b/engine/lib/callbacks.py @@ -15,15 +15,19 @@ class MetricsCallback(BaseCallback): self, log_histograms: bool = False, log_freq: int = 100, + hist_freq: int = 500, step_offset: int = 0, verbose: int = 0, ): super().__init__(verbose) self.log_histograms = log_histograms self.log_freq = max(1, int(log_freq)) + self.hist_freq = max(1, int(hist_freq)) self.step_offset = max(0, int(step_offset)) self._wandb = get_wandb_module() self._wandb_live = bool(self._wandb is not None and self._wandb.run is not None) + self._price_samples: list[float] = [] + self._demand_samples: list[float] = [] self._window_sums = { "train/revenue_mean": 0.0, "train/margin_mean": 0.0, @@ -74,35 +78,100 @@ class MetricsCallback(BaseCallback): ) self._window_count += 1 - def _flush(self, step: int) -> None: - if self._window_count <= 0: + def _accumulate_histograms(self, info: dict[str, Any]) -> None: + if not self.log_histograms: return - denom = float(self._window_count) - payload = { - key: (value / denom) - for key, value in self._window_sums.items() - if value != 0.0 - or key - in { - "train/revenue_mean", - "train/margin_mean", - "train/coi_level_mean", - "train/regret_mean", + + for key in ("effective_prices", "prices"): + if key not in info: + continue + try: + values = np.asarray(info.get(key), dtype=float).reshape(-1) + except Exception: + continue + if values.size <= 0: + continue + finite_values = values[np.isfinite(values)] + if finite_values.size > 0: + self._price_samples.extend(finite_values.tolist()) + break + + if "demand" in info: + try: + demand_values = np.asarray(info.get("demand"), dtype=float).reshape(-1) + except Exception: + demand_values = np.array([], dtype=float) + if demand_values.size > 0: + finite_demand = demand_values[np.isfinite(demand_values)] + if finite_demand.size > 0: + self._demand_samples.extend(finite_demand.tolist()) + + def _flush_histograms(self, step: int, force: bool = False) -> None: + if not self.log_histograms: + return + if not force and step % self.hist_freq != 0: + return + if not self._price_samples and not self._demand_samples: + return + if self._wandb is None: + self._price_samples.clear() + self._demand_samples.clear() + return + + payload: dict[str, Any] = {} + if self._price_samples: + payload["train/price_dist"] = self._wandb.Histogram( + np.asarray(self._price_samples, dtype=np.float32) + ) + if self._demand_samples: + payload["train/demand_dist"] = self._wandb.Histogram( + np.asarray(self._demand_samples, dtype=np.float32) + ) + + if payload and self._wandb_live: + try: + self._wandb.log(payload, step=self.step_offset + int(step)) + except Exception: + self._wandb_live = False + + self._price_samples.clear() + self._demand_samples.clear() + + def _flush(self, step: int, *, force_hist: bool = False) -> None: + if self._window_count > 0: + denom = float(self._window_count) + payload = { + key: (value / denom) + for key, value in self._window_sums.items() + if value != 0.0 + or key + in { + "train/revenue_mean", + "train/margin_mean", + "train/coi_level_mean", + "train/regret_mean", + } } - } - payload["train/global_step"] = int(step) - if self._wandb_live: - self._wandb.log(dict(payload), step=self.step_offset + int(step)) - else: - self.events.append(payload) - for key in self._window_sums: - self._window_sums[key] = 0.0 - self._window_count = 0 + payload["train/global_step"] = int(step) + if self._wandb_live: + try: + self._wandb.log(dict(payload), step=self.step_offset + int(step)) + except Exception: + self._wandb_live = False + self.events.append(payload) + else: + self.events.append(payload) + for key in self._window_sums: + self._window_sums[key] = 0.0 + self._window_count = 0 + + self._flush_histograms(step=step, force=force_hist) def _on_step(self) -> bool: for info in self.locals.get("infos", []): if isinstance(info, dict): self._accumulate(info) + self._accumulate_histograms(info) if self.num_timesteps % self.log_freq == 0: self._flush(step=self.num_timesteps) @@ -110,39 +179,81 @@ class MetricsCallback(BaseCallback): return True def _on_training_end(self) -> None: - self._flush(step=self.num_timesteps) + self._flush(step=self.num_timesteps, force_hist=True) class EvalMetricsCallback(EvalCallback): """Deterministic evaluation collector detached from logging backends.""" def __init__( - self, eval_env, eval_freq: int = 1000, n_eval_episodes: int = 5, **kwargs + self, + eval_env, + eval_freq: int = 1000, + n_eval_episodes: int = 5, + step_offset: int = 0, + **kwargs, ): super().__init__( eval_env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes, **kwargs ) - self._eval_revenues: list[float] = [] + self.step_offset = max(0, int(step_offset)) + self._wandb = get_wandb_module() + self._wandb_live = bool(self._wandb is not None and self._wandb.run is not None) + self._eval_stats: dict[str, list[float]] = { + "eval/revenue_mean": [], + "eval/margin_mean": [], + "eval/coi_level_mean": [], + "eval/coi_leakage_mean": [], + "eval/volatility_mean": [], + "eval/agent_prob_mean": [], + } self.events: list[dict[str, float | int]] = [] def _on_step(self) -> bool: result = super()._on_step() if self.n_calls % self.eval_freq == 0 and hasattr(self, "last_mean_reward"): - self.events.append( - { - "eval/reward_mean": float(self.last_mean_reward), - "eval/revenue_mean": float(np.mean(self._eval_revenues)) - if self._eval_revenues - else 0.0, - "train/global_step": int(self.num_timesteps), - } - ) - self._eval_revenues = [] + payload: dict[str, float | int] = { + "eval/reward_mean": float(self.last_mean_reward), + "train/global_step": int(self.num_timesteps), + } + for key, values in self._eval_stats.items(): + payload[key] = float(np.mean(values)) if values else 0.0 + + if self._wandb_live: + try: + self._wandb.log( + dict(payload), + step=self.step_offset + int(self.num_timesteps), + ) + except Exception: + self._wandb_live = False + self.events.append(payload) + else: + self.events.append(payload) + + for values in self._eval_stats.values(): + values.clear() return result def _log_success_callback(self, locals_: dict, globals_: dict) -> None: # called after each eval episode info = locals_.get("info", {}) - if "economics" in info: - self._eval_revenues.append(info["economics"]["revenue"]) + econ = info.get("economics") if isinstance(info, dict) else None + if not isinstance(econ, dict): + return + + self._eval_stats["eval/revenue_mean"].append(float(econ.get("revenue", 0.0))) + self._eval_stats["eval/margin_mean"].append(float(econ.get("margin", 0.0))) + self._eval_stats["eval/coi_level_mean"].append( + float(econ.get("coi_level", 0.0)) + ) + self._eval_stats["eval/coi_leakage_mean"].append( + float(econ.get("coi_leakage", 0.0)) + ) + self._eval_stats["eval/volatility_mean"].append( + float(econ.get("volatility", 0.0)) + ) + self._eval_stats["eval/agent_prob_mean"].append( + float(econ.get("agent_prob", 0.0)) + ) diff --git a/engine/lib/demand.py b/engine/lib/demand.py index cb37c3d..ba3ddfd 100644 --- a/engine/lib/demand.py +++ b/engine/lib/demand.py @@ -17,18 +17,32 @@ def generate_demand_for_actor( params: tuple, noise_std: float = 1.0, distribution_method=np.random.normal, + normalize: bool = False, ) -> np.ndarray: """d(p;0) = max(0, valuation - price) + epsi for single actor type params: (mean, std) for valuation distribution D_H or D_A""" val = distribution_method(*params, size=len(prices)) noise = distribution_method(0, noise_std, len(prices)) demand = np.maximum(0, val - prices + noise) + if not normalize: + return demand total = np.sum(demand) return demand / total * 100 if total > 0 else demand -def estimate_demand(trajectories, action_weights=None): - return estimate_weighted_demand(trajectories, action_weights) +def estimate_demand( + trajectories, + action_weights=None, + *, + normalize: bool = False, + per_session: bool = True, +): + return estimate_weighted_demand( + trajectories, + action_weights, + normalize=normalize, + per_session=per_session, + ) def _parse_event_state(state: str): @@ -50,7 +64,13 @@ def _weight_for_action(action: str, action_weights: dict) -> float: return CATEGORY_WEIGHTS["nav"] -def estimate_weighted_demand(trajectories, action_weights=None): +def estimate_weighted_demand( + trajectories, + action_weights=None, + *, + normalize: bool = False, + per_session: bool = True, +): action_weights = ( DEFAULT_ACTION_WEIGHTS if action_weights is None else action_weights ) @@ -64,12 +84,20 @@ def estimate_weighted_demand(trajectories, action_weights=None): if w <= 0: continue scores[product_id] = scores.get(product_id, 0.0) + w - total = sum(scores.values()) - return ( - {pid: (score / total) * 100 for pid, score in scores.items()} - if total > 0 - else {} - ) + if not scores: + return {} + + if per_session and len(trajectories) > 0: + inv_n = 1.0 / float(len(trajectories)) + scores = {pid: score * inv_n for pid, score in scores.items()} + + if not normalize: + return scores + + total = float(sum(scores.values())) + if total <= 0: + return {} + return {pid: (score / total) * 100.0 for pid, score in scores.items()} # Example usage diff --git a/engine/lib/providers.py b/engine/lib/providers.py index 19d2788..2fa6d8f 100644 --- a/engine/lib/providers.py +++ b/engine/lib/providers.py @@ -156,14 +156,17 @@ class ProviderBenchmark: # log to wandb if available if HAS_WANDB and wandb.run is not None: - wandb.log( - { - f"benchmark/{name}/revenue": result.mean_revenue, - f"benchmark/{name}/coi_preserved": result.coi_preserved_pct, - f"benchmark/{name}/margin": result.margin_integrity, - "benchmark/alpha": alpha, - } - ) + try: + wandb.log( + { + f"benchmark/{name}/revenue": result.mean_revenue, + f"benchmark/{name}/coi_preserved": result.coi_preserved_pct, + f"benchmark/{name}/margin": result.margin_integrity, + "benchmark/alpha": alpha, + } + ) + except Exception: + pass return self.results diff --git a/engine/lib/wrappers.py b/engine/lib/wrappers.py index f68a27c..dcb4fd1 100644 --- a/engine/lib/wrappers.py +++ b/engine/lib/wrappers.py @@ -32,17 +32,23 @@ class EconomicMetricsWrapper(gym.Wrapper): obs, reward, terminated, truncated, info = self.env.step(action) # extract from unwrapped env - prices = self.env.unwrapped._prices + quoted_prices = np.asarray(self.env.unwrapped._prices, dtype=float) + effective_prices = np.asarray( + info.get("effective_prices", quoted_prices), dtype=float + ) + if effective_prices.shape != quoted_prices.shape: + effective_prices = quoted_prices demand_dict = self.env.unwrapped._demand - demand = np.array([demand_dict.get(i, 0.0) for i in range(len(prices))]) + demand = np.array([demand_dict.get(i, 0.0) for i in range(len(quoted_prices))]) # core calculations - revenue = float(np.sum(prices * demand)) - avg_price = float(np.mean(prices)) + revenue = float(info.get("revenue", np.sum(effective_prices * demand))) + quoted_revenue = float(np.sum(quoted_prices * demand)) + avg_price = float(np.mean(effective_prices)) margin = (avg_price - self.p_min) / max(avg_price, 1e-6) coi_level = avg_price - self.p_min # E[P] - p_min per thesis Def 1 - self._price_history.append(prices.copy()) + self._price_history.append(effective_prices.copy()) self._revenue_history.append(revenue) # regret vs baseline (golden path) @@ -53,6 +59,7 @@ class EconomicMetricsWrapper(gym.Wrapper): # inject structured metrics into info info["economics"] = { "revenue": revenue, + "quoted_revenue": quoted_revenue, "margin": margin, "coi_level": coi_level, "regret": regret, @@ -64,6 +71,10 @@ class EconomicMetricsWrapper(gym.Wrapper): "coi_penalty", "ux_penalty", "volatility", + "upward_volatility", + "supra_penalty", + "supra_share", + "competitive_anchor", "profit", "cost_floor", "reward_revenue", @@ -71,10 +82,13 @@ class EconomicMetricsWrapper(gym.Wrapper): "agent_prob", "alpha_adv", "alpha_nominal", + "erosion_share", + "effective_price_mean", ): if key in info: info["economics"][key] = info[key] - info["prices"] = prices.copy() + info["prices"] = quoted_prices.copy() + info["effective_prices"] = effective_prices.copy() info["demand"] = demand.copy() return obs, reward, terminated, truncated, info diff --git a/engine/orchestrators/sweep_agent.py b/engine/orchestrators/sweep_agent.py index 9f3dcfc..6afeaa2 100644 --- a/engine/orchestrators/sweep_agent.py +++ b/engine/orchestrators/sweep_agent.py @@ -9,6 +9,7 @@ from ..telemetry.wandb import ( get_wandb_module, init_run, run_agent, + update_summary, ) from .train import run_with_active_sweep_run @@ -43,13 +44,23 @@ def run_sweep_agent( spec = TrainSpec.from_flat(merged) if run is not None: run.name = run_name(spec, kind=kind, scenario=scenario) - run_with_active_sweep_run( - spec, - kind=kind, - scenario=scenario, - group=group, - extra_tags=extra_tags, - ) + try: + run_with_active_sweep_run( + spec, + kind=kind, + scenario=scenario, + group=group, + extra_tags=extra_tags, + ) + update_summary({"run/status": "finished"}) + except Exception as exc: + update_summary( + { + "run/status": "crashed", + "run/error": str(exc), + } + ) + raise finally: finish_run() diff --git a/engine/orchestrators/train.py b/engine/orchestrators/train.py index 81ebdb5..4be8997 100644 --- a/engine/orchestrators/train.py +++ b/engine/orchestrators/train.py @@ -20,7 +20,7 @@ def _tags_for_run(spec: TrainSpec, kind: str, extra_tags: Sequence[str]) -> list kind, spec.algorithm.name, spec.runtime.backend, - "vanilla" if spec.study.no_robust else "robust", + "baseline" if spec.study.no_robust else "defended", ] tags.extend([tag for tag in extra_tags if tag]) return tags diff --git a/engine/project.json b/engine/project.json index 4d5d041..1fb18e4 100644 --- a/engine/project.json +++ b/engine/project.json @@ -91,6 +91,44 @@ "command": "bash scripts/nx_research.sh docker-train-publish", "cwd": "." } + }, + "whoclicked-publish": { + "executor": "nx:run-commands", + "dependsOn": [ + "install" + ], + "options": { + "command": "bash scripts/nx_research.sh whoclicked-publish", + "cwd": "." + } + }, + "tpu-ray-bootstrap": { + "executor": "nx:run-commands", + "options": { + "command": "bash scripts/nx_research.sh tpu-ray-bootstrap", + "cwd": "." + } + }, + "tpu-ray-deps": { + "executor": "nx:run-commands", + "options": { + "command": "bash scripts/nx_research.sh tpu-ray-deps", + "cwd": "." + } + }, + "tpu-ray-verify": { + "executor": "nx:run-commands", + "options": { + "command": "bash scripts/nx_research.sh tpu-ray-verify", + "cwd": "." + } + }, + "tpu-ray-teardown": { + "executor": "nx:run-commands", + "options": { + "command": "bash scripts/nx_research.sh tpu-ray-teardown", + "cwd": "." + } } }, "tags": [ diff --git a/engine/spec.py b/engine/spec.py index 818d59f..8cc3ea9 100644 --- a/engine/spec.py +++ b/engine/spec.py @@ -32,10 +32,17 @@ def _normalize_keys(raw: Mapping[str, Any]) -> dict[str, Any]: "study.robust_radius": "robust_radius", "study.robust_points": "robust_points", "study.robust_rollouts": "robust_rollouts", + "study.ambiguity_radius": "robust_radius", + "study.ambiguity_points": "robust_points", + "study.ambiguity_rollouts": "robust_rollouts", "study.info_value": "info_value", "study.eta_ux": "eta_ux", "study.reward_profit_weight": "reward_profit_weight", - "study.revenue_weight": "revenue_weight", + "ambiguity_radius": "robust_radius", + "ambiguity_points": "robust_points", + "ambiguity_rollouts": "robust_rollouts", + "baseline_mode": "no_robust", + "stress_eval_enabled": "robust_eval_enabled", "optimizer.learning_rate": "learning_rate", "optimizer.gamma": "gamma", "optimizer.batch_size": "batch_size", @@ -45,6 +52,7 @@ def _normalize_keys(raw: Mapping[str, Any]) -> dict[str, Any]: "runtime.seed": "seed", "runtime.total_timesteps": "total_timesteps", "runtime.checkpoint_interval": "checkpoint_interval", + "runtime.hist_freq": "hist_freq", "eval.eval_freq": "eval_freq", "eval.eval_episodes": "eval_episodes", } @@ -72,6 +80,8 @@ class EnvSpec: max_steps: int = 100 margin_floor: float = 0.05 margin_floor_patience: int = 5 + agent_mu: float = 45.0 + agent_std: float = 15.0 @dataclass(frozen=True) @@ -84,7 +94,6 @@ class StudySpec: info_value: float = 1.0 eta_ux: float = 0.5 reward_profit_weight: float = 1.0 - revenue_weight: float = 0.01 no_robust: bool = False @@ -126,6 +135,7 @@ class RuntimeSpec: checkpoint_interval: int = 200_000 model_dir: str = "engine/models" log_freq: int = 100 + hist_freq: int = 500 @dataclass(frozen=True) @@ -157,6 +167,7 @@ class TrainSpec: "backend": self.runtime.backend, "device": self.runtime.device, "checkpoint_interval": self.runtime.checkpoint_interval, + "hist_freq": self.runtime.hist_freq, "n_products": self.env.n_products, "N": self.env.n_sessions, "price_low": self.env.price_low, @@ -167,6 +178,8 @@ class TrainSpec: "max_steps": self.env.max_steps, "margin_floor": self.env.margin_floor, "margin_floor_patience": self.env.margin_floor_patience, + "agent_mu": self.env.agent_mu, + "agent_std": self.env.agent_std, "alpha": self.study.alpha, "lambda_coi": self.study.lambda_coi, "robust_radius": self.study.robust_radius, @@ -175,7 +188,6 @@ class TrainSpec: "info_value": self.study.info_value, "eta_ux": self.study.eta_ux, "reward_profit_weight": self.study.reward_profit_weight, - "revenue_weight": self.study.revenue_weight, "no_robust": self.study.no_robust, "learning_rate": self.optimizer.learning_rate, "gamma": self.optimizer.gamma, @@ -246,6 +258,8 @@ class TrainSpec: max_steps=int(base["max_steps"]), margin_floor=float(base["margin_floor"]), margin_floor_patience=int(base["margin_floor_patience"]), + agent_mu=float(base.get("agent_mu", 45.0)), + agent_std=float(base.get("agent_std", 15.0)), ), study=StudySpec( alpha=float(base["alpha"]), @@ -256,7 +270,6 @@ class TrainSpec: info_value=float(base["info_value"]), eta_ux=float(base["eta_ux"]), reward_profit_weight=float(base["reward_profit_weight"]), - revenue_weight=float(base["revenue_weight"]), no_robust=no_robust, ), optimizer=OptimizerSpec( @@ -294,6 +307,7 @@ class TrainSpec: checkpoint_interval=int(base["checkpoint_interval"]), model_dir=str(base["model_dir"]), log_freq=int(base["log_freq"]), + hist_freq=int(base["hist_freq"]), ), eval=EvalSpec( eval_freq=int(base["eval_freq"]), @@ -304,9 +318,11 @@ class TrainSpec: def run_name(spec: TrainSpec, *, kind: str, scenario: str) -> str: + alpha_token = f"{float(spec.study.alpha):.2f}".rstrip("0").rstrip(".") + mode = "baseline" if bool(spec.study.no_robust) else "defended" return ( f"{kind}/{spec.algorithm.name}/{spec.runtime.backend}/" - f"{spec.runtime.device}/{scenario}/s{spec.runtime.seed}" + f"{spec.runtime.device}/{scenario}/a{alpha_token}/{mode}/s{spec.runtime.seed}" ) @@ -318,6 +334,7 @@ def run_metadata( group: str | None = None, tags: Sequence[str] = (), ) -> dict[str, Any]: + mode = "baseline" if bool(spec.study.no_robust) else "defended" metadata: dict[str, Any] = { "run.kind": str(kind), "run.algo": spec.algorithm.name, @@ -326,6 +343,10 @@ def run_metadata( "run.scenario": str(scenario), "run.seed": spec.runtime.seed, "run.tags": list(tags), + "study/alpha": float(spec.study.alpha), + "study/mode": mode, + "study/baseline_mode": float(bool(spec.study.no_robust)), + "tiers": spec.algorithm.name, } if group: metadata["run.group"] = group diff --git a/engine/studies/margin_erosion_alpha.py b/engine/studies/margin_erosion_alpha.py new file mode 100644 index 0000000..3ff97a4 --- /dev/null +++ b/engine/studies/margin_erosion_alpha.py @@ -0,0 +1,133 @@ +"""validate core thesis problem: margin erosion under agent contamination +trains standard RL (no robust components) across α levels to demonstrate systematic failure +""" + +from __future__ import annotations +import json, sys, time +from pathlib import Path +import numpy as np + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +from engine.spec import TrainSpec +from engine.orchestrators import run_train_once + + +def _run_baseline(alpha: float, algo: str, seed: int, steps: int) -> dict: + spec = TrainSpec.from_flat( + { + "algo": algo, + "seed": seed, + "alpha": alpha, + "total_timesteps": steps, + "lambda_coi": 0.0, + "robust_radius": 0.0, + "robust_points": 1, + "robust_rollouts": 1, + "no_robust": True, + "arch": "small", + "n_products": 10, + "N": 100, + "max_steps": 50, + "eval_freq": 5000, + "eval_episodes": 10, + "log_freq": 500, + "robust_eval_enabled": False, + "agent_mu": 12.0, + "agent_std": 2.0, + } + ) + result = run_train_once( + spec, + project="phantom-margin-erosion", + offline=True, + no_wandb=True, + kind="study", + scenario=f"alpha{int(alpha * 100):02d}", + group=f"baseline_{algo}", + extra_tags=("margin_erosion", "baseline"), + ) + return { + "alpha": alpha, + "algo": algo, + "seed": seed, + "eval_reward": result.get("eval/reward_mean", np.nan), + "eval_revenue": result.get("eval/revenue_mean", np.nan), + "eval_coi_level": result.get("eval/coi_level_mean", np.nan), + "eval_margin": result.get("eval/margin_mean", np.nan), + "eval_agent_prob": result.get("eval/agent_prob_mean", np.nan), + } + + +def run_margin_erosion_study( + alphas: list[float] | None = None, + algos: list[str] | None = None, + seeds: int = 3, + steps: int = 30_000, +) -> dict: + alphas = alphas or [0.1, 0.3, 0.5, 0.7, 0.9] + algos = algos or ["ppo", "dqn", "qtable"] + output_dir = Path(__file__).parent / "results" + output_dir.mkdir(exist_ok=True) + ts = time.strftime("%Y%m%d_%H%M%S") + + results = [] + for α in alphas: + for algo in algos: + for si in range(seeds): + seed = 42 + si + print(f"α={α:.1f} {algo} seed={seed}") + m = _run_baseline(α, algo, seed, steps) + results.append(m) + print( + f" margin={m['eval_margin']:.3f} rev={m['eval_revenue']:.0f} coi={m['eval_coi_level']:.1f}" + ) + + summary = {} + for α in alphas: + runs = [r for r in results if abs(r["alpha"] - α) < 0.01] + if not runs: + continue + s = {} + for metric in ["margin", "revenue", "coi_level", "agent_prob"]: + vals = [r[f"eval_{metric}"] for r in runs] + s[f"{metric}_mean"] = float(np.mean(vals)) + s[f"{metric}_std"] = float(np.std(vals)) + s["n_runs"] = len(runs) + summary[f"alpha_{α:.1f}"] = s + + output = { + "timestamp": ts, + "config": {"alphas": alphas, "algos": algos, "seeds": seeds, "steps": steps}, + "results": results, + "summary": summary, + } + + path = output_dir / f"margin_erosion_alpha_{ts}.json" + with open(path, "w") as f: + json.dump(output, f, indent=2) + + print(f"\n→ {path}") + for α in alphas: + k = f"alpha_{α:.1f}" + if k in summary: + s = summary[k] + print( + f" {k}: margin={s['margin_mean']:.3f}±{s['margin_std']:.3f} " + f"coi={s['coi_level_mean']:.1f}±{s['coi_level_std']:.1f}" + ) + return output + + +if __name__ == "__main__": + import argparse + + p = argparse.ArgumentParser(description="margin erosion vs α") + p.add_argument("--quick", action="store_true", help="fast test") + args = p.parse_args() + + run_margin_erosion_study( + alphas=[0.1, 0.7] if args.quick else [0.1, 0.3, 0.5, 0.7, 0.9], + algos=["qtable"] if args.quick else ["ppo", "dqn", "qtable"], + seeds=1 if args.quick else 3, + steps=5_000 if args.quick else 30_000, + ) diff --git a/engine/sweeps/final_thesis_proof.yaml b/engine/sweeps/final_thesis_proof.yaml new file mode 100644 index 0000000..2beaa20 --- /dev/null +++ b/engine/sweeps/final_thesis_proof.yaml @@ -0,0 +1,60 @@ +method: grid +metric: + name: eval/stress_reward_worst + goal: maximize +command: + - ${env} + - python + - -m + - engine.train +parameters: + algo: + value: ppo + backend: + value: sb3 + device: + value: cpu + seed: + values: [42, 1337, 7777] + alpha: + values: [0.1, 0.2, 0.3, 0.4, 0.6, 0.8] + n_products: + values: [25, 50, 100] + N: + value: 100 + no_robust: + values: [false, true] + lambda_coi: + values: [0.15, 0.30] + robust_radius: + value: 0.2 + robust_points: + value: 7 + robust_rollouts: + value: 1 + eta_ux: + value: 0.5 + reward_profit_weight: + value: 1.0 + action_levels: + value: 9 + action_scale_low: + value: 0.8 + action_scale_high: + value: 1.2 + total_timesteps: + value: 100000 + eval_episodes: + value: 12 + eval_freq: + value: 1000 + log_freq: + value: 100 + hist_freq: + value: 500 + learning_rate: + value: 0.0003 + batch_size: + value: 256 + n_steps: + value: 2048 diff --git a/engine/sweeps/ppo_supra_guard.yaml b/engine/sweeps/ppo_supra_guard.yaml new file mode 100644 index 0000000..05131be --- /dev/null +++ b/engine/sweeps/ppo_supra_guard.yaml @@ -0,0 +1,53 @@ +method: random +metric: + name: eval/supra_share_mean + goal: minimize +run_cap: 256 +command: + - ${env} + - python + - -m + - engine.train +parameters: + algo: + value: ppo + seed: + values: [42, 1337, 7777] + alpha: + values: [0.1, 0.2, 0.3, 0.4, 0.6] + n_products: + values: [25, 50] + N: + value: 100 + no_robust: + values: [false, true] + lambda_coi: + values: [0.05, 0.15, 0.3] + robust_radius: + values: [0.1, 0.2, 0.3] + robust_points: + value: 7 + robust_rollouts: + value: 1 + eta_ux: + values: [0.05, 0.15, 0.3, 0.5, 0.75] + reward_profit_weight: + value: 1.0 + total_timesteps: + value: 100000 + eval_episodes: + value: 10 + eval_freq: + value: 1000 + log_freq: + value: 100 + hist_freq: + value: 500 + learning_rate: + value: 0.0003 + batch_size: + value: 256 + n_steps: + value: 2048 + device: + value: cpu diff --git a/engine/telemetry/metrics.py b/engine/telemetry/metrics.py index aa080d8..ccfea58 100644 --- a/engine/telemetry/metrics.py +++ b/engine/telemetry/metrics.py @@ -36,7 +36,12 @@ def canonicalize_metrics(raw: Mapping[str, Any], spec: TrainSpec) -> dict[str, A eval_reward = ( _as_float( - metrics.get("eval/robust_reward_worst", metrics.get("eval/reward_mean")), + metrics.get( + "eval/stress_reward_worst", + metrics.get( + "eval/robust_reward_worst", metrics.get("eval/reward_mean") + ), + ), 0.0, ) or 0.0 @@ -51,9 +56,12 @@ def canonicalize_metrics(raw: Mapping[str, Any], spec: TrainSpec) -> dict[str, A metrics["objective/coi_preserved"] = 0.0 if coi_level is None else coi_level metrics["study/alpha"] = spec.study.alpha + metrics["study/mode"] = "baseline" if bool(spec.study.no_robust) else "defended" + metrics["study/baseline_mode"] = float(bool(spec.study.no_robust)) metrics["study/lambda_coi"] = spec.study.lambda_coi - metrics["study/robust_radius"] = spec.study.robust_radius + metrics["study/ambiguity_radius"] = spec.study.robust_radius metrics["study/info_value"] = spec.study.info_value + metrics["tiers"] = spec.algorithm.name metrics["runtime/backend"] = spec.runtime.backend metrics["runtime/device"] = spec.runtime.device diff --git a/engine/telemetry/wandb.py b/engine/telemetry/wandb.py index 5e6fb85..4181a80 100644 --- a/engine/telemetry/wandb.py +++ b/engine/telemetry/wandb.py @@ -1,5 +1,7 @@ from __future__ import annotations +import os +import time from typing import Any, Callable, Iterable, Mapping @@ -19,6 +21,42 @@ def _require_wandb(): return wandb +def _warn(message: str) -> None: + print(f"PHANTOM_WANDB_WARNING: {message}") + + +def _sanitize_key(raw_key: str) -> str | None: + key = str(raw_key) + replacements = { + "no_robust": "baseline_mode", + "study/no_robust": "study/baseline_mode", + "study/robust_radius": "study/ambiguity_radius", + "robust_radius": "ambiguity_radius", + "robust_points": "ambiguity_points", + "robust_rollouts": "ambiguity_rollouts", + "robust_eval_enabled": "stress_eval_enabled", + "eval/robust_alpha_high": "eval/stress_alpha_high", + "eval/robust_alpha_low": "eval/stress_alpha_low", + "eval/robust_reward_worst": "eval/stress_reward_worst", + "eval/robust_revenue_worst": "eval/stress_revenue_worst", + "eval/robust_coi_leakage_worst": "eval/stress_coi_leakage_worst", + } + key = replacements.get(key, key) + if "robust" in key.lower(): + return None + return key + + +def _sanitize_payload(payload: Mapping[str, Any]) -> dict[str, Any]: + sanitized: dict[str, Any] = {} + for key, value in payload.items(): + clean_key = _sanitize_key(str(key)) + if clean_key is None: + continue + sanitized[clean_key] = value + return sanitized + + def init_run( *, mode: str, @@ -34,7 +72,11 @@ def init_run( if group: kwargs["group"] = group if sweep_mode: - run = wandb.init(**kwargs) + try: + run = wandb.init(**kwargs) + except Exception as exc: + _warn(f"init failed in sweep mode ({exc})") + return None if name and run is not None: run.name = name return run @@ -42,18 +84,25 @@ def init_run( init_kwargs = dict(kwargs) init_kwargs["project"] = project if config is not None: - init_kwargs["config"] = dict(config) + init_kwargs["config"] = _sanitize_payload(dict(config)) if name: init_kwargs["name"] = name if tags: init_kwargs["tags"] = list(tags) - return wandb.init(**init_kwargs) + try: + return wandb.init(**init_kwargs) + except Exception as exc: + _warn(f"init failed ({exc})") + return None def finish_run() -> None: wandb = get_wandb_module() if wandb is not None and wandb.run is not None: - wandb.finish() + try: + wandb.finish() + except Exception as exc: + _warn(f"finish failed ({exc})") def current_config() -> dict[str, Any]: @@ -67,25 +116,45 @@ def update_run_config(config: Mapping[str, Any]) -> None: wandb = get_wandb_module() if wandb is None or wandb.run is None: return + payload = _sanitize_payload(dict(config)) + if not payload: + return try: - wandb.config.update(dict(config), allow_val_change=True) + wandb.config.update(payload, allow_val_change=True) except TypeError: - wandb.config.update(dict(config)) + try: + wandb.config.update(payload) + except Exception as exc: + _warn(f"config update failed ({exc})") + except Exception as exc: + _warn(f"config update failed ({exc})") def log_metrics(metrics: Mapping[str, Any], *, step: int) -> None: wandb = get_wandb_module() if wandb is None or wandb.run is None: return - wandb.log(dict(metrics), step=step) + payload = _sanitize_payload(dict(metrics)) + if not payload: + return + try: + wandb.log(payload, step=step) + except Exception as exc: + _warn(f"log failed at step {step} ({exc})") def update_summary(metrics: Mapping[str, Any]) -> None: wandb = get_wandb_module() if wandb is None or wandb.run is None: return - for key, value in metrics.items(): - wandb.run.summary[key] = value + payload = _sanitize_payload(dict(metrics)) + if not payload: + return + try: + for key, value in payload.items(): + wandb.run.summary[key] = value + except Exception as exc: + _warn(f"summary update failed ({exc})") def run_agent( @@ -95,4 +164,39 @@ def run_agent( count: int | None = None, ) -> None: wandb = _require_wandb() - wandb.agent(sweep_id, function=fn, count=count) + retry_max = max(0, int(os.getenv("PHANTOM_WANDB_AGENT_RETRIES", "8"))) + retry_delay = max(1.0, float(os.getenv("PHANTOM_WANDB_AGENT_RETRY_DELAY", "5"))) + retry_backoff = max( + 1.0, float(os.getenv("PHANTOM_WANDB_AGENT_RETRY_BACKOFF", "1.5")) + ) + retry_max_delay = max( + retry_delay, + float(os.getenv("PHANTOM_WANDB_AGENT_MAX_RETRY_DELAY", "60")), + ) + + target = None if count is None else max(0, int(count)) + completed = 0 + + def _wrapped() -> None: + nonlocal completed + fn() + completed += 1 + + attempt = 0 + while True: + remaining = None if target is None else max(0, int(target - completed)) + if target is not None and remaining == 0: + return + try: + wandb.agent(sweep_id, function=_wrapped, count=remaining) + return + except Exception as exc: + attempt += 1 + if attempt > retry_max: + raise + wait = min(retry_max_delay, retry_delay * (retry_backoff ** (attempt - 1))) + _warn( + f"agent disconnected (attempt {attempt}/{retry_max}, " + f"completed={completed}, remaining={remaining}): {exc}" + ) + time.sleep(wait) diff --git a/engine/train.py b/engine/train.py index 2828db3..3fc235d 100644 --- a/engine/train.py +++ b/engine/train.py @@ -54,6 +54,7 @@ def _build_parser() -> argparse.ArgumentParser: parser.add_argument("--total-timesteps", type=int) parser.add_argument("--model-dir", type=str) parser.add_argument("--log-freq", type=int) + parser.add_argument("--hist-freq", type=int) parser.add_argument("--checkpoint-interval", type=int) parser.add_argument("--device", type=str) @@ -68,7 +69,6 @@ def _build_parser() -> argparse.ArgumentParser: parser.add_argument("--no-robust", action="store_true") parser.add_argument("--eta-ux", type=float) parser.add_argument("--reward-profit-weight", type=float) - parser.add_argument("--revenue-weight", type=float) parser.add_argument("--price-low", type=float) parser.add_argument("--price-high", type=float) @@ -126,6 +126,7 @@ def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]: "total_timesteps": args.total_timesteps, "model_dir": args.model_dir, "log_freq": args.log_freq, + "hist_freq": args.hist_freq, "checkpoint_interval": args.checkpoint_interval, "device": args.device, "alpha": args.alpha, @@ -139,7 +140,6 @@ def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]: "no_robust": args.no_robust, "eta_ux": args.eta_ux, "reward_profit_weight": args.reward_profit_weight, - "revenue_weight": args.revenue_weight, "price_low": args.price_low, "price_high": args.price_high, "action_levels": args.action_levels, @@ -179,8 +179,29 @@ def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]: def main(argv: list[str] | None = None) -> None: + import subprocess import sys + # Ensure data is downloaded + from pathlib import Path + + project_root = Path(__file__).parents[1] + data_dir = project_root / "experiments" / "collected_data" + needs_pull = (not data_dir.exists()) or (not any(data_dir.iterdir())) + if needs_pull: + try: + subprocess.run(["make", "data.pull"], cwd=str(project_root), check=True) + except (subprocess.SubprocessError, OSError) as exc: + sys.path.insert(0, str(project_root)) + try: + from scripts.hf_data import pull + + pull() + except (ImportError, OSError, RuntimeError, ValueError) as fallback_exc: + print( + f"Warning: data.pull failed ({exc}); fallback pull failed ({fallback_exc})" + ) + configure_logging() raw_args = list(sys.argv[1:] if argv is None else argv) run_kind = _probe_run_kind(raw_args) diff --git a/engine/wrapper.py b/engine/wrapper.py index d2ac2cd..0ff75d1 100644 --- a/engine/wrapper.py +++ b/engine/wrapper.py @@ -10,6 +10,7 @@ from .lib.coi import ( ) from .lib.behavior import get_transition_models, trajectory_to_events from .lib.wrappers import EconomicMetricsWrapper +from .jax.robust import select_adversarial_alpha_jax, _JAX_OK class _ActionPricingEngine(PricingEngine): @@ -121,6 +122,7 @@ class PHANTOM(gym.Env): self._prices = None self._demand = None self._step_count = 0 + self._global_step = 0 # monotonic; used as JAX RNG seed across resets self._demand_history = [] self._price_history = [] self._revenue_history = [] @@ -128,6 +130,13 @@ class PHANTOM(gym.Env): self._initial_episode_prices = None self._trajectories = [] # session trajectories for agent prob calculation self.baseline_prices = np.full(self.n_products, self.price_bounds[0]) + self.anchor_prices = np.full( + self.n_products, + float(np.clip(float(self.human_params[0]), *self.price_bounds)), + ) + self.competitive_cap = float( + min(self.price_bounds[1], float(np.mean(self.anchor_prices)) * 1.15) + ) self._low_margin_streak = 0 # consecutive steps below margin_floor self._last_agent_prob = float(self.alpha) self._last_alpha_adv = float(self.alpha) @@ -167,19 +176,28 @@ class PHANTOM(gym.Env): self.market.Nhumans = self.N - n_agents def _decode_action(self, action) -> np.ndarray: - base = ( - self._prices - if self._prices is not None - else np.full(self.n_products, self.price_bounds[0], dtype=float) - ) + prev = self._prices + base = self.anchor_prices + + def _blend(target: np.ndarray) -> np.ndarray: + if prev is None: + lower = float(self.price_bounds[0]) + return np.clip(target, lower, self.competitive_cap) + blended = 0.75 * np.asarray(prev, dtype=float) + 0.25 * target + lower = float(self.price_bounds[0]) + return np.clip(blended, lower, self.competitive_cap) + if np.isscalar(action): idx = int(np.clip(int(action), 0, self.action_levels - 1)) - return np.clip(base * self._action_scales[idx], *self.price_bounds) + target = base * self._action_scales[idx] + return _blend(target) a = np.asarray(action) if a.size == 1: idx = int(np.clip(int(a.reshape(-1)[0]), 0, self.action_levels - 1)) - return np.clip(base * self._action_scales[idx], *self.price_bounds) - return np.clip(a.astype(float), *self.price_bounds) + target = base * self._action_scales[idx] + return _blend(target) + lower = float(self.price_bounds[0]) + return np.clip(a.astype(float), lower, self.competitive_cap) def _compute_agent_prob(self, trajectories=None) -> float: trajectories = ( @@ -214,18 +232,23 @@ class PHANTOM(gym.Env): coi_penalty = self.lambda_coi * coi_leakage * info_budget if len(self._price_history) > 0: - volatility = float( - np.mean( - np.abs(prices - self._price_history[-1]) - / np.maximum(self.baseline_prices, 1.0) - ) - ) + prev_prices = np.asarray(self._price_history[-1], dtype=float) + rel_change = (prices - prev_prices) / np.maximum(prev_prices, 1.0) + volatility = float(np.mean(np.abs(rel_change))) + upward_volatility = float(np.mean(np.clip(rel_change, 0.0, None))) else: volatility = 0.0 - ux_penalty = self.eta_ux * info_budget * volatility + upward_volatility = 0.0 + ux_penalty = self.eta_ux * info_budget * (volatility + 0.5 * upward_volatility) + + competitive_anchor = float(np.mean(self.anchor_prices)) + price_ratio = prices / max(competitive_anchor, 1.0) + supra_excess = np.clip(price_ratio - 1.15, 0.0, None) + supra_penalty = 4.0 * info_budget * float(np.mean(np.square(supra_excess))) + supra_share = float(np.mean(supra_excess > 0.0)) reward_revenue = self.reward_profit_weight * profit - reward = reward_revenue - coi_penalty - ux_penalty + reward = reward_revenue - coi_penalty - ux_penalty - supra_penalty return reward, { "revenue": revenue, @@ -238,6 +261,10 @@ class PHANTOM(gym.Env): "coi_info_budget": info_budget, "ux_penalty": ux_penalty, "volatility": volatility, + "upward_volatility": upward_volatility, + "supra_penalty": supra_penalty, + "supra_share": supra_share, + "competitive_anchor": competitive_anchor, "reward_revenue": reward_revenue, "reward_total": reward, } @@ -261,8 +288,37 @@ class PHANTOM(gym.Env): return float(np.mean(rewards)) if rewards else 0.0 def _select_adversarial_alpha(self, prices: np.ndarray) -> float: - """inner robust step: evaluate candidates and pick worst-case alpha""" + """inner robust step: pick worst-case alpha from the ambiguity interval. + + when JAX is available and robust_rollouts==1 we use a vmapped pass over + all K candidates in a single call (no Python loop, no market.act overhead). + the JAX path approximates demand as the mixed closed-form d(p;theta) signal + rather than running full trajectory sampling, which is accurate for the + alpha-selection decision while being dramatically cheaper. + + when robust_rollouts>1 or JAX is unavailable we fall back to the sequential + market.act() loop so behavior is identical to the original implementation. + """ candidates = self._alpha_candidates() + if len(candidates) == 1: + return float(candidates[0]) + + if _JAX_OK and self.robust_rollouts == 1: + best_alpha, _ = select_adversarial_alpha_jax( + candidates=candidates, + prices=prices, + human_params=self.market.human_params, + agent_params=self.market.agent_params, + noise_std=self.market.noise_std, + baseline_prices=self.baseline_prices, + lambda_coi=self.lambda_coi, + info_value=self.info_value, + reward_profit_weight=self.reward_profit_weight, + rng_seed=self._global_step, + ) + return best_alpha + + # fallback: full trajectory-based sequential evaluation evaluations = [ (float(alpha), self._evaluate_candidate(float(alpha), prices)) for alpha in candidates @@ -299,6 +355,7 @@ class PHANTOM(gym.Env): def step(self, action): self._prices = self._decode_action(action) alpha_adv = self._select_adversarial_alpha(self._prices) + self._global_step += 1 # always increment; JAX path may have already done so self._set_market_mix(alpha_adv) self._platform_stub.set_prices(self._prices) self._step_count += 1 diff --git a/lib/config.py b/lib/config.py index a27ffd9..d46f82c 100644 --- a/lib/config.py +++ b/lib/config.py @@ -2,6 +2,7 @@ All hardcoded paths should reference this module Paths can be overridden via environment variables """ + import os from pathlib import Path @@ -9,24 +10,34 @@ from pathlib import Path PROJECT_ROOT = Path(__file__).parent.parent.resolve() # data directories -DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data')) -EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments')) +DATA_DIR = Path(os.getenv("PHANTOM_DATA_DIR", PROJECT_ROOT / "data")) +EXPERIMENTS_DIR = Path( + os.getenv("PHANTOM_EXPERIMENTS_DIR", PROJECT_ROOT / "experiments") +) # agent/human interaction data -AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents')) -HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans')) +AGENT_DATA_DIR = Path(os.getenv("PHANTOM_AGENT_DATA_DIR", DATA_DIR / "agents")) +HUMAN_DATA_DIR = Path(os.getenv("PHANTOM_HUMAN_DATA_DIR", DATA_DIR / "humans")) # RL simulation runs -SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs')) +SIM_RUNS_DIR = Path( + os.getenv("PHANTOM_SIM_RUNS_DIR", PROJECT_ROOT / "sim" / "rl" / "runs") +) # model artifacts -MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models')) +MODEL_REGISTRY_DIR = Path(os.getenv("PHANTOM_MODEL_REGISTRY_DIR", DATA_DIR / "models")) # collected experiment data -COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data')) +COLLECTED_DATA_DIR = Path( + os.getenv( + "PHANTOM_COLLECTED_DATA_DIR", EXPERIMENTS_DIR / "agents" / "collected_data" + ) +) # notebook outputs -NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs')) +NOTEBOOK_OUTPUT_DIR = Path( + os.getenv("PHANTOM_NOTEBOOK_OUTPUT_DIR", EXPERIMENTS_DIR / "notebooks" / "outputs") +) def ensure_dir(path: Path) -> Path: @@ -51,15 +62,18 @@ def get_sim_path(*parts: str) -> Path: # service configuration (from .env) -KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost') -KAFKA_PORT = os.getenv('KAFKA_PORT', '9092') +KAFKA_HOST = os.getenv("KAFKA_HOST", "localhost") +KAFKA_PORT = os.getenv("KAFKA_PORT", "9092") KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}" -REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') -REDIS_PORT = int(os.getenv('REDIS_PORT', '6379')) +REDIS_HOST = os.getenv("REDIS_HOST", "localhost") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) -SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '') -SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '') +SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "") +SUPABASE_ANON_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "") -BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000')) -PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001')) +BACKEND_PORT = int(os.getenv("BACKEND_PORT", "5000")) +PROVIDER_PORT = int(os.getenv("PROVIDER_PORT", "5001")) + +# huggingface dataset repo for collected behavioral data +HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "velocitatem/phantom-collected-data") diff --git a/nx.json b/nx.json index d286a8f..a87654a 100644 --- a/nx.json +++ b/nx.json @@ -58,6 +58,21 @@ "benchmark": { "cache": false }, + "whoclicked-publish": { + "cache": false + }, + "tpu-ray-bootstrap": { + "cache": false + }, + "tpu-ray-deps": { + "cache": false + }, + "tpu-ray-verify": { + "cache": false + }, + "tpu-ray-teardown": { + "cache": false + }, "up": { "cache": false }, diff --git a/package.json b/package.json index 8590f3c..a47cfe8 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,8 @@ ], "scripts": { "nx": "nx", + "manim:render": "nx run manim:render", + "manim:render-all": "nx run manim:render-all", "projects": "nx show projects", "graph": "nx graph", "web:dev": "nx run web:dev", diff --git a/paper/defense/manim/render.py b/paper/defense/manim/render.py deleted file mode 100644 index 5f15e1e..0000000 --- a/paper/defense/manim/render.py +++ /dev/null @@ -1,84 +0,0 @@ -from __future__ import annotations - -import argparse -import subprocess -import sys -from pathlib import Path - -from scenes import SCENE_ORDER - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Render thesis-defense Manim scenes") - parser.add_argument( - "--quality", - default="qm", - choices=["ql", "qm", "qh", "qk"], - help="Manim quality preset", - ) - parser.add_argument( - "--scene", - action="append", - dest="scenes", - help="Scene name; repeat flag to render many", - ) - parser.add_argument( - "--preview", action="store_true", help="Open video after each render" - ) - parser.add_argument( - "--list", action="store_true", help="List available scenes and exit" - ) - return parser.parse_args() - - -def validate_requested(requested: list[str]) -> list[str]: - missing = [name for name in requested if name not in SCENE_ORDER] - if missing: - choices = ", ".join(SCENE_ORDER) - raise ValueError(f"Unknown scenes: {', '.join(missing)}. Choices: {choices}") - return requested - - -def run_manim(scene_file: Path, scene_name: str, quality: str, preview: bool) -> None: - cmd = [sys.executable, "-m", "manim"] - if preview: - cmd.append("-p") - cmd.extend([f"-{quality}", str(scene_file), scene_name]) - subprocess.run(cmd, cwd=scene_file.parent, check=True) - - -def main() -> int: - args = parse_args() - if args.list: - for scene in SCENE_ORDER: - print(scene) - return 0 - - scenes = validate_requested(args.scenes) if args.scenes else list(SCENE_ORDER) - scene_file = Path(__file__).resolve().parent / "scenes.py" - - try: - for scene_name in scenes: - run_manim( - scene_file=scene_file, - scene_name=scene_name, - quality=args.quality, - preview=args.preview, - ) - except FileNotFoundError: - print( - "manim executable not found. Install Manim in your Python environment.", - file=sys.stderr, - ) - return 2 - except ValueError as exc: - print(str(exc), file=sys.stderr) - return 2 - except subprocess.CalledProcessError as exc: - return exc.returncode - - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/paper/defense/manim/scenes.py b/paper/defense/manim/scenes.py deleted file mode 100644 index 6a74998..0000000 --- a/paper/defense/manim/scenes.py +++ /dev/null @@ -1,1581 +0,0 @@ -from __future__ import annotations - -from typing import Iterable - -import numpy as np -from manim import ( - Axes, - Arrow, - BarChart, - BLUE_D, - Circle, - Create, - CurvedArrow, - DashedLine, - DecimalNumber, - Dot, - DOWN, - FadeIn, - FadeOut, - GREEN_C, - GREY_B, - LaggedStart, - LEFT, - Line, - MathTex, - Matrix, - NumberLine, - ORANGE, - Rectangle, - RED_C, - RIGHT, - RoundedRectangle, - Scene, - SurroundingRectangle, - Text, - Transform, - UP, - ValueTracker, - VGroup, - Write, - always_redraw, - config, -) - -P_MIN = 80.0 -P_MAX = 160.0 -LIGHT_BG = "#F8F8F4" -INK = "#1E1E1E" -AXIS_INK = "#2C2C2C" -HIGHLIGHT = "#8F5F00" - -config.background_color = LIGHT_BG -Text.set_default(color=INK) -MathTex.set_default(color=INK) -Line.set_default(color=AXIS_INK) -Arrow.set_default(color=AXIS_INK) -CurvedArrow.set_default(color=AXIS_INK) -DashedLine.set_default(color=AXIS_INK) - - -def normal_pdf(x: float, mu: float, sigma: float) -> float: - z = (x - mu) / sigma - return float(np.exp(-0.5 * z * z) / (sigma * np.sqrt(2.0 * np.pi))) - - -def scene_title(text: str) -> Text: - return Text(text, font_size=44, weight="BOLD", color=INK).to_edge(UP) - - -def card( - label: str, - color: str = BLUE_D, - width: float = 3.3, - height: float = 1.15, - font_size: float = 24, -) -> VGroup: - box = RoundedRectangle(corner_radius=0.15, width=width, height=height) - box.set_stroke(color=color, width=2.0) - box.set_fill(color=color, opacity=0.12) - text = Text(label, font_size=font_size).move_to(box.get_center()) - return VGroup(box, text) - - -def to_matrix( - values: Iterable[Iterable[float]], - title: str, - color: str, - header_buff: float = 0.28, -) -> VGroup: - mat = Matrix( - [[f"{v:.2f}" for v in row] for row in values], h_buff=1.15, v_buff=0.75 - ) - header = Text(title, font_size=25, weight="BOLD", color=color).next_to( - mat, UP, buff=header_buff - ) - frame = SurroundingRectangle(mat, color=color, buff=0.2) - return VGroup(header, frame, mat) - - -def rank_from_scale(scale: int) -> str: - clamped = max(1, min(scale, 10)) - return "A" if clamped == 1 else str(clamped) - - -def actor_face_card( - rank: str, - role: str, - accent: str, - width: float = 1.6, - height: float = 2.25, - show_role: bool = True, -) -> VGroup: - frame = RoundedRectangle(corner_radius=0.1, width=width, height=height) - frame.set_stroke(color=AXIS_INK, width=2.0) - frame.set_fill(color="#FFFFFF", opacity=1.0) - - top_rank = Text(rank, font_size=30, color=accent).move_to( - frame.get_corner(UP + LEFT) + RIGHT * 0.2 + DOWN * 0.22 - ) - bottom_rank = ( - Text(rank, font_size=30, color=accent) - .rotate(np.pi) - .move_to(frame.get_corner(DOWN + RIGHT) + LEFT * 0.2 + UP * 0.22) - ) - center_rank = Text(rank, font_size=56, weight="BOLD", color=accent).move_to( - frame.get_center() + UP * 0.03 - ) - - parts = [frame, top_rank, bottom_rank, center_rank] - if show_role: - role_label = Text(role, font_size=18, color=GREY_B).next_to( - frame, DOWN, buff=0.08 - ) - parts.append(role_label) - return VGroup(*parts) - - -def product_suit_card( - suit: str, - scale: int, - accent: str, - width: float = 1.86, - height: float = 1.04, - show_label: bool = False, -) -> tuple[VGroup, Text]: - frame = RoundedRectangle(corner_radius=0.08, width=width, height=height) - frame.set_stroke(color=AXIS_INK, width=2.0) - frame.set_fill(color="#FFFFFF", opacity=1.0) - - suit_left = Text(suit, font_size=28, color=accent).move_to( - frame.get_left() + RIGHT * 0.22 - ) - suit_right = Text(suit, font_size=28, color=accent).move_to( - frame.get_right() + LEFT * 0.22 - ) - scale_text = Text( - rank_from_scale(scale), - font_size=40, - weight="BOLD", - color=accent, - ).move_to(frame.get_center()) - - parts = [frame, suit_left, suit_right, scale_text] - if show_label: - scale_label = Text("scale", font_size=14, color=GREY_B).next_to( - frame, DOWN, buff=0.04 - ) - parts.append(scale_label) - return VGroup(*parts), scale_text - - -def private_valuation_card(value: int, show_label: bool = False) -> VGroup: - frame = RoundedRectangle(corner_radius=0.08, width=1.86, height=1.04) - frame.set_stroke(color=AXIS_INK, width=2.0) - frame.set_fill(color="#FFFFFF", opacity=1.0) - - rank = Text( - rank_from_scale(value), font_size=40, weight="BOLD", color=GREEN_C - ).move_to(frame.get_center()) - left_tag = Text("v", font_size=28, color=INK).move_to( - frame.get_left() + RIGHT * 0.22 - ) - right_tag = Text("*", font_size=28, color=INK).move_to( - frame.get_right() + LEFT * 0.22 - ) - - parts = [frame, left_tag, right_tag, rank] - if show_label: - title = Text("private value", font_size=14, color=GREY_B).next_to( - frame, DOWN, buff=0.04 - ) - parts.append(title) - return VGroup(*parts) - - -class DefenseOpening(Scene): - def construct(self) -> None: - title = scene_title("PHANTOM Thesis Defense") - subtitle = Text( - "A mechanism-level defense for dynamic pricing under agentic traffic", - font_size=27, - color=GREY_B, - ).next_to(title, DOWN, buff=0.35) - - roadmap = VGroup( - Text("1) Define pricing power from first principles", font_size=30), - Text("2) Show why agent saturation breaks it", font_size=30), - Text( - "3) Build a control loop from behavior to robust policy", font_size=30 - ), - ).arrange(DOWN, aligned_edge=LEFT, buff=0.28) - roadmap.next_to(subtitle, DOWN, buff=0.75).align_to(subtitle, LEFT) - - self.play(Write(title), FadeIn(subtitle, shift=UP * 0.2)) - self.play( - LaggedStart( - *[FadeIn(item, shift=RIGHT * 0.25) for item in roadmap], lag_ratio=0.18 - ) - ) - - dist_axes = Axes( - x_range=[-6, 6, 2], - y_range=[0.0, 0.2, 0.05], - x_length=2.7, - y_length=1.5, - tips=False, - axis_config={"stroke_width": 1.8, "color": AXIS_INK}, - ) - dist_h = dist_axes.plot( - lambda x: normal_pdf(x, -1.9, 1.6), - x_range=[-6, 6], - color=BLUE_D, - stroke_width=4, - ) - dist_a = dist_axes.plot( - lambda x: normal_pdf(x, 1.8, 1.8), - x_range=[-6, 6], - color=RED_C, - stroke_width=4, - ) - dist_block = VGroup( - dist_axes, - dist_h, - dist_a, - Text("behavior gap g", font_size=16, color=GREY_B).next_to( - dist_axes, DOWN, buff=0.03 - ), - ) - - tail_axes = Axes( - x_range=[0, 1, 0.2], - y_range=[0, 1, 0.2], - x_length=2.7, - y_length=1.5, - tips=False, - axis_config={"stroke_width": 1.8, "color": AXIS_INK}, - ) - tail_n1 = tail_axes.plot( - lambda x: (1 - x) ** 1, - x_range=[0, 1], - color=GREEN_C, - stroke_width=4, - ) - tail_n8 = tail_axes.plot( - lambda x: (1 - x) ** 8, - x_range=[0, 1], - color=HIGHLIGHT, - stroke_width=4, - ) - tail_block = VGroup( - tail_axes, - tail_n1, - tail_n8, - Text("order-statistic tail", font_size=16, color=GREY_B).next_to( - tail_axes, DOWN, buff=0.03 - ), - ) - - control_eq = MathTex( - r"\hat\alpha(\tau')\Rightarrow\pi^*", - font_size=34, - color=HIGHLIGHT, - ) - control_box = SurroundingRectangle(control_eq, color=HIGHLIGHT, buff=0.12) - control_block = VGroup(control_box, control_eq) - - preview = VGroup(dist_block, tail_block, control_block).arrange( - RIGHT, buff=0.45 - ) - preview.next_to(roadmap, DOWN, buff=0.58) - preview_caption = Text("Math flow preview", font_size=21, color=GREY_B).next_to( - preview, UP, buff=0.08 - ) - - f_arrow_1 = Arrow(dist_block.get_right(), tail_block.get_left(), buff=0.08) - f_arrow_2 = Arrow(tail_block.get_right(), control_block.get_left(), buff=0.08) - - self.play(FadeIn(preview_caption, shift=UP * 0.1)) - self.play(FadeIn(dist_block), FadeIn(tail_block), FadeIn(control_block)) - self.play(FadeIn(f_arrow_1), FadeIn(f_arrow_2)) - self.wait(0.9) - - -class CardMarketAnalogyScene(Scene): - def construct(self) -> None: - title = scene_title("Card Analogy: Platform, Customer, Agent") - self.play(Write(title)) - - subtitle = Text( - "K = platform, Q = customer, J = search agent, suit cards = products", - font_size=20, - color=GREY_B, - ).next_to(title, DOWN, buff=0.16) - self.play(FadeIn(subtitle, shift=UP * 0.05)) - - king = actor_face_card( - rank="K", role="platform", accent=ORANGE, show_role=False - ) - king.move_to(LEFT * 5.35 + DOWN * 0.35) - - queen_home = RIGHT * 3.2 + DOWN * 0.28 - queen = actor_face_card( - rank="Q", role="customer", accent=BLUE_D, show_role=False - ) - queen.move_to(queen_home) - - valuation = private_valuation_card(value=5).next_to(queen, RIGHT, buff=0.35) - - specs = [ - ("C", INK, 4), - ("H", RED_C, 6), - ("S", INK, 5), - ("D", RED_C, 3), - ] - scales = [initial for _, _, initial in specs] - products = VGroup() - scale_tokens: list[Text] = [] - for suit, color, initial in specs: - product_card, token = product_suit_card( - suit=suit, scale=initial, accent=color - ) - products.add(product_card) - scale_tokens.append(token) - - products.arrange(DOWN, buff=0.15).move_to(LEFT * 1.75 + DOWN * 0.55) - - actor_link = Arrow( - king.get_right(), - products.get_left(), - buff=0.15, - color=HIGHLIGHT, - stroke_width=3.6, - ) - - self.play( - FadeIn(king, shift=RIGHT * 0.2), - FadeIn(products, shift=UP * 0.15), - FadeIn(queen, shift=LEFT * 0.2), - FadeIn(valuation, shift=LEFT * 0.2), - ) - self.play(FadeIn(actor_link)) - - stage = Text( - "Stage 1: queen browses directly and visited products rise in scale.", - font_size=21, - color=GREY_B, - ).to_edge(DOWN) - self.play(FadeIn(stage, shift=UP * 0.08)) - - direct_visits = [1, 2] - for idx in direct_visits: - target = products[idx] - demand_box = SurroundingRectangle(target, color=BLUE_D, buff=0.06) - king_box = SurroundingRectangle(king[0], color=HIGHLIGHT, buff=0.07) - - self.play( - queen.animate.move_to(target.get_right() + RIGHT * 0.9), - run_time=0.7, - ) - self.play(Create(demand_box), run_time=0.2) - - scales[idx] = min(10, scales[idx] + 2) - new_scale = Text( - rank_from_scale(scales[idx]), - font_size=40, - weight="BOLD", - color=specs[idx][1], - ).move_to(scale_tokens[idx]) - self.play( - Create(king_box), - Transform(scale_tokens[idx], new_scale), - run_time=0.5, - ) - self.play(FadeOut(king_box), FadeOut(demand_box), run_time=0.18) - - self.play(queen.animate.move_to(queen_home), run_time=0.7) - - stage_two = Text( - "Stage 2: queen hires jack to search every card before deciding.", - font_size=21, - color=GREY_B, - ).to_edge(DOWN) - self.play(Transform(stage, stage_two)) - - jack = actor_face_card( - rank="J", role="agent", accent=RED_C, show_role=False - ).scale(0.95) - jack.next_to(queen, LEFT, buff=0.35) - hire_arrow = Arrow( - queen.get_left(), - jack.get_right(), - buff=0.08, - color=HIGHLIGHT, - stroke_width=2.6, - ) - self.play(FadeIn(jack, shift=RIGHT * 0.16), FadeIn(hire_arrow)) - self.play(FadeOut(hire_arrow), run_time=0.2) - - for idx, target in enumerate(products): - demand_box = SurroundingRectangle(target, color=RED_C, buff=0.05) - king_box = SurroundingRectangle(king[0], color=HIGHLIGHT, buff=0.07) - - self.play( - jack.animate.move_to(target.get_right() + RIGHT * 0.62), - run_time=0.32, - ) - self.play(Create(demand_box), run_time=0.17) - - scales[idx] = min(10, scales[idx] + 1) - new_scale = Text( - rank_from_scale(scales[idx]), - font_size=40, - weight="BOLD", - color=specs[idx][1], - ).move_to(scale_tokens[idx]) - self.play( - Create(king_box), Transform(scale_tokens[idx], new_scale), run_time=0.38 - ) - self.play( - FadeOut(king_box), - FadeOut(demand_box), - run_time=0.15, - ) - - self.play(jack.animate.next_to(queen, LEFT, buff=0.35), run_time=0.55) - - report_arrow = Arrow( - jack.get_right(), - queen.get_left(), - buff=0.08, - color=GREEN_C, - stroke_width=2.6, - ) - self.play(FadeIn(report_arrow)) - - best_idx = int(np.argmin(scales)) - best_card = products[best_idx] - choice_box = SurroundingRectangle(best_card, color=GREEN_C, buff=0.07) - stage_three = Text( - "Decision rule: buy when private value v* exceeds shown scale.", - font_size=21, - color=GREY_B, - ).to_edge(DOWN) - - self.play( - Transform(stage, stage_three), - queen.animate.move_to(best_card.get_right() + RIGHT * 0.9), - Create(choice_box), - run_time=0.95, - ) - self.play( - FadeOut(jack), - FadeOut(report_arrow), - FadeOut(actor_link), - FadeOut(subtitle), - ) - self.wait(1.0) - - -class COIFirstPrinciplesScene(Scene): - def construct(self) -> None: - title = scene_title("Cost of Information from First Principles") - self.play(Write(title)) - - setup = VGroup( - MathTex(r"P\sim\pi(\tau)", font_size=44), - MathTex(r"\underline p=\text{reservation price}", font_size=38), - MathTex(r"M=P-\underline p", font_size=46, color=HIGHLIGHT), - ).arrange(DOWN, aligned_edge=LEFT, buff=0.22) - setup.to_edge(LEFT).shift(UP * 0.55) - - self.play( - LaggedStart( - *[FadeIn(line, shift=RIGHT * 0.2) for line in setup], lag_ratio=0.2 - ) - ) - - floor_x = 86.0 - mean_x = 116.0 - axes = ( - Axes( - x_range=[80, 160, 10], - y_range=[0.0, 0.04, 0.01], - x_length=7.0, - y_length=3.3, - tips=False, - axis_config={"stroke_width": 2, "color": AXIS_INK}, - ) - .to_edge(RIGHT) - .shift(DOWN * 0.2) - ) - density = axes.plot( - lambda x: normal_pdf(x, mean_x, 12.0), - x_range=[80, 160], - color=BLUE_D, - stroke_width=6, - ) - floor_line = Line( - axes.c2p(floor_x, 0.0), - axes.c2p(floor_x, 0.036), - color=ORANGE, - stroke_width=4, - ) - mean_line = Line( - axes.c2p(mean_x, 0.0), - axes.c2p(mean_x, 0.036), - color=GREEN_C, - stroke_width=4, - ) - floor_tag = ( - MathTex(r"\underline p", color=ORANGE) - .scale(0.72) - .next_to(floor_line, UP, buff=0.06) - ) - mean_tag = ( - MathTex(r"\mathbb{E}[P]", color=GREEN_C) - .scale(0.72) - .next_to(mean_line, UP, buff=0.06) - ) - coi_span = Line( - axes.c2p(floor_x, 0.032), - axes.c2p(mean_x, 0.032), - color=HIGHLIGHT, - stroke_width=6, - ) - coi_tag = Text( - "average information rent", font_size=18, color=HIGHLIGHT - ).next_to(coi_span, UP, buff=0.05) - - chart = VGroup( - axes, - density, - floor_line, - mean_line, - floor_tag, - mean_tag, - coi_span, - coi_tag, - ) - - self.play(FadeIn(axes), FadeIn(density)) - self.play( - FadeIn(floor_line), FadeIn(mean_line), FadeIn(floor_tag), FadeIn(mean_tag) - ) - self.play(FadeIn(coi_span), FadeIn(coi_tag)) - self.play( - FadeOut(setup, shift=LEFT * 0.15), - chart.animate.scale(0.82).to_edge(RIGHT).shift(UP * 0.6), - ) - - coi_left = MathTex(r"\mathrm{COI}:=\mathbb{E}[", font_size=42) - coi_mid = MathTex(r"M", font_size=42) - coi_right = MathTex(r"]", font_size=42) - coi_eq = VGroup(coi_left, coi_mid, coi_right).arrange(RIGHT, buff=0.04) - coi_eq.to_edge(LEFT).shift(UP * 0.45) - - self.play(Write(coi_left), FadeIn(coi_mid, shift=UP * 0.05), Write(coi_right)) - - expanded_mid = MathTex(r"P-\underline p", font_size=42) - expanded_mid.move_to(coi_mid, aligned_edge=LEFT) - self.play( - Transform(coi_mid, expanded_mid), - coi_right.animate.next_to(coi_mid, RIGHT, buff=0.04), - ) - self.play(coi_eq.animate.set_color(HIGHLIGHT)) - - survival = MathTex( - r"\mathrm{COI}=\int_{\underline p}^{\bar p}(1-F_\pi(p))\,dp", - font_size=33, - color=GREY_B, - ).next_to(coi_eq, DOWN, aligned_edge=LEFT, buff=0.2) - self.play(Write(survival)) - - identity_1 = MathTex( - r"\mathbb E[X]=\int_0^{\infty}\mathbb P(X>u)\,du\quad (X\ge 0)", - font_size=31, - color=GREY_B, - ).next_to(survival, DOWN, aligned_edge=LEFT, buff=0.2) - identity_2 = MathTex( - r"X=P-\underline p,\;u=p-\underline p\Rightarrow\int_{\underline p}^{\bar p}(1-F_\pi(p))\,dp", - font_size=31, - color=GREY_B, - ).next_to(identity_1, DOWN, aligned_edge=LEFT, buff=0.14) - self.play(Write(identity_1)) - self.play(Write(identity_2)) - self.wait(1.0) - - -class COIOrderStatisticProofScene(Scene): - def construct(self) -> None: - title = scene_title("Why COI Erodes with Agent Saturation") - self.play(Write(title)) - - key = MathTex(r"p_{(1)}=\min(p_1,\ldots,p_N)", font_size=42, color=HIGHLIGHT) - key.next_to(title, DOWN, buff=0.35) - self.play(Write(key)) - - number_line = NumberLine( - x_range=[P_MIN, P_MAX, 10], - length=9.8, - color=AXIS_INK, - include_numbers=True, - decimal_number_config={"num_decimal_places": 0, "color": INK}, - ).shift(DOWN * 1.5) - floor_marker = Line( - number_line.n2p(P_MIN), - number_line.n2p(P_MIN) + UP * 0.85, - color=ORANGE, - stroke_width=5, - ) - floor_label = MathTex(r"\underline p", color=ORANGE).next_to( - floor_marker, UP, buff=0.05 - ) - self.play(FadeIn(number_line), FadeIn(floor_marker), FadeIn(floor_label)) - - rng = np.random.default_rng(17) - current_group: VGroup | None = None - current_info: VGroup | None = None - - for n in [1, 3, 8, 20]: - draws = np.sort(rng.beta(2.4, 2.1, size=n) * (P_MAX - P_MIN) + P_MIN) - dots = VGroup( - *[ - Dot(number_line.n2p(float(v)), radius=0.06, color=BLUE_D) - for v in draws - ] - ) - min_dot = Dot(number_line.n2p(float(draws[0])), radius=0.09, color=RED_C) - min_tag = ( - MathTex(r"p_{(1)}", color=RED_C) - .scale(0.65) - .next_to(min_dot, UP, buff=0.08) - ) - step_group = VGroup(dots, min_dot, min_tag) - - info = VGroup( - Text(f"N = {n}", font_size=28), - Text(f"min observed = {draws[0]:.2f}", font_size=24), - ).arrange(DOWN, aligned_edge=LEFT, buff=0.12) - info.to_edge(LEFT).shift(UP * 0.55) - info_box = VGroup(SurroundingRectangle(info, color=GREY_B, buff=0.18), info) - - if current_group is None: - self.play(FadeIn(step_group), FadeIn(info_box)) - else: - self.play( - FadeOut(current_group), - FadeOut(current_info), - FadeIn(step_group), - FadeIn(info_box), - ) - current_group = step_group - current_info = info_box - self.wait(0.4) - - p1 = MathTex( - r"\mathbb{P}(p_{(1)}>t)=\mathbb{P}(p_1>t,\ldots,p_N>t)", font_size=36 - ) - p2 = MathTex(r"\mathbb{P}(p_{(1)}>t)=[1-F(t)]^N", font_size=42, color=HIGHLIGHT) - prob_group = VGroup(p1, p2).arrange(DOWN, aligned_edge=LEFT, buff=0.16) - prob_group.to_edge(RIGHT).shift(UP * 0.75) - - self.play(Write(p1)) - self.play(Write(p2)) - - cleanup_items: list = [key, number_line, floor_marker, floor_label] - if current_group is not None: - cleanup_items.append(current_group) - if current_info is not None: - cleanup_items.append(current_info) - self.play( - FadeOut(VGroup(*cleanup_items), shift=DOWN * 0.12), - prob_group.animate.shift(UP * 0.26), - ) - - tail_axes = ( - Axes( - x_range=[0, 1, 0.2], - y_range=[0, 1, 0.2], - x_length=4.1, - y_length=2.45, - tips=False, - axis_config={"stroke_width": 2, "color": AXIS_INK}, - ) - .to_edge(RIGHT) - .shift(DOWN * 1.0 + LEFT * 0.2) - ) - curve_1 = tail_axes.plot( - lambda x: (1 - x) ** 1, x_range=[0, 1], color=BLUE_D, stroke_width=4 - ) - curve_4 = tail_axes.plot( - lambda x: (1 - x) ** 4, x_range=[0, 1], color=GREEN_C, stroke_width=4 - ) - curve_16 = tail_axes.plot( - lambda x: (1 - x) ** 16, x_range=[0, 1], color=RED_C, stroke_width=4 - ) - c_labels = VGroup( - Text("N=1", font_size=18, color=BLUE_D), - Text("N=4", font_size=18, color=GREEN_C), - Text("N=16", font_size=18, color=RED_C), - ).arrange(DOWN, aligned_edge=LEFT, buff=0.08) - c_labels.next_to(tail_axes, UP, buff=0.08).align_to(tail_axes, RIGHT) - tail_x = MathTex(r"F(t)", font_size=24).next_to(tail_axes, DOWN, buff=0.05) - tail_y = MathTex(r"[1-F(t)]^N", font_size=24).next_to( - tail_axes, LEFT, buff=0.05 - ) - - self.play(FadeIn(tail_axes), Create(curve_1), Create(curve_4), Create(curve_16)) - self.play(FadeIn(c_labels), FadeIn(tail_x), FadeIn(tail_y)) - - e1 = MathTex( - r"\mathbb{E}[p_{(1)}]=\underline p+\int_{\underline p}^{\bar p}[1-F(t)]^N\,dt", - font_size=32, - ) - e2 = MathTex( - r"X:=p_{(1)}-\underline p\ge 0,\quad \mathbb E[X]=\int_0^{\infty}\mathbb P(X>u)\,du", - font_size=27, - color=GREY_B, - ) - e3 = MathTex( - r"\mathbb P(X>u)=\mathbb P\!\left(p_{(1)}>\underline p+u\right)=[1-F(\underline p+u)]^N", - font_size=27, - color=GREY_B, - ) - e4 = MathTex( - r"0\le[1-F(t)]^N\le1,\quad [1-F(t)]^N\to0\ \text{for } t>\underline p", - font_size=27, - color=GREY_B, - ) - e5 = MathTex( - r"\Rightarrow\ \lim_{N\to\infty}(\mathbb{E}[p_{(1)}]-\underline p)=0", - font_size=38, - color=HIGHLIGHT, - ) - proof_block = VGroup(e1, e2, e3, e4, e5).arrange( - DOWN, aligned_edge=LEFT, buff=0.12 - ) - proof_block.to_edge(LEFT).shift(UP * 0.45) - self.play(Write(e1)) - self.play(Write(e2)) - self.play(Write(e3)) - self.play(Write(e4)) - self.play(Write(e5)) - - conclusion = Text( - "As independent query count grows, realizable markup collapses.", - font_size=24, - color=GREY_B, - ) - conclusion.to_edge(DOWN) - self.play(FadeIn(conclusion, shift=UP * 0.1)) - self.wait(1.1) - - -class BehaviorKernelConstructionScene(Scene): - def construct(self) -> None: - title = scene_title("From Session Paths to Transition Kernels") - self.play(Write(title)) - - traj_h = Text( - "human: start -> view -> detail -> cart -> purchase", - font_size=26, - color=GREEN_C, - ) - traj_a = Text( - "agent: start -> view -> detail -> view -> detail", - font_size=26, - color=RED_C, - ) - trajectories = VGroup(traj_h, traj_a).arrange( - DOWN, aligned_edge=LEFT, buff=0.16 - ) - trajectories.next_to(title, DOWN, buff=0.45).align_to(title, LEFT) - self.play( - LaggedStart( - *[FadeIn(t, shift=RIGHT * 0.2) for t in trajectories], lag_ratio=0.25 - ) - ) - - mle = MathTex( - r"\hat P(s'\mid s)=\frac{N(s,s')}{\sum_k N(s,k)}", - font_size=40, - color=HIGHLIGHT, - ) - mle.next_to(trajectories, DOWN, aligned_edge=LEFT, buff=0.28) - self.play(Write(mle)) - - counts = to_matrix( - ( - (0.00, 8.00, 0.00, 0.00), - (0.00, 2.00, 5.00, 1.00), - (0.00, 3.00, 2.00, 4.00), - (0.00, 1.00, 0.00, 6.00), - ), - "transition counts N(s,s')", - color=BLUE_D, - ) - probs = to_matrix( - ( - (0.00, 1.00, 0.00, 0.00), - (0.00, 0.25, 0.62, 0.13), - (0.00, 0.33, 0.22, 0.45), - (0.00, 0.14, 0.00, 0.86), - ), - "normalized kernel T", - color=GREEN_C, - header_buff=0.4, - ) - mats = ( - VGroup(counts, probs) - .arrange(RIGHT, buff=0.95) - .scale(0.92) - .to_edge(DOWN) - .shift(UP * 0.34) - ) - arrow = Arrow(counts.get_right(), probs.get_left(), buff=0.18, stroke_width=4) - arrow_tag = Text("row normalize", font_size=18, color=GREY_B).next_to( - arrow, UP, buff=0.08 - ) - kernel_arrow = Arrow( - mle.get_bottom(), - mats.get_top() + UP * 0.05, - buff=0.1, - color=GREY_B, - stroke_width=3.2, - ) - self.play( - FadeIn(mats, shift=UP * 0.12), - FadeIn(arrow), - FadeIn(arrow_tag), - FadeIn(kernel_arrow, shift=DOWN * 0.06), - ) - self.play( - FadeOut(mle, shift=UP * 0.08), - FadeOut(kernel_arrow, shift=DOWN * 0.08), - ) - - note = Text( - "Kernel shape is the compact behavioral signature used downstream.", - font_size=21, - color=GREY_B, - ) - note.next_to(mats, DOWN, buff=0.16) - self.play(FadeIn(note, shift=UP * 0.1)) - self.wait(1.0) - - -class SeparabilitySignalScene(Scene): - def construct(self) -> None: - title = Text( - "Separability into a Control Signal", - font_size=40, - weight="BOLD", - color=INK, - ).to_edge(UP, buff=0.18) - self.play(Write(title)) - - human = to_matrix( - ( - (0.05, 0.70, 0.20, 0.05), - (0.05, 0.20, 0.60, 0.15), - (0.10, 0.25, 0.30, 0.35), - (0.00, 0.00, 0.00, 1.00), - ), - "human centroid T_H", - color=GREEN_C, - ) - agent = to_matrix( - ( - (0.03, 0.82, 0.12, 0.03), - (0.06, 0.55, 0.21, 0.18), - (0.08, 0.48, 0.14, 0.30), - (0.00, 0.00, 0.00, 1.00), - ), - "agent centroid T_A", - color=RED_C, - ) - kernels = VGroup(human, agent).arrange(RIGHT, buff=0.95).shift(UP * 0.45) - self.play(FadeIn(kernels, shift=UP * 0.15)) - - self.play( - kernels.animate.scale(0.6) - .arrange(DOWN, aligned_edge=LEFT, buff=0.24) - .to_edge(LEFT) - .shift(UP * 0.18) - ) - - d_h = MathTex(r"\Delta_H=D_{KL}(\hat T'\parallel\bar T_H)", font_size=36) - d_a = MathTex(r"\Delta_A=D_{KL}(\hat T'\parallel\bar T_A)", font_size=36) - gap = MathTex(r"g=\Delta_H-\Delta_A", font_size=44, color=HIGHLIGHT) - alpha = MathTex(r"\hat\alpha(\tau')=\sigma(\beta g)", font_size=40) - eqs = VGroup(d_h, d_a, gap, alpha).arrange(DOWN, aligned_edge=LEFT, buff=0.2) - eqs.to_edge(RIGHT).shift(UP * 0.38) - self.play(LaggedStart(*[Write(eq) for eq in eqs], lag_ratio=0.18)) - - self.play( - eqs.animate.scale(0.66).next_to(kernels, DOWN, aligned_edge=LEFT, buff=0.16) - ) - - mu_h, sigma_h = -3.35, 2.67 - mu_a, sigma_a = 1.65, 2.83 - axis = ( - Axes( - x_range=[-10, 10, 2], - y_range=[0.0, 0.18, 0.03], - x_length=6.8, - y_length=3.7, - tips=False, - axis_config={"stroke_width": 2, "color": AXIS_INK}, - ) - .to_edge(RIGHT) - .shift(DOWN * 0.75 + LEFT * 0.15) - ) - x_tag = MathTex(r"g=\Delta_H-\Delta_A", font_size=30).next_to( - axis, DOWN, buff=0.15 - ) - - human_curve = axis.plot( - lambda x: normal_pdf(x, mu_h, sigma_h), - x_range=[-10, 10], - color=BLUE_D, - stroke_width=6, - ) - agent_curve = axis.plot( - lambda x: normal_pdf(x, mu_a, sigma_a), - x_range=[-10, 10], - color=RED_C, - stroke_width=6, - ) - h_label = Text("human", font_size=22, color=BLUE_D).move_to( - axis.c2p(-6.4, 0.108) - ) - a_label = Text("agent", font_size=22, color=RED_C).move_to(axis.c2p(5.8, 0.095)) - - boundary = DashedLine( - axis.c2p(0.0, 0.0), axis.c2p(0.0, 0.165), color=GREY_B, stroke_width=2 - ) - boundary_tag = Text("decision boundary", font_size=17, color=GREY_B).next_to( - boundary, UP, buff=0.08 - ) - boundary_tag.shift(RIGHT * 0.8) - - g_obs = 1.6 - g_line = Line( - axis.c2p(g_obs, 0.0), - axis.c2p(g_obs, 0.145), - color=HIGHLIGHT, - stroke_width=4, - ) - g_dot = Dot(axis.c2p(g_obs, 0.145), color=HIGHLIGHT, radius=0.06) - g_tag = ( - MathTex(r"g_{obs}", color=HIGHLIGHT) - .scale(0.72) - .next_to(g_dot, UP, buff=0.04) - ) - - self.play(FadeIn(axis), FadeIn(x_tag)) - self.play(Create(human_curve), Create(agent_curve)) - self.play( - FadeIn(h_label), FadeIn(a_label), FadeIn(boundary), FadeIn(boundary_tag) - ) - self.play(FadeIn(g_line), FadeIn(g_dot), FadeIn(g_tag)) - - hint = Text( - "Positive gap shifts score toward agent traffic.", - font_size=20, - color=GREY_B, - ) - hint.next_to(x_tag, DOWN, buff=0.1) - hint.match_x(axis) - self.play(FadeIn(hint, shift=UP * 0.1)) - self.wait(1.0) - - -class ContaminationGeneratorScene(Scene): - def construct(self) -> None: - title = scene_title("Contamination Generator G(alpha)") - self.play(Write(title)) - - human_pool = card("labeled human sessions", color=BLUE_D, width=4.1) - agent_pool = card("synthetic agent sessions", color=RED_C, width=4.1) - mixed_pool = card("mixed batch for training", color=HIGHLIGHT, width=4.4) - - top = ( - VGroup(human_pool, agent_pool) - .arrange(RIGHT, buff=1.1) - .next_to(title, DOWN, buff=0.55) - ) - mixed_pool.next_to(top, DOWN, buff=1.25) - - a1 = Arrow( - human_pool.get_bottom(), - mixed_pool.get_top() + LEFT * 1.0, - buff=0.1, - stroke_width=4, - ) - a2 = Arrow( - agent_pool.get_bottom(), - mixed_pool.get_top() + RIGHT * 1.0, - buff=0.1, - stroke_width=4, - ) - - self.play(FadeIn(top, shift=UP * 0.12), FadeIn(mixed_pool, shift=UP * 0.12)) - self.play(FadeIn(a1), FadeIn(a2)) - - flow = VGroup(top, mixed_pool, a1, a2) - self.play(flow.animate.scale(0.68).to_edge(LEFT).shift(UP * 0.58)) - - alpha_tracker = ValueTracker(0.18) - bar_outline = Rectangle( - width=7.0, height=0.46, stroke_color=AXIS_INK, stroke_width=2 - ).move_to(RIGHT * 0.55 + DOWN * 0.12) - base_h = Rectangle( - width=7.0, height=0.4, stroke_width=0, fill_color=BLUE_D, fill_opacity=0.35 - ).move_to(bar_outline) - - def make_agent_fill() -> Rectangle: - width = max(0.02, 7.0 * alpha_tracker.get_value()) - rect = Rectangle( - width=width, - height=0.4, - stroke_width=0, - fill_color=RED_C, - fill_opacity=0.68, - ) - rect.move_to(bar_outline.get_right() + LEFT * (width / 2.0)) - return rect - - agent_fill = always_redraw(make_agent_fill) - alpha_label = Text("alpha =", font_size=24).next_to( - bar_outline, DOWN, buff=0.16 - ) - alpha_value = always_redraw( - lambda: DecimalNumber( - alpha_tracker.get_value(), - num_decimal_places=2, - font_size=28, - color=HIGHLIGHT, - ).next_to(alpha_label, RIGHT, buff=0.1) - ) - left_tag = Text("human share (1-alpha)", font_size=18, color=BLUE_D).next_to( - bar_outline, LEFT, buff=0.15 - ) - right_tag = Text("agent share (alpha)", font_size=18, color=RED_C).next_to( - bar_outline, RIGHT, buff=0.15 - ) - - self.play(FadeIn(bar_outline), FadeIn(base_h), FadeIn(agent_fill)) - self.play( - FadeIn(alpha_label), - FadeIn(alpha_value), - FadeIn(left_tag), - FadeIn(right_tag), - ) - - mix_eq = MathTex( - r"\hat Q(p\mid\tau')=(1-\alpha)\,\hat Q_H(p\mid\tau')+\alpha\,\hat Q_A(p\mid\tau')", - font_size=31, - ).next_to(bar_outline, DOWN, buff=0.45) - interval = MathTex( - r"\alpha\in[\alpha_0-\epsilon_\alpha,\,\alpha_0+\epsilon_\alpha]", - font_size=31, - color=GREY_B, - ) - interval.next_to(mix_eq, DOWN, buff=0.2) - self.play(Write(mix_eq), Write(interval)) - - self.play(alpha_tracker.animate.set_value(0.32), run_time=1.2) - self.play(alpha_tracker.animate.set_value(0.55), run_time=1.2) - self.play(alpha_tracker.animate.set_value(0.24), run_time=1.1) - self.wait(0.9) - - -class RobustControlScene(Scene): - def construct(self) -> None: - title = scene_title("Distributionally Robust Control Layer") - self.play(Write(title)) - - objective = MathTex( - r"\pi^*=\arg\max_\pi\min_{Q\in\mathcal U_\epsilon}\mathbb E_{d\sim Q}[R(p,d)-\lambda\,COI_{leak}(p,\tau') ]", - font_size=31, - ).next_to(title, DOWN, buff=0.4) - reward = MathTex( - r"r_t=R(p_t,d_t)-\lambda f(\tau_t')c_{info},\quad d_t\sim Q(\cdot\mid p_t,\tau_t')", - font_size=31, - color=HIGHLIGHT, - ) - reward.next_to(objective, DOWN, buff=0.25) - demand_link = MathTex( - r"\hat Q(p_t,\tau_t')=\mathbb E_Q[d_t\mid p_t,\tau_t']", - font_size=29, - color=GREY_B, - ).next_to(reward, DOWN, buff=0.16) - self.play(Write(objective), Write(reward), Write(demand_link)) - - plane = ( - Axes( - x_range=[-3, 3, 1], - y_range=[-3, 3, 1], - x_length=5.6, - y_length=5.6, - tips=False, - axis_config={"stroke_width": 1.8, "color": AXIS_INK}, - ) - .to_edge(LEFT) - .shift(DOWN * 0.55) - ) - center = Dot(plane.c2p(0, 0), color=BLUE_D, radius=0.08) - center_tag = ( - MathTex(r"\hat P_N", color=BLUE_D) - .scale(0.75) - .next_to(center, UP, buff=0.07) - ) - ball = Circle(radius=1.75, color=HIGHLIGHT, stroke_width=3).move_to(center) - ball_tag = ( - MathTex(r"\mathcal U_\epsilon", color=HIGHLIGHT) - .scale(0.72) - .next_to(ball, UP, buff=0.08) - ) - - q1 = Dot(plane.c2p(1.0, 0.7), color=GREEN_C) - q2 = Dot(plane.c2p(-1.2, 0.9), color=RED_C) - q3 = Dot(plane.c2p(0.3, -1.3), color=GREEN_C) - q4 = Dot(plane.c2p(-0.9, -0.6), color=GREEN_C) - q2_tag = Text("worst-case Q*", font_size=18, color=RED_C).next_to( - q2, UP, buff=0.07 - ) - - self.play(FadeIn(plane), FadeIn(center), FadeIn(center_tag)) - self.play(Create(ball), FadeIn(ball_tag)) - self.play( - LaggedStart(*[FadeIn(dot) for dot in [q1, q2, q3, q4]], lag_ratio=0.14) - ) - self.play(FadeIn(q2_tag, shift=UP * 0.08)) - - inner_step = card( - "inner min picks Q*", color=RED_C, width=4.6, height=0.9, font_size=20 - ) - demand_step = card( - "sample demand from Q*", color=ORANGE, width=4.6, height=0.9, font_size=20 - ) - update_step = card( - "outer max updates policy", - color=GREEN_C, - width=4.6, - height=0.9, - font_size=20, - ) - pipeline = ( - VGroup(inner_step, demand_step, update_step) - .arrange(DOWN, buff=0.32) - .to_edge(RIGHT) - .shift(DOWN * 0.95) - ) - chooser = Arrow( - q2.get_right() + RIGHT * 0.15, - inner_step.get_left(), - buff=0.08, - color=RED_C, - stroke_width=4, - ) - stage_arrow_1 = Arrow( - inner_step.get_bottom(), - demand_step.get_top(), - buff=0.08, - stroke_width=3.6, - ) - stage_arrow_2 = Arrow( - demand_step.get_bottom(), - update_step.get_top(), - buff=0.08, - stroke_width=3.6, - ) - feedback = CurvedArrow( - update_step.get_left() + DOWN * 0.12, - center.get_right() + UP * 0.15, - angle=0.92, - color=GREEN_C, - stroke_width=3.6, - ) - self.play(FadeIn(pipeline, shift=LEFT * 0.15)) - self.play(FadeIn(chooser)) - self.play(FadeIn(stage_arrow_1), FadeIn(stage_arrow_2)) - self.play(FadeIn(feedback)) - - note = Text( - "Reward is evaluated on demand drawn from Q*, then used for the policy step.", - font_size=22, - color=GREY_B, - ) - note.to_edge(DOWN) - self.play(FadeIn(note, shift=UP * 0.1)) - self.wait(1.0) - - -class SystemLoopScene(Scene): - def construct(self) -> None: - title = scene_title("Online + Offline Defense Loop") - self.play(Write(title)) - - web = card("Web app", color=BLUE_D, width=2.9) - provider = card("Pricing provider", color=BLUE_D, width=3.5) - kafka = card("Kafka streams", color=HIGHLIGHT, width=3.1) - kernels = card("Kernel + KL estimator", color=GREEN_C, width=3.9) - generator = card("Generator G(alpha)", color=GREEN_C, width=3.5) - policy = card("DR-RL trainer", color=ORANGE, width=3.0) - - web.move_to(LEFT * 4.6 + UP * 1.35) - provider.move_to(RIGHT * 4.2 + UP * 1.35) - kafka.move_to(LEFT * 4.6 + DOWN * 1.1) - kernels.move_to(LEFT * 1.3 + DOWN * 1.1) - generator.move_to(RIGHT * 2.0 + DOWN * 1.1) - policy.move_to(RIGHT * 5.1 + DOWN * 1.1) - - online_tag = Text("online serving", font_size=22, weight="BOLD", color=GREY_B) - online_tag.next_to(web, UP, buff=0.38).align_to(web, LEFT) - offline_tag = Text( - "offline defense training", font_size=22, weight="BOLD", color=GREY_B - ) - offline_tag.next_to(kafka, UP, buff=0.38).align_to(kafka, LEFT) - - request_arrow = CurvedArrow( - web.get_right() + UP * 0.2, - provider.get_left() + UP * 0.2, - angle=-0.24, - stroke_width=4, - ) - response_arrow = CurvedArrow( - provider.get_left() + DOWN * 0.2, - web.get_right() + DOWN * 0.2, - angle=-0.24, - stroke_width=4, - ) - log_arrow = Arrow(web.get_bottom(), kafka.get_top(), buff=0.08, stroke_width=4) - k_to_kl = Arrow(kafka.get_right(), kernels.get_left(), buff=0.1, stroke_width=4) - kl_to_g = Arrow( - kernels.get_right(), generator.get_left(), buff=0.1, stroke_width=4 - ) - g_to_pi = Arrow( - generator.get_right(), policy.get_left(), buff=0.1, stroke_width=4 - ) - pi_to_provider = Arrow( - policy.get_top(), provider.get_bottom(), buff=0.08, stroke_width=4 - ) - - nodes = VGroup(web, provider, kafka, kernels, generator, policy) - self.play( - FadeIn(online_tag, shift=UP * 0.08), FadeIn(offline_tag, shift=UP * 0.08) - ) - self.play( - LaggedStart( - *[FadeIn(node, shift=UP * 0.08) for node in nodes], lag_ratio=0.12 - ) - ) - self.play( - LaggedStart( - *[ - FadeIn(a) - for a in [ - request_arrow, - response_arrow, - log_arrow, - k_to_kl, - kl_to_g, - g_to_pi, - pi_to_provider, - ] - ], - lag_ratio=0.08, - ) - ) - - labels = VGroup( - Text("request quote", font_size=17).next_to(request_arrow, UP, buff=0.06), - Text("serve price", font_size=17).next_to(response_arrow, DOWN, buff=0.06), - Text("events + quote logs", font_size=17).next_to( - log_arrow, RIGHT, buff=0.08 - ), - Text("fit kernels + alpha", font_size=17).next_to(kl_to_g, UP, buff=0.08), - Text("robust policy train", font_size=17).next_to(g_to_pi, UP, buff=0.08), - Text("publish model", font_size=17).next_to( - pi_to_provider, RIGHT, buff=0.08 - ), - ) - self.play(LaggedStart(*[FadeIn(l) for l in labels], lag_ratio=0.15)) - self.wait(1.0) - - -class ObjectiveAndResultsScene(Scene): - def construct(self) -> None: - title = scene_title("Early Experimental Signal") - self.play(Write(title)) - - objective_chart = BarChart( - values=[3.41, 3.91], - bar_names=["robust", "non-robust"], - y_range=[0, 5, 1], - y_length=2.9, - x_length=4.8, - bar_colors=[GREEN_C, RED_C], - ) - objective_label = Text("objective (x1e5)", font_size=21).next_to( - objective_chart, UP, buff=0.1 - ) - - revenue_chart = BarChart( - values=[3.80, 4.18], - bar_names=["robust", "non-robust"], - y_range=[0, 5, 1], - y_length=2.9, - x_length=4.8, - bar_colors=[GREEN_C, RED_C], - ) - revenue_label = Text("revenue (x1e5)", font_size=21).next_to( - revenue_chart, UP, buff=0.1 - ) - - charts = VGroup( - VGroup(objective_label, objective_chart), - VGroup(revenue_label, revenue_chart), - ).arrange(RIGHT, buff=0.85) - charts.next_to(title, DOWN, buff=0.7) - self.play(FadeIn(charts, shift=UP * 0.2)) - - pairwise = VGroup( - Text("pairwise win counts", font_size=24, weight="BOLD"), - Text("objective: robust beats baseline in 13 / 40", font_size=22), - Text("revenue: robust beats baseline in 16 / 40", font_size=22), - ).arrange(DOWN, aligned_edge=LEFT, buff=0.13) - pairwise.next_to(charts, DOWN, buff=0.35) - self.play( - LaggedStart( - *[FadeIn(row, shift=RIGHT * 0.15) for row in pairwise], lag_ratio=0.18 - ) - ) - - caution = Text( - "Interpretation: defense effect is real but regime-dependent and needs calibration.", - font_size=22, - color=GREY_B, - ).to_edge(DOWN) - self.play(FadeIn(caution, shift=UP * 0.1)) - self.wait(1.1) - - -class TakeawayScene(Scene): - def construct(self) -> None: - title = scene_title("Takeaways") - self.play(Write(title)) - - bullets = VGroup( - Text("COI gives a clean monetary KPI for pricing power.", font_size=32), - Text( - "Behavioral KL separability becomes a live control signal.", - font_size=32, - ), - Text( - "DR-RL with ambiguity sets protects against contamination shift.", - font_size=32, - ), - ).arrange(DOWN, aligned_edge=LEFT, buff=0.32) - bullets.next_to(title, DOWN, buff=0.7).align_to(title, LEFT) - self.play( - LaggedStart( - *[FadeIn(item, shift=RIGHT * 0.2) for item in bullets], lag_ratio=0.2 - ) - ) - - final = Text( - "From mechanism failure to implementable defense loop.", - font_size=29, - color=HIGHLIGHT, - ) - final.to_edge(DOWN) - self.play(FadeIn(final, shift=UP * 0.1)) - self.wait(1.0) - - -class ThesisBannerPosterScene(Scene): - def construct(self) -> None: - title = Text("PHANTOM", font_size=72, weight="BOLD", color=INK).to_edge(UP) - subtitle = Text( - "Pricing Heuristics Against Non-human Transaction Orchestration", - font_size=24, - color=GREY_B, - ).next_to(title, DOWN, buff=0.05) - - coi_axes = Axes( - x_range=[0, 1, 0.2], - y_range=[0, 1, 0.2], - x_length=3.15, - y_length=1.75, - tips=False, - axis_config={"stroke_width": 1.8, "color": AXIS_INK}, - ) - coi_n1 = coi_axes.plot( - lambda x: (1 - x) ** 1, - x_range=[0, 1], - color=BLUE_D, - stroke_width=4, - ) - coi_n8 = coi_axes.plot( - lambda x: (1 - x) ** 8, - x_range=[0, 1], - color=ORANGE, - stroke_width=4, - ) - coi_hint = Text( - "Order-statistic tail compresses as query count grows", font_size=15 - ) - coi_hint.set_color(GREY_B).next_to(coi_axes, DOWN, buff=0.06) - coi_title = Text("1) COI erosion", font_size=23, weight="BOLD", color=ORANGE) - coi_body = VGroup(coi_axes, coi_n1, coi_n8, coi_hint) - coi_group = VGroup(coi_title, coi_body).arrange(DOWN, buff=0.08) - coi_frame = SurroundingRectangle(coi_group, color=ORANGE, buff=0.14) - coi_frame.set_fill(color=ORANGE, opacity=0.05) - coi_panel = VGroup(coi_frame, coi_group) - - gap_axes = Axes( - x_range=[-8, 8, 2], - y_range=[0.0, 0.2, 0.05], - x_length=3.15, - y_length=1.75, - tips=False, - axis_config={"stroke_width": 1.8, "color": AXIS_INK}, - ) - gap_h = gap_axes.plot( - lambda x: normal_pdf(x, -3.35, 2.67), - x_range=[-8, 8], - color=BLUE_D, - stroke_width=4, - ) - gap_a = gap_axes.plot( - lambda x: normal_pdf(x, 1.65, 2.83), - x_range=[-8, 8], - color=RED_C, - stroke_width=4, - ) - gap_boundary = DashedLine( - gap_axes.c2p(0, 0), - gap_axes.c2p(0, 0.17), - color=GREY_B, - stroke_width=2, - ) - gap_hint = Text( - "Gap score g = Delta_H - Delta_A drives alpha-hat", font_size=15 - ) - gap_hint.set_color(GREY_B).next_to(gap_axes, DOWN, buff=0.06) - gap_title = Text( - "2) Behavioral separability", font_size=23, weight="BOLD", color=GREEN_C - ) - gap_body = VGroup(gap_axes, gap_h, gap_a, gap_boundary, gap_hint) - gap_group = VGroup(gap_title, gap_body).arrange(DOWN, buff=0.08) - gap_frame = SurroundingRectangle(gap_group, color=GREEN_C, buff=0.14) - gap_frame.set_fill(color=GREEN_C, opacity=0.05) - gap_panel = VGroup(gap_frame, gap_group) - - ctrl_title = Text( - "3) Robust pricing control", font_size=23, weight="BOLD", color=HIGHLIGHT - ) - ctrl_signal = MathTex(r"\hat\alpha(\tau')=\sigma(\beta g)", font_size=31) - ctrl_policy = MathTex( - r"\pi^*=\arg\max_\pi\min_{Q\in\mathcal U_\epsilon}\mathbb E[r]", - font_size=29, - color=HIGHLIGHT, - ) - ctrl_steps = VGroup( - card( - "estimate contamination from behavior", - color=GREEN_C, - width=4.0, - height=0.72, - font_size=16, - ), - card( - "optimize price policy under uncertainty", - color=ORANGE, - width=4.0, - height=0.72, - font_size=16, - ), - ).arrange(DOWN, buff=0.18) - ctrl_arrow = Arrow( - ctrl_steps[0].get_bottom(), - ctrl_steps[1].get_top(), - buff=0.06, - color=AXIS_INK, - stroke_width=3, - ) - ctrl_body = VGroup(ctrl_signal, ctrl_policy, ctrl_steps, ctrl_arrow).arrange( - DOWN, buff=0.14 - ) - ctrl_group = VGroup(ctrl_title, ctrl_body).arrange(DOWN, buff=0.08) - ctrl_frame = SurroundingRectangle(ctrl_group, color=HIGHLIGHT, buff=0.14) - ctrl_frame.set_fill(color=HIGHLIGHT, opacity=0.05) - ctrl_panel = VGroup(ctrl_frame, ctrl_group) - - panels = VGroup(coi_panel, gap_panel, ctrl_panel).arrange(RIGHT, buff=0.3) - panels.scale(0.92).next_to(subtitle, DOWN, buff=0.28) - - web = card("web sessions", color=BLUE_D, width=2.2, height=0.7, font_size=17) - kafka = card( - "quote + event logs", color=HIGHLIGHT, width=2.6, height=0.7, font_size=17 - ) - kernel = card( - "transition kernels", color=GREEN_C, width=2.5, height=0.7, font_size=17 - ) - policy = card( - "robust policy", color=ORANGE, width=2.2, height=0.7, font_size=17 - ) - flow_nodes = VGroup(web, kafka, kernel, policy).arrange(RIGHT, buff=0.22) - flow_nodes.to_edge(DOWN, buff=0.52) - flow_arrows = VGroup( - Arrow(web.get_right(), kafka.get_left(), buff=0.05, stroke_width=2.8), - Arrow(kafka.get_right(), kernel.get_left(), buff=0.05, stroke_width=2.8), - Arrow(kernel.get_right(), policy.get_left(), buff=0.05, stroke_width=2.8), - ) - - status = VGroup( - Text("Mann-Whitney p = 0.0006", font_size=19, color=GREEN_C), - Text("Pairwise robust wins: 13/40 objective, 16/40 revenue", font_size=19), - ).arrange(DOWN, buff=0.06) - status[1].set_color(GREY_B) - status.next_to(flow_nodes, UP, buff=0.15) - - footer = Text( - "From mechanism failure to an implementable defense loop", - font_size=25, - color=HIGHLIGHT, - ).next_to(flow_nodes, DOWN, buff=0.13) - - self.add( - title, - subtitle, - panels, - flow_nodes, - flow_arrows, - status, - footer, - ) - self.wait(0.1) - - -SCENE_ORDER = [ - "DefenseOpening", - "CardMarketAnalogyScene", - "COIFirstPrinciplesScene", - "COIOrderStatisticProofScene", - "BehaviorKernelConstructionScene", - "SeparabilitySignalScene", - "ContaminationGeneratorScene", - "RobustControlScene", - "SystemLoopScene", - "ObjectiveAndResultsScene", - "TakeawayScene", -] - -POSTER_SCENES = ["ThesisBannerPosterScene"] - -AVAILABLE_SCENES = SCENE_ORDER + POSTER_SCENES diff --git a/paper/src/bib/references.bib b/paper/src/bib/references.bib index 38e953f..d9c6d21 100644 --- a/paper/src/bib/references.bib +++ b/paper/src/bib/references.bib @@ -630,3 +630,41 @@ Volume: 21}, note = {Publisher: Institute of Mathematical Statistics}, pages = {50 -- 60}, } + +@article{horace_he_and_thinking_machines_lab_defeating_2025, + title = {Defeating {Nondeterminism} in {LLM} {Inference}}, + url = {https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/}, + doi = {10.64434/tml.20250910}, + abstract = {Reproducibility is a bedrock of scientific progress. However, it’s remarkably difficult to get reproducible results out of large language models. +For example, you might observe that asking ChatGPT the same question multiple times provides different results. This by itself is not surprising, since getting a result from a language model involves “sampling”, a process that converts the language model’s output into a probability distribution and probabilistically selects a token. +What might be more surprising is that even when we adjust the temperature down to 0This means that the LLM always chooses the highest probability token, which is called greedy sampling. (thus making the sampling theoretically deterministic), LLM APIs are still not deterministic in practice (see past discussions here, here, or here). Even when running inference on your own hardware with an OSS inference library like vLLM or SGLang, sampling still isn’t deterministic (see here or here).}, + language = {en}, + urldate = {2026-03-10}, + journal = {Thinking Machines Lab: Connectionism}, + author = {{Horace He and Thinking Machines Lab}}, + year = {2025}, + file = {Snapshot:/home/velocitatem/Zotero/storage/U5JG4CNM/defeating-nondeterminism-in-llm-inference.html:text/html}, +} + +@misc{moritz_ray_2018, + title = {Ray: {A} {Distributed} {Framework} for {Emerging} {AI} {Applications}}, + shorttitle = {Ray}, + url = {http://arxiv.org/abs/1712.05889}, + doi = {10.48550/arXiv.1712.05889}, + abstract = {The next generation of AI applications will continuously interact with the environment and learn from these interactions. These applications impose new and demanding systems requirements, both in terms of performance and flexibility. In this paper, we consider these requirements and present Ray---a distributed system to address them. Ray implements a unified interface that can express both task-parallel and actor-based computations, supported by a single dynamic execution engine. To meet the performance requirements, Ray employs a distributed scheduler and a distributed and fault-tolerant store to manage the system's control state. In our experiments, we demonstrate scaling beyond 1.8 million tasks per second and better performance than existing specialized systems for several challenging reinforcement learning applications.}, + urldate = {2026-03-13}, + publisher = {arXiv}, + author = {Moritz, Philipp and Nishihara, Robert and Wang, Stephanie and Tumanov, Alexey and Liaw, Richard and Liang, Eric and Elibol, Melih and Yang, Zongheng and Paul, William and Jordan, Michael I. and Stoica, Ion}, + month = sep, + year = {2018}, + note = {arXiv:1712.05889 [cs]}, + keywords = {Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Distributed, Parallel, and Cluster Computing}, + file = {Preprint PDF:/home/velocitatem/Zotero/storage/SUTDF5BP/Moritz et al. - 2018 - Ray A Distributed Framework for Emerging AI Applications.pdf:application/pdf;Snapshot:/home/velocitatem/Zotero/storage/5GV2DUAA/1712.html:text/html}, +} + +@misc{biewald_experiment_2020, + title = {Experiment {Tracking} with {Weights} and {Biases}}, + url = {https://www.wandb.com/}, + author = {Biewald, Lukas}, + year = {2020}, +} diff --git a/paper/src/chapters/01-intro.tex b/paper/src/chapters/01-intro.tex index 79e5f73..d66b0c2 100644 --- a/paper/src/chapters/01-intro.tex +++ b/paper/src/chapters/01-intro.tex @@ -8,9 +8,9 @@ \section{Introduction} -In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove separability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned separability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners. +In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove distinguishability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned distinguishability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners. -This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned separability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.} +This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned distinguishability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.} \subsection{Motivation and Market Context} @@ -30,7 +30,7 @@ We formally define interaction data as coming from some actor which can either b This dissertation is organized around one main research question and three supporting sub-questions: \begin{enumerate} \item[\textbf{Main RQ}] How can dynamic pricing systems preserve margin integrity when transaction orchestration is increasingly mediated by non-human agents? - \item[\textbf{SQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting? + \item[\textbf{SQ1}] \textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting? \item[\textbf{SQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems? \item[\textbf{SQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination? \end{enumerate} @@ -64,4 +64,4 @@ Extract final result $r$ from terminal state\; \end{algorithm} -The previously described goal of separability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate $\hat{q}$. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary. +The previously described goal of distinguishability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate $\hat{q}$. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary. diff --git a/paper/src/chapters/02-literature-review.tex b/paper/src/chapters/02-literature-review.tex index 5e788ed..272ea4c 100644 --- a/paper/src/chapters/02-literature-review.tex +++ b/paper/src/chapters/02-literature-review.tex @@ -1,6 +1,6 @@ \section{Literature Review} -To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups. +To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups. \subsection{Agent Taxonomy and Definitions} diff --git a/paper/src/chapters/03-methodology.tex b/paper/src/chapters/03-methodology.tex index 6c63f95..e07dcac 100644 --- a/paper/src/chapters/03-methodology.tex +++ b/paper/src/chapters/03-methodology.tex @@ -3,7 +3,7 @@ % Extra notes and clarifications: we observed some humans and get their transition probabilities between event types % We modify behavioral profiles of transition matrices with price elasticity matrices generated by sample valuations of a distributing. -This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven separability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets. +This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven distinguishability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets. \subsection{Problem Formalization} @@ -40,6 +40,7 @@ We formalize the heterogeneity of actors by introducing a type space $\Theta$. A Q(p) = (1-\alpha) \cdot \mathbb{E}_{\theta \sim \mathcal{D}_H}[d(p; \theta)] + \alpha \cdot \mathbb{E}_{\theta \sim \mathcal{D}_A}[d(p; \theta)] + \epsilon_t \end{equation} where $\alpha \in [0, 1]$ represents the contamination parameter (proportion of agents) and $\epsilon_t$ is non-stationary market noise. +Accounting for behavioral and market variation, we also treat $\epsilon_t$ as absorbing serving-path variability from LLM infrastructure (e.g., batch-size-dependent inference behavior under changing load), which appears stochastic at the request level even under greedy decoding \parencite{horace_he_and_thinking_machines_lab_defeating_2025}. @@ -140,6 +141,8 @@ The architecture of this platform begins with the deployed web-apps posting inte \paragraph{Public Web Artifact} We transition the Kappa like architecture of the data collection to a Lambda architecture for actual learning in a surrogate environment. This allows us to move faster on data which is provided and helps us create a feedback loop for production deployment. To support further research in this intersection of fields we release P4P \footnote{\url{https://github.com/velocitatem/p4p}} as a public repository providing the interaction layer of the PHANTOM framework. This provides a configurable storefront which can be tailored to any commercial setting with a standardized session-level event tracking. We document the API adapters or what the framework expects in terms of schemas for pricing providers and log ingestion servicse. The repository is intended for controlled experimentation and method replication rather than production commerce deployment. +\paragraph{Public Dataset} For reproducibility of the behavioral analysis and distinguishability experiments, we also release the interaction dataset used in this thesis as \textit{WhoClickedIt}. The dataset is hosted on Hugging Face \footnote{\url{https://huggingface.co/datasets/velocitatem/whoclickedit}} and is distributed as one flattened event sheet (\texttt{whoclicked.csv}) with explicit labels (\texttt{actor\_type}, \texttt{is\_agent}, and \texttt{record\_type}). The associated dataset card specifies the schema, collection process, and known limitations; a full copy is included in Appendix~\ref{app:whoclicked_card}. + \subsubsection{DevOps Principles} @@ -182,13 +185,24 @@ Since users act with motivations, we define a pool of tasks (jobs to be done) an The task pool is stored as a structured table with fields \texttt{id}, \texttt{created\_at}, \texttt{task\_name}, \texttt{task\_description}, and \texttt{task\_def\_of\_done}. We formulate the tasks as compact jobs-to-be-done rather than as strict click scripts, because the target is to elicit realistic browsing and comparison behavior which can capture nuance of different people. In hotel mode the assigned tasks include \textit{Cheapest Room}, \textit{Cheapest Room w/ View}, \textit{MultiStep Cheapest Room}, \textit{The Digital Nomad (Executive)}, and \textit{The 3-Way Tradeoff (Desk + Quiet + Flexible)}. These prompts deliberately require critical thought in search, inspection of room details, comparison of amenities or images, return visits to the listing page, and a final booking decision which create a degree of cognitive load. In airline mode we use \textit{Last-Minute One-Way Flight}, where the actor must urgently travel to LAX from either SEA or JFK within the next 1--3 days, inspect at least a small set of candidate itineraries, and then book a reasonable earliest departure. A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor. -The human data collection involved 18 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 18 human sessions we ran 18 agent sessions of equivalent task scope, giving a balanced dataset of 36 labeled trajectories. Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy. +The human data collection involved 13 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 13 human sessions we ran 16 agent sessions of equivalent task scope, yielding 29 labeled trajectories in total (45\% human, 55\% agent). Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy. To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior. -Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to separate classes $\theta \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner. +Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to distinguish classes $\theta \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner. -Our process follows three stages: (1) observe and \textit{vectorize} behavioral interactions, (2) learn separability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator. +Our process follows three stages: (1) observe and \textit{vectorize} behavioral interactions, (2) learn distinguishability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator. + +Figure~\ref{fig:phantom_unified_architecture} summarizes the full mechanism from online interaction capture to divergence-based contamination scoring and robust control of pricing decisions. + +\begin{figure}[ht] + \centering + \resizebox{\textwidth}{!}{% + \input{chapters/hero_architecture_figure.tex} + } + \caption{Unified PHANTOM defense architecture. (a) Online serving and logging with behavioral and price-query streams. (b) Distinguishability layer that estimates KL divergence to human/agent prototypes and derives session-level contamination scores. (c) Distributionally robust pricing control that optimizes under an ambiguity set while penalizing COI leakage and tracking UX cost.} + \label{fig:phantom_unified_architecture} +\end{figure} \begin{figure}[ht] \resizebox{\columnwidth}{!}{% @@ -206,8 +220,8 @@ The dynamic pricing mechanism elicited immediate behavioral adjustments. Partici \subsubsection{Design of Training Factorial Study} -The simulator has multiple configurable factors. We design a multi-factor study across five axes derived from the sweep configurations: (1) RL algorithm (\texttt{ppo}, \texttt{a2c}, \texttt{dqn}, \texttt{qtable}; 4 levels), (2) contamination ratio $\alpha$ sampled from $[0.1, 0.6]$ at four representative levels, (3) robustness radius $\epsilon_\alpha \in \{0.0, 0.15, 0.3\}$ (3 levels), (4) COI penalty weight $\lambda_\text{coi}$ at two reference levels, and (5) pricing action granularity (two discretization settings for \texttt{action\_levels}); giving a grid of $4\times4\times3\times2\times2 = 192$ configurations. Statistical power for the behavioral comparisons is determined by a two-sample test over per-session KL divergence scores; a formal power analysis with minimum detectable effect size at $n=18+18$ is reported in the results. -% Power analysis plan: apply a two-sample Mann-Whitney U (or permutation test) on per-session (delta_H - delta_A) divergence scores comparing the human and agent groups. Compute minimum detectable effect size at alpha=0.05, power=0.8, given n=18 per group. Bootstrap confidence intervals on mean KL are a cleaner complement given the non-normality of divergence distributions. +The simulator has multiple configurable factors. We design a multi-factor study across five axes derived from the sweep configurations: (1) RL algorithm (\texttt{ppo}, \texttt{a2c}, \texttt{dqn}, \texttt{qtable}; 4 levels), (2) contamination ratio $\alpha$ sampled from $[0.1, 0.6]$ at four representative levels, (3) robustness radius $\epsilon_\alpha \in \{0.0, 0.15, 0.3\}$ (3 levels), (4) COI penalty weight $\lambda_\text{coi}$ at two reference levels, and (5) pricing action granularity (two discretization settings for \texttt{action\_levels}); giving a grid of $4\times4\times3\times2\times2 = 192$ configurations. Statistical power for the behavioral comparisons is determined by a two-sample test over per-session KL divergence scores; a formal power analysis with minimum detectable effect size at $n_H=13$, $n_A=16$ is reported in the results. +% Power analysis plan: apply a two-sample Mann-Whitney U (or permutation test) on per-session (delta_H - delta_A) divergence scores comparing the human and agent groups. Compute minimum detectable effect size at alpha=0.05, power=0.8, given n_H=13 and n_A=16. Bootstrap confidence intervals on mean KL are a cleaner complement given the non-normality of divergence distributions. While this scale is generally expensive for reinforcement learning, we execute it on a large TPU cluster to make the sweep tractable. Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak BF16 throughput this corresponds to approximately 160\,PFLOPS of aggregate compute (derivation in Appendix~\ref{app:compute_budget}), which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration where throughput-per-dollar is favorable, and reserve on-demand v4 capacity for runs that should not be interrupted. @@ -245,7 +259,8 @@ v4 & 64 (32 + 32) & us-central2-b & 32 Spot + 32 On-demand \\ \end{tabular} \end{table} -For connections from Madrid, we prioritize the europe-west4 allocation for latency-sensitive runs with the benefit of having the most grouped chips within a single region. This regional grouping is important for the deployment of our Kubernetes cluster which cannot span multiple regions. All sweep metadata, model checkpoints, and reward traces are logged in Weights \& Biases. Hardware specifications are from the official Google Cloud TPU documentation \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}. +For connections from Madrid, we prioritize the europe-west4 allocation for latency-sensitive runs with the benefit of having the most grouped chips within a single region. This regional grouping is important for the deployment of our Kubernetes cluster which cannot span multiple regions. All sweep metadata, model checkpoints, and reward traces are logged in Weights \& Biases. % TODO: cite this (from bib) +Hardware specifications are from the official Google Cloud TPU documentation \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}. Design of training processes: we build docker image with the fact in mind of different caching over layers in order to most speed up docker re-building and such we place the most volatile steps towards the end of the image building. What is means in practice is that any dependency installations are isolated so edits to source code do no trigger rebuilds. Only if we update our entry point of training a sweep, Docker will also rebuild the source-code copy stage. @@ -294,15 +309,15 @@ In addition to behavioral events, the platform logs price observations to a sepa -\subsection{Generative Contamination and Separability} +\subsection{Generative Contamination and Distinguishability} To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from Phantom data using a two-stage approach. -\subsubsection{Ground-Truth Separability} -Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability? +\subsubsection{Ground-Truth Distinguishability} +Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability? -To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session separability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior. +To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session distinguishability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior. The normality assumption cannot be made for KL divergence distributions, which are right-skewed and bounded below by zero, so we do not use a Student's $t$-test. Instead we apply a Mann-Whitney $U$ test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the stochastic ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We report $U$, the exact two-sided $p$-value, and group-level descriptive statistics for the gap scores. @@ -387,8 +402,10 @@ The complete pricing-demand-trajectory loop is illustrated in Figure~\ref{fig:or \begin{figure}[ht] \centering -\[ -\text{Oracle}(\vec{p}_{t-1},\vec{\hat{q}})\to +{\setlength{\arraycolsep}{4pt}% +\resizebox{0.85\linewidth}{!}{$ +\begin{aligned} +&\text{Oracle}(\vec{p}_{t-1},\vec{\hat{q}})\to \begin{pmatrix} p_0\\ p_1\\ @@ -397,14 +414,15 @@ p_N \end{pmatrix} \underrightarrow{d_i \sim \mathcal{N}_{\vec{p}}} \begin{pmatrix}d_0\\ d_1\\ \cdots \\ d_N\end{pmatrix} -\underrightarrow{\vec{d}\times \tau_\theta \to \tau^\prime} +\underrightarrow{\vec{d}\otimes \tau_\theta} \begin{bmatrix} 0.01 & 0.02 & \cdots & 0.3 \\ 0.41 & 0.24 & \cdots & 0.0 \\ \cdots & \cdots & \cdots & \cdots \\ 0.51 & 0.09 & \cdots & 0.1 \\ \end{bmatrix} -\underrightarrow{\tau_k \sim \tau^\prime} +\\ +&\underrightarrow{\tau_k \sim \tau^\prime} \{\tau_k\}_{k=0}^K \to \hat{Q}(\tau_k) \to \begin{pmatrix} \hat{q}_0 \\ @@ -413,8 +431,10 @@ p_N \hat{q}_N \\ \end{pmatrix} \to \text{Oracle}(\cdot) -\] -\caption{Oracle-based pricing loop: historical price and demand state map to a new price vector; each product samples demand curves from $\mathcal{N}_{\vec{p}}$; trajectories are generated by mixing demand with behavioral kernels $\tau_\theta$ into transition matrix $\tau'$; sampled trajectories $\{\tau_k\}$ aggregate through proxy $Q(\cdot)$ to yield updated demand $\vec{\hat{q}}$, closing the feedback loop.} +\end{aligned} +$}% +} +\caption{Oracle-based pricing loop: historical price and demand state map to a new price vector; each product samples demand curves from $\mathcal{N}_{\vec{p}}$; trajectories are generated via the Kronecker product $\vec{d}\otimes\tau_\theta$ into transition matrix $\tau'$; sampled trajectories $\{\tau_k\}$ aggregate through proxy $Q(\cdot)$ to yield updated demand $\vec{\hat{q}}$, closing the feedback loop.} \label{fig:oracle_flow} \end{figure} @@ -461,7 +481,7 @@ We also consider taxation-like overlays for agent traffic under strategy-proof m \subsubsection{Pricing Mechanism Summary} -We now present the complete pricing mechanism that integrates the behavioral separability, contamination estimation, and robust optimization components developed in the preceding sections. Algorithm~\ref{alg:phantom_loop_clean} formalizes the defensive pricing loop as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories. +We now present the complete pricing mechanism that integrates the behavioral distinguishability, contamination estimation, and robust optimization components developed in the preceding sections. Algorithm~\ref{alg:phantom_loop_clean} formalizes the defensive pricing loop as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories. \begin{algorithm}[t] \caption{PHANTOM defensive pricing loop} @@ -494,3 +514,47 @@ We now present the complete pricing mechanism that integrates the behavioral sep The algorithm operates in discrete epochs indexed by $t$. At each epoch, the platform applies one discrete multiplicative price action, the environment samples a batch of sessions, and demand is recomputed from weighted events. Robustness is implemented as an inner minimization over a small local grid of contamination candidates around nominal $\alpha_0$, matching the current engine implementation. The history buffer $\mathcal{L}$ (``Limbo'' in our implementation) enforces the alternating Stackelberg structure by preserving the temporal sequence of price publications and demand observations. %The defensive price update in Line 24 implements contamination-aware margin shrinkage: as estimated contamination $\hat{\alpha}_t$ rises, the margin $(p^{\mathrm{ref}} - c)$ is reduced by factor $\kappa\in[0,1]$, with projection $\Pi_{\mathcal{P}}$ ensuring feasibility. In subsequent experiments this heuristic rule is replaced by DR-RL policy $\pi^*$ from Eq.~\ref{eq:robust_policy}. + +\subsection{Parallelization Strategy} + +To avoid preemption of compute mid-training we settle on using a v4 generation, 40 chip compute node with 5 parallel workers. The login node creates an orchestration node with Ray \parencite{moritz_ray_2018} and we distribute ray compute nodes per each other worker. + +\subsubsection{Computational Cost Analysis of the Simulation Step} +The per-step cost of Algorithm~\ref{alg:phantom_loop_clean} is not uniform across its components. To inform hardware provisioning and to identify where algorithmic improvements are most impactful, we profile the hot path of the engine using Python's \texttt{cProfile} instrumentation over 20 environment steps under two configurations: a baseline with the robustness inner loop disabled ($K=1$, $\epsilon_\alpha=0$) and a standard robust setting ($K=5$, $\epsilon_\alpha=0.2$). Both runs use $M=10$ sessions per market call and $N=3$ products. + +The baseline achieves approximately 26 steps per second. Enabling the robustness inner loop with $K=5$ candidates drops throughput to 7.2 steps per second, a $3.6\times$ slowdown that is directly proportional to $K$, consistent with the $O(K)$ scaling of the adversarial alpha selection in the implementation. + +\begin{table}[ht] +\centering +\caption{Per-step profiling results (20 steps, $M=10$ sessions, $N=3$ products). Self-time measures time spent inside the function excluding callees; cumulative time includes the full call subtree.} +\label{tab:profile_results} +\begingroup +\small +\setlength{\tabcolsep}{4pt} +\begin{tabular}{@{}lrrrr@{}} +\toprule +\textbf{Function} & \textbf{Calls} & \textbf{Self (ms)} & \textbf{Cum. (ms)} & \textbf{Cum. \%} \\ +\midrule +\multicolumn{5}{l}{\textit{Baseline ($K=1$, 0.77\,s total, 26 steps/s)}} \\ +\texttt{sample\_behavior\_from\_transitions} & 420 & 131 & 658 & 86\% \\ +\texttt{DataFrame.xs} & 4,820 & 30 & 201 & 26\% \\ +\texttt{numpy.nan\_to\_num} & 4,904 & 43 & 97 & 13\% \\ +\texttt{adjust\_behavior\_to\_condition} & 84 & 3 & 54 & 7\% \\ +\midrule +\multicolumn{5}{l}{\textit{Robust ($K=5$, 2.79\,s total, 7.2 steps/s)}} \\ +\texttt{sample\_behavior\_from\_transitions} & 1,220 & 519 & 2,447 & 88\% \\ +\texttt{DataFrame.xs} & 16,668 & 108 & 729 & 26\% \\ +\texttt{numpy.nan\_to\_num} & 16,912 & 164 & 363 & 13\% \\ +\texttt{adjust\_behavior\_to\_condition} & 244 & 11 & 108 & 4\% \\ +\bottomrule +\end{tabular} +\endgroup +\end{table} + +Across both configurations, \texttt{sample\_behavior\_from\_transitions} accounts for 86--88\% of total wall time. The function implements the Markov chain sampler described in Section~\ref{sec:tpe}: at each transition it retrieves the current-state row from the expanded transition \texttt{DataFrame} via label-based indexing, which internally dispatches through the pandas \texttt{xs} and \texttt{fast\_xs} code paths. For $M$ sessions each running up to $L_{\max}=40$ transitions, a single \texttt{market.act()} call issues up to $M \cdot L_{\max}$ individual row lookups. With $K=5$ robustness candidates per outer step this accumulates to $5 \times 10 \times 40 = 2{,}000$ row accesses per outer step, producing the 16k \texttt{xs} invocations observed in Table~\ref{tab:profile_results}. + +The \texttt{numpy.nan\_to\_num} calls, accounting for 13\% of self-time, occur once per row lookup to sanitize sampled probability vectors before normalization; their call count therefore tracks the \texttt{xs} count exactly. + +\texttt{adjust\_behavior\_to\_condition} expands the base $E \times E$ event transition matrix to a $(E \cdot N) \times (E \cdot N)$ product-specific matrix via a Kronecker product. At $N=3$ this is inexpensive, but the cost scales as $O(E^2 N^2)$, so at the $N=10$ default it becomes a more significant contributor. The result is not cached across the $K$ robustness candidates inside a single outer step, meaning the Kronecker expansion is recomputed $2K$ times per step (once for the human kernel and once for the agent kernel at each candidate $\alpha_k$). + +The dominant bottleneck therefore has a clear structural cause: the expanded transition matrix is a string-keyed \texttt{DataFrame}, and pandas object-level indexing carries substantial per-call overhead relative to the arithmetic being performed. Converting the expanded matrix to a \texttt{numpy} array with an accompanying integer state-to-index map, computed once per \texttt{market.act()} call and cached for the duration of the robustness inner loop, eliminates the entire pandas dispatch chain. We leverage this bottleneck identified as an opportunity to squeeze the gap which is left by the computational needs of the pricing learner. We make use of JAX to parallelize on the TPU, and surprisingly we open up a large speedup even on CPU-only compute, improving throughput from 26 to 220 steps/s in the baseline configuration and from 7.2 to 136 steps/s under the full robust inner loop, an 8.5$\times$ and 19$\times$ speedup respectively. diff --git a/paper/src/chapters/04-results.tex b/paper/src/chapters/04-results.tex index 478482f..f1e4f56 100644 --- a/paper/src/chapters/04-results.tex +++ b/paper/src/chapters/04-results.tex @@ -1,7 +1,7 @@ \section{Results} \begin{figure}[ht] \centering - \input{chapters/figures/supra.tex} + \input{chapters/figures/supra/supra.tex} \caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as SAC as can be clearly seen by the high density at the highest available price.} \label{fig:supra_heatmap} \end{figure} @@ -10,7 +10,7 @@ \subsection{Behavioral Analysis} -Separability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. Table~\ref{tab:divergence_significance} reports the group-level descriptive statistics for the gap scores and the test result. +Distinguishability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. The full recorded cohort contains $n_H=13$ human sessions and $n_A=16$ agent sessions, and Table~\ref{tab:divergence_significance} reports the corresponding group-level statistics and test result. \begin{table}[ht] \centering @@ -20,48 +20,67 @@ Separability between human and agent sessions is evaluated by computing per-sess \toprule Group & $n$ & Mean gap & Std \\ \midrule -Human sessions & 11 & $-3.3522$ & $2.6748$ \\ -Agent sessions & 6 & $+1.6482$ & $2.8349$ \\ +Human sessions & 13 & $-3.35$ & $2.67$ \\ +Agent sessions & 16 & $+1.65$ & $2.83$ \\ \midrule -\multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\ +\multicolumn{4}{l}{Mann-Whitney two-sided test: $p<0.001$} \\ \bottomrule \end{tabular} \end{table} -The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided $p$-value of $0.0006$ indicates near-complete rank separation between the groups at $n_H=11$, $n_A=6$, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing. +The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided test result ($p<0.001$) at $n_H=13$, $n_A=16$ indicates strong rank distinction between groups, providing evidence that the transition kernels are distinguishable enough to justify their use as a control signal in downstream pricing. \subsection{Experimental Outcomes} -To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) non-robust baseline with revenue-only reward (\texttt{--no-robust}). +To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) a baseline policy with revenue-only reward. We report two preliminary stages before the full factorial interpretation. First, we executed a short calibration run at $\alpha=0.3$ (2 evaluation episodes, 3000 training timesteps per tier) across \texttt{qtable}, \texttt{ppo}, \texttt{a2c}, and \texttt{dqn}. In that first run, \texttt{ppo} produced the highest objective score and revenue (objective $=3.76\mathrm{e}5$, revenue $=4.15\mathrm{e}5$), while the remaining tiers stayed lower in this small-budget regime. The corresponding price traces show a monotone escalation for \texttt{ppo} (mean price from $8.61\mathrm{e}1$ to $1.49\mathrm{e}2$), whereas \texttt{qtable}, \texttt{a2c}, and \texttt{dqn} remained nearly flat over the episode horizon. This confirms that the simulation loop is able to express policy-dependent pricing dynamics rather than collapsing into a single trajectory shape. -Second, we launched an overnight paired benchmark over $\alpha \in \{0.00,0.15,0.30,0.45,0.60\}$ with 8 evaluation episodes and 8000 timesteps, comparing robust and non-robust settings at fixed seed/tier/contamination tuples. At the time of writing, two seeds (11 and 22) are complete and one additional seed is still running. We therefore frame the numbers below as an initial signal, not a final claim. -\begin{table}[ht] -\centering -\caption{Early overnight aggregate over completed seeds ($n=2$; seeds 11 and 22).} -\label{tab:pricing_benchmark} -\begin{tabular}{lcccc} -\toprule -Mode & Mean objective score & Mean revenue & Mean COI level & Mean margin \\ -\midrule -Robust & $3.41\mathrm{e}5$ & $3.80\mathrm{e}5$ & $1.08\mathrm{e}2$ & 0.901 \\ -Non-robust (\texttt{--no-robust}) & $3.91\mathrm{e}5$ & $4.18\mathrm{e}5$ & $1.11\mathrm{e}2$ & 0.906 \\ -\bottomrule -\end{tabular} -\end{table} +\subsubsection{The Impact of Contamination on Revenue} -At pair level (same seed, tier, and contamination), robust exceeds non-robust in $13/40$ configurations on objective score and in $16/40$ configurations on revenue. The current early evidence therefore suggests a conditional robustness effect: the defense is active and measurable, but not yet uniformly beneficial without further calibration. +A linear fit test on run-level data ($n=95$) shows a strong negative association between contamination and mean revenue. The fitted model mapping $\alpha \to \text{revenue}$ result in $t(93)=-8.2148$, $p=1.20\times 10^{-12}$, $R^2=0.4205$, and a 95\% confidence interval for the slope of $[-75{,}288.76,\,-45{,}975.13]$. In practical terms, a $+0.1$ increase in $\alpha$ corresponds to an average decrease of about $6{,}063$ revenue units within our environment. +\subsubsection{Large Scale Factorial Training} + +In our complete training runs we logged $\approx 180$ days of net compute time. The results we draw from extensive training are +\begin{enumerate*}[label=(\roman*)] + \item the ability to extract COI is greater in the presence of robustness within the training loop + \item short term revenue measurements suffer $\approx 3\%$ loss but COI margin compensates for this loss in the long run + \item a larger catalog size contributes positively to COI preservation under higher contamination ratios + \item supra-competitive pricing is a natural reward hacking tendency which is drastically reduced by a balanced UX penalty +\end{enumerate*} + +\begin{figure}[ht] + \centering + \input{chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex} + \caption{Revenue curves by contamination for the final cohort. The baseline remains above the defended curve in most cells, but the gap narrows in the high-contamination region.} + \label{fig:final_focus_revenue_by_alpha} +\end{figure} +% TODO: we need a similar plot which shows the COI preserved (what we gain across teh multiple conatmination leves, showing that the robust method has better COI optimization.) + +\begin{figure}[ht] + \centering + \input{chapters/figures/results/includes/final/final_focus_revenue_delta.tex} + \caption{Defended-minus-baseline revenue delta over contamination for the final cohort. The strongest high-contamination deviation begins at $\alpha=0.7$, followed by recovery toward near parity by $\alpha=1.0$.} + \label{fig:final_focus_revenue_delta} +\end{figure} + +\begin{figure}[ht] + \centering + \input{chapters/figures/results/includes/final/final_focus_risk_deltas.tex} + \caption{Defended-minus-baseline leakage and volatility deltas for the final cohort. Leakage remains lower for the defended policy across the full contamination range.} + \label{fig:final_focus_risk_deltas} +\end{figure} \subsection{Interpretation and Insights} -The Mann-Whitney result ($U=2.0$, $p<0.001$) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score. +The Mann-Whitney result ($p<0.001$) confirms that per-session divergence gaps distinguish the two actor classes with near-zero overlap in rank ordering. This is the condition required for distinguishability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score. -The first calibration and overnight runs additionally confirm three practical points aligned with the thesis mechanism. First, the control loop is reproducible end-to-end (training, evaluation, artifact generation) across algorithms and contamination levels. Second, policy class materially changes price trajectories and resulting COI/revenue profiles under identical environment settings. Third, objective improvements from robustness are regime-dependent in the current baseline, which is consistent with the thesis claim that contamination-aware pricing needs explicit calibration rather than a one-size-fits-all penalty. +The first calibration and paired benchmark runs additionally confirm three practical points aligned with the thesis. First, the control loop is reproducible end-to-end (training, evaluation, artifact generation) across algorithms and contamination levels. Second, policy class materially changes price trajectories and resulting COI/revenue profiles under identical environment settings. Third, objective improvements from robustness are regime-dependent in the current baseline, which is consistent with the thesis claim that contamination-aware pricing needs explicit calibration rather than a one-size-fits-all penalty. We also note that maximizing revenue in isolation can favor aggressive high-price behavior; even in these early runs, the non-robust aggregate shows slightly higher mean COI and margin. For this reason, all subsequent reporting in this thesis is interpreted on a multi-metric basis (objective, revenue, COI, and stability), and not by revenue alone. + \subsection{Anomalies} In our initial runs, we observed an instability pocket in one completed run (A2C, robust, seed 11, $\alpha=0.30$) with a large performance drop relative to neighboring configurations. We retain this run in the preliminary summary to avoid survivorship bias and treat it as evidence that robustness sensitivity analysis is necessary before final conclusions. diff --git a/paper/src/chapters/05-discussion.tex b/paper/src/chapters/05-discussion.tex index 51f6600..5b2512f 100644 --- a/paper/src/chapters/05-discussion.tex +++ b/paper/src/chapters/05-discussion.tex @@ -16,6 +16,4 @@ This technology does not come without a more bitter side, ethical concerns do ar With a system like this there is potential for strong drift given the rapid advance of agentic systems and user preference. Our intent behind adding the UX term into the reward shaping process was to further address the risk of degraded user experience. Looking deeper at the underlying methodology, reinforcement learning does not come without it's complications such as reward hacking and often the lack of intepretability which is quite critical in systems that have a strong impact on the revenue of a company. -\subsection{Implications of Findings} - -Interpretation of results and altenrative scenarios with broader market implications. +% \subsection{Implications of Findings} Interpretation of results and altenrative scenarios with broader market implications. diff --git a/paper/src/chapters/06-conclusion.tex b/paper/src/chapters/06-conclusion.tex index a609531..a905bbb 100644 --- a/paper/src/chapters/06-conclusion.tex +++ b/paper/src/chapters/06-conclusion.tex @@ -1,11 +1,24 @@ \section{Conclusion} -For our troubles, we now conclude that... +Our research has explored how reinforcement learning works within pricing systems and environments which are substantially disrupted by an adversarial participant. Our findings include the optimization for our newly introduced metrics. \subsection{Summary of contributions} -The authors contribution was not without the advice of many experienced experts in the field. We thank Marco Casalaina VP Products, Core AI and AI Futurist at Microsoft for the initial critical discussion on the topic of dynamic pricing systems and the spark which has lead to this work. Eugene Bykovets, PhD pointing out the parallels in blockchain systems and the complexity of anonymous interaction and understanding of intent. Importantly, the contributions of Alberto Martín Izquierdo, my academic advisor for the support over and for taking on the challenge of this ambitious work. Many breakthroughs were thanks to numerous discussions with my peers on the topics covered here. +The contribution was not without the advice of many experienced experts in the field. We thank Marco Casalaina VP Products, Core AI and AI Futurist at Microsoft for the initial critical discussion on the topic of dynamic pricing systems and the spark which has lead to this work. Eugene Bykovets, PhD pointing out the parallels in blockchain systems and the complexity of anonymous interaction and understanding of intent. Importantly, the contributions of Alberto Martín Izquierdo, my academic advisor for the support over and for taking on the challenge of this ambitious work. Many breakthroughs were thanks to numerous discussions with my peers on the topics covered here. A thanks to the head of innovation at Amadeus for insight into the industry split on the topic of collapsing margins. Finally we acknowledge the power and use of generative AI technologies for in depth research, rapid prototyping and surfacing of key topics and niches. +Now we very explicitly mention what we contribute in this paper: +\begin{itemize} + \item TPU-accelerated parallelization of the behavioral simulation and reinforcement learning pipeline, making large-scale factorial sweeps tractable. + \item Formalization of non-human transaction orchestration in e-commerce as a distinct source of contamination in dynamic pricing systems. + \item Definition of the Cost of Information (COI) as a mechanism-level quantity for pricing power, together with a theorem showing its erosion under increasing agent saturation. + \item Design and implementation of a controlled e-commerce research platform, built on a hybrid Kappa-Lambda architecture, for collecting and replaying high-fidelity interaction trajectories. + \item Construction and empirical validation of a behavioral distinguishability framework that distinguishes human and agent sessions from interaction signals alone using transition kernels and KL-based divergence. + \item Development of a generative contamination mechanism that injects learned agent behavior into the pricing environment for controlled robustness experiments. + \item Translation of behavioral distinguishability into a defensive pricing mechanism through a distributionally robust reinforcement learning formulation of pricing under non-stationary contamination. + \item Empirical evidence that agent contamination reduces revenue and that robustness is condition-dependent, requiring explicit calibration rather than a one-size-fits-all penalty. + \item Release of a reusable public experimental artifact for reproducing and extending research on dynamic pricing under agent-mediated traffic. +\end{itemize} + \subsection{Future Works and Next Steps} During the eights months of research dedicated to this work, a plethora of opportunities and industry gaps was identified, sadly a majority of which could not be addressed directly. diff --git a/paper/src/chapters/auto/whoclicked_dataset_card.md b/paper/src/chapters/auto/whoclicked_dataset_card.md new file mode 100644 index 0000000..6a743bd --- /dev/null +++ b/paper/src/chapters/auto/whoclicked_dataset_card.md @@ -0,0 +1,165 @@ +--- +pretty_name: whoclickedit +license: mit +language: +- en +task_categories: +- tabular-classification +task_ids: +- tabular-multi-class-classification +tags: +- e-commerce +- dynamic-pricing +- behavioral-telemetry +- human-vs-agent +- session-data +size_categories: +- 1K + +# [whoclickedit](https://huggingface.co/datasets/velocitatem/whoclickedit) + +[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-sm.svg)](https://huggingface.co/datasets/velocitatem/whoclickedit) +![Rows](https://img.shields.io/badge/Rows-3874-0A9396?style=flat-square) +![Columns](https://img.shields.io/badge/Columns-42-005F73?style=flat-square) +![Sessions](https://img.shields.io/badge/Sessions-36-1D3557?style=flat-square) +![Human rows](https://img.shields.io/badge/Human%20rows-798-2A9D8F?style=flat-square) +![Agent rows](https://img.shields.io/badge/Agent%20rows-3076-E76F51?style=flat-square) +![License](https://img.shields.io/badge/License-MIT-111827?style=flat-square) + +> **Event-level behavior data for dynamic pricing research.** +> This dataset captures how humans and automated agents browse, query prices, and move through the PHANTOM storefronts during controlled experiments. + +## What this dataset gives you + +- A single flat file (`whoclicked.csv`) with both interaction and price-log events. +- Explicit labels for actor origin: `actor_type` and `is_agent`. +- Provenance fields from Kafka envelopes when available. +- Metadata flattened into feature-ready `metadata_*` columns. + +## Snapshot + +| Metric | Value | +| --- | --- | +| Rows | `3874` | +| Columns | `42` | +| Time range (UTC) | `2025-12-05T09:43:31.301000+00:00` -> `2026-03-23T12:08:30.151000+00:00` | +| Unique sessions | `36` | + +## Composition + +### Rows by actor +| Actor | Rows | Share | +| --- | --- | --- | +| `human` | 798 | 20.6% | +| `agent` | 3076 | 79.4% | + +### Rows by actor and record type +| Actor | Record type | Rows | +| --- | --- | --- | +| `agent` | `interaction` | 197 | +| `agent` | `price_log` | 2879 | +| `human` | `interaction` | 328 | +| `human` | `price_log` | 470 | + +### Store mode coverage +| Store mode | Rows | +| --- | --- | +| `hotel` | 3628 | +| `airline` | 196 | +| `shop` | 50 | + +### Top interaction events +| Interaction event | Count | +| --- | --- | +| `page_view` | 246 | +| `learn_more_about_item` | 91 | +| `view_item_page` | 88 | +| `add_item_to_cart` | 47 | +| `hover_over_title` | 23 | +| `checkout_start` | 20 | +| `hover_over_paragraph` | 6 | +| `remove_item` | 4 | + +## Collection pipeline + +Data is sourced from two roots inside PHANTOM: + +- `experiments/collected_data` (human sessions) +- `experiments/agents/collected_data` (agent sessions) + +Each session directory contains: + +- `int.json`: user interaction events +- `price.json`: price quote observations + +ETL behavior: + +1. Accepts both Kafka-envelope records and flat payload records. +2. Flattens nested JSON to a tabular schema. +3. Preserves row-level provenance (`source_session_dir`, `source_row_index`, topic fields). +4. Adds modeling labels (`actor_type`, `is_agent`, `record_type`). + +## Schema highlights + +Core modeling fields: + +- `actor_type`, `is_agent`, `record_type` +- `sessionId`, `experimentId`, `storeMode`, `ts` +- `eventName`, `page`, `productId`, `price`, `userAgent` + +Kafka provenance fields: + +- `kafka_partition_id`, `kafka_offset`, `kafka_timestamp_ms`, `kafka_compression` +- `kafka_is_transactional`, `kafka_headers`, `kafka_key_*`, `kafka_value_*` + +
+Metadata columns in this release + +- `metadata_cabinClass` +- `metadata_dateIndex` +- `metadata_dwellTime` +- `metadata_elementText` +- `metadata_fareRule` +- `metadata_flightType` +- `metadata_itemCount` +- `metadata_nights` +- `metadata_price` +- `metadata_referrer` +- `metadata_roomType` +- `metadata_total` +- `metadata_type` + +
+ +## Quick start + +```python +from datasets import load_dataset + +ds = load_dataset("velocitatem/whoclickedit") +``` + +Recommended split strategy: + +- Prefer session-aware or time-aware splits. +- Do not split rows from the same `sessionId` across train and test. + +## Intended use + +- Human-vs-agent behavior classification. +- Session-level telemetry modeling for dynamic pricing defenses. +- Robustness experiments under agent-mediated reconnaissance. + +## Safety and limitations + +- `userAgent` and referrer metadata can be quasi-identifying in very small samples. +- Data comes from a controlled research platform, not a full production marketplace. +- Current release has stronger coverage for `hotel` flows than `airline` flows. + +## Citation + +If you use this dataset, cite the PHANTOM thesis project and link this page: +`https://huggingface.co/datasets/velocitatem/whoclickedit` diff --git a/paper/src/chapters/figures/.gitignore b/paper/src/chapters/figures/.gitignore new file mode 100644 index 0000000..a467b0f --- /dev/null +++ b/paper/src/chapters/figures/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.pyc +.pdf-view-restore diff --git a/paper/src/chapters/figures/results/generated/final/final_focus_alpha_deltas.csv b/paper/src/chapters/figures/results/generated/final/final_focus_alpha_deltas.csv new file mode 100644 index 0000000..32bbd73 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/final/final_focus_alpha_deltas.csv @@ -0,0 +1,12 @@ +alpha,revenue_delta,revenue_delta_pct,reward_delta,reward_delta_pct,volatility_delta,supra_delta,coi_leakage_delta +0.0,-17982.383542886935,-5.11072862876989,-17145.799161982606,-5.235033672101227,0.001232973729699119,0.0,-0.0030412479577408003 +0.1,-14962.041501283413,-4.410637208586118,-14303.760282736213,-4.531344436782669,0.0011858665298920962,0.0,-0.004133727080174038 +0.2,-16153.416666167905,-4.826514761457546,-15398.621298776357,-4.9418165571901715,0.00200624274016295,0.0,-0.0033201883450373615 +0.3,-17294.9275360335,-5.382423616385397,-16544.91845114401,-5.533399709364953,-0.0011022484400295268,0.0,-0.0029151149203366505 +0.4,-19661.294346174283,-6.250307313590199,-18728.35578200908,-6.3953153560217535,3.582812967113658e-05,0.0,-0.0038123361988749577 +0.5,-16411.03168918495,-5.3630681206030015,-15638.77510066732,-5.4888928630525315,0.00015428950526953644,0.0,-0.00439661338956944 +0.6,-14729.668247641937,-5.069964928178309,-13912.22417824401,-5.148827377884945,-0.002735776807082743,0.0,-0.004310129386364658 +0.7,-21160.81910514756,-7.351404104505076,-20171.762105623755,-7.525169314210056,-0.0008903632602569461,0.0,-0.0026198461183787186 +0.8,-16404.76825612632,-5.9342582959227075,-15645.025250480074,-6.078699946285722,0.0010338614665691137,0.0,-0.002542765270289696 +0.9,-8674.090655496111,-3.2592966246269577,-8371.30734891587,-3.378943339994106,-0.0005579187914590139,0.0,-0.0013720835439427759 +1.0,768.8099906174757,0.2991618705853567,399.7394696234842,0.16706914330070038,0.0014659834822295797,0.0,-0.0007600066499474645 diff --git a/paper/src/chapters/figures/results/generated/final/final_focus_alpha_mode_summary.csv b/paper/src/chapters/figures/results/generated/final/final_focus_alpha_mode_summary.csv new file mode 100644 index 0000000..50051aa --- /dev/null +++ b/paper/src/chapters/figures/results/generated/final/final_focus_alpha_mode_summary.csv @@ -0,0 +1,23 @@ +alpha,mode,runs,revenue_mean,reward_mean,supra_mean,volatility_mean,coi_leakage_mean,coi_level_mean +0.0,baseline,36,351855.57381502265,327520.32242613373,0.0,0.06922494093544151,0.11931704468268205,136.80105514058158 +0.0,defended,35,333873.1902721357,310374.5232641511,0.0,0.07045791466514063,0.11627579672494125,136.81832905386602 +0.1,baseline,32,339226.3020897988,315662.6136522988,0.0,0.06952778671756812,0.11924519238669087,136.47864859317326 +0.1,defended,33,324264.2605885154,301358.8533695626,0.0,0.07071365324746022,0.11511146530651684,136.7200845824852 +0.2,baseline,31,334680.76789409376,311598.399506997,0.0,0.06848006194428993,0.11597869134898402,136.83684469591932 +0.2,defended,35,318527.35122792586,296199.77820822067,0.0,0.07048630468445288,0.11265850300394666,137.2758153292305 +0.3,baseline,30,321322.30327214615,299000.9636054795,0.0,0.07085669473747759,0.11527347603412934,136.4452630715689 +0.3,defended,44,304027.37573611265,282456.0451543355,0.0,0.06975444629744806,0.11235836111379269,136.4704115371568 +0.4,baseline,33,314565.2423109539,292844.914432166,0.0,0.07031811881503117,0.11300307992768284,136.72547178046122 +0.4,defended,38,294903.9479647796,274116.55865015695,0.0,0.0703539469447023,0.10919074372880788,136.75671002806396 +0.5,baseline,33,306000.80625751516,284916.7489847879,0.0,0.06938663916591635,0.11118137138243217,136.9528780620641 +0.5,defended,35,289589.7745683302,269277.9738841206,0.0,0.06954092867118589,0.10678475799286273,136.65018588845163 +0.6,baseline,28,290528.0106727377,270201.7985298805,0.0,0.07139577980623227,0.11081647254398667,135.258395468266 +0.6,defended,41,275798.3424250958,256289.57435163652,0.0,0.06866000299914952,0.10650634315762202,136.3194947785247 +0.7,baseline,40,287847.3119465684,268057.25244656845,0.0,0.07132313199532896,0.10746267580456732,137.0170522633547 +0.7,defended,40,266686.49284142087,247885.4903409447,0.0,0.07043276873507201,0.1048428296861886,136.56834095392904 +0.8,baseline,26,276441.76303208206,257374.52726285128,0.0,0.06945655282263205,0.1063246766773884,136.66765260798618 +0.8,defended,39,260036.99477595574,241729.5020123712,0.0,0.07049041428920116,0.1037819114070987,136.61222667078658 +0.9,baseline,35,266133.8213268301,247749.2667554015,0.0,0.0709569180547784,0.10455882265976374,136.5370653814206 +0.9,defended,39,257459.73067133396,239377.95940648564,0.0,0.07039899926331938,0.10318673911582096,136.7368893225831 +1.0,baseline,35,256987.96076959255,239265.888198164,0.0,0.06888231148034313,0.10369761394735275,136.68691718467974 +1.0,defended,30,257756.77076021003,239665.62766778748,0.0,0.07034829496257271,0.10293760729740528,136.65287739235566 diff --git a/paper/src/chapters/figures/results/generated/final/final_focus_headline_summary.json b/paper/src/chapters/figures/results/generated/final/final_focus_headline_summary.json new file mode 100644 index 0000000..e257560 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/final/final_focus_headline_summary.json @@ -0,0 +1,27 @@ +{ + "bundle": "engine/studies/results/wandb_sweep_bundles/bundle_20260317_093826", + "focus_cohort": "max_alpha_coverage", + "alpha_cells": 11, + "alpha_min": 0.0, + "alpha_max": 1.0, + "mean_revenue_delta_pct": -4.787221975639986, + "mean_reward_delta_pct": -4.91730667541704, + "zone_summary": [ + { + "zone": "high_alpha_0_7_plus", + "alpha_cells": 4, + "revenue_delta_pct_mean": -4.0614492886173466, + "reward_delta_pct_mean": -4.2039358642972955, + "coi_leakage_delta_mean": -0.0018236753956396637, + "volatility_delta_mean": 0.00026289072427068336 + }, + { + "zone": "low_alpha_below_0_7", + "alpha_cells": 7, + "revenue_delta_pct_mean": -5.201949225367208, + "reward_delta_pct_mean": -5.324947138914036, + "coi_leakage_delta_mean": -0.0037041938968711296, + "volatility_delta_mean": 0.00011102505536893832 + } + ] +} diff --git a/paper/src/chapters/figures/results/generated/final/final_focus_zone_summary.csv b/paper/src/chapters/figures/results/generated/final/final_focus_zone_summary.csv new file mode 100644 index 0000000..224a022 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/final/final_focus_zone_summary.csv @@ -0,0 +1,3 @@ +zone,alpha_cells,revenue_delta_pct_mean,reward_delta_pct_mean,coi_leakage_delta_mean,volatility_delta_mean +high_alpha_0_7_plus,4,-4.0614492886173466,-4.2039358642972955,-0.0018236753956396637,0.00026289072427068336 +low_alpha_below_0_7,7,-5.201949225367208,-5.324947138914036,-0.0037041938968711296,0.00011102505536893832 diff --git a/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf b/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf new file mode 100644 index 0000000..343539e Binary files /dev/null and b/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf differ diff --git a/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf b/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf new file mode 100644 index 0000000..157fc7a Binary files /dev/null and b/paper/src/chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf differ diff --git a/paper/src/chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf b/paper/src/chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf new file mode 100644 index 0000000..9bbbf7a Binary files /dev/null and b/paper/src/chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf differ diff --git a/paper/src/chapters/figures/results/generated/legacy/first_sweep_headline_summary.json b/paper/src/chapters/figures/results/generated/legacy/first_sweep_headline_summary.json new file mode 100644 index 0000000..caf3d15 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_headline_summary.json @@ -0,0 +1,10 @@ +{ + "runs": 340, + "tiers": 5, + "alphas": 6, + "status": "ok", + "mean_tier_revenue_robust": 190714.62212212436, + "mean_tier_revenue_no_robust": 197371.17216609977, + "mean_tier_revenue_delta": -6656.5500439754105, + "mean_tier_revenue_delta_pct": -3.3726050116242514 +} \ No newline at end of file diff --git a/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_deltas.csv b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_deltas.csv new file mode 100644 index 0000000..fcddcd6 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_deltas.csv @@ -0,0 +1,31 @@ +tier,alpha,runs_robust,runs_no_robust,eval_revenue_mean_delta,eval_revenue_mean_delta_pct,eval_reward_mean_delta,eval_reward_mean_delta_pct,eval_coi_level_mean_delta,eval_coi_level_mean_delta_pct,eval_margin_mean_delta,eval_margin_mean_delta_pct,objective_score_delta,objective_score_delta_pct,train_alpha_adv_delta,train_alpha_adv_delta_pct +dqn,0.0,5.0,2.0,-31308.987414117495,-8.73651226889534,-1909.7427407095092,-0.5742991901121623,-2.8982436567700063,-2.1108702433020436,-0.001972064237093285,-0.2116777198290971,-1909.7427407095092,-0.5742991901121623,, +dqn,0.1,8.0,4.0,-7723.542755668925,-2.2789188721535494,-74239.37371836061,-21.063854618469847,1.7435833801418141,1.2859365583872486,0.0011891962142838164,0.1278074871971924,-74239.37371836061,-21.063854618469847,0.17619791666666657,176.19791666666694 +dqn,0.25,7.0,3.0,-12344.82818986749,-3.7035466052614323,93154.03627578515,36.06691230407512,0.03214544949867104,0.023426184113378143,1.763733457238459e-05,0.001893256490383175,93154.03627578515,36.06691230407512,0.14530952380952394,58.12380952380958 +dqn,0.4,5.0,10.0,-7816.300706216833,-2.4694340725162824,-42362.74668471434,-13.411888482380219,0.6251272343707797,0.4579446603861758,0.0002750615520492605,0.02953644634355915,-42362.74668471434,-13.411888482380219,0.09856666666666747,24.64166666666691 +dqn,0.6,5.0,4.0,-16150.011887742497,-5.347485987139731,-28508.74710866122,-10.151356300001888,-0.63306323164079,-0.46056970247177387,-0.00034537433455417155,-0.0370668515552649,-28508.74710866122,-10.151356300001888,0.1361999999999981,22.699999999999644 +dqn,0.8,7.0,6.0,-18191.8826663699,-6.440527544692988,-55296.94441124235,-20.19273590083627,-0.796733634735034,-0.579832425016392,-0.0006423984775592029,-0.0689476165584585,-55296.94441124235,-20.19273590083627,0.1532857142857158,19.160714285714512 +linear,0.0,9.0,8.0,-14967.67388588126,-4.273413942959129,-20107.23171681742,-6.60039931288617,-0.06127790826209889,-0.04564810574240612,-7.607744079518586e-05,-0.008177885913528719,-20107.23171681742,-6.60039931288617,, +linear,0.1,3.0,5.0,-24531.399901538738,-7.171831328305365,-96669.7835552101,-26.44920711447249,-0.3680976907859872,-0.2733723058172187,-0.0002515287835096469,-0.02702956778346356,-96669.7835552101,-26.44920711447249,, +linear,0.25,6.0,9.0,-14840.859479571285,-4.520682292638562,-26510.179456423968,-8.033117756667396,-0.13734776448131925,-0.10212641096230607,-9.41162442338328e-05,-0.010115001392981545,-26510.179456423968,-8.033117756667396,, +linear,0.4,4.0,11.0,-17196.7642560167,-5.486915251242723,-74520.10209817477,-25.042311510043184,0.12217076984330788,0.09098828726103136,0.00010713887099822461,0.011516865671259795,-74520.10209817477,-25.042311510043184,, +linear,0.6,5.0,3.0,-14284.06615788641,-4.854766876637072,38417.71856593515,14.088596762512362,0.24251461234271687,0.1806530855220358,0.0002606811969937395,0.028024824619509187,38417.71856593515,14.088596762512362,, +linear,0.8,4.0,11.0,-10840.488575784548,-3.933600919557566,15749.581078662042,6.447651726824251,0.028051260535562506,0.020876236575910773,5.361882659971062e-05,0.005763158099097226,15749.581078662042,6.447651726824251,, +qtable,0.0,9.0,8.0,-18644.457288398524,-8.15323701554329,32993.42568058451,20.675688115613053,10.369779227648095,10.682768960780463,0.018566897519637582,2.0803084179092814,32993.42568058451,20.675688115613053,0.11839814814814797, +qtable,0.1,6.0,5.0,-12549.400855549495,-4.616991193742389,-37207.79701261924,-15.336047254435487,0.0884057957559321,0.07703761042583206,-0.01127789819771663,-1.2272540823820444,-37207.79701261924,-15.336047254435487,0.07577777777777787,75.77777777777803 +qtable,0.25,6.0,5.0,-1534.3527429780224,-0.5456640130847226,18433.43663451099,7.304472653867784,-0.5776125938941306,-0.45734160960552755,-0.003316338490628068,-0.3584028328803385,18433.43663451099,7.304472653867784,0.1181458333333334,47.258333333333354 +qtable,0.4,8.0,6.0,-15146.258176090778,-5.274860187729517,-37364.22587794208,-13.005651205148677,0.4611471727478005,0.3629050099230144,0.0071046453227539,0.7751478467862876,-37364.22587794208,-13.005651205148677,0.11010416666666772,27.52604166666698 +qtable,0.6,6.0,6.0,-9577.578548656049,-3.9322693501816666,-19088.152339068736,-9.571307395166029,0.9081750157567683,0.7495917946306662,0.0015520804425310786,0.16838348372043557,-19088.152339068736,-9.571307395166029,0.16983333333333228,28.305555555555333 +qtable,0.8,5.0,2.0,-52751.680936846446,-19.699089872409548,-16508.209313987172,-7.589601869470744,-15.022454081083623,-11.215398490282094,-0.007791824761087751,-0.8384414846099099,-16508.209313987172,-7.589601869470744,0.11120000000000174,13.900000000000245 +static,0.0,5.0,6.0,-4782.871053113384,-5.233544525848519,14411.4689779756,25.538141347978577,1.307060701942973,1.8731997380823568,0.002537468952847566,0.2911381045328444,14411.4689779756,25.538141347978577,, +static,0.1,8.0,5.0,1629.4524528499896,1.880088900553112,-5347.078589385725,-8.14812684380662,0.3600324838305795,0.5019134064795009,-4.6492644957929485e-05,-0.005316014641356001,-5347.078589385725,-8.14812684380662,, +static,0.25,5.0,6.0,-9938.662276761897,-10.398087633377964,-23616.087243780566,-27.701108621456626,-3.0513860773271233,-4.099238223547561,-0.003519771479853273,-0.40113716461596144,-23616.087243780566,-27.701108621456626,, +static,0.4,3.0,4.0,1850.8400595222774,2.1912497828943436,15058.659457798465,23.67199439061036,3.669612467486587,5.430169778169349,0.006763447803564415,0.7804393835882188,15058.659457798465,23.67199439061036,, +static,0.6,6.0,5.0,1038.893948415236,1.2765037688226162,-6062.864079504681,-9.363144945348399,-1.712609061865976,-2.3996341009364213,-0.0042285583442709385,-0.48362088973179423,-6062.864079504681,-9.363144945348399,, +static,0.8,3.0,7.0,2696.6340631967323,3.6826150812750567,149.22406835677975,0.27280281303997084,0.8491716126507072,1.2427748744725668,0.0032786525965587954,0.3777595573932637,149.22406835677975,0.27280281303997084,, +surge,0.0,6.0,6.0,-606.73760243367,-5.066579306500225,-244.17585425326251,-5.525800641331023,0.014874931199557295,0.09186560988877175,0.0019308940532419272,0.4471794260021321,-244.17585425326251,-5.525800641331023,, +surge,0.1,2.0,5.0,169.78743573408792,1.446343107913299,-1012.7706974660168,-20.02053666691211,-0.14459518037699226,-0.864651254901582,-0.0018650458785858248,-0.4260349899970559,-1012.7706974660168,-20.02053666691211,, +surge,0.25,10.0,7.0,-128.20993816584632,-1.1276930411162496,-81.21373487263281,-1.7081453033360994,0.3008506477195141,1.839047728806548,0.0030750148302954305,0.7102446987902812,-81.21373487263281,-1.7081453033360994,, +surge,0.4,6.0,6.0,-473.03722764431404,-4.297928307550563,28.557452243338048,0.6755106104955642,-0.5027452173053764,-3.072002360121898,-0.005581380442163164,-1.288152985482699,28.557452243338048,0.6755106104955642,, +surge,0.6,2.0,5.0,307.79436325796996,3.0356727142643067,2060.57396030564,63.382050333909866,0.2339650444065704,1.438519400758399,0.001302270025389629,0.30077697380833807,2060.57396030564,63.382050333909866,, +surge,0.8,3.0,3.0,423.15386247993047,4.372210191290083,1117.0942083304312,34.86182570616373,0.8971464536957541,5.327339899805159,0.007068630716831503,1.6094191039618562,1117.0942083304312,34.86182570616373,, diff --git a/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_mode_summary.csv b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_mode_summary.csv new file mode 100644 index 0000000..dba8d81 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_alpha_mode_summary.csv @@ -0,0 +1,61 @@ +tier,alpha,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_margin_mean_mean,eval_margin_mean_std,objective_score_mean,objective_score_std,train_alpha_adv_mean,train_alpha_adv_std +dqn,0.0,no_robust,2,358369.40933039243,3531.782519351935,332534.46523867303,114183.5587841961,137.30089123035202,0.8184776440325546,0.9316352418598786,0.0006839003676302996,332534.46523867303,114183.5587841961,, +dqn,0.0,robust,5,327060.42191627494,24311.17412598574,330624.7224979635,62834.39223547943,134.40264757358202,6.160000643680792,0.9296631776227853,0.004262039730140749,330624.7224979635,62834.39223547943,0.17835000000000004,0.08829347371125472 +dqn,0.1,no_robust,4,338912.58043645386,19584.736810155388,352449.13650924934,34076.74819101191,135.58860029055563,3.4055508991301524,0.9304589585186211,0.0023438665484978773,352449.13650924934,34076.74819101191,0.0999999999999998,0.0 +dqn,0.1,robust,8,331189.03768078494,8060.912085646968,278209.7627908887,57861.69545853692,137.33218367069745,0.43113256118808096,0.931648154732905,0.000296560958972609,278209.7627908887,57861.69545853692,0.2761979166666664,0.09826648189130198 +dqn,0.25,no_robust,3,333324.4996115304,6101.717861804452,258281.15112936878,46772.05216097596,137.2201692904545,0.9866477887862672,0.9315871706751672,0.0006356053229300815,258281.15112936878,46772.05216097596,0.25,0.0 +dqn,0.25,robust,7,320979.6714216629,7345.8761269427705,351435.18740515393,40320.63699261721,137.25231473995316,0.3527287960309152,0.9316048080097395,0.0002575240668471541,351435.18740515393,40320.63699261721,0.39530952380952394,0.073021206240698 +dqn,0.4,no_robust,10,316521.94295076875,3631.1820920182718,315859.66987697606,59129.03566963754,136.50715652926755,0.5085743959240285,0.931261495881483,0.00031280530251053175,315859.66987697606,59129.03566963754,0.3999999999999993,0.0 +dqn,0.4,robust,5,308705.6422445519,10654.571556448245,273496.9231922617,68868.59270778317,137.13228376363833,0.9543108715306617,0.9315365574335323,0.0006302636717132419,273496.9231922617,68868.59270778317,0.49856666666666677,0.05745573175159429 +dqn,0.6,no_robust,4,302011.2988903938,2354.1141598720183,280836.828756133,58683.00124997926,137.4522093492651,0.4692723362517602,0.9317606434396914,0.0003317518021682495,280836.828756133,58683.00124997926,0.600000000000001,0.0 +dqn,0.6,robust,5,285861.2870026513,10386.571631344234,252328.08164747176,59388.56063758225,136.8191461176243,1.0629203361893034,0.9314152691051373,0.0005692783702932289,252328.08164747176,59388.56063758225,0.7361999999999991,0.07108625433623189 +dqn,0.8,no_robust,6,282459.51189759385,2625.018247527438,273845.72691287595,66378.16690732416,137.4075681801531,0.29728950101826707,0.9317196295169007,0.00022799290978965786,273845.72691287595,66378.16690732416,0.7999999999999985,0.0 +dqn,0.8,robust,7,264267.62923122395,6771.288971321149,218548.7825016336,50043.2009443344,136.61083454541807,1.2319662937254596,0.9310772310393415,0.0010118564779437284,218548.7825016336,50043.2009443344,0.9532857142857143,0.04709817507333055 +linear,0.0,no_robust,8,350250.9723061577,3156.286820918861,304636.59490360576,71682.88027353655,134.2397614654424,0.32611787466946035,0.9302824910938235,0.00024020749661685483,304636.59490360576,71682.88027353655,, +linear,0.0,robust,9,335283.29842027643,7707.594869976611,284529.36318678834,55524.58819004573,134.1784835571803,0.4477314164684001,0.9302064136530284,0.00034781034181738526,284529.36318678834,55524.58819004573,, +linear,0.1,no_robust,5,342052.1032713031,2576.546352056584,365492.17954557994,44890.93522299766,134.65068807375954,0.2181027640393531,0.930569018064469,0.00014058935916940913,365492.17954557994,44890.93522299766,, +linear,0.1,robust,3,317520.7033697644,4796.580459456527,268822.39599036984,39256.421140635124,134.28259038297355,0.24570499109363475,0.9303174892809594,0.00018817899183709092,268822.39599036984,39256.421140635124,, +linear,0.25,no_robust,9,328288.0441241802,2178.525494145428,330011.0898339667,38591.36053388808,134.48799697074742,0.2199303973026469,0.9304619997297959,0.00015341642413402035,330011.0898339667,38591.36053388808,, +linear,0.25,robust,6,313447.18464460893,11811.426711620714,303500.9103775427,63358.917144214036,134.3506492062661,0.2947034403278951,0.9303678834855621,0.00021446628431268986,303500.9103775427,63358.917144214036,, +linear,0.4,no_robust,11,313414.0672597746,1982.9537556159262,297576.7714904776,69396.90446617964,134.2708754290745,0.3062093691351849,0.9302780292522507,0.00023067974755288992,297576.7714904776,69396.90446617964,, +linear,0.4,robust,4,296217.3030037579,5109.898340355844,223056.66939230284,38293.73688466607,134.3930461989178,0.12347753686382154,0.9303851681232489,7.324605809708878e-05,223056.66939230284,38293.73688466607,, +linear,0.6,no_robust,3,294227.64307441004,2081.9176570448135,272686.62176604365,66672.50905805513,134.24327165069943,0.30764332256042104,0.9301795837547151,0.00020453921786790446,272686.62176604365,66672.50905805513,, +linear,0.6,robust,5,279943.5769165236,9866.031719660255,311104.3403319788,28363.930707781863,134.48578626304214,0.21280262186464388,0.9304402649517088,0.00020533894868120649,311104.3403319788,28363.930707781863,, +linear,0.8,no_robust,11,275586.89347174135,1618.038877505867,244268.4832547461,56201.44465269986,134.36933631960773,0.2845660213184439,0.9303723007028001,0.00017640716421186918,244268.4832547461,56201.44465269986,, +linear,0.8,robust,4,264746.4048959568,7976.6279174956235,260018.06433340814,57942.49882730146,134.3973875801433,0.31511916357643405,0.9304259195293998,0.00023606570471334208,260018.06433340814,57942.49882730146,, +qtable,0.0,no_robust,8,228675.52179404112,103199.70453252994,159575.94976328663,95848.81008103945,97.07014413321637,33.0637115678536,0.8925069648229078,0.04890522141482132,159575.94976328663,95848.81008103945,0.0,0.0 +qtable,0.0,robust,9,210031.0645056426,84361.3834579348,192569.37544387113,116824.7880426837,107.43992336086447,21.41128645838254,0.9110738623425454,0.019188350719133364,192569.37544387113,116824.7880426837,0.11839814814814797,0.061909456985161225 +qtable,0.1,no_robust,5,271809.0706466638,14898.209045050968,242616.60384397948,49181.45526408063,114.75666919996793,3.461383158930426,0.9189538140159812,0.002294693249439748,242616.60384397948,49181.45526408063,0.0999999999999998,0.0 +qtable,0.1,robust,6,259259.66979111428,102995.29934229614,205408.80683136024,94155.1845420674,114.84507499572386,36.206421837506966,0.9076759158182646,0.048591979839360346,205408.80683136024,94155.1845420674,0.17577777777777767,0.06720562696899951 +qtable,0.25,no_robust,5,281190.01916657295,70274.10208723843,252358.2126733039,129868.46825082717,126.29784427276161,15.368804047323954,0.9253103453385114,0.009044883517550522,252358.2126733039,129868.46825082717,0.25,0.0 +qtable,0.25,robust,6,279655.6664235949,93056.2549557545,270791.6493078149,116021.46257259768,125.72023167886748,26.760714047253796,0.9219940068478834,0.022785695882060884,270791.6493078149,116021.46257259768,0.3681458333333334,0.08845114686619042 +qtable,0.4,no_robust,6,287140.4669895195,32698.16434426399,287292.23388022534,83855.95000252876,127.07104066863859,9.200301166154173,0.9165535777734913,0.01306001923887748,287292.23388022534,83855.95000252876,0.3999999999999993,0.0 +qtable,0.4,robust,8,271994.2088134287,79259.3185780895,249928.00800228326,88265.30801790548,127.53218784138639,23.406428094683015,0.9236582230962452,0.020073747007871224,249928.00800228326,88265.30801790548,0.510104166666667,0.09294655989347765 +qtable,0.6,no_robust,6,243563.64469828535,67006.60707045678,199430.98211127534,79119.52886604435,121.15594411011905,17.91243944823949,0.9217533740470492,0.011558797825966702,199430.98211127534,79119.52886604435,0.600000000000001,0.0 +qtable,0.6,robust,6,233986.0661496293,43155.478617087436,180342.8297722066,48117.79957836251,122.06411912587582,12.160951090203252,0.9233054544895802,0.006840854872863436,180342.8297722066,48117.79957836251,0.7698333333333333,0.09107066853090896 +qtable,0.8,no_robust,2,267787.4017455507,1552.038101264713,217510.87340156303,45358.788584678456,133.9448981157492,0.47346860040111405,0.9293224278749692,0.0002998116010539045,217510.87340156303,45358.788584678456,0.7999999999999985,0.0 +qtable,0.8,robust,5,215035.72080870424,32869.73253165852,201002.66408757586,63247.67956376057,118.92244403466557,8.586916805142152,0.9215306031138815,0.004644709320891907,201002.66408757586,63247.67956376057,0.9112000000000002,0.07381653307732307 +static,0.0,no_robust,6,91388.75248869567,13415.65534300268,56431.15832748852,8525.098185703384,69.77689967440658,3.670744870085874,0.8715688236409825,0.005831496806767582,56431.15832748852,8525.098185703384,, +static,0.0,robust,5,86605.88143558228,7614.909395960895,70842.62730546412,8033.737230392738,71.08396037634955,3.6802889678420283,0.8741062925938301,0.005083911544334936,70842.62730546412,8033.737230392738,, +static,0.1,no_robust,5,86668.90445290186,8037.955688932984,65623.40881389238,19329.448262530004,71.73199185012882,4.199046495412734,0.874577067494122,0.006610505646022198,65623.40881389238,19329.448262530004,, +static,0.1,robust,8,88298.35690575185,9576.838833058617,60276.33022450666,13359.490452744656,72.0920243339594,6.7706096714767865,0.8745305748491641,0.010083585815241344,60276.33022450666,13359.490452744656,, +static,0.25,no_robust,6,95581.63603909909,8345.698435455577,85253.22060752509,13111.526873622026,74.43788116042678,2.1078820386097368,0.8774483618896327,0.0037254791853004897,85253.22060752509,13111.526873622026,, +static,0.25,robust,5,85642.97376233719,9472.880627242153,61637.13336374452,15937.429780623212,71.38649508309966,4.0264905454627264,0.8739285904097794,0.005323853359397925,61637.13336374452,15937.429780623212,, +static,0.4,no_robust,4,84465.04245981346,12101.831388745604,63613.81812329075,7778.361846092061,67.5782271530322,3.9088888968092,0.8666205147756862,0.007149121199217965,63613.81812329075,7778.361846092061,, +static,0.4,robust,3,86315.88251933573,8642.748496122398,78672.47758108922,17823.74997200773,71.24783962051879,2.790416943786253,0.8733839625792507,0.005990544453538607,78672.47758108922,17823.74997200773,, +static,0.6,no_robust,5,81385.88962988024,12343.523894997037,64752.43216774836,23486.779472906223,71.36959177224794,5.100226704959064,0.874353948320141,0.007787250295491337,64752.43216774836,23486.779472906223,, +static,0.6,robust,6,82424.78357829548,9831.886701625144,58689.56808824368,12672.506035553573,69.65698271038197,3.484982360048201,0.8701253899758701,0.005917711231889304,58689.56808824368,12672.506035553573,, +static,0.8,no_robust,7,73226.06364450825,4447.877985963851,54700.340767716196,14406.881298569717,68.32867561883204,3.68262917356943,0.8679204886788817,0.007467501164611224,54700.340767716196,14406.881298569717,, +static,0.8,robust,3,75922.69770770498,5046.089536162847,54849.564836072976,22780.98012221352,69.17784723148274,1.5268167784698885,0.8711991412754405,0.0033278715575433297,54849.564836072976,22780.98012221352,, +surge,0.0,no_robust,6,11975.290738176132,411.4052900076416,4418.832131346071,896.5828048394391,16.192056219479124,0.8040364003224534,0.4317940274006973,0.008271862690929055,4418.832131346071,896.5828048394391,, +surge,0.0,robust,6,11368.553135742462,623.8217438159004,4174.6562770928085,639.9963040241264,16.20693115067868,0.9853827520149101,0.4337249214539392,0.010371668289035135,4174.6562770928085,639.9963040241264,, +surge,0.1,no_robust,5,11739.084232858655,332.778792718381,5058.659087494994,1110.8409258976824,16.722948073839394,0.6578121995950104,0.4377682402562083,0.005683401047550787,5058.659087494994,1110.8409258976824,, +surge,0.1,robust,2,11908.871668592743,81.41250285550258,4045.8883900289775,784.7169500268457,16.5783528934624,0.4088194924856508,0.4359031943776225,0.004531137621699143,4045.8883900289775,784.7169500268457,, +surge,0.25,no_robust,7,11369.223138855004,236.1121240061105,4754.4980344481255,1038.0550037539617,16.359045119223275,0.3945156775653057,0.4329514652531622,0.0038762110261952457,4754.4980344481255,1038.0550037539617,, +surge,0.25,robust,10,11241.013200689158,684.503587066406,4673.284299575493,1187.78635131025,16.65989576694279,1.0515950311117155,0.4360264800834576,0.009701952962125513,4673.284299575493,1187.78635131025,, +surge,0.4,no_robust,6,11006.168409400554,364.6584583108646,4227.535704048808,1414.7964077877168,16.365391636138824,0.9138430058543858,0.4332855262584901,0.008024003783434592,4227.535704048808,1414.7964077877168,, +surge,0.4,robust,6,10533.13118175624,526.0758051960169,4256.093156292146,783.7965507386594,15.862646418833448,0.7732699435426456,0.42770414581632693,0.008967505611725135,4256.093156292146,783.7965507386594,, +surge,0.6,no_robust,5,10139.2472848498,97.448078425168,3251.037082975553,742.2100315641153,16.26429537781848,0.4432465691073604,0.4329686574409998,0.004121820888165019,3251.037082975553,742.2100315641153,, +surge,0.6,robust,2,10447.04164810777,524.0029334247373,5311.611043281193,1808.6200710093085,16.49826042222505,0.6088756908260344,0.43427092746638946,0.007817511630542989,5311.611043281193,1808.6200710093085,, +surge,0.8,no_robust,3,9678.259826640971,272.83530913170915,3204.3479815026553,556.8799617962688,16.840420745981802,0.4589959822922529,0.43920385308157944,0.004953937449529005,3204.3479815026553,556.8799617962688,, +surge,0.8,robust,3,10101.413689120902,526.8318040489241,4321.442189833087,1284.166148011517,17.737567199677557,0.6586775330563983,0.44627248379841095,0.004644261847052545,4321.442189833087,1284.166148011517,, diff --git a/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_mode_summary.csv b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_mode_summary.csv new file mode 100644 index 0000000..e296749 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_tier_mode_summary.csv @@ -0,0 +1,11 @@ +tier,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_margin_mean_mean,eval_margin_mean_std,objective_score_mean,objective_score_std,train_alpha_adv_mean,train_alpha_adv_std +dqn,no_robust,29,315185.66674813855,23538.781000060844,302576.8036266896,62951.88633145167,136.82560356086017,1.3692652218935986,0.9313739013618878,0.0009314135057224836,302576.8036266896,62951.88633145167,0.45740740740740693,0.2368477698794438 +dqn,robust,37,306875.13950902375,27585.74444520695,283724.7169827867,69843.05611741856,136.68837571992978,2.3797541654948753,0.9312171495138941,0.0016512408492580111,283724.7169827867,69843.05611741856,0.5058198198198196,0.28324483129860284 +linear,no_robust,47,315501.15296155965,27105.014861872147,298149.1730416604,67664.7308344108,134.36884359609928,0.29743647613433244,0.9303607531364,0.0002152647006739543,298149.1730416604,67664.7308344108,, +linear,robust,31,306269.9232239004,26399.875293394463,279872.824370329,54401.104602086416,134.32737693008372,0.31909212993628877,0.9303375215162144,0.00025000448833182963,279872.824370329,54401.104602086416,, +qtable,no_robust,32,259818.72178238883,67188.58622318009,222088.83510765125,94450.12569617687,116.84641954166946,22.42810298937963,0.9140582213134033,0.02778864370791322,222088.83510765125,94450.12569617687,0.29218749999999993,0.2559326319498438 +qtable,robust,40,244470.50673219413,78666.30912808319,216920.53697298188,93983.50987622296,118.94013969887506,23.1428303249914,0.9178608956089163,0.023827311253270544,216920.53697298188,93983.50987622296,0.4396239583333334,0.29521865862482416 +static,no_robust,33,85228.452028227,12041.415672002751,64828.579890468536,17681.280330831738,70.58818912317687,4.204964531595236,0.8721419294578765,0.007107262779462876,64828.579890468536,17681.280330831738,, +static,robust,30,84963.18577955024,8926.291379160475,63243.76603076817,14880.924342692271,70.94358095957392,4.363134562111469,0.8730306888410219,0.006660289247744752,63243.76603076817,14880.924342692271,, +surge,no_robust,32,11121.867310184698,809.9895800277001,4260.038064073964,1160.4282377968032,16.416108827015794,0.641203520341943,0.43413855082681374,0.006214799767130059,4260.038064073964,1160.4282377968032,, +surge,robust,29,10994.355365953365,750.5115890942825,4448.160863178768,1000.7519971246122,16.495943148858906,0.9823026347466668,0.4347587896392907,0.009698591291108968,4448.160863178768,1000.7519971246122,, diff --git a/paper/src/chapters/figures/results/generated/legacy/first_sweep_top_configs.csv b/paper/src/chapters/figures/results/generated/legacy/first_sweep_top_configs.csv new file mode 100644 index 0000000..e51fd74 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/legacy/first_sweep_top_configs.csv @@ -0,0 +1,26 @@ +Name,tier,alpha,mode,objective/score,eval/revenue_mean,eval/reward_mean,eval/coi_level_mean,lambda_coi,robust_radius,learning_rate,batch_size,n_steps,total_timesteps +eager-sweep-244,dqn,0.0,no_robust,413274.4339549909,355872.06196128257,413274.4339549909,136.722140138007,0.2,0.1,0.0003,256,4096,15000 +efficient-sweep-319,linear,0.0,no_robust,410094.0151741567,353309.5198146561,410094.0151741567,134.55152038805429,0.4,0.1,0.001,128,4096,15000 +swept-sweep-422,linear,0.0,no_robust,403130.32747386186,347611.2815474988,403130.32747386186,133.8559785775022,0.4,0.3,0.0001,512,1024,15000 +decent-sweep-478,linear,0.1,no_robust,400452.36418713134,345284.5750647792,400452.36418713134,134.73082941975588,0.1,0.2,0.001,128,1024,50000 +eternal-sweep-339,linear,0.1,no_robust,399628.4231731644,344154.38525771734,399628.4231731644,134.89479277649667,0.4,0.1,0.0001,256,1024,50000 +ethereal-sweep-21,dqn,0.1,no_robust,398492.807245857,343580.6802427996,398492.807245857,136.67160732585188,0.1,0.2,0.001,512,2048,50000 +dark-sweep-418,linear,0.1,no_robust,394615.3720658343,339749.76272695075,394615.3720658343,134.39233246711,0.2,0.1,0.0003,256,1024,50000 +wandering-sweep-122,dqn,0.0,robust,394061.3617726404,339512.43434806296,394061.3617726404,137.6864755964331,0.1,0.3,0.0001,256,2048,30000 +laced-sweep-132,dqn,0.1,robust,389274.54998495104,335600.5979215904,389274.54998495104,137.36888574027677,0.4,0.2,0.001,256,2048,30000 +rich-sweep-53,qtable,0.0,robust,388601.2626147048,335630.6853337664,388601.2626147048,133.4414069888203,0.2,0.1,0.0001,512,1024,50000 +faithful-sweep-430,qtable,0.25,no_robust,387035.6970938766,333255.5771210341,387035.6970938766,137.4906091183188,0.1,0.2,0.0003,128,1024,15000 +dark-sweep-280,qtable,0.25,no_robust,386318.8845004527,332220.0316564078,386318.8845004527,137.26992450099925,0.4,0.1,0.0001,256,1024,50000 +chocolate-sweep-383,linear,0.25,no_robust,383989.49015403807,331071.7003244704,383989.49015403807,134.60590742050857,0.1,0.2,0.001,512,1024,30000 +dry-sweep-263,dqn,0.0,robust,383372.6880637367,330436.0312615148,383372.6880637367,137.40558130223476,0.1,0.3,0.001,128,1024,50000 +different-sweep-143,qtable,0.0,robust,383278.4198015018,330546.16800945485,383278.4198015018,135.9021538079678,0.1,0.3,0.001,256,2048,30000 +woven-sweep-139,dqn,0.25,robust,382788.1296637251,329427.735752473,382788.1296637251,136.8968339394894,0.1,0.1,0.001,512,1024,15000 +dark-sweep-215,dqn,0.25,robust,382358.2401374872,329330.0097603144,382358.2401374872,137.64528612332785,0.2,0.1,0.0001,512,4096,30000 +charmed-sweep-136,linear,0.25,no_robust,382249.5728044314,329646.2053260979,382249.5728044314,134.46825608007862,0.4,0.1,0.0001,256,2048,15000 +light-sweep-308,linear,0.0,robust,381939.1275250679,329628.9436641051,381939.1275250679,133.6209821974879,0.2,0.2,0.001,128,4096,30000 +treasured-sweep-325,linear,0.25,robust,381322.0104772589,328353.58675398555,381322.0104772589,134.8950293943581,0.1,0.1,0.0001,512,2048,15000 +fine-sweep-202,dqn,0.25,robust,378751.33572275366,326518.9068184018,378751.33572275366,137.2900973301052,0.1,0.2,0.0001,512,2048,30000 +treasured-sweep-380,linear,0.25,no_robust,377898.0979419424,325869.1953595453,377898.0979419424,134.54118723889738,0.4,0.3,0.001,128,1024,50000 +pretty-sweep-49,qtable,0.25,robust,377318.4766808995,325282.0152823859,377318.4766808995,137.19609012644068,0.4,0.1,0.0001,128,4096,50000 +desert-sweep-253,linear,0.25,robust,376808.6335063269,325146.3478714648,376808.6335063269,134.48396340732663,0.2,0.1,0.0003,256,1024,30000 +jolly-sweep-133,qtable,0.4,no_robust,376419.57394710975,323709.24588324485,376419.57394710975,137.8349363778071,0.1,0.3,0.0001,128,2048,50000 diff --git a/paper/src/chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf b/paper/src/chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf new file mode 100644 index 0000000..a278726 Binary files /dev/null and b/paper/src/chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf differ diff --git a/paper/src/chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf new file mode 100644 index 0000000..5f97b19 Binary files /dev/null and b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf differ diff --git a/paper/src/chapters/figures/results/generated/legacy/plots/ppo_delta_curves.pdf b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_delta_curves.pdf new file mode 100644 index 0000000..3f1be83 Binary files /dev/null and b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_delta_curves.pdf differ diff --git a/paper/src/chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf new file mode 100644 index 0000000..49ecc66 Binary files /dev/null and b/paper/src/chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf differ diff --git a/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_deltas.csv b/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_deltas.csv new file mode 100644 index 0000000..42cf5c9 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_deltas.csv @@ -0,0 +1,7 @@ +alpha,runs_robust,runs_no_robust,eval_revenue_mean_robust,eval_revenue_mean_no_robust,eval_revenue_mean_delta,eval_revenue_mean_delta_pct,eval_reward_mean_robust,eval_reward_mean_no_robust,eval_reward_mean_delta,eval_reward_mean_delta_pct,eval_coi_level_mean_robust,eval_coi_level_mean_no_robust,eval_coi_level_mean_delta,eval_coi_level_mean_delta_pct,eval_coi_leakage_mean_robust,eval_coi_leakage_mean_no_robust,eval_coi_leakage_mean_delta,eval_coi_leakage_mean_delta_pct,eval_volatility_mean_robust,eval_volatility_mean_no_robust,eval_volatility_mean_delta,eval_volatility_mean_delta_pct,eval_margin_mean_robust,eval_margin_mean_no_robust,eval_margin_mean_delta,eval_margin_mean_delta_pct,train_alpha_adv_robust,train_alpha_adv_no_robust,train_alpha_adv_delta,train_alpha_adv_delta_pct,train_coi_penalty_robust,train_coi_penalty_no_robust,train_coi_penalty_delta,train_coi_penalty_delta_pct,train_ux_penalty_robust,train_ux_penalty_no_robust,train_ux_penalty_delta,train_ux_penalty_delta_pct,train_agent_prob_robust,train_agent_prob_no_robust,train_agent_prob_delta,train_agent_prob_delta_pct +0.0,4.0,4.0,3379.9042994670963,3565.2912010160844,-185.38690154898813,-5.199768857482219,313527.4707462,331300.229069,-17772.758322799986,-5.364547550342456,137.08358925982625,137.28764358955686,-0.2040543297306101,-0.14863269875959326,0.1146626165658294,0.11861133504329742,-0.003948718477468013,-3.3291240470622716,0.06687153537785637,0.06445662162531288,0.0024149137525434905,3.746572022625408,0.9315273502623671,0.9317078361627993,-0.00018048590043218127,-0.019371512552207898,0.18958333333333333,,,,5.553200113221484,,,,61.35134238638615,66.58479574844135,-5.233453362055201,-7.859832418540847,0.12778212146468534,0.11615891320235115,0.011623208262334192,10.00629907933654 +0.1,4.0,4.0,3307.028238366196,3458.002436284769,-150.97419791857283,-4.365936713473732,306772.49146475,321215.477968,-14442.986503249966,-4.4963544704059375,137.1182041122497,136.82757579763506,0.29062831461465066,0.21240478238427865,0.1128546052304944,0.11704917861668755,-0.004194573386193154,-3.5835991638433753,0.0685405649303561,0.06737596899527175,0.0011645959350843477,1.728503430007924,0.9315331673960889,0.9313276818191593,0.00020548557692967595,0.0220637248243606,0.2818749999999999,0.1,0.18187499999999987,181.87499999999986,5.079528726095333,,,,52.44772950699336,53.288869747139515,-0.841140240146153,-1.578453895039319,0.11644381911386253,0.11765277436070229,-0.0012089552468397546,-1.0275620387270383 +0.25,4.0,4.0,3134.3438215278165,3300.5539051855053,-166.21008365768876,-5.035823938416998,290691.4771835,306522.90003785,-15831.422854350007,-5.16484179563586,136.89990884669214,136.71752459667877,0.18238425001337077,0.1334022471160229,0.11113957413522965,0.1139905600539111,-0.0028509859186814507,-2.50107194607439,0.06427159998376095,0.06846858821082077,-0.004196988227059828,-6.12980103246314,0.9314501501825461,0.9313053225630614,0.0001448276194846443,0.015551035302371268,0.44833333333333336,0.25,0.19833333333333336,79.33333333333334,4.7183804755060255,,,,49.04307009982127,55.2030005738411,-6.159930474019831,-11.158687770568074,0.10998505830218755,0.11684259343269415,-0.0068575351305066035,-5.869037077182653 +0.4,4.0,4.0,2983.852437569374,3180.7872854626567,-196.9348478932825,-6.191386918369099,276545.26309355,295433.5405797,-18888.277486150037,-6.393409986248494,136.19210761854086,136.5783021470118,-0.38619452847095204,-0.2827641890402586,0.10875560547061063,0.11189234314151972,-0.0031367376709090927,-2.8033532794480807,0.07452230347799255,0.07104688223410768,0.003475421243884863,4.891729425132195,0.9307282962514367,0.9310542820602117,-0.0003259858087749645,-0.03501254599824534,0.5999999999999999,0.4000000000000001,0.1999999999999998,49.999999999999936,4.174996403604185,,,,47.99794119802058,50.794260008988424,-2.796318810967847,-5.505186630286606,0.10222958892923095,0.11161526349272373,-0.009385674563492777,-8.408952565976458 +0.6,4.0,4.0,2789.0434220430398,2982.2460998252786,-193.20267778223888,-6.4784283830083,258688.11700405,277051.95613675,-18363.8391327,-6.628301560749781,136.86774320500828,136.81931587629953,0.04842732870875466,0.035395096371142916,0.10501047827147733,0.10802266412956946,-0.0030121858580921257,-2.788475809557069,0.06914180963767007,0.06698591531512615,0.0021558943225439137,3.2184292957732996,0.9314130089130337,0.9313849217310588,2.8087181974889575e-05,0.003015636319588161,0.7733333333333334,0.5999999999999999,0.17333333333333356,28.888888888888935,4.178300996512875,,,,39.928062615509425,47.86860429278531,-7.940541677275881,-16.588203885594947,0.11297979438696983,0.1162670925925253,-0.0032872982055554695,-2.827367686122743 +0.8,4.0,4.0,2586.098242115281,2841.1305915063504,-255.03234939106915,-8.97643882169642,239765.24959855,264140.55002745,-24375.300428900024,-9.228155399224729,136.5038826686135,137.28163778418497,-0.7777551155714661,-0.5665397995864124,0.10253056902792507,0.1031498585902154,-0.0006192895622903344,-0.6003784888844036,0.07325665736408164,0.06592454978099352,0.007332107583088124,11.1219683827132,0.9311235469993302,0.9316596013994161,-0.0005360544000858614,-0.05753758124541101,1.0,0.8000000000000002,0.19999999999999984,24.99999999999998,3.5384100686094007,,,,37.14414699970415,37.43809775029793,-0.29395075059377973,-0.7851647606519765,0.09990322635678014,0.10432800196112454,-0.0044247756043444,-4.241215705437541 diff --git a/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_mode_summary.csv b/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_mode_summary.csv new file mode 100644 index 0000000..52cff7b --- /dev/null +++ b/paper/src/chapters/figures/results/generated/legacy/ppo_alpha_mode_summary.csv @@ -0,0 +1,13 @@ +alpha,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_coi_leakage_mean_mean,eval_coi_leakage_mean_std,eval_volatility_mean_mean,eval_volatility_mean_std,eval_margin_mean_mean,eval_margin_mean_std,train_alpha_adv_mean,train_alpha_adv_std,train_coi_penalty_mean,train_coi_penalty_std,train_ux_penalty_mean,train_ux_penalty_std,train_agent_prob_mean,train_agent_prob_std +0.0,no_robust,4,3565.2912010160844,52.219179508209216,331300.229069,5038.96659004527,137.28764358955686,0.6434240315013728,0.11861133504329742,0.004019332768284657,0.06445662162531288,0.004080405219050139,0.9317078361627993,0.00038018051704976865,,,,,66.58479574844135,32.282270089830455,0.11615891320235115,0.016558627227281013 +0.0,robust,4,3379.9042994670963,54.727408939657735,313527.4707462,5408.058196552377,137.08358925982625,1.047386315387148,0.1146626165658294,0.0025627354157035497,0.06687153537785637,0.008577061675868377,0.9315273502623671,0.0007274203134899985,0.18958333333333333,0.02083333333333336,5.553200113221484,0.45981481828856186,61.35134238638615,30.27964905193963,0.12778212146468534,0.027929667978205217 +0.1,no_robust,4,3458.002436284769,60.75923217871363,321215.477968,6016.373193216596,136.82757579763506,1.1899102161551907,0.11704917861668755,0.0021220259908233973,0.06737596899527175,0.006801136773079149,0.9313276818191593,0.0008352263172197586,0.1,0.0,,,53.288869747139515,18.480340945815023,0.11765277436070229,0.017544197575138736 +0.1,robust,4,3307.028238366196,35.58495715224888,306772.49146475,3488.2690530060245,137.1182041122497,0.8582218376452346,0.1128546052304944,0.0005963155492967403,0.0685405649303561,0.0050673362512629015,0.9315331673960889,0.0005217376436765336,0.2818749999999999,0.03624999999999999,5.079528726095333,0.6109585102054891,52.44772950699336,29.0263361696475,0.11644381911386253,0.021152545180088765 +0.25,no_robust,4,3300.5539051855053,50.460978662647115,306522.90003785,4860.668937531515,136.71752459667877,0.7410676951244369,0.1139905600539111,0.003319948537321803,0.06846858821082077,0.008614994548315848,0.9313053225630614,0.0004919872662680591,0.25,0.0,,,55.2030005738411,26.88247558235345,0.11684259343269415,0.013462146346772591 +0.25,robust,4,3134.3438215278165,64.06834403659167,290691.4771835,6331.196493752059,136.89990884669214,1.3796663751798552,0.11113957413522965,0.0015044942041406348,0.06427159998376095,0.0042331619171274894,0.9314501501825461,0.0008939739741734515,0.44833333333333336,0.0033333333333333518,4.7183804755060255,0.4538389380858333,49.04307009982127,28.20484665432831,0.10998505830218755,0.010731404693185651 +0.4,no_robust,4,3180.7872854626567,71.87564776824694,295433.5405797,7035.374110540269,136.5783021470118,1.7095219574599192,0.11189234314151972,0.0013821115134030936,0.07104688223410768,0.005766138692685495,0.9310542820602117,0.0013989725050689828,0.4000000000000001,0.0,,,50.794260008988424,24.836708377642946,0.11161526349272373,0.005787749200301594 +0.4,robust,4,2983.852437569374,45.51290575912758,276545.26309355,4555.1725323898245,136.19210761854086,1.5546063667946701,0.10875560547061063,0.001118798290958954,0.07452230347799255,0.0040446395928049874,0.9307282962514367,0.0013558080014763189,0.5999999999999999,0.0,4.174996403604185,0.12189448324552496,47.99794119802058,33.51782503281748,0.10222958892923095,0.0031686467591609474 +0.6,no_robust,4,2982.2460998252786,39.93674476199945,277051.95613675,3931.02017169463,136.81931587629953,1.1995405806950865,0.10802266412956946,0.000405835985606262,0.06698591531512615,0.002805894772223563,0.9313849217310588,0.0008100530228792662,0.5999999999999999,0.0,,,47.86860429278531,23.830502772642472,0.1162670925925253,0.028676813474186293 +0.6,robust,4,2789.0434220430398,35.297482315631626,258688.11700405,3420.6735023624556,136.86774320500828,0.7097303238857778,0.10501047827147733,0.0008273121554488608,0.06914180963767007,0.009066158371268139,0.9314130089130337,0.0005024421703994162,0.7733333333333334,0.053333333333333385,4.178300996512875,0.5865970573865015,39.928062615509425,30.25078643153115,0.11297979438696983,0.0274101056520461 +0.8,no_robust,4,2841.1305915063504,21.84043179776092,264140.55002745,2073.353315114627,137.28163778418497,0.6288968799501957,0.1031498585902154,0.0012877581835795701,0.06592454978099352,0.00340700896766341,0.9316596013994161,0.00038430108058413553,0.8000000000000002,0.0,,,37.43809775029793,32.01740090550489,0.10432800196112454,0.018337841526911584 +0.8,robust,4,2586.098242115281,48.05539265296157,239765.24959855,4681.6472175597555,136.5038826686135,1.0611320896043694,0.10253056902792507,0.002587472569909977,0.07325665736408164,0.0015359324114246234,0.9311235469993302,0.0006145440308596868,1.0,0.0,3.5384100686094007,0.391972726035734,37.14414699970415,25.614063825315505,0.09990322635678014,0.010269342031085898 diff --git a/paper/src/chapters/figures/results/generated/legacy/ppo_headline_summary.json b/paper/src/chapters/figures/results/generated/legacy/ppo_headline_summary.json new file mode 100644 index 0000000..5b106f2 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/legacy/ppo_headline_summary.json @@ -0,0 +1,7 @@ +{ + "status": "ok", + "revenue_delta": -191.29017636530716, + "revenue_delta_pct": -5.938226273545598, + "coi_leakage_delta": -0.002960415145605702, + "coi_leakage_delta_pct": -2.6404147469510946 +} \ No newline at end of file diff --git a/paper/src/chapters/figures/results/generated/legacy/ppo_overall_mode_summary.csv b/paper/src/chapters/figures/results/generated/legacy/ppo_overall_mode_summary.csv new file mode 100644 index 0000000..c45b856 --- /dev/null +++ b/paper/src/chapters/figures/results/generated/legacy/ppo_overall_mode_summary.csv @@ -0,0 +1,3 @@ +mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_coi_leakage_mean_mean,eval_coi_leakage_mean_std,eval_volatility_mean_mean,eval_volatility_mean_std,eval_margin_mean_mean,eval_margin_mean_std,train_alpha_adv_mean,train_alpha_adv_std,train_coi_penalty_mean,train_coi_penalty_std,train_ux_penalty_mean,train_ux_penalty_std,train_agent_prob_mean,train_agent_prob_std +no_robust,24,3221.335253213441,262.46595166337727,299277.442303125,24382.561944761477,136.9186666318945,1.0038463876967063,0.11211932326253345,0.005805494533542669,0.06737642102693879,0.005402738047823369,0.9314066076226178,0.0007436370959663933,0.43,0.2546411303445653,,,51.86293802024894,25.340287421525442,0.11381077317368686,0.016664235359362907 +robust,24,3030.0450768481337,288.262657026656,280998.34484843333,26820.020161880373,136.77757261848845,1.06224696086916,0.10915890811692774,0.004616462637659704,0.06943407846195294,0.006435789449278624,0.9312959200008004,0.0007858424519830652,0.5488541666666666,0.2860373751485706,4.540469463924883,0.7906156355346259,47.985382134405825,27.407657819442747,0.11155393475895271,0.01943348418653492 diff --git a/paper/src/chapters/figures/results/generated/legacy/ppo_pairwise_win_rates.csv b/paper/src/chapters/figures/results/generated/legacy/ppo_pairwise_win_rates.csv new file mode 100644 index 0000000..856cc8b --- /dev/null +++ b/paper/src/chapters/figures/results/generated/legacy/ppo_pairwise_win_rates.csv @@ -0,0 +1,25 @@ +alpha,metric,direction,wins,ties,total_pairs,win_probability +0.0,eval/revenue_mean,higher,0,0,16,0.0 +0.0,eval/reward_mean,higher,0,0,16,0.0 +0.0,eval/coi_leakage_mean,lower,14,0,16,0.875 +0.0,eval/volatility_mean,lower,8,0,16,0.5 +0.1,eval/revenue_mean,higher,0,0,16,0.0 +0.1,eval/reward_mean,higher,0,0,16,0.0 +0.1,eval/coi_leakage_mean,lower,16,0,16,1.0 +0.1,eval/volatility_mean,lower,8,0,16,0.5 +0.25,eval/revenue_mean,higher,0,0,16,0.0 +0.25,eval/reward_mean,higher,0,0,16,0.0 +0.25,eval/coi_leakage_mean,lower,12,0,16,0.75 +0.25,eval/volatility_mean,lower,11,0,16,0.6875 +0.4,eval/revenue_mean,higher,0,0,16,0.0 +0.4,eval/reward_mean,higher,0,0,16,0.0 +0.4,eval/coi_leakage_mean,lower,16,0,16,1.0 +0.4,eval/volatility_mean,lower,6,0,16,0.375 +0.6,eval/revenue_mean,higher,0,0,16,0.0 +0.6,eval/reward_mean,higher,0,0,16,0.0 +0.6,eval/coi_leakage_mean,lower,16,0,16,1.0 +0.6,eval/volatility_mean,lower,7,0,16,0.4375 +0.8,eval/revenue_mean,higher,0,0,16,0.0 +0.8,eval/reward_mean,higher,0,0,16,0.0 +0.8,eval/coi_leakage_mean,lower,11,0,16,0.6875 +0.8,eval/volatility_mean,lower,0,0,16,0.0 diff --git a/paper/src/chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex b/paper/src/chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex new file mode 100644 index 0000000..f2fa4a2 --- /dev/null +++ b/paper/src/chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex @@ -0,0 +1 @@ +\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf} diff --git a/paper/src/chapters/figures/results/includes/final/final_focus_revenue_delta.tex b/paper/src/chapters/figures/results/includes/final/final_focus_revenue_delta.tex new file mode 100644 index 0000000..0a13ca4 --- /dev/null +++ b/paper/src/chapters/figures/results/includes/final/final_focus_revenue_delta.tex @@ -0,0 +1 @@ +\includegraphics[width=0.95\linewidth]{chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf} diff --git a/paper/src/chapters/figures/results/includes/final/final_focus_risk_deltas.tex b/paper/src/chapters/figures/results/includes/final/final_focus_risk_deltas.tex new file mode 100644 index 0000000..c694faf --- /dev/null +++ b/paper/src/chapters/figures/results/includes/final/final_focus_risk_deltas.tex @@ -0,0 +1 @@ +\includegraphics[width=0.95\linewidth]{chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf} diff --git a/paper/src/chapters/figures/results/includes/legacy/first_sweep_tier_revenue.tex b/paper/src/chapters/figures/results/includes/legacy/first_sweep_tier_revenue.tex new file mode 100644 index 0000000..52a61b4 --- /dev/null +++ b/paper/src/chapters/figures/results/includes/legacy/first_sweep_tier_revenue.tex @@ -0,0 +1 @@ +\includegraphics[width=0.99\linewidth]{chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf} diff --git a/paper/src/chapters/figures/results/includes/legacy/ppo_alpha_curves.tex b/paper/src/chapters/figures/results/includes/legacy/ppo_alpha_curves.tex new file mode 100644 index 0000000..b4f6618 --- /dev/null +++ b/paper/src/chapters/figures/results/includes/legacy/ppo_alpha_curves.tex @@ -0,0 +1 @@ +\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf} diff --git a/paper/src/chapters/figures/results/includes/legacy/ppo_delta_curves.tex b/paper/src/chapters/figures/results/includes/legacy/ppo_delta_curves.tex new file mode 100644 index 0000000..2b37f92 --- /dev/null +++ b/paper/src/chapters/figures/results/includes/legacy/ppo_delta_curves.tex @@ -0,0 +1 @@ +\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_delta_curves.pdf} diff --git a/paper/src/chapters/figures/results/includes/legacy/ppo_tradeoff_scatter.tex b/paper/src/chapters/figures/results/includes/legacy/ppo_tradeoff_scatter.tex new file mode 100644 index 0000000..7b795d1 --- /dev/null +++ b/paper/src/chapters/figures/results/includes/legacy/ppo_tradeoff_scatter.tex @@ -0,0 +1 @@ +\includegraphics[width=0.88\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf} diff --git a/paper/src/chapters/figures/results/plot_results.py b/paper/src/chapters/figures/results/plot_results.py new file mode 100644 index 0000000..0476948 --- /dev/null +++ b/paper/src/chapters/figures/results/plot_results.py @@ -0,0 +1,313 @@ +from __future__ import annotations + +import argparse +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from matplotlib.ticker import FuncFormatter +import numpy as np +import pandas as pd + +from process_first_sweep import run as run_first_sweep +from process_ppo_benchmark import run as run_ppo_benchmark + + +def _output_dir() -> Path: + return Path(__file__).resolve().parent / "generated" / "legacy" + + +def _plot_dir() -> Path: + return _output_dir() / "plots" + + +def _configure_style() -> None: + plt.rcParams.update( + { + "font.family": "serif", + "font.size": 10, + "axes.titlesize": 10, + "axes.labelsize": 9, + "legend.fontsize": 8, + "xtick.labelsize": 8, + "ytick.labelsize": 8, + "figure.dpi": 220, + "savefig.dpi": 320, + "axes.spines.top": False, + "axes.spines.right": False, + "axes.grid": True, + "grid.alpha": 0.22, + } + ) + + +def _fmt_thousands(value: float, _: int) -> str: + return f"{int(value):,}" + + +def _load_csv(path: Path) -> pd.DataFrame: + if not path.exists(): + raise FileNotFoundError(f"Missing required input: {path}") + return pd.read_csv(path) + + +def _plot_ppo_alpha_curves(alpha_mode: pd.DataFrame, out_dir: Path) -> Path: + fig, axes = plt.subplots(2, 2, figsize=(9.3, 6.4), constrained_layout=True) + robust_color = "#C44E52" + baseline_color = "#4C72B0" + mode_colors = {"robust": robust_color, "no_robust": baseline_color} + mode_labels = {"robust": "Robust", "no_robust": "Non-robust"} + + panels = [ + ("eval_revenue_mean", "Mean Episode Revenue", "Revenue"), + ("eval_reward_mean", "Mean Episode Reward", "Reward"), + ("eval_coi_leakage_mean", "Mean COI Leakage", "COI Leakage"), + ("eval_volatility_mean", "Mean Price Volatility", "Volatility"), + ] + + for ax, (metric_prefix, title, ylabel) in zip(axes.flat, panels): + mean_col = f"{metric_prefix}_mean" + std_col = f"{metric_prefix}_std" + for mode in ("no_robust", "robust"): + sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha") + if sub.empty: + continue + x = sub["alpha"].to_numpy(dtype=float) + y = sub[mean_col].to_numpy(dtype=float) + ax.plot( + x, + y, + marker="o", + linewidth=1.8, + markersize=4, + color=mode_colors[mode], + label=mode_labels[mode], + ) + if std_col in sub.columns: + sigma = sub[std_col].fillna(0.0).to_numpy(dtype=float) + ax.fill_between( + x, + y - sigma, + y + sigma, + color=mode_colors[mode], + alpha=0.14, + linewidth=0, + ) + + ax.set_title(title) + ax.set_xlabel(r"Contamination $\alpha$") + ax.set_ylabel(ylabel) + ax.set_xticks(sorted(alpha_mode["alpha"].unique())) + if metric_prefix in {"eval_revenue_mean", "eval_reward_mean"}: + ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands)) + + handles, labels = axes.flat[0].get_legend_handles_labels() + fig.legend(handles, labels, ncol=2, loc="upper center", bbox_to_anchor=(0.5, 1.02)) + + out_path = out_dir / "ppo_alpha_curves.pdf" + fig.savefig(out_path, bbox_inches="tight") + plt.close(fig) + return out_path + + +def _plot_ppo_delta_curves(deltas: pd.DataFrame, out_dir: Path) -> Path: + fig, axes = plt.subplots(2, 1, figsize=(8.6, 6.0), constrained_layout=True) + deltas = deltas.sort_values("alpha") + x = deltas["alpha"].to_numpy(dtype=float) + + top_metrics = [ + ("eval_revenue_mean_delta_pct", "Revenue", "#4C72B0"), + ("eval_reward_mean_delta_pct", "Reward", "#8172B3"), + ] + for col, label, color in top_metrics: + axes[0].plot( + x, + deltas[col].to_numpy(dtype=float), + marker="o", + linewidth=1.8, + markersize=4, + color=color, + label=label, + ) + axes[0].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--") + axes[0].set_title("Robust Minus Non-robust Delta by Contamination") + axes[0].set_ylabel("Delta (%)") + axes[0].set_xlabel(r"Contamination $\alpha$") + axes[0].set_xticks(x) + axes[0].legend(loc="lower left") + + bottom_metrics = [ + ("eval_coi_leakage_mean_delta_pct", "COI Leakage", "#55A868"), + ("eval_volatility_mean_delta_pct", "Volatility", "#DD8452"), + ] + for col, label, color in bottom_metrics: + axes[1].plot( + x, + deltas[col].to_numpy(dtype=float), + marker="o", + linewidth=1.8, + markersize=4, + color=color, + label=label, + ) + axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--") + axes[1].set_ylabel("Delta (%)") + axes[1].set_xlabel(r"Contamination $\alpha$") + axes[1].set_xticks(x) + axes[1].legend(loc="lower left") + + out_path = out_dir / "ppo_delta_curves.pdf" + fig.savefig(out_path, bbox_inches="tight") + plt.close(fig) + return out_path + + +def _plot_ppo_tradeoff_scatter(deltas: pd.DataFrame, out_dir: Path) -> Path: + fig, ax = plt.subplots(figsize=(6.4, 5.2), constrained_layout=True) + data = deltas.sort_values("alpha") + x = data["eval_coi_leakage_mean_delta_pct"].to_numpy(dtype=float) + y = data["eval_revenue_mean_delta_pct"].to_numpy(dtype=float) + alphas = data["alpha"].to_numpy(dtype=float) + + scatter = ax.scatter( + x, + y, + c=alphas, + cmap="viridis", + s=72, + edgecolor="#222222", + linewidth=0.5, + ) + for x_i, y_i, alpha in zip(x, y, alphas): + ax.annotate( + rf"$\alpha={alpha:.2f}$", + (x_i, y_i), + textcoords="offset points", + xytext=(5, 4), + fontsize=8, + ) + + ax.axhline(0.0, color="#555555", linewidth=1.0, linestyle="--") + ax.axvline(0.0, color="#555555", linewidth=1.0, linestyle="--") + ax.set_xlabel("COI Leakage Delta (%)") + ax.set_ylabel("Revenue Delta (%)") + ax.set_title("PPO Robust Tradeoff Frontier") + cbar = fig.colorbar(scatter, ax=ax) + cbar.set_label(r"Contamination $\alpha$") + + out_path = out_dir / "ppo_tradeoff_scatter.pdf" + fig.savefig(out_path, bbox_inches="tight") + plt.close(fig) + return out_path + + +def _plot_first_sweep_tier_revenue(tier_mode: pd.DataFrame, out_dir: Path) -> Path: + pivot = ( + tier_mode.pivot(index="tier", columns="mode", values="eval_revenue_mean_mean") + .dropna(subset=["robust", "no_robust"], how="any") + .copy() + ) + if pivot.empty: + raise ValueError("First sweep tier summary missing robust/non-robust pairs") + + order = sorted(pivot.index.tolist()) + pivot = pivot.loc[order] + delta_pct = 100.0 * (pivot["robust"] - pivot["no_robust"]) / pivot["no_robust"] + + fig, axes = plt.subplots(1, 2, figsize=(10.2, 4.3), constrained_layout=True) + x = np.arange(len(order)) + width = 0.36 + + axes[0].bar( + x - width / 2, + pivot["no_robust"].to_numpy(dtype=float), + width=width, + label="Non-robust", + color="#4C72B0", + ) + axes[0].bar( + x + width / 2, + pivot["robust"].to_numpy(dtype=float), + width=width, + label="Robust", + color="#C44E52", + ) + axes[0].set_xticks(x) + axes[0].set_xticklabels(order, rotation=20) + axes[0].set_ylabel("Mean Revenue") + axes[0].set_yscale("log") + axes[0].yaxis.set_major_formatter(FuncFormatter(_fmt_thousands)) + axes[0].set_title("First Sweep Tier Revenue (log scale)") + axes[0].legend() + + axes[1].bar(x, delta_pct.to_numpy(dtype=float), color="#55A868", width=0.55) + axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--") + axes[1].set_xticks(x) + axes[1].set_xticklabels(order, rotation=20) + axes[1].set_ylabel("Revenue Delta (%)") + axes[1].set_title("Robust Minus Non-robust by Tier") + + out_path = out_dir / "first_sweep_tier_revenue.pdf" + fig.savefig(out_path, bbox_inches="tight") + plt.close(fig) + return out_path + + +def build_plots(data_dir: Path, out_dir: Path) -> list[Path]: + alpha_mode = _load_csv(data_dir / "ppo_alpha_mode_summary.csv") + deltas = _load_csv(data_dir / "ppo_alpha_deltas.csv") + tier_mode = _load_csv(data_dir / "first_sweep_tier_mode_summary.csv") + + out_dir.mkdir(parents=True, exist_ok=True) + paths = [ + _plot_ppo_alpha_curves(alpha_mode, out_dir), + _plot_ppo_delta_curves(deltas, out_dir), + _plot_ppo_tradeoff_scatter(deltas, out_dir), + _plot_first_sweep_tier_revenue(tier_mode, out_dir), + ] + return paths + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Create paper-ready plots from result CSVs" + ) + parser.add_argument("--data-dir", type=Path, default=_output_dir()) + parser.add_argument("--plot-dir", type=Path, default=_plot_dir()) + parser.add_argument( + "--refresh-data", + action="store_true", + help="Regenerate processed CSVs before plotting", + ) + args = parser.parse_args() + + _configure_style() + + if bool(args.refresh_data): + run_ppo_benchmark( + input_path=Path(__file__).resolve().parents[5] + / "tpu_orchestration" + / "results" + / "ppo_benchmark.csv", + output_dir=args.data_dir, + include_non_finished=False, + ) + run_first_sweep( + input_path=Path(__file__).resolve().parents[5] + / "tpu_orchestration" + / "results" + / "first_sweep.csv", + output_dir=args.data_dir, + include_non_finished=False, + top_n=25, + ) + + outputs = build_plots(data_dir=args.data_dir, out_dir=args.plot_dir) + for path in outputs: + print(path) + + +if __name__ == "__main__": + main() diff --git a/paper/src/chapters/figures/results/plot_wandb_export.py b/paper/src/chapters/figures/results/plot_wandb_export.py new file mode 100644 index 0000000..cda7faf --- /dev/null +++ b/paper/src/chapters/figures/results/plot_wandb_export.py @@ -0,0 +1,658 @@ +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Iterable + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from matplotlib.ticker import FuncFormatter +import numpy as np +import pandas as pd + + +def _load_tikzplotlib(): + def _patch_webcolors() -> None: + try: + import webcolors + + if hasattr(webcolors, "CSS3_HEX_TO_NAMES"): + return + css3 = getattr(webcolors, "CSS3", "css3") + webcolors.CSS3_HEX_TO_NAMES = { + webcolors.name_to_hex(name, spec=css3): name + for name in webcolors.names(spec=css3) + } + except Exception: + return + + _patch_webcolors() + + try: + from matplotlib.legend import Legend + + if not hasattr(Legend, "_ncol") and hasattr(Legend, "_ncols"): + Legend._ncol = property(lambda self: self._ncols) + except Exception: + pass + + try: + import tikzplotlib as module + + return module, None + except Exception: + pass + + try: + from matplotlib.backends import backend_pgf + + if not hasattr(backend_pgf, "common_texification") and hasattr( + backend_pgf, "_tex_escape" + ): + backend_pgf.common_texification = backend_pgf._tex_escape + + _patch_webcolors() + + import tikzplotlib as module + + return module, None + except Exception as exc: + return None, exc + + +TIKZPLOTLIB, TIKZPLOTLIB_IMPORT_ERROR = _load_tikzplotlib() + + +def _default_output_dir() -> Path: + return Path(__file__).resolve().parent / "generated" / "wandb" + + +def _default_plot_dir(output_dir: Path) -> Path: + return output_dir / "plots" + + +def _sanitize(key: str) -> str: + return key.replace("/", "_").replace("-", "_") + + +def _configure_style() -> None: + plt.rcParams.update( + { + "font.family": "serif", + "font.size": 10, + "axes.titlesize": 10, + "axes.labelsize": 9, + "legend.fontsize": 8, + "xtick.labelsize": 8, + "ytick.labelsize": 8, + "figure.dpi": 220, + "savefig.dpi": 320, + "axes.spines.top": False, + "axes.spines.right": False, + "axes.grid": True, + "grid.alpha": 0.22, + } + ) + + +def _fmt_thousands(value: float, _: int) -> str: + return f"{int(value):,}" + + +def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None: + for column in columns: + if column in frame.columns: + frame[column] = pd.to_numeric(frame[column], errors="coerce") + + +def _extract_alpha(frame: pd.DataFrame) -> pd.Series: + if "study/alpha" in frame.columns: + return pd.to_numeric(frame["study/alpha"], errors="coerce") + if "alpha" in frame.columns: + return pd.to_numeric(frame["alpha"], errors="coerce") + return pd.Series(np.nan, index=frame.index, dtype=float) + + +def _extract_mode(frame: pd.DataFrame) -> pd.Series: + if "study/mode" in frame.columns: + mode = frame["study/mode"].astype(str).str.strip().str.lower() + mapping = { + "baseline": "baseline", + "no_robust": "baseline", + "defended": "defended", + "robust": "defended", + } + return mode.map(mapping).fillna("") + + if "study/no_robust" in frame.columns: + no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0) + return pd.Series( + np.where(no_robust > 0.5, "baseline", "defended"), + index=frame.index, + dtype="object", + ) + + if "no_robust" in frame.columns: + no_robust = ( + frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"}) + ) + return pd.Series( + np.where(no_robust, "baseline", "defended"), + index=frame.index, + dtype="object", + ) + + return pd.Series("", index=frame.index, dtype="object") + + +def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame: + data = frame.copy() + if not include_non_finished and "State" in data.columns: + data = data[data["State"].astype(str).str.lower() == "finished"].copy() + + data["alpha"] = _extract_alpha(data) + data["mode"] = _extract_mode(data) + data = data[data["mode"].isin({"baseline", "defended"})] + data = data[data["alpha"].notna()] + + _coerce_numeric( + data, + [ + "eval/revenue_mean", + "eval/reward_mean", + "eval/coi_level_mean", + "eval/coi_leakage_mean", + "eval/volatility_mean", + "eval/revenue_std", + "eval/reward_std", + "eval/margin_mean", + "train/agent_prob", + "train/alpha_adv", + "lambda_coi", + "ambiguity_radius", + "n_products", + ], + ) + + return data.sort_values(["alpha", "mode"]).reset_index(drop=True) + + +def _summary_by_alpha_mode(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame: + agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")} + for metric in metrics: + safe = _sanitize(metric) + agg_spec[f"{safe}_mean"] = (metric, "mean") + agg_spec[f"{safe}_std"] = (metric, "std") + + return ( + frame.groupby(["alpha", "mode"], as_index=False) + .agg(**agg_spec) + .sort_values(["alpha", "mode"]) + .reset_index(drop=True) + ) + + +def _delta_by_alpha(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame: + rows: list[dict[str, float]] = [] + for alpha, alpha_group in summary.groupby("alpha", sort=True): + defended = alpha_group[alpha_group["mode"] == "defended"] + baseline = alpha_group[alpha_group["mode"] == "baseline"] + if defended.empty or baseline.empty: + continue + + row: dict[str, float] = { + "alpha": float(alpha), + "runs_defended": float(defended["runs"].iloc[0]), + "runs_baseline": float(baseline["runs"].iloc[0]), + } + for metric in metrics: + safe = _sanitize(metric) + defended_value = float(defended[f"{safe}_mean"].iloc[0]) + baseline_value = float(baseline[f"{safe}_mean"].iloc[0]) + delta = defended_value - baseline_value + row[f"{safe}_defended"] = defended_value + row[f"{safe}_baseline"] = baseline_value + row[f"{safe}_delta"] = delta + row[f"{safe}_delta_pct"] = ( + np.nan if baseline_value == 0 else 100.0 * delta / baseline_value + ) + rows.append(row) + + return pd.DataFrame(rows) + + +def _summary_by_parameter( + frame: pd.DataFrame, parameter: str, metrics: list[str] +) -> pd.DataFrame: + defended = frame[frame["mode"] == "defended"].copy() + defended = defended[defended[parameter].notna()].copy() + agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")} + for metric in metrics: + safe = _sanitize(metric) + agg_spec[f"{safe}_mean"] = (metric, "mean") + agg_spec[f"{safe}_std"] = (metric, "std") + + return ( + defended.groupby(["alpha", parameter], as_index=False) + .agg(**agg_spec) + .sort_values(["alpha", parameter]) + .reset_index(drop=True) + ) + + +def _save_table(frame: pd.DataFrame, path: Path) -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + frame.to_csv(path, index=False) + return path + + +def _save_figure(fig: plt.Figure, pdf_path: Path, export_tikz: bool) -> list[Path]: + pdf_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(pdf_path, bbox_inches="tight") + written = [pdf_path] + + if export_tikz: + if TIKZPLOTLIB is None: + raise RuntimeError( + "tikzplotlib import failed. Install/upgrade tikzplotlib and matplotlib-compatible dependencies. " + f"Original error: {TIKZPLOTLIB_IMPORT_ERROR}" + ) + + try: + from matplotlib.legend import Legend + from matplotlib.lines import Line2D + + for legend in fig.findobj(Legend): + if not hasattr(legend, "_ncol") and hasattr(legend, "_ncols"): + setattr(legend, "_ncol", legend._ncols) + if not hasattr(legend, "legendHandles") and hasattr( + legend, "legend_handles" + ): + setattr(legend, "legendHandles", legend.legend_handles) + + for line in fig.findobj(Line2D): + if hasattr(line, "_us_dashSeq"): + continue + if not hasattr(line, "_dash_pattern"): + continue + dash_pattern = getattr(line, "_dash_pattern") + if not isinstance(dash_pattern, tuple) or len(dash_pattern) != 2: + continue + setattr(line, "_us_dashOffset", dash_pattern[0]) + setattr(line, "_us_dashSeq", dash_pattern[1]) + except Exception: + pass + + tikz_path = pdf_path.with_suffix(".tikz.tex") + TIKZPLOTLIB.save(str(tikz_path), figure=fig) + written.append(tikz_path) + + plt.close(fig) + return written + + +def _plot_alpha_curves( + alpha_mode: pd.DataFrame, out_dir: Path, export_tikz: bool +) -> list[Path]: + fig, axes = plt.subplots(2, 2, figsize=(9.3, 6.4), constrained_layout=True) + mode_colors = {"baseline": "#4C72B0", "defended": "#C44E52"} + mode_labels = {"baseline": "Baseline", "defended": "Defended"} + + panels = [ + ("eval_revenue_mean", "Mean Episode Revenue", "Revenue"), + ("eval_reward_mean", "Mean Episode Reward", "Reward"), + ("eval_coi_leakage_mean", "Mean COI Leakage", "COI Leakage"), + ("eval_volatility_mean", "Mean Price Volatility", "Volatility"), + ] + + for ax, (metric_prefix, title, ylabel) in zip(axes.flat, panels): + mean_col = f"{metric_prefix}_mean" + std_col = f"{metric_prefix}_std" + for mode in ("baseline", "defended"): + sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha") + if sub.empty: + continue + x = sub["alpha"].to_numpy(dtype=float) + y = sub[mean_col].to_numpy(dtype=float) + ax.plot( + x, + y, + marker="o", + linewidth=1.8, + markersize=4, + color=mode_colors[mode], + label=mode_labels[mode], + ) + sigma = sub[std_col].fillna(0.0).to_numpy(dtype=float) + ax.fill_between( + x, + y - sigma, + y + sigma, + color=mode_colors[mode], + alpha=0.14, + linewidth=0, + ) + + ax.set_title(title) + ax.set_xlabel(r"Contamination $\alpha$") + ax.set_ylabel(ylabel) + ax.set_xticks(sorted(alpha_mode["alpha"].unique())) + if metric_prefix in {"eval_revenue_mean", "eval_reward_mean"}: + ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands)) + + handles, labels = axes.flat[0].get_legend_handles_labels() + fig.legend(handles, labels, ncol=2, loc="upper center", bbox_to_anchor=(0.5, 1.02)) + return _save_figure(fig, out_dir / "wandb_alpha_curves.pdf", export_tikz) + + +def _plot_delta_curves( + deltas: pd.DataFrame, out_dir: Path, export_tikz: bool +) -> list[Path]: + fig, axes = plt.subplots(2, 1, figsize=(8.6, 6.0), constrained_layout=True) + deltas = deltas.sort_values("alpha") + x = deltas["alpha"].to_numpy(dtype=float) + + top_metrics = [ + ("eval_revenue_mean_delta_pct", "Revenue", "#4C72B0"), + ("eval_reward_mean_delta_pct", "Reward", "#8172B3"), + ] + for col, label, color in top_metrics: + axes[0].plot( + x, + deltas[col].to_numpy(dtype=float), + marker="o", + linewidth=1.8, + markersize=4, + color=color, + label=label, + ) + axes[0].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--") + axes[0].set_title("Defended Minus Baseline Delta by Contamination") + axes[0].set_ylabel("Delta (%)") + axes[0].set_xlabel(r"Contamination $\alpha$") + axes[0].set_xticks(x) + axes[0].legend(loc="lower left") + + bottom_metrics = [ + ("eval_coi_leakage_mean_delta_pct", "COI Leakage", "#55A868"), + ("eval_volatility_mean_delta_pct", "Volatility", "#DD8452"), + ] + for col, label, color in bottom_metrics: + axes[1].plot( + x, + deltas[col].to_numpy(dtype=float), + marker="o", + linewidth=1.8, + markersize=4, + color=color, + label=label, + ) + axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--") + axes[1].set_ylabel("Delta (%)") + axes[1].set_xlabel(r"Contamination $\alpha$") + axes[1].set_xticks(x) + axes[1].legend(loc="lower left") + + return _save_figure(fig, out_dir / "wandb_delta_curves.pdf", export_tikz) + + +def _plot_tradeoff_scatter( + deltas: pd.DataFrame, out_dir: Path, export_tikz: bool +) -> list[Path]: + fig, ax = plt.subplots(figsize=(6.4, 5.2), constrained_layout=True) + data = deltas.sort_values("alpha") + x = data["eval_coi_leakage_mean_delta_pct"].to_numpy(dtype=float) + y = data["eval_revenue_mean_delta_pct"].to_numpy(dtype=float) + alphas = data["alpha"].to_numpy(dtype=float) + + scatter = ax.scatter( + x, + y, + c=alphas, + cmap="viridis", + s=72, + edgecolor="#222222", + linewidth=0.5, + ) + for x_i, y_i, alpha in zip(x, y, alphas): + ax.annotate( + rf"$\alpha={alpha:.2f}$", + (x_i, y_i), + textcoords="offset points", + xytext=(5, 4), + fontsize=8, + ) + + ax.axhline(0.0, color="#555555", linewidth=1.0, linestyle="--") + ax.axvline(0.0, color="#555555", linewidth=1.0, linestyle="--") + ax.set_xlabel("COI Leakage Delta (%)") + ax.set_ylabel("Revenue Delta (%)") + ax.set_title("Defended Tradeoff Frontier") + cbar = fig.colorbar(scatter, ax=ax) + cbar.set_label(r"Contamination $\alpha$") + + return _save_figure(fig, out_dir / "wandb_tradeoff_scatter.pdf", export_tikz) + + +def _plot_reward_robustness( + alpha_mode: pd.DataFrame, out_dir: Path, export_tikz: bool +) -> list[Path]: + fig, ax = plt.subplots(figsize=(7.6, 4.5), constrained_layout=True) + mode_colors = {"baseline": "#4C72B0", "defended": "#C44E52"} + mode_labels = {"baseline": "Baseline", "defended": "Defended"} + + for mode in ("baseline", "defended"): + sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha") + x = sub["alpha"].to_numpy(dtype=float) + y = sub["eval_reward_mean_std"].fillna(0.0).to_numpy(dtype=float) + ax.plot( + x, + y, + marker="o", + linewidth=1.8, + markersize=4, + color=mode_colors[mode], + label=mode_labels[mode], + ) + + ax.set_title("Reward Robustness Across Contamination") + ax.set_xlabel(r"Contamination $\alpha$") + ax.set_ylabel("Reward Std Across Runs") + ax.set_xticks(sorted(alpha_mode["alpha"].unique())) + ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands)) + ax.legend(loc="upper left") + return _save_figure(fig, out_dir / "wandb_reward_robustness.pdf", export_tikz) + + +def _plot_parameter_sensitivity( + summary: pd.DataFrame, + parameter: str, + out_name: str, + out_dir: Path, + export_tikz: bool, +) -> list[Path]: + fig, axes = plt.subplots(1, 2, figsize=(10.0, 4.2), constrained_layout=True) + values = sorted(summary[parameter].dropna().unique()) + cmap = plt.get_cmap("viridis") + colors = [cmap(i) for i in np.linspace(0.1, 0.9, len(values))] + + panels = [ + ("eval_revenue_mean", "Revenue"), + ("eval_coi_leakage_mean", "COI Leakage"), + ] + for ax, (metric_prefix, ylabel) in zip(axes, panels): + mean_col = f"{metric_prefix}_mean" + std_col = f"{metric_prefix}_std" + for value, color in zip(values, colors): + sub = summary[summary[parameter] == value].sort_values("alpha") + if sub.empty: + continue + x = sub["alpha"].to_numpy(dtype=float) + y = sub[mean_col].to_numpy(dtype=float) + sigma = sub[std_col].fillna(0.0).to_numpy(dtype=float) + ax.plot( + x, + y, + marker="o", + linewidth=1.6, + markersize=3.6, + color=color, + label=f"{parameter}={value:.2f}", + ) + ax.fill_between( + x, y - sigma, y + sigma, color=color, alpha=0.10, linewidth=0 + ) + + ax.set_xlabel(r"Contamination $\alpha$") + ax.set_ylabel(ylabel) + ax.set_xticks(sorted(summary["alpha"].unique())) + if metric_prefix == "eval_revenue_mean": + ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands)) + + axes[0].set_title(f"{parameter} Sensitivity (Defended)") + axes[1].set_title("Leakage Side-Effect") + handles, labels = axes[0].get_legend_handles_labels() + fig.legend( + handles, + labels, + ncol=max(1, len(values) // 2), + loc="upper center", + bbox_to_anchor=(0.5, 1.06), + ) + + return _save_figure(fig, out_dir / f"{out_name}.pdf", export_tikz) + + +def _plot_delta_summary( + deltas: pd.DataFrame, out_dir: Path, export_tikz: bool +) -> list[Path]: + data = deltas.sort_values("alpha") + x = np.arange(len(data)) + labels = [f"{alpha:.1f}" for alpha in data["alpha"].to_numpy(dtype=float)] + + fig, axes = plt.subplots(1, 3, figsize=(11.0, 3.8), constrained_layout=True) + panels = [ + ("eval_revenue_mean_delta_pct", "Revenue Delta (%)", "#4C72B0"), + ("eval_reward_mean_delta_pct", "Reward Delta (%)", "#8172B3"), + ("eval_coi_leakage_mean_delta_pct", "COI Leakage Delta (%)", "#55A868"), + ] + for ax, (column, title, color) in zip(axes, panels): + values = data[column].to_numpy(dtype=float) + ax.bar(x, values, color=color, alpha=0.85) + ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--") + ax.set_xticks(x) + ax.set_xticklabels(labels) + ax.set_xlabel(r"$\alpha$") + ax.set_title(title) + + return _save_figure(fig, out_dir / "wandb_delta_summary.pdf", export_tikz) + + +def build_artifacts( + input_path: Path, + output_dir: Path, + plot_dir: Path, + include_non_finished: bool, + export_tikz: bool, +) -> list[Path]: + raw = pd.read_csv(input_path) + frame = _prepare_frame(raw, include_non_finished=include_non_finished) + + metrics = [ + metric + for metric in ( + "eval/revenue_mean", + "eval/reward_mean", + "eval/coi_level_mean", + "eval/coi_leakage_mean", + "eval/volatility_mean", + "eval/margin_mean", + "train/agent_prob", + "train/alpha_adv", + ) + if metric in frame.columns + ] + + alpha_mode = _summary_by_alpha_mode(frame, metrics) + deltas = _delta_by_alpha(alpha_mode, metrics) + lambda_summary = _summary_by_parameter(frame, "lambda_coi", metrics) + radius_summary = _summary_by_parameter(frame, "ambiguity_radius", metrics) + + output_dir.mkdir(parents=True, exist_ok=True) + plot_dir.mkdir(parents=True, exist_ok=True) + + written: list[Path] = [] + written.append(_save_table(alpha_mode, output_dir / "wandb_alpha_mode_summary.csv")) + written.append(_save_table(deltas, output_dir / "wandb_alpha_deltas.csv")) + written.append( + _save_table(lambda_summary, output_dir / "wandb_lambda_alpha_summary.csv") + ) + written.append( + _save_table(radius_summary, output_dir / "wandb_radius_alpha_summary.csv") + ) + + written.extend(_plot_alpha_curves(alpha_mode, plot_dir, export_tikz)) + written.extend(_plot_delta_curves(deltas, plot_dir, export_tikz)) + written.extend(_plot_tradeoff_scatter(deltas, plot_dir, export_tikz)) + written.extend(_plot_reward_robustness(alpha_mode, plot_dir, export_tikz)) + written.extend( + _plot_parameter_sensitivity( + summary=lambda_summary, + parameter="lambda_coi", + out_name="wandb_lambda_sensitivity", + out_dir=plot_dir, + export_tikz=export_tikz, + ) + ) + written.extend( + _plot_parameter_sensitivity( + summary=radius_summary, + parameter="ambiguity_radius", + out_name="wandb_radius_sensitivity", + out_dir=plot_dir, + export_tikz=export_tikz, + ) + ) + written.extend(_plot_delta_summary(deltas, plot_dir, export_tikz)) + return written + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Generate W&B sweep visualizations for PHANTOM results" + ) + parser.add_argument( + "--input", type=Path, required=True, help="Path to W&B export CSV" + ) + parser.add_argument("--output-dir", type=Path, default=_default_output_dir()) + parser.add_argument("--plot-dir", type=Path, default=None) + parser.add_argument("--include-non-finished", action="store_true") + parser.add_argument( + "--export-tikz", + action="store_true", + help="Export matplotlib figures to TikZ via tikzplotlib", + ) + args = parser.parse_args() + + _configure_style() + plot_dir = ( + args.plot_dir + if args.plot_dir is not None + else _default_plot_dir(args.output_dir) + ) + + outputs = build_artifacts( + input_path=args.input, + output_dir=args.output_dir, + plot_dir=plot_dir, + include_non_finished=bool(args.include_non_finished), + export_tikz=bool(args.export_tikz), + ) + for path in outputs: + print(path) + + +if __name__ == "__main__": + main() diff --git a/paper/src/chapters/figures/results/process_all_results.py b/paper/src/chapters/figures/results/process_all_results.py new file mode 100644 index 0000000..78ca65f --- /dev/null +++ b/paper/src/chapters/figures/results/process_all_results.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import argparse +from pathlib import Path + +from process_first_sweep import run as run_first_sweep +from process_ppo_benchmark import run as run_ppo_benchmark + + +def _default_output_dir() -> Path: + return Path(__file__).resolve().parent / "generated" / "legacy" + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Process all result CSV exports for paper figures" + ) + parser.add_argument("--output-dir", type=Path, default=_default_output_dir()) + parser.add_argument("--include-non-finished", action="store_true") + parser.add_argument("--top-n", type=int, default=25) + args = parser.parse_args() + + written: list[Path] = [] + written.extend( + run_ppo_benchmark( + input_path=Path(__file__).resolve().parents[5] + / "tpu_orchestration" + / "results" + / "ppo_benchmark.csv", + output_dir=args.output_dir, + include_non_finished=bool(args.include_non_finished), + ) + ) + written.extend( + run_first_sweep( + input_path=Path(__file__).resolve().parents[5] + / "tpu_orchestration" + / "results" + / "first_sweep.csv", + output_dir=args.output_dir, + include_non_finished=bool(args.include_non_finished), + top_n=int(args.top_n), + ) + ) + + for path in written: + print(path) + + +if __name__ == "__main__": + main() diff --git a/paper/src/chapters/figures/results/process_final_sweeps.py b/paper/src/chapters/figures/results/process_final_sweeps.py new file mode 100644 index 0000000..f874e2b --- /dev/null +++ b/paper/src/chapters/figures/results/process_final_sweeps.py @@ -0,0 +1,409 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + + +def _project_root() -> Path: + return Path(__file__).resolve().parents[5] + + +def _default_bundle_dir() -> Path: + base = _project_root() / "engine" / "studies" / "results" / "wandb_sweep_bundles" + bundles = sorted( + [path for path in base.glob("bundle_*") if path.is_dir()], + key=lambda path: path.stat().st_mtime, + reverse=True, + ) + if not bundles: + raise FileNotFoundError(f"No sweep bundle directories found in {base}") + return bundles[0] + + +def _default_output_dir() -> Path: + return Path(__file__).resolve().parent / "generated" / "final" + + +def _default_plot_dir(output_dir: Path) -> Path: + return output_dir / "plots" + + +def _truthy(value: Any) -> bool: + if isinstance(value, bool): + return value + if value is None: + return False + return str(value).strip().lower() in {"1", "true", "yes", "on"} + + +def _mode_of(row: pd.Series) -> str: + mode_hint = str(row.get("study_mode", "")).strip().lower() + if mode_hint in {"baseline", "no_robust"}: + return "baseline" + if mode_hint in {"defended", "robust"}: + return "defended" + if _truthy(row.get("baseline_mode")) or _truthy(row.get("no_robust")): + return "baseline" + return "defended" + + +def _coerce_numeric(frame: pd.DataFrame, columns: list[str]) -> None: + for column in columns: + if column in frame.columns: + frame[column] = pd.to_numeric(frame[column], errors="coerce") + + +def _configure_style() -> None: + plt.rcParams.update( + { + "font.family": "serif", + "font.size": 10, + "axes.titlesize": 10, + "axes.labelsize": 9, + "legend.fontsize": 8, + "xtick.labelsize": 8, + "ytick.labelsize": 8, + "figure.dpi": 220, + "savefig.dpi": 320, + "axes.spines.top": False, + "axes.spines.right": False, + "axes.grid": True, + "grid.alpha": 0.22, + } + ) + + +def _load_runs(bundle_dir: Path) -> pd.DataFrame: + path = bundle_dir / "runs_finished.csv" + if not path.exists(): + raise FileNotFoundError(f"Missing required file: {path}") + frame = pd.read_csv(path) + frame["mode"] = frame.apply(_mode_of, axis=1) + _coerce_numeric( + frame, + [ + "alpha", + "n_products", + "eval_revenue_mean", + "eval_reward_mean", + "eval_supra_share_mean", + "eval_volatility_mean", + "eval_coi_level_mean", + "eval_coi_leakage_mean", + "objective_score", + ], + ) + return frame + + +def _focus_sweep(runs: pd.DataFrame) -> str: + coverage = ( + runs.groupby("sweep_id", as_index=False) + .agg( + n_alpha=("alpha", lambda s: int(pd.Series(s).dropna().nunique())), + max_alpha=("alpha", "max"), + run_count=("run_id", "size"), + ) + .sort_values( + ["n_alpha", "max_alpha", "run_count"], ascending=[False, False, False] + ) + ) + if coverage.empty: + raise ValueError("No sweep rows available in runs_finished.csv") + return str(coverage.iloc[0]["sweep_id"]) + + +def _alpha_mode_summary(runs: pd.DataFrame) -> pd.DataFrame: + return ( + runs.groupby(["alpha", "mode"], as_index=False) + .agg( + runs=("run_id", "size"), + revenue_mean=("eval_revenue_mean", "mean"), + reward_mean=("eval_reward_mean", "mean"), + supra_mean=("eval_supra_share_mean", "mean"), + volatility_mean=("eval_volatility_mean", "mean"), + coi_leakage_mean=("eval_coi_leakage_mean", "mean"), + coi_level_mean=("eval_coi_level_mean", "mean"), + ) + .sort_values(["alpha", "mode"]) + .reset_index(drop=True) + ) + + +def _alpha_deltas(alpha_mode: pd.DataFrame) -> pd.DataFrame: + rows: list[dict[str, float]] = [] + for alpha, group in alpha_mode.groupby("alpha", sort=True): + defended = group[group["mode"] == "defended"] + baseline = group[group["mode"] == "baseline"] + if defended.empty or baseline.empty: + continue + d_rev = float(defended["revenue_mean"].iloc[0]) + b_rev = float(baseline["revenue_mean"].iloc[0]) + d_reward = float(defended["reward_mean"].iloc[0]) + b_reward = float(baseline["reward_mean"].iloc[0]) + d_vol = float(defended["volatility_mean"].iloc[0]) + b_vol = float(baseline["volatility_mean"].iloc[0]) + d_supra = float(defended["supra_mean"].iloc[0]) + b_supra = float(baseline["supra_mean"].iloc[0]) + d_coi_leak = float(defended["coi_leakage_mean"].iloc[0]) + b_coi_leak = float(baseline["coi_leakage_mean"].iloc[0]) + rows.append( + { + "alpha": float(alpha), + "revenue_delta": d_rev - b_rev, + "revenue_delta_pct": 0.0 + if b_rev == 0.0 + else 100.0 * (d_rev - b_rev) / b_rev, + "reward_delta": d_reward - b_reward, + "reward_delta_pct": 0.0 + if b_reward == 0.0 + else 100.0 * (d_reward - b_reward) / b_reward, + "volatility_delta": d_vol - b_vol, + "supra_delta": d_supra - b_supra, + "coi_leakage_delta": d_coi_leak - b_coi_leak, + } + ) + return pd.DataFrame(rows).sort_values("alpha").reset_index(drop=True) + + +def _zone_summary(alpha_deltas: pd.DataFrame) -> pd.DataFrame: + if alpha_deltas.empty: + return pd.DataFrame() + data = alpha_deltas.copy() + data["zone"] = np.where( + data["alpha"] >= 0.7, "high_alpha_0_7_plus", "low_alpha_below_0_7" + ) + return ( + data.groupby("zone", as_index=False) + .agg( + alpha_cells=("alpha", "size"), + revenue_delta_pct_mean=("revenue_delta_pct", "mean"), + reward_delta_pct_mean=("reward_delta_pct", "mean"), + coi_leakage_delta_mean=("coi_leakage_delta", "mean"), + volatility_delta_mean=("volatility_delta", "mean"), + ) + .sort_values("zone") + ) + + +def _save_plot(fig: plt.Figure, path: Path) -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(path, bbox_inches="tight") + plt.close(fig) + return path + + +def _plot_focus_revenue_by_alpha(alpha_mode: pd.DataFrame, out_path: Path) -> Path: + fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True) + for mode, color, label in ( + ("baseline", "#4C72B0", "Baseline"), + ("defended", "#C44E52", "Defended"), + ): + sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha") + if sub.empty: + continue + ax.plot( + sub["alpha"], + sub["revenue_mean"], + marker="o", + linewidth=1.9, + markersize=4, + color=color, + label=label, + ) + ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--") + ax.set_xlabel(r"Contamination $\alpha$") + ax.set_ylabel("Mean episode revenue") + ax.set_title("Final Cohort Revenue Curves") + ax.legend(loc="lower left") + return _save_plot(fig, out_path) + + +def _plot_focus_revenue_delta(alpha_deltas: pd.DataFrame, out_path: Path) -> Path: + fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True) + x = alpha_deltas["alpha"].to_numpy(dtype=float) + y = alpha_deltas["revenue_delta_pct"].to_numpy(dtype=float) + ax.plot(x, y, marker="o", linewidth=2.0, markersize=4, color="#C44E52") + ax.fill_between(x, y, 0.0, color="#C44E52", alpha=0.12) + ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--") + ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--") + high = alpha_deltas[alpha_deltas["alpha"] >= 0.7] + if not high.empty: + best = high.reindex( + high["revenue_delta_pct"].abs().sort_values(ascending=False).index + ).iloc[0] + ax.scatter( + [best["alpha"]], + [best["revenue_delta_pct"]], + color="#1f77b4", + s=45, + zorder=3, + ) + ax.annotate( + f"high-alpha peak {best['revenue_delta_pct']:.2f}%", + (float(best["alpha"]), float(best["revenue_delta_pct"])), + textcoords="offset points", + xytext=(6, 6), + fontsize=8, + ) + ax.set_xlabel(r"Contamination $\alpha$") + ax.set_ylabel("Defended minus baseline revenue (%)") + ax.set_title("Revenue Delta by Contamination (Final Cohort)") + return _save_plot(fig, out_path) + + +def _plot_focus_risk_deltas(alpha_deltas: pd.DataFrame, out_path: Path) -> Path: + fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True) + x = alpha_deltas["alpha"].to_numpy(dtype=float) + ax.plot( + x, + alpha_deltas["coi_leakage_delta"].to_numpy(dtype=float), + marker="o", + linewidth=1.8, + markersize=4, + color="#55A868", + label="COI leakage delta", + ) + ax.plot( + x, + alpha_deltas["volatility_delta"].to_numpy(dtype=float), + marker="s", + linewidth=1.8, + markersize=3.8, + color="#8172B3", + label="Volatility delta", + ) + ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--") + ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--") + ax.set_xlabel(r"Contamination $\alpha$") + ax.set_ylabel("Defended minus baseline") + ax.set_title("Leakage and Stability Deltas (Final Cohort)") + ax.legend(loc="lower left") + return _save_plot(fig, out_path) + + +def _write_include(path: Path, figure_rel_path: str, width: str) -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(f"\\includegraphics[width={width}]{{{figure_rel_path}}}\n") + return path + + +def run(bundle_dir: Path, output_dir: Path, plot_dir: Path) -> list[Path]: + all_runs = _load_runs(bundle_dir) + focus_id = _focus_sweep(all_runs) + focus_runs = all_runs[all_runs["sweep_id"] == focus_id].copy() + alpha_mode = _alpha_mode_summary(focus_runs) + deltas = _alpha_deltas(alpha_mode) + zones = _zone_summary(deltas) + + output_dir.mkdir(parents=True, exist_ok=True) + plot_dir.mkdir(parents=True, exist_ok=True) + + written: list[Path] = [] + alpha_mode_path = output_dir / "final_focus_alpha_mode_summary.csv" + alpha_mode.to_csv(alpha_mode_path, index=False) + written.append(alpha_mode_path) + + delta_path = output_dir / "final_focus_alpha_deltas.csv" + deltas.to_csv(delta_path, index=False) + written.append(delta_path) + + zone_path = output_dir / "final_focus_zone_summary.csv" + zones.to_csv(zone_path, index=False) + written.append(zone_path) + + headline = { + "bundle": str(bundle_dir), + "focus_cohort": "max_alpha_coverage", + "alpha_cells": int(deltas["alpha"].nunique()) if not deltas.empty else 0, + "alpha_min": float(deltas["alpha"].min()) if not deltas.empty else None, + "alpha_max": float(deltas["alpha"].max()) if not deltas.empty else None, + "mean_revenue_delta_pct": float(deltas["revenue_delta_pct"].mean()) + if not deltas.empty + else None, + "mean_reward_delta_pct": float(deltas["reward_delta_pct"].mean()) + if not deltas.empty + else None, + "zone_summary": zones.to_dict(orient="records"), + } + headline_path = output_dir / "final_focus_headline_summary.json" + headline_path.write_text(json.dumps(headline, indent=2) + "\n") + written.append(headline_path) + + written.append( + _plot_focus_revenue_by_alpha( + alpha_mode, + plot_dir / "final_focus_revenue_by_alpha.pdf", + ) + ) + written.append( + _plot_focus_revenue_delta( + deltas, + plot_dir / "final_focus_revenue_delta.pdf", + ) + ) + written.append( + _plot_focus_risk_deltas( + deltas, + plot_dir / "final_focus_risk_deltas.pdf", + ) + ) + + include_dir = Path(__file__).resolve().parent / "includes" / "final" + written.append( + _write_include( + include_dir / "final_focus_revenue_by_alpha.tex", + "chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf", + "0.98\\linewidth", + ) + ) + written.append( + _write_include( + include_dir / "final_focus_revenue_delta.tex", + "chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf", + "0.95\\linewidth", + ) + ) + written.append( + _write_include( + include_dir / "final_focus_risk_deltas.tex", + "chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf", + "0.95\\linewidth", + ) + ) + return written + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Generate final paper figures/tables from the final sweep cohort" + ) + parser.add_argument("--bundle-dir", type=Path, default=_default_bundle_dir()) + parser.add_argument("--output-dir", type=Path, default=_default_output_dir()) + parser.add_argument("--plot-dir", type=Path, default=None) + args = parser.parse_args() + + _configure_style() + plot_dir = ( + args.plot_dir + if args.plot_dir is not None + else _default_plot_dir(args.output_dir) + ) + outputs = run( + bundle_dir=args.bundle_dir, output_dir=args.output_dir, plot_dir=plot_dir + ) + for path in outputs: + print(path) + + +if __name__ == "__main__": + main() diff --git a/paper/src/chapters/figures/results/process_first_sweep.py b/paper/src/chapters/figures/results/process_first_sweep.py new file mode 100644 index 0000000..0e62525 --- /dev/null +++ b/paper/src/chapters/figures/results/process_first_sweep.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Iterable + +import numpy as np +import pandas as pd + + +def _project_root() -> Path: + return Path(__file__).resolve().parents[5] + + +def _default_input() -> Path: + return _project_root() / "tpu_orchestration" / "results" / "first_sweep.csv" + + +def _default_output_dir() -> Path: + return Path(__file__).resolve().parent / "generated" / "legacy" + + +def _sanitize(key: str) -> str: + return key.replace("/", "_").replace("-", "_") + + +def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None: + for column in columns: + if column in frame.columns: + frame[column] = pd.to_numeric(frame[column], errors="coerce") + + +def _extract_alpha(frame: pd.DataFrame) -> pd.Series: + if "study/alpha" in frame.columns: + return pd.to_numeric(frame["study/alpha"], errors="coerce") + if "alpha" in frame.columns: + return pd.to_numeric(frame["alpha"], errors="coerce") + return pd.Series(np.nan, index=frame.index, dtype=float) + + +def _extract_mode(frame: pd.DataFrame) -> pd.Series: + if "study/mode" in frame.columns: + return frame["study/mode"].astype(str).str.strip().str.lower() + if "study/no_robust" in frame.columns: + no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0) + return pd.Series( + np.where(no_robust > 0.5, "no_robust", "robust"), + index=frame.index, + dtype="object", + ) + if "no_robust" in frame.columns: + no_robust = ( + frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"}) + ) + return pd.Series( + np.where(no_robust, "no_robust", "robust"), + index=frame.index, + dtype="object", + ) + return pd.Series("", index=frame.index, dtype="object") + + +def _extract_tier(frame: pd.DataFrame) -> pd.Series: + for column in ("tiers", "runtime/backend", "algo", "run.backend", "run.algo"): + if column in frame.columns: + tier = frame[column].astype(str).str.strip().str.lower() + if tier.notna().any(): + return tier + return pd.Series("unknown", index=frame.index, dtype="object") + + +def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame: + data = frame.copy() + if not include_non_finished and "State" in data.columns: + data = data[data["State"].astype(str).str.lower() == "finished"].copy() + + data["alpha"] = _extract_alpha(data) + data["mode"] = _extract_mode(data) + data["tier"] = _extract_tier(data) + data = data[data["mode"].isin({"robust", "no_robust"})] + data = data[data["alpha"].notna()] + + _coerce_numeric( + data, + [ + "eval/revenue_mean", + "eval/reward_mean", + "eval/coi_level_mean", + "eval/coi_leakage_mean", + "eval/margin_mean", + "eval/volatility_mean", + "objective/score", + "train/alpha_adv", + "lambda_coi", + "robust_radius", + "learning_rate", + "batch_size", + "n_steps", + "total_timesteps", + ], + ) + return data.sort_values(["tier", "alpha", "mode"]).reset_index(drop=True) + + +def _group_summary( + frame: pd.DataFrame, by: list[str], metrics: list[str] +) -> pd.DataFrame: + agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")} + for metric in metrics: + safe = _sanitize(metric) + agg_spec[f"{safe}_mean"] = (metric, "mean") + agg_spec[f"{safe}_std"] = (metric, "std") + return frame.groupby(by, as_index=False).agg(**agg_spec).sort_values(by) + + +def _tier_alpha_deltas(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame: + rows: list[dict[str, float | str]] = [] + for (tier, alpha), group in summary.groupby(["tier", "alpha"], sort=True): + robust = group[group["mode"] == "robust"] + no_robust = group[group["mode"] == "no_robust"] + if robust.empty or no_robust.empty: + continue + + row: dict[str, float | str] = { + "tier": str(tier), + "alpha": float(alpha), + "runs_robust": float(robust["runs"].iloc[0]), + "runs_no_robust": float(no_robust["runs"].iloc[0]), + } + for metric in metrics: + safe = _sanitize(metric) + robust_value = float(robust[f"{safe}_mean"].iloc[0]) + no_robust_value = float(no_robust[f"{safe}_mean"].iloc[0]) + delta = robust_value - no_robust_value + row[f"{safe}_delta"] = delta + row[f"{safe}_delta_pct"] = ( + np.nan if no_robust_value == 0 else 100.0 * delta / no_robust_value + ) + rows.append(row) + + return pd.DataFrame(rows) + + +def _top_runs(frame: pd.DataFrame, n: int) -> pd.DataFrame: + rank_metric = "objective/score" + if rank_metric not in frame.columns or frame[rank_metric].notna().sum() == 0: + rank_metric = "eval/reward_mean" + + keep = [ + "Name", + "tier", + "alpha", + "mode", + rank_metric, + "eval/revenue_mean", + "eval/reward_mean", + "eval/coi_level_mean", + "eval/coi_leakage_mean", + "lambda_coi", + "robust_radius", + "learning_rate", + "batch_size", + "n_steps", + "total_timesteps", + ] + present = [column for column in keep if column in frame.columns] + ranked = frame[present].copy().sort_values(rank_metric, ascending=False) + return ranked.head(max(1, int(n))).reset_index(drop=True) + + +def _headline_json( + frame: pd.DataFrame, tier_mode: pd.DataFrame +) -> dict[str, float | str]: + out: dict[str, float | str] = { + "runs": int(len(frame)), + "tiers": int(frame["tier"].nunique()), + "alphas": int(frame["alpha"].nunique()), + } + + robust_rows = tier_mode[tier_mode["mode"] == "robust"] + no_robust_rows = tier_mode[tier_mode["mode"] == "no_robust"] + if robust_rows.empty or no_robust_rows.empty: + out["status"] = "incomplete_modes" + return out + + robust_mean = robust_rows["eval_revenue_mean_mean"].mean() + no_robust_mean = no_robust_rows["eval_revenue_mean_mean"].mean() + out.update( + { + "status": "ok", + "mean_tier_revenue_robust": float(robust_mean), + "mean_tier_revenue_no_robust": float(no_robust_mean), + "mean_tier_revenue_delta": float(robust_mean - no_robust_mean), + "mean_tier_revenue_delta_pct": float( + 100.0 * (robust_mean - no_robust_mean) / no_robust_mean + ) + if no_robust_mean + else np.nan, + } + ) + return out + + +def run( + input_path: Path, output_dir: Path, include_non_finished: bool, top_n: int +) -> list[Path]: + output_dir.mkdir(parents=True, exist_ok=True) + raw = pd.read_csv(input_path) + frame = _prepare_frame(raw, include_non_finished=include_non_finished) + + metrics = [ + metric + for metric in ( + "eval/revenue_mean", + "eval/reward_mean", + "eval/coi_level_mean", + "eval/coi_leakage_mean", + "eval/margin_mean", + "eval/volatility_mean", + "objective/score", + "train/alpha_adv", + ) + if metric in frame.columns + ] + + tier_mode = _group_summary(frame, ["tier", "mode"], metrics) + tier_alpha_mode = _group_summary(frame, ["tier", "alpha", "mode"], metrics) + deltas = _tier_alpha_deltas(tier_alpha_mode, metrics) + top_configs = _top_runs(frame, n=top_n) + headline = _headline_json(frame, tier_mode) + + outputs = { + "first_sweep_tier_mode_summary.csv": tier_mode, + "first_sweep_tier_alpha_mode_summary.csv": tier_alpha_mode, + "first_sweep_tier_alpha_deltas.csv": deltas, + "first_sweep_top_configs.csv": top_configs, + } + written_paths: list[Path] = [] + for filename, table in outputs.items(): + path = output_dir / filename + table.to_csv(path, index=False) + written_paths.append(path) + + headline_path = output_dir / "first_sweep_headline_summary.json" + headline_path.write_text(json.dumps(headline, indent=2)) + written_paths.append(headline_path) + return written_paths + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Process first sweep CSV for paper tables" + ) + parser.add_argument("--input", type=Path, default=_default_input()) + parser.add_argument("--output-dir", type=Path, default=_default_output_dir()) + parser.add_argument("--include-non-finished", action="store_true") + parser.add_argument("--top-n", type=int, default=25) + args = parser.parse_args() + + written = run( + input_path=args.input, + output_dir=args.output_dir, + include_non_finished=bool(args.include_non_finished), + top_n=int(args.top_n), + ) + for path in written: + print(path) + + +if __name__ == "__main__": + main() diff --git a/paper/src/chapters/figures/results/process_ppo_benchmark.py b/paper/src/chapters/figures/results/process_ppo_benchmark.py new file mode 100644 index 0000000..85f48b2 --- /dev/null +++ b/paper/src/chapters/figures/results/process_ppo_benchmark.py @@ -0,0 +1,277 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Iterable + +import numpy as np +import pandas as pd + + +def _project_root() -> Path: + return Path(__file__).resolve().parents[5] + + +def _default_input() -> Path: + return _project_root() / "tpu_orchestration" / "results" / "ppo_benchmark.csv" + + +def _default_output_dir() -> Path: + return Path(__file__).resolve().parent / "generated" / "legacy" + + +def _sanitize(key: str) -> str: + return key.replace("/", "_").replace("-", "_") + + +def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None: + for column in columns: + if column in frame.columns: + frame[column] = pd.to_numeric(frame[column], errors="coerce") + + +def _extract_alpha(frame: pd.DataFrame) -> pd.Series: + if "study/alpha" in frame.columns: + return pd.to_numeric(frame["study/alpha"], errors="coerce") + if "alpha" in frame.columns: + return pd.to_numeric(frame["alpha"], errors="coerce") + return pd.Series(np.nan, index=frame.index, dtype=float) + + +def _extract_mode(frame: pd.DataFrame) -> pd.Series: + if "study/mode" in frame.columns: + return frame["study/mode"].astype(str).str.strip().str.lower() + if "study/no_robust" in frame.columns: + no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0) + return pd.Series( + np.where(no_robust > 0.5, "no_robust", "robust"), + index=frame.index, + dtype="object", + ) + if "no_robust" in frame.columns: + no_robust = ( + frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"}) + ) + return pd.Series( + np.where(no_robust, "no_robust", "robust"), + index=frame.index, + dtype="object", + ) + return pd.Series("", index=frame.index, dtype="object") + + +def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame: + data = frame.copy() + if not include_non_finished and "State" in data.columns: + data = data[data["State"].astype(str).str.lower() == "finished"].copy() + + data["alpha"] = _extract_alpha(data) + data["mode"] = _extract_mode(data) + data = data[data["mode"].isin({"robust", "no_robust"})] + data = data[data["alpha"].notna()] + + numeric_cols = [ + "eval/revenue_mean", + "eval/reward_mean", + "eval/coi_level_mean", + "eval/coi_leakage_mean", + "eval/volatility_mean", + "eval/margin_mean", + "train/alpha_adv", + "train/coi_penalty", + "train/ux_penalty", + "train/agent_prob", + ] + _coerce_numeric(data, numeric_cols) + return data.sort_values(["alpha", "mode"]).reset_index(drop=True) + + +def _summary_by_alpha_mode(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame: + agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")} + for metric in metrics: + safe = _sanitize(metric) + agg_spec[f"{safe}_mean"] = (metric, "mean") + agg_spec[f"{safe}_std"] = (metric, "std") + + return ( + frame.groupby(["alpha", "mode"], as_index=False) + .agg(**agg_spec) + .sort_values(["alpha", "mode"]) + .reset_index(drop=True) + ) + + +def _delta_by_alpha(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame: + rows: list[dict[str, float]] = [] + for alpha, alpha_group in summary.groupby("alpha", sort=True): + robust = alpha_group[alpha_group["mode"] == "robust"] + no_robust = alpha_group[alpha_group["mode"] == "no_robust"] + if robust.empty or no_robust.empty: + continue + + row: dict[str, float] = { + "alpha": float(alpha), + "runs_robust": float(robust["runs"].iloc[0]), + "runs_no_robust": float(no_robust["runs"].iloc[0]), + } + for metric in metrics: + safe = _sanitize(metric) + robust_value = float(robust[f"{safe}_mean"].iloc[0]) + no_robust_value = float(no_robust[f"{safe}_mean"].iloc[0]) + delta = robust_value - no_robust_value + row[f"{safe}_robust"] = robust_value + row[f"{safe}_no_robust"] = no_robust_value + row[f"{safe}_delta"] = delta + row[f"{safe}_delta_pct"] = ( + np.nan if no_robust_value == 0 else 100.0 * delta / no_robust_value + ) + rows.append(row) + + return pd.DataFrame(rows) + + +def _pairwise_win_rates(frame: pd.DataFrame) -> pd.DataFrame: + rules = { + "eval/revenue_mean": "higher", + "eval/reward_mean": "higher", + "eval/coi_leakage_mean": "lower", + "eval/volatility_mean": "lower", + } + rows: list[dict[str, float]] = [] + for alpha, alpha_group in frame.groupby("alpha", sort=True): + robust = alpha_group[alpha_group["mode"] == "robust"] + no_robust = alpha_group[alpha_group["mode"] == "no_robust"] + if robust.empty or no_robust.empty: + continue + + for metric, direction in rules.items(): + if metric not in frame.columns: + continue + robust_values = robust[metric].dropna().to_numpy(dtype=float) + no_robust_values = no_robust[metric].dropna().to_numpy(dtype=float) + if robust_values.size == 0 or no_robust_values.size == 0: + continue + + if direction == "higher": + wins = (robust_values[:, None] > no_robust_values[None, :]).sum() + else: + wins = (robust_values[:, None] < no_robust_values[None, :]).sum() + ties = (robust_values[:, None] == no_robust_values[None, :]).sum() + total = robust_values.size * no_robust_values.size + win_prob = (wins + 0.5 * ties) / total + rows.append( + { + "alpha": float(alpha), + "metric": metric, + "direction": direction, + "wins": int(wins), + "ties": int(ties), + "total_pairs": int(total), + "win_probability": float(win_prob), + } + ) + return pd.DataFrame(rows) + + +def _overall_mode_summary(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame: + agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")} + for metric in metrics: + safe = _sanitize(metric) + agg_spec[f"{safe}_mean"] = (metric, "mean") + agg_spec[f"{safe}_std"] = (metric, "std") + return frame.groupby("mode", as_index=False).agg(**agg_spec).sort_values("mode") + + +def _headline_json(overall: pd.DataFrame) -> dict[str, float | str]: + if {"robust", "no_robust"} - set(overall["mode"].tolist()): + return {"status": "incomplete_modes"} + + robust = overall[overall["mode"] == "robust"].iloc[0] + no_robust = overall[overall["mode"] == "no_robust"].iloc[0] + + revenue_delta = float( + robust["eval_revenue_mean_mean"] - no_robust["eval_revenue_mean_mean"] + ) + leakage_delta = float( + robust["eval_coi_leakage_mean_mean"] - no_robust["eval_coi_leakage_mean_mean"] + ) + return { + "status": "ok", + "revenue_delta": revenue_delta, + "revenue_delta_pct": float( + 100.0 * revenue_delta / no_robust["eval_revenue_mean_mean"] + ), + "coi_leakage_delta": leakage_delta, + "coi_leakage_delta_pct": float( + 100.0 * leakage_delta / no_robust["eval_coi_leakage_mean_mean"] + ), + } + + +def run(input_path: Path, output_dir: Path, include_non_finished: bool) -> list[Path]: + output_dir.mkdir(parents=True, exist_ok=True) + raw = pd.read_csv(input_path) + frame = _prepare_frame(raw, include_non_finished=include_non_finished) + + metrics = [ + metric + for metric in ( + "eval/revenue_mean", + "eval/reward_mean", + "eval/coi_level_mean", + "eval/coi_leakage_mean", + "eval/volatility_mean", + "eval/margin_mean", + "train/alpha_adv", + "train/coi_penalty", + "train/ux_penalty", + "train/agent_prob", + ) + if metric in frame.columns + ] + + alpha_mode = _summary_by_alpha_mode(frame, metrics) + deltas = _delta_by_alpha(alpha_mode, metrics) + win_rates = _pairwise_win_rates(frame) + overall = _overall_mode_summary(frame, metrics) + headline = _headline_json(overall) + + outputs = { + "ppo_alpha_mode_summary.csv": alpha_mode, + "ppo_alpha_deltas.csv": deltas, + "ppo_pairwise_win_rates.csv": win_rates, + "ppo_overall_mode_summary.csv": overall, + } + written_paths: list[Path] = [] + for filename, table in outputs.items(): + path = output_dir / filename + table.to_csv(path, index=False) + written_paths.append(path) + + headline_path = output_dir / "ppo_headline_summary.json" + headline_path.write_text(json.dumps(headline, indent=2)) + written_paths.append(headline_path) + return written_paths + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Process PPO benchmark CSV for paper tables" + ) + parser.add_argument("--input", type=Path, default=_default_input()) + parser.add_argument("--output-dir", type=Path, default=_default_output_dir()) + parser.add_argument("--include-non-finished", action="store_true") + args = parser.parse_args() + + written = run( + input_path=args.input, + output_dir=args.output_dir, + include_non_finished=bool(args.include_non_finished), + ) + for path in written: + print(path) + + +if __name__ == "__main__": + main() diff --git a/paper/src/chapters/figures/process_supra.py b/paper/src/chapters/figures/supra/process_supra.py similarity index 100% rename from paper/src/chapters/figures/process_supra.py rename to paper/src/chapters/figures/supra/process_supra.py diff --git a/paper/src/chapters/figures/supra.csv b/paper/src/chapters/figures/supra/supra.csv similarity index 100% rename from paper/src/chapters/figures/supra.csv rename to paper/src/chapters/figures/supra/supra.csv diff --git a/paper/src/chapters/figures/supra.tex b/paper/src/chapters/figures/supra/supra.tex similarity index 95% rename from paper/src/chapters/figures/supra.tex rename to paper/src/chapters/figures/supra/supra.tex index 290a2a1..9e815b7 100644 --- a/paper/src/chapters/figures/supra.tex +++ b/paper/src/chapters/figures/supra/supra.tex @@ -21,7 +21,7 @@ surf, shader=flat, mesh/check=false % Disable check to rely on empty lines - ] table [col sep=comma, x=step, y=price, z=density] {chapters/figures/supra_data.csv}; + ] table [col sep=comma, x=step, y=price, z=density] {chapters/figures/supra/supra_data.csv}; \end{axis} \end{tikzpicture} diff --git a/paper/src/chapters/figures/supra_data.csv b/paper/src/chapters/figures/supra/supra_data.csv similarity index 99% rename from paper/src/chapters/figures/supra_data.csv rename to paper/src/chapters/figures/supra/supra_data.csv index f005217..6216cac 100644 --- a/paper/src/chapters/figures/supra_data.csv +++ b/paper/src/chapters/figures/supra/supra_data.csv @@ -4038,4 +4038,3 @@ step,price,density 4000,146.51098761558535,0.0 4000,147.9065925693512,0.0 4000,149.30219752311706,10.0 - diff --git a/paper/src/chapters/hero_architecture_figure.tex b/paper/src/chapters/hero_architecture_figure.tex new file mode 100644 index 0000000..a706781 --- /dev/null +++ b/paper/src/chapters/hero_architecture_figure.tex @@ -0,0 +1,166 @@ +\definecolor{heroBlue}{RGB}{212, 228, 255} +\definecolor{heroBlueBorder}{RGB}{64, 103, 178} +\definecolor{heroGreen}{RGB}{214, 238, 216} +\definecolor{heroGreenBorder}{RGB}{48, 133, 66} +\definecolor{heroAmber}{RGB}{246, 230, 202} +\definecolor{heroAmberBorder}{RGB}{166, 121, 51} +\definecolor{heroGray}{RGB}{236, 236, 236} +\definecolor{heroGrayBorder}{RGB}{120, 120, 120} + +% Panels occupy y = 2.2 .. 10.0 +% Cross-panel connector gutter lives at y = 1.0 .. 2.2 (clearly below all nodes) +\begin{tikzpicture}[ + >=Stealth, + font=\small, + panel/.style={draw=black!65, dashed, rounded corners=4pt, line width=0.85pt}, + bB/.style={rectangle, rounded corners=3pt, draw=heroBlueBorder, fill=heroBlue, + line width=0.9pt, align=center, minimum height=0.85cm}, + bG/.style={rectangle, rounded corners=3pt, draw=heroGreenBorder, fill=heroGreen, + line width=0.9pt, align=center, minimum height=0.85cm}, + bA/.style={rectangle, rounded corners=3pt, draw=heroAmberBorder, fill=heroAmber, + line width=0.9pt, align=center, minimum height=0.85cm}, + bY/.style={rectangle, rounded corners=3pt, draw=heroGrayBorder, fill=heroGray, + line width=0.9pt, align=center, minimum height=0.82cm}, + pill/.style={ellipse, draw=black!50, fill=black!4, line width=0.75pt, + align=center, minimum width=1.6cm, minimum height=0.68cm}, + arr/.style={->, draw=black!80, line width=0.88pt}, + bidir/.style={<->, draw=black!80, line width=0.88pt}, + darr/.style={->, draw=black!60, line width=0.80pt, densely dashed}, + crossA/.style={->, draw=heroAmberBorder!90!black, line width=1.15pt, dash pattern=on 3.5pt off 2pt}, + crossG/.style={->, draw=heroGreenBorder!90!black, line width=1.15pt, dash pattern=on 3.5pt off 2pt}, + arrG/.style={->, draw=heroGreenBorder!90!black, line width=1.15pt}, + lbl/.style={font=\scriptsize, align=center, fill=white, inner sep=1.5pt, text=black} +] + +%% ============================================================ +%% Panel A x: 0.2–11.2 y: 2.2–10.0 +%% ============================================================ +\draw[panel] (0.2,2.2) rectangle (11.2,10.0); +\node[anchor=west, font=\small\bfseries] at (0.45,9.72) {(a) Online platform and data plane}; + +\node[pill] (human) at (1.3, 8.55) {Human}; +\node[pill] (agent) at (1.3, 7.45) {Agent}; + +\node[bB, minimum width=2.75cm] (web) at (4.2, 8.0) {Next.js\\Web App}; +\node[bB, minimum width=2.75cm] (provider) at (7.35, 8.0) {Pricing\\Provider}; +\node[bY, minimum width=1.85cm] (redis) at (9.85, 8.0) {Redis}; + +\node[bG, minimum width=3.1cm] (kBehav) at (4.0, 6.2) {Kafka stream\\Behavior events}; +\node[bG, minimum width=3.0cm] (kQuotes) at (7.5, 6.2) {Kafka stream\\Price quotes}; + +\node[bA, minimum width=3.1cm] (worker) at (4.0, 4.4) {Worker / ETL\\Feature jobs}; +\node[bA, minimum width=2.65cm] (registry) at (8.45, 4.4) {Model\\Registry}; + +% service row +\draw[arr] (human.east) -- (web.west); +\draw[arr] (agent.east) -- (web.west); +\draw[arr] (web.east) -- (provider.west); +\draw[bidir] (provider.east) -- (redis.west); + +% web/provider -> kafka +\draw[arr] (web.south) -- (kBehav.north) + node[midway, left, lbl] {$e=(a,i,t,\mu,\delta)$}; +\draw[arr] (provider.south) -- (kQuotes.north) + node[midway, right, lbl] {$(i,p,\mathrm{sid},\phi,t)$}; + +% kafka -> worker (straight south) +\draw[arr] (kBehav.south) -- (worker.north); +\draw[arr] (kQuotes.south) -- (worker.north); + +% worker -> registry +\draw[arr] (worker.east) -- (registry.west); + +% model refresh: registry east -> goes right to x=11.0, north to y=9.2, left to provider +% this keeps it entirely inside panel A with no crossing of nodes +\draw[crossA, rounded corners=6pt] + (registry.east) -- (11.0, 4.4) + -- (11.0, 9.2) + -- node[midway, lbl] {model refresh} (provider.north |- 0, 9.2) + -- (provider.north); + +%% ============================================================ +%% Panel B x: 11.6–20.4 y: 2.2–10.0 +%% ============================================================ +\draw[panel] (11.6,2.2) rectangle (19.8,10.0); +\node[anchor=west, font=\small\bfseries] at (11.85,9.72) {(b) Distinguishability layer}; + +\node[bG, minimum width=2.4cm] (session) at (14.0, 8.9) {Session prefix\\$\tau'$}; +\node[bB, minimum width=2.4cm] (empKern) at (13.65,7.45) {Empirical kernel\\$\hat T'$}; +\node[bY, minimum width=2.4cm] (weakLab) at (17.55,8.9) {Weak labels\\$\mathcal{D}_H,\mathcal{D}_A$}; +\node[bY, minimum width=2.2cm] (protoH) at (12.8, 5.9) {Prototype\\$\bar T_H$}; +\node[bA, minimum width=2.4cm] (kldist) at (15.55,5.9) {KL distances\\$\Delta_H,\Delta_A$}; +\node[bY, minimum width=2.2cm] (protoA) at (18.3, 5.9) {Prototype\\$\bar T_A$}; +\node[bB, minimum width=2.9cm] (calHead) at (13.55,4.25) {Contrastive\\calibration head}; +\node[bG, minimum width=2.55cm] (score) at (17.75,4.25) {Session score\\$f(\tau'),\hat\alpha(\tau')$}; + +\node[lbl] at (15.55, 3.15) {$\hat\alpha(\tau')=\sigma\!\left(\beta(\Delta_H-\Delta_A)\right)$}; + +\draw[arr, rounded corners=4pt] (session.south) -- (empKern.north); +\draw[arr, rounded corners=4pt] (empKern.south) -- (13.65, 6.8) -| (protoH.north); +\draw[arr, rounded corners=4pt] (weakLab.south) -- (17.55, 6.8) -| (protoA.north); +% weak labels -> protoH: go south then hard-left below weakLab +\draw[arr, rounded corners=4pt] (weakLab.south) -- (17.55,6.8) -| (protoH.north east); +\draw[arr] (protoH.east) -- (kldist.west); +\draw[arr] (protoA.west) -- (kldist.east); +\draw[arr] (kldist.south) -- (calHead.north east); +\draw[arr] (calHead.east) -- (score.west); + +%% ============================================================ +%% Panel C x: 20.8–31.0 y: 2.2–10.0 +%% ============================================================ +\draw[panel] (20.8,2.2) rectangle (31.0,10.0); +\node[anchor=west, font=\small\bfseries] at (21.05,9.72) {(c) Distributionally robust control}; + +\node[bB, minimum width=3.1cm] (state) at (23.15, 8.9) + {State summary\\$[p_{t-1},\hat q_{t-1},f(\tau')]$}; +\node[bY, minimum width=2.9cm] (ambSet) at (23.15, 7.45) {Ambiguity set\\$\mathcal U_\epsilon(\hat P_N)$}; +\node[bG, minimum width=2.9cm] (innerMin) at (28.55, 7.45) {Inner minimisation\\$\min_{Q\in\mathcal U_\epsilon}$}; +\node[bY, minimum width=8.2cm] (contScen) at (25.9, 5.9) + {Contamination scenarios $\;\alpha_k\in\mathcal A_{\epsilon_\alpha}(\alpha_0)$}; +\node[bA, minimum width=8.8cm] (reward) at (25.9, 4.45) + {$r_t = R(p_t,\hat q_t) - \lambda\,\mathrm{COI}_{\mathrm{leak}}(p_t,\tau_t') - \eta\,UX_t$}; +\node[bB, minimum width=2.85cm] (policy) at (22.75, 3.05) {Robust policy $\pi^*$}; +\node[bG, minimum width=2.85cm] (publish) at (29.05, 3.05) {Publish price\\vector $p_t$}; + +\node[lbl] at (25.9, 2.55) {$\pi^*=\arg\max_\pi\min_{Q\in\mathcal U_\epsilon}\mathbb{E}[r_t]$}; + +\draw[arr] (state.south) -- (ambSet.north); +\draw[arr] (ambSet.east) -- (innerMin.west); +\draw[arr, rounded corners=4pt] (ambSet.south) -- (23.15, 6.6) -| ([xshift=-2cm]contScen.north); +\draw[arr, rounded corners=4pt] (innerMin.south) -- (28.55, 6.6) -| ([xshift=2cm]contScen.north); +\draw[arr] (contScen.south) -- (reward.north); +\draw[arr, rounded corners=6pt] (reward.south) -- (25.9, 3.7) -| (policy.north); +\draw[arr] (policy.east) -- (publish.west); +% market response: up the right edge of panel C, entirely inside, rounded +\draw[arrG, rounded corners=6pt] (publish.east) -- (30.6, 3.05) + -- (30.6, 9.8) + -- node[midway, lbl] {market response} (state.north |- 0, 9.8) + -- (state.north); + +%% ============================================================ +%% Cross-panel connectors – gutter at y = 1.0..2.2 +%% Three separate depths: 1.85, 1.45, 1.05 (no overlaps) +%% ============================================================ + +% 1. Worker -> Session (depth y=1.85, shallowest) +\draw[crossA, rounded corners=6pt] + (worker.south) -- (worker.south |- 0, 1.85) + -- node[pos=0.5, lbl] {offline extraction} (11.4, 1.85) + -- (11.4, 8.9) + -- (session.west); + +% 2. Score -> State (depth y=1.45) +\draw[crossG, rounded corners=6pt] + (score.south) -- (score.south |- 0, 1.45) + -- node[pos=0.5, lbl] {contamination signal} (20.6, 1.45) + -- (20.6, 8.9) + -- (state.west); + +% 3. Publish -> Provider (depth y=1.05, deepest) +\draw[crossG, rounded corners=3pt] + (publish.south) -- (publish.south |- 0, 1.05) + -- node[pos=0.4, lbl] {serve online} (5.8, 1.05) + -- (5.8, 7.7) + -- ([yshift=-0.3cm]provider.west); + +\end{tikzpicture} diff --git a/paper/src/chapters/slacberger.tex b/paper/src/chapters/slacberger.tex index 7728c91..1b0f153 100644 --- a/paper/src/chapters/slacberger.tex +++ b/paper/src/chapters/slacberger.tex @@ -62,7 +62,7 @@ We propose a robust optimization objective. The platform seeks a pricing policy Here: \begin{itemize} \item The first term, $p_t \cdot \hat{q}_t(p_t | \theta=H)$, represents the revenue generated strictly from the estimated human segment. - \item $\mathcal{L}_{detect}$ is a penalty term for failing to separate distributions (the cost of confusion). + \item $\mathcal{L}_{detect}$ is a penalty term for failing to distinguish distributions (the cost of confusion). \item $\lambda$ is a hyperparameter balancing revenue exploitation vs. robust detection. \end{itemize} diff --git a/paper/src/graphics/banner.png b/paper/src/graphics/banner.png index 992202e..31351b0 100644 Binary files a/paper/src/graphics/banner.png and b/paper/src/graphics/banner.png differ diff --git a/paper/src/main-genpop.tex b/paper/src/main-genpop.tex index adf81a9..e54f1de 100644 --- a/paper/src/main-genpop.tex +++ b/paper/src/main-genpop.tex @@ -57,7 +57,7 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce \item[Trajectory] Defined as a series of unspecified length, collecting data on states of some object over time. \item[Cost of Information (COI)] The average premium extracted above marginal cost due to information asymmetry. \item[Contamination Ratio] The proportion of agent sessions versus human sessions in the system. -\item[Separability] The ability to distinguish between human and agent behavioral patterns. +\item[Distinguishability] The ability to distinguish between human and agent behavioral patterns. \end{description} \section{Aggregate Compute Budget Derivation} diff --git a/paper/src/main.tex b/paper/src/main.tex index 7a17506..f31edd9 100644 --- a/paper/src/main.tex +++ b/paper/src/main.tex @@ -29,6 +29,9 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce \vspace{1em} \noindent\textbf{Acknowledgments:} This research was supported by the TPU Research Cloud program, which provided access to Google Cloud TPU accelerators (including TPU v4, v5e, and v6e). +\vspace{0.5em} +\noindent\textbf{Project page:} \url{https://velocitatem.github.io/PHANTOM/} + \clearpage \input{chapters/01-intro} \input{chapters/02-literature-review} @@ -43,15 +46,44 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce \appendix \section{Terminology} \begin{description} -\item[Agent $A$] An actor of non-human nature, powered by an LLM. -\item[Human $H$] An individual human with some job to be done. -\item[Actor $\theta$] Defines a type of class which is either Agent or Human and has the capability to carry out actions on a web platform. -\item[Platform] Any web-based platform which serves an interface to a collection of items that can be purchased, each at some price $p_i$. -\item[Behavioral Model] A mathematical model predicting what action comes after a series of prior actions. -\item[LLM] Large Language Model served by some provider with the abstracted capability of tool calling. -\item[TPU] Tensor Processing Unit which is a unique kind of chip architecture developed by Google. -\item[Trajectory] Defined as a series of unspecified length, collecting data on states of some object over time. -% TODO: maybe define other things in a similar succient manner +\item[Agent $A$] A non-human actor, typically an LLM-driven system that executes web actions toward a goal. +\item[Human $H$] A human participant interacting with the platform to complete a task. +\item[Actor Type $\theta$] A latent class parameter describing whether a session is generated by a human or an agent profile. +\item[Platform] A web interface exposing purchasable items and their offered prices. +\item[Session $s$] A bounded interaction record tied to one actor and one session identifier. +\item[Event $e_{s,k}$] A single interaction tuple in a session, including action, item target, and timestamp. +\item[Trajectory $\tau_s$] The ordered sequence of events generated within a session. +\item[Demand Proxy $\hat{q}_{t,i}$] A weighted aggregate of observed actions used as an operational substitute for latent demand. +\item[Action Weight Function $\omega(a)$] A mapping from action type to signal strength in the demand proxy. +\item[True Demand $d(p;\theta)$] The latent purchase response as a function of price and actor type. +\item[Contamination $\alpha$] The proportion of agent-generated traffic in the session mixture. +\item[Non-stationary Noise $\epsilon_t$] Time-varying residual variation not explained by the actor mixture. +\item[Pricing Policy $\pi(\tau)$] A function mapping observed interaction history to an offered price. +\item[Cost of Information (COI)] The expected premium above the minimum viable price induced by the pricing policy. +\item[COI Leakage] A per-quote penalty term modeling information revealed to reconnaissance behavior. +\item[First-Order Statistic $p_{(1)}$] The minimum observed price among multiple independent queries. +\item[Transition Kernel $\mathcal{T}$] A Markov transition matrix over behavioral states or actions. +\item[Distinguishability] The degree to which human and agent sessions can be distinguished from behavior alone. +\item[KL Divergence $D_{KL}$] A relative-entropy measure used to compare session transition structure against class prototypes. +\item[Divergence Scores $\Delta_H,\Delta_A$] Session-level distances to human and agent transition centroids. +\item[Weak Agent Probability $f(\tau)$] A session-level score estimating the likelihood that a trajectory is agent-generated. +\item[Contamination Generator $\mathcal{G}(\alpha)$] A simulator component that injects synthetic agent trajectories to reach a target mixture level. +\item[Stackelberg Game] A leader-follower formulation where the platform sets prices and demand responds. +\item[Ambiguity Set $\mathcal{U}_{\epsilon}$] A set of plausible demand distributions considered under distributional uncertainty. +\item[Wasserstein Ball] A distance-bounded neighborhood around an empirical distribution used in robust optimization. +\item[DR-RL] Distributionally Robust Reinforcement Learning for policies trained against worst-case distributional shifts. +\item[Nominal Contamination $\alpha_0$] The baseline contamination level around which robust candidates are evaluated. +\item[Robustness Radius $\epsilon_\alpha$] The local interval width used for inner minimization over contamination scenarios. +\item[Query-Tax Surrogate] A constant leakage proxy assigning fixed penalty to suspected reconnaissance queries. +\item[Revelation Surrogate] A leakage proxy based on $-\log\pi(p\mid\tau)$ to penalize highly informative quotes. +\item[Limbo Stack] The alternating game-history buffer that stores leader price moves and follower demand responses. +\item[UX Index] A bounded user-experience metric tracked to evaluate policy side effects on legitimate users. +\item[Look-to-Book Ratio] The ratio of search-like interactions to completed purchases, used as an operational contamination indicator. +\item[Hybrid Kappa-Lambda Architecture] A data design combining streaming ingestion with offline and batch learning loops. +\item[MDP / POMDP] Sequential decision models with full observability (MDP) or partial observability (POMDP). +\item[Behavioral Model] A model predicting what action is likely to follow from prior actions. +\item[LLM] Large Language Model served through an inference provider with tool-use capability. +\item[TPU] Tensor Processing Unit, a specialized accelerator architecture developed by Google. \end{description} \section{Aggregate Compute Budget Derivation} @@ -78,6 +110,30 @@ v4 & 64 & 275 & $64 \times 275 = 17{,}600$ \\ Converting to petaFLOPS: $160{,}320\;\text{TFLOPS} = 160.32\;\text{PFLOPS} \approx 160\;\text{PFLOPS}$. This is the theoretical peak under sustained BF16 arithmetic; realized throughput depends on memory bandwidth utilization and inter-chip communication overhead, but the figure serves as a useful upper bound for provisioning decisions. +\section{Slope-Test Verification: Revenue vs. Contamination} +\label{app:alpha_revenue_slope} + +This appendix provides a compact verification of the slope result reported in the main results section. Using the same run-level pairs $x_i=\texttt{study/alpha}_i$ and $y_i=\texttt{eval/revenue\_mean}_i$ ($n=95$), we re-checked the ordinary least squares slope test in Python with standard test routines (SciPy two-sided $t$ test for the slope). + +\[ +\widehat{y}=326{,}878.57-60{,}631.95\,x, +\] +\[ +t(93)=-8.2148,\qquad p=1.2038\times 10^{-12},\qquad R^2=0.4205,\qquad 95\%\,\text{CI}_{\beta_1}=[-75{,}288.76,\,-45{,}975.13]. +\] + +The Python verification reproduces the reported coefficients and inference values, confirming that the slope-test results are correct under standard methods. + +\section{whoclickedit Dataset Card} +\label{app:whoclicked_card} + +For transparency and reproducibility, this appendix includes the full dataset card used for the public release of the \texttt{whoclickedit} dataset. + +\lstinputlisting[ + caption={whoclickedit dataset card (README snapshot)}, + label={lst:whoclicked_dataset_card} +]{chapters/auto/whoclicked_dataset_card.md} + % \input{../build/concatenated_code} \end{document} diff --git a/paper/src/mirrors/cais2026/main.tex b/paper/src/mirrors/cais2026/main.tex index cdff6f3..3fe8db7 100644 --- a/paper/src/mirrors/cais2026/main.tex +++ b/paper/src/mirrors/cais2026/main.tex @@ -41,7 +41,7 @@ \begin{abstract} Dynamic pricing pipelines in e-commerce consume behavioral demand signals to set prices, but the growing presence of LLM-powered agents introduces a novel contamination vector: these agents decouple information gathering from transaction execution across isolated sessions, eroding the platform's pricing power. -We present PHANTOM, a modular compound system that addresses this threat end-to-end. The system is composed of five orchestrated components: (1)~a configurable e-commerce research platform with dual-stream Kafka ingestion for behavioral and price-exposure events, (2)~a GOFAI-based weak labeling stage that partitions sessions into human and agent classes using rule-based predicates, (3)~a transition-kernel estimator that learns separable Markov models for each actor type and constructs a Contamination Generator for controlled simulation, (4)~a Distributionally Robust Reinforcement Learning policy that optimizes pricing under a Wasserstein ambiguity set conditioned on per-session divergence signals, and (5)~an Airflow-orchestrated pipeline that connects online data collection to offline policy training via Redis-backed model serving. +We present PHANTOM, a modular compound system that addresses this threat end-to-end. The system is composed of five orchestrated components: (1)~a configurable e-commerce research platform with dual-stream Kafka ingestion for behavioral and price-exposure events, (2)~a GOFAI-based weak labeling stage that partitions sessions into human and agent classes using rule-based predicates, (3)~a transition-kernel estimator that learns distinguishable Markov models for each actor type and constructs a Contamination Generator for controlled simulation, (4)~a Distributionally Robust Reinforcement Learning policy that optimizes pricing under a Wasserstein ambiguity set conditioned on per-session divergence signals, and (5)~an Airflow-orchestrated pipeline that connects online data collection to offline policy training via Redis-backed model serving. We formally derive the Cost of Information Theorem, proving that standard pricing mechanisms become incentive-incompatible as agent query volume grows. The system architecture, interaction schema, and factorial experiment harness are designed for reproducibility and are released as open artifacts. We evaluate system-level tradeoffs between revenue protection, information leakage, and user-experience degradation through a three-objective reward structure. \end{abstract} @@ -58,7 +58,7 @@ The current innovation boom in generative artificial intelligence and its applic The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. Dynamic pricing algorithms rely on directly translating demand features $q$ to new price assignments $\hat{p}$ across a catalogue of products of size $N$. When agent-driven reconnaissance traffic contaminates these demand signals, the pricing pipeline produces biased estimates that erode margins. This is not a single-model failure but a \textit{compound system} failure: the data ingestion, demand estimation, policy optimization, and model serving stages each propagate and amplify the contamination. -Existing work treats bot detection and dynamic pricing as separate concerns. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience. This gap is what our contribution aims to address. +Existing work treats bot detection and dynamic pricing as separate concerns. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience. This gap is what our contribution aims to address. \subsection{System-Level Contributions} @@ -78,7 +78,7 @@ We frame our contribution along the four CAIS pillars---architectural patterns, This work addresses three core research questions: \begin{enumerate} - \item[\textbf{RQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting? + \item[\textbf{RQ1}] \textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting? \item[\textbf{RQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems? \item[\textbf{RQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination? \end{enumerate} @@ -115,7 +115,7 @@ Each price query generates a record $(i, p, \text{sid}, \phi, t)$ associating th \subsection{Offline Loop: Policy Training} -The Kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger. The offline loop consumes collected trajectories, performs weak labeling and transition-kernel estimation (Section~\ref{sec:separability}), trains the DR-RL policy (Section~\ref{sec:drrl}) in a simulator, and pushes the resulting policy to Redis for the pricing provider to read. +The Kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger. The offline loop consumes collected trajectories, performs weak labeling and transition-kernel estimation (Section~\ref{sec:distinguishability}), trains the DR-RL policy (Section~\ref{sec:drrl}) in a simulator, and pushes the resulting policy to Redis for the pricing provider to read. \subsection{Online Dynamic Pricing (Baseline)} @@ -165,7 +165,7 @@ The metadata record $\mu$ varies by action type. This heterogeneous structure is %% ==================================================================== \section{Methodology: Pipeline Components} -This section details the theoretical and practical framework behind each pipeline component. We formalize the problem environment, derive the \textit{Cost of Information} (COI) theorem that motivates the system design, describe the separability and contamination modules, and formulate the robust pricing policy. +This section details the theoretical and practical framework behind each pipeline component. We formalize the problem environment, derive the \textit{Cost of Information} (COI) theorem that motivates the system design, describe the distinguishability and contamination modules, and formulate the robust pricing policy. \subsection{Problem Formalization} @@ -225,15 +225,15 @@ Since the integrand vanishes as $N \to \infty$ for all $t > \underline{p}$, the This result is the theoretical motivation for the system design: it proves that standard pricing policies $\pi$ fail to extract surplus in the presence of large-scale agentic search, necessitating a contamination-aware component in the pipeline. -\subsection{Module: Separability and Contamination Generation} -\label{sec:separability} +\subsection{Module: Distinguishability and Contamination Generation} +\label{sec:distinguishability} To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from collected data using a two-stage approach. \subsubsection{GOFAI-Based Weak Labeling.} -We use Good Old-Fashioned AI (GOFAI) heuristics to generate weak labels for separability. A set of rule-based predicates $\phi_j: \tau \to \{0,1\}$ partitions dataset $\mathcal{D}$ into high-confidence sets $\mathcal{D}_H$ and $\mathcal{D}_A$. We then estimate separate transition models for both groups and ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability? +We use Good Old-Fashioned AI (GOFAI) heuristics to generate weak labels for distinguishability. A set of rule-based predicates $\phi_j: \tau \to \{0,1\}$ partitions dataset $\mathcal{D}$ into high-confidence sets $\mathcal{D}_H$ and $\mathcal{D}_A$. We then estimate separate transition models for both groups and ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability? -To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global separability and event-level diagnostics at the same time. In our balanced dataset (50\% human, 50\% agent), the average divergence is approximately $1.8$. +To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global distinguishability and event-level diagnostics at the same time. In our recorded dataset (13 human sessions, 16 agent sessions; 45\%/55\%), the average divergence is approximately $1.8$. \begin{definition}[KL Divergence for Transition Distributions] Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is: @@ -243,7 +243,7 @@ Let $P_e$ and $Q_e$ be categorical distributions over destination states followi where $\mathcal{S}_e$ denotes the set of destination events that follow $e$ in the human trajectories. \end{definition} -With these divergence features we train a contrastive model to estimate a weak agent probability $f(\tau)\in[0,1]$, which serves as the interface between the separability module and the downstream pricing policy. +With these divergence features we train a contrastive model to estimate a weak agent probability $f(\tau)\in[0,1]$, which serves as the interface between the distinguishability module and the downstream pricing policy. \subsubsection{Transition-Kernel Estimation and Contamination Generator.} \label{sec:tpe} @@ -282,12 +282,12 @@ Given a newly observed partial trajectory $\tau'$, we compute its empirical tran \Delta_A(\tau') &= D_{KL}(\hat{\mathcal{T}}^\prime \parallel \bar{\mathcal{T}}_A) \end{align} -These divergence statistics serve as the operational connector between the separability module and the pricing policy. We define the per-session contamination estimate as: +These divergence statistics serve as the operational connector between the distinguishability module and the pricing policy. We define the per-session contamination estimate as: \begin{equation} \label{eq:alpha_hat} \hat{\alpha}(\tau') = \sigma\big(\beta(\Delta_H(\tau') - \Delta_A(\tau'))\big) \end{equation} -where $\sigma$ is the logistic function and $\beta > 0$ is a temperature parameter. This maps separability directly into a scalar control input for the pricing objective. +where $\sigma$ is the logistic function and $\beta > 0$ is a temperature parameter. This maps distinguishability directly into a scalar control input for the pricing objective. \subsubsection{Ambiguity Set Construction.} Because the contamination level $\alpha$ and demand shift are non-stationary, a point estimate of the demand distribution is insufficient. Let $\hat{P}_N$ denote the empirical reference distribution induced by the Contamination Generator $\mathcal{G}(\alpha)$. We define the Wasserstein ambiguity set: @@ -344,7 +344,7 @@ The simulator has multiple configurable factors, including valuation distributio Our training budget spans 384 TPU chips across v4, v5e, and v6e generations, distributed across Europe and U.S. regions with a spot-heavy mix and an on-demand reserve. At peak BF16 throughput this corresponds to roughly 160 PFLOPS of aggregate compute. We allocate v6e capacity to the heaviest policy training, use v5e for broad hyperparameter sweeps, and reserve on-demand v4 quota for runs that should not be preempted \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}. -Our process follows three stages: (1)~observe and \textit{vectorize} behavioral interactions, (2)~learn separability to characterize human versus agent patterns, and (3)~use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator. +Our process follows three stages: (1)~observe and \textit{vectorize} behavioral interactions, (2)~learn distinguishability to characterize human versus agent patterns, and (3)~use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator. Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is intentionally a disconnected component that feeds the later contributions. @@ -375,7 +375,7 @@ Initialize contamination estimate $\hat\alpha \leftarrow 0.2$\; $\mathcal S_t \leftarrow \mathcal S_t \cup \{\tau_m\}$\; } - \tcp{Estimate contamination from separability module} + \tcp{Estimate contamination from distinguishability module} compute $\hat\alpha \leftarrow \frac{1}{M}\sum_{\tau\in\mathcal S_t} \Big[\sigma\big(\beta(\Delta_H(\tau)-\Delta_A(\tau))\big)\Big]$\; compute $J_t \leftarrow \text{Revenue}(p_t,\hat Q_t) - \lambda\cdot \text{COILeak}(\hat\alpha) - \eta\cdot \text{UX}(\hat\alpha)$\; @@ -430,7 +430,7 @@ We formally defined the Cost of Information and proved that as the saturation of The system architecture, interaction schema, configurable e-commerce testbed, and factorial experiment harness are designed for reproducibility and released as open artifacts. This is a very generic end-to-end mechanism which is applicable to a variety of different e-commerce tasks. We intentionally put emphasis on the development of this infrastructure to establish a reproducible framework for interaction and to minimize any noise. -Future work includes full factorial evaluation of the DR-RL policy across contamination levels, online adaptation of the ambiguity radius $\epsilon$ as a function of live divergence estimates, extension to multi-agent market maker settings, and integration of the HAP protocol~\cite{dhir_http_2025} as an additional signal source for the separability module. +Future work includes full factorial evaluation of the DR-RL policy across contamination levels, online adaptation of the ambiguity radius $\epsilon$ as a function of live divergence estimates, extension to multi-agent market maker settings, and integration of the HAP protocol~\cite{dhir_http_2025} as an additional signal source for the distinguishability module. %% ==================================================================== diff --git a/paper/src/mirrors/genpop/01-intro.tex b/paper/src/mirrors/genpop/01-intro.tex index 5222081..3528c91 100644 --- a/paper/src/mirrors/genpop/01-intro.tex +++ b/paper/src/mirrors/genpop/01-intro.tex @@ -2,9 +2,9 @@ \section{Introduction} -In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove separability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned separability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners. +In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove distinguishability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned distinguishability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners. -This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned separability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.} +This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned distinguishability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.} \subsection{Motivation and Market Context} @@ -25,7 +25,7 @@ We formally define interaction data as coming from some actor which can either b This dissertation is organized around one main research question and three supporting sub-questions: \begin{enumerate} \item[\textbf{Main RQ}] How can dynamic pricing systems preserve margin integrity when transaction orchestration is increasingly mediated by non-human agents? - \item[\textbf{SQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting? + \item[\textbf{SQ1}] \textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting? \item[\textbf{SQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems? \item[\textbf{SQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination? \end{enumerate} @@ -59,4 +59,4 @@ Extract final result from terminal state\; \end{algorithm} -The previously described goal of separability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary. +The previously described goal of distinguishability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary. diff --git a/paper/src/mirrors/genpop/02-literature-review.tex b/paper/src/mirrors/genpop/02-literature-review.tex index e9c9bf8..e28b37d 100644 --- a/paper/src/mirrors/genpop/02-literature-review.tex +++ b/paper/src/mirrors/genpop/02-literature-review.tex @@ -1,6 +1,6 @@ \section{Literature Review} -To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups. +To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups. \subsection{Agent Taxonomy and Definitions} diff --git a/paper/src/mirrors/genpop/03-methodology.tex b/paper/src/mirrors/genpop/03-methodology.tex index 55a57ac..222f553 100644 --- a/paper/src/mirrors/genpop/03-methodology.tex +++ b/paper/src/mirrors/genpop/03-methodology.tex @@ -1,6 +1,6 @@ \section{Methodology} -This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven separability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets. +This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven distinguishability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets. \subsection{Problem Formalization} @@ -109,13 +109,13 @@ Since users act with motivations, we define a pool of tasks (jobs to be done) an A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor. -The human data collection involved 18 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 18 human sessions we ran 18 agent sessions of equivalent task scope, giving a balanced dataset of 36 labeled trajectories. Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy. +The human data collection involved 13 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 13 human sessions we ran 16 agent sessions of equivalent task scope, yielding 29 labeled trajectories in total (45\% human, 55\% agent). Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy. To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior. -Operationally, goals and experiment runs are tracked in PostgreSQL. This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to separate classes (agent vs human) with session-conditioned probability estimates, then injects those estimates into the pricing learner. +Operationally, goals and experiment runs are tracked in PostgreSQL. This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to distinguish classes (agent vs human) with session-conditioned probability estimates, then injects those estimates into the pricing learner. -Our process follows three stages: (1) observe and vectorize behavioral interactions, (2) learn separability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator. +Our process follows three stages: (1) observe and vectorize behavioral interactions, (2) learn distinguishability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator. \begin{figure}[ht] \resizebox{\columnwidth}{!}{% @@ -209,15 +209,15 @@ In the simulator baseline this order is encoded with a compact fixed scale: cart In addition to behavioral events, the platform logs price observations to a separate Kafka topic. Each price query generates a record associating the product, displayed price, requesting session, platform mode, and timestamp. This dual-stream architecture enables joint analysis of price exposure and behavioral response. -\subsection{Generative Contamination and Separability} +\subsection{Generative Contamination and Distinguishability} To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from Phantom data using a two-stage approach. -\subsubsection{Ground-Truth Separability} +\subsubsection{Ground-Truth Distinguishability} -Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels (human or agent) are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition, treating the resulting human and agent kernels as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability? +Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels (human or agent) are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition, treating the resulting human and agent kernels as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability? -To answer this, we compute per-session divergence scores against both class-level centroids. For each session in either partition, we fit a session-level event transition kernel from that session's trajectory alone, then compute its average divergence to the human centroid and to the agent centroid. The per-session separability score is the gap between these two divergences: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior. +To answer this, we compute per-session divergence scores against both class-level centroids. For each session in either partition, we fit a session-level event transition kernel from that session's trajectory alone, then compute its average divergence to the human centroid and to the agent centroid. The per-session distinguishability score is the gap between these two divergences: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior. We cannot assume normal distributions for divergence scores, which are right-skewed and bounded below by zero, so we do not use a Student's t-test. Instead we apply a Mann-Whitney U test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. @@ -305,7 +305,7 @@ We also consider taxation-like overlays for agent traffic under strategy-proof m \subsubsection{Pricing Mechanism Summary} -We now present the complete pricing mechanism that integrates the behavioral separability, contamination estimation, and robust optimization components developed in the preceding sections. The defensive pricing loop algorithm formalizes the process as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories. +We now present the complete pricing mechanism that integrates the behavioral distinguishability, contamination estimation, and robust optimization components developed in the preceding sections. The defensive pricing loop algorithm formalizes the process as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories. \begin{algorithm}[t] \caption{PHANTOM defensive pricing loop} diff --git a/paper/src/mirrors/genpop/04-results.tex b/paper/src/mirrors/genpop/04-results.tex index bbe6c9d..6a2dc74 100644 --- a/paper/src/mirrors/genpop/04-results.tex +++ b/paper/src/mirrors/genpop/04-results.tex @@ -1,14 +1,14 @@ \section{Results} \begin{figure}[ht] \centering - \input{chapters/figures/supra.tex} + \input{chapters/figures/supra/supra.tex} \caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as SAC as can be clearly seen by the high density at the highest available price.} \label{fig:supra_heatmap} \end{figure} \subsection{Behavioral Analysis} -Separability between human and agent sessions is evaluated by computing per-session divergence gap scores (how much closer each session is to the human baseline versus the agent baseline) and comparing the two groups with a Mann-Whitney U test. The table below reports the group-level descriptive statistics for the gap scores and the test result. +Distinguishability between human and agent sessions is evaluated by computing per-session divergence gap scores (how much closer each session is to the human baseline versus the agent baseline) and comparing the two groups with a Mann-Whitney U test. The full recorded cohort contains 13 human sessions and 16 agent sessions, and the table below reports the corresponding group-level statistics and test result. \begin{table}[ht] \centering @@ -18,19 +18,19 @@ Separability between human and agent sessions is evaluated by computing per-sess \toprule Group & n & Mean gap & Std \\ \midrule -Human sessions & 11 & $-3.3522$ & $2.6748$ \\ -Agent sessions & 6 & $+1.6482$ & $2.8349$ \\ +Human sessions & 13 & $-3.35$ & $2.67$ \\ +Agent sessions & 16 & $+1.65$ & $2.83$ \\ \midrule -\multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\ +\multicolumn{4}{l}{Mann-Whitney two-sided test: $p<0.001$} \\ \bottomrule \end{tabular} \end{table} -The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided p-value of 0.0006 (which means there is only a 0.06\% chance this pattern occurred by random luck) indicates near-complete rank separation between the groups at n=11 humans and n=6 agents, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing. +The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided test result (p less than 0.001) at n=13 humans and n=16 agents indicates strong rank distinction between groups, providing evidence that the transition kernels are distinguishable enough to justify their use as a control signal in downstream pricing. \subsection{Experimental Outcomes} -To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) non-robust baseline with revenue-only reward (no-robust flag). +To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) a baseline policy with revenue-only reward. \begin{table}[ht] \centering @@ -41,7 +41,7 @@ To evaluate robustness contributions, we compare two policies on the same enviro Policy & Eval reward & Eval revenue & COI leakage & Margin collapse rate \\ \midrule Robust policy & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\ -Non-robust baseline (\texttt{--no-robust}) & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\ +Baseline policy & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\ \bottomrule \end{tabular} \end{table} @@ -50,6 +50,6 @@ This comparison isolates the effect of robustness terms from model capacity and \subsection{Interpretation and Insights} -The Mann-Whitney result (U=2.0, p less than 0.001) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score. +The Mann-Whitney result (p less than 0.001) confirms that per-session divergence gaps distinguish the two actor classes with near-zero overlap in rank ordering. This is the condition required for distinguishability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score. \subsection{Anomalies} diff --git a/paper/src/preamble.tex b/paper/src/preamble.tex index d8f9876..9b680c1 100644 --- a/paper/src/preamble.tex +++ b/paper/src/preamble.tex @@ -40,7 +40,7 @@ % Configure cleveref for algorithm2e \crefname{algocf}{Algorithm}{Algorithms} -\usetikzlibrary{positioning, shapes, arrows.meta, fit, backgrounds} +\usetikzlibrary{positioning, shapes, arrows.meta, fit, backgrounds, calc} \lstset{ basicstyle=\ttfamily\footnotesize, breaklines=true, diff --git a/requirements.txt b/requirements.txt index 247121e..71af617 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ pandas jupyter ipykernel matplotlib +tikzplotlib graphviz browser-use pytest @@ -13,3 +14,4 @@ scikit-learn supabase pymc wandb +huggingface_hub diff --git a/scripts/hf_data.py b/scripts/hf_data.py new file mode 100644 index 0000000..120165c --- /dev/null +++ b/scripts/hf_data.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +"""Sync collected behavioral data with HuggingFace Hub. + +Usage: + python scripts/hf_data.py pull # download from HF to local directories + python scripts/hf_data.py push # upload local directories to HF + +Expects HF_TOKEN env var (or logged in via `huggingface-cli login`). +Repo id comes from HF_DATASET_REPO env var, default: velocitatem/phantom-collected-data +""" + +import argparse +import os +import sys +from pathlib import Path + +from huggingface_hub import HfApi, snapshot_download + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +HUMAN_DIR = PROJECT_ROOT / "experiments" / "collected_data" +AGENT_DIR = PROJECT_ROOT / "experiments" / "agents" / "collected_data" + +DEFAULT_REPO = "velocitatem/phantom-collected-data" + +# mapping between local dirs and their prefix inside the HF repo +SLOT_MAP = {"human": HUMAN_DIR, "agent": AGENT_DIR} + + +def _repo_id() -> str: + return os.getenv("HF_DATASET_REPO", DEFAULT_REPO) + + +def _token() -> str | None: + return os.getenv("HF_TOKEN") or None + + +def push(): + api = HfApi(token=_token()) + repo = _repo_id() + api.create_repo(repo, repo_type="dataset", exist_ok=True, private=True) + + for prefix, local_dir in SLOT_MAP.items(): + if not local_dir.exists(): + print(f"skip {prefix}: {local_dir} does not exist") + continue + sessions = [d for d in local_dir.iterdir() if d.is_dir()] + if not sessions: + print(f"skip {prefix}: no session directories") + continue + print(f"uploading {len(sessions)} sessions from {prefix}/ ...") + api.upload_folder( + repo_id=repo, + repo_type="dataset", + folder_path=str(local_dir), + path_in_repo=prefix, + commit_message=f"update {prefix} data ({len(sessions)} sessions)", + ) + print("push complete") + + +def pull(): + repo = _repo_id() + token = _token() + cache = snapshot_download(repo, repo_type="dataset", token=token) + cache = Path(cache) + + for prefix, local_dir in SLOT_MAP.items(): + src = cache / prefix + if not src.exists(): + print(f"skip {prefix}: not present in remote") + continue + local_dir.mkdir(parents=True, exist_ok=True) + sessions = [d for d in src.iterdir() if d.is_dir()] + pulled = 0 + for sess in sessions: + dest = local_dir / sess.name + dest.mkdir(exist_ok=True) + for f in sess.iterdir(): + if f.is_file(): + (dest / f.name).write_bytes(f.read_bytes()) + pulled += 1 + print(f"{prefix}: pulled {len(sessions)} sessions ({pulled} files)") + print("pull complete") + + +def main(): + p = argparse.ArgumentParser(description="Sync collected data with HuggingFace Hub") + p.add_argument("action", choices=["pull", "push"], help="pull or push data") + args = p.parse_args() + {"pull": pull, "push": push}[args.action]() + + +if __name__ == "__main__": + main() diff --git a/scripts/launch_calibration_screen.sh b/scripts/launch_calibration_screen.sh new file mode 100755 index 0000000..6e312a5 --- /dev/null +++ b/scripts/launch_calibration_screen.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +export RAY_MODE="${RAY_MODE:-sweep}" +export SWEEP_KIND="${SWEEP_KIND:-ppo_block_a}" +export SWEEP_METHOD="${SWEEP_METHOD:-grid}" +export SWEEP_PROFILE="${SWEEP_PROFILE:-default}" +export SWEEP_RUN_CAP="${SWEEP_RUN_CAP:-27}" +export COMPARE_ROBUST="${COMPARE_ROBUST:-1}" +export NUM_NODES="${NUM_NODES:-3}" +export AGENTS_PER_NODE="${AGENTS_PER_NODE:-4}" +export AGENT_COUNT="${AGENT_COUNT:-0}" +export INNER_THREADS="${INNER_THREADS:-1}" +export PHANTOM_JAX_PLATFORM="${PHANTOM_JAX_PLATFORM:-cpu}" +export OUTPUT_ROOT="${OUTPUT_ROOT:-engine/studies/results/block_a_sweep}" + +if [ -z "${WORKER_CPUS:-}" ]; then + export WORKER_CPUS="$((AGENTS_PER_NODE * INNER_THREADS))" +fi + +printf '%s\n' "Launching Block A PPO calibration sweep" +printf '%s\n' "RAY_MODE=$RAY_MODE" +printf '%s\n' "SWEEP_KIND=$SWEEP_KIND" +printf '%s\n' "SWEEP_METHOD=$SWEEP_METHOD" +printf '%s\n' "SWEEP_RUN_CAP=$SWEEP_RUN_CAP" +printf '%s\n' "COMPARE_ROBUST=$COMPARE_ROBUST" +printf '%s\n' "NUM_NODES=$NUM_NODES" +printf '%s\n' "AGENTS_PER_NODE=$AGENTS_PER_NODE" +printf '%s\n' "AGENT_COUNT=$AGENT_COUNT" +printf '%s\n' "INNER_THREADS=$INNER_THREADS" +printf '%s\n' "WORKER_CPUS=$WORKER_CPUS" +printf '%s\n' "OUTPUT_ROOT=$OUTPUT_ROOT" + +cd "$ROOT" +bash ./submit_ray_job.sh diff --git a/scripts/nx_research.sh b/scripts/nx_research.sh index 434a312..4cc39ee 100644 --- a/scripts/nx_research.sh +++ b/scripts/nx_research.sh @@ -4,6 +4,7 @@ set -euo pipefail cmd="${1:-}" env_file="${SWEEP_ENV_FILE:-.env.sweep}" +default_tpu_conf="tpu_orchestration/configs/v4_spot_us.conf" load_sweep_env() { set -a @@ -20,6 +21,21 @@ require_var() { fi } +run_tpu_ray_bootstrap() { + local mode_flag="${1:-}" + load_sweep_env + local conf_path="${TPU_CONF:-$default_tpu_conf}" + [ -f "$conf_path" ] || { + printf '%s\n' "TPU config not found: $conf_path" >&2 + exit 1 + } + if [ -n "$mode_flag" ]; then + bash tpu_orchestration/bootstrap_ray.sh --conf "$conf_path" "$mode_flag" + else + bash tpu_orchestration/bootstrap_ray.sh --conf "$conf_path" + fi +} + case "$cmd" in install) [ -x .venv/bin/python ] || python3 -m venv .venv @@ -120,6 +136,32 @@ PY docker build -f docker/Trainer.dockerfile --target gpu -t "$image_ref:gpu-latest" . docker push "$image_ref:gpu-latest" ;; + whoclicked-publish) + require_var HF_TOKEN "HF_TOKEN required - export HF_TOKEN=" + .venv/bin/python scripts/whoclicked_etl.py build-upload \ + --output "${WHOCLICKED_CSV:-experiments/exports/whoclicked.csv}" \ + --repo "${WHOCLICKED_REPO:-velocitatem/whoclickedit}" \ + --path-in-repo "${WHOCLICKED_CSV_PATH_IN_REPO:-whoclicked.csv}" \ + --message "${WHOCLICKED_DATASET_MESSAGE:-Update flattened whoclickedit dataset}" + .venv/bin/python scripts/whoclicked_card.py build-upload \ + --csv "${WHOCLICKED_CSV:-experiments/exports/whoclicked.csv}" \ + --card "${WHOCLICKED_CARD:-experiments/exports/whoclicked_dataset_card.md}" \ + --repo "${WHOCLICKED_REPO:-velocitatem/whoclickedit}" \ + --path-in-repo "${WHOCLICKED_CARD_PATH_IN_REPO:-README.md}" \ + --message "${WHOCLICKED_CARD_MESSAGE:-Update dataset card for whoclickedit}" + ;; + tpu-ray-bootstrap) + run_tpu_ray_bootstrap + ;; + tpu-ray-deps) + run_tpu_ray_bootstrap --deps-only + ;; + tpu-ray-verify) + run_tpu_ray_bootstrap --verify-only + ;; + tpu-ray-teardown) + run_tpu_ray_bootstrap --teardown + ;; *) printf '%s\n' "Unknown research command: $cmd" >&2 exit 1 diff --git a/scripts/ray_distributed_train.py b/scripts/ray_distributed_train.py new file mode 100644 index 0000000..773fddd --- /dev/null +++ b/scripts/ray_distributed_train.py @@ -0,0 +1,667 @@ +from __future__ import annotations + +import argparse +import contextlib +import concurrent.futures +import os +import shlex +import subprocess +import sys +import threading +import time +from pathlib import Path + +import ray +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + + +def _has_flag(tokens: list[str], name: str) -> bool: + return any(tok == name or tok.startswith(f"{name}=") for tok in tokens) + + +def _entry_tokens(run_kind: str, entry_args: str) -> list[str]: + tokens = shlex.split(entry_args) + if run_kind == "benchmark" and not ( + _has_flag(tokens, "--run-kind") or _has_flag(tokens, "--run-mode") + ): + return ["--run-kind", "benchmark", *tokens] + return tokens + + +def _get_flag_value(tokens: list[str], name: str, default: str = "") -> str: + for idx, tok in enumerate(tokens): + if tok == name and idx + 1 < len(tokens): + return str(tokens[idx + 1]) + if tok.startswith(f"{name}="): + return str(tok.split("=", 1)[1]) + return str(default) + + +def _set_flag_value(tokens: list[str], name: str, value: str) -> list[str]: + updated: list[str] = [] + replaced = False + idx = 0 + while idx < len(tokens): + tok = tokens[idx] + if tok == name: + replaced = True + updated.extend([name, str(value)]) + idx += 2 + continue + if tok.startswith(f"{name}="): + replaced = True + updated.append(f"{name}={value}") + idx += 1 + continue + updated.append(tok) + idx += 1 + if not replaced: + updated.extend([name, str(value)]) + return updated + + +def _remove_flag(tokens: list[str], name: str) -> list[str]: + updated: list[str] = [] + idx = 0 + while idx < len(tokens): + tok = tokens[idx] + if tok == name: + idx += 1 + continue + if tok.startswith(f"{name}="): + idx += 1 + continue + updated.append(tok) + idx += 1 + return updated + + +def _csv_values(raw: str) -> list[str]: + return [piece.strip() for piece in str(raw).split(",") if piece.strip()] + + +def _alpha_token(alpha: str) -> str: + return str(alpha).replace(".", "p").replace("-", "m") + + +def _truthy(value: str | bool | None) -> bool: + if isinstance(value, bool): + return value + if value is None: + return False + return str(value).strip().lower() in {"1", "true", "yes", "on"} + + +def _alive_nodes() -> list[tuple[str, str, bool, float]]: + seen: set[str] = set() + nodes: list[tuple[str, str, bool, float]] = [] + for node in ray.nodes(): + if not bool(node.get("Alive", False)): + continue + node_id = str(node.get("NodeID", "")).strip() + ip = str(node.get("NodeManagerAddress", "")).strip() + if not node_id or not ip or node_id in seen: + continue + resources = node.get("Resources", {}) or {} + is_head = bool(resources.get("node:__internal_head__", 0.0)) + tpu = float(resources.get("TPU", 0.0)) + seen.add(node_id) + nodes.append((node_id, ip, is_head, tpu)) + return sorted(nodes, key=lambda item: (item[1], item[2], -item[3], item[0])) + + +def _dedupe_nodes_for_tpu( + nodes: list[tuple[str, str, bool, float]], +) -> tuple[list[tuple[str, str]], list[dict[str, str | float | bool]]]: + selected: dict[str, tuple[str, str, bool, float]] = {} + dropped: list[dict[str, str | float | bool]] = [] + + def _score(item: tuple[str, str, bool, float]) -> tuple[int, float, str]: + node_id, _ip, is_head, tpu = item + return (1 if bool(is_head) else 0, -float(tpu), str(node_id)) + + for item in nodes: + node_id, ip, is_head, tpu = item + existing = selected.get(ip) + if existing is None: + selected[ip] = item + continue + + keep, drop = ( + (item, existing) if _score(item) < _score(existing) else (existing, item) + ) + selected[ip] = keep + dropped.append( + { + "ip": str(ip), + "dropped_node_id": str(drop[0]), + "dropped_is_head": bool(drop[2]), + "dropped_tpu": float(drop[3]), + "kept_node_id": str(keep[0]), + "kept_is_head": bool(keep[2]), + "kept_tpu": float(keep[3]), + } + ) + + entries = [(node_id, ip) for ip, (node_id, _ip, _is_head, _tpu) in selected.items()] + entries.sort(key=lambda item: (item[1], item[0])) + return entries, dropped + + +def _benchmark_cells( + tokens: list[str], *, compare_robust: bool +) -> list[tuple[str, str, str, bool]]: + tiers = _csv_values( + _get_flag_value(tokens, "--tiers", "static,surge,linear,qtable,ppo") + ) + alphas = _csv_values(_get_flag_value(tokens, "--alpha-values", "0.0,0.3,0.6")) + base_no_robust = _has_flag(tokens, "--no-robust") + if compare_robust: + modes = [("robust", False), ("no_robust", True)] + else: + modes = [("no_robust", True)] if base_no_robust else [("robust", False)] + return [ + (tier, alpha, mode_label, no_robust) + for tier in tiers + for alpha in alphas + for mode_label, no_robust in modes + ] + + +def _thread_limited_env(env: dict[str, str], threads: int) -> dict[str, str]: + bounded = dict(env) + n = str(max(1, int(threads))) + for key in ( + "OMP_NUM_THREADS", + "MKL_NUM_THREADS", + "OPENBLAS_NUM_THREADS", + "NUMEXPR_NUM_THREADS", + "VECLIB_MAXIMUM_THREADS", + "BLIS_NUM_THREADS", + ): + bounded[key] = n + return bounded + + +@contextlib.contextmanager +def _semaphore_guard(semaphore: threading.Semaphore | None): + if semaphore is None: + yield + return + semaphore.acquire() + try: + yield + finally: + semaphore.release() + + +def _run_benchmark_cells_parallel( + *, + root: str, + env: dict[str, str], + base_tokens: list[str], + compare_robust: bool, + inner_workers: int, + inner_threads: int, + max_heavy_workers: int, + rank: int, +) -> int: + cells = _benchmark_cells(base_tokens, compare_robust=compare_robust) + if not cells: + return 0 + + cwd = str(Path(root)) + base_out = _get_flag_value(base_tokens, "--output-dir", "engine/studies/results") + max_workers = max(1, min(int(inner_workers), len(cells))) + heavy_tiers = {"ppo", "a2c", "dqn"} + heavy_limit = max(1, int(max_heavy_workers)) + heavy_sem = threading.Semaphore(heavy_limit) + print( + { + "rank": int(rank), + "benchmark_cells": len(cells), + "inner_workers": int(max_workers), + "inner_threads": int(max(1, int(inner_threads))), + "heavy_limit": int(heavy_limit), + } + ) + + def _run_cell( + index: int, + total: int, + tier: str, + alpha: str, + mode_label: str, + no_robust: bool, + ) -> tuple[str, str, str, int]: + tokens = list(base_tokens) + tokens = _set_flag_value(tokens, "--tiers", tier) + tokens = _set_flag_value(tokens, "--alpha-values", alpha) + if no_robust: + if not _has_flag(tokens, "--no-robust"): + tokens.append("--no-robust") + else: + tokens = _remove_flag(tokens, "--no-robust") + + cell_out = ( + Path(base_out) + / f"tier_{tier}" + / f"mode_{mode_label}" + / f"alpha_{_alpha_token(alpha)}" + ) + tokens = _set_flag_value(tokens, "--output-dir", str(cell_out)) + cmd = [sys.executable, "-m", "engine.train", *tokens] + cell_env = _thread_limited_env(env, int(inner_threads)) + cell_env["PHANTOM_BENCHMARK_COMPARE_ROBUST"] = "0" + print( + { + "rank": int(rank), + "cell": f"{index}/{total}", + "tier": tier, + "mode": mode_label, + "alpha": alpha, + "command": " ".join(cmd), + } + ) + heavy_guard = heavy_sem if str(tier).lower() in heavy_tiers else None + with _semaphore_guard(heavy_guard): + proc = subprocess.run(cmd, cwd=cwd, env=cell_env) + return tier, alpha, mode_label, int(proc.returncode) + + failures: list[tuple[str, str, str, int]] = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool: + futures = [ + pool.submit(_run_cell, idx, len(cells), tier, alpha, mode_label, no_robust) + for idx, (tier, alpha, mode_label, no_robust) in enumerate(cells, start=1) + ] + for fut in concurrent.futures.as_completed(futures): + tier, alpha, mode_label, code = fut.result() + if code != 0: + failures.append((tier, alpha, mode_label, code)) + + if failures: + print({"rank": int(rank), "benchmark_failures": failures}) + return 1 + return 0 + + +def _run_sweep_agents_parallel( + *, + root: str, + env: dict[str, str], + base_tokens: list[str], + run_kind: str, + rank: int, + agents_per_node: int, + agent_count: int, + inner_threads: int, + tpu_agent_slots: int, +) -> int: + total = max(1, int(agents_per_node)) + cwd = str(Path(root)) + wants_tpu = str(env.get("JAX_PLATFORMS", "")).strip().lower() == "tpu" + tpu_slots = max(0, int(tpu_agent_slots)) + print( + { + "rank": int(rank), + "sweep_agents": int(total), + "agent_count": int(agent_count), + "inner_threads": int(max(1, int(inner_threads))), + "jax_platform": str(env.get("JAX_PLATFORMS", "")), + "tpu_agent_slots": int(tpu_slots), + } + ) + + def _run_agent(slot: int) -> int: + tokens = list(base_tokens) + if int(agent_count) > 0 and not _has_flag(tokens, "--count"): + tokens.extend(["--count", str(int(agent_count))]) + + if _has_flag(tokens, "--group"): + base_group = _get_flag_value(tokens, "--group", "ray-sweep") + tokens = _set_flag_value(tokens, "--group", f"{base_group}-a{slot}") + + if run_kind == "benchmark": + out_dir = _get_flag_value(tokens, "--output-dir", "engine/studies/results") + tokens = _set_flag_value( + tokens, "--output-dir", str(Path(out_dir) / f"agent_{slot}") + ) + if run_kind == "train": + model_dir = _get_flag_value(tokens, "--model-dir", "engine/models") + tokens = _set_flag_value( + tokens, "--model-dir", str(Path(model_dir) / f"agent_{slot}") + ) + + cmd = [sys.executable, "-m", "engine.train", *tokens] + agent_env = _thread_limited_env(env, int(inner_threads)) + if wants_tpu and tpu_slots > 0 and int(slot) > tpu_slots: + agent_env["JAX_PLATFORMS"] = "cpu" + agent_env["JAX_PLATFORM_NAME"] = "cpu" + agent_env["PHANTOM_SWEEP_AGENT_SLOT"] = str(int(slot)) + print( + { + "rank": int(rank), + "agent_slot": int(slot), + "jax_platform": str(agent_env.get("JAX_PLATFORMS", "")), + "command": " ".join(cmd), + } + ) + proc = subprocess.run(cmd, cwd=cwd, env=agent_env) + return int(proc.returncode) + + failures: list[tuple[int, int]] = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=total) as pool: + future_map = { + pool.submit(_run_agent, slot): slot for slot in range(1, total + 1) + } + for future in concurrent.futures.as_completed(future_map): + slot = int(future_map[future]) + code = int(future.result()) + if code != 0: + failures.append((slot, code)) + + if failures: + print({"rank": int(rank), "sweep_failures": failures}) + return 1 + return 0 + + +@ray.remote(max_retries=0) +def _train_on_node( + *, + root: str, + run_kind: str, + entry_args: str, + node_id: str, + node_ip: str, + rank: int, + world_size: int, + coordinator_ip: str, + coordinator_port: int, + base_seed: int, + run_group: str, + compare_robust: bool, + output_root: str, + wandb_entity: str, + wandb_project: str, + agents_per_node: int, + agent_count: int, + inner_workers: int, + inner_threads: int, + max_heavy_workers: int, + sync_jax: bool, +) -> int: + env = dict(os.environ) + env["PYTHONUNBUFFERED"] = "1" + requested_platform = str(env.get("PHANTOM_JAX_PLATFORM", "tpu")).strip().lower() + allow_multi_node_tpu = _truthy(env.get("PHANTOM_ALLOW_MULTI_NODE_TPU")) + if world_size > 1 and requested_platform == "tpu" and not allow_multi_node_tpu: + requested_platform = "cpu" + print( + "PHANTOM_DISTRIBUTED_NOTE: forcing JAX_PLATFORMS=cpu for multi-node SB3 runs " + "(set PHANTOM_ALLOW_MULTI_NODE_TPU=1 to keep TPU for JAX workloads)" + ) + elif world_size > 1 and requested_platform == "tpu" and allow_multi_node_tpu: + print( + "PHANTOM_DISTRIBUTED_NOTE: keeping JAX_PLATFORMS=tpu in multi-node mixed mode" + ) + env["JAX_PLATFORMS"] = requested_platform + if requested_platform == "cpu": + env["JAX_PLATFORM_NAME"] = "cpu" + else: + env.pop("JAX_PLATFORM_NAME", None) + if requested_platform == "tpu" and world_size > 1 and allow_multi_node_tpu: + env["CLOUD_TPU_TASK_ID"] = str(int(rank)) + print( + { + "rank": int(rank), + "node_ip": str(node_ip), + "jax_platform": "tpu", + "cloud_tpu_task_id": str(env["CLOUD_TPU_TASK_ID"]), + } + ) + else: + # Keep each process in single-host mode when TPU multi-host is disabled. + env["CLOUD_TPU_TASK_ID"] = "0" + if run_kind == "benchmark": + env["PHANTOM_BENCHMARK_COMPARE_ROBUST"] = "1" if compare_robust else "0" + if wandb_entity: + env["WANDB_ENTITY"] = wandb_entity + if wandb_project: + env["WANDB_PROJECT"] = wandb_project + + cwd = str(Path(root)) + + try: + subprocess.run(["make", "data.pull"], cwd=cwd, env=env, check=True) + except (subprocess.SubprocessError, OSError): + pull_cmd = [sys.executable, "scripts/hf_data.py", "pull"] + subprocess.run(pull_cmd, cwd=cwd, env=env, check=True) + + if sync_jax and requested_platform == "tpu": + env_probe = dict(env) + env_probe["CLOUD_TPU_TASK_ID"] = str(rank) + probe = ( + "import jax; " + f"jax.distributed.initialize(coordinator_address='{coordinator_ip}:{coordinator_port}', " + f"num_processes={world_size}, process_id={rank}); " + "print('JAX_SYNC', jax.process_index(), jax.device_count(), jax.local_device_count())" + ) + subprocess.run( + [sys.executable, "-c", probe], cwd=cwd, env=env_probe, check=True + ) + + tokens = _entry_tokens(run_kind, entry_args) + is_sweep_agent = _has_flag(tokens, "--sweep-agent") + seed = int(base_seed + rank) + if not is_sweep_agent and not _has_flag(tokens, "--seed"): + tokens.extend(["--seed", str(seed)]) + + if run_kind == "train" and not _has_flag(tokens, "--group"): + tokens.extend(["--group", run_group]) + + if is_sweep_agent and int(agent_count) > 0 and not _has_flag(tokens, "--count"): + tokens.extend(["--count", str(int(agent_count))]) + + try: + tpu_agent_slots = int( + str( + env.get( + "PHANTOM_TPU_AGENT_SLOTS", + "1" if requested_platform == "tpu" else "0", + ) + ).strip() + ) + except ValueError: + tpu_agent_slots = 1 if requested_platform == "tpu" else 0 + + if ( + run_kind == "benchmark" + and output_root + and not _has_flag(tokens, "--output-dir") + ): + out_dir = Path(output_root) / f"rank_{rank}" / f"seed_{seed}" + out_dir.parent.mkdir(parents=True, exist_ok=True) + tokens.extend(["--output-dir", str(out_dir)]) + + if is_sweep_agent and int(agents_per_node) > 1: + return _run_sweep_agents_parallel( + root=root, + env=env, + base_tokens=tokens, + run_kind=run_kind, + rank=rank, + agents_per_node=int(agents_per_node), + agent_count=int(agent_count), + inner_threads=int(inner_threads), + tpu_agent_slots=int(max(0, tpu_agent_slots)), + ) + + if run_kind == "benchmark" and int(inner_workers) > 1 and not is_sweep_agent: + return _run_benchmark_cells_parallel( + root=root, + env=env, + base_tokens=tokens, + compare_robust=bool(compare_robust), + inner_workers=int(inner_workers), + inner_threads=int(inner_threads), + max_heavy_workers=int(max_heavy_workers), + rank=rank, + ) + + cmd = [sys.executable, "-m", "engine.train", *tokens] + print( + { + "node_id": node_id, + "node_ip": node_ip, + "rank": int(rank), + "run_kind": run_kind, + "seed": int(seed), + "compare_robust": bool(compare_robust), + "wandb_entity": str(env.get("WANDB_ENTITY", "")), + "wandb_project": str(env.get("WANDB_PROJECT", "")), + "command": " ".join(cmd), + } + ) + proc = subprocess.run( + cmd, cwd=cwd, env=_thread_limited_env(env, int(inner_threads)) + ) + return int(proc.returncode) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Launch one train/benchmark run per Ray TPU node" + ) + parser.add_argument("--run-kind", choices=["train", "benchmark"], default="train") + parser.add_argument("--entry-args", type=str, default="") + parser.add_argument("--train-args", type=str, default="") + parser.add_argument("--num-nodes", type=int, default=0) + parser.add_argument("--tpu-per-task", type=float, default=8.0) + parser.add_argument("--base-seed", type=int, default=42) + parser.add_argument("--sync-jax", action="store_true") + parser.add_argument("--coordinator-port", type=int, default=12355) + parser.add_argument("--run-group", type=str, default="") + parser.add_argument("--compare-robust", action="store_true") + parser.add_argument("--output-root", type=str, default="") + parser.add_argument("--wandb-entity", type=str, default="") + parser.add_argument("--wandb-project", type=str, default="") + parser.add_argument("--agents-per-node", type=int, default=1) + parser.add_argument("--agent-count", type=int, default=0) + parser.add_argument("--inner-workers", type=int, default=1) + parser.add_argument("--inner-threads", type=int, default=1) + parser.add_argument("--max-heavy-workers", type=int, default=2) + parser.add_argument("--worker-cpus", type=float, default=1.0) + args = parser.parse_args() + + entry_args = str(args.entry_args or args.train_args).strip() + if not entry_args: + raise ValueError("--entry-args (or legacy --train-args) is required") + + ray.init(address="auto") + + node_records = _alive_nodes() + if not node_records: + raise RuntimeError("No alive Ray nodes found") + + if float(args.tpu_per_task) > 0.0: + node_entries, dropped = _dedupe_nodes_for_tpu(node_records) + if dropped: + print( + { + "tpu_host_dedupe": True, + "alive_ray_nodes": len(node_records), + "unique_tpu_hosts": len(node_entries), + "dropped": dropped, + } + ) + else: + node_entries = [ + (node_id, node_ip) for node_id, node_ip, _is_head, _tpu in node_records + ] + + requested = int(args.num_nodes) + if requested > 0: + if requested > len(node_entries): + print( + { + "requested_nodes": int(requested), + "available_nodes": int(len(node_entries)), + "note": "requested nodes exceed available hosts; capping", + } + ) + node_entries = node_entries[:requested] + + world_size = len(node_entries) + coordinator_ip = node_entries[0][1] + run_group = args.run_group or f"ray-dist-{int(time.time())}" + + print( + { + "nodes": [ + {"node_id": node_id, "node_ip": node_ip} + for node_id, node_ip in node_entries + ], + "world_size": world_size, + "coordinator": f"{coordinator_ip}:{int(args.coordinator_port)}", + "run_kind": str(args.run_kind), + "entry_args": entry_args, + "run_group": run_group, + "compare_robust": bool(args.compare_robust), + "output_root": str(args.output_root), + "agents_per_node": int(args.agents_per_node), + "agent_count": int(args.agent_count), + "inner_workers": int(args.inner_workers), + "inner_threads": int(args.inner_threads), + "max_heavy_workers": int(args.max_heavy_workers), + } + ) + + futures = [] + root = str(Path(__file__).resolve().parents[1]) + for rank, (node_id, node_ip) in enumerate(node_entries): + resources: dict[str, float] = {} + tpu_per_task = float(args.tpu_per_task) + if tpu_per_task > 0.0: + resources["TPU"] = tpu_per_task + futures.append( + _train_on_node.options( + resources=resources, + num_cpus=float(args.worker_cpus), + scheduling_strategy=NodeAffinitySchedulingStrategy( + node_id=node_id, + soft=False, + ), + ).remote( + root=root, + run_kind=str(args.run_kind), + entry_args=entry_args, + node_id=node_id, + node_ip=node_ip, + rank=rank, + world_size=world_size, + coordinator_ip=coordinator_ip, + coordinator_port=int(args.coordinator_port), + base_seed=int(args.base_seed), + run_group=run_group, + compare_robust=bool(args.compare_robust), + output_root=str(args.output_root), + wandb_entity=str(args.wandb_entity), + wandb_project=str(args.wandb_project), + agents_per_node=int(args.agents_per_node), + agent_count=int(args.agent_count), + inner_workers=int(args.inner_workers), + inner_threads=int(args.inner_threads), + max_heavy_workers=int(args.max_heavy_workers), + sync_jax=bool(args.sync_jax and str(args.run_kind) == "train"), + ) + ) + + results = ray.get(futures) + failed = [code for code in results if int(code) != 0] + if failed: + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/setuptpu.sh b/scripts/setuptpu.sh new file mode 100644 index 0000000..041266d --- /dev/null +++ b/scripts/setuptpu.sh @@ -0,0 +1,9 @@ +commands = ( + "pip install \"jax[tpu]\" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html" + "pip install stable-baselines3>=2.2.0 gymnasium wandb tensorboard" + + +" + + +) diff --git a/scripts/wandb_compare_best.py b/scripts/wandb_compare_best.py new file mode 100644 index 0000000..544f9d8 --- /dev/null +++ b/scripts/wandb_compare_best.py @@ -0,0 +1,333 @@ +from __future__ import annotations + +import argparse +import json +import os +import shlex +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +def _truthy(value: Any) -> bool: + if isinstance(value, bool): + return value + if value is None: + return False + return str(value).strip().lower() in {"1", "true", "yes", "on"} + + +def _as_float(value: Any, default: float) -> float: + try: + return float(value) + except (TypeError, ValueError): + return float(default) + + +def _as_int(value: Any, default: int) -> int: + try: + return int(float(value)) + except (TypeError, ValueError): + return int(default) + + +def _normalize_sweep_id( + raw: str, entity: str, project: str +) -> tuple[str, str, str, str]: + sweep_raw = str(raw).strip() + if not sweep_raw: + raise ValueError("--sweep-id is required") + parts = [piece.strip() for piece in sweep_raw.split("/") if piece.strip()] + if len(parts) == 3: + return f"{parts[0]}/{parts[1]}/{parts[2]}", parts[0], parts[1], parts[2] + if len(parts) == 2: + if not entity.strip(): + raise ValueError("--entity is required when --sweep-id is '/'") + return f"{entity}/{parts[0]}/{parts[1]}", entity, parts[0], parts[1] + if len(parts) == 1: + if not entity.strip() or not project.strip(): + raise ValueError( + "--entity and --project are required when --sweep-id is ''" + ) + return f"{entity}/{project}/{parts[0]}", entity, project, parts[0] + raise ValueError(f"invalid --sweep-id value: '{raw}'") + + +def _pick_best_defended_run( + sweep: Any, + metric: str, + *, + min_margin: float, + min_coi: float, +) -> tuple[Any, float]: + ranked: list[tuple[float, Any]] = [] + for run in list(sweep.runs): + if str(getattr(run, "state", "")).lower() != "finished": + continue + cfg = dict(getattr(run, "config", {}) or {}) + is_baseline = ( + _truthy(cfg.get("baseline_mode")) + if "baseline_mode" in cfg + else _truthy(cfg.get("no_robust")) + ) + if is_baseline: + continue + summary = dict(getattr(run, "summary", {}) or {}) + margin = _as_float(summary.get("eval/margin_mean"), -1.0) + coi_level = _as_float(summary.get("eval/coi_level_mean"), -1.0) + if margin < float(min_margin): + continue + if coi_level < float(min_coi): + continue + score = summary.get(metric) + if score is None and str(metric) == "eval/stress_revenue_worst": + score = summary.get("eval/robust_revenue_worst") + if score is None: + continue + try: + ranked.append((float(score), run)) + except (TypeError, ValueError): + continue + if not ranked: + raise RuntimeError( + f"no finished defended runs found with summary metric '{metric}' and constraints " + f"margin>={min_margin}, coi>={min_coi}" + ) + ranked.sort(key=lambda item: item[0], reverse=True) + return ranked[0][1], ranked[0][0] + + +def _format_alpha_values(raw: str, fallback_alpha: float) -> str: + cleaned = str(raw).strip() + if cleaned: + return cleaned + return f"{float(fallback_alpha):.6g}" + + +def _benchmark_tokens( + *, + project: str, + cfg: dict[str, Any], + alpha_values: str, + episodes: int, +) -> list[str]: + algo = str(cfg.get("algo", "")).strip().lower() + if algo not in {"qtable", "ppo", "a2c", "dqn"}: + raise ValueError(f"unsupported algo in best run: '{algo}'") + + total_timesteps = _as_int(cfg.get("total_timesteps"), 80_000) + max_steps = _as_int(cfg.get("max_steps"), 100) + ambiguity_radius = _as_float( + cfg.get("ambiguity_radius", cfg.get("robust_radius")), 0.2 + ) + ambiguity_points = _as_int(cfg.get("ambiguity_points", cfg.get("robust_points")), 7) + ambiguity_rollouts = _as_int( + cfg.get("ambiguity_rollouts", cfg.get("robust_rollouts")), 1 + ) + lambda_coi = _as_float(cfg.get("lambda_coi"), 0.2) + eta_ux = _as_float(cfg.get("eta_ux"), 0.5) + reward_profit_weight = _as_float(cfg.get("reward_profit_weight"), 1.0) + learning_rate = _as_float(cfg.get("learning_rate"), 3e-4) + batch_size = _as_int(cfg.get("batch_size"), 256) + n_steps = _as_int(cfg.get("n_steps"), 2048) + sessions = _as_int(cfg.get("N"), 100) + action_levels = _as_int(cfg.get("action_levels"), 9) + margin_floor = _as_float(cfg.get("margin_floor"), 0.85) + seed = _as_int(cfg.get("seed"), 42) + + return [ + "--project", + project, + "--tiers", + algo, + "--alpha-values", + alpha_values, + "--episodes", + str(int(episodes)), + "--seed", + str(seed), + "--total-timesteps", + str(total_timesteps), + "--max-steps", + str(max_steps), + "--robust-radius", + str(ambiguity_radius), + "--robust-points", + str(ambiguity_points), + "--robust-rollouts", + str(ambiguity_rollouts), + "--lambda-coi", + str(lambda_coi), + "--eta-ux", + str(eta_ux), + "--reward-profit-weight", + str(reward_profit_weight), + "--learning-rate", + str(learning_rate), + "--batch-size", + str(batch_size), + "--n-steps", + str(n_steps), + "--N", + str(sessions), + "--action-levels", + str(action_levels), + "--margin-floor", + str(margin_floor), + "--device", + "cpu", + ] + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Find best defended sweep run and prepare defended-vs-baseline benchmark" + ) + parser.add_argument("--sweep-id", required=True) + parser.add_argument("--entity", default="") + parser.add_argument("--project", default="") + parser.add_argument("--metric", default="eval/stress_revenue_worst") + parser.add_argument("--min-margin", type=float, default=0.90) + parser.add_argument("--min-coi", type=float, default=120.0) + parser.add_argument("--alpha-values", default="") + parser.add_argument("--episodes", type=int, default=15) + parser.add_argument("--num-nodes", type=int, default=4) + parser.add_argument("--tpu-per-task", type=float, default=0.0) + parser.add_argument("--inner-workers", type=int, default=12) + parser.add_argument("--inner-threads", type=int, default=1) + parser.add_argument("--max-heavy-workers", type=int, default=3) + parser.add_argument("--worker-cpus", type=int, default=24) + parser.add_argument( + "--output-root", default="engine/studies/results/overnight/best_compare" + ) + parser.add_argument("--timeout", type=int, default=120) + parser.add_argument("--submit", action="store_true") + parser.add_argument("--ray-no-wait", action="store_true") + parser.add_argument("--submission-id", default="") + parser.add_argument("--output-json", default="") + args = parser.parse_args() + + root = Path(__file__).resolve().parents[1] + cwd = str(Path.cwd()) + sys.path = [p for p in sys.path if p not in {"", cwd}] + + try: + import wandb + except ImportError as exc: + raise ImportError("wandb is required") from exc + + full_sweep_id, entity, project, _ = _normalize_sweep_id( + raw=str(args.sweep_id), + entity=str(args.entity).strip(), + project=str(args.project).strip(), + ) + api = wandb.Api(timeout=int(args.timeout)) + sweep = api.sweep(full_sweep_id) + best_run, best_score = _pick_best_defended_run( + sweep, + str(args.metric), + min_margin=float(args.min_margin), + min_coi=float(args.min_coi), + ) + + best_cfg = dict(getattr(best_run, "config", {}) or {}) + best_alpha = _as_float( + best_cfg.get( + "alpha", + getattr(best_run, "summary", {}).get("study/alpha", 0.6), + ), + 0.6, + ) + alpha_values = _format_alpha_values( + str(args.alpha_values), fallback_alpha=best_alpha + ) + benchmark_tokens = _benchmark_tokens( + project=project, + cfg=best_cfg, + alpha_values=alpha_values, + episodes=int(args.episodes), + ) + benchmark_args = shlex.join(benchmark_tokens) + + submission_id = str(args.submission_id).strip() + if not submission_id: + stamp = datetime.now(timezone.utc).strftime("%m%d-%H%M") + submission_id = f"best-compare-{stamp}" + + env_overrides = { + "RAY_MODE": "benchmark", + "COMPARE_ROBUST": "1", + "NUM_NODES": str(int(args.num_nodes)), + "TPU_PER_TASK": str(float(args.tpu_per_task)), + "PHANTOM_JAX_PLATFORM": "cpu", + "WANDB_ENTITY": entity, + "WANDB_PROJECT": project, + "BENCHMARK_ARGS": benchmark_args, + "INNER_WORKERS": str(int(args.inner_workers)), + "INNER_THREADS": str(int(args.inner_threads)), + "MAX_HEAVY_WORKERS": str(int(args.max_heavy_workers)), + "WORKER_CPUS": str(int(args.worker_cpus)), + "OUTPUT_ROOT": str(args.output_root), + "SUBMISSION_ID": submission_id, + } + if bool(args.ray_no_wait): + env_overrides["RAY_NO_WAIT"] = "1" + + command_str = ( + "cd " + + shlex.quote(str(root)) + + " && " + + " ".join( + f"{key}={shlex.quote(str(value))}" for key, value in env_overrides.items() + ) + + " bash ./submit_ray_job.sh" + ) + + payload = { + "sweep_id": full_sweep_id, + "selection_metric": str(args.metric), + "constraints": { + "min_margin": float(args.min_margin), + "min_coi": float(args.min_coi), + }, + "best_run": { + "id": str(getattr(best_run, "id", "")), + "name": str(getattr(best_run, "name", "")), + "url": str(getattr(best_run, "url", "")), + "score": float(best_score), + "algo": str(best_cfg.get("algo", "")), + "alpha": float(best_alpha), + "eval_margin_mean": _as_float( + getattr(best_run, "summary", {}).get("eval/margin_mean"), 0.0 + ), + "eval_coi_level_mean": _as_float( + getattr(best_run, "summary", {}).get("eval/coi_level_mean"), 0.0 + ), + }, + "benchmark_compare_command": command_str, + } + print(json.dumps(payload, indent=2)) + + output_json = str(args.output_json).strip() + if output_json: + out_path = Path(output_json) + if not out_path.is_absolute(): + out_path = root / out_path + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(payload, indent=2) + "\n") + + if bool(args.submit): + run_env = dict(os.environ) + run_env.update({key: str(value) for key, value in env_overrides.items()}) + subprocess.run( + ["bash", "./submit_ray_job.sh"], + cwd=str(root), + env=run_env, + check=True, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/wandb_create_sweep.py b/scripts/wandb_create_sweep.py new file mode 100644 index 0000000..e44354a --- /dev/null +++ b/scripts/wandb_create_sweep.py @@ -0,0 +1,313 @@ +from __future__ import annotations + +import argparse +import contextlib +import io +import json +import sys +from pathlib import Path +from typing import Any + + +def _base_sweep(method: str, metric_name: str) -> dict[str, Any]: + return { + "method": str(method), + "metric": {"name": str(metric_name), "goal": "maximize"}, + } + + +def _benchmark_sweep(method: str) -> dict[str, Any]: + cfg = _base_sweep(method=method, metric_name="objective/score") + cfg["name"] = "benchmark-all-algos-defense" + cfg["parameters"] = { + "tiers": { + "values": [ + "static", + "surge", + "linear", + "qtable", + "ppo", + "a2c", + "dqn", + ] + }, + "alpha_values": {"values": ["0.0", "0.1", "0.25", "0.4", "0.6", "0.8"]}, + "baseline_mode": {"values": [False, True]}, + "seed": {"values": [42, 1337, 2026, 7777]}, + "episodes": {"values": [8, 12]}, + "total_timesteps": {"values": [15000, 30000, 50000]}, + "lambda_coi": {"values": [0.1, 0.2, 0.4]}, + "ambiguity_radius": {"values": [0.1, 0.2, 0.3]}, + "ambiguity_points": {"values": [5, 7]}, + "ambiguity_rollouts": {"values": [1, 2]}, + "eta_ux": {"values": [0.25, 0.5, 0.75]}, + "reward_profit_weight": {"values": [0.75, 1.0, 1.25]}, + "learning_rate": {"values": [1e-4, 3e-4, 1e-3]}, + "batch_size": {"values": [128, 256, 512]}, + "n_steps": {"values": [1024, 2048, 4096]}, + "device": {"value": "cpu"}, + } + return cfg + + +def _train_sweep(method: str) -> dict[str, Any]: + cfg = _base_sweep(method=method, metric_name="objective/score") + cfg["name"] = "train-all-algos-defense" + cfg["parameters"] = { + "algo": {"values": ["qtable", "ppo", "a2c", "dqn"]}, + "alpha": {"values": [0.0, 0.1, 0.25, 0.4, 0.6]}, + "baseline_mode": {"values": [False, True]}, + "seed": {"values": [42, 1337, 2026, 7777]}, + "total_timesteps": {"values": [30000, 50000, 80000]}, + "learning_rate": {"values": [1e-4, 3e-4, 1e-3]}, + "batch_size": {"values": [128, 256, 512]}, + "n_steps": {"values": [1024, 2048, 4096]}, + "lambda_coi": {"values": [0.1, 0.2, 0.4]}, + "ambiguity_radius": {"values": [0.1, 0.2, 0.3]}, + "ambiguity_points": {"values": [3, 5, 7]}, + "ambiguity_rollouts": {"values": [1, 2]}, + "eta_ux": {"values": [0.25, 0.5, 0.75]}, + "reward_profit_weight": {"values": [0.75, 1.0, 1.25]}, + "N": {"values": [80, 100, 140]}, + "max_steps": {"values": [80, 100, 120]}, + "action_levels": {"values": [7, 9, 11]}, + "device": {"value": "cpu"}, + } + return cfg + + +def _train_robust_revenue_sweep(method: str) -> dict[str, Any]: + cfg = _base_sweep(method=method, metric_name="eval/stress_revenue_worst") + cfg["name"] = "train-defense-revenue-search" + cfg["parameters"] = { + "algo": {"values": ["qtable", "ppo", "a2c", "dqn"]}, + "alpha": {"values": [0.4, 0.6, 0.8]}, + "baseline_mode": {"value": False}, + "seed": {"values": [42, 1337, 2026, 7777]}, + "total_timesteps": {"values": [60_000, 80_000, 120_000]}, + "learning_rate": {"values": [1e-4, 3e-4, 1e-3]}, + "batch_size": {"values": [128, 256, 512]}, + "n_steps": {"values": [1024, 2048, 4096]}, + "lambda_coi": {"values": [0.2, 0.4, 0.6]}, + "ambiguity_radius": {"values": [0.1, 0.2, 0.3]}, + "ambiguity_points": {"values": [5, 7, 9]}, + "ambiguity_rollouts": {"values": [1, 2]}, + "eta_ux": {"values": [0.25, 0.5, 0.75]}, + "reward_profit_weight": {"values": [1.0, 1.25]}, + "N": {"values": [80, 100, 140]}, + "max_steps": {"values": [80, 100, 120]}, + "action_levels": {"values": [7, 9, 11]}, + "margin_floor": {"value": 0.85}, + "device": {"value": "cpu"}, + } + return cfg + + +def _ppo_calibration_sweep(method: str) -> dict[str, Any]: + cfg = _base_sweep(method=method, metric_name="objective/score") + cfg["name"] = "benchmark-ppo-calibration" + cfg["parameters"] = { + "tiers": {"value": "ppo"}, + "alpha_values": {"values": ["0.0", "0.1", "0.25", "0.4", "0.6", "0.8"]}, + "baseline_mode": {"values": [False, True]}, + "seed": {"values": [42, 1337, 2026, 7777]}, + "episodes": {"value": 12}, + "total_timesteps": {"value": 60000}, + "lambda_coi": { + "distribution": "uniform", + "min": 0.05, + "max": 0.6, + }, + "ambiguity_radius": { + "distribution": "uniform", + "min": 0.05, + "max": 0.45, + }, + "ambiguity_points": {"value": 7}, + "ambiguity_rollouts": {"value": 1}, + "eta_ux": {"value": 0.5}, + "reward_profit_weight": {"value": 1.0}, + "learning_rate": { + "distribution": "log_uniform_values", + "min": 1e-4, + "max": 1e-3, + }, + "batch_size": {"values": [128, 256, 512]}, + "n_steps": {"values": [1024, 2048, 4096]}, + "device": {"value": "cpu"}, + } + return cfg + + +def _ppo_block_a_sweep(method: str) -> dict[str, Any]: + cfg = _base_sweep(method=method, metric_name="objective/score") + cfg["name"] = "benchmark-ppo-block-a-calibration" + cfg["parameters"] = { + "tiers": {"value": "ppo"}, + "alpha_values": {"value": "0.25,0.6,0.8"}, + "seed": {"values": [42, 1337, 2026]}, + "episodes": {"value": 12}, + "total_timesteps": {"value": 80000}, + "lambda_coi": {"values": [0.05, 0.1, 0.2]}, + "ambiguity_radius": {"values": [0.05, 0.1, 0.2]}, + "ambiguity_points": {"value": 7}, + "ambiguity_rollouts": {"value": 1}, + "eta_ux": {"value": 0.5}, + "reward_profit_weight": {"value": 1.0}, + "learning_rate": {"value": 3e-4}, + "batch_size": {"value": 256}, + "n_steps": {"value": 2048}, + "device": {"value": "cpu"}, + } + return cfg + + +def _ppo_shift_screen_sweep(method: str) -> dict[str, Any]: + cfg = _base_sweep(method=method, metric_name="objective/score") + cfg["name"] = "benchmark-ppo-shift-screen" + cfg["parameters"] = { + "tiers": {"value": "ppo"}, + "alpha_values": {"value": "0.25"}, + "eval_alpha_values": {"value": "0.6,0.8"}, + "seed": {"values": [42, 1337, 2026]}, + "episodes": {"value": 20}, + "total_timesteps": {"value": 80000}, + "lambda_coi": {"values": [0.0, 0.02, 0.05, 0.1]}, + "ambiguity_radius": {"values": [0.0, 0.02, 0.05, 0.1]}, + "ambiguity_points": {"value": 5}, + "ambiguity_rollouts": {"value": 1}, + "eta_ux": {"value": 0.0}, + "reward_profit_weight": {"value": 1.0}, + "learning_rate": {"value": 3e-4}, + "batch_size": {"value": 256}, + "n_steps": {"value": 2048}, + "device": {"value": "cpu"}, + } + return cfg + + +def _ppo_rl_study_sweep(method: str) -> dict[str, Any]: + cfg = _base_sweep(method=method, metric_name="eval/stress_revenue_worst") + cfg["name"] = "train-ppo-standard-vs-defended-equilibrium" + cfg["parameters"] = { + "algo": {"value": "ppo"}, + "seed": {"values": [42, 1337, 7777]}, + "alpha": {"values": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}, + "n_products": {"values": [5, 25, 50, 100]}, + "N": {"value": 100}, + "no_robust": {"values": [False, True]}, + "lambda_coi": {"values": [0.05, 0.15, 0.3]}, + "ambiguity_radius": {"values": [0.1, 0.2, 0.3]}, + "ambiguity_points": {"value": 7}, + "ambiguity_rollouts": {"value": 1}, + "eta_ux": {"value": 0.0}, + "reward_profit_weight": {"value": 1.0}, + "total_timesteps": {"value": 100000}, + "eval_episodes": {"value": 10}, + "eval_freq": {"value": 1000}, + "log_freq": {"value": 100}, + "hist_freq": {"value": 500}, + "learning_rate": {"value": 3e-4}, + "batch_size": {"value": 256}, + "n_steps": {"value": 2048}, + "device": {"value": "cpu"}, + } + return cfg + + +def main() -> None: + parser = argparse.ArgumentParser(description="Create W&B sweep for PHANTOM") + parser.add_argument( + "--kind", + choices=[ + "benchmark", + "train", + "ppo_calibration", + "ppo_block_a", + "ppo_shift_screen", + "ppo_rl_study", + ], + default="benchmark", + ) + parser.add_argument( + "--profile", + choices=["default", "robust_revenue"], + default="default", + ) + parser.add_argument("--project", required=True) + parser.add_argument("--entity", default="") + parser.add_argument( + "--method", choices=["random", "bayes", "grid"], default="random" + ) + parser.add_argument("--run-cap", type=int, default=0) + parser.add_argument("--json", action="store_true") + parser.add_argument("--full-id", action="store_true") + args = parser.parse_args() + + cwd = str(Path.cwd()) + sys.path = [p for p in sys.path if p not in {"", cwd}] + + try: + import wandb + except ImportError as exc: + raise ImportError("wandb is required to create sweeps") from exc + + if str(args.kind) == "benchmark": + if str(args.profile) != "default": + raise ValueError("benchmark sweeps only support --profile default") + sweep_cfg = _benchmark_sweep(args.method) + elif str(args.kind) == "train": + if str(args.profile) == "robust_revenue": + sweep_cfg = _train_robust_revenue_sweep(args.method) + else: + sweep_cfg = _train_sweep(args.method) + elif str(args.kind) == "ppo_calibration": + if str(args.profile) != "default": + raise ValueError("ppo_calibration sweeps only support --profile default") + sweep_cfg = _ppo_calibration_sweep(args.method) + elif str(args.kind) == "ppo_block_a": + if str(args.profile) != "default": + raise ValueError("ppo_block_a sweeps only support --profile default") + sweep_cfg = _ppo_block_a_sweep(args.method) + elif str(args.kind) == "ppo_shift_screen": + if str(args.profile) != "default": + raise ValueError("ppo_shift_screen sweeps only support --profile default") + sweep_cfg = _ppo_shift_screen_sweep(args.method) + else: + if str(args.profile) != "default": + raise ValueError("ppo_rl_study sweeps only support --profile default") + sweep_cfg = _ppo_rl_study_sweep(args.method) + if int(args.run_cap) > 0: + sweep_cfg["run_cap"] = int(args.run_cap) + + with contextlib.redirect_stdout(io.StringIO()): + sweep_id = wandb.sweep( + sweep=sweep_cfg, + project=str(args.project), + entity=str(args.entity) if str(args.entity).strip() else None, + ) + full_id = ( + f"{args.entity}/{args.project}/{sweep_id}" + if str(args.entity).strip() + else f"{args.project}/{sweep_id}" + ) + + if bool(args.json): + print( + json.dumps( + { + "kind": str(args.kind), + "profile": str(args.profile), + "project": str(args.project), + "entity": str(args.entity), + "sweep_id": str(sweep_id), + "full_id": str(full_id), + } + ) + ) + return + print(full_id if bool(args.full_id) else sweep_id) + + +if __name__ == "__main__": + main() diff --git a/scripts/whoclicked_card.py b/scripts/whoclicked_card.py new file mode 100644 index 0000000..f6829d7 --- /dev/null +++ b/scripts/whoclicked_card.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python3 +"""Build and upload a Hugging Face dataset card for whoclickedit.""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path +from typing import Any +from urllib.parse import quote + +import pandas as pd +from huggingface_hub import HfApi + + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +DEFAULT_INPUT = PROJECT_ROOT / "experiments" / "exports" / "whoclicked.csv" +DEFAULT_OUTPUT = PROJECT_ROOT / "experiments" / "exports" / "whoclicked_dataset_card.md" +DEFAULT_REPO = os.getenv("HF_WHOCLICKED_REPO", "velocitatem/whoclickedit") + + +def _token() -> str | None: + return os.getenv("HF_TOKEN") or None + + +def _exception_details(exc: Exception) -> str: + parts = [str(exc).strip()] + response = getattr(exc, "response", None) + if response is not None: + status = getattr(response, "status_code", None) + if status is not None: + parts.append(f"HTTP {status}") + text = getattr(response, "text", "") + if text: + parts.append(text.strip()[:500]) + return " | ".join(p for p in parts if p) + + +def _size_category(n_rows: int) -> str: + if n_rows < 1_000: + return "n<1K" + if n_rows < 10_000: + return "1K dict[str, int]: + if col not in df.columns: + return {} + vc = df[col].fillna("").astype(str).value_counts(dropna=False) + return {k: int(v) for k, v in vc.items()} + + +def _group_count(df: pd.DataFrame, left: str, right: str) -> dict[tuple[str, str], int]: + if left not in df.columns or right not in df.columns: + return {} + grouped = ( + df.groupby([left, right], dropna=False) + .size() + .reset_index(name="count") + .sort_values([left, right]) + ) + out: dict[tuple[str, str], int] = {} + for _, row in grouped.iterrows(): + out[(str(row[left]), str(row[right]))] = int(row["count"]) + return out + + +def _session_count_by_actor(df: pd.DataFrame) -> dict[str, int]: + if "actor_type" not in df.columns or "sessionId" not in df.columns: + return {} + grouped = ( + df[["actor_type", "sessionId"]] + .dropna(subset=["sessionId"]) + .drop_duplicates() + .groupby("actor_type") + .size() + ) + return {str(k): int(v) for k, v in grouped.items()} + + +def _time_range(df: pd.DataFrame) -> tuple[str, str]: + if "ts" not in df.columns: + return "unknown", "unknown" + ts = pd.to_datetime(df["ts"], errors="coerce", utc=True) + ts = ts.dropna() + if ts.empty: + return "unknown", "unknown" + return ts.min().isoformat(), ts.max().isoformat() + + +def _badge(label: str, value: str, color: str, logo: str | None = None) -> str: + encoded_label = quote(label, safe="") + encoded_value = quote(value, safe="") + base = ( + "https://img.shields.io/badge/" + f"{encoded_label}-{encoded_value}-{color}?style=flat-square" + ) + if logo: + base = f"{base}&logo={quote(logo, safe='')}&logoColor=white" + return f"![{label}]({base})" + + +def _md_table(headers: list[str], rows: list[list[str]]) -> str: + header = f"| {' | '.join(headers)} |" + separator = f"| {' | '.join('---' for _ in headers)} |" + if not rows: + empty = f"| {' | '.join('n/a' for _ in headers)} |" + return "\n".join([header, separator, empty]) + body = "\n".join(f"| {' | '.join(row)} |" for row in rows) + return "\n".join([header, separator, body]) + + +def _render_card(df: pd.DataFrame) -> str: + total_rows = len(df) + total_cols = len(df.columns) + size_cat = _size_category(total_rows) + + actor_counts = _series_count(df, "actor_type") + record_counts = _series_count(df, "record_type") + by_actor_record = _group_count(df, "actor_type", "record_type") + store_counts = _series_count(df, "storeMode") + session_counts = _session_count_by_actor(df) + t_min, t_max = _time_range(df) + + event_counts: dict[str, int] = {} + if "record_type" in df.columns and "eventName" in df.columns: + interactions = df[df["record_type"] == "interaction"] + event_counts = _series_count(interactions, "eventName") + + metadata_cols = sorted(c for c in df.columns if c.startswith("metadata_")) + + total_sessions = sum(session_counts.values()) + human_rows = actor_counts.get("human", 0) + agent_rows = actor_counts.get("agent", 0) + + top_events = list(event_counts.items())[:10] + + snapshot_table = _md_table( + ["Metric", "Value"], + [ + ["Rows", f"`{total_rows}`"], + ["Columns", f"`{total_cols}`"], + ["Time range (UTC)", f"`{t_min}` -> `{t_max}`"], + ["Unique sessions", f"`{total_sessions}`"], + ], + ) + + actor_table = _md_table( + ["Actor", "Rows", "Share"], + [ + [ + "`human`", + str(human_rows), + f"{(human_rows / total_rows * 100):.1f}%" if total_rows else "0.0%", + ], + [ + "`agent`", + str(agent_rows), + f"{(agent_rows / total_rows * 100):.1f}%" if total_rows else "0.0%", + ], + ], + ) + + pair_table = _md_table( + ["Actor", "Record type", "Rows"], + [ + [f"`{actor}`", f"`{record}`", str(n)] + for (actor, record), n in sorted( + by_actor_record.items(), key=lambda x: (x[0][0], x[0][1]) + ) + ], + ) + + store_table = _md_table( + ["Store mode", "Rows"], + [ + [f"`{mode}`", str(n)] + for mode, n in sorted( + store_counts.items(), key=lambda x: x[1], reverse=True + ) + ], + ) + + event_table = _md_table( + ["Interaction event", "Count"], + [[f"`{name}`", str(n)] for name, n in top_events], + ) + + metadata_lines = "\n".join(f"- `{c}`" for c in metadata_cols) or "- none" + + dataset_badge = ( + "[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/" + "dataset-on-hf-sm.svg)](https://huggingface.co/datasets/velocitatem/whoclickedit)" + ) + rows_badge = _badge("Rows", str(total_rows), "0A9396") + cols_badge = _badge("Columns", str(total_cols), "005F73") + sessions_badge = _badge("Sessions", str(total_sessions), "1D3557") + human_badge = _badge("Human rows", str(human_rows), "2A9D8F") + agent_badge = _badge("Agent rows", str(agent_rows), "E76F51") + license_badge = _badge("License", "MIT", "111827") + + return f"""--- +pretty_name: whoclickedit +license: mit +language: +- en +task_categories: +- tabular-classification +task_ids: +- tabular-multi-class-classification +tags: +- e-commerce +- dynamic-pricing +- behavioral-telemetry +- human-vs-agent +- session-data +size_categories: +- {size_cat} +--- + +PHANTOM research banner + +# [whoclickedit](https://huggingface.co/datasets/velocitatem/whoclickedit) + +{dataset_badge} +{rows_badge} +{cols_badge} +{sessions_badge} +{human_badge} +{agent_badge} +{license_badge} + +> **Event-level behavior data for dynamic pricing research.** +> This dataset captures how humans and automated agents browse, query prices, and move through the PHANTOM storefronts during controlled experiments. + +## What this dataset gives you + +- A single flat file (`whoclicked.csv`) with both interaction and price-log events. +- Explicit labels for actor origin: `actor_type` and `is_agent`. +- Provenance fields from Kafka envelopes when available. +- Metadata flattened into feature-ready `metadata_*` columns. + +## Snapshot + +{snapshot_table} + +## Composition + +### Rows by actor +{actor_table} + +### Rows by actor and record type +{pair_table} + +### Store mode coverage +{store_table} + +### Top interaction events +{event_table} + +## Collection pipeline + +Data is sourced from two roots inside PHANTOM: + +- `experiments/collected_data` (human sessions) +- `experiments/agents/collected_data` (agent sessions) + +Each session directory contains: + +- `int.json`: user interaction events +- `price.json`: price quote observations + +ETL behavior: + +1. Accepts both Kafka-envelope records and flat payload records. +2. Flattens nested JSON to a tabular schema. +3. Preserves row-level provenance (`source_session_dir`, `source_row_index`, topic fields). +4. Adds modeling labels (`actor_type`, `is_agent`, `record_type`). + +## Schema highlights + +Core modeling fields: + +- `actor_type`, `is_agent`, `record_type` +- `sessionId`, `experimentId`, `storeMode`, `ts` +- `eventName`, `page`, `productId`, `price`, `userAgent` + +Kafka provenance fields: + +- `kafka_partition_id`, `kafka_offset`, `kafka_timestamp_ms`, `kafka_compression` +- `kafka_is_transactional`, `kafka_headers`, `kafka_key_*`, `kafka_value_*` + +
+Metadata columns in this release + +{metadata_lines} + +
+ +## Quick start + +```python +from datasets import load_dataset + +ds = load_dataset("velocitatem/whoclickedit") +``` + +Recommended split strategy: + +- Prefer session-aware or time-aware splits. +- Do not split rows from the same `sessionId` across train and test. + +## Intended use + +- Human-vs-agent behavior classification. +- Session-level telemetry modeling for dynamic pricing defenses. +- Robustness experiments under agent-mediated reconnaissance. + +## Safety and limitations + +- `userAgent` and referrer metadata can be quasi-identifying in very small samples. +- Data comes from a controlled research platform, not a full production marketplace. +- Current release has stronger coverage for `hotel` flows than `airline` flows. + +## Citation + +If you use this dataset, cite the PHANTOM thesis project and link this page: +`https://huggingface.co/datasets/velocitatem/whoclickedit` +""" + + +def build_card(input_csv: Path, output_md: Path) -> None: + if not input_csv.exists(): + raise FileNotFoundError(f"Input CSV not found: {input_csv}") + df = pd.read_csv(input_csv) + card = _render_card(df) + output_md.parent.mkdir(parents=True, exist_ok=True) + output_md.write_text(card) + print(f"wrote dataset card to {output_md}") + + +def upload_card( + card_path: Path, repo_id: str, path_in_repo: str, commit_message: str +) -> None: + if not card_path.exists(): + raise FileNotFoundError(f"Card file not found: {card_path}") + + api = HfApi(token=_token()) + try: + me = api.whoami(token=_token()) + except Exception as exc: + detail = _exception_details(exc) + raise RuntimeError(f"Hugging Face auth failed. Details: {detail}") from exc + + user_name = me.get("name") or me.get("fullname") or "unknown" + print(f"authenticated to HF as: {user_name}") + + try: + api.repo_info(repo_id=repo_id, repo_type="dataset") + except Exception as exc: + detail = _exception_details(exc) + raise RuntimeError( + f"Dataset repo '{repo_id}' is not accessible. Details: {detail}" + ) from exc + + try: + commit = api.upload_file( + path_or_fileobj=str(card_path), + path_in_repo=path_in_repo, + repo_id=repo_id, + repo_type="dataset", + commit_message=commit_message, + ) + except Exception as exc: + detail = _exception_details(exc) + raise RuntimeError( + f"Card upload failed for '{repo_id}'. Details: {detail}" + ) from exc + + print(f"uploaded dataset card to https://huggingface.co/datasets/{repo_id}") + print(f"commit: {commit}") + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Build or upload whoclickedit dataset card" + ) + sub = parser.add_subparsers(dest="command", required=True) + + build = sub.add_parser("build", help="build card markdown from CSV") + build.add_argument("--input", type=Path, default=DEFAULT_INPUT) + build.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) + + upload = sub.add_parser("upload", help="upload existing card as dataset README.md") + upload.add_argument("--input", type=Path, default=DEFAULT_OUTPUT) + upload.add_argument("--repo", default=DEFAULT_REPO) + upload.add_argument("--path-in-repo", default="README.md") + upload.add_argument("--message", default="Add dataset card for whoclickedit") + + both = sub.add_parser("build-upload", help="build card and upload to dataset repo") + both.add_argument("--csv", type=Path, default=DEFAULT_INPUT) + both.add_argument("--card", type=Path, default=DEFAULT_OUTPUT) + both.add_argument("--repo", default=DEFAULT_REPO) + both.add_argument("--path-in-repo", default="README.md") + both.add_argument("--message", default="Add dataset card for whoclickedit") + + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + try: + if args.command == "build": + build_card(args.input, args.output) + return 0 + + if args.command == "upload": + upload_card(args.input, args.repo, args.path_in_repo, args.message) + return 0 + + if args.command == "build-upload": + build_card(args.csv, args.card) + upload_card(args.card, args.repo, args.path_in_repo, args.message) + return 0 + + raise ValueError(f"Unknown command: {args.command}") + except Exception as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/whoclicked_etl.py b/scripts/whoclicked_etl.py new file mode 100644 index 0000000..105f15a --- /dev/null +++ b/scripts/whoclicked_etl.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +"""Build and upload a flattened who-clicked dataset from local collected_data.""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path +from typing import Any + +import pandas as pd +from huggingface_hub import HfApi + + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +DEFAULT_HUMAN_DIR = PROJECT_ROOT / "experiments" / "collected_data" +DEFAULT_AGENT_DIR = PROJECT_ROOT / "experiments" / "agents" / "collected_data" +DEFAULT_OUTPUT = PROJECT_ROOT / "experiments" / "exports" / "whoclicked.csv" +DEFAULT_REPO = os.getenv("HF_WHOCLICKED_REPO", "velocitatem/whoclickedit") + +BASE_COLUMNS = [ + "actor_type", + "is_agent", + "record_type", + "topic", + "source_session_dir", + "source_file", + "source_row_index", + "ingest_format", + "sessionId", + "experimentId", + "storeMode", + "ts", + "eventName", + "page", + "productId", + "price", + "userAgent", + "kafka_partition_id", + "kafka_offset", + "kafka_timestamp_ms", + "kafka_compression", + "kafka_is_transactional", + "kafka_headers", + "kafka_key_payload", + "kafka_key_encoding", + "kafka_key_schema_id", + "kafka_value_encoding", + "kafka_value_schema_id", + "kafka_value_size", +] + + +def _token() -> str | None: + return os.getenv("HF_TOKEN") or None + + +def _exception_details(exc: Exception) -> str: + parts = [str(exc).strip()] + response = getattr(exc, "response", None) + if response is not None: + status = getattr(response, "status_code", None) + if status is not None: + parts.append(f"HTTP {status}") + text = getattr(response, "text", "") + if text: + text = text.strip() + if text: + parts.append(text[:500]) + return " | ".join(p for p in parts if p) + + +def _flatten_dict(data: dict[str, Any], prefix: str = "") -> dict[str, Any]: + flat: dict[str, Any] = {} + for key, value in data.items(): + normalized_key = str(key).strip().replace(" ", "_") + next_key = f"{prefix}_{normalized_key}" if prefix else normalized_key + if isinstance(value, dict): + flat.update(_flatten_dict(value, next_key)) + else: + flat[next_key] = value + return flat + + +def _as_scalar(value: Any) -> Any: + if isinstance(value, (dict, list, tuple)): + return json.dumps(value, ensure_ascii=True, sort_keys=True) + return value + + +def _empty_envelope() -> dict[str, Any]: + return { + "kafka_partition_id": None, + "kafka_offset": None, + "kafka_timestamp_ms": None, + "kafka_compression": None, + "kafka_is_transactional": None, + "kafka_headers": None, + "kafka_key_payload": None, + "kafka_key_encoding": None, + "kafka_key_schema_id": None, + "kafka_value_encoding": None, + "kafka_value_schema_id": None, + "kafka_value_size": None, + } + + +def _extract_payload_and_envelope( + record: Any, +) -> tuple[dict[str, Any], dict[str, Any], str]: + if ( + isinstance(record, dict) + and isinstance(record.get("value"), dict) + and isinstance(record["value"].get("payload"), dict) + ): + key = record.get("key") if isinstance(record.get("key"), dict) else {} + value = record["value"] + envelope = { + "kafka_partition_id": record.get("partitionID"), + "kafka_offset": record.get("offset"), + "kafka_timestamp_ms": record.get("timestamp"), + "kafka_compression": record.get("compression"), + "kafka_is_transactional": record.get("isTransactional"), + "kafka_headers": _as_scalar(record.get("headers")), + "kafka_key_payload": key.get("payload"), + "kafka_key_encoding": key.get("encoding"), + "kafka_key_schema_id": key.get("schemaId"), + "kafka_value_encoding": value.get("encoding"), + "kafka_value_schema_id": value.get("schemaId"), + "kafka_value_size": value.get("size"), + } + return dict(value["payload"]), envelope, "kafka_envelope" + + if isinstance(record, dict): + return dict(record), _empty_envelope(), "flat_payload" + + return {}, _empty_envelope(), "unknown" + + +def _load_json_list(path: Path) -> list[Any]: + raw = json.loads(path.read_text()) + if not isinstance(raw, list): + raise ValueError(f"Expected list in {path}, got {type(raw).__name__}") + return raw + + +def _normalize_file_rows( + actor_type: str, + is_agent: int, + session_dir_name: str, + source_file: str, + records: list[Any], +) -> list[dict[str, Any]]: + record_type = "interaction" if source_file == "int.json" else "price_log" + topic = "user-interactions" if record_type == "interaction" else "price-logs" + + rows: list[dict[str, Any]] = [] + for idx, raw_record in enumerate(records): + payload, envelope, ingest_format = _extract_payload_and_envelope(raw_record) + metadata = payload.pop("metadata", None) + + payload_flat = _flatten_dict(payload) + row: dict[str, Any] = { + "actor_type": actor_type, + "is_agent": is_agent, + "record_type": record_type, + "topic": topic, + "source_session_dir": session_dir_name, + "source_file": source_file, + "source_row_index": idx, + "ingest_format": ingest_format, + **envelope, + } + row.update({k: _as_scalar(v) for k, v in payload_flat.items()}) + + if isinstance(metadata, dict): + metadata_flat = _flatten_dict(metadata, "metadata") + row.update({k: _as_scalar(v) for k, v in metadata_flat.items()}) + elif metadata is not None: + row["metadata_raw"] = _as_scalar(metadata) + + rows.append(row) + + return rows + + +def _collect_rows_for_actor( + actor_type: str, is_agent: int, base_dir: Path +) -> list[dict[str, Any]]: + if not base_dir.exists(): + raise FileNotFoundError(f"Directory not found: {base_dir}") + + rows: list[dict[str, Any]] = [] + for session_dir in sorted( + (p for p in base_dir.iterdir() if p.is_dir()), key=lambda p: p.name + ): + for source_file in ("int.json", "price.json"): + file_path = session_dir / source_file + if not file_path.exists(): + continue + records = _load_json_list(file_path) + rows.extend( + _normalize_file_rows( + actor_type=actor_type, + is_agent=is_agent, + session_dir_name=session_dir.name, + source_file=source_file, + records=records, + ) + ) + return rows + + +def build_dataframe(human_dir: Path, agent_dir: Path) -> pd.DataFrame: + rows = [ + *_collect_rows_for_actor("human", 0, human_dir), + *_collect_rows_for_actor("agent", 1, agent_dir), + ] + if not rows: + return pd.DataFrame(columns=BASE_COLUMNS) + + df = pd.DataFrame(rows) + ordered_columns = [ + *BASE_COLUMNS, + *sorted(c for c in df.columns if c not in BASE_COLUMNS), + ] + return df[ordered_columns] + + +def _print_summary(df: pd.DataFrame, output_path: Path) -> None: + print(f"wrote {len(df)} rows and {len(df.columns)} columns to {output_path}") + if df.empty: + return + + print("rows by actor/record_type:") + grouped = ( + df.groupby(["actor_type", "record_type"], dropna=False) + .size() + .reset_index(name="count") + .sort_values(["actor_type", "record_type"]) + ) + for _, row in grouped.iterrows(): + print(f" - {row['actor_type']} / {row['record_type']}: {int(row['count'])}") + + required = ["actor_type", "is_agent", "record_type", "sessionId", "ts"] + missing = {col: int(df[col].isna().sum()) for col in required if col in df.columns} + print(f"missing in required columns: {missing}") + + +def build_csv(human_dir: Path, agent_dir: Path, output: Path) -> pd.DataFrame: + df = build_dataframe(human_dir=human_dir, agent_dir=agent_dir) + output.parent.mkdir(parents=True, exist_ok=True) + df.to_csv(output, index=False) + _print_summary(df, output) + return df + + +def _resolve_repo_id(api: HfApi, repo_id: str) -> str: + if "/" in repo_id: + return repo_id + try: + me = api.whoami(token=_token()) + username = me.get("name") + if username: + return f"{username}/{repo_id}" + except Exception: + pass + return repo_id + + +def upload_csv( + input_path: Path, + repo_id: str, + path_in_repo: str, + commit_message: str, + create_if_missing: bool = False, +) -> None: + if not input_path.exists(): + raise FileNotFoundError(f"Input CSV not found: {input_path}") + + api = HfApi(token=_token()) + + try: + me = api.whoami(token=_token()) + except Exception as exc: + detail = _exception_details(exc) + hint = "Set HF_TOKEN with write access or run huggingface-cli login." + raise RuntimeError( + f"Hugging Face auth failed. {hint} Details: {detail}" + ) from exc + + user_name = me.get("name") or me.get("fullname") or "unknown" + print(f"authenticated to HF as: {user_name}") + + resolved_repo_id = _resolve_repo_id(api, repo_id) + if create_if_missing: + api.create_repo(repo_id=resolved_repo_id, repo_type="dataset", exist_ok=True) + else: + try: + api.repo_info(repo_id=resolved_repo_id, repo_type="dataset") + except Exception as exc: + detail = _exception_details(exc) + hint = ( + "Check owner/repo spelling, ensure it is a dataset repo, " + "or pass --create-if-missing." + ) + raise RuntimeError( + f"Dataset repo '{resolved_repo_id}' is not accessible. {hint} Details: {detail}" + ) from exc + + try: + commit = api.upload_file( + path_or_fileobj=str(input_path), + path_in_repo=path_in_repo, + repo_id=resolved_repo_id, + repo_type="dataset", + commit_message=commit_message, + ) + except Exception as exc: + detail = _exception_details(exc) + hint = ( + "Pass --repo /whoclickedit and ensure HF_TOKEN is set " + "(or run huggingface-cli login)." + ) + raise RuntimeError( + f"Upload failed for '{resolved_repo_id}'. {hint} Details: {detail}" + ) from exc + + print( + f"uploaded {input_path} to https://huggingface.co/datasets/{resolved_repo_id}" + ) + print(f"commit: {commit}") + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="ETL for whoclickedit: flatten local collected_data and upload to HF" + ) + sub = parser.add_subparsers(dest="command", required=True) + + build = sub.add_parser("build", help="build flattened CSV locally") + build.add_argument("--human-dir", type=Path, default=DEFAULT_HUMAN_DIR) + build.add_argument("--agent-dir", type=Path, default=DEFAULT_AGENT_DIR) + build.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) + + upload = sub.add_parser("upload", help="upload an existing CSV to HF dataset") + upload.add_argument("--input", type=Path, default=DEFAULT_OUTPUT) + upload.add_argument("--repo", default=DEFAULT_REPO) + upload.add_argument("--path-in-repo", default="whoclicked.csv") + upload.add_argument("--message", default="Update flattened whoclickedit dataset") + upload.add_argument("--create-if-missing", action="store_true") + + build_upload = sub.add_parser( + "build-upload", help="build CSV and upload to HF dataset" + ) + build_upload.add_argument("--human-dir", type=Path, default=DEFAULT_HUMAN_DIR) + build_upload.add_argument("--agent-dir", type=Path, default=DEFAULT_AGENT_DIR) + build_upload.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) + build_upload.add_argument("--repo", default=DEFAULT_REPO) + build_upload.add_argument("--path-in-repo", default="whoclicked.csv") + build_upload.add_argument( + "--message", default="Update flattened whoclickedit dataset" + ) + build_upload.add_argument("--create-if-missing", action="store_true") + + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + + try: + if args.command == "build": + build_csv( + human_dir=args.human_dir, agent_dir=args.agent_dir, output=args.output + ) + return 0 + + if args.command == "upload": + upload_csv( + input_path=args.input, + repo_id=args.repo, + path_in_repo=args.path_in_repo, + commit_message=args.message, + create_if_missing=args.create_if_missing, + ) + return 0 + + if args.command == "build-upload": + build_csv( + human_dir=args.human_dir, agent_dir=args.agent_dir, output=args.output + ) + upload_csv( + input_path=args.output, + repo_id=args.repo, + path_in_repo=args.path_in_repo, + commit_message=args.message, + create_if_missing=args.create_if_missing, + ) + return 0 + + raise ValueError(f"Unknown command: {args.command}") + + except Exception as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/submit_ray_job.sh b/submit_ray_job.sh new file mode 100755 index 0000000..b4a2630 --- /dev/null +++ b/submit_ray_job.sh @@ -0,0 +1,247 @@ +#!/bin/bash +# Submits PHANTOM training to a Ray cluster with .env injection. +# Modes: +# RAY_MODE=single -> one run (default) +# RAY_MODE=distributed -> one run per TPU node (experimental) +# RAY_MODE=benchmark -> one benchmark run per TPU node (overnight) +# RAY_MODE=sweep -> distributed W&B sweep agents + +set -euo pipefail + +ROOT="/home/velocitatem/Documents/Projects/PHANTOM" +RAY_BIN="${RAY_BIN:-ray}" +if ! command -v "$RAY_BIN" >/dev/null 2>&1; then + if [ -x "$ROOT/.venv-ray/bin/ray" ]; then + RAY_BIN="$ROOT/.venv-ray/bin/ray" + else + echo "ray CLI not found. Activate .venv-ray or set RAY_BIN." >&2 + exit 1 + fi +fi + +# 1. Parse .env and generate the JSON payload for Ray +export RUNTIME_ENV_JSON=$(python -c ' +import json +import os +from dotenv import dotenv_values + +env = dotenv_values(".env") +# Filter out empty/None values +env_vars = {k: v for k, v in env.items() if v} +env_vars.setdefault("CLOUD_TPU_TASK_ID", os.getenv("CLOUD_TPU_TASK_ID", "0")) +for k in ( + "WANDB_ENTITY", + "WANDB_PROJECT", + "PHANTOM_BENCHMARK_COMPARE_ROBUST", + "PHANTOM_JAX_PLATFORM", + "PHANTOM_ALLOW_MULTI_NODE_TPU", + "PHANTOM_TPU_AGENT_SLOTS", +): + if os.getenv(k): + env_vars[k] = os.getenv(k) + +print(json.dumps({ + "pip": [ + "stable-baselines3>=2.2.0", + "gymnasium>=0.29.0", + "wandb", + "tensorboard", + "python-dotenv", + "pandas", + "pydantic", + "graphviz", + "huggingface_hub", + "matplotlib" + ], + "env_vars": env_vars +})) +') + +RAY_MODE="${RAY_MODE:-single}" +TRAIN_ARGS="${TRAIN_ARGS:---algo ppo --total-timesteps 1000000}" +BENCHMARK_ARGS="${BENCHMARK_ARGS:---project capstone_tpu --tiers static,surge,linear,qtable,ppo --alpha-values 0.0,0.1,0.25,0.4,0.6,0.8 --episodes 12 --total-timesteps 30000 --max-steps 100 --robust-radius 0.2 --robust-points 7 --robust-rollouts 1 --lambda-coi 0.2 --eta-ux 0.5 --reward-profit-weight 1.0 --device cpu}" +INNER_WORKERS="${INNER_WORKERS:-16}" +INNER_THREADS="${INNER_THREADS:-1}" +MAX_HEAVY_WORKERS="${MAX_HEAVY_WORKERS:-3}" +WORKER_CPUS="${WORKER_CPUS:-$((INNER_WORKERS * INNER_THREADS))}" +SWEEP_KIND="${SWEEP_KIND:-benchmark}" +SWEEP_METHOD="${SWEEP_METHOD:-random}" +SWEEP_PROFILE="${SWEEP_PROFILE:-default}" +SWEEP_RUN_CAP="${SWEEP_RUN_CAP:-0}" +AGENTS_PER_NODE="${AGENTS_PER_NODE:-16}" +AGENT_COUNT="${AGENT_COUNT:-0}" + +SUBMIT_ARGS=() +if [ "${RAY_NO_WAIT:-0}" = "1" ]; then + SUBMIT_ARGS+=(--no-wait) +fi +if [ -n "${SUBMISSION_ID:-}" ]; then + SUBMIT_ARGS+=(--submission-id "$SUBMISSION_ID") +fi + +COMMON_ARGS=( + job submit + --address http://localhost:8265 + --working-dir "$ROOT" + --runtime-env-json "$RUNTIME_ENV_JSON" + "${SUBMIT_ARGS[@]}" + -- +) + +if [ "$RAY_MODE" = "single" ]; then + read -r -a TRAIN_TOKENS <<< "$TRAIN_ARGS" + "$RAY_BIN" "${COMMON_ARGS[@]}" python -m engine.train "${TRAIN_TOKENS[@]}" + exit 0 +fi + +if [ "$RAY_MODE" = "distributed" ]; then + DIST_ARGS=( + python + scripts/ray_distributed_train.py + --train-args "$TRAIN_ARGS" + --num-nodes "${NUM_NODES:-4}" + --tpu-per-task "${TPU_PER_TASK:-8}" + --base-seed "${BASE_SEED:-42}" + ) + if [ "${SYNC_JAX:-0}" = "1" ]; then + DIST_ARGS+=(--sync-jax) + fi + "$RAY_BIN" "${COMMON_ARGS[@]}" "${DIST_ARGS[@]}" + exit 0 +fi + +if [ "$RAY_MODE" = "benchmark" ]; then + DIST_ARGS=( + python + scripts/ray_distributed_train.py + --run-kind benchmark + --entry-args "$BENCHMARK_ARGS" + --num-nodes "${NUM_NODES:-4}" + --tpu-per-task "${TPU_PER_TASK:-8}" + --base-seed "${BASE_SEED:-42}" + --output-root "${OUTPUT_ROOT:-engine/studies/results/overnight}" + --wandb-entity "${WANDB_ENTITY:-lusiana}" + --wandb-project "${WANDB_PROJECT:-capstone_tpu}" + --inner-workers "${INNER_WORKERS}" + --inner-threads "${INNER_THREADS}" + --max-heavy-workers "${MAX_HEAVY_WORKERS}" + --worker-cpus "${WORKER_CPUS}" + ) + if [ "${COMPARE_ROBUST:-1}" = "1" ]; then + DIST_ARGS+=(--compare-robust) + fi + "$RAY_BIN" "${COMMON_ARGS[@]}" "${DIST_ARGS[@]}" + exit 0 +fi + +if [ "$RAY_MODE" = "sweep" ]; then + SWEEP_PROJECT="${WANDB_PROJECT:-capstone_tpu}" + SWEEP_ENTITY="${WANDB_ENTITY:-lusiana}" + SWEEP_ID_VALUE="${SWEEP_ID:-}" + SWEEP_NUM_NODES="${NUM_NODES:-5}" + PY_SWEEP_BIN="${PY_SWEEP_BIN:-}" + if [ -z "$PY_SWEEP_BIN" ]; then + for cand in "$ROOT/.venv/bin/python" "$ROOT/.venv-ray/bin/python" python3 python; do + if [ "$cand" = "python3" ] || [ "$cand" = "python" ]; then + command -v "$cand" >/dev/null 2>&1 || continue + elif [ ! -x "$cand" ]; then + continue + fi + if "$cand" - <<'PY' >/dev/null 2>&1 +import sys +from pathlib import Path +cwd = str(Path.cwd()) +sys.path = [p for p in sys.path if p not in {'', cwd}] +import wandb +print(wandb.__name__) +PY + then + PY_SWEEP_BIN="$cand" + break + fi + done + fi + if [ -z "$PY_SWEEP_BIN" ]; then + echo "No python interpreter with wandb is available for sweep creation." >&2 + exit 1 + fi + + if [ -z "$SWEEP_ID_VALUE" ]; then + if [ -z "${WANDB_API_KEY:-}" ]; then + export WANDB_API_KEY + WANDB_API_KEY="$($PY_SWEEP_BIN - <<'PY' +from dotenv import dotenv_values +print(dotenv_values('.env').get('WANDB_API_KEY', '').strip()) +PY +)" + fi + if [ -z "${WANDB_API_KEY:-}" ]; then + echo "WANDB_API_KEY is required to create a sweep." >&2 + exit 1 + fi + SWEEP_ID_VALUE="$($PY_SWEEP_BIN "$ROOT/scripts/wandb_create_sweep.py" \ + --kind "$SWEEP_KIND" \ + --profile "$SWEEP_PROFILE" \ + --project "$SWEEP_PROJECT" \ + --entity "$SWEEP_ENTITY" \ + --method "$SWEEP_METHOD" \ + --run-cap "$SWEEP_RUN_CAP")" + fi + + SWEEP_ENTRY_ARGS="${SWEEP_ENTRY_ARGS:-}" + if [ -z "$SWEEP_ENTRY_ARGS" ]; then + SWEEP_ENTRY_ARGS="--sweep-agent --sweep-id $SWEEP_ID_VALUE --project $SWEEP_PROJECT --device cpu" + fi + + if [ "$AGENT_COUNT" = "0" ] && [ "${SWEEP_RUN_CAP:-0}" -gt 0 ]; then + TOTAL_AGENTS=$((SWEEP_NUM_NODES * AGENTS_PER_NODE)) + if [ "$TOTAL_AGENTS" -gt 0 ]; then + AGENT_COUNT=$(((SWEEP_RUN_CAP + TOTAL_AGENTS - 1) / TOTAL_AGENTS)) + echo "Derived AGENT_COUNT=$AGENT_COUNT from SWEEP_RUN_CAP=$SWEEP_RUN_CAP across $TOTAL_AGENTS agents" + fi + fi + + SWEEP_RUN_KIND="$SWEEP_KIND" + if [ "$SWEEP_KIND" = "ppo_calibration" ] || [ "$SWEEP_KIND" = "ppo_block_a" ] || [ "$SWEEP_KIND" = "ppo_shift_screen" ]; then + SWEEP_RUN_KIND="benchmark" + fi + if [ "$SWEEP_KIND" = "ppo_rl_study" ]; then + SWEEP_RUN_KIND="train" + fi + if [ "$SWEEP_RUN_KIND" != "benchmark" ] && [ "$SWEEP_RUN_KIND" != "train" ]; then + echo "Unsupported SWEEP_KIND='$SWEEP_KIND' (expected 'benchmark', 'train', 'ppo_calibration', 'ppo_block_a', 'ppo_shift_screen', or 'ppo_rl_study')." >&2 + exit 1 + fi + + DIST_ARGS=( + python + scripts/ray_distributed_train.py + --run-kind "$SWEEP_RUN_KIND" + --entry-args "$SWEEP_ENTRY_ARGS" + --num-nodes "${SWEEP_NUM_NODES}" + --tpu-per-task "${TPU_PER_TASK:-0}" + --base-seed "${BASE_SEED:-42}" + --wandb-entity "$SWEEP_ENTITY" + --wandb-project "$SWEEP_PROJECT" + --agents-per-node "$AGENTS_PER_NODE" + --agent-count "$AGENT_COUNT" + --inner-threads "$INNER_THREADS" + --worker-cpus "${WORKER_CPUS:-$((AGENTS_PER_NODE * INNER_THREADS))}" + ) + if [ "$SWEEP_RUN_KIND" = "benchmark" ]; then + DIST_ARGS+=(--output-root "${OUTPUT_ROOT:-engine/studies/results/sweeps}") + fi + if [ "${COMPARE_ROBUST:-0}" = "1" ]; then + DIST_ARGS+=(--compare-robust) + fi + echo "SWEEP_ID=$SWEEP_ID_VALUE" + if [ "$SWEEP_KIND" = "train" ] && [ "$SWEEP_PROFILE" = "robust_revenue" ]; then + echo "When this sweep finishes, compare best robust config vs no_robust with:" + echo "python scripts/wandb_compare_best.py --entity $SWEEP_ENTITY --project $SWEEP_PROJECT --sweep-id $SWEEP_ID_VALUE --submit --ray-no-wait" + fi + "$RAY_BIN" "${COMMON_ARGS[@]}" "${DIST_ARGS[@]}" + exit 0 +fi + +echo "Unsupported RAY_MODE='$RAY_MODE' (expected 'single', 'distributed', 'benchmark', or 'sweep')." >&2 +exit 1 diff --git a/tpu_orchestration/bootstrap_ray.sh b/tpu_orchestration/bootstrap_ray.sh new file mode 100755 index 0000000..0de5f26 --- /dev/null +++ b/tpu_orchestration/bootstrap_ray.sh @@ -0,0 +1,280 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DEFAULT_CONF="${SCRIPT_DIR}/configs/v4_spot_us.conf" + +RAY_PORT="${RAY_PORT:-6379}" +RAY_DASHBOARD_HOST="${RAY_DASHBOARD_HOST:-0.0.0.0}" +RAY_DASHBOARD_LOCAL_PORT="${RAY_DASHBOARD_LOCAL_PORT:-8265}" +RAY_CLIENT_LOCAL_PORT="${RAY_CLIENT_LOCAL_PORT:-10001}" +TPU_CHIPS_PER_HOST="${TPU_CHIPS_PER_HOST:-8}" +TPU_RESOURCE_PER_HOST="${TPU_RESOURCE_PER_HOST:-8}" + +CONF_FILE="$DEFAULT_CONF" +DEPS_ONLY=0 +VERIFY_ONLY=0 +TEARDOWN=0 + +usage() { + cat <<'EOF' +Usage: bootstrap_ray.sh [options] + +Options: + --conf Path to TPU config (default: tpu_orchestration/configs/v4_spot_us.conf) + --deps-only Install TPU dependencies on all workers and exit + --verify-only Run JAX distributed smoke test on all workers and exit + --teardown Stop Ray on all workers and head, then exit + -h, --help Show this help + +Config file keys expected: + ZONE, QR_NAME, ACCEL_TYPE + +Optional env overrides: + PROJECT_ID, TPU_CHIPS_PER_HOST, TPU_RESOURCE_PER_HOST, + RAY_PORT, RAY_DASHBOARD_HOST, RAY_DASHBOARD_LOCAL_PORT, RAY_CLIENT_LOCAL_PORT +EOF +} + +log() { + printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" +} + +die() { + printf 'Error: %s\n' "$*" >&2 + exit 1 +} + +require_cmd() { + local cmd="$1" + command -v "$cmd" >/dev/null 2>&1 || die "Missing required command: ${cmd}" +} + +parse_args() { + while [ "$#" -gt 0 ]; do + case "$1" in + --conf) + [ "$#" -ge 2 ] || die "--conf requires a path" + CONF_FILE="$2" + shift 2 + ;; + --deps-only) + DEPS_ONLY=1 + shift + ;; + --verify-only) + VERIFY_ONLY=1 + shift + ;; + --teardown) + TEARDOWN=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + die "Unknown option: $1" + ;; + esac + done +} + +load_optional_sweep_env() { + if [ -n "${SWEEP_ENV_FILE:-}" ] && [ -f "${SWEEP_ENV_FILE}" ]; then + set -a + . "${SWEEP_ENV_FILE}" + set +a + return + fi + + local fallback_env="${SCRIPT_DIR}/../.env.sweep" + if [ -f "$fallback_env" ]; then + set -a + . "$fallback_env" + set +a + fi +} + +load_config() { + [ -f "$CONF_FILE" ] || die "Config file not found: $CONF_FILE" + # shellcheck disable=SC1090 + . "$CONF_FILE" + + [ -n "${ZONE:-}" ] || die "ZONE is required in config" + [ -n "${QR_NAME:-}" ] || die "QR_NAME is required in config" + [ -n "${ACCEL_TYPE:-}" ] || die "ACCEL_TYPE is required in config" +} + +resolve_project() { + if [ -n "${PROJECT_ID:-}" ]; then + return + fi + + local active_project + active_project="$(gcloud config get-value project 2>/dev/null || true)" + if [ -n "$active_project" ] && [ "$active_project" != "(unset)" ]; then + PROJECT_ID="$active_project" + return + fi + + die "PROJECT_ID is not set and gcloud has no active project" +} + +resolve_worker_count() { + [ -n "$TPU_CHIPS_PER_HOST" ] || die "TPU_CHIPS_PER_HOST must be set" + [[ "$TPU_CHIPS_PER_HOST" =~ ^[0-9]+$ ]] || die "TPU_CHIPS_PER_HOST must be numeric" + [ "$TPU_CHIPS_PER_HOST" -gt 0 ] || die "TPU_CHIPS_PER_HOST must be > 0" + + local total_chips + if [[ "$ACCEL_TYPE" =~ ([0-9]+)$ ]]; then + total_chips="${BASH_REMATCH[1]}" + else + die "Unable to parse total chips from ACCEL_TYPE=$ACCEL_TYPE" + fi + + if [ $((total_chips % TPU_CHIPS_PER_HOST)) -ne 0 ]; then + die "ACCEL_TYPE=$ACCEL_TYPE is not divisible by TPU_CHIPS_PER_HOST=$TPU_CHIPS_PER_HOST" + fi + + WORKER_COUNT=$((total_chips / TPU_CHIPS_PER_HOST)) + [ "$WORKER_COUNT" -gt 0 ] || die "Computed worker count must be > 0" +} + +run_tpu_ssh() { + local worker="$1" + local remote_cmd="$2" + local args=(compute tpus tpu-vm ssh "$QR_NAME" --zone "$ZONE" --project "$PROJECT_ID" --worker="$worker" --quiet --command "$remote_cmd") + gcloud "${args[@]}" +} + +install_deps() { + local cmd='python3 -m pip install --user --upgrade "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html stable-baselines3 gymnasium wandb tensorboard "ray[default]"' + log "Installing JAX and Ray dependencies on all workers" + run_tpu_ssh "all" "$cmd" +} + +verify_jax() { + local cmd='python3 -c "import jax; jax.distributed.initialize(); print(f\"process_index={jax.process_index()} local_devices={jax.local_device_count()} global_devices={jax.device_count()}\")"' + log "Running JAX distributed smoke test on all workers" + run_tpu_ssh "all" "$cmd" +} + +start_ray_head() { + local resources_json="{\"TPU\":${TPU_RESOURCE_PER_HOST}}" + local cmd="export PATH=\$HOME/.local/bin:\$PATH; ray stop >/dev/null 2>&1 || true; ray start --head --port=${RAY_PORT} --dashboard-host=${RAY_DASHBOARD_HOST} --resources='${resources_json}' --disable-usage-stats" + log "Starting Ray head on worker 0" + run_tpu_ssh "0" "$cmd" +} + +get_head_ip() { + local cmd="hostname -I | awk '{print \$1}'" + local head_ip + head_ip="$(run_tpu_ssh "0" "$cmd" | awk 'NF { ip=$1 } END { print ip }')" + [ -n "$head_ip" ] || die "Failed to resolve Ray head IP" + printf '%s\n' "$head_ip" +} + +start_ray_workers() { + local head_ip="$1" + local resources_json="{\"TPU\":${TPU_RESOURCE_PER_HOST}}" + local cmd + cmd="export PATH=\$HOME/.local/bin:\$PATH; ray stop >/dev/null 2>&1 || true; ray start --address=${head_ip}:${RAY_PORT} --resources='${resources_json}' --disable-usage-stats" + + if [ "$WORKER_COUNT" -le 1 ]; then + log "Single-worker topology detected; skipping worker join step" + return + fi + + local worker + for ((worker = 1; worker < WORKER_COUNT; worker++)); do + log "Starting Ray worker on worker ${worker}" + run_tpu_ssh "$worker" "$cmd" + done +} + +verify_ray_cluster() { + local cmd='export PATH=$HOME/.local/bin:$PATH; ray status' + log "Checking Ray cluster status from worker 0" + run_tpu_ssh "0" "$cmd" +} + +print_tunnel_hint() { + cat <= 1; worker--)); do + log "Stopping Ray on worker ${worker}" + if ! run_tpu_ssh "$worker" "$cmd"; then + failures=$((failures + 1)) + fi + done + fi + + log "Stopping Ray head on worker 0" + if ! run_tpu_ssh "0" "$cmd"; then + failures=$((failures + 1)) + fi + + [ "$failures" -eq 0 ] || die "Teardown completed with ${failures} failure(s)" +} + +main() { + parse_args "$@" + require_cmd gcloud + + load_optional_sweep_env + load_config + resolve_project + resolve_worker_count + + log "Target TPU: ${QR_NAME} (${ACCEL_TYPE}) in ${ZONE}" + log "Computed workers: ${WORKER_COUNT} (chips per host: ${TPU_CHIPS_PER_HOST})" + + if [ "$TEARDOWN" -eq 1 ]; then + teardown_ray + return + fi + + if [ "$DEPS_ONLY" -eq 1 ] && [ "$VERIFY_ONLY" -eq 1 ]; then + install_deps + verify_jax + return + fi + + if [ "$DEPS_ONLY" -eq 1 ]; then + install_deps + return + fi + + if [ "$VERIFY_ONLY" -eq 1 ]; then + verify_jax + return + fi + + install_deps + verify_jax + + start_ray_head + local head_ip + head_ip="$(get_head_ip)" + log "Ray head IP: ${head_ip}" + + start_ray_workers "$head_ip" + verify_ray_cluster + print_tunnel_hint +} + +main "$@" diff --git a/tpu_orchestration/configs/test_vm.conf b/tpu_orchestration/configs/test_vm.conf new file mode 100644 index 0000000..12a78df --- /dev/null +++ b/tpu_orchestration/configs/test_vm.conf @@ -0,0 +1,8 @@ +ZONE="us-central2-b" +QR_NAME="v4-test-vm" +ACCEL_TYPE="v4-8" +RUNTIME_VERSION="tpu-ubuntu2204-base" +IS_SPOT="true" +RUN_ID="phantom_v4_test_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" diff --git a/tpu_orchestration/configs/v4_od_us.conf b/tpu_orchestration/configs/v4_od_us.conf new file mode 100644 index 0000000..42bda3e --- /dev/null +++ b/tpu_orchestration/configs/v4_od_us.conf @@ -0,0 +1,9 @@ +ZONE="us-central2-b" +QR_NAME="v4-32-us-ondemand" +ACCEL_TYPE="v4-32" +RUNTIME_VERSION="tpu-ubuntu2204-base" +IS_SPOT="false" +INTERNAL_IPS="false" +RUN_ID="phantom_v4_od_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" diff --git a/tpu_orchestration/configs/v4_spot_us.conf b/tpu_orchestration/configs/v4_spot_us.conf new file mode 100644 index 0000000..25e9427 --- /dev/null +++ b/tpu_orchestration/configs/v4_spot_us.conf @@ -0,0 +1,9 @@ +ZONE="us-central2-b" +QR_NAME="v4-32-us-spot" +ACCEL_TYPE="v4-32" +RUNTIME_VERSION="tpu-ubuntu2204-base" +IS_SPOT="true" +INTERNAL_IPS="false" +RUN_ID="phantom_v4_spot_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" diff --git a/tpu_orchestration/configs/v5e_eu.conf b/tpu_orchestration/configs/v5e_eu.conf new file mode 100644 index 0000000..573cc5f --- /dev/null +++ b/tpu_orchestration/configs/v5e_eu.conf @@ -0,0 +1,8 @@ +ZONE="europe-west4-b" +QR_NAME="v5e-32-eu-spot" +ACCEL_TYPE="v5litepod-32" +RUNTIME_VERSION="tpu-ubuntu2204-base" +IS_SPOT="true" +RUN_ID="phantom_v5e_eu_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" diff --git a/tpu_orchestration/configs/v5e_us.conf b/tpu_orchestration/configs/v5e_us.conf new file mode 100644 index 0000000..c212eac --- /dev/null +++ b/tpu_orchestration/configs/v5e_us.conf @@ -0,0 +1,8 @@ +ZONE="us-central1-a" +QR_NAME="v5e-32-us-spot" +ACCEL_TYPE="v5litepod-32" +RUNTIME_VERSION="tpu-ubuntu2204-base" +IS_SPOT="true" +RUN_ID="phantom_v5e_us_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" diff --git a/tpu_orchestration/configs/v6e_eu.conf b/tpu_orchestration/configs/v6e_eu.conf new file mode 100644 index 0000000..55d3e3e --- /dev/null +++ b/tpu_orchestration/configs/v6e_eu.conf @@ -0,0 +1,8 @@ +ZONE="europe-west4-a" +QR_NAME="v6e-32-eu-spot" +ACCEL_TYPE="v6e-32" +RUNTIME_VERSION="tpu-ubuntu2204-base" +IS_SPOT="true" +RUN_ID="phantom_v6e_eu_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" diff --git a/tpu_orchestration/configs/v6e_us.conf b/tpu_orchestration/configs/v6e_us.conf new file mode 100644 index 0000000..8145d3d --- /dev/null +++ b/tpu_orchestration/configs/v6e_us.conf @@ -0,0 +1,8 @@ +ZONE="us-east1-d" +QR_NAME="v6e-32-us-spot" +ACCEL_TYPE="v6e-32" +RUNTIME_VERSION="tpu-ubuntu2204-base" +IS_SPOT="true" +RUN_ID="phantom_v6e_us_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" diff --git a/tpu_orchestration/tpu_startup.sh b/tpu_orchestration/tpu_startup.sh new file mode 100644 index 0000000..ae5f556 --- /dev/null +++ b/tpu_orchestration/tpu_startup.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# Idempotent startup script for TPU VMs using HF Buckets + +exec > >(tee -a /var/log/tpu_startup.log) 2>&1 +echo "Starting TPU setup..." + +# 1. Fetch metadata from GCP +get_metadata() { + curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/$1" +} + +export HF_TOKEN=$(get_metadata "HF_TOKEN") +export WANDB_API_KEY=$(get_metadata "WANDB_API_KEY") +export RUN_ID=$(get_metadata "RUN_ID") +export HF_REPO=$(get_metadata "HF_REPO") +export ACCEL_TYPE=$(get_metadata "ACCEL_TYPE") +export GITHUB_REPO=$(get_metadata "GITHUB_REPO") +export BRANCH=$(get_metadata "BRANCH") +export TRAIN_CMD=$(get_metadata "TRAIN_CMD") + +export WORKER_ID=$(hostname) + +# 2. Install dependencies +export DEBIAN_FRONTEND=noninteractive +apt-get update +apt-get install -y git tmux jq curl build-essential wget + +# Install HF CLI +curl -LsSf https://hf.co/cli/install.sh | bash + +# Install Miniconda to ensure modern Python (3.10+) on older TPU OS bases +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh +bash /tmp/miniconda.sh -b -p /opt/conda +rm /tmp/miniconda.sh +export PATH="/opt/conda/bin:$PATH" + +# Create and activate conda environment +conda create -n phantom python=3.11 -y +source /opt/conda/bin/activate phantom + +# Install Python ML dependencies +pip install --upgrade pip +pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html +pip install wandb orbax-checkpoint huggingface_hub + +# 3. Setup directories +mkdir -p /app/data +mkdir -p /app/checkpoints +mkdir -p /app/logs +mkdir -p /app/xla_cache/$ACCEL_TYPE + +export JAX_COMPILATION_CACHE_DIR="/app/xla_cache/${ACCEL_TYPE}" + +# 4. Clone repository +if [ -d "/app/model" ]; then + rm -rf /app/model +fi +git clone --branch $BRANCH $GITHUB_REPO /app/model +cd /app/model + +# Install project-specific dependencies if available +if [ -f "requirements.txt" ]; then + pip install -r requirements.txt +fi +if [ -f "sim/requirements.txt" ]; then + pip install -r sim/requirements.txt +fi + +# 5. Restore state from Hugging Face Buckets +echo "Restoring state from hf://buckets/$HF_REPO..." +# Download base data (shared across all) +hf buckets sync hf://buckets/$HF_REPO/data/base /app/data || echo "No base data found or failed to sync." + +# Download worker-specific checkpoints and logs +hf buckets sync hf://buckets/$HF_REPO/runs/$RUN_ID/checkpoints/$WORKER_ID /app/checkpoints || echo "No checkpoint found." +hf buckets sync hf://buckets/$HF_REPO/runs/$RUN_ID/logs/$WORKER_ID /app/logs || echo "No logs found." + +# Download architecture-specific XLA cache +hf buckets sync hf://buckets/$HF_REPO/runs/$RUN_ID/xla/$ACCEL_TYPE /app/xla_cache/$ACCEL_TYPE || echo "No XLA cache found." + +# 6. Start Background Sync Loop +cat << 'EOF' > /app/sync_loop.sh +#!/bin/bash +while true; do + sleep 120 + echo "[$(date)] Background sync to HF Bucket..." + hf buckets sync /app/checkpoints hf://buckets/$HF_REPO/runs/$RUN_ID/checkpoints/$WORKER_ID --quiet || true + hf buckets sync /app/logs hf://buckets/$HF_REPO/runs/$RUN_ID/logs/$WORKER_ID --quiet || true + hf buckets sync /app/xla_cache/$ACCEL_TYPE hf://buckets/$HF_REPO/runs/$RUN_ID/xla/$ACCEL_TYPE --quiet || true +done +EOF +chmod +x /app/sync_loop.sh +/app/sync_loop.sh & +SYNC_PID=$! + +# 7. Execute Training +echo "Starting training with command: $TRAIN_CMD" +# Ensure we are in the correct directory and environment +cd /app/model +export PYTHONPATH="/app/model:$PYTHONPATH" + +if [ -n "$TRAIN_CMD" ]; then + eval "$TRAIN_CMD" + EXIT_CODE=$? +else + echo "No TRAIN_CMD provided. Sleeping for testing purposes..." + # For testing: run a dummy process so the VM doesn't just idle immediately + sleep 3600 + EXIT_CODE=0 +fi + +# 8. Cleanup and Final Sync +echo "Training finished with exit code $EXIT_CODE. Stopping sync loop and performing final sync..." +kill $SYNC_PID + +hf buckets sync /app/checkpoints hf://buckets/$HF_REPO/runs/$RUN_ID/checkpoints/$WORKER_ID +hf buckets sync /app/logs hf://buckets/$HF_REPO/runs/$RUN_ID/logs/$WORKER_ID +hf buckets sync /app/xla_cache/$ACCEL_TYPE hf://buckets/$HF_REPO/runs/$RUN_ID/xla/$ACCEL_TYPE + +exit $EXIT_CODE \ No newline at end of file diff --git a/tpu_orchestration/watchdog.sh b/tpu_orchestration/watchdog.sh new file mode 100755 index 0000000..1a01447 --- /dev/null +++ b/tpu_orchestration/watchdog.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# Watchdog loop to ensure TPUs are re-queued when preempted + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +CONFIG_FILE=$1 +if [ ! -f "$CONFIG_FILE" ]; then + echo "Config file $CONFIG_FILE not found." + exit 1 +fi + +# Load config +source "$CONFIG_FILE" + +# Make sure HF_TOKEN is available +if [ -z "$HF_TOKEN" ]; then + echo "Error: HF_TOKEN environment variable must be set before running watchdog." + echo "export HF_TOKEN=..." + exit 1 +fi + +# Make sure WANDB_API_KEY is available +if [ -z "$WANDB_API_KEY" ]; then + echo "Warning: WANDB_API_KEY environment variable is not set. Wandb logging may fail." +fi + +# Make sure GITHUB_REPO is set in config or env +if [ -z "$GITHUB_REPO" ]; then + GITHUB_REPO="https://github.com/velocitatem/PHANTOM.git" + if [ -n "$GITHUB_TOKEN" ]; then + GITHUB_REPO="https://velocitatem:${GITHUB_TOKEN}@github.com/velocitatem/PHANTOM.git" + fi +fi + +# Make sure BRANCH is set in config or env +if [ -z "$BRANCH" ]; then + BRANCH="main" +fi + +# Ensure PROJECT_ID is set +if [ -z "$PROJECT_ID" ]; then + PROJECT_ID=$(gcloud config get-value project 2>/dev/null) + if [ -z "$PROJECT_ID" ]; then + PROJECT_ID="phantom-trc" # Fallback to the known project ID + echo "Warning: PROJECT_ID not set and gcloud not configured. Defaulting to $PROJECT_ID" + fi +fi + +echo "Starting watchdog for $QR_NAME in $ZONE (Project: $PROJECT_ID)" +echo "Accelerator: $ACCEL_TYPE" +echo "Run ID: $RUN_ID" + +# Backoff tracking for IP quota errors +RETRY_DELAY=60 +MAX_RETRY_DELAY=300 + +while true; do + STATE=$(gcloud compute tpus queued-resources describe $QR_NAME --zone=$ZONE --project=$PROJECT_ID --format="value(state.state)" 2>/dev/null) + + if [ -z "$STATE" ] || [[ "$STATE" == *"SUSPENDED"* ]] || [[ "$STATE" == *"FAILED"* ]]; then + echo "[$(date)] Cluster '${STATE:-MISSING}' - cleaning IPs and re-queuing..." + + # Clean all orphaned RESERVED IPs in parallel to free quota + gcloud compute addresses list --project=$PROJECT_ID \ + --filter="status=RESERVED AND name~'^tpu-.*'" \ + --format="value(name,region)" 2>/dev/null | \ + while IFS=$'\t' read -r n r; do + [ -n "$n" ] && [ -n "$r" ] && gcloud compute addresses delete "$n" --region="$r" --project=$PROJECT_ID --quiet 2>/dev/null & + done + wait + + # Delete QR and any orphaned VM + gcloud compute tpus queued-resources delete $QR_NAME --zone=$ZONE --project=$PROJECT_ID --quiet --force 2>/dev/null + VM_STATE=$(gcloud compute tpus tpu-vm describe $QR_NAME --zone=$ZONE --project=$PROJECT_ID --format="value(state)" 2>/dev/null) + [ -n "$VM_STATE" ] && gcloud compute tpus tpu-vm delete $QR_NAME --zone=$ZONE --project=$PROJECT_ID --quiet 2>/dev/null + + sleep 5 + + # Create new QR + SPOT_FLAG="" + if [ "$IS_SPOT" = "true" ]; then + SPOT_FLAG="--spot" + fi + + IP_FLAG="--internal-ips" + if [ "${INTERNAL_IPS:-true}" != "true" ]; then + IP_FLAG="" + fi + + # Prepare metadata + METADATA="HF_TOKEN=$HF_TOKEN,RUN_ID=$RUN_ID,HF_REPO=$HF_REPO,ACCEL_TYPE=$ACCEL_TYPE,GITHUB_REPO=$GITHUB_REPO,BRANCH=$BRANCH" + if [ -n "$WANDB_API_KEY" ]; then + METADATA="$METADATA,WANDB_API_KEY=$WANDB_API_KEY" + fi + if [ -n "$TRAIN_CMD" ]; then + METADATA="$METADATA,TRAIN_CMD=$TRAIN_CMD" + fi + + # Determine runtime version + RT_VERSION=${RUNTIME_VERSION:-"tpu-ubuntu2204-base"} + + CREATE_LOG="/tmp/tpu_create_${QR_NAME}.log" + + gcloud compute tpus queued-resources create $QR_NAME \ + --project=$PROJECT_ID \ + --node-id=$QR_NAME \ + --zone=$ZONE \ + --accelerator-type=$ACCEL_TYPE \ + --runtime-version=$RT_VERSION \ + $SPOT_FLAG \ + $IP_FLAG \ + --metadata-from-file startup-script=$(dirname $0)/tpu_startup.sh \ + --metadata "$METADATA" 2>&1 | tee "$CREATE_LOG" + + CREATE_EXIT=${PIPESTATUS[0]} + + if [ $CREATE_EXIT -eq 0 ]; then + echo "[$(date)] Successfully queued $QR_NAME." + RETRY_DELAY=60 + elif grep -Eq "IN_USE_ADDRESSES|RESOURCE_EXHAUSTED|Quota limit|QUOTA_EXCEEDED" "$CREATE_LOG" 2>/dev/null; then + echo "[$(date)] Quota pressure detected - backing off ${RETRY_DELAY}s" + sleep $RETRY_DELAY + RETRY_DELAY=$((RETRY_DELAY * 2)) + [ $RETRY_DELAY -gt $MAX_RETRY_DELAY ] && RETRY_DELAY=$MAX_RETRY_DELAY + continue + else + echo "[$(date)] Failed to queue $QR_NAME (exit=$CREATE_EXIT)." + RETRY_DELAY=60 + fi + else + echo "[$(date)] Cluster state is $STATE. Checking again in 60s..." + fi + sleep 60 +done diff --git a/web/project.json b/web/project.json index 41f561d..49fc1d5 100644 --- a/web/project.json +++ b/web/project.json @@ -7,7 +7,7 @@ "install": { "executor": "nx:run-commands", "options": { - "command": "npm install", + "command": "npm install --include=dev", "cwd": "web" } },