Merge pull request #55 from velocitatem/optimizing-runs

Enhance TPU orchestration and parallelization with benchmarks
This commit is contained in:
Daniel Alves Rösel
2026-03-23 15:15:35 +01:00
committed by GitHub
123 changed files with 7644 additions and 2152 deletions

1
.gitignore vendored
View File

@@ -3,6 +3,7 @@
.env.* .env.*
!.env.*.example !.env.*.example
**/.venv **/.venv
**/.venv-ray
# python build/cache artifacts # python build/cache artifacts
**/__pycache__ **/__pycache__

35
.rayignore Normal file
View File

@@ -0,0 +1,35 @@
# Virtual environments
.venv
.venv*
venv
venv*
**/.venv
**/venv
**/node_modules
node_modules/
# Python caches
__pycache__/
*.pyc
.ruff_cache/
.pytest_cache/
# Git
.git/
# Large data and logs
data/
experiments/
wandb/
dumplogs*
*.zip
*.pdf
*.log
*.dot
# Other large dirs
PHANTOM_web/
web/
docs/
paper/
.nx/

View File

@@ -11,6 +11,7 @@ PYTEST := $(VENV)/bin/pytest
NX := npx nx NX := npx nx
SWEEP_ENV_FILE ?= .env.sweep SWEEP_ENV_FILE ?= .env.sweep
TPU_CONF ?= tpu_orchestration/configs/v4_spot_us.conf
WANDB_ENTITY ?= WANDB_ENTITY ?=
WANDB_PROJECT ?= capstone WANDB_PROJECT ?= capstone
@@ -21,6 +22,14 @@ SIMPLE_BENCHMARK_ARGS ?= --tiers qtable,ppo,dqn,a2c --alpha-values 0.0,0.15,0.3,
BENCHMARK_AGENT_ARGS ?= BENCHMARK_AGENT_ARGS ?=
AGENT_COUNT ?= 0 AGENT_COUNT ?= 0
WHOCLICKED_REPO ?= velocitatem/whoclickedit
WHOCLICKED_CSV ?= experiments/exports/whoclicked.csv
WHOCLICKED_CARD ?= experiments/exports/whoclicked_dataset_card.md
WHOCLICKED_CSV_PATH_IN_REPO ?= whoclicked.csv
WHOCLICKED_CARD_PATH_IN_REPO ?= README.md
WHOCLICKED_DATASET_MESSAGE ?= Update flattened whoclickedit dataset
WHOCLICKED_CARD_MESSAGE ?= Update dataset card for whoclickedit
REPO_URL ?= REPO_URL ?=
BRANCH ?= main BRANCH ?= main
WORKDIR ?= $(HOME)/PHANTOM-agent WORKDIR ?= $(HOME)/PHANTOM-agent
@@ -35,8 +44,10 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
.PHONY: help .PHONY: help
help: help:
@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines" @echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines | manim.render manim.render.all"
@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish" @echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
@echo "data.pull data.push data.whoclicked.publish | study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
@echo "tpu.ray.bootstrap tpu.ray.deps tpu.ray.verify tpu.ray.teardown"
@echo "" @echo ""
@echo "Build general public version:" @echo "Build general public version:"
@echo " make pdf.genpop" @echo " make pdf.genpop"
@@ -56,6 +67,12 @@ help:
@echo "Bootstrap private repo worker from anywhere:" @echo "Bootstrap private repo worker from anywhere:"
@echo " make train.bootstrap REPO_URL=https://github.com/org/repo.git BRANCH=main SWEEP_ID=entity/project/id" @echo " make train.bootstrap REPO_URL=https://github.com/org/repo.git BRANCH=main SWEEP_ID=entity/project/id"
@echo "" @echo ""
@echo "Bootstrap Ray on TPU slice from config:"
@echo " make tpu.ray.bootstrap TPU_CONF=tpu_orchestration/configs/v4_spot_us.conf"
@echo ""
@echo "Publish whoclickedit dataset + card:"
@echo " make data.whoclicked.publish HF_TOKEN=... WHOCLICKED_REPO=velocitatem/whoclickedit"
@echo ""
@echo "Config source: $(SWEEP_ENV_FILE) (auto-loaded)" @echo "Config source: $(SWEEP_ENV_FILE) (auto-loaded)"
$(BUILDDIR): $(BUILDDIR):
@@ -133,10 +150,42 @@ train.agent:
train.bootstrap: train.bootstrap:
@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap @WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap
.PHONY: tpu.ray.bootstrap tpu.ray.deps tpu.ray.verify tpu.ray.teardown
tpu.ray.bootstrap:
@TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-bootstrap
tpu.ray.deps:
@TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-deps
tpu.ray.verify:
@TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-verify
tpu.ray.teardown:
@TPU_CONF="$(TPU_CONF)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" $(NX) run research:tpu-ray-teardown
.PHONY: data.pull data.push
data.pull:
python scripts/hf_data.py pull
data.push:
python scripts/hf_data.py push
.PHONY: data.whoclicked.publish
data.whoclicked.publish:
@HF_TOKEN="$(HF_TOKEN)" WHOCLICKED_REPO="$(WHOCLICKED_REPO)" WHOCLICKED_CSV="$(WHOCLICKED_CSV)" WHOCLICKED_CARD="$(WHOCLICKED_CARD)" WHOCLICKED_CSV_PATH_IN_REPO="$(WHOCLICKED_CSV_PATH_IN_REPO)" WHOCLICKED_CARD_PATH_IN_REPO="$(WHOCLICKED_CARD_PATH_IN_REPO)" WHOCLICKED_DATASET_MESSAGE="$(WHOCLICKED_DATASET_MESSAGE)" WHOCLICKED_CARD_MESSAGE="$(WHOCLICKED_CARD_MESSAGE)" $(NX) run research:whoclicked-publish
.PHONY: stats.lines .PHONY: stats.lines
stats.lines: stats.lines:
@$(NX) run research:stats @$(NX) run research:stats
.PHONY: study.margin-erosion
study.margin-erosion:
python -m engine.studies.margin_erosion_alpha
.PHONY: study.margin-erosion.quick
study.margin-erosion.quick:
python -m engine.studies.margin_erosion_alpha --quick
.PHONY: wordcount .PHONY: wordcount
wordcount: wordcount:
@$(NX) run paper:wordcount @$(NX) run paper:wordcount
@@ -185,3 +234,10 @@ count-lines:
all: all:
@$(NX) run paper:build @$(NX) run paper:build
.PHONY: manim.render manim.render.all
manim.render:
@$(NX) run manim:render
manim.render.all:
@$(NX) run manim:render-all

View File

@@ -2,6 +2,7 @@
### PHANTOM ### PHANTOM
[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-sm.svg)](https://huggingface.co/datasets/velocitatem/whoclickedit)
[![Build PDF](https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml/badge.svg)](https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml) [![Build PDF](https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml/badge.svg)](https://github.com/velocitatem/PHANTOM/actions/workflows/latex.yml)
[![Paper](https://img.shields.io/badge/Paper-PDF-red?logo=adobe-acrobat-reader)](https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf) [![Paper](https://img.shields.io/badge/Paper-PDF-red?logo=adobe-acrobat-reader)](https://pub-d5b94a3c29fd40c6b3881946e463fdb7.r2.dev/thesis-latest.pdf)
[![TPU Research Cloud](https://img.shields.io/badge/TPU%20Research%20Cloud-TRC%20supported-4285F4?logo=googlecloud&logoColor=white)](https://sites.research.google/trc/faq/) [![TPU Research Cloud](https://img.shields.io/badge/TPU%20Research%20Cloud-TRC%20supported-4285F4?logo=googlecloud&logoColor=white)](https://sites.research.google/trc/faq/)

View File

@@ -1,6 +0,0 @@
64 spot Cloud TPU v6e chips in zone europe-west4-a
32 spot Cloud TPU v4 chips in zone us-central2-b
64 spot Cloud TPU v5e chips in zone us-central1-a
64 spot Cloud TPU v6e chips in zone us-east1-d
32 on-demand Cloud TPU v4 chips in zone us-central2-b
64 spot Cloud TPU v5e chips in zone europe-west4-b

View File

@@ -1,22 +0,0 @@
# 32 spot Cloud TPU v4 chips in zone us-central2-b
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv4s32spotUC2B
export TPU_NAME=tpu-v4-32-uc2b-spot
export ZONE=us-central2-b
export ACCELERATOR_TYPE=v4-32
export RUNTIME_VERSION=v2-alpha-tpuv4
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,13 +0,0 @@
# 32 on-demand Cloud TPU v4 chips in zone us-central2-b
export PROJECT_ID=phantom-trc
export QR_NAME=TPUlong
export ZONE=us-central2-b
export ACCELERATOR_TYPE=v4-32
export RUNTIME_VERSION=v2-alpha-tpuv4
#gcloud compute tpus tpu-vm create ${TPU_NAME} --zone=${ZONE} --project=${PROJECT_ID} --accelerator-type=${ACCELERATOR_TYPE} --version=${RUNTIME_VERSION}
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION}

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v5e chips in zone europe-west4-b
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv5e64spotEW4B
export TPU_NAME=tpu-v5e-64-ew4b
export ZONE=europe-west4-b
export ACCELERATOR_TYPE=v5e-64
export RUNTIME_VERSION=v2-alpha-tpuv5-lite
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v5e chips in zone us-central1-a
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv5e64spotUC1A
export TPU_NAME=tpu-v5e-64-uc1a
export ZONE=us-central1-a
export ACCELERATOR_TYPE=v5e-64
export RUNTIME_VERSION=v2-alpha-tpuv5-lite
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v6e chips in zone europe-west4-a
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv6e64spotEW4A
export TPU_NAME=tpu-v6e-64-ew4a
export ZONE=europe-west4-a
export ACCELERATOR_TYPE=v6e-64
export RUNTIME_VERSION=v2-alpha-tpuv6e
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v6e chips in zone us-east1-d
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv6e64spotUE1D
export TPU_NAME=tpu-v6e-64-ue1d
export ZONE=us-east1-d
export ACCELERATOR_TYPE=v6e-64
export RUNTIME_VERSION=v2-alpha-tpuv6e
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,4 +1,23 @@
services: services:
tpu-watchdogs:
build:
context: .
dockerfile: docker/TPUWatchdog.dockerfile
container_name: "PHANTOM-tpu-watchdogs"
restart: unless-stopped
user: "${UID:-1000}:${GID:-1000}"
environment:
- HF_TOKEN=${HF_TOKEN}
- WANDB_API_KEY=${WANDB_API_KEY}
- GITHUB_TOKEN=${GITHUB_TOKEN}
- GOOGLE_APPLICATION_CREDENTIALS=/secrets/gcp-sa.json
- GCP_ACCOUNT=${GCP_ACCOUNT:-}
- WATCHDOG_CONFIG_PATTERN=${WATCHDOG_CONFIG_PATTERN:-v[46]*.conf}
- CLOUDSDK_CONFIG=/.config/gcloud
volumes:
- ~/.config/gcloud:/.config/gcloud:rw
- ./secrets/gcp-sa.json:/secrets/gcp-sa.json:ro
tensorboard-rl: tensorboard-rl:
image: tensorflow/tensorflow:latest image: tensorflow/tensorflow:latest
container_name: "PHANTOM-tensorboard-rl" container_name: "PHANTOM-tensorboard-rl"

View File

@@ -0,0 +1,112 @@
FROM google/cloud-sdk:slim
# Install tmux to manage multiple watchdogs and jq for json parsing
RUN apt-get update && \
apt-get install -y tmux jq && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy the orchestration scripts and configs
COPY tpu_orchestration/ /app/tpu_orchestration/
# Make sure scripts are executable
RUN chmod +x /app/tpu_orchestration/watchdog.sh
RUN chmod +x /app/tpu_orchestration/tpu_startup.sh
# Create an entrypoint script that launches a watchdog for each config
COPY <<-'EOF' /app/entrypoint.sh
#!/bin/bash
set -e
# Make sure required variables are set
if [ -z "$HF_TOKEN" ]; then
echo "Error: HF_TOKEN environment variable is required."
exit 1
fi
if [ -z "$WANDB_API_KEY" ]; then
echo "Warning: WANDB_API_KEY environment variable is not set. Wandb logging may fail on TPUs."
fi
# Authenticate gcloud if credentials are provided
if [ -n "$GOOGLE_APPLICATION_CREDENTIALS" ] && [ -f "$GOOGLE_APPLICATION_CREDENTIALS" ]; then
CRED_TYPE=$(jq -r '.type' "$GOOGLE_APPLICATION_CREDENTIALS" 2>/dev/null || echo "unknown")
if [ "$CRED_TYPE" = "service_account" ]; then
echo "Authenticating gcloud using service account key..."
gcloud auth activate-service-account --key-file="$GOOGLE_APPLICATION_CREDENTIALS"
if [ -z "$PROJECT_ID" ]; then
PROJECT_ID=$(jq -r '.project_id // empty' "$GOOGLE_APPLICATION_CREDENTIALS")
fi
elif [ "$CRED_TYPE" = "authorized_user" ]; then
echo "Using authorized_user credentials via credential file override..."
export CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE="$GOOGLE_APPLICATION_CREDENTIALS"
if gcloud auth print-access-token >/dev/null 2>&1; then
ACTIVE_ACCOUNT=$(gcloud config get-value account 2>/dev/null || true)
if [ -z "$ACTIVE_ACCOUNT" ] || [ "$ACTIVE_ACCOUNT" = "(unset)" ]; then
ACTIVE_ACCOUNT=$(jq -r '.account // empty' "$GOOGLE_APPLICATION_CREDENTIALS")
fi
if [ -n "$ACTIVE_ACCOUNT" ] && [ "$ACTIVE_ACCOUNT" != "(unset)" ]; then
echo "Using gcloud account: $ACTIVE_ACCOUNT"
else
echo "Using gcloud credential override from $GOOGLE_APPLICATION_CREDENTIALS"
fi
else
echo "Warning: credential file override token check failed. Falling back to mounted gcloud config."
unset CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE
if [ -n "$GCP_ACCOUNT" ]; then
gcloud config set account "$GCP_ACCOUNT" >/dev/null 2>&1 || true
fi
ACTIVE_ACCOUNT=$(gcloud config get-value account 2>/dev/null || true)
if [ -z "$ACTIVE_ACCOUNT" ] || [ "$ACTIVE_ACCOUNT" = "(unset)" ]; then
echo "Error: no active gcloud account available. Run 'gcloud auth login' on host and mount ~/.config/gcloud, or use a service account key."
exit 1
fi
echo "Using gcloud account: $ACTIVE_ACCOUNT"
fi
else
echo "Warning: unsupported credential file type '$CRED_TYPE'. Falling back to mounted gcloud config."
fi
else
echo "Note: Assuming gcloud config is mounted from host."
fi
if [ -n "$PROJECT_ID" ]; then
gcloud config set project "$PROJECT_ID"
echo "Set project to $PROJECT_ID"
fi
# Run the watchdogs in the background using bash instead of tmux
# Tmux needs a TTY to attach properly which we might not have in docker
# Stagger startups by 15s to prevent simultaneous TPU creation quota hits
CONFIG_PATTERN=${WATCHDOG_CONFIG_PATTERN:-"*.conf"}
shopt -s nullglob
CONFIGS=(/app/tpu_orchestration/configs/$CONFIG_PATTERN)
if [ ${#CONFIGS[@]} -eq 0 ]; then
echo "Error: no watchdog configs matched pattern '$CONFIG_PATTERN'."
exit 1
fi
echo "Using watchdog config pattern: $CONFIG_PATTERN"
DELAY=0
for conf in "${CONFIGS[@]}"; do
echo "Starting watchdog for $(basename "$conf" .conf) (delay: ${DELAY}s)"
(sleep $DELAY && /app/tpu_orchestration/watchdog.sh "$conf") &
DELAY=$((DELAY + 15))
done
echo "All watchdogs queued with staggered startup."
# Keep the container running
wait
EOF
RUN chmod +x /app/entrypoint.sh
CMD ["/app/entrypoint.sh"]

View File

@@ -272,12 +272,12 @@
</span> </span>
<span class="link-block"> <span class="link-block">
<a href="goals/goals.csv" target="_blank" <a href="https://huggingface.co/datasets/velocitatem/whoclickedit" target="_blank"
class="external-link button is-normal is-rounded is-dark"> class="external-link button is-normal is-rounded is-dark">
<span class="icon"> <span class="icon">
<i class="fas fa-list"></i> <i class="fas fa-database"></i>
</span> </span>
<span>Goal Set</span> <span>Dataset</span>
</a> </a>
</span> </span>

0
engine/__init__.py Normal file
View File

View File

@@ -15,6 +15,10 @@ def make_env(cfg: Mapping[str, Any]):
n_products=int(cfg["n_products"]), n_products=int(cfg["n_products"]),
alpha=float(cfg["alpha"]), alpha=float(cfg["alpha"]),
N=int(cfg["N"]), N=int(cfg["N"]),
agent_params=(
float(cfg.get("agent_mu", 45.0)),
float(cfg.get("agent_std", 15.0)),
),
price_bounds=(float(cfg["price_low"]), float(cfg["price_high"])), price_bounds=(float(cfg["price_low"]), float(cfg["price_high"])),
lambda_coi=float(cfg["lambda_coi"]), lambda_coi=float(cfg["lambda_coi"]),
robust_radius=float(cfg["robust_radius"]), robust_radius=float(cfg["robust_radius"]),
@@ -50,6 +54,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
coi_levels: list[float] = [] coi_levels: list[float] = []
coi_leakages: list[float] = [] coi_leakages: list[float] = []
volatilities: list[float] = [] volatilities: list[float] = []
upward_volatilities: list[float] = []
supra_shares: list[float] = []
supra_penalties: list[float] = []
agent_probs: list[float] = [] agent_probs: list[float] = []
for _ in range(int(episodes)): for _ in range(int(episodes)):
@@ -61,6 +68,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
ep_coi = 0.0 ep_coi = 0.0
ep_coi_leakage = 0.0 ep_coi_leakage = 0.0
ep_volatility = 0.0 ep_volatility = 0.0
ep_upward_volatility = 0.0
ep_supra_share = 0.0
ep_supra_penalty = 0.0
ep_agent_prob = 0.0 ep_agent_prob = 0.0
steps = 0 steps = 0
@@ -74,6 +84,15 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
ep_coi += float(econ.get("coi_level", 0.0)) ep_coi += float(econ.get("coi_level", 0.0))
ep_coi_leakage += float(econ.get("coi_leakage", 0.0)) ep_coi_leakage += float(econ.get("coi_leakage", 0.0))
ep_volatility += float(econ.get("volatility", 0.0)) ep_volatility += float(econ.get("volatility", 0.0))
ep_upward_volatility += float(
info.get("upward_volatility", econ.get("upward_volatility", 0.0))
)
ep_supra_share += float(
info.get("supra_share", econ.get("supra_share", 0.0))
)
ep_supra_penalty += float(
info.get("supra_penalty", econ.get("supra_penalty", 0.0))
)
ep_agent_prob += float(econ.get("agent_prob", info.get("agent_prob", 0.0))) ep_agent_prob += float(econ.get("agent_prob", info.get("agent_prob", 0.0)))
steps += 1 steps += 1
@@ -84,6 +103,9 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
coi_levels.append(ep_coi / denom) coi_levels.append(ep_coi / denom)
coi_leakages.append(ep_coi_leakage / denom) coi_leakages.append(ep_coi_leakage / denom)
volatilities.append(ep_volatility / denom) volatilities.append(ep_volatility / denom)
upward_volatilities.append(ep_upward_volatility / denom)
supra_shares.append(ep_supra_share / denom)
supra_penalties.append(ep_supra_penalty / denom)
agent_probs.append(ep_agent_prob / denom) agent_probs.append(ep_agent_prob / denom)
return { return {
@@ -95,6 +117,13 @@ def _evaluate_env(agent: Any, env: Any, episodes: int) -> dict[str, float]:
"eval/coi_level_mean": float(np.mean(coi_levels)) if coi_levels else 0.0, "eval/coi_level_mean": float(np.mean(coi_levels)) if coi_levels else 0.0,
"eval/coi_leakage_mean": float(np.mean(coi_leakages)) if coi_leakages else 0.0, "eval/coi_leakage_mean": float(np.mean(coi_leakages)) if coi_leakages else 0.0,
"eval/volatility_mean": float(np.mean(volatilities)) if volatilities else 0.0, "eval/volatility_mean": float(np.mean(volatilities)) if volatilities else 0.0,
"eval/upward_volatility_mean": (
float(np.mean(upward_volatilities)) if upward_volatilities else 0.0
),
"eval/supra_share_mean": float(np.mean(supra_shares)) if supra_shares else 0.0,
"eval/supra_penalty_mean": (
float(np.mean(supra_penalties)) if supra_penalties else 0.0
),
"eval/agent_prob_mean": float(np.mean(agent_probs)) if agent_probs else 0.0, "eval/agent_prob_mean": float(np.mean(agent_probs)) if agent_probs else 0.0,
} }
@@ -128,15 +157,15 @@ def evaluate(
shifted_env.close() shifted_env.close()
shifted_rows.append((tag, alpha, shifted_metrics)) shifted_rows.append((tag, alpha, shifted_metrics))
metrics["eval/robust_alpha_low"] = low_alpha metrics["eval/stress_alpha_low"] = low_alpha
metrics["eval/robust_alpha_high"] = high_alpha metrics["eval/stress_alpha_high"] = high_alpha
metrics["eval/robust_reward_worst"] = float( metrics["eval/stress_reward_worst"] = float(
min(row[2]["eval/reward_mean"] for row in shifted_rows) min(row[2]["eval/reward_mean"] for row in shifted_rows)
) )
metrics["eval/robust_revenue_worst"] = float( metrics["eval/stress_revenue_worst"] = float(
min(row[2]["eval/revenue_mean"] for row in shifted_rows) min(row[2]["eval/revenue_mean"] for row in shifted_rows)
) )
metrics["eval/robust_coi_leakage_worst"] = float( metrics["eval/stress_coi_leakage_worst"] = float(
max(row[2]["eval/coi_leakage_mean"] for row in shifted_rows) max(row[2]["eval/coi_leakage_mean"] for row in shifted_rows)
) )
for tag, alpha, shifted_metrics in shifted_rows: for tag, alpha, shifted_metrics in shifted_rows:

View File

@@ -80,7 +80,11 @@ def train_qtable(
"train/global_step": int(steps), "train/global_step": int(steps),
} }
if wandb_live: if wandb_live:
try:
wandb.log(dict(event), step=step_offset + int(steps)) wandb.log(dict(event), step=step_offset + int(steps))
except Exception:
wandb_live = False
train_events.append(event)
else: else:
train_events.append(event) train_events.append(event)
if console_progress: if console_progress:
@@ -113,7 +117,11 @@ def train_qtable(
"train/global_step": int(steps), "train/global_step": int(steps),
} }
if wandb_live: if wandb_live:
try:
wandb.log(dict(tail_event), step=step_offset + int(steps)) wandb.log(dict(tail_event), step=step_offset + int(steps))
except Exception:
wandb_live = False
train_events.append(tail_event)
else: else:
train_events.append(tail_event) train_events.append(tail_event)

View File

@@ -1,10 +1,12 @@
from __future__ import annotations from __future__ import annotations
import json import json
import os
from pathlib import Path from pathlib import Path
from typing import Any, Mapping from typing import Any, Mapping
from ..lib.callbacks import MetricsCallback from ..lib.callbacks import EvalMetricsCallback, MetricsCallback
from ..wandb_checkpoint import checkpoint_artifact_name, log_checkpoint_file
from .common import evaluate, make_env from .common import evaluate, make_env
@@ -117,7 +119,6 @@ def build_model(cfg: Mapping[str, Any], env: Any):
def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]: def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]:
try: try:
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.monitor import Monitor
except ImportError as exc: except ImportError as exc:
raise ImportError("stable-baselines3 is required for SB3 models") from exc raise ImportError("stable-baselines3 is required for SB3 models") from exc
@@ -144,20 +145,20 @@ def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]:
pass pass
metrics_callback = MetricsCallback( metrics_callback = MetricsCallback(
log_histograms=False, log_histograms=True,
log_freq=int(cfg["log_freq"]), log_freq=int(cfg["log_freq"]),
hist_freq=int(cfg.get("hist_freq", 500)),
step_offset=int(cfg.get("wandb_step_offset", 0)), step_offset=int(cfg.get("wandb_step_offset", 0)),
) )
callbacks = [metrics_callback] eval_callback = EvalMetricsCallback(
callbacks.append(
EvalCallback(
eval_env, eval_env,
eval_freq=int(cfg["eval_freq"]), eval_freq=int(cfg["eval_freq"]),
n_eval_episodes=int(cfg["eval_episodes"]), n_eval_episodes=int(cfg["eval_episodes"]),
step_offset=int(cfg.get("wandb_step_offset", 0)),
deterministic=True, deterministic=True,
verbose=0, verbose=0,
) )
) callbacks = [metrics_callback, eval_callback]
target_steps = int(cfg["total_timesteps"]) target_steps = int(cfg["total_timesteps"])
remaining_steps = max(0, target_steps - int(getattr(model, "num_timesteps", 0))) remaining_steps = max(0, target_steps - int(getattr(model, "num_timesteps", 0)))
@@ -173,6 +174,29 @@ def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]:
model_path = model_dir / f"phantom_{cfg['algo']}" model_path = model_dir / f"phantom_{cfg['algo']}"
model.save(str(model_path)) model.save(str(model_path))
artifact_name = checkpoint_artifact_name(
cfg,
backend="sb3",
sweep_id=os.getenv("WANDB_SWEEP_ID"),
)
artifact_logged = False
try:
artifact_logged = bool(
log_checkpoint_file(
artifact_name,
file_path=model_path.with_suffix(".zip"),
artifact_file_name="model.zip",
metadata={
"algo": str(cfg.get("algo", "ppo")),
"backend": "sb3",
"seed": int(cfg.get("seed", 0)),
"step": int(getattr(model, "num_timesteps", 0)),
},
)
)
except Exception:
artifact_logged = False
metrics: dict[str, Any] = evaluate( metrics: dict[str, Any] = evaluate(
model, model,
eval_env, eval_env,
@@ -181,7 +205,12 @@ def train_sb3(cfg: Mapping[str, Any]) -> tuple[object, dict[str, Any]]:
) )
metrics["train/global_step"] = int(model.num_timesteps) metrics["train/global_step"] = int(model.num_timesteps)
metrics["model/path"] = str(model_path.with_suffix(".zip")) metrics["model/path"] = str(model_path.with_suffix(".zip"))
metrics["_train_events"] = list(metrics_callback.events) metrics["model/artifact_name"] = str(artifact_name)
metrics["model/artifact_logged"] = float(artifact_logged)
metrics["_train_events"] = sorted(
[*metrics_callback.events, *eval_callback.events],
key=lambda event: int(event.get("train/global_step", 0)),
)
env.close() env.close()
eval_env.close() eval_env.close()

View File

@@ -1,12 +1,32 @@
from __future__ import annotations from __future__ import annotations
import os
import subprocess
import sys
import argparse import argparse
import json import json
import logging import logging
import os from datetime import datetime, timezone
from datetime import datetime, UTC
from pathlib import Path from pathlib import Path
# clear stale TPU locks on startup
if os.path.exists("/dev/accel0"):
try:
subprocess.run(
["rm", "-f", "/tmp/.libtpu_lockfile", "/tmp/libtpu_lockfile"],
stderr=subprocess.DEVNULL,
)
except:
pass
try:
import jax
jax.config.update("jax_threefry_partitionable", True)
except ImportError:
pass
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@@ -25,6 +45,10 @@ def _log(message: str) -> None:
logger.info(message) logger.info(message)
def _wandb_run_active() -> bool:
return bool(HAS_WANDB and getattr(wandb, "run", None) is not None)
def _parse_list(raw: str) -> list[str]: def _parse_list(raw: str) -> list[str]:
return [x.strip().lower() for x in str(raw).split(",") if x.strip()] return [x.strip().lower() for x in str(raw).split(",") if x.strip()]
@@ -41,6 +65,10 @@ def _truthy(value: str | bool | None) -> bool:
return str(value).strip().lower() in {"1", "true", "yes", "on"} return str(value).strip().lower() in {"1", "true", "yes", "on"}
def _mode_label_from_baseline(is_baseline: bool) -> str:
return "baseline" if bool(is_baseline) else "defended"
def _action(policy, obs: np.ndarray): def _action(policy, obs: np.ndarray):
out = policy.predict(obs, deterministic=True) out = policy.predict(obs, deterministic=True)
action = out[0] if isinstance(out, tuple) else out action = out[0] if isinstance(out, tuple) else out
@@ -146,7 +174,7 @@ def _log_train_events(
alpha: float, alpha: float,
step_offset: int, step_offset: int,
) -> int: ) -> int:
if not (HAS_WANDB and wandb.run is not None): if not _wandb_run_active():
return int(step_offset) return int(step_offset)
if not events: if not events:
return int(step_offset) return int(step_offset)
@@ -167,11 +195,14 @@ def _log_train_events(
"run.kind": "benchmark", "run.kind": "benchmark",
"runtime/backend": tier_name, "runtime/backend": tier_name,
"study/mode": mode_label, "study/mode": mode_label,
"study/no_robust": float(mode_label == "no_robust"), "study/baseline_mode": float(mode_label == "baseline"),
"study/alpha": float(alpha), "study/alpha": float(alpha),
} }
) )
try:
wandb.log(payload, step=cursor + rel_step) wandb.log(payload, step=cursor + rel_step)
except Exception:
return int(step_offset)
max_rel = max(max(1, int(evt.get("train/global_step", 0))) for evt in ordered) max_rel = max(max(1, int(evt.get("train/global_step", 0))) for evt in ordered)
return cursor + max_rel + 1 return cursor + max_rel + 1
@@ -183,6 +214,7 @@ def run_benchmark(
n_episodes: int, n_episodes: int,
mode_label: str, mode_label: str,
step_cursor_start: int = 0, step_cursor_start: int = 0,
eval_alpha_values: list[float] | None = None,
): ):
from .backends.common import make_env from .backends.common import make_env
@@ -219,14 +251,22 @@ def run_benchmark(
"dqn", "dqn",
}: }:
wandb_step_cursor += max(1, int(cfg.get("total_timesteps", 1))) + 1 wandb_step_cursor += max(1, int(cfg.get("total_timesteps", 1))) + 1
env = make_env({**cfg, "alpha": float(alpha)}) eval_targets = (
[float(value) for value in eval_alpha_values]
if eval_alpha_values
else [float(alpha)]
)
for eval_alpha in eval_targets:
env = make_env({**cfg, "alpha": float(eval_alpha)})
eps = [_run_eval_episode(env, policy) for _ in range(int(n_episodes))] eps = [_run_eval_episode(env, policy) for _ in range(int(n_episodes))]
env.close() env.close()
row = { row = {
"tier": tier_name, "tier": tier_name,
"mode": mode_label, "mode": mode_label,
"alpha": float(alpha), "alpha": float(eval_alpha),
"train_alpha": float(alpha),
"eval_alpha": float(eval_alpha),
"episodes": int(n_episodes), "episodes": int(n_episodes),
"mean_reward": float(np.mean([e["reward"] for e in eps])), "mean_reward": float(np.mean([e["reward"] for e in eps])),
"mean_revenue": float(np.mean([e["revenue"] for e in eps])), "mean_revenue": float(np.mean([e["revenue"] for e in eps])),
@@ -237,7 +277,8 @@ def run_benchmark(
row["objective_score"] = row["mean_reward"] row["objective_score"] = row["mean_reward"]
rows.append(row) rows.append(row)
_log( _log(
f"[{run_index}/{total_runs}] alpha={float(alpha):.2f} tier={tier_name}: " f"[{run_index}/{total_runs}] train_alpha={float(alpha):.2f} "
f"eval_alpha={float(eval_alpha):.2f} tier={tier_name}: "
f"reward={row['mean_reward']:.3f} revenue={row['mean_revenue']:.3f} " f"reward={row['mean_reward']:.3f} revenue={row['mean_revenue']:.3f} "
f"coi={row['mean_coi']:.4f} score={row['objective_score']:.3f}" f"coi={row['mean_coi']:.4f} score={row['objective_score']:.3f}"
) )
@@ -246,25 +287,32 @@ def run_benchmark(
step_means = [] step_means = []
for step in range(max_len): for step in range(max_len):
vals = [ vals = [
e["price_trace"][step] for e in eps if step < len(e["price_trace"]) e["price_trace"][step]
for e in eps
if step < len(e["price_trace"])
] ]
step_means.append(float(np.mean(vals)) if vals else np.nan) step_means.append(float(np.mean(vals)) if vals else np.nan)
traces.append( traces.append(
{ {
"tier": tier_name, "tier": tier_name,
"alpha": float(alpha), "alpha": float(eval_alpha),
"train_alpha": float(alpha),
"eval_alpha": float(eval_alpha),
"mean_price_trace": step_means, "mean_price_trace": step_means,
} }
) )
if HAS_WANDB and wandb.run is not None: if _wandb_run_active():
try:
wandb.log( wandb.log(
{ {
"run.kind": "benchmark", "run.kind": "benchmark",
"runtime/backend": tier_name, "runtime/backend": tier_name,
"study/mode": mode_label, "study/mode": mode_label,
"study/no_robust": float(mode_label == "no_robust"), "study/baseline_mode": float(mode_label == "baseline"),
"study/alpha": float(alpha), "study/alpha": float(eval_alpha),
"study/train_alpha": float(alpha),
"study/eval_alpha": float(eval_alpha),
"eval/reward_mean": row["mean_reward"], "eval/reward_mean": row["mean_reward"],
"eval/revenue_mean": row["mean_revenue"], "eval/revenue_mean": row["mean_revenue"],
"eval/margin_mean": row["mean_margin"], "eval/margin_mean": row["mean_margin"],
@@ -274,6 +322,8 @@ def run_benchmark(
}, },
step=wandb_step_cursor, step=wandb_step_cursor,
) )
except Exception:
pass
wandb_step_cursor += 1 wandb_step_cursor += 1
return pd.DataFrame(rows), traces, int(wandb_step_cursor) return pd.DataFrame(rows), traces, int(wandb_step_cursor)
@@ -358,7 +408,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
if compare_robust_override is not None if compare_robust_override is not None
else _truthy(os.environ.get("PHANTOM_BENCHMARK_COMPARE_ROBUST")) else _truthy(os.environ.get("PHANTOM_BENCHMARK_COMPARE_ROBUST"))
) )
robust_modes = [False, True] if compare_robust else [bool(args.no_robust)] baseline_modes = [False, True] if compare_robust else [bool(args.no_robust)]
base_overrides = { base_overrides = {
"seed": args.seed, "seed": args.seed,
@@ -369,6 +419,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
"robust_radius": args.robust_radius, "robust_radius": args.robust_radius,
"robust_points": args.robust_points, "robust_points": args.robust_points,
"robust_rollouts": args.robust_rollouts, "robust_rollouts": args.robust_rollouts,
"margin_floor": args.margin_floor,
"eta_ux": args.eta_ux, "eta_ux": args.eta_ux,
"reward_profit_weight": args.reward_profit_weight, "reward_profit_weight": args.reward_profit_weight,
"price_low": args.price_low, "price_low": args.price_low,
@@ -385,12 +436,20 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
} }
tiers = _parse_list(args.tiers) tiers = _parse_list(args.tiers)
alpha_values = _parse_float_list(args.alpha_values) alpha_values = _parse_float_list(args.alpha_values)
eval_alpha_values = (
_parse_float_list(args.eval_alpha_values)
if str(getattr(args, "eval_alpha_values", "")).strip()
else []
)
_log( _log(
"starting run " "starting run "
+ json.dumps( + json.dumps(
{ {
"tiers": tiers, "tiers": tiers,
"alpha_values": alpha_values, "alpha_values": alpha_values,
"eval_alpha_values": (
eval_alpha_values if eval_alpha_values else alpha_values
),
"episodes": int(args.episodes), "episodes": int(args.episodes),
"total_timesteps": int(args.total_timesteps), "total_timesteps": int(args.total_timesteps),
"device": str(args.device), "device": str(args.device),
@@ -401,14 +460,14 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
all_frames: list[pd.DataFrame] = [] all_frames: list[pd.DataFrame] = []
all_traces: list[dict] = [] all_traces: list[dict] = []
wandb_step_cursor = 0 wandb_step_cursor = 0
for no_robust in robust_modes: for baseline_mode in baseline_modes:
overrides = dict(base_overrides) overrides = dict(base_overrides)
overrides["no_robust"] = bool(no_robust) overrides["baseline_mode"] = bool(baseline_mode)
cfg = TrainSpec.from_flat( cfg = TrainSpec.from_flat(
{k: v for k, v in overrides.items() if v is not None} {k: v for k, v in overrides.items() if v is not None}
).to_flat_dict() ).to_flat_dict()
cfg["linear_warmup_steps"] = int(args.linear_warmup_steps) cfg["linear_warmup_steps"] = int(args.linear_warmup_steps)
mode_label = "no_robust" if no_robust else "robust" mode_label = _mode_label_from_baseline(bool(baseline_mode))
_log(f"mode={mode_label}: begin") _log(f"mode={mode_label}: begin")
df_mode, traces_mode, wandb_step_cursor = run_benchmark( df_mode, traces_mode, wandb_step_cursor = run_benchmark(
cfg, cfg,
@@ -417,6 +476,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
args.episodes, args.episodes,
mode_label=mode_label, mode_label=mode_label,
step_cursor_start=wandb_step_cursor, step_cursor_start=wandb_step_cursor,
eval_alpha_values=eval_alpha_values,
) )
_log(f"mode={mode_label}: complete ({len(df_mode)} rows)") _log(f"mode={mode_label}: complete ({len(df_mode)} rows)")
for trace in traces_mode: for trace in traces_mode:
@@ -429,7 +489,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
out_dir = Path(args.output_dir) out_dir = Path(args.output_dir)
out_dir.mkdir(parents=True, exist_ok=True) out_dir.mkdir(parents=True, exist_ok=True)
stamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") stamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
csv_path = out_dir / f"benchmark_{stamp}.csv" csv_path = out_dir / f"benchmark_{stamp}.csv"
trace_path = out_dir / f"benchmark_traces_{stamp}.json" trace_path = out_dir / f"benchmark_traces_{stamp}.json"
df.to_csv(csv_path, index=False) df.to_csv(csv_path, index=False)
@@ -445,7 +505,7 @@ def _run_with_args(args, compare_robust_override: bool | None = None):
+ json.dumps( + json.dumps(
{ {
"tier": best["tier"], "tier": best["tier"],
"mode": best.get("mode", "robust"), "mode": best.get("mode", "defended"),
"alpha": float(best["alpha"]), "alpha": float(best["alpha"]),
"objective_score": float(best["objective_score"]), "objective_score": float(best["objective_score"]),
"mean_revenue": float(best["mean_revenue"]), "mean_revenue": float(best["mean_revenue"]),
@@ -466,6 +526,7 @@ def run_cli(raw_args: list[str] | None = None):
parser.add_argument("--project", default="capstone") parser.add_argument("--project", default="capstone")
parser.add_argument("--tiers", default="static,surge,linear,qtable,ppo") parser.add_argument("--tiers", default="static,surge,linear,qtable,ppo")
parser.add_argument("--alpha-values", default="0.0,0.3,0.6") parser.add_argument("--alpha-values", default="0.0,0.3,0.6")
parser.add_argument("--eval-alpha-values", default="")
parser.add_argument("--episodes", type=int, default=10) parser.add_argument("--episodes", type=int, default=10)
parser.add_argument("--output-dir", default="engine/studies/results") parser.add_argument("--output-dir", default="engine/studies/results")
parser.add_argument("--seed", type=int, default=42) parser.add_argument("--seed", type=int, default=42)
@@ -476,6 +537,7 @@ def run_cli(raw_args: list[str] | None = None):
parser.add_argument("--robust-radius", type=float, default=0.15) parser.add_argument("--robust-radius", type=float, default=0.15)
parser.add_argument("--robust-points", type=int, default=5) parser.add_argument("--robust-points", type=int, default=5)
parser.add_argument("--robust-rollouts", type=int, default=1) parser.add_argument("--robust-rollouts", type=int, default=1)
parser.add_argument("--margin-floor", type=float, default=0.85)
parser.add_argument("--eta-ux", type=float, default=0.5) parser.add_argument("--eta-ux", type=float, default=0.5)
parser.add_argument("--reward-profit-weight", type=float, default=1.0) parser.add_argument("--reward-profit-weight", type=float, default=1.0)
parser.add_argument("--price-low", type=float, default=10.0) parser.add_argument("--price-low", type=float, default=10.0)
@@ -509,35 +571,47 @@ def run_cli(raw_args: list[str] | None = None):
key_to_attr = { key_to_attr = {
"tiers": "tiers", "tiers": "tiers",
"alpha_values": "alpha_values", "alpha_values": "alpha_values",
"eval_alpha_values": "eval_alpha_values",
"episodes": "episodes", "episodes": "episodes",
"total_timesteps": "total_timesteps", "total_timesteps": "total_timesteps",
"lambda_coi": "lambda_coi", "lambda_coi": "lambda_coi",
"robust_radius": "robust_radius", "robust_radius": "robust_radius",
"robust_points": "robust_points", "robust_points": "robust_points",
"robust_rollouts": "robust_rollouts", "robust_rollouts": "robust_rollouts",
"ambiguity_radius": "robust_radius",
"ambiguity_points": "robust_points",
"ambiguity_rollouts": "robust_rollouts",
"eta_ux": "eta_ux", "eta_ux": "eta_ux",
"reward_profit_weight": "reward_profit_weight", "reward_profit_weight": "reward_profit_weight",
"learning_rate": "learning_rate", "learning_rate": "learning_rate",
"batch_size": "batch_size", "batch_size": "batch_size",
"n_steps": "n_steps", "n_steps": "n_steps",
"baseline_mode": "no_robust",
"no_robust": "no_robust", "no_robust": "no_robust",
"margin_floor": "margin_floor",
"device": "device", "device": "device",
} }
for key in ( for key in (
"tiers", "tiers",
"alpha_values", "alpha_values",
"eval_alpha_values",
"episodes", "episodes",
"total_timesteps", "total_timesteps",
"lambda_coi", "lambda_coi",
"robust_radius", "robust_radius",
"robust_points", "robust_points",
"robust_rollouts", "robust_rollouts",
"ambiguity_radius",
"ambiguity_points",
"ambiguity_rollouts",
"eta_ux", "eta_ux",
"reward_profit_weight", "reward_profit_weight",
"learning_rate", "learning_rate",
"batch_size", "batch_size",
"n_steps", "n_steps",
"baseline_mode",
"no_robust", "no_robust",
"margin_floor",
"device", "device",
): ):
if key in wandb.config: if key in wandb.config:
@@ -560,18 +634,18 @@ def run_cli(raw_args: list[str] | None = None):
tiers = _parse_list(args.tiers) tiers = _parse_list(args.tiers)
alpha_values = _parse_float_list(args.alpha_values) alpha_values = _parse_float_list(args.alpha_values)
run_stamp = datetime.now(UTC).strftime("%m%d-%H%M%S") run_stamp = datetime.now(timezone.utc).strftime("%m%d-%H%M%S")
compare_enabled = _truthy(os.environ.get("PHANTOM_BENCHMARK_COMPARE_ROBUST")) compare_enabled = _truthy(os.environ.get("PHANTOM_BENCHMARK_COMPARE_ROBUST"))
compare_tag = "robust-compare" if compare_enabled else "single-mode" compare_tag = "defended-compare" if compare_enabled else "single-mode"
modes = ( modes = (
[("no_robust", True), ("robust", False)] [("baseline", True), ("defended", False)]
if compare_enabled if compare_enabled
else [("no_robust" if bool(args.no_robust) else "robust", bool(args.no_robust))] else [(_mode_label_from_baseline(bool(args.no_robust)), bool(args.no_robust))]
) )
run_idx = 0 run_idx = 0
for tier in tiers: for tier in tiers:
for mode_label, no_robust in modes: for mode_label, baseline_mode in modes:
for alpha in alpha_values: for alpha in alpha_values:
run_idx += 1 run_idx += 1
alpha_token = ( alpha_token = (
@@ -580,7 +654,7 @@ def run_cli(raw_args: list[str] | None = None):
tier_args = argparse.Namespace(**vars(args)) tier_args = argparse.Namespace(**vars(args))
tier_args.tiers = tier tier_args.tiers = tier
tier_args.alpha_values = str(float(alpha)) tier_args.alpha_values = str(float(alpha))
tier_args.no_robust = bool(no_robust) tier_args.no_robust = bool(baseline_mode)
run = wandb.init( run = wandb.init(
project=args.project, project=args.project,
name=( name=(
@@ -597,16 +671,19 @@ def run_cli(raw_args: list[str] | None = None):
"run.kind": "benchmark", "run.kind": "benchmark",
"runtime/backend": tier, "runtime/backend": tier,
"study/mode": mode_label, "study/mode": mode_label,
"study/no_robust": float(no_robust), "study/baseline_mode": float(baseline_mode),
"study/alpha": float(alpha), "study/alpha": float(alpha),
"tiers": tier, "tiers": tier,
"alpha_values": str(float(alpha)), "alpha_values": str(float(alpha)),
"eval_alpha_values": args.eval_alpha_values,
"episodes": args.episodes, "episodes": args.episodes,
"total_timesteps": args.total_timesteps, "total_timesteps": args.total_timesteps,
"lambda_coi": args.lambda_coi, "lambda_coi": args.lambda_coi,
"robust_radius": args.robust_radius, "ambiguity_radius": args.robust_radius,
"robust_points": args.robust_points, "ambiguity_points": args.robust_points,
"robust_rollouts": args.robust_rollouts, "ambiguity_rollouts": args.robust_rollouts,
"margin_floor": args.margin_floor,
"baseline_mode": float(baseline_mode),
"eta_ux": args.eta_ux, "eta_ux": args.eta_ux,
"reward_profit_weight": args.reward_profit_weight, "reward_profit_weight": args.reward_profit_weight,
"learning_rate": args.learning_rate, "learning_rate": args.learning_rate,

View File

@@ -48,7 +48,8 @@ class MarketEngine:
) )
human_transitions = get_adjusted_transitions(demand_h, human=True) human_transitions = get_adjusted_transitions(demand_h, human=True)
agent_transitions = get_adjusted_transitions(demand_a, human=False) agent_transitions = get_adjusted_transitions(demand_a, human=False)
# sample behavior trajectories from each demand distribution # sample N trajectories in parallel; each chain is independent so threads
# do not share state and numpy's per-call RNG is thread-safe
human_t = [ human_t = [
sample_behavior_from_transitions(human_transitions) sample_behavior_from_transitions(human_transitions)
for _ in range(self.Nhumans) for _ in range(self.Nhumans)
@@ -59,7 +60,25 @@ class MarketEngine:
] ]
# store trajectories for agent probability calculation # store trajectories for agent probability calculation
self.last_trajectories = human_t + agent_t self.last_trajectories = human_t + agent_t
return estimate_demand(self.last_trajectories, self.action_weights)
demand_proxy = estimate_demand(
self.last_trajectories,
self.action_weights,
normalize=True,
per_session=False,
)
raw_mix = ((1.0 - float(self.alpha)) * demand_h) + (
float(self.alpha) * demand_a
)
total_raw_demand = float(np.sum(raw_mix))
if not demand_proxy:
return {i: float(raw_mix[i]) for i in range(len(prices))}
if total_raw_demand <= 0.0:
return {i: 0.0 for i in range(len(prices))}
return {
i: total_raw_demand * float(demand_proxy.get(i, 0.0)) / 100.0
for i in range(len(prices))
}
def measure(self): def measure(self):
pass pass

3
engine/jax/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .robust import select_adversarial_alpha_jax, _JAX_OK
__all__ = ["select_adversarial_alpha_jax", "_JAX_OK"]

197
engine/jax/robust.py Normal file
View File

@@ -0,0 +1,197 @@
"""JAX-accelerated robust inner loop for PHANTOM.
provides a drop-in replacement for the sequential alpha-candidate evaluation in
wrapper.py::_select_adversarial_alpha. the demand generation and reward
computation are vmapped over the K candidate alpha values so all candidates are
evaluated in a single vectorized pass instead of K sequential Python calls.
public surface:
select_adversarial_alpha_jax(candidates, prices, human_params, agent_params,
noise_std, n_sessions, n_products,
baseline_prices, lambda_coi, info_value,
reward_profit_weight, rng_key)
-> (best_alpha: float, rewards: np.ndarray)
falls back gracefully when JAX is unavailable.
"""
from __future__ import annotations
import numpy as np
try:
import jax
import jax.numpy as jnp
from jax import vmap, jit
_JAX_OK = True
except ImportError:
_JAX_OK = False
_JAX_RUNTIME_OK = True
def _demand_for_actor_jax(prices, mean, std, noise_std, key):
"""d(p;theta) = max(0, val - price + noise), normalized to sum 100."""
k1, k2 = jax.random.split(key)
val = jax.random.normal(k1, shape=prices.shape) * std + mean
noise = jax.random.normal(k2, shape=prices.shape) * noise_std
demand = jnp.maximum(0.0, val - prices + noise)
total = demand.sum()
return jnp.where(total > 0, demand / total * 100.0, demand)
def _reward_for_candidate(
alpha,
prices,
human_mean,
human_std,
agent_mean,
agent_std,
noise_std,
baseline_prices,
lambda_coi,
info_value,
reward_profit_weight,
key,
):
"""compute a scalar reward for a single alpha candidate (pure JAX, vmappable)."""
k_h, k_a = jax.random.split(key)
# mixed demand proxy: weighted sum of human and agent demand signals
demand_h = _demand_for_actor_jax(prices, human_mean, human_std, noise_std, k_h)
demand_a = _demand_for_actor_jax(prices, agent_mean, agent_std, noise_std, k_a)
demand = (1.0 - alpha) * demand_h + alpha * demand_a
revenue = jnp.dot(prices, demand)
floor_cost = jnp.dot(baseline_prices, demand)
profit = revenue - floor_cost
# agent_prob proxy: use alpha directly (no trajectory available in vectorized path)
coi_leakage = alpha * info_value
info_budget = jnp.maximum(floor_cost, 1.0)
coi_penalty = lambda_coi * coi_leakage * info_budget
return reward_profit_weight * profit - coi_penalty
if _JAX_OK:
# compile once; retracing only happens on shape/dtype changes
# 12 args: alpha, prices, h_mean, h_std, a_mean, a_std, noise_std,
# baseline_prices, lambda_coi, info_value, reward_profit_weight, key
_reward_batched = jit(
vmap(
_reward_for_candidate,
in_axes=(0, None, None, None, None, None, None, None, None, None, None, 0),
)
)
def select_adversarial_alpha_jax(
candidates: np.ndarray,
prices: np.ndarray,
human_params: tuple,
agent_params: tuple,
noise_std: float,
baseline_prices: np.ndarray,
lambda_coi: float,
info_value: float,
reward_profit_weight: float,
rng_seed: int = 0,
) -> tuple[float, np.ndarray]:
"""evaluate all alpha candidates in a single vmapped pass.
returns (best_alpha, rewards_array) where best_alpha minimizes reward
(worst case for the platform, driving robust policy training).
falls back to a pure-numpy sequential loop when JAX is unavailable so the
wrapper can call this function unconditionally.
"""
global _JAX_RUNTIME_OK
if not _JAX_OK or not _JAX_RUNTIME_OK:
return _fallback(
candidates,
prices,
human_params,
agent_params,
noise_std,
baseline_prices,
lambda_coi,
info_value,
reward_profit_weight,
)
try:
k = len(candidates)
key = jax.random.PRNGKey(rng_seed)
keys = jax.random.split(key, k)
rewards = np.asarray(
_reward_batched(
jnp.asarray(candidates, dtype=jnp.float32),
jnp.asarray(prices, dtype=jnp.float32),
float(human_params[0]),
float(human_params[1]),
float(agent_params[0]),
float(agent_params[1]),
float(noise_std),
jnp.asarray(baseline_prices, dtype=jnp.float32),
float(lambda_coi),
float(info_value),
float(reward_profit_weight),
keys,
)
)
best_idx = int(np.argmin(rewards))
return float(candidates[best_idx]), rewards
except Exception as exc:
# TPU contention / backend init failures can happen in distributed schedulers.
# Degrade to numpy path for the remainder of the process.
_JAX_RUNTIME_OK = False
print(f"PHANTOM_JAX_FALLBACK: {exc}")
return _fallback(
candidates,
prices,
human_params,
agent_params,
noise_std,
baseline_prices,
lambda_coi,
info_value,
reward_profit_weight,
)
def _fallback(
candidates,
prices,
human_params,
agent_params,
noise_std,
baseline_prices,
lambda_coi,
info_value,
reward_profit_weight,
):
"""numpy fallback matching the reward formula above."""
rewards = []
for alpha in candidates:
rng = np.random.default_rng()
val_h = rng.normal(*human_params, size=len(prices))
val_a = rng.normal(*agent_params, size=len(prices))
noise_h = rng.normal(0, noise_std, len(prices))
noise_a = rng.normal(0, noise_std, len(prices))
d_h = np.maximum(0, val_h - prices + noise_h)
d_a = np.maximum(0, val_a - prices + noise_a)
s_h, s_a = d_h.sum(), d_a.sum()
d_h = d_h / s_h * 100 if s_h > 0 else d_h
d_a = d_a / s_a * 100 if s_a > 0 else d_a
demand = (1.0 - alpha) * d_h + alpha * d_a
revenue = float(np.dot(prices, demand))
floor_cost = float(np.dot(baseline_prices, demand))
profit = revenue - floor_cost
coi_penalty = lambda_coi * alpha * info_value * max(floor_cost, 1.0)
rewards.append(reward_profit_weight * profit - coi_penalty)
rewards = np.array(rewards)
best_idx = int(np.argmin(rewards))
return float(candidates[best_idx]), rewards

View File

@@ -22,6 +22,9 @@ human_dir = str(base_dir / "collected_data")
agent_dir = str(base_dir / "agents" / "collected_data") agent_dir = str(base_dir / "agents" / "collected_data")
_cache = {} # lazy cache for models and base pivots _cache = {} # lazy cache for models and base pivots
# cache keyed by (human: bool, condition_tuple) so we skip Kronecker re-expansion
# for repeated calls with the same demand condition inside the robustness inner loop
_transition_cache: dict = {}
def _get_base_pivot(human: bool): def _get_base_pivot(human: bool):
@@ -68,22 +71,41 @@ def trajectory_to_events(trajectory: list) -> list:
"""extract event names from trajectory for KL divergence calculation """extract event names from trajectory for KL divergence calculation
trajectories are in format 'eventName_product0', extract just eventName trajectories are in format 'eventName_product0', extract just eventName
args:
trajectory: list like ['view_product0', 'add_to_cart_product1', 'checkout_product1']
returns:
list: event names like ['view', 'add_to_cart', 'checkout']
""" """
events = [] return [s.rsplit("_product", 1)[0] if "_product" in s else s for s in trajectory]
for state in trajectory:
# state format from sample_behavior: 'eventName_productX'
if "_product" in state: class _TransitionTable:
event = state.rsplit("_product", 1)[0] """numpy-backed transition table; replaces per-step pandas .loc[] indexing.
else:
event = state the profiling hotspot was DataFrame.xs called ~4-16k times per outer step.
events.append(event) converting once to a dense float32 array with an int-keyed state index map
return events reduces each row lookup to a single array slice with no pandas overhead.
rows are pre-normalized so sampling requires no per-step division.
"""
__slots__ = ("matrix", "states", "state_index", "n_states")
def __init__(self, df: pd.DataFrame):
self.states: list[str] = df.index.tolist()
self.state_index: dict[str, int] = {s: i for i, s in enumerate(self.states)}
# float64 throughout: float32 row-sums can drift enough to break np.random.choice
mat = np.nan_to_num(
df.values.astype(np.float64), nan=0.0, posinf=0.0, neginf=0.0
)
mat = np.clip(mat, 0.0, None)
row_sums = mat.sum(axis=1)
# dead rows (all zero) get uniform distribution so sampling never receives NaN
dead = row_sums <= 0
mat[dead] = 1.0
row_sums[dead] = float(mat.shape[1])
mat = mat / row_sums[:, np.newaxis]
# final nan guard in case fp still drifts
np.nan_to_num(mat, nan=0.0, copy=False)
row_sums2 = mat.sum(axis=1, keepdims=True)
row_sums2[row_sums2 <= 0] = 1.0
self.matrix: np.ndarray = mat / row_sums2
self.n_states: int = len(self.states)
def adjust_behavior_to_condition(condition, transition_matrix): def adjust_behavior_to_condition(condition, transition_matrix):
@@ -92,46 +114,73 @@ def adjust_behavior_to_condition(condition, transition_matrix):
condition = np.nan_to_num(condition, nan=0.0, posinf=0.0, neginf=0.0) condition = np.nan_to_num(condition, nan=0.0, posinf=0.0, neginf=0.0)
condition = np.clip(condition, 0.0, None) condition = np.clip(condition, 0.0, None)
s = float(np.sum(condition)) s = float(np.sum(condition))
if not np.isfinite(s) or s <= 0: cond_norm = (
cond_norm = np.full(len(condition), 1.0 / max(len(condition), 1), dtype=float) condition / s
else: if np.isfinite(s) and s > 0
cond_norm = condition / s else np.full(len(condition), 1.0 / max(len(condition), 1), dtype=float)
)
n_products = len(condition) n_products = len(condition)
base_vals = transition_matrix.values base_vals = transition_matrix.values
base_cols, base_rows = ( base_cols, base_rows = (
transition_matrix.columns.tolist(), transition_matrix.columns.tolist(),
transition_matrix.index.tolist(), transition_matrix.index.tolist(),
) )
# expand via kronecker-like tiling: each cell becomes a P*P block weighted by outer product of cond_norm
expanded = np.kron(base_vals, np.outer(cond_norm, cond_norm)) expanded = np.kron(base_vals, np.outer(cond_norm, cond_norm))
new_cols = [f"{c}_product{p}" for c in base_cols for p in range(n_products)] new_cols = [f"{c}_product{p}" for c in base_cols for p in range(n_products)]
new_rows = [f"{r}_product{p}" for r in base_rows for p in range(n_products)] new_rows = [f"{r}_product{p}" for r in base_rows for p in range(n_products)]
return pd.DataFrame(expanded, index=new_rows, columns=new_cols) return pd.DataFrame(expanded, index=new_rows, columns=new_cols)
def get_adjusted_transitions(condition, human=True): def get_adjusted_transitions(condition, human=True) -> _TransitionTable:
"""return a _TransitionTable for the given demand condition.
results are cached by (human, rounded-condition) so that repeated calls with
the same condition inside the robustness inner loop (K candidates, same prices)
skip the Kronecker expansion entirely.
"""
condition = np.asarray(condition, dtype=float)
# round to 4 significant digits for cache key stability
cache_key = (human, tuple(np.round(condition, 4).tolist()))
if cache_key in _transition_cache:
return _transition_cache[cache_key]
# prevent OOM by capping cache size
if len(_transition_cache) > 100:
_transition_cache.clear()
base_pivot = _get_base_pivot(human) base_pivot = _get_base_pivot(human)
return adjust_behavior_to_condition(condition, base_pivot) df = adjust_behavior_to_condition(condition, base_pivot)
table = _TransitionTable(df)
_transition_cache[cache_key] = table
return table
def sample_behavior_from_transitions(adjusted_transitions, max_len=40): def clear_transition_cache():
trajectory = [np.random.choice(adjusted_transitions.index)] """drop cached transition tables; call between episodes if condition space is large."""
_transition_cache.clear()
def sample_behavior_from_transitions(table, max_len=40):
"""sample a Markov trajectory.
accepts _TransitionTable (fast path) or a legacy pandas DataFrame so existing
call sites that pass a DataFrame directly continue to work unchanged.
"""
if isinstance(table, pd.DataFrame):
table = _TransitionTable(table)
idx = np.random.randint(table.n_states)
trajectory = [table.states[idx]]
while len(trajectory) < max_len and "checkout" not in trajectory[-1]: while len(trajectory) < max_len and "checkout" not in trajectory[-1]:
probs = np.asarray(adjusted_transitions.loc[trajectory[-1]].values, dtype=float) row = table.matrix[table.state_index[trajectory[-1]]]
probs = np.nan_to_num(probs, nan=0.0, posinf=0.0, neginf=0.0) idx = int(np.random.choice(table.n_states, p=row))
probs = np.clip(probs, 0.0, None) trajectory.append(table.states[idx])
s = float(np.sum(probs))
sample = np.random.choice(
adjusted_transitions.columns, p=(probs / s) if s > 0 else None
)
trajectory.append(sample)
return trajectory return trajectory
def sample_behavior(condition, human=True, max_len=40): def sample_behavior(condition, human=True, max_len=40):
adjusted_transitions = get_adjusted_transitions(condition, human=human) table = get_adjusted_transitions(condition, human=human)
return sample_behavior_from_transitions(adjusted_transitions, max_len=max_len) return sample_behavior_from_transitions(table, max_len=max_len)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -15,15 +15,19 @@ class MetricsCallback(BaseCallback):
self, self,
log_histograms: bool = False, log_histograms: bool = False,
log_freq: int = 100, log_freq: int = 100,
hist_freq: int = 500,
step_offset: int = 0, step_offset: int = 0,
verbose: int = 0, verbose: int = 0,
): ):
super().__init__(verbose) super().__init__(verbose)
self.log_histograms = log_histograms self.log_histograms = log_histograms
self.log_freq = max(1, int(log_freq)) self.log_freq = max(1, int(log_freq))
self.hist_freq = max(1, int(hist_freq))
self.step_offset = max(0, int(step_offset)) self.step_offset = max(0, int(step_offset))
self._wandb = get_wandb_module() self._wandb = get_wandb_module()
self._wandb_live = bool(self._wandb is not None and self._wandb.run is not None) self._wandb_live = bool(self._wandb is not None and self._wandb.run is not None)
self._price_samples: list[float] = []
self._demand_samples: list[float] = []
self._window_sums = { self._window_sums = {
"train/revenue_mean": 0.0, "train/revenue_mean": 0.0,
"train/margin_mean": 0.0, "train/margin_mean": 0.0,
@@ -74,9 +78,67 @@ class MetricsCallback(BaseCallback):
) )
self._window_count += 1 self._window_count += 1
def _flush(self, step: int) -> None: def _accumulate_histograms(self, info: dict[str, Any]) -> None:
if self._window_count <= 0: if not self.log_histograms:
return return
for key in ("effective_prices", "prices"):
if key not in info:
continue
try:
values = np.asarray(info.get(key), dtype=float).reshape(-1)
except Exception:
continue
if values.size <= 0:
continue
finite_values = values[np.isfinite(values)]
if finite_values.size > 0:
self._price_samples.extend(finite_values.tolist())
break
if "demand" in info:
try:
demand_values = np.asarray(info.get("demand"), dtype=float).reshape(-1)
except Exception:
demand_values = np.array([], dtype=float)
if demand_values.size > 0:
finite_demand = demand_values[np.isfinite(demand_values)]
if finite_demand.size > 0:
self._demand_samples.extend(finite_demand.tolist())
def _flush_histograms(self, step: int, force: bool = False) -> None:
if not self.log_histograms:
return
if not force and step % self.hist_freq != 0:
return
if not self._price_samples and not self._demand_samples:
return
if self._wandb is None:
self._price_samples.clear()
self._demand_samples.clear()
return
payload: dict[str, Any] = {}
if self._price_samples:
payload["train/price_dist"] = self._wandb.Histogram(
np.asarray(self._price_samples, dtype=np.float32)
)
if self._demand_samples:
payload["train/demand_dist"] = self._wandb.Histogram(
np.asarray(self._demand_samples, dtype=np.float32)
)
if payload and self._wandb_live:
try:
self._wandb.log(payload, step=self.step_offset + int(step))
except Exception:
self._wandb_live = False
self._price_samples.clear()
self._demand_samples.clear()
def _flush(self, step: int, *, force_hist: bool = False) -> None:
if self._window_count > 0:
denom = float(self._window_count) denom = float(self._window_count)
payload = { payload = {
key: (value / denom) key: (value / denom)
@@ -92,17 +154,24 @@ class MetricsCallback(BaseCallback):
} }
payload["train/global_step"] = int(step) payload["train/global_step"] = int(step)
if self._wandb_live: if self._wandb_live:
try:
self._wandb.log(dict(payload), step=self.step_offset + int(step)) self._wandb.log(dict(payload), step=self.step_offset + int(step))
except Exception:
self._wandb_live = False
self.events.append(payload)
else: else:
self.events.append(payload) self.events.append(payload)
for key in self._window_sums: for key in self._window_sums:
self._window_sums[key] = 0.0 self._window_sums[key] = 0.0
self._window_count = 0 self._window_count = 0
self._flush_histograms(step=step, force=force_hist)
def _on_step(self) -> bool: def _on_step(self) -> bool:
for info in self.locals.get("infos", []): for info in self.locals.get("infos", []):
if isinstance(info, dict): if isinstance(info, dict):
self._accumulate(info) self._accumulate(info)
self._accumulate_histograms(info)
if self.num_timesteps % self.log_freq == 0: if self.num_timesteps % self.log_freq == 0:
self._flush(step=self.num_timesteps) self._flush(step=self.num_timesteps)
@@ -110,39 +179,81 @@ class MetricsCallback(BaseCallback):
return True return True
def _on_training_end(self) -> None: def _on_training_end(self) -> None:
self._flush(step=self.num_timesteps) self._flush(step=self.num_timesteps, force_hist=True)
class EvalMetricsCallback(EvalCallback): class EvalMetricsCallback(EvalCallback):
"""Deterministic evaluation collector detached from logging backends.""" """Deterministic evaluation collector detached from logging backends."""
def __init__( def __init__(
self, eval_env, eval_freq: int = 1000, n_eval_episodes: int = 5, **kwargs self,
eval_env,
eval_freq: int = 1000,
n_eval_episodes: int = 5,
step_offset: int = 0,
**kwargs,
): ):
super().__init__( super().__init__(
eval_env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes, **kwargs eval_env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes, **kwargs
) )
self._eval_revenues: list[float] = [] self.step_offset = max(0, int(step_offset))
self._wandb = get_wandb_module()
self._wandb_live = bool(self._wandb is not None and self._wandb.run is not None)
self._eval_stats: dict[str, list[float]] = {
"eval/revenue_mean": [],
"eval/margin_mean": [],
"eval/coi_level_mean": [],
"eval/coi_leakage_mean": [],
"eval/volatility_mean": [],
"eval/agent_prob_mean": [],
}
self.events: list[dict[str, float | int]] = [] self.events: list[dict[str, float | int]] = []
def _on_step(self) -> bool: def _on_step(self) -> bool:
result = super()._on_step() result = super()._on_step()
if self.n_calls % self.eval_freq == 0 and hasattr(self, "last_mean_reward"): if self.n_calls % self.eval_freq == 0 and hasattr(self, "last_mean_reward"):
self.events.append( payload: dict[str, float | int] = {
{
"eval/reward_mean": float(self.last_mean_reward), "eval/reward_mean": float(self.last_mean_reward),
"eval/revenue_mean": float(np.mean(self._eval_revenues))
if self._eval_revenues
else 0.0,
"train/global_step": int(self.num_timesteps), "train/global_step": int(self.num_timesteps),
} }
for key, values in self._eval_stats.items():
payload[key] = float(np.mean(values)) if values else 0.0
if self._wandb_live:
try:
self._wandb.log(
dict(payload),
step=self.step_offset + int(self.num_timesteps),
) )
self._eval_revenues = [] except Exception:
self._wandb_live = False
self.events.append(payload)
else:
self.events.append(payload)
for values in self._eval_stats.values():
values.clear()
return result return result
def _log_success_callback(self, locals_: dict, globals_: dict) -> None: def _log_success_callback(self, locals_: dict, globals_: dict) -> None:
# called after each eval episode # called after each eval episode
info = locals_.get("info", {}) info = locals_.get("info", {})
if "economics" in info: econ = info.get("economics") if isinstance(info, dict) else None
self._eval_revenues.append(info["economics"]["revenue"]) if not isinstance(econ, dict):
return
self._eval_stats["eval/revenue_mean"].append(float(econ.get("revenue", 0.0)))
self._eval_stats["eval/margin_mean"].append(float(econ.get("margin", 0.0)))
self._eval_stats["eval/coi_level_mean"].append(
float(econ.get("coi_level", 0.0))
)
self._eval_stats["eval/coi_leakage_mean"].append(
float(econ.get("coi_leakage", 0.0))
)
self._eval_stats["eval/volatility_mean"].append(
float(econ.get("volatility", 0.0))
)
self._eval_stats["eval/agent_prob_mean"].append(
float(econ.get("agent_prob", 0.0))
)

View File

@@ -17,18 +17,32 @@ def generate_demand_for_actor(
params: tuple, params: tuple,
noise_std: float = 1.0, noise_std: float = 1.0,
distribution_method=np.random.normal, distribution_method=np.random.normal,
normalize: bool = False,
) -> np.ndarray: ) -> np.ndarray:
"""d(p;0) = max(0, valuation - price) + epsi for single actor type """d(p;0) = max(0, valuation - price) + epsi for single actor type
params: (mean, std) for valuation distribution D_H or D_A""" params: (mean, std) for valuation distribution D_H or D_A"""
val = distribution_method(*params, size=len(prices)) val = distribution_method(*params, size=len(prices))
noise = distribution_method(0, noise_std, len(prices)) noise = distribution_method(0, noise_std, len(prices))
demand = np.maximum(0, val - prices + noise) demand = np.maximum(0, val - prices + noise)
if not normalize:
return demand
total = np.sum(demand) total = np.sum(demand)
return demand / total * 100 if total > 0 else demand return demand / total * 100 if total > 0 else demand
def estimate_demand(trajectories, action_weights=None): def estimate_demand(
return estimate_weighted_demand(trajectories, action_weights) trajectories,
action_weights=None,
*,
normalize: bool = False,
per_session: bool = True,
):
return estimate_weighted_demand(
trajectories,
action_weights,
normalize=normalize,
per_session=per_session,
)
def _parse_event_state(state: str): def _parse_event_state(state: str):
@@ -50,7 +64,13 @@ def _weight_for_action(action: str, action_weights: dict) -> float:
return CATEGORY_WEIGHTS["nav"] return CATEGORY_WEIGHTS["nav"]
def estimate_weighted_demand(trajectories, action_weights=None): def estimate_weighted_demand(
trajectories,
action_weights=None,
*,
normalize: bool = False,
per_session: bool = True,
):
action_weights = ( action_weights = (
DEFAULT_ACTION_WEIGHTS if action_weights is None else action_weights DEFAULT_ACTION_WEIGHTS if action_weights is None else action_weights
) )
@@ -64,12 +84,20 @@ def estimate_weighted_demand(trajectories, action_weights=None):
if w <= 0: if w <= 0:
continue continue
scores[product_id] = scores.get(product_id, 0.0) + w scores[product_id] = scores.get(product_id, 0.0) + w
total = sum(scores.values()) if not scores:
return ( return {}
{pid: (score / total) * 100 for pid, score in scores.items()}
if total > 0 if per_session and len(trajectories) > 0:
else {} inv_n = 1.0 / float(len(trajectories))
) scores = {pid: score * inv_n for pid, score in scores.items()}
if not normalize:
return scores
total = float(sum(scores.values()))
if total <= 0:
return {}
return {pid: (score / total) * 100.0 for pid, score in scores.items()}
# Example usage # Example usage

View File

@@ -156,6 +156,7 @@ class ProviderBenchmark:
# log to wandb if available # log to wandb if available
if HAS_WANDB and wandb.run is not None: if HAS_WANDB and wandb.run is not None:
try:
wandb.log( wandb.log(
{ {
f"benchmark/{name}/revenue": result.mean_revenue, f"benchmark/{name}/revenue": result.mean_revenue,
@@ -164,6 +165,8 @@ class ProviderBenchmark:
"benchmark/alpha": alpha, "benchmark/alpha": alpha,
} }
) )
except Exception:
pass
return self.results return self.results

View File

@@ -32,17 +32,23 @@ class EconomicMetricsWrapper(gym.Wrapper):
obs, reward, terminated, truncated, info = self.env.step(action) obs, reward, terminated, truncated, info = self.env.step(action)
# extract from unwrapped env # extract from unwrapped env
prices = self.env.unwrapped._prices quoted_prices = np.asarray(self.env.unwrapped._prices, dtype=float)
effective_prices = np.asarray(
info.get("effective_prices", quoted_prices), dtype=float
)
if effective_prices.shape != quoted_prices.shape:
effective_prices = quoted_prices
demand_dict = self.env.unwrapped._demand demand_dict = self.env.unwrapped._demand
demand = np.array([demand_dict.get(i, 0.0) for i in range(len(prices))]) demand = np.array([demand_dict.get(i, 0.0) for i in range(len(quoted_prices))])
# core calculations # core calculations
revenue = float(np.sum(prices * demand)) revenue = float(info.get("revenue", np.sum(effective_prices * demand)))
avg_price = float(np.mean(prices)) quoted_revenue = float(np.sum(quoted_prices * demand))
avg_price = float(np.mean(effective_prices))
margin = (avg_price - self.p_min) / max(avg_price, 1e-6) margin = (avg_price - self.p_min) / max(avg_price, 1e-6)
coi_level = avg_price - self.p_min # E[P] - p_min per thesis Def 1 coi_level = avg_price - self.p_min # E[P] - p_min per thesis Def 1
self._price_history.append(prices.copy()) self._price_history.append(effective_prices.copy())
self._revenue_history.append(revenue) self._revenue_history.append(revenue)
# regret vs baseline (golden path) # regret vs baseline (golden path)
@@ -53,6 +59,7 @@ class EconomicMetricsWrapper(gym.Wrapper):
# inject structured metrics into info # inject structured metrics into info
info["economics"] = { info["economics"] = {
"revenue": revenue, "revenue": revenue,
"quoted_revenue": quoted_revenue,
"margin": margin, "margin": margin,
"coi_level": coi_level, "coi_level": coi_level,
"regret": regret, "regret": regret,
@@ -64,6 +71,10 @@ class EconomicMetricsWrapper(gym.Wrapper):
"coi_penalty", "coi_penalty",
"ux_penalty", "ux_penalty",
"volatility", "volatility",
"upward_volatility",
"supra_penalty",
"supra_share",
"competitive_anchor",
"profit", "profit",
"cost_floor", "cost_floor",
"reward_revenue", "reward_revenue",
@@ -71,10 +82,13 @@ class EconomicMetricsWrapper(gym.Wrapper):
"agent_prob", "agent_prob",
"alpha_adv", "alpha_adv",
"alpha_nominal", "alpha_nominal",
"erosion_share",
"effective_price_mean",
): ):
if key in info: if key in info:
info["economics"][key] = info[key] info["economics"][key] = info[key]
info["prices"] = prices.copy() info["prices"] = quoted_prices.copy()
info["effective_prices"] = effective_prices.copy()
info["demand"] = demand.copy() info["demand"] = demand.copy()
return obs, reward, terminated, truncated, info return obs, reward, terminated, truncated, info

View File

@@ -9,6 +9,7 @@ from ..telemetry.wandb import (
get_wandb_module, get_wandb_module,
init_run, init_run,
run_agent, run_agent,
update_summary,
) )
from .train import run_with_active_sweep_run from .train import run_with_active_sweep_run
@@ -43,6 +44,7 @@ def run_sweep_agent(
spec = TrainSpec.from_flat(merged) spec = TrainSpec.from_flat(merged)
if run is not None: if run is not None:
run.name = run_name(spec, kind=kind, scenario=scenario) run.name = run_name(spec, kind=kind, scenario=scenario)
try:
run_with_active_sweep_run( run_with_active_sweep_run(
spec, spec,
kind=kind, kind=kind,
@@ -50,6 +52,15 @@ def run_sweep_agent(
group=group, group=group,
extra_tags=extra_tags, extra_tags=extra_tags,
) )
update_summary({"run/status": "finished"})
except Exception as exc:
update_summary(
{
"run/status": "crashed",
"run/error": str(exc),
}
)
raise
finally: finally:
finish_run() finish_run()

View File

@@ -20,7 +20,7 @@ def _tags_for_run(spec: TrainSpec, kind: str, extra_tags: Sequence[str]) -> list
kind, kind,
spec.algorithm.name, spec.algorithm.name,
spec.runtime.backend, spec.runtime.backend,
"vanilla" if spec.study.no_robust else "robust", "baseline" if spec.study.no_robust else "defended",
] ]
tags.extend([tag for tag in extra_tags if tag]) tags.extend([tag for tag in extra_tags if tag])
return tags return tags

View File

@@ -91,6 +91,44 @@
"command": "bash scripts/nx_research.sh docker-train-publish", "command": "bash scripts/nx_research.sh docker-train-publish",
"cwd": "." "cwd": "."
} }
},
"whoclicked-publish": {
"executor": "nx:run-commands",
"dependsOn": [
"install"
],
"options": {
"command": "bash scripts/nx_research.sh whoclicked-publish",
"cwd": "."
}
},
"tpu-ray-bootstrap": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_research.sh tpu-ray-bootstrap",
"cwd": "."
}
},
"tpu-ray-deps": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_research.sh tpu-ray-deps",
"cwd": "."
}
},
"tpu-ray-verify": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_research.sh tpu-ray-verify",
"cwd": "."
}
},
"tpu-ray-teardown": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_research.sh tpu-ray-teardown",
"cwd": "."
}
} }
}, },
"tags": [ "tags": [

View File

@@ -32,10 +32,17 @@ def _normalize_keys(raw: Mapping[str, Any]) -> dict[str, Any]:
"study.robust_radius": "robust_radius", "study.robust_radius": "robust_radius",
"study.robust_points": "robust_points", "study.robust_points": "robust_points",
"study.robust_rollouts": "robust_rollouts", "study.robust_rollouts": "robust_rollouts",
"study.ambiguity_radius": "robust_radius",
"study.ambiguity_points": "robust_points",
"study.ambiguity_rollouts": "robust_rollouts",
"study.info_value": "info_value", "study.info_value": "info_value",
"study.eta_ux": "eta_ux", "study.eta_ux": "eta_ux",
"study.reward_profit_weight": "reward_profit_weight", "study.reward_profit_weight": "reward_profit_weight",
"study.revenue_weight": "revenue_weight", "ambiguity_radius": "robust_radius",
"ambiguity_points": "robust_points",
"ambiguity_rollouts": "robust_rollouts",
"baseline_mode": "no_robust",
"stress_eval_enabled": "robust_eval_enabled",
"optimizer.learning_rate": "learning_rate", "optimizer.learning_rate": "learning_rate",
"optimizer.gamma": "gamma", "optimizer.gamma": "gamma",
"optimizer.batch_size": "batch_size", "optimizer.batch_size": "batch_size",
@@ -45,6 +52,7 @@ def _normalize_keys(raw: Mapping[str, Any]) -> dict[str, Any]:
"runtime.seed": "seed", "runtime.seed": "seed",
"runtime.total_timesteps": "total_timesteps", "runtime.total_timesteps": "total_timesteps",
"runtime.checkpoint_interval": "checkpoint_interval", "runtime.checkpoint_interval": "checkpoint_interval",
"runtime.hist_freq": "hist_freq",
"eval.eval_freq": "eval_freq", "eval.eval_freq": "eval_freq",
"eval.eval_episodes": "eval_episodes", "eval.eval_episodes": "eval_episodes",
} }
@@ -72,6 +80,8 @@ class EnvSpec:
max_steps: int = 100 max_steps: int = 100
margin_floor: float = 0.05 margin_floor: float = 0.05
margin_floor_patience: int = 5 margin_floor_patience: int = 5
agent_mu: float = 45.0
agent_std: float = 15.0
@dataclass(frozen=True) @dataclass(frozen=True)
@@ -84,7 +94,6 @@ class StudySpec:
info_value: float = 1.0 info_value: float = 1.0
eta_ux: float = 0.5 eta_ux: float = 0.5
reward_profit_weight: float = 1.0 reward_profit_weight: float = 1.0
revenue_weight: float = 0.01
no_robust: bool = False no_robust: bool = False
@@ -126,6 +135,7 @@ class RuntimeSpec:
checkpoint_interval: int = 200_000 checkpoint_interval: int = 200_000
model_dir: str = "engine/models" model_dir: str = "engine/models"
log_freq: int = 100 log_freq: int = 100
hist_freq: int = 500
@dataclass(frozen=True) @dataclass(frozen=True)
@@ -157,6 +167,7 @@ class TrainSpec:
"backend": self.runtime.backend, "backend": self.runtime.backend,
"device": self.runtime.device, "device": self.runtime.device,
"checkpoint_interval": self.runtime.checkpoint_interval, "checkpoint_interval": self.runtime.checkpoint_interval,
"hist_freq": self.runtime.hist_freq,
"n_products": self.env.n_products, "n_products": self.env.n_products,
"N": self.env.n_sessions, "N": self.env.n_sessions,
"price_low": self.env.price_low, "price_low": self.env.price_low,
@@ -167,6 +178,8 @@ class TrainSpec:
"max_steps": self.env.max_steps, "max_steps": self.env.max_steps,
"margin_floor": self.env.margin_floor, "margin_floor": self.env.margin_floor,
"margin_floor_patience": self.env.margin_floor_patience, "margin_floor_patience": self.env.margin_floor_patience,
"agent_mu": self.env.agent_mu,
"agent_std": self.env.agent_std,
"alpha": self.study.alpha, "alpha": self.study.alpha,
"lambda_coi": self.study.lambda_coi, "lambda_coi": self.study.lambda_coi,
"robust_radius": self.study.robust_radius, "robust_radius": self.study.robust_radius,
@@ -175,7 +188,6 @@ class TrainSpec:
"info_value": self.study.info_value, "info_value": self.study.info_value,
"eta_ux": self.study.eta_ux, "eta_ux": self.study.eta_ux,
"reward_profit_weight": self.study.reward_profit_weight, "reward_profit_weight": self.study.reward_profit_weight,
"revenue_weight": self.study.revenue_weight,
"no_robust": self.study.no_robust, "no_robust": self.study.no_robust,
"learning_rate": self.optimizer.learning_rate, "learning_rate": self.optimizer.learning_rate,
"gamma": self.optimizer.gamma, "gamma": self.optimizer.gamma,
@@ -246,6 +258,8 @@ class TrainSpec:
max_steps=int(base["max_steps"]), max_steps=int(base["max_steps"]),
margin_floor=float(base["margin_floor"]), margin_floor=float(base["margin_floor"]),
margin_floor_patience=int(base["margin_floor_patience"]), margin_floor_patience=int(base["margin_floor_patience"]),
agent_mu=float(base.get("agent_mu", 45.0)),
agent_std=float(base.get("agent_std", 15.0)),
), ),
study=StudySpec( study=StudySpec(
alpha=float(base["alpha"]), alpha=float(base["alpha"]),
@@ -256,7 +270,6 @@ class TrainSpec:
info_value=float(base["info_value"]), info_value=float(base["info_value"]),
eta_ux=float(base["eta_ux"]), eta_ux=float(base["eta_ux"]),
reward_profit_weight=float(base["reward_profit_weight"]), reward_profit_weight=float(base["reward_profit_weight"]),
revenue_weight=float(base["revenue_weight"]),
no_robust=no_robust, no_robust=no_robust,
), ),
optimizer=OptimizerSpec( optimizer=OptimizerSpec(
@@ -294,6 +307,7 @@ class TrainSpec:
checkpoint_interval=int(base["checkpoint_interval"]), checkpoint_interval=int(base["checkpoint_interval"]),
model_dir=str(base["model_dir"]), model_dir=str(base["model_dir"]),
log_freq=int(base["log_freq"]), log_freq=int(base["log_freq"]),
hist_freq=int(base["hist_freq"]),
), ),
eval=EvalSpec( eval=EvalSpec(
eval_freq=int(base["eval_freq"]), eval_freq=int(base["eval_freq"]),
@@ -304,9 +318,11 @@ class TrainSpec:
def run_name(spec: TrainSpec, *, kind: str, scenario: str) -> str: def run_name(spec: TrainSpec, *, kind: str, scenario: str) -> str:
alpha_token = f"{float(spec.study.alpha):.2f}".rstrip("0").rstrip(".")
mode = "baseline" if bool(spec.study.no_robust) else "defended"
return ( return (
f"{kind}/{spec.algorithm.name}/{spec.runtime.backend}/" f"{kind}/{spec.algorithm.name}/{spec.runtime.backend}/"
f"{spec.runtime.device}/{scenario}/s{spec.runtime.seed}" f"{spec.runtime.device}/{scenario}/a{alpha_token}/{mode}/s{spec.runtime.seed}"
) )
@@ -318,6 +334,7 @@ def run_metadata(
group: str | None = None, group: str | None = None,
tags: Sequence[str] = (), tags: Sequence[str] = (),
) -> dict[str, Any]: ) -> dict[str, Any]:
mode = "baseline" if bool(spec.study.no_robust) else "defended"
metadata: dict[str, Any] = { metadata: dict[str, Any] = {
"run.kind": str(kind), "run.kind": str(kind),
"run.algo": spec.algorithm.name, "run.algo": spec.algorithm.name,
@@ -326,6 +343,10 @@ def run_metadata(
"run.scenario": str(scenario), "run.scenario": str(scenario),
"run.seed": spec.runtime.seed, "run.seed": spec.runtime.seed,
"run.tags": list(tags), "run.tags": list(tags),
"study/alpha": float(spec.study.alpha),
"study/mode": mode,
"study/baseline_mode": float(bool(spec.study.no_robust)),
"tiers": spec.algorithm.name,
} }
if group: if group:
metadata["run.group"] = group metadata["run.group"] = group

View File

@@ -0,0 +1,133 @@
"""validate core thesis problem: margin erosion under agent contamination
trains standard RL (no robust components) across α levels to demonstrate systematic failure
"""
from __future__ import annotations
import json, sys, time
from pathlib import Path
import numpy as np
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from engine.spec import TrainSpec
from engine.orchestrators import run_train_once
def _run_baseline(alpha: float, algo: str, seed: int, steps: int) -> dict:
spec = TrainSpec.from_flat(
{
"algo": algo,
"seed": seed,
"alpha": alpha,
"total_timesteps": steps,
"lambda_coi": 0.0,
"robust_radius": 0.0,
"robust_points": 1,
"robust_rollouts": 1,
"no_robust": True,
"arch": "small",
"n_products": 10,
"N": 100,
"max_steps": 50,
"eval_freq": 5000,
"eval_episodes": 10,
"log_freq": 500,
"robust_eval_enabled": False,
"agent_mu": 12.0,
"agent_std": 2.0,
}
)
result = run_train_once(
spec,
project="phantom-margin-erosion",
offline=True,
no_wandb=True,
kind="study",
scenario=f"alpha{int(alpha * 100):02d}",
group=f"baseline_{algo}",
extra_tags=("margin_erosion", "baseline"),
)
return {
"alpha": alpha,
"algo": algo,
"seed": seed,
"eval_reward": result.get("eval/reward_mean", np.nan),
"eval_revenue": result.get("eval/revenue_mean", np.nan),
"eval_coi_level": result.get("eval/coi_level_mean", np.nan),
"eval_margin": result.get("eval/margin_mean", np.nan),
"eval_agent_prob": result.get("eval/agent_prob_mean", np.nan),
}
def run_margin_erosion_study(
alphas: list[float] | None = None,
algos: list[str] | None = None,
seeds: int = 3,
steps: int = 30_000,
) -> dict:
alphas = alphas or [0.1, 0.3, 0.5, 0.7, 0.9]
algos = algos or ["ppo", "dqn", "qtable"]
output_dir = Path(__file__).parent / "results"
output_dir.mkdir(exist_ok=True)
ts = time.strftime("%Y%m%d_%H%M%S")
results = []
for α in alphas:
for algo in algos:
for si in range(seeds):
seed = 42 + si
print(f"α={α:.1f} {algo} seed={seed}")
m = _run_baseline(α, algo, seed, steps)
results.append(m)
print(
f" margin={m['eval_margin']:.3f} rev={m['eval_revenue']:.0f} coi={m['eval_coi_level']:.1f}"
)
summary = {}
for α in alphas:
runs = [r for r in results if abs(r["alpha"] - α) < 0.01]
if not runs:
continue
s = {}
for metric in ["margin", "revenue", "coi_level", "agent_prob"]:
vals = [r[f"eval_{metric}"] for r in runs]
s[f"{metric}_mean"] = float(np.mean(vals))
s[f"{metric}_std"] = float(np.std(vals))
s["n_runs"] = len(runs)
summary[f"alpha_{α:.1f}"] = s
output = {
"timestamp": ts,
"config": {"alphas": alphas, "algos": algos, "seeds": seeds, "steps": steps},
"results": results,
"summary": summary,
}
path = output_dir / f"margin_erosion_alpha_{ts}.json"
with open(path, "w") as f:
json.dump(output, f, indent=2)
print(f"\n{path}")
for α in alphas:
k = f"alpha_{α:.1f}"
if k in summary:
s = summary[k]
print(
f" {k}: margin={s['margin_mean']:.3f}±{s['margin_std']:.3f} "
f"coi={s['coi_level_mean']:.1f}±{s['coi_level_std']:.1f}"
)
return output
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser(description="margin erosion vs α")
p.add_argument("--quick", action="store_true", help="fast test")
args = p.parse_args()
run_margin_erosion_study(
alphas=[0.1, 0.7] if args.quick else [0.1, 0.3, 0.5, 0.7, 0.9],
algos=["qtable"] if args.quick else ["ppo", "dqn", "qtable"],
seeds=1 if args.quick else 3,
steps=5_000 if args.quick else 30_000,
)

View File

@@ -0,0 +1,60 @@
method: grid
metric:
name: eval/stress_reward_worst
goal: maximize
command:
- ${env}
- python
- -m
- engine.train
parameters:
algo:
value: ppo
backend:
value: sb3
device:
value: cpu
seed:
values: [42, 1337, 7777]
alpha:
values: [0.1, 0.2, 0.3, 0.4, 0.6, 0.8]
n_products:
values: [25, 50, 100]
N:
value: 100
no_robust:
values: [false, true]
lambda_coi:
values: [0.15, 0.30]
robust_radius:
value: 0.2
robust_points:
value: 7
robust_rollouts:
value: 1
eta_ux:
value: 0.5
reward_profit_weight:
value: 1.0
action_levels:
value: 9
action_scale_low:
value: 0.8
action_scale_high:
value: 1.2
total_timesteps:
value: 100000
eval_episodes:
value: 12
eval_freq:
value: 1000
log_freq:
value: 100
hist_freq:
value: 500
learning_rate:
value: 0.0003
batch_size:
value: 256
n_steps:
value: 2048

View File

@@ -0,0 +1,53 @@
method: random
metric:
name: eval/supra_share_mean
goal: minimize
run_cap: 256
command:
- ${env}
- python
- -m
- engine.train
parameters:
algo:
value: ppo
seed:
values: [42, 1337, 7777]
alpha:
values: [0.1, 0.2, 0.3, 0.4, 0.6]
n_products:
values: [25, 50]
N:
value: 100
no_robust:
values: [false, true]
lambda_coi:
values: [0.05, 0.15, 0.3]
robust_radius:
values: [0.1, 0.2, 0.3]
robust_points:
value: 7
robust_rollouts:
value: 1
eta_ux:
values: [0.05, 0.15, 0.3, 0.5, 0.75]
reward_profit_weight:
value: 1.0
total_timesteps:
value: 100000
eval_episodes:
value: 10
eval_freq:
value: 1000
log_freq:
value: 100
hist_freq:
value: 500
learning_rate:
value: 0.0003
batch_size:
value: 256
n_steps:
value: 2048
device:
value: cpu

View File

@@ -36,7 +36,12 @@ def canonicalize_metrics(raw: Mapping[str, Any], spec: TrainSpec) -> dict[str, A
eval_reward = ( eval_reward = (
_as_float( _as_float(
metrics.get("eval/robust_reward_worst", metrics.get("eval/reward_mean")), metrics.get(
"eval/stress_reward_worst",
metrics.get(
"eval/robust_reward_worst", metrics.get("eval/reward_mean")
),
),
0.0, 0.0,
) )
or 0.0 or 0.0
@@ -51,9 +56,12 @@ def canonicalize_metrics(raw: Mapping[str, Any], spec: TrainSpec) -> dict[str, A
metrics["objective/coi_preserved"] = 0.0 if coi_level is None else coi_level metrics["objective/coi_preserved"] = 0.0 if coi_level is None else coi_level
metrics["study/alpha"] = spec.study.alpha metrics["study/alpha"] = spec.study.alpha
metrics["study/mode"] = "baseline" if bool(spec.study.no_robust) else "defended"
metrics["study/baseline_mode"] = float(bool(spec.study.no_robust))
metrics["study/lambda_coi"] = spec.study.lambda_coi metrics["study/lambda_coi"] = spec.study.lambda_coi
metrics["study/robust_radius"] = spec.study.robust_radius metrics["study/ambiguity_radius"] = spec.study.robust_radius
metrics["study/info_value"] = spec.study.info_value metrics["study/info_value"] = spec.study.info_value
metrics["tiers"] = spec.algorithm.name
metrics["runtime/backend"] = spec.runtime.backend metrics["runtime/backend"] = spec.runtime.backend
metrics["runtime/device"] = spec.runtime.device metrics["runtime/device"] = spec.runtime.device

View File

@@ -1,5 +1,7 @@
from __future__ import annotations from __future__ import annotations
import os
import time
from typing import Any, Callable, Iterable, Mapping from typing import Any, Callable, Iterable, Mapping
@@ -19,6 +21,42 @@ def _require_wandb():
return wandb return wandb
def _warn(message: str) -> None:
print(f"PHANTOM_WANDB_WARNING: {message}")
def _sanitize_key(raw_key: str) -> str | None:
key = str(raw_key)
replacements = {
"no_robust": "baseline_mode",
"study/no_robust": "study/baseline_mode",
"study/robust_radius": "study/ambiguity_radius",
"robust_radius": "ambiguity_radius",
"robust_points": "ambiguity_points",
"robust_rollouts": "ambiguity_rollouts",
"robust_eval_enabled": "stress_eval_enabled",
"eval/robust_alpha_high": "eval/stress_alpha_high",
"eval/robust_alpha_low": "eval/stress_alpha_low",
"eval/robust_reward_worst": "eval/stress_reward_worst",
"eval/robust_revenue_worst": "eval/stress_revenue_worst",
"eval/robust_coi_leakage_worst": "eval/stress_coi_leakage_worst",
}
key = replacements.get(key, key)
if "robust" in key.lower():
return None
return key
def _sanitize_payload(payload: Mapping[str, Any]) -> dict[str, Any]:
sanitized: dict[str, Any] = {}
for key, value in payload.items():
clean_key = _sanitize_key(str(key))
if clean_key is None:
continue
sanitized[clean_key] = value
return sanitized
def init_run( def init_run(
*, *,
mode: str, mode: str,
@@ -34,7 +72,11 @@ def init_run(
if group: if group:
kwargs["group"] = group kwargs["group"] = group
if sweep_mode: if sweep_mode:
try:
run = wandb.init(**kwargs) run = wandb.init(**kwargs)
except Exception as exc:
_warn(f"init failed in sweep mode ({exc})")
return None
if name and run is not None: if name and run is not None:
run.name = name run.name = name
return run return run
@@ -42,18 +84,25 @@ def init_run(
init_kwargs = dict(kwargs) init_kwargs = dict(kwargs)
init_kwargs["project"] = project init_kwargs["project"] = project
if config is not None: if config is not None:
init_kwargs["config"] = dict(config) init_kwargs["config"] = _sanitize_payload(dict(config))
if name: if name:
init_kwargs["name"] = name init_kwargs["name"] = name
if tags: if tags:
init_kwargs["tags"] = list(tags) init_kwargs["tags"] = list(tags)
try:
return wandb.init(**init_kwargs) return wandb.init(**init_kwargs)
except Exception as exc:
_warn(f"init failed ({exc})")
return None
def finish_run() -> None: def finish_run() -> None:
wandb = get_wandb_module() wandb = get_wandb_module()
if wandb is not None and wandb.run is not None: if wandb is not None and wandb.run is not None:
try:
wandb.finish() wandb.finish()
except Exception as exc:
_warn(f"finish failed ({exc})")
def current_config() -> dict[str, Any]: def current_config() -> dict[str, Any]:
@@ -67,25 +116,45 @@ def update_run_config(config: Mapping[str, Any]) -> None:
wandb = get_wandb_module() wandb = get_wandb_module()
if wandb is None or wandb.run is None: if wandb is None or wandb.run is None:
return return
payload = _sanitize_payload(dict(config))
if not payload:
return
try: try:
wandb.config.update(dict(config), allow_val_change=True) wandb.config.update(payload, allow_val_change=True)
except TypeError: except TypeError:
wandb.config.update(dict(config)) try:
wandb.config.update(payload)
except Exception as exc:
_warn(f"config update failed ({exc})")
except Exception as exc:
_warn(f"config update failed ({exc})")
def log_metrics(metrics: Mapping[str, Any], *, step: int) -> None: def log_metrics(metrics: Mapping[str, Any], *, step: int) -> None:
wandb = get_wandb_module() wandb = get_wandb_module()
if wandb is None or wandb.run is None: if wandb is None or wandb.run is None:
return return
wandb.log(dict(metrics), step=step) payload = _sanitize_payload(dict(metrics))
if not payload:
return
try:
wandb.log(payload, step=step)
except Exception as exc:
_warn(f"log failed at step {step} ({exc})")
def update_summary(metrics: Mapping[str, Any]) -> None: def update_summary(metrics: Mapping[str, Any]) -> None:
wandb = get_wandb_module() wandb = get_wandb_module()
if wandb is None or wandb.run is None: if wandb is None or wandb.run is None:
return return
for key, value in metrics.items(): payload = _sanitize_payload(dict(metrics))
if not payload:
return
try:
for key, value in payload.items():
wandb.run.summary[key] = value wandb.run.summary[key] = value
except Exception as exc:
_warn(f"summary update failed ({exc})")
def run_agent( def run_agent(
@@ -95,4 +164,39 @@ def run_agent(
count: int | None = None, count: int | None = None,
) -> None: ) -> None:
wandb = _require_wandb() wandb = _require_wandb()
wandb.agent(sweep_id, function=fn, count=count) retry_max = max(0, int(os.getenv("PHANTOM_WANDB_AGENT_RETRIES", "8")))
retry_delay = max(1.0, float(os.getenv("PHANTOM_WANDB_AGENT_RETRY_DELAY", "5")))
retry_backoff = max(
1.0, float(os.getenv("PHANTOM_WANDB_AGENT_RETRY_BACKOFF", "1.5"))
)
retry_max_delay = max(
retry_delay,
float(os.getenv("PHANTOM_WANDB_AGENT_MAX_RETRY_DELAY", "60")),
)
target = None if count is None else max(0, int(count))
completed = 0
def _wrapped() -> None:
nonlocal completed
fn()
completed += 1
attempt = 0
while True:
remaining = None if target is None else max(0, int(target - completed))
if target is not None and remaining == 0:
return
try:
wandb.agent(sweep_id, function=_wrapped, count=remaining)
return
except Exception as exc:
attempt += 1
if attempt > retry_max:
raise
wait = min(retry_max_delay, retry_delay * (retry_backoff ** (attempt - 1)))
_warn(
f"agent disconnected (attempt {attempt}/{retry_max}, "
f"completed={completed}, remaining={remaining}): {exc}"
)
time.sleep(wait)

View File

@@ -54,6 +54,7 @@ def _build_parser() -> argparse.ArgumentParser:
parser.add_argument("--total-timesteps", type=int) parser.add_argument("--total-timesteps", type=int)
parser.add_argument("--model-dir", type=str) parser.add_argument("--model-dir", type=str)
parser.add_argument("--log-freq", type=int) parser.add_argument("--log-freq", type=int)
parser.add_argument("--hist-freq", type=int)
parser.add_argument("--checkpoint-interval", type=int) parser.add_argument("--checkpoint-interval", type=int)
parser.add_argument("--device", type=str) parser.add_argument("--device", type=str)
@@ -68,7 +69,6 @@ def _build_parser() -> argparse.ArgumentParser:
parser.add_argument("--no-robust", action="store_true") parser.add_argument("--no-robust", action="store_true")
parser.add_argument("--eta-ux", type=float) parser.add_argument("--eta-ux", type=float)
parser.add_argument("--reward-profit-weight", type=float) parser.add_argument("--reward-profit-weight", type=float)
parser.add_argument("--revenue-weight", type=float)
parser.add_argument("--price-low", type=float) parser.add_argument("--price-low", type=float)
parser.add_argument("--price-high", type=float) parser.add_argument("--price-high", type=float)
@@ -126,6 +126,7 @@ def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]:
"total_timesteps": args.total_timesteps, "total_timesteps": args.total_timesteps,
"model_dir": args.model_dir, "model_dir": args.model_dir,
"log_freq": args.log_freq, "log_freq": args.log_freq,
"hist_freq": args.hist_freq,
"checkpoint_interval": args.checkpoint_interval, "checkpoint_interval": args.checkpoint_interval,
"device": args.device, "device": args.device,
"alpha": args.alpha, "alpha": args.alpha,
@@ -139,7 +140,6 @@ def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]:
"no_robust": args.no_robust, "no_robust": args.no_robust,
"eta_ux": args.eta_ux, "eta_ux": args.eta_ux,
"reward_profit_weight": args.reward_profit_weight, "reward_profit_weight": args.reward_profit_weight,
"revenue_weight": args.revenue_weight,
"price_low": args.price_low, "price_low": args.price_low,
"price_high": args.price_high, "price_high": args.price_high,
"action_levels": args.action_levels, "action_levels": args.action_levels,
@@ -179,8 +179,29 @@ def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]:
def main(argv: list[str] | None = None) -> None: def main(argv: list[str] | None = None) -> None:
import subprocess
import sys import sys
# Ensure data is downloaded
from pathlib import Path
project_root = Path(__file__).parents[1]
data_dir = project_root / "experiments" / "collected_data"
needs_pull = (not data_dir.exists()) or (not any(data_dir.iterdir()))
if needs_pull:
try:
subprocess.run(["make", "data.pull"], cwd=str(project_root), check=True)
except (subprocess.SubprocessError, OSError) as exc:
sys.path.insert(0, str(project_root))
try:
from scripts.hf_data import pull
pull()
except (ImportError, OSError, RuntimeError, ValueError) as fallback_exc:
print(
f"Warning: data.pull failed ({exc}); fallback pull failed ({fallback_exc})"
)
configure_logging() configure_logging()
raw_args = list(sys.argv[1:] if argv is None else argv) raw_args = list(sys.argv[1:] if argv is None else argv)
run_kind = _probe_run_kind(raw_args) run_kind = _probe_run_kind(raw_args)

View File

@@ -10,6 +10,7 @@ from .lib.coi import (
) )
from .lib.behavior import get_transition_models, trajectory_to_events from .lib.behavior import get_transition_models, trajectory_to_events
from .lib.wrappers import EconomicMetricsWrapper from .lib.wrappers import EconomicMetricsWrapper
from .jax.robust import select_adversarial_alpha_jax, _JAX_OK
class _ActionPricingEngine(PricingEngine): class _ActionPricingEngine(PricingEngine):
@@ -121,6 +122,7 @@ class PHANTOM(gym.Env):
self._prices = None self._prices = None
self._demand = None self._demand = None
self._step_count = 0 self._step_count = 0
self._global_step = 0 # monotonic; used as JAX RNG seed across resets
self._demand_history = [] self._demand_history = []
self._price_history = [] self._price_history = []
self._revenue_history = [] self._revenue_history = []
@@ -128,6 +130,13 @@ class PHANTOM(gym.Env):
self._initial_episode_prices = None self._initial_episode_prices = None
self._trajectories = [] # session trajectories for agent prob calculation self._trajectories = [] # session trajectories for agent prob calculation
self.baseline_prices = np.full(self.n_products, self.price_bounds[0]) self.baseline_prices = np.full(self.n_products, self.price_bounds[0])
self.anchor_prices = np.full(
self.n_products,
float(np.clip(float(self.human_params[0]), *self.price_bounds)),
)
self.competitive_cap = float(
min(self.price_bounds[1], float(np.mean(self.anchor_prices)) * 1.15)
)
self._low_margin_streak = 0 # consecutive steps below margin_floor self._low_margin_streak = 0 # consecutive steps below margin_floor
self._last_agent_prob = float(self.alpha) self._last_agent_prob = float(self.alpha)
self._last_alpha_adv = float(self.alpha) self._last_alpha_adv = float(self.alpha)
@@ -167,19 +176,28 @@ class PHANTOM(gym.Env):
self.market.Nhumans = self.N - n_agents self.market.Nhumans = self.N - n_agents
def _decode_action(self, action) -> np.ndarray: def _decode_action(self, action) -> np.ndarray:
base = ( prev = self._prices
self._prices base = self.anchor_prices
if self._prices is not None
else np.full(self.n_products, self.price_bounds[0], dtype=float) def _blend(target: np.ndarray) -> np.ndarray:
) if prev is None:
lower = float(self.price_bounds[0])
return np.clip(target, lower, self.competitive_cap)
blended = 0.75 * np.asarray(prev, dtype=float) + 0.25 * target
lower = float(self.price_bounds[0])
return np.clip(blended, lower, self.competitive_cap)
if np.isscalar(action): if np.isscalar(action):
idx = int(np.clip(int(action), 0, self.action_levels - 1)) idx = int(np.clip(int(action), 0, self.action_levels - 1))
return np.clip(base * self._action_scales[idx], *self.price_bounds) target = base * self._action_scales[idx]
return _blend(target)
a = np.asarray(action) a = np.asarray(action)
if a.size == 1: if a.size == 1:
idx = int(np.clip(int(a.reshape(-1)[0]), 0, self.action_levels - 1)) idx = int(np.clip(int(a.reshape(-1)[0]), 0, self.action_levels - 1))
return np.clip(base * self._action_scales[idx], *self.price_bounds) target = base * self._action_scales[idx]
return np.clip(a.astype(float), *self.price_bounds) return _blend(target)
lower = float(self.price_bounds[0])
return np.clip(a.astype(float), lower, self.competitive_cap)
def _compute_agent_prob(self, trajectories=None) -> float: def _compute_agent_prob(self, trajectories=None) -> float:
trajectories = ( trajectories = (
@@ -214,18 +232,23 @@ class PHANTOM(gym.Env):
coi_penalty = self.lambda_coi * coi_leakage * info_budget coi_penalty = self.lambda_coi * coi_leakage * info_budget
if len(self._price_history) > 0: if len(self._price_history) > 0:
volatility = float( prev_prices = np.asarray(self._price_history[-1], dtype=float)
np.mean( rel_change = (prices - prev_prices) / np.maximum(prev_prices, 1.0)
np.abs(prices - self._price_history[-1]) volatility = float(np.mean(np.abs(rel_change)))
/ np.maximum(self.baseline_prices, 1.0) upward_volatility = float(np.mean(np.clip(rel_change, 0.0, None)))
)
)
else: else:
volatility = 0.0 volatility = 0.0
ux_penalty = self.eta_ux * info_budget * volatility upward_volatility = 0.0
ux_penalty = self.eta_ux * info_budget * (volatility + 0.5 * upward_volatility)
competitive_anchor = float(np.mean(self.anchor_prices))
price_ratio = prices / max(competitive_anchor, 1.0)
supra_excess = np.clip(price_ratio - 1.15, 0.0, None)
supra_penalty = 4.0 * info_budget * float(np.mean(np.square(supra_excess)))
supra_share = float(np.mean(supra_excess > 0.0))
reward_revenue = self.reward_profit_weight * profit reward_revenue = self.reward_profit_weight * profit
reward = reward_revenue - coi_penalty - ux_penalty reward = reward_revenue - coi_penalty - ux_penalty - supra_penalty
return reward, { return reward, {
"revenue": revenue, "revenue": revenue,
@@ -238,6 +261,10 @@ class PHANTOM(gym.Env):
"coi_info_budget": info_budget, "coi_info_budget": info_budget,
"ux_penalty": ux_penalty, "ux_penalty": ux_penalty,
"volatility": volatility, "volatility": volatility,
"upward_volatility": upward_volatility,
"supra_penalty": supra_penalty,
"supra_share": supra_share,
"competitive_anchor": competitive_anchor,
"reward_revenue": reward_revenue, "reward_revenue": reward_revenue,
"reward_total": reward, "reward_total": reward,
} }
@@ -261,8 +288,37 @@ class PHANTOM(gym.Env):
return float(np.mean(rewards)) if rewards else 0.0 return float(np.mean(rewards)) if rewards else 0.0
def _select_adversarial_alpha(self, prices: np.ndarray) -> float: def _select_adversarial_alpha(self, prices: np.ndarray) -> float:
"""inner robust step: evaluate candidates and pick worst-case alpha""" """inner robust step: pick worst-case alpha from the ambiguity interval.
when JAX is available and robust_rollouts==1 we use a vmapped pass over
all K candidates in a single call (no Python loop, no market.act overhead).
the JAX path approximates demand as the mixed closed-form d(p;theta) signal
rather than running full trajectory sampling, which is accurate for the
alpha-selection decision while being dramatically cheaper.
when robust_rollouts>1 or JAX is unavailable we fall back to the sequential
market.act() loop so behavior is identical to the original implementation.
"""
candidates = self._alpha_candidates() candidates = self._alpha_candidates()
if len(candidates) == 1:
return float(candidates[0])
if _JAX_OK and self.robust_rollouts == 1:
best_alpha, _ = select_adversarial_alpha_jax(
candidates=candidates,
prices=prices,
human_params=self.market.human_params,
agent_params=self.market.agent_params,
noise_std=self.market.noise_std,
baseline_prices=self.baseline_prices,
lambda_coi=self.lambda_coi,
info_value=self.info_value,
reward_profit_weight=self.reward_profit_weight,
rng_seed=self._global_step,
)
return best_alpha
# fallback: full trajectory-based sequential evaluation
evaluations = [ evaluations = [
(float(alpha), self._evaluate_candidate(float(alpha), prices)) (float(alpha), self._evaluate_candidate(float(alpha), prices))
for alpha in candidates for alpha in candidates
@@ -299,6 +355,7 @@ class PHANTOM(gym.Env):
def step(self, action): def step(self, action):
self._prices = self._decode_action(action) self._prices = self._decode_action(action)
alpha_adv = self._select_adversarial_alpha(self._prices) alpha_adv = self._select_adversarial_alpha(self._prices)
self._global_step += 1 # always increment; JAX path may have already done so
self._set_market_mix(alpha_adv) self._set_market_mix(alpha_adv)
self._platform_stub.set_prices(self._prices) self._platform_stub.set_prices(self._prices)
self._step_count += 1 self._step_count += 1

View File

@@ -2,6 +2,7 @@
All hardcoded paths should reference this module All hardcoded paths should reference this module
Paths can be overridden via environment variables Paths can be overridden via environment variables
""" """
import os import os
from pathlib import Path from pathlib import Path
@@ -9,24 +10,34 @@ from pathlib import Path
PROJECT_ROOT = Path(__file__).parent.parent.resolve() PROJECT_ROOT = Path(__file__).parent.parent.resolve()
# data directories # data directories
DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data')) DATA_DIR = Path(os.getenv("PHANTOM_DATA_DIR", PROJECT_ROOT / "data"))
EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments')) EXPERIMENTS_DIR = Path(
os.getenv("PHANTOM_EXPERIMENTS_DIR", PROJECT_ROOT / "experiments")
)
# agent/human interaction data # agent/human interaction data
AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents')) AGENT_DATA_DIR = Path(os.getenv("PHANTOM_AGENT_DATA_DIR", DATA_DIR / "agents"))
HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans')) HUMAN_DATA_DIR = Path(os.getenv("PHANTOM_HUMAN_DATA_DIR", DATA_DIR / "humans"))
# RL simulation runs # RL simulation runs
SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs')) SIM_RUNS_DIR = Path(
os.getenv("PHANTOM_SIM_RUNS_DIR", PROJECT_ROOT / "sim" / "rl" / "runs")
)
# model artifacts # model artifacts
MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models')) MODEL_REGISTRY_DIR = Path(os.getenv("PHANTOM_MODEL_REGISTRY_DIR", DATA_DIR / "models"))
# collected experiment data # collected experiment data
COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data')) COLLECTED_DATA_DIR = Path(
os.getenv(
"PHANTOM_COLLECTED_DATA_DIR", EXPERIMENTS_DIR / "agents" / "collected_data"
)
)
# notebook outputs # notebook outputs
NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs')) NOTEBOOK_OUTPUT_DIR = Path(
os.getenv("PHANTOM_NOTEBOOK_OUTPUT_DIR", EXPERIMENTS_DIR / "notebooks" / "outputs")
)
def ensure_dir(path: Path) -> Path: def ensure_dir(path: Path) -> Path:
@@ -51,15 +62,18 @@ def get_sim_path(*parts: str) -> Path:
# service configuration (from .env) # service configuration (from .env)
KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost') KAFKA_HOST = os.getenv("KAFKA_HOST", "localhost")
KAFKA_PORT = os.getenv('KAFKA_PORT', '9092') KAFKA_PORT = os.getenv("KAFKA_PORT", "9092")
KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}" KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}"
REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT = int(os.getenv('REDIS_PORT', '6379')) REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '') SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '') SUPABASE_ANON_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")
BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000')) BACKEND_PORT = int(os.getenv("BACKEND_PORT", "5000"))
PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001')) PROVIDER_PORT = int(os.getenv("PROVIDER_PORT", "5001"))
# huggingface dataset repo for collected behavioral data
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "velocitatem/phantom-collected-data")

15
nx.json
View File

@@ -58,6 +58,21 @@
"benchmark": { "benchmark": {
"cache": false "cache": false
}, },
"whoclicked-publish": {
"cache": false
},
"tpu-ray-bootstrap": {
"cache": false
},
"tpu-ray-deps": {
"cache": false
},
"tpu-ray-verify": {
"cache": false
},
"tpu-ray-teardown": {
"cache": false
},
"up": { "up": {
"cache": false "cache": false
}, },

View File

@@ -7,6 +7,8 @@
], ],
"scripts": { "scripts": {
"nx": "nx", "nx": "nx",
"manim:render": "nx run manim:render",
"manim:render-all": "nx run manim:render-all",
"projects": "nx show projects", "projects": "nx show projects",
"graph": "nx graph", "graph": "nx graph",
"web:dev": "nx run web:dev", "web:dev": "nx run web:dev",

View File

@@ -1,84 +0,0 @@
from __future__ import annotations
import argparse
import subprocess
import sys
from pathlib import Path
from scenes import SCENE_ORDER
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Render thesis-defense Manim scenes")
parser.add_argument(
"--quality",
default="qm",
choices=["ql", "qm", "qh", "qk"],
help="Manim quality preset",
)
parser.add_argument(
"--scene",
action="append",
dest="scenes",
help="Scene name; repeat flag to render many",
)
parser.add_argument(
"--preview", action="store_true", help="Open video after each render"
)
parser.add_argument(
"--list", action="store_true", help="List available scenes and exit"
)
return parser.parse_args()
def validate_requested(requested: list[str]) -> list[str]:
missing = [name for name in requested if name not in SCENE_ORDER]
if missing:
choices = ", ".join(SCENE_ORDER)
raise ValueError(f"Unknown scenes: {', '.join(missing)}. Choices: {choices}")
return requested
def run_manim(scene_file: Path, scene_name: str, quality: str, preview: bool) -> None:
cmd = [sys.executable, "-m", "manim"]
if preview:
cmd.append("-p")
cmd.extend([f"-{quality}", str(scene_file), scene_name])
subprocess.run(cmd, cwd=scene_file.parent, check=True)
def main() -> int:
args = parse_args()
if args.list:
for scene in SCENE_ORDER:
print(scene)
return 0
scenes = validate_requested(args.scenes) if args.scenes else list(SCENE_ORDER)
scene_file = Path(__file__).resolve().parent / "scenes.py"
try:
for scene_name in scenes:
run_manim(
scene_file=scene_file,
scene_name=scene_name,
quality=args.quality,
preview=args.preview,
)
except FileNotFoundError:
print(
"manim executable not found. Install Manim in your Python environment.",
file=sys.stderr,
)
return 2
except ValueError as exc:
print(str(exc), file=sys.stderr)
return 2
except subprocess.CalledProcessError as exc:
return exc.returncode
return 0
if __name__ == "__main__":
raise SystemExit(main())

File diff suppressed because it is too large Load Diff

View File

@@ -630,3 +630,41 @@ Volume: 21},
note = {Publisher: Institute of Mathematical Statistics}, note = {Publisher: Institute of Mathematical Statistics},
pages = {50 -- 60}, pages = {50 -- 60},
} }
@article{horace_he_and_thinking_machines_lab_defeating_2025,
title = {Defeating {Nondeterminism} in {LLM} {Inference}},
url = {https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/},
doi = {10.64434/tml.20250910},
abstract = {Reproducibility is a bedrock of scientific progress. However, its remarkably difficult to get reproducible results out of large language models.
For example, you might observe that asking ChatGPT the same question multiple times provides different results. This by itself is not surprising, since getting a result from a language model involves “sampling”, a process that converts the language models output into a probability distribution and probabilistically selects a token.
What might be more surprising is that even when we adjust the temperature down to 0This means that the LLM always chooses the highest probability token, which is called greedy sampling. (thus making the sampling theoretically deterministic), LLM APIs are still not deterministic in practice (see past discussions here, here, or here). Even when running inference on your own hardware with an OSS inference library like vLLM or SGLang, sampling still isnt deterministic (see here or here).},
language = {en},
urldate = {2026-03-10},
journal = {Thinking Machines Lab: Connectionism},
author = {{Horace He and Thinking Machines Lab}},
year = {2025},
file = {Snapshot:/home/velocitatem/Zotero/storage/U5JG4CNM/defeating-nondeterminism-in-llm-inference.html:text/html},
}
@misc{moritz_ray_2018,
title = {Ray: {A} {Distributed} {Framework} for {Emerging} {AI} {Applications}},
shorttitle = {Ray},
url = {http://arxiv.org/abs/1712.05889},
doi = {10.48550/arXiv.1712.05889},
abstract = {The next generation of AI applications will continuously interact with the environment and learn from these interactions. These applications impose new and demanding systems requirements, both in terms of performance and flexibility. In this paper, we consider these requirements and present Ray---a distributed system to address them. Ray implements a unified interface that can express both task-parallel and actor-based computations, supported by a single dynamic execution engine. To meet the performance requirements, Ray employs a distributed scheduler and a distributed and fault-tolerant store to manage the system's control state. In our experiments, we demonstrate scaling beyond 1.8 million tasks per second and better performance than existing specialized systems for several challenging reinforcement learning applications.},
urldate = {2026-03-13},
publisher = {arXiv},
author = {Moritz, Philipp and Nishihara, Robert and Wang, Stephanie and Tumanov, Alexey and Liaw, Richard and Liang, Eric and Elibol, Melih and Yang, Zongheng and Paul, William and Jordan, Michael I. and Stoica, Ion},
month = sep,
year = {2018},
note = {arXiv:1712.05889 [cs]},
keywords = {Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Distributed, Parallel, and Cluster Computing},
file = {Preprint PDF:/home/velocitatem/Zotero/storage/SUTDF5BP/Moritz et al. - 2018 - Ray A Distributed Framework for Emerging AI Applications.pdf:application/pdf;Snapshot:/home/velocitatem/Zotero/storage/5GV2DUAA/1712.html:text/html},
}
@misc{biewald_experiment_2020,
title = {Experiment {Tracking} with {Weights} and {Biases}},
url = {https://www.wandb.com/},
author = {Biewald, Lukas},
year = {2020},
}

View File

@@ -8,9 +8,9 @@
\section{Introduction} \section{Introduction}
In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove separability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned separability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners. In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove distinguishability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned distinguishability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.
This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned separability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.} This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned distinguishability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.}
\subsection{Motivation and Market Context} \subsection{Motivation and Market Context}
@@ -30,7 +30,7 @@ We formally define interaction data as coming from some actor which can either b
This dissertation is organized around one main research question and three supporting sub-questions: This dissertation is organized around one main research question and three supporting sub-questions:
\begin{enumerate} \begin{enumerate}
\item[\textbf{Main RQ}] How can dynamic pricing systems preserve margin integrity when transaction orchestration is increasingly mediated by non-human agents? \item[\textbf{Main RQ}] How can dynamic pricing systems preserve margin integrity when transaction orchestration is increasingly mediated by non-human agents?
\item[\textbf{SQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting? \item[\textbf{SQ1}] \textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
\item[\textbf{SQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems? \item[\textbf{SQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
\item[\textbf{SQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination? \item[\textbf{SQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
\end{enumerate} \end{enumerate}
@@ -64,4 +64,4 @@ Extract final result $r$ from terminal state\;
\end{algorithm} \end{algorithm}
The previously described goal of separability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate $\hat{q}$. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary. The previously described goal of distinguishability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate $\hat{q}$. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary.

View File

@@ -1,6 +1,6 @@
\section{Literature Review} \section{Literature Review}
To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups. To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups.
\subsection{Agent Taxonomy and Definitions} \subsection{Agent Taxonomy and Definitions}

View File

@@ -3,7 +3,7 @@
% Extra notes and clarifications: we observed some humans and get their transition probabilities between event types % Extra notes and clarifications: we observed some humans and get their transition probabilities between event types
% We modify behavioral profiles of transition matrices with price elasticity matrices generated by sample valuations of a distributing. % We modify behavioral profiles of transition matrices with price elasticity matrices generated by sample valuations of a distributing.
This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven separability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets. This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven distinguishability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets.
\subsection{Problem Formalization} \subsection{Problem Formalization}
@@ -40,6 +40,7 @@ We formalize the heterogeneity of actors by introducing a type space $\Theta$. A
Q(p) = (1-\alpha) \cdot \mathbb{E}_{\theta \sim \mathcal{D}_H}[d(p; \theta)] + \alpha \cdot \mathbb{E}_{\theta \sim \mathcal{D}_A}[d(p; \theta)] + \epsilon_t Q(p) = (1-\alpha) \cdot \mathbb{E}_{\theta \sim \mathcal{D}_H}[d(p; \theta)] + \alpha \cdot \mathbb{E}_{\theta \sim \mathcal{D}_A}[d(p; \theta)] + \epsilon_t
\end{equation} \end{equation}
where $\alpha \in [0, 1]$ represents the contamination parameter (proportion of agents) and $\epsilon_t$ is non-stationary market noise. where $\alpha \in [0, 1]$ represents the contamination parameter (proportion of agents) and $\epsilon_t$ is non-stationary market noise.
Accounting for behavioral and market variation, we also treat $\epsilon_t$ as absorbing serving-path variability from LLM infrastructure (e.g., batch-size-dependent inference behavior under changing load), which appears stochastic at the request level even under greedy decoding \parencite{horace_he_and_thinking_machines_lab_defeating_2025}.
@@ -140,6 +141,8 @@ The architecture of this platform begins with the deployed web-apps posting inte
\paragraph{Public Web Artifact} We transition the Kappa like architecture of the data collection to a Lambda architecture for actual learning in a surrogate environment. This allows us to move faster on data which is provided and helps us create a feedback loop for production deployment. To support further research in this intersection of fields we release P4P \footnote{\url{https://github.com/velocitatem/p4p}} as a public repository providing the interaction layer of the PHANTOM framework. This provides a configurable storefront which can be tailored to any commercial setting with a standardized session-level event tracking. We document the API adapters or what the framework expects in terms of schemas for pricing providers and log ingestion servicse. The repository is intended for controlled experimentation and method replication rather than production commerce deployment. \paragraph{Public Web Artifact} We transition the Kappa like architecture of the data collection to a Lambda architecture for actual learning in a surrogate environment. This allows us to move faster on data which is provided and helps us create a feedback loop for production deployment. To support further research in this intersection of fields we release P4P \footnote{\url{https://github.com/velocitatem/p4p}} as a public repository providing the interaction layer of the PHANTOM framework. This provides a configurable storefront which can be tailored to any commercial setting with a standardized session-level event tracking. We document the API adapters or what the framework expects in terms of schemas for pricing providers and log ingestion servicse. The repository is intended for controlled experimentation and method replication rather than production commerce deployment.
\paragraph{Public Dataset} For reproducibility of the behavioral analysis and distinguishability experiments, we also release the interaction dataset used in this thesis as \textit{WhoClickedIt}. The dataset is hosted on Hugging Face \footnote{\url{https://huggingface.co/datasets/velocitatem/whoclickedit}} and is distributed as one flattened event sheet (\texttt{whoclicked.csv}) with explicit labels (\texttt{actor\_type}, \texttt{is\_agent}, and \texttt{record\_type}). The associated dataset card specifies the schema, collection process, and known limitations; a full copy is included in Appendix~\ref{app:whoclicked_card}.
\subsubsection{DevOps Principles} \subsubsection{DevOps Principles}
@@ -182,13 +185,24 @@ Since users act with motivations, we define a pool of tasks (jobs to be done) an
The task pool is stored as a structured table with fields \texttt{id}, \texttt{created\_at}, \texttt{task\_name}, \texttt{task\_description}, and \texttt{task\_def\_of\_done}. We formulate the tasks as compact jobs-to-be-done rather than as strict click scripts, because the target is to elicit realistic browsing and comparison behavior which can capture nuance of different people. In hotel mode the assigned tasks include \textit{Cheapest Room}, \textit{Cheapest Room w/ View}, \textit{MultiStep Cheapest Room}, \textit{The Digital Nomad (Executive)}, and \textit{The 3-Way Tradeoff (Desk + Quiet + Flexible)}. These prompts deliberately require critical thought in search, inspection of room details, comparison of amenities or images, return visits to the listing page, and a final booking decision which create a degree of cognitive load. In airline mode we use \textit{Last-Minute One-Way Flight}, where the actor must urgently travel to LAX from either SEA or JFK within the next 1--3 days, inspect at least a small set of candidate itineraries, and then book a reasonable earliest departure. The task pool is stored as a structured table with fields \texttt{id}, \texttt{created\_at}, \texttt{task\_name}, \texttt{task\_description}, and \texttt{task\_def\_of\_done}. We formulate the tasks as compact jobs-to-be-done rather than as strict click scripts, because the target is to elicit realistic browsing and comparison behavior which can capture nuance of different people. In hotel mode the assigned tasks include \textit{Cheapest Room}, \textit{Cheapest Room w/ View}, \textit{MultiStep Cheapest Room}, \textit{The Digital Nomad (Executive)}, and \textit{The 3-Way Tradeoff (Desk + Quiet + Flexible)}. These prompts deliberately require critical thought in search, inspection of room details, comparison of amenities or images, return visits to the listing page, and a final booking decision which create a degree of cognitive load. In airline mode we use \textit{Last-Minute One-Way Flight}, where the actor must urgently travel to LAX from either SEA or JFK within the next 1--3 days, inspect at least a small set of candidate itineraries, and then book a reasonable earliest departure.
A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor. A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor.
The human data collection involved 18 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 18 human sessions we ran 18 agent sessions of equivalent task scope, giving a balanced dataset of 36 labeled trajectories. Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy. The human data collection involved 13 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 13 human sessions we ran 16 agent sessions of equivalent task scope, yielding 29 labeled trajectories in total (45\% human, 55\% agent). Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy.
To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior. To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior.
Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to separate classes $\theta \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner. Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to distinguish classes $\theta \in \{A,H\}$ with session-conditioned probability estimates, then injects those estimates into the pricing learner.
Our process follows three stages: (1) observe and \textit{vectorize} behavioral interactions, (2) learn separability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator. Our process follows three stages: (1) observe and \textit{vectorize} behavioral interactions, (2) learn distinguishability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
Figure~\ref{fig:phantom_unified_architecture} summarizes the full mechanism from online interaction capture to divergence-based contamination scoring and robust control of pricing decisions.
\begin{figure}[ht]
\centering
\resizebox{\textwidth}{!}{%
\input{chapters/hero_architecture_figure.tex}
}
\caption{Unified PHANTOM defense architecture. (a) Online serving and logging with behavioral and price-query streams. (b) Distinguishability layer that estimates KL divergence to human/agent prototypes and derives session-level contamination scores. (c) Distributionally robust pricing control that optimizes under an ambiguity set while penalizing COI leakage and tracking UX cost.}
\label{fig:phantom_unified_architecture}
\end{figure}
\begin{figure}[ht] \begin{figure}[ht]
\resizebox{\columnwidth}{!}{% \resizebox{\columnwidth}{!}{%
@@ -206,8 +220,8 @@ The dynamic pricing mechanism elicited immediate behavioral adjustments. Partici
\subsubsection{Design of Training Factorial Study} \subsubsection{Design of Training Factorial Study}
The simulator has multiple configurable factors. We design a multi-factor study across five axes derived from the sweep configurations: (1) RL algorithm (\texttt{ppo}, \texttt{a2c}, \texttt{dqn}, \texttt{qtable}; 4 levels), (2) contamination ratio $\alpha$ sampled from $[0.1, 0.6]$ at four representative levels, (3) robustness radius $\epsilon_\alpha \in \{0.0, 0.15, 0.3\}$ (3 levels), (4) COI penalty weight $\lambda_\text{coi}$ at two reference levels, and (5) pricing action granularity (two discretization settings for \texttt{action\_levels}); giving a grid of $4\times4\times3\times2\times2 = 192$ configurations. Statistical power for the behavioral comparisons is determined by a two-sample test over per-session KL divergence scores; a formal power analysis with minimum detectable effect size at $n=18+18$ is reported in the results. The simulator has multiple configurable factors. We design a multi-factor study across five axes derived from the sweep configurations: (1) RL algorithm (\texttt{ppo}, \texttt{a2c}, \texttt{dqn}, \texttt{qtable}; 4 levels), (2) contamination ratio $\alpha$ sampled from $[0.1, 0.6]$ at four representative levels, (3) robustness radius $\epsilon_\alpha \in \{0.0, 0.15, 0.3\}$ (3 levels), (4) COI penalty weight $\lambda_\text{coi}$ at two reference levels, and (5) pricing action granularity (two discretization settings for \texttt{action\_levels}); giving a grid of $4\times4\times3\times2\times2 = 192$ configurations. Statistical power for the behavioral comparisons is determined by a two-sample test over per-session KL divergence scores; a formal power analysis with minimum detectable effect size at $n_H=13$, $n_A=16$ is reported in the results.
% Power analysis plan: apply a two-sample Mann-Whitney U (or permutation test) on per-session (delta_H - delta_A) divergence scores comparing the human and agent groups. Compute minimum detectable effect size at alpha=0.05, power=0.8, given n=18 per group. Bootstrap confidence intervals on mean KL are a cleaner complement given the non-normality of divergence distributions. % Power analysis plan: apply a two-sample Mann-Whitney U (or permutation test) on per-session (delta_H - delta_A) divergence scores comparing the human and agent groups. Compute minimum detectable effect size at alpha=0.05, power=0.8, given n_H=13 and n_A=16. Bootstrap confidence intervals on mean KL are a cleaner complement given the non-normality of divergence distributions.
While this scale is generally expensive for reinforcement learning, we execute it on a large TPU cluster to make the sweep tractable. While this scale is generally expensive for reinforcement learning, we execute it on a large TPU cluster to make the sweep tractable.
Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak BF16 throughput this corresponds to approximately 160\,PFLOPS of aggregate compute (derivation in Appendix~\ref{app:compute_budget}), which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration where throughput-per-dollar is favorable, and reserve on-demand v4 capacity for runs that should not be interrupted. Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak BF16 throughput this corresponds to approximately 160\,PFLOPS of aggregate compute (derivation in Appendix~\ref{app:compute_budget}), which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration where throughput-per-dollar is favorable, and reserve on-demand v4 capacity for runs that should not be interrupted.
@@ -245,7 +259,8 @@ v4 & 64 (32 + 32) & us-central2-b & 32 Spot + 32 On-demand \\
\end{tabular} \end{tabular}
\end{table} \end{table}
For connections from Madrid, we prioritize the europe-west4 allocation for latency-sensitive runs with the benefit of having the most grouped chips within a single region. This regional grouping is important for the deployment of our Kubernetes cluster which cannot span multiple regions. All sweep metadata, model checkpoints, and reward traces are logged in Weights \& Biases. Hardware specifications are from the official Google Cloud TPU documentation \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}. For connections from Madrid, we prioritize the europe-west4 allocation for latency-sensitive runs with the benefit of having the most grouped chips within a single region. This regional grouping is important for the deployment of our Kubernetes cluster which cannot span multiple regions. All sweep metadata, model checkpoints, and reward traces are logged in Weights \& Biases. % TODO: cite this (from bib)
Hardware specifications are from the official Google Cloud TPU documentation \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}.
Design of training processes: we build docker image with the fact in mind of different caching over layers in order to most speed up docker re-building and such we place the most volatile steps towards the end of the image building. What is means in practice is that any dependency installations are isolated so edits to source code do no trigger rebuilds. Only if we update our entry point of training a sweep, Docker will also rebuild the source-code copy stage. Design of training processes: we build docker image with the fact in mind of different caching over layers in order to most speed up docker re-building and such we place the most volatile steps towards the end of the image building. What is means in practice is that any dependency installations are isolated so edits to source code do no trigger rebuilds. Only if we update our entry point of training a sweep, Docker will also rebuild the source-code copy stage.
@@ -294,15 +309,15 @@ In addition to behavioral events, the platform logs price observations to a sepa
\subsection{Generative Contamination and Separability} \subsection{Generative Contamination and Distinguishability}
To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from Phantom data using a two-stage approach. To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from Phantom data using a two-stage approach.
\subsubsection{Ground-Truth Separability} \subsubsection{Ground-Truth Distinguishability}
Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability? Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability?
To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session separability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior. To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session distinguishability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
The normality assumption cannot be made for KL divergence distributions, which are right-skewed and bounded below by zero, so we do not use a Student's $t$-test. Instead we apply a Mann-Whitney $U$ test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the stochastic ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We report $U$, the exact two-sided $p$-value, and group-level descriptive statistics for the gap scores. The normality assumption cannot be made for KL divergence distributions, which are right-skewed and bounded below by zero, so we do not use a Student's $t$-test. Instead we apply a Mann-Whitney $U$ test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the stochastic ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We report $U$, the exact two-sided $p$-value, and group-level descriptive statistics for the gap scores.
@@ -387,8 +402,10 @@ The complete pricing-demand-trajectory loop is illustrated in Figure~\ref{fig:or
\begin{figure}[ht] \begin{figure}[ht]
\centering \centering
\[ {\setlength{\arraycolsep}{4pt}%
\text{Oracle}(\vec{p}_{t-1},\vec{\hat{q}})\to \resizebox{0.85\linewidth}{!}{$
\begin{aligned}
&\text{Oracle}(\vec{p}_{t-1},\vec{\hat{q}})\to
\begin{pmatrix} \begin{pmatrix}
p_0\\ p_0\\
p_1\\ p_1\\
@@ -397,14 +414,15 @@ p_N
\end{pmatrix} \end{pmatrix}
\underrightarrow{d_i \sim \mathcal{N}_{\vec{p}}} \underrightarrow{d_i \sim \mathcal{N}_{\vec{p}}}
\begin{pmatrix}d_0\\ d_1\\ \cdots \\ d_N\end{pmatrix} \begin{pmatrix}d_0\\ d_1\\ \cdots \\ d_N\end{pmatrix}
\underrightarrow{\vec{d}\times \tau_\theta \to \tau^\prime} \underrightarrow{\vec{d}\otimes \tau_\theta}
\begin{bmatrix} \begin{bmatrix}
0.01 & 0.02 & \cdots & 0.3 \\ 0.01 & 0.02 & \cdots & 0.3 \\
0.41 & 0.24 & \cdots & 0.0 \\ 0.41 & 0.24 & \cdots & 0.0 \\
\cdots & \cdots & \cdots & \cdots \\ \cdots & \cdots & \cdots & \cdots \\
0.51 & 0.09 & \cdots & 0.1 \\ 0.51 & 0.09 & \cdots & 0.1 \\
\end{bmatrix} \end{bmatrix}
\underrightarrow{\tau_k \sim \tau^\prime} \\
&\underrightarrow{\tau_k \sim \tau^\prime}
\{\tau_k\}_{k=0}^K \to \hat{Q}(\tau_k) \{\tau_k\}_{k=0}^K \to \hat{Q}(\tau_k)
\to \begin{pmatrix} \to \begin{pmatrix}
\hat{q}_0 \\ \hat{q}_0 \\
@@ -413,8 +431,10 @@ p_N
\hat{q}_N \\ \hat{q}_N \\
\end{pmatrix} \end{pmatrix}
\to \text{Oracle}(\cdot) \to \text{Oracle}(\cdot)
\] \end{aligned}
\caption{Oracle-based pricing loop: historical price and demand state map to a new price vector; each product samples demand curves from $\mathcal{N}_{\vec{p}}$; trajectories are generated by mixing demand with behavioral kernels $\tau_\theta$ into transition matrix $\tau'$; sampled trajectories $\{\tau_k\}$ aggregate through proxy $Q(\cdot)$ to yield updated demand $\vec{\hat{q}}$, closing the feedback loop.} $}%
}
\caption{Oracle-based pricing loop: historical price and demand state map to a new price vector; each product samples demand curves from $\mathcal{N}_{\vec{p}}$; trajectories are generated via the Kronecker product $\vec{d}\otimes\tau_\theta$ into transition matrix $\tau'$; sampled trajectories $\{\tau_k\}$ aggregate through proxy $Q(\cdot)$ to yield updated demand $\vec{\hat{q}}$, closing the feedback loop.}
\label{fig:oracle_flow} \label{fig:oracle_flow}
\end{figure} \end{figure}
@@ -461,7 +481,7 @@ We also consider taxation-like overlays for agent traffic under strategy-proof m
\subsubsection{Pricing Mechanism Summary} \subsubsection{Pricing Mechanism Summary}
We now present the complete pricing mechanism that integrates the behavioral separability, contamination estimation, and robust optimization components developed in the preceding sections. Algorithm~\ref{alg:phantom_loop_clean} formalizes the defensive pricing loop as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories. We now present the complete pricing mechanism that integrates the behavioral distinguishability, contamination estimation, and robust optimization components developed in the preceding sections. Algorithm~\ref{alg:phantom_loop_clean} formalizes the defensive pricing loop as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories.
\begin{algorithm}[t] \begin{algorithm}[t]
\caption{PHANTOM defensive pricing loop} \caption{PHANTOM defensive pricing loop}
@@ -494,3 +514,47 @@ We now present the complete pricing mechanism that integrates the behavioral sep
The algorithm operates in discrete epochs indexed by $t$. At each epoch, the platform applies one discrete multiplicative price action, the environment samples a batch of sessions, and demand is recomputed from weighted events. Robustness is implemented as an inner minimization over a small local grid of contamination candidates around nominal $\alpha_0$, matching the current engine implementation. The history buffer $\mathcal{L}$ (``Limbo'' in our implementation) enforces the alternating Stackelberg structure by preserving the temporal sequence of price publications and demand observations. The algorithm operates in discrete epochs indexed by $t$. At each epoch, the platform applies one discrete multiplicative price action, the environment samples a batch of sessions, and demand is recomputed from weighted events. Robustness is implemented as an inner minimization over a small local grid of contamination candidates around nominal $\alpha_0$, matching the current engine implementation. The history buffer $\mathcal{L}$ (``Limbo'' in our implementation) enforces the alternating Stackelberg structure by preserving the temporal sequence of price publications and demand observations.
%The defensive price update in Line 24 implements contamination-aware margin shrinkage: as estimated contamination $\hat{\alpha}_t$ rises, the margin $(p^{\mathrm{ref}} - c)$ is reduced by factor $\kappa\in[0,1]$, with projection $\Pi_{\mathcal{P}}$ ensuring feasibility. In subsequent experiments this heuristic rule is replaced by DR-RL policy $\pi^*$ from Eq.~\ref{eq:robust_policy}. %The defensive price update in Line 24 implements contamination-aware margin shrinkage: as estimated contamination $\hat{\alpha}_t$ rises, the margin $(p^{\mathrm{ref}} - c)$ is reduced by factor $\kappa\in[0,1]$, with projection $\Pi_{\mathcal{P}}$ ensuring feasibility. In subsequent experiments this heuristic rule is replaced by DR-RL policy $\pi^*$ from Eq.~\ref{eq:robust_policy}.
\subsection{Parallelization Strategy}
To avoid preemption of compute mid-training we settle on using a v4 generation, 40 chip compute node with 5 parallel workers. The login node creates an orchestration node with Ray \parencite{moritz_ray_2018} and we distribute ray compute nodes per each other worker.
\subsubsection{Computational Cost Analysis of the Simulation Step}
The per-step cost of Algorithm~\ref{alg:phantom_loop_clean} is not uniform across its components. To inform hardware provisioning and to identify where algorithmic improvements are most impactful, we profile the hot path of the engine using Python's \texttt{cProfile} instrumentation over 20 environment steps under two configurations: a baseline with the robustness inner loop disabled ($K=1$, $\epsilon_\alpha=0$) and a standard robust setting ($K=5$, $\epsilon_\alpha=0.2$). Both runs use $M=10$ sessions per market call and $N=3$ products.
The baseline achieves approximately 26 steps per second. Enabling the robustness inner loop with $K=5$ candidates drops throughput to 7.2 steps per second, a $3.6\times$ slowdown that is directly proportional to $K$, consistent with the $O(K)$ scaling of the adversarial alpha selection in the implementation.
\begin{table}[ht]
\centering
\caption{Per-step profiling results (20 steps, $M=10$ sessions, $N=3$ products). Self-time measures time spent inside the function excluding callees; cumulative time includes the full call subtree.}
\label{tab:profile_results}
\begingroup
\small
\setlength{\tabcolsep}{4pt}
\begin{tabular}{@{}lrrrr@{}}
\toprule
\textbf{Function} & \textbf{Calls} & \textbf{Self (ms)} & \textbf{Cum. (ms)} & \textbf{Cum. \%} \\
\midrule
\multicolumn{5}{l}{\textit{Baseline ($K=1$, 0.77\,s total, 26 steps/s)}} \\
\texttt{sample\_behavior\_from\_transitions} & 420 & 131 & 658 & 86\% \\
\texttt{DataFrame.xs} & 4,820 & 30 & 201 & 26\% \\
\texttt{numpy.nan\_to\_num} & 4,904 & 43 & 97 & 13\% \\
\texttt{adjust\_behavior\_to\_condition} & 84 & 3 & 54 & 7\% \\
\midrule
\multicolumn{5}{l}{\textit{Robust ($K=5$, 2.79\,s total, 7.2 steps/s)}} \\
\texttt{sample\_behavior\_from\_transitions} & 1,220 & 519 & 2,447 & 88\% \\
\texttt{DataFrame.xs} & 16,668 & 108 & 729 & 26\% \\
\texttt{numpy.nan\_to\_num} & 16,912 & 164 & 363 & 13\% \\
\texttt{adjust\_behavior\_to\_condition} & 244 & 11 & 108 & 4\% \\
\bottomrule
\end{tabular}
\endgroup
\end{table}
Across both configurations, \texttt{sample\_behavior\_from\_transitions} accounts for 86--88\% of total wall time. The function implements the Markov chain sampler described in Section~\ref{sec:tpe}: at each transition it retrieves the current-state row from the expanded transition \texttt{DataFrame} via label-based indexing, which internally dispatches through the pandas \texttt{xs} and \texttt{fast\_xs} code paths. For $M$ sessions each running up to $L_{\max}=40$ transitions, a single \texttt{market.act()} call issues up to $M \cdot L_{\max}$ individual row lookups. With $K=5$ robustness candidates per outer step this accumulates to $5 \times 10 \times 40 = 2{,}000$ row accesses per outer step, producing the 16k \texttt{xs} invocations observed in Table~\ref{tab:profile_results}.
The \texttt{numpy.nan\_to\_num} calls, accounting for 13\% of self-time, occur once per row lookup to sanitize sampled probability vectors before normalization; their call count therefore tracks the \texttt{xs} count exactly.
\texttt{adjust\_behavior\_to\_condition} expands the base $E \times E$ event transition matrix to a $(E \cdot N) \times (E \cdot N)$ product-specific matrix via a Kronecker product. At $N=3$ this is inexpensive, but the cost scales as $O(E^2 N^2)$, so at the $N=10$ default it becomes a more significant contributor. The result is not cached across the $K$ robustness candidates inside a single outer step, meaning the Kronecker expansion is recomputed $2K$ times per step (once for the human kernel and once for the agent kernel at each candidate $\alpha_k$).
The dominant bottleneck therefore has a clear structural cause: the expanded transition matrix is a string-keyed \texttt{DataFrame}, and pandas object-level indexing carries substantial per-call overhead relative to the arithmetic being performed. Converting the expanded matrix to a \texttt{numpy} array with an accompanying integer state-to-index map, computed once per \texttt{market.act()} call and cached for the duration of the robustness inner loop, eliminates the entire pandas dispatch chain. We leverage this bottleneck identified as an opportunity to squeeze the gap which is left by the computational needs of the pricing learner. We make use of JAX to parallelize on the TPU, and surprisingly we open up a large speedup even on CPU-only compute, improving throughput from 26 to 220 steps/s in the baseline configuration and from 7.2 to 136 steps/s under the full robust inner loop, an 8.5$\times$ and 19$\times$ speedup respectively.

View File

@@ -1,7 +1,7 @@
\section{Results} \section{Results}
\begin{figure}[ht] \begin{figure}[ht]
\centering \centering
\input{chapters/figures/supra.tex} \input{chapters/figures/supra/supra.tex}
\caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as SAC as can be clearly seen by the high density at the highest available price.} \caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as SAC as can be clearly seen by the high density at the highest available price.}
\label{fig:supra_heatmap} \label{fig:supra_heatmap}
\end{figure} \end{figure}
@@ -10,7 +10,7 @@
\subsection{Behavioral Analysis} \subsection{Behavioral Analysis}
Separability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. Table~\ref{tab:divergence_significance} reports the group-level descriptive statistics for the gap scores and the test result. Distinguishability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. The full recorded cohort contains $n_H=13$ human sessions and $n_A=16$ agent sessions, and Table~\ref{tab:divergence_significance} reports the corresponding group-level statistics and test result.
\begin{table}[ht] \begin{table}[ht]
\centering \centering
@@ -20,48 +20,67 @@ Separability between human and agent sessions is evaluated by computing per-sess
\toprule \toprule
Group & $n$ & Mean gap & Std \\ Group & $n$ & Mean gap & Std \\
\midrule \midrule
Human sessions & 11 & $-3.3522$ & $2.6748$ \\ Human sessions & 13 & $-3.35$ & $2.67$ \\
Agent sessions & 6 & $+1.6482$ & $2.8349$ \\ Agent sessions & 16 & $+1.65$ & $2.83$ \\
\midrule \midrule
\multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\ \multicolumn{4}{l}{Mann-Whitney two-sided test: $p<0.001$} \\
\bottomrule \bottomrule
\end{tabular} \end{tabular}
\end{table} \end{table}
The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided $p$-value of $0.0006$ indicates near-complete rank separation between the groups at $n_H=11$, $n_A=6$, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing. The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided test result ($p<0.001$) at $n_H=13$, $n_A=16$ indicates strong rank distinction between groups, providing evidence that the transition kernels are distinguishable enough to justify their use as a control signal in downstream pricing.
\subsection{Experimental Outcomes} \subsection{Experimental Outcomes}
To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) non-robust baseline with revenue-only reward (\texttt{--no-robust}). To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) a baseline policy with revenue-only reward.
We report two preliminary stages before the full factorial interpretation. First, we executed a short calibration run at $\alpha=0.3$ (2 evaluation episodes, 3000 training timesteps per tier) across \texttt{qtable}, \texttt{ppo}, \texttt{a2c}, and \texttt{dqn}. In that first run, \texttt{ppo} produced the highest objective score and revenue (objective $=3.76\mathrm{e}5$, revenue $=4.15\mathrm{e}5$), while the remaining tiers stayed lower in this small-budget regime. The corresponding price traces show a monotone escalation for \texttt{ppo} (mean price from $8.61\mathrm{e}1$ to $1.49\mathrm{e}2$), whereas \texttt{qtable}, \texttt{a2c}, and \texttt{dqn} remained nearly flat over the episode horizon. This confirms that the simulation loop is able to express policy-dependent pricing dynamics rather than collapsing into a single trajectory shape. We report two preliminary stages before the full factorial interpretation. First, we executed a short calibration run at $\alpha=0.3$ (2 evaluation episodes, 3000 training timesteps per tier) across \texttt{qtable}, \texttt{ppo}, \texttt{a2c}, and \texttt{dqn}. In that first run, \texttt{ppo} produced the highest objective score and revenue (objective $=3.76\mathrm{e}5$, revenue $=4.15\mathrm{e}5$), while the remaining tiers stayed lower in this small-budget regime. The corresponding price traces show a monotone escalation for \texttt{ppo} (mean price from $8.61\mathrm{e}1$ to $1.49\mathrm{e}2$), whereas \texttt{qtable}, \texttt{a2c}, and \texttt{dqn} remained nearly flat over the episode horizon. This confirms that the simulation loop is able to express policy-dependent pricing dynamics rather than collapsing into a single trajectory shape.
Second, we launched an overnight paired benchmark over $\alpha \in \{0.00,0.15,0.30,0.45,0.60\}$ with 8 evaluation episodes and 8000 timesteps, comparing robust and non-robust settings at fixed seed/tier/contamination tuples. At the time of writing, two seeds (11 and 22) are complete and one additional seed is still running. We therefore frame the numbers below as an initial signal, not a final claim.
\begin{table}[ht] \subsubsection{The Impact of Contamination on Revenue}
A linear fit test on run-level data ($n=95$) shows a strong negative association between contamination and mean revenue. The fitted model mapping $\alpha \to \text{revenue}$ result in $t(93)=-8.2148$, $p=1.20\times 10^{-12}$, $R^2=0.4205$, and a 95\% confidence interval for the slope of $[-75{,}288.76,\,-45{,}975.13]$. In practical terms, a $+0.1$ increase in $\alpha$ corresponds to an average decrease of about $6{,}063$ revenue units within our environment.
\subsubsection{Large Scale Factorial Training}
In our complete training runs we logged $\approx 180$ days of net compute time. The results we draw from extensive training are
\begin{enumerate*}[label=(\roman*)]
\item the ability to extract COI is greater in the presence of robustness within the training loop
\item short term revenue measurements suffer $\approx 3\%$ loss but COI margin compensates for this loss in the long run
\item a larger catalog size contributes positively to COI preservation under higher contamination ratios
\item supra-competitive pricing is a natural reward hacking tendency which is drastically reduced by a balanced UX penalty
\end{enumerate*}
\begin{figure}[ht]
\centering \centering
\caption{Early overnight aggregate over completed seeds ($n=2$; seeds 11 and 22).} \input{chapters/figures/results/includes/final/final_focus_revenue_by_alpha.tex}
\label{tab:pricing_benchmark} \caption{Revenue curves by contamination for the final cohort. The baseline remains above the defended curve in most cells, but the gap narrows in the high-contamination region.}
\begin{tabular}{lcccc} \label{fig:final_focus_revenue_by_alpha}
\toprule \end{figure}
Mode & Mean objective score & Mean revenue & Mean COI level & Mean margin \\ % TODO: we need a similar plot which shows the COI preserved (what we gain across teh multiple conatmination leves, showing that the robust method has better COI optimization.)
\midrule
Robust & $3.41\mathrm{e}5$ & $3.80\mathrm{e}5$ & $1.08\mathrm{e}2$ & 0.901 \\
Non-robust (\texttt{--no-robust}) & $3.91\mathrm{e}5$ & $4.18\mathrm{e}5$ & $1.11\mathrm{e}2$ & 0.906 \\
\bottomrule
\end{tabular}
\end{table}
At pair level (same seed, tier, and contamination), robust exceeds non-robust in $13/40$ configurations on objective score and in $16/40$ configurations on revenue. The current early evidence therefore suggests a conditional robustness effect: the defense is active and measurable, but not yet uniformly beneficial without further calibration. \begin{figure}[ht]
\centering
\input{chapters/figures/results/includes/final/final_focus_revenue_delta.tex}
\caption{Defended-minus-baseline revenue delta over contamination for the final cohort. The strongest high-contamination deviation begins at $\alpha=0.7$, followed by recovery toward near parity by $\alpha=1.0$.}
\label{fig:final_focus_revenue_delta}
\end{figure}
\begin{figure}[ht]
\centering
\input{chapters/figures/results/includes/final/final_focus_risk_deltas.tex}
\caption{Defended-minus-baseline leakage and volatility deltas for the final cohort. Leakage remains lower for the defended policy across the full contamination range.}
\label{fig:final_focus_risk_deltas}
\end{figure}
\subsection{Interpretation and Insights} \subsection{Interpretation and Insights}
The Mann-Whitney result ($U=2.0$, $p<0.001$) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score. The Mann-Whitney result ($p<0.001$) confirms that per-session divergence gaps distinguish the two actor classes with near-zero overlap in rank ordering. This is the condition required for distinguishability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
The first calibration and overnight runs additionally confirm three practical points aligned with the thesis mechanism. First, the control loop is reproducible end-to-end (training, evaluation, artifact generation) across algorithms and contamination levels. Second, policy class materially changes price trajectories and resulting COI/revenue profiles under identical environment settings. Third, objective improvements from robustness are regime-dependent in the current baseline, which is consistent with the thesis claim that contamination-aware pricing needs explicit calibration rather than a one-size-fits-all penalty. The first calibration and paired benchmark runs additionally confirm three practical points aligned with the thesis. First, the control loop is reproducible end-to-end (training, evaluation, artifact generation) across algorithms and contamination levels. Second, policy class materially changes price trajectories and resulting COI/revenue profiles under identical environment settings. Third, objective improvements from robustness are regime-dependent in the current baseline, which is consistent with the thesis claim that contamination-aware pricing needs explicit calibration rather than a one-size-fits-all penalty.
We also note that maximizing revenue in isolation can favor aggressive high-price behavior; even in these early runs, the non-robust aggregate shows slightly higher mean COI and margin. For this reason, all subsequent reporting in this thesis is interpreted on a multi-metric basis (objective, revenue, COI, and stability), and not by revenue alone. We also note that maximizing revenue in isolation can favor aggressive high-price behavior; even in these early runs, the non-robust aggregate shows slightly higher mean COI and margin. For this reason, all subsequent reporting in this thesis is interpreted on a multi-metric basis (objective, revenue, COI, and stability), and not by revenue alone.
\subsection{Anomalies} \subsection{Anomalies}
In our initial runs, we observed an instability pocket in one completed run (A2C, robust, seed 11, $\alpha=0.30$) with a large performance drop relative to neighboring configurations. We retain this run in the preliminary summary to avoid survivorship bias and treat it as evidence that robustness sensitivity analysis is necessary before final conclusions. In our initial runs, we observed an instability pocket in one completed run (A2C, robust, seed 11, $\alpha=0.30$) with a large performance drop relative to neighboring configurations. We retain this run in the preliminary summary to avoid survivorship bias and treat it as evidence that robustness sensitivity analysis is necessary before final conclusions.

View File

@@ -16,6 +16,4 @@ This technology does not come without a more bitter side, ethical concerns do ar
With a system like this there is potential for strong drift given the rapid advance of agentic systems and user preference. Our intent behind adding the UX term into the reward shaping process was to further address the risk of degraded user experience. Looking deeper at the underlying methodology, reinforcement learning does not come without it's complications such as reward hacking and often the lack of intepretability which is quite critical in systems that have a strong impact on the revenue of a company. With a system like this there is potential for strong drift given the rapid advance of agentic systems and user preference. Our intent behind adding the UX term into the reward shaping process was to further address the risk of degraded user experience. Looking deeper at the underlying methodology, reinforcement learning does not come without it's complications such as reward hacking and often the lack of intepretability which is quite critical in systems that have a strong impact on the revenue of a company.
\subsection{Implications of Findings} % \subsection{Implications of Findings} Interpretation of results and altenrative scenarios with broader market implications.
Interpretation of results and altenrative scenarios with broader market implications.

View File

@@ -1,11 +1,24 @@
\section{Conclusion} \section{Conclusion}
For our troubles, we now conclude that... Our research has explored how reinforcement learning works within pricing systems and environments which are substantially disrupted by an adversarial participant. Our findings include the optimization for our newly introduced metrics.
\subsection{Summary of contributions} \subsection{Summary of contributions}
The authors contribution was not without the advice of many experienced experts in the field. We thank Marco Casalaina VP Products, Core AI and AI Futurist at Microsoft for the initial critical discussion on the topic of dynamic pricing systems and the spark which has lead to this work. Eugene Bykovets, PhD pointing out the parallels in blockchain systems and the complexity of anonymous interaction and understanding of intent. Importantly, the contributions of Alberto Martín Izquierdo, my academic advisor for the support over and for taking on the challenge of this ambitious work. Many breakthroughs were thanks to numerous discussions with my peers on the topics covered here. The contribution was not without the advice of many experienced experts in the field. We thank Marco Casalaina VP Products, Core AI and AI Futurist at Microsoft for the initial critical discussion on the topic of dynamic pricing systems and the spark which has lead to this work. Eugene Bykovets, PhD pointing out the parallels in blockchain systems and the complexity of anonymous interaction and understanding of intent. Importantly, the contributions of Alberto Martín Izquierdo, my academic advisor for the support over and for taking on the challenge of this ambitious work. Many breakthroughs were thanks to numerous discussions with my peers on the topics covered here.
A thanks to the head of innovation at Amadeus for insight into the industry split on the topic of collapsing margins. Finally we acknowledge the power and use of generative AI technologies for in depth research, rapid prototyping and surfacing of key topics and niches. A thanks to the head of innovation at Amadeus for insight into the industry split on the topic of collapsing margins. Finally we acknowledge the power and use of generative AI technologies for in depth research, rapid prototyping and surfacing of key topics and niches.
Now we very explicitly mention what we contribute in this paper:
\begin{itemize}
\item TPU-accelerated parallelization of the behavioral simulation and reinforcement learning pipeline, making large-scale factorial sweeps tractable.
\item Formalization of non-human transaction orchestration in e-commerce as a distinct source of contamination in dynamic pricing systems.
\item Definition of the Cost of Information (COI) as a mechanism-level quantity for pricing power, together with a theorem showing its erosion under increasing agent saturation.
\item Design and implementation of a controlled e-commerce research platform, built on a hybrid Kappa-Lambda architecture, for collecting and replaying high-fidelity interaction trajectories.
\item Construction and empirical validation of a behavioral distinguishability framework that distinguishes human and agent sessions from interaction signals alone using transition kernels and KL-based divergence.
\item Development of a generative contamination mechanism that injects learned agent behavior into the pricing environment for controlled robustness experiments.
\item Translation of behavioral distinguishability into a defensive pricing mechanism through a distributionally robust reinforcement learning formulation of pricing under non-stationary contamination.
\item Empirical evidence that agent contamination reduces revenue and that robustness is condition-dependent, requiring explicit calibration rather than a one-size-fits-all penalty.
\item Release of a reusable public experimental artifact for reproducing and extending research on dynamic pricing under agent-mediated traffic.
\end{itemize}
\subsection{Future Works and Next Steps} \subsection{Future Works and Next Steps}
During the eights months of research dedicated to this work, a plethora of opportunities and industry gaps was identified, sadly a majority of which could not be addressed directly. During the eights months of research dedicated to this work, a plethora of opportunities and industry gaps was identified, sadly a majority of which could not be addressed directly.

View File

@@ -0,0 +1,165 @@
---
pretty_name: whoclickedit
license: mit
language:
- en
task_categories:
- tabular-classification
task_ids:
- tabular-multi-class-classification
tags:
- e-commerce
- dynamic-pricing
- behavioral-telemetry
- human-vs-agent
- session-data
size_categories:
- 1K<n<10K
---
<img align="right" width="280" src="https://raw.githubusercontent.com/velocitatem/PHANTOM/main/docs/static/images/banner.svg" alt="PHANTOM research banner" />
# [whoclickedit](https://huggingface.co/datasets/velocitatem/whoclickedit)
[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-sm.svg)](https://huggingface.co/datasets/velocitatem/whoclickedit)
![Rows](https://img.shields.io/badge/Rows-3874-0A9396?style=flat-square)
![Columns](https://img.shields.io/badge/Columns-42-005F73?style=flat-square)
![Sessions](https://img.shields.io/badge/Sessions-36-1D3557?style=flat-square)
![Human rows](https://img.shields.io/badge/Human%20rows-798-2A9D8F?style=flat-square)
![Agent rows](https://img.shields.io/badge/Agent%20rows-3076-E76F51?style=flat-square)
![License](https://img.shields.io/badge/License-MIT-111827?style=flat-square)
> **Event-level behavior data for dynamic pricing research.**
> This dataset captures how humans and automated agents browse, query prices, and move through the PHANTOM storefronts during controlled experiments.
## What this dataset gives you
- A single flat file (`whoclicked.csv`) with both interaction and price-log events.
- Explicit labels for actor origin: `actor_type` and `is_agent`.
- Provenance fields from Kafka envelopes when available.
- Metadata flattened into feature-ready `metadata_*` columns.
## Snapshot
| Metric | Value |
| --- | --- |
| Rows | `3874` |
| Columns | `42` |
| Time range (UTC) | `2025-12-05T09:43:31.301000+00:00` -> `2026-03-23T12:08:30.151000+00:00` |
| Unique sessions | `36` |
## Composition
### Rows by actor
| Actor | Rows | Share |
| --- | --- | --- |
| `human` | 798 | 20.6% |
| `agent` | 3076 | 79.4% |
### Rows by actor and record type
| Actor | Record type | Rows |
| --- | --- | --- |
| `agent` | `interaction` | 197 |
| `agent` | `price_log` | 2879 |
| `human` | `interaction` | 328 |
| `human` | `price_log` | 470 |
### Store mode coverage
| Store mode | Rows |
| --- | --- |
| `hotel` | 3628 |
| `airline` | 196 |
| `shop` | 50 |
### Top interaction events
| Interaction event | Count |
| --- | --- |
| `page_view` | 246 |
| `learn_more_about_item` | 91 |
| `view_item_page` | 88 |
| `add_item_to_cart` | 47 |
| `hover_over_title` | 23 |
| `checkout_start` | 20 |
| `hover_over_paragraph` | 6 |
| `remove_item` | 4 |
## Collection pipeline
Data is sourced from two roots inside PHANTOM:
- `experiments/collected_data` (human sessions)
- `experiments/agents/collected_data` (agent sessions)
Each session directory contains:
- `int.json`: user interaction events
- `price.json`: price quote observations
ETL behavior:
1. Accepts both Kafka-envelope records and flat payload records.
2. Flattens nested JSON to a tabular schema.
3. Preserves row-level provenance (`source_session_dir`, `source_row_index`, topic fields).
4. Adds modeling labels (`actor_type`, `is_agent`, `record_type`).
## Schema highlights
Core modeling fields:
- `actor_type`, `is_agent`, `record_type`
- `sessionId`, `experimentId`, `storeMode`, `ts`
- `eventName`, `page`, `productId`, `price`, `userAgent`
Kafka provenance fields:
- `kafka_partition_id`, `kafka_offset`, `kafka_timestamp_ms`, `kafka_compression`
- `kafka_is_transactional`, `kafka_headers`, `kafka_key_*`, `kafka_value_*`
<details>
<summary>Metadata columns in this release</summary>
- `metadata_cabinClass`
- `metadata_dateIndex`
- `metadata_dwellTime`
- `metadata_elementText`
- `metadata_fareRule`
- `metadata_flightType`
- `metadata_itemCount`
- `metadata_nights`
- `metadata_price`
- `metadata_referrer`
- `metadata_roomType`
- `metadata_total`
- `metadata_type`
</details>
## Quick start
```python
from datasets import load_dataset
ds = load_dataset("velocitatem/whoclickedit")
```
Recommended split strategy:
- Prefer session-aware or time-aware splits.
- Do not split rows from the same `sessionId` across train and test.
## Intended use
- Human-vs-agent behavior classification.
- Session-level telemetry modeling for dynamic pricing defenses.
- Robustness experiments under agent-mediated reconnaissance.
## Safety and limitations
- `userAgent` and referrer metadata can be quasi-identifying in very small samples.
- Data comes from a controlled research platform, not a full production marketplace.
- Current release has stronger coverage for `hotel` flows than `airline` flows.
## Citation
If you use this dataset, cite the PHANTOM thesis project and link this page:
`https://huggingface.co/datasets/velocitatem/whoclickedit`

3
paper/src/chapters/figures/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
__pycache__/
*.pyc
.pdf-view-restore

View File

@@ -0,0 +1,12 @@
alpha,revenue_delta,revenue_delta_pct,reward_delta,reward_delta_pct,volatility_delta,supra_delta,coi_leakage_delta
0.0,-17982.383542886935,-5.11072862876989,-17145.799161982606,-5.235033672101227,0.001232973729699119,0.0,-0.0030412479577408003
0.1,-14962.041501283413,-4.410637208586118,-14303.760282736213,-4.531344436782669,0.0011858665298920962,0.0,-0.004133727080174038
0.2,-16153.416666167905,-4.826514761457546,-15398.621298776357,-4.9418165571901715,0.00200624274016295,0.0,-0.0033201883450373615
0.3,-17294.9275360335,-5.382423616385397,-16544.91845114401,-5.533399709364953,-0.0011022484400295268,0.0,-0.0029151149203366505
0.4,-19661.294346174283,-6.250307313590199,-18728.35578200908,-6.3953153560217535,3.582812967113658e-05,0.0,-0.0038123361988749577
0.5,-16411.03168918495,-5.3630681206030015,-15638.77510066732,-5.4888928630525315,0.00015428950526953644,0.0,-0.00439661338956944
0.6,-14729.668247641937,-5.069964928178309,-13912.22417824401,-5.148827377884945,-0.002735776807082743,0.0,-0.004310129386364658
0.7,-21160.81910514756,-7.351404104505076,-20171.762105623755,-7.525169314210056,-0.0008903632602569461,0.0,-0.0026198461183787186
0.8,-16404.76825612632,-5.9342582959227075,-15645.025250480074,-6.078699946285722,0.0010338614665691137,0.0,-0.002542765270289696
0.9,-8674.090655496111,-3.2592966246269577,-8371.30734891587,-3.378943339994106,-0.0005579187914590139,0.0,-0.0013720835439427759
1.0,768.8099906174757,0.2991618705853567,399.7394696234842,0.16706914330070038,0.0014659834822295797,0.0,-0.0007600066499474645
1 alpha revenue_delta revenue_delta_pct reward_delta reward_delta_pct volatility_delta supra_delta coi_leakage_delta
2 0.0 -17982.383542886935 -5.11072862876989 -17145.799161982606 -5.235033672101227 0.001232973729699119 0.0 -0.0030412479577408003
3 0.1 -14962.041501283413 -4.410637208586118 -14303.760282736213 -4.531344436782669 0.0011858665298920962 0.0 -0.004133727080174038
4 0.2 -16153.416666167905 -4.826514761457546 -15398.621298776357 -4.9418165571901715 0.00200624274016295 0.0 -0.0033201883450373615
5 0.3 -17294.9275360335 -5.382423616385397 -16544.91845114401 -5.533399709364953 -0.0011022484400295268 0.0 -0.0029151149203366505
6 0.4 -19661.294346174283 -6.250307313590199 -18728.35578200908 -6.3953153560217535 3.582812967113658e-05 0.0 -0.0038123361988749577
7 0.5 -16411.03168918495 -5.3630681206030015 -15638.77510066732 -5.4888928630525315 0.00015428950526953644 0.0 -0.00439661338956944
8 0.6 -14729.668247641937 -5.069964928178309 -13912.22417824401 -5.148827377884945 -0.002735776807082743 0.0 -0.004310129386364658
9 0.7 -21160.81910514756 -7.351404104505076 -20171.762105623755 -7.525169314210056 -0.0008903632602569461 0.0 -0.0026198461183787186
10 0.8 -16404.76825612632 -5.9342582959227075 -15645.025250480074 -6.078699946285722 0.0010338614665691137 0.0 -0.002542765270289696
11 0.9 -8674.090655496111 -3.2592966246269577 -8371.30734891587 -3.378943339994106 -0.0005579187914590139 0.0 -0.0013720835439427759
12 1.0 768.8099906174757 0.2991618705853567 399.7394696234842 0.16706914330070038 0.0014659834822295797 0.0 -0.0007600066499474645

View File

@@ -0,0 +1,23 @@
alpha,mode,runs,revenue_mean,reward_mean,supra_mean,volatility_mean,coi_leakage_mean,coi_level_mean
0.0,baseline,36,351855.57381502265,327520.32242613373,0.0,0.06922494093544151,0.11931704468268205,136.80105514058158
0.0,defended,35,333873.1902721357,310374.5232641511,0.0,0.07045791466514063,0.11627579672494125,136.81832905386602
0.1,baseline,32,339226.3020897988,315662.6136522988,0.0,0.06952778671756812,0.11924519238669087,136.47864859317326
0.1,defended,33,324264.2605885154,301358.8533695626,0.0,0.07071365324746022,0.11511146530651684,136.7200845824852
0.2,baseline,31,334680.76789409376,311598.399506997,0.0,0.06848006194428993,0.11597869134898402,136.83684469591932
0.2,defended,35,318527.35122792586,296199.77820822067,0.0,0.07048630468445288,0.11265850300394666,137.2758153292305
0.3,baseline,30,321322.30327214615,299000.9636054795,0.0,0.07085669473747759,0.11527347603412934,136.4452630715689
0.3,defended,44,304027.37573611265,282456.0451543355,0.0,0.06975444629744806,0.11235836111379269,136.4704115371568
0.4,baseline,33,314565.2423109539,292844.914432166,0.0,0.07031811881503117,0.11300307992768284,136.72547178046122
0.4,defended,38,294903.9479647796,274116.55865015695,0.0,0.0703539469447023,0.10919074372880788,136.75671002806396
0.5,baseline,33,306000.80625751516,284916.7489847879,0.0,0.06938663916591635,0.11118137138243217,136.9528780620641
0.5,defended,35,289589.7745683302,269277.9738841206,0.0,0.06954092867118589,0.10678475799286273,136.65018588845163
0.6,baseline,28,290528.0106727377,270201.7985298805,0.0,0.07139577980623227,0.11081647254398667,135.258395468266
0.6,defended,41,275798.3424250958,256289.57435163652,0.0,0.06866000299914952,0.10650634315762202,136.3194947785247
0.7,baseline,40,287847.3119465684,268057.25244656845,0.0,0.07132313199532896,0.10746267580456732,137.0170522633547
0.7,defended,40,266686.49284142087,247885.4903409447,0.0,0.07043276873507201,0.1048428296861886,136.56834095392904
0.8,baseline,26,276441.76303208206,257374.52726285128,0.0,0.06945655282263205,0.1063246766773884,136.66765260798618
0.8,defended,39,260036.99477595574,241729.5020123712,0.0,0.07049041428920116,0.1037819114070987,136.61222667078658
0.9,baseline,35,266133.8213268301,247749.2667554015,0.0,0.0709569180547784,0.10455882265976374,136.5370653814206
0.9,defended,39,257459.73067133396,239377.95940648564,0.0,0.07039899926331938,0.10318673911582096,136.7368893225831
1.0,baseline,35,256987.96076959255,239265.888198164,0.0,0.06888231148034313,0.10369761394735275,136.68691718467974
1.0,defended,30,257756.77076021003,239665.62766778748,0.0,0.07034829496257271,0.10293760729740528,136.65287739235566
1 alpha mode runs revenue_mean reward_mean supra_mean volatility_mean coi_leakage_mean coi_level_mean
2 0.0 baseline 36 351855.57381502265 327520.32242613373 0.0 0.06922494093544151 0.11931704468268205 136.80105514058158
3 0.0 defended 35 333873.1902721357 310374.5232641511 0.0 0.07045791466514063 0.11627579672494125 136.81832905386602
4 0.1 baseline 32 339226.3020897988 315662.6136522988 0.0 0.06952778671756812 0.11924519238669087 136.47864859317326
5 0.1 defended 33 324264.2605885154 301358.8533695626 0.0 0.07071365324746022 0.11511146530651684 136.7200845824852
6 0.2 baseline 31 334680.76789409376 311598.399506997 0.0 0.06848006194428993 0.11597869134898402 136.83684469591932
7 0.2 defended 35 318527.35122792586 296199.77820822067 0.0 0.07048630468445288 0.11265850300394666 137.2758153292305
8 0.3 baseline 30 321322.30327214615 299000.9636054795 0.0 0.07085669473747759 0.11527347603412934 136.4452630715689
9 0.3 defended 44 304027.37573611265 282456.0451543355 0.0 0.06975444629744806 0.11235836111379269 136.4704115371568
10 0.4 baseline 33 314565.2423109539 292844.914432166 0.0 0.07031811881503117 0.11300307992768284 136.72547178046122
11 0.4 defended 38 294903.9479647796 274116.55865015695 0.0 0.0703539469447023 0.10919074372880788 136.75671002806396
12 0.5 baseline 33 306000.80625751516 284916.7489847879 0.0 0.06938663916591635 0.11118137138243217 136.9528780620641
13 0.5 defended 35 289589.7745683302 269277.9738841206 0.0 0.06954092867118589 0.10678475799286273 136.65018588845163
14 0.6 baseline 28 290528.0106727377 270201.7985298805 0.0 0.07139577980623227 0.11081647254398667 135.258395468266
15 0.6 defended 41 275798.3424250958 256289.57435163652 0.0 0.06866000299914952 0.10650634315762202 136.3194947785247
16 0.7 baseline 40 287847.3119465684 268057.25244656845 0.0 0.07132313199532896 0.10746267580456732 137.0170522633547
17 0.7 defended 40 266686.49284142087 247885.4903409447 0.0 0.07043276873507201 0.1048428296861886 136.56834095392904
18 0.8 baseline 26 276441.76303208206 257374.52726285128 0.0 0.06945655282263205 0.1063246766773884 136.66765260798618
19 0.8 defended 39 260036.99477595574 241729.5020123712 0.0 0.07049041428920116 0.1037819114070987 136.61222667078658
20 0.9 baseline 35 266133.8213268301 247749.2667554015 0.0 0.0709569180547784 0.10455882265976374 136.5370653814206
21 0.9 defended 39 257459.73067133396 239377.95940648564 0.0 0.07039899926331938 0.10318673911582096 136.7368893225831
22 1.0 baseline 35 256987.96076959255 239265.888198164 0.0 0.06888231148034313 0.10369761394735275 136.68691718467974
23 1.0 defended 30 257756.77076021003 239665.62766778748 0.0 0.07034829496257271 0.10293760729740528 136.65287739235566

View File

@@ -0,0 +1,27 @@
{
"bundle": "engine/studies/results/wandb_sweep_bundles/bundle_20260317_093826",
"focus_cohort": "max_alpha_coverage",
"alpha_cells": 11,
"alpha_min": 0.0,
"alpha_max": 1.0,
"mean_revenue_delta_pct": -4.787221975639986,
"mean_reward_delta_pct": -4.91730667541704,
"zone_summary": [
{
"zone": "high_alpha_0_7_plus",
"alpha_cells": 4,
"revenue_delta_pct_mean": -4.0614492886173466,
"reward_delta_pct_mean": -4.2039358642972955,
"coi_leakage_delta_mean": -0.0018236753956396637,
"volatility_delta_mean": 0.00026289072427068336
},
{
"zone": "low_alpha_below_0_7",
"alpha_cells": 7,
"revenue_delta_pct_mean": -5.201949225367208,
"reward_delta_pct_mean": -5.324947138914036,
"coi_leakage_delta_mean": -0.0037041938968711296,
"volatility_delta_mean": 0.00011102505536893832
}
]
}

View File

@@ -0,0 +1,3 @@
zone,alpha_cells,revenue_delta_pct_mean,reward_delta_pct_mean,coi_leakage_delta_mean,volatility_delta_mean
high_alpha_0_7_plus,4,-4.0614492886173466,-4.2039358642972955,-0.0018236753956396637,0.00026289072427068336
low_alpha_below_0_7,7,-5.201949225367208,-5.324947138914036,-0.0037041938968711296,0.00011102505536893832
1 zone alpha_cells revenue_delta_pct_mean reward_delta_pct_mean coi_leakage_delta_mean volatility_delta_mean
2 high_alpha_0_7_plus 4 -4.0614492886173466 -4.2039358642972955 -0.0018236753956396637 0.00026289072427068336
3 low_alpha_below_0_7 7 -5.201949225367208 -5.324947138914036 -0.0037041938968711296 0.00011102505536893832

View File

@@ -0,0 +1,10 @@
{
"runs": 340,
"tiers": 5,
"alphas": 6,
"status": "ok",
"mean_tier_revenue_robust": 190714.62212212436,
"mean_tier_revenue_no_robust": 197371.17216609977,
"mean_tier_revenue_delta": -6656.5500439754105,
"mean_tier_revenue_delta_pct": -3.3726050116242514
}

View File

@@ -0,0 +1,31 @@
tier,alpha,runs_robust,runs_no_robust,eval_revenue_mean_delta,eval_revenue_mean_delta_pct,eval_reward_mean_delta,eval_reward_mean_delta_pct,eval_coi_level_mean_delta,eval_coi_level_mean_delta_pct,eval_margin_mean_delta,eval_margin_mean_delta_pct,objective_score_delta,objective_score_delta_pct,train_alpha_adv_delta,train_alpha_adv_delta_pct
dqn,0.0,5.0,2.0,-31308.987414117495,-8.73651226889534,-1909.7427407095092,-0.5742991901121623,-2.8982436567700063,-2.1108702433020436,-0.001972064237093285,-0.2116777198290971,-1909.7427407095092,-0.5742991901121623,,
dqn,0.1,8.0,4.0,-7723.542755668925,-2.2789188721535494,-74239.37371836061,-21.063854618469847,1.7435833801418141,1.2859365583872486,0.0011891962142838164,0.1278074871971924,-74239.37371836061,-21.063854618469847,0.17619791666666657,176.19791666666694
dqn,0.25,7.0,3.0,-12344.82818986749,-3.7035466052614323,93154.03627578515,36.06691230407512,0.03214544949867104,0.023426184113378143,1.763733457238459e-05,0.001893256490383175,93154.03627578515,36.06691230407512,0.14530952380952394,58.12380952380958
dqn,0.4,5.0,10.0,-7816.300706216833,-2.4694340725162824,-42362.74668471434,-13.411888482380219,0.6251272343707797,0.4579446603861758,0.0002750615520492605,0.02953644634355915,-42362.74668471434,-13.411888482380219,0.09856666666666747,24.64166666666691
dqn,0.6,5.0,4.0,-16150.011887742497,-5.347485987139731,-28508.74710866122,-10.151356300001888,-0.63306323164079,-0.46056970247177387,-0.00034537433455417155,-0.0370668515552649,-28508.74710866122,-10.151356300001888,0.1361999999999981,22.699999999999644
dqn,0.8,7.0,6.0,-18191.8826663699,-6.440527544692988,-55296.94441124235,-20.19273590083627,-0.796733634735034,-0.579832425016392,-0.0006423984775592029,-0.0689476165584585,-55296.94441124235,-20.19273590083627,0.1532857142857158,19.160714285714512
linear,0.0,9.0,8.0,-14967.67388588126,-4.273413942959129,-20107.23171681742,-6.60039931288617,-0.06127790826209889,-0.04564810574240612,-7.607744079518586e-05,-0.008177885913528719,-20107.23171681742,-6.60039931288617,,
linear,0.1,3.0,5.0,-24531.399901538738,-7.171831328305365,-96669.7835552101,-26.44920711447249,-0.3680976907859872,-0.2733723058172187,-0.0002515287835096469,-0.02702956778346356,-96669.7835552101,-26.44920711447249,,
linear,0.25,6.0,9.0,-14840.859479571285,-4.520682292638562,-26510.179456423968,-8.033117756667396,-0.13734776448131925,-0.10212641096230607,-9.41162442338328e-05,-0.010115001392981545,-26510.179456423968,-8.033117756667396,,
linear,0.4,4.0,11.0,-17196.7642560167,-5.486915251242723,-74520.10209817477,-25.042311510043184,0.12217076984330788,0.09098828726103136,0.00010713887099822461,0.011516865671259795,-74520.10209817477,-25.042311510043184,,
linear,0.6,5.0,3.0,-14284.06615788641,-4.854766876637072,38417.71856593515,14.088596762512362,0.24251461234271687,0.1806530855220358,0.0002606811969937395,0.028024824619509187,38417.71856593515,14.088596762512362,,
linear,0.8,4.0,11.0,-10840.488575784548,-3.933600919557566,15749.581078662042,6.447651726824251,0.028051260535562506,0.020876236575910773,5.361882659971062e-05,0.005763158099097226,15749.581078662042,6.447651726824251,,
qtable,0.0,9.0,8.0,-18644.457288398524,-8.15323701554329,32993.42568058451,20.675688115613053,10.369779227648095,10.682768960780463,0.018566897519637582,2.0803084179092814,32993.42568058451,20.675688115613053,0.11839814814814797,
qtable,0.1,6.0,5.0,-12549.400855549495,-4.616991193742389,-37207.79701261924,-15.336047254435487,0.0884057957559321,0.07703761042583206,-0.01127789819771663,-1.2272540823820444,-37207.79701261924,-15.336047254435487,0.07577777777777787,75.77777777777803
qtable,0.25,6.0,5.0,-1534.3527429780224,-0.5456640130847226,18433.43663451099,7.304472653867784,-0.5776125938941306,-0.45734160960552755,-0.003316338490628068,-0.3584028328803385,18433.43663451099,7.304472653867784,0.1181458333333334,47.258333333333354
qtable,0.4,8.0,6.0,-15146.258176090778,-5.274860187729517,-37364.22587794208,-13.005651205148677,0.4611471727478005,0.3629050099230144,0.0071046453227539,0.7751478467862876,-37364.22587794208,-13.005651205148677,0.11010416666666772,27.52604166666698
qtable,0.6,6.0,6.0,-9577.578548656049,-3.9322693501816666,-19088.152339068736,-9.571307395166029,0.9081750157567683,0.7495917946306662,0.0015520804425310786,0.16838348372043557,-19088.152339068736,-9.571307395166029,0.16983333333333228,28.305555555555333
qtable,0.8,5.0,2.0,-52751.680936846446,-19.699089872409548,-16508.209313987172,-7.589601869470744,-15.022454081083623,-11.215398490282094,-0.007791824761087751,-0.8384414846099099,-16508.209313987172,-7.589601869470744,0.11120000000000174,13.900000000000245
static,0.0,5.0,6.0,-4782.871053113384,-5.233544525848519,14411.4689779756,25.538141347978577,1.307060701942973,1.8731997380823568,0.002537468952847566,0.2911381045328444,14411.4689779756,25.538141347978577,,
static,0.1,8.0,5.0,1629.4524528499896,1.880088900553112,-5347.078589385725,-8.14812684380662,0.3600324838305795,0.5019134064795009,-4.6492644957929485e-05,-0.005316014641356001,-5347.078589385725,-8.14812684380662,,
static,0.25,5.0,6.0,-9938.662276761897,-10.398087633377964,-23616.087243780566,-27.701108621456626,-3.0513860773271233,-4.099238223547561,-0.003519771479853273,-0.40113716461596144,-23616.087243780566,-27.701108621456626,,
static,0.4,3.0,4.0,1850.8400595222774,2.1912497828943436,15058.659457798465,23.67199439061036,3.669612467486587,5.430169778169349,0.006763447803564415,0.7804393835882188,15058.659457798465,23.67199439061036,,
static,0.6,6.0,5.0,1038.893948415236,1.2765037688226162,-6062.864079504681,-9.363144945348399,-1.712609061865976,-2.3996341009364213,-0.0042285583442709385,-0.48362088973179423,-6062.864079504681,-9.363144945348399,,
static,0.8,3.0,7.0,2696.6340631967323,3.6826150812750567,149.22406835677975,0.27280281303997084,0.8491716126507072,1.2427748744725668,0.0032786525965587954,0.3777595573932637,149.22406835677975,0.27280281303997084,,
surge,0.0,6.0,6.0,-606.73760243367,-5.066579306500225,-244.17585425326251,-5.525800641331023,0.014874931199557295,0.09186560988877175,0.0019308940532419272,0.4471794260021321,-244.17585425326251,-5.525800641331023,,
surge,0.1,2.0,5.0,169.78743573408792,1.446343107913299,-1012.7706974660168,-20.02053666691211,-0.14459518037699226,-0.864651254901582,-0.0018650458785858248,-0.4260349899970559,-1012.7706974660168,-20.02053666691211,,
surge,0.25,10.0,7.0,-128.20993816584632,-1.1276930411162496,-81.21373487263281,-1.7081453033360994,0.3008506477195141,1.839047728806548,0.0030750148302954305,0.7102446987902812,-81.21373487263281,-1.7081453033360994,,
surge,0.4,6.0,6.0,-473.03722764431404,-4.297928307550563,28.557452243338048,0.6755106104955642,-0.5027452173053764,-3.072002360121898,-0.005581380442163164,-1.288152985482699,28.557452243338048,0.6755106104955642,,
surge,0.6,2.0,5.0,307.79436325796996,3.0356727142643067,2060.57396030564,63.382050333909866,0.2339650444065704,1.438519400758399,0.001302270025389629,0.30077697380833807,2060.57396030564,63.382050333909866,,
surge,0.8,3.0,3.0,423.15386247993047,4.372210191290083,1117.0942083304312,34.86182570616373,0.8971464536957541,5.327339899805159,0.007068630716831503,1.6094191039618562,1117.0942083304312,34.86182570616373,,
1 tier alpha runs_robust runs_no_robust eval_revenue_mean_delta eval_revenue_mean_delta_pct eval_reward_mean_delta eval_reward_mean_delta_pct eval_coi_level_mean_delta eval_coi_level_mean_delta_pct eval_margin_mean_delta eval_margin_mean_delta_pct objective_score_delta objective_score_delta_pct train_alpha_adv_delta train_alpha_adv_delta_pct
2 dqn 0.0 5.0 2.0 -31308.987414117495 -8.73651226889534 -1909.7427407095092 -0.5742991901121623 -2.8982436567700063 -2.1108702433020436 -0.001972064237093285 -0.2116777198290971 -1909.7427407095092 -0.5742991901121623
3 dqn 0.1 8.0 4.0 -7723.542755668925 -2.2789188721535494 -74239.37371836061 -21.063854618469847 1.7435833801418141 1.2859365583872486 0.0011891962142838164 0.1278074871971924 -74239.37371836061 -21.063854618469847 0.17619791666666657 176.19791666666694
4 dqn 0.25 7.0 3.0 -12344.82818986749 -3.7035466052614323 93154.03627578515 36.06691230407512 0.03214544949867104 0.023426184113378143 1.763733457238459e-05 0.001893256490383175 93154.03627578515 36.06691230407512 0.14530952380952394 58.12380952380958
5 dqn 0.4 5.0 10.0 -7816.300706216833 -2.4694340725162824 -42362.74668471434 -13.411888482380219 0.6251272343707797 0.4579446603861758 0.0002750615520492605 0.02953644634355915 -42362.74668471434 -13.411888482380219 0.09856666666666747 24.64166666666691
6 dqn 0.6 5.0 4.0 -16150.011887742497 -5.347485987139731 -28508.74710866122 -10.151356300001888 -0.63306323164079 -0.46056970247177387 -0.00034537433455417155 -0.0370668515552649 -28508.74710866122 -10.151356300001888 0.1361999999999981 22.699999999999644
7 dqn 0.8 7.0 6.0 -18191.8826663699 -6.440527544692988 -55296.94441124235 -20.19273590083627 -0.796733634735034 -0.579832425016392 -0.0006423984775592029 -0.0689476165584585 -55296.94441124235 -20.19273590083627 0.1532857142857158 19.160714285714512
8 linear 0.0 9.0 8.0 -14967.67388588126 -4.273413942959129 -20107.23171681742 -6.60039931288617 -0.06127790826209889 -0.04564810574240612 -7.607744079518586e-05 -0.008177885913528719 -20107.23171681742 -6.60039931288617
9 linear 0.1 3.0 5.0 -24531.399901538738 -7.171831328305365 -96669.7835552101 -26.44920711447249 -0.3680976907859872 -0.2733723058172187 -0.0002515287835096469 -0.02702956778346356 -96669.7835552101 -26.44920711447249
10 linear 0.25 6.0 9.0 -14840.859479571285 -4.520682292638562 -26510.179456423968 -8.033117756667396 -0.13734776448131925 -0.10212641096230607 -9.41162442338328e-05 -0.010115001392981545 -26510.179456423968 -8.033117756667396
11 linear 0.4 4.0 11.0 -17196.7642560167 -5.486915251242723 -74520.10209817477 -25.042311510043184 0.12217076984330788 0.09098828726103136 0.00010713887099822461 0.011516865671259795 -74520.10209817477 -25.042311510043184
12 linear 0.6 5.0 3.0 -14284.06615788641 -4.854766876637072 38417.71856593515 14.088596762512362 0.24251461234271687 0.1806530855220358 0.0002606811969937395 0.028024824619509187 38417.71856593515 14.088596762512362
13 linear 0.8 4.0 11.0 -10840.488575784548 -3.933600919557566 15749.581078662042 6.447651726824251 0.028051260535562506 0.020876236575910773 5.361882659971062e-05 0.005763158099097226 15749.581078662042 6.447651726824251
14 qtable 0.0 9.0 8.0 -18644.457288398524 -8.15323701554329 32993.42568058451 20.675688115613053 10.369779227648095 10.682768960780463 0.018566897519637582 2.0803084179092814 32993.42568058451 20.675688115613053 0.11839814814814797
15 qtable 0.1 6.0 5.0 -12549.400855549495 -4.616991193742389 -37207.79701261924 -15.336047254435487 0.0884057957559321 0.07703761042583206 -0.01127789819771663 -1.2272540823820444 -37207.79701261924 -15.336047254435487 0.07577777777777787 75.77777777777803
16 qtable 0.25 6.0 5.0 -1534.3527429780224 -0.5456640130847226 18433.43663451099 7.304472653867784 -0.5776125938941306 -0.45734160960552755 -0.003316338490628068 -0.3584028328803385 18433.43663451099 7.304472653867784 0.1181458333333334 47.258333333333354
17 qtable 0.4 8.0 6.0 -15146.258176090778 -5.274860187729517 -37364.22587794208 -13.005651205148677 0.4611471727478005 0.3629050099230144 0.0071046453227539 0.7751478467862876 -37364.22587794208 -13.005651205148677 0.11010416666666772 27.52604166666698
18 qtable 0.6 6.0 6.0 -9577.578548656049 -3.9322693501816666 -19088.152339068736 -9.571307395166029 0.9081750157567683 0.7495917946306662 0.0015520804425310786 0.16838348372043557 -19088.152339068736 -9.571307395166029 0.16983333333333228 28.305555555555333
19 qtable 0.8 5.0 2.0 -52751.680936846446 -19.699089872409548 -16508.209313987172 -7.589601869470744 -15.022454081083623 -11.215398490282094 -0.007791824761087751 -0.8384414846099099 -16508.209313987172 -7.589601869470744 0.11120000000000174 13.900000000000245
20 static 0.0 5.0 6.0 -4782.871053113384 -5.233544525848519 14411.4689779756 25.538141347978577 1.307060701942973 1.8731997380823568 0.002537468952847566 0.2911381045328444 14411.4689779756 25.538141347978577
21 static 0.1 8.0 5.0 1629.4524528499896 1.880088900553112 -5347.078589385725 -8.14812684380662 0.3600324838305795 0.5019134064795009 -4.6492644957929485e-05 -0.005316014641356001 -5347.078589385725 -8.14812684380662
22 static 0.25 5.0 6.0 -9938.662276761897 -10.398087633377964 -23616.087243780566 -27.701108621456626 -3.0513860773271233 -4.099238223547561 -0.003519771479853273 -0.40113716461596144 -23616.087243780566 -27.701108621456626
23 static 0.4 3.0 4.0 1850.8400595222774 2.1912497828943436 15058.659457798465 23.67199439061036 3.669612467486587 5.430169778169349 0.006763447803564415 0.7804393835882188 15058.659457798465 23.67199439061036
24 static 0.6 6.0 5.0 1038.893948415236 1.2765037688226162 -6062.864079504681 -9.363144945348399 -1.712609061865976 -2.3996341009364213 -0.0042285583442709385 -0.48362088973179423 -6062.864079504681 -9.363144945348399
25 static 0.8 3.0 7.0 2696.6340631967323 3.6826150812750567 149.22406835677975 0.27280281303997084 0.8491716126507072 1.2427748744725668 0.0032786525965587954 0.3777595573932637 149.22406835677975 0.27280281303997084
26 surge 0.0 6.0 6.0 -606.73760243367 -5.066579306500225 -244.17585425326251 -5.525800641331023 0.014874931199557295 0.09186560988877175 0.0019308940532419272 0.4471794260021321 -244.17585425326251 -5.525800641331023
27 surge 0.1 2.0 5.0 169.78743573408792 1.446343107913299 -1012.7706974660168 -20.02053666691211 -0.14459518037699226 -0.864651254901582 -0.0018650458785858248 -0.4260349899970559 -1012.7706974660168 -20.02053666691211
28 surge 0.25 10.0 7.0 -128.20993816584632 -1.1276930411162496 -81.21373487263281 -1.7081453033360994 0.3008506477195141 1.839047728806548 0.0030750148302954305 0.7102446987902812 -81.21373487263281 -1.7081453033360994
29 surge 0.4 6.0 6.0 -473.03722764431404 -4.297928307550563 28.557452243338048 0.6755106104955642 -0.5027452173053764 -3.072002360121898 -0.005581380442163164 -1.288152985482699 28.557452243338048 0.6755106104955642
30 surge 0.6 2.0 5.0 307.79436325796996 3.0356727142643067 2060.57396030564 63.382050333909866 0.2339650444065704 1.438519400758399 0.001302270025389629 0.30077697380833807 2060.57396030564 63.382050333909866
31 surge 0.8 3.0 3.0 423.15386247993047 4.372210191290083 1117.0942083304312 34.86182570616373 0.8971464536957541 5.327339899805159 0.007068630716831503 1.6094191039618562 1117.0942083304312 34.86182570616373

View File

@@ -0,0 +1,61 @@
tier,alpha,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_margin_mean_mean,eval_margin_mean_std,objective_score_mean,objective_score_std,train_alpha_adv_mean,train_alpha_adv_std
dqn,0.0,no_robust,2,358369.40933039243,3531.782519351935,332534.46523867303,114183.5587841961,137.30089123035202,0.8184776440325546,0.9316352418598786,0.0006839003676302996,332534.46523867303,114183.5587841961,,
dqn,0.0,robust,5,327060.42191627494,24311.17412598574,330624.7224979635,62834.39223547943,134.40264757358202,6.160000643680792,0.9296631776227853,0.004262039730140749,330624.7224979635,62834.39223547943,0.17835000000000004,0.08829347371125472
dqn,0.1,no_robust,4,338912.58043645386,19584.736810155388,352449.13650924934,34076.74819101191,135.58860029055563,3.4055508991301524,0.9304589585186211,0.0023438665484978773,352449.13650924934,34076.74819101191,0.0999999999999998,0.0
dqn,0.1,robust,8,331189.03768078494,8060.912085646968,278209.7627908887,57861.69545853692,137.33218367069745,0.43113256118808096,0.931648154732905,0.000296560958972609,278209.7627908887,57861.69545853692,0.2761979166666664,0.09826648189130198
dqn,0.25,no_robust,3,333324.4996115304,6101.717861804452,258281.15112936878,46772.05216097596,137.2201692904545,0.9866477887862672,0.9315871706751672,0.0006356053229300815,258281.15112936878,46772.05216097596,0.25,0.0
dqn,0.25,robust,7,320979.6714216629,7345.8761269427705,351435.18740515393,40320.63699261721,137.25231473995316,0.3527287960309152,0.9316048080097395,0.0002575240668471541,351435.18740515393,40320.63699261721,0.39530952380952394,0.073021206240698
dqn,0.4,no_robust,10,316521.94295076875,3631.1820920182718,315859.66987697606,59129.03566963754,136.50715652926755,0.5085743959240285,0.931261495881483,0.00031280530251053175,315859.66987697606,59129.03566963754,0.3999999999999993,0.0
dqn,0.4,robust,5,308705.6422445519,10654.571556448245,273496.9231922617,68868.59270778317,137.13228376363833,0.9543108715306617,0.9315365574335323,0.0006302636717132419,273496.9231922617,68868.59270778317,0.49856666666666677,0.05745573175159429
dqn,0.6,no_robust,4,302011.2988903938,2354.1141598720183,280836.828756133,58683.00124997926,137.4522093492651,0.4692723362517602,0.9317606434396914,0.0003317518021682495,280836.828756133,58683.00124997926,0.600000000000001,0.0
dqn,0.6,robust,5,285861.2870026513,10386.571631344234,252328.08164747176,59388.56063758225,136.8191461176243,1.0629203361893034,0.9314152691051373,0.0005692783702932289,252328.08164747176,59388.56063758225,0.7361999999999991,0.07108625433623189
dqn,0.8,no_robust,6,282459.51189759385,2625.018247527438,273845.72691287595,66378.16690732416,137.4075681801531,0.29728950101826707,0.9317196295169007,0.00022799290978965786,273845.72691287595,66378.16690732416,0.7999999999999985,0.0
dqn,0.8,robust,7,264267.62923122395,6771.288971321149,218548.7825016336,50043.2009443344,136.61083454541807,1.2319662937254596,0.9310772310393415,0.0010118564779437284,218548.7825016336,50043.2009443344,0.9532857142857143,0.04709817507333055
linear,0.0,no_robust,8,350250.9723061577,3156.286820918861,304636.59490360576,71682.88027353655,134.2397614654424,0.32611787466946035,0.9302824910938235,0.00024020749661685483,304636.59490360576,71682.88027353655,,
linear,0.0,robust,9,335283.29842027643,7707.594869976611,284529.36318678834,55524.58819004573,134.1784835571803,0.4477314164684001,0.9302064136530284,0.00034781034181738526,284529.36318678834,55524.58819004573,,
linear,0.1,no_robust,5,342052.1032713031,2576.546352056584,365492.17954557994,44890.93522299766,134.65068807375954,0.2181027640393531,0.930569018064469,0.00014058935916940913,365492.17954557994,44890.93522299766,,
linear,0.1,robust,3,317520.7033697644,4796.580459456527,268822.39599036984,39256.421140635124,134.28259038297355,0.24570499109363475,0.9303174892809594,0.00018817899183709092,268822.39599036984,39256.421140635124,,
linear,0.25,no_robust,9,328288.0441241802,2178.525494145428,330011.0898339667,38591.36053388808,134.48799697074742,0.2199303973026469,0.9304619997297959,0.00015341642413402035,330011.0898339667,38591.36053388808,,
linear,0.25,robust,6,313447.18464460893,11811.426711620714,303500.9103775427,63358.917144214036,134.3506492062661,0.2947034403278951,0.9303678834855621,0.00021446628431268986,303500.9103775427,63358.917144214036,,
linear,0.4,no_robust,11,313414.0672597746,1982.9537556159262,297576.7714904776,69396.90446617964,134.2708754290745,0.3062093691351849,0.9302780292522507,0.00023067974755288992,297576.7714904776,69396.90446617964,,
linear,0.4,robust,4,296217.3030037579,5109.898340355844,223056.66939230284,38293.73688466607,134.3930461989178,0.12347753686382154,0.9303851681232489,7.324605809708878e-05,223056.66939230284,38293.73688466607,,
linear,0.6,no_robust,3,294227.64307441004,2081.9176570448135,272686.62176604365,66672.50905805513,134.24327165069943,0.30764332256042104,0.9301795837547151,0.00020453921786790446,272686.62176604365,66672.50905805513,,
linear,0.6,robust,5,279943.5769165236,9866.031719660255,311104.3403319788,28363.930707781863,134.48578626304214,0.21280262186464388,0.9304402649517088,0.00020533894868120649,311104.3403319788,28363.930707781863,,
linear,0.8,no_robust,11,275586.89347174135,1618.038877505867,244268.4832547461,56201.44465269986,134.36933631960773,0.2845660213184439,0.9303723007028001,0.00017640716421186918,244268.4832547461,56201.44465269986,,
linear,0.8,robust,4,264746.4048959568,7976.6279174956235,260018.06433340814,57942.49882730146,134.3973875801433,0.31511916357643405,0.9304259195293998,0.00023606570471334208,260018.06433340814,57942.49882730146,,
qtable,0.0,no_robust,8,228675.52179404112,103199.70453252994,159575.94976328663,95848.81008103945,97.07014413321637,33.0637115678536,0.8925069648229078,0.04890522141482132,159575.94976328663,95848.81008103945,0.0,0.0
qtable,0.0,robust,9,210031.0645056426,84361.3834579348,192569.37544387113,116824.7880426837,107.43992336086447,21.41128645838254,0.9110738623425454,0.019188350719133364,192569.37544387113,116824.7880426837,0.11839814814814797,0.061909456985161225
qtable,0.1,no_robust,5,271809.0706466638,14898.209045050968,242616.60384397948,49181.45526408063,114.75666919996793,3.461383158930426,0.9189538140159812,0.002294693249439748,242616.60384397948,49181.45526408063,0.0999999999999998,0.0
qtable,0.1,robust,6,259259.66979111428,102995.29934229614,205408.80683136024,94155.1845420674,114.84507499572386,36.206421837506966,0.9076759158182646,0.048591979839360346,205408.80683136024,94155.1845420674,0.17577777777777767,0.06720562696899951
qtable,0.25,no_robust,5,281190.01916657295,70274.10208723843,252358.2126733039,129868.46825082717,126.29784427276161,15.368804047323954,0.9253103453385114,0.009044883517550522,252358.2126733039,129868.46825082717,0.25,0.0
qtable,0.25,robust,6,279655.6664235949,93056.2549557545,270791.6493078149,116021.46257259768,125.72023167886748,26.760714047253796,0.9219940068478834,0.022785695882060884,270791.6493078149,116021.46257259768,0.3681458333333334,0.08845114686619042
qtable,0.4,no_robust,6,287140.4669895195,32698.16434426399,287292.23388022534,83855.95000252876,127.07104066863859,9.200301166154173,0.9165535777734913,0.01306001923887748,287292.23388022534,83855.95000252876,0.3999999999999993,0.0
qtable,0.4,robust,8,271994.2088134287,79259.3185780895,249928.00800228326,88265.30801790548,127.53218784138639,23.406428094683015,0.9236582230962452,0.020073747007871224,249928.00800228326,88265.30801790548,0.510104166666667,0.09294655989347765
qtable,0.6,no_robust,6,243563.64469828535,67006.60707045678,199430.98211127534,79119.52886604435,121.15594411011905,17.91243944823949,0.9217533740470492,0.011558797825966702,199430.98211127534,79119.52886604435,0.600000000000001,0.0
qtable,0.6,robust,6,233986.0661496293,43155.478617087436,180342.8297722066,48117.79957836251,122.06411912587582,12.160951090203252,0.9233054544895802,0.006840854872863436,180342.8297722066,48117.79957836251,0.7698333333333333,0.09107066853090896
qtable,0.8,no_robust,2,267787.4017455507,1552.038101264713,217510.87340156303,45358.788584678456,133.9448981157492,0.47346860040111405,0.9293224278749692,0.0002998116010539045,217510.87340156303,45358.788584678456,0.7999999999999985,0.0
qtable,0.8,robust,5,215035.72080870424,32869.73253165852,201002.66408757586,63247.67956376057,118.92244403466557,8.586916805142152,0.9215306031138815,0.004644709320891907,201002.66408757586,63247.67956376057,0.9112000000000002,0.07381653307732307
static,0.0,no_robust,6,91388.75248869567,13415.65534300268,56431.15832748852,8525.098185703384,69.77689967440658,3.670744870085874,0.8715688236409825,0.005831496806767582,56431.15832748852,8525.098185703384,,
static,0.0,robust,5,86605.88143558228,7614.909395960895,70842.62730546412,8033.737230392738,71.08396037634955,3.6802889678420283,0.8741062925938301,0.005083911544334936,70842.62730546412,8033.737230392738,,
static,0.1,no_robust,5,86668.90445290186,8037.955688932984,65623.40881389238,19329.448262530004,71.73199185012882,4.199046495412734,0.874577067494122,0.006610505646022198,65623.40881389238,19329.448262530004,,
static,0.1,robust,8,88298.35690575185,9576.838833058617,60276.33022450666,13359.490452744656,72.0920243339594,6.7706096714767865,0.8745305748491641,0.010083585815241344,60276.33022450666,13359.490452744656,,
static,0.25,no_robust,6,95581.63603909909,8345.698435455577,85253.22060752509,13111.526873622026,74.43788116042678,2.1078820386097368,0.8774483618896327,0.0037254791853004897,85253.22060752509,13111.526873622026,,
static,0.25,robust,5,85642.97376233719,9472.880627242153,61637.13336374452,15937.429780623212,71.38649508309966,4.0264905454627264,0.8739285904097794,0.005323853359397925,61637.13336374452,15937.429780623212,,
static,0.4,no_robust,4,84465.04245981346,12101.831388745604,63613.81812329075,7778.361846092061,67.5782271530322,3.9088888968092,0.8666205147756862,0.007149121199217965,63613.81812329075,7778.361846092061,,
static,0.4,robust,3,86315.88251933573,8642.748496122398,78672.47758108922,17823.74997200773,71.24783962051879,2.790416943786253,0.8733839625792507,0.005990544453538607,78672.47758108922,17823.74997200773,,
static,0.6,no_robust,5,81385.88962988024,12343.523894997037,64752.43216774836,23486.779472906223,71.36959177224794,5.100226704959064,0.874353948320141,0.007787250295491337,64752.43216774836,23486.779472906223,,
static,0.6,robust,6,82424.78357829548,9831.886701625144,58689.56808824368,12672.506035553573,69.65698271038197,3.484982360048201,0.8701253899758701,0.005917711231889304,58689.56808824368,12672.506035553573,,
static,0.8,no_robust,7,73226.06364450825,4447.877985963851,54700.340767716196,14406.881298569717,68.32867561883204,3.68262917356943,0.8679204886788817,0.007467501164611224,54700.340767716196,14406.881298569717,,
static,0.8,robust,3,75922.69770770498,5046.089536162847,54849.564836072976,22780.98012221352,69.17784723148274,1.5268167784698885,0.8711991412754405,0.0033278715575433297,54849.564836072976,22780.98012221352,,
surge,0.0,no_robust,6,11975.290738176132,411.4052900076416,4418.832131346071,896.5828048394391,16.192056219479124,0.8040364003224534,0.4317940274006973,0.008271862690929055,4418.832131346071,896.5828048394391,,
surge,0.0,robust,6,11368.553135742462,623.8217438159004,4174.6562770928085,639.9963040241264,16.20693115067868,0.9853827520149101,0.4337249214539392,0.010371668289035135,4174.6562770928085,639.9963040241264,,
surge,0.1,no_robust,5,11739.084232858655,332.778792718381,5058.659087494994,1110.8409258976824,16.722948073839394,0.6578121995950104,0.4377682402562083,0.005683401047550787,5058.659087494994,1110.8409258976824,,
surge,0.1,robust,2,11908.871668592743,81.41250285550258,4045.8883900289775,784.7169500268457,16.5783528934624,0.4088194924856508,0.4359031943776225,0.004531137621699143,4045.8883900289775,784.7169500268457,,
surge,0.25,no_robust,7,11369.223138855004,236.1121240061105,4754.4980344481255,1038.0550037539617,16.359045119223275,0.3945156775653057,0.4329514652531622,0.0038762110261952457,4754.4980344481255,1038.0550037539617,,
surge,0.25,robust,10,11241.013200689158,684.503587066406,4673.284299575493,1187.78635131025,16.65989576694279,1.0515950311117155,0.4360264800834576,0.009701952962125513,4673.284299575493,1187.78635131025,,
surge,0.4,no_robust,6,11006.168409400554,364.6584583108646,4227.535704048808,1414.7964077877168,16.365391636138824,0.9138430058543858,0.4332855262584901,0.008024003783434592,4227.535704048808,1414.7964077877168,,
surge,0.4,robust,6,10533.13118175624,526.0758051960169,4256.093156292146,783.7965507386594,15.862646418833448,0.7732699435426456,0.42770414581632693,0.008967505611725135,4256.093156292146,783.7965507386594,,
surge,0.6,no_robust,5,10139.2472848498,97.448078425168,3251.037082975553,742.2100315641153,16.26429537781848,0.4432465691073604,0.4329686574409998,0.004121820888165019,3251.037082975553,742.2100315641153,,
surge,0.6,robust,2,10447.04164810777,524.0029334247373,5311.611043281193,1808.6200710093085,16.49826042222505,0.6088756908260344,0.43427092746638946,0.007817511630542989,5311.611043281193,1808.6200710093085,,
surge,0.8,no_robust,3,9678.259826640971,272.83530913170915,3204.3479815026553,556.8799617962688,16.840420745981802,0.4589959822922529,0.43920385308157944,0.004953937449529005,3204.3479815026553,556.8799617962688,,
surge,0.8,robust,3,10101.413689120902,526.8318040489241,4321.442189833087,1284.166148011517,17.737567199677557,0.6586775330563983,0.44627248379841095,0.004644261847052545,4321.442189833087,1284.166148011517,,
1 tier alpha mode runs eval_revenue_mean_mean eval_revenue_mean_std eval_reward_mean_mean eval_reward_mean_std eval_coi_level_mean_mean eval_coi_level_mean_std eval_margin_mean_mean eval_margin_mean_std objective_score_mean objective_score_std train_alpha_adv_mean train_alpha_adv_std
2 dqn 0.0 no_robust 2 358369.40933039243 3531.782519351935 332534.46523867303 114183.5587841961 137.30089123035202 0.8184776440325546 0.9316352418598786 0.0006839003676302996 332534.46523867303 114183.5587841961
3 dqn 0.0 robust 5 327060.42191627494 24311.17412598574 330624.7224979635 62834.39223547943 134.40264757358202 6.160000643680792 0.9296631776227853 0.004262039730140749 330624.7224979635 62834.39223547943 0.17835000000000004 0.08829347371125472
4 dqn 0.1 no_robust 4 338912.58043645386 19584.736810155388 352449.13650924934 34076.74819101191 135.58860029055563 3.4055508991301524 0.9304589585186211 0.0023438665484978773 352449.13650924934 34076.74819101191 0.0999999999999998 0.0
5 dqn 0.1 robust 8 331189.03768078494 8060.912085646968 278209.7627908887 57861.69545853692 137.33218367069745 0.43113256118808096 0.931648154732905 0.000296560958972609 278209.7627908887 57861.69545853692 0.2761979166666664 0.09826648189130198
6 dqn 0.25 no_robust 3 333324.4996115304 6101.717861804452 258281.15112936878 46772.05216097596 137.2201692904545 0.9866477887862672 0.9315871706751672 0.0006356053229300815 258281.15112936878 46772.05216097596 0.25 0.0
7 dqn 0.25 robust 7 320979.6714216629 7345.8761269427705 351435.18740515393 40320.63699261721 137.25231473995316 0.3527287960309152 0.9316048080097395 0.0002575240668471541 351435.18740515393 40320.63699261721 0.39530952380952394 0.073021206240698
8 dqn 0.4 no_robust 10 316521.94295076875 3631.1820920182718 315859.66987697606 59129.03566963754 136.50715652926755 0.5085743959240285 0.931261495881483 0.00031280530251053175 315859.66987697606 59129.03566963754 0.3999999999999993 0.0
9 dqn 0.4 robust 5 308705.6422445519 10654.571556448245 273496.9231922617 68868.59270778317 137.13228376363833 0.9543108715306617 0.9315365574335323 0.0006302636717132419 273496.9231922617 68868.59270778317 0.49856666666666677 0.05745573175159429
10 dqn 0.6 no_robust 4 302011.2988903938 2354.1141598720183 280836.828756133 58683.00124997926 137.4522093492651 0.4692723362517602 0.9317606434396914 0.0003317518021682495 280836.828756133 58683.00124997926 0.600000000000001 0.0
11 dqn 0.6 robust 5 285861.2870026513 10386.571631344234 252328.08164747176 59388.56063758225 136.8191461176243 1.0629203361893034 0.9314152691051373 0.0005692783702932289 252328.08164747176 59388.56063758225 0.7361999999999991 0.07108625433623189
12 dqn 0.8 no_robust 6 282459.51189759385 2625.018247527438 273845.72691287595 66378.16690732416 137.4075681801531 0.29728950101826707 0.9317196295169007 0.00022799290978965786 273845.72691287595 66378.16690732416 0.7999999999999985 0.0
13 dqn 0.8 robust 7 264267.62923122395 6771.288971321149 218548.7825016336 50043.2009443344 136.61083454541807 1.2319662937254596 0.9310772310393415 0.0010118564779437284 218548.7825016336 50043.2009443344 0.9532857142857143 0.04709817507333055
14 linear 0.0 no_robust 8 350250.9723061577 3156.286820918861 304636.59490360576 71682.88027353655 134.2397614654424 0.32611787466946035 0.9302824910938235 0.00024020749661685483 304636.59490360576 71682.88027353655
15 linear 0.0 robust 9 335283.29842027643 7707.594869976611 284529.36318678834 55524.58819004573 134.1784835571803 0.4477314164684001 0.9302064136530284 0.00034781034181738526 284529.36318678834 55524.58819004573
16 linear 0.1 no_robust 5 342052.1032713031 2576.546352056584 365492.17954557994 44890.93522299766 134.65068807375954 0.2181027640393531 0.930569018064469 0.00014058935916940913 365492.17954557994 44890.93522299766
17 linear 0.1 robust 3 317520.7033697644 4796.580459456527 268822.39599036984 39256.421140635124 134.28259038297355 0.24570499109363475 0.9303174892809594 0.00018817899183709092 268822.39599036984 39256.421140635124
18 linear 0.25 no_robust 9 328288.0441241802 2178.525494145428 330011.0898339667 38591.36053388808 134.48799697074742 0.2199303973026469 0.9304619997297959 0.00015341642413402035 330011.0898339667 38591.36053388808
19 linear 0.25 robust 6 313447.18464460893 11811.426711620714 303500.9103775427 63358.917144214036 134.3506492062661 0.2947034403278951 0.9303678834855621 0.00021446628431268986 303500.9103775427 63358.917144214036
20 linear 0.4 no_robust 11 313414.0672597746 1982.9537556159262 297576.7714904776 69396.90446617964 134.2708754290745 0.3062093691351849 0.9302780292522507 0.00023067974755288992 297576.7714904776 69396.90446617964
21 linear 0.4 robust 4 296217.3030037579 5109.898340355844 223056.66939230284 38293.73688466607 134.3930461989178 0.12347753686382154 0.9303851681232489 7.324605809708878e-05 223056.66939230284 38293.73688466607
22 linear 0.6 no_robust 3 294227.64307441004 2081.9176570448135 272686.62176604365 66672.50905805513 134.24327165069943 0.30764332256042104 0.9301795837547151 0.00020453921786790446 272686.62176604365 66672.50905805513
23 linear 0.6 robust 5 279943.5769165236 9866.031719660255 311104.3403319788 28363.930707781863 134.48578626304214 0.21280262186464388 0.9304402649517088 0.00020533894868120649 311104.3403319788 28363.930707781863
24 linear 0.8 no_robust 11 275586.89347174135 1618.038877505867 244268.4832547461 56201.44465269986 134.36933631960773 0.2845660213184439 0.9303723007028001 0.00017640716421186918 244268.4832547461 56201.44465269986
25 linear 0.8 robust 4 264746.4048959568 7976.6279174956235 260018.06433340814 57942.49882730146 134.3973875801433 0.31511916357643405 0.9304259195293998 0.00023606570471334208 260018.06433340814 57942.49882730146
26 qtable 0.0 no_robust 8 228675.52179404112 103199.70453252994 159575.94976328663 95848.81008103945 97.07014413321637 33.0637115678536 0.8925069648229078 0.04890522141482132 159575.94976328663 95848.81008103945 0.0 0.0
27 qtable 0.0 robust 9 210031.0645056426 84361.3834579348 192569.37544387113 116824.7880426837 107.43992336086447 21.41128645838254 0.9110738623425454 0.019188350719133364 192569.37544387113 116824.7880426837 0.11839814814814797 0.061909456985161225
28 qtable 0.1 no_robust 5 271809.0706466638 14898.209045050968 242616.60384397948 49181.45526408063 114.75666919996793 3.461383158930426 0.9189538140159812 0.002294693249439748 242616.60384397948 49181.45526408063 0.0999999999999998 0.0
29 qtable 0.1 robust 6 259259.66979111428 102995.29934229614 205408.80683136024 94155.1845420674 114.84507499572386 36.206421837506966 0.9076759158182646 0.048591979839360346 205408.80683136024 94155.1845420674 0.17577777777777767 0.06720562696899951
30 qtable 0.25 no_robust 5 281190.01916657295 70274.10208723843 252358.2126733039 129868.46825082717 126.29784427276161 15.368804047323954 0.9253103453385114 0.009044883517550522 252358.2126733039 129868.46825082717 0.25 0.0
31 qtable 0.25 robust 6 279655.6664235949 93056.2549557545 270791.6493078149 116021.46257259768 125.72023167886748 26.760714047253796 0.9219940068478834 0.022785695882060884 270791.6493078149 116021.46257259768 0.3681458333333334 0.08845114686619042
32 qtable 0.4 no_robust 6 287140.4669895195 32698.16434426399 287292.23388022534 83855.95000252876 127.07104066863859 9.200301166154173 0.9165535777734913 0.01306001923887748 287292.23388022534 83855.95000252876 0.3999999999999993 0.0
33 qtable 0.4 robust 8 271994.2088134287 79259.3185780895 249928.00800228326 88265.30801790548 127.53218784138639 23.406428094683015 0.9236582230962452 0.020073747007871224 249928.00800228326 88265.30801790548 0.510104166666667 0.09294655989347765
34 qtable 0.6 no_robust 6 243563.64469828535 67006.60707045678 199430.98211127534 79119.52886604435 121.15594411011905 17.91243944823949 0.9217533740470492 0.011558797825966702 199430.98211127534 79119.52886604435 0.600000000000001 0.0
35 qtable 0.6 robust 6 233986.0661496293 43155.478617087436 180342.8297722066 48117.79957836251 122.06411912587582 12.160951090203252 0.9233054544895802 0.006840854872863436 180342.8297722066 48117.79957836251 0.7698333333333333 0.09107066853090896
36 qtable 0.8 no_robust 2 267787.4017455507 1552.038101264713 217510.87340156303 45358.788584678456 133.9448981157492 0.47346860040111405 0.9293224278749692 0.0002998116010539045 217510.87340156303 45358.788584678456 0.7999999999999985 0.0
37 qtable 0.8 robust 5 215035.72080870424 32869.73253165852 201002.66408757586 63247.67956376057 118.92244403466557 8.586916805142152 0.9215306031138815 0.004644709320891907 201002.66408757586 63247.67956376057 0.9112000000000002 0.07381653307732307
38 static 0.0 no_robust 6 91388.75248869567 13415.65534300268 56431.15832748852 8525.098185703384 69.77689967440658 3.670744870085874 0.8715688236409825 0.005831496806767582 56431.15832748852 8525.098185703384
39 static 0.0 robust 5 86605.88143558228 7614.909395960895 70842.62730546412 8033.737230392738 71.08396037634955 3.6802889678420283 0.8741062925938301 0.005083911544334936 70842.62730546412 8033.737230392738
40 static 0.1 no_robust 5 86668.90445290186 8037.955688932984 65623.40881389238 19329.448262530004 71.73199185012882 4.199046495412734 0.874577067494122 0.006610505646022198 65623.40881389238 19329.448262530004
41 static 0.1 robust 8 88298.35690575185 9576.838833058617 60276.33022450666 13359.490452744656 72.0920243339594 6.7706096714767865 0.8745305748491641 0.010083585815241344 60276.33022450666 13359.490452744656
42 static 0.25 no_robust 6 95581.63603909909 8345.698435455577 85253.22060752509 13111.526873622026 74.43788116042678 2.1078820386097368 0.8774483618896327 0.0037254791853004897 85253.22060752509 13111.526873622026
43 static 0.25 robust 5 85642.97376233719 9472.880627242153 61637.13336374452 15937.429780623212 71.38649508309966 4.0264905454627264 0.8739285904097794 0.005323853359397925 61637.13336374452 15937.429780623212
44 static 0.4 no_robust 4 84465.04245981346 12101.831388745604 63613.81812329075 7778.361846092061 67.5782271530322 3.9088888968092 0.8666205147756862 0.007149121199217965 63613.81812329075 7778.361846092061
45 static 0.4 robust 3 86315.88251933573 8642.748496122398 78672.47758108922 17823.74997200773 71.24783962051879 2.790416943786253 0.8733839625792507 0.005990544453538607 78672.47758108922 17823.74997200773
46 static 0.6 no_robust 5 81385.88962988024 12343.523894997037 64752.43216774836 23486.779472906223 71.36959177224794 5.100226704959064 0.874353948320141 0.007787250295491337 64752.43216774836 23486.779472906223
47 static 0.6 robust 6 82424.78357829548 9831.886701625144 58689.56808824368 12672.506035553573 69.65698271038197 3.484982360048201 0.8701253899758701 0.005917711231889304 58689.56808824368 12672.506035553573
48 static 0.8 no_robust 7 73226.06364450825 4447.877985963851 54700.340767716196 14406.881298569717 68.32867561883204 3.68262917356943 0.8679204886788817 0.007467501164611224 54700.340767716196 14406.881298569717
49 static 0.8 robust 3 75922.69770770498 5046.089536162847 54849.564836072976 22780.98012221352 69.17784723148274 1.5268167784698885 0.8711991412754405 0.0033278715575433297 54849.564836072976 22780.98012221352
50 surge 0.0 no_robust 6 11975.290738176132 411.4052900076416 4418.832131346071 896.5828048394391 16.192056219479124 0.8040364003224534 0.4317940274006973 0.008271862690929055 4418.832131346071 896.5828048394391
51 surge 0.0 robust 6 11368.553135742462 623.8217438159004 4174.6562770928085 639.9963040241264 16.20693115067868 0.9853827520149101 0.4337249214539392 0.010371668289035135 4174.6562770928085 639.9963040241264
52 surge 0.1 no_robust 5 11739.084232858655 332.778792718381 5058.659087494994 1110.8409258976824 16.722948073839394 0.6578121995950104 0.4377682402562083 0.005683401047550787 5058.659087494994 1110.8409258976824
53 surge 0.1 robust 2 11908.871668592743 81.41250285550258 4045.8883900289775 784.7169500268457 16.5783528934624 0.4088194924856508 0.4359031943776225 0.004531137621699143 4045.8883900289775 784.7169500268457
54 surge 0.25 no_robust 7 11369.223138855004 236.1121240061105 4754.4980344481255 1038.0550037539617 16.359045119223275 0.3945156775653057 0.4329514652531622 0.0038762110261952457 4754.4980344481255 1038.0550037539617
55 surge 0.25 robust 10 11241.013200689158 684.503587066406 4673.284299575493 1187.78635131025 16.65989576694279 1.0515950311117155 0.4360264800834576 0.009701952962125513 4673.284299575493 1187.78635131025
56 surge 0.4 no_robust 6 11006.168409400554 364.6584583108646 4227.535704048808 1414.7964077877168 16.365391636138824 0.9138430058543858 0.4332855262584901 0.008024003783434592 4227.535704048808 1414.7964077877168
57 surge 0.4 robust 6 10533.13118175624 526.0758051960169 4256.093156292146 783.7965507386594 15.862646418833448 0.7732699435426456 0.42770414581632693 0.008967505611725135 4256.093156292146 783.7965507386594
58 surge 0.6 no_robust 5 10139.2472848498 97.448078425168 3251.037082975553 742.2100315641153 16.26429537781848 0.4432465691073604 0.4329686574409998 0.004121820888165019 3251.037082975553 742.2100315641153
59 surge 0.6 robust 2 10447.04164810777 524.0029334247373 5311.611043281193 1808.6200710093085 16.49826042222505 0.6088756908260344 0.43427092746638946 0.007817511630542989 5311.611043281193 1808.6200710093085
60 surge 0.8 no_robust 3 9678.259826640971 272.83530913170915 3204.3479815026553 556.8799617962688 16.840420745981802 0.4589959822922529 0.43920385308157944 0.004953937449529005 3204.3479815026553 556.8799617962688
61 surge 0.8 robust 3 10101.413689120902 526.8318040489241 4321.442189833087 1284.166148011517 17.737567199677557 0.6586775330563983 0.44627248379841095 0.004644261847052545 4321.442189833087 1284.166148011517

View File

@@ -0,0 +1,11 @@
tier,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_margin_mean_mean,eval_margin_mean_std,objective_score_mean,objective_score_std,train_alpha_adv_mean,train_alpha_adv_std
dqn,no_robust,29,315185.66674813855,23538.781000060844,302576.8036266896,62951.88633145167,136.82560356086017,1.3692652218935986,0.9313739013618878,0.0009314135057224836,302576.8036266896,62951.88633145167,0.45740740740740693,0.2368477698794438
dqn,robust,37,306875.13950902375,27585.74444520695,283724.7169827867,69843.05611741856,136.68837571992978,2.3797541654948753,0.9312171495138941,0.0016512408492580111,283724.7169827867,69843.05611741856,0.5058198198198196,0.28324483129860284
linear,no_robust,47,315501.15296155965,27105.014861872147,298149.1730416604,67664.7308344108,134.36884359609928,0.29743647613433244,0.9303607531364,0.0002152647006739543,298149.1730416604,67664.7308344108,,
linear,robust,31,306269.9232239004,26399.875293394463,279872.824370329,54401.104602086416,134.32737693008372,0.31909212993628877,0.9303375215162144,0.00025000448833182963,279872.824370329,54401.104602086416,,
qtable,no_robust,32,259818.72178238883,67188.58622318009,222088.83510765125,94450.12569617687,116.84641954166946,22.42810298937963,0.9140582213134033,0.02778864370791322,222088.83510765125,94450.12569617687,0.29218749999999993,0.2559326319498438
qtable,robust,40,244470.50673219413,78666.30912808319,216920.53697298188,93983.50987622296,118.94013969887506,23.1428303249914,0.9178608956089163,0.023827311253270544,216920.53697298188,93983.50987622296,0.4396239583333334,0.29521865862482416
static,no_robust,33,85228.452028227,12041.415672002751,64828.579890468536,17681.280330831738,70.58818912317687,4.204964531595236,0.8721419294578765,0.007107262779462876,64828.579890468536,17681.280330831738,,
static,robust,30,84963.18577955024,8926.291379160475,63243.76603076817,14880.924342692271,70.94358095957392,4.363134562111469,0.8730306888410219,0.006660289247744752,63243.76603076817,14880.924342692271,,
surge,no_robust,32,11121.867310184698,809.9895800277001,4260.038064073964,1160.4282377968032,16.416108827015794,0.641203520341943,0.43413855082681374,0.006214799767130059,4260.038064073964,1160.4282377968032,,
surge,robust,29,10994.355365953365,750.5115890942825,4448.160863178768,1000.7519971246122,16.495943148858906,0.9823026347466668,0.4347587896392907,0.009698591291108968,4448.160863178768,1000.7519971246122,,
1 tier mode runs eval_revenue_mean_mean eval_revenue_mean_std eval_reward_mean_mean eval_reward_mean_std eval_coi_level_mean_mean eval_coi_level_mean_std eval_margin_mean_mean eval_margin_mean_std objective_score_mean objective_score_std train_alpha_adv_mean train_alpha_adv_std
2 dqn no_robust 29 315185.66674813855 23538.781000060844 302576.8036266896 62951.88633145167 136.82560356086017 1.3692652218935986 0.9313739013618878 0.0009314135057224836 302576.8036266896 62951.88633145167 0.45740740740740693 0.2368477698794438
3 dqn robust 37 306875.13950902375 27585.74444520695 283724.7169827867 69843.05611741856 136.68837571992978 2.3797541654948753 0.9312171495138941 0.0016512408492580111 283724.7169827867 69843.05611741856 0.5058198198198196 0.28324483129860284
4 linear no_robust 47 315501.15296155965 27105.014861872147 298149.1730416604 67664.7308344108 134.36884359609928 0.29743647613433244 0.9303607531364 0.0002152647006739543 298149.1730416604 67664.7308344108
5 linear robust 31 306269.9232239004 26399.875293394463 279872.824370329 54401.104602086416 134.32737693008372 0.31909212993628877 0.9303375215162144 0.00025000448833182963 279872.824370329 54401.104602086416
6 qtable no_robust 32 259818.72178238883 67188.58622318009 222088.83510765125 94450.12569617687 116.84641954166946 22.42810298937963 0.9140582213134033 0.02778864370791322 222088.83510765125 94450.12569617687 0.29218749999999993 0.2559326319498438
7 qtable robust 40 244470.50673219413 78666.30912808319 216920.53697298188 93983.50987622296 118.94013969887506 23.1428303249914 0.9178608956089163 0.023827311253270544 216920.53697298188 93983.50987622296 0.4396239583333334 0.29521865862482416
8 static no_robust 33 85228.452028227 12041.415672002751 64828.579890468536 17681.280330831738 70.58818912317687 4.204964531595236 0.8721419294578765 0.007107262779462876 64828.579890468536 17681.280330831738
9 static robust 30 84963.18577955024 8926.291379160475 63243.76603076817 14880.924342692271 70.94358095957392 4.363134562111469 0.8730306888410219 0.006660289247744752 63243.76603076817 14880.924342692271
10 surge no_robust 32 11121.867310184698 809.9895800277001 4260.038064073964 1160.4282377968032 16.416108827015794 0.641203520341943 0.43413855082681374 0.006214799767130059 4260.038064073964 1160.4282377968032
11 surge robust 29 10994.355365953365 750.5115890942825 4448.160863178768 1000.7519971246122 16.495943148858906 0.9823026347466668 0.4347587896392907 0.009698591291108968 4448.160863178768 1000.7519971246122

View File

@@ -0,0 +1,26 @@
Name,tier,alpha,mode,objective/score,eval/revenue_mean,eval/reward_mean,eval/coi_level_mean,lambda_coi,robust_radius,learning_rate,batch_size,n_steps,total_timesteps
eager-sweep-244,dqn,0.0,no_robust,413274.4339549909,355872.06196128257,413274.4339549909,136.722140138007,0.2,0.1,0.0003,256,4096,15000
efficient-sweep-319,linear,0.0,no_robust,410094.0151741567,353309.5198146561,410094.0151741567,134.55152038805429,0.4,0.1,0.001,128,4096,15000
swept-sweep-422,linear,0.0,no_robust,403130.32747386186,347611.2815474988,403130.32747386186,133.8559785775022,0.4,0.3,0.0001,512,1024,15000
decent-sweep-478,linear,0.1,no_robust,400452.36418713134,345284.5750647792,400452.36418713134,134.73082941975588,0.1,0.2,0.001,128,1024,50000
eternal-sweep-339,linear,0.1,no_robust,399628.4231731644,344154.38525771734,399628.4231731644,134.89479277649667,0.4,0.1,0.0001,256,1024,50000
ethereal-sweep-21,dqn,0.1,no_robust,398492.807245857,343580.6802427996,398492.807245857,136.67160732585188,0.1,0.2,0.001,512,2048,50000
dark-sweep-418,linear,0.1,no_robust,394615.3720658343,339749.76272695075,394615.3720658343,134.39233246711,0.2,0.1,0.0003,256,1024,50000
wandering-sweep-122,dqn,0.0,robust,394061.3617726404,339512.43434806296,394061.3617726404,137.6864755964331,0.1,0.3,0.0001,256,2048,30000
laced-sweep-132,dqn,0.1,robust,389274.54998495104,335600.5979215904,389274.54998495104,137.36888574027677,0.4,0.2,0.001,256,2048,30000
rich-sweep-53,qtable,0.0,robust,388601.2626147048,335630.6853337664,388601.2626147048,133.4414069888203,0.2,0.1,0.0001,512,1024,50000
faithful-sweep-430,qtable,0.25,no_robust,387035.6970938766,333255.5771210341,387035.6970938766,137.4906091183188,0.1,0.2,0.0003,128,1024,15000
dark-sweep-280,qtable,0.25,no_robust,386318.8845004527,332220.0316564078,386318.8845004527,137.26992450099925,0.4,0.1,0.0001,256,1024,50000
chocolate-sweep-383,linear,0.25,no_robust,383989.49015403807,331071.7003244704,383989.49015403807,134.60590742050857,0.1,0.2,0.001,512,1024,30000
dry-sweep-263,dqn,0.0,robust,383372.6880637367,330436.0312615148,383372.6880637367,137.40558130223476,0.1,0.3,0.001,128,1024,50000
different-sweep-143,qtable,0.0,robust,383278.4198015018,330546.16800945485,383278.4198015018,135.9021538079678,0.1,0.3,0.001,256,2048,30000
woven-sweep-139,dqn,0.25,robust,382788.1296637251,329427.735752473,382788.1296637251,136.8968339394894,0.1,0.1,0.001,512,1024,15000
dark-sweep-215,dqn,0.25,robust,382358.2401374872,329330.0097603144,382358.2401374872,137.64528612332785,0.2,0.1,0.0001,512,4096,30000
charmed-sweep-136,linear,0.25,no_robust,382249.5728044314,329646.2053260979,382249.5728044314,134.46825608007862,0.4,0.1,0.0001,256,2048,15000
light-sweep-308,linear,0.0,robust,381939.1275250679,329628.9436641051,381939.1275250679,133.6209821974879,0.2,0.2,0.001,128,4096,30000
treasured-sweep-325,linear,0.25,robust,381322.0104772589,328353.58675398555,381322.0104772589,134.8950293943581,0.1,0.1,0.0001,512,2048,15000
fine-sweep-202,dqn,0.25,robust,378751.33572275366,326518.9068184018,378751.33572275366,137.2900973301052,0.1,0.2,0.0001,512,2048,30000
treasured-sweep-380,linear,0.25,no_robust,377898.0979419424,325869.1953595453,377898.0979419424,134.54118723889738,0.4,0.3,0.001,128,1024,50000
pretty-sweep-49,qtable,0.25,robust,377318.4766808995,325282.0152823859,377318.4766808995,137.19609012644068,0.4,0.1,0.0001,128,4096,50000
desert-sweep-253,linear,0.25,robust,376808.6335063269,325146.3478714648,376808.6335063269,134.48396340732663,0.2,0.1,0.0003,256,1024,30000
jolly-sweep-133,qtable,0.4,no_robust,376419.57394710975,323709.24588324485,376419.57394710975,137.8349363778071,0.1,0.3,0.0001,128,2048,50000
1 Name tier alpha mode objective/score eval/revenue_mean eval/reward_mean eval/coi_level_mean lambda_coi robust_radius learning_rate batch_size n_steps total_timesteps
2 eager-sweep-244 dqn 0.0 no_robust 413274.4339549909 355872.06196128257 413274.4339549909 136.722140138007 0.2 0.1 0.0003 256 4096 15000
3 efficient-sweep-319 linear 0.0 no_robust 410094.0151741567 353309.5198146561 410094.0151741567 134.55152038805429 0.4 0.1 0.001 128 4096 15000
4 swept-sweep-422 linear 0.0 no_robust 403130.32747386186 347611.2815474988 403130.32747386186 133.8559785775022 0.4 0.3 0.0001 512 1024 15000
5 decent-sweep-478 linear 0.1 no_robust 400452.36418713134 345284.5750647792 400452.36418713134 134.73082941975588 0.1 0.2 0.001 128 1024 50000
6 eternal-sweep-339 linear 0.1 no_robust 399628.4231731644 344154.38525771734 399628.4231731644 134.89479277649667 0.4 0.1 0.0001 256 1024 50000
7 ethereal-sweep-21 dqn 0.1 no_robust 398492.807245857 343580.6802427996 398492.807245857 136.67160732585188 0.1 0.2 0.001 512 2048 50000
8 dark-sweep-418 linear 0.1 no_robust 394615.3720658343 339749.76272695075 394615.3720658343 134.39233246711 0.2 0.1 0.0003 256 1024 50000
9 wandering-sweep-122 dqn 0.0 robust 394061.3617726404 339512.43434806296 394061.3617726404 137.6864755964331 0.1 0.3 0.0001 256 2048 30000
10 laced-sweep-132 dqn 0.1 robust 389274.54998495104 335600.5979215904 389274.54998495104 137.36888574027677 0.4 0.2 0.001 256 2048 30000
11 rich-sweep-53 qtable 0.0 robust 388601.2626147048 335630.6853337664 388601.2626147048 133.4414069888203 0.2 0.1 0.0001 512 1024 50000
12 faithful-sweep-430 qtable 0.25 no_robust 387035.6970938766 333255.5771210341 387035.6970938766 137.4906091183188 0.1 0.2 0.0003 128 1024 15000
13 dark-sweep-280 qtable 0.25 no_robust 386318.8845004527 332220.0316564078 386318.8845004527 137.26992450099925 0.4 0.1 0.0001 256 1024 50000
14 chocolate-sweep-383 linear 0.25 no_robust 383989.49015403807 331071.7003244704 383989.49015403807 134.60590742050857 0.1 0.2 0.001 512 1024 30000
15 dry-sweep-263 dqn 0.0 robust 383372.6880637367 330436.0312615148 383372.6880637367 137.40558130223476 0.1 0.3 0.001 128 1024 50000
16 different-sweep-143 qtable 0.0 robust 383278.4198015018 330546.16800945485 383278.4198015018 135.9021538079678 0.1 0.3 0.001 256 2048 30000
17 woven-sweep-139 dqn 0.25 robust 382788.1296637251 329427.735752473 382788.1296637251 136.8968339394894 0.1 0.1 0.001 512 1024 15000
18 dark-sweep-215 dqn 0.25 robust 382358.2401374872 329330.0097603144 382358.2401374872 137.64528612332785 0.2 0.1 0.0001 512 4096 30000
19 charmed-sweep-136 linear 0.25 no_robust 382249.5728044314 329646.2053260979 382249.5728044314 134.46825608007862 0.4 0.1 0.0001 256 2048 15000
20 light-sweep-308 linear 0.0 robust 381939.1275250679 329628.9436641051 381939.1275250679 133.6209821974879 0.2 0.2 0.001 128 4096 30000
21 treasured-sweep-325 linear 0.25 robust 381322.0104772589 328353.58675398555 381322.0104772589 134.8950293943581 0.1 0.1 0.0001 512 2048 15000
22 fine-sweep-202 dqn 0.25 robust 378751.33572275366 326518.9068184018 378751.33572275366 137.2900973301052 0.1 0.2 0.0001 512 2048 30000
23 treasured-sweep-380 linear 0.25 no_robust 377898.0979419424 325869.1953595453 377898.0979419424 134.54118723889738 0.4 0.3 0.001 128 1024 50000
24 pretty-sweep-49 qtable 0.25 robust 377318.4766808995 325282.0152823859 377318.4766808995 137.19609012644068 0.4 0.1 0.0001 128 4096 50000
25 desert-sweep-253 linear 0.25 robust 376808.6335063269 325146.3478714648 376808.6335063269 134.48396340732663 0.2 0.1 0.0003 256 1024 30000
26 jolly-sweep-133 qtable 0.4 no_robust 376419.57394710975 323709.24588324485 376419.57394710975 137.8349363778071 0.1 0.3 0.0001 128 2048 50000

View File

@@ -0,0 +1,7 @@
alpha,runs_robust,runs_no_robust,eval_revenue_mean_robust,eval_revenue_mean_no_robust,eval_revenue_mean_delta,eval_revenue_mean_delta_pct,eval_reward_mean_robust,eval_reward_mean_no_robust,eval_reward_mean_delta,eval_reward_mean_delta_pct,eval_coi_level_mean_robust,eval_coi_level_mean_no_robust,eval_coi_level_mean_delta,eval_coi_level_mean_delta_pct,eval_coi_leakage_mean_robust,eval_coi_leakage_mean_no_robust,eval_coi_leakage_mean_delta,eval_coi_leakage_mean_delta_pct,eval_volatility_mean_robust,eval_volatility_mean_no_robust,eval_volatility_mean_delta,eval_volatility_mean_delta_pct,eval_margin_mean_robust,eval_margin_mean_no_robust,eval_margin_mean_delta,eval_margin_mean_delta_pct,train_alpha_adv_robust,train_alpha_adv_no_robust,train_alpha_adv_delta,train_alpha_adv_delta_pct,train_coi_penalty_robust,train_coi_penalty_no_robust,train_coi_penalty_delta,train_coi_penalty_delta_pct,train_ux_penalty_robust,train_ux_penalty_no_robust,train_ux_penalty_delta,train_ux_penalty_delta_pct,train_agent_prob_robust,train_agent_prob_no_robust,train_agent_prob_delta,train_agent_prob_delta_pct
0.0,4.0,4.0,3379.9042994670963,3565.2912010160844,-185.38690154898813,-5.199768857482219,313527.4707462,331300.229069,-17772.758322799986,-5.364547550342456,137.08358925982625,137.28764358955686,-0.2040543297306101,-0.14863269875959326,0.1146626165658294,0.11861133504329742,-0.003948718477468013,-3.3291240470622716,0.06687153537785637,0.06445662162531288,0.0024149137525434905,3.746572022625408,0.9315273502623671,0.9317078361627993,-0.00018048590043218127,-0.019371512552207898,0.18958333333333333,,,,5.553200113221484,,,,61.35134238638615,66.58479574844135,-5.233453362055201,-7.859832418540847,0.12778212146468534,0.11615891320235115,0.011623208262334192,10.00629907933654
0.1,4.0,4.0,3307.028238366196,3458.002436284769,-150.97419791857283,-4.365936713473732,306772.49146475,321215.477968,-14442.986503249966,-4.4963544704059375,137.1182041122497,136.82757579763506,0.29062831461465066,0.21240478238427865,0.1128546052304944,0.11704917861668755,-0.004194573386193154,-3.5835991638433753,0.0685405649303561,0.06737596899527175,0.0011645959350843477,1.728503430007924,0.9315331673960889,0.9313276818191593,0.00020548557692967595,0.0220637248243606,0.2818749999999999,0.1,0.18187499999999987,181.87499999999986,5.079528726095333,,,,52.44772950699336,53.288869747139515,-0.841140240146153,-1.578453895039319,0.11644381911386253,0.11765277436070229,-0.0012089552468397546,-1.0275620387270383
0.25,4.0,4.0,3134.3438215278165,3300.5539051855053,-166.21008365768876,-5.035823938416998,290691.4771835,306522.90003785,-15831.422854350007,-5.16484179563586,136.89990884669214,136.71752459667877,0.18238425001337077,0.1334022471160229,0.11113957413522965,0.1139905600539111,-0.0028509859186814507,-2.50107194607439,0.06427159998376095,0.06846858821082077,-0.004196988227059828,-6.12980103246314,0.9314501501825461,0.9313053225630614,0.0001448276194846443,0.015551035302371268,0.44833333333333336,0.25,0.19833333333333336,79.33333333333334,4.7183804755060255,,,,49.04307009982127,55.2030005738411,-6.159930474019831,-11.158687770568074,0.10998505830218755,0.11684259343269415,-0.0068575351305066035,-5.869037077182653
0.4,4.0,4.0,2983.852437569374,3180.7872854626567,-196.9348478932825,-6.191386918369099,276545.26309355,295433.5405797,-18888.277486150037,-6.393409986248494,136.19210761854086,136.5783021470118,-0.38619452847095204,-0.2827641890402586,0.10875560547061063,0.11189234314151972,-0.0031367376709090927,-2.8033532794480807,0.07452230347799255,0.07104688223410768,0.003475421243884863,4.891729425132195,0.9307282962514367,0.9310542820602117,-0.0003259858087749645,-0.03501254599824534,0.5999999999999999,0.4000000000000001,0.1999999999999998,49.999999999999936,4.174996403604185,,,,47.99794119802058,50.794260008988424,-2.796318810967847,-5.505186630286606,0.10222958892923095,0.11161526349272373,-0.009385674563492777,-8.408952565976458
0.6,4.0,4.0,2789.0434220430398,2982.2460998252786,-193.20267778223888,-6.4784283830083,258688.11700405,277051.95613675,-18363.8391327,-6.628301560749781,136.86774320500828,136.81931587629953,0.04842732870875466,0.035395096371142916,0.10501047827147733,0.10802266412956946,-0.0030121858580921257,-2.788475809557069,0.06914180963767007,0.06698591531512615,0.0021558943225439137,3.2184292957732996,0.9314130089130337,0.9313849217310588,2.8087181974889575e-05,0.003015636319588161,0.7733333333333334,0.5999999999999999,0.17333333333333356,28.888888888888935,4.178300996512875,,,,39.928062615509425,47.86860429278531,-7.940541677275881,-16.588203885594947,0.11297979438696983,0.1162670925925253,-0.0032872982055554695,-2.827367686122743
0.8,4.0,4.0,2586.098242115281,2841.1305915063504,-255.03234939106915,-8.97643882169642,239765.24959855,264140.55002745,-24375.300428900024,-9.228155399224729,136.5038826686135,137.28163778418497,-0.7777551155714661,-0.5665397995864124,0.10253056902792507,0.1031498585902154,-0.0006192895622903344,-0.6003784888844036,0.07325665736408164,0.06592454978099352,0.007332107583088124,11.1219683827132,0.9311235469993302,0.9316596013994161,-0.0005360544000858614,-0.05753758124541101,1.0,0.8000000000000002,0.19999999999999984,24.99999999999998,3.5384100686094007,,,,37.14414699970415,37.43809775029793,-0.29395075059377973,-0.7851647606519765,0.09990322635678014,0.10432800196112454,-0.0044247756043444,-4.241215705437541
1 alpha runs_robust runs_no_robust eval_revenue_mean_robust eval_revenue_mean_no_robust eval_revenue_mean_delta eval_revenue_mean_delta_pct eval_reward_mean_robust eval_reward_mean_no_robust eval_reward_mean_delta eval_reward_mean_delta_pct eval_coi_level_mean_robust eval_coi_level_mean_no_robust eval_coi_level_mean_delta eval_coi_level_mean_delta_pct eval_coi_leakage_mean_robust eval_coi_leakage_mean_no_robust eval_coi_leakage_mean_delta eval_coi_leakage_mean_delta_pct eval_volatility_mean_robust eval_volatility_mean_no_robust eval_volatility_mean_delta eval_volatility_mean_delta_pct eval_margin_mean_robust eval_margin_mean_no_robust eval_margin_mean_delta eval_margin_mean_delta_pct train_alpha_adv_robust train_alpha_adv_no_robust train_alpha_adv_delta train_alpha_adv_delta_pct train_coi_penalty_robust train_coi_penalty_no_robust train_coi_penalty_delta train_coi_penalty_delta_pct train_ux_penalty_robust train_ux_penalty_no_robust train_ux_penalty_delta train_ux_penalty_delta_pct train_agent_prob_robust train_agent_prob_no_robust train_agent_prob_delta train_agent_prob_delta_pct
2 0.0 4.0 4.0 3379.9042994670963 3565.2912010160844 -185.38690154898813 -5.199768857482219 313527.4707462 331300.229069 -17772.758322799986 -5.364547550342456 137.08358925982625 137.28764358955686 -0.2040543297306101 -0.14863269875959326 0.1146626165658294 0.11861133504329742 -0.003948718477468013 -3.3291240470622716 0.06687153537785637 0.06445662162531288 0.0024149137525434905 3.746572022625408 0.9315273502623671 0.9317078361627993 -0.00018048590043218127 -0.019371512552207898 0.18958333333333333 5.553200113221484 61.35134238638615 66.58479574844135 -5.233453362055201 -7.859832418540847 0.12778212146468534 0.11615891320235115 0.011623208262334192 10.00629907933654
3 0.1 4.0 4.0 3307.028238366196 3458.002436284769 -150.97419791857283 -4.365936713473732 306772.49146475 321215.477968 -14442.986503249966 -4.4963544704059375 137.1182041122497 136.82757579763506 0.29062831461465066 0.21240478238427865 0.1128546052304944 0.11704917861668755 -0.004194573386193154 -3.5835991638433753 0.0685405649303561 0.06737596899527175 0.0011645959350843477 1.728503430007924 0.9315331673960889 0.9313276818191593 0.00020548557692967595 0.0220637248243606 0.2818749999999999 0.1 0.18187499999999987 181.87499999999986 5.079528726095333 52.44772950699336 53.288869747139515 -0.841140240146153 -1.578453895039319 0.11644381911386253 0.11765277436070229 -0.0012089552468397546 -1.0275620387270383
4 0.25 4.0 4.0 3134.3438215278165 3300.5539051855053 -166.21008365768876 -5.035823938416998 290691.4771835 306522.90003785 -15831.422854350007 -5.16484179563586 136.89990884669214 136.71752459667877 0.18238425001337077 0.1334022471160229 0.11113957413522965 0.1139905600539111 -0.0028509859186814507 -2.50107194607439 0.06427159998376095 0.06846858821082077 -0.004196988227059828 -6.12980103246314 0.9314501501825461 0.9313053225630614 0.0001448276194846443 0.015551035302371268 0.44833333333333336 0.25 0.19833333333333336 79.33333333333334 4.7183804755060255 49.04307009982127 55.2030005738411 -6.159930474019831 -11.158687770568074 0.10998505830218755 0.11684259343269415 -0.0068575351305066035 -5.869037077182653
5 0.4 4.0 4.0 2983.852437569374 3180.7872854626567 -196.9348478932825 -6.191386918369099 276545.26309355 295433.5405797 -18888.277486150037 -6.393409986248494 136.19210761854086 136.5783021470118 -0.38619452847095204 -0.2827641890402586 0.10875560547061063 0.11189234314151972 -0.0031367376709090927 -2.8033532794480807 0.07452230347799255 0.07104688223410768 0.003475421243884863 4.891729425132195 0.9307282962514367 0.9310542820602117 -0.0003259858087749645 -0.03501254599824534 0.5999999999999999 0.4000000000000001 0.1999999999999998 49.999999999999936 4.174996403604185 47.99794119802058 50.794260008988424 -2.796318810967847 -5.505186630286606 0.10222958892923095 0.11161526349272373 -0.009385674563492777 -8.408952565976458
6 0.6 4.0 4.0 2789.0434220430398 2982.2460998252786 -193.20267778223888 -6.4784283830083 258688.11700405 277051.95613675 -18363.8391327 -6.628301560749781 136.86774320500828 136.81931587629953 0.04842732870875466 0.035395096371142916 0.10501047827147733 0.10802266412956946 -0.0030121858580921257 -2.788475809557069 0.06914180963767007 0.06698591531512615 0.0021558943225439137 3.2184292957732996 0.9314130089130337 0.9313849217310588 2.8087181974889575e-05 0.003015636319588161 0.7733333333333334 0.5999999999999999 0.17333333333333356 28.888888888888935 4.178300996512875 39.928062615509425 47.86860429278531 -7.940541677275881 -16.588203885594947 0.11297979438696983 0.1162670925925253 -0.0032872982055554695 -2.827367686122743
7 0.8 4.0 4.0 2586.098242115281 2841.1305915063504 -255.03234939106915 -8.97643882169642 239765.24959855 264140.55002745 -24375.300428900024 -9.228155399224729 136.5038826686135 137.28163778418497 -0.7777551155714661 -0.5665397995864124 0.10253056902792507 0.1031498585902154 -0.0006192895622903344 -0.6003784888844036 0.07325665736408164 0.06592454978099352 0.007332107583088124 11.1219683827132 0.9311235469993302 0.9316596013994161 -0.0005360544000858614 -0.05753758124541101 1.0 0.8000000000000002 0.19999999999999984 24.99999999999998 3.5384100686094007 37.14414699970415 37.43809775029793 -0.29395075059377973 -0.7851647606519765 0.09990322635678014 0.10432800196112454 -0.0044247756043444 -4.241215705437541

View File

@@ -0,0 +1,13 @@
alpha,mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_coi_leakage_mean_mean,eval_coi_leakage_mean_std,eval_volatility_mean_mean,eval_volatility_mean_std,eval_margin_mean_mean,eval_margin_mean_std,train_alpha_adv_mean,train_alpha_adv_std,train_coi_penalty_mean,train_coi_penalty_std,train_ux_penalty_mean,train_ux_penalty_std,train_agent_prob_mean,train_agent_prob_std
0.0,no_robust,4,3565.2912010160844,52.219179508209216,331300.229069,5038.96659004527,137.28764358955686,0.6434240315013728,0.11861133504329742,0.004019332768284657,0.06445662162531288,0.004080405219050139,0.9317078361627993,0.00038018051704976865,,,,,66.58479574844135,32.282270089830455,0.11615891320235115,0.016558627227281013
0.0,robust,4,3379.9042994670963,54.727408939657735,313527.4707462,5408.058196552377,137.08358925982625,1.047386315387148,0.1146626165658294,0.0025627354157035497,0.06687153537785637,0.008577061675868377,0.9315273502623671,0.0007274203134899985,0.18958333333333333,0.02083333333333336,5.553200113221484,0.45981481828856186,61.35134238638615,30.27964905193963,0.12778212146468534,0.027929667978205217
0.1,no_robust,4,3458.002436284769,60.75923217871363,321215.477968,6016.373193216596,136.82757579763506,1.1899102161551907,0.11704917861668755,0.0021220259908233973,0.06737596899527175,0.006801136773079149,0.9313276818191593,0.0008352263172197586,0.1,0.0,,,53.288869747139515,18.480340945815023,0.11765277436070229,0.017544197575138736
0.1,robust,4,3307.028238366196,35.58495715224888,306772.49146475,3488.2690530060245,137.1182041122497,0.8582218376452346,0.1128546052304944,0.0005963155492967403,0.0685405649303561,0.0050673362512629015,0.9315331673960889,0.0005217376436765336,0.2818749999999999,0.03624999999999999,5.079528726095333,0.6109585102054891,52.44772950699336,29.0263361696475,0.11644381911386253,0.021152545180088765
0.25,no_robust,4,3300.5539051855053,50.460978662647115,306522.90003785,4860.668937531515,136.71752459667877,0.7410676951244369,0.1139905600539111,0.003319948537321803,0.06846858821082077,0.008614994548315848,0.9313053225630614,0.0004919872662680591,0.25,0.0,,,55.2030005738411,26.88247558235345,0.11684259343269415,0.013462146346772591
0.25,robust,4,3134.3438215278165,64.06834403659167,290691.4771835,6331.196493752059,136.89990884669214,1.3796663751798552,0.11113957413522965,0.0015044942041406348,0.06427159998376095,0.0042331619171274894,0.9314501501825461,0.0008939739741734515,0.44833333333333336,0.0033333333333333518,4.7183804755060255,0.4538389380858333,49.04307009982127,28.20484665432831,0.10998505830218755,0.010731404693185651
0.4,no_robust,4,3180.7872854626567,71.87564776824694,295433.5405797,7035.374110540269,136.5783021470118,1.7095219574599192,0.11189234314151972,0.0013821115134030936,0.07104688223410768,0.005766138692685495,0.9310542820602117,0.0013989725050689828,0.4000000000000001,0.0,,,50.794260008988424,24.836708377642946,0.11161526349272373,0.005787749200301594
0.4,robust,4,2983.852437569374,45.51290575912758,276545.26309355,4555.1725323898245,136.19210761854086,1.5546063667946701,0.10875560547061063,0.001118798290958954,0.07452230347799255,0.0040446395928049874,0.9307282962514367,0.0013558080014763189,0.5999999999999999,0.0,4.174996403604185,0.12189448324552496,47.99794119802058,33.51782503281748,0.10222958892923095,0.0031686467591609474
0.6,no_robust,4,2982.2460998252786,39.93674476199945,277051.95613675,3931.02017169463,136.81931587629953,1.1995405806950865,0.10802266412956946,0.000405835985606262,0.06698591531512615,0.002805894772223563,0.9313849217310588,0.0008100530228792662,0.5999999999999999,0.0,,,47.86860429278531,23.830502772642472,0.1162670925925253,0.028676813474186293
0.6,robust,4,2789.0434220430398,35.297482315631626,258688.11700405,3420.6735023624556,136.86774320500828,0.7097303238857778,0.10501047827147733,0.0008273121554488608,0.06914180963767007,0.009066158371268139,0.9314130089130337,0.0005024421703994162,0.7733333333333334,0.053333333333333385,4.178300996512875,0.5865970573865015,39.928062615509425,30.25078643153115,0.11297979438696983,0.0274101056520461
0.8,no_robust,4,2841.1305915063504,21.84043179776092,264140.55002745,2073.353315114627,137.28163778418497,0.6288968799501957,0.1031498585902154,0.0012877581835795701,0.06592454978099352,0.00340700896766341,0.9316596013994161,0.00038430108058413553,0.8000000000000002,0.0,,,37.43809775029793,32.01740090550489,0.10432800196112454,0.018337841526911584
0.8,robust,4,2586.098242115281,48.05539265296157,239765.24959855,4681.6472175597555,136.5038826686135,1.0611320896043694,0.10253056902792507,0.002587472569909977,0.07325665736408164,0.0015359324114246234,0.9311235469993302,0.0006145440308596868,1.0,0.0,3.5384100686094007,0.391972726035734,37.14414699970415,25.614063825315505,0.09990322635678014,0.010269342031085898
1 alpha mode runs eval_revenue_mean_mean eval_revenue_mean_std eval_reward_mean_mean eval_reward_mean_std eval_coi_level_mean_mean eval_coi_level_mean_std eval_coi_leakage_mean_mean eval_coi_leakage_mean_std eval_volatility_mean_mean eval_volatility_mean_std eval_margin_mean_mean eval_margin_mean_std train_alpha_adv_mean train_alpha_adv_std train_coi_penalty_mean train_coi_penalty_std train_ux_penalty_mean train_ux_penalty_std train_agent_prob_mean train_agent_prob_std
2 0.0 no_robust 4 3565.2912010160844 52.219179508209216 331300.229069 5038.96659004527 137.28764358955686 0.6434240315013728 0.11861133504329742 0.004019332768284657 0.06445662162531288 0.004080405219050139 0.9317078361627993 0.00038018051704976865 66.58479574844135 32.282270089830455 0.11615891320235115 0.016558627227281013
3 0.0 robust 4 3379.9042994670963 54.727408939657735 313527.4707462 5408.058196552377 137.08358925982625 1.047386315387148 0.1146626165658294 0.0025627354157035497 0.06687153537785637 0.008577061675868377 0.9315273502623671 0.0007274203134899985 0.18958333333333333 0.02083333333333336 5.553200113221484 0.45981481828856186 61.35134238638615 30.27964905193963 0.12778212146468534 0.027929667978205217
4 0.1 no_robust 4 3458.002436284769 60.75923217871363 321215.477968 6016.373193216596 136.82757579763506 1.1899102161551907 0.11704917861668755 0.0021220259908233973 0.06737596899527175 0.006801136773079149 0.9313276818191593 0.0008352263172197586 0.1 0.0 53.288869747139515 18.480340945815023 0.11765277436070229 0.017544197575138736
5 0.1 robust 4 3307.028238366196 35.58495715224888 306772.49146475 3488.2690530060245 137.1182041122497 0.8582218376452346 0.1128546052304944 0.0005963155492967403 0.0685405649303561 0.0050673362512629015 0.9315331673960889 0.0005217376436765336 0.2818749999999999 0.03624999999999999 5.079528726095333 0.6109585102054891 52.44772950699336 29.0263361696475 0.11644381911386253 0.021152545180088765
6 0.25 no_robust 4 3300.5539051855053 50.460978662647115 306522.90003785 4860.668937531515 136.71752459667877 0.7410676951244369 0.1139905600539111 0.003319948537321803 0.06846858821082077 0.008614994548315848 0.9313053225630614 0.0004919872662680591 0.25 0.0 55.2030005738411 26.88247558235345 0.11684259343269415 0.013462146346772591
7 0.25 robust 4 3134.3438215278165 64.06834403659167 290691.4771835 6331.196493752059 136.89990884669214 1.3796663751798552 0.11113957413522965 0.0015044942041406348 0.06427159998376095 0.0042331619171274894 0.9314501501825461 0.0008939739741734515 0.44833333333333336 0.0033333333333333518 4.7183804755060255 0.4538389380858333 49.04307009982127 28.20484665432831 0.10998505830218755 0.010731404693185651
8 0.4 no_robust 4 3180.7872854626567 71.87564776824694 295433.5405797 7035.374110540269 136.5783021470118 1.7095219574599192 0.11189234314151972 0.0013821115134030936 0.07104688223410768 0.005766138692685495 0.9310542820602117 0.0013989725050689828 0.4000000000000001 0.0 50.794260008988424 24.836708377642946 0.11161526349272373 0.005787749200301594
9 0.4 robust 4 2983.852437569374 45.51290575912758 276545.26309355 4555.1725323898245 136.19210761854086 1.5546063667946701 0.10875560547061063 0.001118798290958954 0.07452230347799255 0.0040446395928049874 0.9307282962514367 0.0013558080014763189 0.5999999999999999 0.0 4.174996403604185 0.12189448324552496 47.99794119802058 33.51782503281748 0.10222958892923095 0.0031686467591609474
10 0.6 no_robust 4 2982.2460998252786 39.93674476199945 277051.95613675 3931.02017169463 136.81931587629953 1.1995405806950865 0.10802266412956946 0.000405835985606262 0.06698591531512615 0.002805894772223563 0.9313849217310588 0.0008100530228792662 0.5999999999999999 0.0 47.86860429278531 23.830502772642472 0.1162670925925253 0.028676813474186293
11 0.6 robust 4 2789.0434220430398 35.297482315631626 258688.11700405 3420.6735023624556 136.86774320500828 0.7097303238857778 0.10501047827147733 0.0008273121554488608 0.06914180963767007 0.009066158371268139 0.9314130089130337 0.0005024421703994162 0.7733333333333334 0.053333333333333385 4.178300996512875 0.5865970573865015 39.928062615509425 30.25078643153115 0.11297979438696983 0.0274101056520461
12 0.8 no_robust 4 2841.1305915063504 21.84043179776092 264140.55002745 2073.353315114627 137.28163778418497 0.6288968799501957 0.1031498585902154 0.0012877581835795701 0.06592454978099352 0.00340700896766341 0.9316596013994161 0.00038430108058413553 0.8000000000000002 0.0 37.43809775029793 32.01740090550489 0.10432800196112454 0.018337841526911584
13 0.8 robust 4 2586.098242115281 48.05539265296157 239765.24959855 4681.6472175597555 136.5038826686135 1.0611320896043694 0.10253056902792507 0.002587472569909977 0.07325665736408164 0.0015359324114246234 0.9311235469993302 0.0006145440308596868 1.0 0.0 3.5384100686094007 0.391972726035734 37.14414699970415 25.614063825315505 0.09990322635678014 0.010269342031085898

View File

@@ -0,0 +1,7 @@
{
"status": "ok",
"revenue_delta": -191.29017636530716,
"revenue_delta_pct": -5.938226273545598,
"coi_leakage_delta": -0.002960415145605702,
"coi_leakage_delta_pct": -2.6404147469510946
}

View File

@@ -0,0 +1,3 @@
mode,runs,eval_revenue_mean_mean,eval_revenue_mean_std,eval_reward_mean_mean,eval_reward_mean_std,eval_coi_level_mean_mean,eval_coi_level_mean_std,eval_coi_leakage_mean_mean,eval_coi_leakage_mean_std,eval_volatility_mean_mean,eval_volatility_mean_std,eval_margin_mean_mean,eval_margin_mean_std,train_alpha_adv_mean,train_alpha_adv_std,train_coi_penalty_mean,train_coi_penalty_std,train_ux_penalty_mean,train_ux_penalty_std,train_agent_prob_mean,train_agent_prob_std
no_robust,24,3221.335253213441,262.46595166337727,299277.442303125,24382.561944761477,136.9186666318945,1.0038463876967063,0.11211932326253345,0.005805494533542669,0.06737642102693879,0.005402738047823369,0.9314066076226178,0.0007436370959663933,0.43,0.2546411303445653,,,51.86293802024894,25.340287421525442,0.11381077317368686,0.016664235359362907
robust,24,3030.0450768481337,288.262657026656,280998.34484843333,26820.020161880373,136.77757261848845,1.06224696086916,0.10915890811692774,0.004616462637659704,0.06943407846195294,0.006435789449278624,0.9312959200008004,0.0007858424519830652,0.5488541666666666,0.2860373751485706,4.540469463924883,0.7906156355346259,47.985382134405825,27.407657819442747,0.11155393475895271,0.01943348418653492
1 mode runs eval_revenue_mean_mean eval_revenue_mean_std eval_reward_mean_mean eval_reward_mean_std eval_coi_level_mean_mean eval_coi_level_mean_std eval_coi_leakage_mean_mean eval_coi_leakage_mean_std eval_volatility_mean_mean eval_volatility_mean_std eval_margin_mean_mean eval_margin_mean_std train_alpha_adv_mean train_alpha_adv_std train_coi_penalty_mean train_coi_penalty_std train_ux_penalty_mean train_ux_penalty_std train_agent_prob_mean train_agent_prob_std
2 no_robust 24 3221.335253213441 262.46595166337727 299277.442303125 24382.561944761477 136.9186666318945 1.0038463876967063 0.11211932326253345 0.005805494533542669 0.06737642102693879 0.005402738047823369 0.9314066076226178 0.0007436370959663933 0.43 0.2546411303445653 51.86293802024894 25.340287421525442 0.11381077317368686 0.016664235359362907
3 robust 24 3030.0450768481337 288.262657026656 280998.34484843333 26820.020161880373 136.77757261848845 1.06224696086916 0.10915890811692774 0.004616462637659704 0.06943407846195294 0.006435789449278624 0.9312959200008004 0.0007858424519830652 0.5488541666666666 0.2860373751485706 4.540469463924883 0.7906156355346259 47.985382134405825 27.407657819442747 0.11155393475895271 0.01943348418653492

View File

@@ -0,0 +1,25 @@
alpha,metric,direction,wins,ties,total_pairs,win_probability
0.0,eval/revenue_mean,higher,0,0,16,0.0
0.0,eval/reward_mean,higher,0,0,16,0.0
0.0,eval/coi_leakage_mean,lower,14,0,16,0.875
0.0,eval/volatility_mean,lower,8,0,16,0.5
0.1,eval/revenue_mean,higher,0,0,16,0.0
0.1,eval/reward_mean,higher,0,0,16,0.0
0.1,eval/coi_leakage_mean,lower,16,0,16,1.0
0.1,eval/volatility_mean,lower,8,0,16,0.5
0.25,eval/revenue_mean,higher,0,0,16,0.0
0.25,eval/reward_mean,higher,0,0,16,0.0
0.25,eval/coi_leakage_mean,lower,12,0,16,0.75
0.25,eval/volatility_mean,lower,11,0,16,0.6875
0.4,eval/revenue_mean,higher,0,0,16,0.0
0.4,eval/reward_mean,higher,0,0,16,0.0
0.4,eval/coi_leakage_mean,lower,16,0,16,1.0
0.4,eval/volatility_mean,lower,6,0,16,0.375
0.6,eval/revenue_mean,higher,0,0,16,0.0
0.6,eval/reward_mean,higher,0,0,16,0.0
0.6,eval/coi_leakage_mean,lower,16,0,16,1.0
0.6,eval/volatility_mean,lower,7,0,16,0.4375
0.8,eval/revenue_mean,higher,0,0,16,0.0
0.8,eval/reward_mean,higher,0,0,16,0.0
0.8,eval/coi_leakage_mean,lower,11,0,16,0.6875
0.8,eval/volatility_mean,lower,0,0,16,0.0
1 alpha metric direction wins ties total_pairs win_probability
2 0.0 eval/revenue_mean higher 0 0 16 0.0
3 0.0 eval/reward_mean higher 0 0 16 0.0
4 0.0 eval/coi_leakage_mean lower 14 0 16 0.875
5 0.0 eval/volatility_mean lower 8 0 16 0.5
6 0.1 eval/revenue_mean higher 0 0 16 0.0
7 0.1 eval/reward_mean higher 0 0 16 0.0
8 0.1 eval/coi_leakage_mean lower 16 0 16 1.0
9 0.1 eval/volatility_mean lower 8 0 16 0.5
10 0.25 eval/revenue_mean higher 0 0 16 0.0
11 0.25 eval/reward_mean higher 0 0 16 0.0
12 0.25 eval/coi_leakage_mean lower 12 0 16 0.75
13 0.25 eval/volatility_mean lower 11 0 16 0.6875
14 0.4 eval/revenue_mean higher 0 0 16 0.0
15 0.4 eval/reward_mean higher 0 0 16 0.0
16 0.4 eval/coi_leakage_mean lower 16 0 16 1.0
17 0.4 eval/volatility_mean lower 6 0 16 0.375
18 0.6 eval/revenue_mean higher 0 0 16 0.0
19 0.6 eval/reward_mean higher 0 0 16 0.0
20 0.6 eval/coi_leakage_mean lower 16 0 16 1.0
21 0.6 eval/volatility_mean lower 7 0 16 0.4375
22 0.8 eval/revenue_mean higher 0 0 16 0.0
23 0.8 eval/reward_mean higher 0 0 16 0.0
24 0.8 eval/coi_leakage_mean lower 11 0 16 0.6875
25 0.8 eval/volatility_mean lower 0 0 16 0.0

View File

@@ -0,0 +1 @@
\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf}

View File

@@ -0,0 +1 @@
\includegraphics[width=0.95\linewidth]{chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf}

View File

@@ -0,0 +1 @@
\includegraphics[width=0.95\linewidth]{chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf}

View File

@@ -0,0 +1 @@
\includegraphics[width=0.99\linewidth]{chapters/figures/results/generated/legacy/plots/first_sweep_tier_revenue.pdf}

View File

@@ -0,0 +1 @@
\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_alpha_curves.pdf}

View File

@@ -0,0 +1 @@
\includegraphics[width=0.98\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_delta_curves.pdf}

View File

@@ -0,0 +1 @@
\includegraphics[width=0.88\linewidth]{chapters/figures/results/generated/legacy/plots/ppo_tradeoff_scatter.pdf}

View File

@@ -0,0 +1,313 @@
from __future__ import annotations
import argparse
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import numpy as np
import pandas as pd
from process_first_sweep import run as run_first_sweep
from process_ppo_benchmark import run as run_ppo_benchmark
def _output_dir() -> Path:
return Path(__file__).resolve().parent / "generated" / "legacy"
def _plot_dir() -> Path:
return _output_dir() / "plots"
def _configure_style() -> None:
plt.rcParams.update(
{
"font.family": "serif",
"font.size": 10,
"axes.titlesize": 10,
"axes.labelsize": 9,
"legend.fontsize": 8,
"xtick.labelsize": 8,
"ytick.labelsize": 8,
"figure.dpi": 220,
"savefig.dpi": 320,
"axes.spines.top": False,
"axes.spines.right": False,
"axes.grid": True,
"grid.alpha": 0.22,
}
)
def _fmt_thousands(value: float, _: int) -> str:
return f"{int(value):,}"
def _load_csv(path: Path) -> pd.DataFrame:
if not path.exists():
raise FileNotFoundError(f"Missing required input: {path}")
return pd.read_csv(path)
def _plot_ppo_alpha_curves(alpha_mode: pd.DataFrame, out_dir: Path) -> Path:
fig, axes = plt.subplots(2, 2, figsize=(9.3, 6.4), constrained_layout=True)
robust_color = "#C44E52"
baseline_color = "#4C72B0"
mode_colors = {"robust": robust_color, "no_robust": baseline_color}
mode_labels = {"robust": "Robust", "no_robust": "Non-robust"}
panels = [
("eval_revenue_mean", "Mean Episode Revenue", "Revenue"),
("eval_reward_mean", "Mean Episode Reward", "Reward"),
("eval_coi_leakage_mean", "Mean COI Leakage", "COI Leakage"),
("eval_volatility_mean", "Mean Price Volatility", "Volatility"),
]
for ax, (metric_prefix, title, ylabel) in zip(axes.flat, panels):
mean_col = f"{metric_prefix}_mean"
std_col = f"{metric_prefix}_std"
for mode in ("no_robust", "robust"):
sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha")
if sub.empty:
continue
x = sub["alpha"].to_numpy(dtype=float)
y = sub[mean_col].to_numpy(dtype=float)
ax.plot(
x,
y,
marker="o",
linewidth=1.8,
markersize=4,
color=mode_colors[mode],
label=mode_labels[mode],
)
if std_col in sub.columns:
sigma = sub[std_col].fillna(0.0).to_numpy(dtype=float)
ax.fill_between(
x,
y - sigma,
y + sigma,
color=mode_colors[mode],
alpha=0.14,
linewidth=0,
)
ax.set_title(title)
ax.set_xlabel(r"Contamination $\alpha$")
ax.set_ylabel(ylabel)
ax.set_xticks(sorted(alpha_mode["alpha"].unique()))
if metric_prefix in {"eval_revenue_mean", "eval_reward_mean"}:
ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
handles, labels = axes.flat[0].get_legend_handles_labels()
fig.legend(handles, labels, ncol=2, loc="upper center", bbox_to_anchor=(0.5, 1.02))
out_path = out_dir / "ppo_alpha_curves.pdf"
fig.savefig(out_path, bbox_inches="tight")
plt.close(fig)
return out_path
def _plot_ppo_delta_curves(deltas: pd.DataFrame, out_dir: Path) -> Path:
fig, axes = plt.subplots(2, 1, figsize=(8.6, 6.0), constrained_layout=True)
deltas = deltas.sort_values("alpha")
x = deltas["alpha"].to_numpy(dtype=float)
top_metrics = [
("eval_revenue_mean_delta_pct", "Revenue", "#4C72B0"),
("eval_reward_mean_delta_pct", "Reward", "#8172B3"),
]
for col, label, color in top_metrics:
axes[0].plot(
x,
deltas[col].to_numpy(dtype=float),
marker="o",
linewidth=1.8,
markersize=4,
color=color,
label=label,
)
axes[0].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
axes[0].set_title("Robust Minus Non-robust Delta by Contamination")
axes[0].set_ylabel("Delta (%)")
axes[0].set_xlabel(r"Contamination $\alpha$")
axes[0].set_xticks(x)
axes[0].legend(loc="lower left")
bottom_metrics = [
("eval_coi_leakage_mean_delta_pct", "COI Leakage", "#55A868"),
("eval_volatility_mean_delta_pct", "Volatility", "#DD8452"),
]
for col, label, color in bottom_metrics:
axes[1].plot(
x,
deltas[col].to_numpy(dtype=float),
marker="o",
linewidth=1.8,
markersize=4,
color=color,
label=label,
)
axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
axes[1].set_ylabel("Delta (%)")
axes[1].set_xlabel(r"Contamination $\alpha$")
axes[1].set_xticks(x)
axes[1].legend(loc="lower left")
out_path = out_dir / "ppo_delta_curves.pdf"
fig.savefig(out_path, bbox_inches="tight")
plt.close(fig)
return out_path
def _plot_ppo_tradeoff_scatter(deltas: pd.DataFrame, out_dir: Path) -> Path:
fig, ax = plt.subplots(figsize=(6.4, 5.2), constrained_layout=True)
data = deltas.sort_values("alpha")
x = data["eval_coi_leakage_mean_delta_pct"].to_numpy(dtype=float)
y = data["eval_revenue_mean_delta_pct"].to_numpy(dtype=float)
alphas = data["alpha"].to_numpy(dtype=float)
scatter = ax.scatter(
x,
y,
c=alphas,
cmap="viridis",
s=72,
edgecolor="#222222",
linewidth=0.5,
)
for x_i, y_i, alpha in zip(x, y, alphas):
ax.annotate(
rf"$\alpha={alpha:.2f}$",
(x_i, y_i),
textcoords="offset points",
xytext=(5, 4),
fontsize=8,
)
ax.axhline(0.0, color="#555555", linewidth=1.0, linestyle="--")
ax.axvline(0.0, color="#555555", linewidth=1.0, linestyle="--")
ax.set_xlabel("COI Leakage Delta (%)")
ax.set_ylabel("Revenue Delta (%)")
ax.set_title("PPO Robust Tradeoff Frontier")
cbar = fig.colorbar(scatter, ax=ax)
cbar.set_label(r"Contamination $\alpha$")
out_path = out_dir / "ppo_tradeoff_scatter.pdf"
fig.savefig(out_path, bbox_inches="tight")
plt.close(fig)
return out_path
def _plot_first_sweep_tier_revenue(tier_mode: pd.DataFrame, out_dir: Path) -> Path:
pivot = (
tier_mode.pivot(index="tier", columns="mode", values="eval_revenue_mean_mean")
.dropna(subset=["robust", "no_robust"], how="any")
.copy()
)
if pivot.empty:
raise ValueError("First sweep tier summary missing robust/non-robust pairs")
order = sorted(pivot.index.tolist())
pivot = pivot.loc[order]
delta_pct = 100.0 * (pivot["robust"] - pivot["no_robust"]) / pivot["no_robust"]
fig, axes = plt.subplots(1, 2, figsize=(10.2, 4.3), constrained_layout=True)
x = np.arange(len(order))
width = 0.36
axes[0].bar(
x - width / 2,
pivot["no_robust"].to_numpy(dtype=float),
width=width,
label="Non-robust",
color="#4C72B0",
)
axes[0].bar(
x + width / 2,
pivot["robust"].to_numpy(dtype=float),
width=width,
label="Robust",
color="#C44E52",
)
axes[0].set_xticks(x)
axes[0].set_xticklabels(order, rotation=20)
axes[0].set_ylabel("Mean Revenue")
axes[0].set_yscale("log")
axes[0].yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
axes[0].set_title("First Sweep Tier Revenue (log scale)")
axes[0].legend()
axes[1].bar(x, delta_pct.to_numpy(dtype=float), color="#55A868", width=0.55)
axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
axes[1].set_xticks(x)
axes[1].set_xticklabels(order, rotation=20)
axes[1].set_ylabel("Revenue Delta (%)")
axes[1].set_title("Robust Minus Non-robust by Tier")
out_path = out_dir / "first_sweep_tier_revenue.pdf"
fig.savefig(out_path, bbox_inches="tight")
plt.close(fig)
return out_path
def build_plots(data_dir: Path, out_dir: Path) -> list[Path]:
alpha_mode = _load_csv(data_dir / "ppo_alpha_mode_summary.csv")
deltas = _load_csv(data_dir / "ppo_alpha_deltas.csv")
tier_mode = _load_csv(data_dir / "first_sweep_tier_mode_summary.csv")
out_dir.mkdir(parents=True, exist_ok=True)
paths = [
_plot_ppo_alpha_curves(alpha_mode, out_dir),
_plot_ppo_delta_curves(deltas, out_dir),
_plot_ppo_tradeoff_scatter(deltas, out_dir),
_plot_first_sweep_tier_revenue(tier_mode, out_dir),
]
return paths
def main() -> None:
parser = argparse.ArgumentParser(
description="Create paper-ready plots from result CSVs"
)
parser.add_argument("--data-dir", type=Path, default=_output_dir())
parser.add_argument("--plot-dir", type=Path, default=_plot_dir())
parser.add_argument(
"--refresh-data",
action="store_true",
help="Regenerate processed CSVs before plotting",
)
args = parser.parse_args()
_configure_style()
if bool(args.refresh_data):
run_ppo_benchmark(
input_path=Path(__file__).resolve().parents[5]
/ "tpu_orchestration"
/ "results"
/ "ppo_benchmark.csv",
output_dir=args.data_dir,
include_non_finished=False,
)
run_first_sweep(
input_path=Path(__file__).resolve().parents[5]
/ "tpu_orchestration"
/ "results"
/ "first_sweep.csv",
output_dir=args.data_dir,
include_non_finished=False,
top_n=25,
)
outputs = build_plots(data_dir=args.data_dir, out_dir=args.plot_dir)
for path in outputs:
print(path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,658 @@
from __future__ import annotations
import argparse
from pathlib import Path
from typing import Iterable
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import numpy as np
import pandas as pd
def _load_tikzplotlib():
def _patch_webcolors() -> None:
try:
import webcolors
if hasattr(webcolors, "CSS3_HEX_TO_NAMES"):
return
css3 = getattr(webcolors, "CSS3", "css3")
webcolors.CSS3_HEX_TO_NAMES = {
webcolors.name_to_hex(name, spec=css3): name
for name in webcolors.names(spec=css3)
}
except Exception:
return
_patch_webcolors()
try:
from matplotlib.legend import Legend
if not hasattr(Legend, "_ncol") and hasattr(Legend, "_ncols"):
Legend._ncol = property(lambda self: self._ncols)
except Exception:
pass
try:
import tikzplotlib as module
return module, None
except Exception:
pass
try:
from matplotlib.backends import backend_pgf
if not hasattr(backend_pgf, "common_texification") and hasattr(
backend_pgf, "_tex_escape"
):
backend_pgf.common_texification = backend_pgf._tex_escape
_patch_webcolors()
import tikzplotlib as module
return module, None
except Exception as exc:
return None, exc
TIKZPLOTLIB, TIKZPLOTLIB_IMPORT_ERROR = _load_tikzplotlib()
def _default_output_dir() -> Path:
return Path(__file__).resolve().parent / "generated" / "wandb"
def _default_plot_dir(output_dir: Path) -> Path:
return output_dir / "plots"
def _sanitize(key: str) -> str:
return key.replace("/", "_").replace("-", "_")
def _configure_style() -> None:
plt.rcParams.update(
{
"font.family": "serif",
"font.size": 10,
"axes.titlesize": 10,
"axes.labelsize": 9,
"legend.fontsize": 8,
"xtick.labelsize": 8,
"ytick.labelsize": 8,
"figure.dpi": 220,
"savefig.dpi": 320,
"axes.spines.top": False,
"axes.spines.right": False,
"axes.grid": True,
"grid.alpha": 0.22,
}
)
def _fmt_thousands(value: float, _: int) -> str:
return f"{int(value):,}"
def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None:
for column in columns:
if column in frame.columns:
frame[column] = pd.to_numeric(frame[column], errors="coerce")
def _extract_alpha(frame: pd.DataFrame) -> pd.Series:
if "study/alpha" in frame.columns:
return pd.to_numeric(frame["study/alpha"], errors="coerce")
if "alpha" in frame.columns:
return pd.to_numeric(frame["alpha"], errors="coerce")
return pd.Series(np.nan, index=frame.index, dtype=float)
def _extract_mode(frame: pd.DataFrame) -> pd.Series:
if "study/mode" in frame.columns:
mode = frame["study/mode"].astype(str).str.strip().str.lower()
mapping = {
"baseline": "baseline",
"no_robust": "baseline",
"defended": "defended",
"robust": "defended",
}
return mode.map(mapping).fillna("")
if "study/no_robust" in frame.columns:
no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0)
return pd.Series(
np.where(no_robust > 0.5, "baseline", "defended"),
index=frame.index,
dtype="object",
)
if "no_robust" in frame.columns:
no_robust = (
frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"})
)
return pd.Series(
np.where(no_robust, "baseline", "defended"),
index=frame.index,
dtype="object",
)
return pd.Series("", index=frame.index, dtype="object")
def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame:
data = frame.copy()
if not include_non_finished and "State" in data.columns:
data = data[data["State"].astype(str).str.lower() == "finished"].copy()
data["alpha"] = _extract_alpha(data)
data["mode"] = _extract_mode(data)
data = data[data["mode"].isin({"baseline", "defended"})]
data = data[data["alpha"].notna()]
_coerce_numeric(
data,
[
"eval/revenue_mean",
"eval/reward_mean",
"eval/coi_level_mean",
"eval/coi_leakage_mean",
"eval/volatility_mean",
"eval/revenue_std",
"eval/reward_std",
"eval/margin_mean",
"train/agent_prob",
"train/alpha_adv",
"lambda_coi",
"ambiguity_radius",
"n_products",
],
)
return data.sort_values(["alpha", "mode"]).reset_index(drop=True)
def _summary_by_alpha_mode(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
for metric in metrics:
safe = _sanitize(metric)
agg_spec[f"{safe}_mean"] = (metric, "mean")
agg_spec[f"{safe}_std"] = (metric, "std")
return (
frame.groupby(["alpha", "mode"], as_index=False)
.agg(**agg_spec)
.sort_values(["alpha", "mode"])
.reset_index(drop=True)
)
def _delta_by_alpha(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
rows: list[dict[str, float]] = []
for alpha, alpha_group in summary.groupby("alpha", sort=True):
defended = alpha_group[alpha_group["mode"] == "defended"]
baseline = alpha_group[alpha_group["mode"] == "baseline"]
if defended.empty or baseline.empty:
continue
row: dict[str, float] = {
"alpha": float(alpha),
"runs_defended": float(defended["runs"].iloc[0]),
"runs_baseline": float(baseline["runs"].iloc[0]),
}
for metric in metrics:
safe = _sanitize(metric)
defended_value = float(defended[f"{safe}_mean"].iloc[0])
baseline_value = float(baseline[f"{safe}_mean"].iloc[0])
delta = defended_value - baseline_value
row[f"{safe}_defended"] = defended_value
row[f"{safe}_baseline"] = baseline_value
row[f"{safe}_delta"] = delta
row[f"{safe}_delta_pct"] = (
np.nan if baseline_value == 0 else 100.0 * delta / baseline_value
)
rows.append(row)
return pd.DataFrame(rows)
def _summary_by_parameter(
frame: pd.DataFrame, parameter: str, metrics: list[str]
) -> pd.DataFrame:
defended = frame[frame["mode"] == "defended"].copy()
defended = defended[defended[parameter].notna()].copy()
agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
for metric in metrics:
safe = _sanitize(metric)
agg_spec[f"{safe}_mean"] = (metric, "mean")
agg_spec[f"{safe}_std"] = (metric, "std")
return (
defended.groupby(["alpha", parameter], as_index=False)
.agg(**agg_spec)
.sort_values(["alpha", parameter])
.reset_index(drop=True)
)
def _save_table(frame: pd.DataFrame, path: Path) -> Path:
path.parent.mkdir(parents=True, exist_ok=True)
frame.to_csv(path, index=False)
return path
def _save_figure(fig: plt.Figure, pdf_path: Path, export_tikz: bool) -> list[Path]:
pdf_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(pdf_path, bbox_inches="tight")
written = [pdf_path]
if export_tikz:
if TIKZPLOTLIB is None:
raise RuntimeError(
"tikzplotlib import failed. Install/upgrade tikzplotlib and matplotlib-compatible dependencies. "
f"Original error: {TIKZPLOTLIB_IMPORT_ERROR}"
)
try:
from matplotlib.legend import Legend
from matplotlib.lines import Line2D
for legend in fig.findobj(Legend):
if not hasattr(legend, "_ncol") and hasattr(legend, "_ncols"):
setattr(legend, "_ncol", legend._ncols)
if not hasattr(legend, "legendHandles") and hasattr(
legend, "legend_handles"
):
setattr(legend, "legendHandles", legend.legend_handles)
for line in fig.findobj(Line2D):
if hasattr(line, "_us_dashSeq"):
continue
if not hasattr(line, "_dash_pattern"):
continue
dash_pattern = getattr(line, "_dash_pattern")
if not isinstance(dash_pattern, tuple) or len(dash_pattern) != 2:
continue
setattr(line, "_us_dashOffset", dash_pattern[0])
setattr(line, "_us_dashSeq", dash_pattern[1])
except Exception:
pass
tikz_path = pdf_path.with_suffix(".tikz.tex")
TIKZPLOTLIB.save(str(tikz_path), figure=fig)
written.append(tikz_path)
plt.close(fig)
return written
def _plot_alpha_curves(
alpha_mode: pd.DataFrame, out_dir: Path, export_tikz: bool
) -> list[Path]:
fig, axes = plt.subplots(2, 2, figsize=(9.3, 6.4), constrained_layout=True)
mode_colors = {"baseline": "#4C72B0", "defended": "#C44E52"}
mode_labels = {"baseline": "Baseline", "defended": "Defended"}
panels = [
("eval_revenue_mean", "Mean Episode Revenue", "Revenue"),
("eval_reward_mean", "Mean Episode Reward", "Reward"),
("eval_coi_leakage_mean", "Mean COI Leakage", "COI Leakage"),
("eval_volatility_mean", "Mean Price Volatility", "Volatility"),
]
for ax, (metric_prefix, title, ylabel) in zip(axes.flat, panels):
mean_col = f"{metric_prefix}_mean"
std_col = f"{metric_prefix}_std"
for mode in ("baseline", "defended"):
sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha")
if sub.empty:
continue
x = sub["alpha"].to_numpy(dtype=float)
y = sub[mean_col].to_numpy(dtype=float)
ax.plot(
x,
y,
marker="o",
linewidth=1.8,
markersize=4,
color=mode_colors[mode],
label=mode_labels[mode],
)
sigma = sub[std_col].fillna(0.0).to_numpy(dtype=float)
ax.fill_between(
x,
y - sigma,
y + sigma,
color=mode_colors[mode],
alpha=0.14,
linewidth=0,
)
ax.set_title(title)
ax.set_xlabel(r"Contamination $\alpha$")
ax.set_ylabel(ylabel)
ax.set_xticks(sorted(alpha_mode["alpha"].unique()))
if metric_prefix in {"eval_revenue_mean", "eval_reward_mean"}:
ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
handles, labels = axes.flat[0].get_legend_handles_labels()
fig.legend(handles, labels, ncol=2, loc="upper center", bbox_to_anchor=(0.5, 1.02))
return _save_figure(fig, out_dir / "wandb_alpha_curves.pdf", export_tikz)
def _plot_delta_curves(
deltas: pd.DataFrame, out_dir: Path, export_tikz: bool
) -> list[Path]:
fig, axes = plt.subplots(2, 1, figsize=(8.6, 6.0), constrained_layout=True)
deltas = deltas.sort_values("alpha")
x = deltas["alpha"].to_numpy(dtype=float)
top_metrics = [
("eval_revenue_mean_delta_pct", "Revenue", "#4C72B0"),
("eval_reward_mean_delta_pct", "Reward", "#8172B3"),
]
for col, label, color in top_metrics:
axes[0].plot(
x,
deltas[col].to_numpy(dtype=float),
marker="o",
linewidth=1.8,
markersize=4,
color=color,
label=label,
)
axes[0].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
axes[0].set_title("Defended Minus Baseline Delta by Contamination")
axes[0].set_ylabel("Delta (%)")
axes[0].set_xlabel(r"Contamination $\alpha$")
axes[0].set_xticks(x)
axes[0].legend(loc="lower left")
bottom_metrics = [
("eval_coi_leakage_mean_delta_pct", "COI Leakage", "#55A868"),
("eval_volatility_mean_delta_pct", "Volatility", "#DD8452"),
]
for col, label, color in bottom_metrics:
axes[1].plot(
x,
deltas[col].to_numpy(dtype=float),
marker="o",
linewidth=1.8,
markersize=4,
color=color,
label=label,
)
axes[1].axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
axes[1].set_ylabel("Delta (%)")
axes[1].set_xlabel(r"Contamination $\alpha$")
axes[1].set_xticks(x)
axes[1].legend(loc="lower left")
return _save_figure(fig, out_dir / "wandb_delta_curves.pdf", export_tikz)
def _plot_tradeoff_scatter(
deltas: pd.DataFrame, out_dir: Path, export_tikz: bool
) -> list[Path]:
fig, ax = plt.subplots(figsize=(6.4, 5.2), constrained_layout=True)
data = deltas.sort_values("alpha")
x = data["eval_coi_leakage_mean_delta_pct"].to_numpy(dtype=float)
y = data["eval_revenue_mean_delta_pct"].to_numpy(dtype=float)
alphas = data["alpha"].to_numpy(dtype=float)
scatter = ax.scatter(
x,
y,
c=alphas,
cmap="viridis",
s=72,
edgecolor="#222222",
linewidth=0.5,
)
for x_i, y_i, alpha in zip(x, y, alphas):
ax.annotate(
rf"$\alpha={alpha:.2f}$",
(x_i, y_i),
textcoords="offset points",
xytext=(5, 4),
fontsize=8,
)
ax.axhline(0.0, color="#555555", linewidth=1.0, linestyle="--")
ax.axvline(0.0, color="#555555", linewidth=1.0, linestyle="--")
ax.set_xlabel("COI Leakage Delta (%)")
ax.set_ylabel("Revenue Delta (%)")
ax.set_title("Defended Tradeoff Frontier")
cbar = fig.colorbar(scatter, ax=ax)
cbar.set_label(r"Contamination $\alpha$")
return _save_figure(fig, out_dir / "wandb_tradeoff_scatter.pdf", export_tikz)
def _plot_reward_robustness(
alpha_mode: pd.DataFrame, out_dir: Path, export_tikz: bool
) -> list[Path]:
fig, ax = plt.subplots(figsize=(7.6, 4.5), constrained_layout=True)
mode_colors = {"baseline": "#4C72B0", "defended": "#C44E52"}
mode_labels = {"baseline": "Baseline", "defended": "Defended"}
for mode in ("baseline", "defended"):
sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha")
x = sub["alpha"].to_numpy(dtype=float)
y = sub["eval_reward_mean_std"].fillna(0.0).to_numpy(dtype=float)
ax.plot(
x,
y,
marker="o",
linewidth=1.8,
markersize=4,
color=mode_colors[mode],
label=mode_labels[mode],
)
ax.set_title("Reward Robustness Across Contamination")
ax.set_xlabel(r"Contamination $\alpha$")
ax.set_ylabel("Reward Std Across Runs")
ax.set_xticks(sorted(alpha_mode["alpha"].unique()))
ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
ax.legend(loc="upper left")
return _save_figure(fig, out_dir / "wandb_reward_robustness.pdf", export_tikz)
def _plot_parameter_sensitivity(
summary: pd.DataFrame,
parameter: str,
out_name: str,
out_dir: Path,
export_tikz: bool,
) -> list[Path]:
fig, axes = plt.subplots(1, 2, figsize=(10.0, 4.2), constrained_layout=True)
values = sorted(summary[parameter].dropna().unique())
cmap = plt.get_cmap("viridis")
colors = [cmap(i) for i in np.linspace(0.1, 0.9, len(values))]
panels = [
("eval_revenue_mean", "Revenue"),
("eval_coi_leakage_mean", "COI Leakage"),
]
for ax, (metric_prefix, ylabel) in zip(axes, panels):
mean_col = f"{metric_prefix}_mean"
std_col = f"{metric_prefix}_std"
for value, color in zip(values, colors):
sub = summary[summary[parameter] == value].sort_values("alpha")
if sub.empty:
continue
x = sub["alpha"].to_numpy(dtype=float)
y = sub[mean_col].to_numpy(dtype=float)
sigma = sub[std_col].fillna(0.0).to_numpy(dtype=float)
ax.plot(
x,
y,
marker="o",
linewidth=1.6,
markersize=3.6,
color=color,
label=f"{parameter}={value:.2f}",
)
ax.fill_between(
x, y - sigma, y + sigma, color=color, alpha=0.10, linewidth=0
)
ax.set_xlabel(r"Contamination $\alpha$")
ax.set_ylabel(ylabel)
ax.set_xticks(sorted(summary["alpha"].unique()))
if metric_prefix == "eval_revenue_mean":
ax.yaxis.set_major_formatter(FuncFormatter(_fmt_thousands))
axes[0].set_title(f"{parameter} Sensitivity (Defended)")
axes[1].set_title("Leakage Side-Effect")
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(
handles,
labels,
ncol=max(1, len(values) // 2),
loc="upper center",
bbox_to_anchor=(0.5, 1.06),
)
return _save_figure(fig, out_dir / f"{out_name}.pdf", export_tikz)
def _plot_delta_summary(
deltas: pd.DataFrame, out_dir: Path, export_tikz: bool
) -> list[Path]:
data = deltas.sort_values("alpha")
x = np.arange(len(data))
labels = [f"{alpha:.1f}" for alpha in data["alpha"].to_numpy(dtype=float)]
fig, axes = plt.subplots(1, 3, figsize=(11.0, 3.8), constrained_layout=True)
panels = [
("eval_revenue_mean_delta_pct", "Revenue Delta (%)", "#4C72B0"),
("eval_reward_mean_delta_pct", "Reward Delta (%)", "#8172B3"),
("eval_coi_leakage_mean_delta_pct", "COI Leakage Delta (%)", "#55A868"),
]
for ax, (column, title, color) in zip(axes, panels):
values = data[column].to_numpy(dtype=float)
ax.bar(x, values, color=color, alpha=0.85)
ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.set_xlabel(r"$\alpha$")
ax.set_title(title)
return _save_figure(fig, out_dir / "wandb_delta_summary.pdf", export_tikz)
def build_artifacts(
input_path: Path,
output_dir: Path,
plot_dir: Path,
include_non_finished: bool,
export_tikz: bool,
) -> list[Path]:
raw = pd.read_csv(input_path)
frame = _prepare_frame(raw, include_non_finished=include_non_finished)
metrics = [
metric
for metric in (
"eval/revenue_mean",
"eval/reward_mean",
"eval/coi_level_mean",
"eval/coi_leakage_mean",
"eval/volatility_mean",
"eval/margin_mean",
"train/agent_prob",
"train/alpha_adv",
)
if metric in frame.columns
]
alpha_mode = _summary_by_alpha_mode(frame, metrics)
deltas = _delta_by_alpha(alpha_mode, metrics)
lambda_summary = _summary_by_parameter(frame, "lambda_coi", metrics)
radius_summary = _summary_by_parameter(frame, "ambiguity_radius", metrics)
output_dir.mkdir(parents=True, exist_ok=True)
plot_dir.mkdir(parents=True, exist_ok=True)
written: list[Path] = []
written.append(_save_table(alpha_mode, output_dir / "wandb_alpha_mode_summary.csv"))
written.append(_save_table(deltas, output_dir / "wandb_alpha_deltas.csv"))
written.append(
_save_table(lambda_summary, output_dir / "wandb_lambda_alpha_summary.csv")
)
written.append(
_save_table(radius_summary, output_dir / "wandb_radius_alpha_summary.csv")
)
written.extend(_plot_alpha_curves(alpha_mode, plot_dir, export_tikz))
written.extend(_plot_delta_curves(deltas, plot_dir, export_tikz))
written.extend(_plot_tradeoff_scatter(deltas, plot_dir, export_tikz))
written.extend(_plot_reward_robustness(alpha_mode, plot_dir, export_tikz))
written.extend(
_plot_parameter_sensitivity(
summary=lambda_summary,
parameter="lambda_coi",
out_name="wandb_lambda_sensitivity",
out_dir=plot_dir,
export_tikz=export_tikz,
)
)
written.extend(
_plot_parameter_sensitivity(
summary=radius_summary,
parameter="ambiguity_radius",
out_name="wandb_radius_sensitivity",
out_dir=plot_dir,
export_tikz=export_tikz,
)
)
written.extend(_plot_delta_summary(deltas, plot_dir, export_tikz))
return written
def main() -> None:
parser = argparse.ArgumentParser(
description="Generate W&B sweep visualizations for PHANTOM results"
)
parser.add_argument(
"--input", type=Path, required=True, help="Path to W&B export CSV"
)
parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
parser.add_argument("--plot-dir", type=Path, default=None)
parser.add_argument("--include-non-finished", action="store_true")
parser.add_argument(
"--export-tikz",
action="store_true",
help="Export matplotlib figures to TikZ via tikzplotlib",
)
args = parser.parse_args()
_configure_style()
plot_dir = (
args.plot_dir
if args.plot_dir is not None
else _default_plot_dir(args.output_dir)
)
outputs = build_artifacts(
input_path=args.input,
output_dir=args.output_dir,
plot_dir=plot_dir,
include_non_finished=bool(args.include_non_finished),
export_tikz=bool(args.export_tikz),
)
for path in outputs:
print(path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,51 @@
from __future__ import annotations
import argparse
from pathlib import Path
from process_first_sweep import run as run_first_sweep
from process_ppo_benchmark import run as run_ppo_benchmark
def _default_output_dir() -> Path:
return Path(__file__).resolve().parent / "generated" / "legacy"
def main() -> None:
parser = argparse.ArgumentParser(
description="Process all result CSV exports for paper figures"
)
parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
parser.add_argument("--include-non-finished", action="store_true")
parser.add_argument("--top-n", type=int, default=25)
args = parser.parse_args()
written: list[Path] = []
written.extend(
run_ppo_benchmark(
input_path=Path(__file__).resolve().parents[5]
/ "tpu_orchestration"
/ "results"
/ "ppo_benchmark.csv",
output_dir=args.output_dir,
include_non_finished=bool(args.include_non_finished),
)
)
written.extend(
run_first_sweep(
input_path=Path(__file__).resolve().parents[5]
/ "tpu_orchestration"
/ "results"
/ "first_sweep.csv",
output_dir=args.output_dir,
include_non_finished=bool(args.include_non_finished),
top_n=int(args.top_n),
)
)
for path in written:
print(path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,409 @@
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Any
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def _project_root() -> Path:
return Path(__file__).resolve().parents[5]
def _default_bundle_dir() -> Path:
base = _project_root() / "engine" / "studies" / "results" / "wandb_sweep_bundles"
bundles = sorted(
[path for path in base.glob("bundle_*") if path.is_dir()],
key=lambda path: path.stat().st_mtime,
reverse=True,
)
if not bundles:
raise FileNotFoundError(f"No sweep bundle directories found in {base}")
return bundles[0]
def _default_output_dir() -> Path:
return Path(__file__).resolve().parent / "generated" / "final"
def _default_plot_dir(output_dir: Path) -> Path:
return output_dir / "plots"
def _truthy(value: Any) -> bool:
if isinstance(value, bool):
return value
if value is None:
return False
return str(value).strip().lower() in {"1", "true", "yes", "on"}
def _mode_of(row: pd.Series) -> str:
mode_hint = str(row.get("study_mode", "")).strip().lower()
if mode_hint in {"baseline", "no_robust"}:
return "baseline"
if mode_hint in {"defended", "robust"}:
return "defended"
if _truthy(row.get("baseline_mode")) or _truthy(row.get("no_robust")):
return "baseline"
return "defended"
def _coerce_numeric(frame: pd.DataFrame, columns: list[str]) -> None:
for column in columns:
if column in frame.columns:
frame[column] = pd.to_numeric(frame[column], errors="coerce")
def _configure_style() -> None:
plt.rcParams.update(
{
"font.family": "serif",
"font.size": 10,
"axes.titlesize": 10,
"axes.labelsize": 9,
"legend.fontsize": 8,
"xtick.labelsize": 8,
"ytick.labelsize": 8,
"figure.dpi": 220,
"savefig.dpi": 320,
"axes.spines.top": False,
"axes.spines.right": False,
"axes.grid": True,
"grid.alpha": 0.22,
}
)
def _load_runs(bundle_dir: Path) -> pd.DataFrame:
path = bundle_dir / "runs_finished.csv"
if not path.exists():
raise FileNotFoundError(f"Missing required file: {path}")
frame = pd.read_csv(path)
frame["mode"] = frame.apply(_mode_of, axis=1)
_coerce_numeric(
frame,
[
"alpha",
"n_products",
"eval_revenue_mean",
"eval_reward_mean",
"eval_supra_share_mean",
"eval_volatility_mean",
"eval_coi_level_mean",
"eval_coi_leakage_mean",
"objective_score",
],
)
return frame
def _focus_sweep(runs: pd.DataFrame) -> str:
coverage = (
runs.groupby("sweep_id", as_index=False)
.agg(
n_alpha=("alpha", lambda s: int(pd.Series(s).dropna().nunique())),
max_alpha=("alpha", "max"),
run_count=("run_id", "size"),
)
.sort_values(
["n_alpha", "max_alpha", "run_count"], ascending=[False, False, False]
)
)
if coverage.empty:
raise ValueError("No sweep rows available in runs_finished.csv")
return str(coverage.iloc[0]["sweep_id"])
def _alpha_mode_summary(runs: pd.DataFrame) -> pd.DataFrame:
return (
runs.groupby(["alpha", "mode"], as_index=False)
.agg(
runs=("run_id", "size"),
revenue_mean=("eval_revenue_mean", "mean"),
reward_mean=("eval_reward_mean", "mean"),
supra_mean=("eval_supra_share_mean", "mean"),
volatility_mean=("eval_volatility_mean", "mean"),
coi_leakage_mean=("eval_coi_leakage_mean", "mean"),
coi_level_mean=("eval_coi_level_mean", "mean"),
)
.sort_values(["alpha", "mode"])
.reset_index(drop=True)
)
def _alpha_deltas(alpha_mode: pd.DataFrame) -> pd.DataFrame:
rows: list[dict[str, float]] = []
for alpha, group in alpha_mode.groupby("alpha", sort=True):
defended = group[group["mode"] == "defended"]
baseline = group[group["mode"] == "baseline"]
if defended.empty or baseline.empty:
continue
d_rev = float(defended["revenue_mean"].iloc[0])
b_rev = float(baseline["revenue_mean"].iloc[0])
d_reward = float(defended["reward_mean"].iloc[0])
b_reward = float(baseline["reward_mean"].iloc[0])
d_vol = float(defended["volatility_mean"].iloc[0])
b_vol = float(baseline["volatility_mean"].iloc[0])
d_supra = float(defended["supra_mean"].iloc[0])
b_supra = float(baseline["supra_mean"].iloc[0])
d_coi_leak = float(defended["coi_leakage_mean"].iloc[0])
b_coi_leak = float(baseline["coi_leakage_mean"].iloc[0])
rows.append(
{
"alpha": float(alpha),
"revenue_delta": d_rev - b_rev,
"revenue_delta_pct": 0.0
if b_rev == 0.0
else 100.0 * (d_rev - b_rev) / b_rev,
"reward_delta": d_reward - b_reward,
"reward_delta_pct": 0.0
if b_reward == 0.0
else 100.0 * (d_reward - b_reward) / b_reward,
"volatility_delta": d_vol - b_vol,
"supra_delta": d_supra - b_supra,
"coi_leakage_delta": d_coi_leak - b_coi_leak,
}
)
return pd.DataFrame(rows).sort_values("alpha").reset_index(drop=True)
def _zone_summary(alpha_deltas: pd.DataFrame) -> pd.DataFrame:
if alpha_deltas.empty:
return pd.DataFrame()
data = alpha_deltas.copy()
data["zone"] = np.where(
data["alpha"] >= 0.7, "high_alpha_0_7_plus", "low_alpha_below_0_7"
)
return (
data.groupby("zone", as_index=False)
.agg(
alpha_cells=("alpha", "size"),
revenue_delta_pct_mean=("revenue_delta_pct", "mean"),
reward_delta_pct_mean=("reward_delta_pct", "mean"),
coi_leakage_delta_mean=("coi_leakage_delta", "mean"),
volatility_delta_mean=("volatility_delta", "mean"),
)
.sort_values("zone")
)
def _save_plot(fig: plt.Figure, path: Path) -> Path:
path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(path, bbox_inches="tight")
plt.close(fig)
return path
def _plot_focus_revenue_by_alpha(alpha_mode: pd.DataFrame, out_path: Path) -> Path:
fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True)
for mode, color, label in (
("baseline", "#4C72B0", "Baseline"),
("defended", "#C44E52", "Defended"),
):
sub = alpha_mode[alpha_mode["mode"] == mode].sort_values("alpha")
if sub.empty:
continue
ax.plot(
sub["alpha"],
sub["revenue_mean"],
marker="o",
linewidth=1.9,
markersize=4,
color=color,
label=label,
)
ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--")
ax.set_xlabel(r"Contamination $\alpha$")
ax.set_ylabel("Mean episode revenue")
ax.set_title("Final Cohort Revenue Curves")
ax.legend(loc="lower left")
return _save_plot(fig, out_path)
def _plot_focus_revenue_delta(alpha_deltas: pd.DataFrame, out_path: Path) -> Path:
fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True)
x = alpha_deltas["alpha"].to_numpy(dtype=float)
y = alpha_deltas["revenue_delta_pct"].to_numpy(dtype=float)
ax.plot(x, y, marker="o", linewidth=2.0, markersize=4, color="#C44E52")
ax.fill_between(x, y, 0.0, color="#C44E52", alpha=0.12)
ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--")
high = alpha_deltas[alpha_deltas["alpha"] >= 0.7]
if not high.empty:
best = high.reindex(
high["revenue_delta_pct"].abs().sort_values(ascending=False).index
).iloc[0]
ax.scatter(
[best["alpha"]],
[best["revenue_delta_pct"]],
color="#1f77b4",
s=45,
zorder=3,
)
ax.annotate(
f"high-alpha peak {best['revenue_delta_pct']:.2f}%",
(float(best["alpha"]), float(best["revenue_delta_pct"])),
textcoords="offset points",
xytext=(6, 6),
fontsize=8,
)
ax.set_xlabel(r"Contamination $\alpha$")
ax.set_ylabel("Defended minus baseline revenue (%)")
ax.set_title("Revenue Delta by Contamination (Final Cohort)")
return _save_plot(fig, out_path)
def _plot_focus_risk_deltas(alpha_deltas: pd.DataFrame, out_path: Path) -> Path:
fig, ax = plt.subplots(figsize=(7.8, 4.8), constrained_layout=True)
x = alpha_deltas["alpha"].to_numpy(dtype=float)
ax.plot(
x,
alpha_deltas["coi_leakage_delta"].to_numpy(dtype=float),
marker="o",
linewidth=1.8,
markersize=4,
color="#55A868",
label="COI leakage delta",
)
ax.plot(
x,
alpha_deltas["volatility_delta"].to_numpy(dtype=float),
marker="s",
linewidth=1.8,
markersize=3.8,
color="#8172B3",
label="Volatility delta",
)
ax.axhline(0.0, color="#444444", linewidth=1.0, linestyle="--")
ax.axvline(0.7, color="#666666", linewidth=1.0, linestyle="--")
ax.set_xlabel(r"Contamination $\alpha$")
ax.set_ylabel("Defended minus baseline")
ax.set_title("Leakage and Stability Deltas (Final Cohort)")
ax.legend(loc="lower left")
return _save_plot(fig, out_path)
def _write_include(path: Path, figure_rel_path: str, width: str) -> Path:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(f"\\includegraphics[width={width}]{{{figure_rel_path}}}\n")
return path
def run(bundle_dir: Path, output_dir: Path, plot_dir: Path) -> list[Path]:
all_runs = _load_runs(bundle_dir)
focus_id = _focus_sweep(all_runs)
focus_runs = all_runs[all_runs["sweep_id"] == focus_id].copy()
alpha_mode = _alpha_mode_summary(focus_runs)
deltas = _alpha_deltas(alpha_mode)
zones = _zone_summary(deltas)
output_dir.mkdir(parents=True, exist_ok=True)
plot_dir.mkdir(parents=True, exist_ok=True)
written: list[Path] = []
alpha_mode_path = output_dir / "final_focus_alpha_mode_summary.csv"
alpha_mode.to_csv(alpha_mode_path, index=False)
written.append(alpha_mode_path)
delta_path = output_dir / "final_focus_alpha_deltas.csv"
deltas.to_csv(delta_path, index=False)
written.append(delta_path)
zone_path = output_dir / "final_focus_zone_summary.csv"
zones.to_csv(zone_path, index=False)
written.append(zone_path)
headline = {
"bundle": str(bundle_dir),
"focus_cohort": "max_alpha_coverage",
"alpha_cells": int(deltas["alpha"].nunique()) if not deltas.empty else 0,
"alpha_min": float(deltas["alpha"].min()) if not deltas.empty else None,
"alpha_max": float(deltas["alpha"].max()) if not deltas.empty else None,
"mean_revenue_delta_pct": float(deltas["revenue_delta_pct"].mean())
if not deltas.empty
else None,
"mean_reward_delta_pct": float(deltas["reward_delta_pct"].mean())
if not deltas.empty
else None,
"zone_summary": zones.to_dict(orient="records"),
}
headline_path = output_dir / "final_focus_headline_summary.json"
headline_path.write_text(json.dumps(headline, indent=2) + "\n")
written.append(headline_path)
written.append(
_plot_focus_revenue_by_alpha(
alpha_mode,
plot_dir / "final_focus_revenue_by_alpha.pdf",
)
)
written.append(
_plot_focus_revenue_delta(
deltas,
plot_dir / "final_focus_revenue_delta.pdf",
)
)
written.append(
_plot_focus_risk_deltas(
deltas,
plot_dir / "final_focus_risk_deltas.pdf",
)
)
include_dir = Path(__file__).resolve().parent / "includes" / "final"
written.append(
_write_include(
include_dir / "final_focus_revenue_by_alpha.tex",
"chapters/figures/results/generated/final/plots/final_focus_revenue_by_alpha.pdf",
"0.98\\linewidth",
)
)
written.append(
_write_include(
include_dir / "final_focus_revenue_delta.tex",
"chapters/figures/results/generated/final/plots/final_focus_revenue_delta.pdf",
"0.95\\linewidth",
)
)
written.append(
_write_include(
include_dir / "final_focus_risk_deltas.tex",
"chapters/figures/results/generated/final/plots/final_focus_risk_deltas.pdf",
"0.95\\linewidth",
)
)
return written
def main() -> None:
parser = argparse.ArgumentParser(
description="Generate final paper figures/tables from the final sweep cohort"
)
parser.add_argument("--bundle-dir", type=Path, default=_default_bundle_dir())
parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
parser.add_argument("--plot-dir", type=Path, default=None)
args = parser.parse_args()
_configure_style()
plot_dir = (
args.plot_dir
if args.plot_dir is not None
else _default_plot_dir(args.output_dir)
)
outputs = run(
bundle_dir=args.bundle_dir, output_dir=args.output_dir, plot_dir=plot_dir
)
for path in outputs:
print(path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,272 @@
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Iterable
import numpy as np
import pandas as pd
def _project_root() -> Path:
return Path(__file__).resolve().parents[5]
def _default_input() -> Path:
return _project_root() / "tpu_orchestration" / "results" / "first_sweep.csv"
def _default_output_dir() -> Path:
return Path(__file__).resolve().parent / "generated" / "legacy"
def _sanitize(key: str) -> str:
return key.replace("/", "_").replace("-", "_")
def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None:
for column in columns:
if column in frame.columns:
frame[column] = pd.to_numeric(frame[column], errors="coerce")
def _extract_alpha(frame: pd.DataFrame) -> pd.Series:
if "study/alpha" in frame.columns:
return pd.to_numeric(frame["study/alpha"], errors="coerce")
if "alpha" in frame.columns:
return pd.to_numeric(frame["alpha"], errors="coerce")
return pd.Series(np.nan, index=frame.index, dtype=float)
def _extract_mode(frame: pd.DataFrame) -> pd.Series:
if "study/mode" in frame.columns:
return frame["study/mode"].astype(str).str.strip().str.lower()
if "study/no_robust" in frame.columns:
no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0)
return pd.Series(
np.where(no_robust > 0.5, "no_robust", "robust"),
index=frame.index,
dtype="object",
)
if "no_robust" in frame.columns:
no_robust = (
frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"})
)
return pd.Series(
np.where(no_robust, "no_robust", "robust"),
index=frame.index,
dtype="object",
)
return pd.Series("", index=frame.index, dtype="object")
def _extract_tier(frame: pd.DataFrame) -> pd.Series:
for column in ("tiers", "runtime/backend", "algo", "run.backend", "run.algo"):
if column in frame.columns:
tier = frame[column].astype(str).str.strip().str.lower()
if tier.notna().any():
return tier
return pd.Series("unknown", index=frame.index, dtype="object")
def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame:
data = frame.copy()
if not include_non_finished and "State" in data.columns:
data = data[data["State"].astype(str).str.lower() == "finished"].copy()
data["alpha"] = _extract_alpha(data)
data["mode"] = _extract_mode(data)
data["tier"] = _extract_tier(data)
data = data[data["mode"].isin({"robust", "no_robust"})]
data = data[data["alpha"].notna()]
_coerce_numeric(
data,
[
"eval/revenue_mean",
"eval/reward_mean",
"eval/coi_level_mean",
"eval/coi_leakage_mean",
"eval/margin_mean",
"eval/volatility_mean",
"objective/score",
"train/alpha_adv",
"lambda_coi",
"robust_radius",
"learning_rate",
"batch_size",
"n_steps",
"total_timesteps",
],
)
return data.sort_values(["tier", "alpha", "mode"]).reset_index(drop=True)
def _group_summary(
frame: pd.DataFrame, by: list[str], metrics: list[str]
) -> pd.DataFrame:
agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
for metric in metrics:
safe = _sanitize(metric)
agg_spec[f"{safe}_mean"] = (metric, "mean")
agg_spec[f"{safe}_std"] = (metric, "std")
return frame.groupby(by, as_index=False).agg(**agg_spec).sort_values(by)
def _tier_alpha_deltas(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
rows: list[dict[str, float | str]] = []
for (tier, alpha), group in summary.groupby(["tier", "alpha"], sort=True):
robust = group[group["mode"] == "robust"]
no_robust = group[group["mode"] == "no_robust"]
if robust.empty or no_robust.empty:
continue
row: dict[str, float | str] = {
"tier": str(tier),
"alpha": float(alpha),
"runs_robust": float(robust["runs"].iloc[0]),
"runs_no_robust": float(no_robust["runs"].iloc[0]),
}
for metric in metrics:
safe = _sanitize(metric)
robust_value = float(robust[f"{safe}_mean"].iloc[0])
no_robust_value = float(no_robust[f"{safe}_mean"].iloc[0])
delta = robust_value - no_robust_value
row[f"{safe}_delta"] = delta
row[f"{safe}_delta_pct"] = (
np.nan if no_robust_value == 0 else 100.0 * delta / no_robust_value
)
rows.append(row)
return pd.DataFrame(rows)
def _top_runs(frame: pd.DataFrame, n: int) -> pd.DataFrame:
rank_metric = "objective/score"
if rank_metric not in frame.columns or frame[rank_metric].notna().sum() == 0:
rank_metric = "eval/reward_mean"
keep = [
"Name",
"tier",
"alpha",
"mode",
rank_metric,
"eval/revenue_mean",
"eval/reward_mean",
"eval/coi_level_mean",
"eval/coi_leakage_mean",
"lambda_coi",
"robust_radius",
"learning_rate",
"batch_size",
"n_steps",
"total_timesteps",
]
present = [column for column in keep if column in frame.columns]
ranked = frame[present].copy().sort_values(rank_metric, ascending=False)
return ranked.head(max(1, int(n))).reset_index(drop=True)
def _headline_json(
frame: pd.DataFrame, tier_mode: pd.DataFrame
) -> dict[str, float | str]:
out: dict[str, float | str] = {
"runs": int(len(frame)),
"tiers": int(frame["tier"].nunique()),
"alphas": int(frame["alpha"].nunique()),
}
robust_rows = tier_mode[tier_mode["mode"] == "robust"]
no_robust_rows = tier_mode[tier_mode["mode"] == "no_robust"]
if robust_rows.empty or no_robust_rows.empty:
out["status"] = "incomplete_modes"
return out
robust_mean = robust_rows["eval_revenue_mean_mean"].mean()
no_robust_mean = no_robust_rows["eval_revenue_mean_mean"].mean()
out.update(
{
"status": "ok",
"mean_tier_revenue_robust": float(robust_mean),
"mean_tier_revenue_no_robust": float(no_robust_mean),
"mean_tier_revenue_delta": float(robust_mean - no_robust_mean),
"mean_tier_revenue_delta_pct": float(
100.0 * (robust_mean - no_robust_mean) / no_robust_mean
)
if no_robust_mean
else np.nan,
}
)
return out
def run(
input_path: Path, output_dir: Path, include_non_finished: bool, top_n: int
) -> list[Path]:
output_dir.mkdir(parents=True, exist_ok=True)
raw = pd.read_csv(input_path)
frame = _prepare_frame(raw, include_non_finished=include_non_finished)
metrics = [
metric
for metric in (
"eval/revenue_mean",
"eval/reward_mean",
"eval/coi_level_mean",
"eval/coi_leakage_mean",
"eval/margin_mean",
"eval/volatility_mean",
"objective/score",
"train/alpha_adv",
)
if metric in frame.columns
]
tier_mode = _group_summary(frame, ["tier", "mode"], metrics)
tier_alpha_mode = _group_summary(frame, ["tier", "alpha", "mode"], metrics)
deltas = _tier_alpha_deltas(tier_alpha_mode, metrics)
top_configs = _top_runs(frame, n=top_n)
headline = _headline_json(frame, tier_mode)
outputs = {
"first_sweep_tier_mode_summary.csv": tier_mode,
"first_sweep_tier_alpha_mode_summary.csv": tier_alpha_mode,
"first_sweep_tier_alpha_deltas.csv": deltas,
"first_sweep_top_configs.csv": top_configs,
}
written_paths: list[Path] = []
for filename, table in outputs.items():
path = output_dir / filename
table.to_csv(path, index=False)
written_paths.append(path)
headline_path = output_dir / "first_sweep_headline_summary.json"
headline_path.write_text(json.dumps(headline, indent=2))
written_paths.append(headline_path)
return written_paths
def main() -> None:
parser = argparse.ArgumentParser(
description="Process first sweep CSV for paper tables"
)
parser.add_argument("--input", type=Path, default=_default_input())
parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
parser.add_argument("--include-non-finished", action="store_true")
parser.add_argument("--top-n", type=int, default=25)
args = parser.parse_args()
written = run(
input_path=args.input,
output_dir=args.output_dir,
include_non_finished=bool(args.include_non_finished),
top_n=int(args.top_n),
)
for path in written:
print(path)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,277 @@
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Iterable
import numpy as np
import pandas as pd
def _project_root() -> Path:
return Path(__file__).resolve().parents[5]
def _default_input() -> Path:
return _project_root() / "tpu_orchestration" / "results" / "ppo_benchmark.csv"
def _default_output_dir() -> Path:
return Path(__file__).resolve().parent / "generated" / "legacy"
def _sanitize(key: str) -> str:
return key.replace("/", "_").replace("-", "_")
def _coerce_numeric(frame: pd.DataFrame, columns: Iterable[str]) -> None:
for column in columns:
if column in frame.columns:
frame[column] = pd.to_numeric(frame[column], errors="coerce")
def _extract_alpha(frame: pd.DataFrame) -> pd.Series:
if "study/alpha" in frame.columns:
return pd.to_numeric(frame["study/alpha"], errors="coerce")
if "alpha" in frame.columns:
return pd.to_numeric(frame["alpha"], errors="coerce")
return pd.Series(np.nan, index=frame.index, dtype=float)
def _extract_mode(frame: pd.DataFrame) -> pd.Series:
if "study/mode" in frame.columns:
return frame["study/mode"].astype(str).str.strip().str.lower()
if "study/no_robust" in frame.columns:
no_robust = pd.to_numeric(frame["study/no_robust"], errors="coerce").fillna(0.0)
return pd.Series(
np.where(no_robust > 0.5, "no_robust", "robust"),
index=frame.index,
dtype="object",
)
if "no_robust" in frame.columns:
no_robust = (
frame["no_robust"].astype(str).str.lower().isin({"1", "true", "yes"})
)
return pd.Series(
np.where(no_robust, "no_robust", "robust"),
index=frame.index,
dtype="object",
)
return pd.Series("", index=frame.index, dtype="object")
def _prepare_frame(frame: pd.DataFrame, include_non_finished: bool) -> pd.DataFrame:
data = frame.copy()
if not include_non_finished and "State" in data.columns:
data = data[data["State"].astype(str).str.lower() == "finished"].copy()
data["alpha"] = _extract_alpha(data)
data["mode"] = _extract_mode(data)
data = data[data["mode"].isin({"robust", "no_robust"})]
data = data[data["alpha"].notna()]
numeric_cols = [
"eval/revenue_mean",
"eval/reward_mean",
"eval/coi_level_mean",
"eval/coi_leakage_mean",
"eval/volatility_mean",
"eval/margin_mean",
"train/alpha_adv",
"train/coi_penalty",
"train/ux_penalty",
"train/agent_prob",
]
_coerce_numeric(data, numeric_cols)
return data.sort_values(["alpha", "mode"]).reset_index(drop=True)
def _summary_by_alpha_mode(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
for metric in metrics:
safe = _sanitize(metric)
agg_spec[f"{safe}_mean"] = (metric, "mean")
agg_spec[f"{safe}_std"] = (metric, "std")
return (
frame.groupby(["alpha", "mode"], as_index=False)
.agg(**agg_spec)
.sort_values(["alpha", "mode"])
.reset_index(drop=True)
)
def _delta_by_alpha(summary: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
rows: list[dict[str, float]] = []
for alpha, alpha_group in summary.groupby("alpha", sort=True):
robust = alpha_group[alpha_group["mode"] == "robust"]
no_robust = alpha_group[alpha_group["mode"] == "no_robust"]
if robust.empty or no_robust.empty:
continue
row: dict[str, float] = {
"alpha": float(alpha),
"runs_robust": float(robust["runs"].iloc[0]),
"runs_no_robust": float(no_robust["runs"].iloc[0]),
}
for metric in metrics:
safe = _sanitize(metric)
robust_value = float(robust[f"{safe}_mean"].iloc[0])
no_robust_value = float(no_robust[f"{safe}_mean"].iloc[0])
delta = robust_value - no_robust_value
row[f"{safe}_robust"] = robust_value
row[f"{safe}_no_robust"] = no_robust_value
row[f"{safe}_delta"] = delta
row[f"{safe}_delta_pct"] = (
np.nan if no_robust_value == 0 else 100.0 * delta / no_robust_value
)
rows.append(row)
return pd.DataFrame(rows)
def _pairwise_win_rates(frame: pd.DataFrame) -> pd.DataFrame:
rules = {
"eval/revenue_mean": "higher",
"eval/reward_mean": "higher",
"eval/coi_leakage_mean": "lower",
"eval/volatility_mean": "lower",
}
rows: list[dict[str, float]] = []
for alpha, alpha_group in frame.groupby("alpha", sort=True):
robust = alpha_group[alpha_group["mode"] == "robust"]
no_robust = alpha_group[alpha_group["mode"] == "no_robust"]
if robust.empty or no_robust.empty:
continue
for metric, direction in rules.items():
if metric not in frame.columns:
continue
robust_values = robust[metric].dropna().to_numpy(dtype=float)
no_robust_values = no_robust[metric].dropna().to_numpy(dtype=float)
if robust_values.size == 0 or no_robust_values.size == 0:
continue
if direction == "higher":
wins = (robust_values[:, None] > no_robust_values[None, :]).sum()
else:
wins = (robust_values[:, None] < no_robust_values[None, :]).sum()
ties = (robust_values[:, None] == no_robust_values[None, :]).sum()
total = robust_values.size * no_robust_values.size
win_prob = (wins + 0.5 * ties) / total
rows.append(
{
"alpha": float(alpha),
"metric": metric,
"direction": direction,
"wins": int(wins),
"ties": int(ties),
"total_pairs": int(total),
"win_probability": float(win_prob),
}
)
return pd.DataFrame(rows)
def _overall_mode_summary(frame: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
agg_spec: dict[str, tuple[str, str]] = {"runs": ("mode", "size")}
for metric in metrics:
safe = _sanitize(metric)
agg_spec[f"{safe}_mean"] = (metric, "mean")
agg_spec[f"{safe}_std"] = (metric, "std")
return frame.groupby("mode", as_index=False).agg(**agg_spec).sort_values("mode")
def _headline_json(overall: pd.DataFrame) -> dict[str, float | str]:
if {"robust", "no_robust"} - set(overall["mode"].tolist()):
return {"status": "incomplete_modes"}
robust = overall[overall["mode"] == "robust"].iloc[0]
no_robust = overall[overall["mode"] == "no_robust"].iloc[0]
revenue_delta = float(
robust["eval_revenue_mean_mean"] - no_robust["eval_revenue_mean_mean"]
)
leakage_delta = float(
robust["eval_coi_leakage_mean_mean"] - no_robust["eval_coi_leakage_mean_mean"]
)
return {
"status": "ok",
"revenue_delta": revenue_delta,
"revenue_delta_pct": float(
100.0 * revenue_delta / no_robust["eval_revenue_mean_mean"]
),
"coi_leakage_delta": leakage_delta,
"coi_leakage_delta_pct": float(
100.0 * leakage_delta / no_robust["eval_coi_leakage_mean_mean"]
),
}
def run(input_path: Path, output_dir: Path, include_non_finished: bool) -> list[Path]:
output_dir.mkdir(parents=True, exist_ok=True)
raw = pd.read_csv(input_path)
frame = _prepare_frame(raw, include_non_finished=include_non_finished)
metrics = [
metric
for metric in (
"eval/revenue_mean",
"eval/reward_mean",
"eval/coi_level_mean",
"eval/coi_leakage_mean",
"eval/volatility_mean",
"eval/margin_mean",
"train/alpha_adv",
"train/coi_penalty",
"train/ux_penalty",
"train/agent_prob",
)
if metric in frame.columns
]
alpha_mode = _summary_by_alpha_mode(frame, metrics)
deltas = _delta_by_alpha(alpha_mode, metrics)
win_rates = _pairwise_win_rates(frame)
overall = _overall_mode_summary(frame, metrics)
headline = _headline_json(overall)
outputs = {
"ppo_alpha_mode_summary.csv": alpha_mode,
"ppo_alpha_deltas.csv": deltas,
"ppo_pairwise_win_rates.csv": win_rates,
"ppo_overall_mode_summary.csv": overall,
}
written_paths: list[Path] = []
for filename, table in outputs.items():
path = output_dir / filename
table.to_csv(path, index=False)
written_paths.append(path)
headline_path = output_dir / "ppo_headline_summary.json"
headline_path.write_text(json.dumps(headline, indent=2))
written_paths.append(headline_path)
return written_paths
def main() -> None:
parser = argparse.ArgumentParser(
description="Process PPO benchmark CSV for paper tables"
)
parser.add_argument("--input", type=Path, default=_default_input())
parser.add_argument("--output-dir", type=Path, default=_default_output_dir())
parser.add_argument("--include-non-finished", action="store_true")
args = parser.parse_args()
written = run(
input_path=args.input,
output_dir=args.output_dir,
include_non_finished=bool(args.include_non_finished),
)
for path in written:
print(path)
if __name__ == "__main__":
main()

View File

@@ -21,7 +21,7 @@
surf, surf,
shader=flat, shader=flat,
mesh/check=false % Disable check to rely on empty lines mesh/check=false % Disable check to rely on empty lines
] table [col sep=comma, x=step, y=price, z=density] {chapters/figures/supra_data.csv}; ] table [col sep=comma, x=step, y=price, z=density] {chapters/figures/supra/supra_data.csv};
\end{axis} \end{axis}
\end{tikzpicture} \end{tikzpicture}

View File

@@ -4038,4 +4038,3 @@ step,price,density
4000,146.51098761558535,0.0 4000,146.51098761558535,0.0
4000,147.9065925693512,0.0 4000,147.9065925693512,0.0
4000,149.30219752311706,10.0 4000,149.30219752311706,10.0
1 step price density
4038
4039
4040

View File

@@ -0,0 +1,166 @@
\definecolor{heroBlue}{RGB}{212, 228, 255}
\definecolor{heroBlueBorder}{RGB}{64, 103, 178}
\definecolor{heroGreen}{RGB}{214, 238, 216}
\definecolor{heroGreenBorder}{RGB}{48, 133, 66}
\definecolor{heroAmber}{RGB}{246, 230, 202}
\definecolor{heroAmberBorder}{RGB}{166, 121, 51}
\definecolor{heroGray}{RGB}{236, 236, 236}
\definecolor{heroGrayBorder}{RGB}{120, 120, 120}
% Panels occupy y = 2.2 .. 10.0
% Cross-panel connector gutter lives at y = 1.0 .. 2.2 (clearly below all nodes)
\begin{tikzpicture}[
>=Stealth,
font=\small,
panel/.style={draw=black!65, dashed, rounded corners=4pt, line width=0.85pt},
bB/.style={rectangle, rounded corners=3pt, draw=heroBlueBorder, fill=heroBlue,
line width=0.9pt, align=center, minimum height=0.85cm},
bG/.style={rectangle, rounded corners=3pt, draw=heroGreenBorder, fill=heroGreen,
line width=0.9pt, align=center, minimum height=0.85cm},
bA/.style={rectangle, rounded corners=3pt, draw=heroAmberBorder, fill=heroAmber,
line width=0.9pt, align=center, minimum height=0.85cm},
bY/.style={rectangle, rounded corners=3pt, draw=heroGrayBorder, fill=heroGray,
line width=0.9pt, align=center, minimum height=0.82cm},
pill/.style={ellipse, draw=black!50, fill=black!4, line width=0.75pt,
align=center, minimum width=1.6cm, minimum height=0.68cm},
arr/.style={->, draw=black!80, line width=0.88pt},
bidir/.style={<->, draw=black!80, line width=0.88pt},
darr/.style={->, draw=black!60, line width=0.80pt, densely dashed},
crossA/.style={->, draw=heroAmberBorder!90!black, line width=1.15pt, dash pattern=on 3.5pt off 2pt},
crossG/.style={->, draw=heroGreenBorder!90!black, line width=1.15pt, dash pattern=on 3.5pt off 2pt},
arrG/.style={->, draw=heroGreenBorder!90!black, line width=1.15pt},
lbl/.style={font=\scriptsize, align=center, fill=white, inner sep=1.5pt, text=black}
]
%% ============================================================
%% Panel A x: 0.211.2 y: 2.210.0
%% ============================================================
\draw[panel] (0.2,2.2) rectangle (11.2,10.0);
\node[anchor=west, font=\small\bfseries] at (0.45,9.72) {(a) Online platform and data plane};
\node[pill] (human) at (1.3, 8.55) {Human};
\node[pill] (agent) at (1.3, 7.45) {Agent};
\node[bB, minimum width=2.75cm] (web) at (4.2, 8.0) {Next.js\\Web App};
\node[bB, minimum width=2.75cm] (provider) at (7.35, 8.0) {Pricing\\Provider};
\node[bY, minimum width=1.85cm] (redis) at (9.85, 8.0) {Redis};
\node[bG, minimum width=3.1cm] (kBehav) at (4.0, 6.2) {Kafka stream\\Behavior events};
\node[bG, minimum width=3.0cm] (kQuotes) at (7.5, 6.2) {Kafka stream\\Price quotes};
\node[bA, minimum width=3.1cm] (worker) at (4.0, 4.4) {Worker / ETL\\Feature jobs};
\node[bA, minimum width=2.65cm] (registry) at (8.45, 4.4) {Model\\Registry};
% service row
\draw[arr] (human.east) -- (web.west);
\draw[arr] (agent.east) -- (web.west);
\draw[arr] (web.east) -- (provider.west);
\draw[bidir] (provider.east) -- (redis.west);
% web/provider -> kafka
\draw[arr] (web.south) -- (kBehav.north)
node[midway, left, lbl] {$e=(a,i,t,\mu,\delta)$};
\draw[arr] (provider.south) -- (kQuotes.north)
node[midway, right, lbl] {$(i,p,\mathrm{sid},\phi,t)$};
% kafka -> worker (straight south)
\draw[arr] (kBehav.south) -- (worker.north);
\draw[arr] (kQuotes.south) -- (worker.north);
% worker -> registry
\draw[arr] (worker.east) -- (registry.west);
% model refresh: registry east -> goes right to x=11.0, north to y=9.2, left to provider
% this keeps it entirely inside panel A with no crossing of nodes
\draw[crossA, rounded corners=6pt]
(registry.east) -- (11.0, 4.4)
-- (11.0, 9.2)
-- node[midway, lbl] {model refresh} (provider.north |- 0, 9.2)
-- (provider.north);
%% ============================================================
%% Panel B x: 11.620.4 y: 2.210.0
%% ============================================================
\draw[panel] (11.6,2.2) rectangle (19.8,10.0);
\node[anchor=west, font=\small\bfseries] at (11.85,9.72) {(b) Distinguishability layer};
\node[bG, minimum width=2.4cm] (session) at (14.0, 8.9) {Session prefix\\$\tau'$};
\node[bB, minimum width=2.4cm] (empKern) at (13.65,7.45) {Empirical kernel\\$\hat T'$};
\node[bY, minimum width=2.4cm] (weakLab) at (17.55,8.9) {Weak labels\\$\mathcal{D}_H,\mathcal{D}_A$};
\node[bY, minimum width=2.2cm] (protoH) at (12.8, 5.9) {Prototype\\$\bar T_H$};
\node[bA, minimum width=2.4cm] (kldist) at (15.55,5.9) {KL distances\\$\Delta_H,\Delta_A$};
\node[bY, minimum width=2.2cm] (protoA) at (18.3, 5.9) {Prototype\\$\bar T_A$};
\node[bB, minimum width=2.9cm] (calHead) at (13.55,4.25) {Contrastive\\calibration head};
\node[bG, minimum width=2.55cm] (score) at (17.75,4.25) {Session score\\$f(\tau'),\hat\alpha(\tau')$};
\node[lbl] at (15.55, 3.15) {$\hat\alpha(\tau')=\sigma\!\left(\beta(\Delta_H-\Delta_A)\right)$};
\draw[arr, rounded corners=4pt] (session.south) -- (empKern.north);
\draw[arr, rounded corners=4pt] (empKern.south) -- (13.65, 6.8) -| (protoH.north);
\draw[arr, rounded corners=4pt] (weakLab.south) -- (17.55, 6.8) -| (protoA.north);
% weak labels -> protoH: go south then hard-left below weakLab
\draw[arr, rounded corners=4pt] (weakLab.south) -- (17.55,6.8) -| (protoH.north east);
\draw[arr] (protoH.east) -- (kldist.west);
\draw[arr] (protoA.west) -- (kldist.east);
\draw[arr] (kldist.south) -- (calHead.north east);
\draw[arr] (calHead.east) -- (score.west);
%% ============================================================
%% Panel C x: 20.831.0 y: 2.210.0
%% ============================================================
\draw[panel] (20.8,2.2) rectangle (31.0,10.0);
\node[anchor=west, font=\small\bfseries] at (21.05,9.72) {(c) Distributionally robust control};
\node[bB, minimum width=3.1cm] (state) at (23.15, 8.9)
{State summary\\$[p_{t-1},\hat q_{t-1},f(\tau')]$};
\node[bY, minimum width=2.9cm] (ambSet) at (23.15, 7.45) {Ambiguity set\\$\mathcal U_\epsilon(\hat P_N)$};
\node[bG, minimum width=2.9cm] (innerMin) at (28.55, 7.45) {Inner minimisation\\$\min_{Q\in\mathcal U_\epsilon}$};
\node[bY, minimum width=8.2cm] (contScen) at (25.9, 5.9)
{Contamination scenarios $\;\alpha_k\in\mathcal A_{\epsilon_\alpha}(\alpha_0)$};
\node[bA, minimum width=8.8cm] (reward) at (25.9, 4.45)
{$r_t = R(p_t,\hat q_t) - \lambda\,\mathrm{COI}_{\mathrm{leak}}(p_t,\tau_t') - \eta\,UX_t$};
\node[bB, minimum width=2.85cm] (policy) at (22.75, 3.05) {Robust policy $\pi^*$};
\node[bG, minimum width=2.85cm] (publish) at (29.05, 3.05) {Publish price\\vector $p_t$};
\node[lbl] at (25.9, 2.55) {$\pi^*=\arg\max_\pi\min_{Q\in\mathcal U_\epsilon}\mathbb{E}[r_t]$};
\draw[arr] (state.south) -- (ambSet.north);
\draw[arr] (ambSet.east) -- (innerMin.west);
\draw[arr, rounded corners=4pt] (ambSet.south) -- (23.15, 6.6) -| ([xshift=-2cm]contScen.north);
\draw[arr, rounded corners=4pt] (innerMin.south) -- (28.55, 6.6) -| ([xshift=2cm]contScen.north);
\draw[arr] (contScen.south) -- (reward.north);
\draw[arr, rounded corners=6pt] (reward.south) -- (25.9, 3.7) -| (policy.north);
\draw[arr] (policy.east) -- (publish.west);
% market response: up the right edge of panel C, entirely inside, rounded
\draw[arrG, rounded corners=6pt] (publish.east) -- (30.6, 3.05)
-- (30.6, 9.8)
-- node[midway, lbl] {market response} (state.north |- 0, 9.8)
-- (state.north);
%% ============================================================
%% Cross-panel connectors gutter at y = 1.0..2.2
%% Three separate depths: 1.85, 1.45, 1.05 (no overlaps)
%% ============================================================
% 1. Worker -> Session (depth y=1.85, shallowest)
\draw[crossA, rounded corners=6pt]
(worker.south) -- (worker.south |- 0, 1.85)
-- node[pos=0.5, lbl] {offline extraction} (11.4, 1.85)
-- (11.4, 8.9)
-- (session.west);
% 2. Score -> State (depth y=1.45)
\draw[crossG, rounded corners=6pt]
(score.south) -- (score.south |- 0, 1.45)
-- node[pos=0.5, lbl] {contamination signal} (20.6, 1.45)
-- (20.6, 8.9)
-- (state.west);
% 3. Publish -> Provider (depth y=1.05, deepest)
\draw[crossG, rounded corners=3pt]
(publish.south) -- (publish.south |- 0, 1.05)
-- node[pos=0.4, lbl] {serve online} (5.8, 1.05)
-- (5.8, 7.7)
-- ([yshift=-0.3cm]provider.west);
\end{tikzpicture}

View File

@@ -62,7 +62,7 @@ We propose a robust optimization objective. The platform seeks a pricing policy
Here: Here:
\begin{itemize} \begin{itemize}
\item The first term, $p_t \cdot \hat{q}_t(p_t | \theta=H)$, represents the revenue generated strictly from the estimated human segment. \item The first term, $p_t \cdot \hat{q}_t(p_t | \theta=H)$, represents the revenue generated strictly from the estimated human segment.
\item $\mathcal{L}_{detect}$ is a penalty term for failing to separate distributions (the cost of confusion). \item $\mathcal{L}_{detect}$ is a penalty term for failing to distinguish distributions (the cost of confusion).
\item $\lambda$ is a hyperparameter balancing revenue exploitation vs. robust detection. \item $\lambda$ is a hyperparameter balancing revenue exploitation vs. robust detection.
\end{itemize} \end{itemize}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 84 KiB

After

Width:  |  Height:  |  Size: 324 KiB

View File

@@ -57,7 +57,7 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce
\item[Trajectory] Defined as a series of unspecified length, collecting data on states of some object over time. \item[Trajectory] Defined as a series of unspecified length, collecting data on states of some object over time.
\item[Cost of Information (COI)] The average premium extracted above marginal cost due to information asymmetry. \item[Cost of Information (COI)] The average premium extracted above marginal cost due to information asymmetry.
\item[Contamination Ratio] The proportion of agent sessions versus human sessions in the system. \item[Contamination Ratio] The proportion of agent sessions versus human sessions in the system.
\item[Separability] The ability to distinguish between human and agent behavioral patterns. \item[Distinguishability] The ability to distinguish between human and agent behavioral patterns.
\end{description} \end{description}
\section{Aggregate Compute Budget Derivation} \section{Aggregate Compute Budget Derivation}

View File

@@ -29,6 +29,9 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce
\vspace{1em} \vspace{1em}
\noindent\textbf{Acknowledgments:} This research was supported by the TPU Research Cloud program, which provided access to Google Cloud TPU accelerators (including TPU v4, v5e, and v6e). \noindent\textbf{Acknowledgments:} This research was supported by the TPU Research Cloud program, which provided access to Google Cloud TPU accelerators (including TPU v4, v5e, and v6e).
\vspace{0.5em}
\noindent\textbf{Project page:} \url{https://velocitatem.github.io/PHANTOM/}
\clearpage \clearpage
\input{chapters/01-intro} \input{chapters/01-intro}
\input{chapters/02-literature-review} \input{chapters/02-literature-review}
@@ -43,15 +46,44 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce
\appendix \appendix
\section{Terminology} \section{Terminology}
\begin{description} \begin{description}
\item[Agent $A$] An actor of non-human nature, powered by an LLM. \item[Agent $A$] A non-human actor, typically an LLM-driven system that executes web actions toward a goal.
\item[Human $H$] An individual human with some job to be done. \item[Human $H$] A human participant interacting with the platform to complete a task.
\item[Actor $\theta$] Defines a type of class which is either Agent or Human and has the capability to carry out actions on a web platform. \item[Actor Type $\theta$] A latent class parameter describing whether a session is generated by a human or an agent profile.
\item[Platform] Any web-based platform which serves an interface to a collection of items that can be purchased, each at some price $p_i$. \item[Platform] A web interface exposing purchasable items and their offered prices.
\item[Behavioral Model] A mathematical model predicting what action comes after a series of prior actions. \item[Session $s$] A bounded interaction record tied to one actor and one session identifier.
\item[LLM] Large Language Model served by some provider with the abstracted capability of tool calling. \item[Event $e_{s,k}$] A single interaction tuple in a session, including action, item target, and timestamp.
\item[TPU] Tensor Processing Unit which is a unique kind of chip architecture developed by Google. \item[Trajectory $\tau_s$] The ordered sequence of events generated within a session.
\item[Trajectory] Defined as a series of unspecified length, collecting data on states of some object over time. \item[Demand Proxy $\hat{q}_{t,i}$] A weighted aggregate of observed actions used as an operational substitute for latent demand.
% TODO: maybe define other things in a similar succient manner \item[Action Weight Function $\omega(a)$] A mapping from action type to signal strength in the demand proxy.
\item[True Demand $d(p;\theta)$] The latent purchase response as a function of price and actor type.
\item[Contamination $\alpha$] The proportion of agent-generated traffic in the session mixture.
\item[Non-stationary Noise $\epsilon_t$] Time-varying residual variation not explained by the actor mixture.
\item[Pricing Policy $\pi(\tau)$] A function mapping observed interaction history to an offered price.
\item[Cost of Information (COI)] The expected premium above the minimum viable price induced by the pricing policy.
\item[COI Leakage] A per-quote penalty term modeling information revealed to reconnaissance behavior.
\item[First-Order Statistic $p_{(1)}$] The minimum observed price among multiple independent queries.
\item[Transition Kernel $\mathcal{T}$] A Markov transition matrix over behavioral states or actions.
\item[Distinguishability] The degree to which human and agent sessions can be distinguished from behavior alone.
\item[KL Divergence $D_{KL}$] A relative-entropy measure used to compare session transition structure against class prototypes.
\item[Divergence Scores $\Delta_H,\Delta_A$] Session-level distances to human and agent transition centroids.
\item[Weak Agent Probability $f(\tau)$] A session-level score estimating the likelihood that a trajectory is agent-generated.
\item[Contamination Generator $\mathcal{G}(\alpha)$] A simulator component that injects synthetic agent trajectories to reach a target mixture level.
\item[Stackelberg Game] A leader-follower formulation where the platform sets prices and demand responds.
\item[Ambiguity Set $\mathcal{U}_{\epsilon}$] A set of plausible demand distributions considered under distributional uncertainty.
\item[Wasserstein Ball] A distance-bounded neighborhood around an empirical distribution used in robust optimization.
\item[DR-RL] Distributionally Robust Reinforcement Learning for policies trained against worst-case distributional shifts.
\item[Nominal Contamination $\alpha_0$] The baseline contamination level around which robust candidates are evaluated.
\item[Robustness Radius $\epsilon_\alpha$] The local interval width used for inner minimization over contamination scenarios.
\item[Query-Tax Surrogate] A constant leakage proxy assigning fixed penalty to suspected reconnaissance queries.
\item[Revelation Surrogate] A leakage proxy based on $-\log\pi(p\mid\tau)$ to penalize highly informative quotes.
\item[Limbo Stack] The alternating game-history buffer that stores leader price moves and follower demand responses.
\item[UX Index] A bounded user-experience metric tracked to evaluate policy side effects on legitimate users.
\item[Look-to-Book Ratio] The ratio of search-like interactions to completed purchases, used as an operational contamination indicator.
\item[Hybrid Kappa-Lambda Architecture] A data design combining streaming ingestion with offline and batch learning loops.
\item[MDP / POMDP] Sequential decision models with full observability (MDP) or partial observability (POMDP).
\item[Behavioral Model] A model predicting what action is likely to follow from prior actions.
\item[LLM] Large Language Model served through an inference provider with tool-use capability.
\item[TPU] Tensor Processing Unit, a specialized accelerator architecture developed by Google.
\end{description} \end{description}
\section{Aggregate Compute Budget Derivation} \section{Aggregate Compute Budget Derivation}
@@ -78,6 +110,30 @@ v4 & 64 & 275 & $64 \times 275 = 17{,}600$ \\
Converting to petaFLOPS: $160{,}320\;\text{TFLOPS} = 160.32\;\text{PFLOPS} \approx 160\;\text{PFLOPS}$. This is the theoretical peak under sustained BF16 arithmetic; realized throughput depends on memory bandwidth utilization and inter-chip communication overhead, but the figure serves as a useful upper bound for provisioning decisions. Converting to petaFLOPS: $160{,}320\;\text{TFLOPS} = 160.32\;\text{PFLOPS} \approx 160\;\text{PFLOPS}$. This is the theoretical peak under sustained BF16 arithmetic; realized throughput depends on memory bandwidth utilization and inter-chip communication overhead, but the figure serves as a useful upper bound for provisioning decisions.
\section{Slope-Test Verification: Revenue vs. Contamination}
\label{app:alpha_revenue_slope}
This appendix provides a compact verification of the slope result reported in the main results section. Using the same run-level pairs $x_i=\texttt{study/alpha}_i$ and $y_i=\texttt{eval/revenue\_mean}_i$ ($n=95$), we re-checked the ordinary least squares slope test in Python with standard test routines (SciPy two-sided $t$ test for the slope).
\[
\widehat{y}=326{,}878.57-60{,}631.95\,x,
\]
\[
t(93)=-8.2148,\qquad p=1.2038\times 10^{-12},\qquad R^2=0.4205,\qquad 95\%\,\text{CI}_{\beta_1}=[-75{,}288.76,\,-45{,}975.13].
\]
The Python verification reproduces the reported coefficients and inference values, confirming that the slope-test results are correct under standard methods.
\section{whoclickedit Dataset Card}
\label{app:whoclicked_card}
For transparency and reproducibility, this appendix includes the full dataset card used for the public release of the \texttt{whoclickedit} dataset.
\lstinputlisting[
caption={whoclickedit dataset card (README snapshot)},
label={lst:whoclicked_dataset_card}
]{chapters/auto/whoclicked_dataset_card.md}
% \input{../build/concatenated_code} % \input{../build/concatenated_code}
\end{document} \end{document}

View File

@@ -41,7 +41,7 @@
\begin{abstract} \begin{abstract}
Dynamic pricing pipelines in e-commerce consume behavioral demand signals to set prices, but the growing presence of LLM-powered agents introduces a novel contamination vector: these agents decouple information gathering from transaction execution across isolated sessions, eroding the platform's pricing power. Dynamic pricing pipelines in e-commerce consume behavioral demand signals to set prices, but the growing presence of LLM-powered agents introduces a novel contamination vector: these agents decouple information gathering from transaction execution across isolated sessions, eroding the platform's pricing power.
We present PHANTOM, a modular compound system that addresses this threat end-to-end. The system is composed of five orchestrated components: (1)~a configurable e-commerce research platform with dual-stream Kafka ingestion for behavioral and price-exposure events, (2)~a GOFAI-based weak labeling stage that partitions sessions into human and agent classes using rule-based predicates, (3)~a transition-kernel estimator that learns separable Markov models for each actor type and constructs a Contamination Generator for controlled simulation, (4)~a Distributionally Robust Reinforcement Learning policy that optimizes pricing under a Wasserstein ambiguity set conditioned on per-session divergence signals, and (5)~an Airflow-orchestrated pipeline that connects online data collection to offline policy training via Redis-backed model serving. We present PHANTOM, a modular compound system that addresses this threat end-to-end. The system is composed of five orchestrated components: (1)~a configurable e-commerce research platform with dual-stream Kafka ingestion for behavioral and price-exposure events, (2)~a GOFAI-based weak labeling stage that partitions sessions into human and agent classes using rule-based predicates, (3)~a transition-kernel estimator that learns distinguishable Markov models for each actor type and constructs a Contamination Generator for controlled simulation, (4)~a Distributionally Robust Reinforcement Learning policy that optimizes pricing under a Wasserstein ambiguity set conditioned on per-session divergence signals, and (5)~an Airflow-orchestrated pipeline that connects online data collection to offline policy training via Redis-backed model serving.
We formally derive the Cost of Information Theorem, proving that standard pricing mechanisms become incentive-incompatible as agent query volume grows. The system architecture, interaction schema, and factorial experiment harness are designed for reproducibility and are released as open artifacts. We evaluate system-level tradeoffs between revenue protection, information leakage, and user-experience degradation through a three-objective reward structure. We formally derive the Cost of Information Theorem, proving that standard pricing mechanisms become incentive-incompatible as agent query volume grows. The system architecture, interaction schema, and factorial experiment harness are designed for reproducibility and are released as open artifacts. We evaluate system-level tradeoffs between revenue protection, information leakage, and user-experience degradation through a three-objective reward structure.
\end{abstract} \end{abstract}
@@ -58,7 +58,7 @@ The current innovation boom in generative artificial intelligence and its applic
The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. Dynamic pricing algorithms rely on directly translating demand features $q$ to new price assignments $\hat{p}$ across a catalogue of products of size $N$. When agent-driven reconnaissance traffic contaminates these demand signals, the pricing pipeline produces biased estimates that erode margins. This is not a single-model failure but a \textit{compound system} failure: the data ingestion, demand estimation, policy optimization, and model serving stages each propagate and amplify the contamination. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. Dynamic pricing algorithms rely on directly translating demand features $q$ to new price assignments $\hat{p}$ across a catalogue of products of size $N$. When agent-driven reconnaissance traffic contaminates these demand signals, the pricing pipeline produces biased estimates that erode margins. This is not a single-model failure but a \textit{compound system} failure: the data ingestion, demand estimation, policy optimization, and model serving stages each propagate and amplify the contamination.
Existing work treats bot detection and dynamic pricing as separate concerns. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience. This gap is what our contribution aims to address. Existing work treats bot detection and dynamic pricing as separate concerns. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience. This gap is what our contribution aims to address.
\subsection{System-Level Contributions} \subsection{System-Level Contributions}
@@ -78,7 +78,7 @@ We frame our contribution along the four CAIS pillars---architectural patterns,
This work addresses three core research questions: This work addresses three core research questions:
\begin{enumerate} \begin{enumerate}
\item[\textbf{RQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting? \item[\textbf{RQ1}] \textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
\item[\textbf{RQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems? \item[\textbf{RQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
\item[\textbf{RQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination? \item[\textbf{RQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
\end{enumerate} \end{enumerate}
@@ -115,7 +115,7 @@ Each price query generates a record $(i, p, \text{sid}, \phi, t)$ associating th
\subsection{Offline Loop: Policy Training} \subsection{Offline Loop: Policy Training}
The Kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger. The offline loop consumes collected trajectories, performs weak labeling and transition-kernel estimation (Section~\ref{sec:separability}), trains the DR-RL policy (Section~\ref{sec:drrl}) in a simulator, and pushes the resulting policy to Redis for the pricing provider to read. The Kafka cluster is subscribed to by our pipeline which is configured on a schedule in Airflow, with the possibility of manual trigger. The offline loop consumes collected trajectories, performs weak labeling and transition-kernel estimation (Section~\ref{sec:distinguishability}), trains the DR-RL policy (Section~\ref{sec:drrl}) in a simulator, and pushes the resulting policy to Redis for the pricing provider to read.
\subsection{Online Dynamic Pricing (Baseline)} \subsection{Online Dynamic Pricing (Baseline)}
@@ -165,7 +165,7 @@ The metadata record $\mu$ varies by action type. This heterogeneous structure is
%% ==================================================================== %% ====================================================================
\section{Methodology: Pipeline Components} \section{Methodology: Pipeline Components}
This section details the theoretical and practical framework behind each pipeline component. We formalize the problem environment, derive the \textit{Cost of Information} (COI) theorem that motivates the system design, describe the separability and contamination modules, and formulate the robust pricing policy. This section details the theoretical and practical framework behind each pipeline component. We formalize the problem environment, derive the \textit{Cost of Information} (COI) theorem that motivates the system design, describe the distinguishability and contamination modules, and formulate the robust pricing policy.
\subsection{Problem Formalization} \subsection{Problem Formalization}
@@ -225,15 +225,15 @@ Since the integrand vanishes as $N \to \infty$ for all $t > \underline{p}$, the
This result is the theoretical motivation for the system design: it proves that standard pricing policies $\pi$ fail to extract surplus in the presence of large-scale agentic search, necessitating a contamination-aware component in the pipeline. This result is the theoretical motivation for the system design: it proves that standard pricing policies $\pi$ fail to extract surplus in the presence of large-scale agentic search, necessitating a contamination-aware component in the pipeline.
\subsection{Module: Separability and Contamination Generation} \subsection{Module: Distinguishability and Contamination Generation}
\label{sec:separability} \label{sec:distinguishability}
To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from collected data using a two-stage approach. To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from collected data using a two-stage approach.
\subsubsection{GOFAI-Based Weak Labeling.} \subsubsection{GOFAI-Based Weak Labeling.}
We use Good Old-Fashioned AI (GOFAI) heuristics to generate weak labels for separability. A set of rule-based predicates $\phi_j: \tau \to \{0,1\}$ partitions dataset $\mathcal{D}$ into high-confidence sets $\mathcal{D}_H$ and $\mathcal{D}_A$. We then estimate separate transition models for both groups and ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability? We use Good Old-Fashioned AI (GOFAI) heuristics to generate weak labels for distinguishability. A set of rule-based predicates $\phi_j: \tau \to \{0,1\}$ partitions dataset $\mathcal{D}$ into high-confidence sets $\mathcal{D}_H$ and $\mathcal{D}_A$. We then estimate separate transition models for both groups and ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability?
To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global separability and event-level diagnostics at the same time. In our balanced dataset (50\% human, 50\% agent), the average divergence is approximately $1.8$. To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global distinguishability and event-level diagnostics at the same time. In our recorded dataset (13 human sessions, 16 agent sessions; 45\%/55\%), the average divergence is approximately $1.8$.
\begin{definition}[KL Divergence for Transition Distributions] \begin{definition}[KL Divergence for Transition Distributions]
Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is: Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is:
@@ -243,7 +243,7 @@ Let $P_e$ and $Q_e$ be categorical distributions over destination states followi
where $\mathcal{S}_e$ denotes the set of destination events that follow $e$ in the human trajectories. where $\mathcal{S}_e$ denotes the set of destination events that follow $e$ in the human trajectories.
\end{definition} \end{definition}
With these divergence features we train a contrastive model to estimate a weak agent probability $f(\tau)\in[0,1]$, which serves as the interface between the separability module and the downstream pricing policy. With these divergence features we train a contrastive model to estimate a weak agent probability $f(\tau)\in[0,1]$, which serves as the interface between the distinguishability module and the downstream pricing policy.
\subsubsection{Transition-Kernel Estimation and Contamination Generator.} \subsubsection{Transition-Kernel Estimation and Contamination Generator.}
\label{sec:tpe} \label{sec:tpe}
@@ -282,12 +282,12 @@ Given a newly observed partial trajectory $\tau'$, we compute its empirical tran
\Delta_A(\tau') &= D_{KL}(\hat{\mathcal{T}}^\prime \parallel \bar{\mathcal{T}}_A) \Delta_A(\tau') &= D_{KL}(\hat{\mathcal{T}}^\prime \parallel \bar{\mathcal{T}}_A)
\end{align} \end{align}
These divergence statistics serve as the operational connector between the separability module and the pricing policy. We define the per-session contamination estimate as: These divergence statistics serve as the operational connector between the distinguishability module and the pricing policy. We define the per-session contamination estimate as:
\begin{equation} \begin{equation}
\label{eq:alpha_hat} \label{eq:alpha_hat}
\hat{\alpha}(\tau') = \sigma\big(\beta(\Delta_H(\tau') - \Delta_A(\tau'))\big) \hat{\alpha}(\tau') = \sigma\big(\beta(\Delta_H(\tau') - \Delta_A(\tau'))\big)
\end{equation} \end{equation}
where $\sigma$ is the logistic function and $\beta > 0$ is a temperature parameter. This maps separability directly into a scalar control input for the pricing objective. where $\sigma$ is the logistic function and $\beta > 0$ is a temperature parameter. This maps distinguishability directly into a scalar control input for the pricing objective.
\subsubsection{Ambiguity Set Construction.} \subsubsection{Ambiguity Set Construction.}
Because the contamination level $\alpha$ and demand shift are non-stationary, a point estimate of the demand distribution is insufficient. Let $\hat{P}_N$ denote the empirical reference distribution induced by the Contamination Generator $\mathcal{G}(\alpha)$. We define the Wasserstein ambiguity set: Because the contamination level $\alpha$ and demand shift are non-stationary, a point estimate of the demand distribution is insufficient. Let $\hat{P}_N$ denote the empirical reference distribution induced by the Contamination Generator $\mathcal{G}(\alpha)$. We define the Wasserstein ambiguity set:
@@ -344,7 +344,7 @@ The simulator has multiple configurable factors, including valuation distributio
Our training budget spans 384 TPU chips across v4, v5e, and v6e generations, distributed across Europe and U.S. regions with a spot-heavy mix and an on-demand reserve. At peak BF16 throughput this corresponds to roughly 160 PFLOPS of aggregate compute. We allocate v6e capacity to the heaviest policy training, use v5e for broad hyperparameter sweeps, and reserve on-demand v4 quota for runs that should not be preempted \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}. Our training budget spans 384 TPU chips across v4, v5e, and v6e generations, distributed across Europe and U.S. regions with a spot-heavy mix and an on-demand reserve. At peak BF16 throughput this corresponds to roughly 160 PFLOPS of aggregate compute. We allocate v6e capacity to the heaviest policy training, use v5e for broad hyperparameter sweeps, and reserve on-demand v4 quota for runs that should not be preempted \parencite{noauthor_tpu_2026,noauthor_tpu_2025-1,noauthor_tpu_2025}.
Our process follows three stages: (1)~observe and \textit{vectorize} behavioral interactions, (2)~learn separability to characterize human versus agent patterns, and (3)~use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator. Our process follows three stages: (1)~observe and \textit{vectorize} behavioral interactions, (2)~learn distinguishability to characterize human versus agent patterns, and (3)~use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is intentionally a disconnected component that feeds the later contributions. Operationally, goals and experiment runs are tracked in PostgreSQL (goal table, run table, and assignment mapping). This data-acquisition phase is intentionally a disconnected component that feeds the later contributions.
@@ -375,7 +375,7 @@ Initialize contamination estimate $\hat\alpha \leftarrow 0.2$\;
$\mathcal S_t \leftarrow \mathcal S_t \cup \{\tau_m\}$\; $\mathcal S_t \leftarrow \mathcal S_t \cup \{\tau_m\}$\;
} }
\tcp{Estimate contamination from separability module} \tcp{Estimate contamination from distinguishability module}
compute $\hat\alpha \leftarrow \frac{1}{M}\sum_{\tau\in\mathcal S_t} \Big[\sigma\big(\beta(\Delta_H(\tau)-\Delta_A(\tau))\big)\Big]$\; compute $\hat\alpha \leftarrow \frac{1}{M}\sum_{\tau\in\mathcal S_t} \Big[\sigma\big(\beta(\Delta_H(\tau)-\Delta_A(\tau))\big)\Big]$\;
compute $J_t \leftarrow \text{Revenue}(p_t,\hat Q_t) - \lambda\cdot \text{COILeak}(\hat\alpha) - \eta\cdot \text{UX}(\hat\alpha)$\; compute $J_t \leftarrow \text{Revenue}(p_t,\hat Q_t) - \lambda\cdot \text{COILeak}(\hat\alpha) - \eta\cdot \text{UX}(\hat\alpha)$\;
@@ -430,7 +430,7 @@ We formally defined the Cost of Information and proved that as the saturation of
The system architecture, interaction schema, configurable e-commerce testbed, and factorial experiment harness are designed for reproducibility and released as open artifacts. This is a very generic end-to-end mechanism which is applicable to a variety of different e-commerce tasks. We intentionally put emphasis on the development of this infrastructure to establish a reproducible framework for interaction and to minimize any noise. The system architecture, interaction schema, configurable e-commerce testbed, and factorial experiment harness are designed for reproducibility and released as open artifacts. This is a very generic end-to-end mechanism which is applicable to a variety of different e-commerce tasks. We intentionally put emphasis on the development of this infrastructure to establish a reproducible framework for interaction and to minimize any noise.
Future work includes full factorial evaluation of the DR-RL policy across contamination levels, online adaptation of the ambiguity radius $\epsilon$ as a function of live divergence estimates, extension to multi-agent market maker settings, and integration of the HAP protocol~\cite{dhir_http_2025} as an additional signal source for the separability module. Future work includes full factorial evaluation of the DR-RL policy across contamination levels, online adaptation of the ambiguity radius $\epsilon$ as a function of live divergence estimates, extension to multi-agent market maker settings, and integration of the HAP protocol~\cite{dhir_http_2025} as an additional signal source for the distinguishability module.
%% ==================================================================== %% ====================================================================

View File

@@ -2,9 +2,9 @@
\section{Introduction} \section{Introduction}
In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove separability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned separability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners. In this paper we present an exploration and defense against the presence of new commercial entities in digitally powered platforms, preserving market equilibrium in the age of AI. This research establishes the following contributions: definition and formalization of non-human transactors in e-commerce platforms, development of a testing-ground for capturing the behavioral essence of these transactors across a large variety of digital systems, construction of a discriminative model (to prove distinguishability) as a strong learner for downstream mitigation of contamination by non-human entities, translation of such learned distinguishability into existing dynamic pricing machine learning loops, and finally establishment of a high-level KPI-affecting causal effect and cost-saving framework for the future of internet commerce in the presence of such non-human learners.
This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned separability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.} This research effort touches a large variety of domains, spanning behavioral economics for understanding the rationality of behavior as theorized by the concept of homo economicus, agent-based modeling to translate our learned distinguishability into disjoint dynamic pricing systems, reinforcement learning which serves as the SOTA for price-learners, and dynamic pricing and market equilibrium theory to understand the risks of possible supra-competitive pricing phenomena in cases of adversarial pricing systems driving the market out of equilibrium. \footnote{Given the rapid evolution of the field we acknowledge all developments with a cutoff set at the date of March 1st 2026.}
\subsection{Motivation and Market Context} \subsection{Motivation and Market Context}
@@ -25,7 +25,7 @@ We formally define interaction data as coming from some actor which can either b
This dissertation is organized around one main research question and three supporting sub-questions: This dissertation is organized around one main research question and three supporting sub-questions:
\begin{enumerate} \begin{enumerate}
\item[\textbf{Main RQ}] How can dynamic pricing systems preserve margin integrity when transaction orchestration is increasingly mediated by non-human agents? \item[\textbf{Main RQ}] How can dynamic pricing systems preserve margin integrity when transaction orchestration is increasingly mediated by non-human agents?
\item[\textbf{SQ1}] \textit{Separability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting? \item[\textbf{SQ1}] \textit{Distinguishability}: Can agent and human sessions be reliably distinguished from behavioral interaction signals alone, without relying on network-level or device fingerprinting?
\item[\textbf{SQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems? \item[\textbf{SQ2}] \textit{Theoretical Impact}: What is the formal relationship between agent contamination levels and the erosion of pricing power in dynamic pricing systems?
\item[\textbf{SQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination? \item[\textbf{SQ3}] \textit{Robust Mitigation}: How can pricing policies be constructed to maintain margin integrity under unknown and non-stationary levels of agent contamination?
\end{enumerate} \end{enumerate}
@@ -59,4 +59,4 @@ Extract final result from terminal state\;
\end{algorithm} \end{algorithm}
The previously described goal of separability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary. The previously described goal of distinguishability allows us to formulate a task which entails taking raw interaction data for either actor and creating a composite demand estimate. We propose a robust optimization objective defined in our methodology, transforming the pricing problem into a form of Distributionally Robust Optimization \parencite{kuhn_distributionally_2025} where the learner must guard against adversarial contamination in observed demand distributors. In this setting we must learn to make decision that perform under the assumption of not having a single estimated probability distribution but under an ambiguity set of any distribution, of which we have limited information. In our case as stated is a mixture of distributions with a parameter which is unknown and non-stationary.

View File

@@ -1,6 +1,6 @@
\section{Literature Review} \section{Literature Review}
To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for separating non-human reconnaissance from genuine human demand expression and integrating that separation into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups. To better understand all wedges of the current works, we must start by exploring the nature of agents, agentic computer use and web automation, complementing that with economic reasoning and strategic interaction. The final surface to cover, leads us to data-driven dynamic pricing under uncertainty. The key technical risk is not ``agents buying things'' per se, but agents shaping the behavioral and demand signals that downstream pricing systems consume and depend on. This latter case of agents shopping is currently pending legal action in the case of \textcite{noauthor_amazoncom_2026} which is currently being treated as a violation of the Computer Fraud and Abuse Act. The introduction of these mediating actor entities into economic systems, is further creating a threat of false-name bidding \parencite{yokoo_effect_2004}, which prior research has explored in a trading context. Other research on pseudonyms in dynamic systems, demonstrate whitewashing in AI agents which can ignore defensive mechanisms by re-entry with different identities \parencite{feldman_free-riding_2004}. Dynamic pricing assumes demand proxies are behaviorally meaningful, while bot detection aims at security and access control. The missing bridge is a principled framework for distinguishing non-human reconnaissance from genuine human demand expression and integrating that distinguishability into pricing heuristics without degrading legitimate user experience (in our research tracked by the user-experience index). This gap, is what our contribution aims to address, particularly for the aforementioned stakeholder groups.
\subsection{Agent Taxonomy and Definitions} \subsection{Agent Taxonomy and Definitions}

View File

@@ -1,6 +1,6 @@
\section{Methodology} \section{Methodology}
This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven separability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets. This section details the theoretical and practical framework developed to address dynamic pricing under the influence of non-human actors. We begin by formalizing the problem environment and the nature of the actors. We then derive the \textit{Cost of Information} (COI) theorem, proving the erosion of pricing power in the limit of agent saturation. Following this, we outline our generative contamination strategy using GOFAI-driven distinguishability and transition probability learning. Finally, we formulate the robust control problem as a Stackelberg game solved via Distributionally Robust Reinforcement Learning (DR-RL) with constructed ambiguity sets.
\subsection{Problem Formalization} \subsection{Problem Formalization}
@@ -109,13 +109,13 @@ Since users act with motivations, we define a pool of tasks (jobs to be done) an
A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor. A representative task is to find the cheapest feasible catalog item under explicit constraints while removing strict financial limits so we avoid trivial optimization behavior. Participants are also randomly assigned to one experimental platform mode (hotel or airline). Once assigned, they are dropped into the experiment with an actor ID. Under each experiment ID, we can observe multiple sessions across time and gather long interaction traces for the same actor.
The human data collection involved 18 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 18 human sessions we ran 18 agent sessions of equivalent task scope, giving a balanced dataset of 36 labeled trajectories. Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy. The human data collection involved 13 participants, all of whom provided explicit informed consent prior to their session. Participants had an average age of 21 years and were recruited from a university population. Alongside the 13 human sessions we ran 16 agent sessions of equivalent task scope, yielding 29 labeled trajectories in total (45\% human, 55\% agent). Each participant was assigned a single platform mode and a single task drawn from the pool, and completed the session independently without guidance on navigation or pricing strategy.
To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior. To evaluate quality and realism of the setup, we store both structured event logs and full interaction transcripts. This lets us combine quantitative analysis with transcript-level qualitative findings. The result is an isolated system where we can control the interaction process while preserving realistic behavior.
Operationally, goals and experiment runs are tracked in PostgreSQL. This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to separate classes (agent vs human) with session-conditioned probability estimates, then injects those estimates into the pricing learner. Operationally, goals and experiment runs are tracked in PostgreSQL. This data-acquisition phase is the first half of the methodology and is intentionally a disconnected component that feeds the later contributions. The second half uses collected behavioral traces to distinguish classes (agent vs human) with session-conditioned probability estimates, then injects those estimates into the pricing learner.
Our process follows three stages: (1) observe and vectorize behavioral interactions, (2) learn separability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator. Our process follows three stages: (1) observe and vectorize behavioral interactions, (2) learn distinguishability to characterize human versus agent patterns, and (3) use the learned signal to train a defensive policy in a controlled dynamic-pricing simulator.
\begin{figure}[ht] \begin{figure}[ht]
\resizebox{\columnwidth}{!}{% \resizebox{\columnwidth}{!}{%
@@ -209,15 +209,15 @@ In the simulator baseline this order is encoded with a compact fixed scale: cart
In addition to behavioral events, the platform logs price observations to a separate Kafka topic. Each price query generates a record associating the product, displayed price, requesting session, platform mode, and timestamp. This dual-stream architecture enables joint analysis of price exposure and behavioral response. In addition to behavioral events, the platform logs price observations to a separate Kafka topic. Each price query generates a record associating the product, displayed price, requesting session, platform mode, and timestamp. This dual-stream architecture enables joint analysis of price exposure and behavioral response.
\subsection{Generative Contamination and Separability} \subsection{Generative Contamination and Distinguishability}
To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from Phantom data using a two-stage approach. To train a robust pricing learner, we need a simulator that can generate realistic interaction data under controlled contamination. We build this from Phantom data using a two-stage approach.
\subsubsection{Ground-Truth Separability} \subsubsection{Ground-Truth Distinguishability}
Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels (human or agent) are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition, treating the resulting human and agent kernels as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability? Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels (human or agent) are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition, treating the resulting human and agent kernels as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels distinguishable enough to justify downstream pricing control that depends on that distinguishability?
To answer this, we compute per-session divergence scores against both class-level centroids. For each session in either partition, we fit a session-level event transition kernel from that session's trajectory alone, then compute its average divergence to the human centroid and to the agent centroid. The per-session separability score is the gap between these two divergences: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior. To answer this, we compute per-session divergence scores against both class-level centroids. For each session in either partition, we fit a session-level event transition kernel from that session's trajectory alone, then compute its average divergence to the human centroid and to the agent centroid. The per-session distinguishability score is the gap between these two divergences: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
We cannot assume normal distributions for divergence scores, which are right-skewed and bounded below by zero, so we do not use a Student's t-test. Instead we apply a Mann-Whitney U test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We cannot assume normal distributions for divergence scores, which are right-skewed and bounded below by zero, so we do not use a Student's t-test. Instead we apply a Mann-Whitney U test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations.
@@ -305,7 +305,7 @@ We also consider taxation-like overlays for agent traffic under strategy-proof m
\subsubsection{Pricing Mechanism Summary} \subsubsection{Pricing Mechanism Summary}
We now present the complete pricing mechanism that integrates the behavioral separability, contamination estimation, and robust optimization components developed in the preceding sections. The defensive pricing loop algorithm formalizes the process as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories. We now present the complete pricing mechanism that integrates the behavioral distinguishability, contamination estimation, and robust optimization components developed in the preceding sections. The defensive pricing loop algorithm formalizes the process as a Stackelberg game where the platform (leader) sets prices and the aggregate demand (follower) responds through observed session trajectories.
\begin{algorithm}[t] \begin{algorithm}[t]
\caption{PHANTOM defensive pricing loop} \caption{PHANTOM defensive pricing loop}

View File

@@ -1,14 +1,14 @@
\section{Results} \section{Results}
\begin{figure}[ht] \begin{figure}[ht]
\centering \centering
\input{chapters/figures/supra.tex} \input{chapters/figures/supra/supra.tex}
\caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as SAC as can be clearly seen by the high density at the highest available price.} \caption{Evolution of price distributions over experiment steps. The heatmap illustrates the density of price offerings. This is an early baseline simulation which demonstrates supra-competitive price-setting in deep learning agents such as SAC as can be clearly seen by the high density at the highest available price.}
\label{fig:supra_heatmap} \label{fig:supra_heatmap}
\end{figure} \end{figure}
\subsection{Behavioral Analysis} \subsection{Behavioral Analysis}
Separability between human and agent sessions is evaluated by computing per-session divergence gap scores (how much closer each session is to the human baseline versus the agent baseline) and comparing the two groups with a Mann-Whitney U test. The table below reports the group-level descriptive statistics for the gap scores and the test result. Distinguishability between human and agent sessions is evaluated by computing per-session divergence gap scores (how much closer each session is to the human baseline versus the agent baseline) and comparing the two groups with a Mann-Whitney U test. The full recorded cohort contains 13 human sessions and 16 agent sessions, and the table below reports the corresponding group-level statistics and test result.
\begin{table}[ht] \begin{table}[ht]
\centering \centering
@@ -18,19 +18,19 @@ Separability between human and agent sessions is evaluated by computing per-sess
\toprule \toprule
Group & n & Mean gap & Std \\ Group & n & Mean gap & Std \\
\midrule \midrule
Human sessions & 11 & $-3.3522$ & $2.6748$ \\ Human sessions & 13 & $-3.35$ & $2.67$ \\
Agent sessions & 6 & $+1.6482$ & $2.8349$ \\ Agent sessions & 16 & $+1.65$ & $2.83$ \\
\midrule \midrule
\multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\ \multicolumn{4}{l}{Mann-Whitney two-sided test: $p<0.001$} \\
\bottomrule \bottomrule
\end{tabular} \end{tabular}
\end{table} \end{table}
The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided p-value of 0.0006 (which means there is only a 0.06\% chance this pattern occurred by random luck) indicates near-complete rank separation between the groups at n=11 humans and n=6 agents, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing. The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided test result (p less than 0.001) at n=13 humans and n=16 agents indicates strong rank distinction between groups, providing evidence that the transition kernels are distinguishable enough to justify their use as a control signal in downstream pricing.
\subsection{Experimental Outcomes} \subsection{Experimental Outcomes}
To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) non-robust baseline with revenue-only reward (no-robust flag). To evaluate robustness contributions, we compare two policies on the same environment family: (i) robust pricing with COI-aware reward and adversarial contamination step, and (ii) a baseline policy with revenue-only reward.
\begin{table}[ht] \begin{table}[ht]
\centering \centering
@@ -41,7 +41,7 @@ To evaluate robustness contributions, we compare two policies on the same enviro
Policy & Eval reward & Eval revenue & COI leakage & Margin collapse rate \\ Policy & Eval reward & Eval revenue & COI leakage & Margin collapse rate \\
\midrule \midrule
Robust policy & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\ Robust policy & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\
Non-robust baseline (\texttt{--no-robust}) & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\ Baseline policy & \textit{TBD} & \textit{TBD} & \textit{TBD} & \textit{TBD} \\
\bottomrule \bottomrule
\end{tabular} \end{tabular}
\end{table} \end{table}
@@ -50,6 +50,6 @@ This comparison isolates the effect of robustness terms from model capacity and
\subsection{Interpretation and Insights} \subsection{Interpretation and Insights}
The Mann-Whitney result (U=2.0, p less than 0.001) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score. The Mann-Whitney result (p less than 0.001) confirms that per-session divergence gaps distinguish the two actor classes with near-zero overlap in rank ordering. This is the condition required for distinguishability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
\subsection{Anomalies} \subsection{Anomalies}

Some files were not shown because too many files have changed in this diff Show More