updating computation power graph

This commit is contained in:
2026-03-08 14:22:54 +01:00
parent 17c128cbc0
commit 28dbcacd95
6 changed files with 142 additions and 114 deletions

130
Makefile
View File

@@ -54,160 +54,84 @@ $(BUILDDIR):
mkdir -p paper/$(BUILDDIR)
.PHONY: pdf.build
pdf.build: $(BUILDDIR)
@bash paper/concat_code.sh
@cd $(SRCDIR) && \
$(LATEXMK) -pdf -jobname=$(JOBNAME) -f \
-interaction=nonstopmode -file-line-error \
-r ../.latexmkrc \
-outdir=../$(BUILDDIR) $(TEX)
pdf.build:
@$(NX) run paper:build
.PHONY: pdf.watch
pdf.watch: $(BUILDDIR)
@cd $(SRCDIR) && \
$(LATEXMK) -pvc -pdf -jobname=$(JOBNAME) -f \
-interaction=nonstopmode -file-line-error \
-r ../.latexmkrc \
-outdir=../$(BUILDDIR) $(TEX)
pdf.watch:
@$(NX) run paper:watch
.PHONY: pdf.clean
pdf.clean:
@cd $(SRCDIR) && \
$(LATEXMK) -C -jobname=$(JOBNAME) -outdir=../$(BUILDDIR) || true
rm -rf paper/$(BUILDDIR)/*
@$(NX) run paper:clean
.PHONY: test.backend
test.backend: $(VENV)
$(PYTEST) -v
test.backend:
@$(NX) run research:test
.PHONY: test.e2e
test.e2e:
@cd tests/e2e && npm install
@cd tests/e2e && npx playwright install chromium
@test -f tests/e2e/.env || cp tests/e2e/.env.example tests/e2e/.env
@timeout 30 bash -c 'until curl -sf http://localhost:5000/health > /dev/null 2>&1; do sleep 1; done' || (echo "Backend not ready" && exit 1)
@timeout 30 bash -c 'until curl -sf http://localhost:3000 > /dev/null 2>&1; do sleep 1; done' || (echo "Web app not ready" && exit 1)
@timeout 30 bash -c 'until curl -sf http://localhost:8085/health > /dev/null 2>&1; do sleep 1; done' || (echo "Airflow not ready" && exit 1)
@cd tests/e2e && npm test
@$(NX) run e2e:test
.PHONY: test.all
test.all: test.backend test.e2e
test.all:
@$(NX) run-many -t test --projects=research,e2e --parallel=1
.PHONY: web.dev
web.dev:
@cd web && npm install && npm run dev
@$(NX) run web:dev
$(VENV):
python3 -m venv $(VENV)
$(PIP) install --upgrade pip
.PHONY: install
install: $(VENV)
$(PIP) install -r requirements.txt
install:
@$(NX) run research:install
.PHONY: train
train: install
@$(SWEEP_ENV_LOAD); test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY required — set it in $(SWEEP_ENV_FILE)" && exit 1)
@$(SWEEP_ENV_LOAD); WANDB_API_KEY="$$WANDB_API_KEY" WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" \
$(PYTHON) -m engine.train $(LOCAL_TRAIN_ARGS)
train:
@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" LOCAL_TRAIN_ARGS="$(LOCAL_TRAIN_ARGS)" $(NX) run research:train
.PHONY: train.agent
train.agent: install
@$(SWEEP_ENV_LOAD); test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY required — set it in $(SWEEP_ENV_FILE)" && exit 1)
@test -n "$(SWEEP_ID)" || (echo "SWEEP_ID required, e.g. SWEEP_ID=entity/project/id" && exit 1)
@$(SWEEP_ENV_LOAD); WANDB_API_KEY="$$WANDB_API_KEY" WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" \
$(PYTHON) -m engine.train --sweep-agent --sweep-id "$(SWEEP_ID)" \
$(if $(filter-out 0,$(AGENT_COUNT)),--count $(AGENT_COUNT),)
train.agent:
@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" $(NX) run research:train-agent
.PHONY: train.bootstrap
train.bootstrap:
@$(SWEEP_ENV_LOAD); test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY required — set it in $(SWEEP_ENV_FILE)" && exit 1)
@$(SWEEP_ENV_LOAD); test -n "$$GITHUB_TOKEN" || (echo "GITHUB_TOKEN required — set it in $(SWEEP_ENV_FILE)" && exit 1)
@test -n "$(REPO_URL)" || (echo "REPO_URL required, e.g. REPO_URL=https://github.com/org/repo.git" && exit 1)
@test -n "$(SWEEP_ID)" || (echo "SWEEP_ID required, e.g. SWEEP_ID=entity/project/id" && exit 1)
@$(SWEEP_ENV_LOAD); \
WANDB_API_KEY="$$WANDB_API_KEY" \
WANDB_ENTITY="$(WANDB_ENTITY)" \
WANDB_PROJECT="$(WANDB_PROJECT)" \
GITHUB_TOKEN="$$GITHUB_TOKEN" \
REPO_URL="$(REPO_URL)" \
BRANCH="$(BRANCH)" \
WORKDIR="$(WORKDIR)" \
SWEEP_ID="$(SWEEP_ID)" \
AGENT_COUNT="$(AGENT_COUNT)" \
AGENT_LOOP="$(AGENT_LOOP)" \
RETRY_SECONDS="$(RETRY_SECONDS)" \
bash scripts/wandb_agent_bootstrap.sh
@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap
.PHONY: stats.lines
stats.lines:
@find . \( -path '*/node_modules' -o -path '*/.venv' -o -path '*/venv' \) -prune -o \
\( -name "*.ts" -o -name "*.py" \) -type f -print0 | xargs -0 cat | wc -l
@$(NX) run research:stats
.PHONY: wordcount
wordcount:
@echo "Counting words in main text (excluding appendix)..."
@texcount -nosub -total -sum -1 \
$(SRCDIR)/chapters/01-intro.tex \
$(SRCDIR)/chapters/02-literature-review.tex \
$(SRCDIR)/chapters/03-methodology.tex \
$(SRCDIR)/chapters/04-results.tex \
$(SRCDIR)/chapters/05-discussion.tex \
$(SRCDIR)/chapters/06-conclusion.tex
@$(NX) run paper:wordcount
.PHONY: docker.train.publish
docker.train.publish:
docker build -f docker/Trainer.dockerfile --target gpu -t $(TRAIN_IMAGE_REF):gpu-latest .
docker push $(TRAIN_IMAGE_REF):gpu-latest
docker build -f docker/Trainer.dockerfile --target tpu -t $(TRAIN_IMAGE_REF):tpu-latest .
docker push $(TRAIN_IMAGE_REF):tpu-latest
@TRAIN_IMAGE_REF="$(TRAIN_IMAGE_REF)" $(NX) run research:docker-train-publish
.PHONY: train.tpu.pod
train.tpu.pod:
@test -n "$(TPU_NAME)" || (echo "TPU_NAME required, e.g. TPU_NAME=TPUlong" && exit 1)
@test -n "$(SWEEP_ID)" || (echo "SWEEP_ID required, e.g. SWEEP_ID=entity/project/id" && exit 1)
@$(SWEEP_ENV_LOAD); test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY required — set it in $(SWEEP_ENV_FILE)" && exit 1)
gcloud compute tpus tpu-vm scp scripts/tpu_pod_run.sh $(TPU_NAME):/tmp/tpu_pod_run.sh \
--zone=$(TPU_ZONE) --project=$(TPU_PROJECT) --worker=all
@$(SWEEP_ENV_LOAD); \
gcloud compute tpus tpu-vm ssh $(TPU_NAME) \
--zone=$(TPU_ZONE) --project=$(TPU_PROJECT) --worker=all \
--command="WANDB_API_KEY='$$WANDB_API_KEY' SWEEP_ID='$(SWEEP_ID)' AGENT_COUNT='$(AGENT_COUNT)' sh /tmp/tpu_pod_run.sh"
@TPU_NAME="$(TPU_NAME)" TPU_ZONE="$(TPU_ZONE)" TPU_PROJECT="$(TPU_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" $(NX) run research:train-tpu-pod
.PHONY: train.tpu.vm.prepare
train.tpu.vm.prepare:
@test -n "$(TPU_NAME)" || (echo "TPU_NAME required, e.g. TPU_NAME=TPUlong" && exit 1)
TPU_NAME="$(TPU_NAME)" TPU_ZONE="$(TPU_ZONE)" TPU_PROJECT="$(TPU_PROJECT)" \
LOCAL_REPO_DIR="$(CURDIR)" REMOTE_REPO_DIR="$(TPU_REPO_DIR)" \
sh scripts/tpu_sync_repo.sh
gcloud compute tpus tpu-vm scp scripts/tpu_vm_train.sh $(TPU_NAME):/tmp/tpu_vm_train.sh \
--zone=$(TPU_ZONE) --project=$(TPU_PROJECT) --worker=all
@TPU_NAME="$(TPU_NAME)" TPU_ZONE="$(TPU_ZONE)" TPU_PROJECT="$(TPU_PROJECT)" TPU_REPO_DIR="$(TPU_REPO_DIR)" $(NX) run research:train-tpu-vm-prepare
.PHONY: train.tpu.vm.run
train.tpu.vm.run:
@test -n "$(TPU_NAME)" || (echo "TPU_NAME required, e.g. TPU_NAME=TPUlong" && exit 1)
@test -n "$(LOCAL_TRAIN_ARGS)" || (echo "LOCAL_TRAIN_ARGS required, e.g. --algo ppo --jax --total-timesteps 200000" && exit 1)
@$(SWEEP_ENV_LOAD); \
gcloud compute tpus tpu-vm ssh $(TPU_NAME) \
--zone=$(TPU_ZONE) --project=$(TPU_PROJECT) --worker=all \
--command="REPO_DIR='$(TPU_REPO_DIR)' TRAIN_ARGS='$(LOCAL_TRAIN_ARGS)' WANDB_API_KEY='$$WANDB_API_KEY' sh /tmp/tpu_vm_train.sh"
@TPU_NAME="$(TPU_NAME)" TPU_ZONE="$(TPU_ZONE)" TPU_PROJECT="$(TPU_PROJECT)" TPU_REPO_DIR="$(TPU_REPO_DIR)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" LOCAL_TRAIN_ARGS="$(LOCAL_TRAIN_ARGS)" $(NX) run research:train-tpu-vm-run
.PHONY: train.tpu.vm
train.tpu.vm: train.tpu.vm.prepare train.tpu.vm.run
train.tpu.vm:
@TPU_NAME="$(TPU_NAME)" TPU_ZONE="$(TPU_ZONE)" TPU_PROJECT="$(TPU_PROJECT)" TPU_REPO_DIR="$(TPU_REPO_DIR)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" LOCAL_TRAIN_ARGS="$(LOCAL_TRAIN_ARGS)" $(NX) run research:train-tpu-vm
.PHONY: train.tpu.vm.sweep
train.tpu.vm.sweep:
@test -n "$(TPU_NAME)" || (echo "TPU_NAME required, e.g. TPU_NAME=TPUlong" && exit 1)
@test -n "$(SWEEP_ID)" || (echo "SWEEP_ID required, e.g. SWEEP_ID=lusiana/phantom-pricing/abc123" && exit 1)
@$(SWEEP_ENV_LOAD); test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY required — set it in $(SWEEP_ENV_FILE)" && exit 1)
@$(SWEEP_ENV_LOAD); WANDB_API_KEY="$$WANDB_API_KEY" \
python3 scripts/tpu_vm_sweep_agent.py \
--sweep-id "$(SWEEP_ID)" \
--tpu-name "$(TPU_NAME)" \
--tpu-zone "$(TPU_ZONE)" \
--tpu-project "$(TPU_PROJECT)" \
--tpu-repo-dir "$(TPU_REPO_DIR)" \
$(if $(filter-out 0,$(AGENT_COUNT)),--count $(AGENT_COUNT),)
@TPU_NAME="$(TPU_NAME)" TPU_ZONE="$(TPU_ZONE)" TPU_PROJECT="$(TPU_PROJECT)" TPU_REPO_DIR="$(TPU_REPO_DIR)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" $(NX) run research:train-tpu-vm-sweep
.PHONY: backend.server backend.provider backend.worker platform.up platform.down platform.logs
backend.server:

View File

@@ -7,7 +7,7 @@
"install": {
"executor": "nx:run-commands",
"options": {
"command": "make install",
"command": "bash scripts/nx_research.sh install",
"cwd": "."
}
},
@@ -17,7 +17,7 @@
"install"
],
"options": {
"command": "make test.backend",
"command": ".venv/bin/pytest -v",
"cwd": "."
}
},
@@ -27,14 +27,76 @@
"install"
],
"options": {
"command": "make train",
"command": "bash scripts/nx_research.sh train",
"cwd": "."
}
},
"train-agent": {
"executor": "nx:run-commands",
"dependsOn": [
"install"
],
"options": {
"command": "bash scripts/nx_research.sh train-agent",
"cwd": "."
}
},
"train-bootstrap": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_research.sh train-bootstrap",
"cwd": "."
}
},
"stats": {
"executor": "nx:run-commands",
"options": {
"command": "make stats.lines",
"command": "bash scripts/nx_research.sh stats",
"cwd": "."
}
},
"docker-train-publish": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_research.sh docker-train-publish",
"cwd": "."
}
},
"train-tpu-pod": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_research.sh train-tpu-pod",
"cwd": "."
}
},
"train-tpu-vm-prepare": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_research.sh train-tpu-vm-prepare",
"cwd": "."
}
},
"train-tpu-vm-run": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_research.sh train-tpu-vm-run",
"cwd": "."
}
},
"train-tpu-vm": {
"executor": "nx:run-commands",
"dependsOn": [
"train-tpu-vm-prepare"
],
"options": {
"command": "bash scripts/nx_research.sh train-tpu-vm-run",
"cwd": "."
}
},
"train-tpu-vm-sweep": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_research.sh train-tpu-vm-sweep",
"cwd": "."
}
}

View File

@@ -10,21 +10,28 @@
"{projectRoot}/build"
],
"options": {
"command": "make pdf.build",
"command": "bash scripts/nx_paper.sh build",
"cwd": "."
}
},
"watch": {
"executor": "nx:run-commands",
"options": {
"command": "make pdf.watch",
"command": "bash scripts/nx_paper.sh watch",
"cwd": "."
}
},
"clean": {
"executor": "nx:run-commands",
"options": {
"command": "make pdf.clean",
"command": "bash scripts/nx_paper.sh clean",
"cwd": "."
}
},
"wordcount": {
"executor": "nx:run-commands",
"options": {
"command": "bash scripts/nx_paper.sh wordcount",
"cwd": "."
}
}

View File

@@ -210,8 +210,7 @@ The simulator has multiple configurable factors. We design a multi-factor study
% Power analysis plan: apply a two-sample Mann-Whitney U (or permutation test) on per-session (delta_H - delta_A) divergence scores comparing the human and agent groups. Compute minimum detectable effect size at alpha=0.05, power=0.8, given n=18 per group. Bootstrap confidence intervals on mean KL are a cleaner complement given the non-normality of divergence distributions.
While this scale is generally expensive for reinforcement learning, we execute it on a large TPU cluster to make the sweep tractable.
% TODO: cite in the apendix the math to get to 160 petaflops of compute
Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak BF16 throughput this corresponds to approximately 160 PFLOPS of aggregate compute, which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration where throughput-per-dollar is favorable, and reserve on-demand v4 capacity for runs that should not be interrupted.
Our training budget is provisioned through TPU Research Cloud and spans 384 chips across TPU v4, v5e, and v6e generations, with a spot-heavy allocation plus an on-demand reserve. At peak BF16 throughput this corresponds to approximately 160\,PFLOPS of aggregate compute (derivation in Appendix~\ref{app:compute_budget}), which makes repeated seeds, ablations, and sensitivity sweeps feasible within practical wall-clock limits. We allocate v6e capacity to the highest-intensity policy training jobs, use v5e for wider hyperparameter exploration where throughput-per-dollar is favorable, and reserve on-demand v4 capacity for runs that should not be interrupted.
\begin{table}[ht]
\centering

View File

@@ -53,6 +53,31 @@ These behavioral signals serve as inputs for a Distributionally Robust Reinforce
\item[Trajectory] Defined as a series of unspecified length, collecting data on states of some object over time.
% TODO: maybe define other things in a similar succient manner
\end{description}
\section{Aggregate Compute Budget Derivation}
\label{app:compute_budget}
The claimed peak throughput of approximately 160\,PFLOPS follows from multiplying the per-chip BF16 peak (from official Google Cloud TPU documentation) by the number of chips in each allocation tier and summing across generations.
\begin{table}[ht]
\centering
\caption{Per-generation contribution to aggregate BF16 throughput.}
\label{tab:compute_derivation}
\begin{tabular}{@{}lrrr@{}}
\toprule
\textbf{TPU Gen.} & \textbf{Chips} & \textbf{Peak BF16/chip (TFLOPS)} & \textbf{Subtotal (TFLOPS)} \\
\midrule
v6e (Trillium) & 128 & 918 & $128 \times 918 = 117{,}504$ \\
v5e & 128 & 197 & $128 \times 197 = 25{,}216$ \\
v4 & 64 & 275 & $64 \times 275 = 17{,}600$ \\
\midrule
\textbf{Total} & \textbf{320} & & $\mathbf{160{,}320}$ \\
\bottomrule
\end{tabular}
\end{table}
Converting to petaFLOPS: $160{,}320\;\text{TFLOPS} = 160.32\;\text{PFLOPS} \approx 160\;\text{PFLOPS}$. This is the theoretical peak under sustained BF16 arithmetic; realized throughput depends on memory bandwidth utilization and inter-chip communication overhead, but the figure serves as a useful upper bound for provisioning decisions.
% \input{../build/concatenated_code}
\end{document}

View File

@@ -13,12 +13,23 @@
},
"test": {
"executor": "nx:run-commands",
"dependsOn": [
"install"
],
"outputs": [
"{projectRoot}/test-results"
],
"options": {
"command": "make test.e2e",
"cwd": "."
"commands": [
"npx playwright install chromium",
"test -f .env || cp .env.example .env",
"timeout 30 bash -c \"until curl -sf http://localhost:5000/health > /dev/null 2>&1; do sleep 1; done\" || (echo 'Backend not ready' && exit 1)",
"timeout 30 bash -c \"until curl -sf http://localhost:3000 > /dev/null 2>&1; do sleep 1; done\" || (echo 'Web app not ready' && exit 1)",
"timeout 30 bash -c \"until curl -sf http://localhost:8085/health > /dev/null 2>&1; do sleep 1; done\" || (echo 'Airflow not ready' && exit 1)",
"npm test"
],
"parallel": false,
"cwd": "tests/e2e"
}
},
"test-ui": {