fixing models for gcp

This commit is contained in:
2026-02-17 16:54:55 +01:00
parent 802f31b4a1
commit 9acc998cc9
5 changed files with 497 additions and 193 deletions

116
Makefile
View File

@@ -13,8 +13,8 @@ TPU_ZONE ?= us-central2-b
TPU_TYPE ?= v4-32
TPU_RUNTIME ?= tpu-vm-v4-base
TPU_PROJECT ?= phantom-trc
TPU_NETWORK ?= default
TPU_SUBNETWORK ?= default-us-central2
TPU_NETWORK ?= tpu-network
TPU_SUBNETWORK ?= tpu-network
TPU_USE_SPOT ?= 0
TPU_EXTRA_CREATE_FLAGS ?=
TPU_WORKDIR ?= ~/PHANTOM
@@ -22,7 +22,29 @@ TPU_SYNC_PATHS ?= engine lib requirements.txt Makefile .env
TPU_TRAIN_ARGS ?= --algo ppo --jax --total-timesteps 20000
TPU_JAX_WHEEL_URL ?= https://storage.googleapis.com/jax-releases/libtpu_releases.html
TPU_VENV ?= .venv-tpu
TPU_TRAIN_ENV ?= PHANTOM_USE_JAX=1 WANDB_MODE=offline
TPU_TRAIN_ENV ?= PHANTOM_USE_JAX=1 WANDB_MODE=online
SWEEP_ID ?=
SWEEP_COUNT ?= 5
QUEUE_SCRIPT ?= scripts/queue_sweep.sh
TPU_QUEUE_TYPE ?=
TPU_QUEUE_ZONES ?= europe-west4-a us-central2-b us-central1-a us-east1-d europe-west4-b
TPU_QUEUE_REUSE_EXISTING ?= 1
TPU_QUEUE_KEEP_ALIVE ?= 1
TPU_QUEUE_STRICT_QUOTA ?= 0
TPU_QUEUE_DOWNSHIFT_ON_QUOTA ?= 1
TPU_QUEUE_FILTER_ZONE ?=
TPU_QUEUE_FILTER_TYPE ?=
TPU_QUEUE_EXECUTION_MODE ?= venv
TPU_QUEUE_SYNC_METHOD ?= tar
TPU_QUEUE_SKIP_SYNC ?= 0
TPU_QUEUE_DOCKER_IMAGE ?=
TPU_QUEUE_DOCKER_PULL ?= 1
TPU_QUEUE_DOCKER_AUTO_INSTALL ?= 1
TPU_QUEUE_SSH_BATCH_MODE ?= 1
TPU_QUEUE_SSH_CONNECT_TIMEOUT ?= 12
TPU_QUEUE_SSH_KEY_FILE ?= $(HOME)/.ssh/google_compute_engine
TPU_QUEUE_REQUIRE_SSH_AGENT ?= 1
TPU_QUEUE_AUTO_SSH_ADD ?= 1
TPU_SPOT_FLAG := $(if $(filter 1 true TRUE yes YES,$(TPU_USE_SPOT)),--spot,)
TPU_CREATE_CMD = gcloud --project="$(TPU_PROJECT)" compute tpus tpu-vm create "$(TPU_NAME)" --zone="$(TPU_ZONE)" --accelerator-type="$(TPU_TYPE)" --version="$(TPU_RUNTIME)" --network="$(TPU_NETWORK)" --subnetwork="$(TPU_SUBNETWORK)" $(TPU_SPOT_FLAG) $(TPU_EXTRA_CREATE_FLAGS)
@@ -30,8 +52,13 @@ TPU_CREATE_CMD = gcloud --project="$(TPU_PROJECT)" compute tpus tpu-vm create "$
.PHONY: help
help:
@echo "pdf.build pdf.watch pdf.clean | test.backend test.e2e test.all | web.dev | install | stats.lines | tpu.*"
@echo "pdf.build pdf.watch pdf.clean | test.backend test.e2e test.all | web.dev | install | stats.lines | tpu.* | tpu.queue.*"
@echo "TPU presets: tpu.create.v4.ondemand | tpu.create.v4.spot"
@echo "Queued sweep: SWEEP_ID=entity/project/id make tpu.queue.sweep"
@echo "Queued sweep filters: TPU_QUEUE_FILTER_TYPE=v6e TPU_QUEUE_FILTER_ZONE=europe-west4-a"
@echo "Docker queue: make tpu.queue.sweep.docker TPU_QUEUE_DOCKER_IMAGE=gcr.io/<project>/<image>:tag"
@echo "Docker queue without sync: add TPU_QUEUE_SKIP_SYNC=1"
@echo "If SSH key is encrypted: run ssh-add ~/.ssh/google_compute_engine first"
$(BUILDDIR):
mkdir -p paper/$(BUILDDIR)
@@ -104,11 +131,11 @@ tpu.check.zone:
.PHONY: tpu.create.v4.ondemand
tpu.create.v4.ondemand:
$(MAKE) tpu.create TPU_ZONE=us-central2-b TPU_TYPE=v4-32 TPU_USE_SPOT=0 TPU_SUBNETWORK=default-us-central2
$(MAKE) tpu.create TPU_ZONE=us-central2-b TPU_TYPE=v4-32 TPU_USE_SPOT=0 TPU_SUBNETWORK=tpu-network
.PHONY: tpu.create.v4.spot
tpu.create.v4.spot:
$(MAKE) tpu.create TPU_ZONE=us-central2-b TPU_TYPE=v4-32 TPU_USE_SPOT=1 TPU_SUBNETWORK=default-us-central2
$(MAKE) tpu.create TPU_ZONE=us-central2-b TPU_TYPE=v4-32 TPU_USE_SPOT=1 TPU_SUBNETWORK=tpu-network
.PHONY: tpu.create
tpu.create: tpu.check.zone
@@ -179,6 +206,83 @@ tpu.bootstrap: tpu.ensure tpu.deploy tpu.install
tpu.delete:
gcloud --project="$(TPU_PROJECT)" compute tpus tpu-vm delete "$(TPU_NAME)" --zone="$(TPU_ZONE)" --quiet
.PHONY: tpu.queue.sweep
tpu.queue.sweep:
@set -e; \
test -n "$(SWEEP_ID)" || (echo "SWEEP_ID is required, e.g. SWEEP_ID=entity/project/id" && exit 1); \
test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY is required in your shell" && exit 1); \
if [ "$(TPU_QUEUE_AUTO_SSH_ADD)" = "1" ] && [ "$(TPU_QUEUE_SSH_BATCH_MODE)" != "0" ] && command -v ssh-add >/dev/null 2>&1 && [ -f "$(TPU_QUEUE_SSH_KEY_FILE)" ]; then \
if ! ssh-add -l >/dev/null 2>&1; then \
if [ -z "$$SSH_AUTH_SOCK" ] && command -v ssh-agent >/dev/null 2>&1; then eval "$$(ssh-agent -s)" >/dev/null; fi; \
ssh-add "$(TPU_QUEUE_SSH_KEY_FILE)"; \
fi; \
fi; \
AGENT_COUNT="$(SWEEP_COUNT)" PROJECT_ID="$(TPU_PROJECT)" TPU_NETWORK="$(TPU_NETWORK)" TPU_SUBNETWORK="$(TPU_SUBNETWORK)" TPU_REUSE_EXISTING="$(TPU_QUEUE_REUSE_EXISTING)" TPU_KEEP_ALIVE="$(TPU_QUEUE_KEEP_ALIVE)" TPU_STRICT_QUOTA="$(TPU_QUEUE_STRICT_QUOTA)" TPU_DOWNSHIFT_ON_QUOTA="$(TPU_QUEUE_DOWNSHIFT_ON_QUOTA)" TPU_EXECUTION_MODE="$(TPU_QUEUE_EXECUTION_MODE)" TPU_SYNC_METHOD="$(TPU_QUEUE_SYNC_METHOD)" TPU_SKIP_SYNC="$(TPU_QUEUE_SKIP_SYNC)" TPU_DOCKER_IMAGE="$(TPU_QUEUE_DOCKER_IMAGE)" TPU_DOCKER_PULL="$(TPU_QUEUE_DOCKER_PULL)" TPU_DOCKER_AUTO_INSTALL="$(TPU_QUEUE_DOCKER_AUTO_INSTALL)" TPU_SSH_BATCH_MODE="$(TPU_QUEUE_SSH_BATCH_MODE)" TPU_SSH_CONNECT_TIMEOUT="$(TPU_QUEUE_SSH_CONNECT_TIMEOUT)" TPU_SSH_KEY_FILE="$(TPU_QUEUE_SSH_KEY_FILE)" TPU_REQUIRE_SSH_AGENT="$(TPU_QUEUE_REQUIRE_SSH_AGENT)" TPU_QUEUE_FILTER_ZONE="$(TPU_QUEUE_FILTER_ZONE)" TPU_QUEUE_FILTER_TYPE="$(TPU_QUEUE_FILTER_TYPE)" WANDB_API_KEY="$$WANDB_API_KEY" "$(QUEUE_SCRIPT)" "$(SWEEP_ID)"
.PHONY: tpu.queue.worker
tpu.queue.worker:
@set -e; \
test -n "$(SWEEP_ID)" || (echo "SWEEP_ID is required, e.g. SWEEP_ID=entity/project/id" && exit 1); \
test -n "$$WANDB_API_KEY" || (echo "WANDB_API_KEY is required in your shell" && exit 1); \
if [ "$(TPU_QUEUE_AUTO_SSH_ADD)" = "1" ] && [ "$(TPU_QUEUE_SSH_BATCH_MODE)" != "0" ] && command -v ssh-add >/dev/null 2>&1 && [ -f "$(TPU_QUEUE_SSH_KEY_FILE)" ]; then \
if ! ssh-add -l >/dev/null 2>&1; then \
if [ -z "$$SSH_AUTH_SOCK" ] && command -v ssh-agent >/dev/null 2>&1; then eval "$$(ssh-agent -s)" >/dev/null; fi; \
ssh-add "$(TPU_QUEUE_SSH_KEY_FILE)"; \
fi; \
fi; \
AGENT_COUNT="$(SWEEP_COUNT)" PROJECT_ID="$(TPU_PROJECT)" TPU_NETWORK="$(TPU_NETWORK)" TPU_SUBNETWORK="$(TPU_SUBNETWORK)" TPU_REUSE_EXISTING="$(TPU_QUEUE_REUSE_EXISTING)" TPU_KEEP_ALIVE="$(TPU_QUEUE_KEEP_ALIVE)" TPU_STRICT_QUOTA="$(TPU_QUEUE_STRICT_QUOTA)" TPU_DOWNSHIFT_ON_QUOTA="$(TPU_QUEUE_DOWNSHIFT_ON_QUOTA)" TPU_EXECUTION_MODE="$(TPU_QUEUE_EXECUTION_MODE)" TPU_SYNC_METHOD="$(TPU_QUEUE_SYNC_METHOD)" TPU_SKIP_SYNC="$(TPU_QUEUE_SKIP_SYNC)" TPU_DOCKER_IMAGE="$(TPU_QUEUE_DOCKER_IMAGE)" TPU_DOCKER_PULL="$(TPU_QUEUE_DOCKER_PULL)" TPU_DOCKER_AUTO_INSTALL="$(TPU_QUEUE_DOCKER_AUTO_INSTALL)" TPU_SSH_BATCH_MODE="$(TPU_QUEUE_SSH_BATCH_MODE)" TPU_SSH_CONNECT_TIMEOUT="$(TPU_QUEUE_SSH_CONNECT_TIMEOUT)" TPU_SSH_KEY_FILE="$(TPU_QUEUE_SSH_KEY_FILE)" TPU_REQUIRE_SSH_AGENT="$(TPU_QUEUE_REQUIRE_SSH_AGENT)" TPU_QUEUE_FILTER_ZONE="$(TPU_ZONE)" TPU_QUEUE_FILTER_TYPE="$(TPU_QUEUE_TYPE)" WANDB_API_KEY="$$WANDB_API_KEY" "$(QUEUE_SCRIPT)" "$(SWEEP_ID)"
.PHONY: tpu.queue.sweep.docker
tpu.queue.sweep.docker:
@test -n "$(TPU_QUEUE_DOCKER_IMAGE)" || (echo "TPU_QUEUE_DOCKER_IMAGE is required" && exit 1)
@$(MAKE) tpu.queue.sweep TPU_QUEUE_EXECUTION_MODE=docker
.PHONY: tpu.queue.worker.docker
tpu.queue.worker.docker:
@test -n "$(TPU_QUEUE_DOCKER_IMAGE)" || (echo "TPU_QUEUE_DOCKER_IMAGE is required" && exit 1)
@$(MAKE) tpu.queue.worker TPU_QUEUE_EXECUTION_MODE=docker
.PHONY: tpu.queue.docker.build
tpu.queue.docker.build:
@test -n "$(TPU_QUEUE_DOCKER_IMAGE)" || (echo "TPU_QUEUE_DOCKER_IMAGE is required" && exit 1)
docker build -f docker/TPUSweep.Dockerfile -t "$(TPU_QUEUE_DOCKER_IMAGE)" .
.PHONY: tpu.queue.docker.push
tpu.queue.docker.push:
@test -n "$(TPU_QUEUE_DOCKER_IMAGE)" || (echo "TPU_QUEUE_DOCKER_IMAGE is required" && exit 1)
docker push "$(TPU_QUEUE_DOCKER_IMAGE)"
.PHONY: tpu.queue.status
tpu.queue.status:
@set -e; \
if gcloud compute tpus queued-resources list --help >/dev/null 2>&1; then \
QCMD='gcloud --project=$(TPU_PROJECT) compute tpus queued-resources'; \
else \
QCMD='gcloud --project=$(TPU_PROJECT) alpha compute tpus queued-resources'; \
fi; \
for ZONE in $(TPU_QUEUE_ZONES); do \
echo "--- $$ZONE ---"; \
if ! $$QCMD list --zone="$$ZONE"; then \
echo "Skipping $$ZONE (unavailable or no permission)"; \
fi; \
done
.PHONY: tpu.queue.clean
tpu.queue.clean:
@set -e; \
if gcloud compute tpus queued-resources list --help >/dev/null 2>&1; then \
QCMD='gcloud --project=$(TPU_PROJECT) compute tpus queued-resources'; \
else \
QCMD='gcloud --project=$(TPU_PROJECT) alpha compute tpus queued-resources'; \
fi; \
for ZONE in $(TPU_QUEUE_ZONES); do \
$$QCMD list --zone="$$ZONE" --format='value(name)' 2>/dev/null | while read -r NAME; do \
case "$$NAME" in \
qr-*) echo "Deleting $$NAME ($$ZONE)"; $$QCMD delete "$$NAME" --zone="$$ZONE" --quiet ;; \
esac; \
done; \
done
.PHONY: stats.lines
stats.lines:
@find . \( -path '*/node_modules' -o -path '*/.venv' -o -path '*/venv' \) -prune -o \