From d3a4febfde3b36924d391a584c4d9943ac41cabb Mon Sep 17 00:00:00 2001 From: Daniel Rosel Date: Wed, 11 Mar 2026 20:49:28 +0100 Subject: [PATCH] tpu ready remodel --- Makefile | 9 ++++++- TPUS/README.md | 6 ----- TPUS/v4_32_spot_uscentral2b.sh | 22 --------------- TPUS/v4_uscentral2b.sh | 13 --------- TPUS/v5e_64_spot_europewest4b.sh | 22 --------------- TPUS/v5e_64_spot_uscentral1a.sh | 22 --------------- TPUS/v6e_64_spot_europewest4a.sh | 22 --------------- TPUS/v6e_64_spot_useast1d.sh | 22 --------------- docker-compose.yml | 15 +++++++++++ engine/engine.py | 14 +++------- engine/lib/behavior.py | 5 ++++ lib/config.py | 46 +++++++++++++++++++++----------- requirements.txt | 1 + 13 files changed, 63 insertions(+), 156 deletions(-) delete mode 100644 TPUS/README.md delete mode 100644 TPUS/v4_32_spot_uscentral2b.sh delete mode 100644 TPUS/v4_uscentral2b.sh delete mode 100644 TPUS/v5e_64_spot_europewest4b.sh delete mode 100644 TPUS/v5e_64_spot_uscentral1a.sh delete mode 100644 TPUS/v6e_64_spot_europewest4a.sh delete mode 100644 TPUS/v6e_64_spot_useast1d.sh diff --git a/Makefile b/Makefile index f0072a7..edb2a9a 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" || help: @echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines" @echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish" - @echo "study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot" + @echo "data.pull data.push | study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot" @echo "" @echo "Build general public version:" @echo " make pdf.genpop" @@ -134,6 +134,13 @@ train.agent: train.bootstrap: @WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap +.PHONY: data.pull data.push +data.pull: + python scripts/hf_data.py pull + +data.push: + python scripts/hf_data.py push + .PHONY: stats.lines stats.lines: @$(NX) run research:stats diff --git a/TPUS/README.md b/TPUS/README.md deleted file mode 100644 index bb88fce..0000000 --- a/TPUS/README.md +++ /dev/null @@ -1,6 +0,0 @@ -64 spot Cloud TPU v6e chips in zone europe-west4-a -32 spot Cloud TPU v4 chips in zone us-central2-b -64 spot Cloud TPU v5e chips in zone us-central1-a -64 spot Cloud TPU v6e chips in zone us-east1-d -32 on-demand Cloud TPU v4 chips in zone us-central2-b -64 spot Cloud TPU v5e chips in zone europe-west4-b diff --git a/TPUS/v4_32_spot_uscentral2b.sh b/TPUS/v4_32_spot_uscentral2b.sh deleted file mode 100644 index 661bcdc..0000000 --- a/TPUS/v4_32_spot_uscentral2b.sh +++ /dev/null @@ -1,22 +0,0 @@ -# 32 spot Cloud TPU v4 chips in zone us-central2-b -export PROJECT_ID=phantom-trc -export QR_NAME=TPUv4s32spotUC2B -export TPU_NAME=tpu-v4-32-uc2b-spot -export ZONE=us-central2-b -export ACCELERATOR_TYPE=v4-32 -export RUNTIME_VERSION=v2-alpha-tpuv4 - -gcloud compute tpus tpu-vm create ${TPU_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --version=${RUNTIME_VERSION} \ - --spot \ -|| \ -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} \ - --spot diff --git a/TPUS/v4_uscentral2b.sh b/TPUS/v4_uscentral2b.sh deleted file mode 100644 index a372078..0000000 --- a/TPUS/v4_uscentral2b.sh +++ /dev/null @@ -1,13 +0,0 @@ -# 32 on-demand Cloud TPU v4 chips in zone us-central2-b -export PROJECT_ID=phantom-trc -export QR_NAME=TPUlong -export ZONE=us-central2-b -export ACCELERATOR_TYPE=v4-32 -export RUNTIME_VERSION=v2-alpha-tpuv4 -#gcloud compute tpus tpu-vm create ${TPU_NAME} --zone=${ZONE} --project=${PROJECT_ID} --accelerator-type=${ACCELERATOR_TYPE} --version=${RUNTIME_VERSION} -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} diff --git a/TPUS/v5e_64_spot_europewest4b.sh b/TPUS/v5e_64_spot_europewest4b.sh deleted file mode 100644 index 7a35d7e..0000000 --- a/TPUS/v5e_64_spot_europewest4b.sh +++ /dev/null @@ -1,22 +0,0 @@ -# 64 spot Cloud TPU v5e chips in zone europe-west4-b -export PROJECT_ID=phantom-trc -export QR_NAME=TPUv5e64spotEW4B -export TPU_NAME=tpu-v5e-64-ew4b -export ZONE=europe-west4-b -export ACCELERATOR_TYPE=v5e-64 -export RUNTIME_VERSION=v2-alpha-tpuv5-lite - -gcloud compute tpus tpu-vm create ${TPU_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --version=${RUNTIME_VERSION} \ - --spot \ -|| \ -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} \ - --spot diff --git a/TPUS/v5e_64_spot_uscentral1a.sh b/TPUS/v5e_64_spot_uscentral1a.sh deleted file mode 100644 index 96375fd..0000000 --- a/TPUS/v5e_64_spot_uscentral1a.sh +++ /dev/null @@ -1,22 +0,0 @@ -# 64 spot Cloud TPU v5e chips in zone us-central1-a -export PROJECT_ID=phantom-trc -export QR_NAME=TPUv5e64spotUC1A -export TPU_NAME=tpu-v5e-64-uc1a -export ZONE=us-central1-a -export ACCELERATOR_TYPE=v5e-64 -export RUNTIME_VERSION=v2-alpha-tpuv5-lite - -gcloud compute tpus tpu-vm create ${TPU_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --version=${RUNTIME_VERSION} \ - --spot \ -|| \ -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} \ - --spot diff --git a/TPUS/v6e_64_spot_europewest4a.sh b/TPUS/v6e_64_spot_europewest4a.sh deleted file mode 100644 index 1ea17ac..0000000 --- a/TPUS/v6e_64_spot_europewest4a.sh +++ /dev/null @@ -1,22 +0,0 @@ -# 64 spot Cloud TPU v6e chips in zone europe-west4-a -export PROJECT_ID=phantom-trc -export QR_NAME=TPUv6e64spotEW4A -export TPU_NAME=tpu-v6e-64-ew4a -export ZONE=europe-west4-a -export ACCELERATOR_TYPE=v6e-64 -export RUNTIME_VERSION=v2-alpha-tpuv6e - -gcloud compute tpus tpu-vm create ${TPU_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --version=${RUNTIME_VERSION} \ - --spot \ -|| \ -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} \ - --spot diff --git a/TPUS/v6e_64_spot_useast1d.sh b/TPUS/v6e_64_spot_useast1d.sh deleted file mode 100644 index cada53f..0000000 --- a/TPUS/v6e_64_spot_useast1d.sh +++ /dev/null @@ -1,22 +0,0 @@ -# 64 spot Cloud TPU v6e chips in zone us-east1-d -export PROJECT_ID=phantom-trc -export QR_NAME=TPUv6e64spotUE1D -export TPU_NAME=tpu-v6e-64-ue1d -export ZONE=us-east1-d -export ACCELERATOR_TYPE=v6e-64 -export RUNTIME_VERSION=v2-alpha-tpuv6e - -gcloud compute tpus tpu-vm create ${TPU_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --version=${RUNTIME_VERSION} \ - --spot \ -|| \ -gcloud compute tpus queued-resources create ${QR_NAME} \ - --project=${PROJECT_ID} \ - --zone=${ZONE} \ - --node-id=${TPU_NAME} \ - --accelerator-type=${ACCELERATOR_TYPE} \ - --runtime-version=${RUNTIME_VERSION} \ - --spot diff --git a/docker-compose.yml b/docker-compose.yml index ba2e8a3..c00f4e1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,19 @@ services: + tpu-watchdogs: + build: + context: . + dockerfile: docker/TPUWatchdog.dockerfile + container_name: "PHANTOM-tpu-watchdogs" + restart: unless-stopped + user: "${UID:-1000}:${GID:-1000}" + environment: + - HF_TOKEN=${HF_TOKEN} + - WANDB_API_KEY=${WANDB_API_KEY} + - GITHUB_TOKEN=${GITHUB_TOKEN} + - CLOUDSDK_CONFIG=/.config/gcloud + volumes: + - ~/.config/gcloud:/.config/gcloud:rw + tensorboard-rl: image: tensorflow/tensorflow:latest container_name: "PHANTOM-tensorboard-rl" diff --git a/engine/engine.py b/engine/engine.py index 81a4da7..d548177 100644 --- a/engine/engine.py +++ b/engine/engine.py @@ -1,5 +1,4 @@ from sys import platform -from concurrent.futures import ThreadPoolExecutor import numpy as np from .lib.demand import generate_demand_for_actor, estimate_demand from .lib.behavior import get_adjusted_transitions, sample_behavior_from_transitions @@ -8,9 +7,6 @@ from logging import INFO, getLogger logger = getLogger(__name__) logger.setLevel(INFO) -# shared pool; reused across act() calls to avoid per-call thread-spawn overhead -_pool = ThreadPoolExecutor(max_workers=4) - class MarketEngine: """implements separate demand distributions for humans and agents per Section 3.1.1""" @@ -54,16 +50,14 @@ class MarketEngine: agent_transitions = get_adjusted_transitions(demand_a, human=False) # sample N trajectories in parallel; each chain is independent so threads # do not share state and numpy's per-call RNG is thread-safe - h_futs = [ - _pool.submit(sample_behavior_from_transitions, human_transitions) + human_t = [ + sample_behavior_from_transitions(human_transitions) for _ in range(self.Nhumans) ] - a_futs = [ - _pool.submit(sample_behavior_from_transitions, agent_transitions) + agent_t = [ + sample_behavior_from_transitions(agent_transitions) for _ in range(self.Nagents) ] - human_t = [f.result() for f in h_futs] - agent_t = [f.result() for f in a_futs] # store trajectories for agent probability calculation self.last_trajectories = human_t + agent_t return estimate_demand(self.last_trajectories, self.action_weights) diff --git a/engine/lib/behavior.py b/engine/lib/behavior.py index 5c96c27..52a9d7d 100644 --- a/engine/lib/behavior.py +++ b/engine/lib/behavior.py @@ -143,6 +143,11 @@ def get_adjusted_transitions(condition, human=True) -> _TransitionTable: cache_key = (human, tuple(np.round(condition, 4).tolist())) if cache_key in _transition_cache: return _transition_cache[cache_key] + + # prevent OOM by capping cache size + if len(_transition_cache) > 100: + _transition_cache.clear() + base_pivot = _get_base_pivot(human) df = adjust_behavior_to_condition(condition, base_pivot) table = _TransitionTable(df) diff --git a/lib/config.py b/lib/config.py index a27ffd9..d46f82c 100644 --- a/lib/config.py +++ b/lib/config.py @@ -2,6 +2,7 @@ All hardcoded paths should reference this module Paths can be overridden via environment variables """ + import os from pathlib import Path @@ -9,24 +10,34 @@ from pathlib import Path PROJECT_ROOT = Path(__file__).parent.parent.resolve() # data directories -DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data')) -EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments')) +DATA_DIR = Path(os.getenv("PHANTOM_DATA_DIR", PROJECT_ROOT / "data")) +EXPERIMENTS_DIR = Path( + os.getenv("PHANTOM_EXPERIMENTS_DIR", PROJECT_ROOT / "experiments") +) # agent/human interaction data -AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents')) -HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans')) +AGENT_DATA_DIR = Path(os.getenv("PHANTOM_AGENT_DATA_DIR", DATA_DIR / "agents")) +HUMAN_DATA_DIR = Path(os.getenv("PHANTOM_HUMAN_DATA_DIR", DATA_DIR / "humans")) # RL simulation runs -SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs')) +SIM_RUNS_DIR = Path( + os.getenv("PHANTOM_SIM_RUNS_DIR", PROJECT_ROOT / "sim" / "rl" / "runs") +) # model artifacts -MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models')) +MODEL_REGISTRY_DIR = Path(os.getenv("PHANTOM_MODEL_REGISTRY_DIR", DATA_DIR / "models")) # collected experiment data -COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data')) +COLLECTED_DATA_DIR = Path( + os.getenv( + "PHANTOM_COLLECTED_DATA_DIR", EXPERIMENTS_DIR / "agents" / "collected_data" + ) +) # notebook outputs -NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs')) +NOTEBOOK_OUTPUT_DIR = Path( + os.getenv("PHANTOM_NOTEBOOK_OUTPUT_DIR", EXPERIMENTS_DIR / "notebooks" / "outputs") +) def ensure_dir(path: Path) -> Path: @@ -51,15 +62,18 @@ def get_sim_path(*parts: str) -> Path: # service configuration (from .env) -KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost') -KAFKA_PORT = os.getenv('KAFKA_PORT', '9092') +KAFKA_HOST = os.getenv("KAFKA_HOST", "localhost") +KAFKA_PORT = os.getenv("KAFKA_PORT", "9092") KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}" -REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') -REDIS_PORT = int(os.getenv('REDIS_PORT', '6379')) +REDIS_HOST = os.getenv("REDIS_HOST", "localhost") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) -SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '') -SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '') +SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "") +SUPABASE_ANON_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "") -BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000')) -PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001')) +BACKEND_PORT = int(os.getenv("BACKEND_PORT", "5000")) +PROVIDER_PORT = int(os.getenv("PROVIDER_PORT", "5001")) + +# huggingface dataset repo for collected behavioral data +HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "velocitatem/phantom-collected-data") diff --git a/requirements.txt b/requirements.txt index 247121e..c1a8686 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ scikit-learn supabase pymc wandb +huggingface_hub