From d3a4febfde3b36924d391a584c4d9943ac41cabb Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 11 Mar 2026 20:49:28 +0100
Subject: [PATCH] tpu ready remodel

---
 Makefile                         |  9 ++++++-
 TPUS/README.md                   |  6 -----
 TPUS/v4_32_spot_uscentral2b.sh   | 22 ---------------
 TPUS/v4_uscentral2b.sh           | 13 ---------
 TPUS/v5e_64_spot_europewest4b.sh | 22 ---------------
 TPUS/v5e_64_spot_uscentral1a.sh  | 22 ---------------
 TPUS/v6e_64_spot_europewest4a.sh | 22 ---------------
 TPUS/v6e_64_spot_useast1d.sh     | 22 ---------------
 docker-compose.yml               | 15 +++++++++++
 engine/engine.py                 | 14 +++-------
 engine/lib/behavior.py           |  5 ++++
 lib/config.py                    | 46 +++++++++++++++++++++-----------
 requirements.txt                 |  1 +
 13 files changed, 63 insertions(+), 156 deletions(-)
 delete mode 100644 TPUS/README.md
 delete mode 100644 TPUS/v4_32_spot_uscentral2b.sh
 delete mode 100644 TPUS/v4_uscentral2b.sh
 delete mode 100644 TPUS/v5e_64_spot_europewest4b.sh
 delete mode 100644 TPUS/v5e_64_spot_uscentral1a.sh
 delete mode 100644 TPUS/v6e_64_spot_europewest4a.sh
 delete mode 100644 TPUS/v6e_64_spot_useast1d.sh

diff --git a/Makefile b/Makefile
index f0072a7..edb2a9a 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
 help:
 	@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines"
 	@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
-	@echo "study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
+	@echo "data.pull data.push | study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
 	@echo ""
 	@echo "Build general public version:"
 	@echo "  make pdf.genpop"
@@ -134,6 +134,13 @@ train.agent:
 train.bootstrap:
 	@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap
 
+.PHONY: data.pull data.push
+data.pull:
+	python scripts/hf_data.py pull
+
+data.push:
+	python scripts/hf_data.py push
+
 .PHONY: stats.lines
 stats.lines:
 	@$(NX) run research:stats
diff --git a/TPUS/README.md b/TPUS/README.md
deleted file mode 100644
index bb88fce..0000000
--- a/TPUS/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-64 spot Cloud TPU v6e chips in zone europe-west4-a
-32 spot Cloud TPU v4 chips in zone us-central2-b
-64 spot Cloud TPU v5e chips in zone us-central1-a
-64 spot Cloud TPU v6e chips in zone us-east1-d
-32 on-demand Cloud TPU v4 chips in zone us-central2-b
-64 spot Cloud TPU v5e chips in zone europe-west4-b
diff --git a/TPUS/v4_32_spot_uscentral2b.sh b/TPUS/v4_32_spot_uscentral2b.sh
deleted file mode 100644
index 661bcdc..0000000
--- a/TPUS/v4_32_spot_uscentral2b.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# 32 spot Cloud TPU v4 chips in zone us-central2-b
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUv4s32spotUC2B
-export TPU_NAME=tpu-v4-32-uc2b-spot
-export ZONE=us-central2-b
-export ACCELERATOR_TYPE=v4-32
-export RUNTIME_VERSION=v2-alpha-tpuv4
-
-gcloud compute tpus tpu-vm create ${TPU_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --version=${RUNTIME_VERSION} \
-       --spot \
-|| \
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION} \
-       --spot
diff --git a/TPUS/v4_uscentral2b.sh b/TPUS/v4_uscentral2b.sh
deleted file mode 100644
index a372078..0000000
--- a/TPUS/v4_uscentral2b.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-# 32 on-demand Cloud TPU v4 chips in zone us-central2-b
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUlong
-export ZONE=us-central2-b
-export ACCELERATOR_TYPE=v4-32
-export RUNTIME_VERSION=v2-alpha-tpuv4
-#gcloud compute tpus tpu-vm create ${TPU_NAME}     --zone=${ZONE}     --project=${PROJECT_ID}     --accelerator-type=${ACCELERATOR_TYPE}     --version=${RUNTIME_VERSION}
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION}
diff --git a/TPUS/v5e_64_spot_europewest4b.sh b/TPUS/v5e_64_spot_europewest4b.sh
deleted file mode 100644
index 7a35d7e..0000000
--- a/TPUS/v5e_64_spot_europewest4b.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# 64 spot Cloud TPU v5e chips in zone europe-west4-b
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUv5e64spotEW4B
-export TPU_NAME=tpu-v5e-64-ew4b
-export ZONE=europe-west4-b
-export ACCELERATOR_TYPE=v5e-64
-export RUNTIME_VERSION=v2-alpha-tpuv5-lite
-
-gcloud compute tpus tpu-vm create ${TPU_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --version=${RUNTIME_VERSION} \
-       --spot \
-|| \
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION} \
-       --spot
diff --git a/TPUS/v5e_64_spot_uscentral1a.sh b/TPUS/v5e_64_spot_uscentral1a.sh
deleted file mode 100644
index 96375fd..0000000
--- a/TPUS/v5e_64_spot_uscentral1a.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# 64 spot Cloud TPU v5e chips in zone us-central1-a
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUv5e64spotUC1A
-export TPU_NAME=tpu-v5e-64-uc1a
-export ZONE=us-central1-a
-export ACCELERATOR_TYPE=v5e-64
-export RUNTIME_VERSION=v2-alpha-tpuv5-lite
-
-gcloud compute tpus tpu-vm create ${TPU_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --version=${RUNTIME_VERSION} \
-       --spot \
-|| \
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION} \
-       --spot
diff --git a/TPUS/v6e_64_spot_europewest4a.sh b/TPUS/v6e_64_spot_europewest4a.sh
deleted file mode 100644
index 1ea17ac..0000000
--- a/TPUS/v6e_64_spot_europewest4a.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# 64 spot Cloud TPU v6e chips in zone europe-west4-a
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUv6e64spotEW4A
-export TPU_NAME=tpu-v6e-64-ew4a
-export ZONE=europe-west4-a
-export ACCELERATOR_TYPE=v6e-64
-export RUNTIME_VERSION=v2-alpha-tpuv6e
-
-gcloud compute tpus tpu-vm create ${TPU_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --version=${RUNTIME_VERSION} \
-       --spot \
-|| \
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION} \
-       --spot
diff --git a/TPUS/v6e_64_spot_useast1d.sh b/TPUS/v6e_64_spot_useast1d.sh
deleted file mode 100644
index cada53f..0000000
--- a/TPUS/v6e_64_spot_useast1d.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# 64 spot Cloud TPU v6e chips in zone us-east1-d
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUv6e64spotUE1D
-export TPU_NAME=tpu-v6e-64-ue1d
-export ZONE=us-east1-d
-export ACCELERATOR_TYPE=v6e-64
-export RUNTIME_VERSION=v2-alpha-tpuv6e
-
-gcloud compute tpus tpu-vm create ${TPU_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --version=${RUNTIME_VERSION} \
-       --spot \
-|| \
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION} \
-       --spot
diff --git a/docker-compose.yml b/docker-compose.yml
index ba2e8a3..c00f4e1 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,4 +1,19 @@
 services:
+  tpu-watchdogs:
+    build:
+      context: .
+      dockerfile: docker/TPUWatchdog.dockerfile
+    container_name: "PHANTOM-tpu-watchdogs"
+    restart: unless-stopped
+    user: "${UID:-1000}:${GID:-1000}"
+    environment:
+      - HF_TOKEN=${HF_TOKEN}
+      - WANDB_API_KEY=${WANDB_API_KEY}
+      - GITHUB_TOKEN=${GITHUB_TOKEN}
+      - CLOUDSDK_CONFIG=/.config/gcloud
+    volumes:
+      - ~/.config/gcloud:/.config/gcloud:rw
+
   tensorboard-rl:
     image: tensorflow/tensorflow:latest
     container_name: "PHANTOM-tensorboard-rl"
diff --git a/engine/engine.py b/engine/engine.py
index 81a4da7..d548177 100644
--- a/engine/engine.py
+++ b/engine/engine.py
@@ -1,5 +1,4 @@
 from sys import platform
-from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 from .lib.demand import generate_demand_for_actor, estimate_demand
 from .lib.behavior import get_adjusted_transitions, sample_behavior_from_transitions
@@ -8,9 +7,6 @@ from logging import INFO, getLogger
 logger = getLogger(__name__)
 logger.setLevel(INFO)
 
-# shared pool; reused across act() calls to avoid per-call thread-spawn overhead
-_pool = ThreadPoolExecutor(max_workers=4)
-
 
 class MarketEngine:
     """implements separate demand distributions for humans and agents per Section 3.1.1"""
@@ -54,16 +50,14 @@ class MarketEngine:
         agent_transitions = get_adjusted_transitions(demand_a, human=False)
         # sample N trajectories in parallel; each chain is independent so threads
         # do not share state and numpy's per-call RNG is thread-safe
-        h_futs = [
-            _pool.submit(sample_behavior_from_transitions, human_transitions)
+        human_t = [
+            sample_behavior_from_transitions(human_transitions)
             for _ in range(self.Nhumans)
         ]
-        a_futs = [
-            _pool.submit(sample_behavior_from_transitions, agent_transitions)
+        agent_t = [
+            sample_behavior_from_transitions(agent_transitions)
             for _ in range(self.Nagents)
         ]
-        human_t = [f.result() for f in h_futs]
-        agent_t = [f.result() for f in a_futs]
         # store trajectories for agent probability calculation
         self.last_trajectories = human_t + agent_t
         return estimate_demand(self.last_trajectories, self.action_weights)
diff --git a/engine/lib/behavior.py b/engine/lib/behavior.py
index 5c96c27..52a9d7d 100644
--- a/engine/lib/behavior.py
+++ b/engine/lib/behavior.py
@@ -143,6 +143,11 @@ def get_adjusted_transitions(condition, human=True) -> _TransitionTable:
     cache_key = (human, tuple(np.round(condition, 4).tolist()))
     if cache_key in _transition_cache:
         return _transition_cache[cache_key]
+
+    # prevent OOM by capping cache size
+    if len(_transition_cache) > 100:
+        _transition_cache.clear()
+
     base_pivot = _get_base_pivot(human)
     df = adjust_behavior_to_condition(condition, base_pivot)
     table = _TransitionTable(df)
diff --git a/lib/config.py b/lib/config.py
index a27ffd9..d46f82c 100644
--- a/lib/config.py
+++ b/lib/config.py
@@ -2,6 +2,7 @@
 All hardcoded paths should reference this module
 Paths can be overridden via environment variables
 """
+
 import os
 from pathlib import Path
 
@@ -9,24 +10,34 @@ from pathlib import Path
 PROJECT_ROOT = Path(__file__).parent.parent.resolve()
 
 # data directories
-DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data'))
-EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments'))
+DATA_DIR = Path(os.getenv("PHANTOM_DATA_DIR", PROJECT_ROOT / "data"))
+EXPERIMENTS_DIR = Path(
+    os.getenv("PHANTOM_EXPERIMENTS_DIR", PROJECT_ROOT / "experiments")
+)
 
 # agent/human interaction data
-AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents'))
-HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans'))
+AGENT_DATA_DIR = Path(os.getenv("PHANTOM_AGENT_DATA_DIR", DATA_DIR / "agents"))
+HUMAN_DATA_DIR = Path(os.getenv("PHANTOM_HUMAN_DATA_DIR", DATA_DIR / "humans"))
 
 # RL simulation runs
-SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs'))
+SIM_RUNS_DIR = Path(
+    os.getenv("PHANTOM_SIM_RUNS_DIR", PROJECT_ROOT / "sim" / "rl" / "runs")
+)
 
 # model artifacts
-MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models'))
+MODEL_REGISTRY_DIR = Path(os.getenv("PHANTOM_MODEL_REGISTRY_DIR", DATA_DIR / "models"))
 
 # collected experiment data
-COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data'))
+COLLECTED_DATA_DIR = Path(
+    os.getenv(
+        "PHANTOM_COLLECTED_DATA_DIR", EXPERIMENTS_DIR / "agents" / "collected_data"
+    )
+)
 
 # notebook outputs
-NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs'))
+NOTEBOOK_OUTPUT_DIR = Path(
+    os.getenv("PHANTOM_NOTEBOOK_OUTPUT_DIR", EXPERIMENTS_DIR / "notebooks" / "outputs")
+)
 
 
 def ensure_dir(path: Path) -> Path:
@@ -51,15 +62,18 @@ def get_sim_path(*parts: str) -> Path:
 
 
 # service configuration (from .env)
-KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost')
-KAFKA_PORT = os.getenv('KAFKA_PORT', '9092')
+KAFKA_HOST = os.getenv("KAFKA_HOST", "localhost")
+KAFKA_PORT = os.getenv("KAFKA_PORT", "9092")
 KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}"
 
-REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
-REDIS_PORT = int(os.getenv('REDIS_PORT', '6379'))
+REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
+REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
 
-SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '')
-SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '')
+SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
+SUPABASE_ANON_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")
 
-BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000'))
-PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001'))
+BACKEND_PORT = int(os.getenv("BACKEND_PORT", "5000"))
+PROVIDER_PORT = int(os.getenv("PROVIDER_PORT", "5001"))
+
+# huggingface dataset repo for collected behavioral data
+HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "velocitatem/phantom-collected-data")
diff --git a/requirements.txt b/requirements.txt
index 247121e..c1a8686 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ scikit-learn
 supabase
 pymc
 wandb
+huggingface_hub