tpu ready remodel

2026-07-15 17:43:36 +00:00 · 2026-03-11 20:49:28 +01:00
parent fa2dde8307
commit d3a4febfde
13 changed files with 63 additions and 156 deletions
--- a/9
+++ b/9
@@ -37,7 +37,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
 help:
 	@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines"
 	@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
-	@echo "study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
+	@echo "data.pull data.push | study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
 	@echo ""
 	@echo "Build general public version:"
 	@echo "  make pdf.genpop"
@@ -134,6 +134,13 @@ train.agent:
 train.bootstrap:
 	@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap

+.PHONY: data.pull data.push
+data.pull:
+	python scripts/hf_data.py pull
+
+data.push:
+	python scripts/hf_data.py push
+
 .PHONY: stats.lines
 stats.lines:
 	@$(NX) run research:stats
--- a/TPUS/README.md
+++ b/TPUS/README.md
@@ -1,6 +0,0 @@
-64 spot Cloud TPU v6e chips in zone europe-west4-a
-32 spot Cloud TPU v4 chips in zone us-central2-b
-64 spot Cloud TPU v5e chips in zone us-central1-a
-64 spot Cloud TPU v6e chips in zone us-east1-d
-32 on-demand Cloud TPU v4 chips in zone us-central2-b
-64 spot Cloud TPU v5e chips in zone europe-west4-b
--- a/TPUS/v4_32_spot_uscentral2b.sh
+++ b/TPUS/v4_32_spot_uscentral2b.sh
@@ -1,22 +0,0 @@
-# 32 spot Cloud TPU v4 chips in zone us-central2-b
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUv4s32spotUC2B
-export TPU_NAME=tpu-v4-32-uc2b-spot
-export ZONE=us-central2-b
-export ACCELERATOR_TYPE=v4-32
-export RUNTIME_VERSION=v2-alpha-tpuv4
-
-gcloud compute tpus tpu-vm create ${TPU_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --version=${RUNTIME_VERSION} \
-       --spot \
-|| \
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION} \
-       --spot
--- a/TPUS/v4_uscentral2b.sh
+++ b/TPUS/v4_uscentral2b.sh
@@ -1,13 +0,0 @@
-# 32 on-demand Cloud TPU v4 chips in zone us-central2-b
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUlong
-export ZONE=us-central2-b
-export ACCELERATOR_TYPE=v4-32
-export RUNTIME_VERSION=v2-alpha-tpuv4
-#gcloud compute tpus tpu-vm create ${TPU_NAME}     --zone=${ZONE}     --project=${PROJECT_ID}     --accelerator-type=${ACCELERATOR_TYPE}     --version=${RUNTIME_VERSION}
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION}
--- a/TPUS/v5e_64_spot_europewest4b.sh
+++ b/TPUS/v5e_64_spot_europewest4b.sh
@@ -1,22 +0,0 @@
-# 64 spot Cloud TPU v5e chips in zone europe-west4-b
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUv5e64spotEW4B
-export TPU_NAME=tpu-v5e-64-ew4b
-export ZONE=europe-west4-b
-export ACCELERATOR_TYPE=v5e-64
-export RUNTIME_VERSION=v2-alpha-tpuv5-lite
-
-gcloud compute tpus tpu-vm create ${TPU_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --version=${RUNTIME_VERSION} \
-       --spot \
-|| \
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION} \
-       --spot
--- a/TPUS/v5e_64_spot_uscentral1a.sh
+++ b/TPUS/v5e_64_spot_uscentral1a.sh
@@ -1,22 +0,0 @@
-# 64 spot Cloud TPU v5e chips in zone us-central1-a
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUv5e64spotUC1A
-export TPU_NAME=tpu-v5e-64-uc1a
-export ZONE=us-central1-a
-export ACCELERATOR_TYPE=v5e-64
-export RUNTIME_VERSION=v2-alpha-tpuv5-lite
-
-gcloud compute tpus tpu-vm create ${TPU_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --version=${RUNTIME_VERSION} \
-       --spot \
-|| \
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION} \
-       --spot
--- a/TPUS/v6e_64_spot_europewest4a.sh
+++ b/TPUS/v6e_64_spot_europewest4a.sh
@@ -1,22 +0,0 @@
-# 64 spot Cloud TPU v6e chips in zone europe-west4-a
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUv6e64spotEW4A
-export TPU_NAME=tpu-v6e-64-ew4a
-export ZONE=europe-west4-a
-export ACCELERATOR_TYPE=v6e-64
-export RUNTIME_VERSION=v2-alpha-tpuv6e
-
-gcloud compute tpus tpu-vm create ${TPU_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --version=${RUNTIME_VERSION} \
-       --spot \
-|| \
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION} \
-       --spot
--- a/TPUS/v6e_64_spot_useast1d.sh
+++ b/TPUS/v6e_64_spot_useast1d.sh
@@ -1,22 +0,0 @@
-# 64 spot Cloud TPU v6e chips in zone us-east1-d
-export PROJECT_ID=phantom-trc
-export QR_NAME=TPUv6e64spotUE1D
-export TPU_NAME=tpu-v6e-64-ue1d
-export ZONE=us-east1-d
-export ACCELERATOR_TYPE=v6e-64
-export RUNTIME_VERSION=v2-alpha-tpuv6e
-
-gcloud compute tpus tpu-vm create ${TPU_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --version=${RUNTIME_VERSION} \
-       --spot \
-|| \
-gcloud compute tpus queued-resources create ${QR_NAME} \
-       --project=${PROJECT_ID} \
-       --zone=${ZONE} \
-       --node-id=${TPU_NAME} \
-       --accelerator-type=${ACCELERATOR_TYPE} \
-       --runtime-version=${RUNTIME_VERSION} \
-       --spot
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,4 +1,19 @@
 services:
+  tpu-watchdogs:
+    build:
+      context: .
+      dockerfile: docker/TPUWatchdog.dockerfile
+    container_name: "PHANTOM-tpu-watchdogs"
+    restart: unless-stopped
+    user: "${UID:-1000}:${GID:-1000}"
+    environment:
+      - HF_TOKEN=${HF_TOKEN}
+      - WANDB_API_KEY=${WANDB_API_KEY}
+      - GITHUB_TOKEN=${GITHUB_TOKEN}
+      - CLOUDSDK_CONFIG=/.config/gcloud
+    volumes:
+      - ~/.config/gcloud:/.config/gcloud:rw
+
  tensorboard-rl:
    image: tensorflow/tensorflow:latest
    container_name: "PHANTOM-tensorboard-rl"
--- a/engine/engine.py
+++ b/engine/engine.py
@@ -1,5 +1,4 @@
 from sys import platform
-from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 from .lib.demand import generate_demand_for_actor, estimate_demand
 from .lib.behavior import get_adjusted_transitions, sample_behavior_from_transitions
@@ -8,9 +7,6 @@ from logging import INFO, getLogger
 logger = getLogger(__name__)
 logger.setLevel(INFO)

-# shared pool; reused across act() calls to avoid per-call thread-spawn overhead
-_pool = ThreadPoolExecutor(max_workers=4)
-

 class MarketEngine:
    """implements separate demand distributions for humans and agents per Section 3.1.1"""
@@ -54,16 +50,14 @@ class MarketEngine:
        agent_transitions = get_adjusted_transitions(demand_a, human=False)
        # sample N trajectories in parallel; each chain is independent so threads
        # do not share state and numpy's per-call RNG is thread-safe
-        h_futs = [
-            _pool.submit(sample_behavior_from_transitions, human_transitions)
+        human_t = [
+            sample_behavior_from_transitions(human_transitions)
            for _ in range(self.Nhumans)
        ]
-        a_futs = [
-            _pool.submit(sample_behavior_from_transitions, agent_transitions)
+        agent_t = [
+            sample_behavior_from_transitions(agent_transitions)
            for _ in range(self.Nagents)
        ]
-        human_t = [f.result() for f in h_futs]
-        agent_t = [f.result() for f in a_futs]
        # store trajectories for agent probability calculation
        self.last_trajectories = human_t + agent_t
        return estimate_demand(self.last_trajectories, self.action_weights)
--- a/engine/lib/behavior.py
+++ b/engine/lib/behavior.py
@@ -143,6 +143,11 @@ def get_adjusted_transitions(condition, human=True) -> _TransitionTable:
    cache_key = (human, tuple(np.round(condition, 4).tolist()))
    if cache_key in _transition_cache:
        return _transition_cache[cache_key]
+
+    # prevent OOM by capping cache size
+    if len(_transition_cache) > 100:
+        _transition_cache.clear()
+
    base_pivot = _get_base_pivot(human)
    df = adjust_behavior_to_condition(condition, base_pivot)
    table = _TransitionTable(df)
--- a/lib/config.py
+++ b/lib/config.py
@@ -2,6 +2,7 @@
 All hardcoded paths should reference this module
 Paths can be overridden via environment variables
 """
+
 import os
 from pathlib import Path

@@ -9,24 +10,34 @@ from pathlib import Path
 PROJECT_ROOT = Path(__file__).parent.parent.resolve()

 # data directories
-DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data'))
-EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments'))
+DATA_DIR = Path(os.getenv("PHANTOM_DATA_DIR", PROJECT_ROOT / "data"))
+EXPERIMENTS_DIR = Path(
+    os.getenv("PHANTOM_EXPERIMENTS_DIR", PROJECT_ROOT / "experiments")
+)

 # agent/human interaction data
-AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents'))
-HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans'))
+AGENT_DATA_DIR = Path(os.getenv("PHANTOM_AGENT_DATA_DIR", DATA_DIR / "agents"))
+HUMAN_DATA_DIR = Path(os.getenv("PHANTOM_HUMAN_DATA_DIR", DATA_DIR / "humans"))

 # RL simulation runs
-SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs'))
+SIM_RUNS_DIR = Path(
+    os.getenv("PHANTOM_SIM_RUNS_DIR", PROJECT_ROOT / "sim" / "rl" / "runs")
+)

 # model artifacts
-MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models'))
+MODEL_REGISTRY_DIR = Path(os.getenv("PHANTOM_MODEL_REGISTRY_DIR", DATA_DIR / "models"))

 # collected experiment data
-COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data'))
+COLLECTED_DATA_DIR = Path(
+    os.getenv(
+        "PHANTOM_COLLECTED_DATA_DIR", EXPERIMENTS_DIR / "agents" / "collected_data"
+    )
+)

 # notebook outputs
-NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs'))
+NOTEBOOK_OUTPUT_DIR = Path(
+    os.getenv("PHANTOM_NOTEBOOK_OUTPUT_DIR", EXPERIMENTS_DIR / "notebooks" / "outputs")
+)


 def ensure_dir(path: Path) -> Path:
@@ -51,15 +62,18 @@ def get_sim_path(*parts: str) -> Path:


 # service configuration (from .env)
-KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost')
-KAFKA_PORT = os.getenv('KAFKA_PORT', '9092')
+KAFKA_HOST = os.getenv("KAFKA_HOST", "localhost")
+KAFKA_PORT = os.getenv("KAFKA_PORT", "9092")
 KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}"

-REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
-REDIS_PORT = int(os.getenv('REDIS_PORT', '6379'))
+REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
+REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))

-SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '')
-SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '')
+SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
+SUPABASE_ANON_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")

-BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000'))
-PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001'))
+BACKEND_PORT = int(os.getenv("BACKEND_PORT", "5000"))
+PROVIDER_PORT = int(os.getenv("PROVIDER_PORT", "5001"))
+
+# huggingface dataset repo for collected behavioral data
+HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "velocitatem/phantom-collected-data")
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ scikit-learn
 supabase
 pymc
 wandb
+huggingface_hub