tpu ready remodel

This commit is contained in:
2026-03-11 20:49:28 +01:00
parent fa2dde8307
commit d3a4febfde
13 changed files with 63 additions and 156 deletions

View File

@@ -37,7 +37,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
help: help:
@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines" @echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines"
@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish" @echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
@echo "study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot" @echo "data.pull data.push | study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
@echo "" @echo ""
@echo "Build general public version:" @echo "Build general public version:"
@echo " make pdf.genpop" @echo " make pdf.genpop"
@@ -134,6 +134,13 @@ train.agent:
train.bootstrap: train.bootstrap:
@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap @WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap
.PHONY: data.pull data.push
data.pull:
python scripts/hf_data.py pull
data.push:
python scripts/hf_data.py push
.PHONY: stats.lines .PHONY: stats.lines
stats.lines: stats.lines:
@$(NX) run research:stats @$(NX) run research:stats

View File

@@ -1,6 +0,0 @@
64 spot Cloud TPU v6e chips in zone europe-west4-a
32 spot Cloud TPU v4 chips in zone us-central2-b
64 spot Cloud TPU v5e chips in zone us-central1-a
64 spot Cloud TPU v6e chips in zone us-east1-d
32 on-demand Cloud TPU v4 chips in zone us-central2-b
64 spot Cloud TPU v5e chips in zone europe-west4-b

View File

@@ -1,22 +0,0 @@
# 32 spot Cloud TPU v4 chips in zone us-central2-b
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv4s32spotUC2B
export TPU_NAME=tpu-v4-32-uc2b-spot
export ZONE=us-central2-b
export ACCELERATOR_TYPE=v4-32
export RUNTIME_VERSION=v2-alpha-tpuv4
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,13 +0,0 @@
# 32 on-demand Cloud TPU v4 chips in zone us-central2-b
export PROJECT_ID=phantom-trc
export QR_NAME=TPUlong
export ZONE=us-central2-b
export ACCELERATOR_TYPE=v4-32
export RUNTIME_VERSION=v2-alpha-tpuv4
#gcloud compute tpus tpu-vm create ${TPU_NAME} --zone=${ZONE} --project=${PROJECT_ID} --accelerator-type=${ACCELERATOR_TYPE} --version=${RUNTIME_VERSION}
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION}

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v5e chips in zone europe-west4-b
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv5e64spotEW4B
export TPU_NAME=tpu-v5e-64-ew4b
export ZONE=europe-west4-b
export ACCELERATOR_TYPE=v5e-64
export RUNTIME_VERSION=v2-alpha-tpuv5-lite
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v5e chips in zone us-central1-a
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv5e64spotUC1A
export TPU_NAME=tpu-v5e-64-uc1a
export ZONE=us-central1-a
export ACCELERATOR_TYPE=v5e-64
export RUNTIME_VERSION=v2-alpha-tpuv5-lite
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v6e chips in zone europe-west4-a
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv6e64spotEW4A
export TPU_NAME=tpu-v6e-64-ew4a
export ZONE=europe-west4-a
export ACCELERATOR_TYPE=v6e-64
export RUNTIME_VERSION=v2-alpha-tpuv6e
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v6e chips in zone us-east1-d
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv6e64spotUE1D
export TPU_NAME=tpu-v6e-64-ue1d
export ZONE=us-east1-d
export ACCELERATOR_TYPE=v6e-64
export RUNTIME_VERSION=v2-alpha-tpuv6e
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,4 +1,19 @@
services: services:
tpu-watchdogs:
build:
context: .
dockerfile: docker/TPUWatchdog.dockerfile
container_name: "PHANTOM-tpu-watchdogs"
restart: unless-stopped
user: "${UID:-1000}:${GID:-1000}"
environment:
- HF_TOKEN=${HF_TOKEN}
- WANDB_API_KEY=${WANDB_API_KEY}
- GITHUB_TOKEN=${GITHUB_TOKEN}
- CLOUDSDK_CONFIG=/.config/gcloud
volumes:
- ~/.config/gcloud:/.config/gcloud:rw
tensorboard-rl: tensorboard-rl:
image: tensorflow/tensorflow:latest image: tensorflow/tensorflow:latest
container_name: "PHANTOM-tensorboard-rl" container_name: "PHANTOM-tensorboard-rl"

View File

@@ -1,5 +1,4 @@
from sys import platform from sys import platform
from concurrent.futures import ThreadPoolExecutor
import numpy as np import numpy as np
from .lib.demand import generate_demand_for_actor, estimate_demand from .lib.demand import generate_demand_for_actor, estimate_demand
from .lib.behavior import get_adjusted_transitions, sample_behavior_from_transitions from .lib.behavior import get_adjusted_transitions, sample_behavior_from_transitions
@@ -8,9 +7,6 @@ from logging import INFO, getLogger
logger = getLogger(__name__) logger = getLogger(__name__)
logger.setLevel(INFO) logger.setLevel(INFO)
# shared pool; reused across act() calls to avoid per-call thread-spawn overhead
_pool = ThreadPoolExecutor(max_workers=4)
class MarketEngine: class MarketEngine:
"""implements separate demand distributions for humans and agents per Section 3.1.1""" """implements separate demand distributions for humans and agents per Section 3.1.1"""
@@ -54,16 +50,14 @@ class MarketEngine:
agent_transitions = get_adjusted_transitions(demand_a, human=False) agent_transitions = get_adjusted_transitions(demand_a, human=False)
# sample N trajectories in parallel; each chain is independent so threads # sample N trajectories in parallel; each chain is independent so threads
# do not share state and numpy's per-call RNG is thread-safe # do not share state and numpy's per-call RNG is thread-safe
h_futs = [ human_t = [
_pool.submit(sample_behavior_from_transitions, human_transitions) sample_behavior_from_transitions(human_transitions)
for _ in range(self.Nhumans) for _ in range(self.Nhumans)
] ]
a_futs = [ agent_t = [
_pool.submit(sample_behavior_from_transitions, agent_transitions) sample_behavior_from_transitions(agent_transitions)
for _ in range(self.Nagents) for _ in range(self.Nagents)
] ]
human_t = [f.result() for f in h_futs]
agent_t = [f.result() for f in a_futs]
# store trajectories for agent probability calculation # store trajectories for agent probability calculation
self.last_trajectories = human_t + agent_t self.last_trajectories = human_t + agent_t
return estimate_demand(self.last_trajectories, self.action_weights) return estimate_demand(self.last_trajectories, self.action_weights)

View File

@@ -143,6 +143,11 @@ def get_adjusted_transitions(condition, human=True) -> _TransitionTable:
cache_key = (human, tuple(np.round(condition, 4).tolist())) cache_key = (human, tuple(np.round(condition, 4).tolist()))
if cache_key in _transition_cache: if cache_key in _transition_cache:
return _transition_cache[cache_key] return _transition_cache[cache_key]
# prevent OOM by capping cache size
if len(_transition_cache) > 100:
_transition_cache.clear()
base_pivot = _get_base_pivot(human) base_pivot = _get_base_pivot(human)
df = adjust_behavior_to_condition(condition, base_pivot) df = adjust_behavior_to_condition(condition, base_pivot)
table = _TransitionTable(df) table = _TransitionTable(df)

View File

@@ -2,6 +2,7 @@
All hardcoded paths should reference this module All hardcoded paths should reference this module
Paths can be overridden via environment variables Paths can be overridden via environment variables
""" """
import os import os
from pathlib import Path from pathlib import Path
@@ -9,24 +10,34 @@ from pathlib import Path
PROJECT_ROOT = Path(__file__).parent.parent.resolve() PROJECT_ROOT = Path(__file__).parent.parent.resolve()
# data directories # data directories
DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data')) DATA_DIR = Path(os.getenv("PHANTOM_DATA_DIR", PROJECT_ROOT / "data"))
EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments')) EXPERIMENTS_DIR = Path(
os.getenv("PHANTOM_EXPERIMENTS_DIR", PROJECT_ROOT / "experiments")
)
# agent/human interaction data # agent/human interaction data
AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents')) AGENT_DATA_DIR = Path(os.getenv("PHANTOM_AGENT_DATA_DIR", DATA_DIR / "agents"))
HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans')) HUMAN_DATA_DIR = Path(os.getenv("PHANTOM_HUMAN_DATA_DIR", DATA_DIR / "humans"))
# RL simulation runs # RL simulation runs
SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs')) SIM_RUNS_DIR = Path(
os.getenv("PHANTOM_SIM_RUNS_DIR", PROJECT_ROOT / "sim" / "rl" / "runs")
)
# model artifacts # model artifacts
MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models')) MODEL_REGISTRY_DIR = Path(os.getenv("PHANTOM_MODEL_REGISTRY_DIR", DATA_DIR / "models"))
# collected experiment data # collected experiment data
COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data')) COLLECTED_DATA_DIR = Path(
os.getenv(
"PHANTOM_COLLECTED_DATA_DIR", EXPERIMENTS_DIR / "agents" / "collected_data"
)
)
# notebook outputs # notebook outputs
NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs')) NOTEBOOK_OUTPUT_DIR = Path(
os.getenv("PHANTOM_NOTEBOOK_OUTPUT_DIR", EXPERIMENTS_DIR / "notebooks" / "outputs")
)
def ensure_dir(path: Path) -> Path: def ensure_dir(path: Path) -> Path:
@@ -51,15 +62,18 @@ def get_sim_path(*parts: str) -> Path:
# service configuration (from .env) # service configuration (from .env)
KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost') KAFKA_HOST = os.getenv("KAFKA_HOST", "localhost")
KAFKA_PORT = os.getenv('KAFKA_PORT', '9092') KAFKA_PORT = os.getenv("KAFKA_PORT", "9092")
KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}" KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}"
REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT = int(os.getenv('REDIS_PORT', '6379')) REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '') SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '') SUPABASE_ANON_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")
BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000')) BACKEND_PORT = int(os.getenv("BACKEND_PORT", "5000"))
PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001')) PROVIDER_PORT = int(os.getenv("PROVIDER_PORT", "5001"))
# huggingface dataset repo for collected behavioral data
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "velocitatem/phantom-collected-data")

View File

@@ -13,3 +13,4 @@ scikit-learn
supabase supabase
pymc pymc
wandb wandb
huggingface_hub