tpu ready remodel

This commit is contained in:
2026-03-11 20:49:28 +01:00
parent fa2dde8307
commit d3a4febfde
13 changed files with 63 additions and 156 deletions

View File

@@ -37,7 +37,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
help:
@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines"
@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
@echo "study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
@echo "data.pull data.push | study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
@echo ""
@echo "Build general public version:"
@echo " make pdf.genpop"
@@ -134,6 +134,13 @@ train.agent:
train.bootstrap:
@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap
.PHONY: data.pull data.push
data.pull:
python scripts/hf_data.py pull
data.push:
python scripts/hf_data.py push
.PHONY: stats.lines
stats.lines:
@$(NX) run research:stats

View File

@@ -1,6 +0,0 @@
64 spot Cloud TPU v6e chips in zone europe-west4-a
32 spot Cloud TPU v4 chips in zone us-central2-b
64 spot Cloud TPU v5e chips in zone us-central1-a
64 spot Cloud TPU v6e chips in zone us-east1-d
32 on-demand Cloud TPU v4 chips in zone us-central2-b
64 spot Cloud TPU v5e chips in zone europe-west4-b

View File

@@ -1,22 +0,0 @@
# 32 spot Cloud TPU v4 chips in zone us-central2-b
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv4s32spotUC2B
export TPU_NAME=tpu-v4-32-uc2b-spot
export ZONE=us-central2-b
export ACCELERATOR_TYPE=v4-32
export RUNTIME_VERSION=v2-alpha-tpuv4
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,13 +0,0 @@
# 32 on-demand Cloud TPU v4 chips in zone us-central2-b
export PROJECT_ID=phantom-trc
export QR_NAME=TPUlong
export ZONE=us-central2-b
export ACCELERATOR_TYPE=v4-32
export RUNTIME_VERSION=v2-alpha-tpuv4
#gcloud compute tpus tpu-vm create ${TPU_NAME} --zone=${ZONE} --project=${PROJECT_ID} --accelerator-type=${ACCELERATOR_TYPE} --version=${RUNTIME_VERSION}
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION}

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v5e chips in zone europe-west4-b
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv5e64spotEW4B
export TPU_NAME=tpu-v5e-64-ew4b
export ZONE=europe-west4-b
export ACCELERATOR_TYPE=v5e-64
export RUNTIME_VERSION=v2-alpha-tpuv5-lite
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v5e chips in zone us-central1-a
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv5e64spotUC1A
export TPU_NAME=tpu-v5e-64-uc1a
export ZONE=us-central1-a
export ACCELERATOR_TYPE=v5e-64
export RUNTIME_VERSION=v2-alpha-tpuv5-lite
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v6e chips in zone europe-west4-a
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv6e64spotEW4A
export TPU_NAME=tpu-v6e-64-ew4a
export ZONE=europe-west4-a
export ACCELERATOR_TYPE=v6e-64
export RUNTIME_VERSION=v2-alpha-tpuv6e
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,22 +0,0 @@
# 64 spot Cloud TPU v6e chips in zone us-east1-d
export PROJECT_ID=phantom-trc
export QR_NAME=TPUv6e64spotUE1D
export TPU_NAME=tpu-v6e-64-ue1d
export ZONE=us-east1-d
export ACCELERATOR_TYPE=v6e-64
export RUNTIME_VERSION=v2-alpha-tpuv6e
gcloud compute tpus tpu-vm create ${TPU_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--accelerator-type=${ACCELERATOR_TYPE} \
--version=${RUNTIME_VERSION} \
--spot \
|| \
gcloud compute tpus queued-resources create ${QR_NAME} \
--project=${PROJECT_ID} \
--zone=${ZONE} \
--node-id=${TPU_NAME} \
--accelerator-type=${ACCELERATOR_TYPE} \
--runtime-version=${RUNTIME_VERSION} \
--spot

View File

@@ -1,4 +1,19 @@
services:
tpu-watchdogs:
build:
context: .
dockerfile: docker/TPUWatchdog.dockerfile
container_name: "PHANTOM-tpu-watchdogs"
restart: unless-stopped
user: "${UID:-1000}:${GID:-1000}"
environment:
- HF_TOKEN=${HF_TOKEN}
- WANDB_API_KEY=${WANDB_API_KEY}
- GITHUB_TOKEN=${GITHUB_TOKEN}
- CLOUDSDK_CONFIG=/.config/gcloud
volumes:
- ~/.config/gcloud:/.config/gcloud:rw
tensorboard-rl:
image: tensorflow/tensorflow:latest
container_name: "PHANTOM-tensorboard-rl"

View File

@@ -1,5 +1,4 @@
from sys import platform
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from .lib.demand import generate_demand_for_actor, estimate_demand
from .lib.behavior import get_adjusted_transitions, sample_behavior_from_transitions
@@ -8,9 +7,6 @@ from logging import INFO, getLogger
logger = getLogger(__name__)
logger.setLevel(INFO)
# shared pool; reused across act() calls to avoid per-call thread-spawn overhead
_pool = ThreadPoolExecutor(max_workers=4)
class MarketEngine:
"""implements separate demand distributions for humans and agents per Section 3.1.1"""
@@ -54,16 +50,14 @@ class MarketEngine:
agent_transitions = get_adjusted_transitions(demand_a, human=False)
# sample N trajectories in parallel; each chain is independent so threads
# do not share state and numpy's per-call RNG is thread-safe
h_futs = [
_pool.submit(sample_behavior_from_transitions, human_transitions)
human_t = [
sample_behavior_from_transitions(human_transitions)
for _ in range(self.Nhumans)
]
a_futs = [
_pool.submit(sample_behavior_from_transitions, agent_transitions)
agent_t = [
sample_behavior_from_transitions(agent_transitions)
for _ in range(self.Nagents)
]
human_t = [f.result() for f in h_futs]
agent_t = [f.result() for f in a_futs]
# store trajectories for agent probability calculation
self.last_trajectories = human_t + agent_t
return estimate_demand(self.last_trajectories, self.action_weights)

View File

@@ -143,6 +143,11 @@ def get_adjusted_transitions(condition, human=True) -> _TransitionTable:
cache_key = (human, tuple(np.round(condition, 4).tolist()))
if cache_key in _transition_cache:
return _transition_cache[cache_key]
# prevent OOM by capping cache size
if len(_transition_cache) > 100:
_transition_cache.clear()
base_pivot = _get_base_pivot(human)
df = adjust_behavior_to_condition(condition, base_pivot)
table = _TransitionTable(df)

View File

@@ -2,6 +2,7 @@
All hardcoded paths should reference this module
Paths can be overridden via environment variables
"""
import os
from pathlib import Path
@@ -9,24 +10,34 @@ from pathlib import Path
PROJECT_ROOT = Path(__file__).parent.parent.resolve()
# data directories
DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data'))
EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments'))
DATA_DIR = Path(os.getenv("PHANTOM_DATA_DIR", PROJECT_ROOT / "data"))
EXPERIMENTS_DIR = Path(
os.getenv("PHANTOM_EXPERIMENTS_DIR", PROJECT_ROOT / "experiments")
)
# agent/human interaction data
AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents'))
HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans'))
AGENT_DATA_DIR = Path(os.getenv("PHANTOM_AGENT_DATA_DIR", DATA_DIR / "agents"))
HUMAN_DATA_DIR = Path(os.getenv("PHANTOM_HUMAN_DATA_DIR", DATA_DIR / "humans"))
# RL simulation runs
SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs'))
SIM_RUNS_DIR = Path(
os.getenv("PHANTOM_SIM_RUNS_DIR", PROJECT_ROOT / "sim" / "rl" / "runs")
)
# model artifacts
MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models'))
MODEL_REGISTRY_DIR = Path(os.getenv("PHANTOM_MODEL_REGISTRY_DIR", DATA_DIR / "models"))
# collected experiment data
COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data'))
COLLECTED_DATA_DIR = Path(
os.getenv(
"PHANTOM_COLLECTED_DATA_DIR", EXPERIMENTS_DIR / "agents" / "collected_data"
)
)
# notebook outputs
NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs'))
NOTEBOOK_OUTPUT_DIR = Path(
os.getenv("PHANTOM_NOTEBOOK_OUTPUT_DIR", EXPERIMENTS_DIR / "notebooks" / "outputs")
)
def ensure_dir(path: Path) -> Path:
@@ -51,15 +62,18 @@ def get_sim_path(*parts: str) -> Path:
# service configuration (from .env)
KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost')
KAFKA_PORT = os.getenv('KAFKA_PORT', '9092')
KAFKA_HOST = os.getenv("KAFKA_HOST", "localhost")
KAFKA_PORT = os.getenv("KAFKA_PORT", "9092")
KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}"
REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
REDIS_PORT = int(os.getenv('REDIS_PORT', '6379'))
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '')
SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '')
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
SUPABASE_ANON_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")
BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000'))
PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001'))
BACKEND_PORT = int(os.getenv("BACKEND_PORT", "5000"))
PROVIDER_PORT = int(os.getenv("PROVIDER_PORT", "5001"))
# huggingface dataset repo for collected behavioral data
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "velocitatem/phantom-collected-data")

View File

@@ -13,3 +13,4 @@ scikit-learn
supabase
pymc
wandb
huggingface_hub