mirror of
https://github.com/velocitatem/PHANTOM.git
synced 2026-05-31 08:33:36 +00:00
tpu ready remodel
This commit is contained in:
9
Makefile
9
Makefile
@@ -37,7 +37,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
|
||||
help:
|
||||
@echo "pdf.build pdf.watch pdf.clean pdf.genpop pdf.genpop.watch pdf.arxiv | test.backend test.e2e test.all | web.dev | install | train | benchmark | benchmark.simple | benchmark.agent | train.agent | train.bootstrap | stats.lines"
|
||||
@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
|
||||
@echo "study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
|
||||
@echo "data.pull data.push | study.margin-erosion study.margin-erosion.quick study.margin-erosion.plot"
|
||||
@echo ""
|
||||
@echo "Build general public version:"
|
||||
@echo " make pdf.genpop"
|
||||
@@ -134,6 +134,13 @@ train.agent:
|
||||
train.bootstrap:
|
||||
@WANDB_ENTITY="$(WANDB_ENTITY)" WANDB_PROJECT="$(WANDB_PROJECT)" SWEEP_ENV_FILE="$(SWEEP_ENV_FILE)" REPO_URL="$(REPO_URL)" BRANCH="$(BRANCH)" WORKDIR="$(WORKDIR)" SWEEP_ID="$(SWEEP_ID)" AGENT_COUNT="$(AGENT_COUNT)" AGENT_LOOP="$(AGENT_LOOP)" RETRY_SECONDS="$(RETRY_SECONDS)" $(NX) run research:train-bootstrap
|
||||
|
||||
.PHONY: data.pull data.push
|
||||
data.pull:
|
||||
python scripts/hf_data.py pull
|
||||
|
||||
data.push:
|
||||
python scripts/hf_data.py push
|
||||
|
||||
.PHONY: stats.lines
|
||||
stats.lines:
|
||||
@$(NX) run research:stats
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
64 spot Cloud TPU v6e chips in zone europe-west4-a
|
||||
32 spot Cloud TPU v4 chips in zone us-central2-b
|
||||
64 spot Cloud TPU v5e chips in zone us-central1-a
|
||||
64 spot Cloud TPU v6e chips in zone us-east1-d
|
||||
32 on-demand Cloud TPU v4 chips in zone us-central2-b
|
||||
64 spot Cloud TPU v5e chips in zone europe-west4-b
|
||||
@@ -1,22 +0,0 @@
|
||||
# 32 spot Cloud TPU v4 chips in zone us-central2-b
|
||||
export PROJECT_ID=phantom-trc
|
||||
export QR_NAME=TPUv4s32spotUC2B
|
||||
export TPU_NAME=tpu-v4-32-uc2b-spot
|
||||
export ZONE=us-central2-b
|
||||
export ACCELERATOR_TYPE=v4-32
|
||||
export RUNTIME_VERSION=v2-alpha-tpuv4
|
||||
|
||||
gcloud compute tpus tpu-vm create ${TPU_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--version=${RUNTIME_VERSION} \
|
||||
--spot \
|
||||
|| \
|
||||
gcloud compute tpus queued-resources create ${QR_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--node-id=${TPU_NAME} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--runtime-version=${RUNTIME_VERSION} \
|
||||
--spot
|
||||
@@ -1,13 +0,0 @@
|
||||
# 32 on-demand Cloud TPU v4 chips in zone us-central2-b
|
||||
export PROJECT_ID=phantom-trc
|
||||
export QR_NAME=TPUlong
|
||||
export ZONE=us-central2-b
|
||||
export ACCELERATOR_TYPE=v4-32
|
||||
export RUNTIME_VERSION=v2-alpha-tpuv4
|
||||
#gcloud compute tpus tpu-vm create ${TPU_NAME} --zone=${ZONE} --project=${PROJECT_ID} --accelerator-type=${ACCELERATOR_TYPE} --version=${RUNTIME_VERSION}
|
||||
gcloud compute tpus queued-resources create ${QR_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--node-id=${TPU_NAME} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--runtime-version=${RUNTIME_VERSION}
|
||||
@@ -1,22 +0,0 @@
|
||||
# 64 spot Cloud TPU v5e chips in zone europe-west4-b
|
||||
export PROJECT_ID=phantom-trc
|
||||
export QR_NAME=TPUv5e64spotEW4B
|
||||
export TPU_NAME=tpu-v5e-64-ew4b
|
||||
export ZONE=europe-west4-b
|
||||
export ACCELERATOR_TYPE=v5e-64
|
||||
export RUNTIME_VERSION=v2-alpha-tpuv5-lite
|
||||
|
||||
gcloud compute tpus tpu-vm create ${TPU_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--version=${RUNTIME_VERSION} \
|
||||
--spot \
|
||||
|| \
|
||||
gcloud compute tpus queued-resources create ${QR_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--node-id=${TPU_NAME} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--runtime-version=${RUNTIME_VERSION} \
|
||||
--spot
|
||||
@@ -1,22 +0,0 @@
|
||||
# 64 spot Cloud TPU v5e chips in zone us-central1-a
|
||||
export PROJECT_ID=phantom-trc
|
||||
export QR_NAME=TPUv5e64spotUC1A
|
||||
export TPU_NAME=tpu-v5e-64-uc1a
|
||||
export ZONE=us-central1-a
|
||||
export ACCELERATOR_TYPE=v5e-64
|
||||
export RUNTIME_VERSION=v2-alpha-tpuv5-lite
|
||||
|
||||
gcloud compute tpus tpu-vm create ${TPU_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--version=${RUNTIME_VERSION} \
|
||||
--spot \
|
||||
|| \
|
||||
gcloud compute tpus queued-resources create ${QR_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--node-id=${TPU_NAME} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--runtime-version=${RUNTIME_VERSION} \
|
||||
--spot
|
||||
@@ -1,22 +0,0 @@
|
||||
# 64 spot Cloud TPU v6e chips in zone europe-west4-a
|
||||
export PROJECT_ID=phantom-trc
|
||||
export QR_NAME=TPUv6e64spotEW4A
|
||||
export TPU_NAME=tpu-v6e-64-ew4a
|
||||
export ZONE=europe-west4-a
|
||||
export ACCELERATOR_TYPE=v6e-64
|
||||
export RUNTIME_VERSION=v2-alpha-tpuv6e
|
||||
|
||||
gcloud compute tpus tpu-vm create ${TPU_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--version=${RUNTIME_VERSION} \
|
||||
--spot \
|
||||
|| \
|
||||
gcloud compute tpus queued-resources create ${QR_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--node-id=${TPU_NAME} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--runtime-version=${RUNTIME_VERSION} \
|
||||
--spot
|
||||
@@ -1,22 +0,0 @@
|
||||
# 64 spot Cloud TPU v6e chips in zone us-east1-d
|
||||
export PROJECT_ID=phantom-trc
|
||||
export QR_NAME=TPUv6e64spotUE1D
|
||||
export TPU_NAME=tpu-v6e-64-ue1d
|
||||
export ZONE=us-east1-d
|
||||
export ACCELERATOR_TYPE=v6e-64
|
||||
export RUNTIME_VERSION=v2-alpha-tpuv6e
|
||||
|
||||
gcloud compute tpus tpu-vm create ${TPU_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--version=${RUNTIME_VERSION} \
|
||||
--spot \
|
||||
|| \
|
||||
gcloud compute tpus queued-resources create ${QR_NAME} \
|
||||
--project=${PROJECT_ID} \
|
||||
--zone=${ZONE} \
|
||||
--node-id=${TPU_NAME} \
|
||||
--accelerator-type=${ACCELERATOR_TYPE} \
|
||||
--runtime-version=${RUNTIME_VERSION} \
|
||||
--spot
|
||||
@@ -1,4 +1,19 @@
|
||||
services:
|
||||
tpu-watchdogs:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/TPUWatchdog.dockerfile
|
||||
container_name: "PHANTOM-tpu-watchdogs"
|
||||
restart: unless-stopped
|
||||
user: "${UID:-1000}:${GID:-1000}"
|
||||
environment:
|
||||
- HF_TOKEN=${HF_TOKEN}
|
||||
- WANDB_API_KEY=${WANDB_API_KEY}
|
||||
- GITHUB_TOKEN=${GITHUB_TOKEN}
|
||||
- CLOUDSDK_CONFIG=/.config/gcloud
|
||||
volumes:
|
||||
- ~/.config/gcloud:/.config/gcloud:rw
|
||||
|
||||
tensorboard-rl:
|
||||
image: tensorflow/tensorflow:latest
|
||||
container_name: "PHANTOM-tensorboard-rl"
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from sys import platform
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import numpy as np
|
||||
from .lib.demand import generate_demand_for_actor, estimate_demand
|
||||
from .lib.behavior import get_adjusted_transitions, sample_behavior_from_transitions
|
||||
@@ -8,9 +7,6 @@ from logging import INFO, getLogger
|
||||
logger = getLogger(__name__)
|
||||
logger.setLevel(INFO)
|
||||
|
||||
# shared pool; reused across act() calls to avoid per-call thread-spawn overhead
|
||||
_pool = ThreadPoolExecutor(max_workers=4)
|
||||
|
||||
|
||||
class MarketEngine:
|
||||
"""implements separate demand distributions for humans and agents per Section 3.1.1"""
|
||||
@@ -54,16 +50,14 @@ class MarketEngine:
|
||||
agent_transitions = get_adjusted_transitions(demand_a, human=False)
|
||||
# sample N trajectories in parallel; each chain is independent so threads
|
||||
# do not share state and numpy's per-call RNG is thread-safe
|
||||
h_futs = [
|
||||
_pool.submit(sample_behavior_from_transitions, human_transitions)
|
||||
human_t = [
|
||||
sample_behavior_from_transitions(human_transitions)
|
||||
for _ in range(self.Nhumans)
|
||||
]
|
||||
a_futs = [
|
||||
_pool.submit(sample_behavior_from_transitions, agent_transitions)
|
||||
agent_t = [
|
||||
sample_behavior_from_transitions(agent_transitions)
|
||||
for _ in range(self.Nagents)
|
||||
]
|
||||
human_t = [f.result() for f in h_futs]
|
||||
agent_t = [f.result() for f in a_futs]
|
||||
# store trajectories for agent probability calculation
|
||||
self.last_trajectories = human_t + agent_t
|
||||
return estimate_demand(self.last_trajectories, self.action_weights)
|
||||
|
||||
@@ -143,6 +143,11 @@ def get_adjusted_transitions(condition, human=True) -> _TransitionTable:
|
||||
cache_key = (human, tuple(np.round(condition, 4).tolist()))
|
||||
if cache_key in _transition_cache:
|
||||
return _transition_cache[cache_key]
|
||||
|
||||
# prevent OOM by capping cache size
|
||||
if len(_transition_cache) > 100:
|
||||
_transition_cache.clear()
|
||||
|
||||
base_pivot = _get_base_pivot(human)
|
||||
df = adjust_behavior_to_condition(condition, base_pivot)
|
||||
table = _TransitionTable(df)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
All hardcoded paths should reference this module
|
||||
Paths can be overridden via environment variables
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@@ -9,24 +10,34 @@ from pathlib import Path
|
||||
PROJECT_ROOT = Path(__file__).parent.parent.resolve()
|
||||
|
||||
# data directories
|
||||
DATA_DIR = Path(os.getenv('PHANTOM_DATA_DIR', PROJECT_ROOT / 'data'))
|
||||
EXPERIMENTS_DIR = Path(os.getenv('PHANTOM_EXPERIMENTS_DIR', PROJECT_ROOT / 'experiments'))
|
||||
DATA_DIR = Path(os.getenv("PHANTOM_DATA_DIR", PROJECT_ROOT / "data"))
|
||||
EXPERIMENTS_DIR = Path(
|
||||
os.getenv("PHANTOM_EXPERIMENTS_DIR", PROJECT_ROOT / "experiments")
|
||||
)
|
||||
|
||||
# agent/human interaction data
|
||||
AGENT_DATA_DIR = Path(os.getenv('PHANTOM_AGENT_DATA_DIR', DATA_DIR / 'agents'))
|
||||
HUMAN_DATA_DIR = Path(os.getenv('PHANTOM_HUMAN_DATA_DIR', DATA_DIR / 'humans'))
|
||||
AGENT_DATA_DIR = Path(os.getenv("PHANTOM_AGENT_DATA_DIR", DATA_DIR / "agents"))
|
||||
HUMAN_DATA_DIR = Path(os.getenv("PHANTOM_HUMAN_DATA_DIR", DATA_DIR / "humans"))
|
||||
|
||||
# RL simulation runs
|
||||
SIM_RUNS_DIR = Path(os.getenv('PHANTOM_SIM_RUNS_DIR', PROJECT_ROOT / 'sim' / 'rl' / 'runs'))
|
||||
SIM_RUNS_DIR = Path(
|
||||
os.getenv("PHANTOM_SIM_RUNS_DIR", PROJECT_ROOT / "sim" / "rl" / "runs")
|
||||
)
|
||||
|
||||
# model artifacts
|
||||
MODEL_REGISTRY_DIR = Path(os.getenv('PHANTOM_MODEL_REGISTRY_DIR', DATA_DIR / 'models'))
|
||||
MODEL_REGISTRY_DIR = Path(os.getenv("PHANTOM_MODEL_REGISTRY_DIR", DATA_DIR / "models"))
|
||||
|
||||
# collected experiment data
|
||||
COLLECTED_DATA_DIR = Path(os.getenv('PHANTOM_COLLECTED_DATA_DIR', EXPERIMENTS_DIR / 'agents' / 'collected_data'))
|
||||
COLLECTED_DATA_DIR = Path(
|
||||
os.getenv(
|
||||
"PHANTOM_COLLECTED_DATA_DIR", EXPERIMENTS_DIR / "agents" / "collected_data"
|
||||
)
|
||||
)
|
||||
|
||||
# notebook outputs
|
||||
NOTEBOOK_OUTPUT_DIR = Path(os.getenv('PHANTOM_NOTEBOOK_OUTPUT_DIR', EXPERIMENTS_DIR / 'notebooks' / 'outputs'))
|
||||
NOTEBOOK_OUTPUT_DIR = Path(
|
||||
os.getenv("PHANTOM_NOTEBOOK_OUTPUT_DIR", EXPERIMENTS_DIR / "notebooks" / "outputs")
|
||||
)
|
||||
|
||||
|
||||
def ensure_dir(path: Path) -> Path:
|
||||
@@ -51,15 +62,18 @@ def get_sim_path(*parts: str) -> Path:
|
||||
|
||||
|
||||
# service configuration (from .env)
|
||||
KAFKA_HOST = os.getenv('KAFKA_HOST', 'localhost')
|
||||
KAFKA_PORT = os.getenv('KAFKA_PORT', '9092')
|
||||
KAFKA_HOST = os.getenv("KAFKA_HOST", "localhost")
|
||||
KAFKA_PORT = os.getenv("KAFKA_PORT", "9092")
|
||||
KAFKA_BROKER = f"{KAFKA_HOST}:{KAFKA_PORT}"
|
||||
|
||||
REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
|
||||
REDIS_PORT = int(os.getenv('REDIS_PORT', '6379'))
|
||||
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
|
||||
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
|
||||
|
||||
SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL', '')
|
||||
SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY', '')
|
||||
SUPABASE_URL = os.getenv("NEXT_PUBLIC_SUPABASE_URL", "")
|
||||
SUPABASE_ANON_KEY = os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY", "")
|
||||
|
||||
BACKEND_PORT = int(os.getenv('BACKEND_PORT', '5000'))
|
||||
PROVIDER_PORT = int(os.getenv('PROVIDER_PORT', '5001'))
|
||||
BACKEND_PORT = int(os.getenv("BACKEND_PORT", "5000"))
|
||||
PROVIDER_PORT = int(os.getenv("PROVIDER_PORT", "5001"))
|
||||
|
||||
# huggingface dataset repo for collected behavioral data
|
||||
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "velocitatem/phantom-collected-data")
|
||||
|
||||
@@ -13,3 +13,4 @@ scikit-learn
|
||||
supabase
|
||||
pymc
|
||||
wandb
|
||||
huggingface_hub
|
||||
|
||||
Reference in New Issue
Block a user