mirror of
https://github.com/velocitatem/PHANTOM.git
synced 2026-05-31 16:43:36 +00:00
feat: tpu orchestrator
This commit is contained in:
8
tpu_orchestration/configs/test_vm.conf
Normal file
8
tpu_orchestration/configs/test_vm.conf
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
ZONE="us-central2-b"
|
||||||
|
QR_NAME="v4-test-vm"
|
||||||
|
ACCEL_TYPE="v4-8"
|
||||||
|
RUNTIME_VERSION="v2-alpha-tpuv4"
|
||||||
|
IS_SPOT="true"
|
||||||
|
RUN_ID="phantom_v4_test_1"
|
||||||
|
HF_REPO="velocitatem/capstone"
|
||||||
|
TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof"
|
||||||
8
tpu_orchestration/configs/v4_od_us.conf
Normal file
8
tpu_orchestration/configs/v4_od_us.conf
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
ZONE="us-central2-b"
|
||||||
|
QR_NAME="v4-32-us-ondemand"
|
||||||
|
ACCEL_TYPE="v4-32"
|
||||||
|
RUNTIME_VERSION="v2-alpha-tpuv4"
|
||||||
|
IS_SPOT="false"
|
||||||
|
RUN_ID="phantom_v4_od_1"
|
||||||
|
HF_REPO="velocitatem/capstone"
|
||||||
|
TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof"
|
||||||
8
tpu_orchestration/configs/v4_spot_us.conf
Normal file
8
tpu_orchestration/configs/v4_spot_us.conf
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
ZONE="us-central2-b"
|
||||||
|
QR_NAME="v4-32-us-spot"
|
||||||
|
ACCEL_TYPE="v4-32"
|
||||||
|
RUNTIME_VERSION="v2-alpha-tpuv4"
|
||||||
|
IS_SPOT="true"
|
||||||
|
RUN_ID="phantom_v4_spot_1"
|
||||||
|
HF_REPO="velocitatem/capstone"
|
||||||
|
TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof"
|
||||||
8
tpu_orchestration/configs/v5e_eu.conf
Normal file
8
tpu_orchestration/configs/v5e_eu.conf
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
ZONE="europe-west4-b"
|
||||||
|
QR_NAME="v5e-64-eu-spot"
|
||||||
|
ACCEL_TYPE="v5litepod-64"
|
||||||
|
RUNTIME_VERSION="v2-alpha-tpuv5-lite"
|
||||||
|
IS_SPOT="true"
|
||||||
|
RUN_ID="phantom_v5e_eu_1"
|
||||||
|
HF_REPO="velocitatem/capstone"
|
||||||
|
TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof"
|
||||||
8
tpu_orchestration/configs/v5e_us.conf
Normal file
8
tpu_orchestration/configs/v5e_us.conf
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
ZONE="us-central1-a"
|
||||||
|
QR_NAME="v5e-64-us-spot"
|
||||||
|
ACCEL_TYPE="v5litepod-64"
|
||||||
|
RUNTIME_VERSION="v2-alpha-tpuv5-lite"
|
||||||
|
IS_SPOT="true"
|
||||||
|
RUN_ID="phantom_v5e_us_1"
|
||||||
|
HF_REPO="velocitatem/capstone"
|
||||||
|
TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof"
|
||||||
8
tpu_orchestration/configs/v6e_eu.conf
Normal file
8
tpu_orchestration/configs/v6e_eu.conf
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
ZONE="europe-west4-a"
|
||||||
|
QR_NAME="v6e-64-eu-spot"
|
||||||
|
ACCEL_TYPE="v6e-64"
|
||||||
|
RUNTIME_VERSION="v2-alpha-tpuv6e"
|
||||||
|
IS_SPOT="true"
|
||||||
|
RUN_ID="phantom_v6e_eu_1"
|
||||||
|
HF_REPO="velocitatem/capstone"
|
||||||
|
TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof"
|
||||||
8
tpu_orchestration/configs/v6e_us.conf
Normal file
8
tpu_orchestration/configs/v6e_us.conf
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
ZONE="us-east1-d"
|
||||||
|
QR_NAME="v6e-64-us-spot"
|
||||||
|
ACCEL_TYPE="v6e-64"
|
||||||
|
RUNTIME_VERSION="v2-alpha-tpuv6e"
|
||||||
|
IS_SPOT="true"
|
||||||
|
RUN_ID="phantom_v6e_us_1"
|
||||||
|
HF_REPO="velocitatem/capstone"
|
||||||
|
TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof"
|
||||||
117
tpu_orchestration/tpu_startup.sh
Normal file
117
tpu_orchestration/tpu_startup.sh
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Idempotent startup script for TPU VMs using HF Buckets
|
||||||
|
|
||||||
|
exec > >(tee -a /var/log/tpu_startup.log) 2>&1
|
||||||
|
echo "Starting TPU setup..."
|
||||||
|
|
||||||
|
# 1. Fetch metadata from GCP
|
||||||
|
get_metadata() {
|
||||||
|
curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
export HF_TOKEN=$(get_metadata "HF_TOKEN")
|
||||||
|
export WANDB_API_KEY=$(get_metadata "WANDB_API_KEY")
|
||||||
|
export RUN_ID=$(get_metadata "RUN_ID")
|
||||||
|
export HF_REPO=$(get_metadata "HF_REPO")
|
||||||
|
export ACCEL_TYPE=$(get_metadata "ACCEL_TYPE")
|
||||||
|
export GITHUB_REPO=$(get_metadata "GITHUB_REPO")
|
||||||
|
export BRANCH=$(get_metadata "BRANCH")
|
||||||
|
export TRAIN_CMD=$(get_metadata "TRAIN_CMD")
|
||||||
|
|
||||||
|
export WORKER_ID=$(hostname)
|
||||||
|
|
||||||
|
# 2. Install dependencies
|
||||||
|
export DEBIAN_FRONTEND=noninteractive
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y git tmux jq curl build-essential wget
|
||||||
|
|
||||||
|
# Install HF CLI
|
||||||
|
curl -LsSf https://hf.co/cli/install.sh | bash
|
||||||
|
|
||||||
|
# Install Miniconda to ensure modern Python (3.10+) on older TPU OS bases
|
||||||
|
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
|
||||||
|
bash /tmp/miniconda.sh -b -p /opt/conda
|
||||||
|
rm /tmp/miniconda.sh
|
||||||
|
export PATH="/opt/conda/bin:$PATH"
|
||||||
|
|
||||||
|
# Create and activate conda environment
|
||||||
|
conda create -n phantom python=3.11 -y
|
||||||
|
source /opt/conda/bin/activate phantom
|
||||||
|
|
||||||
|
# Install Python ML dependencies
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
|
||||||
|
pip install wandb orbax-checkpoint huggingface_hub
|
||||||
|
|
||||||
|
# 3. Setup directories
|
||||||
|
mkdir -p /app/data
|
||||||
|
mkdir -p /app/checkpoints
|
||||||
|
mkdir -p /app/logs
|
||||||
|
mkdir -p /app/xla_cache/$ACCEL_TYPE
|
||||||
|
|
||||||
|
export JAX_COMPILATION_CACHE_DIR="/app/xla_cache/${ACCEL_TYPE}"
|
||||||
|
|
||||||
|
# 4. Clone repository
|
||||||
|
if [ -d "/app/model" ]; then
|
||||||
|
rm -rf /app/model
|
||||||
|
fi
|
||||||
|
git clone --branch $BRANCH $GITHUB_REPO /app/model
|
||||||
|
cd /app/model
|
||||||
|
|
||||||
|
# Install project-specific dependencies if available
|
||||||
|
if [ -f "requirements.txt" ]; then
|
||||||
|
pip install -r requirements.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 5. Restore state from Hugging Face Buckets
|
||||||
|
echo "Restoring state from hf://buckets/$HF_REPO..."
|
||||||
|
# Download base data (shared across all)
|
||||||
|
hf buckets sync hf://buckets/$HF_REPO/data/base /app/data || echo "No base data found or failed to sync."
|
||||||
|
|
||||||
|
# Download worker-specific checkpoints and logs
|
||||||
|
hf buckets sync hf://buckets/$HF_REPO/runs/$RUN_ID/checkpoints/$WORKER_ID /app/checkpoints || echo "No checkpoint found."
|
||||||
|
hf buckets sync hf://buckets/$HF_REPO/runs/$RUN_ID/logs/$WORKER_ID /app/logs || echo "No logs found."
|
||||||
|
|
||||||
|
# Download architecture-specific XLA cache
|
||||||
|
hf buckets sync hf://buckets/$HF_REPO/runs/$RUN_ID/xla/$ACCEL_TYPE /app/xla_cache/$ACCEL_TYPE || echo "No XLA cache found."
|
||||||
|
|
||||||
|
# 6. Start Background Sync Loop
|
||||||
|
cat << 'EOF' > /app/sync_loop.sh
|
||||||
|
#!/bin/bash
|
||||||
|
while true; do
|
||||||
|
sleep 120
|
||||||
|
echo "[$(date)] Background sync to HF Bucket..."
|
||||||
|
hf buckets sync /app/checkpoints hf://buckets/$HF_REPO/runs/$RUN_ID/checkpoints/$WORKER_ID --quiet || true
|
||||||
|
hf buckets sync /app/logs hf://buckets/$HF_REPO/runs/$RUN_ID/logs/$WORKER_ID --quiet || true
|
||||||
|
hf buckets sync /app/xla_cache/$ACCEL_TYPE hf://buckets/$HF_REPO/runs/$RUN_ID/xla/$ACCEL_TYPE --quiet || true
|
||||||
|
done
|
||||||
|
EOF
|
||||||
|
chmod +x /app/sync_loop.sh
|
||||||
|
/app/sync_loop.sh &
|
||||||
|
SYNC_PID=$!
|
||||||
|
|
||||||
|
# 7. Execute Training
|
||||||
|
echo "Starting training with command: $TRAIN_CMD"
|
||||||
|
# Ensure we are in the correct directory and environment
|
||||||
|
cd /app/model
|
||||||
|
export PYTHONPATH="/app/model:$PYTHONPATH"
|
||||||
|
|
||||||
|
if [ -n "$TRAIN_CMD" ]; then
|
||||||
|
eval "$TRAIN_CMD"
|
||||||
|
EXIT_CODE=$?
|
||||||
|
else
|
||||||
|
echo "No TRAIN_CMD provided. Sleeping for testing purposes..."
|
||||||
|
# For testing: run a dummy process so the VM doesn't just idle immediately
|
||||||
|
sleep 3600
|
||||||
|
EXIT_CODE=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 8. Cleanup and Final Sync
|
||||||
|
echo "Training finished with exit code $EXIT_CODE. Stopping sync loop and performing final sync..."
|
||||||
|
kill $SYNC_PID
|
||||||
|
|
||||||
|
hf buckets sync /app/checkpoints hf://buckets/$HF_REPO/runs/$RUN_ID/checkpoints/$WORKER_ID
|
||||||
|
hf buckets sync /app/logs hf://buckets/$HF_REPO/runs/$RUN_ID/logs/$WORKER_ID
|
||||||
|
hf buckets sync /app/xla_cache/$ACCEL_TYPE hf://buckets/$HF_REPO/runs/$RUN_ID/xla/$ACCEL_TYPE
|
||||||
|
|
||||||
|
exit $EXIT_CODE
|
||||||
127
tpu_orchestration/watchdog.sh
Executable file
127
tpu_orchestration/watchdog.sh
Executable file
@@ -0,0 +1,127 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Watchdog loop to ensure TPUs are re-queued when preempted
|
||||||
|
|
||||||
|
if [ "$#" -ne 1 ]; then
|
||||||
|
echo "Usage: $0 <config_file>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
CONFIG_FILE=$1
|
||||||
|
if [ ! -f "$CONFIG_FILE" ]; then
|
||||||
|
echo "Config file $CONFIG_FILE not found."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Load config
|
||||||
|
source "$CONFIG_FILE"
|
||||||
|
|
||||||
|
# Make sure HF_TOKEN is available
|
||||||
|
if [ -z "$HF_TOKEN" ]; then
|
||||||
|
echo "Error: HF_TOKEN environment variable must be set before running watchdog."
|
||||||
|
echo "export HF_TOKEN=..."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Make sure WANDB_API_KEY is available
|
||||||
|
if [ -z "$WANDB_API_KEY" ]; then
|
||||||
|
echo "Warning: WANDB_API_KEY environment variable is not set. Wandb logging may fail."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Make sure GITHUB_REPO is set in config or env
|
||||||
|
if [ -z "$GITHUB_REPO" ]; then
|
||||||
|
GITHUB_REPO="https://github.com/velocitatem/PHANTOM.git"
|
||||||
|
if [ -n "$GITHUB_TOKEN" ]; then
|
||||||
|
GITHUB_REPO="https://velocitatem:${GITHUB_TOKEN}@github.com/velocitatem/PHANTOM.git"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Make sure BRANCH is set in config or env
|
||||||
|
if [ -z "$BRANCH" ]; then
|
||||||
|
BRANCH="main"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Ensure PROJECT_ID is set
|
||||||
|
if [ -z "$PROJECT_ID" ]; then
|
||||||
|
PROJECT_ID=$(gcloud config get-value project 2>/dev/null)
|
||||||
|
if [ -z "$PROJECT_ID" ]; then
|
||||||
|
PROJECT_ID="phantom-trc" # Fallback to the known project ID
|
||||||
|
echo "Warning: PROJECT_ID not set and gcloud not configured. Defaulting to $PROJECT_ID"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Starting watchdog for $QR_NAME in $ZONE (Project: $PROJECT_ID)"
|
||||||
|
echo "Accelerator: $ACCEL_TYPE"
|
||||||
|
echo "Run ID: $RUN_ID"
|
||||||
|
|
||||||
|
# Backoff tracking for IP quota errors
|
||||||
|
RETRY_DELAY=60
|
||||||
|
MAX_RETRY_DELAY=300
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
STATE=$(gcloud compute tpus queued-resources describe $QR_NAME --zone=$ZONE --project=$PROJECT_ID --format="value(state)" 2>/dev/null)
|
||||||
|
|
||||||
|
if [ -z "$STATE" ] || [[ "$STATE" == *"SUSPENDED"* ]] || [[ "$STATE" == *"FAILED"* ]]; then
|
||||||
|
echo "[$(date)] Cluster '${STATE:-MISSING}' - cleaning IPs and re-queuing..."
|
||||||
|
|
||||||
|
# Clean all orphaned RESERVED IPs in parallel to free quota
|
||||||
|
gcloud compute addresses list --project=$PROJECT_ID \
|
||||||
|
--filter="status=RESERVED AND name~'^tpu-.*'" \
|
||||||
|
--format="value(name,region)" 2>/dev/null | \
|
||||||
|
while IFS=$'\t' read -r n r; do
|
||||||
|
[ -n "$n" ] && [ -n "$r" ] && gcloud compute addresses delete "$n" --region="$r" --project=$PROJECT_ID --quiet 2>/dev/null &
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
|
||||||
|
# Delete QR and any orphaned VM
|
||||||
|
gcloud compute tpus queued-resources delete $QR_NAME --zone=$ZONE --project=$PROJECT_ID --quiet --force 2>/dev/null
|
||||||
|
VM_STATE=$(gcloud compute tpus tpu-vm describe $QR_NAME --zone=$ZONE --project=$PROJECT_ID --format="value(state)" 2>/dev/null)
|
||||||
|
[ -n "$VM_STATE" ] && gcloud compute tpus tpu-vm delete $QR_NAME --zone=$ZONE --project=$PROJECT_ID --quiet 2>/dev/null
|
||||||
|
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# Create new QR
|
||||||
|
SPOT_FLAG=""
|
||||||
|
if [ "$IS_SPOT" = "true" ]; then
|
||||||
|
SPOT_FLAG="--spot"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Prepare metadata
|
||||||
|
METADATA="HF_TOKEN=$HF_TOKEN,RUN_ID=$RUN_ID,HF_REPO=$HF_REPO,ACCEL_TYPE=$ACCEL_TYPE,GITHUB_REPO=$GITHUB_REPO,BRANCH=$BRANCH"
|
||||||
|
if [ -n "$WANDB_API_KEY" ]; then
|
||||||
|
METADATA="$METADATA,WANDB_API_KEY=$WANDB_API_KEY"
|
||||||
|
fi
|
||||||
|
if [ -n "$TRAIN_CMD" ]; then
|
||||||
|
METADATA="$METADATA,TRAIN_CMD=$TRAIN_CMD"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Determine runtime version
|
||||||
|
RT_VERSION=${RUNTIME_VERSION:-"v2-alpha-tpuv4"}
|
||||||
|
|
||||||
|
gcloud compute tpus queued-resources create $QR_NAME \
|
||||||
|
--project=$PROJECT_ID \
|
||||||
|
--node-id=$QR_NAME \
|
||||||
|
--zone=$ZONE \
|
||||||
|
--accelerator-type=$ACCEL_TYPE \
|
||||||
|
--runtime-version=$RT_VERSION \
|
||||||
|
$SPOT_FLAG \
|
||||||
|
--metadata-from-file startup-script=$(dirname $0)/tpu_startup.sh \
|
||||||
|
--metadata "$METADATA" 2>&1 | tee /tmp/tpu_create_${QR_NAME}.log
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "[$(date)] Successfully queued $QR_NAME."
|
||||||
|
RETRY_DELAY=60
|
||||||
|
elif grep -q "IN_USE_ADDRESSES" /tmp/tpu_create_${QR_NAME}.log 2>/dev/null; then
|
||||||
|
echo "[$(date)] IP quota hit - backing off ${RETRY_DELAY}s"
|
||||||
|
sleep $RETRY_DELAY
|
||||||
|
RETRY_DELAY=$((RETRY_DELAY * 2))
|
||||||
|
[ $RETRY_DELAY -gt $MAX_RETRY_DELAY ] && RETRY_DELAY=$MAX_RETRY_DELAY
|
||||||
|
continue
|
||||||
|
else
|
||||||
|
echo "[$(date)] Failed to queue $QR_NAME."
|
||||||
|
RETRY_DELAY=60
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[$(date)] Cluster state is $STATE. Checking again in 60s..."
|
||||||
|
fi
|
||||||
|
sleep 60
|
||||||
|
done
|
||||||
Reference in New Issue
Block a user