diff --git a/tpu_orchestration/configs/test_vm.conf b/tpu_orchestration/configs/test_vm.conf new file mode 100644 index 0000000..6c154ed --- /dev/null +++ b/tpu_orchestration/configs/test_vm.conf @@ -0,0 +1,8 @@ +ZONE="us-central2-b" +QR_NAME="v4-test-vm" +ACCEL_TYPE="v4-8" +RUNTIME_VERSION="v2-alpha-tpuv4" +IS_SPOT="true" +RUN_ID="phantom_v4_test_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" diff --git a/tpu_orchestration/configs/v4_od_us.conf b/tpu_orchestration/configs/v4_od_us.conf new file mode 100644 index 0000000..8739861 --- /dev/null +++ b/tpu_orchestration/configs/v4_od_us.conf @@ -0,0 +1,8 @@ +ZONE="us-central2-b" +QR_NAME="v4-32-us-ondemand" +ACCEL_TYPE="v4-32" +RUNTIME_VERSION="v2-alpha-tpuv4" +IS_SPOT="false" +RUN_ID="phantom_v4_od_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" \ No newline at end of file diff --git a/tpu_orchestration/configs/v4_spot_us.conf b/tpu_orchestration/configs/v4_spot_us.conf new file mode 100644 index 0000000..d0f9755 --- /dev/null +++ b/tpu_orchestration/configs/v4_spot_us.conf @@ -0,0 +1,8 @@ +ZONE="us-central2-b" +QR_NAME="v4-32-us-spot" +ACCEL_TYPE="v4-32" +RUNTIME_VERSION="v2-alpha-tpuv4" +IS_SPOT="true" +RUN_ID="phantom_v4_spot_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" \ No newline at end of file diff --git a/tpu_orchestration/configs/v5e_eu.conf b/tpu_orchestration/configs/v5e_eu.conf new file mode 100644 index 0000000..c40ee21 --- /dev/null +++ b/tpu_orchestration/configs/v5e_eu.conf @@ -0,0 +1,8 @@ +ZONE="europe-west4-b" +QR_NAME="v5e-64-eu-spot" +ACCEL_TYPE="v5litepod-64" +RUNTIME_VERSION="v2-alpha-tpuv5-lite" +IS_SPOT="true" +RUN_ID="phantom_v5e_eu_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" \ No newline at end of file diff --git a/tpu_orchestration/configs/v5e_us.conf b/tpu_orchestration/configs/v5e_us.conf new file mode 100644 index 0000000..0d44cd5 --- /dev/null +++ b/tpu_orchestration/configs/v5e_us.conf @@ -0,0 +1,8 @@ +ZONE="us-central1-a" +QR_NAME="v5e-64-us-spot" +ACCEL_TYPE="v5litepod-64" +RUNTIME_VERSION="v2-alpha-tpuv5-lite" +IS_SPOT="true" +RUN_ID="phantom_v5e_us_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" \ No newline at end of file diff --git a/tpu_orchestration/configs/v6e_eu.conf b/tpu_orchestration/configs/v6e_eu.conf new file mode 100644 index 0000000..5d29b8c --- /dev/null +++ b/tpu_orchestration/configs/v6e_eu.conf @@ -0,0 +1,8 @@ +ZONE="europe-west4-a" +QR_NAME="v6e-64-eu-spot" +ACCEL_TYPE="v6e-64" +RUNTIME_VERSION="v2-alpha-tpuv6e" +IS_SPOT="true" +RUN_ID="phantom_v6e_eu_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" \ No newline at end of file diff --git a/tpu_orchestration/configs/v6e_us.conf b/tpu_orchestration/configs/v6e_us.conf new file mode 100644 index 0000000..f6cf423 --- /dev/null +++ b/tpu_orchestration/configs/v6e_us.conf @@ -0,0 +1,8 @@ +ZONE="us-east1-d" +QR_NAME="v6e-64-us-spot" +ACCEL_TYPE="v6e-64" +RUNTIME_VERSION="v2-alpha-tpuv6e" +IS_SPOT="true" +RUN_ID="phantom_v6e_us_1" +HF_REPO="velocitatem/capstone" +TRAIN_CMD="python -m engine.train --sweep-agent --sweep-id lusiana/capstone/oasdorof" \ No newline at end of file diff --git a/tpu_orchestration/tpu_startup.sh b/tpu_orchestration/tpu_startup.sh new file mode 100644 index 0000000..62b55a9 --- /dev/null +++ b/tpu_orchestration/tpu_startup.sh @@ -0,0 +1,117 @@ +#!/bin/bash +# Idempotent startup script for TPU VMs using HF Buckets + +exec > >(tee -a /var/log/tpu_startup.log) 2>&1 +echo "Starting TPU setup..." + +# 1. Fetch metadata from GCP +get_metadata() { + curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/attributes/$1" +} + +export HF_TOKEN=$(get_metadata "HF_TOKEN") +export WANDB_API_KEY=$(get_metadata "WANDB_API_KEY") +export RUN_ID=$(get_metadata "RUN_ID") +export HF_REPO=$(get_metadata "HF_REPO") +export ACCEL_TYPE=$(get_metadata "ACCEL_TYPE") +export GITHUB_REPO=$(get_metadata "GITHUB_REPO") +export BRANCH=$(get_metadata "BRANCH") +export TRAIN_CMD=$(get_metadata "TRAIN_CMD") + +export WORKER_ID=$(hostname) + +# 2. Install dependencies +export DEBIAN_FRONTEND=noninteractive +apt-get update +apt-get install -y git tmux jq curl build-essential wget + +# Install HF CLI +curl -LsSf https://hf.co/cli/install.sh | bash + +# Install Miniconda to ensure modern Python (3.10+) on older TPU OS bases +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh +bash /tmp/miniconda.sh -b -p /opt/conda +rm /tmp/miniconda.sh +export PATH="/opt/conda/bin:$PATH" + +# Create and activate conda environment +conda create -n phantom python=3.11 -y +source /opt/conda/bin/activate phantom + +# Install Python ML dependencies +pip install --upgrade pip +pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html +pip install wandb orbax-checkpoint huggingface_hub + +# 3. Setup directories +mkdir -p /app/data +mkdir -p /app/checkpoints +mkdir -p /app/logs +mkdir -p /app/xla_cache/$ACCEL_TYPE + +export JAX_COMPILATION_CACHE_DIR="/app/xla_cache/${ACCEL_TYPE}" + +# 4. Clone repository +if [ -d "/app/model" ]; then + rm -rf /app/model +fi +git clone --branch $BRANCH $GITHUB_REPO /app/model +cd /app/model + +# Install project-specific dependencies if available +if [ -f "requirements.txt" ]; then + pip install -r requirements.txt +fi + +# 5. Restore state from Hugging Face Buckets +echo "Restoring state from hf://buckets/$HF_REPO..." +# Download base data (shared across all) +hf buckets sync hf://buckets/$HF_REPO/data/base /app/data || echo "No base data found or failed to sync." + +# Download worker-specific checkpoints and logs +hf buckets sync hf://buckets/$HF_REPO/runs/$RUN_ID/checkpoints/$WORKER_ID /app/checkpoints || echo "No checkpoint found." +hf buckets sync hf://buckets/$HF_REPO/runs/$RUN_ID/logs/$WORKER_ID /app/logs || echo "No logs found." + +# Download architecture-specific XLA cache +hf buckets sync hf://buckets/$HF_REPO/runs/$RUN_ID/xla/$ACCEL_TYPE /app/xla_cache/$ACCEL_TYPE || echo "No XLA cache found." + +# 6. Start Background Sync Loop +cat << 'EOF' > /app/sync_loop.sh +#!/bin/bash +while true; do + sleep 120 + echo "[$(date)] Background sync to HF Bucket..." + hf buckets sync /app/checkpoints hf://buckets/$HF_REPO/runs/$RUN_ID/checkpoints/$WORKER_ID --quiet || true + hf buckets sync /app/logs hf://buckets/$HF_REPO/runs/$RUN_ID/logs/$WORKER_ID --quiet || true + hf buckets sync /app/xla_cache/$ACCEL_TYPE hf://buckets/$HF_REPO/runs/$RUN_ID/xla/$ACCEL_TYPE --quiet || true +done +EOF +chmod +x /app/sync_loop.sh +/app/sync_loop.sh & +SYNC_PID=$! + +# 7. Execute Training +echo "Starting training with command: $TRAIN_CMD" +# Ensure we are in the correct directory and environment +cd /app/model +export PYTHONPATH="/app/model:$PYTHONPATH" + +if [ -n "$TRAIN_CMD" ]; then + eval "$TRAIN_CMD" + EXIT_CODE=$? +else + echo "No TRAIN_CMD provided. Sleeping for testing purposes..." + # For testing: run a dummy process so the VM doesn't just idle immediately + sleep 3600 + EXIT_CODE=0 +fi + +# 8. Cleanup and Final Sync +echo "Training finished with exit code $EXIT_CODE. Stopping sync loop and performing final sync..." +kill $SYNC_PID + +hf buckets sync /app/checkpoints hf://buckets/$HF_REPO/runs/$RUN_ID/checkpoints/$WORKER_ID +hf buckets sync /app/logs hf://buckets/$HF_REPO/runs/$RUN_ID/logs/$WORKER_ID +hf buckets sync /app/xla_cache/$ACCEL_TYPE hf://buckets/$HF_REPO/runs/$RUN_ID/xla/$ACCEL_TYPE + +exit $EXIT_CODE \ No newline at end of file diff --git a/tpu_orchestration/watchdog.sh b/tpu_orchestration/watchdog.sh new file mode 100755 index 0000000..2b103bc --- /dev/null +++ b/tpu_orchestration/watchdog.sh @@ -0,0 +1,127 @@ +#!/bin/bash +# Watchdog loop to ensure TPUs are re-queued when preempted + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +CONFIG_FILE=$1 +if [ ! -f "$CONFIG_FILE" ]; then + echo "Config file $CONFIG_FILE not found." + exit 1 +fi + +# Load config +source "$CONFIG_FILE" + +# Make sure HF_TOKEN is available +if [ -z "$HF_TOKEN" ]; then + echo "Error: HF_TOKEN environment variable must be set before running watchdog." + echo "export HF_TOKEN=..." + exit 1 +fi + +# Make sure WANDB_API_KEY is available +if [ -z "$WANDB_API_KEY" ]; then + echo "Warning: WANDB_API_KEY environment variable is not set. Wandb logging may fail." +fi + +# Make sure GITHUB_REPO is set in config or env +if [ -z "$GITHUB_REPO" ]; then + GITHUB_REPO="https://github.com/velocitatem/PHANTOM.git" + if [ -n "$GITHUB_TOKEN" ]; then + GITHUB_REPO="https://velocitatem:${GITHUB_TOKEN}@github.com/velocitatem/PHANTOM.git" + fi +fi + +# Make sure BRANCH is set in config or env +if [ -z "$BRANCH" ]; then + BRANCH="main" +fi + +# Ensure PROJECT_ID is set +if [ -z "$PROJECT_ID" ]; then + PROJECT_ID=$(gcloud config get-value project 2>/dev/null) + if [ -z "$PROJECT_ID" ]; then + PROJECT_ID="phantom-trc" # Fallback to the known project ID + echo "Warning: PROJECT_ID not set and gcloud not configured. Defaulting to $PROJECT_ID" + fi +fi + +echo "Starting watchdog for $QR_NAME in $ZONE (Project: $PROJECT_ID)" +echo "Accelerator: $ACCEL_TYPE" +echo "Run ID: $RUN_ID" + +# Backoff tracking for IP quota errors +RETRY_DELAY=60 +MAX_RETRY_DELAY=300 + +while true; do + STATE=$(gcloud compute tpus queued-resources describe $QR_NAME --zone=$ZONE --project=$PROJECT_ID --format="value(state)" 2>/dev/null) + + if [ -z "$STATE" ] || [[ "$STATE" == *"SUSPENDED"* ]] || [[ "$STATE" == *"FAILED"* ]]; then + echo "[$(date)] Cluster '${STATE:-MISSING}' - cleaning IPs and re-queuing..." + + # Clean all orphaned RESERVED IPs in parallel to free quota + gcloud compute addresses list --project=$PROJECT_ID \ + --filter="status=RESERVED AND name~'^tpu-.*'" \ + --format="value(name,region)" 2>/dev/null | \ + while IFS=$'\t' read -r n r; do + [ -n "$n" ] && [ -n "$r" ] && gcloud compute addresses delete "$n" --region="$r" --project=$PROJECT_ID --quiet 2>/dev/null & + done + wait + + # Delete QR and any orphaned VM + gcloud compute tpus queued-resources delete $QR_NAME --zone=$ZONE --project=$PROJECT_ID --quiet --force 2>/dev/null + VM_STATE=$(gcloud compute tpus tpu-vm describe $QR_NAME --zone=$ZONE --project=$PROJECT_ID --format="value(state)" 2>/dev/null) + [ -n "$VM_STATE" ] && gcloud compute tpus tpu-vm delete $QR_NAME --zone=$ZONE --project=$PROJECT_ID --quiet 2>/dev/null + + sleep 5 + + # Create new QR + SPOT_FLAG="" + if [ "$IS_SPOT" = "true" ]; then + SPOT_FLAG="--spot" + fi + + # Prepare metadata + METADATA="HF_TOKEN=$HF_TOKEN,RUN_ID=$RUN_ID,HF_REPO=$HF_REPO,ACCEL_TYPE=$ACCEL_TYPE,GITHUB_REPO=$GITHUB_REPO,BRANCH=$BRANCH" + if [ -n "$WANDB_API_KEY" ]; then + METADATA="$METADATA,WANDB_API_KEY=$WANDB_API_KEY" + fi + if [ -n "$TRAIN_CMD" ]; then + METADATA="$METADATA,TRAIN_CMD=$TRAIN_CMD" + fi + + # Determine runtime version + RT_VERSION=${RUNTIME_VERSION:-"v2-alpha-tpuv4"} + + gcloud compute tpus queued-resources create $QR_NAME \ + --project=$PROJECT_ID \ + --node-id=$QR_NAME \ + --zone=$ZONE \ + --accelerator-type=$ACCEL_TYPE \ + --runtime-version=$RT_VERSION \ + $SPOT_FLAG \ + --metadata-from-file startup-script=$(dirname $0)/tpu_startup.sh \ + --metadata "$METADATA" 2>&1 | tee /tmp/tpu_create_${QR_NAME}.log + + if [ $? -eq 0 ]; then + echo "[$(date)] Successfully queued $QR_NAME." + RETRY_DELAY=60 + elif grep -q "IN_USE_ADDRESSES" /tmp/tpu_create_${QR_NAME}.log 2>/dev/null; then + echo "[$(date)] IP quota hit - backing off ${RETRY_DELAY}s" + sleep $RETRY_DELAY + RETRY_DELAY=$((RETRY_DELAY * 2)) + [ $RETRY_DELAY -gt $MAX_RETRY_DELAY ] && RETRY_DELAY=$MAX_RETRY_DELAY + continue + else + echo "[$(date)] Failed to queue $QR_NAME." + RETRY_DELAY=60 + fi + else + echo "[$(date)] Cluster state is $STATE. Checking again in 60s..." + fi + sleep 60 +done