cleaning manim and improving rtraining setup

2026-07-15 17:43:36 +00:00 · 2026-03-12 00:22:46 +01:00
parent d748733231
commit 22e50aac4a
7 changed files with 94 additions and 1688 deletions
--- a/engine/benchmark.py
+++ b/engine/benchmark.py
@@ -1,12 +1,32 @@
 from __future__ import annotations

+import os
+import subprocess
+import sys
+
 import argparse
 import json
 import logging
-import os
 from datetime import datetime, UTC
 from pathlib import Path

+# clear stale TPU locks on startup
+if os.path.exists("/dev/accel0"):
+    try:
+        subprocess.run(
+            ["rm", "-f", "/tmp/.libtpu_lockfile", "/tmp/libtpu_lockfile"],
+            stderr=subprocess.DEVNULL,
+        )
+    except:
+        pass
+
+try:
+    import jax
+
+    jax.config.update("jax_threefry_partitionable", True)
+except ImportError:
+    pass
+
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
--- a/engine/jax/robust.py
+++ b/engine/jax/robust.py
@@ -28,6 +28,8 @@ try:
 except ImportError:
    _JAX_OK = False

+_JAX_RUNTIME_OK = True
+

 def _demand_for_actor_jax(prices, mean, std, noise_std, key):
    """d(p;theta) = max(0, val - price + noise), normalized to sum 100."""
@@ -104,7 +106,9 @@ def select_adversarial_alpha_jax(
    falls back to a pure-numpy sequential loop when JAX is unavailable so the
    wrapper can call this function unconditionally.
    """
-    if not _JAX_OK:
+    global _JAX_RUNTIME_OK
+
+    if not _JAX_OK or not _JAX_RUNTIME_OK:
        return _fallback(
            candidates,
            prices,
@@ -117,28 +121,45 @@ def select_adversarial_alpha_jax(
            reward_profit_weight,
        )

-    k = len(candidates)
-    key = jax.random.PRNGKey(rng_seed)
-    keys = jax.random.split(key, k)
+    try:
+        k = len(candidates)
+        key = jax.random.PRNGKey(rng_seed)
+        keys = jax.random.split(key, k)

-    rewards = np.asarray(
-        _reward_batched(
-            jnp.asarray(candidates, dtype=jnp.float32),
-            jnp.asarray(prices, dtype=jnp.float32),
-            float(human_params[0]),
-            float(human_params[1]),
-            float(agent_params[0]),
-            float(agent_params[1]),
-            float(noise_std),
-            jnp.asarray(baseline_prices, dtype=jnp.float32),
-            float(lambda_coi),
-            float(info_value),
-            float(reward_profit_weight),
-            keys,
+        rewards = np.asarray(
+            _reward_batched(
+                jnp.asarray(candidates, dtype=jnp.float32),
+                jnp.asarray(prices, dtype=jnp.float32),
+                float(human_params[0]),
+                float(human_params[1]),
+                float(agent_params[0]),
+                float(agent_params[1]),
+                float(noise_std),
+                jnp.asarray(baseline_prices, dtype=jnp.float32),
+                float(lambda_coi),
+                float(info_value),
+                float(reward_profit_weight),
+                keys,
+            )
+        )
+        best_idx = int(np.argmin(rewards))
+        return float(candidates[best_idx]), rewards
+    except Exception as exc:
+        # TPU contention / backend init failures can happen in distributed schedulers.
+        # Degrade to numpy path for the remainder of the process.
+        _JAX_RUNTIME_OK = False
+        print(f"PHANTOM_JAX_FALLBACK: {exc}")
+        return _fallback(
+            candidates,
+            prices,
+            human_params,
+            agent_params,
+            noise_std,
+            baseline_prices,
+            lambda_coi,
+            info_value,
+            reward_profit_weight,
        )
-    )
-    best_idx = int(np.argmin(rewards))
-    return float(candidates[best_idx]), rewards


 def _fallback(
--- a/engine/train.py
+++ b/engine/train.py
@@ -179,8 +179,29 @@ def _overrides_from_args(args: argparse.Namespace) -> dict[str, Any]:


 def main(argv: list[str] | None = None) -> None:
+    import subprocess
    import sys

+    # Ensure data is downloaded
+    from pathlib import Path
+
+    project_root = Path(__file__).parents[1]
+    data_dir = project_root / "experiments" / "collected_data"
+    needs_pull = (not data_dir.exists()) or (not any(data_dir.iterdir()))
+    if needs_pull:
+        try:
+            subprocess.run(["make", "data.pull"], cwd=str(project_root), check=True)
+        except (subprocess.SubprocessError, OSError) as exc:
+            sys.path.insert(0, str(project_root))
+            try:
+                from scripts.hf_data import pull
+
+                pull()
+            except (ImportError, OSError, RuntimeError, ValueError) as fallback_exc:
+                print(
+                    f"Warning: data.pull failed ({exc}); fallback pull failed ({fallback_exc})"
+                )
+
    configure_logging()
    raw_args = list(sys.argv[1:] if argv is None else argv)
    run_kind = _probe_run_kind(raw_args)