From cc24ac72f786befb33ce2df57622a948ef7382d0 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Sun, 8 Mar 2026 13:53:31 +0100
Subject: [PATCH] changed to new test method for singificance

---
 .gitignore                            |  3 ++
 Makefile                              | 49 +++++++++++++++++++----
 backend/server/requirements.txt       | 12 +++---
 paper/src/bib/references.bib          | 14 +++++++
 paper/src/chapters/03-methodology.tex |  8 +---
 paper/src/chapters/04-results.tex     | 21 +++++-----
 sim/rl/behavior_loader/loader.py      | 40 ++++++++++++++-----
 sim/rl/behavior_loader/models.py      | 56 +++++++++++++++++++++++++++
 8 files changed, 162 insertions(+), 41 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8ae7e83..5dc3efe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,9 @@ phantom.egg-info/
 .nextstep
 .ignore-gitlogue
 .cloudflare
+.nx/
+node_modules/
+dist/
 
 # generated svg/graphics
 **/session_*.svg
diff --git a/Makefile b/Makefile
index c9203a4..fe5baca 100644
--- a/Makefile
+++ b/Makefile
@@ -8,6 +8,7 @@ VENV      := .venv
 PYTHON    := $(VENV)/bin/python
 PIP       := $(VENV)/bin/pip
 PYTEST    := $(VENV)/bin/pytest
+NX        := npx nx
 
 SWEEP_ENV_FILE ?= .env.sweep
 
@@ -36,7 +37,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
 .PHONY: help
 help:
 	@echo "pdf.build pdf.watch pdf.clean | test.backend test.e2e test.all | web.dev | install | train | train.agent | train.bootstrap | train.tpu.pod | train.tpu.vm | train.tpu.vm.sweep | stats.lines"
-	@echo "docker.train.publish"
+	@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
 	@echo ""
 	@echo "Local wandb run:"
 	@echo "  make train LOCAL_TRAIN_ARGS='--algo ppo --total-timesteps 50000'"
@@ -208,11 +209,43 @@ train.tpu.vm.sweep:
 		--tpu-repo-dir "$(TPU_REPO_DIR)" \
 		$(if $(filter-out 0,$(AGENT_COUNT)),--count $(AGENT_COUNT),)
 
+.PHONY: backend.server backend.provider backend.worker platform.up platform.down platform.logs
+backend.server:
+	@$(NX) run backend-server:dev
+
+backend.provider:
+	@$(NX) run pricing-provider:dev
+
+backend.worker:
+	@$(NX) run backend-worker:dev
+
+platform.up:
+	@$(NX) run platform:up
+
+platform.down:
+	@$(NX) run platform:down
+
+platform.logs:
+	@$(NX) run platform:logs
+
 .PHONY: pdf clean watch run.webapp test count-lines all
-pdf: pdf.build
-clean: pdf.clean
-watch: pdf.watch
-run.webapp: web.dev
-test: test.backend
-count-lines: stats.lines
-all: pdf.build
+pdf:
+	@$(NX) run paper:build
+
+clean:
+	@$(NX) run paper:clean
+
+watch:
+	@$(NX) run paper:watch
+
+run.webapp:
+	@$(NX) run web:dev
+
+test:
+	@$(NX) run research:test
+
+count-lines:
+	@$(NX) run research:stats
+
+all:
+	@$(NX) run paper:build
diff --git a/backend/server/requirements.txt b/backend/server/requirements.txt
index 6a49ae4..432203e 100644
--- a/backend/server/requirements.txt
+++ b/backend/server/requirements.txt
@@ -1,6 +1,6 @@
-fastapi==0.104.1
-uvicorn[standard]==0.24.0
-kafka-python==2.0.2
-pydantic==2.5.0
-python-dotenv==1.0.0
-supabase==2.9.1
+fastapi>=0.135,<0.136
+uvicorn[standard]>=0.41,<0.42
+kafka-python>=2.3,<2.4
+pydantic>=2.12,<3
+python-dotenv>=1.0,<2
+supabase>=2.28,<3
diff --git a/paper/src/bib/references.bib b/paper/src/bib/references.bib
index 5dc3352..38e953f 100644
--- a/paper/src/bib/references.bib
+++ b/paper/src/bib/references.bib
@@ -616,3 +616,17 @@ Volume: 21},
 	year = {2026},
 	file = {Snapshot:/home/velocitatem/Zotero/storage/N724QGF6/v4.html:text/html},
 }
+
+@article{mann_test_1947,
+	title = {On a {Test} of {Whether} one of {Two} {Random} {Variables} is {Stochastically} {Larger} than the {Other}},
+	volume = {18},
+	url = {https://doi.org/10.1214/aoms/1177730491},
+	doi = {10.1214/aoms/1177730491},
+	abstract = {Let x and y be two random variables with continuous cumulative distribution functions f and g. A statistic U depending on the relative ranks of the x's and y's is proposed for testing the hypothesis f = g. Wilcoxon proposed an equivalent test in the Biometrics Bulletin, December, 1945, but gave only a few points of the distribution of his statistic. Under the hypothesis f = g the probability of obtaining a given U in a sample of n x's and m y's is the solution of a certain recurrence relation involving n and m. Using this recurrence relation tables have been computed giving the probability of U for samples up to n = m = 8. At this point the distribution is almost normal. From the recurrence relation explicit expressions for the mean, variance, and fourth moment are obtained. The 2rth moment is shown to have a certain form which enabled us to prove that the limit distribution is normal if m, n go to infinity in any arbitrary manner. The test is shown to be consistent with respect to the class of alternatives f(x) {\textgreater} g(x) for every x.},
+	number = {1},
+	journal = {The Annals of Mathematical Statistics},
+	author = {Mann, H. B. and Whitney, D. R.},
+	year = {1947},
+	note = {Publisher: Institute of Mathematical Statistics},
+	pages = {50 -- 60},
+}
diff --git a/paper/src/chapters/03-methodology.tex b/paper/src/chapters/03-methodology.tex
index f667e5f..4e770b8 100644
--- a/paper/src/chapters/03-methodology.tex
+++ b/paper/src/chapters/03-methodology.tex
@@ -303,13 +303,9 @@ To train a robust pricing learner, we need a simulator that can generate realist
 \subsubsection{Ground-Truth Separability}
 Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability?
 
-To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global separability and event-level diagnostics at the same time. To test whether the observed between-class value exceeds finite-sample estimation noise, we compute an intra-class bootstrap baseline by repeatedly splitting $\mathcal{D}_H$ and $\mathcal{D}_A$ into two random halves, fitting a transition kernel on each half, and re-computing the same average KL statistic for each split.
+To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session separability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
 
-Formally, for $B$ bootstrap splits per class we obtain reference samples $\{d_{H,b}^{\text{intra}}\}_{b=1}^B$ and $\{d_{A,b}^{\text{intra}}\}_{b=1}^B$, then compare the between-class divergence $d^{\text{inter}}$ against the pooled null distribution. We report pooled mean and variance, lift ratio $d^{\text{inter}}/\mathbb{E}[d^{\text{intra}}]$, and the empirical one-sided p-value
-\begin{equation}
-\hat p = \frac{1 + \sum_{j=1}^{2B}\mathbf{1}\{d_j^{\text{intra}} \ge d^{\text{inter}}\}}{2B + 1},
-\end{equation}
-which gives a direct significance check for separability before using divergence-derived centroid control signals in pricing.
+The normality assumption cannot be made for KL divergence distributions, which are right-skewed and bounded below by zero, so we do not use a Student's $t$-test. Instead we apply a Mann-Whitney $U$ test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the stochastic ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We report $U$, the exact two-sided $p$-value, and group-level descriptive statistics for the gap scores.
 
 \begin{definition}[Kullback-Leibler Divergence for Transition Distributions]
 Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is:
diff --git a/paper/src/chapters/04-results.tex b/paper/src/chapters/04-results.tex
index f541a55..675b722 100644
--- a/paper/src/chapters/04-results.tex
+++ b/paper/src/chapters/04-results.tex
@@ -10,26 +10,25 @@
 
 \subsection{Behavioral Analysis}
 
-The transition-kernel analysis is evaluated with both between-class divergence and an intra-class bootstrap null baseline. This allows us to separate real behavioral differences from finite-sample estimation noise and bias.
+Separability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. Table~\ref{tab:divergence_significance} reports the group-level descriptive statistics for the gap scores and the test result.
 
 \begin{table}[ht]
 \centering
-\caption{Divergence significance using intra-class bootstrap baseline (B=100 per class).}
+\caption{Per-session divergence gap ($\Delta_H - \Delta_A$) by actor class with Mann-Whitney $U$ test.}
 \label{tab:divergence_significance}
-\begin{tabular}{lcccc}
+\begin{tabular}{lccc}
 \toprule
-Metric & Mean KL & Std & 5\% quantile & 95\% quantile \\
+Group & $n$ & Mean gap & Std \\
 \midrule
-Between-class (Human vs Agent) & 5.3067 & -- & -- & -- \\
-Human intra-class split & 2.5271 & 1.2501 & 0.6845 & 4.6015 \\
-Agent intra-class split & 1.2065 & 1.2607 & 0.2177 & 4.2345 \\
+Human sessions & 11 & $-3.3522$ & $2.6748$ \\
+Agent sessions & 6 & $+1.6482$ & $2.8349$ \\
+\midrule
+\multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\
 \bottomrule
 \end{tabular}
 \end{table}
 
-For this run ($n_H=11$, $n_A=7$, $B=100$), the empirical p-value is $0.0149$, both computed as defined in Section~\ref{sec:tpe}. This places the between-class divergence clearly above the intra-class null and supports the use of divergence-derived contamination signals in downstream pricing control.
-
-% TODO: instead could we do a simple t test to see the difference in the means in some way? That way we can yield a P value
+The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided $p$-value of $0.0006$ indicates near-complete rank separation between the groups at $n_H=11$, $n_A=6$, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing.
 
 
 \subsection{Experimental Outcomes}
@@ -54,6 +53,6 @@ This comparison isolates the effect of robustness terms from model capacity and
 
 
 \subsection{Interpretation and Insights}
-Between-class divergence substantially above the intra-class null indicates that the two actor classes are behaviorally separable at the transition-kernel level. In pricing experiments, this is the condition required for separability to act as a useful control signal rather than just an auxiliary classifier score.
+The Mann-Whitney result ($U=2.0$, $p<0.001$) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
 
 \subsection{Anomalies}
diff --git a/sim/rl/behavior_loader/loader.py b/sim/rl/behavior_loader/loader.py
index 3336956..a335fce 100644
--- a/sim/rl/behavior_loader/loader.py
+++ b/sim/rl/behavior_loader/loader.py
@@ -2,6 +2,7 @@ import os
 import json
 from pydantic import BaseModel as Base
 
+
 class PayloadModel(Base):
     sessionId: str
     experimentId: str | None
@@ -13,6 +14,7 @@ class PayloadModel(Base):
     userAgent: str
     ts: str
 
+
 class ValueModel(Base):
     payload: PayloadModel
     encoding: str
@@ -20,6 +22,7 @@ class ValueModel(Base):
     schemaId: int
     size: int
 
+
 class InteractionModel(Base):
     partitionID: int
     offset: int
@@ -30,14 +33,17 @@ class InteractionModel(Base):
     key: dict
     value: ValueModel
 
+
 def _is_admin(page: str | None) -> bool:
     return page is not None and page.startswith("/admin/")
 
+
 class Loader:
     def __init__(self, src_dir: str):
         self.src_dir = src_dir
         self.entries = os.listdir(src_dir)
-        if not self.entries: raise ValueError("empty directory")
+        if not self.entries:
+            raise ValueError("empty directory")
         self.data = self._load_sessions()
 
     def _load_sessions(self) -> dict:
@@ -55,16 +61,21 @@ class Loader:
     def get_entries(self) -> tuple[list[str], int]:
         return self.entries, len(self.entries)
 
+
 class AgentLoader(Loader):
     def _load_sessions(self) -> dict:
         sessions = {}
         for entry in self.entries:
-            with open(f"{self.src_dir}/{entry}/int.json") as f:
+            path = f"{self.src_dir}/{entry}/int.json"
+            if not os.path.isfile(path):
+                continue
+            with open(path) as f:
                 raw = json.load(f)
             ints = [PayloadModel(**i) for i in raw]
             sessions[entry] = [i for i in ints if not _is_admin(i.page)]
         return sessions
 
+
 class JointLoader:
     def __init__(self, human_dir: str, agent_dir: str):
         self.human_loader = Loader(human_dir)
@@ -74,10 +85,14 @@ class JointLoader:
 
     def _merge(self) -> dict:
         return {
-            **{f"human_{sid}": [e.value.payload for e in evts]
-               for sid, evts in self.human_loader.get_data().items()},
-            **{f"agent_{sid}": evts
-               for sid, evts in self.agent_loader.get_data().items()}
+            **{
+                f"human_{sid}": [e.value.payload for e in evts]
+                for sid, evts in self.human_loader.get_data().items()
+            },
+            **{
+                f"agent_{sid}": evts
+                for sid, evts in self.agent_loader.get_data().items()
+            },
         }
 
     def get_data(self) -> dict:
@@ -86,12 +101,17 @@ class JointLoader:
     def get_entries(self) -> tuple[list[str], int]:
         return self.entries, len(self.entries)
 
+
 if __name__ == "__main__":
     agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-    human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+    human_dir = (
+        "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+    )
 
-    for name, cls, path in [("agent", AgentLoader, agent_dir),
-                             ("human", Loader, human_dir),
-                             ("joint", lambda d: JointLoader(human_dir, d), agent_dir)]:
+    for name, cls, path in [
+        ("agent", AgentLoader, agent_dir),
+        ("human", Loader, human_dir),
+        ("joint", lambda d: JointLoader(human_dir, d), agent_dir),
+    ]:
         ldr = cls(path) if name != "joint" else cls(agent_dir)
         print(f"Loaded {len(ldr.get_entries()[0])} {name} sessions")
diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py
index c4ec78f..cb67cbf 100644
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -260,6 +260,29 @@ def _avg_event_kl(
     return float(np.mean([kl_divergence(src_evt[e], dst_evt[e]) for e in common]))
 
 
+def per_session_divergence(
+    model: BehaviorModel,
+    reference_evt: Dict[str, Dict[str, float]],
+) -> List[float]:
+    """KL from each session's event-level transition dist to a reference kernel. Returns one scalar per session."""
+    scores = []
+    for sid, evts in model.data.items():
+        if len(evts) < 2:
+            continue
+        subset_mdp = _build_subset_mdp(model, [sid])
+        sess_evt = aggregate_event_transitions(subset_mdp)
+        common = set(sess_evt.keys()) & set(reference_evt.keys())
+        if not common:
+            scores.append(0.0)
+            continue
+        scores.append(
+            float(
+                np.mean([kl_divergence(sess_evt[e], reference_evt[e]) for e in common])
+            )
+        )
+    return scores
+
+
 def bootstrap_intra_class_divergence(
     model: BehaviorModel,
     n_bootstrap: int = 100,
@@ -412,3 +435,36 @@ if __name__ == "__main__":
         f"  Lift vs pooled intra mean: {inter_class_avg / max(float(np.mean(pooled_null)), 1e-10):.2f}x"
     )
     print(f"  Empirical p-value (inter > intra): {p_empirical:.4f}")
+
+    # per-session divergence scores: delta_H - delta_A per session (positive means closer to agent behavior)
+    from scipy.stats import mannwhitneyu
+
+    human_dH = per_session_divergence(
+        human_model, human_evt
+    )  # human session vs human centroid
+    human_dA = per_session_divergence(
+        human_model, agent_evt
+    )  # human session vs agent centroid
+    agent_dH = per_session_divergence(
+        agent_model, human_evt
+    )  # agent session vs human centroid
+    agent_dA = per_session_divergence(
+        agent_model, agent_evt
+    )  # agent session vs agent centroid
+    # score = delta_H - delta_A: high means far from humans, close to agents
+    n_h = min(len(human_dH), len(human_dA))
+    n_a = min(len(agent_dH), len(agent_dA))
+    human_diff = [human_dH[i] - human_dA[i] for i in range(n_h)]
+    agent_diff = [agent_dH[i] - agent_dA[i] for i in range(n_a)]
+    print(f"\nPer-session divergence gap (delta_H - delta_A):")
+    print(
+        f"  Human sessions (n={n_h}): mean={np.mean(human_diff):.4f}, std={np.std(human_diff):.4f}"
+    )
+    print(
+        f"  Agent sessions (n={n_a}): mean={np.mean(agent_diff):.4f}, std={np.std(agent_diff):.4f}"
+    )
+    if n_h >= 2 and n_a >= 2:
+        U, mw_p = mannwhitneyu(human_diff, agent_diff, alternative="two-sided")
+        print(f"  Mann-Whitney U={U:.1f}, p={mw_p:.4f}")
+    else:
+        print("  Insufficient sessions for Mann-Whitney test")