changed to new test method for singificance

2026-07-15 17:43:36 +00:00 · 2026-03-08 13:53:31 +01:00
parent 4b89b64674
commit cc24ac72f7
8 changed files with 162 additions and 41 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,9 @@ phantom.egg-info/
 .nextstep
 .ignore-gitlogue
 .cloudflare
 .nx/
 node_modules/
 dist/
 # generated svg/graphics
 **/session_*.svg
--- a/49
+++ b/49
@@ -8,6 +8,7 @@ VENV      := .venv
 PYTHON    := $(VENV)/bin/python
 PIP       := $(VENV)/bin/pip
 PYTEST    := $(VENV)/bin/pytest
 NX        := npx nx
 SWEEP_ENV_FILE ?= .env.sweep
@@ -36,7 +37,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
 .PHONY: help
 help:
 	@echo "pdf.build pdf.watch pdf.clean | test.backend test.e2e test.all | web.dev | install | train | train.agent | train.bootstrap | train.tpu.pod | train.tpu.vm | train.tpu.vm.sweep | stats.lines"
-	@echo "docker.train.publish"
+	@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
 	@echo ""
 	@echo "Local wandb run:"
 	@echo "  make train LOCAL_TRAIN_ARGS='--algo ppo --total-timesteps 50000'"
@@ -208,11 +209,43 @@ train.tpu.vm.sweep:
 		--tpu-repo-dir "$(TPU_REPO_DIR)" \
 		$(if $(filter-out 0,$(AGENT_COUNT)),--count $(AGENT_COUNT),)
 .PHONY: backend.server backend.provider backend.worker platform.up platform.down platform.logs
 backend.server:
 	@$(NX) run backend-server:dev
 backend.provider:
 	@$(NX) run pricing-provider:dev
 backend.worker:
 	@$(NX) run backend-worker:dev
 platform.up:
 	@$(NX) run platform:up
 platform.down:
 	@$(NX) run platform:down
 platform.logs:
 	@$(NX) run platform:logs
 .PHONY: pdf clean watch run.webapp test count-lines all
-pdf: pdf.build
+pdf:
-clean: pdf.clean
+	@$(NX) run paper:build
-watch: pdf.watch
+
-run.webapp: web.dev
+clean:
-test: test.backend
+	@$(NX) run paper:clean
-count-lines: stats.lines
+
-all: pdf.build
+watch:
 	@$(NX) run paper:watch
 run.webapp:
 	@$(NX) run web:dev
 test:
 	@$(NX) run research:test
 count-lines:
 	@$(NX) run research:stats
 all:
 	@$(NX) run paper:build
--- a/backend/server/requirements.txt
+++ b/backend/server/requirements.txt
@@ -1,6 +1,6 @@
-fastapi==0.104.1
+fastapi>=0.135,<0.136
-uvicorn[standard]==0.24.0
+uvicorn[standard]>=0.41,<0.42
-kafka-python==2.0.2
+kafka-python>=2.3,<2.4
-pydantic==2.5.0
+pydantic>=2.12,<3
-python-dotenv==1.0.0
+python-dotenv>=1.0,<2
-supabase==2.9.1
+supabase>=2.28,<3
--- a/paper/src/bib/references.bib
+++ b/paper/src/bib/references.bib
@@ -616,3 +616,17 @@ Volume: 21},
 	year = {2026},
 	file = {Snapshot:/home/velocitatem/Zotero/storage/N724QGF6/v4.html:text/html},
 }
@article{mann_test_1947,
 	title = {On a {Test} of {Whether} one of {Two} {Random} {Variables} is {Stochastically} {Larger} than the {Other}},
 	volume = {18},
 	url = {https://doi.org/10.1214/aoms/1177730491},
 	doi = {10.1214/aoms/1177730491},
 	abstract = {Let x and y be two random variables with continuous cumulative distribution functions f and g. A statistic U depending on the relative ranks of the x's and y's is proposed for testing the hypothesis f = g. Wilcoxon proposed an equivalent test in the Biometrics Bulletin, December, 1945, but gave only a few points of the distribution of his statistic. Under the hypothesis f = g the probability of obtaining a given U in a sample of n x's and m y's is the solution of a certain recurrence relation involving n and m. Using this recurrence relation tables have been computed giving the probability of U for samples up to n = m = 8. At this point the distribution is almost normal. From the recurrence relation explicit expressions for the mean, variance, and fourth moment are obtained. The 2rth moment is shown to have a certain form which enabled us to prove that the limit distribution is normal if m, n go to infinity in any arbitrary manner. The test is shown to be consistent with respect to the class of alternatives f(x) {\textgreater} g(x) for every x.},
 	number = {1},
 	journal = {The Annals of Mathematical Statistics},
 	author = {Mann, H. B. and Whitney, D. R.},
 	year = {1947},
 	note = {Publisher: Institute of Mathematical Statistics},
 	pages = {50 -- 60},
 }
--- a/paper/src/chapters/03-methodology.tex
+++ b/paper/src/chapters/03-methodology.tex
@@ -303,13 +303,9 @@ To train a robust pricing learner, we need a simulator that can generate realist
 \subsubsection{Ground-Truth Separability}
 Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability?
-To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global separability and event-level diagnostics at the same time. To test whether the observed between-class value exceeds finite-sample estimation noise, we compute an intra-class bootstrap baseline by repeatedly splitting $\mathcal{D}_H$ and $\mathcal{D}_A$ into two random halves, fitting a transition kernel on each half, and re-computing the same average KL statistic for each split.
+To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session separability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
-Formally, for $B$ bootstrap splits per class we obtain reference samples $\{d_{H,b}^{\text{intra}}\}_{b=1}^B$ and $\{d_{A,b}^{\text{intra}}\}_{b=1}^B$, then compare the between-class divergence $d^{\text{inter}}$ against the pooled null distribution. We report pooled mean and variance, lift ratio $d^{\text{inter}}/\mathbb{E}[d^{\text{intra}}]$, and the empirical one-sided p-value
+The normality assumption cannot be made for KL divergence distributions, which are right-skewed and bounded below by zero, so we do not use a Student's $t$-test. Instead we apply a Mann-Whitney $U$ test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the stochastic ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We report $U$, the exact two-sided $p$-value, and group-level descriptive statistics for the gap scores.
 \begin{equation}
 \hat p = \frac{1 + \sum_{j=1}^{2B}\mathbf{1}\{d_j^{\text{intra}} \ge d^{\text{inter}}\}}{2B + 1},
 \end{equation}
 which gives a direct significance check for separability before using divergence-derived centroid control signals in pricing.
 \begin{definition}[Kullback-Leibler Divergence for Transition Distributions]
 Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is:
--- a/paper/src/chapters/04-results.tex
+++ b/paper/src/chapters/04-results.tex
@@ -10,26 +10,25 @@
 \subsection{Behavioral Analysis}
-The transition-kernel analysis is evaluated with both between-class divergence and an intra-class bootstrap null baseline. This allows us to separate real behavioral differences from finite-sample estimation noise and bias.
+Separability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. Table~\ref{tab:divergence_significance} reports the group-level descriptive statistics for the gap scores and the test result.
 \begin{table}[ht]
 \centering
-\caption{Divergence significance using intra-class bootstrap baseline (B=100 per class).}
+\caption{Per-session divergence gap ($\Delta_H - \Delta_A$) by actor class with Mann-Whitney $U$ test.}
 \label{tab:divergence_significance}
-\begin{tabular}{lcccc}
+\begin{tabular}{lccc}
 \toprule
-Metric & Mean KL & Std & 5\% quantile & 95\% quantile \\
+Group & $n$ & Mean gap & Std \\
 \midrule
-Between-class (Human vs Agent) & 5.3067 & -- & -- & -- \\
+Human sessions & 11 & $-3.3522$ & $2.6748$ \\
-Human intra-class split & 2.5271 & 1.2501 & 0.6845 & 4.6015 \\
+Agent sessions & 6 & $+1.6482$ & $2.8349$ \\
-Agent intra-class split & 1.2065 & 1.2607 & 0.2177 & 4.2345 \\
+\midrule
 \multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\
 \bottomrule
 \end{tabular}
 \end{table}
-For this run ($n_H=11$, $n_A=7$, $B=100$), the empirical p-value is $0.0149$, both computed as defined in Section~\ref{sec:tpe}. This places the between-class divergence clearly above the intra-class null and supports the use of divergence-derived contamination signals in downstream pricing control.
+The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided $p$-value of $0.0006$ indicates near-complete rank separation between the groups at $n_H=11$, $n_A=6$, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing.
 % TODO: instead could we do a simple t test to see the difference in the means in some way? That way we can yield a P value
 \subsection{Experimental Outcomes}
@@ -54,6 +53,6 @@ This comparison isolates the effect of robustness terms from model capacity and
 \subsection{Interpretation and Insights}
-Between-class divergence substantially above the intra-class null indicates that the two actor classes are behaviorally separable at the transition-kernel level. In pricing experiments, this is the condition required for separability to act as a useful control signal rather than just an auxiliary classifier score.
+The Mann-Whitney result ($U=2.0$, $p<0.001$) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
 \subsection{Anomalies}
--- a/sim/rl/behavior_loader/loader.py
+++ b/sim/rl/behavior_loader/loader.py
@@ -2,6 +2,7 @@ import os
 import json
 from pydantic import BaseModel as Base
 class PayloadModel(Base):
    sessionId: str
    experimentId: str | None
@@ -13,6 +14,7 @@ class PayloadModel(Base):
    userAgent: str
    ts: str
 class ValueModel(Base):
    payload: PayloadModel
    encoding: str
@@ -20,6 +22,7 @@ class ValueModel(Base):
    schemaId: int
    size: int
 class InteractionModel(Base):
    partitionID: int
    offset: int
@@ -30,14 +33,17 @@ class InteractionModel(Base):
    key: dict
    value: ValueModel
 def _is_admin(page: str | None) -> bool:
    return page is not None and page.startswith("/admin/")
 class Loader:
    def __init__(self, src_dir: str):
        self.src_dir = src_dir
        self.entries = os.listdir(src_dir)
-        if not self.entries: raise ValueError("empty directory")
+        if not self.entries:
            raise ValueError("empty directory")
        self.data = self._load_sessions()
    def _load_sessions(self) -> dict:
@@ -55,16 +61,21 @@ class Loader:
    def get_entries(self) -> tuple[list[str], int]:
        return self.entries, len(self.entries)
 class AgentLoader(Loader):
    def _load_sessions(self) -> dict:
        sessions = {}
        for entry in self.entries:
-            with open(f"{self.src_dir}/{entry}/int.json") as f:
+            path = f"{self.src_dir}/{entry}/int.json"
            if not os.path.isfile(path):
                continue
            with open(path) as f:
                raw = json.load(f)
            ints = [PayloadModel(**i) for i in raw]
            sessions[entry] = [i for i in ints if not _is_admin(i.page)]
        return sessions
 class JointLoader:
    def __init__(self, human_dir: str, agent_dir: str):
        self.human_loader = Loader(human_dir)
@@ -74,10 +85,14 @@ class JointLoader:
    def _merge(self) -> dict:
        return {
-            **{f"human_{sid}": [e.value.payload for e in evts]
+            **{
-               for sid, evts in self.human_loader.get_data().items()},
+                f"human_{sid}": [e.value.payload for e in evts]
-            **{f"agent_{sid}": evts
+                for sid, evts in self.human_loader.get_data().items()
-               for sid, evts in self.agent_loader.get_data().items()}
+            },
            **{
                f"agent_{sid}": evts
                for sid, evts in self.agent_loader.get_data().items()
            },
        }
    def get_data(self) -> dict:
@@ -86,12 +101,17 @@ class JointLoader:
    def get_entries(self) -> tuple[list[str], int]:
        return self.entries, len(self.entries)
 if __name__ == "__main__":
    agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
-    human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
+    human_dir = (
        "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
    )
-    for name, cls, path in [("agent", AgentLoader, agent_dir),
+    for name, cls, path in [
        ("agent", AgentLoader, agent_dir),
        ("human", Loader, human_dir),
-                             ("joint", lambda d: JointLoader(human_dir, d), agent_dir)]:
+        ("joint", lambda d: JointLoader(human_dir, d), agent_dir),
    ]:
        ldr = cls(path) if name != "joint" else cls(agent_dir)
        print(f"Loaded {len(ldr.get_entries()[0])} {name} sessions")
--- a/sim/rl/behavior_loader/models.py
+++ b/sim/rl/behavior_loader/models.py
@@ -260,6 +260,29 @@ def _avg_event_kl(
    return float(np.mean([kl_divergence(src_evt[e], dst_evt[e]) for e in common]))
 def per_session_divergence(
    model: BehaviorModel,
    reference_evt: Dict[str, Dict[str, float]],
 ) -> List[float]:
    """KL from each session's event-level transition dist to a reference kernel. Returns one scalar per session."""
    scores = []
    for sid, evts in model.data.items():
        if len(evts) < 2:
            continue
        subset_mdp = _build_subset_mdp(model, [sid])
        sess_evt = aggregate_event_transitions(subset_mdp)
        common = set(sess_evt.keys()) & set(reference_evt.keys())
        if not common:
            scores.append(0.0)
            continue
        scores.append(
            float(
                np.mean([kl_divergence(sess_evt[e], reference_evt[e]) for e in common])
            )
        )
    return scores
 def bootstrap_intra_class_divergence(
    model: BehaviorModel,
    n_bootstrap: int = 100,
@@ -412,3 +435,36 @@ if __name__ == "__main__":
        f"  Lift vs pooled intra mean: {inter_class_avg / max(float(np.mean(pooled_null)), 1e-10):.2f}x"
    )
    print(f"  Empirical p-value (inter > intra): {p_empirical:.4f}")
    # per-session divergence scores: delta_H - delta_A per session (positive means closer to agent behavior)
    from scipy.stats import mannwhitneyu
    human_dH = per_session_divergence(
        human_model, human_evt
    )  # human session vs human centroid
    human_dA = per_session_divergence(
        human_model, agent_evt
    )  # human session vs agent centroid
    agent_dH = per_session_divergence(
        agent_model, human_evt
    )  # agent session vs human centroid
    agent_dA = per_session_divergence(
        agent_model, agent_evt
    )  # agent session vs agent centroid
    # score = delta_H - delta_A: high means far from humans, close to agents
    n_h = min(len(human_dH), len(human_dA))
    n_a = min(len(agent_dH), len(agent_dA))
    human_diff = [human_dH[i] - human_dA[i] for i in range(n_h)]
    agent_diff = [agent_dH[i] - agent_dA[i] for i in range(n_a)]
    print(f"\nPer-session divergence gap (delta_H - delta_A):")
    print(
        f"  Human sessions (n={n_h}): mean={np.mean(human_diff):.4f}, std={np.std(human_diff):.4f}"
    )
    print(
        f"  Agent sessions (n={n_a}): mean={np.mean(agent_diff):.4f}, std={np.std(agent_diff):.4f}"
    )
    if n_h >= 2 and n_a >= 2:
        U, mw_p = mannwhitneyu(human_diff, agent_diff, alternative="two-sided")
        print(f"  Mann-Whitney U={U:.1f}, p={mw_p:.4f}")
    else:
        print("  Insufficient sessions for Mann-Whitney test")