From cc24ac72f786befb33ce2df57622a948ef7382d0 Mon Sep 17 00:00:00 2001 From: Daniel Rosel Date: Sun, 8 Mar 2026 13:53:31 +0100 Subject: [PATCH] changed to new test method for singificance --- .gitignore | 3 ++ Makefile | 49 +++++++++++++++++++---- backend/server/requirements.txt | 12 +++--- paper/src/bib/references.bib | 14 +++++++ paper/src/chapters/03-methodology.tex | 8 +--- paper/src/chapters/04-results.tex | 21 +++++----- sim/rl/behavior_loader/loader.py | 40 ++++++++++++++----- sim/rl/behavior_loader/models.py | 56 +++++++++++++++++++++++++++ 8 files changed, 162 insertions(+), 41 deletions(-) diff --git a/.gitignore b/.gitignore index 8ae7e83..5dc3efe 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,9 @@ phantom.egg-info/ .nextstep .ignore-gitlogue .cloudflare +.nx/ +node_modules/ +dist/ # generated svg/graphics **/session_*.svg diff --git a/Makefile b/Makefile index c9203a4..fe5baca 100644 --- a/Makefile +++ b/Makefile @@ -8,6 +8,7 @@ VENV := .venv PYTHON := $(VENV)/bin/python PIP := $(VENV)/bin/pip PYTEST := $(VENV)/bin/pytest +NX := npx nx SWEEP_ENV_FILE ?= .env.sweep @@ -36,7 +37,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" || .PHONY: help help: @echo "pdf.build pdf.watch pdf.clean | test.backend test.e2e test.all | web.dev | install | train | train.agent | train.bootstrap | train.tpu.pod | train.tpu.vm | train.tpu.vm.sweep | stats.lines" - @echo "docker.train.publish" + @echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish" @echo "" @echo "Local wandb run:" @echo " make train LOCAL_TRAIN_ARGS='--algo ppo --total-timesteps 50000'" @@ -208,11 +209,43 @@ train.tpu.vm.sweep: --tpu-repo-dir "$(TPU_REPO_DIR)" \ $(if $(filter-out 0,$(AGENT_COUNT)),--count $(AGENT_COUNT),) +.PHONY: backend.server backend.provider backend.worker platform.up platform.down platform.logs +backend.server: + @$(NX) run backend-server:dev + +backend.provider: + @$(NX) run pricing-provider:dev + +backend.worker: + @$(NX) run backend-worker:dev + +platform.up: + @$(NX) run platform:up + +platform.down: + @$(NX) run platform:down + +platform.logs: + @$(NX) run platform:logs + .PHONY: pdf clean watch run.webapp test count-lines all -pdf: pdf.build -clean: pdf.clean -watch: pdf.watch -run.webapp: web.dev -test: test.backend -count-lines: stats.lines -all: pdf.build +pdf: + @$(NX) run paper:build + +clean: + @$(NX) run paper:clean + +watch: + @$(NX) run paper:watch + +run.webapp: + @$(NX) run web:dev + +test: + @$(NX) run research:test + +count-lines: + @$(NX) run research:stats + +all: + @$(NX) run paper:build diff --git a/backend/server/requirements.txt b/backend/server/requirements.txt index 6a49ae4..432203e 100644 --- a/backend/server/requirements.txt +++ b/backend/server/requirements.txt @@ -1,6 +1,6 @@ -fastapi==0.104.1 -uvicorn[standard]==0.24.0 -kafka-python==2.0.2 -pydantic==2.5.0 -python-dotenv==1.0.0 -supabase==2.9.1 +fastapi>=0.135,<0.136 +uvicorn[standard]>=0.41,<0.42 +kafka-python>=2.3,<2.4 +pydantic>=2.12,<3 +python-dotenv>=1.0,<2 +supabase>=2.28,<3 diff --git a/paper/src/bib/references.bib b/paper/src/bib/references.bib index 5dc3352..38e953f 100644 --- a/paper/src/bib/references.bib +++ b/paper/src/bib/references.bib @@ -616,3 +616,17 @@ Volume: 21}, year = {2026}, file = {Snapshot:/home/velocitatem/Zotero/storage/N724QGF6/v4.html:text/html}, } + +@article{mann_test_1947, + title = {On a {Test} of {Whether} one of {Two} {Random} {Variables} is {Stochastically} {Larger} than the {Other}}, + volume = {18}, + url = {https://doi.org/10.1214/aoms/1177730491}, + doi = {10.1214/aoms/1177730491}, + abstract = {Let x and y be two random variables with continuous cumulative distribution functions f and g. A statistic U depending on the relative ranks of the x's and y's is proposed for testing the hypothesis f = g. Wilcoxon proposed an equivalent test in the Biometrics Bulletin, December, 1945, but gave only a few points of the distribution of his statistic. Under the hypothesis f = g the probability of obtaining a given U in a sample of n x's and m y's is the solution of a certain recurrence relation involving n and m. Using this recurrence relation tables have been computed giving the probability of U for samples up to n = m = 8. At this point the distribution is almost normal. From the recurrence relation explicit expressions for the mean, variance, and fourth moment are obtained. The 2rth moment is shown to have a certain form which enabled us to prove that the limit distribution is normal if m, n go to infinity in any arbitrary manner. The test is shown to be consistent with respect to the class of alternatives f(x) {\textgreater} g(x) for every x.}, + number = {1}, + journal = {The Annals of Mathematical Statistics}, + author = {Mann, H. B. and Whitney, D. R.}, + year = {1947}, + note = {Publisher: Institute of Mathematical Statistics}, + pages = {50 -- 60}, +} diff --git a/paper/src/chapters/03-methodology.tex b/paper/src/chapters/03-methodology.tex index f667e5f..4e770b8 100644 --- a/paper/src/chapters/03-methodology.tex +++ b/paper/src/chapters/03-methodology.tex @@ -303,13 +303,9 @@ To train a robust pricing learner, we need a simulator that can generate realist \subsubsection{Ground-Truth Separability} Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability? -To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global separability and event-level diagnostics at the same time. To test whether the observed between-class value exceeds finite-sample estimation noise, we compute an intra-class bootstrap baseline by repeatedly splitting $\mathcal{D}_H$ and $\mathcal{D}_A$ into two random halves, fitting a transition kernel on each half, and re-computing the same average KL statistic for each split. +To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session separability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior. -Formally, for $B$ bootstrap splits per class we obtain reference samples $\{d_{H,b}^{\text{intra}}\}_{b=1}^B$ and $\{d_{A,b}^{\text{intra}}\}_{b=1}^B$, then compare the between-class divergence $d^{\text{inter}}$ against the pooled null distribution. We report pooled mean and variance, lift ratio $d^{\text{inter}}/\mathbb{E}[d^{\text{intra}}]$, and the empirical one-sided p-value -\begin{equation} -\hat p = \frac{1 + \sum_{j=1}^{2B}\mathbf{1}\{d_j^{\text{intra}} \ge d^{\text{inter}}\}}{2B + 1}, -\end{equation} -which gives a direct significance check for separability before using divergence-derived centroid control signals in pricing. +The normality assumption cannot be made for KL divergence distributions, which are right-skewed and bounded below by zero, so we do not use a Student's $t$-test. Instead we apply a Mann-Whitney $U$ test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the stochastic ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We report $U$, the exact two-sided $p$-value, and group-level descriptive statistics for the gap scores. \begin{definition}[Kullback-Leibler Divergence for Transition Distributions] Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is: diff --git a/paper/src/chapters/04-results.tex b/paper/src/chapters/04-results.tex index f541a55..675b722 100644 --- a/paper/src/chapters/04-results.tex +++ b/paper/src/chapters/04-results.tex @@ -10,26 +10,25 @@ \subsection{Behavioral Analysis} -The transition-kernel analysis is evaluated with both between-class divergence and an intra-class bootstrap null baseline. This allows us to separate real behavioral differences from finite-sample estimation noise and bias. +Separability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. Table~\ref{tab:divergence_significance} reports the group-level descriptive statistics for the gap scores and the test result. \begin{table}[ht] \centering -\caption{Divergence significance using intra-class bootstrap baseline (B=100 per class).} +\caption{Per-session divergence gap ($\Delta_H - \Delta_A$) by actor class with Mann-Whitney $U$ test.} \label{tab:divergence_significance} -\begin{tabular}{lcccc} +\begin{tabular}{lccc} \toprule -Metric & Mean KL & Std & 5\% quantile & 95\% quantile \\ +Group & $n$ & Mean gap & Std \\ \midrule -Between-class (Human vs Agent) & 5.3067 & -- & -- & -- \\ -Human intra-class split & 2.5271 & 1.2501 & 0.6845 & 4.6015 \\ -Agent intra-class split & 1.2065 & 1.2607 & 0.2177 & 4.2345 \\ +Human sessions & 11 & $-3.3522$ & $2.6748$ \\ +Agent sessions & 6 & $+1.6482$ & $2.8349$ \\ +\midrule +\multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\ \bottomrule \end{tabular} \end{table} -For this run ($n_H=11$, $n_A=7$, $B=100$), the empirical p-value is $0.0149$, both computed as defined in Section~\ref{sec:tpe}. This places the between-class divergence clearly above the intra-class null and supports the use of divergence-derived contamination signals in downstream pricing control. - -% TODO: instead could we do a simple t test to see the difference in the means in some way? That way we can yield a P value +The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided $p$-value of $0.0006$ indicates near-complete rank separation between the groups at $n_H=11$, $n_A=6$, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing. \subsection{Experimental Outcomes} @@ -54,6 +53,6 @@ This comparison isolates the effect of robustness terms from model capacity and \subsection{Interpretation and Insights} -Between-class divergence substantially above the intra-class null indicates that the two actor classes are behaviorally separable at the transition-kernel level. In pricing experiments, this is the condition required for separability to act as a useful control signal rather than just an auxiliary classifier score. +The Mann-Whitney result ($U=2.0$, $p<0.001$) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score. \subsection{Anomalies} diff --git a/sim/rl/behavior_loader/loader.py b/sim/rl/behavior_loader/loader.py index 3336956..a335fce 100644 --- a/sim/rl/behavior_loader/loader.py +++ b/sim/rl/behavior_loader/loader.py @@ -2,6 +2,7 @@ import os import json from pydantic import BaseModel as Base + class PayloadModel(Base): sessionId: str experimentId: str | None @@ -13,6 +14,7 @@ class PayloadModel(Base): userAgent: str ts: str + class ValueModel(Base): payload: PayloadModel encoding: str @@ -20,6 +22,7 @@ class ValueModel(Base): schemaId: int size: int + class InteractionModel(Base): partitionID: int offset: int @@ -30,14 +33,17 @@ class InteractionModel(Base): key: dict value: ValueModel + def _is_admin(page: str | None) -> bool: return page is not None and page.startswith("/admin/") + class Loader: def __init__(self, src_dir: str): self.src_dir = src_dir self.entries = os.listdir(src_dir) - if not self.entries: raise ValueError("empty directory") + if not self.entries: + raise ValueError("empty directory") self.data = self._load_sessions() def _load_sessions(self) -> dict: @@ -55,16 +61,21 @@ class Loader: def get_entries(self) -> tuple[list[str], int]: return self.entries, len(self.entries) + class AgentLoader(Loader): def _load_sessions(self) -> dict: sessions = {} for entry in self.entries: - with open(f"{self.src_dir}/{entry}/int.json") as f: + path = f"{self.src_dir}/{entry}/int.json" + if not os.path.isfile(path): + continue + with open(path) as f: raw = json.load(f) ints = [PayloadModel(**i) for i in raw] sessions[entry] = [i for i in ints if not _is_admin(i.page)] return sessions + class JointLoader: def __init__(self, human_dir: str, agent_dir: str): self.human_loader = Loader(human_dir) @@ -74,10 +85,14 @@ class JointLoader: def _merge(self) -> dict: return { - **{f"human_{sid}": [e.value.payload for e in evts] - for sid, evts in self.human_loader.get_data().items()}, - **{f"agent_{sid}": evts - for sid, evts in self.agent_loader.get_data().items()} + **{ + f"human_{sid}": [e.value.payload for e in evts] + for sid, evts in self.human_loader.get_data().items() + }, + **{ + f"agent_{sid}": evts + for sid, evts in self.agent_loader.get_data().items() + }, } def get_data(self) -> dict: @@ -86,12 +101,17 @@ class JointLoader: def get_entries(self) -> tuple[list[str], int]: return self.entries, len(self.entries) + if __name__ == "__main__": agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/" - human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/" + human_dir = ( + "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/" + ) - for name, cls, path in [("agent", AgentLoader, agent_dir), - ("human", Loader, human_dir), - ("joint", lambda d: JointLoader(human_dir, d), agent_dir)]: + for name, cls, path in [ + ("agent", AgentLoader, agent_dir), + ("human", Loader, human_dir), + ("joint", lambda d: JointLoader(human_dir, d), agent_dir), + ]: ldr = cls(path) if name != "joint" else cls(agent_dir) print(f"Loaded {len(ldr.get_entries()[0])} {name} sessions") diff --git a/sim/rl/behavior_loader/models.py b/sim/rl/behavior_loader/models.py index c4ec78f..cb67cbf 100644 --- a/sim/rl/behavior_loader/models.py +++ b/sim/rl/behavior_loader/models.py @@ -260,6 +260,29 @@ def _avg_event_kl( return float(np.mean([kl_divergence(src_evt[e], dst_evt[e]) for e in common])) +def per_session_divergence( + model: BehaviorModel, + reference_evt: Dict[str, Dict[str, float]], +) -> List[float]: + """KL from each session's event-level transition dist to a reference kernel. Returns one scalar per session.""" + scores = [] + for sid, evts in model.data.items(): + if len(evts) < 2: + continue + subset_mdp = _build_subset_mdp(model, [sid]) + sess_evt = aggregate_event_transitions(subset_mdp) + common = set(sess_evt.keys()) & set(reference_evt.keys()) + if not common: + scores.append(0.0) + continue + scores.append( + float( + np.mean([kl_divergence(sess_evt[e], reference_evt[e]) for e in common]) + ) + ) + return scores + + def bootstrap_intra_class_divergence( model: BehaviorModel, n_bootstrap: int = 100, @@ -412,3 +435,36 @@ if __name__ == "__main__": f" Lift vs pooled intra mean: {inter_class_avg / max(float(np.mean(pooled_null)), 1e-10):.2f}x" ) print(f" Empirical p-value (inter > intra): {p_empirical:.4f}") + + # per-session divergence scores: delta_H - delta_A per session (positive means closer to agent behavior) + from scipy.stats import mannwhitneyu + + human_dH = per_session_divergence( + human_model, human_evt + ) # human session vs human centroid + human_dA = per_session_divergence( + human_model, agent_evt + ) # human session vs agent centroid + agent_dH = per_session_divergence( + agent_model, human_evt + ) # agent session vs human centroid + agent_dA = per_session_divergence( + agent_model, agent_evt + ) # agent session vs agent centroid + # score = delta_H - delta_A: high means far from humans, close to agents + n_h = min(len(human_dH), len(human_dA)) + n_a = min(len(agent_dH), len(agent_dA)) + human_diff = [human_dH[i] - human_dA[i] for i in range(n_h)] + agent_diff = [agent_dH[i] - agent_dA[i] for i in range(n_a)] + print(f"\nPer-session divergence gap (delta_H - delta_A):") + print( + f" Human sessions (n={n_h}): mean={np.mean(human_diff):.4f}, std={np.std(human_diff):.4f}" + ) + print( + f" Agent sessions (n={n_a}): mean={np.mean(agent_diff):.4f}, std={np.std(agent_diff):.4f}" + ) + if n_h >= 2 and n_a >= 2: + U, mw_p = mannwhitneyu(human_diff, agent_diff, alternative="two-sided") + print(f" Mann-Whitney U={U:.1f}, p={mw_p:.4f}") + else: + print(" Insufficient sessions for Mann-Whitney test")