mirror of
https://github.com/velocitatem/PHANTOM.git
synced 2026-05-31 16:43:36 +00:00
changed to new test method for singificance
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -18,6 +18,9 @@ phantom.egg-info/
|
||||
.nextstep
|
||||
.ignore-gitlogue
|
||||
.cloudflare
|
||||
.nx/
|
||||
node_modules/
|
||||
dist/
|
||||
|
||||
# generated svg/graphics
|
||||
**/session_*.svg
|
||||
|
||||
49
Makefile
49
Makefile
@@ -8,6 +8,7 @@ VENV := .venv
|
||||
PYTHON := $(VENV)/bin/python
|
||||
PIP := $(VENV)/bin/pip
|
||||
PYTEST := $(VENV)/bin/pytest
|
||||
NX := npx nx
|
||||
|
||||
SWEEP_ENV_FILE ?= .env.sweep
|
||||
|
||||
@@ -36,7 +37,7 @@ SWEEP_ENV_LOAD = set -a; [ -f "$(SWEEP_ENV_FILE)" ] && . "$(SWEEP_ENV_FILE)" ||
|
||||
.PHONY: help
|
||||
help:
|
||||
@echo "pdf.build pdf.watch pdf.clean | test.backend test.e2e test.all | web.dev | install | train | train.agent | train.bootstrap | train.tpu.pod | train.tpu.vm | train.tpu.vm.sweep | stats.lines"
|
||||
@echo "docker.train.publish"
|
||||
@echo "backend.server backend.provider backend.worker | platform.up platform.down platform.logs | docker.train.publish"
|
||||
@echo ""
|
||||
@echo "Local wandb run:"
|
||||
@echo " make train LOCAL_TRAIN_ARGS='--algo ppo --total-timesteps 50000'"
|
||||
@@ -208,11 +209,43 @@ train.tpu.vm.sweep:
|
||||
--tpu-repo-dir "$(TPU_REPO_DIR)" \
|
||||
$(if $(filter-out 0,$(AGENT_COUNT)),--count $(AGENT_COUNT),)
|
||||
|
||||
.PHONY: backend.server backend.provider backend.worker platform.up platform.down platform.logs
|
||||
backend.server:
|
||||
@$(NX) run backend-server:dev
|
||||
|
||||
backend.provider:
|
||||
@$(NX) run pricing-provider:dev
|
||||
|
||||
backend.worker:
|
||||
@$(NX) run backend-worker:dev
|
||||
|
||||
platform.up:
|
||||
@$(NX) run platform:up
|
||||
|
||||
platform.down:
|
||||
@$(NX) run platform:down
|
||||
|
||||
platform.logs:
|
||||
@$(NX) run platform:logs
|
||||
|
||||
.PHONY: pdf clean watch run.webapp test count-lines all
|
||||
pdf: pdf.build
|
||||
clean: pdf.clean
|
||||
watch: pdf.watch
|
||||
run.webapp: web.dev
|
||||
test: test.backend
|
||||
count-lines: stats.lines
|
||||
all: pdf.build
|
||||
pdf:
|
||||
@$(NX) run paper:build
|
||||
|
||||
clean:
|
||||
@$(NX) run paper:clean
|
||||
|
||||
watch:
|
||||
@$(NX) run paper:watch
|
||||
|
||||
run.webapp:
|
||||
@$(NX) run web:dev
|
||||
|
||||
test:
|
||||
@$(NX) run research:test
|
||||
|
||||
count-lines:
|
||||
@$(NX) run research:stats
|
||||
|
||||
all:
|
||||
@$(NX) run paper:build
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
fastapi==0.104.1
|
||||
uvicorn[standard]==0.24.0
|
||||
kafka-python==2.0.2
|
||||
pydantic==2.5.0
|
||||
python-dotenv==1.0.0
|
||||
supabase==2.9.1
|
||||
fastapi>=0.135,<0.136
|
||||
uvicorn[standard]>=0.41,<0.42
|
||||
kafka-python>=2.3,<2.4
|
||||
pydantic>=2.12,<3
|
||||
python-dotenv>=1.0,<2
|
||||
supabase>=2.28,<3
|
||||
|
||||
@@ -616,3 +616,17 @@ Volume: 21},
|
||||
year = {2026},
|
||||
file = {Snapshot:/home/velocitatem/Zotero/storage/N724QGF6/v4.html:text/html},
|
||||
}
|
||||
|
||||
@article{mann_test_1947,
|
||||
title = {On a {Test} of {Whether} one of {Two} {Random} {Variables} is {Stochastically} {Larger} than the {Other}},
|
||||
volume = {18},
|
||||
url = {https://doi.org/10.1214/aoms/1177730491},
|
||||
doi = {10.1214/aoms/1177730491},
|
||||
abstract = {Let x and y be two random variables with continuous cumulative distribution functions f and g. A statistic U depending on the relative ranks of the x's and y's is proposed for testing the hypothesis f = g. Wilcoxon proposed an equivalent test in the Biometrics Bulletin, December, 1945, but gave only a few points of the distribution of his statistic. Under the hypothesis f = g the probability of obtaining a given U in a sample of n x's and m y's is the solution of a certain recurrence relation involving n and m. Using this recurrence relation tables have been computed giving the probability of U for samples up to n = m = 8. At this point the distribution is almost normal. From the recurrence relation explicit expressions for the mean, variance, and fourth moment are obtained. The 2rth moment is shown to have a certain form which enabled us to prove that the limit distribution is normal if m, n go to infinity in any arbitrary manner. The test is shown to be consistent with respect to the class of alternatives f(x) {\textgreater} g(x) for every x.},
|
||||
number = {1},
|
||||
journal = {The Annals of Mathematical Statistics},
|
||||
author = {Mann, H. B. and Whitney, D. R.},
|
||||
year = {1947},
|
||||
note = {Publisher: Institute of Mathematical Statistics},
|
||||
pages = {50 -- 60},
|
||||
}
|
||||
|
||||
@@ -303,13 +303,9 @@ To train a robust pricing learner, we need a simulator that can generate realist
|
||||
\subsubsection{Ground-Truth Separability}
|
||||
Because sessions are collected under controlled experimental conditions where each actor is assigned a known type at the start of the trial, labels $\theta_s \in \{H, A\}$ are available as ground truth rather than as the output of a heuristic classifier. We therefore estimate separate transition kernels directly from each labeled partition $\mathcal{D}_H$ and $\mathcal{D}_A$, treating the resulting $\hat{\mathcal{T}}_H$ and $\hat{\mathcal{T}}_A$ as the ground-truth behavioral profiles for each class. We then ask a direct methodological question: are the kernels separable enough to justify downstream pricing control that depends on that separability?
|
||||
|
||||
To answer this, we compute average KL divergence between transition probability matrices. This statistic gives global separability and event-level diagnostics at the same time. To test whether the observed between-class value exceeds finite-sample estimation noise, we compute an intra-class bootstrap baseline by repeatedly splitting $\mathcal{D}_H$ and $\mathcal{D}_A$ into two random halves, fitting a transition kernel on each half, and re-computing the same average KL statistic for each split.
|
||||
To answer this, we compute per-session KL divergence scores against both class-level centroids. For each session $s$ in either partition, we fit a session-level event transition kernel $\hat{\mathcal{T}}_s$ from that session's trajectory alone, then compute its average KL divergence to the human centroid ($\Delta_{H,s}$) and to the agent centroid ($\Delta_{A,s}$). The per-session separability score is the gap $\Delta_{H,s} - \Delta_{A,s}$: a negative value indicates proximity to human behavior, a positive value indicates proximity to agent behavior.
|
||||
|
||||
Formally, for $B$ bootstrap splits per class we obtain reference samples $\{d_{H,b}^{\text{intra}}\}_{b=1}^B$ and $\{d_{A,b}^{\text{intra}}\}_{b=1}^B$, then compare the between-class divergence $d^{\text{inter}}$ against the pooled null distribution. We report pooled mean and variance, lift ratio $d^{\text{inter}}/\mathbb{E}[d^{\text{intra}}]$, and the empirical one-sided p-value
|
||||
\begin{equation}
|
||||
\hat p = \frac{1 + \sum_{j=1}^{2B}\mathbf{1}\{d_j^{\text{intra}} \ge d^{\text{inter}}\}}{2B + 1},
|
||||
\end{equation}
|
||||
which gives a direct significance check for separability before using divergence-derived centroid control signals in pricing.
|
||||
The normality assumption cannot be made for KL divergence distributions, which are right-skewed and bounded below by zero, so we do not use a Student's $t$-test. Instead we apply a Mann-Whitney $U$ test \parencite{mann_test_1947} on the per-session gap scores between the two groups. The Mann-Whitney test is a rank-based nonparametric test that compares the stochastic ordering of two independent samples without distributional assumptions, making it appropriate for small samples drawn from skewed populations. We report $U$, the exact two-sided $p$-value, and group-level descriptive statistics for the gap scores.
|
||||
|
||||
\begin{definition}[Kullback-Leibler Divergence for Transition Distributions]
|
||||
Let $P_e$ and $Q_e$ be categorical distributions over destination states following event $e$, derived from human and agent trajectories respectively. The KL divergence between these distributions is:
|
||||
|
||||
@@ -10,26 +10,25 @@
|
||||
|
||||
\subsection{Behavioral Analysis}
|
||||
|
||||
The transition-kernel analysis is evaluated with both between-class divergence and an intra-class bootstrap null baseline. This allows us to separate real behavioral differences from finite-sample estimation noise and bias.
|
||||
Separability between human and agent sessions is evaluated by computing per-session divergence gap scores $\Delta_{H,s} - \Delta_{A,s}$ and comparing the two groups with a Mann-Whitney $U$ test. Table~\ref{tab:divergence_significance} reports the group-level descriptive statistics for the gap scores and the test result.
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\caption{Divergence significance using intra-class bootstrap baseline (B=100 per class).}
|
||||
\caption{Per-session divergence gap ($\Delta_H - \Delta_A$) by actor class with Mann-Whitney $U$ test.}
|
||||
\label{tab:divergence_significance}
|
||||
\begin{tabular}{lcccc}
|
||||
\begin{tabular}{lccc}
|
||||
\toprule
|
||||
Metric & Mean KL & Std & 5\% quantile & 95\% quantile \\
|
||||
Group & $n$ & Mean gap & Std \\
|
||||
\midrule
|
||||
Between-class (Human vs Agent) & 5.3067 & -- & -- & -- \\
|
||||
Human intra-class split & 2.5271 & 1.2501 & 0.6845 & 4.6015 \\
|
||||
Agent intra-class split & 1.2065 & 1.2607 & 0.2177 & 4.2345 \\
|
||||
Human sessions & 11 & $-3.3522$ & $2.6748$ \\
|
||||
Agent sessions & 6 & $+1.6482$ & $2.8349$ \\
|
||||
\midrule
|
||||
\multicolumn{4}{l}{Mann-Whitney $U = 2.0$, $p = 0.0006$ (two-sided)} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
For this run ($n_H=11$, $n_A=7$, $B=100$), the empirical p-value is $0.0149$, both computed as defined in Section~\ref{sec:tpe}. This places the between-class divergence clearly above the intra-class null and supports the use of divergence-derived contamination signals in downstream pricing control.
|
||||
|
||||
% TODO: instead could we do a simple t test to see the difference in the means in some way? That way we can yield a P value
|
||||
The sign structure is consistent with the theoretical expectation: human sessions produce negative gap scores (closer to the human centroid, far from the agent centroid) while agent sessions produce positive gap scores (closer to the agent centroid). The two-sided $p$-value of $0.0006$ indicates near-complete rank separation between the groups at $n_H=11$, $n_A=6$, providing strong evidence that the transition kernels are separable enough to justify their use as a control signal in downstream pricing.
|
||||
|
||||
|
||||
\subsection{Experimental Outcomes}
|
||||
@@ -54,6 +53,6 @@ This comparison isolates the effect of robustness terms from model capacity and
|
||||
|
||||
|
||||
\subsection{Interpretation and Insights}
|
||||
Between-class divergence substantially above the intra-class null indicates that the two actor classes are behaviorally separable at the transition-kernel level. In pricing experiments, this is the condition required for separability to act as a useful control signal rather than just an auxiliary classifier score.
|
||||
The Mann-Whitney result ($U=2.0$, $p<0.001$) confirms that per-session divergence gaps separate the two actor classes with near-zero overlap in rank ordering. This is the condition required for separability to act as a useful control signal in the pricing loop rather than just an auxiliary classifier score.
|
||||
|
||||
\subsection{Anomalies}
|
||||
|
||||
@@ -2,6 +2,7 @@ import os
|
||||
import json
|
||||
from pydantic import BaseModel as Base
|
||||
|
||||
|
||||
class PayloadModel(Base):
|
||||
sessionId: str
|
||||
experimentId: str | None
|
||||
@@ -13,6 +14,7 @@ class PayloadModel(Base):
|
||||
userAgent: str
|
||||
ts: str
|
||||
|
||||
|
||||
class ValueModel(Base):
|
||||
payload: PayloadModel
|
||||
encoding: str
|
||||
@@ -20,6 +22,7 @@ class ValueModel(Base):
|
||||
schemaId: int
|
||||
size: int
|
||||
|
||||
|
||||
class InteractionModel(Base):
|
||||
partitionID: int
|
||||
offset: int
|
||||
@@ -30,14 +33,17 @@ class InteractionModel(Base):
|
||||
key: dict
|
||||
value: ValueModel
|
||||
|
||||
|
||||
def _is_admin(page: str | None) -> bool:
|
||||
return page is not None and page.startswith("/admin/")
|
||||
|
||||
|
||||
class Loader:
|
||||
def __init__(self, src_dir: str):
|
||||
self.src_dir = src_dir
|
||||
self.entries = os.listdir(src_dir)
|
||||
if not self.entries: raise ValueError("empty directory")
|
||||
if not self.entries:
|
||||
raise ValueError("empty directory")
|
||||
self.data = self._load_sessions()
|
||||
|
||||
def _load_sessions(self) -> dict:
|
||||
@@ -55,16 +61,21 @@ class Loader:
|
||||
def get_entries(self) -> tuple[list[str], int]:
|
||||
return self.entries, len(self.entries)
|
||||
|
||||
|
||||
class AgentLoader(Loader):
|
||||
def _load_sessions(self) -> dict:
|
||||
sessions = {}
|
||||
for entry in self.entries:
|
||||
with open(f"{self.src_dir}/{entry}/int.json") as f:
|
||||
path = f"{self.src_dir}/{entry}/int.json"
|
||||
if not os.path.isfile(path):
|
||||
continue
|
||||
with open(path) as f:
|
||||
raw = json.load(f)
|
||||
ints = [PayloadModel(**i) for i in raw]
|
||||
sessions[entry] = [i for i in ints if not _is_admin(i.page)]
|
||||
return sessions
|
||||
|
||||
|
||||
class JointLoader:
|
||||
def __init__(self, human_dir: str, agent_dir: str):
|
||||
self.human_loader = Loader(human_dir)
|
||||
@@ -74,10 +85,14 @@ class JointLoader:
|
||||
|
||||
def _merge(self) -> dict:
|
||||
return {
|
||||
**{f"human_{sid}": [e.value.payload for e in evts]
|
||||
for sid, evts in self.human_loader.get_data().items()},
|
||||
**{f"agent_{sid}": evts
|
||||
for sid, evts in self.agent_loader.get_data().items()}
|
||||
**{
|
||||
f"human_{sid}": [e.value.payload for e in evts]
|
||||
for sid, evts in self.human_loader.get_data().items()
|
||||
},
|
||||
**{
|
||||
f"agent_{sid}": evts
|
||||
for sid, evts in self.agent_loader.get_data().items()
|
||||
},
|
||||
}
|
||||
|
||||
def get_data(self) -> dict:
|
||||
@@ -86,12 +101,17 @@ class JointLoader:
|
||||
def get_entries(self) -> tuple[list[str], int]:
|
||||
return self.entries, len(self.entries)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
agent_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/agents/collected_data/"
|
||||
human_dir = "/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
|
||||
human_dir = (
|
||||
"/home/velocitatem/Documents/Projects/PHANTOM/experiments/collected_data/"
|
||||
)
|
||||
|
||||
for name, cls, path in [("agent", AgentLoader, agent_dir),
|
||||
("human", Loader, human_dir),
|
||||
("joint", lambda d: JointLoader(human_dir, d), agent_dir)]:
|
||||
for name, cls, path in [
|
||||
("agent", AgentLoader, agent_dir),
|
||||
("human", Loader, human_dir),
|
||||
("joint", lambda d: JointLoader(human_dir, d), agent_dir),
|
||||
]:
|
||||
ldr = cls(path) if name != "joint" else cls(agent_dir)
|
||||
print(f"Loaded {len(ldr.get_entries()[0])} {name} sessions")
|
||||
|
||||
@@ -260,6 +260,29 @@ def _avg_event_kl(
|
||||
return float(np.mean([kl_divergence(src_evt[e], dst_evt[e]) for e in common]))
|
||||
|
||||
|
||||
def per_session_divergence(
|
||||
model: BehaviorModel,
|
||||
reference_evt: Dict[str, Dict[str, float]],
|
||||
) -> List[float]:
|
||||
"""KL from each session's event-level transition dist to a reference kernel. Returns one scalar per session."""
|
||||
scores = []
|
||||
for sid, evts in model.data.items():
|
||||
if len(evts) < 2:
|
||||
continue
|
||||
subset_mdp = _build_subset_mdp(model, [sid])
|
||||
sess_evt = aggregate_event_transitions(subset_mdp)
|
||||
common = set(sess_evt.keys()) & set(reference_evt.keys())
|
||||
if not common:
|
||||
scores.append(0.0)
|
||||
continue
|
||||
scores.append(
|
||||
float(
|
||||
np.mean([kl_divergence(sess_evt[e], reference_evt[e]) for e in common])
|
||||
)
|
||||
)
|
||||
return scores
|
||||
|
||||
|
||||
def bootstrap_intra_class_divergence(
|
||||
model: BehaviorModel,
|
||||
n_bootstrap: int = 100,
|
||||
@@ -412,3 +435,36 @@ if __name__ == "__main__":
|
||||
f" Lift vs pooled intra mean: {inter_class_avg / max(float(np.mean(pooled_null)), 1e-10):.2f}x"
|
||||
)
|
||||
print(f" Empirical p-value (inter > intra): {p_empirical:.4f}")
|
||||
|
||||
# per-session divergence scores: delta_H - delta_A per session (positive means closer to agent behavior)
|
||||
from scipy.stats import mannwhitneyu
|
||||
|
||||
human_dH = per_session_divergence(
|
||||
human_model, human_evt
|
||||
) # human session vs human centroid
|
||||
human_dA = per_session_divergence(
|
||||
human_model, agent_evt
|
||||
) # human session vs agent centroid
|
||||
agent_dH = per_session_divergence(
|
||||
agent_model, human_evt
|
||||
) # agent session vs human centroid
|
||||
agent_dA = per_session_divergence(
|
||||
agent_model, agent_evt
|
||||
) # agent session vs agent centroid
|
||||
# score = delta_H - delta_A: high means far from humans, close to agents
|
||||
n_h = min(len(human_dH), len(human_dA))
|
||||
n_a = min(len(agent_dH), len(agent_dA))
|
||||
human_diff = [human_dH[i] - human_dA[i] for i in range(n_h)]
|
||||
agent_diff = [agent_dH[i] - agent_dA[i] for i in range(n_a)]
|
||||
print(f"\nPer-session divergence gap (delta_H - delta_A):")
|
||||
print(
|
||||
f" Human sessions (n={n_h}): mean={np.mean(human_diff):.4f}, std={np.std(human_diff):.4f}"
|
||||
)
|
||||
print(
|
||||
f" Agent sessions (n={n_a}): mean={np.mean(agent_diff):.4f}, std={np.std(agent_diff):.4f}"
|
||||
)
|
||||
if n_h >= 2 and n_a >= 2:
|
||||
U, mw_p = mannwhitneyu(human_diff, agent_diff, alternative="two-sided")
|
||||
print(f" Mann-Whitney U={U:.1f}, p={mw_p:.4f}")
|
||||
else:
|
||||
print(" Insufficient sessions for Mann-Whitney test")
|
||||
|
||||
Reference in New Issue
Block a user