mirror of
https://github.com/velocitatem/PHANTOM.git
synced 2026-05-31 08:33:36 +00:00
166 lines
6.0 KiB
Python
166 lines
6.0 KiB
Python
import pytest
|
|
import pandas as pd
|
|
import numpy as np
|
|
from procesing.steps.session import (
|
|
TemporalFeatureStep,
|
|
BehavioralFeatureStep,
|
|
ProductFeatureStep,
|
|
UserAgentFeatureStep,
|
|
ExtractSessionFeaturesStep,
|
|
JoinLabelsStep,
|
|
ValidateDataStep,
|
|
)
|
|
|
|
|
|
# TemporalFeatureStep tests
|
|
def test_temporal_empty(pipeline_context):
|
|
result = TemporalFeatureStep(pipeline_context).transform(pd.DataFrame())
|
|
assert 'sessionId' in result.columns
|
|
assert result.empty
|
|
|
|
|
|
def test_temporal_basic(pipeline_context, session_interactions):
|
|
result = TemporalFeatureStep(pipeline_context).transform(session_interactions)
|
|
assert 'session_duration_sec' in result.columns
|
|
assert 'interaction_velocity' in result.columns
|
|
assert 'max_velocity_5min' in result.columns
|
|
assert result['total_interactions'].sum() == len(session_interactions)
|
|
|
|
|
|
def test_temporal_timeout(pipeline_context):
|
|
df = pd.DataFrame({
|
|
'sessionId': ['s1', 's1'],
|
|
'ts': ['2025-01-01T10:00:00Z', '2025-01-01T11:00:00Z'], # 1 hour gap
|
|
})
|
|
result = TemporalFeatureStep(pipeline_context, timeout_sec=900).transform(df)
|
|
assert result.iloc[0]['session_duration_sec'] == 0 # gap exceeds timeout
|
|
|
|
|
|
# BehavioralFeatureStep tests
|
|
def test_behavioral_empty(pipeline_context):
|
|
result = BehavioralFeatureStep(pipeline_context).transform(pd.DataFrame())
|
|
assert 'sessionId' in result.columns
|
|
|
|
|
|
def test_behavioral_counts(pipeline_context, session_interactions):
|
|
result = BehavioralFeatureStep(pipeline_context).transform(session_interactions)
|
|
assert 'page_views' in result.columns
|
|
assert 'item_views' in result.columns
|
|
assert 'hover_events' in result.columns
|
|
assert result['total_events'].sum() == len(session_interactions)
|
|
|
|
|
|
def test_behavioral_hover_prefix(pipeline_context):
|
|
df = pd.DataFrame({
|
|
'sessionId': ['s1', 's1'],
|
|
'eventName': ['hover_over_custom', 'hover_over_button'],
|
|
'page': ['/products', '/products'],
|
|
})
|
|
result = BehavioralFeatureStep(pipeline_context).transform(df)
|
|
assert result.iloc[0]['hover_events'] == 2
|
|
|
|
|
|
# ProductFeatureStep tests
|
|
def test_product_empty(pipeline_context):
|
|
result = ProductFeatureStep(pipeline_context).transform(pd.DataFrame())
|
|
assert 'sessionId' in result.columns
|
|
|
|
|
|
def test_product_features(pipeline_context, session_interactions):
|
|
result = ProductFeatureStep(pipeline_context).transform(session_interactions)
|
|
assert 'unique_products_viewed' in result.columns
|
|
assert 'price_range' in result.columns
|
|
assert result['unique_products_viewed'].sum() > 0
|
|
|
|
|
|
# UserAgentFeatureStep tests
|
|
def test_ua_empty(pipeline_context):
|
|
result = UserAgentFeatureStep(pipeline_context).transform(pd.DataFrame())
|
|
assert 'sessionId' in result.columns
|
|
|
|
|
|
def test_ua_headless_detection(pipeline_context):
|
|
df = pd.DataFrame({
|
|
'sessionId': ['s1', 's2'],
|
|
'userAgent': ['Mozilla/5.0 Chrome/120', 'HeadlessChrome/120'],
|
|
})
|
|
result = UserAgentFeatureStep(pipeline_context).transform(df)
|
|
assert 'is_headless' in result.columns
|
|
headless = dict(zip(result['sessionId'], result['is_headless']))
|
|
assert headless['s1'] == False
|
|
assert headless['s2'] == True
|
|
|
|
|
|
def test_ua_browser_family(pipeline_context):
|
|
df = pd.DataFrame({
|
|
'sessionId': ['s1', 's2', 's3'],
|
|
'userAgent': ['Mozilla/5.0 Firefox/120', 'Safari/605.1.15', 'Unknown'],
|
|
})
|
|
result = UserAgentFeatureStep(pipeline_context).transform(df)
|
|
browsers = dict(zip(result['sessionId'], result['browser_family']))
|
|
assert browsers['s1'] == 'Firefox'
|
|
assert browsers['s2'] == 'Safari'
|
|
assert browsers['s3'] == 'Other'
|
|
|
|
|
|
def test_ua_automation_detection(pipeline_context):
|
|
df = pd.DataFrame({
|
|
'sessionId': ['s1', 's2'],
|
|
'userAgent': ['Selenium WebDriver', 'Normal Chrome/120'],
|
|
})
|
|
result = UserAgentFeatureStep(pipeline_context).transform(df)
|
|
auto = dict(zip(result['sessionId'], result['is_automation']))
|
|
assert auto['s1'] == True
|
|
assert auto['s2'] == False
|
|
|
|
|
|
# ExtractSessionFeaturesStep tests
|
|
def test_extract_empty(pipeline_context):
|
|
result = ExtractSessionFeaturesStep(pipeline_context).transform(pd.DataFrame())
|
|
assert result.empty
|
|
|
|
|
|
def test_extract_merges_all(pipeline_context, session_interactions):
|
|
result = ExtractSessionFeaturesStep(pipeline_context).transform(session_interactions)
|
|
expected = ['session_duration_sec', 'total_events', 'unique_products_viewed', 'is_headless']
|
|
for col in expected:
|
|
assert col in result.columns
|
|
assert 'experimentId' in result.columns
|
|
|
|
|
|
# JoinLabelsStep tests
|
|
def test_join_labels_tuple_input(pipeline_context):
|
|
features = pd.DataFrame({'sessionId': ['s1'], 'experimentId': ['exp1'], 'total_events': [5]})
|
|
experiments = pd.DataFrame({'id': ['exp1'], 'xp_human_only': [True]})
|
|
result = JoinLabelsStep(pipeline_context).transform((features, experiments))
|
|
assert 'is_agent' in result.columns
|
|
assert result.iloc[0]['is_agent'] == False
|
|
|
|
|
|
def test_join_labels_empty_experiments(pipeline_context):
|
|
features = pd.DataFrame({'sessionId': ['s1'], 'experimentId': ['exp1']})
|
|
result = JoinLabelsStep(pipeline_context).transform((features, pd.DataFrame()))
|
|
assert pd.isna(result.iloc[0]['is_agent'])
|
|
|
|
|
|
# ValidateDataStep tests
|
|
def test_validate_empty(pipeline_context):
|
|
ValidateDataStep(pipeline_context).transform(pd.DataFrame())
|
|
report = pipeline_context.get_cached('validation_report')
|
|
assert report['status'] == 'empty'
|
|
|
|
|
|
def test_validate_missing_cols(pipeline_context):
|
|
df = pd.DataFrame({'sessionId': ['s1'], 'ts': ['2025-01-01']})
|
|
ValidateDataStep(pipeline_context).transform(df)
|
|
report = pipeline_context.get_cached('validation_report')
|
|
assert report['status'] == 'invalid'
|
|
assert 'eventName' in report['missing_cols']
|
|
|
|
|
|
def test_validate_valid(pipeline_context, session_interactions):
|
|
ValidateDataStep(pipeline_context).transform(session_interactions)
|
|
report = pipeline_context.get_cached('validation_report')
|
|
assert report['status'] == 'valid'
|
|
assert report['sessions'] > 0
|