Airflow addition (#28)

* introducing airflow to run pipeline * chore: updating dag with upload to registry * introducing complete provider (non refactored and noisy) * chore: removing old shit * generic pricing baselines * feature: super simple model registry (to be updated maybe third party OS software) * chore: refactoring the providers docker config and requirements * chore: refactored and broke down components (braking * exporting all * local pipeline excution working * fix: fixing import structures from nonrelativistic * chore: enables cross comm pickling with fully e2e pipeline compilation * docs: what the pipeline is like now * pipelines local running and pipeline high level definition * cleaning old pipeline and vectorization * leaked but fixing, not so important * test: started with pipeline step testing * chore: cleaning up provider of prices * test: extra tests wit hsemantic meaning checks * migrating pricers * feature: introducing pricing predictors (pricers) * chore: e2e is done with new pipeline * extra session feature extraction * feature: experiemntal sessin pricer and metrics(vibe) * chore: redefined and connected pricers (#29)
2026-07-16 01:53:37 +00:00 · 2025-11-29 17:50:16 +01:00
parent 2a0e44ab24
commit ad9423bf59
49 changed files with 3642 additions and 619 deletions
--- a/experiments/procesing/tests/test_augement.py
+++ b/experiments/procesing/tests/test_augement.py
@@ -0,0 +1,45 @@
+import pytest
+import random
+import pandas as pd
+from procesing.steps import (
+    CreatePriceBucketsStep,
+    AugmentEventNamesStep
+)
+
+def test_bucketing(pipeline_context):
+    step = CreatePriceBucketsStep(context=pipeline_context)
+
+    # Test with normal price data
+    df = pd.DataFrame({
+        'metadata_price': random.sample(range(10, 1000), 100)
+    })
+    result = step.transform(df)
+    assert 'price_bucket' in result.columns
+    # test if is categorical
+    assert isinstance(result['price_bucket'].dtype, pd.CategoricalDtype)
+    assert result['price_bucket'].nunique() == 3 # as per context config
+    # distribution check
+    counts = result['price_bucket'].value_counts()
+    assert all(counts > 0)
+    assert counts.max() - counts.min() <= 10  # roughly equal distribution for 100 samples
+    # Test with empty DataFrame
+    df = pd.DataFrame()
+    result = step.transform(df)
+    assert 'price_bucket' in result.columns
+    assert result.empty
+
+
+def test_augment_names(pipeline_context):
+    df = pd.DataFrame({
+        'eventName': ['click', 'view', 'purchase'],
+        'productId': ['prod_1', 'prod_2', None],
+        'price_bucket': ['PB_1', None, 'PB_3']
+    })
+    step = AugmentEventNamesStep(context=pipeline_context)
+    result = step.transform(df)
+    expected_event_names = [
+        'click_prod_1@PB_1',
+        'view',
+        'purchase'
+    ]
+    assert result['eventName'].tolist() == expected_event_names