Airflow addition (#28)

* introducing airflow to run pipeline * chore: updating dag with upload to registry * introducing complete provider (non refactored and noisy) * chore: removing old shit * generic pricing baselines * feature: super simple model registry (to be updated maybe third party OS software) * chore: refactoring the providers docker config and requirements * chore: refactored and broke down components (braking * exporting all * local pipeline excution working * fix: fixing import structures from nonrelativistic * chore: enables cross comm pickling with fully e2e pipeline compilation * docs: what the pipeline is like now * pipelines local running and pipeline high level definition * cleaning old pipeline and vectorization * leaked but fixing, not so important * test: started with pipeline step testing * chore: cleaning up provider of prices * test: extra tests wit hsemantic meaning checks * migrating pricers * feature: introducing pricing predictors (pricers) * chore: e2e is done with new pipeline * extra session feature extraction * feature: experiemntal sessin pricer and metrics(vibe) * chore: redefined and connected pricers (#29)
2026-07-16 01:53:37 +00:00 · 2025-11-29 17:50:16 +01:00
parent 2a0e44ab24
commit ad9423bf59
49 changed files with 3642 additions and 619 deletions
--- a/experiments/procesing/steps/demand.py
+++ b/experiments/procesing/steps/demand.py
@@ -0,0 +1,61 @@
+import pandas as pd
+from procesing.steps.base import BaseContextStep
+
+class ComputeDemandStep(BaseContextStep):
+    """
+    Compute demand vector for a single time window or dataframe.
+    Input: single chunk dict OR raw dataframe
+    Output: demand dataframe with [productId, demand_score]
+    """
+
+    def transform(self, chunk):
+        # handle both chunk dict and raw dataframe
+        if isinstance(chunk, dict):
+            interactions = chunk['data']
+            window_meta = {k: v for k, v in chunk.items() if k != 'data'}
+        else:
+            interactions = chunk
+            window_meta = {}
+
+        products = self.context.products
+        unique_products = products['id'].unique()
+
+        # apply filters if configured
+        session_filter = self.context.config.get('session_filter')
+        experiment_filter = self.context.config.get('experiment_filter')
+
+        if session_filter and 'sessionId' in interactions.columns:
+            interactions = interactions[interactions['sessionId'] == session_filter]
+        if experiment_filter and 'experimentId' in interactions.columns:
+            interactions = interactions[interactions['experimentId'] == experiment_filter]
+
+        interactions_with_products = interactions.dropna(subset=['productId'])
+
+        if interactions_with_products.empty:
+            demand_df = pd.DataFrame({
+                'productId': unique_products,
+                'demand_score': 0
+            })
+        else:
+            # crosstab for simple demand count
+            demand_df = pd.crosstab(
+                interactions_with_products['productId'],
+                'count'
+            ).reindex(unique_products, fill_value=0).reset_index()
+            demand_df.columns = ['productId', 'demand_score']
+
+        # attach window metadata if present
+        if window_meta:
+            return {**window_meta, 'demand_vector': demand_df}
+        return demand_df
+
+
+class ComputeDemandForChunksStep(BaseContextStep):
+    """Apply ComputeDemandStep to list of chunks"""
+
+    def transform(self, chunks: list):
+        if not chunks:
+            return []
+
+        demand_step = ComputeDemandStep(self.context)
+        return [demand_step.transform(chunk) for chunk in chunks]