Airflow addition (#28)

* introducing airflow to run pipeline * chore: updating dag with upload to registry * introducing complete provider (non refactored and noisy) * chore: removing old shit * generic pricing baselines * feature: super simple model registry (to be updated maybe third party OS software) * chore: refactoring the providers docker config and requirements * chore: refactored and broke down components (braking * exporting all * local pipeline excution working * fix: fixing import structures from nonrelativistic * chore: enables cross comm pickling with fully e2e pipeline compilation * docs: what the pipeline is like now * pipelines local running and pipeline high level definition * cleaning old pipeline and vectorization * leaked but fixing, not so important * test: started with pipeline step testing * chore: cleaning up provider of prices * test: extra tests wit hsemantic meaning checks * migrating pricers * feature: introducing pricing predictors (pricers) * chore: e2e is done with new pipeline * extra session feature extraction * feature: experiemntal sessin pricer and metrics(vibe) * chore: redefined and connected pricers (#29)
2026-07-16 01:53:37 +00:00 · 2025-11-29 17:50:16 +01:00
parent 2a0e44ab24
commit ad9423bf59
49 changed files with 3642 additions and 619 deletions
--- a/experiments/procesing/elasticity.py
+++ b/experiments/procesing/elasticity.py
@@ -130,25 +130,24 @@ class TemporalElasticityEstimator(BaseEstimator, TransformerMixin):

    def _build_product_timeseries(self, aligned_chunks):
        """Build time series [price, quantity] per product."""
-        series_by_product = {}
-
+        # vectorize chunk merging instead of iterating rows
+        all_merged = []
        for chunk in aligned_chunks:
-            demand_df = chunk['demand']
-            price_df = chunk['prices']
+            merged = chunk['demand'].merge(chunk['prices'], on='productId', how='inner')
+            merged['timestamp'] = chunk['window_start']
+            all_merged.append(merged[['productId', 'timestamp', 'price', 'demand_score']])

-            # merge on productId
-            merged = demand_df.merge(price_df, on='productId', how='inner')
+        if not all_merged:
+            return {}

-            for _, row in merged.iterrows():
-                pid = row['productId']
-                if pid not in series_by_product:
-                    series_by_product[pid] = []
-
-                series_by_product[pid].append({
-                    'timestamp': chunk['window_start'],
-                    'price': row['price'],
-                    'quantity': row['demand_score']
-                })
+        # concat all chunks and group by productId in one pass
+        combined = pd.concat(all_merged, ignore_index=True)
+        series_by_product = {
+            pid: group[['timestamp', 'price', 'demand_score']].rename(
+                columns={'demand_score': 'quantity'}
+            ).to_dict('records')
+            for pid, group in combined.groupby('productId')
+        }

        return series_by_product