Airflow addition (#28)

* introducing airflow to run pipeline * chore: updating dag with upload to registry * introducing complete provider (non refactored and noisy) * chore: removing old shit * generic pricing baselines * feature: super simple model registry (to be updated maybe third party OS software) * chore: refactoring the providers docker config and requirements * chore: refactored and broke down components (braking * exporting all * local pipeline excution working * fix: fixing import structures from nonrelativistic * chore: enables cross comm pickling with fully e2e pipeline compilation * docs: what the pipeline is like now * pipelines local running and pipeline high level definition * cleaning old pipeline and vectorization * leaked but fixing, not so important * test: started with pipeline step testing * chore: cleaning up provider of prices * test: extra tests wit hsemantic meaning checks * migrating pricers * feature: introducing pricing predictors (pricers) * chore: e2e is done with new pipeline * extra session feature extraction * feature: experiemntal sessin pricer and metrics(vibe) * chore: redefined and connected pricers (#29)
2026-07-16 01:53:37 +00:00 · 2025-11-29 17:50:16 +01:00
parent 2a0e44ab24
commit ad9423bf59
49 changed files with 3642 additions and 619 deletions
--- a/experiments/procesing/providers/init.py
+++ b/experiments/procesing/providers/init.py
@@ -0,0 +1,5 @@
+from procesing.providers.base import DataProvider
+from procesing.providers.supabase import SupabaseProvider
+from procesing.providers.backend import BackendAPIProvider
+
+__all__ = ['DataProvider', 'SupabaseProvider', 'BackendAPIProvider']
--- a/experiments/procesing/providers/backend.py
+++ b/experiments/procesing/providers/backend.py
@@ -0,0 +1,19 @@
+import os
+import pandas as pd
+import requests
+from typing import List
+from procesing.providers.base import DataProvider
+
+class BackendAPIProvider(DataProvider):
+    """Concrete backend API implementation"""
+    def __init__(self, backend_url: str = None):
+        self.backend_url = backend_url or os.getenv("BACKEND_URL", "http://localhost:5000")
+    def fetch_kafka_topic(self, topic: str) -> pd.DataFrame:
+        resp = requests.get(f"{self.backend_url}/api/kafka/dump?topic={topic}")
+        resp.raise_for_status()
+        data = resp.json()
+
+        if not data.get('success') or not data.get('data'):
+            return pd.DataFrame()
+
+        return pd.DataFrame(data['data'])
--- a/experiments/procesing/providers/base.py
+++ b/experiments/procesing/providers/base.py
@@ -0,0 +1,21 @@
+from abc import ABC, abstractmethod
+from typing import List
+import pandas as pd
+
+class DataProvider(ABC):
+    """Abstract interface for data access, enables DI and testing"""
+
+    @abstractmethod
+    def fetch_products(self, store_mode: str) -> pd.DataFrame:
+        """Fetch product catalog for given store mode"""
+        pass
+
+    @abstractmethod
+    def fetch_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
+        """Fetch experiment metadata for given IDs"""
+        pass
+
+    @abstractmethod
+    def fetch_kafka_topic(self, topic: str) -> pd.DataFrame:
+        """Fetch data from Kafka topic via backend API"""
+        pass
--- a/experiments/procesing/providers/supabase.py
+++ b/experiments/procesing/providers/supabase.py
@@ -0,0 +1,35 @@
+import os
+import pandas as pd
+import requests
+from typing import List
+from supabase import create_client, Client
+from procesing.providers.base import DataProvider
+from dotenv import load_dotenv
+
+class SupabaseProvider(DataProvider):
+    """Concrete Supabase + backend API implementation"""
+
+    def __init__(self,
+                 supabase_url: str = None,
+                 supabase_key: str = None,):
+        load_dotenv()
+        self.supabase_url = supabase_url or os.getenv("NEXT_PUBLIC_SUPABASE_URL")
+        self.supabase_key = supabase_key or os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
+        self.supabase: Client = create_client(self.supabase_url, self.supabase_key)
+
+    def fetch_products(self, store_mode: str) -> pd.DataFrame:
+        resp = self.supabase.table(f'{store_mode}_products').select(
+            "id, room_type, date_index, metadata, availability"
+        ).execute()
+        return pd.DataFrame(resp.data) if resp.data else pd.DataFrame()
+
+    def fetch_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
+        if not experiment_ids:
+            return pd.DataFrame()
+
+        resp = self.supabase.table('experiments').select(
+            'id, subject_name, xp_human_only, xp_market_mode, xp_task_id, '
+            'task:tasks(task_name, task_description, task_def_of_done)'
+        ).in_('id', experiment_ids).execute()
+
+        return pd.DataFrame(resp.data) if resp.data else pd.DataFrame()