Airflow addition (#28)

* introducing airflow to run pipeline

* chore: updating dag with upload to registry

* introducing complete provider (non refactored and noisy)

* chore: removing old shit

* generic pricing baselines

* feature: super simple model registry (to be updated maybe third party OS software)

* chore: refactoring the providers docker config and requirements

* chore: refactored and broke down components (braking

* exporting all

* local pipeline excution working

* fix: fixing import structures from nonrelativistic

* chore: enables cross comm pickling with fully e2e pipeline compilation

* docs: what the pipeline is like now

* pipelines local running and pipeline high level definition

* cleaning old pipeline and vectorization

* leaked but fixing, not so important

* test: started with pipeline step testing

* chore: cleaning up provider of prices

* test: extra tests wit hsemantic meaning checks

* migrating pricers

* feature: introducing pricing predictors (pricers)

* chore: e2e is done with new pipeline

* extra session feature extraction

* feature: experiemntal sessin pricer and metrics(vibe)

* chore: redefined and connected pricers (#29)
This commit is contained in:
Daniel Alves Rösel
2025-11-29 17:50:16 +01:00
committed by GitHub
parent 2a0e44ab24
commit ad9423bf59
49 changed files with 3642 additions and 619 deletions

View File

@@ -0,0 +1,5 @@
from procesing.providers.base import DataProvider
from procesing.providers.supabase import SupabaseProvider
from procesing.providers.backend import BackendAPIProvider
__all__ = ['DataProvider', 'SupabaseProvider', 'BackendAPIProvider']

View File

@@ -0,0 +1,19 @@
import os
import pandas as pd
import requests
from typing import List
from procesing.providers.base import DataProvider
class BackendAPIProvider(DataProvider):
"""Concrete backend API implementation"""
def __init__(self, backend_url: str = None):
self.backend_url = backend_url or os.getenv("BACKEND_URL", "http://localhost:5000")
def fetch_kafka_topic(self, topic: str) -> pd.DataFrame:
resp = requests.get(f"{self.backend_url}/api/kafka/dump?topic={topic}")
resp.raise_for_status()
data = resp.json()
if not data.get('success') or not data.get('data'):
return pd.DataFrame()
return pd.DataFrame(data['data'])

View File

@@ -0,0 +1,21 @@
from abc import ABC, abstractmethod
from typing import List
import pandas as pd
class DataProvider(ABC):
"""Abstract interface for data access, enables DI and testing"""
@abstractmethod
def fetch_products(self, store_mode: str) -> pd.DataFrame:
"""Fetch product catalog for given store mode"""
pass
@abstractmethod
def fetch_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
"""Fetch experiment metadata for given IDs"""
pass
@abstractmethod
def fetch_kafka_topic(self, topic: str) -> pd.DataFrame:
"""Fetch data from Kafka topic via backend API"""
pass

View File

@@ -0,0 +1,35 @@
import os
import pandas as pd
import requests
from typing import List
from supabase import create_client, Client
from procesing.providers.base import DataProvider
from dotenv import load_dotenv
class SupabaseProvider(DataProvider):
"""Concrete Supabase + backend API implementation"""
def __init__(self,
supabase_url: str = None,
supabase_key: str = None,):
load_dotenv()
self.supabase_url = supabase_url or os.getenv("NEXT_PUBLIC_SUPABASE_URL")
self.supabase_key = supabase_key or os.getenv("NEXT_PUBLIC_SUPABASE_ANON_KEY")
self.supabase: Client = create_client(self.supabase_url, self.supabase_key)
def fetch_products(self, store_mode: str) -> pd.DataFrame:
resp = self.supabase.table(f'{store_mode}_products').select(
"id, room_type, date_index, metadata, availability"
).execute()
return pd.DataFrame(resp.data) if resp.data else pd.DataFrame()
def fetch_experiments(self, experiment_ids: List[str]) -> pd.DataFrame:
if not experiment_ids:
return pd.DataFrame()
resp = self.supabase.table('experiments').select(
'id, subject_name, xp_human_only, xp_market_mode, xp_task_id, '
'task:tasks(task_name, task_description, task_def_of_done)'
).in_('id', experiment_ids).execute()
return pd.DataFrame(resp.data) if resp.data else pd.DataFrame()