Initial commit

This commit is contained in:
Daniel Alves Rösel
2026-04-02 18:47:14 +02:00
committed by GitHub
commit 90ad5e0260
94 changed files with 7797 additions and 0 deletions

0
ml/__init__.py Normal file
View File

View File

@@ -0,0 +1,6 @@
dataset_name: synthetic_classification
output_dir: ml/data/processed
train_samples: 2048
input_dim: 16
num_classes: 3
seed: 42

View File

@@ -0,0 +1,10 @@
input_dim: 16
hidden_dim: 64
num_classes: 3
learning_rate: 0.001
batch_size: 64
epochs: 5
log_every_n_steps: 20
seed: 42
tensorboard_dir: ml/tensorboard
weights_output: ml/models/weights/model.pt

5
ml/data/README.md Normal file
View File

@@ -0,0 +1,5 @@
# Data
Some thoughts on processing data: In a lot of cases you will get data not in an s3 bucket or anything glamarous and for doing anything in terms of modelling you need the data locally but then when you maybe have a 1TB dataset you want 10GB locally and then you upload to a GPU rich server and there you will want all of teh data. How can you managed this data well? What are best practices?
Huggingface lets you upload up to 300 gigs of data into a dataset.

52
ml/data/etl.py Normal file
View File

@@ -0,0 +1,52 @@
import argparse
import json
from pathlib import Path
import torch
import yaml
def build_dataset(
train_samples: int, input_dim: int, num_classes: int, seed: int
) -> dict[str, torch.Tensor]:
generator = torch.Generator().manual_seed(seed)
features = torch.randn(train_samples, input_dim, generator=generator)
labels = torch.randint(0, num_classes, (train_samples,), generator=generator)
return {"features": features, "labels": labels}
def main() -> None:
parser = argparse.ArgumentParser(description="Build a synthetic training dataset")
parser.add_argument("--config", default="ml/configs/data/default.yaml")
parser.add_argument("--output", default="ml/data/processed")
args = parser.parse_args()
with open(args.config, "r", encoding="utf-8") as f:
cfg = yaml.safe_load(f)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
dataset = build_dataset(
train_samples=int(cfg["train_samples"]),
input_dim=int(cfg["input_dim"]),
num_classes=int(cfg["num_classes"]),
seed=int(cfg["seed"]),
)
dataset_path = output_dir / "dataset.pt"
torch.save(dataset, dataset_path)
metadata = {
"dataset_name": cfg["dataset_name"],
"train_samples": int(cfg["train_samples"]),
"input_dim": int(cfg["input_dim"]),
"num_classes": int(cfg["num_classes"]),
"seed": int(cfg["seed"]),
"dataset_path": str(dataset_path),
}
with open(output_dir / "metadata.json", "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2)
if __name__ == "__main__":
main()

Binary file not shown.

View File

@@ -0,0 +1,8 @@
{
"dataset_name": "synthetic_classification",
"train_samples": 2048,
"input_dim": 16,
"num_classes": 3,
"seed": 42,
"dataset_path": "ml/data/processed/dataset.pt"
}

36
ml/inference.py Normal file
View File

@@ -0,0 +1,36 @@
import os
import torch
import torch.nn as nn
from fastapi import FastAPI
from pydantic import BaseModel
# TODO: Import model when ready
from models import * # TODO: SPECIFY
class InputData(BaseModel):
pass
weights_path = os.getenv("ML_LATEST_WEIGHTS_PATH")
if weights_path is None:
raise RuntimeError("ML_LATEST_WEIGHTS_PATH not set")
# FastAPI app
app = FastAPI(title="ML Inference API", version="1.0.0")
@app.get("/health")
def health_check():
return {"status": "healthy", "service": "ml-inference"}
@app.post("/predict")
def predict(data: InputData):
#TODO: x = torch.tensor([data.features], dtype=torch.float32)
with torch.no_grad():
#TODO: y = model(x)
y=torch.tensor(0)
return {"prediction": y.tolist()}

15
ml/models/arch.py Normal file
View File

@@ -0,0 +1,15 @@
import torch
import torch.nn as nn
class Model(nn.Module):
def __init__(self, input_dim: int, hidden_dim: int, num_classes: int) -> None:
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, num_classes),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x)

98
ml/models/train.py Normal file
View File

@@ -0,0 +1,98 @@
import argparse
from pathlib import Path
import torch
import torch.nn as nn
import yaml
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter
from alveslib import get_logger
from ml.models.arch import Model
logger = get_logger("ml-train")
class Trainer:
def __init__(
self,
model: nn.Module,
train_loader: DataLoader,
learning_rate: float,
log_dir: str,
log_every_n_steps: int,
) -> None:
self.model = model
self.train_loader = train_loader
self.optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
self.criterion = nn.CrossEntropyLoss()
self.writer = SummaryWriter(log_dir)
self.step = 0
self.log_every_n_steps = log_every_n_steps
def train_epoch(self) -> float:
self.model.train()
total_loss = 0.0
for batch_idx, (features, target) in enumerate(self.train_loader):
self.optimizer.zero_grad()
output = self.model(features)
loss = self.criterion(output, target)
loss.backward()
self.optimizer.step()
total_loss += loss.item()
if batch_idx % self.log_every_n_steps == 0:
self.writer.add_scalar("Loss/TrainStep", loss.item(), self.step)
self.step += 1
return total_loss / max(len(self.train_loader), 1)
def train(self, epochs: int) -> None:
for epoch in range(epochs):
avg_loss = self.train_epoch()
self.writer.add_scalar("Loss/TrainEpoch", avg_loss, epoch)
logger.info(f"epoch={epoch + 1}/{epochs} avg_loss={avg_loss:.5f}")
self.writer.close()
def main() -> None:
parser = argparse.ArgumentParser(description="Train a baseline model")
parser.add_argument("--config", default="ml/configs/train/default.yaml")
parser.add_argument("--dataset", default="ml/data/processed/dataset.pt")
parser.add_argument("--weights", default=None)
args = parser.parse_args()
with open(args.config, "r", encoding="utf-8") as f:
cfg = yaml.safe_load(f)
torch.manual_seed(int(cfg["seed"]))
dataset_blob = torch.load(args.dataset, map_location="cpu")
dataset = TensorDataset(dataset_blob["features"], dataset_blob["labels"])
train_loader = DataLoader(dataset, batch_size=int(cfg["batch_size"]), shuffle=True)
model = Model(
input_dim=int(cfg["input_dim"]),
hidden_dim=int(cfg["hidden_dim"]),
num_classes=int(cfg["num_classes"]),
)
trainer = Trainer(
model=model,
train_loader=train_loader,
learning_rate=float(cfg["learning_rate"]),
log_dir=str(cfg["tensorboard_dir"]),
log_every_n_steps=int(cfg["log_every_n_steps"]),
)
trainer.train(epochs=int(cfg["epochs"]))
weights_target = args.weights or cfg.get(
"weights_output", "ml/models/weights/model.pt"
)
weights_path = Path(weights_target)
weights_path.parent.mkdir(parents=True, exist_ok=True)
torch.save(model.state_dict(), weights_path)
logger.info(f"saved_weights={weights_path}")
if __name__ == "__main__":
main()

70
ml/project.json Normal file
View File

@@ -0,0 +1,70 @@
{
"name": "ml",
"root": "ml",
"sourceRoot": "ml",
"projectType": "application",
"implicitDependencies": ["alveslib"],
"targets": {
"dev": {
"executor": "nx:run-commands",
"options": {
"cwd": "ml",
"command": "uvicorn inference:app --host 0.0.0.0 --port 8000 --reload"
}
},
"build": {
"executor": "nx:run-commands",
"options": {
"command": "uv run python -m compileall ml"
}
},
"etl": {
"executor": "nx:run-commands",
"cache": true,
"inputs": [
"default",
"{workspaceRoot}/ml/configs/data/**/*.yaml"
],
"outputs": ["{workspaceRoot}/ml/data/processed"],
"options": {
"command": "uv run python -m ml.data.etl --config ml/configs/data/default.yaml --output ml/data/processed"
}
},
"train": {
"executor": "nx:run-commands",
"cache": true,
"dependsOn": ["etl"],
"inputs": [
"default",
"{workspaceRoot}/ml/configs/train/**/*.yaml",
"{workspaceRoot}/ml/configs/data/**/*.yaml",
"{workspaceRoot}/ml/data/processed/**"
],
"outputs": [
"{workspaceRoot}/ml/models/weights",
"{workspaceRoot}/ml/tensorboard"
],
"options": {
"command": "uv run python -m ml.models.train --config ml/configs/train/default.yaml --dataset ml/data/processed/dataset.pt --weights ml/models/weights/model.pt"
}
},
"lint": {
"executor": "nx:run-commands",
"options": {
"command": "uv run ruff check ml"
}
},
"typecheck": {
"executor": "nx:run-commands",
"options": {
"command": "uv run mypy ml"
}
},
"test": {
"executor": "nx:run-commands",
"options": {
"command": "uv run pytest ml -v"
}
}
}
}