Initial commit

2026-07-15 19:03:38 +00:00 · 2026-04-02 18:47:14 +02:00
commit 90ad5e0260
94 changed files with 7797 additions and 0 deletions
--- a/ml/data/README.md
+++ b/ml/data/README.md
@@ -0,0 +1,5 @@
+# Data
+
+Some thoughts on processing data: In a lot of cases you will get data not in an s3 bucket or anything glamarous and for doing anything in terms of modelling you need the data locally but then when you maybe have a 1TB dataset you want 10GB locally and then you upload to a GPU rich server and there you will want all of teh data. How can you managed this data well? What are best practices?
+
+Huggingface lets you upload up to 300 gigs of data into a dataset.
--- a/ml/data/etl.py
+++ b/ml/data/etl.py
@@ -0,0 +1,52 @@
+import argparse
+import json
+from pathlib import Path
+
+import torch
+import yaml
+
+
+def build_dataset(
+    train_samples: int, input_dim: int, num_classes: int, seed: int
+) -> dict[str, torch.Tensor]:
+    generator = torch.Generator().manual_seed(seed)
+    features = torch.randn(train_samples, input_dim, generator=generator)
+    labels = torch.randint(0, num_classes, (train_samples,), generator=generator)
+    return {"features": features, "labels": labels}
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Build a synthetic training dataset")
+    parser.add_argument("--config", default="ml/configs/data/default.yaml")
+    parser.add_argument("--output", default="ml/data/processed")
+    args = parser.parse_args()
+
+    with open(args.config, "r", encoding="utf-8") as f:
+        cfg = yaml.safe_load(f)
+
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    dataset = build_dataset(
+        train_samples=int(cfg["train_samples"]),
+        input_dim=int(cfg["input_dim"]),
+        num_classes=int(cfg["num_classes"]),
+        seed=int(cfg["seed"]),
+    )
+    dataset_path = output_dir / "dataset.pt"
+    torch.save(dataset, dataset_path)
+
+    metadata = {
+        "dataset_name": cfg["dataset_name"],
+        "train_samples": int(cfg["train_samples"]),
+        "input_dim": int(cfg["input_dim"]),
+        "num_classes": int(cfg["num_classes"]),
+        "seed": int(cfg["seed"]),
+        "dataset_path": str(dataset_path),
+    }
+    with open(output_dir / "metadata.json", "w", encoding="utf-8") as f:
+        json.dump(metadata, f, indent=2)
+
+
+if __name__ == "__main__":
+    main()
--- a/ml/data/processed/dataset.pt
+++ b/ml/data/processed/dataset.pt
--- a/ml/data/processed/metadata.json
+++ b/ml/data/processed/metadata.json
@@ -0,0 +1,8 @@
+{
+  "dataset_name": "synthetic_classification",
+  "train_samples": 2048,
+  "input_dim": 16,
+  "num_classes": 3,
+  "seed": 42,
+  "dataset_path": "ml/data/processed/dataset.pt"
+}