Initial commit

2026-07-15 19:03:38 +00:00 · 2026-04-02 18:47:14 +02:00
commit 90ad5e0260
94 changed files with 7797 additions and 0 deletions
--- a/alveslib/scraper.py
+++ b/alveslib/scraper.py
@@ -0,0 +1,70 @@
+import hashlib
+import pickle
+import os
+from pathlib import Path
+from seleniumbase import SB
+from bs4 import BeautifulSoup
+from typing import Optional
+
+class ScraperCache:
+    def __init__(self, cache_dir: str = ".scraper_cache"):
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(exist_ok=True)
+
+    def _get_cache_key(self, url: str) -> str:
+        return hashlib.md5(url.encode()).hexdigest()
+
+    def _get_cache_path(self, cache_key: str) -> Path:
+        return self.cache_dir / f"{cache_key}.pkl"
+
+    def get(self, url: str) -> Optional[BeautifulSoup]:
+        cache_key = self._get_cache_key(url)
+        cache_path = self._get_cache_path(cache_key)
+
+        if cache_path.exists():
+            try:
+                with open(cache_path, 'rb') as f:
+                    return pickle.load(f)
+            except:
+                pass
+        return None
+
+    def set(self, url: str, soup: BeautifulSoup) -> None:
+        cache_key = self._get_cache_key(url)
+        cache_path = self._get_cache_path(cache_key)
+
+        try:
+            with open(cache_path, 'wb') as f:
+                pickle.dump(soup, f)
+        except:
+            pass
+
+_cache = ScraperCache() # glob
+
+def scrape_url(url: str, use_cache: bool = True) -> BeautifulSoup:
+    if use_cache:
+        cached_soup = _cache.get(url)
+        if cached_soup:
+            return cached_soup
+
+    with SB(test=True, uc=True) as sb:
+        sb.open(url)
+        html = sb.get_page_source()
+        soup = BeautifulSoup(html, 'html.parser')
+
+        if use_cache:
+            _cache.set(url, soup)
+
+        return soup
+
+
+
+if __name__ == "__main__":
+    url = "https://httpbin.org/html"
+    print("Testing scraper...")
+    soup = scrape_url(url)
+    print(f"Title: {soup.title.text if soup.title else 'No title'}")
+    print(f"Found {len(soup.find_all('p'))} paragraphs")
+    print("\nTesting cache...")
+    soup2 = scrape_url(url)
+    print("Cache test completed")