Files
cvfs/alveslib/scraper.py
Daniel Alves Rösel 90ad5e0260 Initial commit
2026-04-02 18:47:14 +02:00

71 lines
1.9 KiB
Python

import hashlib
import pickle
import os
from pathlib import Path
from seleniumbase import SB
from bs4 import BeautifulSoup
from typing import Optional
class ScraperCache:
def __init__(self, cache_dir: str = ".scraper_cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
def _get_cache_key(self, url: str) -> str:
return hashlib.md5(url.encode()).hexdigest()
def _get_cache_path(self, cache_key: str) -> Path:
return self.cache_dir / f"{cache_key}.pkl"
def get(self, url: str) -> Optional[BeautifulSoup]:
cache_key = self._get_cache_key(url)
cache_path = self._get_cache_path(cache_key)
if cache_path.exists():
try:
with open(cache_path, 'rb') as f:
return pickle.load(f)
except:
pass
return None
def set(self, url: str, soup: BeautifulSoup) -> None:
cache_key = self._get_cache_key(url)
cache_path = self._get_cache_path(cache_key)
try:
with open(cache_path, 'wb') as f:
pickle.dump(soup, f)
except:
pass
_cache = ScraperCache() # glob
def scrape_url(url: str, use_cache: bool = True) -> BeautifulSoup:
if use_cache:
cached_soup = _cache.get(url)
if cached_soup:
return cached_soup
with SB(test=True, uc=True) as sb:
sb.open(url)
html = sb.get_page_source()
soup = BeautifulSoup(html, 'html.parser')
if use_cache:
_cache.set(url, soup)
return soup
if __name__ == "__main__":
url = "https://httpbin.org/html"
print("Testing scraper...")
soup = scrape_url(url)
print(f"Title: {soup.title.text if soup.title else 'No title'}")
print(f"Found {len(soup.find_all('p'))} paragraphs")
print("\nTesting cache...")
soup2 = scrape_url(url)
print("Cache test completed")