cvfs/alveslib/scraper.py

import hashlib
import pickle
import os
from pathlib import Path
from seleniumbase import SB
from bs4 import BeautifulSoup
from typing import Optional

class ScraperCache:
    def __init__(self, cache_dir: str = ".scraper_cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)

    def _get_cache_key(self, url: str) -> str:
        return hashlib.md5(url.encode()).hexdigest()

    def _get_cache_path(self, cache_key: str) -> Path:
        return self.cache_dir / f"{cache_key}.pkl"

    def get(self, url: str) -> Optional[BeautifulSoup]:
        cache_key = self._get_cache_key(url)
        cache_path = self._get_cache_path(cache_key)

        if cache_path.exists():
            try:
                with open(cache_path, 'rb') as f:
                    return pickle.load(f)
            except:
                pass
        return None

    def set(self, url: str, soup: BeautifulSoup) -> None:
        cache_key = self._get_cache_key(url)
        cache_path = self._get_cache_path(cache_key)

        try:
            with open(cache_path, 'wb') as f:
                pickle.dump(soup, f)
        except:
            pass

_cache = ScraperCache() # glob

def scrape_url(url: str, use_cache: bool = True) -> BeautifulSoup:
    if use_cache:
        cached_soup = _cache.get(url)
        if cached_soup:
            return cached_soup

    with SB(test=True, uc=True) as sb:
        sb.open(url)
        html = sb.get_page_source()
        soup = BeautifulSoup(html, 'html.parser')

        if use_cache:
            _cache.set(url, soup)

        return soup


if __name__ == "__main__":
    url = "https://httpbin.org/html"
    print("Testing scraper...")
    soup = scrape_url(url)
    print(f"Title: {soup.title.text if soup.title else 'No title'}")
    print(f"Found {len(soup.find_all('p'))} paragraphs")
    print("\nTesting cache...")
    soup2 = scrape_url(url)
    print("Cache test completed")