New stuff

2026-07-16 03:13:35 +00:00 · 2023-03-02 18:16:59 +01:00
parent 1080868e60
commit 7923212cb8
2 changed files with 39 additions and 1 deletions
--- a/crawler.py
+++ b/crawler.py
@@ -8,12 +8,15 @@ import sqlite3
 import time
 import sys
-urls = ["https://glittery-croquembouche-c25561.netlify.app/"]
+urls = ["https://garlic-react.netlify.app/", "https://garlic-vue.netlify.app/"]
 # connect to the database
 conn = sqlite3.connect('garlic.db')
 c = conn.cursor()
 # clear the database
 c.execute("DROP TABLE IF EXISTS garlic")
 # create the table if it does not exist
 c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''')
--- a/scraper-bs.py
+++ b/scraper-bs.py
@@ -0,0 +1,35 @@
 websites = ["https://garlic-vue.netlify.app/"]
 # create a crawler for websites, that also supports javascript
 # use selenium
 # download the chrome driver from https://chromedriver.chromium.org/downloads
 # and put it in the same folder as this script
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from bs4 import BeautifulSoup
 import time
 driverPath = "/home/velocitatem/Documents/Projects/garlic/chromedriver"
 def get_html(url):
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(executable_path=driverPath, chrome_options=options)
    driver.get(url)
    time.sleep(5)
    html = driver.page_source
    driver.quit()
    return html
 def get_data(html):
    soup = BeautifulSoup(html, "html.parser")
    # print all the text on the page
    print(soup.text)
 for url in websites:
    get_data(get_html(url))