diff --git a/crawler.py b/crawler.py index 2506ca7..1725eb5 100644 --- a/crawler.py +++ b/crawler.py @@ -8,12 +8,15 @@ import sqlite3 import time import sys -urls = ["https://glittery-croquembouche-c25561.netlify.app/"] +urls = ["https://garlic-react.netlify.app/", "https://garlic-vue.netlify.app/"] # connect to the database conn = sqlite3.connect('garlic.db') c = conn.cursor() +# clear the database +c.execute("DROP TABLE IF EXISTS garlic") + # create the table if it does not exist c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''') diff --git a/scraper-bs.py b/scraper-bs.py new file mode 100644 index 0000000..121356f --- /dev/null +++ b/scraper-bs.py @@ -0,0 +1,35 @@ +websites = ["https://garlic-vue.netlify.app/"] +# create a crawler for websites, that also supports javascript +# use selenium +# download the chrome driver from https://chromedriver.chromium.org/downloads +# and put it in the same folder as this script +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from bs4 import BeautifulSoup +import time + +driverPath = "/home/velocitatem/Documents/Projects/garlic/chromedriver" + + +def get_html(url): + options = Options() + options.headless = True + driver = webdriver.Chrome(executable_path=driverPath, chrome_options=options) + driver.get(url) + time.sleep(5) + html = driver.page_source + driver.quit() + return html + + +def get_data(html): + + soup = BeautifulSoup(html, "html.parser") + # print all the text on the page + print(soup.text) + + + + +for url in websites: + get_data(get_html(url))