New stuff

This commit is contained in:
2023-03-02 18:16:59 +01:00
parent 1080868e60
commit 7923212cb8
2 changed files with 39 additions and 1 deletions

View File

@@ -8,12 +8,15 @@ import sqlite3
import time import time
import sys import sys
urls = ["https://glittery-croquembouche-c25561.netlify.app/"] urls = ["https://garlic-react.netlify.app/", "https://garlic-vue.netlify.app/"]
# connect to the database # connect to the database
conn = sqlite3.connect('garlic.db') conn = sqlite3.connect('garlic.db')
c = conn.cursor() c = conn.cursor()
# clear the database
c.execute("DROP TABLE IF EXISTS garlic")
# create the table if it does not exist # create the table if it does not exist
c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''') c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''')

35
scraper-bs.py Normal file
View File

@@ -0,0 +1,35 @@
websites = ["https://garlic-vue.netlify.app/"]
# create a crawler for websites, that also supports javascript
# use selenium
# download the chrome driver from https://chromedriver.chromium.org/downloads
# and put it in the same folder as this script
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
driverPath = "/home/velocitatem/Documents/Projects/garlic/chromedriver"
def get_html(url):
options = Options()
options.headless = True
driver = webdriver.Chrome(executable_path=driverPath, chrome_options=options)
driver.get(url)
time.sleep(5)
html = driver.page_source
driver.quit()
return html
def get_data(html):
soup = BeautifulSoup(html, "html.parser")
# print all the text on the page
print(soup.text)
for url in websites:
get_data(get_html(url))