mirror of
https://github.com/velocitatem/garlic.git
synced 2026-05-31 16:53:37 +00:00
New stuff
This commit is contained in:
@@ -8,12 +8,15 @@ import sqlite3
|
|||||||
import time
|
import time
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
urls = ["https://glittery-croquembouche-c25561.netlify.app/"]
|
urls = ["https://garlic-react.netlify.app/", "https://garlic-vue.netlify.app/"]
|
||||||
|
|
||||||
# connect to the database
|
# connect to the database
|
||||||
conn = sqlite3.connect('garlic.db')
|
conn = sqlite3.connect('garlic.db')
|
||||||
c = conn.cursor()
|
c = conn.cursor()
|
||||||
|
|
||||||
|
# clear the database
|
||||||
|
c.execute("DROP TABLE IF EXISTS garlic")
|
||||||
|
|
||||||
# create the table if it does not exist
|
# create the table if it does not exist
|
||||||
c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''')
|
c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''')
|
||||||
|
|
||||||
|
|||||||
35
scraper-bs.py
Normal file
35
scraper-bs.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
websites = ["https://garlic-vue.netlify.app/"]
|
||||||
|
# create a crawler for websites, that also supports javascript
|
||||||
|
# use selenium
|
||||||
|
# download the chrome driver from https://chromedriver.chromium.org/downloads
|
||||||
|
# and put it in the same folder as this script
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
|
||||||
|
driverPath = "/home/velocitatem/Documents/Projects/garlic/chromedriver"
|
||||||
|
|
||||||
|
|
||||||
|
def get_html(url):
|
||||||
|
options = Options()
|
||||||
|
options.headless = True
|
||||||
|
driver = webdriver.Chrome(executable_path=driverPath, chrome_options=options)
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(5)
|
||||||
|
html = driver.page_source
|
||||||
|
driver.quit()
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
def get_data(html):
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
# print all the text on the page
|
||||||
|
print(soup.text)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for url in websites:
|
||||||
|
get_data(get_html(url))
|
||||||
Reference in New Issue
Block a user