mirror of
https://github.com/velocitatem/garlic.git
synced 2026-05-31 08:43:36 +00:00
47 lines
1.4 KiB
Python
47 lines
1.4 KiB
Python
# this is a simple crawler that crawls the web and stores the website content in a database
|
|
# it is a simple crawler that does not use any advanced techniques
|
|
# it also scrapes the content of the website and stores it in the database
|
|
|
|
from urllib.request import urlopen
|
|
import re
|
|
import sqlite3
|
|
import time
|
|
import sys
|
|
|
|
urls = ["http://localhost:37593"]
|
|
|
|
# connect to the database
|
|
conn = sqlite3.connect('garlic.db')
|
|
c = conn.cursor()
|
|
|
|
# create the table if it does not exist
|
|
c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''')
|
|
|
|
# create the index if it does not exist
|
|
c.execute('''CREATE INDEX IF NOT EXISTS url_index ON websites (url)''')
|
|
|
|
# commit the changes
|
|
conn.commit()
|
|
|
|
# loop through the urls
|
|
for url in urls:
|
|
# check if the url is already in the database
|
|
c.execute('''SELECT * FROM websites WHERE url = ?''', (url,))
|
|
if c.fetchone() is None:
|
|
# if the url is not in the database, then get the content
|
|
try:
|
|
content = urlopen(url).read().decode('utf-8')
|
|
# insert the url and the content into the database
|
|
c.execute('''INSERT INTO websites VALUES (?, ?)''', (url, content))
|
|
# commit the changes
|
|
conn.commit()
|
|
# find all the urls in the content
|
|
urls.extend(re.findall('''href=["'](.[^"']+)["']''', content))
|
|
except:
|
|
# if the url is not valid, then continue
|
|
continue
|
|
|
|
|
|
# close the connection
|
|
conn.close()
|