mirror of
https://github.com/velocitatem/garlic.git
synced 2026-05-31 08:43:36 +00:00
init
This commit is contained in:
6
README.md
Normal file
6
README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# Garlic 🧄🧛
|
||||
|
||||
Garlic is a simple, fast and secure way to protect your website from being scraped by bots.
|
||||
|
||||
# Why?
|
||||
+
|
||||
46
crawler.py
Normal file
46
crawler.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# this is a simple crawler that crawls the web and stores the website content in a database
|
||||
# it is a simple crawler that does not use any advanced techniques
|
||||
# it also scrapes the content of the website and stores it in the database
|
||||
|
||||
from urllib.request import urlopen
|
||||
import re
|
||||
import sqlite3
|
||||
import time
|
||||
import sys
|
||||
|
||||
urls = ["http://localhost:37593"]
|
||||
|
||||
# connect to the database
|
||||
conn = sqlite3.connect('garlic.db')
|
||||
c = conn.cursor()
|
||||
|
||||
# create the table if it does not exist
|
||||
c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''')
|
||||
|
||||
# create the index if it does not exist
|
||||
c.execute('''CREATE INDEX IF NOT EXISTS url_index ON websites (url)''')
|
||||
|
||||
# commit the changes
|
||||
conn.commit()
|
||||
|
||||
# loop through the urls
|
||||
for url in urls:
|
||||
# check if the url is already in the database
|
||||
c.execute('''SELECT * FROM websites WHERE url = ?''', (url,))
|
||||
if c.fetchone() is None:
|
||||
# if the url is not in the database, then get the content
|
||||
try:
|
||||
content = urlopen(url).read().decode('utf-8')
|
||||
# insert the url and the content into the database
|
||||
c.execute('''INSERT INTO websites VALUES (?, ?)''', (url, content))
|
||||
# commit the changes
|
||||
conn.commit()
|
||||
# find all the urls in the content
|
||||
urls.extend(re.findall('''href=["'](.[^"']+)["']''', content))
|
||||
except:
|
||||
# if the url is not valid, then continue
|
||||
continue
|
||||
|
||||
|
||||
# close the connection
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user