From 1060c988a97e568644d781572cafb151281c07a3 Mon Sep 17 00:00:00 2001 From: Daniel Rosel Date: Wed, 1 Mar 2023 10:18:59 +0100 Subject: [PATCH] init --- README.md | 6 ++++++ crawler.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 README.md create mode 100644 crawler.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..a765f1a --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +# Garlic 🧄🧛 + +Garlic is a simple, fast and secure way to protect your website from being scraped by bots. + +# Why? ++ diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..a5b08d9 --- /dev/null +++ b/crawler.py @@ -0,0 +1,46 @@ +# this is a simple crawler that crawls the web and stores the website content in a database +# it is a simple crawler that does not use any advanced techniques +# it also scrapes the content of the website and stores it in the database + +from urllib.request import urlopen +import re +import sqlite3 +import time +import sys + +urls = ["http://localhost:37593"] + +# connect to the database +conn = sqlite3.connect('garlic.db') +c = conn.cursor() + +# create the table if it does not exist +c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''') + +# create the index if it does not exist +c.execute('''CREATE INDEX IF NOT EXISTS url_index ON websites (url)''') + +# commit the changes +conn.commit() + +# loop through the urls +for url in urls: + # check if the url is already in the database + c.execute('''SELECT * FROM websites WHERE url = ?''', (url,)) + if c.fetchone() is None: + # if the url is not in the database, then get the content + try: + content = urlopen(url).read().decode('utf-8') + # insert the url and the content into the database + c.execute('''INSERT INTO websites VALUES (?, ?)''', (url, content)) + # commit the changes + conn.commit() + # find all the urls in the content + urls.extend(re.findall('''href=["'](.[^"']+)["']''', content)) + except: + # if the url is not valid, then continue + continue + + +# close the connection +conn.close()