From 1060c988a97e568644d781572cafb151281c07a3 Mon Sep 17 00:00:00 2001
From: Daniel Rosel <daniel@alves.world>
Date: Wed, 1 Mar 2023 10:18:59 +0100
Subject: [PATCH] init

---
 README.md  |  6 ++++++
 crawler.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 README.md
 create mode 100644 crawler.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a765f1a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,6 @@
+# Garlic 🧄🧛
+
+Garlic is a simple, fast and secure way to protect your website from being scraped by bots.
+
+# Why?
++
diff --git a/crawler.py b/crawler.py
new file mode 100644
index 0000000..a5b08d9
--- /dev/null
+++ b/crawler.py
@@ -0,0 +1,46 @@
+# this is a simple crawler that crawls the web and stores the website content in a database
+# it is a simple crawler that does not use any advanced techniques
+# it also scrapes the content of the website and stores it in the database
+
+from urllib.request import urlopen
+import re
+import sqlite3
+import time
+import sys
+
+urls = ["http://localhost:37593"]
+
+# connect to the database
+conn = sqlite3.connect('garlic.db')
+c = conn.cursor()
+
+# create the table if it does not exist
+c.execute('''CREATE TABLE IF NOT EXISTS websites (url text, content text)''')
+
+# create the index if it does not exist
+c.execute('''CREATE INDEX IF NOT EXISTS url_index ON websites (url)''')
+
+# commit the changes
+conn.commit()
+
+# loop through the urls
+for url in urls:
+    # check if the url is already in the database
+    c.execute('''SELECT * FROM websites WHERE url = ?''', (url,))
+    if c.fetchone() is None:
+        # if the url is not in the database, then get the content
+        try:
+            content = urlopen(url).read().decode('utf-8')
+            # insert the url and the content into the database
+            c.execute('''INSERT INTO websites VALUES (?, ?)''', (url, content))
+            # commit the changes
+            conn.commit()
+            # find all the urls in the content
+            urls.extend(re.findall('''href=["'](.[^"']+)["']''', content))
+        except:
+            # if the url is not valid, then continue
+            continue
+
+
+# close the connection
+conn.close()