From e0a4a26990bf4f004a2d0e38f0341f3ab2a3721b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= <tm@tlater.net>
Date: Sat, 9 Apr 2022 23:06:46 +0100
Subject: [PATCH] Implement scrape utils

---
 utils/scrapeutils.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 utils/scrapeutils.py

diff --git a/utils/scrapeutils.py b/utils/scrapeutils.py
new file mode 100644
index 0000000..2b68039
--- /dev/null
+++ b/utils/scrapeutils.py
@@ -0,0 +1,27 @@
+"""Utility functions for scraping."""
+from typing import List
+
+import requests
+from bs4 import BeautifulSoup
+
+from .linkutils import Link
+
+
+def download_link_texts(
+    links: List[Link], class_: str, directory: str, encoding: str = None
+):
+    """Download link texts contained HTML elements with the given class to a dir."""
+    for i, link in enumerate(links):
+        print(f"Downloading {link.url} ({i+1}/{len(links)})")
+        text = get_link_text(link.url, class_, encoding)
+        with open(f"{directory}/{link.date}_{i}.txt", "w+") as f:
+            f.write(text)
+
+
+def get_link_text(link: str, class_: str, encoding: str = None) -> str:
+    """Get the text of a div with a given classname on a webpage."""
+    request = requests.get(link)
+    if encoding:
+        request.encoding = encoding
+    soup = BeautifulSoup(request.text, "html.parser")
+    return soup.find(class_=class_).get_text().strip()