From e0a4a26990bf4f004a2d0e38f0341f3ab2a3721b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= Date: Sat, 9 Apr 2022 23:06:46 +0100 Subject: [PATCH] Implement scrape utils --- utils/scrapeutils.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 utils/scrapeutils.py diff --git a/utils/scrapeutils.py b/utils/scrapeutils.py new file mode 100644 index 0000000..2b68039 --- /dev/null +++ b/utils/scrapeutils.py @@ -0,0 +1,27 @@ +"""Utility functions for scraping.""" +from typing import List + +import requests +from bs4 import BeautifulSoup + +from .linkutils import Link + + +def download_link_texts( + links: List[Link], class_: str, directory: str, encoding: str = None +): + """Download link texts contained HTML elements with the given class to a dir.""" + for i, link in enumerate(links): + print(f"Downloading {link.url} ({i+1}/{len(links)})") + text = get_link_text(link.url, class_, encoding) + with open(f"{directory}/{link.date}_{i}.txt", "w+") as f: + f.write(text) + + +def get_link_text(link: str, class_: str, encoding: str = None) -> str: + """Get the text of a div with a given classname on a webpage.""" + request = requests.get(link) + if encoding: + request.encoding = encoding + soup = BeautifulSoup(request.text, "html.parser") + return soup.find(class_=class_).get_text().strip()