Implement scrape utils
This commit is contained in:
parent
8c012a28b3
commit
e0a4a26990
27
utils/scrapeutils.py
Normal file
27
utils/scrapeutils.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
"""Utility functions for scraping."""
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from .linkutils import Link
|
||||||
|
|
||||||
|
|
||||||
|
def download_link_texts(
|
||||||
|
links: List[Link], class_: str, directory: str, encoding: str = None
|
||||||
|
):
|
||||||
|
"""Download link texts contained HTML elements with the given class to a dir."""
|
||||||
|
for i, link in enumerate(links):
|
||||||
|
print(f"Downloading {link.url} ({i+1}/{len(links)})")
|
||||||
|
text = get_link_text(link.url, class_, encoding)
|
||||||
|
with open(f"{directory}/{link.date}_{i}.txt", "w+") as f:
|
||||||
|
f.write(text)
|
||||||
|
|
||||||
|
|
||||||
|
def get_link_text(link: str, class_: str, encoding: str = None) -> str:
|
||||||
|
"""Get the text of a div with a given classname on a webpage."""
|
||||||
|
request = requests.get(link)
|
||||||
|
if encoding:
|
||||||
|
request.encoding = encoding
|
||||||
|
soup = BeautifulSoup(request.text, "html.parser")
|
||||||
|
return soup.find(class_=class_).get_text().strip()
|
Loading…
Reference in a new issue