scrape-yuanyuan/utils/scrapeutils.py

28 lines
912 B
Python
Raw Normal View History

2022-04-09 23:06:46 +01:00
"""Utility functions for scraping."""
from typing import List
import requests
from bs4 import BeautifulSoup
from .linkutils import Link
def download_link_texts(
links: List[Link], class_: str, directory: str, encoding: str = None
):
"""Download link texts contained HTML elements with the given class to a dir."""
for i, link in enumerate(links):
print(f"Downloading {link.url} ({i+1}/{len(links)})")
text = get_link_text(link.url, class_, encoding)
with open(f"{directory}/{link.date}_{i}.txt", "w+") as f:
f.write(text)
def get_link_text(link: str, class_: str, encoding: str = None) -> str:
"""Get the text of a div with a given classname on a webpage."""
request = requests.get(link)
if encoding:
request.encoding = encoding
soup = BeautifulSoup(request.text, "html.parser")
return soup.find(class_=class_).get_text().strip()