scrape-yuanyuan/utils/scrapeutils.py

"""Utility functions for scraping."""
from typing import List

import requests
from bs4 import BeautifulSoup

from .linkutils import Link


def download_link_texts(
    links: List[Link], class_: str, directory: str, encoding: str = None
):
    """Download link texts contained HTML elements with the given class to a dir."""
    for i, link in enumerate(links):
        print(f"Downloading {link.url} ({i+1}/{len(links)})")
        text = get_link_text(link.url, class_, encoding)
        with open(f"{directory}/{link.date}_{i}.txt", "w+") as f:
            f.write(text)


def get_link_text(link: str, class_: str, encoding: str = None) -> str:
    """Get the text of a div with a given classname on a webpage."""
    request = requests.get(link)
    if encoding:
        request.encoding = encoding
    soup = BeautifulSoup(request.text, "html.parser")
    return soup.find(class_=class_).get_text().strip()
Implement scrape utils 2022-04-09 23:06:46 +01:00			`"""Utility functions for scraping."""`
			`from typing import List`

			`import requests`
			`from bs4 import BeautifulSoup`

			`from .linkutils import Link`


			`def download_link_texts(`
			`links: List[Link], class_: str, directory: str, encoding: str = None`
			`):`
			`"""Download link texts contained HTML elements with the given class to a dir."""`
			`for i, link in enumerate(links):`
			`print(f"Downloading {link.url} ({i+1}/{len(links)})")`
			`text = get_link_text(link.url, class_, encoding)`
			`with open(f"{directory}/{link.date}_{i}.txt", "w+") as f:`
			`f.write(text)`


			`def get_link_text(link: str, class_: str, encoding: str = None) -> str:`
			`"""Get the text of a div with a given classname on a webpage."""`
			`request = requests.get(link)`
			`if encoding:`
			`request.encoding = encoding`
			`soup = BeautifulSoup(request.text, "html.parser")`
			`return soup.find(class_=class_).get_text().strip()`