scrape-yuanyuan/guangdong/scrape.py

"""Script to scrape article contents from https://wsjkw.gd.gov.cn.

Links aren't conveniently available, since the page that lists them is
rendered entirely using JS, so we need to run a fully-fledged JS
browser to get them.

We use `extract-urls.js` to extract the links beforehand, and dump
them to a file, which we can extract here.
"""
import re
from typing import Tuple

import requests
from bs4 import BeautifulSoup

MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d")


def main():
    """Read all links from the set of links and dump their articles to files."""
    with open("links/links.txt") as f:
        links = f.readlines()

    for i, link in enumerate(links):
        print(f"Downloading {link.rstrip()} ({i}/{len(links)})")

        # The links aren't formatted correctly, we need to prefix
        # them with `http`
        link = f"http:{link}"
        date, text = download_link(link)

        with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f:
            f.write(text)


def download_link(link: str) -> Tuple[str, str]:
    """Download a link."""
    download = requests.get(link)
    return extract_article(download.text)


def extract_article(website: str) -> Tuple[str, str]:
    """Extract an article."""
    soup = BeautifulSoup(website, "html.parser")
    date = soup.find(class_="date-row")

    if date:
        match = re.search(MATCH_DATE, date.get_text())
        if match:
            date = match.group(0)
        else:
            date = "unknown"
    else:
        date = "unknown"

    text = soup.find(class_="article-content").get_text().strip()
    return date, text


if __name__ == "__main__":
    main()