"""Script to scrape article contents from https://wsjkw.gd.gov.cn. Links aren't conveniently available, since the page that lists them is rendered entirely using JS, so we need to run a fully-fledged JS browser to get them. We use `extract-urls.js` to extract the links beforehand, and dump them to a file, which we can extract here. """ import re from typing import Tuple import requests from bs4 import BeautifulSoup MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d") def main(): """Read all links from the set of links and dump their articles to files.""" with open("links/links.txt") as f: links = f.readlines() for i, link in enumerate(links): print(f"Downloading {link.rstrip()} ({i}/{len(links)})") # The links aren't formatted correctly, we need to prefix # them with `http` link = f"http:{link}" date, text = download_link(link) with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f: f.write(text) def download_link(link: str) -> Tuple[str, str]: """Download a link.""" download = requests.get(link) return extract_article(download.text) def extract_article(website: str) -> Tuple[str, str]: """Extract an article.""" soup = BeautifulSoup(website, "html.parser") date = soup.find(class_="date-row") if date: match = re.search(MATCH_DATE, date.get_text()) if match: date = match.group(0) else: date = "unknown" else: date = "unknown" text = soup.find(class_="article-content").get_text().strip() return date, text if __name__ == "__main__": main()