scrape-yuanyuan/scrape.py

62 lines
1.6 KiB
Python
Raw Normal View History

2022-04-09 14:44:18 +01:00
"""Script to scrape article contents from https://wsjkw.gd.gov.cn.
Links aren't conveniently available, since the page that lists them is
rendered entirely using JS, so we need to run a fully-fledged JS
browser to get them.
We use `extract-urls.js` to extract the links beforehand, and dump
them to a file, which we can extract here.
"""
import re
from typing import Tuple
import requests
from bs4 import BeautifulSoup
MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d")
def main():
"""Read all links from the set of links and dump their articles to files."""
with open("links/links.txt") as f:
links = f.readlines()
for i, link in enumerate(links):
print(f"Downloading {link.rstrip()} ({i}/{len(links)})")
# The links aren't formatted correctly, we need to prefix
# them with `http`
link = f"http:{link}"
date, text = download_link(link)
with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f:
f.write(text)
def download_link(link: str) -> Tuple[str, str]:
"""Download a link."""
download = requests.get(link)
return extract_article(download.text)
def extract_article(website: str) -> Tuple[str, str]:
"""Extract an article."""
soup = BeautifulSoup(website, "html.parser")
date = soup.find(class_="date-row")
if date:
match = re.search(MATCH_DATE, date.get_text())
if match:
date = match.group(0)
else:
date = "unknown"
else:
date = "unknown"
text = soup.find(class_="article-content").get_text().strip()
return date, text
if __name__ == "__main__":
main()