62 lines
1.6 KiB
Python
62 lines
1.6 KiB
Python
"""Script to scrape article contents from https://wsjkw.gd.gov.cn.
|
|
|
|
Links aren't conveniently available, since the page that lists them is
|
|
rendered entirely using JS, so we need to run a fully-fledged JS
|
|
browser to get them.
|
|
|
|
We use `extract-urls.js` to extract the links beforehand, and dump
|
|
them to a file, which we can extract here.
|
|
"""
|
|
import re
|
|
from typing import Tuple
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d")
|
|
|
|
|
|
def main():
|
|
"""Read all links from the set of links and dump their articles to files."""
|
|
with open("links/links.txt") as f:
|
|
links = f.readlines()
|
|
|
|
for i, link in enumerate(links):
|
|
print(f"Downloading {link.rstrip()} ({i}/{len(links)})")
|
|
|
|
# The links aren't formatted correctly, we need to prefix
|
|
# them with `http`
|
|
link = f"http:{link}"
|
|
date, text = download_link(link)
|
|
|
|
with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f:
|
|
f.write(text)
|
|
|
|
|
|
def download_link(link: str) -> Tuple[str, str]:
|
|
"""Download a link."""
|
|
download = requests.get(link)
|
|
return extract_article(download.text)
|
|
|
|
|
|
def extract_article(website: str) -> Tuple[str, str]:
|
|
"""Extract an article."""
|
|
soup = BeautifulSoup(website, "html.parser")
|
|
date = soup.find(class_="date-row")
|
|
|
|
if date:
|
|
match = re.search(MATCH_DATE, date.get_text())
|
|
if match:
|
|
date = match.group(0)
|
|
else:
|
|
date = "unknown"
|
|
else:
|
|
date = "unknown"
|
|
|
|
text = soup.find(class_="article-content").get_text().strip()
|
|
return date, text
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|