From 340feaa7ed5ac65e771fb9397bbff8c915e7c14c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= <tm@tlater.net>
Date: Sat, 9 Apr 2022 18:31:14 +0100
Subject: [PATCH] Implement scraping for qinghai

---
 qinghai/scrape.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 qinghai/scrape.py

diff --git a/qinghai/scrape.py b/qinghai/scrape.py
new file mode 100644
index 0000000..db65ae4
--- /dev/null
+++ b/qinghai/scrape.py
@@ -0,0 +1,69 @@
+r"""Script to scrape article contents from https://wsjkw.qinghai.gov.cn.
+
+Links are available from pages
+https://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in
+the second href of elements with the class `xxgk_content_title`. Dates
+are the first span of the same element.
+
+Article contents are in a div with the class `page_text`.
+"""
+
+import csv
+from typing import List, Tuple
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+PAGE_START = 14
+PAGE_END = 75
+PAGE_BASE = "https://wsjkw.qinghai.gov.cn/zwgk/xxgkml"
+
+
+def main():
+    """Download articles."""
+    links = [
+        link
+        for page in range(PAGE_START - 1, PAGE_END)
+        for link in get_article_links(page)
+    ]
+
+    # Write out the links for reference
+    with open("articles-qinghai/links.txt", "w+") as f:
+        writer = csv.writer(f)
+        writer.writerow(["index", "link", "date"])
+        for i, link in enumerate(links):
+            writer.writerow(i, link[0], link[1])
+
+    for i, link in enumerate(links):
+        print(f"Downloading {link[0]} ({i}/{len(links)})")
+
+        text = download_article_text(link[0])
+        with open(f"articles-qinghai/{link[1]}_{i}.txt") as f:
+            f.write(text)
+
+
+def get_article_links(page: int) -> List[Tuple[str, str]]:
+    """Get all (article link, date) tuples from a specific page."""
+    print(f"Scraping page {page}")
+
+    page_link = f"{PAGE_BASE}/index{page}.html"
+    soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
+    titles = soup.find_all(class_="xxgk_content_title")
+
+    def parse_title(title: Tag) -> Tuple[str, str]:
+        date = title.span.get_text()
+        link = title.find_all("a")[1].get("href")
+
+        return link, date
+
+    return [parse_title(title) for title in titles]
+
+
+def download_article_text(link: str) -> str:
+    """Get the text of an article from its link."""
+    soup = BeautifulSoup(requests.get(link).text, "html.parser")
+    return soup.find(class_="page_text").get_text().strip()
+
+
+if __name__ == "__main__":
+    main()