From e111c1f0816f03739aaf35685edc08c70b7c8e4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= <tm@tlater.net>
Date: Sat, 9 Apr 2022 23:06:53 +0100
Subject: [PATCH] Implement shanxi scraping

---
 shanxi/scrape-articles.py | 19 +++++++++++++++
 shanxi/scrape-links.py    | 51 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 shanxi/scrape-articles.py
 create mode 100644 shanxi/scrape-links.py

diff --git a/shanxi/scrape-articles.py b/shanxi/scrape-articles.py
new file mode 100644
index 0000000..0e85164
--- /dev/null
+++ b/shanxi/scrape-articles.py
@@ -0,0 +1,19 @@
+"""Script to scrape article text from http://sxwjw.shaanxi.gov.cn.
+
+Article contents are in a div with the class `message-box`.
+"""
+
+from utils.linkutils import read_links
+from utils.scrapeutils import download_link_texts
+
+
+def main():
+    """Collect and output article text."""
+    with open("articles-shanxi/links.csv", "r") as f:
+        links = read_links(f)
+
+    download_link_texts(links, "message-box", "articles-shanxi")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/shanxi/scrape-links.py b/shanxi/scrape-links.py
new file mode 100644
index 0000000..32a21d8
--- /dev/null
+++ b/shanxi/scrape-links.py
@@ -0,0 +1,51 @@
+r"""Script to scrape article links from http://sxwjw.shaanxi.gov.cn.
+
+Links are available from pages
+http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index(\d+_)?.html.
+
+The page structure is almost exactly like that of ningxia. Only
+difference is the class name of the content div.
+
+Something like div.con-rt li > a
+"""
+
+from typing import List
+
+import requests
+from bs4 import BeautifulSoup
+
+from utils.linkutils import Link, absolutize_link, dump_links
+
+PAGE_START = 2
+PAGE_END = 20
+PAGE_BASE = "http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/"
+
+
+def main():
+    """Collect and output article links."""
+    links = [
+        link
+        for page in range(PAGE_START - 1, PAGE_END)
+        for link in get_article_links(page)
+    ]
+
+    with open("articles-shanxi/links.csv", "w+") as f:
+        dump_links(links, f)
+
+
+def get_article_links(page: int) -> List[Link]:
+    """Get all links from a specific page."""
+    page_link = f"{PAGE_BASE}/index_{page}.html"
+    soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
+
+    link_nodes = soup.select("div.con-rt li > a")
+    date_nodes = soup.select("div.con-rt li > span")
+
+    return [
+        Link(absolutize_link(link.get("href"), PAGE_BASE), date.get_text())
+        for link, date in zip(link_nodes, date_nodes)
+    ]
+
+
+if __name__ == "__main__":
+    main()