Implement shanxi scraping

2022-04-09 23:06:53 +01:00 · 2022-04-09 23:06:53 +01:00 · e111c1f081
commit e111c1f081
parent e0a4a26990
2 changed files with 70 additions and 0 deletions
--- a/shanxi/scrape-articles.py
+++ b/shanxi/scrape-articles.py
@ -0,0 +1,19 @@
+"""Script to scrape article text from http://sxwjw.shaanxi.gov.cn.
+
+Article contents are in a div with the class `message-box`.
+"""
+
+from utils.linkutils import read_links
+from utils.scrapeutils import download_link_texts
+
+
+def main():
+    """Collect and output article text."""
+    with open("articles-shanxi/links.csv", "r") as f:
+        links = read_links(f)
+
+    download_link_texts(links, "message-box", "articles-shanxi")
+
+
+if __name__ == "__main__":
+    main()