Implement shanxi scraping

This commit is contained in:
Tristan Daniël Maat 2022-04-09 23:06:53 +01:00
parent e0a4a26990
commit e111c1f081
Signed by: tlater
GPG key ID: 49670FD774E43268
2 changed files with 70 additions and 0 deletions

19
shanxi/scrape-articles.py Normal file
View file

@ -0,0 +1,19 @@
"""Script to scrape article text from http://sxwjw.shaanxi.gov.cn.
Article contents are in a div with the class `message-box`.
"""
from utils.linkutils import read_links
from utils.scrapeutils import download_link_texts
def main():
"""Collect and output article text."""
with open("articles-shanxi/links.csv", "r") as f:
links = read_links(f)
download_link_texts(links, "message-box", "articles-shanxi")
if __name__ == "__main__":
main()