From e111c1f0816f03739aaf35685edc08c70b7c8e4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= Date: Sat, 9 Apr 2022 23:06:53 +0100 Subject: [PATCH] Implement shanxi scraping --- shanxi/scrape-articles.py | 19 +++++++++++++++ shanxi/scrape-links.py | 51 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 shanxi/scrape-articles.py create mode 100644 shanxi/scrape-links.py diff --git a/shanxi/scrape-articles.py b/shanxi/scrape-articles.py new file mode 100644 index 0000000..0e85164 --- /dev/null +++ b/shanxi/scrape-articles.py @@ -0,0 +1,19 @@ +"""Script to scrape article text from http://sxwjw.shaanxi.gov.cn. + +Article contents are in a div with the class `message-box`. +""" + +from utils.linkutils import read_links +from utils.scrapeutils import download_link_texts + + +def main(): + """Collect and output article text.""" + with open("articles-shanxi/links.csv", "r") as f: + links = read_links(f) + + download_link_texts(links, "message-box", "articles-shanxi") + + +if __name__ == "__main__": + main() diff --git a/shanxi/scrape-links.py b/shanxi/scrape-links.py new file mode 100644 index 0000000..32a21d8 --- /dev/null +++ b/shanxi/scrape-links.py @@ -0,0 +1,51 @@ +r"""Script to scrape article links from http://sxwjw.shaanxi.gov.cn. + +Links are available from pages +http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index(\d+_)?.html. + +The page structure is almost exactly like that of ningxia. Only +difference is the class name of the content div. + +Something like div.con-rt li > a +""" + +from typing import List + +import requests +from bs4 import BeautifulSoup + +from utils.linkutils import Link, absolutize_link, dump_links + +PAGE_START = 2 +PAGE_END = 20 +PAGE_BASE = "http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/" + + +def main(): + """Collect and output article links.""" + links = [ + link + for page in range(PAGE_START - 1, PAGE_END) + for link in get_article_links(page) + ] + + with open("articles-shanxi/links.csv", "w+") as f: + dump_links(links, f) + + +def get_article_links(page: int) -> List[Link]: + """Get all links from a specific page.""" + page_link = f"{PAGE_BASE}/index_{page}.html" + soup = BeautifulSoup(requests.get(page_link).text, "html.parser") + + link_nodes = soup.select("div.con-rt li > a") + date_nodes = soup.select("div.con-rt li > span") + + return [ + Link(absolutize_link(link.get("href"), PAGE_BASE), date.get_text()) + for link, date in zip(link_nodes, date_nodes) + ] + + +if __name__ == "__main__": + main()