From 340feaa7ed5ac65e771fb9397bbff8c915e7c14c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= Date: Sat, 9 Apr 2022 18:31:14 +0100 Subject: [PATCH] Implement scraping for qinghai --- qinghai/scrape.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 qinghai/scrape.py diff --git a/qinghai/scrape.py b/qinghai/scrape.py new file mode 100644 index 0000000..db65ae4 --- /dev/null +++ b/qinghai/scrape.py @@ -0,0 +1,69 @@ +r"""Script to scrape article contents from https://wsjkw.qinghai.gov.cn. + +Links are available from pages +https://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in +the second href of elements with the class `xxgk_content_title`. Dates +are the first span of the same element. + +Article contents are in a div with the class `page_text`. +""" + +import csv +from typing import List, Tuple + +import requests +from bs4 import BeautifulSoup, Tag + +PAGE_START = 14 +PAGE_END = 75 +PAGE_BASE = "https://wsjkw.qinghai.gov.cn/zwgk/xxgkml" + + +def main(): + """Download articles.""" + links = [ + link + for page in range(PAGE_START - 1, PAGE_END) + for link in get_article_links(page) + ] + + # Write out the links for reference + with open("articles-qinghai/links.txt", "w+") as f: + writer = csv.writer(f) + writer.writerow(["index", "link", "date"]) + for i, link in enumerate(links): + writer.writerow(i, link[0], link[1]) + + for i, link in enumerate(links): + print(f"Downloading {link[0]} ({i}/{len(links)})") + + text = download_article_text(link[0]) + with open(f"articles-qinghai/{link[1]}_{i}.txt") as f: + f.write(text) + + +def get_article_links(page: int) -> List[Tuple[str, str]]: + """Get all (article link, date) tuples from a specific page.""" + print(f"Scraping page {page}") + + page_link = f"{PAGE_BASE}/index{page}.html" + soup = BeautifulSoup(requests.get(page_link).text, "html.parser") + titles = soup.find_all(class_="xxgk_content_title") + + def parse_title(title: Tag) -> Tuple[str, str]: + date = title.span.get_text() + link = title.find_all("a")[1].get("href") + + return link, date + + return [parse_title(title) for title in titles] + + +def download_article_text(link: str) -> str: + """Get the text of an article from its link.""" + soup = BeautifulSoup(requests.get(link).text, "html.parser") + return soup.find(class_="page_text").get_text().strip() + + +if __name__ == "__main__": + main()