From 5c557bdb9d859c309b0316de94fd4ce5d4fa0018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= Date: Sat, 9 Apr 2022 22:34:42 +0100 Subject: [PATCH] Implement scraping for ningxia --- ningxia/scrape-articles.py | 32 +++++++++++++++++++++ ningxia/scrape-links.py | 58 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 ningxia/scrape-articles.py create mode 100644 ningxia/scrape-links.py diff --git a/ningxia/scrape-articles.py b/ningxia/scrape-articles.py new file mode 100644 index 0000000..b9541eb --- /dev/null +++ b/ningxia/scrape-articles.py @@ -0,0 +1,32 @@ +r"""Script to scrape article text from http://wsjkw.nx.gov.cn. + +Article contents are in a div with the class `xl-content`. +""" + +import requests +from bs4 import BeautifulSoup + +from ..utils.linkutils import read_links + + +def main(): + """Collect and output article text.""" + with open("articles-ningxia/links.txt", "r") as f: + links = read_links(f) + + for i, link in enumerate(links): + print(f"Downloading {link.url} ({i}/{len(links)})") + text = get_article_text(link.url) + with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f: + f.write(text) + + +def get_article_text(link: str) -> str: + """Download article text.""" + request = requests.get(link) + soup = BeautifulSoup(request.text, "html.parser") + return soup.find(class_="xl-content").get_text().strip() + + +if __name__ == "__main__": + main() diff --git a/ningxia/scrape-links.py b/ningxia/scrape-links.py new file mode 100644 index 0000000..23091fd --- /dev/null +++ b/ningxia/scrape-links.py @@ -0,0 +1,58 @@ +r"""Script to scrape article links from http://wsjkw.nx.gov.cn. + +Links are available from pages +http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html. + +The page structure is a bit difficult, it contains sub-lists of a big +list that have separating borders every few elements. + +Something like div.gl-list li > a +""" + +from typing import List + +import requests +from bs4 import BeautifulSoup, Tag + +from utils.linkutils import Link, dump_links + +PAGE_START = 8 +PAGE_END = 44 +PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/" + + +def main(): + """Collect and output article links.""" + links = [ + link + for page in range(PAGE_START - 1, PAGE_END) + for link in get_article_links(page) + ] + + with open("articles-ningxia/links.txt", "w+") as f: + dump_links(links, f) + + +def get_article_links(page: int) -> List[Link]: + """Get all (article link, date) tuples from a specific page.""" + page_link = f"{PAGE_BASE}/index_{page}.html" + soup = BeautifulSoup(requests.get(page_link).text, "html.parser") + + link_nodes = soup.select("div.gl-list li > a") + date_nodes = soup.select("div.gl-list li > span") + + def parse_link(tag: Tag) -> str: + link: str = tag.get("href") + if link.startswith("./"): + link = PAGE_BASE + link[2:] + + return link + + return [ + Link(parse_link(link), date.get_text()[1:-1]) + for link, date in zip(link_nodes, date_nodes) + ] + + +if __name__ == "__main__": + main()