r"""Script to scrape article contents from http://wsjkw.qinghai.gov.cn. Links are available from pages http://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in the second href of elements with the class `xxgk_content_title`. Dates are the first span of the same element. Article contents are in a div with the class `page_text`. """ import csv from typing import List, Tuple import requests from bs4 import BeautifulSoup, Tag PAGE_START = 14 PAGE_END = 75 PAGE_BASE = "https://wsjkw.qinghai.gov.cn/zwgk/xxgkml" def main(): """Download articles.""" links = [ link for page in range(PAGE_START - 1, PAGE_END) for link in get_article_links(page) ] # Write out the links for reference with open("articles-qinghai/links.txt", "w+") as f: writer = csv.writer(f) writer.writerow(["index", "link", "date"]) for i, link in enumerate(links): writer.writerow(i, link[0], link[1]) for i, link in enumerate(links): # Broken links # # 275 was available as an iframe, and is parsed separately in # scrape-iframe.py if i in (210, 275, 453, 681, 703, 791, 871, 913, 914, 915): continue print(f"Downloading {link[0]} ({i}/{len(links)})") text = download_article_text(link[0]) with open(f"articles-qinghai/{link[1]}_{i}.txt", "w+") as f: f.write(text) def get_article_links(page: int) -> List[Tuple[str, str]]: """Get all (article link, date) tuples from a specific page.""" print(f"Scraping page {page}") page_link = f"{PAGE_BASE}/index{page}.html" soup = BeautifulSoup(requests.get(page_link).text, "html.parser") titles = soup.find_all(class_="xxgk_content_title") def parse_title(title: Tag) -> Tuple[str, str]: date = title.span.get_text() link = title.find_all("a")[1].get("href") return link, date return [parse_title(title) for title in titles] def download_article_text(link: str) -> str: """Get the text of an article from its link.""" request = requests.get(link) request.encoding = "gbk" # The website responds with the wrong encoding soup = BeautifulSoup(request.text, "html.parser") return soup.find(class_="page_text").get_text().strip() if __name__ == "__main__": main()