Implement scraping for qinghai

2022-04-09 18:31:14 +01:00 · 2022-04-09 18:31:14 +01:00 · 340feaa7ed
commit 340feaa7ed
parent 6dace44412
1 changed files with 69 additions and 0 deletions
--- a/qinghai/scrape.py
+++ b/qinghai/scrape.py
@ -0,0 +1,69 @@
 r"""Script to scrape article contents from https://wsjkw.qinghai.gov.cn.
 Links are available from pages
 https://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in
 the second href of elements with the class `xxgk_content_title`. Dates
 are the first span of the same element.
 Article contents are in a div with the class `page_text`.
 """
 import csv
 from typing import List, Tuple
 import requests
 from bs4 import BeautifulSoup, Tag
 PAGE_START = 14
 PAGE_END = 75
 PAGE_BASE = "https://wsjkw.qinghai.gov.cn/zwgk/xxgkml"
 def main():
    """Download articles."""
    links = [
        link
        for page in range(PAGE_START - 1, PAGE_END)
        for link in get_article_links(page)
    ]
    # Write out the links for reference
    with open("articles-qinghai/links.txt", "w+") as f:
        writer = csv.writer(f)
        writer.writerow(["index", "link", "date"])
        for i, link in enumerate(links):
            writer.writerow(i, link[0], link[1])
    for i, link in enumerate(links):
        print(f"Downloading {link[0]} ({i}/{len(links)})")
        text = download_article_text(link[0])
        with open(f"articles-qinghai/{link[1]}_{i}.txt") as f:
            f.write(text)
 def get_article_links(page: int) -> List[Tuple[str, str]]:
    """Get all (article link, date) tuples from a specific page."""
    print(f"Scraping page {page}")
    page_link = f"{PAGE_BASE}/index{page}.html"
    soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
    titles = soup.find_all(class_="xxgk_content_title")
    def parse_title(title: Tag) -> Tuple[str, str]:
        date = title.span.get_text()
        link = title.find_all("a")[1].get("href")
        return link, date
    return [parse_title(title) for title in titles]
 def download_article_text(link: str) -> str:
    """Get the text of an article from its link."""
    soup = BeautifulSoup(requests.get(link).text, "html.parser")
    return soup.find(class_="page_text").get_text().strip()
 if __name__ == "__main__":
    main()