scrape-yuanyuan/qinghai/scrape.py

r"""Script to scrape article contents from http://wsjkw.qinghai.gov.cn.

Links are available from pages
http://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in
the second href of elements with the class `xxgk_content_title`. Dates
are the first span of the same element.

Article contents are in a div with the class `page_text`.
"""

import csv
from typing import List, Tuple

import requests
from bs4 import BeautifulSoup, Tag

PAGE_START = 14
PAGE_END = 75
PAGE_BASE = "https://wsjkw.qinghai.gov.cn/zwgk/xxgkml"


def main():
    """Download articles."""
    links = [
        link
        for page in range(PAGE_START - 1, PAGE_END)
        for link in get_article_links(page)
    ]

    # Write out the links for reference
    with open("articles-qinghai/links.txt", "w+") as f:
        writer = csv.writer(f)
        writer.writerow(["index", "link", "date"])
        for i, link in enumerate(links):
            writer.writerow(i, link[0], link[1])

    for i, link in enumerate(links):
        print(f"Downloading {link[0]} ({i}/{len(links)})")

        text = download_article_text(link[0])
        with open(f"articles-qinghai/{link[1]}_{i}.txt", "w+") as f:
            f.write(text)


def get_article_links(page: int) -> List[Tuple[str, str]]:
    """Get all (article link, date) tuples from a specific page."""
    print(f"Scraping page {page}")

    page_link = f"{PAGE_BASE}/index{page}.html"
    soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
    titles = soup.find_all(class_="xxgk_content_title")

    def parse_title(title: Tag) -> Tuple[str, str]:
        date = title.span.get_text()
        link = title.find_all("a")[1].get("href")

        return link, date

    return [parse_title(title) for title in titles]


def download_article_text(link: str) -> str:
    """Get the text of an article from its link."""
    request = requests.get(link)
    request.encoding = "gbk"  # The website responds with the wrong encoding
    soup = BeautifulSoup(request.text, "html.parser")
    return soup.find(class_="page_text").get_text().strip()


if __name__ == "__main__":
    main()
Don't mistakenly refer to https links 2022-04-09 19:03:52 +01:00			`r"""Script to scrape article contents from http://wsjkw.qinghai.gov.cn.`
Implement scraping for qinghai 2022-04-09 18:31:14 +01:00
			`Links are available from pages`
Don't mistakenly refer to https links 2022-04-09 19:03:52 +01:00			`http://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in`
Implement scraping for qinghai 2022-04-09 18:31:14 +01:00			the second href of elements with the class `xxgk_content_title`. Dates
			`are the first span of the same element.`

			Article contents are in a div with the class `page_text`.
			`"""`

			`import csv`
			`from typing import List, Tuple`

			`import requests`
			`from bs4 import BeautifulSoup, Tag`

			`PAGE_START = 14`
			`PAGE_END = 75`
			`PAGE_BASE = "https://wsjkw.qinghai.gov.cn/zwgk/xxgkml"`


			`def main():`
			`"""Download articles."""`
			`links = [`
			`link`
			`for page in range(PAGE_START - 1, PAGE_END)`
			`for link in get_article_links(page)`
			`]`

			`# Write out the links for reference`
			`with open("articles-qinghai/links.txt", "w+") as f:`
			`writer = csv.writer(f)`
			`writer.writerow(["index", "link", "date"])`
			`for i, link in enumerate(links):`
			`writer.writerow(i, link[0], link[1])`

			`for i, link in enumerate(links):`
			`print(f"Downloading {link[0]} ({i}/{len(links)})")`

			`text = download_article_text(link[0])`
Fix missing write setting on file 2022-04-09 18:58:40 +01:00			`with open(f"articles-qinghai/{link[1]}_{i}.txt", "w+") as f:`
Implement scraping for qinghai 2022-04-09 18:31:14 +01:00			`f.write(text)`


			`def get_article_links(page: int) -> List[Tuple[str, str]]:`
			`"""Get all (article link, date) tuples from a specific page."""`
			`print(f"Scraping page {page}")`

			`page_link = f"{PAGE_BASE}/index{page}.html"`
			`soup = BeautifulSoup(requests.get(page_link).text, "html.parser")`
			`titles = soup.find_all(class_="xxgk_content_title")`

			`def parse_title(title: Tag) -> Tuple[str, str]:`
			`date = title.span.get_text()`
			`link = title.find_all("a")[1].get("href")`

			`return link, date`

			`return [parse_title(title) for title in titles]`


			`def download_article_text(link: str) -> str:`
			`"""Get the text of an article from its link."""`
Override reported text encoding for qinghai 2022-04-09 18:58:58 +01:00			`request = requests.get(link)`
			`request.encoding = "gbk" # The website responds with the wrong encoding`
			`soup = BeautifulSoup(request.text, "html.parser")`
Implement scraping for qinghai 2022-04-09 18:31:14 +01:00			`return soup.find(class_="page_text").get_text().strip()`


			`if __name__ == "__main__":`
			`main()`