2022-04-09 19:03:52 +01:00
|
|
|
r"""Script to scrape article contents from http://wsjkw.qinghai.gov.cn.
|
2022-04-09 18:31:14 +01:00
|
|
|
|
|
|
|
Links are available from pages
|
2022-04-09 19:03:52 +01:00
|
|
|
http://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in
|
2022-04-09 18:31:14 +01:00
|
|
|
the second href of elements with the class `xxgk_content_title`. Dates
|
|
|
|
are the first span of the same element.
|
|
|
|
|
|
|
|
Article contents are in a div with the class `page_text`.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import csv
|
|
|
|
from typing import List, Tuple
|
|
|
|
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup, Tag
|
|
|
|
|
|
|
|
PAGE_START = 14
|
|
|
|
PAGE_END = 75
|
|
|
|
PAGE_BASE = "https://wsjkw.qinghai.gov.cn/zwgk/xxgkml"
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
"""Download articles."""
|
|
|
|
links = [
|
|
|
|
link
|
|
|
|
for page in range(PAGE_START - 1, PAGE_END)
|
|
|
|
for link in get_article_links(page)
|
|
|
|
]
|
|
|
|
|
|
|
|
# Write out the links for reference
|
|
|
|
with open("articles-qinghai/links.txt", "w+") as f:
|
|
|
|
writer = csv.writer(f)
|
|
|
|
writer.writerow(["index", "link", "date"])
|
|
|
|
for i, link in enumerate(links):
|
|
|
|
writer.writerow(i, link[0], link[1])
|
|
|
|
|
|
|
|
for i, link in enumerate(links):
|
2022-04-09 19:30:41 +01:00
|
|
|
# Broken links
|
|
|
|
#
|
|
|
|
# 275 was available as an iframe, and is parsed separately in
|
|
|
|
# scrape-iframe.py
|
|
|
|
if i in (210, 275, 453, 681, 703, 791, 871, 913, 914, 915):
|
2022-04-09 19:04:20 +01:00
|
|
|
continue
|
|
|
|
|
2022-04-09 18:31:14 +01:00
|
|
|
print(f"Downloading {link[0]} ({i}/{len(links)})")
|
|
|
|
|
|
|
|
text = download_article_text(link[0])
|
2022-04-09 18:58:40 +01:00
|
|
|
with open(f"articles-qinghai/{link[1]}_{i}.txt", "w+") as f:
|
2022-04-09 18:31:14 +01:00
|
|
|
f.write(text)
|
|
|
|
|
|
|
|
|
|
|
|
def get_article_links(page: int) -> List[Tuple[str, str]]:
|
|
|
|
"""Get all (article link, date) tuples from a specific page."""
|
|
|
|
print(f"Scraping page {page}")
|
|
|
|
|
|
|
|
page_link = f"{PAGE_BASE}/index{page}.html"
|
|
|
|
soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
|
|
|
|
titles = soup.find_all(class_="xxgk_content_title")
|
|
|
|
|
|
|
|
def parse_title(title: Tag) -> Tuple[str, str]:
|
|
|
|
date = title.span.get_text()
|
|
|
|
link = title.find_all("a")[1].get("href")
|
|
|
|
|
|
|
|
return link, date
|
|
|
|
|
|
|
|
return [parse_title(title) for title in titles]
|
|
|
|
|
|
|
|
|
|
|
|
def download_article_text(link: str) -> str:
|
|
|
|
"""Get the text of an article from its link."""
|
2022-04-09 18:58:58 +01:00
|
|
|
request = requests.get(link)
|
|
|
|
request.encoding = "gbk" # The website responds with the wrong encoding
|
|
|
|
soup = BeautifulSoup(request.text, "html.parser")
|
2022-04-09 18:31:14 +01:00
|
|
|
return soup.find(class_="page_text").get_text().strip()
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|