Implement shanxi scraping

main
Tristan Daniël Maat 2022-04-09 23:06:53 +01:00
parent e0a4a26990
commit e111c1f081
Signed by: tlater
GPG Key ID: 49670FD774E43268
2 changed files with 70 additions and 0 deletions

19
shanxi/scrape-articles.py Normal file
View File

@ -0,0 +1,19 @@
"""Script to scrape article text from http://sxwjw.shaanxi.gov.cn.
Article contents are in a div with the class `message-box`.
"""
from utils.linkutils import read_links
from utils.scrapeutils import download_link_texts
def main():
"""Collect and output article text."""
with open("articles-shanxi/links.csv", "r") as f:
links = read_links(f)
download_link_texts(links, "message-box", "articles-shanxi")
if __name__ == "__main__":
main()

51
shanxi/scrape-links.py Normal file
View File

@ -0,0 +1,51 @@
r"""Script to scrape article links from http://sxwjw.shaanxi.gov.cn.
Links are available from pages
http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index(\d+_)?.html.
The page structure is almost exactly like that of ningxia. Only
difference is the class name of the content div.
Something like div.con-rt li > a
"""
from typing import List
import requests
from bs4 import BeautifulSoup
from utils.linkutils import Link, absolutize_link, dump_links
PAGE_START = 2
PAGE_END = 20
PAGE_BASE = "http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/"
def main():
"""Collect and output article links."""
links = [
link
for page in range(PAGE_START - 1, PAGE_END)
for link in get_article_links(page)
]
with open("articles-shanxi/links.csv", "w+") as f:
dump_links(links, f)
def get_article_links(page: int) -> List[Link]:
"""Get all links from a specific page."""
page_link = f"{PAGE_BASE}/index_{page}.html"
soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
link_nodes = soup.select("div.con-rt li > a")
date_nodes = soup.select("div.con-rt li > span")
return [
Link(absolutize_link(link.get("href"), PAGE_BASE), date.get_text())
for link, date in zip(link_nodes, date_nodes)
]
if __name__ == "__main__":
main()