Implement shanxi scraping

Implement scrape utils
shanxi: Update page numbers
2022-04-09 23:06:53 +01:00 · 2022-04-09 23:06:46 +01:00 · 2022-04-09 23:06:27 +01:00 · 2022-04-09 23:06:16 +01:00
5 changed files with 105 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -12,7 +12,7 @@ We need:
 : page 11-42 (actually 8-44?)
 [Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html)
-: page 2-18
+: page 2-18 (actually 2-20?)
 [Xinjiang](http://wjw.xinjiang.gov.cn/hfpc/zcwj4/zfxxgk_gknrz_10.shtml)
 : page 10-20
--- a/shanxi/scrape-articles.py
+++ b/shanxi/scrape-articles.py
@ -0,0 +1,19 @@
 """Script to scrape article text from http://sxwjw.shaanxi.gov.cn.
 Article contents are in a div with the class `message-box`.
 """
 from utils.linkutils import read_links
 from utils.scrapeutils import download_link_texts
 def main():
    """Collect and output article text."""
    with open("articles-shanxi/links.csv", "r") as f:
        links = read_links(f)
    download_link_texts(links, "message-box", "articles-shanxi")
 if __name__ == "__main__":
    main()
--- a/shanxi/scrape-links.py
+++ b/shanxi/scrape-links.py
@ -0,0 +1,51 @@
 r"""Script to scrape article links from http://sxwjw.shaanxi.gov.cn.
 Links are available from pages
 http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index(\d+_)?.html.
 The page structure is almost exactly like that of ningxia. Only
 difference is the class name of the content div.
 Something like div.con-rt li > a
 """
 from typing import List
 import requests
 from bs4 import BeautifulSoup
 from utils.linkutils import Link, absolutize_link, dump_links
 PAGE_START = 2
 PAGE_END = 20
 PAGE_BASE = "http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/"
 def main():
    """Collect and output article links."""
    links = [
        link
        for page in range(PAGE_START - 1, PAGE_END)
        for link in get_article_links(page)
    ]
    with open("articles-shanxi/links.csv", "w+") as f:
        dump_links(links, f)
 def get_article_links(page: int) -> List[Link]:
    """Get all links from a specific page."""
    page_link = f"{PAGE_BASE}/index_{page}.html"
    soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
    link_nodes = soup.select("div.con-rt li > a")
    date_nodes = soup.select("div.con-rt li > span")
    return [
        Link(absolutize_link(link.get("href"), PAGE_BASE), date.get_text())
        for link, date in zip(link_nodes, date_nodes)
    ]
 if __name__ == "__main__":
    main()
--- a/utils/linkutils.py
+++ b/utils/linkutils.py
@ -23,3 +23,10 @@ def read_links(f: TextIO) -> List[Link]:
    reader = csv.reader(f)
    next(reader)  # Skip the header
    return [Link(link[1], link[2]) for link in reader]
 def absolutize_link(link: str, page_base: str) -> str:
    """Ensure we have an absolute url."""
    if link.startswith("./"):
        link = page_base + link[2:]
    return link
--- a/utils/scrapeutils.py
+++ b/utils/scrapeutils.py
@ -0,0 +1,27 @@
 """Utility functions for scraping."""
 from typing import List
 import requests
 from bs4 import BeautifulSoup
 from .linkutils import Link
 def download_link_texts(
    links: List[Link], class_: str, directory: str, encoding: str = None
 ):
    """Download link texts contained HTML elements with the given class to a dir."""
    for i, link in enumerate(links):
        print(f"Downloading {link.url} ({i+1}/{len(links)})")
        text = get_link_text(link.url, class_, encoding)
        with open(f"{directory}/{link.date}_{i}.txt", "w+") as f:
            f.write(text)
 def get_link_text(link: str, class_: str, encoding: str = None) -> str:
    """Get the text of a div with a given classname on a webpage."""
    request = requests.get(link)
    if encoding:
        request.encoding = encoding
    soup = BeautifulSoup(request.text, "html.parser")
    return soup.find(class_=class_).get_text().strip()
Author	SHA1	Message	Date
Tristan Daniël Maat	e111c1f081	Implement shanxi scraping	2022-04-09 23:06:53 +01:00
Tristan Daniël Maat	e0a4a26990	Implement scrape utils	2022-04-09 23:06:46 +01:00
Tristan Daniël Maat	8c012a28b3	shanxi: Update page numbers	2022-04-09 23:06:27 +01:00
Tristan Daniël Maat	ff7b03bc2b	Add absolutize_link util	2022-04-09 23:06:16 +01:00