scrape-yuanyuan/shanxi/scrape-links.py

r"""Script to scrape article links from http://sxwjw.shaanxi.gov.cn.

Links are available from pages
http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index(\d+_)?.html.

The page structure is almost exactly like that of ningxia. Only
difference is the class name of the content div.

Something like div.con-rt li > a
"""

from typing import List

import requests
from bs4 import BeautifulSoup

from utils.linkutils import Link, absolutize_link, dump_links

PAGE_START = 2
PAGE_END = 20
PAGE_BASE = "http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/"


def main():
    """Collect and output article links."""
    links = [
        link
        for page in range(PAGE_START - 1, PAGE_END)
        for link in get_article_links(page)
    ]

    with open("articles-shanxi/links.csv", "w+") as f:
        dump_links(links, f)


def get_article_links(page: int) -> List[Link]:
    """Get all links from a specific page."""
    page_link = f"{PAGE_BASE}/index_{page}.html"
    soup = BeautifulSoup(requests.get(page_link).text, "html.parser")

    link_nodes = soup.select("div.con-rt li > a")
    date_nodes = soup.select("div.con-rt li > span")

    return [
        Link(absolutize_link(link.get("href"), PAGE_BASE), date.get_text())
        for link, date in zip(link_nodes, date_nodes)
    ]


if __name__ == "__main__":
    main()