r"""Script to scrape article links from http://sxwjw.shaanxi.gov.cn. Links are available from pages http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index(\d+_)?.html. The page structure is almost exactly like that of ningxia. Only difference is the class name of the content div. Something like div.con-rt li > a """ from typing import List import requests from bs4 import BeautifulSoup from utils.linkutils import Link, absolutize_link, dump_links PAGE_START = 2 PAGE_END = 20 PAGE_BASE = "http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/" def main(): """Collect and output article links.""" links = [ link for page in range(PAGE_START - 1, PAGE_END) for link in get_article_links(page) ] with open("articles-shanxi/links.csv", "w+") as f: dump_links(links, f) def get_article_links(page: int) -> List[Link]: """Get all links from a specific page.""" page_link = f"{PAGE_BASE}/index_{page}.html" soup = BeautifulSoup(requests.get(page_link).text, "html.parser") link_nodes = soup.select("div.con-rt li > a") date_nodes = soup.select("div.con-rt li > span") return [ Link(absolutize_link(link.get("href"), PAGE_BASE), date.get_text()) for link, date in zip(link_nodes, date_nodes) ] if __name__ == "__main__": main()