r"""Script to scrape article links from http://wsjkw.nx.gov.cn. Links are available from pages http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html. The page structure is a bit difficult, it contains sub-lists of a big list that have separating borders every few elements. Something like div.gl-list li > a """ from typing import List import requests from bs4 import BeautifulSoup, Tag from utils.linkutils import Link, dump_links PAGE_START = 8 PAGE_END = 44 PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/" def main(): """Collect and output article links.""" links = [ link for page in range(PAGE_START - 1, PAGE_END) for link in get_article_links(page) ] with open("articles-ningxia/links.csv", "w+") as f: dump_links(links, f) def get_article_links(page: int) -> List[Link]: """Get all (article link, date) tuples from a specific page.""" page_link = f"{PAGE_BASE}/index_{page}.html" soup = BeautifulSoup(requests.get(page_link).text, "html.parser") link_nodes = soup.select("div.gl-list li > a") date_nodes = soup.select("div.gl-list li > span") def parse_link(tag: Tag) -> str: link: str = tag.get("href") if link.startswith("./"): link = PAGE_BASE + link[2:] return link return [ Link(parse_link(link), date.get_text()[1:-1]) for link, date in zip(link_nodes, date_nodes) ] if __name__ == "__main__": main()