scrape-yuanyuan/ningxia/scrape-links.py

59 lines
1.4 KiB
Python
Raw Normal View History

2022-04-09 22:34:42 +01:00
r"""Script to scrape article links from http://wsjkw.nx.gov.cn.
Links are available from pages
http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html.
The page structure is a bit difficult, it contains sub-lists of a big
list that have separating borders every few elements.
Something like div.gl-list li > a
"""
from typing import List
import requests
from bs4 import BeautifulSoup, Tag
from utils.linkutils import Link, dump_links
PAGE_START = 8
PAGE_END = 44
PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/"
def main():
"""Collect and output article links."""
links = [
link
for page in range(PAGE_START - 1, PAGE_END)
for link in get_article_links(page)
]
2022-04-09 22:36:40 +01:00
with open("articles-ningxia/links.csv", "w+") as f:
2022-04-09 22:34:42 +01:00
dump_links(links, f)
def get_article_links(page: int) -> List[Link]:
"""Get all (article link, date) tuples from a specific page."""
page_link = f"{PAGE_BASE}/index_{page}.html"
soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
link_nodes = soup.select("div.gl-list li > a")
date_nodes = soup.select("div.gl-list li > span")
def parse_link(tag: Tag) -> str:
link: str = tag.get("href")
if link.startswith("./"):
link = PAGE_BASE + link[2:]
return link
return [
Link(parse_link(link), date.get_text()[1:-1])
for link, date in zip(link_nodes, date_nodes)
]
if __name__ == "__main__":
main()