52 lines
1.3 KiB
Python
52 lines
1.3 KiB
Python
r"""Script to scrape article links from http://sxwjw.shaanxi.gov.cn.
|
|
|
|
Links are available from pages
|
|
http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index(\d+_)?.html.
|
|
|
|
The page structure is almost exactly like that of ningxia. Only
|
|
difference is the class name of the content div.
|
|
|
|
Something like div.con-rt li > a
|
|
"""
|
|
|
|
from typing import List
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from utils.linkutils import Link, absolutize_link, dump_links
|
|
|
|
PAGE_START = 2
|
|
PAGE_END = 20
|
|
PAGE_BASE = "http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/"
|
|
|
|
|
|
def main():
|
|
"""Collect and output article links."""
|
|
links = [
|
|
link
|
|
for page in range(PAGE_START - 1, PAGE_END)
|
|
for link in get_article_links(page)
|
|
]
|
|
|
|
with open("articles-shanxi/links.csv", "w+") as f:
|
|
dump_links(links, f)
|
|
|
|
|
|
def get_article_links(page: int) -> List[Link]:
|
|
"""Get all links from a specific page."""
|
|
page_link = f"{PAGE_BASE}/index_{page}.html"
|
|
soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
|
|
|
|
link_nodes = soup.select("div.con-rt li > a")
|
|
date_nodes = soup.select("div.con-rt li > span")
|
|
|
|
return [
|
|
Link(absolutize_link(link.get("href"), PAGE_BASE), date.get_text())
|
|
for link, date in zip(link_nodes, date_nodes)
|
|
]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|