scrape-yuanyuan/ningxia/scrape-links.py

r"""Script to scrape article links from http://wsjkw.nx.gov.cn.

Links are available from pages
http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html.

The page structure is a bit difficult, it contains sub-lists of a big
list that have separating borders every few elements.

Something like div.gl-list li > a
"""

from typing import List

import requests
from bs4 import BeautifulSoup, Tag

from utils.linkutils import Link, dump_links

PAGE_START = 8
PAGE_END = 44
PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/"


def main():
    """Collect and output article links."""
    links = [
        link
        for page in range(PAGE_START - 1, PAGE_END)
        for link in get_article_links(page)
    ]

    with open("articles-ningxia/links.csv", "w+") as f:
        dump_links(links, f)


def get_article_links(page: int) -> List[Link]:
    """Get all (article link, date) tuples from a specific page."""
    page_link = f"{PAGE_BASE}/index_{page}.html"
    soup = BeautifulSoup(requests.get(page_link).text, "html.parser")

    link_nodes = soup.select("div.gl-list li > a")
    date_nodes = soup.select("div.gl-list li > span")

    def parse_link(tag: Tag) -> str:
        link: str = tag.get("href")
        if link.startswith("./"):
            link = PAGE_BASE + link[2:]

        return link

    return [
        Link(parse_link(link), date.get_text()[1:-1])
        for link, date in zip(link_nodes, date_nodes)
    ]


if __name__ == "__main__":
    main()
Implement scraping for ningxia 2022-04-09 22:34:42 +01:00			`r"""Script to scrape article links from http://wsjkw.nx.gov.cn.`

			`Links are available from pages`
			`http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html.`

			`The page structure is a bit difficult, it contains sub-lists of a big`
			`list that have separating borders every few elements.`

			`Something like div.gl-list li > a`
			`"""`

			`from typing import List`

			`import requests`
			`from bs4 import BeautifulSoup, Tag`

			`from utils.linkutils import Link, dump_links`

			`PAGE_START = 8`
			`PAGE_END = 44`
			`PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/"`


			`def main():`
			`"""Collect and output article links."""`
			`links = [`
			`link`
			`for page in range(PAGE_START - 1, PAGE_END)`
			`for link in get_article_links(page)`
			`]`

Fix link file extension 2022-04-09 22:36:40 +01:00			`with open("articles-ningxia/links.csv", "w+") as f:`
Implement scraping for ningxia 2022-04-09 22:34:42 +01:00			`dump_links(links, f)`


			`def get_article_links(page: int) -> List[Link]:`
			`"""Get all (article link, date) tuples from a specific page."""`
			`page_link = f"{PAGE_BASE}/index_{page}.html"`
			`soup = BeautifulSoup(requests.get(page_link).text, "html.parser")`

			`link_nodes = soup.select("div.gl-list li > a")`
			`date_nodes = soup.select("div.gl-list li > span")`

			`def parse_link(tag: Tag) -> str:`
			`link: str = tag.get("href")`
			`if link.startswith("./"):`
			`link = PAGE_BASE + link[2:]`

			`return link`

			`return [`
			`Link(parse_link(link), date.get_text()[1:-1])`
			`for link, date in zip(link_nodes, date_nodes)`
			`]`


			`if __name__ == "__main__":`
			`main()`