Implement scraping for ningxia

2022-04-09 22:34:42 +01:00 · 2022-04-09 22:34:42 +01:00 · 5c557bdb9d
commit 5c557bdb9d
parent 65bf00f452
2 changed files with 90 additions and 0 deletions
--- a/ningxia/scrape-articles.py
+++ b/ningxia/scrape-articles.py
@ -0,0 +1,32 @@
+r"""Script to scrape article text from http://wsjkw.nx.gov.cn.
+
+Article contents are in a div with the class `xl-content`.
+"""
+
+import requests
+from bs4 import BeautifulSoup
+
+from ..utils.linkutils import read_links
+
+
+def main():
+    """Collect and output article text."""
+    with open("articles-ningxia/links.txt", "r") as f:
+        links = read_links(f)
+
+    for i, link in enumerate(links):
+        print(f"Downloading {link.url} ({i}/{len(links)})")
+        text = get_article_text(link.url)
+        with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f:
+            f.write(text)
+
+
+def get_article_text(link: str) -> str:
+    """Download article text."""
+    request = requests.get(link)
+    soup = BeautifulSoup(request.text, "html.parser")
+    return soup.find(class_="xl-content").get_text().strip()
+
+
+if __name__ == "__main__":
+    main()
--- a/ningxia/scrape-links.py
+++ b/ningxia/scrape-links.py
@ -0,0 +1,58 @@
+r"""Script to scrape article links from http://wsjkw.nx.gov.cn.
+
+Links are available from pages
+http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html.
+
+The page structure is a bit difficult, it contains sub-lists of a big
+list that have separating borders every few elements.
+
+Something like div.gl-list li > a
+"""
+
+from typing import List
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+from utils.linkutils import Link, dump_links
+
+PAGE_START = 8
+PAGE_END = 44
+PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/"
+
+
+def main():
+    """Collect and output article links."""
+    links = [
+        link
+        for page in range(PAGE_START - 1, PAGE_END)
+        for link in get_article_links(page)
+    ]
+
+    with open("articles-ningxia/links.txt", "w+") as f:
+        dump_links(links, f)
+
+
+def get_article_links(page: int) -> List[Link]:
+    """Get all (article link, date) tuples from a specific page."""
+    page_link = f"{PAGE_BASE}/index_{page}.html"
+    soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
+
+    link_nodes = soup.select("div.gl-list li > a")
+    date_nodes = soup.select("div.gl-list li > span")
+
+    def parse_link(tag: Tag) -> str:
+        link: str = tag.get("href")
+        if link.startswith("./"):
+            link = PAGE_BASE + link[2:]
+
+        return link
+
+    return [
+        Link(parse_link(link), date.get_text()[1:-1])
+        for link, date in zip(link_nodes, date_nodes)
+    ]
+
+
+if __name__ == "__main__":
+    main()