Add ningxia file dump

Fix link file extension
Ignore article directories
2022-04-09 22:37:27 +01:00 · 2022-04-09 22:36:40 +01:00 · 2022-04-09 22:35:15 +01:00 · 2022-04-09 22:34:42 +01:00 · 2022-04-09 22:33:43 +01:00 · 2022-04-09 22:32:13 +01:00
9 changed files with 168 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+__pycache__/
+articles-*/
--- a/README.md
+++ b/README.md
@ -9,7 +9,7 @@ We need:
 : page 14-75

 [Ningxia](http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index.html)
-: page 11-42
+: page 11-42 (actually 8-44?)

 [Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html)
 : page 2-18
--- a/flake.nix
+++ b/flake.nix
@ -15,6 +15,10 @@
        pkgs = import nixpkgs {inherit system;};
      in {
        devShell = pkgs.mkShell {
+          shellHook = ''
+            export PYTHONPATH="$(pwd)"
+          '';
+
          nativeBuildInputs = with pkgs; [
            zip
            unzip
--- a/ningxia/README.md
+++ b/ningxia/README.md
@ -0,0 +1,46 @@
+## Ningxia scraping
+
+Zip of full article dump: [articles-ningxia.zip](./articles-ningxia.zip).
+
+There are, once again, files that are likely just links to PDFs:
+
+```console
+.rw-r--r-- 264 tlater  9 Apr 22:20 ./2016-08-17_738.txt
+.rw-r--r-- 180 tlater  9 Apr 22:20 ./2016-08-17_739.txt
+.rw-r--r-- 201 tlater  9 Apr 22:19 ./2017-03-16_676.txt
+.rw-r--r-- 394 tlater  9 Apr 22:19 ./2017-04-13_666.txt
+.rw-r--r-- 326 tlater  9 Apr 22:19 ./2017-04-21_662.txt
+.rw-r--r-- 204 tlater  9 Apr 22:19 ./2017-05-16_655.txt
+.rw-r--r-- 316 tlater  9 Apr 22:19 ./2017-06-19_645.txt
+.rw-r--r-- 187 tlater  9 Apr 22:18 ./2017-09-15_607.txt
+.rw-r--r-- 171 tlater  9 Apr 22:18 ./2018-03-08_551.txt
+.rw-r--r-- 174 tlater  9 Apr 22:17 ./2018-05-25_517.txt
+.rw-r--r-- 143 tlater  9 Apr 22:17 ./2018-06-08_512.txt
+.rw-r--r-- 216 tlater  9 Apr 22:17 ./2018-07-13_504.txt
+.rw-r--r-- 131 tlater  9 Apr 22:17 ./2018-08-10_479.txt
+.rw-r--r-- 198 tlater  9 Apr 22:16 ./2018-12-20_385.txt
+.rw-r--r-- 300 tlater  9 Apr 22:15 ./2019-02-15_359.txt
+.rw-r--r-- 241 tlater  9 Apr 22:15 ./2019-04-17_331.txt
+.rw-r--r-- 209 tlater  9 Apr 22:15 ./2019-05-21_309.txt
+.rw-r--r-- 264 tlater  9 Apr 22:15 ./2019-06-11_306.txt
+.rw-r--r-- 325 tlater  9 Apr 22:15 ./2019-06-11_307.txt
+.rw-r--r-- 306 tlater  9 Apr 22:15 ./2019-07-22_286.txt
+.rw-r--r-- 131 tlater  9 Apr 22:14 ./2019-09-05_266.txt
+.rw-r--r-- 264 tlater  9 Apr 22:14 ./2019-09-09_265.txt
+.rw-r--r-- 177 tlater  9 Apr 22:14 ./2019-11-19_231.txt
+.rw-r--r-- 203 tlater  9 Apr 22:13 ./2020-02-01_158.txt
+.rw-r--r-- 204 tlater  9 Apr 22:13 ./2020-03-01_151.txt
+.rw-r--r-- 158 tlater  9 Apr 22:12 ./2020-04-01_125.txt
+.rw-r--r-- 131 tlater  9 Apr 22:13 ./2020-04-01_126.txt
+.rw-r--r-- 182 tlater  9 Apr 22:13 ./2020-04-01_127.txt
+.rw-r--r-- 176 tlater  9 Apr 22:12 ./2020-04-17_95.txt
+.rw-r--r-- 398 tlater  9 Apr 22:12 ./2020-04-17_96.txt
+.rw-r--r-- 174 tlater  9 Apr 22:12 ./2020-05-12_72.txt
+.rw-r--r-- 151 tlater  9 Apr 22:12 ./2020-06-04_63.txt
+.rw-r--r-- 137 tlater  9 Apr 22:12 ./2020-06-10_59.txt
+.rw-r--r-- 161 tlater  9 Apr 22:11 ./2020-07-10_46.txt
+.rw-r--r-- 206 tlater  9 Apr 22:11 ./2020-07-17_41.txt
+.rw-r--r-- 189 tlater  9 Apr 22:11 ./2020-09-04_33.txt
+.rw-r--r-- 156 tlater  9 Apr 22:11 ./2020-09-07_30.txt
+.rw-r--r-- 201 tlater  9 Apr 22:11 ./2020-10-01_15.txt
+```
--- a/ningxia/articles-ningxia.zip
+++ b/ningxia/articles-ningxia.zip
--- a/ningxia/scrape-articles.py
+++ b/ningxia/scrape-articles.py
@ -0,0 +1,32 @@
+r"""Script to scrape article text from http://wsjkw.nx.gov.cn.
+
+Article contents are in a div with the class `xl-content`.
+"""
+
+import requests
+from bs4 import BeautifulSoup
+
+from ..utils.linkutils import read_links
+
+
+def main():
+    """Collect and output article text."""
+    with open("articles-ningxia/links.txt", "r") as f:
+        links = read_links(f)
+
+    for i, link in enumerate(links):
+        print(f"Downloading {link.url} ({i}/{len(links)})")
+        text = get_article_text(link.url)
+        with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f:
+            f.write(text)
+
+
+def get_article_text(link: str) -> str:
+    """Download article text."""
+    request = requests.get(link)
+    soup = BeautifulSoup(request.text, "html.parser")
+    return soup.find(class_="xl-content").get_text().strip()
+
+
+if __name__ == "__main__":
+    main()
--- a/ningxia/scrape-links.py
+++ b/ningxia/scrape-links.py
@ -0,0 +1,58 @@
+r"""Script to scrape article links from http://wsjkw.nx.gov.cn.
+
+Links are available from pages
+http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html.
+
+The page structure is a bit difficult, it contains sub-lists of a big
+list that have separating borders every few elements.
+
+Something like div.gl-list li > a
+"""
+
+from typing import List
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+from utils.linkutils import Link, dump_links
+
+PAGE_START = 8
+PAGE_END = 44
+PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/"
+
+
+def main():
+    """Collect and output article links."""
+    links = [
+        link
+        for page in range(PAGE_START - 1, PAGE_END)
+        for link in get_article_links(page)
+    ]
+
+    with open("articles-ningxia/links.csv", "w+") as f:
+        dump_links(links, f)
+
+
+def get_article_links(page: int) -> List[Link]:
+    """Get all (article link, date) tuples from a specific page."""
+    page_link = f"{PAGE_BASE}/index_{page}.html"
+    soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
+
+    link_nodes = soup.select("div.gl-list li > a")
+    date_nodes = soup.select("div.gl-list li > span")
+
+    def parse_link(tag: Tag) -> str:
+        link: str = tag.get("href")
+        if link.startswith("./"):
+            link = PAGE_BASE + link[2:]
+
+        return link
+
+    return [
+        Link(parse_link(link), date.get_text()[1:-1])
+        for link, date in zip(link_nodes, date_nodes)
+    ]
+
+
+if __name__ == "__main__":
+    main()
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/linkutils.py
+++ b/utils/linkutils.py
@ -0,0 +1,25 @@
+"""Utility functions for handling links."""
+import csv
+from typing import List, NamedTuple, TextIO
+
+
+class Link(NamedTuple):
+    """A type for links - contains its url and date."""
+
+    url: str
+    date: str
+
+
+def dump_links(links: List[Link], f: TextIO):
+    """Dump links to a file in csv format."""
+    writer = csv.writer(f)
+    writer.writerow(["index", "link", "date"])
+    for i, link in enumerate(links):
+        writer.writerow([i, link[0], link[1]])
+
+
+def read_links(f: TextIO) -> List[Link]:
+    """Read links from a csv format."""
+    reader = csv.reader(f)
+    next(reader)  # Skip the header
+    return [Link(link[1], link[2]) for link in reader]
Author	SHA1	Message	Date
Tristan Daniël Maat	8820ce1b95	Add ningxia file dump	2022-04-09 22:37:27 +01:00
Tristan Daniël Maat	3f2b0245ec	Fix link file extension	2022-04-09 22:36:40 +01:00
Tristan Daniël Maat	ec77f0798b	Ignore article directories	2022-04-09 22:35:15 +01:00
Tristan Daniël Maat	5c557bdb9d	Implement scraping for ningxia	2022-04-09 22:34:42 +01:00
Tristan Daniël Maat	65bf00f452	Add linkutils	2022-04-09 22:33:43 +01:00
Tristan Daniël Maat	90b338945e	Update page numbers for ningxia	2022-04-09 22:32:13 +01:00