diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 369c058..0000000 --- a/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -__pycache__/ -articles-*/ diff --git a/README.md b/Readme.md similarity index 95% rename from README.md rename to Readme.md index 4a8478f..06c4549 100644 --- a/README.md +++ b/Readme.md @@ -9,7 +9,7 @@ We need: : page 14-75 [Ningxia](http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index.html) -: page 11-42 (actually 8-44?) +: page 11-42 [Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html) : page 2-18 diff --git a/flake.nix b/flake.nix index 5e902e9..1999251 100644 --- a/flake.nix +++ b/flake.nix @@ -15,10 +15,6 @@ pkgs = import nixpkgs {inherit system;}; in { devShell = pkgs.mkShell { - shellHook = '' - export PYTHONPATH="$(pwd)" - ''; - nativeBuildInputs = with pkgs; [ zip unzip diff --git a/ningxia/README.md b/ningxia/README.md deleted file mode 100644 index 7daeaa2..0000000 --- a/ningxia/README.md +++ /dev/null @@ -1,46 +0,0 @@ -## Ningxia scraping - -Zip of full article dump: [articles-ningxia.zip](./articles-ningxia.zip). - -There are, once again, files that are likely just links to PDFs: - -```console -.rw-r--r-- 264 tlater 9 Apr 22:20 ./2016-08-17_738.txt -.rw-r--r-- 180 tlater 9 Apr 22:20 ./2016-08-17_739.txt -.rw-r--r-- 201 tlater 9 Apr 22:19 ./2017-03-16_676.txt -.rw-r--r-- 394 tlater 9 Apr 22:19 ./2017-04-13_666.txt -.rw-r--r-- 326 tlater 9 Apr 22:19 ./2017-04-21_662.txt -.rw-r--r-- 204 tlater 9 Apr 22:19 ./2017-05-16_655.txt -.rw-r--r-- 316 tlater 9 Apr 22:19 ./2017-06-19_645.txt -.rw-r--r-- 187 tlater 9 Apr 22:18 ./2017-09-15_607.txt -.rw-r--r-- 171 tlater 9 Apr 22:18 ./2018-03-08_551.txt -.rw-r--r-- 174 tlater 9 Apr 22:17 ./2018-05-25_517.txt -.rw-r--r-- 143 tlater 9 Apr 22:17 ./2018-06-08_512.txt -.rw-r--r-- 216 tlater 9 Apr 22:17 ./2018-07-13_504.txt -.rw-r--r-- 131 tlater 9 Apr 22:17 ./2018-08-10_479.txt -.rw-r--r-- 198 tlater 9 Apr 22:16 ./2018-12-20_385.txt -.rw-r--r-- 300 tlater 9 Apr 22:15 ./2019-02-15_359.txt -.rw-r--r-- 241 tlater 9 Apr 22:15 ./2019-04-17_331.txt -.rw-r--r-- 209 tlater 9 Apr 22:15 ./2019-05-21_309.txt -.rw-r--r-- 264 tlater 9 Apr 22:15 ./2019-06-11_306.txt -.rw-r--r-- 325 tlater 9 Apr 22:15 ./2019-06-11_307.txt -.rw-r--r-- 306 tlater 9 Apr 22:15 ./2019-07-22_286.txt -.rw-r--r-- 131 tlater 9 Apr 22:14 ./2019-09-05_266.txt -.rw-r--r-- 264 tlater 9 Apr 22:14 ./2019-09-09_265.txt -.rw-r--r-- 177 tlater 9 Apr 22:14 ./2019-11-19_231.txt -.rw-r--r-- 203 tlater 9 Apr 22:13 ./2020-02-01_158.txt -.rw-r--r-- 204 tlater 9 Apr 22:13 ./2020-03-01_151.txt -.rw-r--r-- 158 tlater 9 Apr 22:12 ./2020-04-01_125.txt -.rw-r--r-- 131 tlater 9 Apr 22:13 ./2020-04-01_126.txt -.rw-r--r-- 182 tlater 9 Apr 22:13 ./2020-04-01_127.txt -.rw-r--r-- 176 tlater 9 Apr 22:12 ./2020-04-17_95.txt -.rw-r--r-- 398 tlater 9 Apr 22:12 ./2020-04-17_96.txt -.rw-r--r-- 174 tlater 9 Apr 22:12 ./2020-05-12_72.txt -.rw-r--r-- 151 tlater 9 Apr 22:12 ./2020-06-04_63.txt -.rw-r--r-- 137 tlater 9 Apr 22:12 ./2020-06-10_59.txt -.rw-r--r-- 161 tlater 9 Apr 22:11 ./2020-07-10_46.txt -.rw-r--r-- 206 tlater 9 Apr 22:11 ./2020-07-17_41.txt -.rw-r--r-- 189 tlater 9 Apr 22:11 ./2020-09-04_33.txt -.rw-r--r-- 156 tlater 9 Apr 22:11 ./2020-09-07_30.txt -.rw-r--r-- 201 tlater 9 Apr 22:11 ./2020-10-01_15.txt -``` diff --git a/ningxia/articles-ningxia.zip b/ningxia/articles-ningxia.zip deleted file mode 100644 index b84f95a..0000000 Binary files a/ningxia/articles-ningxia.zip and /dev/null differ diff --git a/ningxia/scrape-articles.py b/ningxia/scrape-articles.py deleted file mode 100644 index b9541eb..0000000 --- a/ningxia/scrape-articles.py +++ /dev/null @@ -1,32 +0,0 @@ -r"""Script to scrape article text from http://wsjkw.nx.gov.cn. - -Article contents are in a div with the class `xl-content`. -""" - -import requests -from bs4 import BeautifulSoup - -from ..utils.linkutils import read_links - - -def main(): - """Collect and output article text.""" - with open("articles-ningxia/links.txt", "r") as f: - links = read_links(f) - - for i, link in enumerate(links): - print(f"Downloading {link.url} ({i}/{len(links)})") - text = get_article_text(link.url) - with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f: - f.write(text) - - -def get_article_text(link: str) -> str: - """Download article text.""" - request = requests.get(link) - soup = BeautifulSoup(request.text, "html.parser") - return soup.find(class_="xl-content").get_text().strip() - - -if __name__ == "__main__": - main() diff --git a/ningxia/scrape-links.py b/ningxia/scrape-links.py deleted file mode 100644 index d5e05d5..0000000 --- a/ningxia/scrape-links.py +++ /dev/null @@ -1,58 +0,0 @@ -r"""Script to scrape article links from http://wsjkw.nx.gov.cn. - -Links are available from pages -http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html. - -The page structure is a bit difficult, it contains sub-lists of a big -list that have separating borders every few elements. - -Something like div.gl-list li > a -""" - -from typing import List - -import requests -from bs4 import BeautifulSoup, Tag - -from utils.linkutils import Link, dump_links - -PAGE_START = 8 -PAGE_END = 44 -PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/" - - -def main(): - """Collect and output article links.""" - links = [ - link - for page in range(PAGE_START - 1, PAGE_END) - for link in get_article_links(page) - ] - - with open("articles-ningxia/links.csv", "w+") as f: - dump_links(links, f) - - -def get_article_links(page: int) -> List[Link]: - """Get all (article link, date) tuples from a specific page.""" - page_link = f"{PAGE_BASE}/index_{page}.html" - soup = BeautifulSoup(requests.get(page_link).text, "html.parser") - - link_nodes = soup.select("div.gl-list li > a") - date_nodes = soup.select("div.gl-list li > span") - - def parse_link(tag: Tag) -> str: - link: str = tag.get("href") - if link.startswith("./"): - link = PAGE_BASE + link[2:] - - return link - - return [ - Link(parse_link(link), date.get_text()[1:-1]) - for link, date in zip(link_nodes, date_nodes) - ] - - -if __name__ == "__main__": - main() diff --git a/utils/__init__.py b/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/utils/linkutils.py b/utils/linkutils.py deleted file mode 100644 index d572f97..0000000 --- a/utils/linkutils.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Utility functions for handling links.""" -import csv -from typing import List, NamedTuple, TextIO - - -class Link(NamedTuple): - """A type for links - contains its url and date.""" - - url: str - date: str - - -def dump_links(links: List[Link], f: TextIO): - """Dump links to a file in csv format.""" - writer = csv.writer(f) - writer.writerow(["index", "link", "date"]) - for i, link in enumerate(links): - writer.writerow([i, link[0], link[1]]) - - -def read_links(f: TextIO) -> List[Link]: - """Read links from a csv format.""" - reader = csv.reader(f) - next(reader) # Skip the header - return [Link(link[1], link[2]) for link in reader]