diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..369c058 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +articles-*/ diff --git a/Readme.md b/README.md similarity index 95% rename from Readme.md rename to README.md index 06c4549..4a8478f 100644 --- a/Readme.md +++ b/README.md @@ -9,7 +9,7 @@ We need: : page 14-75 [Ningxia](http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index.html) -: page 11-42 +: page 11-42 (actually 8-44?) [Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html) : page 2-18 diff --git a/flake.nix b/flake.nix index 1999251..5e902e9 100644 --- a/flake.nix +++ b/flake.nix @@ -15,6 +15,10 @@ pkgs = import nixpkgs {inherit system;}; in { devShell = pkgs.mkShell { + shellHook = '' + export PYTHONPATH="$(pwd)" + ''; + nativeBuildInputs = with pkgs; [ zip unzip diff --git a/ningxia/README.md b/ningxia/README.md new file mode 100644 index 0000000..7daeaa2 --- /dev/null +++ b/ningxia/README.md @@ -0,0 +1,46 @@ +## Ningxia scraping + +Zip of full article dump: [articles-ningxia.zip](./articles-ningxia.zip). + +There are, once again, files that are likely just links to PDFs: + +```console +.rw-r--r-- 264 tlater 9 Apr 22:20 ./2016-08-17_738.txt +.rw-r--r-- 180 tlater 9 Apr 22:20 ./2016-08-17_739.txt +.rw-r--r-- 201 tlater 9 Apr 22:19 ./2017-03-16_676.txt +.rw-r--r-- 394 tlater 9 Apr 22:19 ./2017-04-13_666.txt +.rw-r--r-- 326 tlater 9 Apr 22:19 ./2017-04-21_662.txt +.rw-r--r-- 204 tlater 9 Apr 22:19 ./2017-05-16_655.txt +.rw-r--r-- 316 tlater 9 Apr 22:19 ./2017-06-19_645.txt +.rw-r--r-- 187 tlater 9 Apr 22:18 ./2017-09-15_607.txt +.rw-r--r-- 171 tlater 9 Apr 22:18 ./2018-03-08_551.txt +.rw-r--r-- 174 tlater 9 Apr 22:17 ./2018-05-25_517.txt +.rw-r--r-- 143 tlater 9 Apr 22:17 ./2018-06-08_512.txt +.rw-r--r-- 216 tlater 9 Apr 22:17 ./2018-07-13_504.txt +.rw-r--r-- 131 tlater 9 Apr 22:17 ./2018-08-10_479.txt +.rw-r--r-- 198 tlater 9 Apr 22:16 ./2018-12-20_385.txt +.rw-r--r-- 300 tlater 9 Apr 22:15 ./2019-02-15_359.txt +.rw-r--r-- 241 tlater 9 Apr 22:15 ./2019-04-17_331.txt +.rw-r--r-- 209 tlater 9 Apr 22:15 ./2019-05-21_309.txt +.rw-r--r-- 264 tlater 9 Apr 22:15 ./2019-06-11_306.txt +.rw-r--r-- 325 tlater 9 Apr 22:15 ./2019-06-11_307.txt +.rw-r--r-- 306 tlater 9 Apr 22:15 ./2019-07-22_286.txt +.rw-r--r-- 131 tlater 9 Apr 22:14 ./2019-09-05_266.txt +.rw-r--r-- 264 tlater 9 Apr 22:14 ./2019-09-09_265.txt +.rw-r--r-- 177 tlater 9 Apr 22:14 ./2019-11-19_231.txt +.rw-r--r-- 203 tlater 9 Apr 22:13 ./2020-02-01_158.txt +.rw-r--r-- 204 tlater 9 Apr 22:13 ./2020-03-01_151.txt +.rw-r--r-- 158 tlater 9 Apr 22:12 ./2020-04-01_125.txt +.rw-r--r-- 131 tlater 9 Apr 22:13 ./2020-04-01_126.txt +.rw-r--r-- 182 tlater 9 Apr 22:13 ./2020-04-01_127.txt +.rw-r--r-- 176 tlater 9 Apr 22:12 ./2020-04-17_95.txt +.rw-r--r-- 398 tlater 9 Apr 22:12 ./2020-04-17_96.txt +.rw-r--r-- 174 tlater 9 Apr 22:12 ./2020-05-12_72.txt +.rw-r--r-- 151 tlater 9 Apr 22:12 ./2020-06-04_63.txt +.rw-r--r-- 137 tlater 9 Apr 22:12 ./2020-06-10_59.txt +.rw-r--r-- 161 tlater 9 Apr 22:11 ./2020-07-10_46.txt +.rw-r--r-- 206 tlater 9 Apr 22:11 ./2020-07-17_41.txt +.rw-r--r-- 189 tlater 9 Apr 22:11 ./2020-09-04_33.txt +.rw-r--r-- 156 tlater 9 Apr 22:11 ./2020-09-07_30.txt +.rw-r--r-- 201 tlater 9 Apr 22:11 ./2020-10-01_15.txt +``` diff --git a/ningxia/articles-ningxia.zip b/ningxia/articles-ningxia.zip new file mode 100644 index 0000000..b84f95a Binary files /dev/null and b/ningxia/articles-ningxia.zip differ diff --git a/ningxia/scrape-articles.py b/ningxia/scrape-articles.py new file mode 100644 index 0000000..b9541eb --- /dev/null +++ b/ningxia/scrape-articles.py @@ -0,0 +1,32 @@ +r"""Script to scrape article text from http://wsjkw.nx.gov.cn. + +Article contents are in a div with the class `xl-content`. +""" + +import requests +from bs4 import BeautifulSoup + +from ..utils.linkutils import read_links + + +def main(): + """Collect and output article text.""" + with open("articles-ningxia/links.txt", "r") as f: + links = read_links(f) + + for i, link in enumerate(links): + print(f"Downloading {link.url} ({i}/{len(links)})") + text = get_article_text(link.url) + with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f: + f.write(text) + + +def get_article_text(link: str) -> str: + """Download article text.""" + request = requests.get(link) + soup = BeautifulSoup(request.text, "html.parser") + return soup.find(class_="xl-content").get_text().strip() + + +if __name__ == "__main__": + main() diff --git a/ningxia/scrape-links.py b/ningxia/scrape-links.py new file mode 100644 index 0000000..d5e05d5 --- /dev/null +++ b/ningxia/scrape-links.py @@ -0,0 +1,58 @@ +r"""Script to scrape article links from http://wsjkw.nx.gov.cn. + +Links are available from pages +http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html. + +The page structure is a bit difficult, it contains sub-lists of a big +list that have separating borders every few elements. + +Something like div.gl-list li > a +""" + +from typing import List + +import requests +from bs4 import BeautifulSoup, Tag + +from utils.linkutils import Link, dump_links + +PAGE_START = 8 +PAGE_END = 44 +PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/" + + +def main(): + """Collect and output article links.""" + links = [ + link + for page in range(PAGE_START - 1, PAGE_END) + for link in get_article_links(page) + ] + + with open("articles-ningxia/links.csv", "w+") as f: + dump_links(links, f) + + +def get_article_links(page: int) -> List[Link]: + """Get all (article link, date) tuples from a specific page.""" + page_link = f"{PAGE_BASE}/index_{page}.html" + soup = BeautifulSoup(requests.get(page_link).text, "html.parser") + + link_nodes = soup.select("div.gl-list li > a") + date_nodes = soup.select("div.gl-list li > span") + + def parse_link(tag: Tag) -> str: + link: str = tag.get("href") + if link.startswith("./"): + link = PAGE_BASE + link[2:] + + return link + + return [ + Link(parse_link(link), date.get_text()[1:-1]) + for link, date in zip(link_nodes, date_nodes) + ] + + +if __name__ == "__main__": + main() diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/linkutils.py b/utils/linkutils.py new file mode 100644 index 0000000..d572f97 --- /dev/null +++ b/utils/linkutils.py @@ -0,0 +1,25 @@ +"""Utility functions for handling links.""" +import csv +from typing import List, NamedTuple, TextIO + + +class Link(NamedTuple): + """A type for links - contains its url and date.""" + + url: str + date: str + + +def dump_links(links: List[Link], f: TextIO): + """Dump links to a file in csv format.""" + writer = csv.writer(f) + writer.writerow(["index", "link", "date"]) + for i, link in enumerate(links): + writer.writerow([i, link[0], link[1]]) + + +def read_links(f: TextIO) -> List[Link]: + """Read links from a csv format.""" + reader = csv.reader(f) + next(reader) # Skip the header + return [Link(link[1], link[2]) for link in reader]