Compare commits

..

6 commits

9 changed files with 168 additions and 1 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
__pycache__/
articles-*/

View file

@ -9,7 +9,7 @@ We need:
: page 14-75
[Ningxia](http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index.html)
: page 11-42
: page 11-42 (actually 8-44?)
[Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html)
: page 2-18

View file

@ -15,6 +15,10 @@
pkgs = import nixpkgs {inherit system;};
in {
devShell = pkgs.mkShell {
shellHook = ''
export PYTHONPATH="$(pwd)"
'';
nativeBuildInputs = with pkgs; [
zip
unzip

46
ningxia/README.md Normal file
View file

@ -0,0 +1,46 @@
## Ningxia scraping
Zip of full article dump: [articles-ningxia.zip](./articles-ningxia.zip).
There are, once again, files that are likely just links to PDFs:
```console
.rw-r--r-- 264 tlater 9 Apr 22:20 ./2016-08-17_738.txt
.rw-r--r-- 180 tlater 9 Apr 22:20 ./2016-08-17_739.txt
.rw-r--r-- 201 tlater 9 Apr 22:19 ./2017-03-16_676.txt
.rw-r--r-- 394 tlater 9 Apr 22:19 ./2017-04-13_666.txt
.rw-r--r-- 326 tlater 9 Apr 22:19 ./2017-04-21_662.txt
.rw-r--r-- 204 tlater 9 Apr 22:19 ./2017-05-16_655.txt
.rw-r--r-- 316 tlater 9 Apr 22:19 ./2017-06-19_645.txt
.rw-r--r-- 187 tlater 9 Apr 22:18 ./2017-09-15_607.txt
.rw-r--r-- 171 tlater 9 Apr 22:18 ./2018-03-08_551.txt
.rw-r--r-- 174 tlater 9 Apr 22:17 ./2018-05-25_517.txt
.rw-r--r-- 143 tlater 9 Apr 22:17 ./2018-06-08_512.txt
.rw-r--r-- 216 tlater 9 Apr 22:17 ./2018-07-13_504.txt
.rw-r--r-- 131 tlater 9 Apr 22:17 ./2018-08-10_479.txt
.rw-r--r-- 198 tlater 9 Apr 22:16 ./2018-12-20_385.txt
.rw-r--r-- 300 tlater 9 Apr 22:15 ./2019-02-15_359.txt
.rw-r--r-- 241 tlater 9 Apr 22:15 ./2019-04-17_331.txt
.rw-r--r-- 209 tlater 9 Apr 22:15 ./2019-05-21_309.txt
.rw-r--r-- 264 tlater 9 Apr 22:15 ./2019-06-11_306.txt
.rw-r--r-- 325 tlater 9 Apr 22:15 ./2019-06-11_307.txt
.rw-r--r-- 306 tlater 9 Apr 22:15 ./2019-07-22_286.txt
.rw-r--r-- 131 tlater 9 Apr 22:14 ./2019-09-05_266.txt
.rw-r--r-- 264 tlater 9 Apr 22:14 ./2019-09-09_265.txt
.rw-r--r-- 177 tlater 9 Apr 22:14 ./2019-11-19_231.txt
.rw-r--r-- 203 tlater 9 Apr 22:13 ./2020-02-01_158.txt
.rw-r--r-- 204 tlater 9 Apr 22:13 ./2020-03-01_151.txt
.rw-r--r-- 158 tlater 9 Apr 22:12 ./2020-04-01_125.txt
.rw-r--r-- 131 tlater 9 Apr 22:13 ./2020-04-01_126.txt
.rw-r--r-- 182 tlater 9 Apr 22:13 ./2020-04-01_127.txt
.rw-r--r-- 176 tlater 9 Apr 22:12 ./2020-04-17_95.txt
.rw-r--r-- 398 tlater 9 Apr 22:12 ./2020-04-17_96.txt
.rw-r--r-- 174 tlater 9 Apr 22:12 ./2020-05-12_72.txt
.rw-r--r-- 151 tlater 9 Apr 22:12 ./2020-06-04_63.txt
.rw-r--r-- 137 tlater 9 Apr 22:12 ./2020-06-10_59.txt
.rw-r--r-- 161 tlater 9 Apr 22:11 ./2020-07-10_46.txt
.rw-r--r-- 206 tlater 9 Apr 22:11 ./2020-07-17_41.txt
.rw-r--r-- 189 tlater 9 Apr 22:11 ./2020-09-04_33.txt
.rw-r--r-- 156 tlater 9 Apr 22:11 ./2020-09-07_30.txt
.rw-r--r-- 201 tlater 9 Apr 22:11 ./2020-10-01_15.txt
```

Binary file not shown.

View file

@ -0,0 +1,32 @@
r"""Script to scrape article text from http://wsjkw.nx.gov.cn.
Article contents are in a div with the class `xl-content`.
"""
import requests
from bs4 import BeautifulSoup
from ..utils.linkutils import read_links
def main():
"""Collect and output article text."""
with open("articles-ningxia/links.txt", "r") as f:
links = read_links(f)
for i, link in enumerate(links):
print(f"Downloading {link.url} ({i}/{len(links)})")
text = get_article_text(link.url)
with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f:
f.write(text)
def get_article_text(link: str) -> str:
"""Download article text."""
request = requests.get(link)
soup = BeautifulSoup(request.text, "html.parser")
return soup.find(class_="xl-content").get_text().strip()
if __name__ == "__main__":
main()

58
ningxia/scrape-links.py Normal file
View file

@ -0,0 +1,58 @@
r"""Script to scrape article links from http://wsjkw.nx.gov.cn.
Links are available from pages
http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html.
The page structure is a bit difficult, it contains sub-lists of a big
list that have separating borders every few elements.
Something like div.gl-list li > a
"""
from typing import List
import requests
from bs4 import BeautifulSoup, Tag
from utils.linkutils import Link, dump_links
PAGE_START = 8
PAGE_END = 44
PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/"
def main():
"""Collect and output article links."""
links = [
link
for page in range(PAGE_START - 1, PAGE_END)
for link in get_article_links(page)
]
with open("articles-ningxia/links.csv", "w+") as f:
dump_links(links, f)
def get_article_links(page: int) -> List[Link]:
"""Get all (article link, date) tuples from a specific page."""
page_link = f"{PAGE_BASE}/index_{page}.html"
soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
link_nodes = soup.select("div.gl-list li > a")
date_nodes = soup.select("div.gl-list li > span")
def parse_link(tag: Tag) -> str:
link: str = tag.get("href")
if link.startswith("./"):
link = PAGE_BASE + link[2:]
return link
return [
Link(parse_link(link), date.get_text()[1:-1])
for link, date in zip(link_nodes, date_nodes)
]
if __name__ == "__main__":
main()

0
utils/__init__.py Normal file
View file

25
utils/linkutils.py Normal file
View file

@ -0,0 +1,25 @@
"""Utility functions for handling links."""
import csv
from typing import List, NamedTuple, TextIO
class Link(NamedTuple):
"""A type for links - contains its url and date."""
url: str
date: str
def dump_links(links: List[Link], f: TextIO):
"""Dump links to a file in csv format."""
writer = csv.writer(f)
writer.writerow(["index", "link", "date"])
for i, link in enumerate(links):
writer.writerow([i, link[0], link[1]])
def read_links(f: TextIO) -> List[Link]:
"""Read links from a csv format."""
reader = csv.reader(f)
next(reader) # Skip the header
return [Link(link[1], link[2]) for link in reader]