Compare commits
6 commits
f7cf03d442
...
8820ce1b95
Author | SHA1 | Date | |
---|---|---|---|
Tristan Daniël Maat | 8820ce1b95 | ||
Tristan Daniël Maat | 3f2b0245ec | ||
Tristan Daniël Maat | ec77f0798b | ||
Tristan Daniël Maat | 5c557bdb9d | ||
Tristan Daniël Maat | 65bf00f452 | ||
Tristan Daniël Maat | 90b338945e |
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
__pycache__/
|
||||||
|
articles-*/
|
|
@ -9,7 +9,7 @@ We need:
|
||||||
: page 14-75
|
: page 14-75
|
||||||
|
|
||||||
[Ningxia](http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index.html)
|
[Ningxia](http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index.html)
|
||||||
: page 11-42
|
: page 11-42 (actually 8-44?)
|
||||||
|
|
||||||
[Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html)
|
[Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html)
|
||||||
: page 2-18
|
: page 2-18
|
|
@ -15,6 +15,10 @@
|
||||||
pkgs = import nixpkgs {inherit system;};
|
pkgs = import nixpkgs {inherit system;};
|
||||||
in {
|
in {
|
||||||
devShell = pkgs.mkShell {
|
devShell = pkgs.mkShell {
|
||||||
|
shellHook = ''
|
||||||
|
export PYTHONPATH="$(pwd)"
|
||||||
|
'';
|
||||||
|
|
||||||
nativeBuildInputs = with pkgs; [
|
nativeBuildInputs = with pkgs; [
|
||||||
zip
|
zip
|
||||||
unzip
|
unzip
|
||||||
|
|
46
ningxia/README.md
Normal file
46
ningxia/README.md
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
## Ningxia scraping
|
||||||
|
|
||||||
|
Zip of full article dump: [articles-ningxia.zip](./articles-ningxia.zip).
|
||||||
|
|
||||||
|
There are, once again, files that are likely just links to PDFs:
|
||||||
|
|
||||||
|
```console
|
||||||
|
.rw-r--r-- 264 tlater 9 Apr 22:20 ./2016-08-17_738.txt
|
||||||
|
.rw-r--r-- 180 tlater 9 Apr 22:20 ./2016-08-17_739.txt
|
||||||
|
.rw-r--r-- 201 tlater 9 Apr 22:19 ./2017-03-16_676.txt
|
||||||
|
.rw-r--r-- 394 tlater 9 Apr 22:19 ./2017-04-13_666.txt
|
||||||
|
.rw-r--r-- 326 tlater 9 Apr 22:19 ./2017-04-21_662.txt
|
||||||
|
.rw-r--r-- 204 tlater 9 Apr 22:19 ./2017-05-16_655.txt
|
||||||
|
.rw-r--r-- 316 tlater 9 Apr 22:19 ./2017-06-19_645.txt
|
||||||
|
.rw-r--r-- 187 tlater 9 Apr 22:18 ./2017-09-15_607.txt
|
||||||
|
.rw-r--r-- 171 tlater 9 Apr 22:18 ./2018-03-08_551.txt
|
||||||
|
.rw-r--r-- 174 tlater 9 Apr 22:17 ./2018-05-25_517.txt
|
||||||
|
.rw-r--r-- 143 tlater 9 Apr 22:17 ./2018-06-08_512.txt
|
||||||
|
.rw-r--r-- 216 tlater 9 Apr 22:17 ./2018-07-13_504.txt
|
||||||
|
.rw-r--r-- 131 tlater 9 Apr 22:17 ./2018-08-10_479.txt
|
||||||
|
.rw-r--r-- 198 tlater 9 Apr 22:16 ./2018-12-20_385.txt
|
||||||
|
.rw-r--r-- 300 tlater 9 Apr 22:15 ./2019-02-15_359.txt
|
||||||
|
.rw-r--r-- 241 tlater 9 Apr 22:15 ./2019-04-17_331.txt
|
||||||
|
.rw-r--r-- 209 tlater 9 Apr 22:15 ./2019-05-21_309.txt
|
||||||
|
.rw-r--r-- 264 tlater 9 Apr 22:15 ./2019-06-11_306.txt
|
||||||
|
.rw-r--r-- 325 tlater 9 Apr 22:15 ./2019-06-11_307.txt
|
||||||
|
.rw-r--r-- 306 tlater 9 Apr 22:15 ./2019-07-22_286.txt
|
||||||
|
.rw-r--r-- 131 tlater 9 Apr 22:14 ./2019-09-05_266.txt
|
||||||
|
.rw-r--r-- 264 tlater 9 Apr 22:14 ./2019-09-09_265.txt
|
||||||
|
.rw-r--r-- 177 tlater 9 Apr 22:14 ./2019-11-19_231.txt
|
||||||
|
.rw-r--r-- 203 tlater 9 Apr 22:13 ./2020-02-01_158.txt
|
||||||
|
.rw-r--r-- 204 tlater 9 Apr 22:13 ./2020-03-01_151.txt
|
||||||
|
.rw-r--r-- 158 tlater 9 Apr 22:12 ./2020-04-01_125.txt
|
||||||
|
.rw-r--r-- 131 tlater 9 Apr 22:13 ./2020-04-01_126.txt
|
||||||
|
.rw-r--r-- 182 tlater 9 Apr 22:13 ./2020-04-01_127.txt
|
||||||
|
.rw-r--r-- 176 tlater 9 Apr 22:12 ./2020-04-17_95.txt
|
||||||
|
.rw-r--r-- 398 tlater 9 Apr 22:12 ./2020-04-17_96.txt
|
||||||
|
.rw-r--r-- 174 tlater 9 Apr 22:12 ./2020-05-12_72.txt
|
||||||
|
.rw-r--r-- 151 tlater 9 Apr 22:12 ./2020-06-04_63.txt
|
||||||
|
.rw-r--r-- 137 tlater 9 Apr 22:12 ./2020-06-10_59.txt
|
||||||
|
.rw-r--r-- 161 tlater 9 Apr 22:11 ./2020-07-10_46.txt
|
||||||
|
.rw-r--r-- 206 tlater 9 Apr 22:11 ./2020-07-17_41.txt
|
||||||
|
.rw-r--r-- 189 tlater 9 Apr 22:11 ./2020-09-04_33.txt
|
||||||
|
.rw-r--r-- 156 tlater 9 Apr 22:11 ./2020-09-07_30.txt
|
||||||
|
.rw-r--r-- 201 tlater 9 Apr 22:11 ./2020-10-01_15.txt
|
||||||
|
```
|
BIN
ningxia/articles-ningxia.zip
Normal file
BIN
ningxia/articles-ningxia.zip
Normal file
Binary file not shown.
32
ningxia/scrape-articles.py
Normal file
32
ningxia/scrape-articles.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
r"""Script to scrape article text from http://wsjkw.nx.gov.cn.
|
||||||
|
|
||||||
|
Article contents are in a div with the class `xl-content`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from ..utils.linkutils import read_links
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Collect and output article text."""
|
||||||
|
with open("articles-ningxia/links.txt", "r") as f:
|
||||||
|
links = read_links(f)
|
||||||
|
|
||||||
|
for i, link in enumerate(links):
|
||||||
|
print(f"Downloading {link.url} ({i}/{len(links)})")
|
||||||
|
text = get_article_text(link.url)
|
||||||
|
with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f:
|
||||||
|
f.write(text)
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_text(link: str) -> str:
|
||||||
|
"""Download article text."""
|
||||||
|
request = requests.get(link)
|
||||||
|
soup = BeautifulSoup(request.text, "html.parser")
|
||||||
|
return soup.find(class_="xl-content").get_text().strip()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
58
ningxia/scrape-links.py
Normal file
58
ningxia/scrape-links.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
r"""Script to scrape article links from http://wsjkw.nx.gov.cn.
|
||||||
|
|
||||||
|
Links are available from pages
|
||||||
|
http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html.
|
||||||
|
|
||||||
|
The page structure is a bit difficult, it contains sub-lists of a big
|
||||||
|
list that have separating borders every few elements.
|
||||||
|
|
||||||
|
Something like div.gl-list li > a
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
from utils.linkutils import Link, dump_links
|
||||||
|
|
||||||
|
PAGE_START = 8
|
||||||
|
PAGE_END = 44
|
||||||
|
PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Collect and output article links."""
|
||||||
|
links = [
|
||||||
|
link
|
||||||
|
for page in range(PAGE_START - 1, PAGE_END)
|
||||||
|
for link in get_article_links(page)
|
||||||
|
]
|
||||||
|
|
||||||
|
with open("articles-ningxia/links.csv", "w+") as f:
|
||||||
|
dump_links(links, f)
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_links(page: int) -> List[Link]:
|
||||||
|
"""Get all (article link, date) tuples from a specific page."""
|
||||||
|
page_link = f"{PAGE_BASE}/index_{page}.html"
|
||||||
|
soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
|
||||||
|
|
||||||
|
link_nodes = soup.select("div.gl-list li > a")
|
||||||
|
date_nodes = soup.select("div.gl-list li > span")
|
||||||
|
|
||||||
|
def parse_link(tag: Tag) -> str:
|
||||||
|
link: str = tag.get("href")
|
||||||
|
if link.startswith("./"):
|
||||||
|
link = PAGE_BASE + link[2:]
|
||||||
|
|
||||||
|
return link
|
||||||
|
|
||||||
|
return [
|
||||||
|
Link(parse_link(link), date.get_text()[1:-1])
|
||||||
|
for link, date in zip(link_nodes, date_nodes)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
0
utils/__init__.py
Normal file
0
utils/__init__.py
Normal file
25
utils/linkutils.py
Normal file
25
utils/linkutils.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
"""Utility functions for handling links."""
|
||||||
|
import csv
|
||||||
|
from typing import List, NamedTuple, TextIO
|
||||||
|
|
||||||
|
|
||||||
|
class Link(NamedTuple):
|
||||||
|
"""A type for links - contains its url and date."""
|
||||||
|
|
||||||
|
url: str
|
||||||
|
date: str
|
||||||
|
|
||||||
|
|
||||||
|
def dump_links(links: List[Link], f: TextIO):
|
||||||
|
"""Dump links to a file in csv format."""
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerow(["index", "link", "date"])
|
||||||
|
for i, link in enumerate(links):
|
||||||
|
writer.writerow([i, link[0], link[1]])
|
||||||
|
|
||||||
|
|
||||||
|
def read_links(f: TextIO) -> List[Link]:
|
||||||
|
"""Read links from a csv format."""
|
||||||
|
reader = csv.reader(f)
|
||||||
|
next(reader) # Skip the header
|
||||||
|
return [Link(link[1], link[2]) for link in reader]
|
Loading…
Reference in a new issue