Compare commits
No commits in common. "8820ce1b954e408f94f421ac8889709b66b7e5e7" and "f7cf03d4422ee37fd71b8a02b99031f302a68cde" have entirely different histories.
8820ce1b95
...
f7cf03d442
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,2 +0,0 @@
|
|||
__pycache__/
|
||||
articles-*/
|
|
@ -9,7 +9,7 @@ We need:
|
|||
: page 14-75
|
||||
|
||||
[Ningxia](http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index.html)
|
||||
: page 11-42 (actually 8-44?)
|
||||
: page 11-42
|
||||
|
||||
[Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html)
|
||||
: page 2-18
|
|
@ -15,10 +15,6 @@
|
|||
pkgs = import nixpkgs {inherit system;};
|
||||
in {
|
||||
devShell = pkgs.mkShell {
|
||||
shellHook = ''
|
||||
export PYTHONPATH="$(pwd)"
|
||||
'';
|
||||
|
||||
nativeBuildInputs = with pkgs; [
|
||||
zip
|
||||
unzip
|
||||
|
|
|
@ -1,46 +0,0 @@
|
|||
## Ningxia scraping
|
||||
|
||||
Zip of full article dump: [articles-ningxia.zip](./articles-ningxia.zip).
|
||||
|
||||
There are, once again, files that are likely just links to PDFs:
|
||||
|
||||
```console
|
||||
.rw-r--r-- 264 tlater 9 Apr 22:20 ./2016-08-17_738.txt
|
||||
.rw-r--r-- 180 tlater 9 Apr 22:20 ./2016-08-17_739.txt
|
||||
.rw-r--r-- 201 tlater 9 Apr 22:19 ./2017-03-16_676.txt
|
||||
.rw-r--r-- 394 tlater 9 Apr 22:19 ./2017-04-13_666.txt
|
||||
.rw-r--r-- 326 tlater 9 Apr 22:19 ./2017-04-21_662.txt
|
||||
.rw-r--r-- 204 tlater 9 Apr 22:19 ./2017-05-16_655.txt
|
||||
.rw-r--r-- 316 tlater 9 Apr 22:19 ./2017-06-19_645.txt
|
||||
.rw-r--r-- 187 tlater 9 Apr 22:18 ./2017-09-15_607.txt
|
||||
.rw-r--r-- 171 tlater 9 Apr 22:18 ./2018-03-08_551.txt
|
||||
.rw-r--r-- 174 tlater 9 Apr 22:17 ./2018-05-25_517.txt
|
||||
.rw-r--r-- 143 tlater 9 Apr 22:17 ./2018-06-08_512.txt
|
||||
.rw-r--r-- 216 tlater 9 Apr 22:17 ./2018-07-13_504.txt
|
||||
.rw-r--r-- 131 tlater 9 Apr 22:17 ./2018-08-10_479.txt
|
||||
.rw-r--r-- 198 tlater 9 Apr 22:16 ./2018-12-20_385.txt
|
||||
.rw-r--r-- 300 tlater 9 Apr 22:15 ./2019-02-15_359.txt
|
||||
.rw-r--r-- 241 tlater 9 Apr 22:15 ./2019-04-17_331.txt
|
||||
.rw-r--r-- 209 tlater 9 Apr 22:15 ./2019-05-21_309.txt
|
||||
.rw-r--r-- 264 tlater 9 Apr 22:15 ./2019-06-11_306.txt
|
||||
.rw-r--r-- 325 tlater 9 Apr 22:15 ./2019-06-11_307.txt
|
||||
.rw-r--r-- 306 tlater 9 Apr 22:15 ./2019-07-22_286.txt
|
||||
.rw-r--r-- 131 tlater 9 Apr 22:14 ./2019-09-05_266.txt
|
||||
.rw-r--r-- 264 tlater 9 Apr 22:14 ./2019-09-09_265.txt
|
||||
.rw-r--r-- 177 tlater 9 Apr 22:14 ./2019-11-19_231.txt
|
||||
.rw-r--r-- 203 tlater 9 Apr 22:13 ./2020-02-01_158.txt
|
||||
.rw-r--r-- 204 tlater 9 Apr 22:13 ./2020-03-01_151.txt
|
||||
.rw-r--r-- 158 tlater 9 Apr 22:12 ./2020-04-01_125.txt
|
||||
.rw-r--r-- 131 tlater 9 Apr 22:13 ./2020-04-01_126.txt
|
||||
.rw-r--r-- 182 tlater 9 Apr 22:13 ./2020-04-01_127.txt
|
||||
.rw-r--r-- 176 tlater 9 Apr 22:12 ./2020-04-17_95.txt
|
||||
.rw-r--r-- 398 tlater 9 Apr 22:12 ./2020-04-17_96.txt
|
||||
.rw-r--r-- 174 tlater 9 Apr 22:12 ./2020-05-12_72.txt
|
||||
.rw-r--r-- 151 tlater 9 Apr 22:12 ./2020-06-04_63.txt
|
||||
.rw-r--r-- 137 tlater 9 Apr 22:12 ./2020-06-10_59.txt
|
||||
.rw-r--r-- 161 tlater 9 Apr 22:11 ./2020-07-10_46.txt
|
||||
.rw-r--r-- 206 tlater 9 Apr 22:11 ./2020-07-17_41.txt
|
||||
.rw-r--r-- 189 tlater 9 Apr 22:11 ./2020-09-04_33.txt
|
||||
.rw-r--r-- 156 tlater 9 Apr 22:11 ./2020-09-07_30.txt
|
||||
.rw-r--r-- 201 tlater 9 Apr 22:11 ./2020-10-01_15.txt
|
||||
```
|
Binary file not shown.
|
@ -1,32 +0,0 @@
|
|||
r"""Script to scrape article text from http://wsjkw.nx.gov.cn.
|
||||
|
||||
Article contents are in a div with the class `xl-content`.
|
||||
"""
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ..utils.linkutils import read_links
|
||||
|
||||
|
||||
def main():
|
||||
"""Collect and output article text."""
|
||||
with open("articles-ningxia/links.txt", "r") as f:
|
||||
links = read_links(f)
|
||||
|
||||
for i, link in enumerate(links):
|
||||
print(f"Downloading {link.url} ({i}/{len(links)})")
|
||||
text = get_article_text(link.url)
|
||||
with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f:
|
||||
f.write(text)
|
||||
|
||||
|
||||
def get_article_text(link: str) -> str:
|
||||
"""Download article text."""
|
||||
request = requests.get(link)
|
||||
soup = BeautifulSoup(request.text, "html.parser")
|
||||
return soup.find(class_="xl-content").get_text().strip()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,58 +0,0 @@
|
|||
r"""Script to scrape article links from http://wsjkw.nx.gov.cn.
|
||||
|
||||
Links are available from pages
|
||||
http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html.
|
||||
|
||||
The page structure is a bit difficult, it contains sub-lists of a big
|
||||
list that have separating borders every few elements.
|
||||
|
||||
Something like div.gl-list li > a
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from utils.linkutils import Link, dump_links
|
||||
|
||||
PAGE_START = 8
|
||||
PAGE_END = 44
|
||||
PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/"
|
||||
|
||||
|
||||
def main():
|
||||
"""Collect and output article links."""
|
||||
links = [
|
||||
link
|
||||
for page in range(PAGE_START - 1, PAGE_END)
|
||||
for link in get_article_links(page)
|
||||
]
|
||||
|
||||
with open("articles-ningxia/links.csv", "w+") as f:
|
||||
dump_links(links, f)
|
||||
|
||||
|
||||
def get_article_links(page: int) -> List[Link]:
|
||||
"""Get all (article link, date) tuples from a specific page."""
|
||||
page_link = f"{PAGE_BASE}/index_{page}.html"
|
||||
soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
|
||||
|
||||
link_nodes = soup.select("div.gl-list li > a")
|
||||
date_nodes = soup.select("div.gl-list li > span")
|
||||
|
||||
def parse_link(tag: Tag) -> str:
|
||||
link: str = tag.get("href")
|
||||
if link.startswith("./"):
|
||||
link = PAGE_BASE + link[2:]
|
||||
|
||||
return link
|
||||
|
||||
return [
|
||||
Link(parse_link(link), date.get_text()[1:-1])
|
||||
for link, date in zip(link_nodes, date_nodes)
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,25 +0,0 @@
|
|||
"""Utility functions for handling links."""
|
||||
import csv
|
||||
from typing import List, NamedTuple, TextIO
|
||||
|
||||
|
||||
class Link(NamedTuple):
|
||||
"""A type for links - contains its url and date."""
|
||||
|
||||
url: str
|
||||
date: str
|
||||
|
||||
|
||||
def dump_links(links: List[Link], f: TextIO):
|
||||
"""Dump links to a file in csv format."""
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["index", "link", "date"])
|
||||
for i, link in enumerate(links):
|
||||
writer.writerow([i, link[0], link[1]])
|
||||
|
||||
|
||||
def read_links(f: TextIO) -> List[Link]:
|
||||
"""Read links from a csv format."""
|
||||
reader = csv.reader(f)
|
||||
next(reader) # Skip the header
|
||||
return [Link(link[1], link[2]) for link in reader]
|
Loading…
Reference in a new issue