Compare commits
No commits in common. "f9aab0628e392667c47f286d906dca4c51842814" and "3602c569669586d5e0dd89341e9c3ce1f57125c7" have entirely different histories.
f9aab0628e
...
3602c56966
|
@ -1,64 +0,0 @@
|
|||
## Shanxi scraping
|
||||
|
||||
This time, articles from the last few pages are inaccessible, 254-283.
|
||||
|
||||
Zip of full article dump: [articles-shanxi.zip](./articles-shanxi.zip).
|
||||
|
||||
There are, once again, files that are likely just links to PDFs:
|
||||
|
||||
```console
|
||||
.rw-r--r-- 328 tlater 9 Apr 23:37 ./2017-03-29_239.txt
|
||||
.rw-r--r-- 301 tlater 9 Apr 23:36 ./2017-06-08_229.txt
|
||||
.rw-r--r-- 286 tlater 9 Apr 23:36 ./2017-06-20_223.txt
|
||||
.rw-r--r-- 310 tlater 9 Apr 23:36 ./2017-07-12_218.txt
|
||||
.rw-r--r-- 242 tlater 9 Apr 23:35 ./2017-11-17_198.txt
|
||||
.rw-r--r-- 298 tlater 9 Apr 23:34 ./2018-04-27_156.txt
|
||||
.rw-r--r-- 288 tlater 9 Apr 23:34 ./2018-04-27_157.txt
|
||||
.rw-r--r-- 298 tlater 9 Apr 23:34 ./2018-04-27_158.txt
|
||||
.rw-r--r-- 322 tlater 9 Apr 23:33 ./2018-05-15_149.txt
|
||||
.rw-r--r-- 352 tlater 9 Apr 23:34 ./2018-05-15_150.txt
|
||||
.rw-r--r-- 334 tlater 9 Apr 23:34 ./2018-05-15_151.txt
|
||||
.rw-r--r-- 128 tlater 9 Apr 23:33 ./2018-09-25_135.txt
|
||||
.rw-r--r-- 293 tlater 9 Apr 23:33 ./2018-11-16_128.txt
|
||||
.rw-r--r-- 332 tlater 9 Apr 23:33 ./2018-11-28_124.txt
|
||||
.rw-r--r-- 298 tlater 9 Apr 23:32 ./2019-01-15_115.txt
|
||||
.rw-r--r-- 277 tlater 9 Apr 23:33 ./2019-01-15_116.txt
|
||||
.rw-r--r-- 364 tlater 9 Apr 23:31 ./2019-05-30_83.txt
|
||||
.rw-r--r-- 313 tlater 9 Apr 23:31 ./2019-05-30_85.txt
|
||||
.rw-r--r-- 352 tlater 9 Apr 23:32 ./2019-05-30_92.txt
|
||||
.rw-r--r-- 295 tlater 9 Apr 23:32 ./2019-05-30_93.txt
|
||||
.rw-r--r-- 388 tlater 9 Apr 23:32 ./2019-05-30_94.txt
|
||||
.rw-r--r-- 358 tlater 9 Apr 23:32 ./2019-05-30_96.txt
|
||||
.rw-r--r-- 382 tlater 9 Apr 23:31 ./2019-05-31_77.txt
|
||||
.rw-r--r-- 361 tlater 9 Apr 23:31 ./2019-05-31_80.txt
|
||||
.rw-r--r-- 250 tlater 9 Apr 23:31 ./2019-07-15_68.txt
|
||||
.rw-r--r-- 395 tlater 9 Apr 23:29 ./2019-10-21_59.txt
|
||||
.rw-r--r-- 278 tlater 9 Apr 23:30 ./2019-10-21_61.txt
|
||||
.rw-r--r-- 380 tlater 9 Apr 23:29 ./2019-11-05_58.txt
|
||||
.rw-r--r-- 329 tlater 9 Apr 23:29 ./2019-12-04_56.txt
|
||||
.rw-r--r-- 341 tlater 9 Apr 23:29 ./2019-12-05_55.txt
|
||||
.rw-r--r-- 363 tlater 9 Apr 23:29 ./2019-12-20_45.txt
|
||||
.rw-r--r-- 339 tlater 9 Apr 23:29 ./2019-12-20_46.txt
|
||||
.rw-r--r-- 338 tlater 9 Apr 23:29 ./2020-03-27_44.txt
|
||||
.rw-r--r-- 373 tlater 9 Apr 23:29 ./2020-05-06_37.txt
|
||||
.rw-r--r-- 338 tlater 9 Apr 23:29 ./2020-05-06_38.txt
|
||||
.rw-r--r-- 349 tlater 9 Apr 23:29 ./2020-05-06_39.txt
|
||||
.rw-r--r-- 386 tlater 9 Apr 23:29 ./2020-05-20_34.txt
|
||||
.rw-r--r-- 315 tlater 9 Apr 23:29 ./2020-06-29_32.txt
|
||||
.rw-r--r-- 321 tlater 9 Apr 23:29 ./2020-07-02_31.txt
|
||||
.rw-r--r-- 350 tlater 9 Apr 23:29 ./2020-07-13_29.txt
|
||||
.rw-r--r-- 380 tlater 9 Apr 23:28 ./2020-07-23_14.txt
|
||||
.rw-r--r-- 362 tlater 9 Apr 23:28 ./2020-07-23_15.txt
|
||||
.rw-r--r-- 374 tlater 9 Apr 23:28 ./2020-07-23_16.txt
|
||||
.rw-r--r-- 392 tlater 9 Apr 23:28 ./2020-07-23_19.txt
|
||||
.rw-r--r-- 392 tlater 9 Apr 23:28 ./2020-07-23_24.txt
|
||||
.rw-r--r-- 266 tlater 9 Apr 23:28 ./2020-08-04_13.txt
|
||||
.rw-r--r-- 206 tlater 9 Apr 23:28 ./2020-08-19_10.txt
|
||||
.rw-r--r-- 302 tlater 9 Apr 23:28 ./2020-08-21_9.txt
|
||||
.rw-r--r-- 258 tlater 9 Apr 23:28 ./2020-09-01_6.txt
|
||||
.rw-r--r-- 284 tlater 9 Apr 23:28 ./2020-09-01_7.txt
|
||||
.rw-r--r-- 326 tlater 9 Apr 23:28 ./2020-09-01_8.txt
|
||||
.rw-r--r-- 194 tlater 9 Apr 23:28 ./2020-09-14_4.txt
|
||||
.rw-r--r-- 332 tlater 9 Apr 23:28 ./2020-10-09_3.txt
|
||||
.rw-r--r-- 315 tlater 9 Apr 23:28 ./2020-10-12_1.txt
|
||||
```
|
Binary file not shown.
|
@ -12,9 +12,6 @@ def main():
|
|||
with open("articles-shanxi/links.csv", "r") as f:
|
||||
links = read_links(f)
|
||||
|
||||
# Broken links
|
||||
links = links[:254]
|
||||
|
||||
download_link_texts(links, "message-box", "articles-shanxi", encoding="UTF-8")
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue