diff --git a/shanxi/README.md b/shanxi/README.md new file mode 100644 index 0000000..4cf6b71 --- /dev/null +++ b/shanxi/README.md @@ -0,0 +1,64 @@ +## Shanxi scraping + +This time, articles from the last few pages are inaccessible, 254-283. + +Zip of full article dump: [articles-shanxi.zip](./articles-shanxi.zip). + +There are, once again, files that are likely just links to PDFs: + +```console +.rw-r--r-- 328 tlater 9 Apr 23:37 ./2017-03-29_239.txt +.rw-r--r-- 301 tlater 9 Apr 23:36 ./2017-06-08_229.txt +.rw-r--r-- 286 tlater 9 Apr 23:36 ./2017-06-20_223.txt +.rw-r--r-- 310 tlater 9 Apr 23:36 ./2017-07-12_218.txt +.rw-r--r-- 242 tlater 9 Apr 23:35 ./2017-11-17_198.txt +.rw-r--r-- 298 tlater 9 Apr 23:34 ./2018-04-27_156.txt +.rw-r--r-- 288 tlater 9 Apr 23:34 ./2018-04-27_157.txt +.rw-r--r-- 298 tlater 9 Apr 23:34 ./2018-04-27_158.txt +.rw-r--r-- 322 tlater 9 Apr 23:33 ./2018-05-15_149.txt +.rw-r--r-- 352 tlater 9 Apr 23:34 ./2018-05-15_150.txt +.rw-r--r-- 334 tlater 9 Apr 23:34 ./2018-05-15_151.txt +.rw-r--r-- 128 tlater 9 Apr 23:33 ./2018-09-25_135.txt +.rw-r--r-- 293 tlater 9 Apr 23:33 ./2018-11-16_128.txt +.rw-r--r-- 332 tlater 9 Apr 23:33 ./2018-11-28_124.txt +.rw-r--r-- 298 tlater 9 Apr 23:32 ./2019-01-15_115.txt +.rw-r--r-- 277 tlater 9 Apr 23:33 ./2019-01-15_116.txt +.rw-r--r-- 364 tlater 9 Apr 23:31 ./2019-05-30_83.txt +.rw-r--r-- 313 tlater 9 Apr 23:31 ./2019-05-30_85.txt +.rw-r--r-- 352 tlater 9 Apr 23:32 ./2019-05-30_92.txt +.rw-r--r-- 295 tlater 9 Apr 23:32 ./2019-05-30_93.txt +.rw-r--r-- 388 tlater 9 Apr 23:32 ./2019-05-30_94.txt +.rw-r--r-- 358 tlater 9 Apr 23:32 ./2019-05-30_96.txt +.rw-r--r-- 382 tlater 9 Apr 23:31 ./2019-05-31_77.txt +.rw-r--r-- 361 tlater 9 Apr 23:31 ./2019-05-31_80.txt +.rw-r--r-- 250 tlater 9 Apr 23:31 ./2019-07-15_68.txt +.rw-r--r-- 395 tlater 9 Apr 23:29 ./2019-10-21_59.txt +.rw-r--r-- 278 tlater 9 Apr 23:30 ./2019-10-21_61.txt +.rw-r--r-- 380 tlater 9 Apr 23:29 ./2019-11-05_58.txt +.rw-r--r-- 329 tlater 9 Apr 23:29 ./2019-12-04_56.txt +.rw-r--r-- 341 tlater 9 Apr 23:29 ./2019-12-05_55.txt +.rw-r--r-- 363 tlater 9 Apr 23:29 ./2019-12-20_45.txt +.rw-r--r-- 339 tlater 9 Apr 23:29 ./2019-12-20_46.txt +.rw-r--r-- 338 tlater 9 Apr 23:29 ./2020-03-27_44.txt +.rw-r--r-- 373 tlater 9 Apr 23:29 ./2020-05-06_37.txt +.rw-r--r-- 338 tlater 9 Apr 23:29 ./2020-05-06_38.txt +.rw-r--r-- 349 tlater 9 Apr 23:29 ./2020-05-06_39.txt +.rw-r--r-- 386 tlater 9 Apr 23:29 ./2020-05-20_34.txt +.rw-r--r-- 315 tlater 9 Apr 23:29 ./2020-06-29_32.txt +.rw-r--r-- 321 tlater 9 Apr 23:29 ./2020-07-02_31.txt +.rw-r--r-- 350 tlater 9 Apr 23:29 ./2020-07-13_29.txt +.rw-r--r-- 380 tlater 9 Apr 23:28 ./2020-07-23_14.txt +.rw-r--r-- 362 tlater 9 Apr 23:28 ./2020-07-23_15.txt +.rw-r--r-- 374 tlater 9 Apr 23:28 ./2020-07-23_16.txt +.rw-r--r-- 392 tlater 9 Apr 23:28 ./2020-07-23_19.txt +.rw-r--r-- 392 tlater 9 Apr 23:28 ./2020-07-23_24.txt +.rw-r--r-- 266 tlater 9 Apr 23:28 ./2020-08-04_13.txt +.rw-r--r-- 206 tlater 9 Apr 23:28 ./2020-08-19_10.txt +.rw-r--r-- 302 tlater 9 Apr 23:28 ./2020-08-21_9.txt +.rw-r--r-- 258 tlater 9 Apr 23:28 ./2020-09-01_6.txt +.rw-r--r-- 284 tlater 9 Apr 23:28 ./2020-09-01_7.txt +.rw-r--r-- 326 tlater 9 Apr 23:28 ./2020-09-01_8.txt +.rw-r--r-- 194 tlater 9 Apr 23:28 ./2020-09-14_4.txt +.rw-r--r-- 332 tlater 9 Apr 23:28 ./2020-10-09_3.txt +.rw-r--r-- 315 tlater 9 Apr 23:28 ./2020-10-12_1.txt +``` diff --git a/shanxi/articles-shanxi.zip b/shanxi/articles-shanxi.zip new file mode 100644 index 0000000..9091462 Binary files /dev/null and b/shanxi/articles-shanxi.zip differ