diff --git a/shanxi/README.md b/shanxi/README.md deleted file mode 100644 index 4cf6b71..0000000 --- a/shanxi/README.md +++ /dev/null @@ -1,64 +0,0 @@ -## Shanxi scraping - -This time, articles from the last few pages are inaccessible, 254-283. - -Zip of full article dump: [articles-shanxi.zip](./articles-shanxi.zip). - -There are, once again, files that are likely just links to PDFs: - -```console -.rw-r--r-- 328 tlater 9 Apr 23:37 ./2017-03-29_239.txt -.rw-r--r-- 301 tlater 9 Apr 23:36 ./2017-06-08_229.txt -.rw-r--r-- 286 tlater 9 Apr 23:36 ./2017-06-20_223.txt -.rw-r--r-- 310 tlater 9 Apr 23:36 ./2017-07-12_218.txt -.rw-r--r-- 242 tlater 9 Apr 23:35 ./2017-11-17_198.txt -.rw-r--r-- 298 tlater 9 Apr 23:34 ./2018-04-27_156.txt -.rw-r--r-- 288 tlater 9 Apr 23:34 ./2018-04-27_157.txt -.rw-r--r-- 298 tlater 9 Apr 23:34 ./2018-04-27_158.txt -.rw-r--r-- 322 tlater 9 Apr 23:33 ./2018-05-15_149.txt -.rw-r--r-- 352 tlater 9 Apr 23:34 ./2018-05-15_150.txt -.rw-r--r-- 334 tlater 9 Apr 23:34 ./2018-05-15_151.txt -.rw-r--r-- 128 tlater 9 Apr 23:33 ./2018-09-25_135.txt -.rw-r--r-- 293 tlater 9 Apr 23:33 ./2018-11-16_128.txt -.rw-r--r-- 332 tlater 9 Apr 23:33 ./2018-11-28_124.txt -.rw-r--r-- 298 tlater 9 Apr 23:32 ./2019-01-15_115.txt -.rw-r--r-- 277 tlater 9 Apr 23:33 ./2019-01-15_116.txt -.rw-r--r-- 364 tlater 9 Apr 23:31 ./2019-05-30_83.txt -.rw-r--r-- 313 tlater 9 Apr 23:31 ./2019-05-30_85.txt -.rw-r--r-- 352 tlater 9 Apr 23:32 ./2019-05-30_92.txt -.rw-r--r-- 295 tlater 9 Apr 23:32 ./2019-05-30_93.txt -.rw-r--r-- 388 tlater 9 Apr 23:32 ./2019-05-30_94.txt -.rw-r--r-- 358 tlater 9 Apr 23:32 ./2019-05-30_96.txt -.rw-r--r-- 382 tlater 9 Apr 23:31 ./2019-05-31_77.txt -.rw-r--r-- 361 tlater 9 Apr 23:31 ./2019-05-31_80.txt -.rw-r--r-- 250 tlater 9 Apr 23:31 ./2019-07-15_68.txt -.rw-r--r-- 395 tlater 9 Apr 23:29 ./2019-10-21_59.txt -.rw-r--r-- 278 tlater 9 Apr 23:30 ./2019-10-21_61.txt -.rw-r--r-- 380 tlater 9 Apr 23:29 ./2019-11-05_58.txt -.rw-r--r-- 329 tlater 9 Apr 23:29 ./2019-12-04_56.txt -.rw-r--r-- 341 tlater 9 Apr 23:29 ./2019-12-05_55.txt -.rw-r--r-- 363 tlater 9 Apr 23:29 ./2019-12-20_45.txt -.rw-r--r-- 339 tlater 9 Apr 23:29 ./2019-12-20_46.txt -.rw-r--r-- 338 tlater 9 Apr 23:29 ./2020-03-27_44.txt -.rw-r--r-- 373 tlater 9 Apr 23:29 ./2020-05-06_37.txt -.rw-r--r-- 338 tlater 9 Apr 23:29 ./2020-05-06_38.txt -.rw-r--r-- 349 tlater 9 Apr 23:29 ./2020-05-06_39.txt -.rw-r--r-- 386 tlater 9 Apr 23:29 ./2020-05-20_34.txt -.rw-r--r-- 315 tlater 9 Apr 23:29 ./2020-06-29_32.txt -.rw-r--r-- 321 tlater 9 Apr 23:29 ./2020-07-02_31.txt -.rw-r--r-- 350 tlater 9 Apr 23:29 ./2020-07-13_29.txt -.rw-r--r-- 380 tlater 9 Apr 23:28 ./2020-07-23_14.txt -.rw-r--r-- 362 tlater 9 Apr 23:28 ./2020-07-23_15.txt -.rw-r--r-- 374 tlater 9 Apr 23:28 ./2020-07-23_16.txt -.rw-r--r-- 392 tlater 9 Apr 23:28 ./2020-07-23_19.txt -.rw-r--r-- 392 tlater 9 Apr 23:28 ./2020-07-23_24.txt -.rw-r--r-- 266 tlater 9 Apr 23:28 ./2020-08-04_13.txt -.rw-r--r-- 206 tlater 9 Apr 23:28 ./2020-08-19_10.txt -.rw-r--r-- 302 tlater 9 Apr 23:28 ./2020-08-21_9.txt -.rw-r--r-- 258 tlater 9 Apr 23:28 ./2020-09-01_6.txt -.rw-r--r-- 284 tlater 9 Apr 23:28 ./2020-09-01_7.txt -.rw-r--r-- 326 tlater 9 Apr 23:28 ./2020-09-01_8.txt -.rw-r--r-- 194 tlater 9 Apr 23:28 ./2020-09-14_4.txt -.rw-r--r-- 332 tlater 9 Apr 23:28 ./2020-10-09_3.txt -.rw-r--r-- 315 tlater 9 Apr 23:28 ./2020-10-12_1.txt -``` diff --git a/shanxi/articles-shanxi.zip b/shanxi/articles-shanxi.zip deleted file mode 100644 index 9091462..0000000 Binary files a/shanxi/articles-shanxi.zip and /dev/null differ diff --git a/shanxi/scrape-articles.py b/shanxi/scrape-articles.py index 8896064..63fb55c 100644 --- a/shanxi/scrape-articles.py +++ b/shanxi/scrape-articles.py @@ -12,9 +12,6 @@ def main(): with open("articles-shanxi/links.csv", "r") as f: links = read_links(f) - # Broken links - links = links[:254] - download_link_texts(links, "message-box", "articles-shanxi", encoding="UTF-8")