From 06dabf8c03461094d16632e5efefd9c49ba8db74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= Date: Sat, 9 Apr 2022 19:30:41 +0100 Subject: [PATCH 1/3] Work around other broken links --- qinghai/scrape.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/qinghai/scrape.py b/qinghai/scrape.py index 812973a..f6b8025 100644 --- a/qinghai/scrape.py +++ b/qinghai/scrape.py @@ -35,8 +35,11 @@ def main(): writer.writerow(i, link[0], link[1]) for i, link in enumerate(links): - # Broken link - if i == 210: + # Broken links + # + # 275 was available as an iframe, and is parsed separately in + # scrape-iframe.py + if i in (210, 275, 453, 681, 703, 791, 871, 913, 914, 915): continue print(f"Downloading {link[0]} ({i}/{len(links)})") From 4a1cbbe4525d44a74fd4fc7adc45eaf6e28662e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= Date: Sat, 9 Apr 2022 19:30:55 +0100 Subject: [PATCH 2/3] Document the result of the dump --- qinghai/README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 qinghai/README.md diff --git a/qinghai/README.md b/qinghai/README.md new file mode 100644 index 0000000..18cea8b --- /dev/null +++ b/qinghai/README.md @@ -0,0 +1,53 @@ +## Qinghai scraping + +A few links don't exist anymore. They have the indexes 210, 453, 681, +703, 791, 871, 913, 914, 915 in `links.csv`. + +There are a few small files again, mostly pdf links: + +```console +.rw-r--r-- 101 tlater 9 Apr 19:26 ./2016-09-28_923.txt +.rw-r--r-- 133 tlater 9 Apr 19:26 ./2016-09-28_924.txt +.rw-r--r-- 116 tlater 9 Apr 19:26 ./2016-09-28_925.txt +.rw-r--r-- 147 tlater 9 Apr 19:26 ./2016-09-28_926.txt +.rw-r--r-- 111 tlater 9 Apr 19:23 ./2017-03-16_838.txt +.rw-r--r-- 36 tlater 9 Apr 19:20 ./2017-07-07_745.txt +.rw-r--r-- 82 tlater 9 Apr 19:17 ./2017-08-14_723.txt +.rw-r--r-- 211 tlater 9 Apr 19:17 ./2017-09-12_704.txt +.rw-r--r-- 97 tlater 9 Apr 19:14 ./2017-11-15_587.txt +.rw-r--r-- 156 tlater 9 Apr 19:13 ./2017-11-20_580.txt +.rw-r--r-- 283 tlater 9 Apr 19:13 ./2017-11-23_575.txt +.rw-r--r-- 39 tlater 9 Apr 19:13 ./2017-12-29_566.txt +.rw-r--r-- 39 tlater 9 Apr 19:13 ./2018-01-12_561.txt +.rw-r--r-- 165 tlater 9 Apr 19:12 ./2018-05-30_505.txt +.rw-r--r-- 145 tlater 9 Apr 19:12 ./2018-05-30_507.txt +.rw-r--r-- 391 tlater 9 Apr 19:11 ./2018-07-25_475.txt +.rw-r--r-- 158 tlater 9 Apr 19:11 ./2018-09-13_467.txt +.rw-r--r-- 204 tlater 9 Apr 19:04 ./2020-03-09_254.txt +.rw-r--r-- 124 tlater 9 Apr 19:04 ./2020-03-18_248.txt +.rw-r--r-- 228 tlater 9 Apr 19:04 ./2020-03-20_245.txt +.rw-r--r-- 186 tlater 9 Apr 19:03 ./2020-04-01_221.txt +.rw-r--r-- 67 tlater 9 Apr 19:02 ./2020-04-21_208.txt +.rw-r--r-- 174 tlater 9 Apr 19:01 ./2020-04-30_194.txt +.rw-r--r-- 147 tlater 9 Apr 19:01 ./2020-05-08_186.txt +.rw-r--r-- 189 tlater 9 Apr 19:01 ./2020-05-12_182.txt +.rw-r--r-- 82 tlater 9 Apr 19:01 ./2020-05-15_180.txt +.rw-r--r-- 119 tlater 9 Apr 19:00 ./2020-06-04_139.txt +.rw-r--r-- 201 tlater 9 Apr 19:00 ./2020-07-01_114.txt +.rw-r--r-- 113 tlater 9 Apr 18:59 ./2020-07-20_90.txt +.rw-r--r-- 115 tlater 9 Apr 18:59 ./2020-07-21_86.txt +.rw-r--r-- 99 tlater 9 Apr 18:58 ./2020-08-27_36.txt +.rw-r--r-- 99 tlater 9 Apr 18:58 ./2020-08-27_37.txt +.rw-r--r-- 130 tlater 9 Apr 18:58 ./2020-08-27_38.txt +.rw-r--r-- 130 tlater 9 Apr 18:58 ./2020-08-27_39.txt +.rw-r--r-- 190 tlater 9 Apr 18:58 ./2020-08-27_40.txt +.rw-r--r-- 190 tlater 9 Apr 18:58 ./2020-08-27_41.txt +.rw-r--r-- 184 tlater 9 Apr 18:58 ./2020-08-27_42.txt +.rw-r--r-- 184 tlater 9 Apr 18:58 ./2020-08-27_43.txt +.rw-r--r-- 127 tlater 9 Apr 18:58 ./2020-08-27_44.txt +.rw-r--r-- 127 tlater 9 Apr 18:58 ./2020-08-27_45.txt +.rw-r--r-- 94 tlater 9 Apr 18:58 ./2020-08-27_46.txt +.rw-r--r-- 94 tlater 9 Apr 18:58 ./2020-08-27_47.txt +.rw-r--r-- 88 tlater 9 Apr 18:58 ./2020-09-12_20.txt +.rw-r--r-- 200 tlater 9 Apr 18:57 ./2020-09-21_11.txt +``` From 7fc2a23d82ba85799743e8133fd7a9ee305f1e19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= Date: Sat, 9 Apr 2022 19:31:09 +0100 Subject: [PATCH 3/3] Handle special case 275 --- qinghai/2020-02-08_275.html | 125 ++++++++++++++++++++++++++++++++++++ qinghai/scrape-iframe.py | 24 +++++++ 2 files changed, 149 insertions(+) create mode 100644 qinghai/2020-02-08_275.html create mode 100644 qinghai/scrape-iframe.py diff --git a/qinghai/2020-02-08_275.html b/qinghai/2020-02-08_275.html new file mode 100644 index 0000000..f5ee8b5 --- /dev/null +++ b/qinghai/2020-02-08_275.html @@ -0,0 +1,125 @@ + + + + + + + + + + + + + + + + + + + + + + +国家卫生健康委关于新型冠状病毒肺炎暂命名事宜的通知 + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

通知公告

+
您现在所在位置: +首页 > 新型冠状病毒肺炎疫情防控 > 通知公告 +
+
+
+
+
国家卫生健康委关于新型冠状病毒肺炎暂命名事宜的通知
+
+
+ 发布时间: + 2020-02-08 + +来源: + 医政医管局 + + + +
+
+
+

国卫医函〔2020〕42号 
+

+

+

各省、自治区、直辖市人民政府,新疆生产建设兵团,国务院应对新型冠状病毒肺炎疫情联防联控机制成员:
+  现决定将“新型冠状病毒感染的肺炎”暂命名为“新型冠状病毒肺炎”,简称“新冠肺炎”;英文名称为“Novel Coronavirus Pneumonia”,简称“NCP”。
+

+

+

+

国家卫生健康委

+

2020年2月7日

+

+

  (信息公开形式:主动公开)

+
+
+ +
+ + 分享到   +
+
+
+
+ +
+ + +
+
+
+

地址:北京市西城区西直门外南路1号  邮编:100044  电话:010-68792114  ICP备案编号:京ICP备18052910号  京公网安备 11010202000005号

+

中华人民共和国国家卫生健康委员会  版权所有  技术支持:国家卫生健康委员会统计信息中心  网站标识码:bm24000006

+
+
+ +normal \ No newline at end of file diff --git a/qinghai/scrape-iframe.py b/qinghai/scrape-iframe.py new file mode 100644 index 0000000..a9a6f6d --- /dev/null +++ b/qinghai/scrape-iframe.py @@ -0,0 +1,24 @@ +"""Script to scrape contents from a specific article. + +This is for +http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml. + +For whatever reason, this article is implemented as an iframe, so +requires downloading with a full-featured browser. It's just one +though, so let's parse it. +""" + +from bs4 import BeautifulSoup + + +def main(): + """Scrape html site.""" + with open("2020-02-08_275.html", "r") as f: + soup = BeautifulSoup(f.read(), "html.parser") + text = soup.find(class_="w1024").get_text().strip() + with open("articles-qinghai/2020-02-08_275.txt", "w+") as f: + f.write(text) + + +if __name__ == "__main__": + main()