diff --git a/qinghai/2020-02-08_275.html b/qinghai/2020-02-08_275.html deleted file mode 100644 index f5ee8b5..0000000 --- a/qinghai/2020-02-08_275.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - -国家卫生健康委关于新型冠状病毒肺炎暂命名事宜的通知 - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-

通知公告

-
您现在所在位置: -首页 > 新型冠状病毒肺炎疫情防控 > 通知公告 -
-
-
-
-
国家卫生健康委关于新型冠状病毒肺炎暂命名事宜的通知
-
-
- 发布时间: - 2020-02-08 - -来源: - 医政医管局 - - - -
-
-
-

国卫医函〔2020〕42号 
-

-

-

各省、自治区、直辖市人民政府,新疆生产建设兵团,国务院应对新型冠状病毒肺炎疫情联防联控机制成员:
-  现决定将“新型冠状病毒感染的肺炎”暂命名为“新型冠状病毒肺炎”,简称“新冠肺炎”;英文名称为“Novel Coronavirus Pneumonia”,简称“NCP”。
-

-

-

-

国家卫生健康委

-

2020年2月7日

-

-

  (信息公开形式:主动公开)

-
-
- -
- - 分享到   -
-
-
-
- -
- - -
-
-
-

地址:北京市西城区西直门外南路1号  邮编:100044  电话:010-68792114  ICP备案编号:京ICP备18052910号  京公网安备 11010202000005号

-

中华人民共和国国家卫生健康委员会  版权所有  技术支持:国家卫生健康委员会统计信息中心  网站标识码:bm24000006

-
-
- -normal \ No newline at end of file diff --git a/qinghai/README.md b/qinghai/README.md deleted file mode 100644 index 18cea8b..0000000 --- a/qinghai/README.md +++ /dev/null @@ -1,53 +0,0 @@ -## Qinghai scraping - -A few links don't exist anymore. They have the indexes 210, 453, 681, -703, 791, 871, 913, 914, 915 in `links.csv`. - -There are a few small files again, mostly pdf links: - -```console -.rw-r--r-- 101 tlater 9 Apr 19:26 ./2016-09-28_923.txt -.rw-r--r-- 133 tlater 9 Apr 19:26 ./2016-09-28_924.txt -.rw-r--r-- 116 tlater 9 Apr 19:26 ./2016-09-28_925.txt -.rw-r--r-- 147 tlater 9 Apr 19:26 ./2016-09-28_926.txt -.rw-r--r-- 111 tlater 9 Apr 19:23 ./2017-03-16_838.txt -.rw-r--r-- 36 tlater 9 Apr 19:20 ./2017-07-07_745.txt -.rw-r--r-- 82 tlater 9 Apr 19:17 ./2017-08-14_723.txt -.rw-r--r-- 211 tlater 9 Apr 19:17 ./2017-09-12_704.txt -.rw-r--r-- 97 tlater 9 Apr 19:14 ./2017-11-15_587.txt -.rw-r--r-- 156 tlater 9 Apr 19:13 ./2017-11-20_580.txt -.rw-r--r-- 283 tlater 9 Apr 19:13 ./2017-11-23_575.txt -.rw-r--r-- 39 tlater 9 Apr 19:13 ./2017-12-29_566.txt -.rw-r--r-- 39 tlater 9 Apr 19:13 ./2018-01-12_561.txt -.rw-r--r-- 165 tlater 9 Apr 19:12 ./2018-05-30_505.txt -.rw-r--r-- 145 tlater 9 Apr 19:12 ./2018-05-30_507.txt -.rw-r--r-- 391 tlater 9 Apr 19:11 ./2018-07-25_475.txt -.rw-r--r-- 158 tlater 9 Apr 19:11 ./2018-09-13_467.txt -.rw-r--r-- 204 tlater 9 Apr 19:04 ./2020-03-09_254.txt -.rw-r--r-- 124 tlater 9 Apr 19:04 ./2020-03-18_248.txt -.rw-r--r-- 228 tlater 9 Apr 19:04 ./2020-03-20_245.txt -.rw-r--r-- 186 tlater 9 Apr 19:03 ./2020-04-01_221.txt -.rw-r--r-- 67 tlater 9 Apr 19:02 ./2020-04-21_208.txt -.rw-r--r-- 174 tlater 9 Apr 19:01 ./2020-04-30_194.txt -.rw-r--r-- 147 tlater 9 Apr 19:01 ./2020-05-08_186.txt -.rw-r--r-- 189 tlater 9 Apr 19:01 ./2020-05-12_182.txt -.rw-r--r-- 82 tlater 9 Apr 19:01 ./2020-05-15_180.txt -.rw-r--r-- 119 tlater 9 Apr 19:00 ./2020-06-04_139.txt -.rw-r--r-- 201 tlater 9 Apr 19:00 ./2020-07-01_114.txt -.rw-r--r-- 113 tlater 9 Apr 18:59 ./2020-07-20_90.txt -.rw-r--r-- 115 tlater 9 Apr 18:59 ./2020-07-21_86.txt -.rw-r--r-- 99 tlater 9 Apr 18:58 ./2020-08-27_36.txt -.rw-r--r-- 99 tlater 9 Apr 18:58 ./2020-08-27_37.txt -.rw-r--r-- 130 tlater 9 Apr 18:58 ./2020-08-27_38.txt -.rw-r--r-- 130 tlater 9 Apr 18:58 ./2020-08-27_39.txt -.rw-r--r-- 190 tlater 9 Apr 18:58 ./2020-08-27_40.txt -.rw-r--r-- 190 tlater 9 Apr 18:58 ./2020-08-27_41.txt -.rw-r--r-- 184 tlater 9 Apr 18:58 ./2020-08-27_42.txt -.rw-r--r-- 184 tlater 9 Apr 18:58 ./2020-08-27_43.txt -.rw-r--r-- 127 tlater 9 Apr 18:58 ./2020-08-27_44.txt -.rw-r--r-- 127 tlater 9 Apr 18:58 ./2020-08-27_45.txt -.rw-r--r-- 94 tlater 9 Apr 18:58 ./2020-08-27_46.txt -.rw-r--r-- 94 tlater 9 Apr 18:58 ./2020-08-27_47.txt -.rw-r--r-- 88 tlater 9 Apr 18:58 ./2020-09-12_20.txt -.rw-r--r-- 200 tlater 9 Apr 18:57 ./2020-09-21_11.txt -``` diff --git a/qinghai/scrape-iframe.py b/qinghai/scrape-iframe.py deleted file mode 100644 index a9a6f6d..0000000 --- a/qinghai/scrape-iframe.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Script to scrape contents from a specific article. - -This is for -http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml. - -For whatever reason, this article is implemented as an iframe, so -requires downloading with a full-featured browser. It's just one -though, so let's parse it. -""" - -from bs4 import BeautifulSoup - - -def main(): - """Scrape html site.""" - with open("2020-02-08_275.html", "r") as f: - soup = BeautifulSoup(f.read(), "html.parser") - text = soup.find(class_="w1024").get_text().strip() - with open("articles-qinghai/2020-02-08_275.txt", "w+") as f: - f.write(text) - - -if __name__ == "__main__": - main() diff --git a/qinghai/scrape.py b/qinghai/scrape.py index f6b8025..812973a 100644 --- a/qinghai/scrape.py +++ b/qinghai/scrape.py @@ -35,11 +35,8 @@ def main(): writer.writerow(i, link[0], link[1]) for i, link in enumerate(links): - # Broken links - # - # 275 was available as an iframe, and is parsed separately in - # scrape-iframe.py - if i in (210, 275, 453, 681, 703, 791, 871, 913, 914, 915): + # Broken link + if i == 210: continue print(f"Downloading {link[0]} ({i}/{len(links)})")