Compare commits
3 commits
8cb72464b4
...
7fc2a23d82
Author | SHA1 | Date | |
---|---|---|---|
Tristan Daniël Maat | 7fc2a23d82 | ||
Tristan Daniël Maat | 4a1cbbe452 | ||
Tristan Daniël Maat | 06dabf8c03 |
125
qinghai/2020-02-08_275.html
Normal file
125
qinghai/2020-02-08_275.html
Normal file
|
@ -0,0 +1,125 @@
|
|||
<!DOCTYPE html><html><head>
|
||||
<link rel="icon" data-savepage-href="http://www.nhc.gov.cn/favicon.ico" href="">
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta charset="utf-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1.0, user-scalable=no">
|
||||
|
||||
<meta name="SiteName" content="宣传司">
|
||||
<meta name="SiteDomain" content="">
|
||||
<meta name="SiteIDCode" content="">
|
||||
<meta name="ColumnName" content="通知公告">
|
||||
<meta name="ColumnDescription" content="">
|
||||
<meta name="ColumnKeywords" content="">
|
||||
<meta name="ColumnType" content="">
|
||||
<meta name="ArticleTitle" content="国家卫生健康委关于新型冠状病毒肺炎暂命名事宜的通知">
|
||||
<meta name="PubDate" content="2020-04-22 09:34:47">
|
||||
<meta name="Keywords" content="">
|
||||
<meta name="Description" content="">
|
||||
|
||||
<meta name="others" content="页面生成时间 2020-04-22 09:34:47">
|
||||
<meta name="template,templategroup,version" content="adb7c6acf2db4cc19c609ea5121b3d79,default,1.1">
|
||||
<title>国家卫生健康委关于新型冠状病毒肺炎暂命名事宜的通知</title>
|
||||
<link href="/xcs/xhtml/css/publics.css" type="text/css" rel="stylesheet">
|
||||
<link rel="stylesheet" type="text/css" href="/xcs/xhtml/css/xxgzbd.css">
|
||||
<link href="/xcs/xhtml/css/list.css" type="text/css" rel="stylesheet">
|
||||
|
||||
<!--[if lt IE 9]><script r='m'>document.createElement("section")</script><![endif]--><script data-savepage-type="" type="text/plain" data-savepage-src="/xcs/xhtml/js/jquery-1.8.3.min.js"></script>
|
||||
<script data-savepage-type="text/javascript" type="text/plain"></script>
|
||||
<script data-savepage-type="" type="text/plain"></script>
|
||||
<script data-savepage-type="" type="text/plain" data-savepage-src="http://bdimg.share.baidu.com/static/api/js/share.js?v=89860593.js?cdnversion=458201"></script><style type="text/css">@media print {
|
||||
.TridactylStatusIndicator {
|
||||
display: none !important;
|
||||
}
|
||||
}</style><link rel="stylesheet" href="http://bdimg.share.baidu.com/static/api/css/share_style0_16.css?v=8105b07e.css">
|
||||
<style id="savepage-cssvariables">
|
||||
:root {
|
||||
--savepage-url-5: url(data:image/png;base64,);
|
||||
--savepage-url-6: url(data:image/png;base64,);
|
||||
--savepage-url-10: url(data:image/png;base64,);
|
||||
}
|
||||
</style>
|
||||
<script id="savepage-shadowloader" type="application/javascript">
|
||||
"use strict";
|
||||
window.addEventListener("DOMContentLoaded",
|
||||
function(event) {
|
||||
savepage_ShadowLoader(5);
|
||||
},false);
|
||||
function savepage_ShadowLoader(c){createShadowDOMs(0,document.documentElement);function createShadowDOMs(a,b){var i;if(b.localName=="iframe"||b.localName=="frame"){if(a<c){try{if(b.contentDocument.documentElement!=null){createShadowDOMs(a+1,b.contentDocument.documentElement)}}catch(e){}}}else{if(b.children.length>=1&&b.children[0].localName=="template"&&b.children[0].hasAttribute("data-savepage-shadowroot")){b.attachShadow({mode:"open"}).appendChild(b.children[0].content);b.removeChild(b.children[0]);for(i=0;i<b.shadowRoot.children.length;i++)if(b.shadowRoot.children[i]!=null)createShadowDOMs(a,b.shadowRoot.children[i])}for(i=0;i<b.children.length;i++)if(b.children[i]!=null)createShadowDOMs(a,b.children[i])}}}
|
||||
</script>
|
||||
<meta name="savepage-url" content="http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml">
|
||||
<meta name="savepage-title" content="国家卫生健康委关于新型冠状病毒肺炎暂命名事宜的通知">
|
||||
<meta name="savepage-pubdate" content="Unknown">
|
||||
<meta name="savepage-from" content="http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml">
|
||||
<meta name="savepage-date" content="Sat Apr 09 2022 19:11:51 GMT+0100 (British Summer Time)">
|
||||
<meta name="savepage-state" content="Basic Items; Retain cross-origin frames; Merge CSS images; Remove unsaved URLs; Load lazy images in existing content; Max frame depth = 5; Max resource size = 50MB; Max resource time = 10s;">
|
||||
<meta name="savepage-version" content="27.3">
|
||||
<meta name="savepage-comments" content="">
|
||||
</head><iframe class="cleanslate hidden" data-savepage-src="moz-extension://4c36391b-a5be-4bdf-a527-2faf208ce26c/static/commandline.html" src="" id="cmdline_iframe" loading="lazy" style="height: 0px !important;" data-savepage-key="0-0"></iframe>
|
||||
|
||||
<body>
|
||||
<script data-savepage-type="" type="text/plain" language="JavaScript"></script>
|
||||
<script data-savepage-type="text/javascript" type="text/plain" charset="utf-8" id="kpyfx_js_id_10006654" data-savepage-src="//fxsjcj.kaipuyun.cn/count/10006654/10006654.js"></script>
|
||||
|
||||
<div class="banner"><img data-savepage-currentsrc="http://www.nhc.gov.cn/xcs/xhtml/images/gzbdbanner.png" data-savepage-src="/xcs/xhtml/images/gzbdbanner.png" src="" class="bannerimg"></div>
|
||||
<div class="banner banner_ydd"><img data-savepage-currentsrc="http://www.nhc.gov.cn/xcs/xhtml/images/gzbdbanner_ydd.png" data-savepage-src="/xcs/xhtml/images/gzbdbanner_ydd.png" src="" class="bannerimg"></div>
|
||||
|
||||
<div class="w1024 mb50">
|
||||
<div class="index_title">
|
||||
<h3 class="index_title_h3 fl">通知公告</h3>
|
||||
<div class="fr loc">您现在所在位置:
|
||||
<a data-savepage-href="/xcs/new_index.shtml" href="http://www.nhc.gov.cn/xcs/new_index.shtml" target="_parent">首页</a> > <a data-savepage-href="/xcs/xxgzbd/gzbd_index.shtml" href="http://www.nhc.gov.cn/xcs/xxgzbd/gzbd_index.shtml" target="_parent">新型冠状病毒肺炎疫情防控</a> > <a data-savepage-href="/xcs/zhengcwj/list_gzbd.shtml" href="http://www.nhc.gov.cn/xcs/zhengcwj/list_gzbd.shtml" target="_parent"><span>通知公告</span></a>
|
||||
</div>
|
||||
<div class="clear"></div>
|
||||
</div>
|
||||
<div class="list">
|
||||
<div class="tit">国家卫生健康委关于新型冠状病毒肺炎暂命名事宜的通知</div>
|
||||
<div class="source">
|
||||
<div class="fr"><span><a href="#" onclick="changeSize('16px');return false;"><img data-savepage-currentsrc="http://www.nhc.gov.cn/xcs/xhtml/images/sm.jpg" data-savepage-src="/xcs/xhtml/images/sm.jpg" src=""></a></span><span><a href="#" onclick="changeSize('20px');return false;"><img data-savepage-currentsrc="http://www.nhc.gov.cn/xcs/xhtml/images/big.jpg" data-savepage-src="/xcs/xhtml/images/big.jpg" src=""></a></span><span><a href="javascript:window.print()"><img data-savepage-currentsrc="http://www.nhc.gov.cn/xcs/xhtml/images/dys.jpg" data-savepage-src="/xcs/xhtml/images/dys.jpg" src=""></a></span></div>
|
||||
<span>发布时间:
|
||||
2020-02-08
|
||||
</span>
|
||||
<span class="mr">来源:
|
||||
医政医管局
|
||||
</span>
|
||||
|
||||
|
||||
<div class="clear"></div>
|
||||
</div>
|
||||
<div class="con" id="xw_box">
|
||||
<p style="text-align: right; line-height: 1.5; font-family: 仿宋, 仿宋_GB2312; font-size: 20px;">国卫医函〔2020〕42号 <br>
|
||||
</p>
|
||||
<p style="line-height: 1.5; font-family: 仿宋, 仿宋_GB2312; font-size: 20px;"> </p>
|
||||
<p style="text-align: justify; line-height: 1.5; font-family: 仿宋, 仿宋_GB2312; font-size: 20px;">各省、自治区、直辖市人民政府,新疆生产建设兵团,国务院应对新型冠状病毒肺炎疫情联防联控机制成员:<br>
|
||||
现决定将“新型冠状病毒感染的肺炎”暂命名为“新型冠状病毒肺炎”,简称“新冠肺炎”;英文名称为“Novel Coronavirus Pneumonia”,简称“NCP”。<br>
|
||||
</p>
|
||||
<p style="text-align: justify; line-height: 1.5; font-family: 仿宋, 仿宋_GB2312; font-size: 20px;"> </p>
|
||||
<p style="text-align: justify; line-height: 1.5; font-family: 仿宋, 仿宋_GB2312; font-size: 20px;"> </p>
|
||||
<p style="text-align: right; line-height: 1.5; font-family: 仿宋, 仿宋_GB2312; font-size: 20px;">国家卫生健康委</p>
|
||||
<p style="text-align: right; line-height: 1.5; font-family: 仿宋, 仿宋_GB2312; font-size: 20px;">2020年2月7日</p>
|
||||
<p style="text-align: justify; line-height: 1.5; font-family: 仿宋, 仿宋_GB2312; font-size: 20px;"> </p>
|
||||
<p style="text-align: justify; line-height: 1.5; font-family: 仿宋, 仿宋_GB2312; font-size: 20px;"> (信息公开形式:主动公开)</p>
|
||||
<div class="fx fr" style="font-size: 20px;">
|
||||
<div class="bdsharebuttonbox fr bdshare-button-style0-16" data-bd-bind="1649527526839" style="font-size: 20px;">
|
||||
<a href="#" class="bds_more" data-cmd="more"></a><a href="#" class="bds_qzone" data-cmd="qzone" title="分享到QQ空间"></a><a href="#" class="bds_tsina" data-cmd="tsina" title="分享到新浪微博"></a><a href="#" class="bds_tqq" data-cmd="tqq" title="分享到腾讯微博"></a><a href="#" class="bds_renren" data-cmd="renren" title="分享到人人网"></a><a href="#" class="bds_weixin" data-cmd="weixin" title="分享到微信"></a>
|
||||
</div>
|
||||
<script data-savepage-type="" type="text/plain"></script>
|
||||
分享到
|
||||
</div>
|
||||
<div class="clear" style="font-size: 20px;"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<div class="ttbg">
|
||||
<div class="w1200 bottom">
|
||||
<div class="dzjg fl"><a href="http://bszs.conac.cn/sitename?method=show&id=095F1802D253655FE053022819AC11D8" target="_blank"><img data-savepage-currentsrc="http://www.nhc.gov.cn/xcs/xhtml/images/logo_foot.png" data-savepage-src="/xcs/xhtml/images/logo_foot.png" src="" width="46" height=""></a></div>
|
||||
<p>地址:北京市西城区西直门外南路1号 邮编:100044 电话:010-68792114 ICP备案编号:京ICP备18052910号 京公网安备 11010202000005号</p>
|
||||
<p>中华人民共和国国家卫生健康委员会 版权所有 技术支持:国家卫生健康委员会统计信息中心 网站标识码:bm24000006</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<span class="cleanslate TridactylStatusIndicator TridactylModenormal">normal</span></body></html>
|
53
qinghai/README.md
Normal file
53
qinghai/README.md
Normal file
|
@ -0,0 +1,53 @@
|
|||
## Qinghai scraping
|
||||
|
||||
A few links don't exist anymore. They have the indexes 210, 453, 681,
|
||||
703, 791, 871, 913, 914, 915 in `links.csv`.
|
||||
|
||||
There are a few small files again, mostly pdf links:
|
||||
|
||||
```console
|
||||
.rw-r--r-- 101 tlater 9 Apr 19:26 ./2016-09-28_923.txt
|
||||
.rw-r--r-- 133 tlater 9 Apr 19:26 ./2016-09-28_924.txt
|
||||
.rw-r--r-- 116 tlater 9 Apr 19:26 ./2016-09-28_925.txt
|
||||
.rw-r--r-- 147 tlater 9 Apr 19:26 ./2016-09-28_926.txt
|
||||
.rw-r--r-- 111 tlater 9 Apr 19:23 ./2017-03-16_838.txt
|
||||
.rw-r--r-- 36 tlater 9 Apr 19:20 ./2017-07-07_745.txt
|
||||
.rw-r--r-- 82 tlater 9 Apr 19:17 ./2017-08-14_723.txt
|
||||
.rw-r--r-- 211 tlater 9 Apr 19:17 ./2017-09-12_704.txt
|
||||
.rw-r--r-- 97 tlater 9 Apr 19:14 ./2017-11-15_587.txt
|
||||
.rw-r--r-- 156 tlater 9 Apr 19:13 ./2017-11-20_580.txt
|
||||
.rw-r--r-- 283 tlater 9 Apr 19:13 ./2017-11-23_575.txt
|
||||
.rw-r--r-- 39 tlater 9 Apr 19:13 ./2017-12-29_566.txt
|
||||
.rw-r--r-- 39 tlater 9 Apr 19:13 ./2018-01-12_561.txt
|
||||
.rw-r--r-- 165 tlater 9 Apr 19:12 ./2018-05-30_505.txt
|
||||
.rw-r--r-- 145 tlater 9 Apr 19:12 ./2018-05-30_507.txt
|
||||
.rw-r--r-- 391 tlater 9 Apr 19:11 ./2018-07-25_475.txt
|
||||
.rw-r--r-- 158 tlater 9 Apr 19:11 ./2018-09-13_467.txt
|
||||
.rw-r--r-- 204 tlater 9 Apr 19:04 ./2020-03-09_254.txt
|
||||
.rw-r--r-- 124 tlater 9 Apr 19:04 ./2020-03-18_248.txt
|
||||
.rw-r--r-- 228 tlater 9 Apr 19:04 ./2020-03-20_245.txt
|
||||
.rw-r--r-- 186 tlater 9 Apr 19:03 ./2020-04-01_221.txt
|
||||
.rw-r--r-- 67 tlater 9 Apr 19:02 ./2020-04-21_208.txt
|
||||
.rw-r--r-- 174 tlater 9 Apr 19:01 ./2020-04-30_194.txt
|
||||
.rw-r--r-- 147 tlater 9 Apr 19:01 ./2020-05-08_186.txt
|
||||
.rw-r--r-- 189 tlater 9 Apr 19:01 ./2020-05-12_182.txt
|
||||
.rw-r--r-- 82 tlater 9 Apr 19:01 ./2020-05-15_180.txt
|
||||
.rw-r--r-- 119 tlater 9 Apr 19:00 ./2020-06-04_139.txt
|
||||
.rw-r--r-- 201 tlater 9 Apr 19:00 ./2020-07-01_114.txt
|
||||
.rw-r--r-- 113 tlater 9 Apr 18:59 ./2020-07-20_90.txt
|
||||
.rw-r--r-- 115 tlater 9 Apr 18:59 ./2020-07-21_86.txt
|
||||
.rw-r--r-- 99 tlater 9 Apr 18:58 ./2020-08-27_36.txt
|
||||
.rw-r--r-- 99 tlater 9 Apr 18:58 ./2020-08-27_37.txt
|
||||
.rw-r--r-- 130 tlater 9 Apr 18:58 ./2020-08-27_38.txt
|
||||
.rw-r--r-- 130 tlater 9 Apr 18:58 ./2020-08-27_39.txt
|
||||
.rw-r--r-- 190 tlater 9 Apr 18:58 ./2020-08-27_40.txt
|
||||
.rw-r--r-- 190 tlater 9 Apr 18:58 ./2020-08-27_41.txt
|
||||
.rw-r--r-- 184 tlater 9 Apr 18:58 ./2020-08-27_42.txt
|
||||
.rw-r--r-- 184 tlater 9 Apr 18:58 ./2020-08-27_43.txt
|
||||
.rw-r--r-- 127 tlater 9 Apr 18:58 ./2020-08-27_44.txt
|
||||
.rw-r--r-- 127 tlater 9 Apr 18:58 ./2020-08-27_45.txt
|
||||
.rw-r--r-- 94 tlater 9 Apr 18:58 ./2020-08-27_46.txt
|
||||
.rw-r--r-- 94 tlater 9 Apr 18:58 ./2020-08-27_47.txt
|
||||
.rw-r--r-- 88 tlater 9 Apr 18:58 ./2020-09-12_20.txt
|
||||
.rw-r--r-- 200 tlater 9 Apr 18:57 ./2020-09-21_11.txt
|
||||
```
|
24
qinghai/scrape-iframe.py
Normal file
24
qinghai/scrape-iframe.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
"""Script to scrape contents from a specific article.
|
||||
|
||||
This is for
|
||||
http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml.
|
||||
|
||||
For whatever reason, this article is implemented as an iframe, so
|
||||
requires downloading with a full-featured browser. It's just one
|
||||
though, so let's parse it.
|
||||
"""
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def main():
|
||||
"""Scrape html site."""
|
||||
with open("2020-02-08_275.html", "r") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
text = soup.find(class_="w1024").get_text().strip()
|
||||
with open("articles-qinghai/2020-02-08_275.txt", "w+") as f:
|
||||
f.write(text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -35,8 +35,11 @@ def main():
|
|||
writer.writerow(i, link[0], link[1])
|
||||
|
||||
for i, link in enumerate(links):
|
||||
# Broken link
|
||||
if i == 210:
|
||||
# Broken links
|
||||
#
|
||||
# 275 was available as an iframe, and is parsed separately in
|
||||
# scrape-iframe.py
|
||||
if i in (210, 275, 453, 681, 703, 791, 871, 913, 914, 915):
|
||||
continue
|
||||
|
||||
print(f"Downloading {link[0]} ({i}/{len(links)})")
|
||||
|
|
Loading…
Reference in a new issue