25 lines
670 B
Python
25 lines
670 B
Python
|
"""Script to scrape contents from a specific article.
|
||
|
|
||
|
This is for
|
||
|
http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml.
|
||
|
|
||
|
For whatever reason, this article is implemented as an iframe, so
|
||
|
requires downloading with a full-featured browser. It's just one
|
||
|
though, so let's parse it.
|
||
|
"""
|
||
|
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
|
||
|
def main():
|
||
|
"""Scrape html site."""
|
||
|
with open("2020-02-08_275.html", "r") as f:
|
||
|
soup = BeautifulSoup(f.read(), "html.parser")
|
||
|
text = soup.find(class_="w1024").get_text().strip()
|
||
|
with open("articles-qinghai/2020-02-08_275.txt", "w+") as f:
|
||
|
f.write(text)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|