scrape-yuanyuan/qinghai/scrape-iframe.py

25 lines
670 B
Python
Raw Normal View History

2022-04-09 19:31:09 +01:00
"""Script to scrape contents from a specific article.
This is for
http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml.
For whatever reason, this article is implemented as an iframe, so
requires downloading with a full-featured browser. It's just one
though, so let's parse it.
"""
from bs4 import BeautifulSoup
def main():
"""Scrape html site."""
with open("2020-02-08_275.html", "r") as f:
soup = BeautifulSoup(f.read(), "html.parser")
text = soup.find(class_="w1024").get_text().strip()
with open("articles-qinghai/2020-02-08_275.txt", "w+") as f:
f.write(text)
if __name__ == "__main__":
main()