scrape-yuanyuan/qinghai/scrape-iframe.py

"""Script to scrape contents from a specific article.

This is for
http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml.

For whatever reason, this article is implemented as an iframe, so
requires downloading with a full-featured browser. It's just one
though, so let's parse it.
"""

from bs4 import BeautifulSoup


def main():
    """Scrape html site."""
    with open("2020-02-08_275.html", "r") as f:
        soup = BeautifulSoup(f.read(), "html.parser")
    text = soup.find(class_="w1024").get_text().strip()
    with open("articles-qinghai/2020-02-08_275.txt", "w+") as f:
        f.write(text)


if __name__ == "__main__":
    main()