scrape-yuanyuan/qinghai/scrape-iframe.py

"""Script to scrape contents from a specific article.

This is for
http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml.

For whatever reason, this article is implemented as an iframe, so
requires downloading with a full-featured browser. It's just one
though, so let's parse it.
"""

from bs4 import BeautifulSoup


def main():
    """Scrape html site."""
    with open("2020-02-08_275.html", "r") as f:
        soup = BeautifulSoup(f.read(), "html.parser")
    text = soup.find(class_="w1024").get_text().strip()
    with open("articles-qinghai/2020-02-08_275.txt", "w+") as f:
        f.write(text)


if __name__ == "__main__":
    main()
Handle special case 275 2022-04-09 19:31:09 +01:00			`"""Script to scrape contents from a specific article.`

			`This is for`
			`http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml.`

			`For whatever reason, this article is implemented as an iframe, so`
			`requires downloading with a full-featured browser. It's just one`
			`though, so let's parse it.`
			`"""`

			`from bs4 import BeautifulSoup`


			`def main():`
			`"""Scrape html site."""`
			`with open("2020-02-08_275.html", "r") as f:`
			`soup = BeautifulSoup(f.read(), "html.parser")`
			`text = soup.find(class_="w1024").get_text().strip()`
			`with open("articles-qinghai/2020-02-08_275.txt", "w+") as f:`
			`f.write(text)`


			`if __name__ == "__main__":`
			`main()`