"""Script to scrape contents from a specific article. This is for http://www.nhc.gov.cn/xcs/zhengcwj/202002/18c1bb43965a4492907957875de02ae7.shtml. For whatever reason, this article is implemented as an iframe, so requires downloading with a full-featured browser. It's just one though, so let's parse it. """ from bs4 import BeautifulSoup def main(): """Scrape html site.""" with open("2020-02-08_275.html", "r") as f: soup = BeautifulSoup(f.read(), "html.parser") text = soup.find(class_="w1024").get_text().strip() with open("articles-qinghai/2020-02-08_275.txt", "w+") as f: f.write(text) if __name__ == "__main__": main()