r"""Script to scrape article text from http://wsjkw.nx.gov.cn. Article contents are in a div with the class `xl-content`. """ import requests from bs4 import BeautifulSoup from ..utils.linkutils import read_links def main(): """Collect and output article text.""" with open("articles-ningxia/links.txt", "r") as f: links = read_links(f) for i, link in enumerate(links): print(f"Downloading {link.url} ({i}/{len(links)})") text = get_article_text(link.url) with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f: f.write(text) def get_article_text(link: str) -> str: """Download article text.""" request = requests.get(link) soup = BeautifulSoup(request.text, "html.parser") return soup.find(class_="xl-content").get_text().strip() if __name__ == "__main__": main()