scrape-yuanyuan/ningxia/scrape-articles.py

33 lines
858 B
Python
Raw Normal View History

2022-04-09 22:34:42 +01:00
r"""Script to scrape article text from http://wsjkw.nx.gov.cn.
Article contents are in a div with the class `xl-content`.
"""
import requests
from bs4 import BeautifulSoup
from ..utils.linkutils import read_links
def main():
"""Collect and output article text."""
with open("articles-ningxia/links.txt", "r") as f:
links = read_links(f)
for i, link in enumerate(links):
print(f"Downloading {link.url} ({i}/{len(links)})")
text = get_article_text(link.url)
with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f:
f.write(text)
def get_article_text(link: str) -> str:
"""Download article text."""
request = requests.get(link)
soup = BeautifulSoup(request.text, "html.parser")
return soup.find(class_="xl-content").get_text().strip()
if __name__ == "__main__":
main()