33 lines
858 B
Python
33 lines
858 B
Python
|
r"""Script to scrape article text from http://wsjkw.nx.gov.cn.
|
||
|
|
||
|
Article contents are in a div with the class `xl-content`.
|
||
|
"""
|
||
|
|
||
|
import requests
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
from ..utils.linkutils import read_links
|
||
|
|
||
|
|
||
|
def main():
|
||
|
"""Collect and output article text."""
|
||
|
with open("articles-ningxia/links.txt", "r") as f:
|
||
|
links = read_links(f)
|
||
|
|
||
|
for i, link in enumerate(links):
|
||
|
print(f"Downloading {link.url} ({i}/{len(links)})")
|
||
|
text = get_article_text(link.url)
|
||
|
with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f:
|
||
|
f.write(text)
|
||
|
|
||
|
|
||
|
def get_article_text(link: str) -> str:
|
||
|
"""Download article text."""
|
||
|
request = requests.get(link)
|
||
|
soup = BeautifulSoup(request.text, "html.parser")
|
||
|
return soup.find(class_="xl-content").get_text().strip()
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|