scrape-yuanyuan/ningxia/scrape-articles.py

r"""Script to scrape article text from http://wsjkw.nx.gov.cn.

Article contents are in a div with the class `xl-content`.
"""

import requests
from bs4 import BeautifulSoup

from ..utils.linkutils import read_links


def main():
    """Collect and output article text."""
    with open("articles-ningxia/links.txt", "r") as f:
        links = read_links(f)

    for i, link in enumerate(links):
        print(f"Downloading {link.url} ({i}/{len(links)})")
        text = get_article_text(link.url)
        with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f:
            f.write(text)


def get_article_text(link: str) -> str:
    """Download article text."""
    request = requests.get(link)
    soup = BeautifulSoup(request.text, "html.parser")
    return soup.find(class_="xl-content").get_text().strip()


if __name__ == "__main__":
    main()
Implement scraping for ningxia 2022-04-09 22:34:42 +01:00			`r"""Script to scrape article text from http://wsjkw.nx.gov.cn.`

			Article contents are in a div with the class `xl-content`.
			`"""`

			`import requests`
			`from bs4 import BeautifulSoup`

			`from ..utils.linkutils import read_links`


			`def main():`
			`"""Collect and output article text."""`
			`with open("articles-ningxia/links.txt", "r") as f:`
			`links = read_links(f)`

			`for i, link in enumerate(links):`
			`print(f"Downloading {link.url} ({i}/{len(links)})")`
			`text = get_article_text(link.url)`
			`with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f:`
			`f.write(text)`


			`def get_article_text(link: str) -> str:`
			`"""Download article text."""`
			`request = requests.get(link)`
			`soup = BeautifulSoup(request.text, "html.parser")`
			`return soup.find(class_="xl-content").get_text().strip()`


			`if __name__ == "__main__":`
			`main()`