diff --git a/README.md b/README.md index a94e6fe..4a8478f 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ We need: : page 11-42 (actually 8-44?) [Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html) -: page 2-18 (actually 2-20?) +: page 2-18 [Xinjiang](http://wjw.xinjiang.gov.cn/hfpc/zcwj4/zfxxgk_gknrz_10.shtml) : page 10-20 diff --git a/shanxi/scrape-articles.py b/shanxi/scrape-articles.py deleted file mode 100644 index 0e85164..0000000 --- a/shanxi/scrape-articles.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Script to scrape article text from http://sxwjw.shaanxi.gov.cn. - -Article contents are in a div with the class `message-box`. -""" - -from utils.linkutils import read_links -from utils.scrapeutils import download_link_texts - - -def main(): - """Collect and output article text.""" - with open("articles-shanxi/links.csv", "r") as f: - links = read_links(f) - - download_link_texts(links, "message-box", "articles-shanxi") - - -if __name__ == "__main__": - main() diff --git a/shanxi/scrape-links.py b/shanxi/scrape-links.py deleted file mode 100644 index 32a21d8..0000000 --- a/shanxi/scrape-links.py +++ /dev/null @@ -1,51 +0,0 @@ -r"""Script to scrape article links from http://sxwjw.shaanxi.gov.cn. - -Links are available from pages -http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index(\d+_)?.html. - -The page structure is almost exactly like that of ningxia. Only -difference is the class name of the content div. - -Something like div.con-rt li > a -""" - -from typing import List - -import requests -from bs4 import BeautifulSoup - -from utils.linkutils import Link, absolutize_link, dump_links - -PAGE_START = 2 -PAGE_END = 20 -PAGE_BASE = "http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/" - - -def main(): - """Collect and output article links.""" - links = [ - link - for page in range(PAGE_START - 1, PAGE_END) - for link in get_article_links(page) - ] - - with open("articles-shanxi/links.csv", "w+") as f: - dump_links(links, f) - - -def get_article_links(page: int) -> List[Link]: - """Get all links from a specific page.""" - page_link = f"{PAGE_BASE}/index_{page}.html" - soup = BeautifulSoup(requests.get(page_link).text, "html.parser") - - link_nodes = soup.select("div.con-rt li > a") - date_nodes = soup.select("div.con-rt li > span") - - return [ - Link(absolutize_link(link.get("href"), PAGE_BASE), date.get_text()) - for link, date in zip(link_nodes, date_nodes) - ] - - -if __name__ == "__main__": - main() diff --git a/utils/linkutils.py b/utils/linkutils.py index 8763366..d572f97 100644 --- a/utils/linkutils.py +++ b/utils/linkutils.py @@ -23,10 +23,3 @@ def read_links(f: TextIO) -> List[Link]: reader = csv.reader(f) next(reader) # Skip the header return [Link(link[1], link[2]) for link in reader] - - -def absolutize_link(link: str, page_base: str) -> str: - """Ensure we have an absolute url.""" - if link.startswith("./"): - link = page_base + link[2:] - return link diff --git a/utils/scrapeutils.py b/utils/scrapeutils.py deleted file mode 100644 index 2b68039..0000000 --- a/utils/scrapeutils.py +++ /dev/null @@ -1,27 +0,0 @@ -"""Utility functions for scraping.""" -from typing import List - -import requests -from bs4 import BeautifulSoup - -from .linkutils import Link - - -def download_link_texts( - links: List[Link], class_: str, directory: str, encoding: str = None -): - """Download link texts contained HTML elements with the given class to a dir.""" - for i, link in enumerate(links): - print(f"Downloading {link.url} ({i+1}/{len(links)})") - text = get_link_text(link.url, class_, encoding) - with open(f"{directory}/{link.date}_{i}.txt", "w+") as f: - f.write(text) - - -def get_link_text(link: str, class_: str, encoding: str = None) -> str: - """Get the text of a div with a given classname on a webpage.""" - request = requests.get(link) - if encoding: - request.encoding = encoding - soup = BeautifulSoup(request.text, "html.parser") - return soup.find(class_=class_).get_text().strip()