Compare commits
4 commits
8820ce1b95
...
e111c1f081
Author | SHA1 | Date | |
---|---|---|---|
Tristan Daniël Maat | e111c1f081 | ||
Tristan Daniël Maat | e0a4a26990 | ||
Tristan Daniël Maat | 8c012a28b3 | ||
Tristan Daniël Maat | ff7b03bc2b |
|
@ -12,7 +12,7 @@ We need:
|
||||||
: page 11-42 (actually 8-44?)
|
: page 11-42 (actually 8-44?)
|
||||||
|
|
||||||
[Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html)
|
[Shanxi](http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index.html)
|
||||||
: page 2-18
|
: page 2-18 (actually 2-20?)
|
||||||
|
|
||||||
[Xinjiang](http://wjw.xinjiang.gov.cn/hfpc/zcwj4/zfxxgk_gknrz_10.shtml)
|
[Xinjiang](http://wjw.xinjiang.gov.cn/hfpc/zcwj4/zfxxgk_gknrz_10.shtml)
|
||||||
: page 10-20
|
: page 10-20
|
||||||
|
|
19
shanxi/scrape-articles.py
Normal file
19
shanxi/scrape-articles.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
"""Script to scrape article text from http://sxwjw.shaanxi.gov.cn.
|
||||||
|
|
||||||
|
Article contents are in a div with the class `message-box`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from utils.linkutils import read_links
|
||||||
|
from utils.scrapeutils import download_link_texts
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Collect and output article text."""
|
||||||
|
with open("articles-shanxi/links.csv", "r") as f:
|
||||||
|
links = read_links(f)
|
||||||
|
|
||||||
|
download_link_texts(links, "message-box", "articles-shanxi")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
51
shanxi/scrape-links.py
Normal file
51
shanxi/scrape-links.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
r"""Script to scrape article links from http://sxwjw.shaanxi.gov.cn.
|
||||||
|
|
||||||
|
Links are available from pages
|
||||||
|
http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/index(\d+_)?.html.
|
||||||
|
|
||||||
|
The page structure is almost exactly like that of ningxia. Only
|
||||||
|
difference is the class name of the content div.
|
||||||
|
|
||||||
|
Something like div.con-rt li > a
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from utils.linkutils import Link, absolutize_link, dump_links
|
||||||
|
|
||||||
|
PAGE_START = 2
|
||||||
|
PAGE_END = 20
|
||||||
|
PAGE_BASE = "http://sxwjw.shaanxi.gov.cn/zfxxgk/fdzdgknr/zcwj/xzgfxwj/"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Collect and output article links."""
|
||||||
|
links = [
|
||||||
|
link
|
||||||
|
for page in range(PAGE_START - 1, PAGE_END)
|
||||||
|
for link in get_article_links(page)
|
||||||
|
]
|
||||||
|
|
||||||
|
with open("articles-shanxi/links.csv", "w+") as f:
|
||||||
|
dump_links(links, f)
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_links(page: int) -> List[Link]:
|
||||||
|
"""Get all links from a specific page."""
|
||||||
|
page_link = f"{PAGE_BASE}/index_{page}.html"
|
||||||
|
soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
|
||||||
|
|
||||||
|
link_nodes = soup.select("div.con-rt li > a")
|
||||||
|
date_nodes = soup.select("div.con-rt li > span")
|
||||||
|
|
||||||
|
return [
|
||||||
|
Link(absolutize_link(link.get("href"), PAGE_BASE), date.get_text())
|
||||||
|
for link, date in zip(link_nodes, date_nodes)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -23,3 +23,10 @@ def read_links(f: TextIO) -> List[Link]:
|
||||||
reader = csv.reader(f)
|
reader = csv.reader(f)
|
||||||
next(reader) # Skip the header
|
next(reader) # Skip the header
|
||||||
return [Link(link[1], link[2]) for link in reader]
|
return [Link(link[1], link[2]) for link in reader]
|
||||||
|
|
||||||
|
|
||||||
|
def absolutize_link(link: str, page_base: str) -> str:
|
||||||
|
"""Ensure we have an absolute url."""
|
||||||
|
if link.startswith("./"):
|
||||||
|
link = page_base + link[2:]
|
||||||
|
return link
|
||||||
|
|
27
utils/scrapeutils.py
Normal file
27
utils/scrapeutils.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
"""Utility functions for scraping."""
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from .linkutils import Link
|
||||||
|
|
||||||
|
|
||||||
|
def download_link_texts(
|
||||||
|
links: List[Link], class_: str, directory: str, encoding: str = None
|
||||||
|
):
|
||||||
|
"""Download link texts contained HTML elements with the given class to a dir."""
|
||||||
|
for i, link in enumerate(links):
|
||||||
|
print(f"Downloading {link.url} ({i+1}/{len(links)})")
|
||||||
|
text = get_link_text(link.url, class_, encoding)
|
||||||
|
with open(f"{directory}/{link.date}_{i}.txt", "w+") as f:
|
||||||
|
f.write(text)
|
||||||
|
|
||||||
|
|
||||||
|
def get_link_text(link: str, class_: str, encoding: str = None) -> str:
|
||||||
|
"""Get the text of a div with a given classname on a webpage."""
|
||||||
|
request = requests.get(link)
|
||||||
|
if encoding:
|
||||||
|
request.encoding = encoding
|
||||||
|
soup = BeautifulSoup(request.text, "html.parser")
|
||||||
|
return soup.find(class_=class_).get_text().strip()
|
Loading…
Reference in a new issue