Implement scraping for ningxia
This commit is contained in:
parent
65bf00f452
commit
5c557bdb9d
32
ningxia/scrape-articles.py
Normal file
32
ningxia/scrape-articles.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
r"""Script to scrape article text from http://wsjkw.nx.gov.cn.
|
||||||
|
|
||||||
|
Article contents are in a div with the class `xl-content`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from ..utils.linkutils import read_links
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Collect and output article text."""
|
||||||
|
with open("articles-ningxia/links.txt", "r") as f:
|
||||||
|
links = read_links(f)
|
||||||
|
|
||||||
|
for i, link in enumerate(links):
|
||||||
|
print(f"Downloading {link.url} ({i}/{len(links)})")
|
||||||
|
text = get_article_text(link.url)
|
||||||
|
with open(f"articles-ningxia/{link.date}_{i}.txt", "w+") as f:
|
||||||
|
f.write(text)
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_text(link: str) -> str:
|
||||||
|
"""Download article text."""
|
||||||
|
request = requests.get(link)
|
||||||
|
soup = BeautifulSoup(request.text, "html.parser")
|
||||||
|
return soup.find(class_="xl-content").get_text().strip()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
58
ningxia/scrape-links.py
Normal file
58
ningxia/scrape-links.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
r"""Script to scrape article links from http://wsjkw.nx.gov.cn.
|
||||||
|
|
||||||
|
Links are available from pages
|
||||||
|
http://wsjkw.nx.gov.cn/xwzx_279/tzgg/index(\d+_)?.html.
|
||||||
|
|
||||||
|
The page structure is a bit difficult, it contains sub-lists of a big
|
||||||
|
list that have separating borders every few elements.
|
||||||
|
|
||||||
|
Something like div.gl-list li > a
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
from utils.linkutils import Link, dump_links
|
||||||
|
|
||||||
|
PAGE_START = 8
|
||||||
|
PAGE_END = 44
|
||||||
|
PAGE_BASE = "http://wsjkw.nx.gov.cn/xwzx_279/tzgg/"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Collect and output article links."""
|
||||||
|
links = [
|
||||||
|
link
|
||||||
|
for page in range(PAGE_START - 1, PAGE_END)
|
||||||
|
for link in get_article_links(page)
|
||||||
|
]
|
||||||
|
|
||||||
|
with open("articles-ningxia/links.txt", "w+") as f:
|
||||||
|
dump_links(links, f)
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_links(page: int) -> List[Link]:
|
||||||
|
"""Get all (article link, date) tuples from a specific page."""
|
||||||
|
page_link = f"{PAGE_BASE}/index_{page}.html"
|
||||||
|
soup = BeautifulSoup(requests.get(page_link).text, "html.parser")
|
||||||
|
|
||||||
|
link_nodes = soup.select("div.gl-list li > a")
|
||||||
|
date_nodes = soup.select("div.gl-list li > span")
|
||||||
|
|
||||||
|
def parse_link(tag: Tag) -> str:
|
||||||
|
link: str = tag.get("href")
|
||||||
|
if link.startswith("./"):
|
||||||
|
link = PAGE_BASE + link[2:]
|
||||||
|
|
||||||
|
return link
|
||||||
|
|
||||||
|
return [
|
||||||
|
Link(parse_link(link), date.get_text()[1:-1])
|
||||||
|
for link, date in zip(link_nodes, date_nodes)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in a new issue