Implement shanxi scraping
This commit is contained in:
parent
e0a4a26990
commit
e111c1f081
2 changed files with 70 additions and 0 deletions
19
shanxi/scrape-articles.py
Normal file
19
shanxi/scrape-articles.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
"""Script to scrape article text from http://sxwjw.shaanxi.gov.cn.
|
||||
|
||||
Article contents are in a div with the class `message-box`.
|
||||
"""
|
||||
|
||||
from utils.linkutils import read_links
|
||||
from utils.scrapeutils import download_link_texts
|
||||
|
||||
|
||||
def main():
|
||||
"""Collect and output article text."""
|
||||
with open("articles-shanxi/links.csv", "r") as f:
|
||||
links = read_links(f)
|
||||
|
||||
download_link_texts(links, "message-box", "articles-shanxi")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue