diff --git a/qinghai/scrape.py b/qinghai/scrape.py index 561d92d..812973a 100644 --- a/qinghai/scrape.py +++ b/qinghai/scrape.py @@ -1,7 +1,7 @@ -r"""Script to scrape article contents from https://wsjkw.qinghai.gov.cn. +r"""Script to scrape article contents from http://wsjkw.qinghai.gov.cn. Links are available from pages -https://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in +http://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in the second href of elements with the class `xxgk_content_title`. Dates are the first span of the same element. @@ -35,6 +35,10 @@ def main(): writer.writerow(i, link[0], link[1]) for i, link in enumerate(links): + # Broken link + if i == 210: + continue + print(f"Downloading {link[0]} ({i}/{len(links)})") text = download_article_text(link[0])