diff --git a/qinghai/scrape.py b/qinghai/scrape.py index 812973a..561d92d 100644 --- a/qinghai/scrape.py +++ b/qinghai/scrape.py @@ -1,7 +1,7 @@ -r"""Script to scrape article contents from http://wsjkw.qinghai.gov.cn. +r"""Script to scrape article contents from https://wsjkw.qinghai.gov.cn. Links are available from pages -http://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in +https://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in the second href of elements with the class `xxgk_content_title`. Dates are the first span of the same element. @@ -35,10 +35,6 @@ def main(): writer.writerow(i, link[0], link[1]) for i, link in enumerate(links): - # Broken link - if i == 210: - continue - print(f"Downloading {link[0]} ({i}/{len(links)})") text = download_article_text(link[0])