From a66fbc83aaf83c61076706bd836e5b54df848d70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= <tm@tlater.net> Date: Sat, 9 Apr 2022 19:03:52 +0100 Subject: [PATCH 1/2] Don't mistakenly refer to https links --- qinghai/scrape.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qinghai/scrape.py b/qinghai/scrape.py index 561d92d..081ed02 100644 --- a/qinghai/scrape.py +++ b/qinghai/scrape.py @@ -1,7 +1,7 @@ -r"""Script to scrape article contents from https://wsjkw.qinghai.gov.cn. +r"""Script to scrape article contents from http://wsjkw.qinghai.gov.cn. Links are available from pages -https://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in +http://wsjkw.qinghai.gov.cn/zwgk/xxgkml/index\d*.html. Links are in the second href of elements with the class `xxgk_content_title`. Dates are the first span of the same element. From 8cb72464b45c8b059a52d629e5037405aff70e08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= <tm@tlater.net> Date: Sat, 9 Apr 2022 19:04:20 +0100 Subject: [PATCH 2/2] Work around broken link --- qinghai/scrape.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/qinghai/scrape.py b/qinghai/scrape.py index 081ed02..812973a 100644 --- a/qinghai/scrape.py +++ b/qinghai/scrape.py @@ -35,6 +35,10 @@ def main(): writer.writerow(i, link[0], link[1]) for i, link in enumerate(links): + # Broken link + if i == 210: + continue + print(f"Downloading {link[0]} ({i}/{len(links)})") text = download_article_text(link[0])