From e61a31154f650e73f1fa8e7a23ede718f9489980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= <tm@tlater.net> Date: Sat, 9 Apr 2022 18:58:40 +0100 Subject: [PATCH 1/2] Fix missing write setting on file --- qinghai/scrape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qinghai/scrape.py b/qinghai/scrape.py index db65ae4..b465a53 100644 --- a/qinghai/scrape.py +++ b/qinghai/scrape.py @@ -38,7 +38,7 @@ def main(): print(f"Downloading {link[0]} ({i}/{len(links)})") text = download_article_text(link[0]) - with open(f"articles-qinghai/{link[1]}_{i}.txt") as f: + with open(f"articles-qinghai/{link[1]}_{i}.txt", "w+") as f: f.write(text) From 3858d2a5561e23d2f5e85b4887db03b996fe35a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= <tm@tlater.net> Date: Sat, 9 Apr 2022 18:58:58 +0100 Subject: [PATCH 2/2] Override reported text encoding for qinghai --- qinghai/scrape.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qinghai/scrape.py b/qinghai/scrape.py index b465a53..561d92d 100644 --- a/qinghai/scrape.py +++ b/qinghai/scrape.py @@ -61,7 +61,9 @@ def get_article_links(page: int) -> List[Tuple[str, str]]: def download_article_text(link: str) -> str: """Get the text of an article from its link.""" - soup = BeautifulSoup(requests.get(link).text, "html.parser") + request = requests.get(link) + request.encoding = "gbk" # The website responds with the wrong encoding + soup = BeautifulSoup(request.text, "html.parser") return soup.find(class_="page_text").get_text().strip()