From 3858d2a5561e23d2f5e85b4887db03b996fe35a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= Date: Sat, 9 Apr 2022 18:58:58 +0100 Subject: [PATCH] Override reported text encoding for qinghai --- qinghai/scrape.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qinghai/scrape.py b/qinghai/scrape.py index b465a53..561d92d 100644 --- a/qinghai/scrape.py +++ b/qinghai/scrape.py @@ -61,7 +61,9 @@ def get_article_links(page: int) -> List[Tuple[str, str]]: def download_article_text(link: str) -> str: """Get the text of an article from its link.""" - soup = BeautifulSoup(requests.get(link).text, "html.parser") + request = requests.get(link) + request.encoding = "gbk" # The website responds with the wrong encoding + soup = BeautifulSoup(request.text, "html.parser") return soup.find(class_="page_text").get_text().strip()