Compare commits
No commits in common. "3858d2a5561e23d2f5e85b4887db03b996fe35a7" and "340feaa7ed5ac65e771fb9397bbff8c915e7c14c" have entirely different histories.
3858d2a556
...
340feaa7ed
|
@ -38,7 +38,7 @@ def main():
|
||||||
print(f"Downloading {link[0]} ({i}/{len(links)})")
|
print(f"Downloading {link[0]} ({i}/{len(links)})")
|
||||||
|
|
||||||
text = download_article_text(link[0])
|
text = download_article_text(link[0])
|
||||||
with open(f"articles-qinghai/{link[1]}_{i}.txt", "w+") as f:
|
with open(f"articles-qinghai/{link[1]}_{i}.txt") as f:
|
||||||
f.write(text)
|
f.write(text)
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,9 +61,7 @@ def get_article_links(page: int) -> List[Tuple[str, str]]:
|
||||||
|
|
||||||
def download_article_text(link: str) -> str:
|
def download_article_text(link: str) -> str:
|
||||||
"""Get the text of an article from its link."""
|
"""Get the text of an article from its link."""
|
||||||
request = requests.get(link)
|
soup = BeautifulSoup(requests.get(link).text, "html.parser")
|
||||||
request.encoding = "gbk" # The website responds with the wrong encoding
|
|
||||||
soup = BeautifulSoup(request.text, "html.parser")
|
|
||||||
return soup.find(class_="page_text").get_text().strip()
|
return soup.find(class_="page_text").get_text().strip()
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue