Structure the project a bit better

This commit is contained in:
Tristan Daniël Maat 2022-04-09 16:50:15 +01:00
parent 4c73ace62d
commit dcb665cde4
Signed by: tlater
GPG key ID: 49670FD774E43268
3 changed files with 0 additions and 0 deletions

49
guangdong/extract-urls.js Normal file
View file

@ -0,0 +1,49 @@
/**
* Extract article links from https://wsjkw.gd.gov.cn.
*
* Used to provide links for scrape.py, which is used to actually
* extract page contents.
*
* @module extract-urls
*/
/**
* Download a string from the current webpage.
*
* @param {string} text - The text to download
* @param {string} fileType - The file type to download - text/plain is just a .txt
* @param {string} fileName - The filename to give the file
*/
function downloadString(text, fileType, fileName) {
var blob = new Blob([text], { type: fileType });
var a = document.createElement('a');
a.download = fileName;
a.href = URL.createObjectURL(blob);
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
a.style.display = "none";
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
}
/**
* Gather download links from https://wsjkw.gd.gov.cn.
*
* The actual page scraping is done in python, but these links are
* only available once the webpage has fully executed its own JS, so
* we grab them all with devtools in a little script.
*/
function gatherLinks() {
let links = [];
for (const link of document.getElementsByClassName("article-list__item-title")) {
links.push(link.children[0].getAttributeNode("href").value);
}
return links;
}
// Actually execute the download
downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");

1751
guangdong/links/links.txt Normal file

File diff suppressed because it is too large Load diff

61
guangdong/scrape.py Normal file
View file

@ -0,0 +1,61 @@
"""Script to scrape article contents from https://wsjkw.gd.gov.cn.
Links aren't conveniently available, since the page that lists them is
rendered entirely using JS, so we need to run a fully-fledged JS
browser to get them.
We use `extract-urls.js` to extract the links beforehand, and dump
them to a file, which we can extract here.
"""
import re
from typing import Tuple
import requests
from bs4 import BeautifulSoup
MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d")
def main():
"""Read all links from the set of links and dump their articles to files."""
with open("links/links.txt") as f:
links = f.readlines()
for i, link in enumerate(links):
print(f"Downloading {link.rstrip()} ({i}/{len(links)})")
# The links aren't formatted correctly, we need to prefix
# them with `http`
link = f"http:{link}"
date, text = download_link(link)
with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f:
f.write(text)
def download_link(link: str) -> Tuple[str, str]:
"""Download a link."""
download = requests.get(link)
return extract_article(download.text)
def extract_article(website: str) -> Tuple[str, str]:
"""Extract an article."""
soup = BeautifulSoup(website, "html.parser")
date = soup.find(class_="date-row")
if date:
match = re.search(MATCH_DATE, date.get_text())
if match:
date = match.group(0)
else:
date = "unknown"
else:
date = "unknown"
text = soup.find(class_="article-content").get_text().strip()
return date, text
if __name__ == "__main__":
main()