Initial commit

This commit is contained in:
Tristan Daniël Maat 2022-04-09 14:44:18 +01:00
commit 4c73ace62d
Signed by: tlater
GPG key ID: 49670FD774E43268
5 changed files with 1934 additions and 0 deletions

49
extract-urls.js Normal file
View file

@ -0,0 +1,49 @@
/**
* Extract article links from https://wsjkw.gd.gov.cn.
*
* Used to provide links for scrape.py, which is used to actually
* extract page contents.
*
* @module extract-urls
*/
/**
* Download a string from the current webpage.
*
* @param {string} text - The text to download
* @param {string} fileType - The file type to download - text/plain is just a .txt
* @param {string} fileName - The filename to give the file
*/
function downloadString(text, fileType, fileName) {
var blob = new Blob([text], { type: fileType });
var a = document.createElement('a');
a.download = fileName;
a.href = URL.createObjectURL(blob);
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
a.style.display = "none";
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
}
/**
* Gather download links from https://wsjkw.gd.gov.cn.
*
* The actual page scraping is done in python, but these links are
* only available once the webpage has fully executed its own JS, so
* we grab them all with devtools in a little script.
*/
function gatherLinks() {
let links = [];
for (const link of document.getElementsByClassName("article-list__item-title")) {
links.push(link.children[0].getAttributeNode("href").value);
}
return links;
}
// Actually execute the download
downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");

40
flake.lock Normal file
View file

@ -0,0 +1,40 @@
{
"nodes": {
"flake-utils": {
"locked": {
"lastModified": 1648297722,
"narHash": "sha256-W+qlPsiZd8F3XkzXOzAoR+mpFqzm3ekQkJNa+PIh1BQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "0f8662f1319ad6abf89b3380dd2722369fc51ade",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1649117019,
"narHash": "sha256-ID7nw/8MDgqj/cbJ0wy6AtQ9wp58hSnE6+weZwuHnso=",
"path": "/nix/store/9dmig1pv9njj5kswvs8yvw3qp6b81zkd-source",
"rev": "ccb90fb9e11459aeaf83cc28d5f8910816d90dd0",
"type": "path"
},
"original": {
"id": "nixpkgs",
"type": "indirect"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

33
flake.nix Normal file
View file

@ -0,0 +1,33 @@
{
description = "Website scraping to help with stuff";
inputs = {
flake-utils.url = "github:numtide/flake-utils";
};
outputs = {
self,
nixpkgs,
flake-utils,
}:
flake-utils.lib.eachDefaultSystem (
system: let
pkgs = import nixpkgs {inherit system;};
in {
devShell = pkgs.mkShell {
nativeBuildInputs = with pkgs; [
(python39.withPackages (pypkgs:
with pypkgs; [
beautifulsoup4
requests
python-lsp-server
python-lsp-black
pylsp-mypy
pyls-isort
]))
];
};
}
);
}

1751
links/links.txt Normal file

File diff suppressed because it is too large Load diff

61
scrape.py Normal file
View file

@ -0,0 +1,61 @@
"""Script to scrape article contents from https://wsjkw.gd.gov.cn.
Links aren't conveniently available, since the page that lists them is
rendered entirely using JS, so we need to run a fully-fledged JS
browser to get them.
We use `extract-urls.js` to extract the links beforehand, and dump
them to a file, which we can extract here.
"""
import re
from typing import Tuple
import requests
from bs4 import BeautifulSoup
MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d")
def main():
"""Read all links from the set of links and dump their articles to files."""
with open("links/links.txt") as f:
links = f.readlines()
for i, link in enumerate(links):
print(f"Downloading {link.rstrip()} ({i}/{len(links)})")
# The links aren't formatted correctly, we need to prefix
# them with `http`
link = f"http:{link}"
date, text = download_link(link)
with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f:
f.write(text)
def download_link(link: str) -> Tuple[str, str]:
"""Download a link."""
download = requests.get(link)
return extract_article(download.text)
def extract_article(website: str) -> Tuple[str, str]:
"""Extract an article."""
soup = BeautifulSoup(website, "html.parser")
date = soup.find(class_="date-row")
if date:
match = re.search(MATCH_DATE, date.get_text())
if match:
date = match.group(0)
else:
date = "unknown"
else:
date = "unknown"
text = soup.find(class_="article-content").get_text().strip()
return date, text
if __name__ == "__main__":
main()