Initial commit
This commit is contained in:
commit
4c73ace62d
49
extract-urls.js
Normal file
49
extract-urls.js
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
/**
|
||||||
|
* Extract article links from https://wsjkw.gd.gov.cn.
|
||||||
|
*
|
||||||
|
* Used to provide links for scrape.py, which is used to actually
|
||||||
|
* extract page contents.
|
||||||
|
*
|
||||||
|
* @module extract-urls
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Download a string from the current webpage.
|
||||||
|
*
|
||||||
|
* @param {string} text - The text to download
|
||||||
|
* @param {string} fileType - The file type to download - text/plain is just a .txt
|
||||||
|
* @param {string} fileName - The filename to give the file
|
||||||
|
*/
|
||||||
|
function downloadString(text, fileType, fileName) {
|
||||||
|
var blob = new Blob([text], { type: fileType });
|
||||||
|
|
||||||
|
var a = document.createElement('a');
|
||||||
|
a.download = fileName;
|
||||||
|
a.href = URL.createObjectURL(blob);
|
||||||
|
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
|
||||||
|
a.style.display = "none";
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
document.body.removeChild(a);
|
||||||
|
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gather download links from https://wsjkw.gd.gov.cn.
|
||||||
|
*
|
||||||
|
* The actual page scraping is done in python, but these links are
|
||||||
|
* only available once the webpage has fully executed its own JS, so
|
||||||
|
* we grab them all with devtools in a little script.
|
||||||
|
*/
|
||||||
|
function gatherLinks() {
|
||||||
|
let links = [];
|
||||||
|
|
||||||
|
for (const link of document.getElementsByClassName("article-list__item-title")) {
|
||||||
|
links.push(link.children[0].getAttributeNode("href").value);
|
||||||
|
}
|
||||||
|
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Actually execute the download
|
||||||
|
downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");
|
40
flake.lock
Normal file
40
flake.lock
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"flake-utils": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1648297722,
|
||||||
|
"narHash": "sha256-W+qlPsiZd8F3XkzXOzAoR+mpFqzm3ekQkJNa+PIh1BQ=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "0f8662f1319ad6abf89b3380dd2722369fc51ade",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1649117019,
|
||||||
|
"narHash": "sha256-ID7nw/8MDgqj/cbJ0wy6AtQ9wp58hSnE6+weZwuHnso=",
|
||||||
|
"path": "/nix/store/9dmig1pv9njj5kswvs8yvw3qp6b81zkd-source",
|
||||||
|
"rev": "ccb90fb9e11459aeaf83cc28d5f8910816d90dd0",
|
||||||
|
"type": "path"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"id": "nixpkgs",
|
||||||
|
"type": "indirect"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-utils": "flake-utils",
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
33
flake.nix
Normal file
33
flake.nix
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
{
|
||||||
|
description = "Website scraping to help with stuff";
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
flake-utils.url = "github:numtide/flake-utils";
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs = {
|
||||||
|
self,
|
||||||
|
nixpkgs,
|
||||||
|
flake-utils,
|
||||||
|
}:
|
||||||
|
flake-utils.lib.eachDefaultSystem (
|
||||||
|
system: let
|
||||||
|
pkgs = import nixpkgs {inherit system;};
|
||||||
|
in {
|
||||||
|
devShell = pkgs.mkShell {
|
||||||
|
nativeBuildInputs = with pkgs; [
|
||||||
|
(python39.withPackages (pypkgs:
|
||||||
|
with pypkgs; [
|
||||||
|
beautifulsoup4
|
||||||
|
requests
|
||||||
|
|
||||||
|
python-lsp-server
|
||||||
|
python-lsp-black
|
||||||
|
pylsp-mypy
|
||||||
|
pyls-isort
|
||||||
|
]))
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
1751
links/links.txt
Normal file
1751
links/links.txt
Normal file
File diff suppressed because it is too large
Load diff
61
scrape.py
Normal file
61
scrape.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
"""Script to scrape article contents from https://wsjkw.gd.gov.cn.
|
||||||
|
|
||||||
|
Links aren't conveniently available, since the page that lists them is
|
||||||
|
rendered entirely using JS, so we need to run a fully-fledged JS
|
||||||
|
browser to get them.
|
||||||
|
|
||||||
|
We use `extract-urls.js` to extract the links beforehand, and dump
|
||||||
|
them to a file, which we can extract here.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Read all links from the set of links and dump their articles to files."""
|
||||||
|
with open("links/links.txt") as f:
|
||||||
|
links = f.readlines()
|
||||||
|
|
||||||
|
for i, link in enumerate(links):
|
||||||
|
print(f"Downloading {link.rstrip()} ({i}/{len(links)})")
|
||||||
|
|
||||||
|
# The links aren't formatted correctly, we need to prefix
|
||||||
|
# them with `http`
|
||||||
|
link = f"http:{link}"
|
||||||
|
date, text = download_link(link)
|
||||||
|
|
||||||
|
with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f:
|
||||||
|
f.write(text)
|
||||||
|
|
||||||
|
|
||||||
|
def download_link(link: str) -> Tuple[str, str]:
|
||||||
|
"""Download a link."""
|
||||||
|
download = requests.get(link)
|
||||||
|
return extract_article(download.text)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_article(website: str) -> Tuple[str, str]:
|
||||||
|
"""Extract an article."""
|
||||||
|
soup = BeautifulSoup(website, "html.parser")
|
||||||
|
date = soup.find(class_="date-row")
|
||||||
|
|
||||||
|
if date:
|
||||||
|
match = re.search(MATCH_DATE, date.get_text())
|
||||||
|
if match:
|
||||||
|
date = match.group(0)
|
||||||
|
else:
|
||||||
|
date = "unknown"
|
||||||
|
else:
|
||||||
|
date = "unknown"
|
||||||
|
|
||||||
|
text = soup.find(class_="article-content").get_text().strip()
|
||||||
|
return date, text
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in a new issue