Initial commit
This commit is contained in:
commit
4c73ace62d
49
extract-urls.js
Normal file
49
extract-urls.js
Normal file
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Extract article links from https://wsjkw.gd.gov.cn.
|
||||
*
|
||||
* Used to provide links for scrape.py, which is used to actually
|
||||
* extract page contents.
|
||||
*
|
||||
* @module extract-urls
|
||||
*/
|
||||
|
||||
/**
|
||||
* Download a string from the current webpage.
|
||||
*
|
||||
* @param {string} text - The text to download
|
||||
* @param {string} fileType - The file type to download - text/plain is just a .txt
|
||||
* @param {string} fileName - The filename to give the file
|
||||
*/
|
||||
function downloadString(text, fileType, fileName) {
|
||||
var blob = new Blob([text], { type: fileType });
|
||||
|
||||
var a = document.createElement('a');
|
||||
a.download = fileName;
|
||||
a.href = URL.createObjectURL(blob);
|
||||
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
|
||||
a.style.display = "none";
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gather download links from https://wsjkw.gd.gov.cn.
|
||||
*
|
||||
* The actual page scraping is done in python, but these links are
|
||||
* only available once the webpage has fully executed its own JS, so
|
||||
* we grab them all with devtools in a little script.
|
||||
*/
|
||||
function gatherLinks() {
|
||||
let links = [];
|
||||
|
||||
for (const link of document.getElementsByClassName("article-list__item-title")) {
|
||||
links.push(link.children[0].getAttributeNode("href").value);
|
||||
}
|
||||
|
||||
return links;
|
||||
}
|
||||
|
||||
// Actually execute the download
|
||||
downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");
|
40
flake.lock
Normal file
40
flake.lock
Normal file
|
@ -0,0 +1,40 @@
|
|||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"locked": {
|
||||
"lastModified": 1648297722,
|
||||
"narHash": "sha256-W+qlPsiZd8F3XkzXOzAoR+mpFqzm3ekQkJNa+PIh1BQ=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "0f8662f1319ad6abf89b3380dd2722369fc51ade",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1649117019,
|
||||
"narHash": "sha256-ID7nw/8MDgqj/cbJ0wy6AtQ9wp58hSnE6+weZwuHnso=",
|
||||
"path": "/nix/store/9dmig1pv9njj5kswvs8yvw3qp6b81zkd-source",
|
||||
"rev": "ccb90fb9e11459aeaf83cc28d5f8910816d90dd0",
|
||||
"type": "path"
|
||||
},
|
||||
"original": {
|
||||
"id": "nixpkgs",
|
||||
"type": "indirect"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
33
flake.nix
Normal file
33
flake.nix
Normal file
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
description = "Website scraping to help with stuff";
|
||||
|
||||
inputs = {
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
};
|
||||
|
||||
outputs = {
|
||||
self,
|
||||
nixpkgs,
|
||||
flake-utils,
|
||||
}:
|
||||
flake-utils.lib.eachDefaultSystem (
|
||||
system: let
|
||||
pkgs = import nixpkgs {inherit system;};
|
||||
in {
|
||||
devShell = pkgs.mkShell {
|
||||
nativeBuildInputs = with pkgs; [
|
||||
(python39.withPackages (pypkgs:
|
||||
with pypkgs; [
|
||||
beautifulsoup4
|
||||
requests
|
||||
|
||||
python-lsp-server
|
||||
python-lsp-black
|
||||
pylsp-mypy
|
||||
pyls-isort
|
||||
]))
|
||||
];
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
1751
links/links.txt
Normal file
1751
links/links.txt
Normal file
File diff suppressed because it is too large
Load diff
61
scrape.py
Normal file
61
scrape.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
"""Script to scrape article contents from https://wsjkw.gd.gov.cn.
|
||||
|
||||
Links aren't conveniently available, since the page that lists them is
|
||||
rendered entirely using JS, so we need to run a fully-fledged JS
|
||||
browser to get them.
|
||||
|
||||
We use `extract-urls.js` to extract the links beforehand, and dump
|
||||
them to a file, which we can extract here.
|
||||
"""
|
||||
import re
|
||||
from typing import Tuple
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d")
|
||||
|
||||
|
||||
def main():
|
||||
"""Read all links from the set of links and dump their articles to files."""
|
||||
with open("links/links.txt") as f:
|
||||
links = f.readlines()
|
||||
|
||||
for i, link in enumerate(links):
|
||||
print(f"Downloading {link.rstrip()} ({i}/{len(links)})")
|
||||
|
||||
# The links aren't formatted correctly, we need to prefix
|
||||
# them with `http`
|
||||
link = f"http:{link}"
|
||||
date, text = download_link(link)
|
||||
|
||||
with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f:
|
||||
f.write(text)
|
||||
|
||||
|
||||
def download_link(link: str) -> Tuple[str, str]:
|
||||
"""Download a link."""
|
||||
download = requests.get(link)
|
||||
return extract_article(download.text)
|
||||
|
||||
|
||||
def extract_article(website: str) -> Tuple[str, str]:
|
||||
"""Extract an article."""
|
||||
soup = BeautifulSoup(website, "html.parser")
|
||||
date = soup.find(class_="date-row")
|
||||
|
||||
if date:
|
||||
match = re.search(MATCH_DATE, date.get_text())
|
||||
if match:
|
||||
date = match.group(0)
|
||||
else:
|
||||
date = "unknown"
|
||||
else:
|
||||
date = "unknown"
|
||||
|
||||
text = soup.find(class_="article-content").get_text().strip()
|
||||
return date, text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in a new issue