Compare commits

...

3 Commits

5 changed files with 35 additions and 10 deletions

23
Readme.md Normal file
View File

@ -0,0 +1,23 @@
# Province article scraping
A couple of scripts to scrape article text from various provinces for
a text analysis university course.
We need:
Qinghai
: page 14-75
Ningxia
: page 11-42
Shanxi
: page 2-18
Xinjiang
: page 10-20
The websites all have subtle differences, so there's simply a folder +
scripts for each (the scripts are simple enough that there's no need
for deduplication or anything complex). Written in python/js where
necessary for educational purposes.

View File

@ -16,6 +16,8 @@
in { in {
devShell = pkgs.mkShell { devShell = pkgs.mkShell {
nativeBuildInputs = with pkgs; [ nativeBuildInputs = with pkgs; [
nodePackages.typescript-language-server
(python39.withPackages (pypkgs: (python39.withPackages (pypkgs:
with pypkgs; [ with pypkgs; [
beautifulsoup4 beautifulsoup4

View File

@ -15,17 +15,17 @@
* @param {string} fileName - The filename to give the file * @param {string} fileName - The filename to give the file
*/ */
function downloadString(text, fileType, fileName) { function downloadString(text, fileType, fileName) {
var blob = new Blob([text], { type: fileType }); var blob = new Blob([text], { type: fileType });
var a = document.createElement('a'); var a = document.createElement('a');
a.download = fileName; a.download = fileName;
a.href = URL.createObjectURL(blob); a.href = URL.createObjectURL(blob);
a.dataset.downloadurl = [fileType, a.download, a.href].join(':'); a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
a.style.display = "none"; a.style.display = "none";
document.body.appendChild(a); document.body.appendChild(a);
a.click(); a.click();
document.body.removeChild(a); document.body.removeChild(a);
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500); setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
} }
/** /**