Compare commits

...

3 commits

5 changed files with 35 additions and 10 deletions

23
Readme.md Normal file
View file

@ -0,0 +1,23 @@
# Province article scraping
A couple of scripts to scrape article text from various provinces for
a text analysis university course.
We need:
Qinghai
: page 14-75
Ningxia
: page 11-42
Shanxi
: page 2-18
Xinjiang
: page 10-20
The websites all have subtle differences, so there's simply a folder +
scripts for each (the scripts are simple enough that there's no need
for deduplication or anything complex). Written in python/js where
necessary for educational purposes.

View file

@ -16,6 +16,8 @@
in {
devShell = pkgs.mkShell {
nativeBuildInputs = with pkgs; [
nodePackages.typescript-language-server
(python39.withPackages (pypkgs:
with pypkgs; [
beautifulsoup4

View file

@ -15,17 +15,17 @@
* @param {string} fileName - The filename to give the file
*/
function downloadString(text, fileType, fileName) {
var blob = new Blob([text], { type: fileType });
var blob = new Blob([text], { type: fileType });
var a = document.createElement('a');
a.download = fileName;
a.href = URL.createObjectURL(blob);
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
a.style.display = "none";
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
var a = document.createElement('a');
a.download = fileName;
a.href = URL.createObjectURL(blob);
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
a.style.display = "none";
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
}
/**