Compare commits
3 commits
4c73ace62d
...
9030da9a0c
Author | SHA1 | Date | |
---|---|---|---|
Tristan Daniël Maat | 9030da9a0c | ||
Tristan Daniël Maat | 60d7eec53f | ||
Tristan Daniël Maat | dcb665cde4 |
23
Readme.md
Normal file
23
Readme.md
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Province article scraping
|
||||
|
||||
A couple of scripts to scrape article text from various provinces for
|
||||
a text analysis university course.
|
||||
|
||||
We need:
|
||||
|
||||
Qinghai
|
||||
: page 14-75
|
||||
|
||||
Ningxia
|
||||
: page 11-42
|
||||
|
||||
Shanxi
|
||||
: page 2-18
|
||||
|
||||
Xinjiang
|
||||
: page 10-20
|
||||
|
||||
The websites all have subtle differences, so there's simply a folder +
|
||||
scripts for each (the scripts are simple enough that there's no need
|
||||
for deduplication or anything complex). Written in python/js where
|
||||
necessary for educational purposes.
|
|
@ -16,6 +16,8 @@
|
|||
in {
|
||||
devShell = pkgs.mkShell {
|
||||
nativeBuildInputs = with pkgs; [
|
||||
nodePackages.typescript-language-server
|
||||
|
||||
(python39.withPackages (pypkgs:
|
||||
with pypkgs; [
|
||||
beautifulsoup4
|
||||
|
|
|
@ -15,17 +15,17 @@
|
|||
* @param {string} fileName - The filename to give the file
|
||||
*/
|
||||
function downloadString(text, fileType, fileName) {
|
||||
var blob = new Blob([text], { type: fileType });
|
||||
var blob = new Blob([text], { type: fileType });
|
||||
|
||||
var a = document.createElement('a');
|
||||
a.download = fileName;
|
||||
a.href = URL.createObjectURL(blob);
|
||||
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
|
||||
a.style.display = "none";
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
|
||||
var a = document.createElement('a');
|
||||
a.download = fileName;
|
||||
a.href = URL.createObjectURL(blob);
|
||||
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
|
||||
a.style.display = "none";
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
|
||||
}
|
||||
|
||||
/**
|
Loading…
Reference in a new issue