Compare commits
No commits in common. "9030da9a0c34f4547070b8dae7160966fea341a9" and "4c73ace62d0c65b094ec002c4a0575012a635628" have entirely different histories.
9030da9a0c
...
4c73ace62d
23
Readme.md
23
Readme.md
|
@ -1,23 +0,0 @@
|
||||||
# Province article scraping
|
|
||||||
|
|
||||||
A couple of scripts to scrape article text from various provinces for
|
|
||||||
a text analysis university course.
|
|
||||||
|
|
||||||
We need:
|
|
||||||
|
|
||||||
Qinghai
|
|
||||||
: page 14-75
|
|
||||||
|
|
||||||
Ningxia
|
|
||||||
: page 11-42
|
|
||||||
|
|
||||||
Shanxi
|
|
||||||
: page 2-18
|
|
||||||
|
|
||||||
Xinjiang
|
|
||||||
: page 10-20
|
|
||||||
|
|
||||||
The websites all have subtle differences, so there's simply a folder +
|
|
||||||
scripts for each (the scripts are simple enough that there's no need
|
|
||||||
for deduplication or anything complex). Written in python/js where
|
|
||||||
necessary for educational purposes.
|
|
|
@ -15,17 +15,17 @@
|
||||||
* @param {string} fileName - The filename to give the file
|
* @param {string} fileName - The filename to give the file
|
||||||
*/
|
*/
|
||||||
function downloadString(text, fileType, fileName) {
|
function downloadString(text, fileType, fileName) {
|
||||||
var blob = new Blob([text], { type: fileType });
|
var blob = new Blob([text], { type: fileType });
|
||||||
|
|
||||||
var a = document.createElement('a');
|
var a = document.createElement('a');
|
||||||
a.download = fileName;
|
a.download = fileName;
|
||||||
a.href = URL.createObjectURL(blob);
|
a.href = URL.createObjectURL(blob);
|
||||||
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
|
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
|
||||||
a.style.display = "none";
|
a.style.display = "none";
|
||||||
document.body.appendChild(a);
|
document.body.appendChild(a);
|
||||||
a.click();
|
a.click();
|
||||||
document.body.removeChild(a);
|
document.body.removeChild(a);
|
||||||
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
|
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
|
@ -16,8 +16,6 @@
|
||||||
in {
|
in {
|
||||||
devShell = pkgs.mkShell {
|
devShell = pkgs.mkShell {
|
||||||
nativeBuildInputs = with pkgs; [
|
nativeBuildInputs = with pkgs; [
|
||||||
nodePackages.typescript-language-server
|
|
||||||
|
|
||||||
(python39.withPackages (pypkgs:
|
(python39.withPackages (pypkgs:
|
||||||
with pypkgs; [
|
with pypkgs; [
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
|
|
Loading…
Reference in a new issue