diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..51ef5c8 --- /dev/null +++ b/Readme.md @@ -0,0 +1,23 @@ +# Province article scraping + +A couple of scripts to scrape article text from various provinces for +a text analysis university course. + +We need: + +Qinghai +: page 14-75 + +Ningxia +: page 11-42 + +Shanxi +: page 2-18 + +Xinjiang +: page 10-20 + +The websites all have subtle differences, so there's simply a folder + +scripts for each (the scripts are simple enough that there's no need +for deduplication or anything complex). Written in python/js where +necessary for educational purposes. diff --git a/flake.nix b/flake.nix index d801027..5a4daa6 100644 --- a/flake.nix +++ b/flake.nix @@ -16,6 +16,8 @@ in { devShell = pkgs.mkShell { nativeBuildInputs = with pkgs; [ + nodePackages.typescript-language-server + (python39.withPackages (pypkgs: with pypkgs; [ beautifulsoup4 diff --git a/extract-urls.js b/guangdong/extract-urls.js similarity index 73% rename from extract-urls.js rename to guangdong/extract-urls.js index 7081ecb..94e2824 100644 --- a/extract-urls.js +++ b/guangdong/extract-urls.js @@ -15,17 +15,17 @@ * @param {string} fileName - The filename to give the file */ function downloadString(text, fileType, fileName) { - var blob = new Blob([text], { type: fileType }); + var blob = new Blob([text], { type: fileType }); - var a = document.createElement('a'); - a.download = fileName; - a.href = URL.createObjectURL(blob); - a.dataset.downloadurl = [fileType, a.download, a.href].join(':'); - a.style.display = "none"; - document.body.appendChild(a); - a.click(); - document.body.removeChild(a); - setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500); + var a = document.createElement('a'); + a.download = fileName; + a.href = URL.createObjectURL(blob); + a.dataset.downloadurl = [fileType, a.download, a.href].join(':'); + a.style.display = "none"; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500); } /** diff --git a/links/links.txt b/guangdong/links/links.txt similarity index 100% rename from links/links.txt rename to guangdong/links/links.txt diff --git a/scrape.py b/guangdong/scrape.py similarity index 100% rename from scrape.py rename to guangdong/scrape.py