From dcb665cde4c1a8b69df75568590d24bdcd010587 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= <tm@tlater.net> Date: Sat, 9 Apr 2022 16:50:15 +0100 Subject: [PATCH 1/3] Structure the project a bit better --- extract-urls.js => guangdong/extract-urls.js | 0 {links => guangdong/links}/links.txt | 0 scrape.py => guangdong/scrape.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename extract-urls.js => guangdong/extract-urls.js (100%) rename {links => guangdong/links}/links.txt (100%) rename scrape.py => guangdong/scrape.py (100%) diff --git a/extract-urls.js b/guangdong/extract-urls.js similarity index 100% rename from extract-urls.js rename to guangdong/extract-urls.js diff --git a/links/links.txt b/guangdong/links/links.txt similarity index 100% rename from links/links.txt rename to guangdong/links/links.txt diff --git a/scrape.py b/guangdong/scrape.py similarity index 100% rename from scrape.py rename to guangdong/scrape.py From 60d7eec53f9c2cf0a5ccf4c895a9b6a960c2ebcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= <tm@tlater.net> Date: Sat, 9 Apr 2022 17:43:37 +0100 Subject: [PATCH 2/3] Add typescript-language-server --- flake.nix | 2 ++ guangdong/extract-urls.js | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/flake.nix b/flake.nix index d801027..5a4daa6 100644 --- a/flake.nix +++ b/flake.nix @@ -16,6 +16,8 @@ in { devShell = pkgs.mkShell { nativeBuildInputs = with pkgs; [ + nodePackages.typescript-language-server + (python39.withPackages (pypkgs: with pypkgs; [ beautifulsoup4 diff --git a/guangdong/extract-urls.js b/guangdong/extract-urls.js index 7081ecb..94e2824 100644 --- a/guangdong/extract-urls.js +++ b/guangdong/extract-urls.js @@ -15,17 +15,17 @@ * @param {string} fileName - The filename to give the file */ function downloadString(text, fileType, fileName) { - var blob = new Blob([text], { type: fileType }); + var blob = new Blob([text], { type: fileType }); - var a = document.createElement('a'); - a.download = fileName; - a.href = URL.createObjectURL(blob); - a.dataset.downloadurl = [fileType, a.download, a.href].join(':'); - a.style.display = "none"; - document.body.appendChild(a); - a.click(); - document.body.removeChild(a); - setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500); + var a = document.createElement('a'); + a.download = fileName; + a.href = URL.createObjectURL(blob); + a.dataset.downloadurl = [fileType, a.download, a.href].join(':'); + a.style.display = "none"; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500); } /** From 9030da9a0c34f4547070b8dae7160966fea341a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tristan=20Dani=C3=ABl=20Maat?= <tm@tlater.net> Date: Sat, 9 Apr 2022 17:43:47 +0100 Subject: [PATCH 3/3] Add Readme --- Readme.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 Readme.md diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..51ef5c8 --- /dev/null +++ b/Readme.md @@ -0,0 +1,23 @@ +# Province article scraping + +A couple of scripts to scrape article text from various provinces for +a text analysis university course. + +We need: + +Qinghai +: page 14-75 + +Ningxia +: page 11-42 + +Shanxi +: page 2-18 + +Xinjiang +: page 10-20 + +The websites all have subtle differences, so there's simply a folder + +scripts for each (the scripts are simple enough that there's no need +for deduplication or anything complex). Written in python/js where +necessary for educational purposes.