Initial commit

2022-04-09 14:44:18 +01:00 · 2022-04-09 14:44:18 +01:00 · 4c73ace62d
commit 4c73ace62d
5 changed files with 1934 additions and 0 deletions
--- a/extract-urls.js
+++ b/extract-urls.js
@ -0,0 +1,49 @@
+/**
+ * Extract article links from https://wsjkw.gd.gov.cn.
+ *
+ * Used to provide links for scrape.py, which is used to actually
+ * extract page contents.
+ *
+ * @module extract-urls
+ */
+
+/**
+ * Download a string from the current webpage.
+ *
+ * @param {string} text - The text to download
+ * @param {string} fileType - The file type to download - text/plain is just a .txt
+ * @param {string} fileName - The filename to give the file
+ */
+function downloadString(text, fileType, fileName) {
+  var blob = new Blob([text], { type: fileType });
+
+  var a = document.createElement('a');
+  a.download = fileName;
+  a.href = URL.createObjectURL(blob);
+  a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
+  a.style.display = "none";
+  document.body.appendChild(a);
+  a.click();
+  document.body.removeChild(a);
+  setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
+}
+
+/**
+ * Gather download links from https://wsjkw.gd.gov.cn.
+ *
+ * The actual page scraping is done in python, but these links are
+ * only available once the webpage has fully executed its own JS, so
+ * we grab them all with devtools in a little script.
+ */
+function gatherLinks() {
+    let links = [];
+
+    for (const link of document.getElementsByClassName("article-list__item-title")) {
+        links.push(link.children[0].getAttributeNode("href").value);
+    }
+
+    return links;
+}
+
+// Actually execute the download
+downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,40 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "locked": {
+        "lastModified": 1648297722,
+        "narHash": "sha256-W+qlPsiZd8F3XkzXOzAoR+mpFqzm3ekQkJNa+PIh1BQ=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "0f8662f1319ad6abf89b3380dd2722369fc51ade",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1649117019,
+        "narHash": "sha256-ID7nw/8MDgqj/cbJ0wy6AtQ9wp58hSnE6+weZwuHnso=",
+        "path": "/nix/store/9dmig1pv9njj5kswvs8yvw3qp6b81zkd-source",
+        "rev": "ccb90fb9e11459aeaf83cc28d5f8910816d90dd0",
+        "type": "path"
+      },
+      "original": {
+        "id": "nixpkgs",
+        "type": "indirect"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,33 @@
+{
+  description = "Website scraping to help with stuff";
+
+  inputs = {
+    flake-utils.url = "github:numtide/flake-utils";
+  };
+
+  outputs = {
+    self,
+    nixpkgs,
+    flake-utils,
+  }:
+    flake-utils.lib.eachDefaultSystem (
+      system: let
+        pkgs = import nixpkgs {inherit system;};
+      in {
+        devShell = pkgs.mkShell {
+          nativeBuildInputs = with pkgs; [
+            (python39.withPackages (pypkgs:
+              with pypkgs; [
+                beautifulsoup4
+                requests
+
+                python-lsp-server
+                python-lsp-black
+                pylsp-mypy
+                pyls-isort
+              ]))
+          ];
+        };
+      }
+    );
+}
--- a/links/links.txt
+++ b/links/links.txt
--- a/scrape.py
+++ b/scrape.py
@ -0,0 +1,61 @@
+"""Script to scrape article contents from https://wsjkw.gd.gov.cn.
+
+Links aren't conveniently available, since the page that lists them is
+rendered entirely using JS, so we need to run a fully-fledged JS
+browser to get them.
+
+We use `extract-urls.js` to extract the links beforehand, and dump
+them to a file, which we can extract here.
+"""
+import re
+from typing import Tuple
+
+import requests
+from bs4 import BeautifulSoup
+
+MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d")
+
+
+def main():
+    """Read all links from the set of links and dump their articles to files."""
+    with open("links/links.txt") as f:
+        links = f.readlines()
+
+    for i, link in enumerate(links):
+        print(f"Downloading {link.rstrip()} ({i}/{len(links)})")
+
+        # The links aren't formatted correctly, we need to prefix
+        # them with `http`
+        link = f"http:{link}"
+        date, text = download_link(link)
+
+        with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f:
+            f.write(text)
+
+
+def download_link(link: str) -> Tuple[str, str]:
+    """Download a link."""
+    download = requests.get(link)
+    return extract_article(download.text)
+
+
+def extract_article(website: str) -> Tuple[str, str]:
+    """Extract an article."""
+    soup = BeautifulSoup(website, "html.parser")
+    date = soup.find(class_="date-row")
+
+    if date:
+        match = re.search(MATCH_DATE, date.get_text())
+        if match:
+            date = match.group(0)
+        else:
+            date = "unknown"
+    else:
+        date = "unknown"
+
+    text = soup.find(class_="article-content").get_text().strip()
+    return date, text
+
+
+if __name__ == "__main__":
+    main()