Initial commit

2022-04-09 14:44:18 +01:00 · 2022-04-09 14:44:18 +01:00 · 4c73ace62d
commit 4c73ace62d
5 changed files with 1934 additions and 0 deletions
--- a/extract-urls.js
+++ b/extract-urls.js
@ -0,0 +1,49 @@
 /**
 * Extract article links from https://wsjkw.gd.gov.cn.
 *
 * Used to provide links for scrape.py, which is used to actually
 * extract page contents.
 *
 * @module extract-urls
 */
 /**
 * Download a string from the current webpage.
 *
 * @param {string} text - The text to download
 * @param {string} fileType - The file type to download - text/plain is just a .txt
 * @param {string} fileName - The filename to give the file
 */
 function downloadString(text, fileType, fileName) {
  var blob = new Blob([text], { type: fileType });
  var a = document.createElement('a');
  a.download = fileName;
  a.href = URL.createObjectURL(blob);
  a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
  a.style.display = "none";
  document.body.appendChild(a);
  a.click();
  document.body.removeChild(a);
  setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
 }
 /**
 * Gather download links from https://wsjkw.gd.gov.cn.
 *
 * The actual page scraping is done in python, but these links are
 * only available once the webpage has fully executed its own JS, so
 * we grab them all with devtools in a little script.
 */
 function gatherLinks() {
    let links = [];
    for (const link of document.getElementsByClassName("article-list__item-title")) {
        links.push(link.children[0].getAttributeNode("href").value);
    }
    return links;
 }
 // Actually execute the download
 downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,40 @@
 {
  "nodes": {
    "flake-utils": {
      "locked": {
        "lastModified": 1648297722,
        "narHash": "sha256-W+qlPsiZd8F3XkzXOzAoR+mpFqzm3ekQkJNa+PIh1BQ=",
        "owner": "numtide",
        "repo": "flake-utils",
        "rev": "0f8662f1319ad6abf89b3380dd2722369fc51ade",
        "type": "github"
      },
      "original": {
        "owner": "numtide",
        "repo": "flake-utils",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
        "lastModified": 1649117019,
        "narHash": "sha256-ID7nw/8MDgqj/cbJ0wy6AtQ9wp58hSnE6+weZwuHnso=",
        "path": "/nix/store/9dmig1pv9njj5kswvs8yvw3qp6b81zkd-source",
        "rev": "ccb90fb9e11459aeaf83cc28d5f8910816d90dd0",
        "type": "path"
      },
      "original": {
        "id": "nixpkgs",
        "type": "indirect"
      }
    },
    "root": {
      "inputs": {
        "flake-utils": "flake-utils",
        "nixpkgs": "nixpkgs"
      }
    }
  },
  "root": "root",
  "version": 7
 }
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,33 @@
 {
  description = "Website scraping to help with stuff";
  inputs = {
    flake-utils.url = "github:numtide/flake-utils";
  };
  outputs = {
    self,
    nixpkgs,
    flake-utils,
  }:
    flake-utils.lib.eachDefaultSystem (
      system: let
        pkgs = import nixpkgs {inherit system;};
      in {
        devShell = pkgs.mkShell {
          nativeBuildInputs = with pkgs; [
            (python39.withPackages (pypkgs:
              with pypkgs; [
                beautifulsoup4
                requests
                python-lsp-server
                python-lsp-black
                pylsp-mypy
                pyls-isort
              ]))
          ];
        };
      }
    );
 }
--- a/links/links.txt
+++ b/links/links.txt
--- a/scrape.py
+++ b/scrape.py
@ -0,0 +1,61 @@
 """Script to scrape article contents from https://wsjkw.gd.gov.cn.
 Links aren't conveniently available, since the page that lists them is
 rendered entirely using JS, so we need to run a fully-fledged JS
 browser to get them.
 We use `extract-urls.js` to extract the links beforehand, and dump
 them to a file, which we can extract here.
 """
 import re
 from typing import Tuple
 import requests
 from bs4 import BeautifulSoup
 MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d")
 def main():
    """Read all links from the set of links and dump their articles to files."""
    with open("links/links.txt") as f:
        links = f.readlines()
    for i, link in enumerate(links):
        print(f"Downloading {link.rstrip()} ({i}/{len(links)})")
        # The links aren't formatted correctly, we need to prefix
        # them with `http`
        link = f"http:{link}"
        date, text = download_link(link)
        with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f:
            f.write(text)
 def download_link(link: str) -> Tuple[str, str]:
    """Download a link."""
    download = requests.get(link)
    return extract_article(download.text)
 def extract_article(website: str) -> Tuple[str, str]:
    """Extract an article."""
    soup = BeautifulSoup(website, "html.parser")
    date = soup.find(class_="date-row")
    if date:
        match = re.search(MATCH_DATE, date.get_text())
        if match:
            date = match.group(0)
        else:
            date = "unknown"
    else:
        date = "unknown"
    text = soup.find(class_="article-content").get_text().strip()
    return date, text
 if __name__ == "__main__":
    main()