Structure the project a bit better

2022-04-09 16:50:15 +01:00 · 2022-04-09 16:50:15 +01:00 · dcb665cde4
commit dcb665cde4
parent 4c73ace62d
3 changed files with 0 additions and 0 deletions
--- a/guangdong/extract-urls.js
+++ b/guangdong/extract-urls.js
@ -0,0 +1,49 @@
+/**
+ * Extract article links from https://wsjkw.gd.gov.cn.
+ *
+ * Used to provide links for scrape.py, which is used to actually
+ * extract page contents.
+ *
+ * @module extract-urls
+ */
+
+/**
+ * Download a string from the current webpage.
+ *
+ * @param {string} text - The text to download
+ * @param {string} fileType - The file type to download - text/plain is just a .txt
+ * @param {string} fileName - The filename to give the file
+ */
+function downloadString(text, fileType, fileName) {
+  var blob = new Blob([text], { type: fileType });
+
+  var a = document.createElement('a');
+  a.download = fileName;
+  a.href = URL.createObjectURL(blob);
+  a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
+  a.style.display = "none";
+  document.body.appendChild(a);
+  a.click();
+  document.body.removeChild(a);
+  setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
+}
+
+/**
+ * Gather download links from https://wsjkw.gd.gov.cn.
+ *
+ * The actual page scraping is done in python, but these links are
+ * only available once the webpage has fully executed its own JS, so
+ * we grab them all with devtools in a little script.
+ */
+function gatherLinks() {
+    let links = [];
+
+    for (const link of document.getElementsByClassName("article-list__item-title")) {
+        links.push(link.children[0].getAttributeNode("href").value);
+    }
+
+    return links;
+}
+
+// Actually execute the download
+downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");
--- a/guangdong/links/links.txt
+++ b/guangdong/links/links.txt
--- a/guangdong/scrape.py
+++ b/guangdong/scrape.py
@ -0,0 +1,61 @@
+"""Script to scrape article contents from https://wsjkw.gd.gov.cn.
+
+Links aren't conveniently available, since the page that lists them is
+rendered entirely using JS, so we need to run a fully-fledged JS
+browser to get them.
+
+We use `extract-urls.js` to extract the links beforehand, and dump
+them to a file, which we can extract here.
+"""
+import re
+from typing import Tuple
+
+import requests
+from bs4 import BeautifulSoup
+
+MATCH_DATE = re.compile(r"\d\d\d\d-\d\d-\d\d")
+
+
+def main():
+    """Read all links from the set of links and dump their articles to files."""
+    with open("links/links.txt") as f:
+        links = f.readlines()
+
+    for i, link in enumerate(links):
+        print(f"Downloading {link.rstrip()} ({i}/{len(links)})")
+
+        # The links aren't formatted correctly, we need to prefix
+        # them with `http`
+        link = f"http:{link}"
+        date, text = download_link(link)
+
+        with open(f"articles-guangdong/{i}_{date}.txt", mode="w+") as f:
+            f.write(text)
+
+
+def download_link(link: str) -> Tuple[str, str]:
+    """Download a link."""
+    download = requests.get(link)
+    return extract_article(download.text)
+
+
+def extract_article(website: str) -> Tuple[str, str]:
+    """Extract an article."""
+    soup = BeautifulSoup(website, "html.parser")
+    date = soup.find(class_="date-row")
+
+    if date:
+        match = re.search(MATCH_DATE, date.get_text())
+        if match:
+            date = match.group(0)
+        else:
+            date = "unknown"
+    else:
+        date = "unknown"
+
+    text = soup.find(class_="article-content").get_text().strip()
+    return date, text
+
+
+if __name__ == "__main__":
+    main()