scrape-yuanyuan/guangdong/extract-urls.js

/**
 * Extract article links from https://wsjkw.gd.gov.cn.
 *
 * Used to provide links for scrape.py, which is used to actually
 * extract page contents.
 *
 * @module extract-urls
 */

/**
 * Download a string from the current webpage.
 *
 * @param {string} text - The text to download
 * @param {string} fileType - The file type to download - text/plain is just a .txt
 * @param {string} fileName - The filename to give the file
 */
function downloadString(text, fileType, fileName) {
    var blob = new Blob([text], { type: fileType });

    var a = document.createElement('a');
    a.download = fileName;
    a.href = URL.createObjectURL(blob);
    a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
    a.style.display = "none";
    document.body.appendChild(a);
    a.click();
    document.body.removeChild(a);
    setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
}

/**
 * Gather download links from https://wsjkw.gd.gov.cn.
 *
 * The actual page scraping is done in python, but these links are
 * only available once the webpage has fully executed its own JS, so
 * we grab them all with devtools in a little script.
 */
function gatherLinks() {
    let links = [];

    for (const link of document.getElementsByClassName("article-list__item-title")) {
        links.push(link.children[0].getAttributeNode("href").value);
    }

    return links;
}

// Actually execute the download
downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");