scrape-yuanyuan/guangdong/extract-urls.js

/**
 * Extract article links from https://wsjkw.gd.gov.cn.
 *
 * Used to provide links for scrape.py, which is used to actually
 * extract page contents.
 *
 * @module extract-urls
 */

/**
 * Download a string from the current webpage.
 *
 * @param {string} text - The text to download
 * @param {string} fileType - The file type to download - text/plain is just a .txt
 * @param {string} fileName - The filename to give the file
 */
function downloadString(text, fileType, fileName) {
    var blob = new Blob([text], { type: fileType });

    var a = document.createElement('a');
    a.download = fileName;
    a.href = URL.createObjectURL(blob);
    a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
    a.style.display = "none";
    document.body.appendChild(a);
    a.click();
    document.body.removeChild(a);
    setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
}

/**
 * Gather download links from https://wsjkw.gd.gov.cn.
 *
 * The actual page scraping is done in python, but these links are
 * only available once the webpage has fully executed its own JS, so
 * we grab them all with devtools in a little script.
 */
function gatherLinks() {
    let links = [];

    for (const link of document.getElementsByClassName("article-list__item-title")) {
        links.push(link.children[0].getAttributeNode("href").value);
    }

    return links;
}

// Actually execute the download
downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");
Initial commit 2022-04-09 14:44:18 +01:00			`/**`
			`* Extract article links from https://wsjkw.gd.gov.cn.`
			`*`
			`* Used to provide links for scrape.py, which is used to actually`
			`* extract page contents.`
			`*`
			`* @module extract-urls`
			`*/`

			`/**`
			`* Download a string from the current webpage.`
			`*`
			`* @param {string} text - The text to download`
			`* @param {string} fileType - The file type to download - text/plain is just a .txt`
			`* @param {string} fileName - The filename to give the file`
			`*/`
			`function downloadString(text, fileType, fileName) {`
Add typescript-language-server 2022-04-09 17:43:37 +01:00			`var blob = new Blob([text], { type: fileType });`
Initial commit 2022-04-09 14:44:18 +01:00
Add typescript-language-server 2022-04-09 17:43:37 +01:00			`var a = document.createElement('a');`
			`a.download = fileName;`
			`a.href = URL.createObjectURL(blob);`
			`a.dataset.downloadurl = [fileType, a.download, a.href].join(':');`
			`a.style.display = "none";`
			`document.body.appendChild(a);`
			`a.click();`
			`document.body.removeChild(a);`
			`setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);`
Initial commit 2022-04-09 14:44:18 +01:00			`}`

			`/**`
			`* Gather download links from https://wsjkw.gd.gov.cn.`
			`*`
			`* The actual page scraping is done in python, but these links are`
			`* only available once the webpage has fully executed its own JS, so`
			`* we grab them all with devtools in a little script.`
			`*/`
			`function gatherLinks() {`
			`let links = [];`

			`for (const link of document.getElementsByClassName("article-list__item-title")) {`
			`links.push(link.children[0].getAttributeNode("href").value);`
			`}`

			`return links;`
			`}`

			`// Actually execute the download`
			`downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");`