scrape-yuanyuan/guangdong/extract-urls.js

50 lines
1.5 KiB
JavaScript
Raw Normal View History

2022-04-09 14:44:18 +01:00
/**
* Extract article links from https://wsjkw.gd.gov.cn.
*
* Used to provide links for scrape.py, which is used to actually
* extract page contents.
*
* @module extract-urls
*/
/**
* Download a string from the current webpage.
*
* @param {string} text - The text to download
* @param {string} fileType - The file type to download - text/plain is just a .txt
* @param {string} fileName - The filename to give the file
*/
function downloadString(text, fileType, fileName) {
2022-04-09 17:43:37 +01:00
var blob = new Blob([text], { type: fileType });
2022-04-09 14:44:18 +01:00
2022-04-09 17:43:37 +01:00
var a = document.createElement('a');
a.download = fileName;
a.href = URL.createObjectURL(blob);
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
a.style.display = "none";
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
2022-04-09 14:44:18 +01:00
}
/**
* Gather download links from https://wsjkw.gd.gov.cn.
*
* The actual page scraping is done in python, but these links are
* only available once the webpage has fully executed its own JS, so
* we grab them all with devtools in a little script.
*/
function gatherLinks() {
let links = [];
for (const link of document.getElementsByClassName("article-list__item-title")) {
links.push(link.children[0].getAttributeNode("href").value);
}
return links;
}
// Actually execute the download
downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");