2022-04-09 14:44:18 +01:00
|
|
|
/**
|
|
|
|
* Extract article links from https://wsjkw.gd.gov.cn.
|
|
|
|
*
|
|
|
|
* Used to provide links for scrape.py, which is used to actually
|
|
|
|
* extract page contents.
|
|
|
|
*
|
|
|
|
* @module extract-urls
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Download a string from the current webpage.
|
|
|
|
*
|
|
|
|
* @param {string} text - The text to download
|
|
|
|
* @param {string} fileType - The file type to download - text/plain is just a .txt
|
|
|
|
* @param {string} fileName - The filename to give the file
|
|
|
|
*/
|
|
|
|
function downloadString(text, fileType, fileName) {
|
2022-04-09 17:43:37 +01:00
|
|
|
var blob = new Blob([text], { type: fileType });
|
2022-04-09 14:44:18 +01:00
|
|
|
|
2022-04-09 17:43:37 +01:00
|
|
|
var a = document.createElement('a');
|
|
|
|
a.download = fileName;
|
|
|
|
a.href = URL.createObjectURL(blob);
|
|
|
|
a.dataset.downloadurl = [fileType, a.download, a.href].join(':');
|
|
|
|
a.style.display = "none";
|
|
|
|
document.body.appendChild(a);
|
|
|
|
a.click();
|
|
|
|
document.body.removeChild(a);
|
|
|
|
setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500);
|
2022-04-09 14:44:18 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gather download links from https://wsjkw.gd.gov.cn.
|
|
|
|
*
|
|
|
|
* The actual page scraping is done in python, but these links are
|
|
|
|
* only available once the webpage has fully executed its own JS, so
|
|
|
|
* we grab them all with devtools in a little script.
|
|
|
|
*/
|
|
|
|
function gatherLinks() {
|
|
|
|
let links = [];
|
|
|
|
|
|
|
|
for (const link of document.getElementsByClassName("article-list__item-title")) {
|
|
|
|
links.push(link.children[0].getAttributeNode("href").value);
|
|
|
|
}
|
|
|
|
|
|
|
|
return links;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Actually execute the download
|
|
|
|
downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");
|