/** * Extract article links from https://wsjkw.gd.gov.cn. * * Used to provide links for scrape.py, which is used to actually * extract page contents. * * @module extract-urls */ /** * Download a string from the current webpage. * * @param {string} text - The text to download * @param {string} fileType - The file type to download - text/plain is just a .txt * @param {string} fileName - The filename to give the file */ function downloadString(text, fileType, fileName) { var blob = new Blob([text], { type: fileType }); var a = document.createElement('a'); a.download = fileName; a.href = URL.createObjectURL(blob); a.dataset.downloadurl = [fileType, a.download, a.href].join(':'); a.style.display = "none"; document.body.appendChild(a); a.click(); document.body.removeChild(a); setTimeout(function() { URL.revokeObjectURL(a.href); }, 1500); } /** * Gather download links from https://wsjkw.gd.gov.cn. * * The actual page scraping is done in python, but these links are * only available once the webpage has fully executed its own JS, so * we grab them all with devtools in a little script. */ function gatherLinks() { let links = []; for (const link of document.getElementsByClassName("article-list__item-title")) { links.push(link.children[0].getAttributeNode("href").value); } return links; } // Actually execute the download downloadString(gatherLinks().join("\n") + "\n", "text/plain", "links-" + document.getElementById("select-page").selectedIndex + ".txt");