// ==UserScript==
// @name Crawler base on SingleFile
// @author Mark
// @description Download site in single file automatically
// @license MIT
// @version 0.0.18
// @match https://*/*
// @run-at document-idle
// @grant GM.setValue
// @grant GM.getValue
// @grant GM.xmlHttpRequest
// @grant GM_registerMenuCommand
// @grant unsafeWindow
// @require https://update.greasyfork.org/scripts/483730/1305396/gm-fetch.js
// @require https://openuserjs.org/src/libs/sizzle/GM_config.js
// @connect *
// @noframes
// @namespace https://greasyfork.org/users/1106595
// ==/UserScript==
const REPORT_ADDRESS = "https://crawler-hit.deno.dev/api/update";
const PAGE_LOADING_TIME = 7;
const ERROR_RELOAD_TIME = 10;
const ERROR_RELOAD_LONG_TIME = 60;
const NEXT_TASK_WAITING_TIME = 10;
const NO_TASK_WAITING_TIME = 90;
const CF_CHALLENGE_WAITING_TIME = 20;
const QUICK_SLEEP_TIME = 5;
const DOMAIN_REG = /^(https?):\/\/([^\s\/?\.#]+\.?)+$/;
const TASK_MAX_RETRY_TIMES = 3;
const TIME_POINT_TYPES = {
PREPARE_START: "prepareStart",
TASK_LOADED: "taskLoaded",
TASK_REPORTED: "taskReported",
PRESIGN_INDEX: "presignIndex",
PRESIGN_SINGLEFILE: "presignSinglefile",
SINGLE_FILE_SUCCESS: "singleFileSuccess",
INDEX_FILE_UPLOADED: "indexFileUploaded",
SINGLE_FILE_UPLOADED: "singleFileUploaded",
VALIDATE_FAILED: "validateFailed",
};
let gmc = new GM_config({
id: "CrawlerConfig",
title: "Crawler setting",
fields: {
Name: {
label: "Name",
type: "text",
},
Password: {
label: "Password",
type: "text",
},
taskInterval: {
label: "Task Interval (s)",
type: "int",
default: NEXT_TASK_WAITING_TIME,
},
taskMaxRetryTimes: {
label: "Task Max Retry Times",
type: "int",
default: TASK_MAX_RETRY_TIMES,
},
preferServer: {
label: "Prefer preSign Server",
type: "text",
},
reportServer: {
label: "Report Server",
type: "text",
default: REPORT_ADDRESS,
},
},
events: {
init: function () {
// runs after initialization completes
},
save: function () {
// runs after values are saved
console.log("save", this.get("Name"), this.get("Password"));
this.close();
},
},
});
const crawlerUtil = {
addScript: (url) => {
const s = document.createElement("script");
s.src = url;
s.onerror = (evt) => {
setTimeout(() => {
addScript(url);
}, 2000);
};
document.body.append(s);
},
addScriptByText: async (url, cache = false, retry = 0) => {
const s = document.createElement("script");
s.dataset.crawler = "true";
const scriptCache = (await GM.getValue("scriptCache")) || {};
if (cache && scriptCache[url]) {
s.innerHTML = scriptCache[url];
document.body.append(s);
return true;
}
try {
const res = await GM.xmlHttpRequest({
url: url,
method: "GET",
});
const text = res.responseText;
if (cache) {
scriptCache[url] = text;
GM.setValue("scriptCache", scriptCache);
}
s.innerHTML = text;
document.body.append(s);
return true;
} catch (error) {
if (retry > 3) {
return false;
}
await sleep(2);
return await addScriptByText(url, retry + 1);
}
},
getPreSignUrl: async (doi, fileName, name, pass, preferServer = "") => {
const configServer = DOMAIN_REG.test(preferServer) ? [preferServer] : [];
const preSignSevers = configServer.concat([
"https://minio-presign.hzc.pub",
"https://minio-presign-ali.hzc.pub",
"https://chem-brain-minio.deno.dev",
]);
async function getPreSignUrlFromServer(serverIndex = 0) {
try {
return await (
await GM_fetch(
`${preSignSevers[serverIndex]}/api/presignedPutObject?doi=${doi}&file_name=${fileName}&account=${name}&pass=${pass}`
)
).json();
} catch (error) {
if (!preSignSevers[serverIndex + 1]) {
return { reload: true };
}
return await getPreSignUrlFromServer(serverIndex + 1);
}
}
const preSignRes = await getPreSignUrlFromServer();
if (preSignRes.reload) {
return "RELOAD";
}
const url = preSignRes?.url;
return url || null;
},
uploader: async (url, content) => {
const mime = "application/gzip"
const gzip_data = pako.gzip(content, { level: 9 });
const upload_blob = new Blob([gzip_data], { type: mime });
return await GM.xmlHttpRequest({
method: "PUT",
url,
headers: {
"Content-Type": mime,
"Content-Length": upload_blob.size,
},
data: upload_blob,
});
},
downloadFile: (data, fileName) => {
const a = document.createElement("a");
document.body.appendChild(a);
a.style = "display: none";
const blob = new Blob([data], {
type: "application/octet-stream",
});
const url = window.URL.createObjectURL(blob);
a.href = url;
a.download = fileName;
a.click();
window.URL.revokeObjectURL(url);
},
generateClientId: () => (1e6 * Math.random()).toString(32).replace(".", ""),
sleep: (duration) => {
return new Promise((res, rej) => {
setTimeout(() => res(), duration * 1000);
});
},
};
// main function
(function () {
"use strict";
const {
addScript,
addScriptByText,
generateClientId,
uploader,
downloadFile,
getPreSignUrl,
sleep,
} = crawlerUtil;
const dependenciesInit = async () => {
await addScriptByText(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js",
true
);
await addScriptByText(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js",
true
);
await addScriptByText(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js",
true
);
await addScriptByText(
"https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js",
true
);
await addScriptByText(
"https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js"
);
await addScriptByText(
"https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/validator.js"
);
await addScriptByText(
"https://cdn.jsdelivr.net/npm/pako@2.1.0/dist/pako.min.js"
);
return () => {
document.querySelectorAll("script[data-crawler='true']").forEach((el) => {
el.parentElement.removeChild(el);
});
};
};
const pureHTMLCleaner = (document) => {
document.querySelectorAll("script").forEach((el) => {
el.parentElement.removeChild(el);
});
document.querySelectorAll("style").forEach((el) => {
el.parentElement.removeChild(el);
});
};
// Overwrite fetch function to bypass CORS
window.unsafeWindow.fetch = async (...args) => {
return await fetch(...args).catch(async (err) => {
return await GM_fetch(...args);
});
};
async function reload(waiting = 60, message = "") {
console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
await sleep(waiting);
location.reload();
}
function readFile(accept = "", multiple = false) {
const inputEl = document.createElement("input");
inputEl.setAttribute("type", "file");
inputEl.setAttribute("accept", accept);
inputEl.setAttribute("multiple", !!multiple);
return new Promise((resolve, reject) => {
inputEl.addEventListener("change", (e) => {
resolve(multiple ? inputEl.files : inputEl.files[0]);
window.removeEventListener("click", onWindowClick, true);
});
document.body.append(inputEl);
inputEl.click();
const onWindowClick = () => {
if (!inputEl.value) {
reject(new Error("用户取消选择"));
}
window.removeEventListener("click", onWindowClick, true);
};
setTimeout(() => {
window.addEventListener("click", onWindowClick, true);
}, 100);
});
}
function AddImportBtn() {
const btnWrapImport = document.createElement("div");
btnWrapImport.id = "CRAWLER_ID";
btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
const importBtn = btnWrapImport.querySelector("button");
importBtn.onclick = async () => {
if (
!window.confirm(
"The data in browser will be clear up. Please make sure you have to do this !!!"
)
) {
return;
}
const file = await readFile(".json");
const reader = new FileReader();
reader.onload = (event) => {
const json = JSON.parse(event.target.result);
// console.log({json}, 'json')
// this.importFromBackUp.bind(this)(json);
if (
json instanceof Array &&
json.every((item) => item.doi && item.validator)
) {
GM.setValue("tasks", json);
location.reload();
} else {
alert(
"Please upload json file like [{doi: string, validator: string, ...}]"
);
}
};
reader.readAsText(file);
};
document.body.appendChild(btnWrapImport);
return () => {
const importBtn = document.getElementById("CRAWLER_ID");
if (importBtn) {
importBtn.parentElement.removeChild(importBtn);
}
};
}
GM_registerMenuCommand("Download", async () => {
const taskData = await GM.getValue("tasks");
const waitingTasks = taskData.filter(
(task) =>
!task.downloaded &&
task.validated === undefined &&
validators[task.validator]
);
const now = new Date();
downloadFile(
JSON.stringify(taskData),
`${now.getFullYear()}-${
now.getMonth() + 1
}-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
taskData.length
}-${taskData.length - waitingTasks.length}.json`
);
});
GM_registerMenuCommand("Config", async () => {
gmc.open();
});
const printStyle = "color: blue;background-color: #ccc;font-size: 20px";
const prepareNextTask = async (nextDoi) => {
const taskInterval = gmc.get("taskInterval") || NEXT_TASK_WAITING_TIME;
if (nextDoi) {
console.log(
`%cStart next task ${taskInterval}s later...`,
printStyle,
nextDoi
);
await sleep(taskInterval);
const taskData = await GM.getValue("tasks");
const task = taskData.find((task) => task.doi === nextDoi);
await saveTaskTimepoint(TIME_POINT_TYPES.PREPARE_START, task, taskData);
location.href = nextDoi;
} else {
await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
}
};
let lasestTimepoint = 0;
const saveTaskTimepoint = async (pointName, task, taskData) => {
if (pointName === TIME_POINT_TYPES.PREPARE_START) {
task[`timePoint_${pointName}`] = new Date().valueOf()
}
else {
if (lasestTimepoint == 0) {
lasestTimepoint = task[`timePoint_${TIME_POINT_TYPES.PREPARE_START}`] || 0;
}
if (lasestTimepoint == 0) {
task[`timePoint_${pointName}`] = 0;
} else {
task[`timePoint_${pointName}`] = new Date().valueOf() - lasestTimepoint;
}
lasestTimepoint = new Date().valueOf();
}
await GM.setValue("tasks", taskData);
};
const checkRetry = async (task, taskData, nextDoi) => {
const taskMaxRetryTimes = gmc.get("taskMaxRetryTimes") || TASK_MAX_RETRY_TIMES;
const retryTimes = task.retryTimes || 0;
let result = true;
if (retryTimes >= taskMaxRetryTimes) {
console.log(`%cTask have been retry ${taskMaxRetryTimes} times! ${task.doi}`, printStyle);
task.validated = false;
task.updateTime = new Date().valueOf();
await prepareNextTask(nextDoi);
result = false;
} else {
task.retryTimes = retryTimes + 1;
}
await GM.setValue("tasks", taskData);
return result;
}
async function start() {
console.log(new Date());
const importBtnHandler = AddImportBtn();
let clientId = await GM.getValue("clientId");
if (typeof clientId !== "string" || !clientId) {
clientId = generateClientId();
await GM.setValue("clientId", clientId);
}
// ---------------------------- Script dependencies handler -----------------------------------------------------
const dependenciesHandler = await dependenciesInit();
if (!singlefile || !singlefile.getPageData) {
await reload(ERROR_RELOAD_TIME, `singlefile error! ${currentTask.doi}`);
return;
}
if (!(validators && DEFAULT_CONFIG)) {
await reload(
ERROR_RELOAD_TIME,
"Can not get validators or DEFAULT_CONFIG"
);
return;
}
// ---------------------------- Get Task -----------------------------------------------------
const taskData = await GM.getValue("tasks");
let tasks = taskData || [];
// find task which not downloaded and not validated before
const waitingTasks = tasks.filter(
(task) =>
!task.downloaded &&
task.validated === undefined &&
validators[task.validator]
);
console.log(
`%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
printStyle,
tasks
);
if (!waitingTasks.length) {
await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
return;
}
const invalidatedTasks = tasks.filter((task) => task.validated === false);
const doneTasks = tasks
.filter((task) => task.downloaded)
.sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
const previousDay = new Date().valueOf() - 24 * 3600 * 1000;
const last24hDoneTasks = doneTasks.filter(
(task) => task.updateTime > previousDay
);
const lastDoneTime = new Date(doneTasks[0]?.updateTime);
const currentTask = waitingTasks[0];
const nextTask = waitingTasks[1] || {};
await saveTaskTimepoint(TIME_POINT_TYPES.TASK_LOADED, currentTask, tasks);
const updateCurrentTask = async (isSuccess) => {
currentTask.validated = isSuccess;
currentTask.updateTime = new Date().valueOf();
await GM.setValue("tasks", tasks);
};
// ---------------------------- Report progress -----------------------------------------------------
const reportUrl = gmc.get("reportServer") || REPORT_ADDRESS;
const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
Speed: ${last24hDoneTasks.length} / last 24h`;
GM.xmlHttpRequest({
url: reportUrl,
method: "POST",
headers: { "Content-Type": "application/json" },
data: JSON.stringify({
account: clientId,
invalidate_count: invalidatedTasks.length,
done_count: doneTasks.length,
queue_count: waitingTasks.length,
tip: reportTip,
}),
})
.then((res) => {
console.log("Report successfully", { res });
})
.finally(() => {
saveTaskTimepoint(TIME_POINT_TYPES.TASK_REPORTED, currentTask, tasks);
});
// -------------------------- Detect Cloudflare challenge -------------------------------------------------------
await sleep(PAGE_LOADING_TIME);
if (document.getElementById("challenge-form")) {
console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
await sleep(CF_CHALLENGE_WAITING_TIME);
currentTask.cloudflareBlock = true;
await updateCurrentTask(false);
await prepareNextTask(nextTask.doi);
return;
}
// bypass els institution check
if (document.querySelector('.sec-A #bdd-els-close')) {
const elsCloseBtn = document.querySelector('.sec-A #bdd-els-close');
elsCloseBtn.click();
}
// ---------------------------- validated task ------------------------------------------------
const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
const doiFixed = doi.replaceAll("/", "_");
const validator = validators[currentTask.validator];
let name = "";
let pass = "";
let preferServer = "";
try {
name = gmc.get("Name");
pass = gmc.get("Password");
preferServer = gmc.get("preferServer");
if (!name || !pass) {
throw new Error();
}
} catch (err) {
console.error(
`%cMiss name or password. Please input in config panel.`,
printStyle
);
return;
}
const indexUrl = await getPreSignUrl(doiFixed, `_.html.gz`, name, pass, preferServer);
await saveTaskTimepoint(TIME_POINT_TYPES.PRESIGN_INDEX, currentTask, tasks);
const singlefileUrl = await getPreSignUrl(
doiFixed,
`_.sf.html.gz`,
name,
pass,
preferServer
);
await saveTaskTimepoint(
TIME_POINT_TYPES.PRESIGN_SINGLEFILE,
currentTask,
tasks
);
if (indexUrl === "RELOAD" || singlefileUrl === "RELOAD") {
await reload(
ERROR_RELOAD_LONG_TIME,
"Minio PreSignUrl error, please check url or account"
);
return;
}
if (!indexUrl && !singlefileUrl) {
console.error("%cFile existed!!!", printStyle, currentTask.doi);
await updateCurrentTask(false);
await prepareNextTask(nextTask.doi);
return;
} else {
const old_index = await getPreSignUrl(doiFixed, `_.html`, name, pass, preferServer);
const old_singlefileUrl = await getPreSignUrl(
doiFixed,
`_.sf.html`,
name,
pass,
preferServer
);
if (!old_index && !old_singlefileUrl) {
console.error("%cFile existed!!!", printStyle, currentTask.doi);
await updateCurrentTask(false);
await prepareNextTask(nextTask.doi);
return;
}
}
// --------------------------- Page validate ------------------------------------------------------
if (!document.body.textContent.toLowerCase().includes(doi)) {
console.log(
`%cURL not match, will redirect to ${currentTask.doi} 5s later`,
printStyle
);
await sleep(QUICK_SLEEP_TIME);
if(await checkRetry(currentTask, tasks, nextTask.doi)){
location.href = currentTask.doi;
}
return;
}
if (validator(document)) {
console.log(
"%cValidate successfully! Downloading page...",
printStyle,
waitingTasks,
tasks
);
importBtnHandler();
// repair special page
if (typeof documentFixer[currentTask.validator] === "function") {
documentFixer[currentTask.validator](document);
}
try {
const data = await singlefile.getPageData(DEFAULT_CONFIG);
await saveTaskTimepoint(
TIME_POINT_TYPES.SINGLE_FILE_SUCCESS,
currentTask,
tasks
);
// downloadFile(data.content, `${doiFixed}.singlefile.html`);
// downloadFile(document.body.parentElement.outerHTML, `${doiFixed}.html`);
if (singlefileUrl) {
await uploader(singlefileUrl, data.content);
await saveTaskTimepoint(
TIME_POINT_TYPES.SINGLE_FILE_UPLOADED,
currentTask,
tasks
);
}
if (indexUrl) {
dependenciesHandler();
pureHTMLCleaner(document);
await uploader(indexUrl, document.body.parentElement.outerHTML);
await saveTaskTimepoint(
TIME_POINT_TYPES.INDEX_FILE_UPLOADED,
currentTask,
tasks
);
}
console.log("%cUpload successfully!", printStyle);
currentTask.downloaded = true;
await updateCurrentTask(true);
} catch (error) {
console.error(error);
if (await checkRetry(currentTask, tasks, nextTask.doi)) {
await reload(
ERROR_RELOAD_TIME,
`singlefile or upload error! ${currentTask.doi}`
);
}
return;
}
} else {
console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
await saveTaskTimepoint(
TIME_POINT_TYPES.VALIDATE_FAILED,
currentTask,
tasks
);
await updateCurrentTask(false);
}
// --------------------------- Prepare next task ------------------------------------------------------
await prepareNextTask(nextTask.doi);
}
start();
})();