Crawler base on SingleFile

Download site in single file automatically

// ==UserScript==
// @name              Crawler base on SingleFile
// @author            Mark
// @description       Download site in single file automatically
// @license           MIT
// @version           0.0.18
// @match             https://*/*
// @run-at            document-idle
// @grant GM.setValue
// @grant GM.getValue
// @grant GM.xmlHttpRequest
// @grant GM_registerMenuCommand
// @grant unsafeWindow
// @require     https://update.greasyfork.org/scripts/483730/1305396/gm-fetch.js
// @require     https://openuserjs.org/src/libs/sizzle/GM_config.js
// @connect *
// @noframes
// @namespace https://greasyfork.org/users/1106595
// ==/UserScript==

const REPORT_ADDRESS = "https://crawler-hit.deno.dev/api/update";
const PAGE_LOADING_TIME = 7;
const ERROR_RELOAD_TIME = 10;
const ERROR_RELOAD_LONG_TIME = 60;
const NEXT_TASK_WAITING_TIME = 10;

const NO_TASK_WAITING_TIME = 90;
const CF_CHALLENGE_WAITING_TIME = 20;
const QUICK_SLEEP_TIME = 5;
const DOMAIN_REG = /^(https?):\/\/([^\s\/?\.#]+\.?)+$/;
const TASK_MAX_RETRY_TIMES = 3;
const TIME_POINT_TYPES = {
  PREPARE_START: "prepareStart",
  TASK_LOADED: "taskLoaded",
  TASK_REPORTED: "taskReported",
  PRESIGN_INDEX: "presignIndex",
  PRESIGN_SINGLEFILE: "presignSinglefile",
  SINGLE_FILE_SUCCESS: "singleFileSuccess",
  INDEX_FILE_UPLOADED: "indexFileUploaded",
  SINGLE_FILE_UPLOADED: "singleFileUploaded",
  VALIDATE_FAILED: "validateFailed",
};
let gmc = new GM_config({
  id: "CrawlerConfig",
  title: "Crawler setting",
  fields: {
    Name: {
      label: "Name",
      type: "text",
    },
    Password: {
      label: "Password",
      type: "text",
    },
    taskInterval: {
      label: "Task Interval (s)",
      type: "int",
      default: NEXT_TASK_WAITING_TIME,
    },
    taskMaxRetryTimes: {
      label: "Task Max Retry Times",
      type: "int",
      default: TASK_MAX_RETRY_TIMES,
    },
    preferServer: {
      label: "Prefer preSign Server",
      type: "text",
    },
    reportServer: {
      label: "Report Server",
      type: "text",
      default: REPORT_ADDRESS,
    },
  },
  events: {
    init: function () {
      // runs after initialization completes
    },
    save: function () {
      // runs after values are saved
      console.log("save", this.get("Name"), this.get("Password"));
      this.close();
    },
  },
});

const crawlerUtil = {
  addScript: (url) => {
    const s = document.createElement("script");
    s.src = url;
    s.onerror = (evt) => {
      setTimeout(() => {
        addScript(url);
      }, 2000);
    };
    document.body.append(s);
  },

  addScriptByText: async (url, cache = false, retry = 0) => {
    const s = document.createElement("script");
    s.dataset.crawler = "true";
    const scriptCache = (await GM.getValue("scriptCache")) || {};
    if (cache && scriptCache[url]) {
      s.innerHTML = scriptCache[url];
      document.body.append(s);
      return true;
    }
    try {
      const res = await GM.xmlHttpRequest({
        url: url,
        method: "GET",
      });

      const text = res.responseText;
      if (cache) {
        scriptCache[url] = text;
        GM.setValue("scriptCache", scriptCache);
      }
      s.innerHTML = text;
      document.body.append(s);
      return true;
    } catch (error) {
      if (retry > 3) {
        return false;
      }
      await sleep(2);
      return await addScriptByText(url, retry + 1);
    }
  },

  getPreSignUrl: async (doi, fileName, name, pass, preferServer = "") => {
    const configServer = DOMAIN_REG.test(preferServer) ? [preferServer] : [];
    const preSignSevers = configServer.concat([
      "https://minio-presign.hzc.pub",
      "https://minio-presign-ali.hzc.pub",
      "https://chem-brain-minio.deno.dev",
    ]);
    async function getPreSignUrlFromServer(serverIndex = 0) {
      try {
        return await (
          await GM_fetch(
            `${preSignSevers[serverIndex]}/api/presignedPutObject?doi=${doi}&file_name=${fileName}&account=${name}&pass=${pass}`
          )
        ).json();
      } catch (error) {
        if (!preSignSevers[serverIndex + 1]) {
          return { reload: true };
        }
        return await getPreSignUrlFromServer(serverIndex + 1);
      }
    }

    const preSignRes = await getPreSignUrlFromServer();
    if (preSignRes.reload) {
      return "RELOAD";
    }

    const url = preSignRes?.url;
    return url || null;
  },

  uploader: async (url, content) => {
    const mime = "application/gzip"
    const gzip_data = pako.gzip(content, { level: 9 });
    const upload_blob = new Blob([gzip_data], { type: mime });

    return await GM.xmlHttpRequest({
      method: "PUT",
      url,
      headers: {
        "Content-Type": mime,
        "Content-Length": upload_blob.size,
      },
      data: upload_blob,
    });
  },

  downloadFile: (data, fileName) => {
    const a = document.createElement("a");
    document.body.appendChild(a);
    a.style = "display: none";
    const blob = new Blob([data], {
      type: "application/octet-stream",
    });
    const url = window.URL.createObjectURL(blob);
    a.href = url;
    a.download = fileName;
    a.click();
    window.URL.revokeObjectURL(url);
  },

  generateClientId: () => (1e6 * Math.random()).toString(32).replace(".", ""),

  sleep: (duration) => {
    return new Promise((res, rej) => {
      setTimeout(() => res(), duration * 1000);
    });
  },
};

// main function
(function () {
  "use strict";
  const {
    addScript,
    addScriptByText,
    generateClientId,
    uploader,
    downloadFile,
    getPreSignUrl,
    sleep,
  } = crawlerUtil;

  const dependenciesInit = async () => {
    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js",
      true
    );
    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js",
      true
    );
    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js",
      true
    );
    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js",
      true
    );

    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/config.js"
    );
    await addScriptByText(
      "https://cdn.jsdelivr.net/gh/IKKEM-Lin/crawler-base-on-singlefile/validator.js"
    );
    await addScriptByText(
      "https://cdn.jsdelivr.net/npm/pako@2.1.0/dist/pako.min.js"
    );
    return () => {
      document.querySelectorAll("script[data-crawler='true']").forEach((el) => {
        el.parentElement.removeChild(el);
      });
    };
  };

  const pureHTMLCleaner = (document) => {
    document.querySelectorAll("script").forEach((el) => {
      el.parentElement.removeChild(el);
    });
    document.querySelectorAll("style").forEach((el) => {
      el.parentElement.removeChild(el);
    });
  };

  // Overwrite fetch function to bypass CORS
  window.unsafeWindow.fetch = async (...args) => {
    return await fetch(...args).catch(async (err) => {
      return await GM_fetch(...args);
    });
  };

  async function reload(waiting = 60, message = "") {
    console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
    await sleep(waiting);
    location.reload();
  }

  function readFile(accept = "", multiple = false) {
    const inputEl = document.createElement("input");
    inputEl.setAttribute("type", "file");
    inputEl.setAttribute("accept", accept);
    inputEl.setAttribute("multiple", !!multiple);
    return new Promise((resolve, reject) => {
      inputEl.addEventListener("change", (e) => {
        resolve(multiple ? inputEl.files : inputEl.files[0]);
        window.removeEventListener("click", onWindowClick, true);
      });
      document.body.append(inputEl);
      inputEl.click();

      const onWindowClick = () => {
        if (!inputEl.value) {
          reject(new Error("用户取消选择"));
        }
        window.removeEventListener("click", onWindowClick, true);
      };
      setTimeout(() => {
        window.addEventListener("click", onWindowClick, true);
      }, 100);
    });
  }

  function AddImportBtn() {
    const btnWrapImport = document.createElement("div");
    btnWrapImport.id = "CRAWLER_ID";
    btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
    const importBtn = btnWrapImport.querySelector("button");
    importBtn.onclick = async () => {
      if (
        !window.confirm(
          "The data in browser will be clear up. Please make sure you have to do this !!!"
        )
      ) {
        return;
      }
      const file = await readFile(".json");
      const reader = new FileReader();

      reader.onload = (event) => {
        const json = JSON.parse(event.target.result);
        // console.log({json}, 'json')
        // this.importFromBackUp.bind(this)(json);
        if (
          json instanceof Array &&
          json.every((item) => item.doi && item.validator)
        ) {
          GM.setValue("tasks", json);
          location.reload();
        } else {
          alert(
            "Please upload json file like [{doi: string, validator: string, ...}]"
          );
        }
      };

      reader.readAsText(file);
    };
    document.body.appendChild(btnWrapImport);
    return () => {
      const importBtn = document.getElementById("CRAWLER_ID");
      if (importBtn) {
        importBtn.parentElement.removeChild(importBtn);
      }
    };
  }

  GM_registerMenuCommand("Download", async () => {
    const taskData = await GM.getValue("tasks");
    const waitingTasks = taskData.filter(
      (task) =>
        !task.downloaded &&
        task.validated === undefined &&
        validators[task.validator]
    );
    const now = new Date();
    downloadFile(
      JSON.stringify(taskData),
      `${now.getFullYear()}-${
        now.getMonth() + 1
      }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
        taskData.length
      }-${taskData.length - waitingTasks.length}.json`
    );
  });

  GM_registerMenuCommand("Config", async () => {
    gmc.open();
  });

  const printStyle = "color: blue;background-color: #ccc;font-size: 20px";

  const prepareNextTask = async (nextDoi) => {
    const taskInterval = gmc.get("taskInterval") || NEXT_TASK_WAITING_TIME;
    if (nextDoi) {
      console.log(
        `%cStart next task ${taskInterval}s later...`,
        printStyle,
        nextDoi
      );
      await sleep(taskInterval);
      const taskData = await GM.getValue("tasks");
      const task = taskData.find((task) => task.doi === nextDoi);
      await saveTaskTimepoint(TIME_POINT_TYPES.PREPARE_START, task, taskData);
      location.href = nextDoi;
    } else {
      await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
    }
  };

  let lasestTimepoint = 0;
  const saveTaskTimepoint = async (pointName, task, taskData) => {
    if (pointName === TIME_POINT_TYPES.PREPARE_START) {
      task[`timePoint_${pointName}`] = new Date().valueOf()
    }
    else {
      if (lasestTimepoint == 0) {
        lasestTimepoint = task[`timePoint_${TIME_POINT_TYPES.PREPARE_START}`] || 0;
      }
      if (lasestTimepoint == 0) {
        task[`timePoint_${pointName}`] = 0;
      } else {
        task[`timePoint_${pointName}`] = new Date().valueOf() - lasestTimepoint;
      }
      lasestTimepoint = new Date().valueOf();
    }
    await GM.setValue("tasks", taskData);
  };

  const checkRetry = async (task, taskData, nextDoi) => {
    const taskMaxRetryTimes = gmc.get("taskMaxRetryTimes") || TASK_MAX_RETRY_TIMES;
    const retryTimes = task.retryTimes || 0;
    let result = true;
    if (retryTimes >= taskMaxRetryTimes) {
      console.log(`%cTask have been retry ${taskMaxRetryTimes} times! ${task.doi}`, printStyle);
      task.validated = false;
      task.updateTime = new Date().valueOf();
      await prepareNextTask(nextDoi);
      result = false;
    } else {
      task.retryTimes = retryTimes + 1;
    }
    await GM.setValue("tasks", taskData);
    return result;
  }

  async function start() {
    console.log(new Date());

    const importBtnHandler = AddImportBtn();

    let clientId = await GM.getValue("clientId");
    if (typeof clientId !== "string" || !clientId) {
      clientId = generateClientId();
      await GM.setValue("clientId", clientId);
    }

    // ---------------------------- Script dependencies handler -----------------------------------------------------
    const dependenciesHandler = await dependenciesInit();

    if (!singlefile || !singlefile.getPageData) {
      await reload(ERROR_RELOAD_TIME, `singlefile error! ${currentTask.doi}`);
      return;
    }

    if (!(validators && DEFAULT_CONFIG)) {
      await reload(
        ERROR_RELOAD_TIME,
        "Can not get validators or DEFAULT_CONFIG"
      );
      return;
    }

    // ---------------------------- Get Task -----------------------------------------------------
    const taskData = await GM.getValue("tasks");
    let tasks = taskData || [];

    // find task which not downloaded and not validated before
    const waitingTasks = tasks.filter(
      (task) =>
        !task.downloaded &&
        task.validated === undefined &&
        validators[task.validator]
    );
    console.log(
      `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
      printStyle,
      tasks
    );

    if (!waitingTasks.length) {
      await reload(NO_TASK_WAITING_TIME, "No tasks waiting");
      return;
    }

    const invalidatedTasks = tasks.filter((task) => task.validated === false);
    const doneTasks = tasks
      .filter((task) => task.downloaded)
      .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
    const previousDay = new Date().valueOf() - 24 * 3600 * 1000;
    const last24hDoneTasks = doneTasks.filter(
      (task) => task.updateTime > previousDay
    );

    const lastDoneTime = new Date(doneTasks[0]?.updateTime);
    const currentTask = waitingTasks[0];
    const nextTask = waitingTasks[1] || {};
    await saveTaskTimepoint(TIME_POINT_TYPES.TASK_LOADED, currentTask, tasks);

    const updateCurrentTask = async (isSuccess) => {
      currentTask.validated = isSuccess;
      currentTask.updateTime = new Date().valueOf();
      await GM.setValue("tasks", tasks);
    };

    // ---------------------------- Report progress -----------------------------------------------------

    const reportUrl = gmc.get("reportServer") || REPORT_ADDRESS;
    const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}
      Speed: ${last24hDoneTasks.length} / last 24h`;
    GM.xmlHttpRequest({
      url: reportUrl,
      method: "POST",
      headers: { "Content-Type": "application/json" },
      data: JSON.stringify({
        account: clientId,
        invalidate_count: invalidatedTasks.length,
        done_count: doneTasks.length,
        queue_count: waitingTasks.length,
        tip: reportTip,
      }),
    })
      .then((res) => {
        console.log("Report successfully", { res });
      })
      .finally(() => {
        saveTaskTimepoint(TIME_POINT_TYPES.TASK_REPORTED, currentTask, tasks);
      });


    // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
    await sleep(PAGE_LOADING_TIME);
    if (document.getElementById("challenge-form")) {
      console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
      await sleep(CF_CHALLENGE_WAITING_TIME);
      currentTask.cloudflareBlock = true;
      await updateCurrentTask(false);
      await prepareNextTask(nextTask.doi);
      return;
    }
    // bypass els institution check
    if (document.querySelector('.sec-A #bdd-els-close')) {
      const elsCloseBtn = document.querySelector('.sec-A #bdd-els-close');
      elsCloseBtn.click();
    }

    // ---------------------------- validated task ------------------------------------------------

    const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
    const doiFixed = doi.replaceAll("/", "_");
    const validator = validators[currentTask.validator];

    let name = "";
    let pass = "";
    let preferServer = "";
    try {
      name = gmc.get("Name");
      pass = gmc.get("Password");
      preferServer = gmc.get("preferServer");
      if (!name || !pass) {
        throw new Error();
      }
    } catch (err) {
      console.error(
        `%cMiss name or password. Please input in config panel.`,
        printStyle
      );
      return;
    }

    const indexUrl = await getPreSignUrl(doiFixed, `_.html.gz`, name, pass, preferServer);
    await saveTaskTimepoint(TIME_POINT_TYPES.PRESIGN_INDEX, currentTask, tasks);
    const singlefileUrl = await getPreSignUrl(
      doiFixed,
      `_.sf.html.gz`,
      name,
      pass,
      preferServer
    );
    await saveTaskTimepoint(
      TIME_POINT_TYPES.PRESIGN_SINGLEFILE,
      currentTask,
      tasks
    );
    if (indexUrl === "RELOAD" || singlefileUrl === "RELOAD") {
      await reload(
        ERROR_RELOAD_LONG_TIME,
        "Minio PreSignUrl error, please check url or account"
      );
      return;
    }
    if (!indexUrl && !singlefileUrl) {
      console.error("%cFile existed!!!", printStyle, currentTask.doi);
      await updateCurrentTask(false);
      await prepareNextTask(nextTask.doi);
      return;
    } else { 
      const old_index = await getPreSignUrl(doiFixed, `_.html`, name, pass, preferServer);
      const old_singlefileUrl = await getPreSignUrl(
        doiFixed,
        `_.sf.html`,
        name,
        pass,
        preferServer
      );
      if (!old_index && !old_singlefileUrl) {
        console.error("%cFile existed!!!", printStyle, currentTask.doi);
        await updateCurrentTask(false);
        await prepareNextTask(nextTask.doi);
        return;
      }
    } 

    // --------------------------- Page validate ------------------------------------------------------
    if (!document.body.textContent.toLowerCase().includes(doi)) {
      console.log(
        `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
        printStyle
      );
      await sleep(QUICK_SLEEP_TIME);
      if(await checkRetry(currentTask, tasks, nextTask.doi)){
        location.href = currentTask.doi;
      }
      return;
    }
    if (validator(document)) {
      console.log(
        "%cValidate successfully! Downloading page...",
        printStyle,
        waitingTasks,
        tasks
      );
      importBtnHandler();
      // repair special page
      if (typeof documentFixer[currentTask.validator] === "function") {
        documentFixer[currentTask.validator](document);
      }
      try {
        const data = await singlefile.getPageData(DEFAULT_CONFIG);
        await saveTaskTimepoint(
          TIME_POINT_TYPES.SINGLE_FILE_SUCCESS,
          currentTask,
          tasks
        );
        // downloadFile(data.content, `${doiFixed}.singlefile.html`);
        // downloadFile(document.body.parentElement.outerHTML, `${doiFixed}.html`);
        if (singlefileUrl) {
          await uploader(singlefileUrl, data.content);
          await saveTaskTimepoint(
            TIME_POINT_TYPES.SINGLE_FILE_UPLOADED,
            currentTask,
            tasks
          );
        }
        if (indexUrl) {
          dependenciesHandler();
          pureHTMLCleaner(document);
          await uploader(indexUrl, document.body.parentElement.outerHTML);
          await saveTaskTimepoint(
            TIME_POINT_TYPES.INDEX_FILE_UPLOADED,
            currentTask,
            tasks
          );
        }
        console.log("%cUpload successfully!", printStyle);
        currentTask.downloaded = true;
        await updateCurrentTask(true);
      } catch (error) {
        console.error(error);
        if (await checkRetry(currentTask, tasks, nextTask.doi)) {
          await reload(
            ERROR_RELOAD_TIME,
            `singlefile or upload error! ${currentTask.doi}`
          );
        }
        return;
      }
    } else {
      console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
      await saveTaskTimepoint(
        TIME_POINT_TYPES.VALIDATE_FAILED,
        currentTask,
        tasks
      );
      await updateCurrentTask(false);
    }

    // --------------------------- Prepare next task ------------------------------------------------------
    await prepareNextTask(nextTask.doi);
  }

  start();
})();