Crawler base on SingleFile

Download site in single file automatically

Versión del día 20/12/2023. Echa un vistazo a la versión más reciente.

Tendrás que instalar una extensión para tu navegador como Tampermonkey, Greasemonkey o Violentmonkey si quieres utilizar este script.

You will need to install an extension such as Tampermonkey to install this script.

Necesitarás instalar una extensión como Tampermonkey o Violentmonkey para instalar este script.

Necesitarás instalar una extensión como Tampermonkey o Userscripts para instalar este script.

Necesitará instalar una extensión como Tampermonkey para instalar este script.

Necesitarás instalar una extensión para administrar scripts de usuario si quieres instalar este script.

(Ya tengo un administrador de scripts de usuario, déjame instalarlo)

Necesitará instalar una extensión como Stylus para instalar este estilo.

Necesitará instalar una extensión como Stylus para instalar este estilo.

Necesitará instalar una extensión como Stylus para instalar este estilo.

Necesitará instalar una extensión del gestor de estilos de usuario para instalar este estilo.

Necesitará instalar una extensión del gestor de estilos de usuario para instalar este estilo.

Necesitará instalar una extensión del gestor de estilos de usuario para instalar este estilo.

(Ya tengo un administrador de estilos de usuario, déjame instalarlo)

// ==UserScript==
// @name              Crawler base on SingleFile
// @author            Mark
// @description       Download site in single file automatically
// @license           MIT
// @version           0.0.2
// @match             https://*/*
// @run-at            document-idle
// @grant GM.setValue
// @grant GM.getValue
// @grant GM.xmlHttpRequest
// @grant GM_registerMenuCommand
// @grant unsafeWindow
// @noframes
// @namespace https://greasyfork.org/users/1106595
// ==/UserScript==

// config for singleFile
const BACKGROUND_SAVE_SUPPORTED = !/Mobile.*Firefox/.test(navigator.userAgent);
const DEFAULT_CONFIG = {
  removeHiddenElements: true,
  removeUnusedStyles: true,
  removeUnusedFonts: true,
  removeFrames: false,
  compressHTML: true,
  compressCSS: false,
  loadDeferredImages: true,
  loadDeferredImagesMaxIdleTime: 1500,
  loadDeferredImagesBlockCookies: false,
  loadDeferredImagesBlockStorage: false,
  loadDeferredImagesKeepZoomLevel: false,
  loadDeferredImagesDispatchScrollEvent: false,
  loadDeferredImagesBeforeFrames: false,
  filenameTemplate:
    "%if-empty<{page-title}|No title> ({date-locale} {time-locale}).{filename-extension}",
  infobarTemplate: "",
  includeInfobar: false,
  confirmInfobarContent: false,
  autoClose: false,
  confirmFilename: false,
  filenameConflictAction: "uniquify",
  filenameMaxLength: 192,
  filenameMaxLengthUnit: "bytes",
  filenameReplacedCharacters: [
    "~",
    "+",
    "\\\\",
    "?",
    "%",
    "*",
    ":",
    "|",
    '"',
    "<",
    ">",
    "\x00-\x1f",
    "\x7F",
  ],
  filenameReplacementCharacter: "_",
  replaceEmojisInFilename: false,
  saveFilenameTemplateData: false,
  contextMenuEnabled: true,
  tabMenuEnabled: true,
  browserActionMenuEnabled: true,
  shadowEnabled: true,
  logsEnabled: true,
  progressBarEnabled: true,
  maxResourceSizeEnabled: false,
  maxResourceSize: 10,
  displayInfobar: true,
  displayStats: false,
  backgroundSave: BACKGROUND_SAVE_SUPPORTED,
  defaultEditorMode: "normal",
  applySystemTheme: true,
  autoSaveDelay: 1,
  autoSaveLoad: false,
  autoSaveUnload: false,
  autoSaveLoadOrUnload: true,
  autoSaveDiscard: false,
  autoSaveRemove: false,
  autoSaveRepeat: false,
  autoSaveRepeatDelay: 10,
  removeAlternativeFonts: true,
  removeAlternativeMedias: true,
  removeAlternativeImages: true,
  groupDuplicateImages: true,
  maxSizeDuplicateImages: 512 * 1024,
  saveRawPage: false,
  saveToClipboard: false,
  addProof: false,
  saveToGDrive: false,
  saveToDropbox: false,
  saveWithWebDAV: false,
  webDAVURL: "",
  webDAVUser: "",
  webDAVPassword: "",
  saveToGitHub: false,
  githubToken: "",
  githubUser: "",
  githubRepository: "SingleFile-Archives",
  githubBranch: "main",
  saveWithCompanion: false,
  forceWebAuthFlow: false,
  resolveFragmentIdentifierURLs: false,
  userScriptEnabled: false,
  openEditor: false,
  openSavedPage: false,
  autoOpenEditor: false,
  saveCreatedBookmarks: false,
  allowedBookmarkFolders: [],
  ignoredBookmarkFolders: [],
  replaceBookmarkURL: true,
  saveFavicon: true,
  includeBOM: false,
  warnUnsavedPage: true,
  displayInfobarInEditor: false,
  compressContent: false,
  createRootDirectory: false,
  selfExtractingArchive: true,
  extractDataFromPage: true,
  preventAppendedData: false,
  insertTextBody: false,
  autoSaveExternalSave: false,
  insertMetaNoIndex: false,
  insertMetaCSP: true,
  passReferrerOnError: false,
  password: "",
  insertSingleFileComment: true,
  removeSavedDate: false,
  blockMixedContent: false,
  saveOriginalURLs: false,
  acceptHeaders: {
    font: "application/font-woff2;q=1.0,application/font-woff;q=0.9,*/*;q=0.8",
    image: "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
    stylesheet: "text/css,*/*;q=0.1",
    script: "*/*",
    document: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    video:
      "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5",
    audio:
      "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5",
  },
  moveStylesInHead: false,
  networkTimeout: 0,
  woleetKey: "",
  blockImages: false,
  blockStylesheets: false,
  blockFonts: false,
  blockScripts: true,
  blockVideos: true,
  blockAudios: true,
  _migratedTemplateFormat: true,
};

// validator define for different press
const validators = {
  1002: (document) =>
    document.querySelector(
      ".article__body .abstract-group .article-section__abstract .article-section__content"
    ) &&
    document.querySelectorAll(
      ".article__body .article-section__full :where(.article-section__content > p, .article-section__sub-content > p)"
    ).length > 0,
  1016: (document) =>
    (document.querySelector("div.abstract.author > div") ||
      document.querySelector('[data-left-hand-nav="Summary"]')) &&
    (document.querySelectorAll(
      "div#body > div:first-child > section[id^=s] p[id^=p]"
    ).length > 0 ||
      document.querySelectorAll(
        "div#body > div:first-child  :where(section[id^=aep-section] > p, section[id^=aep-section] div > p)"
      ).length > 0 ||
      document.querySelectorAll("[id^='sec'] .section-paragraph").length > 0),
  3390: (document) =>
    document.querySelector("#html-abstract .html-p") &&
    document.querySelectorAll("article .html-body .html-p").length > 0,
  1039: (document) =>
    document.querySelector("article .capsule__text") &&
    document.querySelectorAll("#pnlArticleContentLoaded > p").length > 0,
  1021: (document) =>
    document.querySelector("p.articleBody_abstractText") &&
    (document.querySelectorAll("div.NLM_p").length > 0 ||
      document.querySelectorAll(".article_content-left > p").length > 0),
  1038: (document) =>
    document.querySelector("#Abs1-content") &&
    document.querySelectorAll(
      "article .main-content .c-article-section__content > p"
    ).length > 0,
  1007: (document) =>
    document.querySelectorAll("#Abs1-content p").length > 0 &&
    document.querySelectorAll(".main-content .c-article-section__content > p")
      .length > 0,
  1088: (document) =>
    document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 &&
    document.querySelectorAll(`:where( 
        div[itemprop="articleBody"] >  p, 
        div[itemprop="articleBody"] > .article-text > p, 
        div[itemprop="articleBody"] > .article-text > .article-text > p,
        div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p)
    `).length > 0,
  1063: (document) =>
    document.querySelectorAll("#ContentTab .abstract p").length > 0 &&
    document.querySelectorAll("#ContentTab .article-section-wrapper > p")
      .length > 0,
  1149: (document) =>
    document.querySelectorAll(".wd-jnl-art-abstract > p").length > 0 &&
    document.querySelectorAll(`:where( 
      div[itemprop="articleBody"] >  p, 
      div[itemprop="articleBody"] > .article-text > p, 
      div[itemprop="articleBody"] > .article-text > .article-text > p,
      div[itemprop="articleBody"] > .article-text > .article-text > .article-text > p)
  `).length > 0,
};

const addScript = (url) => {
  const s = document.createElement("script");
  s.src = url;
  document.body.append(s);
};

// main function
(function () {
  "use strict";

  addScript(
    "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-bootstrap.js"
  );
  addScript(
    "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-hooks-frames.js"
  );
  addScript(
    "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file-frames.js"
  );
  // Overwrite fetch function to bypass CORS
  /** The "fetch-url2.deno.dev" code as follow
   * 
    serve((req: Request) => handleRequest(req));

    async function handleRequest(req: Request) {
        const url = req.url;
        const finalUrl = url && url.split("?url=")[1];
        if (!finalUrl) {
            return new Response(url + " no match '?url='");
        }
        const res = await fetch(finalUrl);
        return new Response(res.body, {
        headers: {
            ...res.headers,
            "Access-Control-Allow-Origin": "*",
            "Access-Control-Expose-Headers":
            "Request-Context,api-supported-versions,Content-Length,Date,Server",
        },
        });
    }
   **/
  window.unsafeWindow.fetch = async (...args) => {
    console.log(args);
    if (args.length <= 1) {
      return await fetch(...args);
    } else {
      const [url, ...otherArgs] = args;
      return await fetch(...args).catch(
        async (err) =>
          await fetch("https://fetch-url2.deno.dev?url=" + url, ...otherArgs)
      );
    }
  };

  const downloadFile = (data, fileName) => {
    const a = document.createElement("a");
    document.body.appendChild(a);
    a.style = "display: none";
    const blob = new Blob([data], {
      type: "application/octet-stream",
    });
    const url = window.URL.createObjectURL(blob);
    a.href = url;
    a.download = fileName;
    a.click();
    window.URL.revokeObjectURL(url);
  };

  const sleep = (duration) => {
    return new Promise((res, rej) => {
      setTimeout(() => res(), duration * 1000);
    });
  };

  async function reload(waiting = 60, message = "") {
    console.warn(`%c${message}, reload ${waiting}s later`, printStyle);
    await sleep(waiting);
    location.reload();
  }

  function readFile(accept = "", multiple = false) {
    const inputEl = document.createElement("input");
    inputEl.setAttribute("type", "file");
    inputEl.setAttribute("accept", accept);
    inputEl.setAttribute("multiple", !!multiple);
    return new Promise((resolve, reject) => {
      inputEl.addEventListener("change", (e) => {
        resolve(multiple ? inputEl.files : inputEl.files[0]);
        window.removeEventListener("click", onWindowClick, true);
      });
      document.body.append(inputEl);
      inputEl.click();

      const onWindowClick = () => {
        if (!inputEl.value) {
          reject(new Error("用户取消选择"));
        }
        window.removeEventListener("click", onWindowClick, true);
      };
      setTimeout(() => {
        window.addEventListener("click", onWindowClick, true);
      }, 100);
    });
  }

  function AddImportBtn() {
    const btnWrapImport = document.createElement("div");
    btnWrapImport.id = "CRAWLER_ID";
    btnWrapImport.innerHTML = `<button style="padding: 4px 8px;position: fixed;bottom: 40%;right: 8px;border-radius: 4px;background-color: #224466;color: #fff;">Import</button>`;
    const importBtn = btnWrapImport.querySelector("button");
    importBtn.onclick = async () => {
      if (
        !window.confirm(
          "The data in browser will be clear up. Please make sure you have to do this !!!"
        )
      ) {
        return;
      }
      const file = await readFile(".json");
      const reader = new FileReader();

      reader.onload = (event) => {
        const json = JSON.parse(event.target.result);
        // console.log({json}, 'json')
        // this.importFromBackUp.bind(this)(json);
        if (
          json instanceof Array &&
          json.every((item) => item.doi && item.validator)
        ) {
          GM.setValue("tasks", json);
          location.reload();
        } else {
          alert(
            "Please upload json file like [{doi: string, validator: string, ...}]"
          );
        }
      };

      reader.readAsText(file);
    };
    document.body.appendChild(btnWrapImport);
  }

  function removeImportBtn() {
    const importBtn = document.getElementById("CRAWLER_ID");
    if (importBtn) {
      importBtn.parentElement.removeChild(importBtn);
    }
  }

  GM_registerMenuCommand("Download", async () => {
    const taskData = await GM.getValue("tasks");
    const waitingTasks = taskData.filter(
      (task) =>
        !task.downloaded &&
        task.validated === undefined &&
        validators[task.validator]
    );
    const now = new Date();
    downloadFile(
      JSON.stringify(taskData),
      `${now.getFullYear()}-${
        now.getMonth() + 1
      }-${now.getDate()}-${now.getHours()}${now.getMinutes()}${now.getSeconds()}-${
        taskData.length
      }-${taskData.length - waitingTasks.length}.json`
    );
  });

  const printStyle = "color: blue;background-color: #ccc;font-size: 20px";

  async function start() {
    console.log(new Date());
    AddImportBtn();
    await sleep(7);
    addScript(
      "https://cdn.jsdelivr.net/gh/gildas-lormeau/SingleFile-MV3/lib/single-file.js"
    );
    const taskData = await GM.getValue("tasks");
    let tasks = taskData || [];

    // find task which not downloaded and not validated before
    const waitingTasks = tasks.filter(
      (task) =>
        !task.downloaded &&
        task.validated === undefined &&
        validators[task.validator]
    );
    console.log(
      `%cTry to get tasks firstly(${waitingTasks.length} / ${tasks.length}):`,
      printStyle,
      tasks
    );

    // ---------------------------- Report progress -----------------------------------------------------

    const invalidatedTasks = tasks.filter((task) => task.validated === false);
    const doneTasks = tasks
      .filter((task) => task.downloaded)
      .sort((a, b) => (a.updateTime > b.updateTime ? -1 : 1));
    const lastDoneTime = new Date(doneTasks[0]?.updateTime);
    const reportTip = `Last download time: ${lastDoneTime.toLocaleString()}`;
    GM.xmlHttpRequest({
      url: "https://crawler-hit.deno.dev/api/update",
      method: "POST",
      headers: { "Content-Type": "application/json" },
      data: JSON.stringify({
        account: "开发机",
        invalidate_count: invalidatedTasks.length,
        done_count: doneTasks.length,
        queue_count: waitingTasks.length,
        tip: reportTip,
      }),
    }).then((res) => {
      window.tts = res;
      console.log({ res });
    });

    if (!waitingTasks.length) {
      await reload(90, "No tasks waiting");
      return;
    }

    // -------------------------- Detect Cloudflare challenge -------------------------------------------------------
    await sleep(10);
    const currentTask = waitingTasks[0];
    const doi = currentTask.doi.replace("https://doi.org/", "").toLowerCase();
    const validator = validators[currentTask.validator];
    if (document.getElementById("challenge-form")) {
      console.log(`%cCloudflare challenge! ${currentTask.doi}`, printStyle);
      await sleep(20);
      currentTask.validated = false;
      currentTask.cloudflareBlock = true;
    }
    
    // --------------------------- Page validate ------------------------------------------------------
    if (
      !currentTask.cloudflareBlock &&
      !document.body.textContent.toLowerCase().includes(doi)
    ) {
      console.log(
        `%cURL not match, will redirect to ${currentTask.doi} 5s later`,
        printStyle
      );
      await sleep(5);
      location.href = currentTask.doi;
      return;
    }
    if (!currentTask.cloudflareBlock && validator(document)) {
      console.log(
        "%cValidate successfully! Downloading page...",
        printStyle,
        waitingTasks,
        tasks
      );
      removeImportBtn();
      try {
        const data = await singlefile.getPageData(DEFAULT_CONFIG);
        downloadFile(
          data.content,
          `${doi.replaceAll("/", "_")}.singlefile.html`
        );
        downloadFile(
          document.body.parentElement.outerHTML,
          `${doi.replaceAll("/", "_")}.html`
        );
        currentTask.downloaded = true;
        currentTask.validated = true;
        currentTask.updateTime = new Date().valueOf();
      } catch (error) {
        console.error(error);
        await reload(10, `singlefile error! ${currentTask.doi}`);
        return;
      }
    } else {
      console.log(`%cValidate failed! ${currentTask.doi}`, printStyle);
      currentTask.validated = false;
    }

    await GM.setValue("tasks", tasks);

    // --------------------------- Prepare next task ------------------------------------------------------
    const nextTask = waitingTasks[1];
    if (nextTask) {
      console.log(
        `%cStart next task 10s later...`,
        printStyle,
        nextTask.doi,
        tasks
      );
      await sleep(10);
      location.href = nextTask.doi;
    } else {
      await reload(60, "No tasks waiting");
    }
  }

  start();
})();