Sequential Link Content Finder

Collect links from an index page, visit them sequentially, and save URLs whose page text matches a target string.

Na nainštalovanie skriptu si budete musieť nainštalovať rozšírenie, ako napríklad Tampermonkey, Greasemonkey alebo Violentmonkey.

Na inštaláciu tohto skriptu je potrebné nainštalovať rozšírenie, ako napríklad Tampermonkey.

Na nainštalovanie skriptu si budete musieť nainštalovať rozšírenie, ako napríklad Tampermonkey, % alebo Violentmonkey.

Na nainštalovanie skriptu si budete musieť nainštalovať rozšírenie, ako napríklad Tampermonkey alebo Userscripts.

Na inštaláciu tohto skriptu je potrebné nainštalovať rozšírenie, ako napríklad Tampermonkey.

Na inštaláciu tohto skriptu je potrebné nainštalovať rozšírenie správcu používateľských skriptov.

(Už mám správcu používateľských skriptov, nechajte ma ho nainštalovať!)

Advertisement:

Na inštaláciu tohto štýlu je potrebné nainštalovať rozšírenie, ako napríklad Stylus.

Na inštaláciu tohto štýlu je potrebné nainštalovať rozšírenie, ako napríklad Stylus.

Na inštaláciu tohto štýlu je potrebné nainštalovať rozšírenie, ako napríklad Stylus.

Na inštaláciu tohto štýlu je potrebné nainštalovať rozšírenie správcu používateľských štýlov.

Na inštaláciu tohto štýlu je potrebné nainštalovať rozšírenie správcu používateľských štýlov.

Na inštaláciu tohto štýlu je potrebné nainštalovať rozšírenie správcu používateľských štýlov.

(Už mám správcu používateľských štýlov, nechajte ma ho nainštalovať!)

Advertisement:

// ==UserScript==
// @name         Sequential Link Content Finder
// @namespace    https://example.com/
// @version      1.0.0
// @description  Collect links from an index page, visit them sequentially, and save URLs whose page text matches a target string.
// @match        https://example.com/*
// @grant        GM.getValue
// @grant        GM.setValue
// @grant        GM.deleteValue
// @grant        GM.registerMenuCommand
// @run-at       document-idle
// ==/UserScript==

(async function () {
  "use strict";

  const STATE_STORAGE_KEY = "sequential-link-content-finder-state-v1";
  const CONFIG_STORAGE_KEY = "sequential-link-content-finder-config-v1";

  const DEFAULT_CONFIG = {
    indexUrlPattern: "^https://example\\.com/index",
    linkHrefPattern: "/target-pages/",
    targetText: "Text to search for",
    waitAfterLoadMs: 2000,
    maxUrls: 500,
  };

  function log(...args) {
    console.log("[LinkContentFinder]", ...args);
  }

  function sleep(ms) {
    return new Promise((resolve) => setTimeout(resolve, ms));
  }

  async function loadConfig() {
    const saved = await GM.getValue(CONFIG_STORAGE_KEY);
    return {
      ...DEFAULT_CONFIG,
      ...(saved || {}),
    };
  }

  async function saveConfig(config) {
    await GM.setValue(CONFIG_STORAGE_KEY, config);
  }

  async function resetConfig() {
    await GM.deleteValue(CONFIG_STORAGE_KEY);
  }

  async function loadState() {
    return (await GM.getValue(STATE_STORAGE_KEY)) || null;
  }

  async function saveState(state) {
    await GM.setValue(STATE_STORAGE_KEY, state);
  }

  async function clearState() {
    await GM.deleteValue(STATE_STORAGE_KEY);
  }

  function normalizeUrl(href) {
    try {
      return new URL(href, location.href).href;
    } catch {
      return null;
    }
  }

  function compileRegExp(patternText, label) {
    try {
      return new RegExp(patternText);
    } catch (error) {
      throw new Error(`${label} is invalid RegExp: ${error.message}`);
    }
  }

  function isValidConfig(config) {
    if (!config || typeof config !== "object") return false;

    if (typeof config.indexUrlPattern !== "string") return false;
    if (typeof config.linkHrefPattern !== "string") return false;
    if (typeof config.targetText !== "string") return false;

    if (!Number.isFinite(Number(config.waitAfterLoadMs))) return false;
    if (Number(config.waitAfterLoadMs) < 0) return false;

    if (!Number.isInteger(Number(config.maxUrls))) return false;
    if (Number(config.maxUrls) < 1) return false;

    try {
      compileRegExp(config.indexUrlPattern, "INDEX_URL_PATTERN");
      compileRegExp(config.linkHrefPattern, "LINK_HREF_PATTERN");
    } catch {
      return false;
    }

    return true;
  }

  function validateConfig(config) {
    compileRegExp(config.indexUrlPattern, "INDEX_URL_PATTERN");
    compileRegExp(config.linkHrefPattern, "LINK_HREF_PATTERN");

    if (!config.targetText) {
      throw new Error("TARGET_TEXT is empty.");
    }

    if (
      !Number.isFinite(Number(config.waitAfterLoadMs)) ||
      Number(config.waitAfterLoadMs) < 0
    ) {
      throw new Error("WAIT_AFTER_LOAD_MS must be 0 or greater.");
    }

    if (
      !Number.isInteger(Number(config.maxUrls)) ||
      Number(config.maxUrls) < 1
    ) {
      throw new Error("MAX_URLS must be an integer greater than 0.");
    }
  }

  /**
   * This validates both active and completed states.
   *
   * - active: true means the scan is running and may be auto-resumed.
   * - active: false means the scan has completed and should be kept for result display.
   */
  function isValidStoredState(state) {
    if (!state || typeof state !== "object") return false;

    if (typeof state.active !== "boolean") return false;

    if (typeof state.indexUrl !== "string") return false;
    if (!state.indexUrl) return false;

    if (!Array.isArray(state.queue)) return false;
    if (state.queue.length === 0) return false;
    if (!state.queue.every((url) => typeof url === "string" && url)) {
      return false;
    }

    if (!Number.isInteger(state.currentIndex)) return false;
    if (state.currentIndex < 0) return false;
    if (state.currentIndex > state.queue.length) return false;

    if (!Array.isArray(state.matchedUrls)) return false;
    if (!state.matchedUrls.every((url) => typeof url === "string" && url)) {
      return false;
    }

    if (!isValidConfig(state.configSnapshot)) return false;

    if (typeof state.startedAt !== "string") return false;

    if (state.active === false) {
      if (typeof state.finishedAt !== "string") return false;
      if (state.currentIndex !== state.queue.length) return false;
    }

    return true;
  }

  function isActiveScanState(state) {
    return isValidStoredState(state) && state.active === true;
  }

  function isUrlInScanScope(url, state) {
    if (!state || !Array.isArray(state.queue)) return false;
    return url === state.indexUrl || state.queue.includes(url);
  }

  function collectLinksFromIndexPage(config) {
    const linkHrefRegExp = compileRegExp(
      config.linkHrefPattern,
      "LINK_HREF_PATTERN"
    );

    const urls = [...document.querySelectorAll("a[href]")]
      .map((a) => normalizeUrl(a.getAttribute("href")))
      .filter(Boolean)
      .filter((url) => linkHrefRegExp.test(url));

    return [...new Set(urls)].slice(0, Number(config.maxUrls));
  }

  async function startScan() {
    const existingState = await loadState();

    if (existingState) {
      if (!isValidStoredState(existingState)) {
        log("Invalid existing scan state found. Clearing it:", existingState);
        await clearState();
      } else if (existingState.active === true) {
        const confirmed = confirm(
          [
            "An active scan state already exists.",
            "",
            `Total URLs: ${existingState.queue.length}`,
            `Processed URLs: ${existingState.currentIndex}`,
            "",
            "Discard the existing state and start a new scan?",
          ].join("\n")
        );

        if (!confirmed) {
          log("Start canceled because an active scan state exists.");
          return;
        }

        await clearState();
      } else {
        const confirmed = confirm(
          [
            "A previous scan result is saved.",
            "",
            `Total URLs: ${existingState.queue.length}`,
            `Matched URLs: ${existingState.matchedUrls.length}`,
            "",
            "Discard the previous result and start a new scan?",
          ].join("\n")
        );

        if (!confirmed) {
          log("Start canceled because a completed scan result exists.");
          return;
        }

        await clearState();
      }
    }

    const config = await loadConfig();

    try {
      validateConfig(config);
    } catch (error) {
      alert(error.message);
      log(error);
      return;
    }

    const indexUrlRegExp = compileRegExp(
      config.indexUrlPattern,
      "INDEX_URL_PATTERN"
    );

    if (!indexUrlRegExp.test(location.href)) {
      log("This page is not recognized as the index page:", location.href);
      alert(
        [
          "This page is not recognized as the index page.",
          "",
          `Current URL: ${location.href}`,
          `INDEX_URL_PATTERN: ${config.indexUrlPattern}`,
        ].join("\n")
      );
      return;
    }

    let urls;

    try {
      urls = collectLinksFromIndexPage(config);
    } catch (error) {
      alert(error.message);
      log(error);
      return;
    }

    if (urls.length === 0) {
      log("No matching links found.");
      alert(
        [
          "No matching links were found.",
          "",
          `LINK_HREF_PATTERN: ${config.linkHrefPattern}`,
        ].join("\n")
      );
      return;
    }

    const state = {
      active: true,
      indexUrl: location.href,
      queue: urls,
      currentIndex: 0,
      matchedUrls: [],
      configSnapshot: config,
      startedAt: new Date().toISOString(),
      finishedAt: null,
    };

    await saveState(state);

    log("Scan started.");
    log("Config:", config);
    log("Collected URLs:", urls.length);
    log("First URL:", urls[0]);

    location.href = urls[0];
  }

  async function continueScanOnTargetPage(state) {
    if (!isActiveScanState(state)) {
      log("Invalid active scan state before processing target page. Clearing state:", state);
      await clearState();
      return;
    }

    const config = state.configSnapshot;
    const currentUrl = location.href;
    const expectedUrl = state.queue[state.currentIndex];

    log(`Processing ${state.currentIndex + 1}/${state.queue.length}`);
    log("Current URL:", currentUrl);
    log("Expected URL:", expectedUrl);

    await sleep(Number(config.waitAfterLoadMs));

    const pageText = document.body ? document.body.innerText : "";
    const matched = pageText.includes(config.targetText);

    if (matched) {
      if (!state.matchedUrls.includes(currentUrl)) {
        state.matchedUrls.push(currentUrl);
      }
      log("Matched:", currentUrl);
    } else {
      log("Not matched:", currentUrl);
    }

    state.currentIndex += 1;

    if (state.currentIndex >= state.queue.length) {
      state.active = false;
      state.currentIndex = state.queue.length;
      state.finishedAt = new Date().toISOString();

      await saveState(state);

      log("Scan finished.");
      log("Matched URLs:", state.matchedUrls);
      console.table(state.matchedUrls);

      alert(
        [
          "Scan completed.",
          `Total URLs: ${state.queue.length}`,
          `Matched URLs: ${state.matchedUrls.length}`,
          "",
          "The result has been saved to Tampermonkey storage.",
          "Use the Tampermonkey menu item “Show scan results” to view it.",
        ].join("\n")
      );

      location.href = state.indexUrl;
      return;
    }

    await saveState(state);

    const nextUrl = state.queue[state.currentIndex];

    if (typeof nextUrl !== "string" || !nextUrl) {
      log("Invalid next URL. Clearing scan state:", nextUrl);
      await clearState();
      return;
    }

    log("Next URL:", nextUrl);

    location.href = nextUrl;
  }

  async function showResults() {
    const state = await loadState();

    if (!state) {
      log("No saved state.");
      alert("No saved state was found.");
      return;
    }

    if (!isValidStoredState(state)) {
      log("Invalid saved state found:", state);
      alert("The saved state is invalid. Run “Reset scan state” if necessary.");
      return;
    }

    console.table(state.matchedUrls || []);

    alert(
      [
        `Status: ${state.active ? "Running" : "Completed"}`,
        `Total URLs: ${state.queue.length}`,
        `Processed URLs: ${state.currentIndex}`,
        `Matched URLs: ${state.matchedUrls.length}`,
        "",
        "See console.table output for details.",
      ].join("\n")
    );
  }

  async function resetScanState() {
    await clearState();
    log("State cleared.");
    alert("The scan state has been cleared.");
  }

  async function emergencyStopScan() {
    await clearState();
    log("Emergency stop: scan state cleared.");
    alert("The scan state has been cleared. Reload the page if necessary.");
  }

  async function showConfigPanel() {
    const existing = document.getElementById("lc-finder-config-panel");
    if (existing) {
      existing.remove();
      return;
    }

    const config = await loadConfig();

    const panel = document.createElement("div");
    panel.id = "lc-finder-config-panel";

    panel.innerHTML = `
      <div class="lc-finder-header">
        <strong>Link Content Finder</strong>
        <button type="button" data-action="close">×</button>
      </div>

      <label>
        INDEX_URL_PATTERN
        <input type="text" data-field="indexUrlPattern">
      </label>

      <label>
        LINK_HREF_PATTERN
        <input type="text" data-field="linkHrefPattern">
      </label>

      <label>
        TARGET_TEXT
        <textarea data-field="targetText" rows="4"></textarea>
      </label>

      <label>
        WAIT_AFTER_LOAD_MS
        <input type="number" data-field="waitAfterLoadMs" min="0" step="100">
      </label>

      <label>
        MAX_URLS
        <input type="number" data-field="maxUrls" min="1" step="1">
      </label>

      <div class="lc-finder-actions">
        <button type="button" data-action="save">Save</button>
        <button type="button" data-action="save-and-start">Save & Start</button>
        <button type="button" data-action="reset-config">Reset Config</button>
        <button type="button" data-action="emergency-stop">Emergency Stop</button>
      </div>

      <div class="lc-finder-help">
        <p>
          Enter the body of the regular expression, not a JavaScript regex literal.
        </p>
        <p>
          Example: <code>^https://example\\.com/index</code>
        </p>
        <p>
          Example: <code>/articles/\\d+</code>
        </p>
      </div>
    `;

    const style = document.createElement("style");
    style.textContent = `
      #lc-finder-config-panel {
        position: fixed;
        right: 16px;
        bottom: 16px;
        z-index: 999999;
        width: 420px;
        max-width: calc(100vw - 32px);
        padding: 16px;
        box-sizing: border-box;
        background: #fff;
        color: #222;
        border: 1px solid #ccc;
        border-radius: 8px;
        box-shadow: 0 8px 24px rgba(0, 0, 0, 0.2);
        font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
        font-size: 13px;
        line-height: 1.4;
      }

      #lc-finder-config-panel .lc-finder-header {
        display: flex;
        justify-content: space-between;
        align-items: center;
        margin-bottom: 12px;
      }

      #lc-finder-config-panel .lc-finder-header button {
        font-size: 18px;
        line-height: 1;
      }

      #lc-finder-config-panel label {
        display: block;
        margin: 10px 0;
        font-weight: 600;
      }

      #lc-finder-config-panel input,
      #lc-finder-config-panel textarea {
        display: block;
        width: 100%;
        box-sizing: border-box;
        margin-top: 4px;
        padding: 6px 8px;
        border: 1px solid #aaa;
        border-radius: 4px;
        font: 13px ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
      }

      #lc-finder-config-panel textarea {
        resize: vertical;
      }

      #lc-finder-config-panel .lc-finder-actions {
        display: flex;
        gap: 8px;
        flex-wrap: wrap;
        margin-top: 12px;
      }

      #lc-finder-config-panel button {
        cursor: pointer;
        padding: 6px 10px;
        border: 1px solid #999;
        border-radius: 4px;
        background: #f7f7f7;
        color: #222;
      }

      #lc-finder-config-panel button:hover {
        background: #eee;
      }

      #lc-finder-config-panel .lc-finder-help {
        margin-top: 12px;
        color: #555;
        font-size: 12px;
      }

      #lc-finder-config-panel .lc-finder-help p {
        margin: 4px 0;
      }

      #lc-finder-config-panel code {
        font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
      }
    `;

    document.documentElement.appendChild(style);
    document.body.appendChild(panel);

    panel.querySelector('[data-field="indexUrlPattern"]').value =
      config.indexUrlPattern;
    panel.querySelector('[data-field="linkHrefPattern"]').value =
      config.linkHrefPattern;
    panel.querySelector('[data-field="targetText"]').value = config.targetText;
    panel.querySelector('[data-field="waitAfterLoadMs"]').value =
      config.waitAfterLoadMs;
    panel.querySelector('[data-field="maxUrls"]').value = config.maxUrls;

    function readConfigFromPanel() {
      return {
        indexUrlPattern: panel
          .querySelector('[data-field="indexUrlPattern"]')
          .value.trim(),
        linkHrefPattern: panel
          .querySelector('[data-field="linkHrefPattern"]')
          .value.trim(),
        targetText: panel.querySelector('[data-field="targetText"]').value,
        waitAfterLoadMs: Number(
          panel.querySelector('[data-field="waitAfterLoadMs"]').value
        ),
        maxUrls: Number(panel.querySelector('[data-field="maxUrls"]').value),
      };
    }

    async function saveFromPanel() {
      const nextConfig = readConfigFromPanel();
      validateConfig(nextConfig);
      await saveConfig(nextConfig);
      log("Config saved:", nextConfig);
      alert("Config saved.");
      return nextConfig;
    }

    panel.addEventListener("click", async (event) => {
      const action = event.target?.dataset?.action;
      if (!action) return;

      try {
        if (action === "close") {
          panel.remove();
          style.remove();
          return;
        }

        if (action === "save") {
          await saveFromPanel();
          return;
        }

        if (action === "save-and-start") {
          await saveFromPanel();
          panel.remove();
          style.remove();
          await startScan();
          return;
        }

        if (action === "reset-config") {
          const confirmed = confirm("Reset the config to the default values?");
          if (!confirmed) return;

          await resetConfig();
          panel.remove();
          style.remove();
          await showConfigPanel();
          return;
        }

        if (action === "emergency-stop") {
          await emergencyStopScan();
          return;
        }
      } catch (error) {
        log(error);
        alert(error.message);
      }
    });
  }

  GM.registerMenuCommand("Show config panel", showConfigPanel);
  GM.registerMenuCommand("Start link content scan", startScan);
  GM.registerMenuCommand("Show scan results", showResults);
  GM.registerMenuCommand("Reset scan state", resetScanState);
  GM.registerMenuCommand("Emergency stop scan", emergencyStopScan);

  /**
   * Auto-resume guard.
   *
   * This script never starts a new scan automatically on page load.
   * Completed states are kept for result display.
   * Only active states are eligible for auto-resume.
   */
  const state = await loadState();

  if (!state) {
    log("Idle. Use the Tampermonkey menu: Show config panel or Start link content scan.");
    return;
  }

  if (!isValidStoredState(state)) {
    log("Invalid scan state found. Clearing state:", state);
    await clearState();
    return;
  }

  if (state.active === false) {
    log("Completed scan result exists. Not auto-resuming.");
    return;
  }

  if (!isActiveScanState(state)) {
    log("No active scan state. Doing nothing.");
    return;
  }

  if (!isUrlInScanScope(location.href, state)) {
    log(
      "Active scan state exists, but the current URL is outside the scan scope. Doing nothing.",
      location.href
    );
    return;
  }

  const config = state.configSnapshot;

  let indexUrlRegExp;

  try {
    indexUrlRegExp = compileRegExp(
      config.indexUrlPattern,
      "INDEX_URL_PATTERN"
    );
  } catch (error) {
    log("Invalid config snapshot. Clearing scan state:", error);
    await clearState();
    alert(
      [
        "The saved scan state's config is invalid, so the state was cleared.",
        "",
        error.message,
      ].join("\n")
    );
    return;
  }

  if (indexUrlRegExp.test(location.href)) {
    if (state.currentIndex < state.queue.length) {
      const nextUrl = state.queue[state.currentIndex];

      if (typeof nextUrl !== "string" || !nextUrl) {
        log("Invalid next URL. Clearing scan state:", nextUrl);
        await clearState();
        return;
      }

      log("Resuming from index page. Next URL:", nextUrl);
      location.href = nextUrl;
    } else {
      log("Active state is complete but not marked as finished. Marking it as completed.");

      state.active = false;
      state.currentIndex = state.queue.length;
      state.finishedAt = new Date().toISOString();

      await saveState(state);
    }

    return;
  }

  await continueScanOnTargetPage(state);
})();