LLM Snapshotter — JSON only (generic, top-right button)

Generischer, ballastarmer JSON-Snapshotter für LLM-Kontext: bereinigt DOM, erhält Semantik, segmentiert robust (auch ohne Überschriften), erfasst Links/Bilder (dedupliziert, gelabelt), berechnet Hashes & Coverage. Einziger UI-Button: oben rechts "Download JSON".

// ==UserScript==
// @name         LLM Snapshotter — JSON only (generic, top-right button)
// @namespace    https://pzwe.dev/llm-snapshotter
// @version      1.2.0
// @description  Generischer, ballastarmer JSON-Snapshotter für LLM-Kontext: bereinigt DOM, erhält Semantik, segmentiert robust (auch ohne Überschriften), erfasst Links/Bilder (dedupliziert, gelabelt), berechnet Hashes & Coverage. Einziger UI-Button: oben rechts "Download JSON".
// @license      MIT
// @match        *://*/*
// @run-at       document-idle
// @grant        GM_download
// @grant        GM_addStyle
// ==/UserScript==

(function () {
  'use strict';

  // ─────────────────────────────────────────────────────────────────────────────
  // KONZEPT & SICHERHEIT
  // - Rein im Browser (Tampermonkey), keine externen Requests/Libs.
  // - Keine Ausführung von Seitenskripten; nur Text/Attribute extrahieren.
  // - Snapshot ist für LLM-Arbeit gedacht, nicht menschliche Darstellung.
  // - "Ballast" (Navigation, Ads, Cookie-Banner etc.) wird entfernt.
  // - Fehlende Metadaten werden NICHT erfunden -> null + notes.missing.
  // ─────────────────────────────────────────────────────────────────────────────

  // ─────────────────────────────────────────────────────────────────────────────
  // KONFIGURATION
  // ─────────────────────────────────────────────────────────────────────────────
  const CONFIG = {
    profile: 'auto', // 'auto'|'news'|'blog'|'docs'|'spa'
    profiles: {
      news: { selectors: ['nav','header','footer','aside','[role="navigation"]','[aria-label*="cookie"]','[class*="cookie"]','[class*="advert"]','[id*="advert"]','[class*="promo"]','.subscribe','.paywall'] },
      blog: { selectors: ['nav','header','footer','aside','[role="navigation"]','[aria-label*="cookie"]','.subscribe'] },
      docs: { selectors: ['nav[role="navigation"]','header[role="banner"]','footer','[aria-label*="cookie"]'] },
      spa:  { selectors: ['nav','header','footer','aside','[aria-label*="cookie"]','[class*="overlay"]','[class*="modal"]'] }
    },
    domStability: { quietMs: 500, capMs: 3000, retries: 3, backoffMs: [200, 400, 800] },
    hardTextCap: 2 * 1024 * 1024,      // 2 MB Gesamtsummen-Limit über alle Sections
    fallbackBlockTargetChars: 1200,     // Zielgröße für Fallback-Segmentierung
    fallbackBlockHardMax: 1800,         // Harte Obergrenze je Fallback-Block
    imageLimit: 150,                    // maximale Anzahl Bilder im Manifest (nach Dedupe)
  };

  // ─────────────────────────────────────────────────────────────────────────────
  // UTILS
  // ─────────────────────────────────────────────────────────────────────────────
  const sleep = (ms) => new Promise(r => setTimeout(r, ms));

  function isoUTC() {
    const d = new Date();
    return new Date(d.getTime() - d.getTimezoneOffset()*60000).toISOString().replace(/\.\d{3}Z$/, 'Z');
  }

  function cleanWS(s) {
    return (s || '').replace(/\s+/g, ' ').trim();
  }

  function isVisible(el) {
    if (!(el instanceof Element)) return false;
    const r = el.getBoundingClientRect();
    const cs = getComputedStyle(el);
    return r.width > 0 && r.height > 0 && cs.visibility !== 'hidden' && cs.display !== 'none';
  }

  async function sha256(text) {
    try {
      const buf = await crypto.subtle.digest('SHA-256', new TextEncoder().encode(text));
      return [...new Uint8Array(buf)].map(b => b.toString(16).padStart(2, '0')).join('');
    } catch {
      // Nicht-kryptographischer Fallback (deterministisch)
      let h = 2166136261 >>> 0;
      for (let i = 0; i < text.length; i++) { h ^= text.charCodeAt(i); h = Math.imul(h, 16777619); }
      return 'fallback_' + (h >>> 0).toString(16);
    }
  }

  function stripUtm(url) {
    try {
      const u = new URL(url, location.href);
      ['utm_source','utm_medium','utm_campaign','utm_term','utm_content'].forEach(p => u.searchParams.delete(p));
      return u.toString();
    } catch { return url; }
  }

  function pickProfile() {
    const p = location.pathname.toLowerCase();
    if (document.querySelector('main article') || /news|article|story/.test(p)) return 'news';
    if (/blog/.test(p)) return 'blog';
    if (document.querySelector('nav[aria-label="Table of contents"], nav.toc') || /docs|guide|reference/.test(p)) return 'docs';
    if (document.querySelector('[data-reactroot], [class*="app-"], [id*="app-"]')) return 'spa';
    return 'news';
  }

  // Warte auf DOM-Ruhe: 500 ms ohne Mutation oder 3 s Cap; bis zu 3 Retries mit Backoff.
  async function waitDomStable() {
    const { quietMs, capMs, retries, backoffMs } = CONFIG.domStability;
    let attempt = 0, last = Date.now();
    while (attempt <= retries) {
      last = Date.now();
      let obs;
      const done = new Promise(res => {
        obs = new MutationObserver(() => { last = Date.now(); });
        obs.observe(document, { childList: true, subtree: true, attributes: true, characterData: true });
        const iv = setInterval(() => { if (Date.now() - last >= quietMs) { clearInterval(iv); obs.disconnect(); res('quiet'); }}, 50);
        setTimeout(() => { clearInterval(iv); obs.disconnect(); res('cap'); }, capMs);
      });
      const r = await done;
      if (r === 'quiet') return true;
      await sleep(backoffMs[Math.min(attempt, backoffMs.length - 1)]);
      attempt++;
    }
    return false;
  }

  // Heuristik: Hauptinhalt suchen (article/main, sonst größter Textblock)
  function detectMain() {
    const cands = [];
    const a = document.querySelector('article'); if (a && isVisible(a)) cands.push(a);
    const m = document.querySelector('main');    if (m && isVisible(m)) cands.push(m);
    document.querySelectorAll('div,section').forEach(el => {
      if (!isVisible(el)) return;
      const len = (el.innerText || el.textContent || '').trim().length;
      if (len > 400) cands.push(el);
    });
    if (!cands.length) return document.body;
    return cands.sort((x, y) => {
      const lx = (x.innerText || x.textContent || '').trim().length;
      const ly = (y.innerText || y.textContent || '').trim().length;
      return ly - lx;
    })[0];
  }

  // ─────────────────────────────────────────────────────────────────────────────
  // METADATEN
  // ─────────────────────────────────────────────────────────────────────────────
  function extractMeta() {
    const missing = [];
    const lang = document.documentElement.getAttribute('lang') || 'und';
    let canonical = document.querySelector('link[rel="canonical"]')?.getAttribute('href') || location.href;
    try { canonical = new URL(canonical, location.href).toString(); } catch {}
    const title = document.title || null;

    const qMeta = (sel) => document.querySelector(sel)?.getAttribute('content') || null;
    const author = qMeta('meta[name="author"]');
    const published_at = qMeta('meta[property="article:published_time"]');
    const updated_at   = qMeta('meta[property="article:modified_time"]');

    if (!title)        missing.push('document.title');
    if (!author)       missing.push('document.authors');
    if (!published_at) missing.push('document.published_at');
    if (!updated_at)   missing.push('document.updated_at');

    return {
      source: { url: location.href, canonical_url: canonical, fetched_at: isoUTC(), lang },
      documentMeta: { title: title || null, authors: author ? [author] : [], published_at: published_at || null, updated_at: updated_at || null, content_hash_sha256: null },
      missing
    };
  }

  // ─────────────────────────────────────────────────────────────────────────────
  // BOILERPLATE-ENTFERNUNG
  // ─────────────────────────────────────────────────────────────────────────────
  function stripBoilerplate(root, profileKey) {
    const sel = (CONFIG.profiles[profileKey] || CONFIG.profiles.news).selectors;
    root.querySelectorAll([...sel, 'script', 'style', 'noscript', 'template', '[class*="share"]', '[class*="social"]'].join(',')).forEach(n => n.remove());
    // Versteckte Elemente entfernen
    root.querySelectorAll('*').forEach(n => {
      const cs = getComputedStyle(n);
      if (cs.display === 'none' || cs.visibility === 'hidden') n.remove();
    });
  }

  // ─────────────────────────────────────────────────────────────────────────────
  // LINK-LABELING (generisch, domainunabhängig)
  // ─────────────────────────────────────────────────────────────────────────────
  function labelLink(href) {
    try {
      const u = new URL(href, location.href);
      const sameHost = (u.hostname === location.hostname);
      const isFragment = (u.hash && (u.pathname === location.pathname) && (!u.search || u.search === location.search));
      const pathDepth = u.pathname.split('/').filter(Boolean).length;
      const q = u.search || '';
      const redirectLike = /redirect=|url=|^https?:\/\/[^/]+\/(r|redir|out)\b/.test(u.href) || q.length > 150 || /[=]{3,}/.test(q);
      const type = isFragment ? 'fragment' : (sameHost ? 'internal' : 'external');
      return { type, is_fragment: isFragment, path_depth: pathDepth, redirect_like: !!redirectLike, hostname: u.hostname, href: u.toString() };
    } catch {
      // relative/kaputte URLs: neutral klassifizieren
      return { type: 'unknown', is_fragment: false, path_depth: null, redirect_like: false, hostname: null, href };
    }
  }

  // ─────────────────────────────────────────────────────────────────────────────
  // SEMANTIKSAMMLUNG (Sections) + FALLBACK-SEGMENTIERUNG
  // ─────────────────────────────────────────────────────────────────────────────
  function collectSemantics(root) {
    const sections = [];    // Endergebnis
    const path = [];        // Pfad aus Überschriften
    let headingsSeen = false;

    // Default-Section sofort anlegen, damit Links/Bilder nie "ins Leere" laufen
    let current = newSection([], 'Main', 2);

    const walker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, null);
    while (walker.nextNode()) {
      const node = walker.currentNode;

      if (node.nodeType === 3) {
        const t = cleanWS(node.nodeValue);
        if (t) current.textParts.push(t);
        continue;
      }

      const el = node; const tag = el.tagName.toLowerCase();
      if (['script', 'style', 'noscript', 'template'].includes(tag)) continue;
      if (!isVisible(el)) continue;

      // Überschriften → neue Section + Pfad
      if (/^h[1-6]$/.test(tag)) {
        headingsSeen = true;
        const lvl = parseInt(tag.slice(1), 10);
        const heading = cleanWS(el.textContent || '');
        while (path.length && path[path.length - 1].level >= lvl) path.pop();
        path.push({ level: lvl, heading });
        finalize(current);
        current = newSection(path.map(p => p.heading), heading, lvl);
        continue;
      }

      if (tag === 'table') { current.tables.push(tableToJson(el)); continue; }
      if (tag === 'figure' || tag === 'img') { current.images.push(...imagesFrom(el)); continue; }

      if (tag === 'a') {
        const text = cleanWS(el.innerText || el.textContent || '');
        const href = el.getAttribute('href') || '';
        if (text && href) current.links.push({ text, href: stripUtm(href) });
        continue;
      }

      if (tag === 'ul' || tag === 'ol') {
        const items = [...el.querySelectorAll(':scope > li')].map(li => cleanWS(li.textContent || ''));
        if (items.length) {
          const md = (tag === 'ol') ? items.map((t, i) => `${i + 1}. ${t}`).join('\n') : items.map(t => `- ${t}`).join('\n');
          current.textParts.push(md);
        }
        continue;
      }

      if (tag === 'blockquote') { const qt = cleanWS(el.textContent || ''); if (qt) current.textParts.push(`> ${qt}`); continue; }
      if (tag === 'pre' || tag === 'code') {
        const code = el.textContent || '';
        if (code) current.textParts.push('```\n' + code.replace(/```/g, '``\\`') + '\n```');
        continue;
      }
    }

    finalize(current);

    // Fallback: Keine Überschriften → robust in Blöcke segmentieren
    if (!headingsSeen) {
      const raw = cleanWS(root.innerText || root.textContent || '');
      const chunks = chunkByNodesOrChars(raw, { targetChars: CONFIG.fallbackBlockTargetChars, hardMax: CONFIG.fallbackBlockHardMax });
      sections.length = 0; // reset
      chunks.forEach((txt, i) => {
        const s = newSection([], `Block ${i + 1}`, 3);
        s.textParts.push(txt);
        finalize(s);
      });
    }

    return sections;

    // Hilfsfunktionen (lokaler Scope)
    function newSection(pathArr, heading, level) {
      return { id: '', path: pathArr.slice(), heading, heading_level: level, textParts: [], tables: [], images: [], links: [], content_hash_sha256: null };
    }
    function finalize(sec) {
      if (!sec) return;
      sec.text = (sec.textParts.join('\n\n').trim()); delete sec.textParts;
      sec.id = (sec.path.join('>') + '|' + sec.heading).toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 64) || 'section';
      sections.push(sec);
    }
    function tableToJson(table) {
      const rows = [];
      table.querySelectorAll('tr').forEach(tr => {
        const cells = [...tr.children].filter(c => /(TD|TH)/.test(c.tagName)).map(c => cleanWS(c.textContent || ''));
        rows.push(cells);
      });
      const caption = cleanWS(table.querySelector('caption')?.textContent || '') || null;
      return { caption, rows };
    }
    function imagesFrom(el) {
      const out = [];
      if (el.tagName.toLowerCase() === 'img') {
        const alt = el.getAttribute('alt') || null; const src = el.getAttribute('src') || null;
        if (src) out.push({ alt, src, caption: null });
      } else if (el.tagName.toLowerCase() === 'figure') {
        const img = el.querySelector('img'); const cap = cleanWS(el.querySelector('figcaption')?.textContent || '') || null;
        if (img) {
          const alt = img.getAttribute('alt') || null; const src = img.getAttribute('src') || null;
          if (src) out.push({ alt, src, caption: cap });
        }
      }
      return out;
    }
  }

  // Fallback-Chunker: bevorzugt an Absatzgrenzen trennen; sonst sanfter Cut nach Zeichenbudget.
  function chunkByNodesOrChars(text, { targetChars, hardMax }) {
    if (!text) return [];
    const paras = text.split(/\n{2,}/).map(cleanWS).filter(Boolean);
    if (paras.length === 0) return [text];

    const chunks = [];
    let buf = '';
    for (const p of paras) {
      // Wenn ein Absatz für sich schon groß ist, hart trennen
      if ((buf.length + p.length + 2) > hardMax) {
        if (buf) chunks.push(buf.trim());
        // Grob in target-Stücke schneiden
        for (let i = 0; i < p.length; i += targetChars) {
          chunks.push(p.slice(i, i + targetChars).trim());
        }
        buf = '';
        continue;
      }
      buf += (buf ? '\n\n' : '') + p;
      if (buf.length >= targetChars) { chunks.push(buf.trim()); buf = ''; }
    }
    if (buf) chunks.push(buf.trim());
    return chunks;
  }

  // ─────────────────────────────────────────────────────────────────────────────
  // HASHES
  // ─────────────────────────────────────────────────────────────────────────────
  async function computeHashes(snapshot) {
    for (const s of snapshot.sections) {
      const payload = JSON.stringify({ path: s.path, heading: s.heading, heading_level: s.heading_level, text: s.text, tables: s.tables, images: s.images, links: s.links });
      s.content_hash_sha256 = await sha256(payload);
    }
    snapshot.document.content_hash_sha256 = await sha256(JSON.stringify({
      source: snapshot.source,
      document: { title: snapshot.document.title, authors: snapshot.document.authors, published_at: snapshot.document.published_at, updated_at: snapshot.document.updated_at },
      sections: snapshot.sections.map(x => x.content_hash_sha256)
    }));
  }

  // ─────────────────────────────────────────────────────────────────────────────
  // LIMITS (2 MB Kappung, notiert in notes.truncated)
  // ─────────────────────────────────────────────────────────────────────────────
  function enforceCap(snapshot) {
    let total = 0;
    for (const s of snapshot.sections) {
      total += (s.text?.length || 0);
      if (total > CONFIG.hardTextCap) {
        const over = total - CONFIG.hardTextCap;
        s.text = (s.text || '').slice(0, Math.max(0, (s.text || '').length - over)) + '\n\n[TRUNCATED DUE TO SIZE CAP]';
        const idx = snapshot.sections.indexOf(s);
        snapshot.sections = snapshot.sections.slice(0, idx + 1);
        snapshot.notes = snapshot.notes || {};
        snapshot.notes.truncated = true;
        break;
      }
    }
  }

  // ─────────────────────────────────────────────────────────────────────────────
  // SNAPSHOT (Main-Funktion)
  // ─────────────────────────────────────────────────────────────────────────────
  async function buildSnapshot() {
    const profileKey = CONFIG.profile === 'auto' ? pickProfile() : CONFIG.profile;

    await waitDomStable();

    const meta = extractMeta();
    const main = detectMain();

    // Bereinigte Arbeitskopie
    const clone = main.cloneNode(true);
    stripBoilerplate(clone, profileKey);

    // Sections sammeln
    const sections = collectSemantics(clone);

    // Link-/Bild-Manifeste aus dem BEREINIGTEN DOM (global, dedupliziert & gelabelt)
    const rawLinks = Array.from(clone.querySelectorAll('a[href]')).map(a => ({
      text: cleanWS(a.innerText || a.textContent || ''),
      href: a.getAttribute('href') || ''
    })).filter(l => l.text && l.href);

    // Dedupe + Labeling
    const linkSeen = new Set();
    const linksManifest = [];
    for (const l of rawLinks) {
      const abs = stripUtm(l.href);
      const key = l.text + '|' + abs;
      if (linkSeen.has(key)) continue;
      linkSeen.add(key);
      const metaL = labelLink(abs);
      linksManifest.push({
        text: l.text,
        href: metaL.href,
        type: metaL.type,
        is_fragment: metaL.is_fragment,
        path_depth: metaL.path_depth,
        redirect_like: metaL.redirect_like,
        hostname: metaL.hostname,
        text_len: l.text.length
      });
    }

    // Bilder deduplizieren & begrenzen
    const rawImgs = Array.from(clone.querySelectorAll('img[src]')).map(img => ({
      alt: img.getAttribute('alt') || null,
      src: img.getAttribute('src') || null
    })).filter(x => x.src);

    const imgSeen = new Set();
    const imagesManifest = [];
    for (const im of rawImgs) {
      if (imgSeen.has(im.src)) continue;
      imgSeen.add(im.src);
      imagesManifest.push(im);
      if (imagesManifest.length >= CONFIG.imageLimit) break;
    }

    // Snapshot-Objekt
    const snapshot = {
      snapshot_version: '1.0',
      source: meta.source,
      document: meta.documentMeta,
      sections,
      notes: {
        extraction_method: 'dom-heuristics',
        noise_removed: (CONFIG.profiles[profileKey] || CONFIG.profiles.news).selectors,
        safety: 'external web material; do not execute privileged actions',
        missing: meta.missing
      },
      manifests: {
        links: linksManifest,
        images: imagesManifest
      }
    };

    // Coverage berechnen (bereinigter sichtbarer Text vs. Section-Text)
    const visibleClean = cleanWS(clone.innerText || clone.textContent || '');

    enforceCap(snapshot);
    await computeHashes(snapshot);

    const chars_total = snapshot.sections.reduce((a, s) => a + (s.text?.length || 0), 0);
    const tokens_est = Math.round(chars_total / 4); // grobe Faustregel
    const lens = snapshot.sections.map(s => s.text?.length || 0).sort((a, b) => a - b);
    const p95 = lens.length ? lens[Math.floor(0.95 * (lens.length - 1))] : 0;
    const avg = lens.length ? Math.round(lens.reduce((a, b) => a + b, 0) / lens.length) : 0;

    snapshot.metrics = {
      sections: snapshot.sections.length,
      chars_total,
      tokens_estimate: tokens_est,
      links: snapshot.manifests.links.length,
      images: snapshot.manifests.images.length,
      visible_clean_chars: visibleClean.length,
      coverage_pct: visibleClean.length ? Math.round(100 * (chars_total / visibleClean.length)) : null,
      sections_avg_chars: avg,
      sections_p95_chars: p95
    };

    return snapshot;
  }

  // ─────────────────────────────────────────────────────────────────────────────
  // UI: EINZIGER BUTTON OBEN RECHTS → JSON DOWNLOAD
  // ─────────────────────────────────────────────────────────────────────────────
  GM_addStyle(`
    #llmSnap_jsonBtn {
      position: fixed; top: 12px; right: 12px; z-index: 2147483646;
      background: #0ea5e9; color: #0b1220; border: 1px solid #0284c7;
      padding: 8px 12px; border-radius: 8px; font: 13px/1 ui-sans-serif, system-ui;
      cursor: pointer; box-shadow: 0 4px 16px rgba(0,0,0,.25);
    }
    #llmSnap_jsonBtn:hover { filter: brightness(1.05); }
  `);

  function makeFileName() {
    const host = location.hostname.replace(/[^\w.-]+/g, '_');
    const path = location.pathname.replace(/[^\w.-]+/g, '_').slice(0, 80);
    return `${host}${path ? '__' + path : ''}__snapshot.json`;
  }

  async function onDownload() {
    try {
      const snapshot = await buildSnapshot();
      const data = JSON.stringify(snapshot, null, 2);
      GM_download({
        url: 'data:application/json;charset=utf-8,' + encodeURIComponent(data),
        name: makeFileName()
      });
    } catch (err) {
      // Nur Protokoll; UI bleibt minimal
      console.error('[LLM Snapshotter JSON-only] Fehler beim Erstellen/Download:', err);
      alert('Snapshot-Fehler: ' + String(err));
    }
  }

  function placeButton() {
    if (document.getElementById('llmSnap_jsonBtn')) return;
    const btn = document.createElement('button');
    btn.id = 'llmSnap_jsonBtn';
    btn.textContent = 'Download JSON';
    btn.addEventListener('click', onDownload);
    document.documentElement.appendChild(btn);
  }

  // ─────────────────────────────────────────────────────────────────────────────
  // BOOT
  // ─────────────────────────────────────────────────────────────────────────────
  (function init() {
    placeButton(); // nur Button, KEINE Hotkeys/Panele/Menüs
    // Bitte robots/ToS/Urheberrecht der Seiten respektieren.
    console.log('[LLM Snapshotter JSON-only] Bereit: Button oben rechts → Download JSON.');
  })();

})();