您需要先安装一个扩展,例如 篡改猴、Greasemonkey 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 暴力猴,之后才能安装此脚本。
您需要先安装一个扩展,例如 篡改猴 或 Userscripts ,之后才能安装此脚本。
您需要先安装一款用户脚本管理器扩展,例如 Tampermonkey,才能安装此脚本。
您需要先安装用户脚本管理器扩展后才能安装此脚本。
Generischer, ballastarmer JSON-Snapshotter für LLM-Kontext: bereinigt DOM, erhält Semantik, segmentiert robust (auch ohne Überschriften), erfasst Links/Bilder (dedupliziert, gelabelt), berechnet Hashes & Coverage. Einziger UI-Button: oben rechts "Download JSON".
// ==UserScript== // @name LLM Snapshotter — JSON only (generic, top-right button) // @namespace https://pzwe.dev/llm-snapshotter // @version 1.2.0 // @description Generischer, ballastarmer JSON-Snapshotter für LLM-Kontext: bereinigt DOM, erhält Semantik, segmentiert robust (auch ohne Überschriften), erfasst Links/Bilder (dedupliziert, gelabelt), berechnet Hashes & Coverage. Einziger UI-Button: oben rechts "Download JSON". // @license MIT // @match *://*/* // @run-at document-idle // @grant GM_download // @grant GM_addStyle // ==/UserScript== (function () { 'use strict'; // ───────────────────────────────────────────────────────────────────────────── // KONZEPT & SICHERHEIT // - Rein im Browser (Tampermonkey), keine externen Requests/Libs. // - Keine Ausführung von Seitenskripten; nur Text/Attribute extrahieren. // - Snapshot ist für LLM-Arbeit gedacht, nicht menschliche Darstellung. // - "Ballast" (Navigation, Ads, Cookie-Banner etc.) wird entfernt. // - Fehlende Metadaten werden NICHT erfunden -> null + notes.missing. // ───────────────────────────────────────────────────────────────────────────── // ───────────────────────────────────────────────────────────────────────────── // KONFIGURATION // ───────────────────────────────────────────────────────────────────────────── const CONFIG = { profile: 'auto', // 'auto'|'news'|'blog'|'docs'|'spa' profiles: { news: { selectors: ['nav','header','footer','aside','[role="navigation"]','[aria-label*="cookie"]','[class*="cookie"]','[class*="advert"]','[id*="advert"]','[class*="promo"]','.subscribe','.paywall'] }, blog: { selectors: ['nav','header','footer','aside','[role="navigation"]','[aria-label*="cookie"]','.subscribe'] }, docs: { selectors: ['nav[role="navigation"]','header[role="banner"]','footer','[aria-label*="cookie"]'] }, spa: { selectors: ['nav','header','footer','aside','[aria-label*="cookie"]','[class*="overlay"]','[class*="modal"]'] } }, domStability: { quietMs: 500, capMs: 3000, retries: 3, backoffMs: [200, 400, 800] }, hardTextCap: 2 * 1024 * 1024, // 2 MB Gesamtsummen-Limit über alle Sections fallbackBlockTargetChars: 1200, // Zielgröße für Fallback-Segmentierung fallbackBlockHardMax: 1800, // Harte Obergrenze je Fallback-Block imageLimit: 150, // maximale Anzahl Bilder im Manifest (nach Dedupe) }; // ───────────────────────────────────────────────────────────────────────────── // UTILS // ───────────────────────────────────────────────────────────────────────────── const sleep = (ms) => new Promise(r => setTimeout(r, ms)); function isoUTC() { const d = new Date(); return new Date(d.getTime() - d.getTimezoneOffset()*60000).toISOString().replace(/\.\d{3}Z$/, 'Z'); } function cleanWS(s) { return (s || '').replace(/\s+/g, ' ').trim(); } function isVisible(el) { if (!(el instanceof Element)) return false; const r = el.getBoundingClientRect(); const cs = getComputedStyle(el); return r.width > 0 && r.height > 0 && cs.visibility !== 'hidden' && cs.display !== 'none'; } async function sha256(text) { try { const buf = await crypto.subtle.digest('SHA-256', new TextEncoder().encode(text)); return [...new Uint8Array(buf)].map(b => b.toString(16).padStart(2, '0')).join(''); } catch { // Nicht-kryptographischer Fallback (deterministisch) let h = 2166136261 >>> 0; for (let i = 0; i < text.length; i++) { h ^= text.charCodeAt(i); h = Math.imul(h, 16777619); } return 'fallback_' + (h >>> 0).toString(16); } } function stripUtm(url) { try { const u = new URL(url, location.href); ['utm_source','utm_medium','utm_campaign','utm_term','utm_content'].forEach(p => u.searchParams.delete(p)); return u.toString(); } catch { return url; } } function pickProfile() { const p = location.pathname.toLowerCase(); if (document.querySelector('main article') || /news|article|story/.test(p)) return 'news'; if (/blog/.test(p)) return 'blog'; if (document.querySelector('nav[aria-label="Table of contents"], nav.toc') || /docs|guide|reference/.test(p)) return 'docs'; if (document.querySelector('[data-reactroot], [class*="app-"], [id*="app-"]')) return 'spa'; return 'news'; } // Warte auf DOM-Ruhe: 500 ms ohne Mutation oder 3 s Cap; bis zu 3 Retries mit Backoff. async function waitDomStable() { const { quietMs, capMs, retries, backoffMs } = CONFIG.domStability; let attempt = 0, last = Date.now(); while (attempt <= retries) { last = Date.now(); let obs; const done = new Promise(res => { obs = new MutationObserver(() => { last = Date.now(); }); obs.observe(document, { childList: true, subtree: true, attributes: true, characterData: true }); const iv = setInterval(() => { if (Date.now() - last >= quietMs) { clearInterval(iv); obs.disconnect(); res('quiet'); }}, 50); setTimeout(() => { clearInterval(iv); obs.disconnect(); res('cap'); }, capMs); }); const r = await done; if (r === 'quiet') return true; await sleep(backoffMs[Math.min(attempt, backoffMs.length - 1)]); attempt++; } return false; } // Heuristik: Hauptinhalt suchen (article/main, sonst größter Textblock) function detectMain() { const cands = []; const a = document.querySelector('article'); if (a && isVisible(a)) cands.push(a); const m = document.querySelector('main'); if (m && isVisible(m)) cands.push(m); document.querySelectorAll('div,section').forEach(el => { if (!isVisible(el)) return; const len = (el.innerText || el.textContent || '').trim().length; if (len > 400) cands.push(el); }); if (!cands.length) return document.body; return cands.sort((x, y) => { const lx = (x.innerText || x.textContent || '').trim().length; const ly = (y.innerText || y.textContent || '').trim().length; return ly - lx; })[0]; } // ───────────────────────────────────────────────────────────────────────────── // METADATEN // ───────────────────────────────────────────────────────────────────────────── function extractMeta() { const missing = []; const lang = document.documentElement.getAttribute('lang') || 'und'; let canonical = document.querySelector('link[rel="canonical"]')?.getAttribute('href') || location.href; try { canonical = new URL(canonical, location.href).toString(); } catch {} const title = document.title || null; const qMeta = (sel) => document.querySelector(sel)?.getAttribute('content') || null; const author = qMeta('meta[name="author"]'); const published_at = qMeta('meta[property="article:published_time"]'); const updated_at = qMeta('meta[property="article:modified_time"]'); if (!title) missing.push('document.title'); if (!author) missing.push('document.authors'); if (!published_at) missing.push('document.published_at'); if (!updated_at) missing.push('document.updated_at'); return { source: { url: location.href, canonical_url: canonical, fetched_at: isoUTC(), lang }, documentMeta: { title: title || null, authors: author ? [author] : [], published_at: published_at || null, updated_at: updated_at || null, content_hash_sha256: null }, missing }; } // ───────────────────────────────────────────────────────────────────────────── // BOILERPLATE-ENTFERNUNG // ───────────────────────────────────────────────────────────────────────────── function stripBoilerplate(root, profileKey) { const sel = (CONFIG.profiles[profileKey] || CONFIG.profiles.news).selectors; root.querySelectorAll([...sel, 'script', 'style', 'noscript', 'template', '[class*="share"]', '[class*="social"]'].join(',')).forEach(n => n.remove()); // Versteckte Elemente entfernen root.querySelectorAll('*').forEach(n => { const cs = getComputedStyle(n); if (cs.display === 'none' || cs.visibility === 'hidden') n.remove(); }); } // ───────────────────────────────────────────────────────────────────────────── // LINK-LABELING (generisch, domainunabhängig) // ───────────────────────────────────────────────────────────────────────────── function labelLink(href) { try { const u = new URL(href, location.href); const sameHost = (u.hostname === location.hostname); const isFragment = (u.hash && (u.pathname === location.pathname) && (!u.search || u.search === location.search)); const pathDepth = u.pathname.split('/').filter(Boolean).length; const q = u.search || ''; const redirectLike = /redirect=|url=|^https?:\/\/[^/]+\/(r|redir|out)\b/.test(u.href) || q.length > 150 || /[=]{3,}/.test(q); const type = isFragment ? 'fragment' : (sameHost ? 'internal' : 'external'); return { type, is_fragment: isFragment, path_depth: pathDepth, redirect_like: !!redirectLike, hostname: u.hostname, href: u.toString() }; } catch { // relative/kaputte URLs: neutral klassifizieren return { type: 'unknown', is_fragment: false, path_depth: null, redirect_like: false, hostname: null, href }; } } // ───────────────────────────────────────────────────────────────────────────── // SEMANTIKSAMMLUNG (Sections) + FALLBACK-SEGMENTIERUNG // ───────────────────────────────────────────────────────────────────────────── function collectSemantics(root) { const sections = []; // Endergebnis const path = []; // Pfad aus Überschriften let headingsSeen = false; // Default-Section sofort anlegen, damit Links/Bilder nie "ins Leere" laufen let current = newSection([], 'Main', 2); const walker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, null); while (walker.nextNode()) { const node = walker.currentNode; if (node.nodeType === 3) { const t = cleanWS(node.nodeValue); if (t) current.textParts.push(t); continue; } const el = node; const tag = el.tagName.toLowerCase(); if (['script', 'style', 'noscript', 'template'].includes(tag)) continue; if (!isVisible(el)) continue; // Überschriften → neue Section + Pfad if (/^h[1-6]$/.test(tag)) { headingsSeen = true; const lvl = parseInt(tag.slice(1), 10); const heading = cleanWS(el.textContent || ''); while (path.length && path[path.length - 1].level >= lvl) path.pop(); path.push({ level: lvl, heading }); finalize(current); current = newSection(path.map(p => p.heading), heading, lvl); continue; } if (tag === 'table') { current.tables.push(tableToJson(el)); continue; } if (tag === 'figure' || tag === 'img') { current.images.push(...imagesFrom(el)); continue; } if (tag === 'a') { const text = cleanWS(el.innerText || el.textContent || ''); const href = el.getAttribute('href') || ''; if (text && href) current.links.push({ text, href: stripUtm(href) }); continue; } if (tag === 'ul' || tag === 'ol') { const items = [...el.querySelectorAll(':scope > li')].map(li => cleanWS(li.textContent || '')); if (items.length) { const md = (tag === 'ol') ? items.map((t, i) => `${i + 1}. ${t}`).join('\n') : items.map(t => `- ${t}`).join('\n'); current.textParts.push(md); } continue; } if (tag === 'blockquote') { const qt = cleanWS(el.textContent || ''); if (qt) current.textParts.push(`> ${qt}`); continue; } if (tag === 'pre' || tag === 'code') { const code = el.textContent || ''; if (code) current.textParts.push('```\n' + code.replace(/```/g, '``\\`') + '\n```'); continue; } } finalize(current); // Fallback: Keine Überschriften → robust in Blöcke segmentieren if (!headingsSeen) { const raw = cleanWS(root.innerText || root.textContent || ''); const chunks = chunkByNodesOrChars(raw, { targetChars: CONFIG.fallbackBlockTargetChars, hardMax: CONFIG.fallbackBlockHardMax }); sections.length = 0; // reset chunks.forEach((txt, i) => { const s = newSection([], `Block ${i + 1}`, 3); s.textParts.push(txt); finalize(s); }); } return sections; // Hilfsfunktionen (lokaler Scope) function newSection(pathArr, heading, level) { return { id: '', path: pathArr.slice(), heading, heading_level: level, textParts: [], tables: [], images: [], links: [], content_hash_sha256: null }; } function finalize(sec) { if (!sec) return; sec.text = (sec.textParts.join('\n\n').trim()); delete sec.textParts; sec.id = (sec.path.join('>') + '|' + sec.heading).toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 64) || 'section'; sections.push(sec); } function tableToJson(table) { const rows = []; table.querySelectorAll('tr').forEach(tr => { const cells = [...tr.children].filter(c => /(TD|TH)/.test(c.tagName)).map(c => cleanWS(c.textContent || '')); rows.push(cells); }); const caption = cleanWS(table.querySelector('caption')?.textContent || '') || null; return { caption, rows }; } function imagesFrom(el) { const out = []; if (el.tagName.toLowerCase() === 'img') { const alt = el.getAttribute('alt') || null; const src = el.getAttribute('src') || null; if (src) out.push({ alt, src, caption: null }); } else if (el.tagName.toLowerCase() === 'figure') { const img = el.querySelector('img'); const cap = cleanWS(el.querySelector('figcaption')?.textContent || '') || null; if (img) { const alt = img.getAttribute('alt') || null; const src = img.getAttribute('src') || null; if (src) out.push({ alt, src, caption: cap }); } } return out; } } // Fallback-Chunker: bevorzugt an Absatzgrenzen trennen; sonst sanfter Cut nach Zeichenbudget. function chunkByNodesOrChars(text, { targetChars, hardMax }) { if (!text) return []; const paras = text.split(/\n{2,}/).map(cleanWS).filter(Boolean); if (paras.length === 0) return [text]; const chunks = []; let buf = ''; for (const p of paras) { // Wenn ein Absatz für sich schon groß ist, hart trennen if ((buf.length + p.length + 2) > hardMax) { if (buf) chunks.push(buf.trim()); // Grob in target-Stücke schneiden for (let i = 0; i < p.length; i += targetChars) { chunks.push(p.slice(i, i + targetChars).trim()); } buf = ''; continue; } buf += (buf ? '\n\n' : '') + p; if (buf.length >= targetChars) { chunks.push(buf.trim()); buf = ''; } } if (buf) chunks.push(buf.trim()); return chunks; } // ───────────────────────────────────────────────────────────────────────────── // HASHES // ───────────────────────────────────────────────────────────────────────────── async function computeHashes(snapshot) { for (const s of snapshot.sections) { const payload = JSON.stringify({ path: s.path, heading: s.heading, heading_level: s.heading_level, text: s.text, tables: s.tables, images: s.images, links: s.links }); s.content_hash_sha256 = await sha256(payload); } snapshot.document.content_hash_sha256 = await sha256(JSON.stringify({ source: snapshot.source, document: { title: snapshot.document.title, authors: snapshot.document.authors, published_at: snapshot.document.published_at, updated_at: snapshot.document.updated_at }, sections: snapshot.sections.map(x => x.content_hash_sha256) })); } // ───────────────────────────────────────────────────────────────────────────── // LIMITS (2 MB Kappung, notiert in notes.truncated) // ───────────────────────────────────────────────────────────────────────────── function enforceCap(snapshot) { let total = 0; for (const s of snapshot.sections) { total += (s.text?.length || 0); if (total > CONFIG.hardTextCap) { const over = total - CONFIG.hardTextCap; s.text = (s.text || '').slice(0, Math.max(0, (s.text || '').length - over)) + '\n\n[TRUNCATED DUE TO SIZE CAP]'; const idx = snapshot.sections.indexOf(s); snapshot.sections = snapshot.sections.slice(0, idx + 1); snapshot.notes = snapshot.notes || {}; snapshot.notes.truncated = true; break; } } } // ───────────────────────────────────────────────────────────────────────────── // SNAPSHOT (Main-Funktion) // ───────────────────────────────────────────────────────────────────────────── async function buildSnapshot() { const profileKey = CONFIG.profile === 'auto' ? pickProfile() : CONFIG.profile; await waitDomStable(); const meta = extractMeta(); const main = detectMain(); // Bereinigte Arbeitskopie const clone = main.cloneNode(true); stripBoilerplate(clone, profileKey); // Sections sammeln const sections = collectSemantics(clone); // Link-/Bild-Manifeste aus dem BEREINIGTEN DOM (global, dedupliziert & gelabelt) const rawLinks = Array.from(clone.querySelectorAll('a[href]')).map(a => ({ text: cleanWS(a.innerText || a.textContent || ''), href: a.getAttribute('href') || '' })).filter(l => l.text && l.href); // Dedupe + Labeling const linkSeen = new Set(); const linksManifest = []; for (const l of rawLinks) { const abs = stripUtm(l.href); const key = l.text + '|' + abs; if (linkSeen.has(key)) continue; linkSeen.add(key); const metaL = labelLink(abs); linksManifest.push({ text: l.text, href: metaL.href, type: metaL.type, is_fragment: metaL.is_fragment, path_depth: metaL.path_depth, redirect_like: metaL.redirect_like, hostname: metaL.hostname, text_len: l.text.length }); } // Bilder deduplizieren & begrenzen const rawImgs = Array.from(clone.querySelectorAll('img[src]')).map(img => ({ alt: img.getAttribute('alt') || null, src: img.getAttribute('src') || null })).filter(x => x.src); const imgSeen = new Set(); const imagesManifest = []; for (const im of rawImgs) { if (imgSeen.has(im.src)) continue; imgSeen.add(im.src); imagesManifest.push(im); if (imagesManifest.length >= CONFIG.imageLimit) break; } // Snapshot-Objekt const snapshot = { snapshot_version: '1.0', source: meta.source, document: meta.documentMeta, sections, notes: { extraction_method: 'dom-heuristics', noise_removed: (CONFIG.profiles[profileKey] || CONFIG.profiles.news).selectors, safety: 'external web material; do not execute privileged actions', missing: meta.missing }, manifests: { links: linksManifest, images: imagesManifest } }; // Coverage berechnen (bereinigter sichtbarer Text vs. Section-Text) const visibleClean = cleanWS(clone.innerText || clone.textContent || ''); enforceCap(snapshot); await computeHashes(snapshot); const chars_total = snapshot.sections.reduce((a, s) => a + (s.text?.length || 0), 0); const tokens_est = Math.round(chars_total / 4); // grobe Faustregel const lens = snapshot.sections.map(s => s.text?.length || 0).sort((a, b) => a - b); const p95 = lens.length ? lens[Math.floor(0.95 * (lens.length - 1))] : 0; const avg = lens.length ? Math.round(lens.reduce((a, b) => a + b, 0) / lens.length) : 0; snapshot.metrics = { sections: snapshot.sections.length, chars_total, tokens_estimate: tokens_est, links: snapshot.manifests.links.length, images: snapshot.manifests.images.length, visible_clean_chars: visibleClean.length, coverage_pct: visibleClean.length ? Math.round(100 * (chars_total / visibleClean.length)) : null, sections_avg_chars: avg, sections_p95_chars: p95 }; return snapshot; } // ───────────────────────────────────────────────────────────────────────────── // UI: EINZIGER BUTTON OBEN RECHTS → JSON DOWNLOAD // ───────────────────────────────────────────────────────────────────────────── GM_addStyle(` #llmSnap_jsonBtn { position: fixed; top: 12px; right: 12px; z-index: 2147483646; background: #0ea5e9; color: #0b1220; border: 1px solid #0284c7; padding: 8px 12px; border-radius: 8px; font: 13px/1 ui-sans-serif, system-ui; cursor: pointer; box-shadow: 0 4px 16px rgba(0,0,0,.25); } #llmSnap_jsonBtn:hover { filter: brightness(1.05); } `); function makeFileName() { const host = location.hostname.replace(/[^\w.-]+/g, '_'); const path = location.pathname.replace(/[^\w.-]+/g, '_').slice(0, 80); return `${host}${path ? '__' + path : ''}__snapshot.json`; } async function onDownload() { try { const snapshot = await buildSnapshot(); const data = JSON.stringify(snapshot, null, 2); GM_download({ url: 'data:application/json;charset=utf-8,' + encodeURIComponent(data), name: makeFileName() }); } catch (err) { // Nur Protokoll; UI bleibt minimal console.error('[LLM Snapshotter JSON-only] Fehler beim Erstellen/Download:', err); alert('Snapshot-Fehler: ' + String(err)); } } function placeButton() { if (document.getElementById('llmSnap_jsonBtn')) return; const btn = document.createElement('button'); btn.id = 'llmSnap_jsonBtn'; btn.textContent = 'Download JSON'; btn.addEventListener('click', onDownload); document.documentElement.appendChild(btn); } // ───────────────────────────────────────────────────────────────────────────── // BOOT // ───────────────────────────────────────────────────────────────────────────── (function init() { placeButton(); // nur Button, KEINE Hotkeys/Panele/Menüs // Bitte robots/ToS/Urheberrecht der Seiten respektieren. console.log('[LLM Snapshotter JSON-only] Bereit: Button oben rechts → Download JSON.'); })(); })();