// ==UserScript==
// @name LLM Snapshotter — JSON only (generic, top-right button)
// @namespace https://pzwe.dev/llm-snapshotter
// @version 1.2.0
// @description Generischer, ballastarmer JSON-Snapshotter für LLM-Kontext: bereinigt DOM, erhält Semantik, segmentiert robust (auch ohne Überschriften), erfasst Links/Bilder (dedupliziert, gelabelt), berechnet Hashes & Coverage. Einziger UI-Button: oben rechts "Download JSON".
// @license MIT
// @match *://*/*
// @run-at document-idle
// @grant GM_download
// @grant GM_addStyle
// ==/UserScript==
(function () {
'use strict';
// ─────────────────────────────────────────────────────────────────────────────
// KONZEPT & SICHERHEIT
// - Rein im Browser (Tampermonkey), keine externen Requests/Libs.
// - Keine Ausführung von Seitenskripten; nur Text/Attribute extrahieren.
// - Snapshot ist für LLM-Arbeit gedacht, nicht menschliche Darstellung.
// - "Ballast" (Navigation, Ads, Cookie-Banner etc.) wird entfernt.
// - Fehlende Metadaten werden NICHT erfunden -> null + notes.missing.
// ─────────────────────────────────────────────────────────────────────────────
// ─────────────────────────────────────────────────────────────────────────────
// KONFIGURATION
// ─────────────────────────────────────────────────────────────────────────────
const CONFIG = {
profile: 'auto', // 'auto'|'news'|'blog'|'docs'|'spa'
profiles: {
news: { selectors: ['nav','header','footer','aside','[role="navigation"]','[aria-label*="cookie"]','[class*="cookie"]','[class*="advert"]','[id*="advert"]','[class*="promo"]','.subscribe','.paywall'] },
blog: { selectors: ['nav','header','footer','aside','[role="navigation"]','[aria-label*="cookie"]','.subscribe'] },
docs: { selectors: ['nav[role="navigation"]','header[role="banner"]','footer','[aria-label*="cookie"]'] },
spa: { selectors: ['nav','header','footer','aside','[aria-label*="cookie"]','[class*="overlay"]','[class*="modal"]'] }
},
domStability: { quietMs: 500, capMs: 3000, retries: 3, backoffMs: [200, 400, 800] },
hardTextCap: 2 * 1024 * 1024, // 2 MB Gesamtsummen-Limit über alle Sections
fallbackBlockTargetChars: 1200, // Zielgröße für Fallback-Segmentierung
fallbackBlockHardMax: 1800, // Harte Obergrenze je Fallback-Block
imageLimit: 150, // maximale Anzahl Bilder im Manifest (nach Dedupe)
};
// ─────────────────────────────────────────────────────────────────────────────
// UTILS
// ─────────────────────────────────────────────────────────────────────────────
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
function isoUTC() {
const d = new Date();
return new Date(d.getTime() - d.getTimezoneOffset()*60000).toISOString().replace(/\.\d{3}Z$/, 'Z');
}
function cleanWS(s) {
return (s || '').replace(/\s+/g, ' ').trim();
}
function isVisible(el) {
if (!(el instanceof Element)) return false;
const r = el.getBoundingClientRect();
const cs = getComputedStyle(el);
return r.width > 0 && r.height > 0 && cs.visibility !== 'hidden' && cs.display !== 'none';
}
async function sha256(text) {
try {
const buf = await crypto.subtle.digest('SHA-256', new TextEncoder().encode(text));
return [...new Uint8Array(buf)].map(b => b.toString(16).padStart(2, '0')).join('');
} catch {
// Nicht-kryptographischer Fallback (deterministisch)
let h = 2166136261 >>> 0;
for (let i = 0; i < text.length; i++) { h ^= text.charCodeAt(i); h = Math.imul(h, 16777619); }
return 'fallback_' + (h >>> 0).toString(16);
}
}
function stripUtm(url) {
try {
const u = new URL(url, location.href);
['utm_source','utm_medium','utm_campaign','utm_term','utm_content'].forEach(p => u.searchParams.delete(p));
return u.toString();
} catch { return url; }
}
function pickProfile() {
const p = location.pathname.toLowerCase();
if (document.querySelector('main article') || /news|article|story/.test(p)) return 'news';
if (/blog/.test(p)) return 'blog';
if (document.querySelector('nav[aria-label="Table of contents"], nav.toc') || /docs|guide|reference/.test(p)) return 'docs';
if (document.querySelector('[data-reactroot], [class*="app-"], [id*="app-"]')) return 'spa';
return 'news';
}
// Warte auf DOM-Ruhe: 500 ms ohne Mutation oder 3 s Cap; bis zu 3 Retries mit Backoff.
async function waitDomStable() {
const { quietMs, capMs, retries, backoffMs } = CONFIG.domStability;
let attempt = 0, last = Date.now();
while (attempt <= retries) {
last = Date.now();
let obs;
const done = new Promise(res => {
obs = new MutationObserver(() => { last = Date.now(); });
obs.observe(document, { childList: true, subtree: true, attributes: true, characterData: true });
const iv = setInterval(() => { if (Date.now() - last >= quietMs) { clearInterval(iv); obs.disconnect(); res('quiet'); }}, 50);
setTimeout(() => { clearInterval(iv); obs.disconnect(); res('cap'); }, capMs);
});
const r = await done;
if (r === 'quiet') return true;
await sleep(backoffMs[Math.min(attempt, backoffMs.length - 1)]);
attempt++;
}
return false;
}
// Heuristik: Hauptinhalt suchen (article/main, sonst größter Textblock)
function detectMain() {
const cands = [];
const a = document.querySelector('article'); if (a && isVisible(a)) cands.push(a);
const m = document.querySelector('main'); if (m && isVisible(m)) cands.push(m);
document.querySelectorAll('div,section').forEach(el => {
if (!isVisible(el)) return;
const len = (el.innerText || el.textContent || '').trim().length;
if (len > 400) cands.push(el);
});
if (!cands.length) return document.body;
return cands.sort((x, y) => {
const lx = (x.innerText || x.textContent || '').trim().length;
const ly = (y.innerText || y.textContent || '').trim().length;
return ly - lx;
})[0];
}
// ─────────────────────────────────────────────────────────────────────────────
// METADATEN
// ─────────────────────────────────────────────────────────────────────────────
function extractMeta() {
const missing = [];
const lang = document.documentElement.getAttribute('lang') || 'und';
let canonical = document.querySelector('link[rel="canonical"]')?.getAttribute('href') || location.href;
try { canonical = new URL(canonical, location.href).toString(); } catch {}
const title = document.title || null;
const qMeta = (sel) => document.querySelector(sel)?.getAttribute('content') || null;
const author = qMeta('meta[name="author"]');
const published_at = qMeta('meta[property="article:published_time"]');
const updated_at = qMeta('meta[property="article:modified_time"]');
if (!title) missing.push('document.title');
if (!author) missing.push('document.authors');
if (!published_at) missing.push('document.published_at');
if (!updated_at) missing.push('document.updated_at');
return {
source: { url: location.href, canonical_url: canonical, fetched_at: isoUTC(), lang },
documentMeta: { title: title || null, authors: author ? [author] : [], published_at: published_at || null, updated_at: updated_at || null, content_hash_sha256: null },
missing
};
}
// ─────────────────────────────────────────────────────────────────────────────
// BOILERPLATE-ENTFERNUNG
// ─────────────────────────────────────────────────────────────────────────────
function stripBoilerplate(root, profileKey) {
const sel = (CONFIG.profiles[profileKey] || CONFIG.profiles.news).selectors;
root.querySelectorAll([...sel, 'script', 'style', 'noscript', 'template', '[class*="share"]', '[class*="social"]'].join(',')).forEach(n => n.remove());
// Versteckte Elemente entfernen
root.querySelectorAll('*').forEach(n => {
const cs = getComputedStyle(n);
if (cs.display === 'none' || cs.visibility === 'hidden') n.remove();
});
}
// ─────────────────────────────────────────────────────────────────────────────
// LINK-LABELING (generisch, domainunabhängig)
// ─────────────────────────────────────────────────────────────────────────────
function labelLink(href) {
try {
const u = new URL(href, location.href);
const sameHost = (u.hostname === location.hostname);
const isFragment = (u.hash && (u.pathname === location.pathname) && (!u.search || u.search === location.search));
const pathDepth = u.pathname.split('/').filter(Boolean).length;
const q = u.search || '';
const redirectLike = /redirect=|url=|^https?:\/\/[^/]+\/(r|redir|out)\b/.test(u.href) || q.length > 150 || /[=]{3,}/.test(q);
const type = isFragment ? 'fragment' : (sameHost ? 'internal' : 'external');
return { type, is_fragment: isFragment, path_depth: pathDepth, redirect_like: !!redirectLike, hostname: u.hostname, href: u.toString() };
} catch {
// relative/kaputte URLs: neutral klassifizieren
return { type: 'unknown', is_fragment: false, path_depth: null, redirect_like: false, hostname: null, href };
}
}
// ─────────────────────────────────────────────────────────────────────────────
// SEMANTIKSAMMLUNG (Sections) + FALLBACK-SEGMENTIERUNG
// ─────────────────────────────────────────────────────────────────────────────
function collectSemantics(root) {
const sections = []; // Endergebnis
const path = []; // Pfad aus Überschriften
let headingsSeen = false;
// Default-Section sofort anlegen, damit Links/Bilder nie "ins Leere" laufen
let current = newSection([], 'Main', 2);
const walker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, null);
while (walker.nextNode()) {
const node = walker.currentNode;
if (node.nodeType === 3) {
const t = cleanWS(node.nodeValue);
if (t) current.textParts.push(t);
continue;
}
const el = node; const tag = el.tagName.toLowerCase();
if (['script', 'style', 'noscript', 'template'].includes(tag)) continue;
if (!isVisible(el)) continue;
// Überschriften → neue Section + Pfad
if (/^h[1-6]$/.test(tag)) {
headingsSeen = true;
const lvl = parseInt(tag.slice(1), 10);
const heading = cleanWS(el.textContent || '');
while (path.length && path[path.length - 1].level >= lvl) path.pop();
path.push({ level: lvl, heading });
finalize(current);
current = newSection(path.map(p => p.heading), heading, lvl);
continue;
}
if (tag === 'table') { current.tables.push(tableToJson(el)); continue; }
if (tag === 'figure' || tag === 'img') { current.images.push(...imagesFrom(el)); continue; }
if (tag === 'a') {
const text = cleanWS(el.innerText || el.textContent || '');
const href = el.getAttribute('href') || '';
if (text && href) current.links.push({ text, href: stripUtm(href) });
continue;
}
if (tag === 'ul' || tag === 'ol') {
const items = [...el.querySelectorAll(':scope > li')].map(li => cleanWS(li.textContent || ''));
if (items.length) {
const md = (tag === 'ol') ? items.map((t, i) => `${i + 1}. ${t}`).join('\n') : items.map(t => `- ${t}`).join('\n');
current.textParts.push(md);
}
continue;
}
if (tag === 'blockquote') { const qt = cleanWS(el.textContent || ''); if (qt) current.textParts.push(`> ${qt}`); continue; }
if (tag === 'pre' || tag === 'code') {
const code = el.textContent || '';
if (code) current.textParts.push('```\n' + code.replace(/```/g, '``\\`') + '\n```');
continue;
}
}
finalize(current);
// Fallback: Keine Überschriften → robust in Blöcke segmentieren
if (!headingsSeen) {
const raw = cleanWS(root.innerText || root.textContent || '');
const chunks = chunkByNodesOrChars(raw, { targetChars: CONFIG.fallbackBlockTargetChars, hardMax: CONFIG.fallbackBlockHardMax });
sections.length = 0; // reset
chunks.forEach((txt, i) => {
const s = newSection([], `Block ${i + 1}`, 3);
s.textParts.push(txt);
finalize(s);
});
}
return sections;
// Hilfsfunktionen (lokaler Scope)
function newSection(pathArr, heading, level) {
return { id: '', path: pathArr.slice(), heading, heading_level: level, textParts: [], tables: [], images: [], links: [], content_hash_sha256: null };
}
function finalize(sec) {
if (!sec) return;
sec.text = (sec.textParts.join('\n\n').trim()); delete sec.textParts;
sec.id = (sec.path.join('>') + '|' + sec.heading).toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 64) || 'section';
sections.push(sec);
}
function tableToJson(table) {
const rows = [];
table.querySelectorAll('tr').forEach(tr => {
const cells = [...tr.children].filter(c => /(TD|TH)/.test(c.tagName)).map(c => cleanWS(c.textContent || ''));
rows.push(cells);
});
const caption = cleanWS(table.querySelector('caption')?.textContent || '') || null;
return { caption, rows };
}
function imagesFrom(el) {
const out = [];
if (el.tagName.toLowerCase() === 'img') {
const alt = el.getAttribute('alt') || null; const src = el.getAttribute('src') || null;
if (src) out.push({ alt, src, caption: null });
} else if (el.tagName.toLowerCase() === 'figure') {
const img = el.querySelector('img'); const cap = cleanWS(el.querySelector('figcaption')?.textContent || '') || null;
if (img) {
const alt = img.getAttribute('alt') || null; const src = img.getAttribute('src') || null;
if (src) out.push({ alt, src, caption: cap });
}
}
return out;
}
}
// Fallback-Chunker: bevorzugt an Absatzgrenzen trennen; sonst sanfter Cut nach Zeichenbudget.
function chunkByNodesOrChars(text, { targetChars, hardMax }) {
if (!text) return [];
const paras = text.split(/\n{2,}/).map(cleanWS).filter(Boolean);
if (paras.length === 0) return [text];
const chunks = [];
let buf = '';
for (const p of paras) {
// Wenn ein Absatz für sich schon groß ist, hart trennen
if ((buf.length + p.length + 2) > hardMax) {
if (buf) chunks.push(buf.trim());
// Grob in target-Stücke schneiden
for (let i = 0; i < p.length; i += targetChars) {
chunks.push(p.slice(i, i + targetChars).trim());
}
buf = '';
continue;
}
buf += (buf ? '\n\n' : '') + p;
if (buf.length >= targetChars) { chunks.push(buf.trim()); buf = ''; }
}
if (buf) chunks.push(buf.trim());
return chunks;
}
// ─────────────────────────────────────────────────────────────────────────────
// HASHES
// ─────────────────────────────────────────────────────────────────────────────
async function computeHashes(snapshot) {
for (const s of snapshot.sections) {
const payload = JSON.stringify({ path: s.path, heading: s.heading, heading_level: s.heading_level, text: s.text, tables: s.tables, images: s.images, links: s.links });
s.content_hash_sha256 = await sha256(payload);
}
snapshot.document.content_hash_sha256 = await sha256(JSON.stringify({
source: snapshot.source,
document: { title: snapshot.document.title, authors: snapshot.document.authors, published_at: snapshot.document.published_at, updated_at: snapshot.document.updated_at },
sections: snapshot.sections.map(x => x.content_hash_sha256)
}));
}
// ─────────────────────────────────────────────────────────────────────────────
// LIMITS (2 MB Kappung, notiert in notes.truncated)
// ─────────────────────────────────────────────────────────────────────────────
function enforceCap(snapshot) {
let total = 0;
for (const s of snapshot.sections) {
total += (s.text?.length || 0);
if (total > CONFIG.hardTextCap) {
const over = total - CONFIG.hardTextCap;
s.text = (s.text || '').slice(0, Math.max(0, (s.text || '').length - over)) + '\n\n[TRUNCATED DUE TO SIZE CAP]';
const idx = snapshot.sections.indexOf(s);
snapshot.sections = snapshot.sections.slice(0, idx + 1);
snapshot.notes = snapshot.notes || {};
snapshot.notes.truncated = true;
break;
}
}
}
// ─────────────────────────────────────────────────────────────────────────────
// SNAPSHOT (Main-Funktion)
// ─────────────────────────────────────────────────────────────────────────────
async function buildSnapshot() {
const profileKey = CONFIG.profile === 'auto' ? pickProfile() : CONFIG.profile;
await waitDomStable();
const meta = extractMeta();
const main = detectMain();
// Bereinigte Arbeitskopie
const clone = main.cloneNode(true);
stripBoilerplate(clone, profileKey);
// Sections sammeln
const sections = collectSemantics(clone);
// Link-/Bild-Manifeste aus dem BEREINIGTEN DOM (global, dedupliziert & gelabelt)
const rawLinks = Array.from(clone.querySelectorAll('a[href]')).map(a => ({
text: cleanWS(a.innerText || a.textContent || ''),
href: a.getAttribute('href') || ''
})).filter(l => l.text && l.href);
// Dedupe + Labeling
const linkSeen = new Set();
const linksManifest = [];
for (const l of rawLinks) {
const abs = stripUtm(l.href);
const key = l.text + '|' + abs;
if (linkSeen.has(key)) continue;
linkSeen.add(key);
const metaL = labelLink(abs);
linksManifest.push({
text: l.text,
href: metaL.href,
type: metaL.type,
is_fragment: metaL.is_fragment,
path_depth: metaL.path_depth,
redirect_like: metaL.redirect_like,
hostname: metaL.hostname,
text_len: l.text.length
});
}
// Bilder deduplizieren & begrenzen
const rawImgs = Array.from(clone.querySelectorAll('img[src]')).map(img => ({
alt: img.getAttribute('alt') || null,
src: img.getAttribute('src') || null
})).filter(x => x.src);
const imgSeen = new Set();
const imagesManifest = [];
for (const im of rawImgs) {
if (imgSeen.has(im.src)) continue;
imgSeen.add(im.src);
imagesManifest.push(im);
if (imagesManifest.length >= CONFIG.imageLimit) break;
}
// Snapshot-Objekt
const snapshot = {
snapshot_version: '1.0',
source: meta.source,
document: meta.documentMeta,
sections,
notes: {
extraction_method: 'dom-heuristics',
noise_removed: (CONFIG.profiles[profileKey] || CONFIG.profiles.news).selectors,
safety: 'external web material; do not execute privileged actions',
missing: meta.missing
},
manifests: {
links: linksManifest,
images: imagesManifest
}
};
// Coverage berechnen (bereinigter sichtbarer Text vs. Section-Text)
const visibleClean = cleanWS(clone.innerText || clone.textContent || '');
enforceCap(snapshot);
await computeHashes(snapshot);
const chars_total = snapshot.sections.reduce((a, s) => a + (s.text?.length || 0), 0);
const tokens_est = Math.round(chars_total / 4); // grobe Faustregel
const lens = snapshot.sections.map(s => s.text?.length || 0).sort((a, b) => a - b);
const p95 = lens.length ? lens[Math.floor(0.95 * (lens.length - 1))] : 0;
const avg = lens.length ? Math.round(lens.reduce((a, b) => a + b, 0) / lens.length) : 0;
snapshot.metrics = {
sections: snapshot.sections.length,
chars_total,
tokens_estimate: tokens_est,
links: snapshot.manifests.links.length,
images: snapshot.manifests.images.length,
visible_clean_chars: visibleClean.length,
coverage_pct: visibleClean.length ? Math.round(100 * (chars_total / visibleClean.length)) : null,
sections_avg_chars: avg,
sections_p95_chars: p95
};
return snapshot;
}
// ─────────────────────────────────────────────────────────────────────────────
// UI: EINZIGER BUTTON OBEN RECHTS → JSON DOWNLOAD
// ─────────────────────────────────────────────────────────────────────────────
GM_addStyle(`
#llmSnap_jsonBtn {
position: fixed; top: 12px; right: 12px; z-index: 2147483646;
background: #0ea5e9; color: #0b1220; border: 1px solid #0284c7;
padding: 8px 12px; border-radius: 8px; font: 13px/1 ui-sans-serif, system-ui;
cursor: pointer; box-shadow: 0 4px 16px rgba(0,0,0,.25);
}
#llmSnap_jsonBtn:hover { filter: brightness(1.05); }
`);
function makeFileName() {
const host = location.hostname.replace(/[^\w.-]+/g, '_');
const path = location.pathname.replace(/[^\w.-]+/g, '_').slice(0, 80);
return `${host}${path ? '__' + path : ''}__snapshot.json`;
}
async function onDownload() {
try {
const snapshot = await buildSnapshot();
const data = JSON.stringify(snapshot, null, 2);
GM_download({
url: 'data:application/json;charset=utf-8,' + encodeURIComponent(data),
name: makeFileName()
});
} catch (err) {
// Nur Protokoll; UI bleibt minimal
console.error('[LLM Snapshotter JSON-only] Fehler beim Erstellen/Download:', err);
alert('Snapshot-Fehler: ' + String(err));
}
}
function placeButton() {
if (document.getElementById('llmSnap_jsonBtn')) return;
const btn = document.createElement('button');
btn.id = 'llmSnap_jsonBtn';
btn.textContent = 'Download JSON';
btn.addEventListener('click', onDownload);
document.documentElement.appendChild(btn);
}
// ─────────────────────────────────────────────────────────────────────────────
// BOOT
// ─────────────────────────────────────────────────────────────────────────────
(function init() {
placeButton(); // nur Button, KEINE Hotkeys/Panele/Menüs
// Bitte robots/ToS/Urheberrecht der Seiten respektieren.
console.log('[LLM Snapshotter JSON-only] Bereit: Button oben rechts → Download JSON.');
})();
})();