Clastify PDF Exporter with OCR

Export Clastify canvases into a PDF, allowing user to draw boxes for content to exclude from OCR, with individual removal, scrolling, Clastify logo OCR exclusion zone, and dynamic banner removal.

このスクリプトの質問や評価の投稿はこちら通報はこちらへお寄せください
// ==UserScript==
// @name         Clastify PDF Exporter with OCR
// @namespace    https://www.youtube.com/watch?v=dQw4w9WgXcQ
// @version      1.0
// @description  Export Clastify canvases into a PDF, allowing user to draw boxes for content to exclude from OCR, with individual removal, scrolling, Clastify logo OCR exclusion zone, and dynamic banner removal.
// @match        https://www.clastify.com/*
// @grant        none
// @license      MIT
// ==/UserScript==

(function () {
  'use strict';

  // --- Config ---
  const DEFAULT_DEBUG_MODE = false;      // Default for PDF export debugging (show all OCR text/boxes)
  let debugPdfExport = DEFAULT_DEBUG_MODE; // Can be toggled by annotation UI
  const PAGE_MODE = 'native';           // 'fit' = A4 with scaling; 'native' = page size equals canvas size
  const CALIBRATION_SIZE = 1.4;       // multiplier for font size (vertical fit)
  const BASELINE_OFFSET = -0.2;       // fraction of bbox height to shift baseline
  const ZOOM_PERCENT = 100;           // force zoom for max-quality canvases
  const SCROLL_STEP = 600;
  const SCROLL_DELAY_MS = 400;
  const RENDER_WAIT_MS = 1500;
  const LOW_CONFIDENCE_THRESHOLD = 60; // Tesseract words below this confidence are 'low confidence'
  const ANNOTATION_Z_INDEX = 10000;    // Z-index for annotation UI and overlays
  const EXPORT_SCALE = 1.75;           // Scale factor for high-res export buffer

  // New OCR Refinement Config
  const OUTLIER_Y_DEVIATION_FACTOR = 0.2; // Max allowed deviation from median Y-coordinates for words to be 'inlier' (tighter)
  const LINE_HEIGHT_BUFFER_MULTIPLIER = 1.15; // Multiplier to slightly increase calculated line height for ascenders/descenders
  const MAX_LINE_HEIGHT_FACTOR_OF_MEDIAN_WORD_HEIGHT = 1.8; // Cap effectiveHeight to prevent extreme cases (e.g. 1.8x median word height)
  const FONT_SIZE_TOLERANCE = 0.85; // Factor: if text width < target * tolerance, grow; if > target / tolerance, shrink.
  const FONT_SIZE_MAX_GROWTH_FACTOR = 1.1; // Max factor to grow font size beyond initial height-based calculation
  const SPACE_FONT_SIZE_RATIO_TO_WORD = 0.5; // Font size of inserted spaces relative to the word's font size
  const SCROLL_DEBOUNCE_DELAY = 200; // Milliseconds to wait before executing scroll-triggered functions

  // This defines a rectangular area in the top-right corner of each canvas
  // that will be automatically excluded from OCR processing.
  // Coordinates are relative to the canvas's natural (unscaled) width and height.
  const UPPER_RIGHT_EXCLUSION = {
    x_offset_from_right: 200, // 200px from the right edge of the canvas
    y_offset_from_top: 0,     // 0px from the top edge of the canvas
    width: 200,               // 200px wide
    height: 80,               // 80px tall
    description: 'Logo OCR Exclusion' // For debug logs if needed
  };

  // --- Global Annotation Storage ---
  // Stores annotations per original canvas (page).
  // Each entry in the array corresponds to a canvas index.
  // Each canvas's annotations is an array of { x, y, width, height, type: 'user_exclude_ocr', id: unique ID }
  window.clastifyPageAnnotations = window.clastifyPageAnnotations || [];
  let currentAnnotationType = 'user_exclude_ocr'; // Unified annotation type

  let isOcrEnabled = false;
  let isExporting = false;


  // --- UI Elements ---
  const BUTTON_SIZE = 140;
  const BUTTON_GAP = 10;

  const savePdfBtn = document.createElement('button');
  savePdfBtn.textContent = '💾 Save as PDF';
  Object.assign(savePdfBtn.style, {
    position: 'fixed', top: `${BUTTON_GAP}px`, right: `${BUTTON_GAP}px`, zIndex: ANNOTATION_Z_INDEX + 10,
    width: `${BUTTON_SIZE}px`, padding: '8px 12px', background: '#28a745', color: '#fff',
    border: 'none', borderRadius: '4px', cursor: 'pointer',
    boxShadow: '0 2px 6px rgba(0,0,0,0.2)', display: 'block', textAlign: 'center'
  });
  document.body.appendChild(savePdfBtn);

  const ocrToggleBtn = document.createElement('button');
  Object.assign(ocrToggleBtn.style, {
    position: 'fixed', top: `${BUTTON_GAP}px`, right: `${2 * BUTTON_GAP + BUTTON_SIZE}px`, zIndex: ANNOTATION_Z_INDEX + 10,
    width: `${BUTTON_SIZE}px`, padding: '8px 12px', background: '#dc3545', color: '#fff',
    border: 'none', borderRadius: '4px', cursor: 'pointer',
    boxShadow: '0 2px 6px rgba(0,0,0,0.2)', display: 'block', textAlign: 'center'
  });
  function updateOcrBtn() {
    ocrToggleBtn.textContent = isOcrEnabled ? '📄OCR: On' : '📄OCR: Off';
    ocrToggleBtn.style.background = isOcrEnabled ? '#28a745' : '#dc3545';
  }
  updateOcrBtn();
  ocrToggleBtn.addEventListener('click', () => {
    isOcrEnabled = !isOcrEnabled;
    updateOcrBtn();
    debugToggleBtn.style.display = isOcrEnabled ? 'block' : 'none';
    annotateBtn.style.display = isOcrEnabled ? 'block' : 'none';
    if (isOcrEnabled) {
      annotateBtn.style.top = `${2 * BUTTON_GAP + 34}px`;
      annotateBtn.style.right = ocrToggleBtn.style.right;
    }
  });
  document.body.appendChild(ocrToggleBtn);

  const annotateBtn = document.createElement('button');
  annotateBtn.textContent = '✍️ Exclude OCR';
  Object.assign(annotateBtn.style, {
    position: 'fixed', top: `${2 * BUTTON_GAP + 34}px`, right: `${2 * BUTTON_GAP + BUTTON_SIZE}px`, zIndex: ANNOTATION_Z_INDEX + 10,
    width: `${BUTTON_SIZE}px`, padding: '8px 12px', background: '#ffc107', color: '#333',
    border: 'none', borderRadius: '4px', cursor: 'pointer',
    boxShadow: '0 2px 6px rgba(0,0,0,0.2)', display: 'none', textAlign: 'center'
  });
  document.body.appendChild(annotateBtn);

  const debugToggleBtn = document.createElement('button');
  Object.assign(debugToggleBtn.style, {
    position: 'fixed', top: `${BUTTON_GAP}px`, right: `${3 * BUTTON_GAP + 2 * BUTTON_SIZE}px`, zIndex: ANNOTATION_Z_INDEX + 10,
    width: `${BUTTON_SIZE}px`, padding: '8px 12px', color: '#fff',
    border: 'none', borderRadius: '4px', cursor: 'pointer',
    boxShadow: '0 2px 6px rgba(0,0,0,0.2)', display: 'none', textAlign: 'center'
  });
  function updateDebugBtn() {
    debugToggleBtn.textContent = `🔧 Debug: ${debugPdfExport ? 'On' : 'Off'}`;
    debugToggleBtn.style.background = debugPdfExport ? '#28a745' : '#dc3545';
  }
  updateDebugBtn();
  debugToggleBtn.addEventListener('click', () => {
    debugPdfExport = !debugPdfExport;
    updateDebugBtn();
  });
  document.body.appendChild(debugToggleBtn);

  const cancelBtn = document.createElement('button');
  cancelBtn.textContent = 'Cancel';
  Object.assign(cancelBtn.style, {
    position: 'fixed', top: '10px', right: `${BUTTON_GAP}px`, zIndex: ANNOTATION_Z_INDEX + 10,
    width: `${BUTTON_SIZE}px`, padding: '8px 12px', background: '#dc3545', color: '#fff',
    border: 'none', borderRadius: '4px', cursor: 'pointer',
    boxShadow: '0 2px 6px rgba(0,0,0,0.2)', display: 'none', textAlign: 'center'
  });
  cancelBtn.addEventListener('click', () => {
    isExporting = false;
    hideProgress();
    cancelBtn.style.display = 'none';
    savePdfBtn.style.display = 'block';
    ocrToggleBtn.style.display = 'block';
    debugToggleBtn.style.display = isOcrEnabled ? 'block' : 'none';
    annotateBtn.style.display = isOcrEnabled ? 'block' : 'none';
    if (isOcrEnabled) {
      annotateBtn.style.top = `${2 * BUTTON_GAP + 34}px`;
      annotateBtn.style.right = ocrToggleBtn.style.right;
    }
  });
  document.body.appendChild(cancelBtn);

  const progressBox = document.createElement('div');
  Object.assign(progressBox.style, {
    position: 'fixed', bottom: '10px', right: '10px', zIndex: ANNOTATION_Z_INDEX + 10,
    padding: '8px 12px', background: '#000', color: '#fff',
    borderRadius: '4px', fontSize: '13px', fontFamily: 'monospace',
    display: 'none', whiteSpace: 'pre-line'
  });
  document.body.appendChild(progressBox);

  // --- Annotation Control Panel ---
  const annotationPanel = document.createElement('div');
  Object.assign(annotationPanel.style, {
    position: 'fixed', top: '10px', left: '10px', zIndex: ANNOTATION_Z_INDEX + 10,
    background: '#333', color: '#fff', padding: '8px', borderRadius: '6px', /* Smaller padding, less rounded corners */
    boxShadow: '0 2px 8px rgba(0,0,0,0.3)', display: 'none', /* Smaller shadow */
    fontFamily: 'sans-serif', fontSize: '12px', lineHeight: '1.4', /* Smaller font size, tighter line height */
    width: '220px' /* Fixed width to make it slicker */
  });
  annotationPanel.innerHTML = `
    <h4 style="margin-top:0; margin-bottom: 8px; color: #ffc107; font-size: 14px;">OCR Exclusion</h4>
    <p style="margin-bottom: 10px; font-size: 11px;">Draw rectangles over content you don't want to be OCR'd. It is a good idea to draw over any images, diagrams, and mathematical equations, as they confuse the OCR engine.</p>
    <div style="margin-bottom: 10px;">
        <button id="clearAllAnnotationsBtn" style="background:#dc3545; color:#fff; border:none; padding:5px 8px; border-radius:3px; cursor:pointer; width:100%; font-size: 12px;">Remove All Annotations</button>
    </div>
    <div>
        <button id="finishAnnotationBtn" style="background:#007bff; color:#fff; border:none; padding:6px 10px; border-radius:3px; cursor:pointer; width:100%; font-size: 13px;">Finish Annotating</button>
    </div>
  `;
  document.body.appendChild(annotationPanel);

  const clearAllAnnotationsBtn = annotationPanel.querySelector('#clearAllAnnotationsBtn');
  const finishAnnotationBtn = annotationPanel.querySelector('#finishAnnotationBtn');


  function updateProgress(msg) {
    progressBox.style.display = 'block';
    progressBox.textContent = msg;
  }

  function hideProgress() {
    progressBox.style.display = 'none';
  }

  // --- Load libraries dynamically ---
  function loadScript(src) {
    return new Promise(resolve => {
      const s = document.createElement('script');
      s.src = src;
      s.onload = resolve;
      document.head.appendChild(s);
    });
  }

  async function loadLibs() {
    if (!window.jspdf) {
      updateProgress('Loading jsPDF library...');
      await loadScript('https://cdnjs.cloudflare.com/ajax/libs/jspdf/2.5.1/jspdf.umd.min.js');
    }
    if (isOcrEnabled && !window.Tesseract) {
      updateProgress('Loading Tesseract.js library...');
      await loadScript('https://cdn.jsdelivr.net/npm/tesseract.js@5/dist/tesseract.min.js');
    }
    updateProgress('Libraries loaded.');
  }

  // --- Filename helpers ---
  function sanitizeFilename(str) {
    return (str || 'document')
      .replace(/[<>:"/\\|?*]+/g, '')
      .replace(/\s+/g, '_');
  }

  function getDocTitle() {
    return (document.title || '')
      .replace(/\s*\|\s*Clastify\s*$/i, '')
      .trim();
  }

  // --- High-res export buffer creation ---
  function createHighResCanvas(originalCanvas, scale) {
    const tempCanvas = document.createElement('canvas');
    tempCanvas.width = originalCanvas.width * scale;
    tempCanvas.height = originalCanvas.height * scale;
    const ctx = tempCanvas.getContext('2d');
    ctx.drawImage(originalCanvas, 0, 0, tempCanvas.width, tempCanvas.height);
    return tempCanvas;
  }

  // --- Annotation Mode Logic ---
  let isAnnotating = false;
  let currentOverlay = null; // The overlay div currently being drawn on/hovered
  let activeCanvasIndex = -1; // The index of the canvas whose overlay is currentOverlay
  let startX, startY; // Mouse down coordinates relative to the overlay
  let drawingRect = null; // The visual rectangle element being drawn
  let annotationIdCounter = 0; // Simple ID for unique annotations

  function enterAnnotationMode() {
    isAnnotating = true;
    savePdfBtn.style.display = 'none';
    ocrToggleBtn.style.display = 'none';
    debugToggleBtn.style.display = 'none';
    annotateBtn.style.display = 'none';
    annotationPanel.style.display = 'block';
    cancelBtn.style.display = 'none';

    const canvases = Array.from(document.querySelectorAll('canvas'));
    if (canvases.length === 0) {
      alert('No canvases found for annotation. Scroll the document first.');
      exitAnnotationMode();
      return;
    }

    // Initialize annotations storage.
    if (window.clastifyPageAnnotations.length < canvases.length) {
      for (let k = window.clastifyPageAnnotations.length; k < canvases.length; k++) {
        window.clastifyPageAnnotations.push([]);
      }
    }

    // Create overlays for each canvas
    canvases.forEach((canvas, index) => {
      const canvasRect = canvas.getBoundingClientRect(); // Get accurate position of canvas
      const overlay = document.createElement('div');
      Object.assign(overlay.style, {
        position: 'absolute', // Absolute position to lock to page
        left: `${canvasRect.left + window.scrollX}px`, // Corrected: Use clientRect + scroll
        top: `${canvasRect.top + window.scrollY}px`,   // Corrected: Use clientRect + scroll
        width: `${canvasRect.width}px`,                 // Use clientRect for dimensions
        height: `${canvasRect.height}px`,               // Use clientRect for dimensions
        zIndex: ANNOTATION_Z_INDEX,
        cursor: 'crosshair',
        pointerEvents: 'auto', // Crucial to receive mouse events
      });
      overlay.dataset.canvasIndex = index;
      overlay.classList.add('clastify-annotation-overlay'); // Add a class for easy selection

      // Render existing annotations for this page
      renderAnnotationsOnOverlay(overlay, index, canvas.width, canvas.height, canvasRect.width, canvasRect.height);

      overlay.addEventListener('mousedown', startDrawing);
      overlay.addEventListener('mousemove', drawRectangle);
      overlay.addEventListener('mouseup', stopDrawing);
      overlay.addEventListener('mouseenter', handleOverlayMouseEnter); // For potential future page-specific actions
      overlay.addEventListener('mouseleave', handleOverlayMouseLeave); // For potential future page-specific actions
      document.body.appendChild(overlay);
    });

    // No type buttons to update
  }

  function exitAnnotationMode() {
    isAnnotating = false;
    savePdfBtn.style.display = 'block';
    ocrToggleBtn.style.display = 'block';
    if (isOcrEnabled) {
      debugToggleBtn.style.display = 'block';
      annotateBtn.style.display = 'block';
      annotateBtn.style.top = `${2 * BUTTON_GAP + 34}px`;
      annotateBtn.style.right = ocrToggleBtn.style.right;
    } else {
      debugToggleBtn.style.display = 'none';
      annotateBtn.style.display = 'none';
    }
    annotationPanel.style.display = 'none';
    cancelBtn.style.display = 'none';

    document.querySelectorAll('.clastify-annotation-overlay').forEach(overlay => {
      overlay.removeEventListener('mousedown', startDrawing);
      overlay.removeEventListener('mousemove', drawRectangle);
      overlay.removeEventListener('mouseup', stopDrawing);
      overlay.removeEventListener('mouseenter', handleOverlayMouseEnter);
      overlay.removeEventListener('mouseleave', handleOverlayMouseLeave);
      overlay.remove();
    });
    activeCanvasIndex = -1; // Reset active index
  }

  // These handlers are kept for structural completeness, though 'clear for hovered page' is removed
  function handleOverlayMouseEnter(event) {
    activeCanvasIndex = parseInt(event.currentTarget.dataset.canvasIndex, 10);
  }

  function handleOverlayMouseLeave() {
    activeCanvasIndex = -1;
  }

  function removeAnnotation(pageIndex, annotationId) {
    window.clastifyPageAnnotations[pageIndex] = window.clastifyPageAnnotations[pageIndex].filter(
      ann => ann.id !== annotationId
    );
    const canvas = document.querySelectorAll('canvas')[pageIndex];
    const overlay = document.querySelector(`.clastify-annotation-overlay[data-canvas-index="${pageIndex}"]`);
    if (overlay && canvas) {
        // Pass canvas.offsetWidth/Height for the current scaled dimensions of the overlay
        const canvasRect = canvas.getBoundingClientRect();
        renderAnnotationsOnOverlay(overlay, pageIndex, canvas.width, canvas.height, canvasRect.width, canvasRect.height);
    }
  }


  function renderAnnotationsOnOverlay(overlay, pageIndex, canvasNaturalWidth, canvasNaturalHeight, overlayDisplayWidth, overlayDisplayHeight) {
    overlay.innerHTML = ''; // Clear previous drawings

    const annotations = window.clastifyPageAnnotations[pageIndex] || [];
    annotations.forEach(ann => {
      const scaleX = overlayDisplayWidth / canvasNaturalWidth;
      const scaleY = overlayDisplayHeight / canvasNaturalHeight;

      const rect = document.createElement('div');
      Object.assign(rect.style, {
        position: 'absolute',
        left: `${ann.x * scaleX}px`,
        top: `${ann.y * scaleY}px`,
        width: `${ann.width * scaleX}px`,
        height: `${ann.height * scaleY}px`,
        opacity: '0.7',
        pointerEvents: 'none' // Don't block mouse events for new drawing, unless it's the 'X' button
      });

      // For user-drawn annotations: specific border and 'X' button (unified color)
      Object.assign(rect.style, {
        border: `2px solid #9c27b0`, // Purple for user-drawn exclusion boxes
      });

      // Add 'X' button for user-drawn annotations
      const removeBtn = document.createElement('button');
      removeBtn.textContent = 'X';
      Object.assign(removeBtn.style, {
        position: 'absolute',
        top: '-10px', right: '-10px',
        background: '#dc3545', color: 'white',
        border: 'none', borderRadius: '50%',
        width: '20px', height: '20px',
        fontSize: '12px', lineHeight: '10px',
        textAlign: 'center', cursor: 'pointer',
        pointerEvents: 'auto', // Allow clicks on the button
        zIndex: ANNOTATION_Z_INDEX + 1 // Ensure it's above other elements
      });
      removeBtn.addEventListener('click', (e) => {
        e.stopPropagation(); // Prevent triggering the mousedown on the overlay
        removeAnnotation(pageIndex, ann.id);
      });
      rect.appendChild(removeBtn);

      overlay.appendChild(rect);
    });
  }

  function getOverlayRelativeCoords(event, overlay) {
    const rect = overlay.getBoundingClientRect(); // Still use viewport-relative for mouse within overlay
    const x = event.clientX - rect.left;
    const y = event.clientY - rect.top;
    return { x, y };
  }

  function startDrawing(event) {
    if (event.button !== 0) return; // Only left-click
    if (drawingRect) return; // Prevent multiple simultaneous draws

    currentOverlay = event.currentTarget;
    activeCanvasIndex = parseInt(currentOverlay.dataset.canvasIndex, 10);
    const coords = getOverlayRelativeCoords(event, currentOverlay);
    startX = coords.x;
    startY = coords.y;

    drawingRect = document.createElement('div');
    Object.assign(drawingRect.style, {
      position: 'absolute',
      border: `2px dashed #9c27b0`, // Purple for drawing
      opacity: '0.7',
      pointerEvents: 'none',
      left: `${startX}px`,
      top: `${startY}px`,
      width: '0px',
      height: '0px'
    });
    currentOverlay.appendChild(drawingRect);
  }

  function drawRectangle(event) {
    if (!drawingRect) return;
    const coords = getOverlayRelativeCoords(event, currentOverlay);
    const currentX = coords.x;
    const currentY = coords.y;

    const width = Math.abs(currentX - startX);
    const height = Math.abs(currentY - startY);
    const left = Math.min(startX, currentX);
    const top = Math.min(startY, currentY);

    Object.assign(drawingRect.style, {
      left: `${left}px`,
      top: `${top}px`,
      width: `${width}px`,
      height: `${height}px`
    });
  }

  function stopDrawing(event) {
    if (!drawingRect) return;

    const coords = getOverlayRelativeCoords(event, currentOverlay);
    const endX = coords.x;
    const endY = coords.y;

    const canvas = document.querySelectorAll('canvas')[activeCanvasIndex];
    // Use getBoundingClientRect for the current displayed dimensions of the overlay
    const overlayRect = currentOverlay.getBoundingClientRect();
    const overlayDisplayWidth = overlayRect.width;
    const overlayDisplayHeight = overlayRect.height;

    const scaleX = canvas.width / overlayDisplayWidth; // Canvas's natural width vs displayed width
    const scaleY = canvas.height / overlayDisplayHeight; // Canvas's natural height vs displayed height

    const x = Math.round(Math.min(startX, endX) * scaleX);
    const y = Math.round(Math.min(startY, endY) * scaleY);
    const width = Math.round(Math.abs(endX - startX) * scaleX);
    const height = Math.round(Math.abs(endY - startY) * scaleY);

    if (width > 5 && height > 5) { // Only save if a reasonable size
      annotationIdCounter++;
      window.clastifyPageAnnotations[activeCanvasIndex].push({ x, y, width, height, type: currentAnnotationType, id: `manual-${annotationIdCounter}` });
      // Rerender the overlay to show the final, non-dashed rectangle with 'X' button
      renderAnnotationsOnOverlay(currentOverlay, activeCanvasIndex, canvas.width, canvas.height, overlayDisplayWidth, overlayDisplayHeight);
    } else {
        drawingRect.remove(); // Remove small, insignificant draws
    }

    drawingRect = null;
    currentOverlay = null; // Important to reset
    activeCanvasIndex = -1; // Important to reset
  }

  // --- Bounding box utility functions (needed for annotation overlap check) ---
  function getBbox(wObj) {
    const x0 = (wObj.bbox?.x0 ?? wObj.x ?? wObj.left ?? 0);
    const y0 = (wObj.bbox?.y0 ?? wObj.y ?? wObj.top ?? 0);
    const x1 = (wObj.bbox?.x1 ?? (x0 + (wObj.width ?? 0)));
    const y1 = (wObj.bbox?.y1 ?? (y0 + (wObj.height ?? 0)));
    return { x0, y0, x1, y1 };
  }

  // Function to generate a robust word ID, independent of parent line object structure for lookups
  function generateRobustWordId(pageIndex, wordObj) {
      // If Tesseract provides a word_id, use it directly as it's likely the most unique
      if (wordObj.word_id) {
          return `p${pageIndex}-${wordObj.word_id}`;
      }
      // Fallback: use a combination of page index, word text, and its bounding box coordinates
      // Using x0, y0, x1, y1 from bbox makes it highly unique for words on a page
      const { x0, y0, x1, y1 } = getBbox(wordObj);
      const textSnippet = (wordObj.text || '').substring(0, 20).replace(/\s+/g, '_'); // Sanitize text
      return `p${pageIndex}-${textSnippet}-${x0}-${y0}-${x1}-${y1}`;
  }

  // Helper to get median from an array
  function getMedian(arr) {
      if (arr.length === 0) return 0;
      const sorted = [...arr].sort((a, b) => a - b);
      const mid = Math.floor(sorted.length / 2);
      return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
  }


  // --- Main export function (modified to use annotations) ---
  async function exportPDF() {
    isExporting = true;

    // Hide all buttons and show cancel
    savePdfBtn.style.display = 'none';
    ocrToggleBtn.style.display = 'none';
    debugToggleBtn.style.display = 'none';
    annotateBtn.style.display = 'none';
    annotationPanel.style.display = 'none';
    cancelBtn.style.display = 'block';

    await loadLibs();
    const { jsPDF } = window.jspdf;

    if (!isExporting) return;

    // Step 2: Auto-scroll to trigger lazy loading
    updateProgress('Scrolling to load all pages...');
    const firstCanvas = document.querySelector('canvas');
    let dynamicStep = SCROLL_STEP; // fallback
    if (firstCanvas) {
      dynamicStep = firstCanvas.getBoundingClientRect().height + 20;
    }
    for (let y = 0; y < document.body.scrollHeight; y += dynamicStep) {
      if (!isExporting) break;
      window.scrollTo(0, y);
      await new Promise(r => setTimeout(r, SCROLL_DELAY_MS));
    }
    window.scrollTo(0, 0);

    if (!isExporting) {
      return;
    }

    // Step 3: Collect canvases
    const canvases = Array.from(document.querySelectorAll('canvas'));
    if (canvases.length === 0) {
      alert('No canvases found. Scroll the document and try again.');
      isExporting = false;
      cancelBtn.style.display = 'none';
      savePdfBtn.style.display = 'block';
      ocrToggleBtn.style.display = 'block';
      debugToggleBtn.style.display = isOcrEnabled ? 'block' : 'none';
      annotateBtn.style.display = isOcrEnabled ? 'block' : 'none';
      if (isOcrEnabled) {
        annotateBtn.style.top = `${2 * BUTTON_GAP + 34}px`;
        annotateBtn.style.right = ocrToggleBtn.style.right;
      }
      return;
    }

    // Ensure annotations array matches canvases length, fill with empty arrays if needed
    if (window.clastifyPageAnnotations.length < canvases.length) {
        for (let k = window.clastifyPageAnnotations.length; k < canvases.length; k++) {
            window.clastifyPageAnnotations.push([]);
        }
    }

    if (!isExporting) {
      return;
    }

    // Step 4: Create PDF
    let pdf;
    if (PAGE_MODE === 'fit') {
      pdf = new jsPDF({ unit: 'px', format: 'a4' });
    } else {
      const first = canvases[0];
      pdf = new jsPDF({ unit: 'px', format: [first.width, first.height] });
    }

    let invisibleState = null;
    if (isOcrEnabled) {
      const hasGState = pdf.GState ? true : false;
      invisibleState = hasGState ? new pdf.GState({ opacity: 0 }) : null;
    }

    for (let i = 0; i < canvases.length; i++) {
      if (!isExporting) break;
      const c = canvases[i];
      updateProgress(`Processing page ${i + 1} of ${canvases.length}...\n(Rendering image)`);

      // Create high-res buffer for export
      const highResCanvas = createHighResCanvas(c, EXPORT_SCALE);

      let pageWidth, pageHeight, xOffset, yOffset, imgW, imgH, ratio;

      if (PAGE_MODE === 'fit') {
        pageWidth = pdf.internal.pageSize.getWidth();
        pageHeight = pdf.internal.pageSize.getHeight();
        ratio = Math.min(pageWidth / c.width, pageHeight / c.height);
        imgW = Math.round(c.width * ratio);
        imgH = Math.round(c.height * ratio);
        xOffset = Math.round((pageWidth - imgW) / 2);
        yOffset = Math.round((pageHeight - imgH) / 2);

        if (i > 0) pdf.addPage();
        pdf.addImage(highResCanvas.toDataURL('image/png'), 'PNG', xOffset, yOffset, imgW, imgH);
      } else { // 'native' mode
        if (i > 0) pdf.addPage([c.width, c.height], 'portrait');
        pageWidth = c.width; pageHeight = c.height; ratio = 1;
        imgW = c.width; imgH = c.height;
        xOffset = 0; yOffset = 0;
        pdf.addImage(highResCanvas.toDataURL('image/png'), 'PNG', 0, 0, c.width, c.height);
      }

      if (!isOcrEnabled) continue;

      if (!isExporting) break;

      // --- Process Annotations ---
      const pageAnnotations = window.clastifyPageAnnotations[i] || [];

      // Calculate fixed upper-right exclusion zone for the *current canvas*
      const fixedExclusionX = c.width - UPPER_RIGHT_EXCLUSION.x_offset_from_right;
      const fixedExclusionY = UPPER_RIGHT_EXCLUSION.y_offset_from_top;
      const fixedExclusionWidth = UPPER_RIGHT_EXCLUSION.width;
      const fixedExclusionHeight = UPPER_RIGHT_EXCLUSION.height;

      const fixedExclusionBbox = {
          x: fixedExclusionX,
          y: fixedExclusionY,
          width: fixedExclusionWidth,
          height: fixedExclusionHeight
      };

      // Step 5: OCR this page (only for non-annotated regions), using line-based height
      let result;
      try {
          updateProgress(`Processing page ${i + 1} of ${canvases.length}...\n(Tesseract OCR running)`);
          result = await Tesseract.recognize(c, 'eng', {
              logger: m => { // Add a logger to see Tesseract's internal progress
                  if (m.status === 'recognizing') {
                      updateProgress(`Page ${i + 1}/${canvases.length}: OCR progress: ${(m.progress * 100).toFixed(1)}%`);
                  } else if (m.status === 'loading tesseract core' || m.status === 'initializing tesseract' || m.status === 'loading language traineddata') {
                      updateProgress(`Page ${i + 1}/${canvases.length}: OCR status: ${m.status}`);
                  }
              }
          });
          if (!result || !result.data) {
              throw new Error("Tesseract.recognize returned empty or invalid result data.");
          }
          updateProgress(`Processing page ${i + 1} of ${canvases.length}...\n(OCR complete)`);
      } catch (error) {
          console.error(`Tesseract OCR failed for page ${i + 1}:`, error);
          updateProgress(`ERROR: OCR failed for page ${i + 1}. Check console for details. Aborting.`);
          hideProgress();
          isExporting = false;
          cancelBtn.style.display = 'none';
          savePdfBtn.style.display = 'block';
          ocrToggleBtn.style.display = 'block';
          debugToggleBtn.style.display = isOcrEnabled ? 'block' : 'none';
          annotateBtn.style.display = isOcrEnabled ? 'block' : 'none';
          if (isOcrEnabled) {
            annotateBtn.style.top = `${2 * BUTTON_GAP + 34}px`;
            annotateBtn.style.right = ocrToggleBtn.style.right;
          }
          return; // Stop the export process
      }

      if (!isExporting) break;


      const lines = result.data.lines || [];

      // Map to store consistent height, top Y, and potentially adjusted width for words
      const wordFormattingInfoMap = new Map();

      // First pass: Iterate through Tesseract lines to determine robust line heights and prepare for overlap resolution
      for (const line of lines) {
        if (!line.words || line.words.length === 0) continue;

        let lineTopY = line.bbox.y0; // Default to Tesseract's line bbox
        let effectiveHeight = line.bbox.y1 - line.bbox.y0; // Default to Tesseract's line bbox height

        const highConfidenceWordsDetails = []; // To store {wObj, bbox} for robust height calculation

        // Collect high-confidence, non-annotated words for robust height calculation
        for (const wObj of line.words) {
            const bbox = getBbox(wObj); // Tesseract bbox in canvas coords
            const wBbox = { x: bbox.x0, y: bbox.y0, width: bbox.x1 - bbox.x0, height: bbox.y1 - bbox.y0 };

            const isOverlappingAnnotation = pageAnnotations.some(ann => {
                return wBbox.x < ann.x + ann.width &&
                       wBbox.x + wBbox.width > ann.x &&
                       wBbox.y < ann.y + ann.height &&
                       wBbox.y + wBbox.height > ann.y;
            }) || ( // Also check against the fixed upper-right exclusion zone
                wBbox.x < fixedExclusionBbox.x + fixedExclusionBbox.width &&
                wBbox.x + wBbox.width > fixedExclusionBbox.x &&
                wBbox.y < fixedExclusionBbox.y + fixedExclusionBbox.height &&
                wBbox.y + wBbox.height > fixedExclusionBbox.y
            );


            if (!isOverlappingAnnotation && wObj.confidence >= LOW_CONFIDENCE_THRESHOLD) {
                highConfidenceWordsDetails.push({ wObj, bbox });
            }
        }

        // Determine robust line height based on high confidence words
        if (highConfidenceWordsDetails.length > 0) {
            const y0s = highConfidenceWordsDetails.map(d => d.bbox.y0);
            const y1s = highConfidenceWordsDetails.map(d => d.bbox.y1);
            const heights = highConfidenceWordsDetails.map(d => d.bbox.y1 - d.bbox.y0);

            const medianY0 = getMedian(y0s);
            const medianY1 = getMedian(y1s);
            const medianWordHeight = getMedian(heights);

            // Filter outliers based on deviation from median Y
            const filteredWordsDetails = highConfidenceWordsDetails.filter(d => {
                const bbox = d.bbox;
                // A word is an outlier if its top or bottom deviates too much from the median line top/bottom
                return Math.abs(bbox.y0 - medianY0) <= (medianWordHeight * OUTLIER_Y_DEVIATION_FACTOR) &&
                       Math.abs(bbox.y1 - medianY1) <= (medianWordHeight * OUTLIER_Y_DEVIATION_FACTOR);
            });

            if (filteredWordsDetails.length > 0) {
                let finalMinY0 = Infinity;
                let finalMaxY1 = -Infinity;
                for (const d of filteredWordsDetails) {
                    finalMinY0 = Math.min(finalMinY0, d.bbox.y0);
                    finalMaxY1 = Math.max(finalMaxY1, d.bbox.y1);
                }
                lineTopY = finalMinY0;
                effectiveHeight = finalMaxY1 - finalMinY0;
            }
            // If all high-confidence words were filtered as outliers, fall back to line.bbox (already default)
        }

        // Apply a buffer to the effective height
        effectiveHeight *= LINE_HEIGHT_BUFFER_MULTIPLIER;

        // Apply a maximum cap to effectiveHeight to prevent excessively tall boxes from anomalies
        if (highConfidenceWordsDetails.length > 0) {
             const medianInitialWordHeight = getMedian(highConfidenceWordsDetails.map(d => d.bbox.y1 - d.bbox.y0));
             effectiveHeight = Math.min(effectiveHeight, medianInitialWordHeight * MAX_LINE_HEIGHT_FACTOR_OF_MEDIAN_WORD_HEIGHT);
        }

        // Prepare words for overlap resolution and final formatting info storage
        const wordsForOverlapResolution = []; // Only high-confidence for this

        for (const wObj of line.words) {
            const bbox = getBbox(wObj);
            const wBbox = { x: bbox.x0, y: bbox.y0, width: bbox.x1 - bbox.x0, height: bbox.y1 - bbox.y0 };

            const isOverlappingAnnotation = pageAnnotations.some(ann => {
                return wBbox.x < ann.x + ann.width &&
                       wBbox.x + wBbox.width > ann.x &&
                       wBbox.y < ann.y + ann.height &&
                       wBbox.y + wBbox.height > ann.y;
            }) || ( // Also check against the fixed upper-right exclusion zone
                wBbox.x < fixedExclusionBbox.x + fixedExclusionBbox.width &&
                wBbox.x + wBbox.width > fixedExclusionBbox.x &&
                wBbox.y < fixedExclusionBbox.y + fixedExclusionBbox.height &&
                wBbox.y + wBbox.height > fixedExclusionBbox.y
            );

            if (!isOverlappingAnnotation) {
                const wordId = generateRobustWordId(i, wObj);
                wordFormattingInfoMap.set(wordId, {
                    height: effectiveHeight,
                    y0_line: lineTopY,
                    originalWidth: wBbox.width,
                    adjustedWidth: wBbox.width, // Initialize adjusted width
                    isHighConfidence: wObj.confidence >= LOW_CONFIDENCE_THRESHOLD,
                    // Store the original word object's x0 for later use in drawing
                    originalX0: wBbox.x
                });
                if (wObj.confidence >= LOW_CONFIDENCE_THRESHOLD) {
                    wordsForOverlapResolution.push({ wordId, x0: wBbox.x, x1: wBbox.x + wBbox.width });
                }
            }
        }

        // Resolve horizontal overlaps for high-confidence words in this line
        wordsForOverlapResolution.sort((a, b) => a.x0 - b.x0); // Sort by x-start

        for (let k = 0; k < wordsForOverlapResolution.length - 1; k++) {
            const currentWord = wordsForOverlapResolution[k];
            const nextWord = wordsForOverlapResolution[k + 1];

            // Check for overlap
            if (currentWord.x1 > nextWord.x0) {
                const formatInfoCurrent = wordFormattingInfoMap.get(currentWord.wordId);
                // Adjust currentWord's width to end at nextWord's start
                const newAdjustedWidth = nextWord.x0 - currentWord.x0;
                if (formatInfoCurrent && newAdjustedWidth > 0) { // Ensure width doesn't become negative
                    formatInfoCurrent.adjustedWidth = newAdjustedWidth;
                    // Update the x1 in the temp array for potential future overlaps with next words
                    currentWord.x1 = nextWord.x0;
                }
            }
        }
      }

      if (!isExporting) break;

      // Step 6: Draw OCR text and bounding boxes for non-annotated regions
      if (!debugPdfExport && invisibleState) {
        pdf.setGState(invisibleState);
      }
      pdf.setFont('helvetica', 'normal');

      // Iterate through blocks, paragraphs, then lines, then words within lines, to correctly place spaces
      for (const block of result.data.blocks || []) {
          for (const paragraph of block.paragraphs || []) {
              for (const line of paragraph.lines || []) {
                  const lineWordsToDraw = [];
                  for (const wObj of line.words || []) {
                      const wordId = generateRobustWordId(i, wObj);
                      if (wordFormattingInfoMap.has(wordId)) {
                          lineWordsToDraw.push(wObj); // Push the original word object
                      }
                  }

                  // Now process words for this specific line, adding spaces between them
                  for (let k = 0; k < lineWordsToDraw.length; k++) {
                      const wObj = lineWordsToDraw[k];
                      const txt = (wObj.text || '').trim();
                      if (!txt) continue;

                      const wordId = generateRobustWordId(i, wObj);
                      const formatInfo = wordFormattingInfoMap.get(wordId);

                      if (formatInfo === undefined) {
                          console.warn(`Formatting info not found for word: "${wObj.text}" (ID: ${wordId}). Skipping word.`);
                          continue;
                      }

                      const x0 = formatInfo.originalX0; // Use the stored original X0
                      const effectiveHeight = formatInfo.height;
                      const lineTopY = formatInfo.y0_line;
                      const rectWidthForText = formatInfo.adjustedWidth;

                      // Calculate initial font size based on effectiveHeight (vertical fit)
                      let fontSize = Math.max(1, effectiveHeight * ratio * CALIBRATION_SIZE);
                      pdf.setFontSize(fontSize); // Set temporarily to measure text width

                      // Adjust font size based on horizontal fit
                      const textWidthMeasured = pdf.getTextWidth(txt);
                      const targetWidthInPdfUnits = rectWidthForText * ratio;

                      if (textWidthMeasured > targetWidthInPdfUnits * (1 + (1 - FONT_SIZE_TOLERANCE))) { // Text overflows significantly
                          fontSize = fontSize * (targetWidthInPdfUnits / textWidthMeasured) * FONT_SIZE_TOLERANCE; // Shrink
                      } else if (textWidthMeasured < targetWidthInPdfUnits * FONT_SIZE_TOLERANCE && textWidthMeasured > 0) { // Text underfills significantly
                          let proposedFontSize = fontSize * (targetWidthInPdfUnits / textWidthMeasured) * (2 - FONT_SIZE_TOLERANCE); // Grow
                          // Cap the growth
                          fontSize = Math.min(proposedFontSize, fontSize * FONT_SIZE_MAX_GROWTH_FACTOR);
                      }
                      // Ensure font size doesn't go below 1px
                      fontSize = Math.max(1, fontSize);


                      const px = xOffset + x0 * ratio; // x-coordinate for text baseline (word's original x position)
                      const pyBaseline = yOffset + (lineTopY + effectiveHeight) * ratio + (effectiveHeight * ratio * BASELINE_OFFSET);

                      const rectX = xOffset + x0 * ratio;
                      const rectY = yOffset + lineTopY * ratio;
                      const rectHeight = effectiveHeight * ratio;

                      if (debugPdfExport) {
                          if (!formatInfo.isHighConfidence) {
                              pdf.setDrawColor(255, 165, 0); // Orange for low-confidence boxes (non-annotated)
                              pdf.setTextColor(255, 0, 255); // Magenta for low-confidence text (non-annotated)
                          } else {
                              pdf.setDrawColor(0, 0, 255); // Blue for high-confidence boxes (non-annotated)
                              pdf.setTextColor(255, 0, 0); // Bright red for high-confidence text (non-annotated)
                          }
                          pdf.setLineWidth(0.5);
                          pdf.rect(rectX, rectY, rectWidthForText * ratio, rectHeight); // Use rectWidthForText
                      } else {
                          if (!invisibleState) {
                              pdf.setTextColor(255, 255, 255);
                          }
                      }

                      pdf.setFontSize(fontSize);
                      pdf.text(txt, px, pyBaseline);

                      // --- Add Space Box ---
                      if (k < lineWordsToDraw.length - 1) { // If not the last word in the line
                          const nextWordInLine = lineWordsToDraw[k + 1];
                          const currentWordEndInCanvasUnits = x0 + rectWidthForText;
                          const nextWordStartInCanvasUnits = (nextWordInLine.bbox?.x0 ?? nextWordInLine.x ?? nextWordInLine.left ?? 0);

                          const availableSpaceWidthInCanvasUnits = nextWordStartInCanvasUnits - currentWordEndInCanvasUnits;

                          if (availableSpaceWidthInCanvasUnits > 0.5) { // Only add if there's a meaningful gap
                              const spaceChar = ' ';
                              const spaceXInPdfUnits = xOffset + currentWordEndInCanvasUnits * ratio;
                              const spaceYBaselineInPdfUnits = pyBaseline; // Same baseline as the words
                              const spaceWidthInPdfUnits = availableSpaceWidthInCanvasUnits * ratio;

                              // Use a small font size for space, relative to the word's font size
                              const smallSpaceFontSize = Math.max(1, fontSize * SPACE_FONT_SIZE_RATIO_TO_WORD);

                              pdf.setFontSize(smallSpaceFontSize);
                              const measuredSpaceWidth = pdf.getTextWidth(spaceChar);

                              let finalSpaceFontSize = smallSpaceFontSize;
                              // Attempt to fit space text within the available gap, but don't grow too much
                              if (measuredSpaceWidth > spaceWidthInPdfUnits) {
                                  finalSpaceFontSize = smallSpaceFontSize * (spaceWidthInPdfUnits / measuredSpaceWidth) * FONT_SIZE_TOLERANCE;
                              } else if (measuredSpaceWidth < spaceWidthInPdfUnits * FONT_SIZE_TOLERANCE && measuredSpaceWidth > 0) {
                                  let proposedSpaceFontSize = smallSpaceFontSize * (spaceWidthInPdfUnits / measuredSpaceWidth) * (2 - FONT_SIZE_TOLERANCE);
                                  finalSpaceFontSize = Math.min(proposedSpaceFontSize, smallSpaceFontSize * FONT_SIZE_MAX_GROWTH_FACTOR);
                              }
                              finalSpaceFontSize = Math.max(1, finalSpaceFontSize);

                              pdf.setFontSize(finalSpaceFontSize);

                              if (debugPdfExport) {
                                  pdf.setDrawColor(255, 0, 255); // Magenta for space box
                                  pdf.setTextColor(255, 0, 255); // Magenta for space text
                                  // Draw space box using the line's calculated height for visual consistency
                                  pdf.rect(spaceXInPdfUnits, rectY, spaceWidthInPdfUnits, rectHeight);
                              } else {
                                  if (!invisibleState) {
                                      pdf.setTextColor(255, 255, 255); // White for invisible text if no GState
                                  }
                              }
                              pdf.text(spaceChar, spaceXInPdfUnits, spaceYBaselineInPdfUnits);

                              // Reset text color for the next word (if debug mode)
                              if (debugPdfExport) {
                                  if (formatInfo.isHighConfidence) {
                                      pdf.setTextColor(255, 0, 0); // Bright red for high-confidence text
                                  } else {
                                      pdf.setTextColor(255, 0, 255); // Magenta for low-confidence text
                                  }
                              }
                          }
                      }
                  }
              }
          }
      }

      if (!isExporting) break;

      // --- Draw Annotated Regions in PDF for Debug/Visual Confirmation ---
      if (debugPdfExport) {
        // Draw the fixed upper-right exclusion zone for debug purposes
        pdf.setDrawColor(255, 0, 0); // Red for fixed exclusion
        pdf.setLineWidth(1);
        pdf.rect(xOffset + fixedExclusionX * ratio,
                 yOffset + fixedExclusionY * ratio,
                 fixedExclusionWidth * ratio,
                 fixedExclusionHeight * ratio);
        pdf.setFontSize(10);
        pdf.setTextColor(255, 0, 0);
        pdf.text("FIXED OCR EXCLUSION", xOffset + fixedExclusionX * ratio + 2, yOffset + fixedExclusionY * ratio + 10);


        pageAnnotations.forEach(ann => {
          const rectX = xOffset + ann.x * ratio;
          const rectY = yOffset + ann.y * ratio;
          const rectWidth = ann.width * ratio;
          const rectHeight = ann.height * ratio;

          // Unified color for user-drawn exclusions
          if (ann.type === 'user_exclude_ocr') {
              pdf.setDrawColor(156, 39, 176); // Purple for user-drawn
              pdf.setLineWidth(2);
              pdf.rect(rectX, rectY, rectWidth, rectHeight);
              pdf.setFontSize(10);
              pdf.setTextColor(156, 39, 176);
              pdf.text("USER EXCLUDE OCR", rectX + 2, rectY + 10);
          }
        });
      }

      if (!debugPdfExport && invisibleState) {
        pdf.setGState(new pdf.GState({ opacity: 1 }));
      }
    }

    if (!isExporting) {
      return;
    }

    // Step 7: Save PDF
    updateProgress("Saving PDF...");
    const title = sanitizeFilename(getDocTitle());
    pdf.save(`clastify_${title}.pdf`);

    hideProgress();
    isExporting = false;
    cancelBtn.style.display = 'none';
    savePdfBtn.style.display = 'block';
    ocrToggleBtn.style.display = 'block';
    debugToggleBtn.style.display = isOcrEnabled ? 'block' : 'none';
    annotateBtn.style.display = isOcrEnabled ? 'block' : 'none';
    if (isOcrEnabled) {
      annotateBtn.style.top = `${2 * BUTTON_GAP + 34}px`;
      annotateBtn.style.right = ocrToggleBtn.style.right;
    }
    // No alert, just close the mode.
  }


  // --- Event Listeners ---
  savePdfBtn.addEventListener('click', exportPDF);
  annotateBtn.addEventListener('click', enterAnnotationMode);


  clearAllAnnotationsBtn.addEventListener('click', () => {
    if (confirm('Are you sure you want to clear ALL manual annotations across all pages?')) {
        // Now, we just clear everything since there's no auto-generated logo annotation to preserve
        window.clastifyPageAnnotations = window.clastifyPageAnnotations.map(() => []);
        document.querySelectorAll('.clastify-annotation-overlay').forEach(overlay => {
            const index = parseInt(overlay.dataset.canvasIndex, 10);
            const canvas = document.querySelectorAll('canvas')[index];
            // Pass the correct displayed dimensions of the overlay
            const canvasRect = canvas.getBoundingClientRect();
            renderAnnotationsOnOverlay(overlay, index, canvas.width, canvas.height, canvasRect.width, canvasRect.height);
        });
    }
  });


  finishAnnotationBtn.addEventListener('click', () => {
    exitAnnotationMode();
    // No alert, just close the mode.
  });

  // --- Dynamic Banner Removal and Debounce ---
  let debounceTimer;

  function removeBannersDynamically() {
      const banners = document.querySelectorAll('img.MuiBox-root[class*="css"]');
      banners.forEach(banner => {
          console.log('Dynamically removing banner:', banner);
          if (banner.parentElement) {
              banner.parentElement.remove();
          } else {
              banner.remove();
          }
      });
  }

  // Attach the debounced banner removal to scroll events
  window.addEventListener('scroll', () => {
    clearTimeout(debounceTimer);
    debounceTimer = setTimeout(removeBannersDynamically, SCROLL_DEBOUNCE_DELAY);
  });

  // Also run once on initial script load to catch any banners already present
  // Use DOMContentLoaded to ensure elements are parsed, but still execute early.
  if (document.readyState === 'loading') {
    document.addEventListener('DOMContentLoaded', removeBannersDynamically);
  } else {
    removeBannersDynamically();
  }


})();