HathiTrust Text Export UI + seq/num

Export selected HathiTrust Text-only pages to TXT with seq/num mode

Vous devrez installer une extension telle que Tampermonkey, Greasemonkey ou Violentmonkey pour installer ce script.

Vous devrez installer une extension telle que Tampermonkey pour installer ce script.

Vous devrez installer une extension telle que Tampermonkey ou Violentmonkey pour installer ce script.

Vous devrez installer une extension telle que Tampermonkey ou Userscripts pour installer ce script.

Vous devrez installer une extension telle que Tampermonkey pour installer ce script.

Vous devrez installer une extension de gestionnaire de script utilisateur pour installer ce script.

(J'ai déjà un gestionnaire de scripts utilisateur, laissez-moi l'installer !)

Vous devrez installer une extension telle que Stylus pour installer ce style.

Vous devrez installer une extension telle que Stylus pour installer ce style.

Vous devrez installer une extension telle que Stylus pour installer ce style.

Vous devrez installer une extension du gestionnaire de style pour utilisateur pour installer ce style.

Vous devrez installer une extension du gestionnaire de style pour utilisateur pour installer ce style.

Vous devrez installer une extension du gestionnaire de style pour utilisateur pour installer ce style.

(J'ai déjà un gestionnaire de style utilisateur, laissez-moi l'installer!)

// ==UserScript==
// @name         HathiTrust Text Export UI + seq/num
// @namespace    AdoreJc
// @version      1.0
// @description  Export selected HathiTrust Text-only pages to TXT with seq/num mode
// @supportURL   https://github.com/AdoreJc/HathiTrust-Text-Export/issues
// @license      MIT
// @match        https://babel.hathitrust.org/cgi/ssd*
// @grant        GM_download
// ==/UserScript==

(function () {
    'use strict';

    function sleep(ms) {
        return new Promise(resolve => setTimeout(resolve, ms));
    }

    function cleanTextFromNode(node) {
        return node.textContent
            .replace(/\r/g, '')
            .replace(/[ \t]+\n/g, '\n')
            .replace(/\n[ \t]+/g, '\n')
            .replace(/\n{3,}/g, '\n\n')
            .trim();
    }

    function getSeqFromUrlOrDoc(doc, currentUrl) {
        const url = new URL(currentUrl, location.origin);

        const seqFromQuery = url.searchParams.get('seq');
        if (seqFromQuery && /^\d+$/.test(seqFromQuery)) {
            return parseInt(seqFromQuery, 10);
        }

        const hashMatch = url.hash.match(/seq(\d+)/i);
        if (hashMatch) {
            return parseInt(hashMatch[1], 10);
        }

        const h2 = doc.querySelector('#mdpContentContainer h2[id^="seq"]');
        if (h2 && h2.id) {
            const m = h2.id.match(/^seq(\d+)$/i);
            if (m) return parseInt(m[1], 10);
        }

        return null;
    }

    function getNumFromTitle(title) {
        const m = title.match(/Page\s+(\d+)/i);
        return m ? parseInt(m[1], 10) : null;
    }

    function parsePageFromDocument(doc, currentUrl) {
        const container = doc.querySelector('#mdpContentContainer');
        if (!container) {
            throw new Error('Cannot find #mdpContentContainer');
        }

        const h2 = container.querySelector('h2');
        if (!h2) {
            throw new Error('Cannot find page title h2');
        }

        const title = h2.textContent.trim();
        const num = getNumFromTitle(title);
        const seq = getSeqFromUrlOrDoc(doc, currentUrl);

        const mdpPage = container.querySelector('#mdpPage');
        if (!mdpPage) {
            throw new Error('Cannot find #mdpPage');
        }

        const paragraphs = [...mdpPage.querySelectorAll('p')];
        const bodyParts = [];

        for (const p of paragraphs) {
            const raw = p.textContent.replace(/\s+/g, ' ').trim();
            if (!raw) continue;
            if (/Previous Page|Next Page|Return to top/i.test(raw)) continue;

            const cleaned = cleanTextFromNode(p);
            if (cleaned) bodyParts.push(cleaned);
        }

        const bodyText = bodyParts.join('\n\n').trim();

        let nextUrl = null;
        const nextLink = [...doc.querySelectorAll('a')].find(a =>
            /Next Page/i.test(a.textContent)
        );
        if (nextLink && nextLink.getAttribute('href')) {
            nextUrl = new URL(nextLink.getAttribute('href'), currentUrl).href;
        }

        return { seq, num, title, bodyText, nextUrl };
    }

    function getCurrentValue(pageInfo, mode) {
        return mode === 'seq' ? pageInfo.seq : pageInfo.num;
    }

    async function fetchDocument(url) {
        const res = await fetch(url, {
            method: 'GET',
            credentials: 'include'
        });

        if (!res.ok) {
            throw new Error(`HTTP ${res.status}: ${url}`);
        }

        const html = await res.text();
        return new DOMParser().parseFromString(html, 'text/html');
    }

    function getSafeBookTitle() {
        const h1 = document.querySelector('#mdpPageHeader h1');
        const raw = h1 ? h1.textContent.trim() : 'HathiTrust_Text_Export';
        return raw
            .replace(/[\\/:*?"<>|]+/g, '_')
            .replace(/\s+/g, '_')
            .slice(0, 120);
    }

    function downloadText(filename, text) {
        const blob = new Blob([text], { type: 'text/plain;charset=utf-8' });
        const blobUrl = URL.createObjectURL(blob);

        GM_download({
            url: blobUrl,
            name: filename,
            saveAs: true,
            onload: () => {
                setTimeout(() => URL.revokeObjectURL(blobUrl), 5000);
            },
            onerror: (e) => {
                console.error('Download failed', e);
                alert('Download failed. Check the console for details.');
            }
        });
    }

    async function runExport(startValue, endValue, mode, btn, statusEl) {
        const results = [];
        const visited = new Set();

        let doc = document;
        let currentUrl = location.href;

        while (true) {
            if (visited.has(currentUrl)) {
                console.warn('Duplicate page detected, stopping:', currentUrl);
                break;
            }
            visited.add(currentUrl);

            const pageInfo = parsePageFromDocument(doc, currentUrl);
            const currentValue = getCurrentValue(pageInfo, mode);

            const progressText = currentValue == null
                ? `Exporting... ${mode.toUpperCase()} N/A`
                : `Exporting... ${mode} ${currentValue}`;

            btn.textContent = progressText;
            if (statusEl) {
                statusEl.textContent = `Processing: ${pageInfo.title} | seq=${pageInfo.seq ?? 'N/A'} | num=${pageInfo.num ?? 'N/A'}`;
            }

            console.log(`Parsed: ${pageInfo.title} | num=${pageInfo.num} | seq=${pageInfo.seq}`);

            if (currentValue === null || currentValue === undefined) {
                if (mode === 'num') {
                    console.warn('Current page has no usable num, skipped:', pageInfo.title);
                }
            } else if (currentValue >= startValue && currentValue <= endValue) {
                results.push(`${pageInfo.title}\n\n${pageInfo.bodyText}`);
            }

            if (currentValue !== null && currentValue !== undefined && currentValue >= endValue) {
                break;
            }

            if (!pageInfo.nextUrl) {
                console.warn('Next Page not found, stopping early');
                break;
            }

            await sleep(1500);
            doc = await fetchDocument(pageInfo.nextUrl);
            currentUrl = pageInfo.nextUrl;
        }

        if (!results.length) {
            alert('No content was captured.');
            return;
        }

        const outputName = `${getSafeBookTitle()}_${mode}_${startValue}_${endValue}.txt`;
        const finalText = results.join('\n\n' + '='.repeat(80) + '\n\n');
        downloadText(outputName, finalText);

        btn.textContent = 'Export TXT';
        if (statusEl) {
            statusEl.textContent = `Done. Exported ${results.length} page(s).`;
        }
        alert(`Done: mode=${mode}, exported ${results.length} page(s).`);
    }

    function addUI() {
        if (document.getElementById('ht-export-panel')) return;

        let currentSeq = '';
        let currentNum = '';

        try {
            const info = parsePageFromDocument(document, location.href);
            currentSeq = info.seq ?? '';
            currentNum = info.num ?? '';
        } catch (e) {
            console.warn('Failed to read current page info', e);
        }

        const defaultMode = currentNum !== '' ? 'num' : 'seq';
        const defaultValue = defaultMode === 'num' ? currentNum : currentSeq;

        const panel = document.createElement('div');
        panel.id = 'ht-export-panel';

        Object.assign(panel.style, {
            position: 'fixed',
            right: '20px',
            bottom: '20px',
            zIndex: '999999',
            background: '#fff',
            border: '1px solid #ccc',
            borderRadius: '10px',
            padding: '12px',
            boxShadow: '0 2px 10px rgba(0,0,0,0.2)',
            fontSize: '14px',
            fontFamily: 'Arial, sans-serif',
            minWidth: '280px'
        });

        panel.innerHTML = `
            <div style="font-weight:bold; margin-bottom:8px;">HathiTrust TXT Export</div>
            <div style="margin-bottom:4px;">Current num: ${currentNum === '' ? 'N/A' : currentNum}</div>
            <div style="margin-bottom:8px;">Current seq: ${currentSeq === '' ? 'N/A' : currentSeq}</div>

            <label style="display:block; margin-bottom:8px;">
                Mode:
                <select id="ht-mode" style="width:100%; margin-top:4px; box-sizing:border-box;">
                    <option value="num" ${defaultMode === 'num' ? 'selected' : ''}>num (book page)</option>
                    <option value="seq" ${defaultMode === 'seq' ? 'selected' : ''}>seq (scan sequence)</option>
                </select>
            </label>

            <label style="display:block; margin-bottom:6px;">
                Start:
                <input id="ht-start-page" type="number" value="${defaultValue}" style="width:100%; margin-top:4px; box-sizing:border-box;">
            </label>

            <label style="display:block; margin-bottom:10px;">
                End:
                <input id="ht-end-page" type="number" value="${defaultValue}" style="width:100%; margin-top:4px; box-sizing:border-box;">
            </label>

            <button id="ht-export-btn" style="
                width:100%;
                padding:8px 10px;
                background:#2563eb;
                color:#fff;
                border:none;
                border-radius:8px;
                cursor:pointer;
            ">Export TXT</button>

            <div id="ht-export-status" style="
                margin-top:8px;
                font-size:12px;
                color:#444;
                line-height:1.4;
                word-break:break-word;
            ">Ready.</div>
        `;

        document.body.appendChild(panel);

        const btn = document.getElementById('ht-export-btn');
        const modeSelect = document.getElementById('ht-mode');
        const startInput = document.getElementById('ht-start-page');
        const endInput = document.getElementById('ht-end-page');
        const statusEl = document.getElementById('ht-export-status');

        modeSelect.addEventListener('change', () => {
            const mode = modeSelect.value;
            const val = mode === 'num' ? currentNum : currentSeq;
            startInput.value = val;
            endInput.value = val;
        });

        btn.addEventListener('click', async () => {
            const mode = modeSelect.value;
            const startValue = parseInt(startInput.value, 10);
            const endValue = parseInt(endInput.value, 10);

            if (!Number.isInteger(startValue) || !Number.isInteger(endValue)) {
                alert('Please enter valid numbers.');
                return;
            }

            if (startValue > endValue) {
                alert('Start cannot be greater than End.');
                return;
            }

            btn.disabled = true;
            btn.textContent = 'Starting...';
            statusEl.textContent = 'Preparing export...';

            try {
                await runExport(startValue, endValue, mode, btn, statusEl);
            } catch (err) {
                console.error(err);
                alert('Export failed: ' + err.message);
                statusEl.textContent = 'Export failed: ' + err.message;
            } finally {
                btn.disabled = false;
                btn.textContent = 'Export TXT';
            }
        });
    }

    window.addEventListener('load', addUI);
})();