HathiTrust Text Export UI + seq/num

Export selected HathiTrust Text-only pages to TXT with seq/num mode

You will need to install an extension such as Tampermonkey, Greasemonkey or Violentmonkey to install this script.

You will need to install an extension such as Tampermonkey to install this script.

You will need to install an extension such as Tampermonkey or Violentmonkey to install this script.

You will need to install an extension such as Tampermonkey or Userscripts to install this script.

You will need to install an extension such as Tampermonkey to install this script.

You will need to install a user script manager extension to install this script.

(I already have a user script manager, let me install it!)

You will need to install an extension such as Stylus to install this style.

You will need to install an extension such as Stylus to install this style.

You will need to install an extension such as Stylus to install this style.

You will need to install a user style manager extension to install this style.

You will need to install a user style manager extension to install this style.

You will need to install a user style manager extension to install this style.

(I already have a user style manager, let me install it!)

// ==UserScript==
// @name         HathiTrust Text Export UI + seq/num
// @namespace    AdoreJc
// @version      1.0
// @description  Export selected HathiTrust Text-only pages to TXT with seq/num mode
// @supportURL   https://github.com/AdoreJc/HathiTrust-Text-Export/issues
// @license      MIT
// @match        https://babel.hathitrust.org/cgi/ssd*
// @grant        GM_download
// ==/UserScript==

(function () {
    'use strict';

    function sleep(ms) {
        return new Promise(resolve => setTimeout(resolve, ms));
    }

    function cleanTextFromNode(node) {
        return node.textContent
            .replace(/\r/g, '')
            .replace(/[ \t]+\n/g, '\n')
            .replace(/\n[ \t]+/g, '\n')
            .replace(/\n{3,}/g, '\n\n')
            .trim();
    }

    function getSeqFromUrlOrDoc(doc, currentUrl) {
        const url = new URL(currentUrl, location.origin);

        const seqFromQuery = url.searchParams.get('seq');
        if (seqFromQuery && /^\d+$/.test(seqFromQuery)) {
            return parseInt(seqFromQuery, 10);
        }

        const hashMatch = url.hash.match(/seq(\d+)/i);
        if (hashMatch) {
            return parseInt(hashMatch[1], 10);
        }

        const h2 = doc.querySelector('#mdpContentContainer h2[id^="seq"]');
        if (h2 && h2.id) {
            const m = h2.id.match(/^seq(\d+)$/i);
            if (m) return parseInt(m[1], 10);
        }

        return null;
    }

    function getNumFromTitle(title) {
        const m = title.match(/Page\s+(\d+)/i);
        return m ? parseInt(m[1], 10) : null;
    }

    function parsePageFromDocument(doc, currentUrl) {
        const container = doc.querySelector('#mdpContentContainer');
        if (!container) {
            throw new Error('Cannot find #mdpContentContainer');
        }

        const h2 = container.querySelector('h2');
        if (!h2) {
            throw new Error('Cannot find page title h2');
        }

        const title = h2.textContent.trim();
        const num = getNumFromTitle(title);
        const seq = getSeqFromUrlOrDoc(doc, currentUrl);

        const mdpPage = container.querySelector('#mdpPage');
        if (!mdpPage) {
            throw new Error('Cannot find #mdpPage');
        }

        const paragraphs = [...mdpPage.querySelectorAll('p')];
        const bodyParts = [];

        for (const p of paragraphs) {
            const raw = p.textContent.replace(/\s+/g, ' ').trim();
            if (!raw) continue;
            if (/Previous Page|Next Page|Return to top/i.test(raw)) continue;

            const cleaned = cleanTextFromNode(p);
            if (cleaned) bodyParts.push(cleaned);
        }

        const bodyText = bodyParts.join('\n\n').trim();

        let nextUrl = null;
        const nextLink = [...doc.querySelectorAll('a')].find(a =>
            /Next Page/i.test(a.textContent)
        );
        if (nextLink && nextLink.getAttribute('href')) {
            nextUrl = new URL(nextLink.getAttribute('href'), currentUrl).href;
        }

        return { seq, num, title, bodyText, nextUrl };
    }

    function getCurrentValue(pageInfo, mode) {
        return mode === 'seq' ? pageInfo.seq : pageInfo.num;
    }

    async function fetchDocument(url) {
        const res = await fetch(url, {
            method: 'GET',
            credentials: 'include'
        });

        if (!res.ok) {
            throw new Error(`HTTP ${res.status}: ${url}`);
        }

        const html = await res.text();
        return new DOMParser().parseFromString(html, 'text/html');
    }

    function getSafeBookTitle() {
        const h1 = document.querySelector('#mdpPageHeader h1');
        const raw = h1 ? h1.textContent.trim() : 'HathiTrust_Text_Export';
        return raw
            .replace(/[\\/:*?"<>|]+/g, '_')
            .replace(/\s+/g, '_')
            .slice(0, 120);
    }

    function downloadText(filename, text) {
        const blob = new Blob([text], { type: 'text/plain;charset=utf-8' });
        const blobUrl = URL.createObjectURL(blob);

        GM_download({
            url: blobUrl,
            name: filename,
            saveAs: true,
            onload: () => {
                setTimeout(() => URL.revokeObjectURL(blobUrl), 5000);
            },
            onerror: (e) => {
                console.error('Download failed', e);
                alert('Download failed. Check the console for details.');
            }
        });
    }

    async function runExport(startValue, endValue, mode, btn, statusEl) {
        const results = [];
        const visited = new Set();

        let doc = document;
        let currentUrl = location.href;

        while (true) {
            if (visited.has(currentUrl)) {
                console.warn('Duplicate page detected, stopping:', currentUrl);
                break;
            }
            visited.add(currentUrl);

            const pageInfo = parsePageFromDocument(doc, currentUrl);
            const currentValue = getCurrentValue(pageInfo, mode);

            const progressText = currentValue == null
                ? `Exporting... ${mode.toUpperCase()} N/A`
                : `Exporting... ${mode} ${currentValue}`;

            btn.textContent = progressText;
            if (statusEl) {
                statusEl.textContent = `Processing: ${pageInfo.title} | seq=${pageInfo.seq ?? 'N/A'} | num=${pageInfo.num ?? 'N/A'}`;
            }

            console.log(`Parsed: ${pageInfo.title} | num=${pageInfo.num} | seq=${pageInfo.seq}`);

            if (currentValue === null || currentValue === undefined) {
                if (mode === 'num') {
                    console.warn('Current page has no usable num, skipped:', pageInfo.title);
                }
            } else if (currentValue >= startValue && currentValue <= endValue) {
                results.push(`${pageInfo.title}\n\n${pageInfo.bodyText}`);
            }

            if (currentValue !== null && currentValue !== undefined && currentValue >= endValue) {
                break;
            }

            if (!pageInfo.nextUrl) {
                console.warn('Next Page not found, stopping early');
                break;
            }

            await sleep(1500);
            doc = await fetchDocument(pageInfo.nextUrl);
            currentUrl = pageInfo.nextUrl;
        }

        if (!results.length) {
            alert('No content was captured.');
            return;
        }

        const outputName = `${getSafeBookTitle()}_${mode}_${startValue}_${endValue}.txt`;
        const finalText = results.join('\n\n' + '='.repeat(80) + '\n\n');
        downloadText(outputName, finalText);

        btn.textContent = 'Export TXT';
        if (statusEl) {
            statusEl.textContent = `Done. Exported ${results.length} page(s).`;
        }
        alert(`Done: mode=${mode}, exported ${results.length} page(s).`);
    }

    function addUI() {
        if (document.getElementById('ht-export-panel')) return;

        let currentSeq = '';
        let currentNum = '';

        try {
            const info = parsePageFromDocument(document, location.href);
            currentSeq = info.seq ?? '';
            currentNum = info.num ?? '';
        } catch (e) {
            console.warn('Failed to read current page info', e);
        }

        const defaultMode = currentNum !== '' ? 'num' : 'seq';
        const defaultValue = defaultMode === 'num' ? currentNum : currentSeq;

        const panel = document.createElement('div');
        panel.id = 'ht-export-panel';

        Object.assign(panel.style, {
            position: 'fixed',
            right: '20px',
            bottom: '20px',
            zIndex: '999999',
            background: '#fff',
            border: '1px solid #ccc',
            borderRadius: '10px',
            padding: '12px',
            boxShadow: '0 2px 10px rgba(0,0,0,0.2)',
            fontSize: '14px',
            fontFamily: 'Arial, sans-serif',
            minWidth: '280px'
        });

        panel.innerHTML = `
            <div style="font-weight:bold; margin-bottom:8px;">HathiTrust TXT Export</div>
            <div style="margin-bottom:4px;">Current num: ${currentNum === '' ? 'N/A' : currentNum}</div>
            <div style="margin-bottom:8px;">Current seq: ${currentSeq === '' ? 'N/A' : currentSeq}</div>

            <label style="display:block; margin-bottom:8px;">
                Mode:
                <select id="ht-mode" style="width:100%; margin-top:4px; box-sizing:border-box;">
                    <option value="num" ${defaultMode === 'num' ? 'selected' : ''}>num (book page)</option>
                    <option value="seq" ${defaultMode === 'seq' ? 'selected' : ''}>seq (scan sequence)</option>
                </select>
            </label>

            <label style="display:block; margin-bottom:6px;">
                Start:
                <input id="ht-start-page" type="number" value="${defaultValue}" style="width:100%; margin-top:4px; box-sizing:border-box;">
            </label>

            <label style="display:block; margin-bottom:10px;">
                End:
                <input id="ht-end-page" type="number" value="${defaultValue}" style="width:100%; margin-top:4px; box-sizing:border-box;">
            </label>

            <button id="ht-export-btn" style="
                width:100%;
                padding:8px 10px;
                background:#2563eb;
                color:#fff;
                border:none;
                border-radius:8px;
                cursor:pointer;
            ">Export TXT</button>

            <div id="ht-export-status" style="
                margin-top:8px;
                font-size:12px;
                color:#444;
                line-height:1.4;
                word-break:break-word;
            ">Ready.</div>
        `;

        document.body.appendChild(panel);

        const btn = document.getElementById('ht-export-btn');
        const modeSelect = document.getElementById('ht-mode');
        const startInput = document.getElementById('ht-start-page');
        const endInput = document.getElementById('ht-end-page');
        const statusEl = document.getElementById('ht-export-status');

        modeSelect.addEventListener('change', () => {
            const mode = modeSelect.value;
            const val = mode === 'num' ? currentNum : currentSeq;
            startInput.value = val;
            endInput.value = val;
        });

        btn.addEventListener('click', async () => {
            const mode = modeSelect.value;
            const startValue = parseInt(startInput.value, 10);
            const endValue = parseInt(endInput.value, 10);

            if (!Number.isInteger(startValue) || !Number.isInteger(endValue)) {
                alert('Please enter valid numbers.');
                return;
            }

            if (startValue > endValue) {
                alert('Start cannot be greater than End.');
                return;
            }

            btn.disabled = true;
            btn.textContent = 'Starting...';
            statusEl.textContent = 'Preparing export...';

            try {
                await runExport(startValue, endValue, mode, btn, statusEl);
            } catch (err) {
                console.error(err);
                alert('Export failed: ' + err.message);
                statusEl.textContent = 'Export failed: ' + err.message;
            } finally {
                btn.disabled = false;
                btn.textContent = 'Export TXT';
            }
        });
    }

    window.addEventListener('load', addUI);
})();