Wenku Doc Downloader

下载“百度文库”文档,导出txt或pdf。“豆丁网”“爱问共享资料”(新浪文档)文档导出pdf。在文档页面最底部有蓝/绿色长方形按钮,说明脚本生效了,否则就没有生效。

Stan na 08-12-2021. Zobacz najnowsza wersja.

Aby zainstalować ten skrypt, wymagana jest instalacje jednego z następujących rozszerzeń: Tampermonkey, Greasemonkey lub Violentmonkey.

Aby zainstalować ten skrypt, wymagana jest instalacje jednego z następujących rozszerzeń: Tampermonkey, Violentmonkey.

Aby zainstalować ten skrypt, wymagana jest instalacje jednego z następujących rozszerzeń: Tampermonkey, Violentmonkey.

Aby zainstalować ten skrypt, wymagana będzie instalacja rozszerzenia Tampermonkey lub Userscripts.

You will need to install an extension such as Tampermonkey to install this script.

Aby zainstalować ten skrypt, musisz zainstalować rozszerzenie menedżera skryptów użytkownika.

(Mam już menedżera skryptów użytkownika, pozwól mi to zainstalować!)

You will need to install an extension such as Stylus to install this style.

You will need to install an extension such as Stylus to install this style.

You will need to install an extension such as Stylus to install this style.

Będziesz musiał zainstalować rozszerzenie menedżera stylów użytkownika, aby zainstalować ten styl.

Będziesz musiał zainstalować rozszerzenie menedżera stylów użytkownika, aby zainstalować ten styl.

Musisz zainstalować rozszerzenie menedżera stylów użytkownika, aby zainstalować ten styl.

(Mam już menedżera stylów użytkownika, pozwól mi to zainstalować!)

// ==UserScript==
// @name         Wenku Doc Downloader
// @namespace    http://tampermonkey.net/
// @version      1.0
// @description  下载“百度文库”文档,导出txt或pdf。“豆丁网”“爱问共享资料”(新浪文档)文档导出pdf。在文档页面最底部有蓝/绿色长方形按钮,说明脚本生效了,否则就没有生效。
// @author       [email protected]
// @match        https://wenku.baidu.com/view/*
// @match        https://www.docin.com/p-*
// @match        https://ishare.iask.sina.com.cn/f/*
// @icon         https://www.google.com/s2/favicons?domain=limestart.cn
// @grant        none
// @license      GPL-3.0-only
// @create       2021-11-22
// @note         修复了纯图片文档报错的bug(程序里有个字符串写错了【笑哭】)
// @note         更新了百度文库上所有文档类型直接导出PDF的功能(纯图片文档更推荐使用【图片下载合并器】)
// ==/UserScript==

/*
*  附属功能函数部分
*/

function createAndDownloadFile(fileName, content) {
    // 创建并下载文件
    var aTag = document.createElement('a');
    var blob = new Blob([content]);
    aTag.download = fileName;
    aTag.href = URL.createObjectURL(blob);
    aTag.click();
    URL.revokeObjectURL(blob);
}

function formatText(text){
    // 用于纯文本文档的文本美化
	var reg_exp_1 = new RegExp(" [(]?=[\u4e00-\u9fa5] [)]");
	var reg_exp_2 = new RegExp("(?<=TEMP[\u4e00-\u9fa5]) ");
	var reg_exp_3 = new RegExp("(?<=[\u4e00-\u9fa5]) (?=[\u4e00-\u9fa5])");

	var text_1 = text.replace(reg_exp_1, "TEMP");
	var text_2 = text_1.replace(reg_exp_2, "");
	var text_3 = text_2.replace("TEMP", "");
	var text_final = text_3.replace(/ /g, " ");
	return text_final;
}

function formatText2(text) {
    // 用于图形文字混合型文档的文本美化
    var reg_exp = new RegExp("[  ]{2,}");
    var content_1 = text.replace(reg_exp, "\n");

    var content_2 = content_1.replace(/[  ]\n/g, "\n");

    var reg_exp_2 = new RegExp("\n[   ]*\n*\n");
    var content_3 = content_2.replace(reg_exp_2, "\n");

    var reg_exp_3 = new RegExp(" *\n * ");
    var content_4 = content_3.replace(reg_exp_3, "\n");

    var content_5 = content_4.replace(/[  ]/g, " ");
    var final_content = content_5.replace(/[ \n]精选文档[ \n]/g).replace(/\n{2,}/g, "\n");

    return final_content;
}

function detectType() {
    // 获取文档类型名称
    try {
        var doc_title_wrap = document.getElementsByClassName("doc-title-wrap")[0];
        var file_type = doc_title_wrap.children[0].className;
    }
    catch (e) {
        alert("请刷新页面以激活该按钮。\n先点击【导出pdf】橙色按钮后该按钮将无法使用。");
        return "safe_quit";
    }
    var pdf, doc, ppt, excel, type;
    // 判断文档类型
    if (file_type.search("word") !== -1) {
        type = "word";
    }
    else if (file_type.search("ppt") !== -1) {
        type = "ppt";
    }
    else if (file_type.search("excel") !== -1) {
        type = "excel";
    }
    else if (file_type.search("pdf") !== -1) {
        type = "pdf";
    }
    else if (file_type.search("txt" !== -1)) {
        type = "txt";
    }
    else {
        type = file_type;
    }
    // 分别尝试获取相应元素列表,若列表长度为0则不存在相应元素,否则存在
    var pic_nums = document.getElementsByClassName("reader-pic-item").length;
    var word_nums = document.getElementsByClassName("reader-word-layer").length;
    var ppt_img_nums = document.getElementsByClassName("ppt-image-wrap").length;

    // 判断文档类型、文字和图片的数量状况
    if (type === "word" && !word_nums && pic_nums) {
        // doc: 纯图片
        return "doc-only-pic";
    }
    else if (type === "word" && word_nums > 2 && pic_nums <= 1) {
        // doc: 纯文字
        return "doc-only-word";
    }
    else if (type === "word" && pic_nums > 2 && word_nums > 2) {
        // doc: 图形、文字混合
        return "doc-pic-word";
    }
    else if (type === "pdf" && pic_nums > 2 && word_nums === 1) {
        // pdf: 带有一行文字标题,之后都是图形
        return "pdf-pic-title";
    }
    else if (type === "pdf" && !word_nums && pic_nums) {
        // pdf: 纯图形
        return "pdf-only-pic";
    }
    else if (type === "pdf" && !pic_nums && word_nums > 1) {
        // pdf: 纯文字
        return "pdf-only-word";
    }
    else if (type === "pdf" && word_nums > 2 && pic_nums > 1) {
        // pdf: 图形、文字混合
        return "pdf-pic-word";
    }
    else if ((type === "ppt" && ppt_img_nums > 2) || (type === "pdf" && !word_nums && !pic_nums && ppt_img_nums)) {
        // ppt: 包含至少3页内容 / 纯ppt图形页面构成
        return "ppt";
    }
    else if (type === "excel" && pic_nums && word_nums > 2) {
        // excel: 包含可选中文字
        return "excel-only-word";
    }
    else if (type === "excel" && pic_nums && !word_nums) {
        // excel: 纯图形
        return "excel-only-pic";
    }
    else if (type === "txt") {
        // txt: 纯文字
        return "txt";
    }
    else {
        return {"源文档类型": type,
                "图形数量": pic_nums,
                "文字块数量": word_nums,
                "ppt纯图形页面数量": ppt_img_nums};
    }
}

function tryToRemoveElement(element) {
    // try移除元素
    try {
        element.remove();
    }
    catch(e) {
        console.log();
    }
}

function tryToRemoveSameElem(elem_list_box) {
    // try移除[元素列表1, 元素列表2, ...]的元素
    for (var elem_list of elem_list_box) {
        if (!elem_list) {
            continue;
        }
        for (var elem of elem_list) {
            try {
                elem.remove();
            }
            catch(e) {
                console.log();
            }
        }
    }
}

function centerDoc(class_name, default_offset) {
    // 使文档居中
    var doc_main = document.getElementsByClassName(class_name)[0];
    var offset = window.prompt("请输入偏移百分位:", default_offset);
    // 如果输入的数字不在 0-59 内,提醒用户重新设置
    if (offset.length === 1 && offset.search(/[0-9]/) !== -1) {
        doc_main.style.marginLeft = offset + "%";
        return true;
    }
    else if (offset.length === 2 && offset.search(/[1-5][0-9]/) !== -1) {
        doc_main.style.marginLeft = offset + "%";
        return true
    }
    else {
        alert("请输入一个正整数,范围在0至59之间,用来使文档居中\n(不同文档偏移量不同,所以需要手动调整)");
        return false;
    }
}

/*
*  主要功能函数部分
*/

var docin_counter = 0;

function printPageDocin() {
    // # 清理并打印豆丁网的文档页
    // ## 选择指针光标
    try {document.getElementById("j_select").click();} catch(e) {console.log();}
    // ## 移除页面上无关的元素
    // ### 移除单个元素
    var doc_head = document.getElementsByClassName("doc_header_mod")[0];
    var head_wrapper = document.getElementsByClassName("head_wrapper")[0];
    var aside = document.getElementsByClassName("aside")[0];
    var slide = document.getElementById("docinShareSlider");
    var no_more = document.getElementsByClassName("no_more_mod")[0];
    var like_too = document.getElementById("likeToo");
    var tools_bottom_bar = document.getElementsByClassName("tools_bottom_bar")[0];
    var page_crubms = document.getElementsByClassName("page_crubms")[0];
    var bottom_ad = document.getElementById("jControlDivRecomm");
    var back_to_top = document.getElementsByClassName("backToTop")[0];
    // ### 执行移除
    var elem_list = [doc_head,
                     head_wrapper,
                     aside,
                     slide,
                     no_more,
                     like_too,
                     tools_bottom_bar,
                     page_crubms,
                     bottom_ad,
                     back_to_top
                    ];
    for (var elem of elem_list) {
        tryToRemoveElement(elem);
    }
    // ### 移除全部同类元素
    var ad_box = document.getElementsByClassName("adBox");
    tryToRemoveSameElem([ad_box]);
    // 使文档居中
    var doc = document.getElementsByClassName("main")[0];
    doc.style.marginLeft = "6%";
    // 隐藏按钮,然后打印页面
    var btn_2 = document.getElementsByClassName("save-doc-btn")[0];
    btn_2.style.display = "none";
    // 打印结束,显示按钮
    alert("如果预览时有空白页,请取消打印\n请上下滚动页面,确保每页内容都加载完成\n如果文档中有广告,请取消打印,再点一次按钮\n最多不超过2次,应该没有广告了");
    window.print();
    btn_2.style.removeProperty("display");
}

function printPageiShare() {
    // # 清理并打印爱问共享资料的文档页
    // ## 移除页面上无关的元素
    // ### 移除单个元素
    var topbanner = document.getElementsByClassName("detail-topbanner")[0];
    var header = document.getElementsByClassName("new-detail-header")[0];
    var fixright = document.getElementById("fix-right");
    var redpacket = document.getElementsByClassName("loginRedPacket-dialog")[0];
    var fixedrightfull = document.getElementsByClassName("fixed-right-full")[0];
    var footer = document.getElementsByClassName("website-footer")[0];
    var guess = document.getElementsByClassName("guess-you-like-warpper")[0];
    var detailtopbox = document.getElementsByClassName("detail-top-box")[0];
    var fullscreen = document.getElementsByClassName("reader-fullScreen")[0];
    var endhint = document.getElementsByClassName("endof-trial-reading")[0];
    var crumb_arrow;
    try {crumb_arrow = document.getElementsByClassName("crumb-arrow")[0].parentElement;} catch(e) {console.log();}
    var copyright = document.getElementsByClassName("copyright-container")[0];
    var state_btn = document.getElementsByClassName("state-bottom")[0];
    // ### 执行移除
    var elem_list = [topbanner,
                     header,
                     fixright,
                     redpacket,
                     fixedrightfull,
                     footer,
                     guess,
                     detailtopbox,
                     fullscreen,
                     endhint,
                     crumb_arrow,
                     copyright,
                     state_btn
                    ];
    for (var elem of elem_list) {
        tryToRemoveElement(elem);
    }
    // ### 移除全部同类元素
    var adv_container = document.getElementsByClassName("adv-container");
    tryToRemoveSameElem([adv_container]);
    // 使文档居中
    alert("建议使用:\n偏移量:18\n缩放:默认\n如果预览中有广告,就取消打印\n再点一次按钮,预览中应该就没有广告了");
    if (!centerDoc("doc-main", "18")) {
        return; // 如果输入非法,终止函数调用
    }
    // 隐藏按钮,然后打印页面
    var btn_2 = document.getElementsByClassName("save-doc-btn")[0];
    btn_2.style.display = "none";
    window.print();
    // 打印结束,显示按钮
    btn_2.style.removeProperty("display");
}

function printPageBaidu() {
    // # 清理并打百度文库的文档页
    // ## 移除无关页面元素
    // ### 要移除的单个元素
    var header_wrapper = document.getElementsByClassName("header-wrapper")[0];
    var right_wrapper = document.getElementById("right-wrapper-id");
    var reader_topbar = document.getElementsByClassName("reader-topbar")[0];
    var end_fold_page = document.getElementsByClassName("try-end-fold-page")[0];

    for (var elem_1 of [header_wrapper, right_wrapper, reader_topbar, end_fold_page]) {
        tryToRemoveElement(elem_1);
    }
    // ### 移除全部同类元素
    var lazy_load_list = document.getElementsByClassName("lazy-load");
    var no_full_screen_list = document.getElementsByClassName("no-full-screen");
    var ads = document.getElementsByClassName("hx-warp");

    tryToRemoveSameElem([lazy_load_list, ads, no_full_screen_list]);

    // 使文档居中
    alert("建议使用:\n偏移量:0\n缩放:118%");
    if (!centerDoc("left-wrapper", "0")) {
        return; // 如果输入非法,退出函数调用
    }
    // 隐藏按钮,然后打印页面
    var section = document.getElementsByClassName("btns_section")[0];
    section.style.display = "none";
    window.print();
    // 打印结束,显示按钮
    section.style.removeProperty("display");
}

function createSaveHtmlBtn() {
    // 创建 下载html 按钮
    var btn_3 = document.createElement("button");
    // 样式设定
    btn_3.setAttribute("class", "save-html-btn");
    btn_3.style.height = "25px";
    btn_3.style.width = "15%";
    btn_3.style.marginLeft = "0.2%";
    btn_3.style.backgroundColor = "orange";
    btn_3.style.border = "none";
    btn_3.textContent = "导出pdf";
    btn_3.style.fontWeight = "bold";
    btn_3.style.borderRadius = "10%";
    // 绑定事件,添加到页面上
    btn_3.onclick = printPageBaidu;
    var section = document.getElementsByClassName("btns_section")[0];
    section.appendChild(btn_3);
}

function readAlliShare() {
    var red_btn = document.getElementsByClassName("red-color")[0];
    var red_text = red_btn.textContent;
    // 如果可以展开,则展开
    if (red_text.search("点击可继续阅读") !== -1) {
        red_btn.click();
    }
    // 否则启动按钮2,准备清理页面然后打印为PDF
    else {
        var hint = "文档已经完全展开,可以导出";
        alert(hint);
        // 准备调整按钮,先获取按钮
        var init_btn = document.getElementsByClassName("init-btn")[0];
        var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0];
        // 调整按钮显示状况
        save_doc_btn.style.removeProperty("display");
        init_btn.style.display = "none";
    }
}

function readAll() {
    var read_all_btn = document.getElementsByClassName("read-all")[0];
    // 如果存在“继续阅读”的按钮
    if (read_all_btn) {
        // 点击“继续阅读”按钮
        read_all_btn.click();
    }
    else{
        var hint = "文档已经完全展开,可以导出";
        alert(hint);
        try {
            // 判断文档类型
            var category = detectType();
        }
        catch(e) {
            alert("未知/特殊文档类型,例如学术文献,暂不支持下载\n也可与作者反馈或联系:\[email protected]");
            return undefined;
        }
        // 准备调整按钮,先获取按钮
        var init_btn = document.getElementsByClassName("init-btn")[0];
        var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0];

        // 添加导出pdf功能的按钮
        save_doc_btn.style.width = "34.8%";
        createSaveHtmlBtn();
        // 纯文字类型文档推荐导出纯文本
        if (category === "doc-only-word" ||
                category === "pdf-only-word") {
            save_doc_btn.textContent += "(推荐)";
        }
        // 纯图类型文档推荐导出图片链接再拼合为PDF
        else if (category === "doc-only-pic" ||
                category === "pdf-pic-title" ||
                category === "ppt" ||
                category === "pdf-only-pic" ||
                category === "excel-only-pic"){
            save_doc_btn.textContent = "导出图片链接来合并为PDF(推荐)";
        }
        // 其他类型应该是图文混合型,推荐导出PDF
        else {
            var print_page_btn = document.getElementsByClassName("save-html-btn")[0];
            print_page_btn.textContent += "(推荐)";
        }
        // 调整按钮显示状况
        save_doc_btn.style.removeProperty("display");
        init_btn.style.display = "none";
    }
}

function savePDFData() {
    // 存储pdf型data(假定是内容是pic)
    // alert("Function savePDFData was called.");
    var pic_urls = document.getElementsByClassName("reader-pic-item");
    var text_list = [];
    // 去掉前缀
    var reg_exp_1 = new RegExp(": ?url[(]");
    // 去掉后缀
    var reg_exp_2 = new RegExp("[)]; ?background-position");

    for (var i = 0; i < pic_urls.length; i++){
        var whole_text = pic_urls[i].getAttribute("style");
        var de_pretext = whole_text.split(reg_exp_1)[1];
        var url = de_pretext.split(reg_exp_2)[0];
        text_list.push(url);
    }

    text_list[0] = text_list[0].replace(/"/g, "");
    var content = text_list.join("\n");
    // 启动下载
    createAndDownloadFile("urls.csv", content);
}

function saveDocData() {
    // 存储doc型data(内容是text)
    // alert("Function saveDocData was called.");
    // 获取文本
	var text_elements = document.getElementsByClassName("reader-word-layer");
	var texts = [];
	for (var elem of text_elements){
		texts.push(elem.textContent);
	}
	// 美化后导出文本
	var origin_content = texts.join("");
	var content = formatText(origin_content);
	createAndDownloadFile("纯文本文档.txt", content);
}

function savePPTData() {
    // 存储ppt型data(内容是pic)
    // alert("Function savePPTData was called.");
    var pic_elements = document.getElementsByClassName("ppt-image-wrap");
    var pic_urls = [];

    for (var elem of pic_elements) {
        var pic_obj = elem.children[0];
        var url = pic_obj.src;
        pic_urls.push(url);
    }
    var content = pic_urls.join("\n");
    // 启动下载
    createAndDownloadFile("urls.csv", content);
}

function saveExcelData() {
    // 1. 拿到表格
    var table_pic = document.getElementsByClassName("reader-pic-item")[0];
    var url = table_pic.style.getPropertyValue("background-image");
    // 获取图片地址
    var pure_url = url.slice(5, -2);

    // 2. 拿到表格内文字信息
    var text_elems = document.getElementsByClassName("reader-word-layer");
    var text_list = [];
    for (var elem of text_elems) {
    	text_list.push(elem.textContent);
    }
    var _text = text_list.join("\n");
    // 替换奇怪的空格
    var text = _text.replace(/ /g, " ");

    // 3. 合并至一个字符串,然后导出
    var head = "表格图形链接如下(复制到浏览器中打开):";
    var content = head + "\n\n" + pure_url + "\n\n" + text;
    createAndDownloadFile("图片地址和表格内容.txt", content);
}

function saveDocAndPicData() {
    // 对于文字和图形混合型的data只能存储其中的纯文字
    // alert("Function saveDocAndPicData was called.");
    // 获取文本
	var text_elements = document.getElementsByClassName("reader-word-layer");
	var texts = [];
	for (var elem of text_elements){
		texts.push(elem.textContent);
	}
    var origin_content = texts.join("");
	// 美化后导出文本
	var content = formatText2(origin_content);
	createAndDownloadFile("纯文本文档.txt", content);
}

function saveTxtData() {
    // 存储纯文本到本地
    var text_elements = document.getElementsByClassName("p-txt");
	var texts = [];
	for (var elem of text_elements){
		texts.push(elem.textContent);
	}
    var content = texts.join("");
	createAndDownloadFile("纯文本文档.txt", content);
}

function saveData() {
    // 存储文档数据到本地
    var category = detectType();
    if (category === "doc-only-pic" ||
             category === "pdf-pic-title" ||
             category === "pdf-only-pic" ||
             category === "excel-only-pic"){
        // 对于纯图形文档,都用【图片下载合并器】来处理
        savePDFData();
    }
    else if (category === "doc-only-word" ||
             category === "doc-pic-word" ||
             category === "pdf-only-word" ||
             category === "pdf-pic-word") {
        // 对于包含大量文字、且非表格的文档,直接提出纯文本
        saveDocData();
    }
    else if (category === "ppt") {
        // ppt按类似于纯图文档的方法处理
        savePPTData();
    }
    else if (category === "excel-only-word") {
        // excel仅保存其中的纯文字
        saveExcelData();
    }
    else if (category === "txt") {
        // txt直接保存
        saveTxtData();
    }
    else if (category === "safe_quit") {
        // 安全退出
        return;
    }
    else {
        var info = [];
        for (var key in category){
            info.push(key + " : " + category[key]);
        }
        alert("未知处理类型,请反馈或联系作者:\[email protected]\n" + info.join("\n"));
    }
}

function create2btns() {
    // 创建两个初始按钮:展开文档、存储文档

    // 创建脚本启动按钮1、2
    var btn_1 = document.createElement("button");
    var btn_2 = document.createElement("button");

    // 设定按钮1、2样式
    btn_1.setAttribute("class", "init-btn");
    btn_1.style.height = "25px";
    btn_1.style.width = "50%";
    btn_1.style.marginLeft = "25%";
    btn_1.style.border = "none";
    btn_1.style.backgroundColor = "blue";
    btn_1.style.color = "white";
    btn_1.style.fontWeight = "bold";
    btn_1.textContent = "展开文档";

    btn_2.setAttribute("class", "save-doc-btn");
    btn_2.style.height = "25px";
    btn_2.style.width = "50%";
    btn_2.style.marginLeft = "25%";
    btn_2.style.backgroundColor = "green";
    btn_2.style.border = "none";
    btn_2.style.display = "none";
    btn_2.style.color = "white";
    btn_2.style.fontWeight = "bold";

    // 添加按钮元素到页面
    var section = document.createElement("section");
    section.setAttribute("class", "btns_section");
    section.appendChild(btn_1);
    section.appendChild(btn_2);
    document.body.appendChild(section);
    // 返回元素引用
    return [btn_1, btn_2]
}

/*
*  主函数部分
*/

function baiduWenku() {
    // 创建脚本启动按钮1、2
    var [btn_1, btn_2] = create2btns();
    btn_2.textContent = "导出纯文本";

    // 绑定主函数
    btn_1.onclick = readAll;
    btn_2.onclick = saveData;

    // 解除打印限制
    var style = document.createElement("style");
    style.innerHTML = `@media print {
        body{
            display:block;
        }
    }`;
    style.type="text/css";
    document.getElementsByTagName("head")[0].appendChild(style);
}

function docin() {
    // 创建脚本启动按钮
    var [btn_1, btn_2] = create2btns();
    btn_1.remove();
    btn_2.textContent = "打印页面到PDF";
    btn_2.style.removeProperty("display");
    // 绑定主函数
    btn_2.onclick = printPageDocin;
}

function ishare() {
    // 创建脚本启动按钮1、2
    var [btn_1, btn_2] = create2btns();
    btn_2.textContent = "打印页面到PDF";

    // 绑定主函数
    btn_1.onclick = readAlliShare;
    btn_2.onclick = printPageiShare;

    // 移除底部下载条
    var detailfixed = document.getElementsByClassName("detail-fixed")[0];
    detailfixed.remove();
}

function main() {
    var host = window.location.host;
    if (host === "wenku.baidu.com") {
        baiduWenku();
    }
    else if (host === "www.docin.com") {
        docin();
    }
    else if (host === "ishare.iask.sina.com.cn") {
        ishare();
    }
    else {
        console.log("匹配到了无效网页");
    }
}

main();