中华人民共和国中央人民政府【数据爬取和下载】

www.gov.cn

// ==UserScript==
// @name         中华人民共和国中央人民政府【数据爬取和下载】
// @namespace    http://tampermonkey.net/
// @version      2024-01-08
// @description  www.gov.cn
// @author       You
// @match        https://sousuo.www.gov.cn/sousuo/search.shtml?*
// @icon         https://www.google.com/s2/favicons?sz=64&domain=www.gov.cn
// @grant        none
// @license MIT
// ==/UserScript==

/**
 * @param content 要保存的内容
 * @param filename 文件名
 */
var funDownload = function (content, filename) {
    // 创建隐藏的可下载链接
    var eleLink = document.createElement('a');
    eleLink.download = filename;
    eleLink.style.display = 'none';
    // 字符内容转变成blob地址
    var blob = new Blob([content]);
    eleLink.href = URL.createObjectURL(blob);
    // 触发点击
    document.body.appendChild(eleLink);
    eleLink.click();
    // 然后移除
    document.body.removeChild(eleLink);
};
let tableMapping = [
    ["#", (l, ind) => ind + 1],
    ["发布事件", l => l.time.split(' ')[0]],
    ["标签", l => l.label],
    ["类型编号", l => l.documentType],
    ["来源编号", l => l.source],
    ["类型", l => {
        l = l.type || '';
        if (l instanceof Array) {
            return l.join('@');
        } else {
            return l.toString()
        }
    }],
    ["链接", l => l.url],
    ["标题", l => l.title_no_tag],
    ["发布代码", l => l.pubcode],
    ["文档类型", l => l.document || ''],
    ["文档ID", l => l.documentId || ''],
    ["general", l => l.general || ''],
    ["agencies", l => {
        l = l.agencies || '';
        if (l instanceof Array) {
            return l.join('@');
        } else {
            return l.toString()
        }
    }],
    ["Issue", l => {
        l = l.Issue || '';
        if (l instanceof Array) {
            return l.join('@');
        } else {
            return l.toString()
        }
    }],
    ["摘要", l => l.summary],
    ["内容", l => l.content],
];
class Loading {
    constructor() {
        this.dom = document.createElement('div');
        this.dom.setAttribute('style', 'width:100vw;height:100vh;position:fixed;left:0;top:0;display:none;rgb(144 144 144 / 26%);color: black;text-align: center;line-height: 150px;font-size: 40px;');
        document.body.append(this.dom);
    }
    show(text) {
        this.dom.innerText = text;
        this.dom.style.display = 'block';
    }
    hide() {
        this.dom.style.display = 'none';
    }
};
class SpiderDom {
    constructor(action) {
        this.action = {
            start() {
                action.start();
            },
            getCacheData() {
                return action.getCacheData();
            },
            downloadCacheData() {
                action.downloadCacheData();
            }
        };
        let dom = this.buildDom();
        $('.o_body').append(dom);
        this.initDomEvent();
    }

    buildDom() {
        return $(`<div style="display: block;flex: 1;background: #dfdfdf;">
<div id="spider_action">
<button id="spider_action_start" style="background: #025293;border-radius: 3px;color: #ffffff;cursor: pointer;padding: 5px;">开始爬取</button>
<!--<button id="spider_action_cache">从缓存读取</button>-->
<button id="spider_action_download" style="background: #025293;border-radius: 3px;color: #ffffff;cursor: pointer;padding: 5px;">下载</button>
</div>
<!--<div id="spider_search_info"></div>-->
<div id="spider_process"></div>
<div id="spider_result"></div>
</div>`);
    }
    initDomEvent() {
        $("#spider_action_start").on('click', () => {
            this.action.start();
        });
        $("#spider_action_cache").on('click', () => {
            let datas = this.action.getCacheData();
        });
        $("#spider_action_download").on('click', () => {
            this.action.downloadCacheData();
        });
    }
    drawCurrentSpiderData(list) {
        // spider_result
        let head = `<thead>${tableMapping.map(tm => `<th>${tm[0]}</th>`).join('')}</thead>`;
        // spider_result
        $("#spider_result").html(`${head}<tbody>
${list.map((l, ind) => {
            return `<tr>${tableMapping.map(tm => `<td>${tm[1](l, ind)}</td>`).join('')}</tr>`;
        }).join('\n')}
</tbody>`);
    }
    renderAfterPageRequest(requestData, responseData) {
        //
        let pageInfo = responseData.result.data.pager;
        let list = responseData.result.data.middle.list;
        let currentSpiderCount = (pageInfo.pageNo - 1) * pageInfo.pageSize + list.length;
        $("#spider_process").text(`总共有 ${pageInfo.total} 条记录,已经爬取 ${currentSpiderCount}`);
        this.drawCurrentSpiderData(list);
    }
    finish() {
        $("#spider_process").text("爬取完成");
    }
};
class Cache {
    constructor() {
        this.key = `spider_cache_key`;
        this.cache = [];
    }
    addData(list) {
        this.cache.splice(this.cache.length,0,...list);
        localStorage.setItem(this.key, JSON.stringify(this.cache));
    }
    getData() {
        if (this.cache.length) {
            return this.cache;
        } else {
            let data = localStorage.getItem(this.key);
            if (data) {
                try {
                    data = JSON.parse(data);
                } catch (e) {
                    data = null;
                } finally {
                    return data || [];
                }
            } else {
                return [];
            }
        }
    }
    download() {
        let data = this.getData();
        let head = tableMapping.map(_ => `"${_[0]}"`).join(',');
        let content = data.map((line, ind) => {
            return tableMapping.map(_ => {
                return `"${(_[1](line, ind) || "").toString().replace(/"/g, "'")}"`
            }).join(',');
        }).join('\r\n');
        funDownload("\uFEFF" + head + "\r\n" + content, (new Date().getTime()) + ".csv");
    }
}
class RequestPageData {
    static Status = {
        Over: 0,
        Requesting: 1,
        StartRequest: 2,
    }
    constructor() {
        this.currentPage = 0;
        this.pager = {
            "pageCount": 181,
            "total": 1810,
            "pageNo": 181,
            "pageSize": 10,
            "totalPage": 181,
        };
        this.beforeRequestHandle = [];
        this.afterRequestHandle = [];
        this.overRequestHandle = [];
        this.padding = false;
        this.requestId = null;
    }
    addAfterRequestHandle(cb) {
        this.afterRequestHandle.push(cb);
        return this;
    }
    addBeforeRequestHandle(cb) {
        this.beforeRequestHandle.push(cb);
        return this;
    }
    addOverRequestHandle(cb) {
        this.overRequestHandle.push(cb);
        return this;
    }
    start() {
        if (this.requestId) {
            return;
        } else {
            this.currentPage = 0;
            this.requestId = setInterval(() => {
                let status = this.go();
                if (status === RequestPageData.Status.Over) {
                    this.overRequestHandle.forEach(cb => cb());
                    clearInterval(this.requestId);
                }
            }, 500);
        }
    }
    go(page) {
        if (this.padding) {
            return RequestPageData.Status.Requesting;
        }
        if (page) {
            this.currentPage = page;
        } else {
            this.currentPage++;
        }
        if (this.currentPage === 1 || this.currentPage <= this.pager.totalPage) {
            this.padding = true;
            this.beforeRequestHandle.forEach(cb => cb(this.currentPage));
            window.listPager.goPageFn(this.currentPage);
            return RequestPageData.Status.Requesting;
        } else {
            return RequestPageData.Status.Over;
        }
    }
    callback(requestData, responseData) {
        this.pager = {
            ...responseData.result.data.pager,
            totalPage: Math.ceil(responseData.result.data.pager.total / responseData.result.data.pager.pageSize)
        };
        this.afterRequestHandle.forEach(cb => cb(requestData, responseData));
        this.padding = false;
    }
}

function init() {
    let cache = new Cache();
    let loading = new Loading();
    let hideResultId = false;
    let requestPageData = new RequestPageData();
    let spiderDom = new SpiderDom({
        start(page) {
            requestPageData.start();
        },
        getCacheData() {
            return cache.getData();
        },
        downloadCacheData() {
            cache.download();
            // todo
        }
    });
    requestPageData
        .addAfterRequestHandle(spiderDom.renderAfterPageRequest.bind(spiderDom))
        .addAfterRequestHandle((req, rep) => {
            cache.addData(rep.result.data.middle.list);
        })

        .addBeforeRequestHandle(page => loading.show(`正在获取数据 [page=${page}]`))

        .addOverRequestHandle(spiderDom.finish.bind(spiderDom))
        .addOverRequestHandle(() => loading.hide());

    return {
        hideResult() {
            if (hideResultId) {
                return;
            }
            function hide() {
                $('div.js_listPager.listPager').each((idx,e) => e.style.display = 'none');
                $('div.main_content.basic_result.js_result').each((idx,e) => e.style.display = 'none');
                $('iframe').each((idx,e) => e.parentElement.style.display = 'none');
                $('.search-main')[0].style.flex = 'unset';
            };
            hide();
            hideResultId = setInterval(() => {
                hide();
            }, 2000);
        },
        loading,
        redirectSuccess(requestData, responseData) {
            requestPageData.callback(requestData, responseData);
        },
    };
}


(function() {
    function inj() {
        let obj = init();
        const ajax = jQuery.ajax;
        window.ajaxInj = function (opt) {
            return ajax(opt); // 调用保存的原始 $.ajax
        };
        obj.hideResult();

        delete jQuery.ajax;
        delete $.ajax;

        $.ajax = function (opt) {
            if (opt.url.startsWith('https://sousuoht.www.gov.cn/athena/forward') && opt.dataType === 'json') {
                opt._go_ = true;
                opt.success = null;
                return window.ajaxInj(opt).then(ret => {
                    obj.loading.show();
                    obj.redirectSuccess(opt, ret);
                    obj.loading.hide();
                });
            } else {
                return window.ajaxInj(opt);
            }
        };
    }

    let id = setInterval(() => {
        if (window.$ && window.$.ajax && window.jQuery) {
            inj();
            clearInterval(id);
        }
    }, 500);

    // Your code here...
})();