Greasy Fork is available in English.

中华人民共和国中央人民政府【数据爬取和下载】

www.gov.cn

このスクリプトの質問や評価の投稿はこちら通報はこちらへお寄せください。
// ==UserScript==
// @name         中华人民共和国中央人民政府【数据爬取和下载】
// @namespace    http://tampermonkey.net/
// @version      2024-01-08
// @description  www.gov.cn
// @author       You
// @match        https://sousuo.www.gov.cn/sousuo/search.shtml?*
// @icon         https://www.google.com/s2/favicons?sz=64&domain=www.gov.cn
// @grant        none
// @license MIT
// ==/UserScript==

/**
 * @param content 要保存的内容
 * @param filename 文件名
 */
var funDownload = function (content, filename) {
    // 创建隐藏的可下载链接
    var eleLink = document.createElement('a');
    eleLink.download = filename;
    eleLink.style.display = 'none';
    // 字符内容转变成blob地址
    var blob = new Blob([content]);
    eleLink.href = URL.createObjectURL(blob);
    // 触发点击
    document.body.appendChild(eleLink);
    eleLink.click();
    // 然后移除
    document.body.removeChild(eleLink);
};
let tableMapping = [
    ["#", (l, ind) => ind + 1],
    ["发布事件", l => l.time.split(' ')[0]],
    ["标签", l => l.label],
    ["类型编号", l => l.documentType],
    ["来源编号", l => l.source],
    ["类型", l => {
        l = l.type || '';
        if (l instanceof Array) {
            return l.join('@');
        } else {
            return l.toString()
        }
    }],
    ["链接", l => l.url],
    ["标题", l => l.title_no_tag],
    ["发布代码", l => l.pubcode],
    ["文档类型", l => l.document || ''],
    ["文档ID", l => l.documentId || ''],
    ["general", l => l.general || ''],
    ["agencies", l => {
        l = l.agencies || '';
        if (l instanceof Array) {
            return l.join('@');
        } else {
            return l.toString()
        }
    }],
    ["Issue", l => {
        l = l.Issue || '';
        if (l instanceof Array) {
            return l.join('@');
        } else {
            return l.toString()
        }
    }],
    ["摘要", l => l.summary],
    ["内容", l => l.content],
];
class Loading {
    constructor() {
        this.dom = document.createElement('div');
        this.dom.setAttribute('style', 'width:100vw;height:100vh;position:fixed;left:0;top:0;display:none;rgb(144 144 144 / 26%);color: black;text-align: center;line-height: 150px;font-size: 40px;');
        document.body.append(this.dom);
    }
    show(text) {
        this.dom.innerText = text;
        this.dom.style.display = 'block';
    }
    hide() {
        this.dom.style.display = 'none';
    }
};
class SpiderDom {
    constructor(action) {
        this.action = {
            start() {
                action.start();
            },
            getCacheData() {
                return action.getCacheData();
            },
            downloadCacheData() {
                action.downloadCacheData();
            }
        };
        let dom = this.buildDom();
        $('.o_body').append(dom);
        this.initDomEvent();
    }

    buildDom() {
        return $(`<div style="display: block;flex: 1;background: #dfdfdf;">
<div id="spider_action">
<button id="spider_action_start" style="background: #025293;border-radius: 3px;color: #ffffff;cursor: pointer;padding: 5px;">开始爬取</button>
<!--<button id="spider_action_cache">从缓存读取</button>-->
<button id="spider_action_download" style="background: #025293;border-radius: 3px;color: #ffffff;cursor: pointer;padding: 5px;">下载</button>
</div>
<!--<div id="spider_search_info"></div>-->
<div id="spider_process"></div>
<div id="spider_result"></div>
</div>`);
    }
    initDomEvent() {
        $("#spider_action_start").on('click', () => {
            this.action.start();
        });
        $("#spider_action_cache").on('click', () => {
            let datas = this.action.getCacheData();
        });
        $("#spider_action_download").on('click', () => {
            this.action.downloadCacheData();
        });
    }
    drawCurrentSpiderData(list) {
        // spider_result
        let head = `<thead>${tableMapping.map(tm => `<th>${tm[0]}</th>`).join('')}</thead>`;
        // spider_result
        $("#spider_result").html(`${head}<tbody>
${list.map((l, ind) => {
            return `<tr>${tableMapping.map(tm => `<td>${tm[1](l, ind)}</td>`).join('')}</tr>`;
        }).join('\n')}
</tbody>`);
    }
    renderAfterPageRequest(requestData, responseData) {
        //
        let pageInfo = responseData.result.data.pager;
        let list = responseData.result.data.middle.list;
        let currentSpiderCount = (pageInfo.pageNo - 1) * pageInfo.pageSize + list.length;
        $("#spider_process").text(`总共有 ${pageInfo.total} 条记录,已经爬取 ${currentSpiderCount}`);
        this.drawCurrentSpiderData(list);
    }
    finish() {
        $("#spider_process").text("爬取完成");
    }
};
class Cache {
    constructor() {
        this.key = `spider_cache_key`;
        this.cache = [];
    }
    addData(list) {
        this.cache.splice(this.cache.length,0,...list);
        localStorage.setItem(this.key, JSON.stringify(this.cache));
    }
    getData() {
        if (this.cache.length) {
            return this.cache;
        } else {
            let data = localStorage.getItem(this.key);
            if (data) {
                try {
                    data = JSON.parse(data);
                } catch (e) {
                    data = null;
                } finally {
                    return data || [];
                }
            } else {
                return [];
            }
        }
    }
    download() {
        let data = this.getData();
        let head = tableMapping.map(_ => `"${_[0]}"`).join(',');
        let content = data.map((line, ind) => {
            return tableMapping.map(_ => {
                return `"${(_[1](line, ind) || "").toString().replace(/"/g, "'")}"`
            }).join(',');
        }).join('\r\n');
        funDownload("\uFEFF" + head + "\r\n" + content, (new Date().getTime()) + ".csv");
    }
}
class RequestPageData {
    static Status = {
        Over: 0,
        Requesting: 1,
        StartRequest: 2,
    }
    constructor() {
        this.currentPage = 0;
        this.pager = {
            "pageCount": 181,
            "total": 1810,
            "pageNo": 181,
            "pageSize": 10,
            "totalPage": 181,
        };
        this.beforeRequestHandle = [];
        this.afterRequestHandle = [];
        this.overRequestHandle = [];
        this.padding = false;
        this.requestId = null;
    }
    addAfterRequestHandle(cb) {
        this.afterRequestHandle.push(cb);
        return this;
    }
    addBeforeRequestHandle(cb) {
        this.beforeRequestHandle.push(cb);
        return this;
    }
    addOverRequestHandle(cb) {
        this.overRequestHandle.push(cb);
        return this;
    }
    start() {
        if (this.requestId) {
            return;
        } else {
            this.currentPage = 0;
            this.requestId = setInterval(() => {
                let status = this.go();
                if (status === RequestPageData.Status.Over) {
                    this.overRequestHandle.forEach(cb => cb());
                    clearInterval(this.requestId);
                }
            }, 500);
        }
    }
    go(page) {
        if (this.padding) {
            return RequestPageData.Status.Requesting;
        }
        if (page) {
            this.currentPage = page;
        } else {
            this.currentPage++;
        }
        if (this.currentPage === 1 || this.currentPage <= this.pager.totalPage) {
            this.padding = true;
            this.beforeRequestHandle.forEach(cb => cb(this.currentPage));
            window.listPager.goPageFn(this.currentPage);
            return RequestPageData.Status.Requesting;
        } else {
            return RequestPageData.Status.Over;
        }
    }
    callback(requestData, responseData) {
        this.pager = {
            ...responseData.result.data.pager,
            totalPage: Math.ceil(responseData.result.data.pager.total / responseData.result.data.pager.pageSize)
        };
        this.afterRequestHandle.forEach(cb => cb(requestData, responseData));
        this.padding = false;
    }
}

function init() {
    let cache = new Cache();
    let loading = new Loading();
    let hideResultId = false;
    let requestPageData = new RequestPageData();
    let spiderDom = new SpiderDom({
        start(page) {
            requestPageData.start();
        },
        getCacheData() {
            return cache.getData();
        },
        downloadCacheData() {
            cache.download();
            // todo
        }
    });
    requestPageData
        .addAfterRequestHandle(spiderDom.renderAfterPageRequest.bind(spiderDom))
        .addAfterRequestHandle((req, rep) => {
            cache.addData(rep.result.data.middle.list);
        })

        .addBeforeRequestHandle(page => loading.show(`正在获取数据 [page=${page}]`))

        .addOverRequestHandle(spiderDom.finish.bind(spiderDom))
        .addOverRequestHandle(() => loading.hide());

    return {
        hideResult() {
            if (hideResultId) {
                return;
            }
            function hide() {
                $('div.js_listPager.listPager').each((idx,e) => e.style.display = 'none');
                $('div.main_content.basic_result.js_result').each((idx,e) => e.style.display = 'none');
                $('iframe').each((idx,e) => e.parentElement.style.display = 'none');
                $('.search-main')[0].style.flex = 'unset';
            };
            hide();
            hideResultId = setInterval(() => {
                hide();
            }, 2000);
        },
        loading,
        redirectSuccess(requestData, responseData) {
            requestPageData.callback(requestData, responseData);
        },
    };
}


(function() {
    function inj() {
        let obj = init();
        const ajax = jQuery.ajax;
        window.ajaxInj = function (opt) {
            return ajax(opt); // 调用保存的原始 $.ajax
        };
        obj.hideResult();

        delete jQuery.ajax;
        delete $.ajax;

        $.ajax = function (opt) {
            if (opt.url.startsWith('https://sousuoht.www.gov.cn/athena/forward') && opt.dataType === 'json') {
                opt._go_ = true;
                opt.success = null;
                return window.ajaxInj(opt).then(ret => {
                    obj.loading.show();
                    obj.redirectSuccess(opt, ret);
                    obj.loading.hide();
                });
            } else {
                return window.ajaxInj(opt);
            }
        };
    }

    let id = setInterval(() => {
        if (window.$ && window.$.ajax && window.jQuery) {
            inj();
            clearInterval(id);
        }
    }, 500);

    // Your code here...
})();