提取 DOM 下的链接并下载为 JSON

爬取某个class被标记fff的 DOM 元素下面的所有链接的内容,并下载为 JSON 文件。

// ==UserScript==
// @name        提取 DOM 下的链接并下载为 JSON
// @version     1.1
// @description 爬取某个class被标记fff的 DOM 元素下面的所有链接的内容,并下载为 JSON 文件。
// @namespace    http://your-namespace.com
// @author       cjm
// @match        http://*/*
// @match        https://*/*
// @grant        GM_registerMenuCommand
// @grant        GM_xmlhttpRequest
// @license MIT
// ==/UserScript==

(function() {
    'use strict';

    // Function to scrape links from selected elements
    function scrapeLinks() {
        // Get the selected elements
        const selectedElements = [...document.querySelectorAll('.fff')];

        // 提取所有链接
        const allLinks = selectedElements
        .flatMap(element => [...element.querySelectorAll('a')])
        .map(link => link.href);

        // 过滤出哈希部分
        const filteredLinks = allLinks.map(link => {
            const hashIndex = link.indexOf('#');
            return hashIndex === -1 ? link : link.substring(0, hashIndex);
        });

        // 去除重复项
        const uniqueLinks = [...new Set(filteredLinks)];

        // 输出结果
        console.log(uniqueLinks);

        // Get the content of each link
        Promise.all(uniqueLinks.map(getLinkContent))
            .then(contentArray => {
            const parsedContentArray = contentArray.map(({link,content})=>{
                const html = content;
                const parser = new DOMParser();
                const doc = parser.parseFromString(html, "text/html");
                let originText = doc.body.textContent;

                originText = originText.replace(/^\s+$/gm, '\n')

                // 替换多个连续的换行符为一个换行符
                originText = originText.replace(/\n+/g, '\n');

                // 替换多个连续的制表符为一个制表符
                originText = originText.replace(/\t+/g, '\t');

                return {link,text:originText}
            });

            // Combine the content into a JSON object
            const jsonContent = {
                links: parsedContentArray
            };

            console.log(jsonContent);

            // Convert the JSON object to a string
            const jsonString = JSON.stringify(jsonContent, null, '\t');


            // Download the JSON file
            downloadFile(jsonString, window.location.host+ '.json');
        });
    }

    // Function to get the content of a link using GM_xmlhttpRequest
    function getLinkContent(link) {
        return new Promise((resolve) => {
            GM_xmlhttpRequest({
                method: 'GET',
                url: link,
                onload: (response) => {
                    if (response.status === 200) {
                        const content = response.responseText;
                        resolve({ link, content });
                    } else {
                        resolve({ link, content: null });
                    }
                }
            });
        });
    }


    // Add a custom menu command to start link scraping
    GM_registerMenuCommand('要爬的元素加入class为fff后,点击开始爬取链接', scrapeLinks);


    // Function to download a file
    function downloadFile(content, filename) {
        const blob = new Blob([content], { type: 'application/json' });
        const url = URL.createObjectURL(blob);
        const a = document.createElement('a');
        a.href = url;
        a.download = filename;
        document.body.appendChild(a);
        a.click();
        document.body.removeChild(a);
    }
})();