Extract links from frame

§

Posted: 2022-08-02

Report comment

Testing site: http://konjo.sub.jp/o-kami/img/main/gij/girl/00.htm
This code only extracts links from a portion of what is seen on the page (This is the edited version of the twitter link extractor, to extract links from general sites):

// ==UserScript==
// @name         Extract any links from any site
// @namespace    any site
// @version      0.2
// @description  try to take over the world!
// @include      *
// @exclude      https://twitter.com/*
// @exclude      https://www.smwcentral.net/*
// @exclude      https://github.com/*
// @grant        none
// ==/UserScript==


(function() {
    'use strict';
    const all = window.allLink = new Set();
    
    function getLink() {
        Array.from(document.querySelectorAll('[href]')).forEach(link=>{ //"a href" links
            if(!all.has(link.href)) {
                all.add(link.href);
                console.log((link.href).replace(/^http/, "ttp").replace(/#.*$/, ""));
            }
        });
        Array.from(document.querySelectorAll('[src]')).forEach(link=>{ //Image
            if(!all.has(link.src)) {
                all.add(link.src);
                console.log((link.src).replace(/^http/, "ttp").replace(/#.*$/, ""));
            }
        });
        Array.from(document.querySelectorAll('[style]')).forEach(link=>{ //Background images
            if(!all.has(link.style)) {
                if ((link.style.backgroundImage).replace(/url\((\"|\')/, "").replace(/(\"|\')\)/, "").replace(/^http/, "ttp") != "") {
                    all.add(link.style);
                    console.log((link.style.backgroundImage).replace(/url\((\"|\')/, "").replace(/(\"|\')\)/, "").replace(/^http/, "ttp"));
                }
            }
        });
    }
    getLink();
    window.addEventListener('scroll',getLink);
})();

If you inspect element, you'll notice the page uses a <frameset> tag and actually loads multiple HTML files together. So can someone make a code based on this but also extract links not just the main HTML file but also all and I mean ALL the other HTML files that were loaded?

Scripter113

§

Posted: 2022-08-02

Edited: 2022-08-02

Report comment

You'll notice that this removes the h in http/https. That is because firefox's console log and browser log truncate URLs' text when copied and/or saved as a text file, replacing the middle portion of the substring text with ellipsis, which invalidates the URL.

Scripter113

§

Posted: 2022-08-13

Report comment

Nevermind. Use window.frames:

(function() {
	'use strict';
	const all = window.allLink = new Set();
	function getLink(PageDocument) {
		Array.from(PageDocument.getElementsByTagName('a')).forEach(link=>{ //"a href" links
			let URLString = FormatURL(link.href)
			if(!all.has(URLString[0])&&URLString[1]) {
				all.add(URLString[0]);
				console.log((URLString[0]).replace(/^http/, "ttp").replace(/#.*$/, ""));
			}
		});
		Array.from(PageDocument.getElementsByTagName('img')).forEach(link=>{ //Images
			let URLString = FormatURL(link.src)
			if(!all.has(URLString[0])&&URLString[1]) {
				all.add(URLString[0]);
				console.log((URLString[0]).replace(/^http/, "ttp").replace(/#.*$/, ""));
			}
		});
		Array.from(PageDocument.getElementsByTagName('*')).forEach(link=>{ //Background images
			let URLString = FormatURL(link.style.backgroundImage.slice(5, -2))
			if(!all.has(URLString[0])&&URLString[1]) {
				all.add(URLString[0]);
				console.log((URLString[0]).replace(/^http/, "ttp"));
			}
		});
	}
	
	function FormatURL(String) {
		let IsStringValid = true
		if ((/^\s*javascript:.*$/.test(String))||String=="none"||String=="") {
			IsStringValid = false
		}
		if (IsStringValid) {
			if (/^\/+/.test(String)) {
				String = String.replace(/^\/+/, "https://")
			} else if (/^(?!http(s)?:\/\/)/.test(String)) {
				String = String.replace(/^/, "https://")
			}
		}
		return [String, IsStringValid]
	};
	
	//Code that executes when the MAIN WINDOW loads the page
	//Please note that this does not reflect the loading of subwindows when you open links in a way that does not reload the main window
	//Since this executes ONCE when the main window loads.

	window.addEventListener('load',getLink.bind(null, document)); //Get links on the main window when page finishes loading
	window.addEventListener('load', (event) => {
		let CurrentDocument = document
		window.addEventListener('scroll',getLink.bind(null, CurrentDocument)); //Get links on the main window when scrolling (when page loads as you scroll; infinute scroll)
		if (window.frames.length) { //Loop through every window and extract their links too (NOTE: will not extract recursively)
			for (let i=0;i

Scripter113

§

Posted: 2022-08-13

Report comment

However, is this safe? As in, I wrote this, and I am worried that there may be a vulnerability in there. Can someone vet this?

hacker09

§

Posted: 2022-08-18

Report comment

You whole code isn't there but looks good.

No one would really try to hack your script haha, specially if you don't publish it

Greasy Fork

Extract links from frame

Post reply