// ==UserScript==
// @name Eza's Universal Scraper
// @namespace https://inkbunny.net/ezalias
// @description Gather all images from any page, on command
// @license MIT
// @license Public domain / no rights reserved
// @include *
// @version 1.8
// @noframes
// @grant GM_registerMenuCommand
// ==/UserScript==
// Grab all images and image links - create simple html list of links - insert list at top of page.
// My apologies to anyone reading this; it is a hot mess. But published beats perfect.
// Aha: there IS multi-splitting, using regexes as the delimiter. E.g. "Hello awesome, world!".split(/[\s,]+/); for splittong on spaces and commas.
// Stop scraping. Use document object model.
// Try to find videos, too, like http://www.pornhub.com/view_video.php?viewkey=1098897412 - the mp4's in the source, but not linked except for subscribers
// Imgur does some stupid crap with fake loading / unloading. That'd be fine for memory's sake - but the links also disappear. DownThemAll breaks and this only lists presently-onscreen images.
// Images are still under "content" inside <meta property="og:image"> tags.
// Tempted to just split(http) and filter for URLs with image-filetype extensions.
// List Pixiv links because 'select -> open all in tabs' doesn't work anymore. (Eventually fix in Smoothener.)
// Consider setting custom colors, at least for visited links. (Done.)
// Added NewGrounds because their gallery view is sort of terrible.
// For text files: document.body.innerHTML = document.body.innerHTML.split( /[\n ]/ ).filter( s => s.match( '//' ) ).map( s => '<a href=' + s + '>' + s + '</a><br>' )
// Support MP4s.
// I'm considering getting rid of the link whitelist. Linking all image still goes first, and has purpose - but why not support all ordered links in gallery-like sites?
// This thought leads back to old considerations of scraping all my various subscription pages on multiple websites. With text links, it's tractable.
// The main obstacle, then as now, is domain origin policy. It -should- help that a script for this could run on each of those domains.
// Hide visited links? display:none does not work. Neither does visibility:hidden. Uh... huh.
// https://stackoverflow.com/questions/20074015/a-visited-img-display-none - fuck's sake. "For privacy reasons," it's broken on purpose.
// So there's probably no workaround like unlinking those links. Anything that could tell the site which other sites you've visited is a vulnerability. Shoot.
// 'Open unvisited links' would serve the same purpose.
// booru.org
// Highlight MP4s. Give them a special class or some shit.
// This occasionally gets 'script running too long' errors... and on weird sites. Like DuckDuckGo. Which is bizarre, since I don't think it runs anything at all until triggered. Even then there's no live HTML collection left spinning away. They get Array.from'd in a hurry.
// Menuless access a la Gallery Swallower: insert a DOM element in the upper-left corner, thin enough it won't interfere with sensible website design. When clicked it changes classes. Put an interval in this script to check the length of a live HTML collection containing that class. When it's nonzero, clear the interval and show_links().
// Ideally this is a button that slides out or has on-hover text indicating what the hell it does. It'll be useless on a touchscreen, but come on.
// I commented on HFand "Link all images" appeared in my comment text. The button appeared in the comment iframe and I shrugged it off. This is cause to add //@noframes, obviously, but more importantly it makes me wonder what the fuck HF's comment code is doing and whether that's a vulnerability.
// Given the appearance, consider releasing as "Eza's Scraping Notch." But then Universal Scraper is more descriptive.
/*
. direct image links from 'eza's universal scraper' are so great on sites like Mastodon. no lag. no endless loading. I want that to work on pixiv, which is all lag and loading, because they use lazy js for fucking everything.
https://i.pximg.net/c/250x250_80_a2/img-master/img/2020/05/17/13/37/33/81630978_p0_square1200.jpg
https://i.pximg.net/img-original/img/2020/05/17/13/37/33/81630978_p0.png
. so I'd have to link manga pages normally, or detect page count... but single images are trivial.
*/
// Linux Mint addition: oh hey, e621 has trivial thumbnails.
// https://static1.e621.net/data/preview/dc/07/dc07f1901445c35ab83c052493ec39d2.jpg
// https://static1.e621.net/data/dc/07/dc07f1901445c35ab83c052493ec39d2.png
// Fuck, file extension!
// https://art.ngfiles.com/medium_views/1464000/1464020_bluebreed_veronica.png?f1602728027
// https://art.ngfiles.com/images/1464000/1464020_bluebreed_veronica.png?f1602727984
// Change numbering: build forward list as numbered links / HTML strings, then reverse that list, so reversed numbers count back down.
GM_registerMenuCommand( "Link all images at top of page", show_links );
// Put button on page, since menu is missing on later userscript plugins
var trigger = document.createElement( 'button' );
// Onclick, change class to some spinner, so it reacts instantly and looks like it's loading. Really the interval is waiting a second.
// html += '<style> .reloader { background-color:#dbd7d8; border-radius: 50%; width: 60px; height: 60px; text-align: center; display: inline-block; border:1px solid #19ab19; cursor:pointer; line-height: 20px; color:#194d19; font-family:Arial; font-size:33px; padding: 10px 10px; text-decoration:none; } .reloader:hover { background-color:#2abd2a; } </style>';
trigger.style = "position: absolute; width: 90px; height: 30px; left:-85px; top: 5px; background-color:#303020; text-align: center; display: inline-block; border:1px solid #8080A0; cursor:pointer; line-height: 20px; color:#8080A0; font-family:Arial; font-size:10px; text-decoration:none; z-index: 1000000; overflow:hidden;" // Getting it to slide onscreen on-hover might require adding a proper 'style' element. :hover is a pseudoselector.
// Aaargh the button is affected by other CSS. Do I have to specify a bunch of useless parameters so they're ignored?
// Might be easier to not have text.
// Can I align it from the right-hand side? I want it mostly offscreen, I don't care what it looks like.
// Durrr set width and height.
trigger.innerText = "Link all images"; // "Link all images" flows onto two lines when using right:99vw. Bleh.
trigger.title = "Link all images at top of page";
// Text is WIP. Ideally work the word "visible" in there, since it's non-obvious.
trigger.className = "ezas_unclicked_button";
//trigger.onclick = function(){ this.innerText='...'; this.className = 'ezas_clicked_button'; } // Immediate visible change, idempotent
trigger.onclick = function(){ this.style = 'display:none;'; this.className = 'ezas_clicked_button'; } // Immediate visible change, idempotent
document.body.appendChild( trigger );
// Injecting code into the page is nontrivial - ironically because function.toString is fragile - so just look for a change in the page.
var button_check = document.getElementsByClassName( 'ezas_clicked_button' );
var fake_event = setInterval( function() {
if( button_check.length > 0 ) {
clearInterval( fake_event );
show_links();
}
}, 500 ); // 500 ms is an important threshold for action, and the tiny button instantly disappearing isn't cutting it. I wrote this and I'm tapping my foot.
function show_links () {
var links = get_links();
var block = new String;
// block += "<style> a { color: #BBA; } a:visited { color: #A1A; } </style>"; // This doesn't work, by the way.
// block += "<style> .scraped a { color: #1BA; filter: drop-shadow( 0 0 3px #111 ); } .scraped a:visited { color: #A1A; } </style>";
block += "<style> .scraped a { color: #1BA; } .scraped a:visited { color: #A1A; } </style>";
// block += "<style> .scraped .universal a { color: #11A; } .scraped .universal a:visited { color: #A1A; } </style>";
// <span style=''>? Or at least span id and then id.a in a <style> thing.
// block += "<style> a { color: #BBBBAA; } a:visited { color: #AA11AA; visibility:hidden; display:none; } </style>"; // What the fuck.
// ##article:has-text(/Promoted/)
block += "<style> a:has-text( /.mp4/ ) { color: #BFA; } a:has-text( /.mp4/ ):visited { color: #F1A; } </style>"; // Nope. Do /original as well once this works.
links.reverse(); // Reverse order. Should probably be an option instead of hardcoded, but this is already a hacky little thing.
block += "<span class='scraped'> "
for( var n = 0; n < links.length; n++ ) {
if( n!= 0 && n % 10 == 0 ) { block += "<br>"; }
block += "" + n + " <a class='universal' style:'display: none' href='" + links[n] + "'>" + links[n] + "</a> <br>\n";
// console.log( links[n] );
}
block += "</span>";
document.body.innerHTML = block + document.body.innerHTML;
}
function get_links() {
var urls = new Array;
/*
// Grab links
var links = document.getElementsByTagName( 'a' );
for( var which in links ) { urls.push( "" + links[which] ); }
// Grab <meta content="url"> because Imgur
var links = document.getElementsByTagName( 'meta' );
for( var which in links ) { urls.push( "" + links[which].content ); }
*/
// Bare image links first
// urls = urls.concat( Array.from( document.getElementsByTagName( 'a' ) ).map( v => v.href )
// .filter( u => u.match('.jpg') || u.match('.png') || u.match('.gif') ) );
urls = urls.concat( Array.from( document.getElementsByTagName( 'a' ) ).map( v => v.href ) );
urls = urls.concat( Array.from( document.getElementsByTagName( 'meta' ) ).map( v => v.content ) ); // Imgur. <meta content="url"> nonsense.
/*
// Filter URL list to exclude non-images
for( var n = urls.length-1; n > 0; n-- ) { // Backwards
var ditch = true;
if( urls[n].indexOf( ".jpg" ) > 0 ) { ditch = false; }
if( urls[n].indexOf( ".jpeg" ) > 0 ) { ditch = false; }
if( urls[n].indexOf( ".png" ) > 0 ) { ditch = false; }
if( urls[n].indexOf( ".gif" ) > 0 ) { ditch = false; }
if( urls[n].indexOf( ".mp3" ) > 0 ) { ditch = false; }
// if( urls[n].indexOf( "/pictures/" ) > 0 ) { ditch = false; } // Hacky HF deal - comment out later
// if( urls[n].indexOf( "?mode=medium" ) > 0 ) { ditch = false; } // Hacky Pixiv deal
if( urls[n].indexOf( "en/artworks/" ) > 0 ) { ditch = false; } // Hacky new Pixiv deal
if( urls[n].match( '/art/view/' ) ) { ditch = false; } // Hacky NewGrounds deal
if( ditch ) { urls.splice( n, 1 ); }
}
*/
var whitelist = [ ".jpg", ".jpeg", ".png", ".gif", ".mp3", ".mp4",
"en/artworks/", //Pixiv
"/s/", // IB
"/view/", // FA - obviating NewGrounds, actually - and P34
// "/", // General purpose? Ech, breaks Mastodon. Maybe sort.
"s=view", // Gelbooru
"/pictures/", // HF
"/post/show/", // e296
"/posts/", // Also e296, for pools?
"/art/", // DeviantArt... bluuuh have to exclude #comments
"/artworks/", // Pixiv
"/picture.php", // HA
"artstation.com/projects/", // ArtStation
"/art/view/" ]; // NewGrounds
urls = urls.filter( u => {
return whitelist.map( w => u.match( w ) ? 1 : 0 ).reduce( (a,e) => a+e ); // If any whitelisted item matches this URL, keep this URL.
} )
/*
var blacklist = [ "#comments" ];
urls = urls.filter( u => {
// return blacklist.map( w => u.match( w ) ? 0 : 1 ).reduce( (a,e) => a+e );
return u.match( "#comments" ) ? false : true; // Sloppy.
} )
*/
urls = urls.filter( u => ! u.match( "#comments" ) ); // Yeah?
// Direct image links from thumbnail links.
// let thumbs = urls.filter( u => u.match( 'p0_square' ) ); // Pixiv
// urls = urls.concat( u.map( u => u.replace( 'c/250x250_80_a2/img-master/', 'img-original/' ).replace( '_square1200', '' ) ) );
// let s = u.split('/')
// } ) )
// https://i.pximg.net/c/250x250_80_a2/img-master/img/2020/05/17/13/37/33/81630978_p0_square1200.jpg
// https://i.pximg.net/img-original/img/2020/05/17/13/37/33/81630978_p0.png
// Goddammit - filetypes.
/*
// Add embedded images, unfiltered... because they're images
var srcs = document.getElementsByTagName( 'img' );
// for( var which in srcs ) { urls.push( "" + srcs[which].src ); }
for( var n = 0; n < srcs.length; n++ ) { urls.push( "" + srcs[n].src ); }
*/
urls = urls.concat( Array.from( document.getElementsByTagName( 'img' ) ).map( v => v.src ) );
urls = urls.concat( Array.from( document.getElementsByTagName( 'video' ) ).map( v => v.src ) );
urls = urls.concat( Array.from( document.getElementsByTagName( 'source' ) ).map( v => v.src ) ); // Really, <video> tag? Really?
// Aaaargh new Twitter fucking hides images as you scroll.
if( document.domain != "baraag.net" ) { // Completely fucky order on Baraag. No idea why.
urls = urls.concat( Array.from( document.getElementsByTagName( 'a' ) ).map( v => v.href )
.filter( u => u.match('.jpg') || u.match('.png') || u.match('.gif') ) );
}
// Promote NewGrounds previews - note, probably ignores secondary images
// https://art.ngfiles.com/medium_views/1464000/1464020_bluebreed_veronica.png?f1602728027
// https://art.ngfiles.com/images/1464000/1464020_bluebreed_veronica.png?f1602727984
urls = urls.concat(
urls.filter( u => u.match( 'art.ngfiles.com/medium_views' ) )
.map( u => u.replace( 'medium_views', 'images' ) )
);
// Might be better to redirect, like Twitter and Tumblr URLs. That'd keep the iu_ inline / secondary images in-order.
// Durrrrr just map and replace instead of adding new URLs.
// Thumbnails too?
// <img src="https://art.ngfiles.com/thumbnails/1464000/1464020.png?f1602728033" alt="Veronica">
// https://www.newgrounds.com/art/view/bluebreed/halloween-tron-bonne
// https://art.ngfiles.com/images/1481000/1481891_bluebreed_halloween-tron-bonne.png?f1603898447
/*
urls = urls.concat(
urls.filter( u => u.match( 'art.ngfiles.com/thumbnails' ) )
.map( u => {
u.replace( 'thumbnails', 'images' );
let name = // Nope, need alt-text for this.
} )
);
*/
// Remove small images?
// urls = urls.filter( u => ! u.match( '/small' ) ); // Baraag
// Remove duplicates
for( var n = urls.length-1; n > 1; n-- ) { // Backwards, now
for( var x = n-1; x > 0; x-- ) { // For each array value before N
if( urls[x] == urls[n] ) { urls.splice( x, 1 ); n--; }
}
}
/*
var url_set = new Set( urls );
urls = Array.from( url_set );
*/
urls.push( '----- ----- ----- ----- ----- ----- -----' ); // Visible seperator
urls = urls.concat( Array.from( urls ).reverse() ); // concat is functional and reverse isn't. Fuck Javascript.
return urls;
}
function get_links3() {
//for (var attrname in obj2) { obj1[attrname] = obj2[attrname]; }
var links = document.getElementsByTagName( 'a' );
for( var which in links ) {
var ditch = true;
if( links[which].href.indexOf( ".jpg" ) > 0 ) { ditch = false; }
if( ditch ) { delete link; }
}
return links;
}
function get_links2() {
var srcs = document.getElementsByTagName( 'img' );
var links = new Array;
links = document.getElementsByTagName( 'a' );
for( var n = srcs.length-1; n > 0; n-- ) { // Backwards
var link = links[n].href;
var ditch = true;
if( link.indexOf( ".jpg" ) > 0 ) { ditch = false; }
if( link.indexOf( ".jpeg" ) > 0 ) { ditch = false; }
if( link.indexOf( ".png" ) > 0 ) { ditch = false; }
if( link.indexOf( ".gif" ) > 0 ) { ditch = false; }
// if( ditch ) { links.splice( n, 1 ); }
if( ditch ) { delete links[n]; }
}
//return srcs.concat( links );
return srcs;
}
function get_links1 () {
var links = new Array;
// Gather <a> addresses
var hrefs = document.body.innerHTML.split( 'href=' );
for( var n = 0; n < hrefs.length; n++ ) {
var url = hrefs[n].split( /["'>]+/ )[1]; // Terminate on quotes (or brackets, or space)
links.push( url );
}
// Gather <img> sources
var srcs = document.body.innerHTML.split( 'src=' );
for( var n = 0; n < srcs.length; n++ ) {
// var url = srcs[n].split( /["']+/ )[1];
var url = srcs[n].split( /["'>]+/ )[1];
links.push( url );
}
// Remove non-images
for( var n = links.length-1; n > 0; n-- ) { // Backwards, now
var ditch = true;
if( links[n].indexOf( ".jpg" ) > 0 ) { ditch = false; }
if( links[n].indexOf( ".jpeg" ) > 0 ) { ditch = false; }
if( links[n].indexOf( ".png" ) > 0 ) { ditch = false; }
if( links[n].indexOf( ".gif" ) > 0 ) { ditch = false; }
if( ditch ) { links.splice( n, 1 ); }
}
// Remove duplicates
for( var n = links.length-1; n > 1; n-- ) { // Backwards, now
for( var x = n-1; x > 0; x-- ) { // For each array value before N
if( links[x] == links[n] ) { links.splice( x, 1 ); n--; }
}
}
return links;
}