Avito Scraping Assistant

Assistente per navigare e scaricare dati da Avito con gestione manuale dei CAPTCHA e supporto corretto per caratteri cirillici

// ==UserScript==
// @name         Avito Scraping Assistant
// @namespace    https://danielfragomeli.com/
// @version      1.2
// @description  Assistente per navigare e scaricare dati da Avito con gestione manuale dei CAPTCHA e supporto corretto per caratteri cirillici
// @author       dan098
// @match        *://*.avito.ru/*
// @grant        GM_setValue
// @grant        GM_getValue
// @grant        GM_deleteValue
// @grant        GM_listValues
// @grant        GM_setClipboard
// @grant        GM_xmlhttpRequest
// @grant        unsafeWindow
// @license      MIT
// @require      https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.min.js
// @require      https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.2/papaparse.min.js
// ==/UserScript==

(function() {
    'use strict';

    // Stili CSS per l'interfaccia
    const css = `
        #avito-scraper-panel {
            position: fixed;
            bottom: 10px;
            right: 10px;
            background-color: #fff;
            border: 2px solid #0078d7;
            border-radius: 5px;
            padding: 10px;
            width: 400px;
            z-index: 10000;
            font-family: Arial, sans-serif;
            box-shadow: 0 0 10px rgba(0,0,0,0.2);
        }
        #avito-scraper-panel h3 {
            margin-top: 0;
            margin-bottom: 10px;
            color: #0078d7;
            border-bottom: 1px solid #eee;
            padding-bottom: 5px;
        }
        #avito-urls {
            width: 100%;
            height: 100px;
            margin-bottom: 10px;
            resize: vertical;
        }
        #avito-status {
            margin: 10px 0;
            padding: 5px;
            border-radius: 3px;
            background-color: #f0f0f0;
        }
        .avito-button {
            background-color: #0078d7;
            color: white;
            border: none;
            padding: 8px 12px;
            margin: 5px 5px 5px 0;
            border-radius: 3px;
            cursor: pointer;
        }
        .avito-button:hover {
            background-color: #005a9e;
        }
        .avito-button:disabled {
            background-color: #cccccc;
            cursor: not-allowed;
        }
        .progress-bar {
            height: 15px;
            background-color: #e0e0e0;
            border-radius: 5px;
            margin: 10px 0;
        }
        .progress-fill {
            height: 100%;
            background-color: #0078d7;
            border-radius: 5px;
            width: 0%;
            transition: width 0.3s;
        }
        .captcha-alert {
            color: #d61e00;
            font-weight: bold;
            margin-top: 5px;
            display: none;
        }
        .controls-row {
            display: flex;
            justify-content: space-between;
            align-items: center;
        }
        .pause-button {
            background-color: #ff9800;
        }
        .pause-button:hover {
            background-color: #e68a00;
        }
        .loading-indicator {
            color: blue;
            font-style: italic;
        }
        #current-url-container {
            margin: 10px 0;
            border: 1px solid #ddd;
            padding: 5px;
            border-radius: 3px;
            word-break: break-all;
            font-size: 12px;
        }
        #extracted-preview {
            margin: 10px 0;
            max-height: 100px;
            overflow-y: auto;
            border: 1px solid #ddd;
            padding: 5px;
            font-size: 12px;
            background-color: #f9f9f9;
        }
    `;

    // Aggiungi stili alla pagina
    const styleEl = document.createElement('style');
    styleEl.textContent = css;
    document.head.appendChild(styleEl);

    // Crea e aggiungi il pannello di controllo alla pagina
    const panel = document.createElement('div');
    panel.id = 'avito-scraper-panel';
    panel.innerHTML = `
        <h3>Avito Scraping Assistant</h3>
        <div>
            <textarea id="avito-urls" placeholder="Inserisci gli URL da visitare, uno per riga"></textarea>
            <div class="controls-row">
                <button id="load-urls" class="avito-button">Carica URLs</button>
                <button id="load-file" class="avito-button">Carica da File</button>
                <input type="file" id="url-file" style="display: none;" accept=".txt,.csv">
            </div>
        </div>
        <div id="current-url-container" style="display: none;">
            <strong>URL Corrente:</strong> <span id="current-url"></span>
        </div>
        <div class="progress-bar">
            <div class="progress-fill" id="progress-fill"></div>
        </div>
        <div id="avito-status">Stato: Pronto</div>
        <div class="captcha-alert" id="captcha-alert">⚠️ CAPTCHA rilevato! Risolvilo manualmente, poi clicca "Continua"</div>
        <div class="loading-indicator" id="loading-indicator" style="display: none;">Caricamento pagina in corso...</div>
        <div id="extracted-preview" style="display: none;"></div>
        <div>
            <button id="start-scraping" class="avito-button" disabled>Inizia</button>
            <button id="extract-data" class="avito-button" disabled>Estrai Dati</button>
            <button id="next-url" class="avito-button" disabled>Prossimo</button>
            <button id="pause-resume" class="avito-button pause-button" disabled>Pausa</button>
        </div>
        <div>
            <button id="export-csv" class="avito-button" disabled>Esporta CSV</button>
            <button id="clear-data" class="avito-button" disabled>Cancella Dati</button>
            <button id="view-data" class="avito-button" disabled>Visualizza Dati</button>
        </div>
    `;

    document.body.appendChild(panel);

    // Funzioni helper per gestire lo stato dello scraping
    class ScrapingState {
        constructor() {
            this.urls = [];
            this.currentIndex = 0;
            this.extractedData = [];
            this.isPaused = false;
            this.isLoading = false;
            this.loadState();
        }

        loadState() {
            const savedUrls = GM_getValue('avitoUrls', null);
            const savedIndex = GM_getValue('avitoCurrentIndex', 0);
            const savedData = GM_getValue('avitoExtractedData', null);
            
            if (savedUrls) {
                this.urls = JSON.parse(savedUrls);
                document.getElementById('avito-urls').value = this.urls.join('\n');
            }
            
            this.currentIndex = savedIndex;
            
            if (savedData) {
                this.extractedData = JSON.parse(savedData);
            }
            
            this.updateButtonStates();
            this.updateProgress();
            
            if (this.urls.length > 0) {
                document.getElementById('start-scraping').disabled = false;
                
                if (this.extractedData.length > 0) {
                    document.getElementById('export-csv').disabled = false;
                    document.getElementById('clear-data').disabled = false;
                    document.getElementById('view-data').disabled = false;
                }
            }
        }

        saveState() {
            GM_setValue('avitoUrls', JSON.stringify(this.urls));
            GM_setValue('avitoCurrentIndex', this.currentIndex);
            GM_setValue('avitoExtractedData', JSON.stringify(this.extractedData));
        }

        setUrls(urlArray) {
            this.urls = urlArray.filter(url => url.trim() !== '');
            this.saveState();
            this.updateButtonStates();
            this.updateProgress();
        }

        getCurrentUrl() {
            if (this.currentIndex < this.urls.length) {
                return this.urls[this.currentIndex];
            }
            return null;
        }

        moveToNext() {
            if (this.currentIndex < this.urls.length - 1) {
                this.currentIndex++;
                this.saveState();
                this.updateProgress();
                return true;
            }
            return false;
        }

        addExtractedData(data) {
            this.extractedData.push(data);
            this.saveState();
            document.getElementById('export-csv').disabled = false;
            document.getElementById('clear-data').disabled = false;
            document.getElementById('view-data').disabled = false;
            
            // Mostra un'anteprima dei dati estratti
            this.showDataPreview(data);
        }

        showDataPreview(data) {
            const previewEl = document.getElementById('extracted-preview');
            previewEl.style.display = 'block';
            
            // Seleziona alcuni campi chiave da mostrare
            const preview = `
                <strong>Dati estratti:</strong><br>
                Venditore: ${data['seller name']}<br>
                Città: ${data['Seller City']}<br>
                Prezzo: ${data.Price}<br>
                <small>(Dati salvati correttamente)</small>
            `;
            
            previewEl.innerHTML = preview;
        }

        updateProgress() {
            const progressElement = document.getElementById('progress-fill');
            const percent = this.urls.length > 0 ? (this.currentIndex / this.urls.length) * 100 : 0;
            progressElement.style.width = `${percent}%`;
            
            const statusElement = document.getElementById('avito-status');
            statusElement.textContent = `Stato: ${this.currentIndex}/${this.urls.length} URL processati`;
            
            const currentUrlContainer = document.getElementById('current-url-container');
            const currentUrlSpan = document.getElementById('current-url');
            
            if (this.getCurrentUrl()) {
                currentUrlContainer.style.display = 'block';
                currentUrlSpan.textContent = this.getCurrentUrl();
            } else {
                currentUrlContainer.style.display = 'none';
            }
        }

        updateButtonStates() {
            const startButton = document.getElementById('start-scraping');
            const extractButton = document.getElementById('extract-data');
            const nextButton = document.getElementById('next-url');
            const pauseButton = document.getElementById('pause-resume');
            const viewDataButton = document.getElementById('view-data');
            
            startButton.disabled = this.urls.length === 0;
            
            const isCurrentPage = window.location.href.includes(this.getCurrentUrl());
            extractButton.disabled = !isCurrentPage || this.isLoading;
            nextButton.disabled = this.currentIndex >= this.urls.length - 1 || this.isLoading;
            pauseButton.disabled = this.urls.length === 0;
            viewDataButton.disabled = this.extractedData.length === 0;
        }

        setLoading(isLoading) {
            this.isLoading = isLoading;
            const loadingIndicator = document.getElementById('loading-indicator');
            loadingIndicator.style.display = isLoading ? 'block' : 'none';
            this.updateButtonStates();
        }

        clearData() {
            this.extractedData = [];
            this.saveState();
            document.getElementById('export-csv').disabled = true;
            document.getElementById('clear-data').disabled = true;
            document.getElementById('view-data').disabled = true;
            document.getElementById('extracted-preview').style.display = 'none';
        }
    }

    // Istanza dello stato dello scraping
    const state = new ScrapingState();

    // Funzione per estrarre i dati dalla pagina attuale
    function extractDataFromPage() {
        const data = {
            link: window.location.href,
            'seller name': 'N/A',
            'shop link': 'N/A',
            marketplace: 'Avito',
            product: 'Jewelry',
            image: 'N/A',
            description: 'N/A',
            'Seller City': 'N/A',
            Price: 'N/A',
            'Data scaricamento': new Date().toISOString().split('T')[0],
            month: '2025-03',
            Brand: 'Roberto Coin',
            Country: 'Russia',
            'Type of platform': 'Marketplace'
        };

        // Gli XPath forniti
        const xpaths = {
            'seller name': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div/div[1]/div/div/div/h3/a",
            'shop link': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[3]/div[2]/div/div/div/div[1]/div/div[1]/div/div[1]/div/div/div/h3/a",
            'image': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[1]/div[1]/div/div/div/div/div[3]/img",
            'description': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[2]/div/div/div/p[1]",
            'Seller City': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/div[1]/div/p[1]/span",
            'Price': "/html/body/div[1]/div/div[3]/div[1]/div/div[2]/div[3]/div/div[2]/div/div/div/div[1]/div/div[1]/div/div/div/span/span/span[1]"
        };

        // Estrai i dati utilizzando gli XPath
        for (const [field, xpath] of Object.entries(xpaths)) {
            try {
                const element = document.evaluate(
                    xpath, 
                    document, 
                    null, 
                    XPathResult.FIRST_ORDERED_NODE_TYPE, 
                    null
                ).singleNodeValue;

                if (element) {
                    if (field === 'shop link') {
                        data[field] = element.getAttribute('href') || 'N/A';
                        // Aggiunge dominio se il link è relativo
                        if (data[field] && !data[field].startsWith('http')) {
                            data[field] = `https://www.avito.ru${data[field]}`;
                        }
                    } else if (field === 'image') {
                        data[field] = element.getAttribute('src') || 'N/A';
                    } else {
                        data[field] = element.textContent.trim() || 'N/A';
                    }
                }
            } catch (e) {
                console.error(`Errore nell'estrazione di ${field}:`, e);
            }
        }

        state.addExtractedData(data);
        
        // Aggiorna lo stato e mostra un messaggio di successo
        const statusElement = document.getElementById('avito-status');
        statusElement.textContent = `Stato: Dati estratti da ${state.currentIndex + 1}/${state.urls.length} URL`;
        
        // Abilita il pulsante "Next" se ci sono ancora URL da processare
        document.getElementById('next-url').disabled = state.currentIndex >= state.urls.length - 1;
        
        return data;
    }

    // Funzione per rilevare CAPTCHA
    function detectCaptcha() {
        // Questa è una semplificazione: adatta i selettori in base al reale CAPTCHA di Avito
        const captchaSelectors = [
            'iframe[src*="captcha"]',
            'div[class*="captcha"]',
            'div[id*="captcha"]',
            'input[name*="captcha"]',
            'img[src*="captcha"]',
            'form[action*="captcha"]',
            'div.firewall-container'  // Aggiunto per il firewall di Avito
        ];
        
        for (const selector of captchaSelectors) {
            if (document.querySelector(selector)) {
                return true;
            }
        }
        
        // Cerca anche per testo di CAPTCHA o blocco
        const bodyText = document.body.innerText.toLowerCase();
        const captchaTexts = ['captcha', 'проверка', 'безопасность', 'подтвердите', 'robot', 'человек'];
        for (const text of captchaTexts) {
            if (bodyText.includes(text)) {
                return true;
            }
        }
        
        return false;
    }

    // Gestori degli eventi per i pulsanti
    document.getElementById('load-urls').addEventListener('click', () => {
        const urlText = document.getElementById('avito-urls').value;
        const urlArray = urlText.split('\n').map(url => url.trim()).filter(url => url !== '');
        state.setUrls(urlArray);
        document.getElementById('start-scraping').disabled = urlArray.length === 0;
    });

    document.getElementById('load-file').addEventListener('click', () => {
        document.getElementById('url-file').click();
    });

    document.getElementById('url-file').addEventListener('change', (e) => {
        const file = e.target.files[0];
        if (file) {
            const reader = new FileReader();
            reader.onload = (event) => {
                const content = event.target.result;
                document.getElementById('avito-urls').value = content;
                const urlArray = content.split('\n').map(url => url.trim()).filter(url => url !== '');
                state.setUrls(urlArray);
                document.getElementById('start-scraping').disabled = urlArray.length === 0;
            };
            reader.readAsText(file);
        }
    });

    document.getElementById('start-scraping').addEventListener('click', () => {
        const currentUrl = state.getCurrentUrl();
        if (currentUrl) {
            state.setLoading(true);
            window.location.href = currentUrl;
        }
    });

    document.getElementById('extract-data').addEventListener('click', () => {
        const data = extractDataFromPage();
        const previewEl = document.getElementById('extracted-preview');
        previewEl.style.display = 'block';
    });

    document.getElementById('next-url').addEventListener('click', () => {
        if (state.moveToNext()) {
            const nextUrl = state.getCurrentUrl();
            if (nextUrl) {
                state.setLoading(true);
                window.location.href = nextUrl;
            }
        } else {
            alert('Hai completato tutti gli URL!');
        }
    });

    document.getElementById('pause-resume').addEventListener('click', () => {
        state.isPaused = !state.isPaused;
        const pauseButton = document.getElementById('pause-resume');
        pauseButton.textContent = state.isPaused ? 'Riprendi' : 'Pausa';
    });

    // Funzione per gestire correttamente l'encoding del CSV
    function exportCSV() {
        if (state.extractedData.length === 0) {
            alert('Nessun dato da esportare.');
            return;
        }
        
        // Utilizza PapaParse con configurazione per UTF-8
        const csvConfig = {
            quotes: true,  // Usa sempre le virgolette per proteggere i valori con caratteri speciali
            quoteChar: '"',
            escapeChar: '"',
            delimiter: ",",
            header: true,
            newline: "\r\n",
            skipEmptyLines: false
        };
        
        const csv = Papa.unparse(state.extractedData, csvConfig);
        
        // Aggiunge BOM (Byte Order Mark) per far riconoscere correttamente UTF-8 a Excel e altri programmi
        const BOM = "\uFEFF";
        const csvWithBOM = BOM + csv;
        
        // Crea un blob con encoding esplicito UTF-8
        const blob = new Blob([csvWithBOM], { type: 'text/csv;charset=utf-8;' });
        const url = URL.createObjectURL(blob);
        
        // Crea un link per il download
        const link = document.createElement('a');
        link.href = url;
        link.setAttribute('download', `avito_data_${new Date().toISOString().split('T')[0]}.csv`);
        document.body.appendChild(link);
        link.click();
        document.body.removeChild(link);
    }

    document.getElementById('export-csv').addEventListener('click', exportCSV);

    document.getElementById('clear-data').addEventListener('click', () => {
        if (confirm('Sei sicuro di voler cancellare tutti i dati estratti?')) {
            state.clearData();
            const statusElement = document.getElementById('avito-status');
            statusElement.textContent = 'Stato: Dati cancellati';
        }
    });

    document.getElementById('view-data').addEventListener('click', () => {
        if (state.extractedData.length === 0) {
            alert('Nessun dato da visualizzare.');
            return;
        }
        
        // Crea una nuova finestra per visualizzare i dati
        const dataWindow = window.open('', 'Dati Estratti', 'width=800,height=600');
        dataWindow.document.write(`
            <html>
            <head>
                <title>Dati Estratti da Avito</title>
                <meta charset="UTF-8">
                <style>
                    body { font-family: Arial, sans-serif; margin: 20px; }
                    table { border-collapse: collapse; width: 100%; }
                    th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
                    th { background-color: #f2f2f2; }
                    tr:nth-child(even) { background-color: #f9f9f9; }
                    .container { max-width: 100%; overflow-x: auto; }
                </style>
            </head>
            <body>
                <h2>Dati Estratti da Avito (${state.extractedData.length} record)</h2>
                <div class="container">
                    <table>
                        <thead>
                            <tr>
                                ${Object.keys(state.extractedData[0]).map(key => `<th>${key}</th>`).join('')}
                            </tr>
                        </thead>
                        <tbody>
                            ${state.extractedData.map(record => `
                                <tr>
                                    ${Object.values(record).map(value => `<td>${value}</td>`).join('')}
                                </tr>
                            `).join('')}
                        </tbody>
                    </table>
                </div>
            </body>
            </html>
        `);
    });

    // Controllo periodico di CAPTCHA e caricamento pagina
    function checkPageStatus() {
        const captchaAlert = document.getElementById('captcha-alert');
        
        if (detectCaptcha()) {
            captchaAlert.style.display = 'block';
            state.setLoading(false);
        } else {
            captchaAlert.style.display = 'none';
            
            // Controlla se la pagina è completamente caricata
            if (document.readyState === 'complete') {
                state.setLoading(false);
                
                // Se l'URL corrente corrisponde all'URL che stiamo processando, abilita il pulsante di estrazione
                const currentUrl = state.getCurrentUrl();
                if (currentUrl && window.location.href.includes(currentUrl)) {
                    document.getElementById('extract-data').disabled = false;
                }
            }
        }
    }

    // Controlla lo stato della pagina ogni secondo
    setInterval(checkPageStatus, 1000);

    // Evento quando la pagina è completamente caricata
    window.addEventListener('load', () => {
        // Aggiungiamo un piccolo ritardo per assicurarci che tutti gli elementi siano caricati
        setTimeout(() => {
            state.setLoading(false);
            state.updateButtonStates();
            
            // Verifica se l'URL corrente corrisponde a uno degli URL nella lista
            const currentUrl = state.getCurrentUrl();
            if (currentUrl && window.location.href.includes(currentUrl)) {
                document.getElementById('extract-data').disabled = false;
            }
        }, 1500);
    });

    console.log('Avito Scraping Assistant avviato con supporto UTF-8');
})();