大众点评评论

获取大众点评网页评论,解决动态字体加密

// ==UserScript==
// @name         大众点评评论
// @namespace    http://tampermonkey.net/
// @version      0.11
// @description  获取大众点评网页评论,解决动态字体加密
// @author       You
// @match        http://www.dianping.com/shop*
// @match        https://www.dianping.com/shop*
// @icon         https://www.google.com/s2/favicons?domain=dianping.com
// @require      https://greasyfork.org/scripts/435146-html2canvas-132/code/html2canvas132.js?version=986217
// @require      https://unpkg.com/tesseract.js@2.1.0/dist/tesseract.min.js
// @grant        GM.xmlHttpRequest
// ==/UserScript==
/* global html2canvas Tesseract */

// console.log(GM_info);
const moreBtnClass = '.fold';
const lessBtnClass = '.unfold';
const commentClass = '.review-words';
const nextBtnClass = '.NextPage';
const ppocrUrl = 'https://www.paddlepaddle.org.cn/paddlehub-api/image_classification/chinese_ocr_db_crnn_mobile';

(function() {
  'use strict';
  const $ = document.querySelectorAll.bind(document);

  const renderCmt = elm => {
    return new Promise((resolve, reject) => {
      // console.log('elm', elm)
      html2canvas(elm, {
        allowTaint: true,
        scale: 1,
        useCORS: true,
        width: elm.offsetWidth * 1.2,
        height: elm.offsetHeight * 1.2,
        x: -elm.offsetWidth * 0.1,
        y: -elm.offsetHeight * 0.16
      }).then(canvas => {
        const data = canvas.toDataURL().split(',')[1];
        // console.log('data', data);
        // document.body.append(canvas);
        GM.xmlHttpRequest({
          method: 'POST',
          url: ppocrUrl,
          responseType: 'json',
          headers: {
            'Content-Type': 'application/json'
          },
          data: JSON.stringify({ image: data }),
          onload: response => {
            // console.log('response', response);
            const res = response.response.result[0].data.map(r => r.text).join('');
            return resolve(res);
          }
        });
        // return resolve(canvas);
        // console.log('start to recognize');
        // Tesseract.recognize(canvas, 'chi_sim', {
        // langPath: 'https://raw.githubusercontent.com/naptha/tessdata/gh-pages/4.0.0_best/',
        // }).then(res => {
        // console.log(res);
        // const { text } = res.data;
        // return resolve(text);
        // })
      })
    });
  }

  const getAllCommentCanvas = async () => {
    let comments = $(commentClass);
    // comments = [comments[0]];
    let tasks = [];
    let ret = [];
    //for(let i = 0;i <= comments.length; i++) {
    //const cmt = comments[i];
    //const res = await renderCmt(cmt)
    //console.log('res', i, res);
    //}
    comments.forEach((cmt, idx) => {
      const imgs = cmt.querySelectorAll('img');
      imgs.forEach(img => cmt.removeChild(img));
      tasks.push(renderCmt(cmt));
    });
    ret = await Promise.all(tasks);
    return ret;
  }

  const getResult = async () => {
    const $ = document.querySelectorAll.bind(document);
    const documentTxt = new XMLSerializer().serializeToString(document);

    const getCssUrl = () => {
      const bar = documentTxt.matchAll(/href=\"(\/\/s3plus\.meituan\.net\/v1\/.*?)\"/g);
      const baz = [...bar];
      return baz.map(b => 'https:' + b[1]);
    }

    const getSvgUrl = (content) => {
      const bar = content.matchAll(/\[class\^=\"(.*?)\"\].*?url\((\/\/s3plus.meituan.net\/v1\/.*?)\)/g);
      const baz = [...bar];
      return baz;
    }

    const getFileViaUrl = url => {
      return new Promise((resolve, reject) => {
        GM.xmlHttpRequest({
          method: 'GET',
          url: url,
          responseType: 'text',
          headers: {
            'Content-Type': 'text/css'
          },
          onload: response => {
            if (response.status === 200) return resolve(response.response)
            else return resolve('');
          }
        });
      });
    }

    const cssNameMap = {};
    const svgMap = {};
    const urls = getCssUrl();
    let svgUrls = [];
    for(let i = 0;i < urls.length;i += 1) {
      const cssContent = await getFileViaUrl(urls[i]);
      const matchs = cssContent.matchAll(/.(.*?)\{background:-(.*?)px -(.*?)px;}/mg);
      const matchNames = [...matchs];
      matchNames.forEach(name => {
        if (!name[0].includes('[')) {
          cssNameMap[name[1]] = [+Number(name[2]).toFixed(0), +Number(name[3]).toFixed(0)]
        }
      });
      const svgUrl = getSvgUrl(cssContent);
      svgUrls = [...svgUrl, ...svgUrls];
    }

    for(let i = 0;i < svgUrls.length;i += 1) {
      const svgContent = await getFileViaUrl(svgUrls[i][2]);
      const fontLocMap = [...svgContent.matchAll(/<text x=\".*?\" y=\"(.*?)\">(.*?)<\/text>/mg)];
      let fontHeightOffset =0;
      let fontWeightOffset = 0
      if (svgContent.includes('#333')) {
        fontHeightOffset = 23;
        fontWeightOffset = 0;
      }
      if (svgContent.includes('#666')) {
        fontHeightOffset = 15;
        fontWeightOffset = 0;
      }
      const fontLoc = {};
      fontLocMap.forEach((fl, idx) => {
        fontLoc[fl[1]] = idx + 1;
      });
      svgMap[svgUrls[i][1]] = {};
      svgMap[svgUrls[i][1]]['fontLocMap'] = fontLocMap;
      svgMap[svgUrls[i][1]]['fontHeightOffset'] = fontHeightOffset;
      svgMap[svgUrls[i][1]]['fontWeightOffset'] = fontWeightOffset;
      svgMap[svgUrls[i][1]]['fontLoc'] = fontLoc;
    }
    // console.log('svgMap', svgMap);
    // console.log('cssNameMap', cssNameMap);
    Object.keys(cssNameMap).forEach((key, idx) => {
      const arr = cssNameMap[key];
      const keys = Object.keys(svgMap);
      const foo = keys.find(k => key.includes(k));
      const fontMap = svgMap[foo];
      if (!fontMap) return;
      const locX = arr[0];
      const locY = arr[1];
      const fontHeightOffset = fontMap.fontHeightOffset;
      const fontWeightOffset = fontMap.fontWeightOffset;
      const fontLoc = fontMap.fontLoc;
      const fontLocMap = fontMap.fontLocMap;
      const locXLine = Math.floor((locX + fontWeightOffset) / 14);
      const locYLine = fontLoc[locY + fontHeightOffset];
      let val = '';
      // console.log('fontLocMap', fontLocMap);
      if (fontLocMap[locYLine - 1]) val = fontLocMap[locYLine - 1][2][locXLine];
      cssNameMap[key].push(val);
    });
    // console.log('cssMap', cssNameMap)
    const comments = [...$(commentClass)];
    const result = [];
    comments.forEach(cmt => {
      const imgs = cmt.querySelectorAll('img');
      imgs.forEach(img => cmt.removeChild(img));
      const nodes = [...cmt.childNodes];
      let foo = '';
      nodes.forEach(node => {
        const cls = node.className;
        if (cls) {
          const bar = cssNameMap[cls];
          if (bar) foo += bar[2];
        } else foo += node.textContent;
      });
      result.push(foo.trim());
    });

    return result;
  }

  const showResult = (pics) => {
    let foo = document.createElement('p');
    foo.innerHTML = pics.map(p => '<div style="margin-top: 20px;">' + p + '</div>').join(` `);
    foo.style.position = 'fixed';
    foo.style.width = '600px';
    foo.style.height = '600px';
    foo.style.left = '10px';
    foo.style.bottom = '20px';
    foo.style.padding = '20px';
    foo.style.background = '#61ffff';
    foo.style.overflow = 'auto';
    document.body.appendChild(foo);
  }

  let btn = document.createElement('button');
  let next = document.createElement('button');
  btn.innerHTML = '开始采集';
  btn.style.position = 'fixed'
  btn.style.right = '20px';
  btn.style.bottom = '80px';

  next.innerHTML = '下一页';
  next.style.position = 'fixed'
  next.style.right = '20px';
  next.style.bottom = '120px';

  document.body.appendChild(btn);
  document.body.appendChild(next);

  btn.onclick = async () => {
    const moreBtns = $(moreBtnClass);
    moreBtns.forEach(b => {
      b.click();
      b.style.opacity = 0;
    });
    const lessBtns = $(lessBtnClass);
    lessBtns.forEach(l => l.style.opacity = 0);
    //getAllCommentCanvas()
    //.then(pics => {
    //console.log(pics);
    //showResult(pics);
    //})
    //.catch(console.error)
    const res = await getResult();
    // console.log('res', res);
    showResult(res);
  }
  next.onclick = () => {
    const nextBtn = $(nextBtnClass)[0];
    if (nextBtn) nextBtn.click();
  }

})();