Greasy Fork is available in English.

bufferEncoding

guess the encoding from buffer

Este script não deve ser instalado diretamente. Este script é uma biblioteca de outros scripts para incluir com o diretório meta // @require https://update.greasyfork.org/scripts/462180/1163789/bufferEncoding.js

  1. // ==UserScript==
  2. // @name bufferEncoding
  3. // @namespace http://tampermonkey.net/
  4. // @version 0.1
  5. // @description guess the encoding from buffer
  6. // @author You
  7. // @include *
  8. // ==/UserScript==
  9. (function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.languageEncoding = f()}})(function(){var define,module,exports;return (function(){function r(e,n,t){function o(i,f){if(!n[i]){if(!e[i]){var c="function"==typeof require&&require;if(!f&&c)return c(i,!0);if(u)return u(i,!0);var a=new Error("Cannot find module '"+i+"'");throw a.code="MODULE_NOT_FOUND",a}var p=n[i]={exports:{}};e[i][0].call(p.exports,function(r){var n=e[i][1][r];return o(n||r)},p,p.exports,r,e,n,t)}return n[i].exports}for(var u="function"==typeof require&&require,i=0;i<t.length;i++)o(t[i]);return o}return r})()({1:[function(require,module,exports){
  10. const byteOrderMarks = require("../config/byteOrderMarkObject.js");
  11.  
  12. module.exports = (uInt8Start) => {
  13. for (const element of byteOrderMarks) {
  14. if (element.regex.test(uInt8Start)) return element.encoding;
  15. }
  16.  
  17. return null;
  18. };
  19.  
  20. },{"../config/byteOrderMarkObject.js":6}],2:[function(require,module,exports){
  21. module.exports = (content) => {
  22. for (let b = 0; b < content.length; b++) {
  23. // If ? is encountered it's definitely not utf8!
  24. if (content[b] === "�") {
  25. return false;
  26. }
  27. }
  28. return true;
  29. }
  30. },{}],3:[function(require,module,exports){
  31. const countAllMatches = require("./processing-content/countAllMatches.js");
  32. const calculateConfidenceScore = require("./processing-content/calculateConfidenceScore.js");
  33. const byteOrderMarkObject = require("../config/byteOrderMarkObject.js");
  34.  
  35. module.exports = (data, fileInfo) => {
  36. data.languageArr = countAllMatches(data, fileInfo.encoding);
  37.  
  38. fileInfo.language = data.languageArr.reduce((acc, val) =>
  39. acc.count > val.count ? acc : val
  40. ).name;
  41.  
  42. // "pos" gives us the position in the language array that has the most matches
  43. data.pos = data.languageArr.findIndex(
  44. (elem) => elem.name === fileInfo.language
  45. );
  46.  
  47. // Determine the encoding
  48. if (!fileInfo.encoding) {
  49. fileInfo.encoding = data.languageArr[data.pos].encoding;
  50. }
  51.  
  52. const calculations = calculateConfidenceScore(data, fileInfo);
  53.  
  54. if (fileInfo.confidence.encoding) {
  55. fileInfo.confidence.language = calculations;
  56. } else {
  57. fileInfo.confidence.encoding = calculations;
  58. fileInfo.confidence.language = calculations;
  59. }
  60.  
  61. // Edge case, when no matches were found
  62. if (!data.languageArr[data.pos].count) {
  63. fileInfo.language = null;
  64. fileInfo.confidence.language = null;
  65.  
  66. if (!byteOrderMarkObject.some(obj => obj.encoding === fileInfo.encoding)) {
  67. fileInfo.encoding = null;
  68. fileInfo.confidence.encoding = null;
  69. }
  70. }
  71.  
  72. return fileInfo;
  73. };
  74.  
  75. },{"../config/byteOrderMarkObject.js":6,"./processing-content/calculateConfidenceScore.js":4,"./processing-content/countAllMatches.js":5}],4:[function(require,module,exports){
  76. module.exports = (data, fileInfo) => {
  77. const charRegex = new RegExp(
  78. /\d|\n|\s|\-|\.|\,|\:|\;|\?|\!|\<|\>|\[|\]|\{|\}|\&|\=|\|/,
  79. "g"
  80. );
  81. const totalCharacters = data.content.replace(charRegex, "").length;
  82. const langArr = data.languageArr;
  83. const pos = data.pos;
  84.  
  85. const secondLanguage = langArr.reduce((acc, val) => {
  86. if (acc.name === fileInfo.language) return val;
  87. if (val.name === fileInfo.language) return acc;
  88.  
  89. return acc.count >= val.count ? acc : val;
  90. });
  91.  
  92. const languageRatio =
  93. langArr[pos].count / (secondLanguage.count + langArr[pos].count);
  94. const characterWordRatio = langArr[pos].count / totalCharacters;
  95.  
  96. let lowerLimit = null;
  97. let upperLimit = null;
  98. const multiplier = 0.8;
  99.  
  100. if (fileInfo.encoding === "UTF-8" || fileInfo.encoding === "UTF-16LE") {
  101. lowerLimit = langArr[pos].utfFrequency
  102. ? langArr[pos].utfFrequency.low * multiplier
  103. : null;
  104. upperLimit = langArr[pos].utfFrequency
  105. ? (langArr[pos].utfFrequency.low + langArr[pos].utfFrequency.high) / 2
  106. : null;
  107. } else {
  108. lowerLimit = langArr[pos].isoFrequency
  109. ? langArr[pos].isoFrequency.low * multiplier
  110. : null;
  111. upperLimit = langArr[pos].isoFrequency
  112. ? (langArr[pos].isoFrequency.low + langArr[pos].isoFrequency.high) / 2
  113. : null;
  114. }
  115.  
  116. let confidenceScore;
  117.  
  118. if (!lowerLimit || !upperLimit) {
  119. confidenceScore = null;
  120. } else if (characterWordRatio >= upperLimit) {
  121. confidenceScore = 1;
  122. } else if (characterWordRatio > lowerLimit) {
  123. const range = upperLimit - lowerLimit;
  124. const surplus = characterWordRatio - lowerLimit;
  125. const confidenceRaisePercentage = surplus / range;
  126. const confidenceRaise = (1 - languageRatio) * confidenceRaisePercentage;
  127. confidenceScore = Number((languageRatio + confidenceRaise).toFixed(2));
  128. } else {
  129. confidenceScore = Number(
  130. (languageRatio * (characterWordRatio / lowerLimit)).toFixed(2)
  131. );
  132. }
  133.  
  134. return confidenceScore;
  135. };
  136.  
  137. },{}],5:[function(require,module,exports){
  138. const languageArr = require("../../config/languageObject.js");
  139.  
  140. module.exports = (data, encoding) => {
  141. const newLanguageArr = [];
  142.  
  143. // Cloning the language array and making sure that "count" has no reference to "languageArr"!
  144. languageArr.forEach((obj) => {
  145. const updatedLangObj = {};
  146. Object.keys(obj).forEach((key) => {
  147. if (key !== "count") {
  148. updatedLangObj[key] = obj[key];
  149. } else {
  150. updatedLangObj.count = 0;
  151. }
  152. });
  153. newLanguageArr.push(updatedLangObj);
  154. });
  155.  
  156. const regex = encoding ? "utfRegex" : "isoRegex";
  157.  
  158. // Populating the count property of the language array
  159. newLanguageArr.forEach((lang) => {
  160. if (lang[regex]) {
  161. const matches = data.content.match(lang[regex]);
  162.  
  163. if (matches) lang.count = matches.length;
  164. }
  165. });
  166.  
  167. return newLanguageArr;
  168. };
  169.  
  170. },{"../../config/languageObject.js":7}],6:[function(require,module,exports){
  171. module.exports = [
  172. {
  173. encoding: "UTF-EBCDIC",
  174. regex: new RegExp("221 115 102 115"),
  175. },
  176. {
  177. encoding: "GB-18030",
  178. regex: new RegExp("132 49 149 51"),
  179. },
  180. {
  181. encoding: "UTF-32LE",
  182. regex: new RegExp("255 254 0 0"),
  183. },
  184. {
  185. encoding: "UTF-32BE",
  186. regex: new RegExp("0 0 254 255"),
  187. },
  188. {
  189. encoding: "UTF-8",
  190. regex: new RegExp("239 187 191"),
  191. },
  192. {
  193. encoding: "UTF-7",
  194. regex: new RegExp("43 47 118"),
  195. },
  196. {
  197. encoding: "UTF-1",
  198. regex: new RegExp("247 100 76"),
  199. },
  200. {
  201. encoding: "SCSU",
  202. regex: new RegExp("14 254 255"),
  203. },
  204. {
  205. encoding: "BOCU-1",
  206. regex: new RegExp("251 238 40"),
  207. },
  208. {
  209. encoding: "UTF-16BE",
  210. regex: new RegExp("254 255"),
  211. },
  212. {
  213. encoding: "UTF-16LE",
  214. regex: new RegExp("255 254"),
  215. },
  216. ];
  217.  
  218. },{}],7:[function(require,module,exports){
  219. const flag = "gi";
  220.  
  221. const sharedRegex = {
  222. czech: new RegExp(/jsem|jsi/, flag),
  223. hungarian: new RegExp(/\snem\s/, flag),
  224. slovak: new RegExp(/poriadku|myslím|\ssme\s/, flag),
  225. slovenian: new RegExp(/\skaj\s|lahko|zdaj/, flag),
  226. albanian: new RegExp(/nuk/, flag),
  227. english: new RegExp(/ the /, flag),
  228. french: new RegExp(/c'est/, flag),
  229. portuguese: new RegExp(/ não /, flag),
  230. spanish: new RegExp(/estaba|\smuy\s|siempre|ahora/, flag),
  231. german: new RegExp(/\sdas\s/, flag),
  232. italian: new RegExp(/\sche\s/, flag),
  233. danish: new RegExp(/hvad|noget/, flag),
  234. norwegian: new RegExp(/deg/, flag),
  235. swedish: new RegExp(/ jag /, flag),
  236. dutch: new RegExp(/ het /, flag),
  237. finnish: new RegExp(/hän/, flag),
  238. "serbo-croatian": new RegExp(/ sam | kako /, flag),
  239. estonian: new RegExp(/\sseda\s|\spole\s|midagi/, flag),
  240. icelandic: new RegExp(/Það/, flag),
  241. "malay-indonesian": new RegExp(/tidak/, flag),
  242. turkish: new RegExp(/ bir /, flag),
  243. lithuanian: new RegExp(/taip|\stai\s/, flag),
  244. bengali: new RegExp(/এটা/, flag),
  245. hindi: new RegExp(/हैं/, flag),
  246. urdu: new RegExp(/ایک/, flag),
  247. vietnamese: new RegExp(/ không /, flag)
  248. };
  249.  
  250. const sharedFrequency = {
  251. polish: { low: 0.004355, high: 0.005102 },
  252. czech: { low: 0.004433, high: 0.007324 },
  253. hungarian: { low: 0.004994, high: 0.005183 },
  254. romanian: { low: 0.003319, high: 0.004190 },
  255. slovak: { low: 0.001736, high: 0.002557 },
  256. slovenian: { low: 0.004111, high: 0.004959 },
  257. albanian: { low: 0.003773, high: 0.007313 },
  258. ukrainian: { low: 0.002933, high: 0.005389 },
  259. english: { low: 0.004679, high: 0.007580 },
  260. french: { low: 0.003016, high: 0.004825 },
  261. portuguese: { low: 0.003406, high: 0.005032 },
  262. spanish: { low: 0.002348, high: 0.002881 },
  263. german: { low: 0.004044, high: 0.004391 },
  264. italian: { low: 0.003889, high: 0.005175 },
  265. danish: { low: 0.003630, high: 0.004189 },
  266. norwegian: { low: 0.002410, high: 0.003918 },
  267. swedish: { low: 0.004916, high: 0.007221 },
  268. dutch: { low: 0.003501, high: 0.004150 },
  269. finnish: { low: 0.003308, high: 0.005135 },
  270. "serbo-croatian": { low: 0.002568, high: 0.005182 },
  271. estonian: { low: 0.002892, high: 0.003963 },
  272. icelandic: { low: 0.004366, high: 0.004366 },
  273. "malay-indonesian": { low: 0.002825, high: 0.003932 },
  274. greek: { low: 0.003440, high: 0.004862 },
  275. turkish: { low: 0.002915, high: 0.004588 },
  276. hebrew: { low: 0.003663, high: 0.004666 },
  277. lithuanian: { low: 0.003277, high: 0.003768 },
  278. bengali: { low: 0.003155, high: 0.005236 },
  279. hindi: { low: 0.004159, high: 0.006478 },
  280. urdu: { low: 0.004118, high: 0.005851 },
  281. vietnamese: { low: 0.003387, high: 0.005191 }
  282. };
  283.  
  284. module.exports = [
  285. {
  286. name: "polish",
  287. count: 0,
  288. utfRegex: new RegExp(/się/, flag),
  289. isoRegex: new RegExp(/siê/, flag),
  290. encoding: "CP1250",
  291. utfFrequency: sharedFrequency.polish,
  292. isoFrequency: sharedFrequency.polish
  293. },
  294. {
  295. name: "czech",
  296. count: 0,
  297. utfRegex: sharedRegex.czech,
  298. isoRegex: sharedRegex.czech,
  299. encoding: "CP1250",
  300. utfFrequency: sharedFrequency.czech,
  301. isoFrequency: sharedFrequency.czech
  302. },
  303. {
  304. name: "hungarian",
  305. count: 0,
  306. utfRegex: sharedRegex.hungarian,
  307. isoRegex: sharedRegex.hungarian,
  308. encoding: "CP1250",
  309. utfFrequency: sharedFrequency.hungarian,
  310. isoFrequency: sharedFrequency.hungarian
  311. },
  312. {
  313. name: "romanian",
  314. count: 0,
  315. utfRegex: new RegExp(/sunt|eşti/, flag),
  316. isoRegex: new RegExp(/sunt|eºti/, flag),
  317. encoding: "CP1250",
  318. utfFrequency: sharedFrequency.romanian,
  319. isoFrequency: sharedFrequency.romanian
  320. },
  321. {
  322. name: "slovak",
  323. count: 0,
  324. utfRegex: sharedRegex.slovak,
  325. isoRegex: sharedRegex.slovak,
  326. encoding: "CP1250",
  327. utfFrequency: sharedFrequency.slovak,
  328. isoFrequency: sharedFrequency.slovak
  329. },
  330. {
  331. name: "slovenian",
  332. count: 0,
  333. utfRegex: sharedRegex.slovenian,
  334. isoRegex: sharedRegex.slovenian,
  335. encoding: "CP1250",
  336. utfFrequency: sharedFrequency.slovenian,
  337. isoFrequency: sharedFrequency.slovenian
  338. },
  339. {
  340. name: "albanian",
  341. count: 0,
  342. utfRegex: sharedRegex.albanian,
  343. isoRegex: sharedRegex.albanian,
  344. encoding: "CP1250",
  345. utfFrequency: sharedFrequency.albanian,
  346. isoFrequency: sharedFrequency.albanian
  347. },
  348. {
  349. name: "russian",
  350. count: 0,
  351. utfRegex: new RegExp(/что/, flag),
  352. isoRegex: new RegExp(/÷òî/, flag),
  353. encoding: "CP1251",
  354. utfFrequency: { low: 0.004965, high: 0.005341 },
  355. isoFrequency: { low: 0.003884, high: 0.003986 }
  356. },
  357. {
  358. name: "ukrainian",
  359. count: 0,
  360. utfRegex: new RegExp(/він|але/, flag),
  361. isoRegex: new RegExp(/â³í|àëå/, flag),
  362. encoding: "CP1251",
  363. utfFrequency: sharedFrequency.ukrainian,
  364. isoFrequency: sharedFrequency.ukrainian
  365. },
  366. {
  367. name: "bulgarian",
  368. count: 0,
  369. utfRegex: new RegExp(/това|какво/, flag),
  370. isoRegex: new RegExp(/òîâà|äîáðå|êaêâo/, flag),
  371. encoding: "CP1251",
  372. utfFrequency: { low: 0.005225, high: 0.005628 },
  373. isoFrequency: { low: 0.002767, high: 0.004951 }
  374. },
  375. {
  376. name: "english",
  377. count: 0,
  378. utfRegex: sharedRegex.english,
  379. isoRegex: sharedRegex.english,
  380. encoding: "CP1252",
  381. utfFrequency: sharedFrequency.english,
  382. isoFrequency: sharedFrequency.english
  383. },
  384. {
  385. name: "french",
  386. count: 0,
  387. utfRegex: sharedRegex.french,
  388. isoRegex: sharedRegex.french,
  389. encoding: "CP1252",
  390. utfFrequency: sharedFrequency.french,
  391. isoFrequency: sharedFrequency.french
  392. },
  393. {
  394. name: "portuguese",
  395. count: 0,
  396. utfRegex: sharedRegex.portuguese,
  397. isoRegex: sharedRegex.portuguese,
  398. encoding: "CP1252",
  399. utfFrequency: sharedFrequency.portuguese,
  400. isoFrequency: sharedFrequency.portuguese
  401. },
  402. {
  403. name: "spanish",
  404. count: 0,
  405. utfRegex: sharedRegex.spanish,
  406. isoRegex: sharedRegex.spanish,
  407. encoding: "CP1252",
  408. utfFrequency: sharedFrequency.spanish,
  409. isoFrequency: sharedFrequency.spanish
  410. },
  411. {
  412. name: "german",
  413. count: 0,
  414. utfRegex: sharedRegex.german,
  415. isoRegex: sharedRegex.german,
  416. encoding: "CP1252",
  417. utfFrequency: sharedFrequency.german,
  418. isoFrequency: sharedFrequency.german
  419. },
  420. {
  421. name: "italian",
  422. count: 0,
  423. utfRegex: sharedRegex.italian,
  424. isoRegex: sharedRegex.italian,
  425. encoding: "CP1252",
  426. utfFrequency: sharedFrequency.italian,
  427. isoFrequency: sharedFrequency.italian
  428. },
  429. {
  430. name: "danish",
  431. count: 0,
  432. utfRegex: sharedRegex.danish,
  433. isoRegex: sharedRegex.danish,
  434. encoding: "CP1252",
  435. utfFrequency: sharedFrequency.danish,
  436. isoFrequency: sharedFrequency.danish
  437. },
  438. {
  439. name: "norwegian",
  440. count: 0,
  441. utfRegex: sharedRegex.norwegian,
  442. isoRegex: sharedRegex.norwegian,
  443. encoding: "CP1252",
  444. utfFrequency: sharedFrequency.norwegian,
  445. isoFrequency: sharedFrequency.norwegian
  446. },
  447. {
  448. name: "swedish",
  449. count: 0,
  450. utfRegex: sharedRegex.swedish,
  451. isoRegex: sharedRegex.swedish,
  452. encoding: "CP1252",
  453. utfFrequency: sharedFrequency.swedish,
  454. isoFrequency: sharedFrequency.swedish
  455. },
  456. {
  457. name: "dutch",
  458. count: 0,
  459. utfRegex: sharedRegex.dutch,
  460. isoRegex: sharedRegex.dutch,
  461. encoding: "CP1252",
  462. utfFrequency: sharedFrequency.dutch,
  463. isoFrequency: sharedFrequency.dutch
  464. },
  465. {
  466. name: "finnish",
  467. count: 0,
  468. utfRegex: sharedRegex.finnish,
  469. isoRegex: sharedRegex.finnish,
  470. encoding: "CP1252",
  471. utfFrequency: sharedFrequency.finnish,
  472. isoFrequency: sharedFrequency.finnish
  473. },
  474. {
  475. name: "serbo-croatian",
  476. count: 0,
  477. utfRegex: sharedRegex["serbo-croatian"],
  478. isoRegex: sharedRegex["serbo-croatian"],
  479. encoding: "CP1252",
  480. utfFrequency: sharedFrequency["serbo-croatian"],
  481. isoFrequency: sharedFrequency["serbo-croatian"]
  482. },
  483. {
  484. name: "estonian",
  485. count: 0,
  486. utfRegex: sharedRegex.estonian,
  487. isoRegex: sharedRegex.estonian,
  488. encoding: "CP1252",
  489. utfFrequency: sharedFrequency.estonian,
  490. isoFrequency: sharedFrequency.estonian
  491. },
  492. {
  493. name: "icelandic",
  494. count: 0,
  495. utfRegex: sharedRegex.icelandic,
  496. isoRegex: sharedRegex.icelandic,
  497. encoding: "CP1252",
  498. utfFrequency: sharedFrequency.icelandic,
  499. isoFrequency: sharedFrequency.icelandic
  500. },
  501. {
  502. name: "malay-indonesian",
  503. count: 0,
  504. utfRegex: sharedRegex["malay-indonesian"],
  505. isoRegex: sharedRegex["malay-indonesian"],
  506. encoding: "CP1252",
  507. utfFrequency: sharedFrequency["malay-indonesian"],
  508. isoFrequency: sharedFrequency["malay-indonesian"]
  509. },
  510. {
  511. name: "greek",
  512. count: 0,
  513. utfRegex: new RegExp(/είναι/, flag),
  514. isoRegex: new RegExp(/åßíáé/, flag),
  515. encoding: "CP1253",
  516. utfFrequency: sharedFrequency.greek,
  517. isoFrequency: sharedFrequency.greek
  518. },
  519. {
  520. name: "turkish",
  521. count: 0,
  522. utfRegex: sharedRegex.turkish,
  523. isoRegex: sharedRegex.turkish,
  524. encoding: "CP1254",
  525. utfFrequency: sharedFrequency.turkish,
  526. isoFrequency: sharedFrequency.turkish
  527. },
  528. {
  529. name: "hebrew",
  530. count: 0,
  531. utfRegex: new RegExp(/אתה/, flag),
  532. isoRegex: new RegExp(/àúä/, flag),
  533. encoding: "CP1255",
  534. utfFrequency: sharedFrequency.hebrew,
  535. isoFrequency: sharedFrequency.hebrew
  536. },
  537. {
  538. name: "arabic",
  539. count: 0,
  540. utfRegex: new RegExp(/هذا/, flag),
  541. isoRegex: new RegExp(/åðç/, flag),
  542. encoding: "CP1256",
  543. utfFrequency: { low: 0.003522, high: 0.004348 },
  544. isoFrequency: { low: 0.003773, high: 0.005559 }
  545. },
  546. {
  547. name: "farsi-persian",
  548. count: 0,
  549. utfRegex: new RegExp(/اون/, flag),
  550. isoRegex: new RegExp(/çíä/, flag),
  551. encoding: "CP1256",
  552. utfFrequency: { low: 0.002761, high: 0.004856 },
  553. isoFrequency: { low: 0.003010, high: 0.006646 }
  554. },
  555. {
  556. name: "lithuanian",
  557. count: 0,
  558. utfRegex: sharedRegex.lithuanian,
  559. isoRegex: sharedRegex.lithuanian,
  560. encoding: "CP1257",
  561. utfFrequency: sharedFrequency.lithuanian,
  562. isoFrequency: sharedFrequency.lithuanian
  563. },
  564. {
  565. name: "chinese-simplified",
  566. count: 0,
  567. utfRegex: new RegExp(/么/, flag),
  568. isoRegex: new RegExp(/´ó|¶¯|Å®/, flag),
  569. encoding: "GB18030",
  570. utfFrequency: { low: 0.009567, high: 0.011502 },
  571. isoFrequency: { low: 0.003137, high: 0.005009 }
  572. },
  573. {
  574. name: "chinese-traditional",
  575. count: 0,
  576. utfRegex: new RegExp(/們/, flag),
  577. isoRegex: new RegExp(/¦b/, flag),
  578. encoding: "BIG5",
  579. utfFrequency: { low: 0.012484, high: 0.014964 },
  580. isoFrequency: { low: 0.005063, high: 0.005822 }
  581. },
  582. {
  583. name: "japanese",
  584. count: 0,
  585. utfRegex: new RegExp(/ど/, flag),
  586. isoRegex: new RegExp(/‚»|‚Á‚Ä/, flag),
  587. encoding: "Shift-JIS",
  588. utfFrequency: { low: 0.004257, high: 0.006585 },
  589. isoFrequency: { low: 0.004286, high: 0.004653 }
  590. },
  591. {
  592. name: "korean",
  593. count: 0,
  594. utfRegex: new RegExp(/도/, flag),
  595. isoRegex: new RegExp(/àö¾î|å¾ß|¡¼­/, flag),
  596. encoding: "EUC-KR",
  597. utfFrequency: { low: 0.010910, high: 0.013670 },
  598. isoFrequency: { low: 0.004118, high: 0.004961 }
  599. },
  600. {
  601. name: "thai",
  602. count: 0,
  603. utfRegex: new RegExp(/แฮร์รี่|พอตเตอร์/, flag),
  604. isoRegex: new RegExp(/áîãìãõè|¾íµàµíãì­/, flag),
  605. encoding: "TIS-620",
  606. utfFrequency: { low: 0.003194, high: 0.003468 },
  607. isoFrequency: { low: 0.002091, high: 0.002303 }
  608. },
  609. // The following languages don't seem to have their own encoding
  610. // Subtitle files in these languages seem to almost exclusively use UTF encoding.
  611. {
  612. name: "bengali",
  613. count: 0,
  614. utfRegex: sharedRegex.bengali,
  615. isoRegex: sharedRegex.bengali,
  616. utfFrequency: sharedFrequency.bengali,
  617. isoFrequency: sharedFrequency.bengali
  618. },
  619. {
  620. name: "hindi",
  621. count: 0,
  622. utfRegex: sharedRegex.hindi,
  623. isoRegex: sharedRegex.hindi,
  624. utfFrequency: sharedFrequency.hindi,
  625. isoFrequency: sharedFrequency.hindi
  626. },
  627. {
  628. name: "urdu",
  629. count: 0,
  630. utfRegex: sharedRegex.urdu,
  631. isoRegex: sharedRegex.urdu,
  632. utfFrequency: sharedFrequency.urdu,
  633. isoFrequency: sharedFrequency.urdu
  634. },
  635. {
  636. name: "vietnamese",
  637. count: 0,
  638. utfRegex: sharedRegex.vietnamese,
  639. isoRegex: sharedRegex.vietnamese,
  640. utfFrequency: sharedFrequency.vietnamese,
  641. isoFrequency: sharedFrequency.vietnamese
  642. },
  643. ];
  644. },{}],8:[function(require,module,exports){
  645. const checkUTF = require("./components/checkUTF.js");
  646. const processContent = require("./components/processContent.js");
  647. const checkByteOrderMark = require("./components/checkByteOrderMark.js");
  648.  
  649. module.exports = (buffer) => {
  650. return new Promise((resolve, reject) => {
  651. const bufferInfo = {
  652. encoding: null,
  653. language: null,
  654. confidence: {
  655. encoding: null,
  656. language: null,
  657. },
  658. };
  659. const data = {};
  660.  
  661. // Check the byte order mark!
  662. const byteOrderMarkBuffer = new FileReader();
  663.  
  664. byteOrderMarkBuffer.onload = () => {
  665. const uInt8String = new Uint8Array(byteOrderMarkBuffer.result).slice(0, 4).join(" ");
  666. const byteOrderMark = checkByteOrderMark(uInt8String);
  667.  
  668. if (byteOrderMark) {
  669. bufferInfo.encoding = byteOrderMark;
  670. bufferInfo.confidence.encoding = 1;
  671.  
  672. const byteOrderMarkReader = new FileReader();
  673.  
  674. byteOrderMarkReader.onload = () => {
  675. data.content = byteOrderMarkReader.result;
  676. resolve(processContent(data, bufferInfo));
  677. };
  678.  
  679. byteOrderMarkReader.onerror = (err) => {
  680. reject(err);
  681. };
  682.  
  683. byteOrderMarkReader.readAsArrayBuffer(buffer, bufferInfo.encoding);
  684. } else {
  685. // Read with UTF-8 first, then with ISO-8859-1
  686. const utfReader = new FileReader();
  687.  
  688. utfReader.onload = () => {
  689. const utfContent = utfReader.result;
  690.  
  691. const utf8 = checkUTF(utfContent);
  692.  
  693. if (utf8) {
  694. bufferInfo.encoding = "UTF-8";
  695. bufferInfo.confidence.encoding = 1;
  696. }
  697.  
  698. if (utf8) {
  699. data.content = utfContent;
  700. resolve(processContent(data, bufferInfo));
  701. } else {
  702. const isoReader = new FileReader();
  703.  
  704. isoReader.onload = () => {
  705. data.content = isoReader.result;
  706. resolve(processContent(data, bufferInfo));
  707. };
  708.  
  709. isoReader.readAsText(buffer, "ISO-8859-1");
  710. }
  711. };
  712.  
  713. utfReader.onerror = (err) => {
  714. reject(err);
  715. };
  716.  
  717. utfReader.readAsText(buffer, "UTF-8");
  718. }
  719. };
  720.  
  721. byteOrderMarkBuffer.onerror = (err) => {
  722. reject(err);
  723. };
  724.  
  725. byteOrderMarkBuffer.readAsArrayBuffer(buffer);
  726. });
  727. };
  728.  
  729. },{"./components/checkByteOrderMark.js":1,"./components/checkUTF.js":2,"./components/processContent.js":3}]},{},[8])(8)
  730. });