lemmatizer

lemmatizer 单词原型查找

Bu script direkt olarak kurulamaz. Başka scriptler için bir kütüphanedir ve meta yönergeleri içerir // @require https://update.greasyfork.org/scripts/534389/1598598/lemmatizer.js

Bu betiği kurabilmeniz için Tampermonkey, Greasemonkey ya da Violentmonkey gibi bir kullanıcı betiği eklentisini kurmanız gerekmektedir.

Bu betiği yüklemek için Tampermonkey gibi bir uzantı yüklemeniz gerekir.

Bu betiği kurabilmeniz için Tampermonkey ya da Violentmonkey gibi bir kullanıcı betiği eklentisini kurmanız gerekmektedir.

Bu betiği kurabilmeniz için Tampermonkey ya da Userscripts gibi bir kullanıcı betiği eklentisini kurmanız gerekmektedir.

Bu betiği indirebilmeniz için ayrıca Tampermonkey gibi bir eklenti kurmanız gerekmektedir.

Bu komut dosyasını yüklemek için bir kullanıcı komut dosyası yöneticisi uzantısı yüklemeniz gerekecek.

(Zaten bir kullanıcı komut dosyası yöneticim var, kurmama izin verin!)

Bu stili yüklemek için Stylus gibi bir uzantı yüklemeniz gerekir.

Bu stili yüklemek için Stylus gibi bir uzantı kurmanız gerekir.

Bu stili yükleyebilmek için Stylus gibi bir uzantı yüklemeniz gerekir.

Bu stili yüklemek için bir kullanıcı stili yöneticisi uzantısı yüklemeniz gerekir.

Bu stili yüklemek için bir kullanıcı stili yöneticisi uzantısı kurmanız gerekir.

Bu stili yükleyebilmek için bir kullanıcı stili yöneticisi uzantısı yüklemeniz gerekir.

(Zateb bir user-style yöneticim var, yükleyeyim!)

/*
* JavaScript Lemmatizer v0.0.2
* https://github.com/takafumir/javascript-lemmatizer
* MIT License
* by Takafumi Yamano
* adapted by xing
*/

// extend String and define String#endsWith
if (typeof String.endsWith !== "function") {
    String.prototype.endsWith = function (suffix) {
        return this.indexOf(suffix, this.length - suffix.length) !== -1;
    };
}

// Lemmatizer constructor
// wn_files {
//     noun: [
//         'index.noun.json',
//         'noun.exc.json'
//     ],
//     verb: [
//         'index.verb.json',
//         'verb.exc.json'
//     ],
//     adj: [
//         'index.adj.json',
//         'adj.exc.json'
//     ],
//     adv: [
//         'index.adv.json',
//         'adv.exc.json'
//     ]
// }
// wn_data={
//     index.adv.json:'[[["airiest","airy"],...]]',...see https://github.com/takafumir/javascript-lemmatizer/dict/adj.exc.json
// }
const Lemmatizer = function (wn_data, wn_files = {
    noun: [
        'index.noun.json',
        'noun.exc.json'
    ],
    verb: [
        'index.verb.json',
        'verb.exc.json'
    ],
    adj: [
        'index.adj.json',
        'adj.exc.json'
    ],
    adv: [
        'index.adv.json',
        'adv.exc.json'
    ]
}) {
    this.files = wn_data;
    this.morphological_substitutions = {
        noun: [
            ['ies', 'y'],
            ['ves', 'f'],
            ['men', 'man']
        ],
        verb: [
            ['ies', 'y'],
            ['ied', 'y'],
            ['cked', 'c'],
            ['cked', 'ck'],
            ['able', 'e'],
            ['able', ''],
            ['ability', 'e'],
            ['ability', '']
        ],
        adj: [
            ['er', ''],
            ['est', ''],
            ['er', 'e'],
            ['est', 'e'],
            ['ier', 'y'],
            ['iest', 'y']
        ],
        adv: [
            ['er', ''],
            ['est', ''],
            ['er', 'e'],
            ['est', 'e'],
            ['ier', 'y'],
            ['iest', 'y']
        ]
    };

    this.data = {};

    this.wordlists = {};
    this.exceptions = {};
    this.exceptionsEx = {};

    // initialize wordlists and exceptions
    for (let key in this.morphological_substitutions) {
        this.wordlists[key] = {};
        this.exceptions[key] = {};
        this.exceptionsEx[key] = {};
    }

    // store dictionary data to localStorage from wn_files
    for (let pos in wn_files) {
        this.load_wordnet_files(pos, wn_files[pos][0], wn_files[pos][1]);
    }

    // fetch dictionary data from localStorage, then set up wordlists and exceptions
    for (let pos in wn_files) {
        this.setup_dic_data(pos);
    }
};

// Lemmatizer properties
Lemmatizer.prototype = {
    form: '',
    idx: '_idx',
    exc: '_exc',
    lems: [], // -> [ ["lemma1", "verb"], ["lemma2", "noun"]... ]

    // **************************************************
    // public
    // **************************************************
    // reuturn Array of ["lemma", "pos"] pairs
    // like [ ["lemma1", "verb"], ["lemma2", "noun"]... ]
    lemmas: function (form, pos) {
        let self = this;
        this.lems = [];
        this.form = form;

        let parts = ['verb', 'noun', 'adj', 'adv'];
        if (pos && parts.indexOf(pos) < 0) {
            console.log("warning: pos must be 'verb' or 'noun' or 'adj' or 'adv'.");
            return;
        }

        if (!pos) {
            parts.forEach(pos => self.irregular_bases(pos));
            parts.forEach(pos => self.regular_bases(pos));

            // when lemma not found and the form is included in wordlists.
            if (this.is_lemma_empty()) {
                parts.filter(pos => self.wordlists[pos][form])
                    .forEach(pos => self.lems([form, pos]));
            }
            // when lemma not found and the form is not included in wordlists.
            if (this.is_lemma_empty()) {
                this.lems.push([form, '']);
            }
        } else {
            this.base_forms(pos);
            if (this.is_lemma_empty()) {
                this.lems.push([form, pos]);
            }
        }

        // sort to verb -> noun -> adv -> adj
        return this.uniq_lemmas(this.lems).sort((a, b) => a[1] > b[1] ? -1 : 1);
    },

    // return only uniq lemmas without pos like [ 'high' ] or [ 'leave', 'leaf' ]
    only_lemmas: function (form, pos) {
        return [...new Set(this.lemmas(form, pos).map(val => val[0]))];
    },

    only_lemmas_withPos: function (form) {
        return [...new Set(this.lemmas(form))]
    },


    // **************************************************
    // private
    // The following properties(methods) are only used by
    // Lemmatizer inside, so don't call them from outside.
    // **************************************************
    is_lemma_empty: function () {
        return this.lems.length === 0;
    },

    // set up dictionary data
    load_wordnet_files: function (pos, list, exc) {
        let key_idx = pos + this.idx;
        this.open_file(key_idx, list);
        let key_exc = pos + this.exc;
        this.open_file(key_exc, exc);
    },

    setup_dic_data: function (pos) {
        let self = this;
        let key_idx = pos + this.idx;
        this.fetch_data(key_idx).forEach(function (w) {
            self.wordlists[pos][w] = w;
        });
        let key_exc = pos + this.exc;
        this.fetch_data(key_exc).forEach(function (item) {
            let w = item[0];
            self.exceptions[pos][w] = item[1];
            if (self.exceptionsEx[pos].hasOwnProperty(item[1])) {
                self.exceptionsEx[pos][item[1]].push(w);
            } else {
                self.exceptionsEx[pos][item[1]] = [w];
            }
        });
    },

    open_file: function (key, file) {
        this.store_data(key, this.files[file]);
    },

    store_data: function (key, data) {
        this.data[key] = JSON.parse(data);
    },

    fetch_data: function (key) {
        return this.data[key];
    },
    // end of set up dictionary data

    base_forms: function (pos) {
        this.irregular_bases(pos);
        this.regular_bases(pos);
    },

    // build array lemmas(this.lems) like [ [lemma1, "verb"], [lemma2, "noun"]... ]
    irregular_bases: function (pos) {
        if (this.exceptions[pos][this.form] && this.exceptions[pos][this.form] !== this.form) {
            this.lems.push([this.exceptions[pos][this.form], pos]);
        }
    },

    // build array lemmas(this.lems) like [ [lemma1, "verb"], [lemma2, "noun"]... ]
    regular_bases: function (pos) {
        let bases = null;
        // bases -> [ [lemma1, lemma2, lemma3...], pos ]
        switch (pos) {
            case 'verb':
                bases = this.possible_verb_bases();
                break;
            case 'noun':
                bases = this.possible_noun_bases();
                break;
            case 'adj':
                bases = this.possible_adj_adv_bases('adj');
                break;
            case 'adv':
                bases = this.possible_adj_adv_bases('adv');
                break;
            default:
                break;
        }
        if (bases) {
            this.check_lemmas(bases);
        }
    },

    // check if possible bases are include in lemma wordlists and push
    check_lemmas: function (bases) {
        let self = this;
        // bases -> [ [lemma1, lemma2, lemma3...], pos ]
        let lemmas = bases[0];
        let pos = bases[1];
        lemmas.forEach(function (lemma) {
            if (self.wordlists[pos][lemma] && self.wordlists[pos][lemma] === lemma) {
                self.lems.push([lemma, pos]);
            }
        });
    },

    possible_verb_bases: function () {
        let form = this.form;
        let lemmas = [];

        if (this.ends_with_es()) {
            // goes -> go
            let verb_base = form.slice(0, -2);
            lemmas.push(verb_base);
            if (!this.wordlists['verb'][verb_base] || this.wordlists['verb'][verb_base] !== verb_base) {
                // opposes -> oppose
                lemmas.push(form.slice(0, -1));
            }
        } else if (this.ends_with_verb_vowel_ys()) {
            // annoys -> annoy
            lemmas.push(form.slice(0, -1));
        } else if (form.endsWith('ed') && !form.endsWith('ied') && !form.endsWith('cked')) {
            // saved -> save
            let past_base = form.slice(0, -1);
            lemmas.push(past_base);
            if (!this.wordlists['verb'][past_base] || this.wordlists['verb'][past_base] !== past_base) {
                // talked -> talk, but not push like coded -> cod
                lemmas.push(form.slice(0, -2));
            }
        } else if (form.endsWith('ed') && this.double_consonant('ed')) {
            // dragged -> drag
            lemmas.push(form.slice(0, -3));
            // added -> add
            lemmas.push(form.slice(0, -2));
            // pirouetted -> pirouette
            lemmas.push(form.slice(0, -2) + 'e');
        } else if (form.endsWith('ing') && this.double_consonant('ing')) {
            // dragging -> drag
            lemmas.push(form.slice(0, -4));
            // adding -> add
            lemmas.push(form.slice(0, -3));
            // pirouetting -> pirouette
            lemmas.push(form.slice(0, -3) + 'e');
        } else if (form.endsWith('ing') && !this.exceptions['verb'][form]) {
            // coding -> code
            let ing_base = form.slice(0, -3) + 'e';
            lemmas.push(ing_base);
            if (!this.wordlists['verb'][ing_base] || this.wordlists['verb'][ing_base] !== ing_base) {
                // talking -> talk, but not push like coding -> cod
                lemmas.push(form.slice(0, -3));
            }
        } else if (form.endsWith('able') && this.double_consonant('able')) {
            lemmas.push(form.slice(0, -5));
        } else if (form.endsWith('ability') && this.double_consonant('ability')) {
            lemmas.push(form.slice(0, -8));
        } else if (form.endsWith('s')) {
            lemmas.push(form.slice(0, -1));
        }

        this.morphological_substitutions["verb"].forEach(function (entry) {
            let morpho = entry[0];
            let origin = entry[1];
            if (form.endsWith(morpho)) {
                lemmas.push(form.slice(0, -(morpho.length)) + origin);
            }
        });

        lemmas.push(form);

        return [lemmas, 'verb'];
    },

    possible_noun_bases: function () {
        let form = this.form;
        let lemmas = [];

        if (this.ends_with_es()) {
            // watches -> watch
            let noun_base = form.slice(0, -2);
            lemmas.push(noun_base);
            if (!this.wordlists['noun'][noun_base] || this.wordlists['noun'][noun_base] !== noun_base) {
                // horses -> horse
                lemmas.push(form.slice(0, -1));
            }
        } else if (form.endsWith('s')) {
            lemmas.push(form.slice(0, -1));
        }

        this.morphological_substitutions["noun"].forEach(function (entry) {
            let morpho = entry[0];
            let origin = entry[1];
            if (form.endsWith(morpho)) {
                lemmas.push(form.slice(0, -(morpho.length)) + origin);
            }
        });

        // to push a word like 'us' as it is
        lemmas.push(form);

        return [lemmas, 'noun'];
    },

    possible_adj_adv_bases: function (pos) {
        let form = this.form;
        let lemmas = [];

        if (form.endsWith('est') && this.double_consonant('est')) {
            // biggest -> big
            lemmas.push(form.slice(0, -4));
        } else if (form.endsWith('er') && this.double_consonant('er')) {
            // bigger -> bigger
            lemmas.push(form.slice(0, -3));
        }

        this.morphological_substitutions[pos].forEach(function (entry) {
            let morpho = entry[0];
            let origin = entry[1];
            if (form.endsWith(morpho)) {
                lemmas.push(form.slice(0, -(morpho.length)) + origin);
            }
        });

        // to push a word like 'after' as it is
        lemmas.push(form);

        return [lemmas, pos];
    },

    double_consonant: function (suffix) {
        // for like bigger -> big
        let form = this.form;
        // length after removing suffix from form
        let len = form.length - suffix.length;
        return this.is_vowel(form[len - 3]) && !this.is_vowel(form[len - 2]) && form[len - 2] === form[len - 1];
    },

    is_vowel: function (letter) {
        return ["a", "e", "i", "o", "u"].indexOf(letter) > -1;
    },

    // [ ["leave", "verb"], ["leaf", "noun"], ["leave", "verb"], ["leave", "noun"] ];
    // -> [ ["leave", "verb"], ["leaf", "noun"], ["leave", "noun"] ];
    uniq_lemmas: function (lemmas) {
        let u_lemmas = [];
        let len = lemmas.length;
        for (let i = 0; i < len; i++) {
            let val = lemmas[i];
            if (!this.is_include(u_lemmas, val) && val[0].length > 1) {
                u_lemmas.push(val);
            }
        }
        return u_lemmas;
    },

    is_include: function (lemmas, target) {
        let len = lemmas.length;
        for (let i = 0; i < len; i++) {
            if (lemmas[i][0] === target[0] && lemmas[i][1] === target[1]) {
                return true;
            }
        }
        return false;
    },

    ends_with_es: function () {
        let result = false;
        let form = this.form;
        let ends = ['ches', 'shes', 'oes', 'ses', 'xes', 'zes'];
        ends.forEach(function (end) {
            if (form.endsWith(end)) {
                result = true;
            }
        });
        return result;
    },

    ends_with_verb_vowel_ys: function () {
        let result = false;
        let form = this.form;
        let ends = ['ays', 'eys', 'iys', 'oys', 'uys'];
        ends.forEach(function (end) {
            if (form.endsWith(end)) {
                result = true;
            }
        });
        return result;
    },
    irregularConjugationOrPluralities: function (pos) {
        this.form = pos;
        return Object.keys(this.exceptionsEx).filterAndMapX(form => this.exceptionsEx[form].hasOwnProperty(this.form) ? [this.exceptionsEx[form][this.form], form] : false);
    }
};