js中比较文本相似性的四种算法

js中比较文本相似性的四种算法

js中比较文本相似性的四种算法

今天给大家介绍一个js中不同算法的文本相似性检测功能的实现

1、Levenshtein算法(编辑距离)

指的是两个字符串之间,由一个转换成另一个所需的最少编辑操作次数。

许可的编辑操作包括将一个字符替换成另一个字符,插入一个字符,删除一个字符。

编辑距离的算法是首先由俄国科学家Levenshtein提出的,故又叫Levenshtein Distance。

示例如下,可在线直接运行,点击代码右上角

<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>BFW NEW PAGE</title>
    <script id="bfwone" data="dep=jquery.17&err=0" type="text/javascript" src="http://repo.bfw.wiki/bfwrepo/js/bfwone.js"></script>
    <script type="text/javascript">
        LevenshteinDistance = function(a, b) {
            if (a.length == 0) return b.length;
            if (b.length == 0) return a.length;

            var matrix = [];

            // increment along the first column of each row
            var i;
            for (i = 0; i <= b.length; i++) {
                matrix[i] = [i];
            }

            // increment each column in the first row
            var j;
            for (j = 0; j <= a.length; j++) {
                matrix[0][j] = j;
            }

            // Fill in the rest of the matrix
            for (i = 1; i <= b.length; i++) {
                for (j = 1; j <= a.length; j++) {
                    if (b.charAt(i-1) == a.charAt(j-1)) {
                        matrix[i][j] = matrix[i-1][j-1];
                    } else {
                        matrix[i][j] = Math.min(matrix[i-1][j-1] + 1, // substitution
                            Math.min(matrix[i][j-1] + 1, // insertion
                                matrix[i-1][j] + 1)); // deletion
                    }
                }
            }

            return matrix[b.length][a.length];
        };
        bready(function() {
            $("#comparebtn").click(function() {
                alert((10-LevenshteinDistance($("#left-words").val(), $("#right-words").val()))/10);
            });
        });
    </script>
</head>
<body>
    <div>
        <textarea id="left-words"></textarea>
        <textarea id="right-words"></textarea>
    </div>
    <button id="comparebtn">检测</button>
</body>
</html>

2、三角比较算法

三角比较法就是讲单词按照3个一组拆分成连续的序列,例如 hello 我们可以拆成hel  ell  llo三个连续的序列,两个单词hollo,我们拆成hol oll llo,我们发现拆完后最后一组是相似的,那么他们的相似值等于1/字母数5=20%

也就是相似性,ji代码如下

(function () {
    TrigramIndex = function (inputPhrases) {
        function asTrigrams(phrase, callback) {
            var rawData = "  ".concat(phrase, "  ");
            for (var i = rawData.length - 3; i >= 0; i = i - 1)
                callback.call(this, rawData.slice(i, i + 3));
        };

        var instance = {
            phrases: [],
            trigramIndex: [],

            index: function (phrase) {
                if (!phrase || phrase === "" || this.phrases.indexOf(phrase) >= 0) return;
                var phraseIndex = this.phrases.push(phrase) - 1;
                asTrigrams.call(this, phrase, function (trigram) {
                    var phrasesForTrigram = this.trigramIndex[trigram];
                    if (!phrasesForTrigram) phrasesForTrigram = [];
                    if (phrasesForTrigram.indexOf(phraseIndex) < 0) phrasesForTrigram.push(phraseIndex);
                    this.trigramIndex[trigram] = phrasesForTrigram;
                });
            },

            find: function (phrase) {
                var phraseMatches = [];
                var trigramsInPhrase = 0;
                asTrigrams.call(this, phrase, function (trigram) {
                    var phrasesForTrigram = this.trigramIndex[trigram];
                    trigramsInPhrase += 1;
                    if (phrasesForTrigram)
                        for (var j in phrasesForTrigram) {
                            phraseIndex = phrasesForTrigram[j];
                            if (!phraseMatches[phraseIndex]) phraseMatches[phraseIndex] = 0;
                            phraseMatches[phraseIndex] += 1;
                        }
                });
                var result = [];
                for (var i in phraseMatches)
                    result.push({ phrase: this.phrases[i], matches: phraseMatches[i] });

                result.sort(function (a, b) {
                    var diff = b.matches - a.matches;
                    return diff;// == 0 ? a.phrase.localeCompare(b.phrase) : diff;
                });
                return result;
            }
        };
        for (var i in inputPhrases)
            instance.index(inputPhrases[i]);
        return instance;
    };
})();

3、余弦算法

通过比较两个句子之间的余弦相似性可以作为它们的矢量表示的点积。有多种方法可以将句子/段落表示为向量。

similarity
= cos(a,b)
= dotproduct(a,b) / ( norm(a) * norm(b) )
= a.b / ||a|| * ||b||

就是讲句子分成单独的词语,然后比较每个单词在句子中出现的次数,形成一个矢量数据,然后余弦一下就好了

js代码如下

(function () {
    
    function termFreqMap(str) {
        var words = str.split(' ');
        var termFreq = {};
        words.forEach(function(w) {
            termFreq[w] = (termFreq[w] || 0) + 1;
        });
        return termFreq;
    }

    function addKeysToDict(map, dict) {
        for (var key in map) {
            dict[key] = true;
        }
    }

    function termFreqMapToVector(map, dict) {
        var termFreqVector = [];
        for (var term in dict) {
            termFreqVector.push(map[term] || 0);
        }
        return termFreqVector;
    }

    function vecDotProduct(vecA, vecB) {
        var product = 0;
        for (var i = 0; i < vecA.length; i++) {
            product += vecA[i] * vecB[i];
        }
        return product;
    }

    function vecMagnitude(vec) {
        var sum = 0;
        for (var i = 0; i < vec.length; i++) {
            sum += vec[i] * vec[i];
        }
        return Math.sqrt(sum);
    }

    function cosineSimilarity(vecA, vecB) {
        return vecDotProduct(vecA, vecB) / (vecMagnitude(vecA) * vecMagnitude(vecB));
    }

    Cosinesimilarity = function textCosineSimilarity(strA, strB) {
        var termFreqA = termFreqMap(strA);
        var termFreqB = termFreqMap(strB);

        var dict = {};
        addKeysToDict(termFreqA, dict);
        addKeysToDict(termFreqB, dict);

        var termFreqVecA = termFreqMapToVector(termFreqA, dict);
        var termFreqVecB = termFreqMapToVector(termFreqB, dict);

        return cosineSimilarity(termFreqVecA, termFreqVecB);
    }
})();

4、Jaro Winkler算法

js中比较文本相似性的四种算法

js代码如下

(function () {
        JaroWrinker  = function (s1, s2) {
        var m = 0;

        // Exit early if either are empty.
        if ( s1.length === 0 || s2.length === 0 ) {
            return 0;
        }

        // Exit early if they're an exact match.
        if ( s1 === s2 ) {
            return 1;
        }

        var range     = (Math.floor(Math.max(s1.length, s2.length) / 2)) - 1,
            s1Matches = new Array(s1.length),
            s2Matches = new Array(s2.length);

        for ( i = 0; i < s1.length; i++ ) {
            var low  = (i >= range) ? i - range : 0,
                high = (i + range <= s2.length) ? (i + range) : (s2.length - 1);

            for ( j = low; j <= high; j++ ) {
            if ( s1Matches[i] !== true && s2Matches[j] !== true && s1[i] === s2[j] ) {
                ++m;
                s1Matches[i] = s2Matches[j] = true;
                break;
            }
            }
        }

        // Exit early if no matches were found.
        if ( m === 0 ) {
            return 0;
        }

        // Count the transpositions.
        var k = n_trans = 0;

        for ( i = 0; i < s1.length; i++ ) {
            if ( s1Matches[i] === true ) {
            for ( j = k; j < s2.length; j++ ) {
                if ( s2Matches[j] === true ) {
                k = j + 1;
                break;
                }
            }

            if ( s1[i] !== s2[j] ) {
                ++n_trans;
            }
            }
        }

        var weight = (m / s1.length + m / s2.length + (m - (n_trans / 2)) / m) / 3,
            l      = 0,
            p      = 0.1;

        if ( weight > 0.7 ) {
            while ( s1[l] === s2[l] && l < 4 ) {
            ++l;
            }

            weight = weight + l * p * (1 - weight);
        }

        return weight;
    }
})();



{{collectdata}}

网友评论0