Mercurial > dedupe
view EditDistance.cpp @ 41:e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
| author | Tom Fredrik Blenning Klaussen <bfg@blenning.no> |
|---|---|
| date | Sun, 09 Sep 2012 15:17:00 +0200 |
| parents | f711ddb56ae7 |
| children | 4c283daa42c7 |
line wrap: on
line source
#include "EditDistance.hpp" #include "CompileTimeConstants.h" #include "ConfigurationProcessing.hpp" #include <boost/numeric/ublas/matrix.hpp> #define CharComparer(A, B) (QChar(A) == QChar(B)) EditDistance::cacheType* EditDistance::cache = 0; //EditDistance::cacheType EditDistance::cache; QString EditDistance::removeDiacritics(QString in) { QString out; foreach(QChar c, in) { if (c.decompositionTag() == QChar::NoDecomposition) { out.append(c); } else { QString tmp = c.decomposition(); out.append(tmp[0]); } } return out; } int EditDistance::Compute(QString a, QString b, bool remove) { if (remove) { a = removeDiacritics(a); b = removeDiacritics(b); } if ( a == b) return 0; OrderedPair<UniqueString> lup(a, b); if (cache == 0) { QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION); EditDistance::cache = new cacheType(cacheLocation, "EditLUT"); } boost::optional<int> res = cache->value(lup); if (res) return *res; uint s1 = a.size(); uint s2 = b.size(); // Allocate distance matrix boost::numeric::ublas::matrix<int> d(s1 + 1, s2 + 1); d.clear(); // Compute distance for (uint i = 0; i <= s1; ++i) d(i, 0) = i; for (uint j = 0; j <= s2; ++j) d(0, j) = j; for (uint i = 1; i <= s1; ++i) { for (uint j = 1; j <= s2; ++j) { if (CharComparer(a.at(i - 1), b.at(j - 1))) { // No change required d(i, j) = d(i - 1, j - 1); } else { d(i, j) = std::min(d(i - 1, j) + 1, // Deletion std::min(d(i, j - 1) + 1, // Insertion d(i - 1, j - 1) + 1)); // Substitution } } } // Return final value int retVal = d(s1, s2); cache->insert(lup, retVal); return retVal; }
