Mercurial > dedupe
view EditDistance.cpp @ 16:06166d6c083b
Add configuration processing.
Cache DB values
Add a custom RBTree to save space.
Track multiple DB connections properly.
More testing.
Add ValueExistsException.
| author | Tom Fredrik Blenning Klaussen <bfg@blenning.no> |
|---|---|
| date | Tue, 28 Aug 2012 18:58:02 +0200 |
| parents | b5943e4bf676 |
| children | fda70a362ed5 |
line wrap: on
line source
#include "EditDistance.hpp" #include "CompileTimeConstants.h" #include "ConfigurationProcessing.hpp" #include <QtCore/QList> #define CharComparer(A, B) (QChar(A) == QChar(B)) EditDistance::cacheType* EditDistance::cache = 0; //EditDistance::cacheType EditDistance::cache; QString EditDistance::removeDiacritics(QString in) { QString out; foreach(QChar c, in) { if (c.decompositionTag() == QChar::NoDecomposition) { out.append(c); } else { QString tmp = c.decomposition(); out.append(tmp[0]); } } return out; } int EditDistance::Compute(QString a, QString b, bool remove) { if (remove) { a = removeDiacritics(a); b = removeDiacritics(b); } OrderedPair<UniqueString> lup(a, b); if (cache == 0) { QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION); EditDistance::cache = new cacheType(cacheLocation, "EditLUT"); } boost::optional<int> res = cache->value(lup); if (res) return *res; // Allocate distance matrix QList<QList<int> > d; QList<int> temp; for (int i=0;i<b.size()+1;i++) { temp.append(0); } for (int i=0;i<a.size()+1;i++) { d.append(temp); } #if 0 // Get character comparer CharComparer isEqual = (ignoreCase) ? (CharComparer)CharCompareIgnoreCase : CharCompare; #endif // Compute distance for (int i = 0; i <= a.size(); i++) d[i][0] = i; for (int j = 0; j <= b.size(); j++) d[0][j] = j; for (int i = 1; i <= a.size(); i++) { for (int j = 1; j <= b.size(); j++) { if (CharComparer(a.at(i-1), b.at(j - 1))) { // No change required d[i][j] = d[i - 1][j - 1]; } else { d[i][ j] = std::min(d[i - 1][ j] + 1, // Deletion std::min(d[i][ j - 1] + 1, // Insertion d[i - 1][ j - 1] + 1)); // Substitution } } } // Return final value int retVal = d[a.size()][ b.size()]; cache->insert(lup, retVal); return retVal; }
