Mercurial > dedupe
comparison EditDistance.cpp @ 41:e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
| author | Tom Fredrik Blenning Klaussen <bfg@blenning.no> |
|---|---|
| date | Sun, 09 Sep 2012 15:17:00 +0200 |
| parents | f711ddb56ae7 |
| children | 4c283daa42c7 |
comparison
equal
deleted
inserted
replaced
| 40:f711ddb56ae7 | 41:e0898020af08 |
|---|---|
| 1 #include "EditDistance.hpp" | 1 #include "EditDistance.hpp" |
| 2 | 2 |
| 3 #include "CompileTimeConstants.h" | 3 #include "CompileTimeConstants.h" |
| 4 #include "ConfigurationProcessing.hpp" | 4 #include "ConfigurationProcessing.hpp" |
| 5 | 5 |
| 6 #include <QtCore/QList> | 6 #include <boost/numeric/ublas/matrix.hpp> |
| 7 | 7 |
| 8 #define CharComparer(A, B) (QChar(A) == QChar(B)) | 8 #define CharComparer(A, B) (QChar(A) == QChar(B)) |
| 9 | 9 |
| 10 EditDistance::cacheType* EditDistance::cache = 0; | 10 EditDistance::cacheType* EditDistance::cache = 0; |
| 11 //EditDistance::cacheType EditDistance::cache; | 11 //EditDistance::cacheType EditDistance::cache; |
| 29 if (remove) { | 29 if (remove) { |
| 30 a = removeDiacritics(a); | 30 a = removeDiacritics(a); |
| 31 b = removeDiacritics(b); | 31 b = removeDiacritics(b); |
| 32 } | 32 } |
| 33 | 33 |
| 34 if ( a == b) | |
| 35 return 0; | |
| 36 | |
| 34 OrderedPair<UniqueString> lup(a, b); | 37 OrderedPair<UniqueString> lup(a, b); |
| 35 | 38 |
| 36 if (cache == 0) { | 39 if (cache == 0) { |
| 37 QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION); | 40 QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION); |
| 38 EditDistance::cache = new cacheType(cacheLocation, "EditLUT"); | 41 EditDistance::cache = new cacheType(cacheLocation, "EditLUT"); |
| 39 } | 42 } |
| 40 boost::optional<int> res = cache->value(lup); | 43 boost::optional<int> res = cache->value(lup); |
| 41 if (res) | 44 if (res) |
| 42 return *res; | 45 return *res; |
| 43 | 46 |
| 47 uint s1 = a.size(); | |
| 48 uint s2 = b.size(); | |
| 44 | 49 |
| 45 // Allocate distance matrix | 50 // Allocate distance matrix |
| 46 QList<QList<int> > d; | 51 boost::numeric::ublas::matrix<int> d(s1 + 1, s2 + 1); |
| 47 QList<int> temp; | 52 d.clear(); |
| 48 for (int i=0;i<b.size()+1;i++) { | 53 |
| 49 temp.append(0); | 54 // Compute distance |
| 55 for (uint i = 0; i <= s1; ++i) | |
| 56 d(i, 0) = i; | |
| 57 for (uint j = 0; j <= s2; ++j) | |
| 58 d(0, j) = j; | |
| 59 for (uint i = 1; i <= s1; ++i) { | |
| 60 for (uint j = 1; j <= s2; ++j) { | |
| 61 if (CharComparer(a.at(i - 1), b.at(j - 1))) { | |
| 62 // No change required | |
| 63 d(i, j) = d(i - 1, j - 1); | |
| 64 } | |
| 65 else { | |
| 66 d(i, j) = | |
| 67 std::min(d(i - 1, j) + 1, // Deletion | |
| 68 std::min(d(i, j - 1) + 1, // Insertion | |
| 69 d(i - 1, j - 1) + 1)); // Substitution | |
| 70 } | |
| 71 } | |
| 50 } | 72 } |
| 51 for (int i=0;i<a.size()+1;i++) { | |
| 52 d.append(temp); | |
| 53 } | |
| 54 #if 0 | |
| 55 // Get character comparer | |
| 56 CharComparer isEqual = (ignoreCase) ? | |
| 57 (CharComparer)CharCompareIgnoreCase : CharCompare; | |
| 58 #endif | |
| 59 // Compute distance | |
| 60 for (int i = 0; i <= a.size(); i++) | |
| 61 d[i][0] = i; | |
| 62 for (int j = 0; j <= b.size(); j++) | |
| 63 d[0][j] = j; | |
| 64 for (int i = 1; i <= a.size(); i++) | |
| 65 { | |
| 66 for (int j = 1; j <= b.size(); j++) | |
| 67 { | |
| 68 if (CharComparer(a.at(i-1), b.at(j - 1))) | |
| 69 { | |
| 70 // No change required | |
| 71 d[i][j] = d[i - 1][j - 1]; | |
| 72 } | |
| 73 else | |
| 74 { | |
| 75 d[i][ j] = | |
| 76 std::min(d[i - 1][ j] + 1, // Deletion | |
| 77 std::min(d[i][ j - 1] + 1, // Insertion | |
| 78 d[i - 1][ j - 1] + 1)); // Substitution | |
| 79 } | |
| 80 } | |
| 81 } | |
| 82 | 73 |
| 83 // Return final value | 74 // Return final value |
| 84 int retVal = d[a.size()][ b.size()]; | 75 int retVal = d(s1, s2); |
| 85 cache->insert(lup, retVal); | 76 cache->insert(lup, retVal); |
| 86 | 77 |
| 87 return retVal; | 78 return retVal; |
| 88 } | 79 } |
