Mercurial > dedupe
changeset 41:e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
| author | Tom Fredrik Blenning Klaussen <bfg@blenning.no> |
|---|---|
| date | Sun, 09 Sep 2012 15:17:00 +0200 |
| parents | f711ddb56ae7 |
| children | 4c283daa42c7 |
| files | EditDistance.cpp |
| diffstat | 1 files changed, 27 insertions(+), 36 deletions(-) [+] |
line wrap: on
line diff
--- a/EditDistance.cpp Fri Sep 07 13:32:33 2012 +0200 +++ b/EditDistance.cpp Sun Sep 09 15:17:00 2012 +0200 @@ -3,7 +3,7 @@ #include "CompileTimeConstants.h" #include "ConfigurationProcessing.hpp" -#include <QtCore/QList> +#include <boost/numeric/ublas/matrix.hpp> #define CharComparer(A, B) (QChar(A) == QChar(B)) @@ -31,6 +31,9 @@ b = removeDiacritics(b); } + if ( a == b) + return 0; + OrderedPair<UniqueString> lup(a, b); if (cache == 0) { @@ -41,47 +44,35 @@ if (res) return *res; + uint s1 = a.size(); + uint s2 = b.size(); // Allocate distance matrix - QList<QList<int> > d; - QList<int> temp; - for (int i=0;i<b.size()+1;i++) { - temp.append(0); - } - for (int i=0;i<a.size()+1;i++) { - d.append(temp); - } -#if 0 - // Get character comparer - CharComparer isEqual = (ignoreCase) ? - (CharComparer)CharCompareIgnoreCase : CharCompare; -#endif + boost::numeric::ublas::matrix<int> d(s1 + 1, s2 + 1); + d.clear(); + // Compute distance - for (int i = 0; i <= a.size(); i++) - d[i][0] = i; - for (int j = 0; j <= b.size(); j++) - d[0][j] = j; - for (int i = 1; i <= a.size(); i++) - { - for (int j = 1; j <= b.size(); j++) - { - if (CharComparer(a.at(i-1), b.at(j - 1))) - { - // No change required - d[i][j] = d[i - 1][j - 1]; - } - else - { - d[i][ j] = - std::min(d[i - 1][ j] + 1, // Deletion - std::min(d[i][ j - 1] + 1, // Insertion - d[i - 1][ j - 1] + 1)); // Substitution - } - } + for (uint i = 0; i <= s1; ++i) + d(i, 0) = i; + for (uint j = 0; j <= s2; ++j) + d(0, j) = j; + for (uint i = 1; i <= s1; ++i) { + for (uint j = 1; j <= s2; ++j) { + if (CharComparer(a.at(i - 1), b.at(j - 1))) { + // No change required + d(i, j) = d(i - 1, j - 1); + } + else { + d(i, j) = + std::min(d(i - 1, j) + 1, // Deletion + std::min(d(i, j - 1) + 1, // Insertion + d(i - 1, j - 1) + 1)); // Substitution + } } + } // Return final value - int retVal = d[a.size()][ b.size()]; + int retVal = d(s1, s2); cache->insert(lup, retVal); return retVal;
