Mercurial > dedupe
comparison EditDistance.cpp @ 0:a3834af36579
Working with memory backend.
| author | Tom Fredrik Blenning Klaussen <bfg@blenning.no> |
|---|---|
| date | Mon, 20 Aug 2012 15:49:48 +0200 |
| parents | |
| children | b5943e4bf676 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a3834af36579 |
|---|---|
| 1 #include "EditDistance.hpp" | |
| 2 | |
| 3 #include <QtCore/QList> | |
| 4 | |
| 5 #define CharComparer(A, B) (QChar(A) == QChar(B)) | |
| 6 | |
| 7 EditDistance::cacheType EditDistance::cache; | |
| 8 | |
| 9 QString EditDistance::removeDiacritics(QString in) | |
| 10 { | |
| 11 QString out; | |
| 12 foreach(QChar c, in) { | |
| 13 if (c.decompositionTag() == QChar::NoDecomposition) { | |
| 14 out.append(c); | |
| 15 } | |
| 16 else { | |
| 17 QString tmp = c.decomposition(); | |
| 18 out.append(tmp[0]); | |
| 19 } | |
| 20 } | |
| 21 return out; | |
| 22 } | |
| 23 | |
| 24 int EditDistance::Compute(QString a, QString b, bool remove) { | |
| 25 if (remove) { | |
| 26 a=removeDiacritics(a); | |
| 27 b=removeDiacritics(b); | |
| 28 } | |
| 29 | |
| 30 OrderedPair<QString> lup(a, b); | |
| 31 | |
| 32 boost::optional<int> res = cache.value(lup); | |
| 33 if (res) | |
| 34 return *res; | |
| 35 | |
| 36 | |
| 37 // Allocate distance matrix | |
| 38 QList<QList<int> > d; | |
| 39 QList<int> temp; | |
| 40 for (int i=0;i<b.size()+1;i++) { | |
| 41 temp.append(0); | |
| 42 } | |
| 43 for (int i=0;i<a.size()+1;i++) { | |
| 44 d.append(temp); | |
| 45 } | |
| 46 #if 0 | |
| 47 // Get character comparer | |
| 48 CharComparer isEqual = (ignoreCase) ? | |
| 49 (CharComparer)CharCompareIgnoreCase : CharCompare; | |
| 50 #endif | |
| 51 // Compute distance | |
| 52 for (int i = 0; i <= a.size(); i++) | |
| 53 d[i][0] = i; | |
| 54 for (int j = 0; j <= b.size(); j++) | |
| 55 d[0][j] = j; | |
| 56 for (int i = 1; i <= a.size(); i++) | |
| 57 { | |
| 58 for (int j = 1; j <= b.size(); j++) | |
| 59 { | |
| 60 if (CharComparer(a.at(i-1), b.at(j - 1))) | |
| 61 { | |
| 62 // No change required | |
| 63 d[i][j] = d[i - 1][j - 1]; | |
| 64 } | |
| 65 else | |
| 66 { | |
| 67 d[i][ j] = | |
| 68 std::min(d[i - 1][ j] + 1, // Deletion | |
| 69 std::min(d[i][ j - 1] + 1, // Insertion | |
| 70 d[i - 1][ j - 1] + 1)); // Substitution | |
| 71 } | |
| 72 } | |
| 73 } | |
| 74 | |
| 75 // Return final value | |
| 76 int retVal = d[a.size()][ b.size()]; | |
| 77 cache.insert(lup, retVal); | |
| 78 | |
| 79 return retVal; | |
| 80 } |
