comparison EditDistance.cpp @ 41:e0898020af08

Faster computation of EditDistance, by more efficient datastructure.
author Tom Fredrik Blenning Klaussen <bfg@blenning.no>
date Sun, 09 Sep 2012 15:17:00 +0200
parents f711ddb56ae7
children 4c283daa42c7
comparison
equal deleted inserted replaced
40:f711ddb56ae7 41:e0898020af08
1 #include "EditDistance.hpp" 1 #include "EditDistance.hpp"
2 2
3 #include "CompileTimeConstants.h" 3 #include "CompileTimeConstants.h"
4 #include "ConfigurationProcessing.hpp" 4 #include "ConfigurationProcessing.hpp"
5 5
6 #include <QtCore/QList> 6 #include <boost/numeric/ublas/matrix.hpp>
7 7
8 #define CharComparer(A, B) (QChar(A) == QChar(B)) 8 #define CharComparer(A, B) (QChar(A) == QChar(B))
9 9
10 EditDistance::cacheType* EditDistance::cache = 0; 10 EditDistance::cacheType* EditDistance::cache = 0;
11 //EditDistance::cacheType EditDistance::cache; 11 //EditDistance::cacheType EditDistance::cache;
29 if (remove) { 29 if (remove) {
30 a = removeDiacritics(a); 30 a = removeDiacritics(a);
31 b = removeDiacritics(b); 31 b = removeDiacritics(b);
32 } 32 }
33 33
34 if ( a == b)
35 return 0;
36
34 OrderedPair<UniqueString> lup(a, b); 37 OrderedPair<UniqueString> lup(a, b);
35 38
36 if (cache == 0) { 39 if (cache == 0) {
37 QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION); 40 QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION);
38 EditDistance::cache = new cacheType(cacheLocation, "EditLUT"); 41 EditDistance::cache = new cacheType(cacheLocation, "EditLUT");
39 } 42 }
40 boost::optional<int> res = cache->value(lup); 43 boost::optional<int> res = cache->value(lup);
41 if (res) 44 if (res)
42 return *res; 45 return *res;
43 46
47 uint s1 = a.size();
48 uint s2 = b.size();
44 49
45 // Allocate distance matrix 50 // Allocate distance matrix
46 QList<QList<int> > d; 51 boost::numeric::ublas::matrix<int> d(s1 + 1, s2 + 1);
47 QList<int> temp; 52 d.clear();
48 for (int i=0;i<b.size()+1;i++) { 53
49 temp.append(0); 54 // Compute distance
55 for (uint i = 0; i <= s1; ++i)
56 d(i, 0) = i;
57 for (uint j = 0; j <= s2; ++j)
58 d(0, j) = j;
59 for (uint i = 1; i <= s1; ++i) {
60 for (uint j = 1; j <= s2; ++j) {
61 if (CharComparer(a.at(i - 1), b.at(j - 1))) {
62 // No change required
63 d(i, j) = d(i - 1, j - 1);
64 }
65 else {
66 d(i, j) =
67 std::min(d(i - 1, j) + 1, // Deletion
68 std::min(d(i, j - 1) + 1, // Insertion
69 d(i - 1, j - 1) + 1)); // Substitution
70 }
71 }
50 } 72 }
51 for (int i=0;i<a.size()+1;i++) {
52 d.append(temp);
53 }
54 #if 0
55 // Get character comparer
56 CharComparer isEqual = (ignoreCase) ?
57 (CharComparer)CharCompareIgnoreCase : CharCompare;
58 #endif
59 // Compute distance
60 for (int i = 0; i <= a.size(); i++)
61 d[i][0] = i;
62 for (int j = 0; j <= b.size(); j++)
63 d[0][j] = j;
64 for (int i = 1; i <= a.size(); i++)
65 {
66 for (int j = 1; j <= b.size(); j++)
67 {
68 if (CharComparer(a.at(i-1), b.at(j - 1)))
69 {
70 // No change required
71 d[i][j] = d[i - 1][j - 1];
72 }
73 else
74 {
75 d[i][ j] =
76 std::min(d[i - 1][ j] + 1, // Deletion
77 std::min(d[i][ j - 1] + 1, // Insertion
78 d[i - 1][ j - 1] + 1)); // Substitution
79 }
80 }
81 }
82 73
83 // Return final value 74 // Return final value
84 int retVal = d[a.size()][ b.size()]; 75 int retVal = d(s1, s2);
85 cache->insert(lup, retVal); 76 cache->insert(lup, retVal);
86 77
87 return retVal; 78 return retVal;
88 } 79 }