Mercurial > dedupe
annotate EditDistance.cpp @ 41:e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
| author | Tom Fredrik Blenning Klaussen <bfg@blenning.no> |
|---|---|
| date | Sun, 09 Sep 2012 15:17:00 +0200 |
| parents | f711ddb56ae7 |
| children | 4c283daa42c7 |
| rev | line source |
|---|---|
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
1 #include "EditDistance.hpp" |
|
40
f711ddb56ae7
Sort up includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
34
diff
changeset
|
2 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
3 #include "CompileTimeConstants.h" |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
4 #include "ConfigurationProcessing.hpp" |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
5 |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
6 #include <boost/numeric/ublas/matrix.hpp> |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
7 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
8 #define CharComparer(A, B) (QChar(A) == QChar(B)) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
9 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
10 EditDistance::cacheType* EditDistance::cache = 0; |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
11 //EditDistance::cacheType EditDistance::cache; |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
12 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
13 QString EditDistance::removeDiacritics(QString in) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
14 { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
15 QString out; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
16 foreach(QChar c, in) { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
17 if (c.decompositionTag() == QChar::NoDecomposition) { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
18 out.append(c); |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
19 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
20 else { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
21 QString tmp = c.decomposition(); |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
22 out.append(tmp[0]); |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
23 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
24 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
25 return out; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
26 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
27 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
28 int EditDistance::Compute(QString a, QString b, bool remove) { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
29 if (remove) { |
|
9
b5943e4bf676
Fix up header includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
0
diff
changeset
|
30 a = removeDiacritics(a); |
|
b5943e4bf676
Fix up header includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
0
diff
changeset
|
31 b = removeDiacritics(b); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
32 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
33 |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
34 if ( a == b) |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
35 return 0; |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
36 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
37 OrderedPair<UniqueString> lup(a, b); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
38 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
39 if (cache == 0) { |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
40 QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION); |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
41 EditDistance::cache = new cacheType(cacheLocation, "EditLUT"); |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
42 } |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
43 boost::optional<int> res = cache->value(lup); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
44 if (res) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
45 return *res; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
46 |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
47 uint s1 = a.size(); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
48 uint s2 = b.size(); |
|
34
fda70a362ed5
Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
16
diff
changeset
|
49 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
50 // Allocate distance matrix |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
51 boost::numeric::ublas::matrix<int> d(s1 + 1, s2 + 1); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
52 d.clear(); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
53 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
54 // Compute distance |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
55 for (uint i = 0; i <= s1; ++i) |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
56 d(i, 0) = i; |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
57 for (uint j = 0; j <= s2; ++j) |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
58 d(0, j) = j; |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
59 for (uint i = 1; i <= s1; ++i) { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
60 for (uint j = 1; j <= s2; ++j) { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
61 if (CharComparer(a.at(i - 1), b.at(j - 1))) { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
62 // No change required |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
63 d(i, j) = d(i - 1, j - 1); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
64 } |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
65 else { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
66 d(i, j) = |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
67 std::min(d(i - 1, j) + 1, // Deletion |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
68 std::min(d(i, j - 1) + 1, // Insertion |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
69 d(i - 1, j - 1) + 1)); // Substitution |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
70 } |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
71 } |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
72 } |
|
34
fda70a362ed5
Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
16
diff
changeset
|
73 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
74 // Return final value |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
75 int retVal = d(s1, s2); |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
76 cache->insert(lup, retVal); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
77 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
78 return retVal; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
79 } |
