Mercurial > dedupe
annotate EditDistance.cpp @ 77:a827f3687c4a
Compile fix Linux, wrong capitalization.
| author | Tom Fredrik Blenning Klaussen <bfg@bfgconsult.no> |
|---|---|
| date | Sat, 16 Feb 2013 19:00:54 +0100 |
| parents | 4c283daa42c7 |
| children | 9744ec195be3 |
| rev | line source |
|---|---|
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
1 #include "EditDistance.hpp" |
|
40
f711ddb56ae7
Sort up includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
34
diff
changeset
|
2 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
3 #include "CompileTimeConstants.h" |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
4 #include "ConfigurationProcessing.hpp" |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
5 |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
6 #include <boost/numeric/ublas/matrix.hpp> |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
7 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
8 #define CharComparer(A, B) (QChar(A) == QChar(B)) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
9 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
10 EditDistance::cacheType* EditDistance::cache = 0; |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
11 //EditDistance::cacheType EditDistance::cache; |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
12 |
|
42
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
13 void EditDistance::removeDiacriticsNoCopy(QString& in) |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
14 { |
|
42
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
15 for(QString::iterator c = in.begin(); |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
16 c != in.end(); ++c) { |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
17 if (c->decompositionTag() != QChar::NoDecomposition) { |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
18 QString tmp = c->decomposition(); |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
19 *c = tmp[0]; |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
20 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
21 } |
|
42
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
22 } |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
23 |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
24 QString EditDistance::removeDiacritics(const QString& in) |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
25 { |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
26 QString out = in; |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
27 removeDiacriticsNoCopy(out); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
28 return out; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
29 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
30 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
31 int EditDistance::Compute(QString a, QString b, bool remove) { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
32 if (remove) { |
|
42
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
33 removeDiacriticsNoCopy(a); |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
34 removeDiacriticsNoCopy(b); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
35 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
36 |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
37 if ( a == b) |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
38 return 0; |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
39 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
40 OrderedPair<UniqueString> lup(a, b); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
41 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
42 if (cache == 0) { |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
43 QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION); |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
44 EditDistance::cache = new cacheType(cacheLocation, "EditLUT"); |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
45 } |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
46 boost::optional<int> res = cache->value(lup); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
47 if (res) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
48 return *res; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
49 |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
50 uint s1 = a.size(); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
51 uint s2 = b.size(); |
|
34
fda70a362ed5
Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
16
diff
changeset
|
52 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
53 // Allocate distance matrix |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
54 boost::numeric::ublas::matrix<int> d(s1 + 1, s2 + 1); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
55 d.clear(); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
56 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
57 // Compute distance |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
58 for (uint i = 0; i <= s1; ++i) |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
59 d(i, 0) = i; |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
60 for (uint j = 0; j <= s2; ++j) |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
61 d(0, j) = j; |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
62 for (uint i = 1; i <= s1; ++i) { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
63 for (uint j = 1; j <= s2; ++j) { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
64 if (CharComparer(a.at(i - 1), b.at(j - 1))) { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
65 // No change required |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
66 d(i, j) = d(i - 1, j - 1); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
67 } |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
68 else { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
69 d(i, j) = |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
70 std::min(d(i - 1, j) + 1, // Deletion |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
71 std::min(d(i, j - 1) + 1, // Insertion |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
72 d(i - 1, j - 1) + 1)); // Substitution |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
73 } |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
74 } |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
75 } |
|
34
fda70a362ed5
Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
16
diff
changeset
|
76 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
77 // Return final value |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
78 int retVal = d(s1, s2); |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
79 cache->insert(lup, retVal); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
80 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
81 return retVal; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
82 } |
