Mercurial > dedupe
annotate EditDistance.cpp @ 115:404795616b1e default tip
Added a lot of common files to ignore
| author | Tom Fredrik Blenning Klaussen <bfg@bfgconsult.no> |
|---|---|
| date | Sat, 25 Mar 2017 17:43:57 +0100 |
| parents | 9744ec195be3 |
| children |
| rev | line source |
|---|---|
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
1 #include "EditDistance.hpp" |
|
40
f711ddb56ae7
Sort up includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
34
diff
changeset
|
2 |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
3 #include <boost/numeric/ublas/matrix.hpp> |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
4 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
5 #define CharComparer(A, B) (QChar(A) == QChar(B)) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
6 |
|
42
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
7 void EditDistance::removeDiacriticsNoCopy(QString& in) |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
8 { |
|
42
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
9 for(QString::iterator c = in.begin(); |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
10 c != in.end(); ++c) { |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
11 if (c->decompositionTag() != QChar::NoDecomposition) { |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
12 QString tmp = c->decomposition(); |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
13 *c = tmp[0]; |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
14 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
15 } |
|
42
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
16 } |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
17 |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
18 QString EditDistance::removeDiacritics(const QString& in) |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
19 { |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
20 QString out = in; |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
21 removeDiacriticsNoCopy(out); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
22 return out; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
23 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
24 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
25 int EditDistance::Compute(QString a, QString b, bool remove) { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
26 if (remove) { |
|
42
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
27 removeDiacriticsNoCopy(a); |
|
4c283daa42c7
Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
41
diff
changeset
|
28 removeDiacriticsNoCopy(b); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
29 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
30 |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
31 uint s1 = a.size(); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
32 uint s2 = b.size(); |
|
34
fda70a362ed5
Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
16
diff
changeset
|
33 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
34 // Allocate distance matrix |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
35 boost::numeric::ublas::matrix<int> d(s1 + 1, s2 + 1); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
36 d.clear(); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
37 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
38 // Compute distance |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
39 for (uint i = 0; i <= s1; ++i) |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
40 d(i, 0) = i; |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
41 for (uint j = 0; j <= s2; ++j) |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
42 d(0, j) = j; |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
43 for (uint i = 1; i <= s1; ++i) { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
44 for (uint j = 1; j <= s2; ++j) { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
45 if (CharComparer(a.at(i - 1), b.at(j - 1))) { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
46 // No change required |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
47 d(i, j) = d(i - 1, j - 1); |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
48 } |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
49 else { |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
50 d(i, j) = |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
51 std::min(d(i - 1, j) + 1, // Deletion |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
52 std::min(d(i, j - 1) + 1, // Insertion |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
53 d(i - 1, j - 1) + 1)); // Substitution |
|
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
54 } |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
55 } |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
56 } |
|
34
fda70a362ed5
Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
16
diff
changeset
|
57 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
58 // Return final value |
|
41
e0898020af08
Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
40
diff
changeset
|
59 int retVal = d(s1, s2); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
60 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
61 return retVal; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
62 } |
