annotate EditDistance.cpp @ 78:9744ec195be3

Encapsulate EditDistance with caching.
author Tom Fredrik Blenning Klaussen <bfg@bfgconsult.no>
date Thu, 10 Oct 2013 01:07:52 +0200
parents 4c283daa42c7
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
1 #include "EditDistance.hpp"
40
f711ddb56ae7 Sort up includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 34
diff changeset
2
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
3 #include <boost/numeric/ublas/matrix.hpp>
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
4
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
5 #define CharComparer(A, B) (QChar(A) == QChar(B))
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
6
42
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
7 void EditDistance::removeDiacriticsNoCopy(QString& in)
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
8 {
42
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
9 for(QString::iterator c = in.begin();
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
10 c != in.end(); ++c) {
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
11 if (c->decompositionTag() != QChar::NoDecomposition) {
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
12 QString tmp = c->decomposition();
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
13 *c = tmp[0];
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
14 }
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
15 }
42
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
16 }
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
17
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
18 QString EditDistance::removeDiacritics(const QString& in)
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
19 {
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
20 QString out = in;
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
21 removeDiacriticsNoCopy(out);
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
22 return out;
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
23 }
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
24
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
25 int EditDistance::Compute(QString a, QString b, bool remove) {
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
26 if (remove) {
42
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
27 removeDiacriticsNoCopy(a);
4c283daa42c7 Optimize diacritics removal.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 41
diff changeset
28 removeDiacriticsNoCopy(b);
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
29 }
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
30
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
31 uint s1 = a.size();
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
32 uint s2 = b.size();
34
fda70a362ed5 Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 16
diff changeset
33
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
34 // Allocate distance matrix
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
35 boost::numeric::ublas::matrix<int> d(s1 + 1, s2 + 1);
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
36 d.clear();
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
37
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
38 // Compute distance
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
39 for (uint i = 0; i <= s1; ++i)
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
40 d(i, 0) = i;
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
41 for (uint j = 0; j <= s2; ++j)
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
42 d(0, j) = j;
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
43 for (uint i = 1; i <= s1; ++i) {
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
44 for (uint j = 1; j <= s2; ++j) {
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
45 if (CharComparer(a.at(i - 1), b.at(j - 1))) {
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
46 // No change required
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
47 d(i, j) = d(i - 1, j - 1);
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
48 }
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
49 else {
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
50 d(i, j) =
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
51 std::min(d(i - 1, j) + 1, // Deletion
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
52 std::min(d(i, j - 1) + 1, // Insertion
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
53 d(i - 1, j - 1) + 1)); // Substitution
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
54 }
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
55 }
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
56 }
34
fda70a362ed5 Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 16
diff changeset
57
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
58 // Return final value
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
59 int retVal = d(s1, s2);
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
60
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
61 return retVal;
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
62 }