annotate EditDistance.cpp @ 41:e0898020af08

Faster computation of EditDistance, by more efficient datastructure.
author Tom Fredrik Blenning Klaussen <bfg@blenning.no>
date Sun, 09 Sep 2012 15:17:00 +0200
parents f711ddb56ae7
children 4c283daa42c7
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
1 #include "EditDistance.hpp"
40
f711ddb56ae7 Sort up includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 34
diff changeset
2
16
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
3 #include "CompileTimeConstants.h"
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
4 #include "ConfigurationProcessing.hpp"
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
5
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
6 #include <boost/numeric/ublas/matrix.hpp>
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
7
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
8 #define CharComparer(A, B) (QChar(A) == QChar(B))
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
9
16
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
10 EditDistance::cacheType* EditDistance::cache = 0;
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
11 //EditDistance::cacheType EditDistance::cache;
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
12
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
13 QString EditDistance::removeDiacritics(QString in)
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
14 {
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
15 QString out;
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
16 foreach(QChar c, in) {
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
17 if (c.decompositionTag() == QChar::NoDecomposition) {
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
18 out.append(c);
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
19 }
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
20 else {
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
21 QString tmp = c.decomposition();
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
22 out.append(tmp[0]);
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
23 }
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
24 }
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
25 return out;
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
26 }
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
27
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
28 int EditDistance::Compute(QString a, QString b, bool remove) {
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
29 if (remove) {
9
b5943e4bf676 Fix up header includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 0
diff changeset
30 a = removeDiacritics(a);
b5943e4bf676 Fix up header includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 0
diff changeset
31 b = removeDiacritics(b);
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
32 }
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
33
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
34 if ( a == b)
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
35 return 0;
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
36
16
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
37 OrderedPair<UniqueString> lup(a, b);
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
38
16
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
39 if (cache == 0) {
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
40 QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION);
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
41 EditDistance::cache = new cacheType(cacheLocation, "EditLUT");
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
42 }
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
43 boost::optional<int> res = cache->value(lup);
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
44 if (res)
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
45 return *res;
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
46
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
47 uint s1 = a.size();
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
48 uint s2 = b.size();
34
fda70a362ed5 Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 16
diff changeset
49
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
50 // Allocate distance matrix
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
51 boost::numeric::ublas::matrix<int> d(s1 + 1, s2 + 1);
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
52 d.clear();
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
53
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
54 // Compute distance
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
55 for (uint i = 0; i <= s1; ++i)
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
56 d(i, 0) = i;
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
57 for (uint j = 0; j <= s2; ++j)
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
58 d(0, j) = j;
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
59 for (uint i = 1; i <= s1; ++i) {
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
60 for (uint j = 1; j <= s2; ++j) {
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
61 if (CharComparer(a.at(i - 1), b.at(j - 1))) {
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
62 // No change required
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
63 d(i, j) = d(i - 1, j - 1);
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
64 }
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
65 else {
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
66 d(i, j) =
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
67 std::min(d(i - 1, j) + 1, // Deletion
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
68 std::min(d(i, j - 1) + 1, // Insertion
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
69 d(i - 1, j - 1) + 1)); // Substitution
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
70 }
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
71 }
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
72 }
34
fda70a362ed5 Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 16
diff changeset
73
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
74 // Return final value
41
e0898020af08 Faster computation of EditDistance, by more efficient datastructure.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 40
diff changeset
75 int retVal = d(s1, s2);
16
06166d6c083b Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents: 9
diff changeset
76 cache->insert(lup, retVal);
0
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
77
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
78 return retVal;
a3834af36579 Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff changeset
79 }