Mercurial > dedupe
annotate EditDistance.cpp @ 38:7905fa8a3f1b
Test for empty string.
| author | Tom Fredrik Blenning Klaussen <bfg@blenning.no> |
|---|---|
| date | Fri, 07 Sep 2012 13:07:46 +0200 |
| parents | fda70a362ed5 |
| children | f711ddb56ae7 |
| rev | line source |
|---|---|
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
1 #include "EditDistance.hpp" |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
2 #include "CompileTimeConstants.h" |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
3 #include "ConfigurationProcessing.hpp" |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
4 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
5 #include <QtCore/QList> |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
6 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
7 #define CharComparer(A, B) (QChar(A) == QChar(B)) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
8 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
9 EditDistance::cacheType* EditDistance::cache = 0; |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
10 //EditDistance::cacheType EditDistance::cache; |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
11 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
12 QString EditDistance::removeDiacritics(QString in) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
13 { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
14 QString out; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
15 foreach(QChar c, in) { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
16 if (c.decompositionTag() == QChar::NoDecomposition) { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
17 out.append(c); |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
18 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
19 else { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
20 QString tmp = c.decomposition(); |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
21 out.append(tmp[0]); |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
22 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
23 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
24 return out; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
25 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
26 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
27 int EditDistance::Compute(QString a, QString b, bool remove) { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
28 if (remove) { |
|
9
b5943e4bf676
Fix up header includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
0
diff
changeset
|
29 a = removeDiacritics(a); |
|
b5943e4bf676
Fix up header includes.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
0
diff
changeset
|
30 b = removeDiacritics(b); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
31 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
32 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
33 OrderedPair<UniqueString> lup(a, b); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
34 |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
35 if (cache == 0) { |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
36 QString cacheLocation = processSetupVariables(EDITDISTANCE_CACHE_LOCATION); |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
37 EditDistance::cache = new cacheType(cacheLocation, "EditLUT"); |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
38 } |
|
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
39 boost::optional<int> res = cache->value(lup); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
40 if (res) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
41 return *res; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
42 |
|
34
fda70a362ed5
Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
16
diff
changeset
|
43 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
44 // Allocate distance matrix |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
45 QList<QList<int> > d; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
46 QList<int> temp; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
47 for (int i=0;i<b.size()+1;i++) { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
48 temp.append(0); |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
49 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
50 for (int i=0;i<a.size()+1;i++) { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
51 d.append(temp); |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
52 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
53 #if 0 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
54 // Get character comparer |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
55 CharComparer isEqual = (ignoreCase) ? |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
56 (CharComparer)CharCompareIgnoreCase : CharCompare; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
57 #endif |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
58 // Compute distance |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
59 for (int i = 0; i <= a.size(); i++) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
60 d[i][0] = i; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
61 for (int j = 0; j <= b.size(); j++) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
62 d[0][j] = j; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
63 for (int i = 1; i <= a.size(); i++) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
64 { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
65 for (int j = 1; j <= b.size(); j++) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
66 { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
67 if (CharComparer(a.at(i-1), b.at(j - 1))) |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
68 { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
69 // No change required |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
70 d[i][j] = d[i - 1][j - 1]; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
71 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
72 else |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
73 { |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
74 d[i][ j] = |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
75 std::min(d[i - 1][ j] + 1, // Deletion |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
76 std::min(d[i][ j - 1] + 1, // Insertion |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
77 d[i - 1][ j - 1] + 1)); // Substitution |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
78 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
79 } |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
80 } |
|
34
fda70a362ed5
Remove whitespace.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
16
diff
changeset
|
81 |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
82 // Return final value |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
83 int retVal = d[a.size()][ b.size()]; |
|
16
06166d6c083b
Add configuration processing.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
9
diff
changeset
|
84 cache->insert(lup, retVal); |
|
0
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
85 |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
86 return retVal; |
|
a3834af36579
Working with memory backend.
Tom Fredrik Blenning Klaussen <bfg@blenning.no>
parents:
diff
changeset
|
87 } |
